mirror of
https://github.com/paboyle/Grid.git
synced 2025-07-28 10:17:08 +01:00
Compare commits
41 Commits
Author | SHA1 | Date | |
---|---|---|---|
41f344bbd3 | |||
a77cd50b2f | |||
73af020f98 | |||
bffb83c46e | |||
7031f37350 | |||
829dd74cb2 | |||
66e671985d | |||
5afcbcf0f3 | |||
9730579312 | |||
bfae14d035 | |||
b78fc73d19 | |||
|
709f8ae76c | ||
|
7aa06329d0 | ||
|
9d6a38c44c | ||
|
6ec5cee368 | ||
|
f2e9a68825 | ||
|
d88750e6b6 | ||
|
821358eda7 | ||
|
fce6e1f135 | ||
|
8f0bb3e676 | ||
|
262c70d967 | ||
|
da43ef7c2d | ||
|
7b60ab5df1 | ||
|
f6b961a64e | ||
|
f1ed988aa3 | ||
|
eea51bb604 | ||
|
9203126aa5 | ||
|
f90ba4712a | ||
|
3737a24096 | ||
d418f78352 | |||
25163998a0 | |||
|
dc546aaa4b | ||
|
5364d580c9 | ||
|
2a9a6347e3 | ||
|
cfdb56f314 | ||
|
b517e88db3 | ||
bb317aba8d | |||
644cc6647e | |||
72397ce23b | |||
|
d60a80c098 | ||
|
bb8b6d9d73 |
@@ -51,11 +51,13 @@ directory
|
||||
#pragma nv_diag_suppress cast_to_qualified_type
|
||||
//disables nvcc specific warning in many files
|
||||
#pragma nv_diag_suppress esa_on_defaulted_function_ignored
|
||||
#pragma nv_diag_suppress declared_but_not_referenced
|
||||
#pragma nv_diag_suppress extra_semicolon
|
||||
#else
|
||||
//disables nvcc specific warning in json.hpp
|
||||
#pragma diag_suppress unsigned_compare_with_zero
|
||||
#pragma diag_suppress cast_to_qualified_type
|
||||
#pragma diag_suppress declared_but_not_referenced
|
||||
//disables nvcc specific warning in many files
|
||||
#pragma diag_suppress esa_on_defaulted_function_ignored
|
||||
#pragma diag_suppress extra_semicolon
|
||||
|
@@ -269,7 +269,9 @@ public:
|
||||
RealD xscale = 2.0/(hi-lo);
|
||||
RealD mscale = -(hi+lo)/(hi-lo);
|
||||
Linop.HermOp(T0,y);
|
||||
grid->Barrier();
|
||||
axpby(T1,xscale,mscale,y,in);
|
||||
grid->Barrier();
|
||||
|
||||
// sum = .5 c[0] T0 + c[1] T1
|
||||
// out = ()*T0 + Coeffs[1]*T1;
|
||||
|
@@ -65,6 +65,7 @@ NAMESPACE_BEGIN(Grid);
|
||||
#endif
|
||||
|
||||
enum GridBLASOperation_t { GridBLAS_OP_N, GridBLAS_OP_T, GridBLAS_OP_C } ;
|
||||
enum GridBLASPrecision_t { GridBLAS_PRECISION_DEFAULT, GridBLAS_PRECISION_16F, GridBLAS_PRECISION_16BF, GridBLAS_PRECISION_TF32 };
|
||||
|
||||
class GridBLAS {
|
||||
public:
|
||||
@@ -97,7 +98,21 @@ public:
|
||||
gridblasInit=1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#ifdef GRID_CUDA
|
||||
cublasComputeType_t toDataType(GridBLASPrecision_t p) {
|
||||
switch (p) {
|
||||
case GridBLAS_PRECISION_16F:
|
||||
return CUBLAS_COMPUTE_32F_FAST_16F;
|
||||
case GridBLAS_PRECISION_16BF:
|
||||
return CUBLAS_COMPUTE_32F_FAST_16BF;
|
||||
case GridBLAS_PRECISION_TF32:
|
||||
return CUBLAS_COMPUTE_32F_FAST_TF32;
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
// Force construct once
|
||||
GridBLAS() { Init(); };
|
||||
~GridBLAS() { };
|
||||
@@ -138,8 +153,10 @@ public:
|
||||
deviceVector<ComplexD*> &Amk, // pointer list to matrices
|
||||
deviceVector<ComplexD*> &Bkn,
|
||||
ComplexD beta,
|
||||
deviceVector<ComplexD*> &Cmn)
|
||||
deviceVector<ComplexD*> &Cmn,
|
||||
GridBLASPrecision_t precision = GridBLAS_PRECISION_DEFAULT)
|
||||
{
|
||||
assert(precision == GridBLAS_PRECISION_DEFAULT);
|
||||
gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
|
||||
m,n,k,
|
||||
alpha,
|
||||
@@ -201,8 +218,10 @@ public:
|
||||
deviceVector<ComplexD*> &Amk, // pointer list to matrices
|
||||
deviceVector<ComplexD*> &Bkn,
|
||||
ComplexD beta,
|
||||
deviceVector<ComplexD*> &Cmn)
|
||||
deviceVector<ComplexD*> &Cmn,
|
||||
GridBLASPrecision_t precision = GridBLAS_PRECISION_DEFAULT)
|
||||
{
|
||||
assert(precision == GridBLAS_PRECISION_DEFAULT);
|
||||
RealD t2=usecond();
|
||||
int32_t batchCount = Amk.size();
|
||||
assert(Bkn.size()==batchCount);
|
||||
@@ -448,7 +467,8 @@ public:
|
||||
deviceVector<ComplexF*> &Amk, // pointer list to matrices
|
||||
deviceVector<ComplexF*> &Bkn,
|
||||
ComplexF beta,
|
||||
deviceVector<ComplexF*> &Cmn)
|
||||
deviceVector<ComplexF*> &Cmn,
|
||||
GridBLASPrecision_t precision = GridBLAS_PRECISION_DEFAULT)
|
||||
{
|
||||
RealD t2=usecond();
|
||||
int32_t batchCount = Amk.size();
|
||||
@@ -473,6 +493,7 @@ public:
|
||||
assert(Bkn.size()==batchCount);
|
||||
assert(Cmn.size()==batchCount);
|
||||
#ifdef GRID_HIP
|
||||
assert(precision == GridBLAS_PRECISION_DEFAULT);
|
||||
hipblasOperation_t hOpA;
|
||||
hipblasOperation_t hOpB;
|
||||
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
|
||||
@@ -503,50 +524,67 @@ public:
|
||||
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
|
||||
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
|
||||
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
|
||||
auto err = cublasCgemmBatched(gridblasHandle,
|
||||
hOpA,
|
||||
hOpB,
|
||||
m,n,k,
|
||||
(cuComplex *) &alpha_p[0],
|
||||
(cuComplex **)&Amk[0], lda,
|
||||
(cuComplex **)&Bkn[0], ldb,
|
||||
(cuComplex *) &beta_p[0],
|
||||
(cuComplex **)&Cmn[0], ldc,
|
||||
batchCount);
|
||||
cublasStatus_t err;
|
||||
if (precision == GridBLAS_PRECISION_DEFAULT) {
|
||||
err = cublasCgemmBatched(gridblasHandle,
|
||||
hOpA,
|
||||
hOpB,
|
||||
m,n,k,
|
||||
(cuComplex *) &alpha_p[0],
|
||||
(cuComplex **)&Amk[0], lda,
|
||||
(cuComplex **)&Bkn[0], ldb,
|
||||
(cuComplex *) &beta_p[0],
|
||||
(cuComplex **)&Cmn[0], ldc,
|
||||
batchCount);
|
||||
} else {
|
||||
cublasComputeType_t compute_precision = toDataType(precision);
|
||||
err = cublasGemmBatchedEx(gridblasHandle,
|
||||
hOpA,
|
||||
hOpB,
|
||||
m,n,k,
|
||||
(void *) &alpha_p[0],
|
||||
(void **)&Amk[0], CUDA_C_32F, lda,
|
||||
(void **)&Bkn[0], CUDA_C_32F, ldb,
|
||||
(void *) &beta_p[0],
|
||||
(void **)&Cmn[0], CUDA_C_32F, ldc,
|
||||
batchCount, compute_precision, CUBLAS_GEMM_DEFAULT);
|
||||
}
|
||||
assert(err==CUBLAS_STATUS_SUCCESS);
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
int64_t m64=m;
|
||||
int64_t n64=n;
|
||||
int64_t k64=k;
|
||||
int64_t lda64=lda;
|
||||
int64_t ldb64=ldb;
|
||||
int64_t ldc64=ldc;
|
||||
int64_t batchCount64=batchCount;
|
||||
|
||||
oneapi::mkl::transpose iOpA;
|
||||
oneapi::mkl::transpose iOpB;
|
||||
|
||||
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
|
||||
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
|
||||
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
|
||||
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
|
||||
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
|
||||
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
|
||||
|
||||
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
|
||||
&iOpA,
|
||||
&iOpB,
|
||||
&m64,&n64,&k64,
|
||||
(ComplexF *) &alpha_p[0],
|
||||
(const ComplexF **)&Amk[0], (const int64_t *)&lda64,
|
||||
(const ComplexF **)&Bkn[0], (const int64_t *)&ldb64,
|
||||
(ComplexF *) &beta_p[0],
|
||||
(ComplexF **)&Cmn[0], (const int64_t *)&ldc64,
|
||||
(int64_t)1,&batchCount64,std::vector<sycl::event>());
|
||||
assert(precision == GridBLAS_PRECISION_DEFAULT);
|
||||
int64_t m64=m;
|
||||
int64_t n64=n;
|
||||
int64_t k64=k;
|
||||
int64_t lda64=lda;
|
||||
int64_t ldb64=ldb;
|
||||
int64_t ldc64=ldc;
|
||||
int64_t batchCount64=batchCount;
|
||||
|
||||
oneapi::mkl::transpose iOpA;
|
||||
oneapi::mkl::transpose iOpB;
|
||||
|
||||
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
|
||||
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
|
||||
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
|
||||
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
|
||||
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
|
||||
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
|
||||
|
||||
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
|
||||
&iOpA,
|
||||
&iOpB,
|
||||
&m64,&n64,&k64,
|
||||
(ComplexF *) &alpha_p[0],
|
||||
(const ComplexF **)&Amk[0], (const int64_t *)&lda64,
|
||||
(const ComplexF **)&Bkn[0], (const int64_t *)&ldb64,
|
||||
(ComplexF *) &beta_p[0],
|
||||
(ComplexF **)&Cmn[0], (const int64_t *)&ldc64,
|
||||
(int64_t)1,&batchCount64,std::vector<sycl::event>());
|
||||
synchronise();
|
||||
#endif
|
||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
||||
assert(precision == GridBLAS_PRECISION_DEFAULT);
|
||||
// Need a default/reference implementation; use Eigen
|
||||
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
|
||||
thread_for (p, batchCount, {
|
||||
@@ -946,6 +984,336 @@ public:
|
||||
RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount;
|
||||
}
|
||||
|
||||
/*
|
||||
Inverse and Determinant
|
||||
|
||||
- CPU version uses Eigen
|
||||
- GPU version uses LAPACK-compatible getrf / getri
|
||||
|
||||
Design comment: Eigen does not expose getrf / getri in a LAPACK compatible manner.
|
||||
Overhead to go through getrf / getri for CPU version too large.
|
||||
Current interface therefore only guarantees the inverse and determinant
|
||||
functions on all platforms but not the getrf / getri ones.
|
||||
*/
|
||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
||||
|
||||
void inverseBatched(int64_t n,
|
||||
deviceVector<ComplexD*> &Ann,
|
||||
deviceVector<ComplexD*> &Cnn) {
|
||||
|
||||
int64_t batchCount = Ann.size();
|
||||
assert(batchCount == Cnn.size());
|
||||
thread_for(p,batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcd> eAnn(Ann[p],n,n);
|
||||
Eigen::Map<Eigen::MatrixXcd> eCnn(Cnn[p],n,n);
|
||||
eCnn = eAnn.inverse();
|
||||
});
|
||||
}
|
||||
|
||||
void inverseBatched(int64_t n,
|
||||
deviceVector<ComplexF*> &Ann,
|
||||
deviceVector<ComplexF*> &Cnn) {
|
||||
|
||||
int64_t batchCount = Ann.size();
|
||||
assert(batchCount == Cnn.size());
|
||||
thread_for(p,batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcf> eAnn(Ann[p],n,n);
|
||||
Eigen::Map<Eigen::MatrixXcf> eCnn(Cnn[p],n,n);
|
||||
eCnn = eAnn.inverse();
|
||||
});
|
||||
}
|
||||
|
||||
void determinantBatched(int64_t n,
|
||||
deviceVector<ComplexD*> &Ann,
|
||||
deviceVector<ComplexD*> &C) {
|
||||
|
||||
int64_t batchCount = Ann.size();
|
||||
assert(batchCount == C.size());
|
||||
thread_for(p,batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcd> eAnn(Ann[p],n,n);
|
||||
*C[p] = eAnn.determinant();
|
||||
});
|
||||
}
|
||||
|
||||
void determinantBatched(int64_t n,
|
||||
deviceVector<ComplexF*> &Ann,
|
||||
deviceVector<ComplexF*> &C) {
|
||||
|
||||
int64_t batchCount = Ann.size();
|
||||
assert(batchCount == C.size());
|
||||
thread_for(p,batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcf> eAnn(Ann[p],n,n);
|
||||
*C[p] = eAnn.determinant();
|
||||
});
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#ifdef GRID_SYCL
|
||||
template<typename T>
|
||||
void getrfBatchedSYCL(int64_t n,
|
||||
deviceVector<T*> &Ann,
|
||||
deviceVector<int64_t> &ipiv,
|
||||
deviceVector<int64_t> &info) {
|
||||
|
||||
int64_t batchCount = Ann.size();
|
||||
|
||||
static deviceVector<T> scratchpad;
|
||||
int64_t sp_size = oneapi::mkl::lapack::getrf_batch_scratchpad_size<T>(*gridblasHandle, &n, &n, &n, (int64_t)1, &batchCount);
|
||||
if (sp_size > scratchpad.size())
|
||||
scratchpad.resize(sp_size);
|
||||
|
||||
static deviceVector<int64_t*> _ipiv;
|
||||
if (batchCount > _ipiv.size())
|
||||
_ipiv.resize(batchCount);
|
||||
int64_t** p_ipiv = &_ipiv[0];
|
||||
int64_t* pipiv = &ipiv[0];
|
||||
|
||||
accelerator_for(i, batchCount, 1, { p_ipiv[i] = &pipiv[i*n]; });
|
||||
|
||||
oneapi::mkl::lapack::getrf_batch(*gridblasHandle,
|
||||
&n, &n,
|
||||
(T **)&Ann[0],
|
||||
&n,
|
||||
(int64_t**)&_ipiv[0],
|
||||
(int64_t)1, &batchCount,
|
||||
(T*)&scratchpad[0], (int64_t)scratchpad.size(),
|
||||
std::vector<sycl::event>());
|
||||
synchronise();
|
||||
}
|
||||
#endif
|
||||
|
||||
void getrfBatched(int64_t n,
|
||||
deviceVector<ComplexD*> &Ann,
|
||||
deviceVector<int64_t> &ipiv,
|
||||
deviceVector<int64_t> &info)
|
||||
{
|
||||
int64_t batchCount = Ann.size();
|
||||
assert(ipiv.size()==batchCount*n);
|
||||
assert(info.size()==batchCount);
|
||||
|
||||
#ifdef GRID_HIP
|
||||
auto err = hipblasZgetrfBatched(gridblasHandle,(int)n,
|
||||
(hipblasDoubleComplex **)&Ann[0], (int)n,
|
||||
(int*) &ipiv[0],
|
||||
(int*) &info[0],
|
||||
(int)batchCount);
|
||||
assert(err==HIPBLAS_STATUS_SUCCESS);
|
||||
#endif
|
||||
#ifdef GRID_CUDA
|
||||
auto err = cublasZgetrfBatched(gridblasHandle, (int)n,
|
||||
(cuDoubleComplex **)&Ann[0], (int)n,
|
||||
(int*) &ipiv[0],
|
||||
(int*) &info[0],
|
||||
(int)batchCount);
|
||||
assert(err==CUBLAS_STATUS_SUCCESS);
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
getrfBatchedSYCL(n, Ann, ipiv, info);
|
||||
#endif
|
||||
}
|
||||
|
||||
void getrfBatched(int64_t n,
|
||||
deviceVector<ComplexF*> &Ann,
|
||||
deviceVector<int64_t> &ipiv,
|
||||
deviceVector<int64_t> &info)
|
||||
{
|
||||
int64_t batchCount = Ann.size();
|
||||
assert(ipiv.size()==batchCount*n);
|
||||
assert(info.size()==batchCount);
|
||||
|
||||
#ifdef GRID_HIP
|
||||
auto err = hipblasCgetrfBatched(gridblasHandle,(int)n,
|
||||
(hipblasComplex **)&Ann[0], (int)n,
|
||||
(int*) &ipiv[0],
|
||||
(int*) &info[0],
|
||||
(int)batchCount);
|
||||
assert(err==HIPBLAS_STATUS_SUCCESS);
|
||||
#endif
|
||||
#ifdef GRID_CUDA
|
||||
auto err = cublasCgetrfBatched(gridblasHandle, (int)n,
|
||||
(cuComplex **)&Ann[0], (int)n,
|
||||
(int*) &ipiv[0],
|
||||
(int*) &info[0],
|
||||
(int)batchCount);
|
||||
assert(err==CUBLAS_STATUS_SUCCESS);
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
getrfBatchedSYCL(n, Ann, ipiv, info);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef GRID_SYCL
|
||||
template<typename T>
|
||||
void getriBatchedSYCL(int64_t n,
|
||||
deviceVector<T*> &Ann,
|
||||
deviceVector<int64_t> &ipiv,
|
||||
deviceVector<int64_t> &info,
|
||||
deviceVector<T*> &Cnn) {
|
||||
|
||||
int64_t batchCount = Ann.size();
|
||||
|
||||
static deviceVector<T> scratchpad;
|
||||
int64_t sp_size = oneapi::mkl::lapack::getri_batch_scratchpad_size<T>(*gridblasHandle, &n, &n, (int64_t)1, &batchCount);
|
||||
if (sp_size > scratchpad.size())
|
||||
scratchpad.resize(sp_size);
|
||||
|
||||
static deviceVector<int64_t*> _ipiv;
|
||||
if (batchCount > _ipiv.size())
|
||||
_ipiv.resize(batchCount);
|
||||
int64_t** p_ipiv = &_ipiv[0];
|
||||
int64_t* pipiv = &ipiv[0];
|
||||
|
||||
accelerator_for(i, batchCount, 1, { p_ipiv[i] = &pipiv[i*n]; });
|
||||
|
||||
oneapi::mkl::lapack::getri_batch(*gridblasHandle,
|
||||
&n,
|
||||
(T **)&Ann[0],
|
||||
&n,
|
||||
(int64_t**)p_ipiv,
|
||||
(int64_t)1, &batchCount,
|
||||
(T *)&scratchpad[0], (int64_t)scratchpad.size(),
|
||||
std::vector<sycl::event>());
|
||||
|
||||
synchronise();
|
||||
|
||||
T** pA = &Ann[0];
|
||||
T** pC = &Cnn[0];
|
||||
accelerator_for(i, batchCount*n*n, 1, {
|
||||
auto j = i / batchCount;
|
||||
auto k = i % batchCount;
|
||||
pC[k][j] = pA[k][j];
|
||||
});
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
void getriBatched(int64_t n,
|
||||
deviceVector<ComplexD*> &Ann,
|
||||
deviceVector<int64_t> &ipiv,
|
||||
deviceVector<int64_t> &info,
|
||||
deviceVector<ComplexD*> &Cnn)
|
||||
{
|
||||
int64_t batchCount = Ann.size();
|
||||
assert(ipiv.size()==batchCount*n);
|
||||
assert(info.size()==batchCount);
|
||||
assert(Cnn.size()==batchCount);
|
||||
|
||||
#ifdef GRID_HIP
|
||||
auto err = hipblasZgetriBatched(gridblasHandle,(int)n,
|
||||
(hipblasDoubleComplex **)&Ann[0], (int)n,
|
||||
(int*) &ipiv[0],
|
||||
(hipblasDoubleComplex **)&Cnn[0], (int)n,
|
||||
(int*) &info[0],
|
||||
(int)batchCount);
|
||||
assert(err==HIPBLAS_STATUS_SUCCESS);
|
||||
#endif
|
||||
#ifdef GRID_CUDA
|
||||
auto err = cublasZgetriBatched(gridblasHandle, (int)n,
|
||||
(cuDoubleComplex **)&Ann[0], (int)n,
|
||||
(int*) &ipiv[0],
|
||||
(cuDoubleComplex **)&Cnn[0], (int)n,
|
||||
(int*) &info[0],
|
||||
(int)batchCount);
|
||||
assert(err==CUBLAS_STATUS_SUCCESS);
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
getriBatchedSYCL(n, Ann, ipiv, info, Cnn);
|
||||
#endif
|
||||
}
|
||||
|
||||
void getriBatched(int64_t n,
|
||||
deviceVector<ComplexF*> &Ann,
|
||||
deviceVector<int64_t> &ipiv,
|
||||
deviceVector<int64_t> &info,
|
||||
deviceVector<ComplexF*> &Cnn)
|
||||
{
|
||||
int64_t batchCount = Ann.size();
|
||||
assert(ipiv.size()==batchCount*n);
|
||||
assert(info.size()==batchCount);
|
||||
assert(Cnn.size()==batchCount);
|
||||
|
||||
#ifdef GRID_HIP
|
||||
auto err = hipblasCgetriBatched(gridblasHandle,(int)n,
|
||||
(hipblasComplex **)&Ann[0], (int)n,
|
||||
(int*) &ipiv[0],
|
||||
(hipblasComplex **)&Cnn[0], (int)n,
|
||||
(int*) &info[0],
|
||||
(int)batchCount);
|
||||
assert(err==HIPBLAS_STATUS_SUCCESS);
|
||||
#endif
|
||||
#ifdef GRID_CUDA
|
||||
auto err = cublasCgetriBatched(gridblasHandle, (int)n,
|
||||
(cuComplex **)&Ann[0], (int)n,
|
||||
(int*) &ipiv[0],
|
||||
(cuComplex **)&Cnn[0], (int)n,
|
||||
(int*) &info[0],
|
||||
(int)batchCount);
|
||||
assert(err==CUBLAS_STATUS_SUCCESS);
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
getriBatchedSYCL(n, Ann, ipiv, info, Cnn);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<typename dtype>
|
||||
void inverseBatched(int64_t n,
|
||||
deviceVector<dtype*> &Ann, // this will be overwritten with LU decomposition
|
||||
deviceVector<dtype*> &Cnn // this will be overwritten with the inverse
|
||||
) {
|
||||
|
||||
int64_t batchCount = Ann.size();
|
||||
RealD t0 = usecond();
|
||||
deviceVector<int64_t> ipiv(batchCount*n);
|
||||
deviceVector<int64_t> info(batchCount);
|
||||
|
||||
//RealD t1 = usecond();
|
||||
getrfBatched(n, Ann, ipiv, info);
|
||||
// test info for non-invertibility? set to nan if yes?
|
||||
getriBatched(n, Ann, ipiv, info, Cnn);
|
||||
//synchronise();
|
||||
//RealD t2 = usecond();
|
||||
//std::cout << GridLogMessage << "Temp " << t1-t0 << " rf/ri " << t2-t1 << std::endl;
|
||||
}
|
||||
|
||||
template<typename dtype>
|
||||
void determinantBatched(int64_t n,
|
||||
deviceVector<dtype*> &Ann, // this will be overwritten with LU decomposition
|
||||
deviceVector<dtype*> &C // this will be overwritten with determinant
|
||||
) {
|
||||
|
||||
int64_t batchCount = Ann.size();
|
||||
//RealD t0 = usecond();
|
||||
deviceVector<int64_t> ipiv(batchCount*n);
|
||||
deviceVector<int64_t> info(batchCount);
|
||||
|
||||
dtype** pAnn = (dtype**)&Ann[0];
|
||||
dtype** pC = (dtype**)&C[0];
|
||||
#if defined(GRID_CUDA) || defined(GRID_HIP)
|
||||
int* pipiv = (int*)&ipiv[0];
|
||||
#else
|
||||
int64_t* pipiv = (int64_t*)&ipiv[0];
|
||||
#endif
|
||||
|
||||
//RealD t1 = usecond();
|
||||
getrfBatched(n, Ann, ipiv, info);
|
||||
//RealD t2 = usecond();
|
||||
accelerator_for(i,batchCount,1,{
|
||||
dtype det = 1.0;
|
||||
for (int64_t j=0;j<n;j++) {
|
||||
det *= pAnn[i][n*j + j];
|
||||
// branchless signs
|
||||
det *= (pipiv[i*n + j] == j+1) ? (1.0) : (-1.0);
|
||||
}
|
||||
*pC[i] = det;
|
||||
});
|
||||
|
||||
//RealD t3 = usecond();
|
||||
//std::cout << GridLogMessage << "Temp " << t1 - t0 << " rf/ri " << t2-t1 << "final" << t3 - t2 << std::endl;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
template<class CComplex>
|
||||
double benchmark(int M, int N, int K, int BATCH)
|
||||
{
|
||||
|
@@ -183,6 +183,7 @@ public:
|
||||
int recv_from_rank,
|
||||
int bytes);
|
||||
|
||||
int IsOffNode(int rank);
|
||||
double StencilSendToRecvFrom(void *xmit,
|
||||
int xmit_to_rank,int do_xmit,
|
||||
void *recv,
|
||||
@@ -201,9 +202,9 @@ public:
|
||||
void StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list);
|
||||
|
||||
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
void *xmit,void *xmit_comp,
|
||||
int xmit_to_rank,int do_xmit,
|
||||
void *recv,
|
||||
void *recv,void *recv_comp,
|
||||
int recv_from_rank,int do_recv,
|
||||
int xbytes,int rbytes,int dir);
|
||||
|
||||
|
@@ -32,6 +32,10 @@ NAMESPACE_BEGIN(Grid);
|
||||
|
||||
|
||||
Grid_MPI_Comm CartesianCommunicator::communicator_world;
|
||||
#ifdef GRID_CHECKSUM_COMMS
|
||||
extern void * Grid_backtrace_buffer[_NBACKTRACE];
|
||||
uint64_t checksum_index = 1;
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////
|
||||
// First initialise of comms system
|
||||
@@ -260,32 +264,39 @@ CartesianCommunicator::~CartesianCommunicator()
|
||||
}
|
||||
#ifdef USE_GRID_REDUCTION
|
||||
void CartesianCommunicator::GlobalSum(float &f){
|
||||
FlightRecorder::StepLog("GlobalSumP2P");
|
||||
CartesianCommunicator::GlobalSumP2P(f);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSum(double &d)
|
||||
{
|
||||
FlightRecorder::StepLog("GlobalSumP2P");
|
||||
CartesianCommunicator::GlobalSumP2P(d);
|
||||
}
|
||||
#else
|
||||
void CartesianCommunicator::GlobalSum(float &f){
|
||||
FlightRecorder::StepLog("AllReduce float");
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSum(double &d)
|
||||
{
|
||||
FlightRecorder::StepLog("AllReduce double");
|
||||
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
#endif
|
||||
void CartesianCommunicator::GlobalSum(uint32_t &u){
|
||||
FlightRecorder::StepLog("AllReduce uint32_t");
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSum(uint64_t &u){
|
||||
FlightRecorder::StepLog("AllReduce uint64_t");
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSumVector(uint64_t* u,int N){
|
||||
FlightRecorder::StepLog("AllReduceVector");
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,u,N,MPI_UINT64_T,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
@@ -294,26 +305,31 @@ void CartesianCommunicator::GlobalXOR(uint32_t &u){
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalXOR(uint64_t &u){
|
||||
FlightRecorder::StepLog("GlobalXOR");
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalMax(float &f)
|
||||
{
|
||||
FlightRecorder::StepLog("GlobalMax");
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_MAX,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalMax(double &d)
|
||||
{
|
||||
FlightRecorder::StepLog("GlobalMax");
|
||||
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSumVector(float *f,int N)
|
||||
{
|
||||
FlightRecorder::StepLog("GlobalSumVector(float *)");
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSumVector(double *d,int N)
|
||||
{
|
||||
FlightRecorder::StepLog("GlobalSumVector(double *)");
|
||||
int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
@@ -388,11 +404,16 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
||||
{
|
||||
std::vector<CommsRequest_t> list;
|
||||
double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
|
||||
offbytes += StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
|
||||
offbytes += StencilSendToRecvFromBegin(list,xmit,xmit,dest,dox,recv,recv,from,dor,bytes,bytes,dir);
|
||||
StencilSendToRecvFromComplete(list,dir);
|
||||
return offbytes;
|
||||
}
|
||||
|
||||
int CartesianCommunicator::IsOffNode(int rank)
|
||||
{
|
||||
int grank = ShmRanks[rank];
|
||||
if ( grank == MPI_UNDEFINED ) return true;
|
||||
else return false;
|
||||
}
|
||||
|
||||
#ifdef ACCELERATOR_AWARE_MPI
|
||||
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
|
||||
@@ -407,9 +428,9 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
|
||||
return 0.0; // Do nothing -- no preparation required
|
||||
}
|
||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
void *xmit,void *xmit_comp,
|
||||
int dest,int dox,
|
||||
void *recv,
|
||||
void *recv,void *recv_comp,
|
||||
int from,int dor,
|
||||
int xbytes,int rbytes,int dir)
|
||||
{
|
||||
@@ -433,7 +454,8 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
||||
if ( dor ) {
|
||||
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
|
||||
tag= dir+from*32;
|
||||
ierr=MPI_Irecv(recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
|
||||
// std::cout << " StencilSendToRecvFrom "<<dir<<" MPI_Irecv "<<std::hex<<recv<<std::dec<<std::endl;
|
||||
ierr=MPI_Irecv(recv_comp, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
|
||||
assert(ierr==0);
|
||||
list.push_back(rrq);
|
||||
off_node_bytes+=rbytes;
|
||||
@@ -442,6 +464,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
||||
else {
|
||||
void *shm = (void *) this->ShmBufferTranslate(from,xmit);
|
||||
assert(shm!=NULL);
|
||||
// std::cout << " StencilSendToRecvFrom "<<dir<<" CopyDeviceToDevice recv "<<std::hex<<recv<<" remote "<<shm <<std::dec<<std::endl;
|
||||
acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
|
||||
}
|
||||
#endif
|
||||
@@ -450,7 +473,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
||||
if (dox) {
|
||||
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
|
||||
tag= dir+_processor*32;
|
||||
ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
|
||||
ierr =MPI_Isend(xmit_comp, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
|
||||
assert(ierr==0);
|
||||
list.push_back(xrq);
|
||||
off_node_bytes+=xbytes;
|
||||
@@ -549,6 +572,11 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
|
||||
* - post device - host send buffer transfer asynch
|
||||
*/
|
||||
|
||||
#ifdef GRID_CHECKSUM_COMMS
|
||||
rbytes += 8;
|
||||
xbytes += 8;
|
||||
#endif
|
||||
|
||||
if ( dor ) {
|
||||
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
|
||||
tag= dir+from*32;
|
||||
@@ -561,6 +589,7 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
|
||||
srq.req = rrq;
|
||||
srq.host_buf = host_recv;
|
||||
srq.device_buf = recv;
|
||||
srq.tag = tag;
|
||||
list.push_back(srq);
|
||||
off_node_bytes+=rbytes;
|
||||
}
|
||||
@@ -574,7 +603,16 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
|
||||
host_xmit = this->HostBufferMalloc(xbytes);
|
||||
CommsRequest_t srq;
|
||||
|
||||
#ifdef GRID_CHECKSUM_COMMS
|
||||
uint64_t xbytes_data = xbytes - 8;
|
||||
srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes_data); // Make this Asynch
|
||||
assert(xbytes % 8 == 0);
|
||||
// flip one bit so that a zero buffer is not consistent
|
||||
uint64_t xsum = checksum_gpu((uint64_t*)xmit, xbytes_data / 8) ^ (checksum_index + 1 + 1000 * tag);
|
||||
*(uint64_t*)(((char*)host_xmit) + xbytes_data) = xsum;
|
||||
#else
|
||||
srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
|
||||
#endif
|
||||
|
||||
// ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
|
||||
// assert(ierr==0);
|
||||
@@ -616,7 +654,11 @@ void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequ
|
||||
|
||||
if ( flag ) {
|
||||
// std::cout << " PollIrecv "<<idx<<" flag "<<flag<<std::endl;
|
||||
#ifdef GRID_CHECKSUM_COMMS
|
||||
acceleratorCopyToDeviceAsynch(list[idx].host_buf,list[idx].device_buf,list[idx].bytes - 8);
|
||||
#else
|
||||
acceleratorCopyToDeviceAsynch(list[idx].host_buf,list[idx].device_buf,list[idx].bytes);
|
||||
#endif
|
||||
list[idx].PacketType=InterNodeReceiveHtoD;
|
||||
} else {
|
||||
pending ++;
|
||||
@@ -669,9 +711,9 @@ void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsReque
|
||||
}
|
||||
|
||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
void *xmit,void *xmit_comp,
|
||||
int dest,int dox,
|
||||
void *recv,
|
||||
void *recv,void *recv_comp,
|
||||
int from,int dor,
|
||||
int xbytes,int rbytes,int dir)
|
||||
{
|
||||
@@ -779,7 +821,40 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
|
||||
// acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
|
||||
// }
|
||||
// }
|
||||
|
||||
#ifdef GRID_CHECKSUM_COMMS
|
||||
for(int r=0;r<list.size();r++){
|
||||
if ( list[r].PacketType == InterNodeReceiveHtoD ) {
|
||||
uint64_t rbytes_data = list[r].bytes - 8;
|
||||
uint64_t expected_cs = *(uint64_t*)(((char*)list[r].host_buf) + rbytes_data);
|
||||
uint64_t computed_cs = checksum_gpu((uint64_t*)list[r].device_buf, rbytes_data / 8) ^ (checksum_index + 1 + 1000 * list[r].tag); //
|
||||
if (expected_cs != computed_cs) {
|
||||
// TODO: error message, backtrace, quit
|
||||
|
||||
fprintf(stderr, "GRID_CHECKSUM_COMMS error:\n");
|
||||
fprintf(stderr, " processor = %d\n", (int)_processor);
|
||||
for(int d=0;d<_processors.size();d++)
|
||||
fprintf(stderr, " processor_coord[%d] = %d\n", d, _processor_coor[d]);
|
||||
fprintf(stderr, " hostname: %s\n", GridHostname());
|
||||
fprintf(stderr, " expected_cs: %ld\n", expected_cs);
|
||||
fprintf(stderr, " computed_cs: %ld\n", computed_cs);
|
||||
fprintf(stderr, " dest: %d\n", list[r].dest);
|
||||
fprintf(stderr, " tag: %d\n", list[r].tag);
|
||||
fprintf(stderr, " commdir: %d\n", list[r].commdir);
|
||||
fprintf(stderr, " bytes: %ld\n", (uint64_t)list[r].bytes);
|
||||
|
||||
fflush(stderr);
|
||||
|
||||
// backtrace
|
||||
int symbols = backtrace(Grid_backtrace_buffer,_NBACKTRACE);
|
||||
backtrace_symbols_fd(Grid_backtrace_buffer,symbols, 2);
|
||||
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
checksum_index += 1;
|
||||
#endif
|
||||
|
||||
list.resize(0); // Delete the list
|
||||
this->HostBufferFreeAll(); // Clean up the buffer allocs
|
||||
@@ -794,6 +869,7 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
|
||||
|
||||
void CartesianCommunicator::StencilBarrier(void)
|
||||
{
|
||||
FlightRecorder::StepLog("NodeBarrier");
|
||||
MPI_Barrier (ShmComm);
|
||||
}
|
||||
//void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
|
||||
@@ -801,11 +877,13 @@ void CartesianCommunicator::StencilBarrier(void)
|
||||
//}
|
||||
void CartesianCommunicator::Barrier(void)
|
||||
{
|
||||
FlightRecorder::StepLog("GridBarrier");
|
||||
int ierr = MPI_Barrier(communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
|
||||
{
|
||||
FlightRecorder::StepLog("Broadcast");
|
||||
int ierr=MPI_Bcast(data,
|
||||
bytes,
|
||||
MPI_BYTE,
|
||||
@@ -819,11 +897,13 @@ int CartesianCommunicator::RankWorld(void){
|
||||
return r;
|
||||
}
|
||||
void CartesianCommunicator::BarrierWorld(void){
|
||||
FlightRecorder::StepLog("BarrierWorld");
|
||||
int ierr = MPI_Barrier(communicator_world);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
|
||||
{
|
||||
FlightRecorder::StepLog("BroadcastWorld");
|
||||
int ierr= MPI_Bcast(data,
|
||||
bytes,
|
||||
MPI_BYTE,
|
||||
@@ -846,6 +926,7 @@ void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,
|
||||
}
|
||||
void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t bytes)
|
||||
{
|
||||
FlightRecorder::StepLog("AllToAll");
|
||||
// MPI is a pain and uses "int" arguments
|
||||
// 64*64*64*128*16 == 500Million elements of data.
|
||||
// When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug.
|
||||
|
@@ -124,6 +124,8 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest
|
||||
dest=0;
|
||||
}
|
||||
|
||||
int CartesianCommunicator::IsOffNode(int rank) { return false; }
|
||||
|
||||
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
||||
int xmit_to_rank,int dox,
|
||||
void *recv,
|
||||
|
@@ -43,10 +43,6 @@ Author: Christoph Lehner <christoph@lhnr.de>
|
||||
#define GRID_SYCL_LEVEL_ZERO_IPC
|
||||
#define SHM_SOCKETS
|
||||
#else
|
||||
#ifdef HAVE_NUMAIF_H
|
||||
#warning " Using NUMAIF "
|
||||
#include <numaif.h>
|
||||
#endif
|
||||
#endif
|
||||
#include <syscall.h>
|
||||
#endif
|
||||
@@ -543,49 +539,21 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#ifndef ACCELERATOR_AWARE_MPI
|
||||
// printf("Host buffer allocate for GPU non-aware MPI\n");
|
||||
#if 0
|
||||
HostCommBuf= acceleratorAllocHost(bytes);
|
||||
#else
|
||||
HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
|
||||
#if 0
|
||||
#warning "Moving host buffers to specific NUMA domain"
|
||||
int numa;
|
||||
char *numa_name=(char *)getenv("MPI_BUF_NUMA");
|
||||
if(numa_name) {
|
||||
unsigned long page_size = sysconf(_SC_PAGESIZE);
|
||||
numa = atoi(numa_name);
|
||||
unsigned long page_count = bytes/page_size;
|
||||
std::vector<void *> pages(page_count);
|
||||
std::vector<int> nodes(page_count,numa);
|
||||
std::vector<int> status(page_count,-1);
|
||||
for(unsigned long p=0;p<page_count;p++){
|
||||
pages[p] =(void *) ((uint64_t) HostCommBuf + p*page_size);
|
||||
}
|
||||
int ret = move_pages(0,
|
||||
page_count,
|
||||
&pages[0],
|
||||
&nodes[0],
|
||||
&status[0],
|
||||
MPOL_MF_MOVE);
|
||||
printf("Host buffer move to numa domain %d : move_pages returned %d\n",numa,ret);
|
||||
if (ret) perror(" move_pages failed for reason:");
|
||||
}
|
||||
#endif
|
||||
acceleratorPin(HostCommBuf,bytes);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
ShmCommBuf = acceleratorAllocDevice(bytes);
|
||||
if (ShmCommBuf == (void *)NULL ) {
|
||||
std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
|
||||
std::cerr << "SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
if ( WorldRank == 0 ){
|
||||
std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
|
||||
std::cout << Mheader " acceleratorAllocDevice "<< bytes
|
||||
<< "bytes at "<< std::hex<< ShmCommBuf << " - "<<(bytes-1+(uint64_t)ShmCommBuf) <<std::dec<<" for comms buffers " <<std::endl;
|
||||
}
|
||||
SharedMemoryZero(ShmCommBuf,bytes);
|
||||
std::cout<< "Setting up IPC"<<std::endl;
|
||||
if ( WorldRank == 0 ){
|
||||
std::cout<< Mheader "Setting up IPC"<<std::endl;
|
||||
}
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Loop over ranks/gpu's on our node
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@@ -616,8 +584,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
if ( err != ZE_RESULT_SUCCESS ) {
|
||||
std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
|
||||
exit(EXIT_FAILURE);
|
||||
} else {
|
||||
std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
|
||||
}
|
||||
memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int));
|
||||
handle.pid = getpid();
|
||||
@@ -676,12 +642,12 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
#ifdef SHM_SOCKETS
|
||||
myfd=UnixSockets::RecvFileDescriptor();
|
||||
#else
|
||||
std::cout<<"mapping seeking remote pid/fd "
|
||||
<<handle.pid<<"/"
|
||||
<<handle.fd<<std::endl;
|
||||
// std::cout<<"mapping seeking remote pid/fd "
|
||||
// <<handle.pid<<"/"
|
||||
// <<handle.fd<<std::endl;
|
||||
|
||||
int pidfd = syscall(SYS_pidfd_open,handle.pid,0);
|
||||
std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n";
|
||||
// std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n";
|
||||
// int myfd = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0);
|
||||
myfd = syscall(438,pidfd,handle.fd,0);
|
||||
int err_t = errno;
|
||||
@@ -691,7 +657,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
assert(0);
|
||||
}
|
||||
#endif
|
||||
std::cout<<"Using IpcHandle mapped remote pid "<<handle.pid <<" FD "<<handle.fd <<" to myfd "<<myfd<<"\n";
|
||||
// std::cout<<"Using IpcHandle mapped remote pid "<<handle.pid <<" FD "<<handle.fd <<" to myfd "<<myfd<<"\n";
|
||||
memcpy((void *)&ihandle,(void *)&handle.ze,sizeof(ihandle));
|
||||
memcpy((void *)&ihandle,(void *)&myfd,sizeof(int));
|
||||
|
||||
@@ -700,9 +666,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
std::cerr << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
|
||||
std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
|
||||
exit(EXIT_FAILURE);
|
||||
} else {
|
||||
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle succeeded for rank "<<r<<std::endl;
|
||||
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle pointer is "<<std::hex<<thisBuf<<std::dec<<std::endl;
|
||||
}
|
||||
assert(thisBuf!=nullptr);
|
||||
}
|
||||
@@ -783,6 +746,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
WorldShmCommBufs[r] =ptr;
|
||||
// std::cout << Mheader "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
|
||||
}
|
||||
std::cout<< Mheader " Intra-node IPC setup is complete "<<std::endl;
|
||||
_ShmAlloc=1;
|
||||
_ShmAllocBytes = bytes;
|
||||
};
|
||||
@@ -990,7 +954,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
|
||||
}
|
||||
#endif
|
||||
|
||||
SharedMemoryTest();
|
||||
// SharedMemoryTest();
|
||||
}
|
||||
//////////////////////////////////////////////////////////////////
|
||||
// On node barrier
|
||||
@@ -1039,11 +1003,13 @@ void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
|
||||
{
|
||||
int gpeer = ShmRanks[rank];
|
||||
assert(gpeer!=ShmRank); // never send to self
|
||||
// std::cout << "ShmBufferTranslate for rank " << rank<<" peer "<<gpeer<<std::endl;
|
||||
if (gpeer == MPI_UNDEFINED){
|
||||
return NULL;
|
||||
} else {
|
||||
uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank];
|
||||
uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset;
|
||||
// std::cout << "ShmBufferTranslate : local,offset,remote "<<std::hex<<local_p<<" "<<offset<<" "<<remote<<std::dec<<std::endl;
|
||||
return (void *) remote;
|
||||
}
|
||||
}
|
||||
|
@@ -122,10 +122,10 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
|
||||
{
|
||||
acceleratorMemSet(dest,0,bytes);
|
||||
}
|
||||
void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
|
||||
{
|
||||
acceleratorCopyToDevice(src,dest,bytes);
|
||||
}
|
||||
//void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
|
||||
//{
|
||||
// acceleratorCopyToDevice(src,dest,bytes);
|
||||
//}
|
||||
////////////////////////////////////////////////////////
|
||||
// Global shared functionality finished
|
||||
// Now move to per communicator functionality
|
||||
|
@@ -202,7 +202,7 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,deviceVector<
|
||||
{
|
||||
auto buffer_p = & buffer[0];
|
||||
auto table = MapCshiftTable();
|
||||
autoView( rhs_v, rhs, AcceleratorWrite);
|
||||
autoView( rhs_v, rhs, AcceleratorWriteDiscard);
|
||||
accelerator_for(i,ent,vobj::Nsimd(),{
|
||||
coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
|
||||
});
|
||||
@@ -228,7 +228,7 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
|
||||
if(cbmask ==0x3 ) {
|
||||
int _slice_stride = rhs.Grid()->_slice_stride[dimension];
|
||||
int _slice_block = rhs.Grid()->_slice_block[dimension];
|
||||
autoView( rhs_v , rhs, AcceleratorWrite);
|
||||
autoView( rhs_v , rhs, AcceleratorWriteDiscard);
|
||||
accelerator_for(nn,e1*e2,1,{
|
||||
int n = nn%e1;
|
||||
int b = nn/e1;
|
||||
@@ -302,7 +302,7 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
|
||||
{
|
||||
auto table = MapCshiftTable();
|
||||
autoView(rhs_v , rhs, AcceleratorRead);
|
||||
autoView(lhs_v , lhs, AcceleratorWrite);
|
||||
autoView(lhs_v , lhs, AcceleratorWriteDiscard);
|
||||
accelerator_for(i,ent,vobj::Nsimd(),{
|
||||
coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second]));
|
||||
});
|
||||
|
@@ -29,8 +29,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
#ifndef _GRID_CSHIFT_MPI_H_
|
||||
#define _GRID_CSHIFT_MPI_H_
|
||||
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
#ifdef GRID_CHECKSUM_COMMS
|
||||
extern uint64_t checksum_index;
|
||||
#endif
|
||||
|
||||
const int Cshift_verbose=0;
|
||||
template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
|
||||
{
|
||||
@@ -126,8 +130,9 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
||||
static deviceVector<vobj> send_buf; send_buf.resize(buffer_size);
|
||||
static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size);
|
||||
#ifndef ACCELERATOR_AWARE_MPI
|
||||
static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size);
|
||||
static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size);
|
||||
int pad = (8 + sizeof(vobj) - 1) / sizeof(vobj);
|
||||
static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size+pad);
|
||||
static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size+pad);
|
||||
#endif
|
||||
|
||||
int cb= (cbmask==0x2)? Odd : Even;
|
||||
@@ -143,9 +148,11 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
||||
int comm_proc = ((x+sshift)/rd)%pd;
|
||||
|
||||
if (comm_proc==0) {
|
||||
FlightRecorder::StepLog("Cshift_Copy_plane");
|
||||
tcopy-=usecond();
|
||||
Copy_plane(ret,rhs,dimension,x,sx,cbmask);
|
||||
tcopy+=usecond();
|
||||
FlightRecorder::StepLog("Cshift_Copy_plane_complete");
|
||||
} else {
|
||||
|
||||
int words = buffer_size;
|
||||
@@ -153,9 +160,11 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
||||
|
||||
int bytes = words * sizeof(vobj);
|
||||
|
||||
FlightRecorder::StepLog("Cshift_Gather_plane");
|
||||
tgather-=usecond();
|
||||
Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask);
|
||||
tgather+=usecond();
|
||||
FlightRecorder::StepLog("Cshift_Gather_plane_complete");
|
||||
|
||||
// int rank = grid->_processor;
|
||||
int recv_from_rank;
|
||||
@@ -166,6 +175,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
||||
tcomms-=usecond();
|
||||
grid->Barrier();
|
||||
|
||||
FlightRecorder::StepLog("Cshift_SendRecv");
|
||||
#ifdef ACCELERATOR_AWARE_MPI
|
||||
grid->SendToRecvFrom((void *)&send_buf[0],
|
||||
xmit_to_rank,
|
||||
@@ -175,17 +185,46 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
||||
#else
|
||||
// bouncy bouncy
|
||||
acceleratorCopyFromDevice(&send_buf[0],&hsend_buf[0],bytes);
|
||||
|
||||
#ifdef GRID_CHECKSUM_COMMS
|
||||
assert(bytes % 8 == 0);
|
||||
checksum_index++;
|
||||
uint64_t xsum = checksum_gpu((uint64_t*)&send_buf[0], bytes / 8) ^ (1 + checksum_index);
|
||||
*(uint64_t*)(((char*)&hsend_buf[0]) + bytes) = xsum;
|
||||
bytes += 8;
|
||||
#endif
|
||||
|
||||
grid->SendToRecvFrom((void *)&hsend_buf[0],
|
||||
xmit_to_rank,
|
||||
(void *)&hrecv_buf[0],
|
||||
recv_from_rank,
|
||||
bytes);
|
||||
|
||||
#ifdef GRID_CHECKSUM_COMMS
|
||||
bytes -= 8;
|
||||
acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
|
||||
uint64_t expected_cs = *(uint64_t*)(((char*)&hrecv_buf[0]) + bytes);
|
||||
uint64_t computed_cs = checksum_gpu((uint64_t*)&recv_buf[0], bytes / 8) ^ (1 + checksum_index);
|
||||
std::cout << GridLogComms<< " Cshift: "
|
||||
<<" dim"<<dimension
|
||||
<<" shift "<<shift
|
||||
<< " rank "<< grid->ThisRank()
|
||||
<<" Coor "<<grid->ThisProcessorCoor()
|
||||
<<" send "<<xsum<<" to "<<xmit_to_rank
|
||||
<<" recv "<<computed_cs<<" from "<<recv_from_rank
|
||||
<<std::endl;
|
||||
assert(expected_cs == computed_cs);
|
||||
#else
|
||||
acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
FlightRecorder::StepLog("Cshift_SendRecv_complete");
|
||||
|
||||
xbytes+=bytes;
|
||||
grid->Barrier();
|
||||
tcomms+=usecond();
|
||||
FlightRecorder::StepLog("Cshift_barrier_complete");
|
||||
|
||||
tscatter-=usecond();
|
||||
Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
|
||||
@@ -249,8 +288,16 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
||||
recv_buf_extract[s].resize(buffer_size);
|
||||
}
|
||||
#ifndef ACCELERATOR_AWARE_MPI
|
||||
hostVector<scalar_object> hsend_buf; hsend_buf.resize(buffer_size);
|
||||
hostVector<scalar_object> hrecv_buf; hrecv_buf.resize(buffer_size);
|
||||
#ifdef GRID_CHECKSUM_COMMS
|
||||
buffer_size += (8 + sizeof(vobj) - 1) / sizeof(vobj);
|
||||
#endif
|
||||
|
||||
static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size);
|
||||
static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size);
|
||||
|
||||
#ifdef GRID_CHECKSUM_COMMS
|
||||
buffer_size -= (8 + sizeof(vobj) - 1) / sizeof(vobj);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
int bytes = buffer_size*sizeof(scalar_object);
|
||||
@@ -313,12 +360,37 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
||||
#else
|
||||
// bouncy bouncy
|
||||
acceleratorCopyFromDevice((void *)send_buf_extract_mpi,(void *)&hsend_buf[0],bytes);
|
||||
#ifdef GRID_CHECKSUM_COMMS
|
||||
assert(bytes % 8 == 0);
|
||||
checksum_index++;
|
||||
uint64_t xsum = checksum_gpu((uint64_t*)send_buf_extract_mpi, bytes / 8) ^ (1 + checksum_index);
|
||||
*(uint64_t*)(((char*)&hsend_buf[0]) + bytes) = xsum;
|
||||
bytes += 8;
|
||||
#endif
|
||||
grid->SendToRecvFrom((void *)&hsend_buf[0],
|
||||
xmit_to_rank,
|
||||
(void *)&hrecv_buf[0],
|
||||
recv_from_rank,
|
||||
bytes);
|
||||
#ifdef GRID_CHECKSUM_COMMS
|
||||
bytes -= 8;
|
||||
acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes);
|
||||
uint64_t expected_cs = *(uint64_t*)(((char*)&hrecv_buf[0]) + bytes);
|
||||
uint64_t computed_cs = checksum_gpu((uint64_t*)recv_buf_extract_mpi, bytes / 8) ^ (1 + checksum_index);
|
||||
|
||||
std::cout << GridLogComms<< " Cshift_comms_simd: "
|
||||
<<" dim"<<dimension
|
||||
<<" shift "<<shift
|
||||
<< " rank "<< grid->ThisRank()
|
||||
<<" Coor "<<grid->ThisProcessorCoor()
|
||||
<<" send "<<xsum<<" to "<<xmit_to_rank
|
||||
<<" recv "<<computed_cs<<" from "<<recv_from_rank
|
||||
<<std::endl;
|
||||
assert(expected_cs == computed_cs);
|
||||
#else
|
||||
acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
xbytes+=bytes;
|
||||
|
@@ -236,7 +236,7 @@ public:
|
||||
template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
|
||||
vobj vtmp;
|
||||
vtmp = r;
|
||||
#if 0
|
||||
#if 1
|
||||
deviceVector<vobj> vvtmp(1);
|
||||
acceleratorPut(vvtmp[0],vtmp);
|
||||
vobj *vvtmp_p = & vvtmp[0];
|
||||
|
@@ -325,8 +325,8 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
|
||||
assert(ok);
|
||||
}
|
||||
FlightRecorder::StepLog("Start global sum");
|
||||
// grid->GlobalSumP2P(nrm);
|
||||
grid->GlobalSum(nrm);
|
||||
grid->GlobalSumP2P(nrm);
|
||||
// grid->GlobalSum(nrm);
|
||||
FlightRecorder::StepLog("Finished global sum");
|
||||
// std::cout << " norm "<< nrm << " p2p norm "<<nrmck<<std::endl;
|
||||
FlightRecorder::ReductionLog(local,real(nrm));
|
||||
|
@@ -87,6 +87,25 @@ template<class Word> Word svm_xor(Word *vec,uint64_t L)
|
||||
theGridAccelerator->wait();
|
||||
return ret;
|
||||
}
|
||||
template<class Word> Word checksum_gpu(Word *vec,uint64_t L)
|
||||
{
|
||||
Word identity; identity=0;
|
||||
Word ret = 0;
|
||||
{
|
||||
sycl::buffer<Word, 1> abuff(&ret, {1});
|
||||
theGridAccelerator->submit([&](sycl::handler &cgh) {
|
||||
auto Reduction = sycl::reduction(abuff,cgh,identity,std::bit_xor<>());
|
||||
cgh.parallel_for(sycl::range<1>{L},
|
||||
Reduction,
|
||||
[=] (sycl::id<1> index, auto &sum) {
|
||||
auto l = index % 61;
|
||||
sum ^= vec[index]<<l | vec[index]>>(64-l);
|
||||
});
|
||||
});
|
||||
}
|
||||
theGridAccelerator->wait();
|
||||
return ret;
|
||||
}
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
|
@@ -106,6 +106,47 @@ public:
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
#ifdef GRID_LOG_VIEWS
|
||||
// Little autoscope assister
|
||||
template<class View>
|
||||
class ViewCloser
|
||||
{
|
||||
View v; // Take a copy of view and call view close when I go out of scope automatically
|
||||
const char* filename; int line, mode;
|
||||
public:
|
||||
ViewCloser(View &_v, const char* _filename, int _line, int _mode) :
|
||||
v(_v), filename(_filename), line(_line), mode(_mode) {
|
||||
|
||||
switch (mode){
|
||||
case AcceleratorRead:
|
||||
case AcceleratorWrite:
|
||||
case CpuRead:
|
||||
case CpuWrite:
|
||||
ViewLogger::Log(filename, line, 1, mode, &v[0], v.size() * sizeof(v[0]));
|
||||
break;
|
||||
}
|
||||
|
||||
};
|
||||
~ViewCloser() {
|
||||
|
||||
switch (mode) {
|
||||
case AcceleratorWriteDiscard:
|
||||
case AcceleratorWrite:
|
||||
case CpuWrite:
|
||||
ViewLogger::Log(filename, line, -1, mode, &v[0], v.size() * sizeof(v[0]));
|
||||
break;
|
||||
}
|
||||
|
||||
v.ViewClose();
|
||||
}
|
||||
};
|
||||
|
||||
#define autoView(l_v,l,mode) \
|
||||
auto l_v = l.View(mode); \
|
||||
ViewCloser<decltype(l_v)> _autoView##l_v(l_v,__FILE__,__LINE__,mode);
|
||||
|
||||
#else
|
||||
// Little autoscope assister
|
||||
template<class View>
|
||||
class ViewCloser
|
||||
@@ -119,6 +160,7 @@ class ViewCloser
|
||||
#define autoView(l_v,l,mode) \
|
||||
auto l_v = l.View(mode); \
|
||||
ViewCloser<decltype(l_v)> _autoView##l_v(l_v);
|
||||
#endif
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Lattice expression types used by ET to assemble the AST
|
||||
|
@@ -69,6 +69,7 @@ GridLogger GridLogMemory (1, "Memory", GridLogColours, "NORMAL");
|
||||
GridLogger GridLogTracing(1, "Tracing", GridLogColours, "NORMAL");
|
||||
GridLogger GridLogDebug (1, "Debug", GridLogColours, "PURPLE");
|
||||
GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN");
|
||||
GridLogger GridLogComms (1, "Comms", GridLogColours, "BLUE");
|
||||
GridLogger GridLogDslash (1, "Dslash", GridLogColours, "BLUE");
|
||||
GridLogger GridLogIterative (1, "Iterative", GridLogColours, "BLUE");
|
||||
GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE");
|
||||
@@ -84,6 +85,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
|
||||
GridLogDebug.Active(0);
|
||||
GridLogPerformance.Active(0);
|
||||
GridLogDslash.Active(0);
|
||||
GridLogComms.Active(0);
|
||||
GridLogIntegrator.Active(1);
|
||||
GridLogColours.Active(0);
|
||||
GridLogHMC.Active(1);
|
||||
@@ -97,6 +99,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
|
||||
if (logstreams[i] == std::string("Debug")) GridLogDebug.Active(1);
|
||||
if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1);
|
||||
if (logstreams[i] == std::string("Dslash")) GridLogDslash.Active(1);
|
||||
if (logstreams[i] == std::string("Comms")) GridLogComms.Active(1);
|
||||
if (logstreams[i] == std::string("NoIntegrator"))GridLogIntegrator.Active(0);
|
||||
if (logstreams[i] == std::string("NoHMC")) GridLogHMC.Active(0);
|
||||
if (logstreams[i] == std::string("Colours")) GridLogColours.Active(1);
|
||||
|
@@ -180,6 +180,7 @@ extern GridLogger GridLogError;
|
||||
extern GridLogger GridLogWarning;
|
||||
extern GridLogger GridLogMessage;
|
||||
extern GridLogger GridLogDebug;
|
||||
extern GridLogger GridLogComms;
|
||||
extern GridLogger GridLogPerformance;
|
||||
extern GridLogger GridLogDslash;
|
||||
extern GridLogger GridLogIterative;
|
||||
|
@@ -154,6 +154,12 @@ public:
|
||||
StencilImpl Stencil;
|
||||
StencilImpl StencilEven;
|
||||
StencilImpl StencilOdd;
|
||||
void SloppyComms(int sloppy)
|
||||
{
|
||||
Stencil.SetSloppyComms(sloppy);
|
||||
StencilEven.SetSloppyComms(sloppy);
|
||||
StencilOdd.SetSloppyComms(sloppy);
|
||||
}
|
||||
|
||||
// Copy of the gauge field , with even and odd subsets
|
||||
DoubledGaugeField Umu;
|
||||
|
@@ -179,6 +179,12 @@ public:
|
||||
StencilImpl Stencil;
|
||||
StencilImpl StencilEven;
|
||||
StencilImpl StencilOdd;
|
||||
void SloppyComms(int sloppy)
|
||||
{
|
||||
Stencil.SetSloppyComms(sloppy);
|
||||
StencilEven.SetSloppyComms(sloppy);
|
||||
StencilOdd.SetSloppyComms(sloppy);
|
||||
}
|
||||
|
||||
// Copy of the gauge field , with even and odd subsets
|
||||
DoubledGaugeField Umu;
|
||||
|
@@ -146,6 +146,12 @@ public:
|
||||
StencilImpl Stencil;
|
||||
StencilImpl StencilEven;
|
||||
StencilImpl StencilOdd;
|
||||
void SloppyComms(int sloppy)
|
||||
{
|
||||
Stencil.SetSloppyComms(sloppy);
|
||||
StencilEven.SetSloppyComms(sloppy);
|
||||
StencilOdd.SetSloppyComms(sloppy);
|
||||
}
|
||||
|
||||
// Copy of the gauge field , with even and odd subsets
|
||||
DoubledGaugeField Umu;
|
||||
|
@@ -32,209 +32,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
// Wilson compressor will need FaceGather policies for:
|
||||
// Periodic, Dirichlet, and partial Dirichlet for DWF
|
||||
///////////////////////////////////////////////////////////////
|
||||
const int dwf_compressor_depth=2;
|
||||
#define DWF_COMPRESS
|
||||
class FaceGatherPartialDWF
|
||||
{
|
||||
public:
|
||||
#ifdef DWF_COMPRESS
|
||||
static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/(2*dwf_compressor_depth);};
|
||||
#else
|
||||
static int PartialCompressionFactor(GridBase *grid) { return 1;}
|
||||
#endif
|
||||
template<class vobj,class cobj,class compressor>
|
||||
static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
|
||||
const Lattice<vobj> &rhs,
|
||||
cobj *buffer,
|
||||
compressor &compress,
|
||||
int off,int so,int partial)
|
||||
{
|
||||
//DWF only hack: If a direction that is OFF node we use Partial Dirichlet
|
||||
// Shrinks local and remote comms buffers
|
||||
GridBase *Grid = rhs.Grid();
|
||||
int Ls = Grid->_rdimensions[0];
|
||||
#ifdef DWF_COMPRESS
|
||||
int depth=dwf_compressor_depth;
|
||||
#else
|
||||
int depth=Ls/2;
|
||||
#endif
|
||||
std::pair<int,int> *table_v = & table[0];
|
||||
auto rhs_v = rhs.View(AcceleratorRead);
|
||||
int vol=table.size()/Ls;
|
||||
accelerator_forNB( idx,table.size(), vobj::Nsimd(), {
|
||||
Integer i=idx/Ls;
|
||||
Integer s=idx%Ls;
|
||||
Integer sc=depth+s-(Ls-depth);
|
||||
if(s<depth) compress.Compress(buffer[off+i+s*vol],rhs_v[so+table_v[idx].second]);
|
||||
if(s>=Ls-depth) compress.Compress(buffer[off+i+sc*vol],rhs_v[so+table_v[idx].second]);
|
||||
});
|
||||
rhs_v.ViewClose();
|
||||
}
|
||||
template<class decompressor,class Decompression>
|
||||
static void DecompressFace(decompressor decompress,Decompression &dd)
|
||||
{
|
||||
auto Ls = dd.dims[0];
|
||||
#ifdef DWF_COMPRESS
|
||||
int depth=dwf_compressor_depth;
|
||||
#else
|
||||
int depth=Ls/2;
|
||||
#endif
|
||||
// Just pass in the Grid
|
||||
auto kp = dd.kernel_p;
|
||||
auto mp = dd.mpi_p;
|
||||
int size= dd.buffer_size;
|
||||
int vol= size/Ls;
|
||||
accelerator_forNB(o,size,1,{
|
||||
int idx=o/Ls;
|
||||
int s=o%Ls;
|
||||
if ( s < depth ) {
|
||||
int oo=s*vol+idx;
|
||||
kp[o]=mp[oo];
|
||||
} else if ( s >= Ls-depth ) {
|
||||
int sc = depth + s - (Ls-depth);
|
||||
int oo=sc*vol+idx;
|
||||
kp[o]=mp[oo];
|
||||
} else {
|
||||
kp[o] = Zero();//fill rest with zero if partial dirichlet
|
||||
}
|
||||
});
|
||||
}
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Need to gather *interior portions* for ALL s-slices in simd directions
|
||||
// Do the gather as need to treat SIMD lanes differently, and insert zeroes on receive side
|
||||
// Reorder the fifth dim to be s=Ls-1 , s=0, s=1,...,Ls-2.
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
template<class vobj,class cobj,class compressor>
|
||||
static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
|
||||
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
|
||||
compressor &compress,int type,int partial)
|
||||
{
|
||||
GridBase *Grid = rhs.Grid();
|
||||
int Ls = Grid->_rdimensions[0];
|
||||
#ifdef DWF_COMPRESS
|
||||
int depth=dwf_compressor_depth;
|
||||
#else
|
||||
int depth = Ls/2;
|
||||
#endif
|
||||
|
||||
// insertion of zeroes...
|
||||
assert( (table.size()&0x1)==0);
|
||||
int num=table.size()/2;
|
||||
int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
|
||||
|
||||
auto rhs_v = rhs.View(AcceleratorRead);
|
||||
auto p0=&pointers[0][0];
|
||||
auto p1=&pointers[1][0];
|
||||
auto tp=&table[0];
|
||||
int nnum=num/Ls;
|
||||
accelerator_forNB(j, num, vobj::Nsimd(), {
|
||||
// Reorders both local and remote comms buffers
|
||||
//
|
||||
int s = j % Ls;
|
||||
int sp1 = (s+depth)%Ls; // peri incremented s slice
|
||||
|
||||
int hxyz= j/Ls;
|
||||
|
||||
int xyz0= hxyz*2; // xyzt part of coor
|
||||
int xyz1= hxyz*2+1;
|
||||
|
||||
int jj= hxyz + sp1*nnum ; // 0,1,2,3 -> Ls-1 slice , 0-slice, 1-slice ....
|
||||
|
||||
int kk0= xyz0*Ls + s ; // s=0 goes to s=1
|
||||
int kk1= xyz1*Ls + s ; // s=Ls-1 -> s=0
|
||||
compress.CompressExchange(p0[jj],p1[jj],
|
||||
rhs_v[so+tp[kk0 ].second], // Same s, consecutive xyz sites
|
||||
rhs_v[so+tp[kk1 ].second],
|
||||
type);
|
||||
});
|
||||
rhs_v.ViewClose();
|
||||
}
|
||||
// Merge routine is for SIMD faces
|
||||
template<class decompressor,class Merger>
|
||||
static void MergeFace(decompressor decompress,Merger &mm)
|
||||
{
|
||||
auto Ls = mm.dims[0];
|
||||
#ifdef DWF_COMPRESS
|
||||
int depth=dwf_compressor_depth;
|
||||
#else
|
||||
int depth = Ls/2;
|
||||
#endif
|
||||
int num= mm.buffer_size/2; // relate vol and Ls to buffer size
|
||||
auto mp = &mm.mpointer[0];
|
||||
auto vp0= &mm.vpointers[0][0]; // First arg is exchange first
|
||||
auto vp1= &mm.vpointers[1][0];
|
||||
auto type= mm.type;
|
||||
int nnum = num/Ls;
|
||||
accelerator_forNB(o,num,Merger::Nsimd,{
|
||||
|
||||
int s=o%Ls;
|
||||
int hxyz=o/Ls; // xyzt related component
|
||||
int xyz0=hxyz*2;
|
||||
int xyz1=hxyz*2+1;
|
||||
|
||||
int sp = (s+depth)%Ls;
|
||||
int jj= hxyz + sp*nnum ; // 0,1,2,3 -> Ls-1 slice , 0-slice, 1-slice ....
|
||||
|
||||
int oo0= s+xyz0*Ls;
|
||||
int oo1= s+xyz1*Ls;
|
||||
|
||||
// same ss0, ss1 pair goes to new layout
|
||||
decompress.Exchange(mp[oo0],mp[oo1],vp0[jj],vp1[jj],type);
|
||||
});
|
||||
}
|
||||
};
|
||||
class FaceGatherDWFMixedBCs
|
||||
{
|
||||
public:
|
||||
#ifdef DWF_COMPRESS
|
||||
static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/(2*dwf_compressor_depth);};
|
||||
#else
|
||||
static int PartialCompressionFactor(GridBase *grid) {return 1;}
|
||||
#endif
|
||||
|
||||
template<class vobj,class cobj,class compressor>
|
||||
static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
|
||||
const Lattice<vobj> &rhs,
|
||||
cobj *buffer,
|
||||
compressor &compress,
|
||||
int off,int so,int partial)
|
||||
{
|
||||
// std::cout << " face gather simple DWF partial "<<partial <<std::endl;
|
||||
if(partial) FaceGatherPartialDWF::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
|
||||
else FaceGatherSimple::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
|
||||
}
|
||||
template<class vobj,class cobj,class compressor>
|
||||
static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
|
||||
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
|
||||
compressor &compress,int type,int partial)
|
||||
{
|
||||
// std::cout << " face gather exch DWF partial "<<partial <<std::endl;
|
||||
if(partial) FaceGatherPartialDWF::Gather_plane_exchange(table,rhs,pointers,dimension, plane,cbmask,compress,type,partial);
|
||||
else FaceGatherSimple::Gather_plane_exchange (table,rhs,pointers,dimension, plane,cbmask,compress,type,partial);
|
||||
}
|
||||
template<class decompressor,class Merger>
|
||||
static void MergeFace(decompressor decompress,Merger &mm)
|
||||
{
|
||||
int partial = mm.partial;
|
||||
// std::cout << " merge DWF partial "<<partial <<std::endl;
|
||||
if ( partial ) FaceGatherPartialDWF::MergeFace(decompress,mm);
|
||||
else FaceGatherSimple::MergeFace(decompress,mm);
|
||||
}
|
||||
|
||||
template<class decompressor,class Decompression>
|
||||
static void DecompressFace(decompressor decompress,Decompression &dd)
|
||||
{
|
||||
int partial = dd.partial;
|
||||
// std::cout << " decompress DWF partial "<<partial <<std::endl;
|
||||
if ( partial ) FaceGatherPartialDWF::DecompressFace(decompress,dd);
|
||||
else FaceGatherSimple::DecompressFace(decompress,dd);
|
||||
}
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// optimised versions supporting half precision too??? Deprecate
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@@ -242,8 +39,7 @@ public:
|
||||
|
||||
//Could make FaceGather a template param, but then behaviour is runtime not compile time
|
||||
template<class _HCspinor,class _Hspinor,class _Spinor, class projector>
|
||||
class WilsonCompressorTemplate : public FaceGatherDWFMixedBCs
|
||||
// : public FaceGatherSimple
|
||||
class WilsonCompressorTemplate : public FaceGatherSimple
|
||||
{
|
||||
public:
|
||||
|
||||
|
@@ -165,6 +165,12 @@ public:
|
||||
StencilImpl Stencil;
|
||||
StencilImpl StencilEven;
|
||||
StencilImpl StencilOdd;
|
||||
void SloppyComms(int sloppy)
|
||||
{
|
||||
Stencil.SetSloppyComms(sloppy);
|
||||
StencilEven.SetSloppyComms(sloppy);
|
||||
StencilOdd.SetSloppyComms(sloppy);
|
||||
}
|
||||
|
||||
// Copy of the gauge field , with even and odd subsets
|
||||
DoubledGaugeField Umu;
|
||||
|
@@ -204,7 +204,14 @@ public:
|
||||
DoubledGaugeField Umu;
|
||||
DoubledGaugeField UmuEven;
|
||||
DoubledGaugeField UmuOdd;
|
||||
|
||||
|
||||
|
||||
void SloppyComms(int sloppy)
|
||||
{
|
||||
Stencil.SetSloppyComms(sloppy);
|
||||
StencilEven.SetSloppyComms(sloppy);
|
||||
StencilOdd.SetSloppyComms(sloppy);
|
||||
}
|
||||
// Comms buffer
|
||||
// std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
|
||||
|
||||
|
@@ -57,7 +57,7 @@ public:
|
||||
|
||||
{
|
||||
// RealD eps = 1.0;
|
||||
std::cout<<GridLogMessage << "ZMobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" gamma passed in"<<std::endl;
|
||||
// std::cout<<GridLogMessage << "ZMobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" gamma passed in"<<std::endl;
|
||||
std::vector<Coeff_t> zgamma(this->Ls);
|
||||
for(int s=0;s<this->Ls;s++){
|
||||
zgamma[s] = gamma[s];
|
||||
|
@@ -535,7 +535,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
||||
{
|
||||
autoView(U_v , U,AcceleratorRead);
|
||||
autoView(in_v , in,AcceleratorRead);
|
||||
autoView(out_v,out,AcceleratorWrite);
|
||||
autoView(out_v,out,AcceleratorWriteDiscard);
|
||||
autoView(st_v , st,AcceleratorRead);
|
||||
KERNEL_CALL_ID(GenericDhopSite);
|
||||
}
|
||||
|
@@ -118,7 +118,7 @@ protected:
|
||||
GaugeK); // derivative of SmearBase
|
||||
return SigmaK;
|
||||
}
|
||||
|
||||
public:
|
||||
/*! @brief Returns smeared configuration at level 'Level' */
|
||||
const GaugeField &get_smeared_conf(int Level) const
|
||||
{
|
||||
|
@@ -819,7 +819,7 @@ public:
|
||||
} // if smearingLevels = 0 do nothing
|
||||
}
|
||||
|
||||
private:
|
||||
public:
|
||||
//====================================================================
|
||||
// Override base clas here to mask it
|
||||
virtual void fill_smearedSet(GaugeField &U)
|
||||
|
@@ -252,6 +252,11 @@ void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const{
|
||||
|
||||
out = in;
|
||||
RealD taus = 0.;
|
||||
|
||||
// Perform initial t=0 measurements
|
||||
for(auto const &meas : this->functions)
|
||||
meas.second(0,taus,out);
|
||||
|
||||
for (unsigned int step = 1; step <= Nstep; step++) { //step indicates the number of smearing steps applied at the time of measurement
|
||||
auto start = std::chrono::high_resolution_clock::now();
|
||||
evolve_step(out, taus);
|
||||
@@ -336,6 +341,11 @@ void WilsonFlowAdaptive<Gimpl>::smear(GaugeField& out, const GaugeField& in) con
|
||||
RealD taus = 0.;
|
||||
RealD eps = init_epsilon;
|
||||
unsigned int step = 0;
|
||||
|
||||
// Perform initial t=0 measurements
|
||||
for(auto const &meas : this->functions)
|
||||
meas.second(step,taus,out);
|
||||
|
||||
do{
|
||||
int step_success = evolve_step_adaptive(out, taus, eps);
|
||||
step += step_success; //step will not be incremented if the integration step fails
|
||||
|
@@ -30,25 +30,26 @@
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
uint64_t DslashFullCount;
|
||||
uint64_t DslashPartialCount;
|
||||
//uint64_t DslashPartialCount;
|
||||
uint64_t DslashDirichletCount;
|
||||
|
||||
void DslashResetCounts(void)
|
||||
{
|
||||
DslashFullCount=0;
|
||||
DslashPartialCount=0;
|
||||
// DslashPartialCount=0;
|
||||
DslashDirichletCount=0;
|
||||
}
|
||||
void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full)
|
||||
{
|
||||
dirichlet = DslashDirichletCount;
|
||||
partial = DslashPartialCount;
|
||||
partial = 0;
|
||||
full = DslashFullCount;
|
||||
}
|
||||
void DslashLogFull(void) { DslashFullCount++;}
|
||||
void DslashLogPartial(void) { DslashPartialCount++;}
|
||||
//void DslashLogPartial(void) { DslashPartialCount++;}
|
||||
void DslashLogDirichlet(void){ DslashDirichletCount++;}
|
||||
|
||||
deviceVector<unsigned char> StencilBuffer::DeviceCommBuf;
|
||||
|
||||
void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
|
||||
int off,std::vector<std::pair<int,int> > & table)
|
||||
|
@@ -55,10 +55,10 @@ NAMESPACE_BEGIN(Grid);
|
||||
// These can move into a params header and be given MacroMagic serialisation
|
||||
struct DefaultImplParams {
|
||||
Coordinate dirichlet; // Blocksize of dirichlet BCs
|
||||
int partialDirichlet;
|
||||
// int partialDirichlet;
|
||||
DefaultImplParams() {
|
||||
dirichlet.resize(0);
|
||||
partialDirichlet=0;
|
||||
// partialDirichlet=0;
|
||||
};
|
||||
};
|
||||
|
||||
@@ -69,6 +69,12 @@ struct DefaultImplParams {
|
||||
void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
|
||||
int off,std::vector<std::pair<int,int> > & table);
|
||||
|
||||
class StencilBuffer
|
||||
{
|
||||
public:
|
||||
static deviceVector<unsigned char> DeviceCommBuf; // placed in Stencil.cc
|
||||
};
|
||||
|
||||
void DslashResetCounts(void);
|
||||
void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full);
|
||||
void DslashLogFull(void);
|
||||
@@ -113,8 +119,8 @@ class CartesianStencilAccelerator {
|
||||
///////////////////////////////////////////////////
|
||||
// If true, this is partially communicated per face
|
||||
///////////////////////////////////////////////////
|
||||
StencilVector _comms_partial_send;
|
||||
StencilVector _comms_partial_recv;
|
||||
// StencilVector _comms_partial_send;
|
||||
// StencilVector _comms_partial_recv;
|
||||
//
|
||||
StencilVector _comm_buf_size;
|
||||
StencilVector _permute_type;
|
||||
@@ -186,6 +192,11 @@ public:
|
||||
|
||||
void ViewClose(void) { }
|
||||
|
||||
#ifdef GRID_LOG_VIEWS
|
||||
size_t size() { return 0; };
|
||||
uint64_t & operator[](size_t i) { static uint64_t v=0; return v; };
|
||||
#endif
|
||||
|
||||
};
|
||||
|
||||
////////////////////////////////////////
|
||||
@@ -205,16 +216,16 @@ public:
|
||||
struct Packet {
|
||||
void * send_buf;
|
||||
void * recv_buf;
|
||||
#ifndef ACCELERATOR_AWARE_MPI
|
||||
void * host_send_buf; // Allocate this if not MPI_CUDA_AWARE
|
||||
void * host_recv_buf; // Allocate this if not MPI_CUDA_AWARE
|
||||
#endif
|
||||
void * compressed_send_buf;
|
||||
void * compressed_recv_buf;
|
||||
Integer to_rank;
|
||||
Integer from_rank;
|
||||
Integer do_send;
|
||||
Integer do_recv;
|
||||
Integer xbytes;
|
||||
Integer rbytes;
|
||||
Integer xbytes_compressed;
|
||||
Integer rbytes_compressed;
|
||||
};
|
||||
struct Merge {
|
||||
static constexpr int Nsimd = vobj::Nsimd();
|
||||
@@ -223,7 +234,7 @@ public:
|
||||
std::vector<cobj *> vpointers;
|
||||
Integer buffer_size;
|
||||
Integer type;
|
||||
Integer partial; // partial dirichlet BCs
|
||||
// Integer partial; // partial dirichlet BCs
|
||||
Coordinate dims;
|
||||
};
|
||||
struct Decompress {
|
||||
@@ -231,7 +242,7 @@ public:
|
||||
cobj * kernel_p;
|
||||
cobj * mpi_p;
|
||||
Integer buffer_size;
|
||||
Integer partial; // partial dirichlet BCs
|
||||
// Integer partial; // partial dirichlet BCs
|
||||
Coordinate dims;
|
||||
};
|
||||
struct CopyReceiveBuffer {
|
||||
@@ -252,9 +263,45 @@ public:
|
||||
|
||||
protected:
|
||||
GridBase * _grid;
|
||||
|
||||
///////////////////////////////////////////////////
|
||||
// Sloppy comms will make a second buffer upon comms
|
||||
///////////////////////////////////////////////////
|
||||
size_t device_heap_top; //
|
||||
size_t device_heap_bytes;//
|
||||
size_t device_heap_size; //
|
||||
void *DeviceBufferMalloc(size_t bytes)
|
||||
{
|
||||
void *ptr = (void *)device_heap_top;
|
||||
device_heap_top += bytes;
|
||||
device_heap_bytes+= bytes;
|
||||
if ( device_heap_bytes > device_heap_size ) {
|
||||
std::cout << "DeviceBufferMalloc overflow bytes "<<bytes<<" heap bytes "<<device_heap_bytes<<" heap size "<<device_heap_size<<std::endl;
|
||||
assert (device_heap_bytes <= device_heap_size);
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
void DeviceBufferFreeAll(void)
|
||||
{
|
||||
device_heap_size = _unified_buffer_size*sizeof(cobj);
|
||||
// Resize up if necessary, never down
|
||||
if ( StencilBuffer::DeviceCommBuf.size() < device_heap_size ) {
|
||||
StencilBuffer::DeviceCommBuf.resize(device_heap_size);
|
||||
}
|
||||
device_heap_top =(size_t) &StencilBuffer::DeviceCommBuf[0];
|
||||
device_heap_size = StencilBuffer::DeviceCommBuf.size();
|
||||
device_heap_bytes=0;
|
||||
}
|
||||
|
||||
public:
|
||||
GridBase *Grid(void) const { return _grid; }
|
||||
|
||||
/////////////////////////////////////////////////////////
|
||||
// Control reduced precision comms
|
||||
/////////////////////////////////////////////////////////
|
||||
int SloppyComms;
|
||||
void SetSloppyComms(int sloppy) { SloppyComms = sloppy; };
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Needed to conveniently communicate gparity parameters into GPU memory
|
||||
// without adding parameters. Perhaps a template parameter to StenciView is
|
||||
@@ -268,7 +315,7 @@ public:
|
||||
}
|
||||
|
||||
int face_table_computed;
|
||||
int partialDirichlet;
|
||||
// int partialDirichlet;
|
||||
int fullDirichlet;
|
||||
std::vector<deviceVector<std::pair<int,int> > > face_table ;
|
||||
deviceVector<int> surface_list;
|
||||
@@ -361,24 +408,145 @@ public:
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Non blocking send and receive. Necessarily parallel.
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
void DecompressPacket(Packet &packet)
|
||||
{
|
||||
if ( !SloppyComms ) return;
|
||||
|
||||
if ( packet.do_recv && _grid->IsOffNode(packet.from_rank) ) {
|
||||
|
||||
typedef typename getPrecision<cobj>::real_scalar_type word;
|
||||
uint64_t words = packet.rbytes/sizeof(word);
|
||||
const int nsimd = sizeof(typename cobj::vector_type)/sizeof(word);
|
||||
const uint64_t outer = words/nsimd;
|
||||
|
||||
if(sizeof(word)==8) {
|
||||
|
||||
// Can either choose to represent as float vs double and prec change
|
||||
// OR
|
||||
// truncate the mantissa bfp16 style
|
||||
double *dbuf =(double *) packet.recv_buf;
|
||||
float *fbuf =(float *) packet.compressed_recv_buf;
|
||||
|
||||
accelerator_forNB(ss,outer,nsimd,{
|
||||
int lane = acceleratorSIMTlane(nsimd);
|
||||
dbuf[ss*nsimd+lane] = fbuf[ss*nsimd+lane]; //conversion
|
||||
});
|
||||
|
||||
} else if ( sizeof(word)==4){
|
||||
// Can either choose to represent as half vs float and prec change
|
||||
// OR
|
||||
// truncate the mantissa bfp16 style
|
||||
|
||||
uint32_t *fbuf =(uint32_t *) packet.recv_buf;
|
||||
uint16_t *hbuf =(uint16_t *) packet.compressed_recv_buf;
|
||||
|
||||
accelerator_forNB(ss,outer,nsimd,{
|
||||
int lane = acceleratorSIMTlane(nsimd);
|
||||
fbuf[ss*nsimd+lane] = ((uint32_t)hbuf[ss*nsimd+lane])<<16; //copy back and pad each word with zeroes
|
||||
});
|
||||
|
||||
} else {
|
||||
assert(0 && "unknown floating point precision");
|
||||
}
|
||||
}
|
||||
}
|
||||
void CompressPacket(Packet &packet)
|
||||
{
|
||||
packet.xbytes_compressed = packet.xbytes;
|
||||
packet.compressed_send_buf = packet.send_buf;
|
||||
|
||||
packet.rbytes_compressed = packet.rbytes;
|
||||
packet.compressed_recv_buf = packet.recv_buf;
|
||||
|
||||
if ( !SloppyComms ) {
|
||||
return;
|
||||
}
|
||||
|
||||
typedef typename getPrecision<cobj>::real_scalar_type word;
|
||||
uint64_t words = packet.xbytes/sizeof(word);
|
||||
const int nsimd = sizeof(typename cobj::vector_type)/sizeof(word);
|
||||
const uint64_t outer = words/nsimd;
|
||||
|
||||
if (packet.do_recv && _grid->IsOffNode(packet.from_rank) ) {
|
||||
|
||||
packet.rbytes_compressed = packet.rbytes/2;
|
||||
packet.compressed_recv_buf = DeviceBufferMalloc(packet.rbytes_compressed);
|
||||
// std::cout << " CompressPacket recv from "<<packet.from_rank<<" "<<std::hex<<packet.compressed_recv_buf<<std::dec<<std::endl;
|
||||
|
||||
}
|
||||
//else {
|
||||
// std::cout << " CompressPacket recv is uncompressed from "<<packet.from_rank<<" "<<std::hex<<packet.compressed_recv_buf<<std::dec<<std::endl;
|
||||
// }
|
||||
|
||||
if (packet.do_send && _grid->IsOffNode(packet.to_rank) ) {
|
||||
|
||||
packet.xbytes_compressed = packet.xbytes/2;
|
||||
packet.compressed_send_buf = DeviceBufferMalloc(packet.xbytes_compressed);
|
||||
// std::cout << " CompressPacket send to "<<packet.to_rank<<" "<<std::hex<<packet.compressed_send_buf<<std::dec<<std::endl;
|
||||
|
||||
if(sizeof(word)==8) {
|
||||
|
||||
double *dbuf =(double *) packet.send_buf;
|
||||
float *fbuf =(float *) packet.compressed_send_buf;
|
||||
|
||||
accelerator_forNB(ss,outer,nsimd,{
|
||||
int lane = acceleratorSIMTlane(nsimd);
|
||||
fbuf[ss*nsimd+lane] = dbuf[ss*nsimd+lane]; // convert fp64 to fp32
|
||||
});
|
||||
|
||||
} else if ( sizeof(word)==4){
|
||||
|
||||
uint32_t *fbuf =(uint32_t *) packet.send_buf;
|
||||
uint16_t *hbuf =(uint16_t *) packet.compressed_send_buf;
|
||||
|
||||
accelerator_forNB(ss,outer,nsimd,{
|
||||
int lane = acceleratorSIMTlane(nsimd);
|
||||
hbuf[ss*nsimd+lane] = fbuf[ss*nsimd+lane]>>16; // convert as in Bagel/BFM ; bfloat16 ; s7e8 Intel patent
|
||||
});
|
||||
|
||||
} else {
|
||||
assert(0 && "unknown floating point precision");
|
||||
}
|
||||
|
||||
}
|
||||
// else {
|
||||
// std::cout << " CompressPacket send is uncompressed to "<<packet.to_rank<<" "<<std::hex<<packet.compressed_send_buf<<std::dec<<std::endl;
|
||||
// }
|
||||
|
||||
return;
|
||||
}
|
||||
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
|
||||
{
|
||||
// std::cout << "Communicate Begin "<<std::endl;
|
||||
// _grid->Barrier();
|
||||
FlightRecorder::StepLog("Communicate begin");
|
||||
///////////////////////////////////////////////
|
||||
// All GPU kernel tasks must complete
|
||||
// accelerator_barrier(); // All kernels should ALREADY be complete
|
||||
// _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer
|
||||
// But the HaloGather had a barrier too.
|
||||
// accelerator_barrier(); All kernels should ALREADY be complete
|
||||
//Everyone is here, so noone running slow and still using receive buffer
|
||||
_grid->StencilBarrier();
|
||||
// But the HaloGather had a barrier too.
|
||||
///////////////////////////////////////////////
|
||||
if (SloppyComms) {
|
||||
DeviceBufferFreeAll();
|
||||
}
|
||||
for(int i=0;i<Packets.size();i++){
|
||||
this->CompressPacket(Packets[i]);
|
||||
}
|
||||
if (SloppyComms) {
|
||||
accelerator_barrier();
|
||||
#ifdef NVLINK_GET
|
||||
_grid->StencilBarrier();
|
||||
#endif
|
||||
}
|
||||
|
||||
for(int i=0;i<Packets.size();i++){
|
||||
// std::cout << "Communicate prepare "<<i<<std::endl;
|
||||
// _grid->Barrier();
|
||||
_grid->StencilSendToRecvFromPrepare(MpiReqs,
|
||||
Packets[i].send_buf,
|
||||
Packets[i].compressed_send_buf,
|
||||
Packets[i].to_rank,Packets[i].do_send,
|
||||
Packets[i].recv_buf,
|
||||
Packets[i].compressed_recv_buf,
|
||||
Packets[i].from_rank,Packets[i].do_recv,
|
||||
Packets[i].xbytes,Packets[i].rbytes,i);
|
||||
Packets[i].xbytes_compressed,Packets[i].rbytes_compressed,i);
|
||||
}
|
||||
// std::cout << "Communicate PollDtoH "<<std::endl;
|
||||
// _grid->Barrier();
|
||||
@@ -389,18 +557,22 @@ public:
|
||||
// Starts intranode
|
||||
for(int i=0;i<Packets.size();i++){
|
||||
// std::cout << "Communicate Begin "<<i<<std::endl;
|
||||
// _grid->Barrier();
|
||||
_grid->StencilSendToRecvFromBegin(MpiReqs,
|
||||
Packets[i].send_buf,
|
||||
Packets[i].send_buf,Packets[i].compressed_send_buf,
|
||||
Packets[i].to_rank,Packets[i].do_send,
|
||||
Packets[i].recv_buf,
|
||||
Packets[i].recv_buf,Packets[i].compressed_recv_buf,
|
||||
Packets[i].from_rank,Packets[i].do_recv,
|
||||
Packets[i].xbytes,Packets[i].rbytes,i);
|
||||
Packets[i].xbytes_compressed,Packets[i].rbytes_compressed,i);
|
||||
// std::cout << "Communicate Begin started "<<i<<std::endl;
|
||||
// _grid->Barrier();
|
||||
}
|
||||
FlightRecorder::StepLog("Communicate begin has finished");
|
||||
// Get comms started then run checksums
|
||||
// Having this PRIOR to the dslash seems to make Sunspot work... (!)
|
||||
for(int i=0;i<Packets.size();i++){
|
||||
if ( Packets[i].do_send )
|
||||
FlightRecorder::xmitLog(Packets[i].send_buf,Packets[i].xbytes);
|
||||
FlightRecorder::xmitLog(Packets[i].compressed_send_buf,Packets[i].xbytes_compressed);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -415,14 +587,15 @@ public:
|
||||
// std::cout << "Communicate Complete Complete "<<std::endl;
|
||||
// _grid->Barrier();
|
||||
_grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
|
||||
if ( this->partialDirichlet ) DslashLogPartial();
|
||||
else if ( this->fullDirichlet ) DslashLogDirichlet();
|
||||
// if ( this->partialDirichlet ) DslashLogPartial();
|
||||
if ( this->fullDirichlet ) DslashLogDirichlet();
|
||||
else DslashLogFull();
|
||||
// acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete
|
||||
// accelerator_barrier();
|
||||
for(int i=0;i<Packets.size();i++){
|
||||
this->DecompressPacket(Packets[i]);
|
||||
if ( Packets[i].do_recv )
|
||||
FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank);
|
||||
FlightRecorder::recvLog(Packets[i].compressed_recv_buf,Packets[i].rbytes_compressed,Packets[i].from_rank);
|
||||
}
|
||||
FlightRecorder::StepLog("Finish communicate complete");
|
||||
}
|
||||
@@ -617,7 +790,7 @@ public:
|
||||
}
|
||||
void AddDecompress(cobj *k_p,cobj *m_p,Integer buffer_size,std::vector<Decompress> &dv) {
|
||||
Decompress d;
|
||||
d.partial = this->partialDirichlet;
|
||||
// d.partial = this->partialDirichlet;
|
||||
d.dims = _grid->_fdimensions;
|
||||
d.kernel_p = k_p;
|
||||
d.mpi_p = m_p;
|
||||
@@ -626,7 +799,7 @@ public:
|
||||
}
|
||||
void AddMerge(cobj *merge_p,std::vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) {
|
||||
Merge m;
|
||||
m.partial = this->partialDirichlet;
|
||||
// m.partial = this->partialDirichlet;
|
||||
m.dims = _grid->_fdimensions;
|
||||
m.type = type;
|
||||
m.mpointer = merge_p;
|
||||
@@ -731,8 +904,8 @@ public:
|
||||
int block = dirichlet_block[dimension];
|
||||
this->_comms_send[ii] = comm_dim;
|
||||
this->_comms_recv[ii] = comm_dim;
|
||||
this->_comms_partial_send[ii] = 0;
|
||||
this->_comms_partial_recv[ii] = 0;
|
||||
// this->_comms_partial_send[ii] = 0;
|
||||
// this->_comms_partial_recv[ii] = 0;
|
||||
if ( block && comm_dim ) {
|
||||
assert(abs(displacement) < ld );
|
||||
// Quiesce communication across block boundaries
|
||||
@@ -753,10 +926,10 @@ public:
|
||||
if ( ( (ld*(pc+1) ) % block ) == 0 ) this->_comms_send[ii] = 0;
|
||||
if ( ( (ld*pc ) % block ) == 0 ) this->_comms_recv[ii] = 0;
|
||||
}
|
||||
if ( partialDirichlet ) {
|
||||
this->_comms_partial_send[ii] = !this->_comms_send[ii];
|
||||
this->_comms_partial_recv[ii] = !this->_comms_recv[ii];
|
||||
}
|
||||
// if ( partialDirichlet ) {
|
||||
// this->_comms_partial_send[ii] = !this->_comms_send[ii];
|
||||
// this->_comms_partial_recv[ii] = !this->_comms_recv[ii];
|
||||
// }
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -768,6 +941,7 @@ public:
|
||||
Parameters p=Parameters(),
|
||||
bool preserve_shm=false)
|
||||
{
|
||||
SloppyComms = 0;
|
||||
face_table_computed=0;
|
||||
_grid = grid;
|
||||
this->parameters=p;
|
||||
@@ -785,7 +959,7 @@ public:
|
||||
this->same_node.resize(npoints);
|
||||
|
||||
if ( p.dirichlet.size() ==0 ) p.dirichlet.resize(grid->Nd(),0);
|
||||
partialDirichlet = p.partialDirichlet;
|
||||
// partialDirichlet = p.partialDirichlet;
|
||||
DirichletBlock(p.dirichlet); // comms send/recv set up
|
||||
fullDirichlet=0;
|
||||
for(int d=0;d<p.dirichlet.size();d++){
|
||||
@@ -866,7 +1040,7 @@ public:
|
||||
/////////////////////////////////////////////////////////////////////////////////
|
||||
const int Nsimd = grid->Nsimd();
|
||||
|
||||
// Allow for multiple stencils to exist simultaneously
|
||||
// Allow for multiple stencils to be communicated simultaneously
|
||||
if (!preserve_shm)
|
||||
_grid->ShmBufferFreeAll();
|
||||
|
||||
@@ -934,7 +1108,8 @@ public:
|
||||
GridBase *grid=_grid;
|
||||
const int Nsimd = grid->Nsimd();
|
||||
|
||||
int comms_recv = this->_comms_recv[point] || this->_comms_partial_recv[point] ;
|
||||
// int comms_recv = this->_comms_recv[point] || this->_comms_partial_recv[point] ;
|
||||
int comms_recv = this->_comms_recv[point];
|
||||
int fd = _grid->_fdimensions[dimension];
|
||||
int ld = _grid->_ldimensions[dimension];
|
||||
int rd = _grid->_rdimensions[dimension];
|
||||
@@ -1123,8 +1298,8 @@ public:
|
||||
|
||||
int comms_send = this->_comms_send[point];
|
||||
int comms_recv = this->_comms_recv[point];
|
||||
int comms_partial_send = this->_comms_partial_send[point] ;
|
||||
int comms_partial_recv = this->_comms_partial_recv[point] ;
|
||||
// int comms_partial_send = this->_comms_partial_send[point] ;
|
||||
// int comms_partial_recv = this->_comms_partial_recv[point] ;
|
||||
|
||||
assert(rhs.Grid()==_grid);
|
||||
// conformable(_grid,rhs.Grid());
|
||||
@@ -1159,11 +1334,11 @@ public:
|
||||
int rbytes;
|
||||
|
||||
if ( comms_send ) xbytes = bytes; // Full send
|
||||
else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
|
||||
// else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
|
||||
else xbytes = 0; // full dirichlet
|
||||
|
||||
if ( comms_recv ) rbytes = bytes;
|
||||
else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
|
||||
// else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
|
||||
else rbytes = 0;
|
||||
|
||||
int so = sx*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
|
||||
@@ -1190,7 +1365,8 @@ public:
|
||||
}
|
||||
|
||||
|
||||
if ( (compress.DecompressionStep()&&comms_recv) || comms_partial_recv ) {
|
||||
// if ( (compress.DecompressionStep()&&comms_recv) || comms_partial_recv ) {
|
||||
if ( compress.DecompressionStep()&&comms_recv) {
|
||||
recv_buf=u_simd_recv_buf[0];
|
||||
} else {
|
||||
recv_buf=this->u_recv_buf_p;
|
||||
@@ -1224,7 +1400,8 @@ public:
|
||||
#endif
|
||||
|
||||
// std::cout << " GatherPlaneSimple partial send "<< comms_partial_send<<std::endl;
|
||||
compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,comms_partial_send);
|
||||
// compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,comms_partial_send);
|
||||
compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,0);
|
||||
|
||||
int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[comm_off],0,xbytes,rbytes,cbmask);
|
||||
if ( !duplicate ) { // Force comms for now
|
||||
@@ -1233,8 +1410,8 @@ public:
|
||||
// Build a list of things to do after we synchronise GPUs
|
||||
// Start comms now???
|
||||
///////////////////////////////////////////////////////////
|
||||
int do_send = (comms_send|comms_partial_send) && (!shm_send );
|
||||
int do_recv = (comms_send|comms_partial_send) && (!shm_recv );
|
||||
int do_send = (comms_send) && (!shm_send );
|
||||
int do_recv = (comms_send) && (!shm_recv );
|
||||
AddPacket((void *)&send_buf[comm_off],
|
||||
(void *)&recv_buf[comm_off],
|
||||
xmit_to_rank, do_send,
|
||||
@@ -1242,7 +1419,7 @@ public:
|
||||
xbytes,rbytes);
|
||||
}
|
||||
|
||||
if ( (compress.DecompressionStep() && comms_recv) || comms_partial_recv ) {
|
||||
if ( (compress.DecompressionStep() && comms_recv) ) {
|
||||
AddDecompress(&this->u_recv_buf_p[comm_off],
|
||||
&recv_buf[comm_off],
|
||||
words,Decompressions);
|
||||
@@ -1264,8 +1441,8 @@ public:
|
||||
|
||||
int comms_send = this->_comms_send[point];
|
||||
int comms_recv = this->_comms_recv[point];
|
||||
int comms_partial_send = this->_comms_partial_send[point] ;
|
||||
int comms_partial_recv = this->_comms_partial_recv[point] ;
|
||||
// int comms_partial_send = this->_comms_partial_send[point] ;
|
||||
// int comms_partial_recv = this->_comms_partial_recv[point] ;
|
||||
|
||||
int fd = _grid->_fdimensions[dimension];
|
||||
int rd = _grid->_rdimensions[dimension];
|
||||
@@ -1340,18 +1517,20 @@ public:
|
||||
|
||||
|
||||
if ( comms_send ) xbytes = bytes;
|
||||
else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
|
||||
// else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
|
||||
else xbytes = 0;
|
||||
|
||||
if ( comms_recv ) rbytes = bytes;
|
||||
else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
|
||||
// else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
|
||||
else rbytes = 0;
|
||||
|
||||
// Gathers SIMD lanes for send and merge
|
||||
// Different faces can be full comms or partial comms with multiple ranks per node
|
||||
if ( comms_send || comms_recv||comms_partial_send||comms_partial_recv ) {
|
||||
// if ( comms_send || comms_recv||comms_partial_send||comms_partial_recv ) {
|
||||
if ( comms_send || comms_recv ) {
|
||||
|
||||
int partial = partialDirichlet;
|
||||
// int partial = partialDirichlet;
|
||||
int partial = 0;
|
||||
compressor::Gather_plane_exchange(face_table[face_idx],rhs,
|
||||
spointers,dimension,sx,cbmask,
|
||||
compress,permute_type,partial );
|
||||
@@ -1417,7 +1596,8 @@ public:
|
||||
if ( (bytes != rbytes) && (rbytes!=0) ){
|
||||
acceleratorMemSet(rp,0,bytes); // Zero prefill comms buffer to zero
|
||||
}
|
||||
int do_send = (comms_send|comms_partial_send) && (!shm_send );
|
||||
// int do_send = (comms_send|comms_partial_send) && (!shm_send );
|
||||
int do_send = (comms_send) && (!shm_send );
|
||||
AddPacket((void *)sp,(void *)rp,
|
||||
xmit_to_rank,do_send,
|
||||
recv_from_rank,do_send,
|
||||
@@ -1431,7 +1611,8 @@ public:
|
||||
}
|
||||
}
|
||||
// rpointer may be doing a remote read in the gather over SHM
|
||||
if ( comms_recv|comms_partial_recv ) {
|
||||
// if ( comms_recv|comms_partial_recv ) {
|
||||
if ( comms_recv ) {
|
||||
AddMerge(&this->u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers);
|
||||
}
|
||||
|
||||
|
@@ -67,7 +67,7 @@ void acceleratorInit(void)
|
||||
printf("AcceleratorCudaInit[%d]: Device identifier: %s\n",rank, prop.name);
|
||||
|
||||
|
||||
GPU_PROP_FMT(totalGlobalMem,"%lld");
|
||||
GPU_PROP_FMT(totalGlobalMem,"%zu");
|
||||
GPU_PROP(managedMemory);
|
||||
GPU_PROP(isMultiGpuBoard);
|
||||
GPU_PROP(warpSize);
|
||||
@@ -240,7 +240,7 @@ void acceleratorInit(void)
|
||||
|
||||
char hostname[HOST_NAME_MAX+1];
|
||||
gethostname(hostname, HOST_NAME_MAX+1);
|
||||
if ( rank==0 ) printf(" acceleratorInit world_rank %d is host %s \n",world_rank,hostname);
|
||||
if ( rank==0 ) printf("AcceleratorSyclInit world_rank %d is host %s \n",world_rank,hostname);
|
||||
|
||||
auto devices = sycl::device::get_devices();
|
||||
for(int d = 0;d<devices.size();d++){
|
||||
|
@@ -215,7 +215,7 @@ inline void *acceleratorAllocHost(size_t bytes)
|
||||
auto err = cudaMallocHost((void **)&ptr,bytes);
|
||||
if( err != cudaSuccess ) {
|
||||
ptr = (void *) NULL;
|
||||
printf(" cudaMallocHost failed for %d %s \n",bytes,cudaGetErrorString(err));
|
||||
printf(" cudaMallocHost failed for %zu %s \n",bytes,cudaGetErrorString(err));
|
||||
assert(0);
|
||||
}
|
||||
return ptr;
|
||||
@@ -226,7 +226,7 @@ inline void *acceleratorAllocShared(size_t bytes)
|
||||
auto err = cudaMallocManaged((void **)&ptr,bytes);
|
||||
if( err != cudaSuccess ) {
|
||||
ptr = (void *) NULL;
|
||||
printf(" cudaMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err));
|
||||
printf(" cudaMallocManaged failed for %zu %s \n",bytes,cudaGetErrorString(err));
|
||||
assert(0);
|
||||
}
|
||||
return ptr;
|
||||
@@ -237,7 +237,7 @@ inline void *acceleratorAllocDevice(size_t bytes)
|
||||
auto err = cudaMalloc((void **)&ptr,bytes);
|
||||
if( err != cudaSuccess ) {
|
||||
ptr = (void *) NULL;
|
||||
printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err));
|
||||
printf(" cudaMalloc failed for %zu %s \n",bytes,cudaGetErrorString(err));
|
||||
}
|
||||
return ptr;
|
||||
};
|
||||
@@ -251,7 +251,7 @@ inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { c
|
||||
inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
|
||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
|
||||
inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
|
||||
acceleratorCopyToDevice(to,from,bytes, cudaMemcpyHostToDevice);
|
||||
acceleratorCopyToDevice(from,to,bytes);
|
||||
return 0;
|
||||
}
|
||||
inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
|
||||
@@ -337,7 +337,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
|
||||
cgh.parallel_for( \
|
||||
sycl::nd_range<3>(global,local), \
|
||||
[=] (sycl::nd_item<3> item) /*mutable*/ \
|
||||
[[intel::reqd_sub_group_size(16)]] \
|
||||
[[sycl::reqd_sub_group_size(16)]] \
|
||||
{ \
|
||||
auto iter1 = item.get_global_id(0); \
|
||||
auto iter2 = item.get_global_id(1); \
|
||||
|
@@ -28,11 +28,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
/* END LEGAL */
|
||||
#pragma once
|
||||
|
||||
#ifndef MIN
|
||||
#define MIN(x,y) ((x)>(y)?(y):(x))
|
||||
#endif
|
||||
|
||||
|
||||
// Introduce a class to gain deterministic bit reproducible reduction.
|
||||
// make static; perhaps just a namespace is required.
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
@@ -372,4 +372,53 @@ void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank)
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef GRID_LOG_VIEWS
|
||||
|
||||
bool ViewLogger::Enabled = false;
|
||||
std::vector<ViewLogger::Entry_t> ViewLogger::LogVector;
|
||||
|
||||
void ViewLogger::Begin() { Enabled = true; LogVector.resize(0); }
|
||||
void ViewLogger::End() { Enabled = false; }
|
||||
|
||||
void ViewLogger::Log(const char* filename, int line, int index, int mode, void* data, uint64_t bytes)
|
||||
{
|
||||
if (!Enabled)
|
||||
return;
|
||||
|
||||
size_t i = LogVector.size();
|
||||
LogVector.resize(i + 1);
|
||||
auto & n = LogVector[i];
|
||||
|
||||
n.filename = filename;
|
||||
n.line = line;
|
||||
n.index = index;
|
||||
|
||||
if (bytes < sizeof(uint64_t)) {
|
||||
|
||||
n.head = n.tail = 0;
|
||||
|
||||
} else {
|
||||
|
||||
switch (mode) {
|
||||
case AcceleratorRead:
|
||||
case AcceleratorWrite:
|
||||
case AcceleratorWriteDiscard:
|
||||
acceleratorCopyFromDevice((char*)data, &n.head, sizeof(uint64_t));
|
||||
acceleratorCopyFromDevice((char*)data + bytes - sizeof(uint64_t), &n.tail, sizeof(uint64_t));
|
||||
break;
|
||||
|
||||
case CpuRead:
|
||||
case CpuWrite:
|
||||
//case CpuWriteDiscard:
|
||||
n.head = *(uint64_t*)data;
|
||||
n.tail = *(uint64_t*)((char*)data + bytes - sizeof(uint64_t));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
@@ -42,5 +42,22 @@ class FlightRecorder {
|
||||
static void xmitLog(void *,uint64_t bytes);
|
||||
static void recvLog(void *,uint64_t bytes,int rank);
|
||||
};
|
||||
#ifdef GRID_LOG_VIEWS
|
||||
class ViewLogger {
|
||||
struct Entry_t {
|
||||
const char* filename;
|
||||
int line;
|
||||
int index;
|
||||
uint64_t head, tail;
|
||||
};
|
||||
|
||||
public:
|
||||
static bool Enabled;
|
||||
static std::vector<Entry_t> LogVector;
|
||||
static void Begin();
|
||||
static void End();
|
||||
static void Log(const char* filename, int line, int index, int mode, void* data, uint64_t bytes);
|
||||
};
|
||||
#endif
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
|
@@ -46,10 +46,14 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
#include <cstdlib>
|
||||
#include <memory>
|
||||
|
||||
|
||||
#include <Grid/Grid.h>
|
||||
|
||||
#include <Grid/util/CompilerCompatible.h>
|
||||
|
||||
#ifdef HAVE_UNWIND
|
||||
#include <libunwind.h>
|
||||
#endif
|
||||
|
||||
#include <fenv.h>
|
||||
#ifdef __APPLE__
|
||||
@@ -295,6 +299,20 @@ void GridBanner(void)
|
||||
std::cout << std::setprecision(9);
|
||||
}
|
||||
|
||||
//Some file local variables
|
||||
static int fileno_stdout;
|
||||
static int fileno_stderr;
|
||||
static int signal_delay;
|
||||
class dlRegion {
|
||||
public:
|
||||
uint64_t start;
|
||||
uint64_t end;
|
||||
uint64_t size;
|
||||
uint64_t offset;
|
||||
std::string name;
|
||||
};
|
||||
std::vector<dlRegion> dlMap;
|
||||
|
||||
void Grid_init(int *argc,char ***argv)
|
||||
{
|
||||
|
||||
@@ -347,6 +365,19 @@ void Grid_init(int *argc,char ***argv)
|
||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
|
||||
Grid_debug_handler_init();
|
||||
}
|
||||
// Sleep n-seconds at end of handler
|
||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--signal-delay") ){
|
||||
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--signal-delay");
|
||||
GridCmdOptionInt(arg,signal_delay);
|
||||
}
|
||||
// periodic wakeup with stack trace printed
|
||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-heartbeat") ){
|
||||
Grid_debug_heartbeat();
|
||||
}
|
||||
// periodic wakeup with empty handler (interrupts some system calls)
|
||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--heartbeat") ){
|
||||
Grid_heartbeat();
|
||||
}
|
||||
|
||||
#if defined(A64FX)
|
||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-overlap") ){
|
||||
@@ -385,26 +416,52 @@ void Grid_init(int *argc,char ***argv)
|
||||
} else {
|
||||
FILE *fp;
|
||||
std::ostringstream fname;
|
||||
|
||||
int rank = CartesianCommunicator::RankWorld();
|
||||
int radix=64;
|
||||
char* root = getenv("GRID_STDOUT_ROOT");
|
||||
if (root) {
|
||||
fname << root ;
|
||||
mkdir(fname.str().c_str(), S_IRWXU );
|
||||
fname << "/";
|
||||
}
|
||||
fname << (rank/radix)*radix ;
|
||||
mkdir(fname.str().c_str(), S_IRWXU );
|
||||
fname << "/";
|
||||
fname<<"Grid.stdout.";
|
||||
fname<<CartesianCommunicator::RankWorld();
|
||||
fp=freopen(fname.str().c_str(),"w",stdout);
|
||||
assert(fp!=(FILE *)NULL);
|
||||
|
||||
std::ostringstream ename;
|
||||
if (root){
|
||||
ename << root << "/";
|
||||
}
|
||||
ename << (rank/radix)*radix << "/";
|
||||
ename<<"Grid.stderr.";
|
||||
ename<<CartesianCommunicator::RankWorld();
|
||||
fp=freopen(ename.str().c_str(),"w",stderr);
|
||||
assert(fp!=(FILE *)NULL);
|
||||
}
|
||||
fileno_stdout = fileno(stdout);
|
||||
fileno_stderr = fileno(stderr) ;
|
||||
|
||||
////////////////////////////////////////////////////
|
||||
// OK to use GridLogMessage etc from here on
|
||||
////////////////////////////////////////////////////
|
||||
std::cout << GridLogMessage << "================================================ "<<std::endl;
|
||||
std::cout << GridLogMessage << "MPI is initialised and logging filters activated "<<std::endl;
|
||||
std::cout << GridLogMessage << "================================================ "<<std::endl;
|
||||
|
||||
gethostname(hostname, HOST_NAME_MAX+1);
|
||||
std::cout << GridLogMessage << "This rank is running on host "<< hostname<<std::endl;
|
||||
{
|
||||
gethostname(hostname, HOST_NAME_MAX+1);
|
||||
time_t mytime;
|
||||
struct tm *info;
|
||||
char buffer[80];
|
||||
time(&mytime);
|
||||
info = localtime(&mytime);
|
||||
strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", info);
|
||||
std::cout << GridLogMessage << "This rank is running on host "<< hostname<<" at local time "<<buffer<<std::endl;
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////
|
||||
// Reporting
|
||||
@@ -421,6 +478,47 @@ void Grid_init(int *argc,char ***argv)
|
||||
MemoryProfiler::stats = &dbgMemStats;
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////
|
||||
// LD.so space
|
||||
/////////////////////////////////////////////////////////
|
||||
#ifndef __APPLE__
|
||||
{
|
||||
// Provides mapping of .so files
|
||||
FILE *f = fopen("/proc/self/maps", "r");
|
||||
if (f) {
|
||||
char line[256];
|
||||
while (fgets(line, sizeof(line), f)) {
|
||||
if (strstr(line, "r-xp")) {
|
||||
dlRegion region;
|
||||
uint32_t major, minor, inode;
|
||||
uint64_t start,end,offset;
|
||||
char path[PATH_MAX];
|
||||
sscanf(line,"%lx-%lx r-xp %lx %x:%x %d %s",
|
||||
&start,&end,&offset,
|
||||
&major,&minor,&inode,path);
|
||||
region.start=start;
|
||||
region.end =end;
|
||||
region.offset=offset;
|
||||
region.name = std::string(path);
|
||||
region.size = region.end-region.start;
|
||||
dlMap.push_back(region);
|
||||
// std::cout << GridLogMessage<< line;
|
||||
}
|
||||
}
|
||||
fclose(f);
|
||||
}
|
||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--dylib-map") ){
|
||||
std::cout << GridLogMessage << "================================================ "<<std::endl;
|
||||
std::cout << GridLogMessage<< " Dynamic library map: " <<std::endl;
|
||||
std::cout << GridLogMessage << "================================================ "<<std::endl;
|
||||
for(int r=0;r<dlMap.size();r++){
|
||||
auto region = dlMap[r];
|
||||
std::cout << GridLogMessage<<" "<<region.name<<std::hex<<region.start<<"-"<<region.end<<" sz "<<region.size<<std::dec<<std::endl;
|
||||
}
|
||||
std::cout << GridLogMessage << "================================================ "<<std::endl;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
////////////////////////////////////
|
||||
// Logging
|
||||
////////////////////////////////////
|
||||
@@ -453,14 +551,19 @@ void Grid_init(int *argc,char ***argv)
|
||||
std::cout<<GridLogMessage<<" --shm-hugepages : use explicit huge pages in mmap call "<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --device-mem M : Size of device software cache for lattice fields (MB) "<<std::endl;
|
||||
std::cout<<GridLogMessage<<std::endl;
|
||||
std::cout<<GridLogMessage<<"Verbose and debug:"<<std::endl;
|
||||
std::cout<<GridLogMessage<<"Verbose:"<<std::endl;
|
||||
std::cout<<GridLogMessage<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --log list : comma separated list from Error,Warning,Message,Performance,Iterative,Integrator,Debug,Colours"<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --decomposition : report on default omp,mpi and simd decomposition"<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --debug-signals : catch sigsegv and print a blame report"<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --debug-stdout : print stdout from EVERY node"<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --debug-mem : print Grid allocator activity"<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --notimestamp : suppress millisecond resolution stamps"<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --decomposition : report on default omp,mpi and simd decomposition"<<std::endl;
|
||||
std::cout<<GridLogMessage<<"Debug:"<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --dylib-map : print dynamic library map, useful for interpreting signal backtraces "<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --heartbeat : periodic itimer wakeup (interrupts stuck system calls!) "<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --signal-delay n : pause for n seconds after signal handling (useful to get ALL nodes in stuck state) "<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --debug-stdout : print stdout from EVERY node to file Grid.stdout/err.rank "<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --debug-signals : catch sigsegv and print a blame report, handle SIGHUP with a backtrace to stderr"<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --debug-heartbeat : periodically report backtrace "<<std::endl;
|
||||
std::cout<<GridLogMessage<<" --debug-mem : print Grid allocator activity"<<std::endl;
|
||||
std::cout<<GridLogMessage<<std::endl;
|
||||
std::cout<<GridLogMessage<<"Performance:"<<std::endl;
|
||||
std::cout<<GridLogMessage<<std::endl;
|
||||
@@ -555,17 +658,56 @@ void GridLogLayout() {
|
||||
}
|
||||
|
||||
void * Grid_backtrace_buffer[_NBACKTRACE];
|
||||
#define SIGLOG(A) ::write(fileno_stderr,A,strlen(A));
|
||||
|
||||
void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr)
|
||||
void sig_print_dig(uint32_t dig)
|
||||
{
|
||||
fprintf(stderr,"Signal handler on host %s\n",hostname);
|
||||
fprintf(stderr,"FlightRecorder step %d stage %s \n",
|
||||
FlightRecorder::StepLoggingCounter,
|
||||
FlightRecorder::StepName);
|
||||
fprintf(stderr,"Caught signal %d\n",si->si_signo);
|
||||
fprintf(stderr," mem address %llx\n",(unsigned long long)si->si_addr);
|
||||
fprintf(stderr," code %d\n",si->si_code);
|
||||
// x86 64bit
|
||||
const char *digits[] = {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "a", "b", "c", "d", "e", "f" };
|
||||
if ( dig>=0 && dig< 16){
|
||||
SIGLOG(digits[dig]);
|
||||
}
|
||||
}
|
||||
void sig_print_uint(uint32_t A)
|
||||
{
|
||||
int dig;
|
||||
int nz=0;
|
||||
#define DIGIT(DIV) dig = (A/DIV)%10 ; if(dig|nz) sig_print_dig(dig); nz = nz|dig;
|
||||
DIGIT(1000000000); // Catches 4BN = 2^32
|
||||
DIGIT(100000000);
|
||||
DIGIT(10000000);
|
||||
DIGIT(1000000);
|
||||
DIGIT(100000);
|
||||
DIGIT(10000);
|
||||
DIGIT(1000);
|
||||
DIGIT(100);
|
||||
DIGIT(10);
|
||||
DIGIT(1);
|
||||
if (nz==0) SIGLOG("0");
|
||||
}
|
||||
void sig_print_hex(uint64_t A)
|
||||
{
|
||||
int nz=0;
|
||||
int dig;
|
||||
#define NIBBLE(A) dig = A ; if(dig|nz) sig_print_dig(dig); nz = nz|dig;
|
||||
SIGLOG("0x");
|
||||
NIBBLE((A>>(15*4))&0xF);
|
||||
NIBBLE((A>>(14*4))&0xF);
|
||||
NIBBLE((A>>(13*4))&0xF);
|
||||
NIBBLE((A>>(12*4))&0xF);
|
||||
NIBBLE((A>>(11*4))&0xF);
|
||||
NIBBLE((A>>(10*4))&0xF);
|
||||
NIBBLE((A>>(9*4))&0xF);
|
||||
NIBBLE((A>>(8*4))&0xF);
|
||||
NIBBLE((A>>(7*4))&0xF);
|
||||
NIBBLE((A>>(6*4))&0xF);
|
||||
NIBBLE((A>>(5*4))&0xF);
|
||||
NIBBLE((A>>(4*4))&0xF);
|
||||
NIBBLE((A>>(3*4))&0xF);
|
||||
NIBBLE((A>>(2*4))&0xF);
|
||||
NIBBLE((A>>4)&0xF);
|
||||
sig_print_dig(A&0xF);
|
||||
}
|
||||
/*
|
||||
#ifdef __linux__
|
||||
#ifdef __x86_64__
|
||||
ucontext_t * uc= (ucontext_t *)ptr;
|
||||
@@ -573,81 +715,158 @@ void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr)
|
||||
fprintf(stderr," instruction %llx\n",(unsigned long long)sc->rip);
|
||||
#endif
|
||||
#endif
|
||||
fflush(stderr);
|
||||
BACKTRACEFP(stderr);
|
||||
fprintf(stderr,"Called backtrace\n");
|
||||
fflush(stdout);
|
||||
fflush(stderr);
|
||||
*/
|
||||
void Grid_generic_handler(int sig,siginfo_t *si,void * ptr)
|
||||
{
|
||||
SIGLOG("Signal handler on host ");
|
||||
SIGLOG(hostname);
|
||||
SIGLOG(" process id ");
|
||||
sig_print_uint((uint32_t)getpid());
|
||||
SIGLOG("\n");
|
||||
SIGLOG("FlightRecorder step ");
|
||||
sig_print_uint(FlightRecorder::StepLoggingCounter);
|
||||
SIGLOG(" stage ");
|
||||
SIGLOG(FlightRecorder::StepName);
|
||||
SIGLOG("\n");
|
||||
SIGLOG("Caught signal ");
|
||||
sig_print_uint(si->si_signo);
|
||||
SIGLOG("\n");
|
||||
SIGLOG(" mem address ");
|
||||
sig_print_hex((uint64_t)si->si_addr);
|
||||
SIGLOG("\n");
|
||||
SIGLOG(" code ");
|
||||
sig_print_uint(si->si_code);
|
||||
SIGLOG("\n");
|
||||
|
||||
ucontext_t *uc= (ucontext_t *)ptr;
|
||||
|
||||
SIGLOG("Backtrace:\n");
|
||||
#ifdef HAVE_UNWIND
|
||||
// Debug cross check on offsets
|
||||
// int symbols = backtrace(Grid_backtrace_buffer,_NBACKTRACE);
|
||||
// backtrace_symbols_fd(Grid_backtrace_buffer,symbols,fileno_stderr);
|
||||
unw_cursor_t cursor;
|
||||
unw_word_t ip, off;
|
||||
if (!unw_init_local(&cursor, uc) ) {
|
||||
|
||||
SIGLOG(" frame IP function\n");
|
||||
int level = 0;
|
||||
int ret = 0;
|
||||
while(1) {
|
||||
char name[128];
|
||||
if (level >= _NBACKTRACE) return;
|
||||
|
||||
unw_get_reg(&cursor, UNW_REG_IP, &ip);
|
||||
|
||||
sig_print_uint(level); SIGLOG(" ");
|
||||
sig_print_hex(ip); SIGLOG(" ");
|
||||
for(int r=0;r<dlMap.size();r++){
|
||||
if((ip>=dlMap[r].start) &&(ip<dlMap[r].end)){
|
||||
SIGLOG(dlMap[r].name.c_str());
|
||||
SIGLOG("+");
|
||||
sig_print_hex((ip-dlMap[r].start));
|
||||
break;
|
||||
}
|
||||
}
|
||||
SIGLOG("\n");
|
||||
Grid_backtrace_buffer[level]=(void *)ip;
|
||||
level++;
|
||||
ret = unw_step(&cursor);
|
||||
if (ret <= 0) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
// Known Asynch-Signal unsafe
|
||||
int symbols = backtrace(Grid_backtrace_buffer,_NBACKTRACE);
|
||||
backtrace_symbols_fd(Grid_backtrace_buffer,symbols,fileno_stderr);
|
||||
#endif
|
||||
}
|
||||
|
||||
void Grid_heartbeat_signal_handler(int sig,siginfo_t *si,void * ptr)
|
||||
{
|
||||
Grid_generic_handler(sig,si,ptr);
|
||||
SIGLOG("\n");
|
||||
}
|
||||
void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr)
|
||||
{
|
||||
Grid_generic_handler(sig,si,ptr);
|
||||
if (signal_delay) {
|
||||
SIGLOG("Adding extra signal delay ");
|
||||
sig_print_uint(signal_delay);
|
||||
SIGLOG(" s\n");
|
||||
usleep( (uint64_t) signal_delay*1000LL*1000LL);
|
||||
}
|
||||
SIGLOG("\n");
|
||||
return;
|
||||
}
|
||||
|
||||
void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
|
||||
void Grid_fatal_signal_handler(int sig,siginfo_t *si,void * ptr)
|
||||
{
|
||||
fprintf(stderr,"Signal handler on host %s\n",hostname);
|
||||
fprintf(stderr,"Caught signal %d\n",si->si_signo);
|
||||
fprintf(stderr," mem address %llx\n",(unsigned long long)si->si_addr);
|
||||
fprintf(stderr," code %d\n",si->si_code);
|
||||
// Linux/Posix
|
||||
#ifdef __linux__
|
||||
// And x86 64bit
|
||||
#ifdef __x86_64__
|
||||
ucontext_t * uc= (ucontext_t *)ptr;
|
||||
struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
|
||||
fprintf(stderr," instruction %llx\n",(unsigned long long)sc->rip);
|
||||
#define REG(A) fprintf(stderr," %s %lx\n",#A,sc-> A);
|
||||
REG(rdi);
|
||||
REG(rsi);
|
||||
REG(rbp);
|
||||
REG(rbx);
|
||||
REG(rdx);
|
||||
REG(rax);
|
||||
REG(rcx);
|
||||
REG(rsp);
|
||||
REG(rip);
|
||||
|
||||
|
||||
REG(r8);
|
||||
REG(r9);
|
||||
REG(r10);
|
||||
REG(r11);
|
||||
REG(r12);
|
||||
REG(r13);
|
||||
REG(r14);
|
||||
REG(r15);
|
||||
#endif
|
||||
#endif
|
||||
fflush(stderr);
|
||||
BACKTRACEFP(stderr);
|
||||
fprintf(stderr,"Called backtrace\n");
|
||||
fflush(stdout);
|
||||
fflush(stderr);
|
||||
Grid_generic_handler(sig,si,ptr);
|
||||
SIGLOG("\n");
|
||||
exit(0);
|
||||
return;
|
||||
};
|
||||
void Grid_empty_signal_handler(int sig,siginfo_t *si,void * ptr)
|
||||
{
|
||||
// SIGLOG("heartbeat signal handled\n");
|
||||
return;
|
||||
}
|
||||
void Grid_debug_heartbeat(void)
|
||||
{
|
||||
struct sigaction sa_ping;
|
||||
|
||||
sigemptyset (&sa_ping.sa_mask);
|
||||
sa_ping.sa_sigaction= Grid_usr_signal_handler;
|
||||
sa_ping.sa_flags = SA_SIGINFO;
|
||||
sigaction(SIGALRM,&sa_ping,NULL);
|
||||
|
||||
// repeating 10s heartbeat
|
||||
struct itimerval it_val;
|
||||
it_val.it_value.tv_sec = 10;
|
||||
it_val.it_value.tv_usec = 0;
|
||||
it_val.it_interval = it_val.it_value;
|
||||
setitimer(ITIMER_REAL, &it_val, NULL);
|
||||
}
|
||||
void Grid_heartbeat(void)
|
||||
{
|
||||
struct sigaction sa_ping;
|
||||
|
||||
sigemptyset (&sa_ping.sa_mask);
|
||||
sa_ping.sa_sigaction= Grid_empty_signal_handler;
|
||||
sa_ping.sa_flags = SA_SIGINFO;
|
||||
sigaction(SIGALRM,&sa_ping,NULL);
|
||||
|
||||
// repeating 10s heartbeat
|
||||
struct itimerval it_val;
|
||||
it_val.it_value.tv_sec = 10;
|
||||
it_val.it_value.tv_usec = 1000;
|
||||
it_val.it_interval = it_val.it_value;
|
||||
setitimer(ITIMER_REAL, &it_val, NULL);
|
||||
}
|
||||
void Grid_exit_handler(void)
|
||||
{
|
||||
// BACKTRACEFP(stdout);
|
||||
// fflush(stdout);
|
||||
BACKTRACEFP(stdout);
|
||||
fflush(stdout);
|
||||
}
|
||||
void Grid_debug_handler_init(void)
|
||||
{
|
||||
struct sigaction sa;
|
||||
sigemptyset (&sa.sa_mask);
|
||||
sa.sa_sigaction= Grid_sa_signal_handler;
|
||||
sa.sa_sigaction= Grid_fatal_signal_handler;
|
||||
sa.sa_flags = SA_SIGINFO;
|
||||
// sigaction(SIGSEGV,&sa,NULL);
|
||||
sigaction(SIGTRAP,&sa,NULL);
|
||||
sigaction(SIGBUS,&sa,NULL);
|
||||
// sigaction(SIGUSR2,&sa,NULL);
|
||||
|
||||
feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
|
||||
|
||||
sigaction(SIGFPE,&sa,NULL);
|
||||
sigaction(SIGKILL,&sa,NULL);
|
||||
sigaction(SIGILL,&sa,NULL);
|
||||
#ifndef GRID_SYCL
|
||||
sigaction(SIGSEGV,&sa,NULL); // SYCL is using SIGSEGV
|
||||
sigaction(SIGBUS,&sa,NULL);
|
||||
feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
|
||||
sigaction(SIGFPE,&sa,NULL);
|
||||
#endif
|
||||
|
||||
// Non terminating SIGUSR1/2 handler
|
||||
// Non terminating SIGHUP handler
|
||||
struct sigaction sa_ping;
|
||||
sigemptyset (&sa_ping.sa_mask);
|
||||
sa_ping.sa_sigaction= Grid_usr_signal_handler;
|
||||
|
@@ -38,7 +38,11 @@ char * GridHostname(void);
|
||||
|
||||
// internal, controled with --handle
|
||||
void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr);
|
||||
void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr);
|
||||
void Grid_empty_signal_handler(int sig,siginfo_t *si,void * ptr);
|
||||
void Grid_debug_handler_init(void);
|
||||
void Grid_debug_heartbeat(void);
|
||||
void Grid_heartbeat(void);
|
||||
void Grid_quiesce_nodes(void);
|
||||
void Grid_unquiesce_nodes(void);
|
||||
|
||||
|
@@ -66,6 +66,7 @@ namespace Grid{
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
template <class T> void writeFile(T& in, std::string const fname){
|
||||
#ifdef HAVE_LIME
|
||||
// Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
|
||||
@@ -73,7 +74,7 @@ template <class T> void writeFile(T& in, std::string const fname){
|
||||
Grid::emptyUserRecord record;
|
||||
Grid::ScidacWriter WR(in.Grid()->IsBoss());
|
||||
WR.open(fname);
|
||||
WR.writeScidacFieldRecord(in,record,0);
|
||||
WR.writeScidacFieldRecord(in,record,0); // Lexico
|
||||
WR.close();
|
||||
#endif
|
||||
// What is the appropriate way to throw error?
|
||||
@@ -107,8 +108,18 @@ int main(int argc, char **argv) {
|
||||
|
||||
for (int conf = CPar.StartConfiguration; conf <= CPar.EndConfiguration; conf+= CPar.Skip){
|
||||
|
||||
#if 0
|
||||
CPNersc.CheckpointRestore(conf, Umu, sRNG, pRNG);
|
||||
#else
|
||||
// Don't require Grid format RNGs
|
||||
FieldMetaData header;
|
||||
std::string file, filesmr;
|
||||
file = CPar.conf_path + "/" + CPar.conf_prefix + "." + std::to_string(conf);
|
||||
filesmr = CPar.conf_path + "/" + CPar.conf_smr_prefix + "." + std::to_string(conf);
|
||||
|
||||
NerscIO::readConfiguration(Umu,header,file);
|
||||
#endif
|
||||
|
||||
std::cout << std::setprecision(15);
|
||||
std::cout << GridLogMessage << "Initial plaquette: "<< WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu) << std::endl;
|
||||
|
||||
@@ -116,6 +127,7 @@ int main(int argc, char **argv) {
|
||||
std::string file_post = CPar.conf_prefix + "." + std::to_string(conf);
|
||||
|
||||
WilsonFlow<PeriodicGimplR> WF(WFPar.step_size,WFPar.steps,WFPar.meas_interval);
|
||||
|
||||
WF.addMeasurement(WFPar.meas_interval_density, [&file_pre,&file_post,&conf](int step, RealD t, const typename PeriodicGimplR::GaugeField &U){
|
||||
|
||||
typedef typename PeriodicGimplR::GaugeLinkField GaugeMat;
|
||||
@@ -165,33 +177,48 @@ int main(int argc, char **argv) {
|
||||
//double coeff = 2.0 / (1.0 * Nd * (Nd - 1)) / 3.0;
|
||||
//Plq = coeff * Plq;
|
||||
|
||||
int tau = std::round(t);
|
||||
std::string efile = file_pre + "E_dnsty_" + std::to_string(tau) + "_" + file_post;
|
||||
writeFile(R,efile);
|
||||
std::string tfile = file_pre + "Top_dnsty_" + std::to_string(tau) + "_" + file_post;
|
||||
writeFile(qfield,tfile);
|
||||
|
||||
RealD WFlow_TC5Li = WilsonLoops<PeriodicGimplR>::TopologicalCharge5Li(U);
|
||||
|
||||
int tau = std::round(t);
|
||||
|
||||
std::string efile = file_pre + "E_dnsty_" + std::to_string(tau) + "_" + file_post;
|
||||
// writeFile(R,efile);
|
||||
|
||||
std::string tfile = file_pre + "Top_dnsty_" + std::to_string(tau) + "_" + file_post;
|
||||
// writeFile(qfield,tfile);
|
||||
|
||||
std::string ufile = file_pre + "U_" + std::to_string(tau) + "_" + file_post;
|
||||
{
|
||||
// PeriodicGimplR::GaugeField Ucopy = U;
|
||||
// NerscIO::writeConfiguration(Ucopy,ufile);
|
||||
}
|
||||
|
||||
RealD E = real(sum(R))/ RealD(U.Grid()->gSites());
|
||||
RealD T = real( sum(qfield) );
|
||||
Coordinate scoor; for (int mu=0; mu < Nd; mu++) scoor[mu] = 0;
|
||||
RealD E0 = real(peekSite(R,scoor));
|
||||
RealD T0 = real(peekSite(qfield,scoor));
|
||||
std::cout << GridLogMessage << "[WilsonFlow] Saved energy density (clover) & topo. charge density: " << conf << " " << step << " " << tau << " "
|
||||
<< "(E_avg,T_sum) " << E << " " << T << " (E, T at origin) " << E0 << " " << T0 << std::endl;
|
||||
<< "(E_avg,T_sum) " << E << " " << T << " (E, T at origin) " << E0 << " " << T0 << " Q5Li "<< WFlow_TC5Li << std::endl;
|
||||
|
||||
});
|
||||
|
||||
int t=WFPar.maxTau;
|
||||
WF.smear(Uflow, Umu);
|
||||
|
||||
// NerscIO::writeConfiguration(Uflow,filesmr);
|
||||
|
||||
|
||||
RealD WFlow_plaq = WilsonLoops<PeriodicGimplR>::avgPlaquette(Uflow);
|
||||
RealD WFlow_TC = WilsonLoops<PeriodicGimplR>::TopologicalCharge(Uflow);
|
||||
RealD WFlow_TC5Li = WilsonLoops<PeriodicGimplR>::TopologicalCharge5Li(Uflow);
|
||||
RealD WFlow_T0 = WF.energyDensityPlaquette(t,Uflow); // t
|
||||
RealD WFlow_EC = WF.energyDensityCloverleaf(t,Uflow);
|
||||
std::cout << GridLogMessage << "Plaquette "<< conf << " " << WFlow_plaq << std::endl;
|
||||
std::cout << GridLogMessage << "T0 "<< conf << " " << WFlow_T0 << std::endl;
|
||||
std::cout << GridLogMessage << "TC0 "<< conf << " " << WFlow_EC << std::endl;
|
||||
std::cout << GridLogMessage << "TopologicalCharge "<< conf << " " << WFlow_TC << std::endl;
|
||||
std::cout << GridLogMessage << "Plaquette "<< conf << " " << WFlow_plaq << std::endl;
|
||||
std::cout << GridLogMessage << "T0 "<< conf << " " << WFlow_T0 << std::endl;
|
||||
std::cout << GridLogMessage << "TC0 "<< conf << " " << WFlow_EC << std::endl;
|
||||
std::cout << GridLogMessage << "TopologicalCharge "<< conf << " " << WFlow_TC << std::endl;
|
||||
std::cout << GridLogMessage << "TopologicalCharge5Li "<< conf << " " << WFlow_TC5Li<< std::endl;
|
||||
|
||||
std::cout<< GridLogMessage << " Admissibility check:\n";
|
||||
const double sp_adm = 0.067; // admissible threshold
|
||||
|
@@ -201,8 +201,7 @@ int main(int argc, char **argv) {
|
||||
|
||||
Params.dirichlet=NonDirichlet;
|
||||
ParamsDir.dirichlet=Dirichlet;
|
||||
ParamsDir.partialDirichlet=0;
|
||||
std::cout << GridLogMessage<< "Partial Dirichlet depth is "<<dwf_compressor_depth<<std::endl;
|
||||
// ParamsDir.partialDirichlet=0;
|
||||
|
||||
// double StoppingCondition = 1e-14;
|
||||
// double MDStoppingCondition = 1e-9;
|
||||
@@ -298,11 +297,11 @@ int main(int argc, char **argv) {
|
||||
if ( dirichlet_den[h]==1) ParamsDen.dirichlet = Dirichlet;
|
||||
else ParamsDen.dirichlet = NonDirichlet;
|
||||
|
||||
if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1;
|
||||
else ParamsNum.partialDirichlet = 0;
|
||||
// if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1;
|
||||
// else ParamsNum.partialDirichlet = 0;
|
||||
|
||||
if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1;
|
||||
else ParamsDen.partialDirichlet = 0;
|
||||
// if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1;
|
||||
// else ParamsDen.partialDirichlet = 0;
|
||||
|
||||
Numerators.push_back (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum));
|
||||
Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen));
|
||||
|
@@ -333,9 +333,9 @@ int main(int argc, char **argv) {
|
||||
ParamsF.dirichlet=NonDirichlet;
|
||||
ParamsDir.dirichlet=Dirichlet;
|
||||
ParamsDirF.dirichlet=Dirichlet;
|
||||
ParamsDir.partialDirichlet=1;
|
||||
ParamsDirF.partialDirichlet=1;
|
||||
std::cout << GridLogMessage<< "Partial Dirichlet depth is "<<dwf_compressor_depth<<std::endl;
|
||||
// ParamsDir.partialDirichlet=1;
|
||||
// ParamsDirF.partialDirichlet=1;
|
||||
// std::cout << GridLogMessage<< "Partial Dirichlet depth is "<<dwf_compressor_depth<<std::endl;
|
||||
|
||||
// double StoppingCondition = 1e-14;
|
||||
// double MDStoppingCondition = 1e-9;
|
||||
@@ -481,21 +481,21 @@ int main(int argc, char **argv) {
|
||||
if ( dirichlet_den[h]==1) ParamsDen.dirichlet = Dirichlet;
|
||||
else ParamsDen.dirichlet = NonDirichlet;
|
||||
|
||||
if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1;
|
||||
else ParamsNum.partialDirichlet = 0;
|
||||
// if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1;
|
||||
// else ParamsNum.partialDirichlet = 0;
|
||||
|
||||
if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1;
|
||||
else ParamsDen.partialDirichlet = 0;
|
||||
// if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1;
|
||||
// else ParamsDen.partialDirichlet = 0;
|
||||
|
||||
Numerators.push_back (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum));
|
||||
Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen));
|
||||
|
||||
ParamsDenF.dirichlet = ParamsDen.dirichlet;
|
||||
ParamsDenF.partialDirichlet = ParamsDen.partialDirichlet;
|
||||
// ParamsDenF.partialDirichlet = ParamsDen.partialDirichlet;
|
||||
DenominatorsF.push_back(new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_den[h],M5,b,c, ParamsDenF));
|
||||
|
||||
ParamsNumF.dirichlet = ParamsNum.dirichlet;
|
||||
ParamsNumF.partialDirichlet = ParamsNum.partialDirichlet;
|
||||
// ParamsNumF.partialDirichlet = ParamsNum.partialDirichlet;
|
||||
NumeratorsF.push_back (new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_num[h],M5,b,c, ParamsNumF));
|
||||
|
||||
LinOpD.push_back(new LinearOperatorD(*Denominators[h]));
|
||||
|
@@ -166,18 +166,18 @@ int main (int argc, char ** argv)
|
||||
}
|
||||
|
||||
|
||||
|
||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||
std::cout<<GridLogMessage << "= Benchmarking concurrent STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
|
||||
std::cout<<GridLogMessage << "= Benchmarking sequential STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
|
||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||
header();
|
||||
|
||||
for(int lat=8;lat<=maxlat;lat+=4){
|
||||
for(int Ls=8;Ls<=8;Ls*=2){
|
||||
|
||||
Coordinate latt_size ({lat*mpi_layout[0],
|
||||
lat*mpi_layout[1],
|
||||
lat*mpi_layout[2],
|
||||
lat*mpi_layout[3]});
|
||||
lat*mpi_layout[1],
|
||||
lat*mpi_layout[2],
|
||||
lat*mpi_layout[3]});
|
||||
|
||||
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
|
||||
RealD Nrank = Grid._Nprocessors;
|
||||
@@ -193,101 +193,6 @@ int main (int argc, char ** argv)
|
||||
rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
|
||||
}
|
||||
|
||||
int ncomm;
|
||||
|
||||
double dbytes;
|
||||
for(int i=0;i<Nloop;i++){
|
||||
double start=usecond();
|
||||
|
||||
dbytes=0;
|
||||
ncomm=0;
|
||||
|
||||
std::vector<CommsRequest_t> requests;
|
||||
|
||||
for(int mu=0;mu<4;mu++){
|
||||
|
||||
|
||||
if (mpi_layout[mu]>1 ) {
|
||||
|
||||
ncomm++;
|
||||
int comm_proc=1;
|
||||
int xmit_to_rank;
|
||||
int recv_from_rank;
|
||||
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
||||
dbytes+=
|
||||
Grid.StencilSendToRecvFromBegin(requests,
|
||||
(void *)&xbuf[mu][0],
|
||||
xmit_to_rank,1,
|
||||
(void *)&rbuf[mu][0],
|
||||
recv_from_rank,1,
|
||||
bytes,bytes,mu);
|
||||
|
||||
comm_proc = mpi_layout[mu]-1;
|
||||
|
||||
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
||||
dbytes+=
|
||||
Grid.StencilSendToRecvFromBegin(requests,
|
||||
(void *)&xbuf[mu+4][0],
|
||||
xmit_to_rank,1,
|
||||
(void *)&rbuf[mu+4][0],
|
||||
recv_from_rank,1,
|
||||
bytes,bytes,mu+4);
|
||||
|
||||
}
|
||||
}
|
||||
Grid.StencilSendToRecvFromComplete(requests,0);
|
||||
Grid.Barrier();
|
||||
double stop=usecond();
|
||||
t_time[i] = stop-start; // microseconds
|
||||
|
||||
}
|
||||
|
||||
timestat.statistics(t_time);
|
||||
|
||||
dbytes=dbytes*ppn;
|
||||
double xbytes = dbytes*0.5;
|
||||
// double rbytes = dbytes*0.5;
|
||||
double bidibytes = dbytes;
|
||||
|
||||
std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
|
||||
<<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
|
||||
<<std::right<< xbytes/timestat.mean<<" "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
|
||||
<<xbytes/timestat.max <<" "<< xbytes/timestat.min
|
||||
<< "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< " " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
|
||||
<< bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||
std::cout<<GridLogMessage << "= Benchmarking sequential STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
|
||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||
header();
|
||||
|
||||
for(int lat=8;lat<=maxlat;lat+=4){
|
||||
for(int Ls=8;Ls<=8;Ls*=2){
|
||||
|
||||
Coordinate latt_size ({lat*mpi_layout[0],
|
||||
lat*mpi_layout[1],
|
||||
lat*mpi_layout[2],
|
||||
lat*mpi_layout[3]});
|
||||
|
||||
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
|
||||
RealD Nrank = Grid._Nprocessors;
|
||||
RealD Nnode = Grid.NodeCount();
|
||||
RealD ppn = Nrank/Nnode;
|
||||
|
||||
std::vector<HalfSpinColourVectorD *> xbuf(8);
|
||||
std::vector<HalfSpinColourVectorD *> rbuf(8);
|
||||
Grid.ShmBufferFreeAll();
|
||||
uint64_t bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
|
||||
for(int d=0;d<8;d++){
|
||||
xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
|
||||
rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
|
||||
}
|
||||
|
||||
int ncomm;
|
||||
double dbytes;
|
||||
for(int i=0;i<Nloop;i++){
|
||||
@@ -296,45 +201,35 @@ int main (int argc, char ** argv)
|
||||
std::vector<CommsRequest_t> requests;
|
||||
dbytes=0;
|
||||
ncomm=0;
|
||||
for(int mu=0;mu<4;mu++){
|
||||
|
||||
|
||||
for(int dir=0;dir<8;dir++) {
|
||||
|
||||
double tbytes;
|
||||
int mu =dir % 4;
|
||||
|
||||
if (mpi_layout[mu]>1 ) {
|
||||
|
||||
ncomm++;
|
||||
int comm_proc=1;
|
||||
int xmit_to_rank;
|
||||
int recv_from_rank;
|
||||
|
||||
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
||||
dbytes+=
|
||||
Grid.StencilSendToRecvFromBegin(requests,
|
||||
(void *)&xbuf[mu][0],
|
||||
xmit_to_rank,1,
|
||||
(void *)&rbuf[mu][0],
|
||||
recv_from_rank,1,
|
||||
bytes,bytes,mu);
|
||||
Grid.StencilSendToRecvFromComplete(requests,mu);
|
||||
requests.resize(0);
|
||||
if ( dir == mu ) {
|
||||
int comm_proc=1;
|
||||
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
||||
} else {
|
||||
int comm_proc = mpi_layout[mu]-1;
|
||||
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
||||
}
|
||||
// int tid = omp_get_thread_num();
|
||||
int tid = 0;
|
||||
tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,1,
|
||||
(void *)&rbuf[dir][0], recv_from_rank,1, bytes,tid);
|
||||
|
||||
comm_proc = mpi_layout[mu]-1;
|
||||
|
||||
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
||||
dbytes+=
|
||||
Grid.StencilSendToRecvFromBegin(requests,
|
||||
(void *)&xbuf[mu+4][0],
|
||||
xmit_to_rank,1,
|
||||
(void *)&rbuf[mu+4][0],
|
||||
recv_from_rank,1,
|
||||
bytes,bytes,mu+4);
|
||||
Grid.StencilSendToRecvFromComplete(requests,mu+4);
|
||||
requests.resize(0);
|
||||
|
||||
dbytes+=tbytes;
|
||||
}
|
||||
}
|
||||
}
|
||||
Grid.Barrier();
|
||||
double stop=usecond();
|
||||
t_time[i] = stop-start; // microseconds
|
||||
|
||||
}
|
||||
|
||||
timestat.statistics(t_time);
|
||||
|
@@ -32,18 +32,18 @@
|
||||
using namespace std;
|
||||
using namespace Grid;
|
||||
|
||||
template<class d>
|
||||
struct scal {
|
||||
d internal;
|
||||
////////////////////////
|
||||
/// Move to domains ////
|
||||
////////////////////////
|
||||
|
||||
Gamma::Algebra Gmu [] = {
|
||||
Gamma::Algebra::GammaX,
|
||||
Gamma::Algebra::GammaY,
|
||||
Gamma::Algebra::GammaZ,
|
||||
Gamma::Algebra::GammaT
|
||||
};
|
||||
|
||||
Gamma::Algebra Gmu [] = {
|
||||
Gamma::Algebra::GammaX,
|
||||
Gamma::Algebra::GammaY,
|
||||
Gamma::Algebra::GammaZ,
|
||||
Gamma::Algebra::GammaT
|
||||
};
|
||||
|
||||
void Benchmark(int Ls, Coordinate Dirichlet,bool Sloppy);
|
||||
|
||||
int main (int argc, char ** argv)
|
||||
{
|
||||
@@ -52,39 +52,108 @@ int main (int argc, char ** argv)
|
||||
|
||||
int threads = GridThread::GetThreads();
|
||||
|
||||
Coordinate latt4 = GridDefaultLatt();
|
||||
int Ls=8;
|
||||
for(int i=0;i<argc;i++)
|
||||
int Ls=16;
|
||||
for(int i=0;i<argc;i++) {
|
||||
if(std::string(argv[i]) == "-Ls"){
|
||||
std::stringstream ss(argv[i+1]); ss >> Ls;
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////
|
||||
// With comms
|
||||
//////////////////
|
||||
Coordinate Dirichlet(Nd+1,0);
|
||||
|
||||
std::cout << "\n\n\n\n\n\n" <<std::endl;
|
||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||
std::cout << GridLogMessage<< " Testing with full communication " <<std::endl;
|
||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||
|
||||
Benchmark(Ls,Dirichlet,false);
|
||||
|
||||
std::cout << "\n\n\n\n\n\n" <<std::endl;
|
||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||
std::cout << GridLogMessage<< " Testing with sloppy communication " <<std::endl;
|
||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||
|
||||
Benchmark(Ls,Dirichlet,true);
|
||||
|
||||
//////////////////
|
||||
// Domain decomposed
|
||||
//////////////////
|
||||
/*
|
||||
Coordinate latt4 = GridDefaultLatt();
|
||||
Coordinate mpi = GridDefaultMpi();
|
||||
Coordinate CommDim(Nd);
|
||||
Coordinate shm;
|
||||
GlobalSharedMemory::GetShmDims(mpi,shm);
|
||||
|
||||
|
||||
std::cout << "\n\n\n\n\n\n" <<std::endl;
|
||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||
// std::cout << GridLogMessage<< " Testing without internode communication " <<std::endl;
|
||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||
|
||||
for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
|
||||
Dirichlet[0] = 0;
|
||||
Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
|
||||
Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
|
||||
Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
|
||||
Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
|
||||
|
||||
Benchmark(Ls,Dirichlet,false);
|
||||
|
||||
std::cout << "\n\n\n\n\n\n" <<std::endl;
|
||||
|
||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||
std::cout << GridLogMessage<< " Testing with sloppy communication " <<std::endl;
|
||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||
|
||||
for(int d=0;d<Nd;d++) CommDim[d]= mpi[d]>1 ? 1 : 0;
|
||||
|
||||
Benchmark(Ls,Dirichlet,true);
|
||||
*/
|
||||
|
||||
Grid_finalize();
|
||||
exit(0);
|
||||
}
|
||||
void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
|
||||
{
|
||||
Coordinate latt4 = GridDefaultLatt();
|
||||
GridLogLayout();
|
||||
|
||||
long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
|
||||
|
||||
|
||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
|
||||
std::vector<int> seeds4({1,2,3,4});
|
||||
std::vector<int> seeds5({5,6,7,8});
|
||||
#undef SINGLE
|
||||
#ifdef SINGLE
|
||||
typedef vComplexF Simd;
|
||||
typedef LatticeFermionF FermionField;
|
||||
typedef LatticeGaugeFieldF GaugeField;
|
||||
typedef LatticeColourMatrixF ColourMatrixField;
|
||||
typedef DomainWallFermionF FermionAction;
|
||||
#else
|
||||
typedef vComplexD Simd;
|
||||
typedef LatticeFermionD FermionField;
|
||||
typedef LatticeGaugeFieldD GaugeField;
|
||||
typedef LatticeColourMatrixD ColourMatrixField;
|
||||
typedef DomainWallFermionD FermionAction;
|
||||
#endif
|
||||
|
||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,Simd::Nsimd()),GridDefaultMpi());
|
||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
||||
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
||||
|
||||
std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
|
||||
GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
|
||||
GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
|
||||
GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
|
||||
GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
|
||||
|
||||
std::vector<int> seeds4({1,2,3,4});
|
||||
std::vector<int> seeds5({5,6,7,8});
|
||||
|
||||
std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
|
||||
GridParallelRNG RNG4(UGrid); RNG4.SeedUniqueString(std::string("The 4D RNG"));
|
||||
|
||||
std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
|
||||
GridParallelRNG RNG5(FGrid); RNG5.SeedUniqueString(std::string("The 5D RNG"));
|
||||
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
|
||||
|
||||
LatticeFermion src (FGrid); random(RNG5,src);
|
||||
|
||||
FermionField src (FGrid); random(RNG5,src);
|
||||
#if 0
|
||||
src = Zero();
|
||||
{
|
||||
@@ -100,46 +169,39 @@ int main (int argc, char ** argv)
|
||||
src = src*N2;
|
||||
#endif
|
||||
|
||||
|
||||
LatticeFermion result(FGrid); result=Zero();
|
||||
LatticeFermion ref(FGrid); ref=Zero();
|
||||
LatticeFermion tmp(FGrid);
|
||||
LatticeFermion err(FGrid);
|
||||
FermionField result(FGrid); result=Zero();
|
||||
FermionField ref(FGrid); ref=Zero();
|
||||
FermionField tmp(FGrid);
|
||||
FermionField err(FGrid);
|
||||
|
||||
std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
|
||||
LatticeGaugeField Umu(UGrid);
|
||||
GaugeField Umu(UGrid);
|
||||
GaugeField UmuCopy(UGrid);
|
||||
SU<Nc>::HotConfiguration(RNG4,Umu);
|
||||
// SU<Nc>::ColdConfiguration(Umu);
|
||||
UmuCopy=Umu;
|
||||
std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
|
||||
#if 0
|
||||
Umu=1.0;
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
LatticeColourMatrix ttmp(UGrid);
|
||||
ttmp = PeekIndex<LorentzIndex>(Umu,mu);
|
||||
// if (mu !=2 ) ttmp = 0;
|
||||
// ttmp = ttmp* pow(10.0,mu);
|
||||
PokeIndex<LorentzIndex>(Umu,ttmp,mu);
|
||||
}
|
||||
std::cout << GridLogMessage << "Forced to diagonal " << std::endl;
|
||||
#endif
|
||||
|
||||
////////////////////////////////////
|
||||
// Apply BCs
|
||||
////////////////////////////////////
|
||||
Coordinate Block(4);
|
||||
for(int d=0;d<4;d++) Block[d]= Dirichlet[d+1];
|
||||
|
||||
std::cout << GridLogMessage << "Applying BCs for Dirichlet Block5 " << Dirichlet << std::endl;
|
||||
std::cout << GridLogMessage << "Applying BCs for Dirichlet Block4 " << Block << std::endl;
|
||||
|
||||
DirichletFilter<GaugeField> Filter(Block);
|
||||
Filter.applyFilter(Umu);
|
||||
|
||||
////////////////////////////////////
|
||||
// Naive wilson implementation
|
||||
////////////////////////////////////
|
||||
// replicate across fifth dimension
|
||||
LatticeGaugeField Umu5d(FGrid);
|
||||
std::vector<LatticeColourMatrix> U(4,FGrid);
|
||||
{
|
||||
autoView( Umu5d_v, Umu5d, CpuWrite);
|
||||
autoView( Umu_v , Umu , CpuRead);
|
||||
for(int ss=0;ss<Umu.Grid()->oSites();ss++){
|
||||
for(int s=0;s<Ls;s++){
|
||||
Umu5d_v[Ls*ss+s] = Umu_v[ss];
|
||||
}
|
||||
}
|
||||
}
|
||||
std::vector<ColourMatrixField> U(4,UGrid);
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
|
||||
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
|
||||
}
|
||||
|
||||
std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
|
||||
|
||||
if (1)
|
||||
@@ -147,10 +209,28 @@ int main (int argc, char ** argv)
|
||||
ref = Zero();
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
|
||||
tmp = U[mu]*Cshift(src,mu+1,1);
|
||||
tmp = Cshift(src,mu+1,1);
|
||||
{
|
||||
autoView( tmp_v , tmp , CpuWrite);
|
||||
autoView( U_v , U[mu] , CpuRead);
|
||||
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
|
||||
for(int s=0;s<Ls;s++){
|
||||
tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
|
||||
}
|
||||
}
|
||||
}
|
||||
ref=ref + tmp - Gamma(Gmu[mu])*tmp;
|
||||
|
||||
tmp =adj(U[mu])*src;
|
||||
{
|
||||
autoView( tmp_v , tmp , CpuWrite);
|
||||
autoView( U_v , U[mu] , CpuRead);
|
||||
autoView( src_v, src , CpuRead);
|
||||
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
|
||||
for(int s=0;s<Ls;s++){
|
||||
tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
|
||||
}
|
||||
}
|
||||
}
|
||||
tmp =Cshift(tmp,mu+1,-1);
|
||||
ref=ref + tmp + Gamma(Gmu[mu])*tmp;
|
||||
}
|
||||
@@ -167,11 +247,9 @@ int main (int argc, char ** argv)
|
||||
std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
|
||||
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionD::Dhop "<<std::endl;
|
||||
std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
|
||||
std::cout << GridLogMessage<< "* VComplex size is "<<sizeof(vComplex)<< " B"<<std::endl;
|
||||
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
|
||||
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
|
||||
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop "<<std::endl;
|
||||
std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
|
||||
std::cout << GridLogMessage<< "* VComplex size is "<<sizeof(Simd)<< " B"<<std::endl;
|
||||
#ifdef GRID_OMP
|
||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
|
||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
|
||||
@@ -181,9 +259,15 @@ int main (int argc, char ** argv)
|
||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
|
||||
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||
|
||||
DomainWallFermionD Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
|
||||
int ncall =1000;
|
||||
|
||||
FermionAction::ImplParams p;
|
||||
p.dirichlet=Dirichlet;
|
||||
FermionAction Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,p);
|
||||
Dw.SloppyComms(sloppy);
|
||||
Dw.ImportGauge(Umu);
|
||||
|
||||
int ncall =300;
|
||||
RealD n2e;
|
||||
|
||||
if (1) {
|
||||
FGrid->Barrier();
|
||||
Dw.Dhop(src,result,0);
|
||||
@@ -198,8 +282,8 @@ int main (int argc, char ** argv)
|
||||
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||
double flops=single_site_flops*volume*ncall;
|
||||
|
||||
auto nsimd = vComplex::Nsimd();
|
||||
auto simdwidth = sizeof(vComplex);
|
||||
auto nsimd = Simd::Nsimd();
|
||||
auto simdwidth = sizeof(Simd);
|
||||
|
||||
// RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
|
||||
double data_rf = volume * ((2*Nd+1)*Nd*Nc + 2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
|
||||
@@ -208,28 +292,27 @@ int main (int argc, char ** argv)
|
||||
double data_mem = (volume * (2*Nd+1)*Nd*Nc + (volume/Ls) *2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
|
||||
|
||||
std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
|
||||
// std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
|
||||
// std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s per rank = "<< flops/(t1-t0)/NP<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NN<<std::endl;
|
||||
std::cout<<GridLogMessage << "RF GiB/s (base 2) = "<< 1000000. * data_rf/((t1-t0))<<std::endl;
|
||||
std::cout<<GridLogMessage << "mem GiB/s (base 2) = "<< 1000000. * data_mem/((t1-t0))<<std::endl;
|
||||
err = ref-result;
|
||||
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
||||
//exit(0);
|
||||
n2e = norm2(err);
|
||||
std::cout<<GridLogMessage << "norm diff "<< n2e<< " Line "<<__LINE__ <<std::endl;
|
||||
|
||||
if(( norm2(err)>1.0e-4) ) {
|
||||
/*
|
||||
std::cout << "RESULT\n " << result<<std::endl;
|
||||
std::cout << "REF \n " << ref <<std::endl;
|
||||
std::cout << "ERR \n " << err <<std::endl;
|
||||
*/
|
||||
if(( n2e>1.0e-4) ) {
|
||||
std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
|
||||
FGrid->Barrier();
|
||||
std::cout<<GridLogMessage << "RESULT" << std::endl;
|
||||
// std::cout << result<<std::endl;
|
||||
std::cout << norm2(result)<<std::endl;
|
||||
std::cout<<GridLogMessage << "REF" << std::endl;
|
||||
std::cout << norm2(ref)<<std::endl;
|
||||
std::cout<<GridLogMessage << "ERR" << std::endl;
|
||||
std::cout << norm2(err)<<std::endl;
|
||||
FGrid->Barrier();
|
||||
exit(-1);
|
||||
}
|
||||
assert (norm2(err)< 1.0e-4 );
|
||||
assert (n2e< 1.0e-4 );
|
||||
}
|
||||
|
||||
if (1)
|
||||
@@ -238,16 +321,30 @@ int main (int argc, char ** argv)
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
|
||||
// ref = src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
|
||||
tmp = U[mu]*Cshift(src,mu+1,1);
|
||||
tmp = Cshift(src,mu+1,1);
|
||||
{
|
||||
autoView( ref_v, ref, CpuWrite);
|
||||
autoView( tmp_v, tmp, CpuRead);
|
||||
for(int i=0;i<ref_v.size();i++){
|
||||
ref_v[i]+= tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]; ;
|
||||
autoView( U_v , U[mu] , CpuRead);
|
||||
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
|
||||
for(int s=0;s<Ls;s++){
|
||||
int i=s+Ls*ss;
|
||||
ref_v[i]+= U_v[ss]*(tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]); ;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tmp =adj(U[mu])*src;
|
||||
|
||||
{
|
||||
autoView( tmp_v , tmp , CpuWrite);
|
||||
autoView( U_v , U[mu] , CpuRead);
|
||||
autoView( src_v, src , CpuRead);
|
||||
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
|
||||
for(int s=0;s<Ls;s++){
|
||||
tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
|
||||
}
|
||||
}
|
||||
}
|
||||
// tmp =adj(U[mu])*src;
|
||||
tmp =Cshift(tmp,mu+1,-1);
|
||||
{
|
||||
autoView( ref_v, ref, CpuWrite);
|
||||
@@ -259,27 +356,27 @@ int main (int argc, char ** argv)
|
||||
}
|
||||
ref = -0.5*ref;
|
||||
}
|
||||
// dump=1;
|
||||
Dw.Dhop(src,result,1);
|
||||
|
||||
Dw.Dhop(src,result,DaggerYes);
|
||||
|
||||
std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
|
||||
std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
|
||||
std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
|
||||
|
||||
std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
|
||||
std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl;
|
||||
std::cout<<GridLogMessage << "norm dag ref "<< norm2(ref)<<std::endl;
|
||||
err = ref-result;
|
||||
std::cout<<GridLogMessage << "norm dag diff "<< norm2(err)<<std::endl;
|
||||
if((norm2(err)>1.0e-4)){
|
||||
/*
|
||||
std::cout<< "DAG RESULT\n " <<ref << std::endl;
|
||||
std::cout<< "DAG sRESULT\n " <<result << std::endl;
|
||||
std::cout<< "DAG ERR \n " << err <<std::endl;
|
||||
*/
|
||||
}
|
||||
LatticeFermion src_e (FrbGrid);
|
||||
LatticeFermion src_o (FrbGrid);
|
||||
LatticeFermion r_e (FrbGrid);
|
||||
LatticeFermion r_o (FrbGrid);
|
||||
LatticeFermion r_eo (FGrid);
|
||||
n2e= norm2(err);
|
||||
std::cout<<GridLogMessage << "norm dag diff "<< n2e<< " Line "<<__LINE__ <<std::endl;
|
||||
|
||||
assert((n2e)<1.0e-4);
|
||||
|
||||
FermionField src_e (FrbGrid);
|
||||
FermionField src_o (FrbGrid);
|
||||
FermionField r_e (FrbGrid);
|
||||
FermionField r_o (FrbGrid);
|
||||
FermionField r_eo (FGrid);
|
||||
|
||||
std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl;
|
||||
pickCheckerboard(Even,src_e,src);
|
||||
@@ -291,10 +388,8 @@ int main (int argc, char ** argv)
|
||||
|
||||
// S-direction is INNERMOST and takes no part in the parity.
|
||||
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
|
||||
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionD::DhopEO "<<std::endl;
|
||||
std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
|
||||
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
|
||||
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
|
||||
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermion::DhopEO "<<std::endl;
|
||||
std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
|
||||
#ifdef GRID_OMP
|
||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
|
||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
|
||||
@@ -308,13 +403,7 @@ int main (int argc, char ** argv)
|
||||
Dw.DhopEO(src_o,r_e,DaggerNo);
|
||||
double t0=usecond();
|
||||
for(int i=0;i<ncall;i++){
|
||||
#ifdef CUDA_PROFILE
|
||||
if(i==10) cudaProfilerStart();
|
||||
#endif
|
||||
Dw.DhopEO(src_o,r_e,DaggerNo);
|
||||
#ifdef CUDA_PROFILE
|
||||
if(i==20) cudaProfilerStop();
|
||||
#endif
|
||||
}
|
||||
double t1=usecond();
|
||||
FGrid->Barrier();
|
||||
@@ -338,14 +427,9 @@ int main (int argc, char ** argv)
|
||||
setCheckerboard(r_eo,r_e);
|
||||
|
||||
err = r_eo-result;
|
||||
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
||||
if((norm2(err)>1.0e-4)){
|
||||
/*
|
||||
std::cout<< "Deo RESULT\n " <<r_eo << std::endl;
|
||||
std::cout<< "Deo REF\n " <<result << std::endl;
|
||||
std::cout<< "Deo ERR \n " << err <<std::endl;
|
||||
*/
|
||||
}
|
||||
n2e= norm2(err);
|
||||
std::cout<<GridLogMessage << "norm diff "<< n2e<<std::endl;
|
||||
assert(n2e<1.0e-4);
|
||||
|
||||
pickCheckerboard(Even,src_e,err);
|
||||
pickCheckerboard(Odd,src_o,err);
|
||||
@@ -354,6 +438,4 @@ int main (int argc, char ** argv)
|
||||
|
||||
assert(norm2(src_e)<1.0e-4);
|
||||
assert(norm2(src_o)<1.0e-4);
|
||||
Grid_finalize();
|
||||
exit(0);
|
||||
}
|
||||
|
@@ -43,7 +43,7 @@ Gamma::Algebra Gmu [] = {
|
||||
Gamma::Algebra::GammaT
|
||||
};
|
||||
|
||||
void Benchmark(int Ls, Coordinate Dirichlet);
|
||||
void Benchmark(int Ls, Coordinate Dirichlet,bool Sloppy);
|
||||
|
||||
int main (int argc, char ** argv)
|
||||
{
|
||||
@@ -69,11 +69,19 @@ int main (int argc, char ** argv)
|
||||
std::cout << GridLogMessage<< " Testing with full communication " <<std::endl;
|
||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||
|
||||
Benchmark(Ls,Dirichlet);
|
||||
Benchmark(Ls,Dirichlet,false);
|
||||
|
||||
std::cout << "\n\n\n\n\n\n" <<std::endl;
|
||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||
std::cout << GridLogMessage<< " Testing with sloppy communication " <<std::endl;
|
||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||
|
||||
Benchmark(Ls,Dirichlet,true);
|
||||
|
||||
//////////////////
|
||||
// Domain decomposed
|
||||
//////////////////
|
||||
/*
|
||||
Coordinate latt4 = GridDefaultLatt();
|
||||
Coordinate mpi = GridDefaultMpi();
|
||||
Coordinate CommDim(Nd);
|
||||
@@ -81,42 +89,35 @@ int main (int argc, char ** argv)
|
||||
GlobalSharedMemory::GetShmDims(mpi,shm);
|
||||
|
||||
|
||||
//////////////////////
|
||||
// Node level
|
||||
//////////////////////
|
||||
std::cout << "\n\n\n\n\n\n" <<std::endl;
|
||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||
std::cout << GridLogMessage<< " Testing without internode communication " <<std::endl;
|
||||
// std::cout << GridLogMessage<< " Testing without internode communication " <<std::endl;
|
||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||
|
||||
for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
|
||||
// Dirichlet[0] = 0;
|
||||
// Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
|
||||
// Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
|
||||
// Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
|
||||
// Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
|
||||
Dirichlet[0] = 0;
|
||||
Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
|
||||
Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
|
||||
Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
|
||||
Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
|
||||
|
||||
Benchmark(Ls,Dirichlet);
|
||||
Benchmark(Ls,Dirichlet,false);
|
||||
|
||||
std::cout << "\n\n\n\n\n\n" <<std::endl;
|
||||
|
||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||
std::cout << GridLogMessage<< " Testing without intranode communication " <<std::endl;
|
||||
std::cout << GridLogMessage<< " Testing with sloppy communication " <<std::endl;
|
||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||
|
||||
for(int d=0;d<Nd;d++) CommDim[d]= mpi[d]>1 ? 1 : 0;
|
||||
// Dirichlet[0] = 0;
|
||||
// Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0];
|
||||
// Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1];
|
||||
// Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2];
|
||||
// Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3];
|
||||
|
||||
Benchmark(Ls,Dirichlet);
|
||||
|
||||
Benchmark(Ls,Dirichlet,true);
|
||||
*/
|
||||
|
||||
Grid_finalize();
|
||||
exit(0);
|
||||
}
|
||||
void Benchmark(int Ls, Coordinate Dirichlet)
|
||||
void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
|
||||
{
|
||||
Coordinate latt4 = GridDefaultLatt();
|
||||
GridLogLayout();
|
||||
@@ -132,21 +133,13 @@ void Benchmark(int Ls, Coordinate Dirichlet)
|
||||
typedef LatticeGaugeFieldF GaugeField;
|
||||
typedef LatticeColourMatrixF ColourMatrixField;
|
||||
typedef DomainWallFermionF FermionAction;
|
||||
#endif
|
||||
#ifdef DOUBLE
|
||||
#else
|
||||
typedef vComplexD Simd;
|
||||
typedef LatticeFermionD FermionField;
|
||||
typedef LatticeGaugeFieldD GaugeField;
|
||||
typedef LatticeColourMatrixD ColourMatrixField;
|
||||
typedef DomainWallFermionD FermionAction;
|
||||
#endif
|
||||
#ifdef DOUBLE2
|
||||
typedef vComplexD2 Simd;
|
||||
typedef LatticeFermionD2 FermionField;
|
||||
typedef LatticeGaugeFieldD2 GaugeField;
|
||||
typedef LatticeColourMatrixD2 ColourMatrixField;
|
||||
typedef DomainWallFermionD2 FermionAction;
|
||||
#endif
|
||||
|
||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,Simd::Nsimd()),GridDefaultMpi());
|
||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||
@@ -269,6 +262,7 @@ void Benchmark(int Ls, Coordinate Dirichlet)
|
||||
FermionAction::ImplParams p;
|
||||
p.dirichlet=Dirichlet;
|
||||
FermionAction Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,p);
|
||||
Dw.SloppyComms(sloppy);
|
||||
Dw.ImportGauge(Umu);
|
||||
|
||||
int ncall =300;
|
||||
|
@@ -1,465 +0,0 @@
|
||||
/*************************************************************************************
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
Source file: ./benchmarks/Benchmark_dwf.cc
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#include <Grid/Grid.h>
|
||||
#ifdef GRID_CUDA
|
||||
#define CUDA_PROFILE
|
||||
#endif
|
||||
|
||||
#ifdef CUDA_PROFILE
|
||||
#include <cuda_profiler_api.h>
|
||||
#endif
|
||||
|
||||
using namespace std;
|
||||
using namespace Grid;
|
||||
|
||||
////////////////////////
|
||||
/// Move to domains ////
|
||||
////////////////////////
|
||||
|
||||
Gamma::Algebra Gmu [] = {
|
||||
Gamma::Algebra::GammaX,
|
||||
Gamma::Algebra::GammaY,
|
||||
Gamma::Algebra::GammaZ,
|
||||
Gamma::Algebra::GammaT
|
||||
};
|
||||
|
||||
void Benchmark(int Ls, Coordinate Dirichlet, int partial);
|
||||
|
||||
int main (int argc, char ** argv)
|
||||
{
|
||||
Grid_init(&argc,&argv);
|
||||
|
||||
|
||||
int threads = GridThread::GetThreads();
|
||||
|
||||
int Ls=8;
|
||||
for(int i=0;i<argc;i++) {
|
||||
if(std::string(argv[i]) == "-Ls"){
|
||||
std::stringstream ss(argv[i+1]); ss >> Ls;
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////
|
||||
// With comms
|
||||
//////////////////
|
||||
Coordinate Dirichlet(Nd+1,0);
|
||||
|
||||
for(auto partial : {0}) {
|
||||
std::cout << "\n\n\n\n\n\n" <<std::endl;
|
||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||
std::cout << GridLogMessage<< " Testing with full communication " <<std::endl;
|
||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||
Benchmark(Ls,Dirichlet,partial);
|
||||
}
|
||||
|
||||
//////////////////
|
||||
// Domain decomposed
|
||||
//////////////////
|
||||
Coordinate latt4 = GridDefaultLatt();
|
||||
Coordinate mpi = GridDefaultMpi();
|
||||
Coordinate CommDim(Nd);
|
||||
//Coordinate shm({2,1,1,1});
|
||||
Coordinate shm;
|
||||
GlobalSharedMemory::GetShmDims(mpi,shm);
|
||||
|
||||
std::cout <<GridLogMessage << " Shared memory MPI decomp is " <<shm<<std::endl;
|
||||
|
||||
//////////////////////
|
||||
// Node level
|
||||
//////////////////////
|
||||
for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
|
||||
// for(int d=0;d<Nd;d++) CommDim[d]= 1;
|
||||
Dirichlet[0] = 0;
|
||||
Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
|
||||
Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
|
||||
Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
|
||||
Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
|
||||
|
||||
for(auto partial : {0,1}) {
|
||||
std::cout << "\n\n\n\n\n\n" <<std::endl;
|
||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||
std::cout << GridLogMessage<< " Testing without internode communication partial dirichlet="<<partial <<std::endl;
|
||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||
Benchmark(Ls,Dirichlet,partial);
|
||||
}
|
||||
|
||||
for(int d=0;d<Nd;d++) CommDim[d]= mpi[d]>1 ? 1 : 0;
|
||||
Dirichlet[0] = 0;
|
||||
Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0];
|
||||
Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1];
|
||||
Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2];
|
||||
Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3];
|
||||
|
||||
for(auto partial : {0,1}) {
|
||||
std::cout << "\n\n\n\n\n\n" <<std::endl;
|
||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||
std::cout << GridLogMessage<< " Testing without intranode communication; partial dirichlet= "<<partial <<std::endl;
|
||||
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
|
||||
Benchmark(Ls,Dirichlet,partial);
|
||||
}
|
||||
Grid_finalize();
|
||||
exit(0);
|
||||
}
|
||||
void Benchmark(int Ls, Coordinate Dirichlet, int partial)
|
||||
{
|
||||
Coordinate latt4 = GridDefaultLatt();
|
||||
GridLogLayout();
|
||||
|
||||
long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
|
||||
|
||||
std::vector<int> seeds4({1,2,3,4});
|
||||
std::vector<int> seeds5({5,6,7,8});
|
||||
#define SINGLE
|
||||
#ifdef SINGLE
|
||||
typedef vComplexF Simd;
|
||||
typedef LatticeFermionF FermionField;
|
||||
typedef LatticeGaugeFieldF GaugeField;
|
||||
typedef LatticeColourMatrixF ColourMatrixField;
|
||||
typedef DomainWallFermionF FermionAction;
|
||||
#endif
|
||||
#ifdef DOUBLE
|
||||
typedef vComplexD Simd;
|
||||
typedef LatticeFermionD FermionField;
|
||||
typedef LatticeGaugeFieldD GaugeField;
|
||||
typedef LatticeColourMatrixD ColourMatrixField;
|
||||
typedef DomainWallFermionD FermionAction;
|
||||
#endif
|
||||
#ifdef DOUBLE2
|
||||
typedef vComplexD2 Simd;
|
||||
typedef LatticeFermionD2 FermionField;
|
||||
typedef LatticeGaugeFieldD2 GaugeField;
|
||||
typedef LatticeColourMatrixD2 ColourMatrixField;
|
||||
typedef DomainWallFermionD2 FermionAction;
|
||||
#endif
|
||||
|
||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,Simd::Nsimd()),GridDefaultMpi());
|
||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
||||
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
||||
|
||||
std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
|
||||
GridParallelRNG RNG4(UGrid); RNG4.SeedUniqueString(std::string("The 4D RNG"));
|
||||
|
||||
std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
|
||||
GridParallelRNG RNG5(FGrid); RNG5.SeedUniqueString(std::string("The 5D RNG"));
|
||||
|
||||
|
||||
FermionField src (FGrid); random(RNG5,src);
|
||||
#if 0
|
||||
src = Zero();
|
||||
{
|
||||
Coordinate origin({0,0,0,latt4[2]-1,0});
|
||||
SpinColourVectorF tmp;
|
||||
tmp=Zero();
|
||||
tmp()(0)(0)=Complex(-2.0,0.0);
|
||||
std::cout << " source site 0 " << tmp<<std::endl;
|
||||
pokeSite(tmp,src,origin);
|
||||
}
|
||||
#else
|
||||
RealD N2 = 1.0/::sqrt(norm2(src));
|
||||
src = src*N2;
|
||||
#endif
|
||||
|
||||
FermionField result(FGrid); result=Zero();
|
||||
FermionField ref(FGrid); ref=Zero();
|
||||
FermionField tmp(FGrid);
|
||||
FermionField err(FGrid);
|
||||
|
||||
std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
|
||||
GaugeField Umu(UGrid);
|
||||
GaugeField UmuFull(UGrid);
|
||||
GaugeField UmuCopy(UGrid);
|
||||
SU<Nc>::HotConfiguration(RNG4,Umu);
|
||||
UmuCopy=Umu;
|
||||
UmuFull=Umu;
|
||||
std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
|
||||
|
||||
////////////////////////////////////
|
||||
// Apply BCs
|
||||
////////////////////////////////////
|
||||
Coordinate Block(4);
|
||||
for(int d=0;d<4;d++) Block[d]= Dirichlet[d+1];
|
||||
|
||||
std::cout << GridLogMessage << "Applying BCs for Dirichlet Block5 " << Dirichlet << std::endl;
|
||||
std::cout << GridLogMessage << "Applying BCs for Dirichlet Block4 " << Block << std::endl;
|
||||
|
||||
DirichletFilter<GaugeField> Filter(Block);
|
||||
Filter.applyFilter(Umu);
|
||||
if(!partial) Filter.applyFilter(UmuCopy);
|
||||
|
||||
////////////////////////////////////
|
||||
// Naive wilson implementation
|
||||
////////////////////////////////////
|
||||
std::vector<ColourMatrixField> U(4,UGrid);
|
||||
std::vector<ColourMatrixField> Ucopy(4,UGrid);
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
|
||||
Ucopy[mu] = PeekIndex<LorentzIndex>(UmuCopy,mu);
|
||||
}
|
||||
|
||||
std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
|
||||
|
||||
if (1)
|
||||
{
|
||||
ref = Zero();
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
int depth=dwf_compressor_depth;
|
||||
tmp = Cshift(src,mu+1,1);
|
||||
{
|
||||
autoView( tmp_v , tmp , CpuWrite);
|
||||
autoView( U_v , U[mu] , CpuRead);
|
||||
autoView( Ucopy_v, Ucopy[mu] , CpuRead);
|
||||
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
|
||||
for(int s=0;s<Ls;s++){
|
||||
if ( (s<depth) || (s>=Ls-depth)){
|
||||
tmp_v[Ls*ss+s] = Ucopy_v[ss]*tmp_v[Ls*ss+s];
|
||||
} else {
|
||||
tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
ref=ref + tmp - Gamma(Gmu[mu])*tmp;
|
||||
{
|
||||
autoView( tmp_v , tmp , CpuWrite);
|
||||
autoView( U_v , U[mu] , CpuRead);
|
||||
autoView( Ucopy_v, Ucopy[mu] , CpuRead);
|
||||
autoView( src_v, src , CpuRead);
|
||||
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
|
||||
for(int s=0;s<Ls;s++){
|
||||
if ( (s<depth) || (s>=Ls-depth)){
|
||||
tmp_v[Ls*ss+s] = adj(Ucopy_v[ss])*src_v[Ls*ss+s];
|
||||
} else {
|
||||
tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
tmp =Cshift(tmp,mu+1,-1);
|
||||
ref=ref + tmp + Gamma(Gmu[mu])*tmp;
|
||||
}
|
||||
ref = -0.5*ref;
|
||||
}
|
||||
|
||||
RealD mass=0.1;
|
||||
RealD M5 =1.8;
|
||||
|
||||
RealD NP = UGrid->_Nprocessors;
|
||||
RealD NN = UGrid->NodeCount();
|
||||
|
||||
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||
std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
|
||||
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop "<<std::endl;
|
||||
std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
|
||||
std::cout << GridLogMessage <<"* BCs for Dirichlet Block4 " << Block << std::endl;
|
||||
std::cout << GridLogMessage <<"* Partial Dirichlet BC = " << partial << std::endl;
|
||||
std::cout << GridLogMessage<< "* VComplex size is "<<sizeof(Simd)<< " B"<<std::endl;
|
||||
#ifdef GRID_OMP
|
||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
|
||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
|
||||
#endif
|
||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
|
||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
|
||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
|
||||
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
|
||||
|
||||
FermionAction::ImplParams p;
|
||||
p.dirichlet=Dirichlet;
|
||||
p.partialDirichlet=partial;
|
||||
FermionAction Dw(UmuFull,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,p);
|
||||
|
||||
int ncall =1;
|
||||
RealD n2e;
|
||||
|
||||
if (1) {
|
||||
FGrid->Barrier();
|
||||
Dw.Dhop(src,result,0);
|
||||
std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
|
||||
double t0=usecond();
|
||||
for(int i=0;i<ncall;i++){
|
||||
Dw.Dhop(src,result,0);
|
||||
}
|
||||
double t1=usecond();
|
||||
FGrid->Barrier();
|
||||
|
||||
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||
double flops=single_site_flops*volume*ncall;
|
||||
|
||||
auto nsimd = Simd::Nsimd();
|
||||
auto simdwidth = sizeof(Simd);
|
||||
|
||||
// RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
|
||||
double data_rf = volume * ((2*Nd+1)*Nd*Nc + 2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
|
||||
|
||||
// mem: Nd Wilson * Ls, Nd gauge, Nc colors
|
||||
double data_mem = (volume * (2*Nd+1)*Nd*Nc + (volume/Ls) *2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
|
||||
|
||||
std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s per rank = "<< flops/(t1-t0)/NP<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NN<<std::endl;
|
||||
err = ref-result;
|
||||
n2e = norm2(err);
|
||||
|
||||
std::cout<<GridLogMessage << "norm diff "<< n2e<< " Line "<<__LINE__ <<std::endl;
|
||||
|
||||
if(( n2e>1.0e-4) ) {
|
||||
std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
|
||||
FGrid->Barrier();
|
||||
|
||||
DumpSliceNorm("s-slice ref ",ref,1);
|
||||
DumpSliceNorm("s-slice res ",result,1);
|
||||
DumpSliceNorm("s-slice error ",err,1);
|
||||
exit(-1);
|
||||
}
|
||||
assert (n2e< 1.0e-4 );
|
||||
}
|
||||
|
||||
if (1)
|
||||
{ // Naive wilson dag implementation
|
||||
|
||||
ref = Zero();
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
|
||||
int depth=dwf_compressor_depth;
|
||||
tmp = Cshift(src,mu+1,1);
|
||||
{
|
||||
autoView( tmp_v , tmp , CpuWrite);
|
||||
autoView( U_v , U[mu] , CpuRead);
|
||||
autoView( Ucopy_v, Ucopy[mu] , CpuRead);
|
||||
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
|
||||
for(int s=0;s<Ls;s++){
|
||||
if ( (s<depth) || (s>=Ls-depth)){
|
||||
tmp_v[Ls*ss+s] = Ucopy_v[ss]*tmp_v[Ls*ss+s];
|
||||
} else {
|
||||
tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
ref=ref + tmp + Gamma(Gmu[mu])*tmp;
|
||||
{
|
||||
autoView( tmp_v , tmp , CpuWrite);
|
||||
autoView( U_v , U[mu] , CpuRead);
|
||||
autoView( Ucopy_v, Ucopy[mu] , CpuRead);
|
||||
autoView( src_v, src , CpuRead);
|
||||
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
|
||||
for(int s=0;s<Ls;s++){
|
||||
if ( (s<depth) || (s>=Ls-depth)){
|
||||
tmp_v[Ls*ss+s] = adj(Ucopy_v[ss])*src_v[Ls*ss+s];
|
||||
} else {
|
||||
tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
tmp =Cshift(tmp,mu+1,-1);
|
||||
ref=ref + tmp - Gamma(Gmu[mu])*tmp;
|
||||
}
|
||||
ref = -0.5*ref;
|
||||
}
|
||||
|
||||
Dw.Dhop(src,result,DaggerYes);
|
||||
|
||||
std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
|
||||
std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
|
||||
std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
|
||||
|
||||
std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
|
||||
std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl;
|
||||
std::cout<<GridLogMessage << "norm dag ref "<< norm2(ref)<<std::endl;
|
||||
err = ref-result;
|
||||
n2e= norm2(err);
|
||||
std::cout<<GridLogMessage << "norm dag diff "<< n2e<< " Line "<<__LINE__ <<std::endl;
|
||||
|
||||
assert((n2e)<1.0e-4);
|
||||
|
||||
FermionField src_e (FrbGrid);
|
||||
FermionField src_o (FrbGrid);
|
||||
FermionField r_e (FrbGrid);
|
||||
FermionField r_o (FrbGrid);
|
||||
FermionField r_eo (FGrid);
|
||||
|
||||
std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl;
|
||||
pickCheckerboard(Even,src_e,src);
|
||||
pickCheckerboard(Odd,src_o,src);
|
||||
|
||||
std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl;
|
||||
std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;
|
||||
|
||||
|
||||
// S-direction is INNERMOST and takes no part in the parity.
|
||||
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
|
||||
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermion::DhopEO "<<std::endl;
|
||||
std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
|
||||
#ifdef GRID_OMP
|
||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
|
||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
|
||||
#endif
|
||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
|
||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
|
||||
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
|
||||
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
|
||||
{
|
||||
FGrid->Barrier();
|
||||
Dw.DhopEO(src_o,r_e,DaggerNo);
|
||||
double t0=usecond();
|
||||
for(int i=0;i<ncall;i++){
|
||||
Dw.DhopEO(src_o,r_e,DaggerNo);
|
||||
}
|
||||
double t1=usecond();
|
||||
FGrid->Barrier();
|
||||
|
||||
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||
double flops=(single_site_flops*volume*ncall)/2.0;
|
||||
|
||||
std::cout<<GridLogMessage << "Deo mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||
std::cout<<GridLogMessage << "Deo mflop/s per rank "<< flops/(t1-t0)/NP<<std::endl;
|
||||
std::cout<<GridLogMessage << "Deo mflop/s per node "<< flops/(t1-t0)/NN<<std::endl;
|
||||
}
|
||||
Dw.DhopEO(src_o,r_e,DaggerNo);
|
||||
Dw.DhopOE(src_e,r_o,DaggerNo);
|
||||
Dw.Dhop (src ,result,DaggerNo);
|
||||
|
||||
std::cout<<GridLogMessage << "r_e"<<norm2(r_e)<<std::endl;
|
||||
std::cout<<GridLogMessage << "r_o"<<norm2(r_o)<<std::endl;
|
||||
std::cout<<GridLogMessage << "res"<<norm2(result)<<std::endl;
|
||||
|
||||
setCheckerboard(r_eo,r_o);
|
||||
setCheckerboard(r_eo,r_e);
|
||||
|
||||
err = r_eo-result;
|
||||
n2e= norm2(err);
|
||||
std::cout<<GridLogMessage << "norm diff "<< n2e<< " Line "<<__LINE__ <<std::endl;
|
||||
assert(n2e<1.0e-4);
|
||||
|
||||
pickCheckerboard(Even,src_e,err);
|
||||
pickCheckerboard(Odd,src_o,err);
|
||||
std::cout<<GridLogMessage << "norm diff even "<< norm2(src_e)<<std::endl;
|
||||
std::cout<<GridLogMessage << "norm diff odd "<< norm2(src_o)<<std::endl;
|
||||
|
||||
assert(norm2(src_e)<1.0e-4);
|
||||
assert(norm2(src_o)<1.0e-4);
|
||||
}
|
@@ -873,7 +873,7 @@ int main (int argc, char ** argv)
|
||||
int do_su4=0;
|
||||
int do_memory=1;
|
||||
int do_comms =1;
|
||||
int do_blas =0;
|
||||
int do_blas =1;
|
||||
int do_dslash=1;
|
||||
|
||||
int sel=4;
|
||||
|
42
configure.ac
42
configure.ac
@@ -86,6 +86,7 @@ AC_ARG_WITH([gmp],
|
||||
[try this for a non-standard install prefix of the GMP library])],
|
||||
[AM_CXXFLAGS="-I$with_gmp/include $AM_CXXFLAGS"]
|
||||
[AM_LDFLAGS="-L$with_gmp/lib $AM_LDFLAGS"])
|
||||
|
||||
AC_ARG_WITH([mpfr],
|
||||
[AS_HELP_STRING([--with-mpfr=prefix],
|
||||
[try this for a non-standard install prefix of the MPFR library])],
|
||||
@@ -106,6 +107,13 @@ AC_ARG_WITH([lime],
|
||||
[AM_CXXFLAGS="-I$with_lime/include $AM_CXXFLAGS"]
|
||||
[AM_LDFLAGS="-L$with_lime/lib $AM_LDFLAGS"])
|
||||
|
||||
############### LIBUNWIND
|
||||
AC_ARG_WITH([unwind],
|
||||
[AS_HELP_STRING([--with-unwind=prefix],
|
||||
[try this for a non-standard install prefix of the libunwind library])],
|
||||
[AM_CXXFLAGS="-I$with_unwind/include $AM_CXXFLAGS"]
|
||||
[AM_LDFLAGS="-L$with_unwind/lib $AM_LDFLAGS"])
|
||||
|
||||
############### OpenSSL
|
||||
AC_ARG_WITH([openssl],
|
||||
[AS_HELP_STRING([--with-openssl=prefix],
|
||||
@@ -214,7 +222,7 @@ esac
|
||||
|
||||
############### Symplectic group
|
||||
AC_ARG_ENABLE([Sp],
|
||||
[AC_HELP_STRING([--enable-Sp=yes|no], [enable gauge group Sp2n])],
|
||||
[AS_HELP_STRING([--enable-Sp=yes|no],[enable gauge group Sp2n])],
|
||||
[ac_ENABLE_SP=${enable_Sp}], [ac_ENABLE_SP=no])
|
||||
|
||||
AM_CONDITIONAL(BUILD_SP, [ test "${ac_ENABLE_SP}X" == "yesX" ])
|
||||
@@ -255,6 +263,28 @@ case ${ac_ACCELERATOR_AWARE_MPI} in
|
||||
*);;
|
||||
esac
|
||||
|
||||
############### CHECKSUM COMMS
|
||||
AC_ARG_ENABLE([checksum-comms],
|
||||
[AS_HELP_STRING([--enable-checksum-comms=yes|no],[checksum all communication])],
|
||||
[ac_CHECKSUM_COMMS=${enable_checksum_comms}], [ac_CHECKSUM_COMMS=yes])
|
||||
|
||||
case ${ac_CHECKSUM_COMMS} in
|
||||
yes)
|
||||
AC_DEFINE([GRID_CHECKSUM_COMMS],[1],[checksum all communication]);;
|
||||
*);;
|
||||
esac
|
||||
|
||||
############### LOG VIEWS
|
||||
AC_ARG_ENABLE([log-views],
|
||||
[AS_HELP_STRING([--enable-log-views=yes|no],[log information on all view open/close])],
|
||||
[ac_LOG_VIEWS=${enable_log_views}], [ac_LOG_VIEWS=yes])
|
||||
|
||||
case ${ac_LOG_VIEWS} in
|
||||
yes)
|
||||
AC_DEFINE([GRID_LOG_VIEWS],[1],[log information on all view open/close]);;
|
||||
*);;
|
||||
esac
|
||||
|
||||
############### SYCL/CUDA/HIP/none
|
||||
AC_ARG_ENABLE([accelerator],
|
||||
[AS_HELP_STRING([--enable-accelerator=cuda|sycl|hip|none],[enable none,cuda,sycl,hip acceleration])],
|
||||
@@ -373,6 +403,16 @@ AC_SEARCH_LIBS([limeCreateReader], [lime],
|
||||
[have_lime=true],
|
||||
[AC_MSG_WARN(LIME library was not found in your system.)])
|
||||
|
||||
AC_SEARCH_LIBS([unw_backtrace], [unwind],
|
||||
[AC_DEFINE([HAVE_UNWIND], [1], [Define to 1 if you have the `libunwind' library])]
|
||||
[have_unwind=true],
|
||||
[AC_MSG_WARN(libunwind library was not found in your system.)])
|
||||
|
||||
AC_SEARCH_LIBS([_Ux86_64_step], [unwind-x86_64],
|
||||
[AC_DEFINE([HAVE_UNWIND_X86_64], [1], [Define to 1 if you have the `libunwind-x86_64' library])]
|
||||
[have_unwind_x86_64=true],
|
||||
[AC_MSG_WARN(libunwind library was not found in your system.)])
|
||||
|
||||
AC_SEARCH_LIBS([SHA256_Init], [crypto],
|
||||
[AC_DEFINE([HAVE_CRYPTO], [1], [Define to 1 if you have the `OpenSSL' library])]
|
||||
[have_crypto=true],
|
||||
|
@@ -1,7 +1,8 @@
|
||||
#!/bin/bash
|
||||
|
||||
##PBS -q EarlyAppAccess
|
||||
#PBS -q debug
|
||||
#PBS -l filesystems=flare
|
||||
#PBS -l filesystems=home
|
||||
#PBS -l select=2
|
||||
#PBS -l walltime=00:20:00
|
||||
#PBS -A LatticeQCD_aesp_CNDA
|
||||
@@ -14,26 +15,18 @@ cp $PBS_NODEFILE nodefile
|
||||
|
||||
export OMP_NUM_THREADS=4
|
||||
export MPICH_OFI_NIC_POLICY=GPU
|
||||
|
||||
#export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
|
||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
|
||||
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
|
||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
|
||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
|
||||
#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
|
||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
|
||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
|
||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
|
||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
|
||||
|
||||
export MPICH_CH4_SHM=XPMEM
|
||||
export MPIR_CVAR_DEBUG_SUMMARY=1
|
||||
export MPICH_DBG_LEVEL=VERBOSE
|
||||
export MPICH_DBG_CLASS=ALL
|
||||
#
|
||||
# Local vol 16.16.16.32
|
||||
#
|
||||
|
||||
#VOL=32.64.64.96
|
||||
mpiexec -np 1 -ppn 1 -envall mpivars
|
||||
|
||||
for VOL in 32.32.32.96 32.64.64.96
|
||||
for VOL in 32.32.32.96
|
||||
do
|
||||
for AT in 32
|
||||
do
|
||||
|
@@ -1,26 +1,28 @@
|
||||
#Ahead of time compile for PVC
|
||||
export MPFR=`spack find --paths mpfr | grep ^mpfr | awk '{print $2}' `
|
||||
export GMP=`spack find --paths gmp | grep ^gmp | awk '{print $2}' `
|
||||
export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' `
|
||||
export UNWIND=`spack find --paths libunwind | grep ^libunwind | awk '{print $2}' `
|
||||
|
||||
export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -lnuma -L/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/lib -fPIC -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc"
|
||||
export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions -I/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/include/ -fPIC"
|
||||
|
||||
#JIT compile
|
||||
#export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl "
|
||||
#export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions "
|
||||
|
||||
../configure \
|
||||
../../configure \
|
||||
--enable-simd=GPU \
|
||||
--enable-reduction=grid \
|
||||
--enable-gen-simd-width=64 \
|
||||
--enable-comms=mpi-auto \
|
||||
--enable-debug \
|
||||
--prefix $HOME/gpt-install \
|
||||
--disable-gparity \
|
||||
--disable-fermion-reps \
|
||||
--with-lime=$CLIME \
|
||||
--enable-shm=nvlink \
|
||||
--enable-checksum-comms=yes \
|
||||
--enable-log-views=yes \
|
||||
--enable-accelerator=sycl \
|
||||
--enable-accelerator-aware-mpi=no\
|
||||
--enable-accelerator-aware-mpi=no \
|
||||
--enable-unified=no \
|
||||
--with-lime=$CLIME \
|
||||
--with-gmp=$GMP \
|
||||
--with-mpfr=$MPFR \
|
||||
--with-unwind=$UNWIND \
|
||||
MPICXX=mpicxx \
|
||||
CXX=icpx
|
||||
CXX=icpx \
|
||||
LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -lsycl -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc -lnuma" \
|
||||
CXXFLAGS="-fPIC -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel"
|
||||
|
||||
|
||||
|
||||
|
@@ -1,16 +1,13 @@
|
||||
#module load oneapi/release/2023.12.15.001
|
||||
#module load mpich/icc-all-debug-pmix-gpu/52.2
|
||||
#module load mpich-config/mode/deterministic
|
||||
#module load intel_compute_runtime/release/821.35
|
||||
module load pti-gpu
|
||||
|
||||
source ~/spack/share/spack/setup-env.sh
|
||||
spack load c-lime
|
||||
spack load openssl
|
||||
export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' `
|
||||
export HTTP_PROXY=http://proxy.alcf.anl.gov:3128
|
||||
export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128
|
||||
export http_proxy=http://proxy.alcf.anl.gov:3128
|
||||
export https_proxy=http://proxy.alcf.anl.gov:3128
|
||||
git config --global http.proxy http://proxy.alcf.anl.gov:3128
|
||||
|
||||
source ~/spack/share/spack/setup-env.sh
|
||||
spack load c-lime
|
||||
spack load openssl@3.3.1%gcc@12.2.0
|
||||
spack load unwind
|
||||
export UNWIND=`spack find --paths libunwind | grep ^libunwind | awk '{print $2}' `
|
||||
export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' `
|
||||
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
|
||||
|
273
systems/Jupiter/benchmarks/dwf.1node.perf
Normal file
273
systems/Jupiter/benchmarks/dwf.1node.perf
Normal file
@@ -0,0 +1,273 @@
|
||||
RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
|
||||
RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
|
||||
RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
|
||||
RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
|
||||
SLURM detected
|
||||
AcceleratorCudaInit[0]: ========================
|
||||
AcceleratorCudaInit[0]: Device Number : 0
|
||||
AcceleratorCudaInit[0]: ========================
|
||||
AcceleratorCudaInit[0]: Device identifier: NVIDIA GH200 120GB
|
||||
AcceleratorCudaInit[0]: totalGlobalMem: 102005473280
|
||||
AcceleratorCudaInit[0]: managedMemory: 1
|
||||
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
|
||||
AcceleratorCudaInit[0]: warpSize: 32
|
||||
AcceleratorCudaInit[0]: pciBusID: 1
|
||||
AcceleratorCudaInit[0]: pciDeviceID: 0
|
||||
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
|
||||
AcceleratorCudaInit: using default device
|
||||
AcceleratorCudaInit: assume user either uses
|
||||
AcceleratorCudaInit: a) IBM jsrun, or
|
||||
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||
local rank 0 device 0 bus id: 0009:01:00.0
|
||||
AcceleratorCudaInit: ================================================
|
||||
SharedMemoryMpi: World communicator of size 4
|
||||
SharedMemoryMpi: Node communicator of size 4
|
||||
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x4002c0000000 - 40033fffffff for comms buffers
|
||||
Setting up IPC
|
||||
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|_ | | | | | | | | | | | | _|__
|
||||
__|_ _|__
|
||||
__|_ GGGG RRRR III DDDD _|__
|
||||
__|_ G R R I D D _|__
|
||||
__|_ G R R I D D _|__
|
||||
__|_ G GG RRRR I D D _|__
|
||||
__|_ G G R R I D D _|__
|
||||
__|_ GGGG R R III DDDD _|__
|
||||
__|_ _|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
| | | | | | | | | | | | | |
|
||||
|
||||
|
||||
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
Current Grid git commit hash=3737a24096282ea179607fc879814710860a0de6: (HEAD -> develop, origin/develop, origin/HEAD) clean
|
||||
|
||||
Grid : Message : ================================================
|
||||
Grid : Message : MPI is initialised and logging filters activated
|
||||
Grid : Message : ================================================
|
||||
Grid : Message : This rank is running on host jpbo-119-30.jupiter.internal
|
||||
Grid : Message : Requested 2147483648 byte stencil comms buffers
|
||||
Grid : Message : MemoryManager Cache 81604378624 bytes
|
||||
Grid : Message : MemoryManager::Init() setting up
|
||||
Grid : Message : MemoryManager::Init() cache pool for recent host allocations: SMALL 8 LARGE 2 HUGE 0
|
||||
Grid : Message : MemoryManager::Init() cache pool for recent device allocations: SMALL 16 LARGE 8 Huge 0
|
||||
Grid : Message : MemoryManager::Init() cache pool for recent shared allocations: SMALL 16 LARGE 8 Huge 0
|
||||
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
|
||||
Grid : Message : MemoryManager::Init() Using cudaMalloc
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Grid : Message : 0.303000 s : ++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
Grid : Message : 0.309000 s : Testing with full communication
|
||||
Grid : Message : 0.312000 s : ++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
Grid : Message : 0.313000 s : Grid Layout
|
||||
Grid : Message : 0.313000 s : Global lattice size : 32 32 64 64
|
||||
Grid : Message : 0.319000 s : OpenMP threads : 4
|
||||
Grid : Message : 0.320000 s : MPI tasks : 1 1 2 2
|
||||
Grid : Message : 0.129590 s : Initialising 4d RNG
|
||||
Grid : Message : 0.764790 s : Intialising parallel RNG with unique string 'The 4D RNG'
|
||||
Grid : Message : 0.764920 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
|
||||
Grid : Message : 0.942440 s : Initialising 5d RNG
|
||||
Grid : Message : 1.149388 s : Intialising parallel RNG with unique string 'The 5D RNG'
|
||||
Grid : Message : 1.149404 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
|
||||
local rank 1 device 0 bus id: 0019:01:00.0
|
||||
local rank 2 device 0 bus id: 0029:01:00.0
|
||||
local rank 3 device 0 bus id: 0039:01:00.0
|
||||
Grid : Message : 43.893114 s : Drawing gauge field
|
||||
Grid : Message : 54.574150 s : Random gauge initialised
|
||||
Grid : Message : 54.574170 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
|
||||
Grid : Message : 54.574172 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
|
||||
Grid : Message : 54.580032 s : Setting up Cshift based reference
|
||||
Grid : Message : 60.407451 s : *****************************************************************
|
||||
Grid : Message : 60.407469 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
|
||||
Grid : Message : 60.407470 s : *****************************************************************
|
||||
Grid : Message : 60.407471 s : *****************************************************************
|
||||
Grid : Message : 60.407472 s : * Benchmarking DomainWallFermionR::Dhop
|
||||
Grid : Message : 60.407473 s : * Vectorising space-time by 8
|
||||
Grid : Message : 60.407475 s : * VComplex size is 64 B
|
||||
Grid : Message : 60.407477 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 60.407479 s : * Using GENERIC Nc WilsonKernels
|
||||
Grid : Message : 60.407480 s : *****************************************************************
|
||||
Grid : Message : 61.102178 s : Called warmup
|
||||
Grid : Message : 62.177160 s : Called Dw 300 times in 1074958 us
|
||||
Grid : Message : 62.177198 s : mflop/s = 24721998.6
|
||||
Grid : Message : 62.177201 s : mflop/s per rank = 6180499.64
|
||||
Grid : Message : 62.177204 s : mflop/s per node = 24721998.6
|
||||
Grid : Message : 62.182696 s : norm diff 5.8108784e-14 Line 306
|
||||
Grid : Message : 71.328862 s : ----------------------------------------------------------------
|
||||
Grid : Message : 71.328884 s : Compare to naive wilson implementation Dag to verify correctness
|
||||
Grid : Message : 71.328885 s : ----------------------------------------------------------------
|
||||
Grid : Message : 71.328886 s : Called DwDag
|
||||
Grid : Message : 71.328887 s : norm dag result 4.12810493
|
||||
Grid : Message : 71.329493 s : norm dag ref 4.12810493
|
||||
Grid : Message : 71.331967 s : norm dag diff 3.40632318e-14 Line 377
|
||||
Grid : Message : 71.394727 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
|
||||
Grid : Message : 71.803650 s : src_e0.500003185
|
||||
Grid : Message : 71.819727 s : src_o0.499996882
|
||||
Grid : Message : 71.821991 s : *********************************************************
|
||||
Grid : Message : 71.821993 s : * Benchmarking DomainWallFermion::DhopEO
|
||||
Grid : Message : 71.821995 s : * Vectorising space-time by 8
|
||||
Grid : Message : 71.821998 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 71.822002 s : * Using GENERIC Nc WilsonKernels
|
||||
Grid : Message : 71.822003 s : *********************************************************
|
||||
Grid : Message : 72.377054 s : Deo mflop/s = 24065467
|
||||
Grid : Message : 72.377071 s : Deo mflop/s per rank 6016366.75
|
||||
Grid : Message : 72.377074 s : Deo mflop/s per node 24065467
|
||||
Grid : Message : 72.624877 s : r_e2.06377678
|
||||
Grid : Message : 72.625198 s : r_o2.06381058
|
||||
Grid : Message : 72.625507 s : res4.12758736
|
||||
Grid : Message : 73.759140 s : norm diff 0
|
||||
Grid : Message : 73.868204 s : norm diff even 0
|
||||
Grid : Message : 73.907201 s : norm diff odd 0
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Grid : Message : 74.414580 s : ++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
Grid : Message : 74.414582 s : Testing without internode communication
|
||||
Grid : Message : 74.414584 s : ++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
Grid : Message : 74.414586 s : Grid Layout
|
||||
Grid : Message : 74.414586 s : Global lattice size : 32 32 64 64
|
||||
Grid : Message : 74.414594 s : OpenMP threads : 4
|
||||
Grid : Message : 74.414595 s : MPI tasks : 1 1 2 2
|
||||
Grid : Message : 74.679364 s : Initialising 4d RNG
|
||||
Grid : Message : 74.742332 s : Intialising parallel RNG with unique string 'The 4D RNG'
|
||||
Grid : Message : 74.742343 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
|
||||
Grid : Message : 74.759525 s : Initialising 5d RNG
|
||||
Grid : Message : 75.812412 s : Intialising parallel RNG with unique string 'The 5D RNG'
|
||||
Grid : Message : 75.812429 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
|
||||
Grid : Message : 119.252016 s : Drawing gauge field
|
||||
Grid : Message : 129.919846 s : Random gauge initialised
|
||||
Grid : Message : 129.919863 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
|
||||
Grid : Message : 129.919865 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
|
||||
Grid : Message : 129.923611 s : Setting up Cshift based reference
|
||||
Grid : Message : 135.522878 s : *****************************************************************
|
||||
Grid : Message : 135.522897 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
|
||||
Grid : Message : 135.522899 s : *****************************************************************
|
||||
Grid : Message : 135.522899 s : *****************************************************************
|
||||
Grid : Message : 135.522900 s : * Benchmarking DomainWallFermionR::Dhop
|
||||
Grid : Message : 135.522901 s : * Vectorising space-time by 8
|
||||
Grid : Message : 135.522903 s : * VComplex size is 64 B
|
||||
Grid : Message : 135.522905 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 135.522907 s : * Using GENERIC Nc WilsonKernels
|
||||
Grid : Message : 135.522908 s : *****************************************************************
|
||||
Grid : Message : 136.151202 s : Called warmup
|
||||
Grid : Message : 137.224721 s : Called Dw 300 times in 1073490 us
|
||||
Grid : Message : 137.224748 s : mflop/s = 24755806
|
||||
Grid : Message : 137.224751 s : mflop/s per rank = 6188951.49
|
||||
Grid : Message : 137.224753 s : mflop/s per node = 24755806
|
||||
Grid : Message : 137.235239 s : norm diff 5.8108784e-14 Line 306
|
||||
Grid : Message : 146.451686 s : ----------------------------------------------------------------
|
||||
Grid : Message : 146.451708 s : Compare to naive wilson implementation Dag to verify correctness
|
||||
Grid : Message : 146.451710 s : ----------------------------------------------------------------
|
||||
Grid : Message : 146.451712 s : Called DwDag
|
||||
Grid : Message : 146.451714 s : norm dag result 4.12810493
|
||||
Grid : Message : 146.452323 s : norm dag ref 4.12810493
|
||||
Grid : Message : 146.454799 s : norm dag diff 3.40632318e-14 Line 377
|
||||
Grid : Message : 146.498557 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
|
||||
Grid : Message : 146.940894 s : src_e0.500003185
|
||||
Grid : Message : 146.953676 s : src_o0.499996882
|
||||
Grid : Message : 146.955927 s : *********************************************************
|
||||
Grid : Message : 146.955929 s : * Benchmarking DomainWallFermion::DhopEO
|
||||
Grid : Message : 146.955932 s : * Vectorising space-time by 8
|
||||
Grid : Message : 146.955936 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 146.955938 s : * Using GENERIC Nc WilsonKernels
|
||||
Grid : Message : 146.955941 s : *********************************************************
|
||||
Grid : Message : 147.511975 s : Deo mflop/s = 24036256.5
|
||||
Grid : Message : 147.511989 s : Deo mflop/s per rank 6009064.13
|
||||
Grid : Message : 147.511991 s : Deo mflop/s per node 24036256.5
|
||||
Grid : Message : 147.522100 s : r_e2.06377678
|
||||
Grid : Message : 147.522433 s : r_o2.06381058
|
||||
Grid : Message : 147.522745 s : res4.12758736
|
||||
Grid : Message : 148.229848 s : norm diff 0
|
||||
Grid : Message : 149.233474 s : norm diff even 0
|
||||
Grid : Message : 149.235815 s : norm diff odd 0
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Grid : Message : 149.960985 s : ++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
Grid : Message : 149.960990 s : Testing without intranode communication
|
||||
Grid : Message : 149.960991 s : ++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
Grid : Message : 149.960995 s : Grid Layout
|
||||
Grid : Message : 149.960995 s : Global lattice size : 32 32 64 64
|
||||
Grid : Message : 149.961003 s : OpenMP threads : 4
|
||||
Grid : Message : 149.961004 s : MPI tasks : 1 1 2 2
|
||||
Grid : Message : 150.155810 s : Initialising 4d RNG
|
||||
Grid : Message : 150.800200 s : Intialising parallel RNG with unique string 'The 4D RNG'
|
||||
Grid : Message : 150.800340 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
|
||||
Grid : Message : 150.973420 s : Initialising 5d RNG
|
||||
Grid : Message : 151.131117 s : Intialising parallel RNG with unique string 'The 5D RNG'
|
||||
Grid : Message : 151.131136 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
|
||||
Grid : Message : 193.933765 s : Drawing gauge field
|
||||
Grid : Message : 204.611551 s : Random gauge initialised
|
||||
Grid : Message : 204.611574 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
|
||||
Grid : Message : 204.611576 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
|
||||
Grid : Message : 204.615265 s : Setting up Cshift based reference
|
||||
Grid : Message : 210.117788 s : *****************************************************************
|
||||
Grid : Message : 210.117807 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
|
||||
Grid : Message : 210.117809 s : *****************************************************************
|
||||
Grid : Message : 210.117810 s : *****************************************************************
|
||||
Grid : Message : 210.117812 s : * Benchmarking DomainWallFermionR::Dhop
|
||||
Grid : Message : 210.117813 s : * Vectorising space-time by 8
|
||||
Grid : Message : 210.117814 s : * VComplex size is 64 B
|
||||
Grid : Message : 210.117817 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 210.117818 s : * Using GENERIC Nc WilsonKernels
|
||||
Grid : Message : 210.117819 s : *****************************************************************
|
||||
Grid : Message : 210.714641 s : Called warmup
|
||||
Grid : Message : 211.892227 s : Called Dw 300 times in 1177557 us
|
||||
Grid : Message : 211.892252 s : mflop/s = 22568003.2
|
||||
Grid : Message : 211.892255 s : mflop/s per rank = 5642000.8
|
||||
Grid : Message : 211.892257 s : mflop/s per node = 22568003.2
|
||||
Grid : Message : 211.896037 s : norm diff 5.8108784e-14 Line 306
|
||||
Grid : Message : 220.751375 s : ----------------------------------------------------------------
|
||||
Grid : Message : 220.751406 s : Compare to naive wilson implementation Dag to verify correctness
|
||||
Grid : Message : 220.751409 s : ----------------------------------------------------------------
|
||||
Grid : Message : 220.751411 s : Called DwDag
|
||||
Grid : Message : 220.751412 s : norm dag result 4.12810493
|
||||
Grid : Message : 220.753307 s : norm dag ref 4.12810493
|
||||
Grid : Message : 220.755796 s : norm dag diff 3.40632318e-14 Line 377
|
||||
Grid : Message : 220.813226 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
|
||||
Grid : Message : 221.697800 s : src_e0.500003185
|
||||
Grid : Message : 221.890920 s : src_o0.499996882
|
||||
Grid : Message : 221.913430 s : *********************************************************
|
||||
Grid : Message : 221.913450 s : * Benchmarking DomainWallFermion::DhopEO
|
||||
Grid : Message : 221.913480 s : * Vectorising space-time by 8
|
||||
Grid : Message : 221.913500 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 221.913530 s : * Using GENERIC Nc WilsonKernels
|
||||
Grid : Message : 221.913550 s : *********************************************************
|
||||
Grid : Message : 221.645213 s : Deo mflop/s = 24114032
|
||||
Grid : Message : 221.645228 s : Deo mflop/s per rank 6028508.01
|
||||
Grid : Message : 221.645231 s : Deo mflop/s per node 24114032
|
||||
Grid : Message : 221.656021 s : r_e2.06377678
|
||||
Grid : Message : 221.656389 s : r_o2.06381058
|
||||
Grid : Message : 221.656698 s : res4.12758736
|
||||
Grid : Message : 222.110075 s : norm diff 0
|
||||
Grid : Message : 222.857692 s : norm diff even 0
|
||||
Grid : Message : 222.875763 s : norm diff odd 0
|
||||
Grid : Message : 223.598127 s : *******************************************
|
||||
Grid : Message : 223.598145 s : ******* Grid Finalize ******
|
||||
Grid : Message : 223.598146 s : *******************************************
|
286
systems/Jupiter/benchmarks/dwf.4node.perf
Normal file
286
systems/Jupiter/benchmarks/dwf.4node.perf
Normal file
@@ -0,0 +1,286 @@
|
||||
RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
|
||||
RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
|
||||
RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
|
||||
RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
|
||||
RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
|
||||
RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
|
||||
RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
|
||||
RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
|
||||
RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
|
||||
RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
|
||||
RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
|
||||
RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
|
||||
RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
|
||||
RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
|
||||
RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
|
||||
RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
|
||||
SLURM detected
|
||||
AcceleratorCudaInit[0]: ========================
|
||||
AcceleratorCudaInit[0]: Device Number : 0
|
||||
AcceleratorCudaInit[0]: ========================
|
||||
AcceleratorCudaInit[0]: Device identifier: NVIDIA GH200 120GB
|
||||
AcceleratorCudaInit[0]: totalGlobalMem: 102005473280
|
||||
AcceleratorCudaInit[0]: managedMemory: 1
|
||||
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
|
||||
AcceleratorCudaInit[0]: warpSize: 32
|
||||
AcceleratorCudaInit[0]: pciBusID: 1
|
||||
AcceleratorCudaInit[0]: pciDeviceID: 0
|
||||
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
|
||||
AcceleratorCudaInit: using default device
|
||||
AcceleratorCudaInit: assume user either uses
|
||||
AcceleratorCudaInit: a) IBM jsrun, or
|
||||
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
|
||||
AcceleratorCudaInit: Configure options --enable-setdevice=no
|
||||
local rank 0 device 0 bus id: 0009:01:00.0
|
||||
AcceleratorCudaInit: ================================================
|
||||
SharedMemoryMpi: World communicator of size 16
|
||||
SharedMemoryMpi: Node communicator of size 4
|
||||
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x4002a0000000 - 40031fffffff for comms buffers
|
||||
Setting up IPC
|
||||
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|_ | | | | | | | | | | | | _|__
|
||||
__|_ _|__
|
||||
__|_ GGGG RRRR III DDDD _|__
|
||||
__|_ G R R I D D _|__
|
||||
__|_ G R R I D D _|__
|
||||
__|_ G GG RRRR I D D _|__
|
||||
__|_ G G R R I D D _|__
|
||||
__|_ GGGG R R III DDDD _|__
|
||||
__|_ _|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
|
||||
| | | | | | | | | | | | | |
|
||||
|
||||
|
||||
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
Current Grid git commit hash=3737a24096282ea179607fc879814710860a0de6: (HEAD -> develop, origin/develop, origin/HEAD) clean
|
||||
|
||||
Grid : Message : ================================================
|
||||
Grid : Message : MPI is initialised and logging filters activated
|
||||
Grid : Message : ================================================
|
||||
Grid : Message : This rank is running on host jpbo-012-11.jupiter.internal
|
||||
Grid : Message : Requested 2147483648 byte stencil comms buffers
|
||||
Grid : Message : MemoryManager Cache 81604378624 bytes
|
||||
Grid : Message : MemoryManager::Init() setting up
|
||||
Grid : Message : MemoryManager::Init() cache pool for recent host allocations: SMALL 8 LARGE 2 HUGE 0
|
||||
Grid : Message : MemoryManager::Init() cache pool for recent device allocations: SMALL 16 LARGE 8 Huge 0
|
||||
Grid : Message : MemoryManager::Init() cache pool for recent shared allocations: SMALL 16 LARGE 8 Huge 0
|
||||
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
|
||||
Grid : Message : MemoryManager::Init() Using cudaMalloc
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Grid : Message : 0.834000 s : ++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
Grid : Message : 0.838000 s : Testing with full communication
|
||||
Grid : Message : 0.839000 s : ++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
Grid : Message : 0.840000 s : Grid Layout
|
||||
Grid : Message : 0.840000 s : Global lattice size : 64 64 64 64
|
||||
Grid : Message : 0.846000 s : OpenMP threads : 4
|
||||
Grid : Message : 0.846000 s : MPI tasks : 2 2 2 2
|
||||
Grid : Message : 0.165970 s : Initialising 4d RNG
|
||||
Grid : Message : 0.787270 s : Intialising parallel RNG with unique string 'The 4D RNG'
|
||||
Grid : Message : 0.787340 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
|
||||
Grid : Message : 0.960410 s : Initialising 5d RNG
|
||||
Grid : Message : 1.142344 s : Intialising parallel RNG with unique string 'The 5D RNG'
|
||||
Grid : Message : 1.142352 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
|
||||
local rank 2 device 0 bus id: 0029:01:00.0
|
||||
local rank 3 device 0 bus id: 0039:01:00.0
|
||||
local rank 1 device 0 bus id: 0019:01:00.0
|
||||
Grid : Message : 44.657270 s : Drawing gauge field
|
||||
Grid : Message : 55.247733 s : Random gauge initialised
|
||||
Grid : Message : 55.247745 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
|
||||
Grid : Message : 55.247747 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
|
||||
Grid : Message : 55.253053 s : Setting up Cshift based reference
|
||||
Grid : Message : 62.191747 s : *****************************************************************
|
||||
Grid : Message : 62.191767 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
|
||||
Grid : Message : 62.191768 s : *****************************************************************
|
||||
Grid : Message : 62.191769 s : *****************************************************************
|
||||
Grid : Message : 62.191769 s : * Benchmarking DomainWallFermionR::Dhop
|
||||
Grid : Message : 62.191769 s : * Vectorising space-time by 8
|
||||
Grid : Message : 62.191770 s : * VComplex size is 64 B
|
||||
Grid : Message : 62.191771 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 62.191771 s : * Using GENERIC Nc WilsonKernels
|
||||
Grid : Message : 62.191772 s : *****************************************************************
|
||||
Grid : Message : 62.857568 s : Called warmup
|
||||
Grid : Message : 65.581790 s : Called Dw 300 times in 2200540 us
|
||||
Grid : Message : 65.582120 s : mflop/s = 48306525
|
||||
Grid : Message : 65.582140 s : mflop/s per rank = 3019157.81
|
||||
Grid : Message : 65.582150 s : mflop/s per node = 12076631.3
|
||||
Grid : Message : 65.637550 s : norm diff 5.80156793e-14 Line 306
|
||||
Grid : Message : 75.122153 s : ----------------------------------------------------------------
|
||||
Grid : Message : 75.122166 s : Compare to naive wilson implementation Dag to verify correctness
|
||||
Grid : Message : 75.122167 s : ----------------------------------------------------------------
|
||||
Grid : Message : 75.122167 s : Called DwDag
|
||||
Grid : Message : 75.122167 s : norm dag result 4.12801829
|
||||
Grid : Message : 75.123295 s : norm dag ref 4.12801829
|
||||
Grid : Message : 75.125890 s : norm dag diff 3.42093991e-14 Line 377
|
||||
Grid : Message : 75.188462 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
|
||||
Grid : Message : 75.605683 s : src_e0.500004005
|
||||
Grid : Message : 75.617824 s : src_o0.499996067
|
||||
Grid : Message : 75.620089 s : *********************************************************
|
||||
Grid : Message : 75.620091 s : * Benchmarking DomainWallFermion::DhopEO
|
||||
Grid : Message : 75.620093 s : * Vectorising space-time by 8
|
||||
Grid : Message : 75.620094 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 75.620095 s : * Using GENERIC Nc WilsonKernels
|
||||
Grid : Message : 75.620096 s : *********************************************************
|
||||
Grid : Message : 76.732272 s : Deo mflop/s = 48068252.4
|
||||
Grid : Message : 76.732283 s : Deo mflop/s per rank 3004265.77
|
||||
Grid : Message : 76.732285 s : Deo mflop/s per node 12017063.1
|
||||
Grid : Message : 76.749317 s : r_e2.06443136
|
||||
Grid : Message : 76.749652 s : r_o2.06378451
|
||||
Grid : Message : 76.749955 s : res4.12821587
|
||||
Grid : Message : 77.198827 s : norm diff 0
|
||||
Grid : Message : 77.981760 s : norm diff even 0
|
||||
Grid : Message : 78.455900 s : norm diff odd 0
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Grid : Message : 78.539333 s : ++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
Grid : Message : 78.539337 s : Testing without internode communication
|
||||
Grid : Message : 78.539338 s : ++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
Grid : Message : 78.539339 s : Grid Layout
|
||||
Grid : Message : 78.539339 s : Global lattice size : 64 64 64 64
|
||||
Grid : Message : 78.539347 s : OpenMP threads : 4
|
||||
Grid : Message : 78.539348 s : MPI tasks : 2 2 2 2
|
||||
Grid : Message : 78.798501 s : Initialising 4d RNG
|
||||
Grid : Message : 78.862916 s : Intialising parallel RNG with unique string 'The 4D RNG'
|
||||
Grid : Message : 78.862925 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
|
||||
Grid : Message : 78.879916 s : Initialising 5d RNG
|
||||
Grid : Message : 79.941271 s : Intialising parallel RNG with unique string 'The 5D RNG'
|
||||
Grid : Message : 79.941280 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
|
||||
Grid : Message : 124.586264 s : Drawing gauge field
|
||||
Grid : Message : 135.338090 s : Random gauge initialised
|
||||
Grid : Message : 135.338102 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
|
||||
Grid : Message : 135.338103 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
|
||||
Grid : Message : 135.341266 s : Setting up Cshift based reference
|
||||
Grid : Message : 142.604280 s : *****************************************************************
|
||||
Grid : Message : 142.604450 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
|
||||
Grid : Message : 142.604460 s : *****************************************************************
|
||||
Grid : Message : 142.604470 s : *****************************************************************
|
||||
Grid : Message : 142.604480 s : * Benchmarking DomainWallFermionR::Dhop
|
||||
Grid : Message : 142.604480 s : * Vectorising space-time by 8
|
||||
Grid : Message : 142.604500 s : * VComplex size is 64 B
|
||||
Grid : Message : 142.604510 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 142.604510 s : * Using GENERIC Nc WilsonKernels
|
||||
Grid : Message : 142.604520 s : *****************************************************************
|
||||
Grid : Message : 142.686034 s : Called warmup
|
||||
Grid : Message : 144.868543 s : Called Dw 300 times in 2182483 us
|
||||
Grid : Message : 144.868559 s : mflop/s = 48706194.1
|
||||
Grid : Message : 144.868561 s : mflop/s per rank = 3044137.13
|
||||
Grid : Message : 144.868562 s : mflop/s per node = 12176548.5
|
||||
Grid : Message : 144.887595 s : norm diff 5.80156793e-14 Line 306
|
||||
Grid : Message : 153.622978 s : ----------------------------------------------------------------
|
||||
Grid : Message : 153.622994 s : Compare to naive wilson implementation Dag to verify correctness
|
||||
Grid : Message : 153.622995 s : ----------------------------------------------------------------
|
||||
Grid : Message : 153.622995 s : Called DwDag
|
||||
Grid : Message : 153.622996 s : norm dag result 4.12801829
|
||||
Grid : Message : 153.623604 s : norm dag ref 4.12801829
|
||||
Grid : Message : 153.626098 s : norm dag diff 3.42093991e-14 Line 377
|
||||
Grid : Message : 153.691426 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
|
||||
Grid : Message : 154.148319 s : src_e0.500004005
|
||||
Grid : Message : 154.151454 s : src_o0.499996067
|
||||
Grid : Message : 154.153722 s : *********************************************************
|
||||
Grid : Message : 154.153724 s : * Benchmarking DomainWallFermion::DhopEO
|
||||
Grid : Message : 154.153725 s : * Vectorising space-time by 8
|
||||
Grid : Message : 154.153726 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 154.153727 s : * Using GENERIC Nc WilsonKernels
|
||||
Grid : Message : 154.153728 s : *********************************************************
|
||||
Grid : Message : 155.200671 s : Deo mflop/s = 51121022.4
|
||||
Grid : Message : 155.200682 s : Deo mflop/s per rank 3195063.9
|
||||
Grid : Message : 155.200684 s : Deo mflop/s per node 12780255.6
|
||||
Grid : Message : 155.217204 s : r_e2.06443136
|
||||
Grid : Message : 155.217550 s : r_o2.06378451
|
||||
Grid : Message : 155.217869 s : res4.12821587
|
||||
Grid : Message : 155.673744 s : norm diff 0
|
||||
Grid : Message : 156.463329 s : norm diff even 0
|
||||
Grid : Message : 156.878866 s : norm diff odd 0
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Grid : Message : 157.620761 s : ++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
Grid : Message : 157.620764 s : Testing without intranode communication
|
||||
Grid : Message : 157.620765 s : ++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
Grid : Message : 157.620766 s : Grid Layout
|
||||
Grid : Message : 157.620766 s : Global lattice size : 64 64 64 64
|
||||
Grid : Message : 157.620773 s : OpenMP threads : 4
|
||||
Grid : Message : 157.620774 s : MPI tasks : 2 2 2 2
|
||||
Grid : Message : 157.671479 s : Initialising 4d RNG
|
||||
Grid : Message : 157.738691 s : Intialising parallel RNG with unique string 'The 4D RNG'
|
||||
Grid : Message : 157.738698 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
|
||||
Grid : Message : 157.755651 s : Initialising 5d RNG
|
||||
Grid : Message : 158.848676 s : Intialising parallel RNG with unique string 'The 5D RNG'
|
||||
Grid : Message : 158.848685 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
|
||||
Grid : Message : 202.465158 s : Drawing gauge field
|
||||
Grid : Message : 213.214546 s : Random gauge initialised
|
||||
Grid : Message : 213.214561 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
|
||||
Grid : Message : 213.214563 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
|
||||
Grid : Message : 213.217711 s : Setting up Cshift based reference
|
||||
Grid : Message : 219.662772 s : *****************************************************************
|
||||
Grid : Message : 219.662786 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
|
||||
Grid : Message : 219.662787 s : *****************************************************************
|
||||
Grid : Message : 219.662788 s : *****************************************************************
|
||||
Grid : Message : 219.662788 s : * Benchmarking DomainWallFermionR::Dhop
|
||||
Grid : Message : 219.662789 s : * Vectorising space-time by 8
|
||||
Grid : Message : 219.662790 s : * VComplex size is 64 B
|
||||
Grid : Message : 219.662791 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 219.662791 s : * Using GENERIC Nc WilsonKernels
|
||||
Grid : Message : 219.662791 s : *****************************************************************
|
||||
Grid : Message : 220.425592 s : Called warmup
|
||||
Grid : Message : 222.536249 s : Called Dw 300 times in 2110597 us
|
||||
Grid : Message : 222.536267 s : mflop/s = 50365105.5
|
||||
Grid : Message : 222.536269 s : mflop/s per rank = 3147819.09
|
||||
Grid : Message : 222.536270 s : mflop/s per node = 12591276.4
|
||||
Grid : Message : 222.541053 s : norm diff 5.80156793e-14 Line 306
|
||||
Grid : Message : 232.135901 s : ----------------------------------------------------------------
|
||||
Grid : Message : 232.135915 s : Compare to naive wilson implementation Dag to verify correctness
|
||||
Grid : Message : 232.135916 s : ----------------------------------------------------------------
|
||||
Grid : Message : 232.135917 s : Called DwDag
|
||||
Grid : Message : 232.135918 s : norm dag result 4.12801829
|
||||
Grid : Message : 232.151938 s : norm dag ref 4.12801829
|
||||
Grid : Message : 232.154451 s : norm dag diff 3.42093991e-14 Line 377
|
||||
Grid : Message : 232.216117 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
|
||||
Grid : Message : 232.630529 s : src_e0.500004005
|
||||
Grid : Message : 232.643197 s : src_o0.499996067
|
||||
Grid : Message : 232.645527 s : *********************************************************
|
||||
Grid : Message : 232.645529 s : * Benchmarking DomainWallFermion::DhopEO
|
||||
Grid : Message : 232.645532 s : * Vectorising space-time by 8
|
||||
Grid : Message : 232.645533 s : * Using Overlapped Comms/Compute
|
||||
Grid : Message : 232.645534 s : * Using GENERIC Nc WilsonKernels
|
||||
Grid : Message : 232.645535 s : *********************************************************
|
||||
Grid : Message : 233.774184 s : Deo mflop/s = 47432091.9
|
||||
Grid : Message : 233.774194 s : Deo mflop/s per rank 2964505.74
|
||||
Grid : Message : 233.774196 s : Deo mflop/s per node 11858023
|
||||
Grid : Message : 233.791552 s : r_e2.06443136
|
||||
Grid : Message : 233.791899 s : r_o2.06378451
|
||||
Grid : Message : 233.792204 s : res4.12821587
|
||||
Grid : Message : 234.230783 s : norm diff 0
|
||||
Grid : Message : 235.162780 s : norm diff even 0
|
||||
Grid : Message : 235.291950 s : norm diff odd 0
|
||||
Grid : Message : 235.765411 s : *******************************************
|
||||
Grid : Message : 235.765424 s : ******* Grid Finalize ******
|
||||
Grid : Message : 235.765425 s : *******************************************
|
||||
|