1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-08-01 12:17:06 +01:00

Compare commits

..

3 Commits

Author SHA1 Message Date
Chulwoo Jung
8419cc5c64 specflow evec I/O added, 2025-07-11 15:57:23 -04:00
Chulwoo Jung
2cc6deb8e0 Merge branch 'develop' of https://github.com/paboyle/Grid into ic2 2025-04-25 10:48:41 -04:00
Chulwoo Jung
19d0590579 Checking in for merging 2025-04-25 10:48:22 -04:00
64 changed files with 1352 additions and 2583 deletions

View File

@@ -51,13 +51,11 @@ directory
#pragma nv_diag_suppress cast_to_qualified_type #pragma nv_diag_suppress cast_to_qualified_type
//disables nvcc specific warning in many files //disables nvcc specific warning in many files
#pragma nv_diag_suppress esa_on_defaulted_function_ignored #pragma nv_diag_suppress esa_on_defaulted_function_ignored
#pragma nv_diag_suppress declared_but_not_referenced
#pragma nv_diag_suppress extra_semicolon #pragma nv_diag_suppress extra_semicolon
#else #else
//disables nvcc specific warning in json.hpp //disables nvcc specific warning in json.hpp
#pragma diag_suppress unsigned_compare_with_zero #pragma diag_suppress unsigned_compare_with_zero
#pragma diag_suppress cast_to_qualified_type #pragma diag_suppress cast_to_qualified_type
#pragma diag_suppress declared_but_not_referenced
//disables nvcc specific warning in many files //disables nvcc specific warning in many files
#pragma diag_suppress esa_on_defaulted_function_ignored #pragma diag_suppress esa_on_defaulted_function_ignored
#pragma diag_suppress extra_semicolon #pragma diag_suppress extra_semicolon

View File

@@ -269,9 +269,7 @@ public:
RealD xscale = 2.0/(hi-lo); RealD xscale = 2.0/(hi-lo);
RealD mscale = -(hi+lo)/(hi-lo); RealD mscale = -(hi+lo)/(hi-lo);
Linop.HermOp(T0,y); Linop.HermOp(T0,y);
grid->Barrier();
axpby(T1,xscale,mscale,y,in); axpby(T1,xscale,mscale,y,in);
grid->Barrier();
// sum = .5 c[0] T0 + c[1] T1 // sum = .5 c[0] T0 + c[1] T1
// out = ()*T0 + Coeffs[1]*T1; // out = ()*T0 + Coeffs[1]*T1;

View File

@@ -65,7 +65,6 @@ NAMESPACE_BEGIN(Grid);
#endif #endif
enum GridBLASOperation_t { GridBLAS_OP_N, GridBLAS_OP_T, GridBLAS_OP_C } ; enum GridBLASOperation_t { GridBLAS_OP_N, GridBLAS_OP_T, GridBLAS_OP_C } ;
enum GridBLASPrecision_t { GridBLAS_PRECISION_DEFAULT, GridBLAS_PRECISION_16F, GridBLAS_PRECISION_16BF, GridBLAS_PRECISION_TF32 };
class GridBLAS { class GridBLAS {
public: public:
@@ -98,21 +97,7 @@ public:
gridblasInit=1; gridblasInit=1;
} }
} }
#ifdef GRID_CUDA
cublasComputeType_t toDataType(GridBLASPrecision_t p) {
switch (p) {
case GridBLAS_PRECISION_16F:
return CUBLAS_COMPUTE_32F_FAST_16F;
case GridBLAS_PRECISION_16BF:
return CUBLAS_COMPUTE_32F_FAST_16BF;
case GridBLAS_PRECISION_TF32:
return CUBLAS_COMPUTE_32F_FAST_TF32;
default:
assert(0);
}
}
#endif
// Force construct once // Force construct once
GridBLAS() { Init(); }; GridBLAS() { Init(); };
~GridBLAS() { }; ~GridBLAS() { };
@@ -153,10 +138,8 @@ public:
deviceVector<ComplexD*> &Amk, // pointer list to matrices deviceVector<ComplexD*> &Amk, // pointer list to matrices
deviceVector<ComplexD*> &Bkn, deviceVector<ComplexD*> &Bkn,
ComplexD beta, ComplexD beta,
deviceVector<ComplexD*> &Cmn, deviceVector<ComplexD*> &Cmn)
GridBLASPrecision_t precision = GridBLAS_PRECISION_DEFAULT)
{ {
assert(precision == GridBLAS_PRECISION_DEFAULT);
gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
m,n,k, m,n,k,
alpha, alpha,
@@ -218,10 +201,8 @@ public:
deviceVector<ComplexD*> &Amk, // pointer list to matrices deviceVector<ComplexD*> &Amk, // pointer list to matrices
deviceVector<ComplexD*> &Bkn, deviceVector<ComplexD*> &Bkn,
ComplexD beta, ComplexD beta,
deviceVector<ComplexD*> &Cmn, deviceVector<ComplexD*> &Cmn)
GridBLASPrecision_t precision = GridBLAS_PRECISION_DEFAULT)
{ {
assert(precision == GridBLAS_PRECISION_DEFAULT);
RealD t2=usecond(); RealD t2=usecond();
int32_t batchCount = Amk.size(); int32_t batchCount = Amk.size();
assert(Bkn.size()==batchCount); assert(Bkn.size()==batchCount);
@@ -467,8 +448,7 @@ public:
deviceVector<ComplexF*> &Amk, // pointer list to matrices deviceVector<ComplexF*> &Amk, // pointer list to matrices
deviceVector<ComplexF*> &Bkn, deviceVector<ComplexF*> &Bkn,
ComplexF beta, ComplexF beta,
deviceVector<ComplexF*> &Cmn, deviceVector<ComplexF*> &Cmn)
GridBLASPrecision_t precision = GridBLAS_PRECISION_DEFAULT)
{ {
RealD t2=usecond(); RealD t2=usecond();
int32_t batchCount = Amk.size(); int32_t batchCount = Amk.size();
@@ -493,7 +473,6 @@ public:
assert(Bkn.size()==batchCount); assert(Bkn.size()==batchCount);
assert(Cmn.size()==batchCount); assert(Cmn.size()==batchCount);
#ifdef GRID_HIP #ifdef GRID_HIP
assert(precision == GridBLAS_PRECISION_DEFAULT);
hipblasOperation_t hOpA; hipblasOperation_t hOpA;
hipblasOperation_t hOpB; hipblasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N; if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
@@ -524,67 +503,50 @@ public:
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N; if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T; if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C; if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
cublasStatus_t err; auto err = cublasCgemmBatched(gridblasHandle,
if (precision == GridBLAS_PRECISION_DEFAULT) { hOpA,
err = cublasCgemmBatched(gridblasHandle, hOpB,
hOpA, m,n,k,
hOpB, (cuComplex *) &alpha_p[0],
m,n,k, (cuComplex **)&Amk[0], lda,
(cuComplex *) &alpha_p[0], (cuComplex **)&Bkn[0], ldb,
(cuComplex **)&Amk[0], lda, (cuComplex *) &beta_p[0],
(cuComplex **)&Bkn[0], ldb, (cuComplex **)&Cmn[0], ldc,
(cuComplex *) &beta_p[0], batchCount);
(cuComplex **)&Cmn[0], ldc,
batchCount);
} else {
cublasComputeType_t compute_precision = toDataType(precision);
err = cublasGemmBatchedEx(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(void *) &alpha_p[0],
(void **)&Amk[0], CUDA_C_32F, lda,
(void **)&Bkn[0], CUDA_C_32F, ldb,
(void *) &beta_p[0],
(void **)&Cmn[0], CUDA_C_32F, ldc,
batchCount, compute_precision, CUBLAS_GEMM_DEFAULT);
}
assert(err==CUBLAS_STATUS_SUCCESS); assert(err==CUBLAS_STATUS_SUCCESS);
#endif #endif
#ifdef GRID_SYCL #ifdef GRID_SYCL
assert(precision == GridBLAS_PRECISION_DEFAULT); int64_t m64=m;
int64_t m64=m; int64_t n64=n;
int64_t n64=n; int64_t k64=k;
int64_t k64=k; int64_t lda64=lda;
int64_t lda64=lda; int64_t ldb64=ldb;
int64_t ldb64=ldb; int64_t ldc64=ldc;
int64_t ldc64=ldc; int64_t batchCount64=batchCount;
int64_t batchCount64=batchCount;
oneapi::mkl::transpose iOpA;
oneapi::mkl::transpose iOpA; oneapi::mkl::transpose iOpB;
oneapi::mkl::transpose iOpB;
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N; if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T; if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C; if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N; if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T; if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle, &iOpA,
&iOpA, &iOpB,
&iOpB, &m64,&n64,&k64,
&m64,&n64,&k64, (ComplexF *) &alpha_p[0],
(ComplexF *) &alpha_p[0], (const ComplexF **)&Amk[0], (const int64_t *)&lda64,
(const ComplexF **)&Amk[0], (const int64_t *)&lda64, (const ComplexF **)&Bkn[0], (const int64_t *)&ldb64,
(const ComplexF **)&Bkn[0], (const int64_t *)&ldb64, (ComplexF *) &beta_p[0],
(ComplexF *) &beta_p[0], (ComplexF **)&Cmn[0], (const int64_t *)&ldc64,
(ComplexF **)&Cmn[0], (const int64_t *)&ldc64, (int64_t)1,&batchCount64,std::vector<sycl::event>());
(int64_t)1,&batchCount64,std::vector<sycl::event>());
synchronise(); synchronise();
#endif #endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
assert(precision == GridBLAS_PRECISION_DEFAULT);
// Need a default/reference implementation; use Eigen // Need a default/reference implementation; use Eigen
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) { if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, { thread_for (p, batchCount, {
@@ -984,336 +946,6 @@ public:
RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount; RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount;
} }
/*
Inverse and Determinant
- CPU version uses Eigen
- GPU version uses LAPACK-compatible getrf / getri
Design comment: Eigen does not expose getrf / getri in a LAPACK compatible manner.
Overhead to go through getrf / getri for CPU version too large.
Current interface therefore only guarantees the inverse and determinant
functions on all platforms but not the getrf / getri ones.
*/
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
void inverseBatched(int64_t n,
deviceVector<ComplexD*> &Ann,
deviceVector<ComplexD*> &Cnn) {
int64_t batchCount = Ann.size();
assert(batchCount == Cnn.size());
thread_for(p,batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAnn(Ann[p],n,n);
Eigen::Map<Eigen::MatrixXcd> eCnn(Cnn[p],n,n);
eCnn = eAnn.inverse();
});
}
void inverseBatched(int64_t n,
deviceVector<ComplexF*> &Ann,
deviceVector<ComplexF*> &Cnn) {
int64_t batchCount = Ann.size();
assert(batchCount == Cnn.size());
thread_for(p,batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAnn(Ann[p],n,n);
Eigen::Map<Eigen::MatrixXcf> eCnn(Cnn[p],n,n);
eCnn = eAnn.inverse();
});
}
void determinantBatched(int64_t n,
deviceVector<ComplexD*> &Ann,
deviceVector<ComplexD*> &C) {
int64_t batchCount = Ann.size();
assert(batchCount == C.size());
thread_for(p,batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAnn(Ann[p],n,n);
*C[p] = eAnn.determinant();
});
}
void determinantBatched(int64_t n,
deviceVector<ComplexF*> &Ann,
deviceVector<ComplexF*> &C) {
int64_t batchCount = Ann.size();
assert(batchCount == C.size());
thread_for(p,batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAnn(Ann[p],n,n);
*C[p] = eAnn.determinant();
});
}
#else
#ifdef GRID_SYCL
template<typename T>
void getrfBatchedSYCL(int64_t n,
deviceVector<T*> &Ann,
deviceVector<int64_t> &ipiv,
deviceVector<int64_t> &info) {
int64_t batchCount = Ann.size();
static deviceVector<T> scratchpad;
int64_t sp_size = oneapi::mkl::lapack::getrf_batch_scratchpad_size<T>(*gridblasHandle, &n, &n, &n, (int64_t)1, &batchCount);
if (sp_size > scratchpad.size())
scratchpad.resize(sp_size);
static deviceVector<int64_t*> _ipiv;
if (batchCount > _ipiv.size())
_ipiv.resize(batchCount);
int64_t** p_ipiv = &_ipiv[0];
int64_t* pipiv = &ipiv[0];
accelerator_for(i, batchCount, 1, { p_ipiv[i] = &pipiv[i*n]; });
oneapi::mkl::lapack::getrf_batch(*gridblasHandle,
&n, &n,
(T **)&Ann[0],
&n,
(int64_t**)&_ipiv[0],
(int64_t)1, &batchCount,
(T*)&scratchpad[0], (int64_t)scratchpad.size(),
std::vector<sycl::event>());
synchronise();
}
#endif
void getrfBatched(int64_t n,
deviceVector<ComplexD*> &Ann,
deviceVector<int64_t> &ipiv,
deviceVector<int64_t> &info)
{
int64_t batchCount = Ann.size();
assert(ipiv.size()==batchCount*n);
assert(info.size()==batchCount);
#ifdef GRID_HIP
auto err = hipblasZgetrfBatched(gridblasHandle,(int)n,
(hipblasDoubleComplex **)&Ann[0], (int)n,
(int*) &ipiv[0],
(int*) &info[0],
(int)batchCount);
assert(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
auto err = cublasZgetrfBatched(gridblasHandle, (int)n,
(cuDoubleComplex **)&Ann[0], (int)n,
(int*) &ipiv[0],
(int*) &info[0],
(int)batchCount);
assert(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
getrfBatchedSYCL(n, Ann, ipiv, info);
#endif
}
void getrfBatched(int64_t n,
deviceVector<ComplexF*> &Ann,
deviceVector<int64_t> &ipiv,
deviceVector<int64_t> &info)
{
int64_t batchCount = Ann.size();
assert(ipiv.size()==batchCount*n);
assert(info.size()==batchCount);
#ifdef GRID_HIP
auto err = hipblasCgetrfBatched(gridblasHandle,(int)n,
(hipblasComplex **)&Ann[0], (int)n,
(int*) &ipiv[0],
(int*) &info[0],
(int)batchCount);
assert(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
auto err = cublasCgetrfBatched(gridblasHandle, (int)n,
(cuComplex **)&Ann[0], (int)n,
(int*) &ipiv[0],
(int*) &info[0],
(int)batchCount);
assert(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
getrfBatchedSYCL(n, Ann, ipiv, info);
#endif
}
#ifdef GRID_SYCL
template<typename T>
void getriBatchedSYCL(int64_t n,
deviceVector<T*> &Ann,
deviceVector<int64_t> &ipiv,
deviceVector<int64_t> &info,
deviceVector<T*> &Cnn) {
int64_t batchCount = Ann.size();
static deviceVector<T> scratchpad;
int64_t sp_size = oneapi::mkl::lapack::getri_batch_scratchpad_size<T>(*gridblasHandle, &n, &n, (int64_t)1, &batchCount);
if (sp_size > scratchpad.size())
scratchpad.resize(sp_size);
static deviceVector<int64_t*> _ipiv;
if (batchCount > _ipiv.size())
_ipiv.resize(batchCount);
int64_t** p_ipiv = &_ipiv[0];
int64_t* pipiv = &ipiv[0];
accelerator_for(i, batchCount, 1, { p_ipiv[i] = &pipiv[i*n]; });
oneapi::mkl::lapack::getri_batch(*gridblasHandle,
&n,
(T **)&Ann[0],
&n,
(int64_t**)p_ipiv,
(int64_t)1, &batchCount,
(T *)&scratchpad[0], (int64_t)scratchpad.size(),
std::vector<sycl::event>());
synchronise();
T** pA = &Ann[0];
T** pC = &Cnn[0];
accelerator_for(i, batchCount*n*n, 1, {
auto j = i / batchCount;
auto k = i % batchCount;
pC[k][j] = pA[k][j];
});
}
#endif
void getriBatched(int64_t n,
deviceVector<ComplexD*> &Ann,
deviceVector<int64_t> &ipiv,
deviceVector<int64_t> &info,
deviceVector<ComplexD*> &Cnn)
{
int64_t batchCount = Ann.size();
assert(ipiv.size()==batchCount*n);
assert(info.size()==batchCount);
assert(Cnn.size()==batchCount);
#ifdef GRID_HIP
auto err = hipblasZgetriBatched(gridblasHandle,(int)n,
(hipblasDoubleComplex **)&Ann[0], (int)n,
(int*) &ipiv[0],
(hipblasDoubleComplex **)&Cnn[0], (int)n,
(int*) &info[0],
(int)batchCount);
assert(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
auto err = cublasZgetriBatched(gridblasHandle, (int)n,
(cuDoubleComplex **)&Ann[0], (int)n,
(int*) &ipiv[0],
(cuDoubleComplex **)&Cnn[0], (int)n,
(int*) &info[0],
(int)batchCount);
assert(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
getriBatchedSYCL(n, Ann, ipiv, info, Cnn);
#endif
}
void getriBatched(int64_t n,
deviceVector<ComplexF*> &Ann,
deviceVector<int64_t> &ipiv,
deviceVector<int64_t> &info,
deviceVector<ComplexF*> &Cnn)
{
int64_t batchCount = Ann.size();
assert(ipiv.size()==batchCount*n);
assert(info.size()==batchCount);
assert(Cnn.size()==batchCount);
#ifdef GRID_HIP
auto err = hipblasCgetriBatched(gridblasHandle,(int)n,
(hipblasComplex **)&Ann[0], (int)n,
(int*) &ipiv[0],
(hipblasComplex **)&Cnn[0], (int)n,
(int*) &info[0],
(int)batchCount);
assert(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
auto err = cublasCgetriBatched(gridblasHandle, (int)n,
(cuComplex **)&Ann[0], (int)n,
(int*) &ipiv[0],
(cuComplex **)&Cnn[0], (int)n,
(int*) &info[0],
(int)batchCount);
assert(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
getriBatchedSYCL(n, Ann, ipiv, info, Cnn);
#endif
}
template<typename dtype>
void inverseBatched(int64_t n,
deviceVector<dtype*> &Ann, // this will be overwritten with LU decomposition
deviceVector<dtype*> &Cnn // this will be overwritten with the inverse
) {
int64_t batchCount = Ann.size();
RealD t0 = usecond();
deviceVector<int64_t> ipiv(batchCount*n);
deviceVector<int64_t> info(batchCount);
//RealD t1 = usecond();
getrfBatched(n, Ann, ipiv, info);
// test info for non-invertibility? set to nan if yes?
getriBatched(n, Ann, ipiv, info, Cnn);
//synchronise();
//RealD t2 = usecond();
//std::cout << GridLogMessage << "Temp " << t1-t0 << " rf/ri " << t2-t1 << std::endl;
}
template<typename dtype>
void determinantBatched(int64_t n,
deviceVector<dtype*> &Ann, // this will be overwritten with LU decomposition
deviceVector<dtype*> &C // this will be overwritten with determinant
) {
int64_t batchCount = Ann.size();
//RealD t0 = usecond();
deviceVector<int64_t> ipiv(batchCount*n);
deviceVector<int64_t> info(batchCount);
dtype** pAnn = (dtype**)&Ann[0];
dtype** pC = (dtype**)&C[0];
#if defined(GRID_CUDA) || defined(GRID_HIP)
int* pipiv = (int*)&ipiv[0];
#else
int64_t* pipiv = (int64_t*)&ipiv[0];
#endif
//RealD t1 = usecond();
getrfBatched(n, Ann, ipiv, info);
//RealD t2 = usecond();
accelerator_for(i,batchCount,1,{
dtype det = 1.0;
for (int64_t j=0;j<n;j++) {
det *= pAnn[i][n*j + j];
// branchless signs
det *= (pipiv[i*n + j] == j+1) ? (1.0) : (-1.0);
}
*pC[i] = det;
});
//RealD t3 = usecond();
//std::cout << GridLogMessage << "Temp " << t1 - t0 << " rf/ri " << t2-t1 << "final" << t3 - t2 << std::endl;
}
#endif
template<class CComplex> template<class CComplex>
double benchmark(int M, int N, int K, int BATCH) double benchmark(int M, int N, int K, int BATCH)
{ {

View File

@@ -183,7 +183,6 @@ public:
int recv_from_rank, int recv_from_rank,
int bytes); int bytes);
int IsOffNode(int rank);
double StencilSendToRecvFrom(void *xmit, double StencilSendToRecvFrom(void *xmit,
int xmit_to_rank,int do_xmit, int xmit_to_rank,int do_xmit,
void *recv, void *recv,
@@ -202,9 +201,9 @@ public:
void StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list); void StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list);
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,void *xmit_comp, void *xmit,
int xmit_to_rank,int do_xmit, int xmit_to_rank,int do_xmit,
void *recv,void *recv_comp, void *recv,
int recv_from_rank,int do_recv, int recv_from_rank,int do_recv,
int xbytes,int rbytes,int dir); int xbytes,int rbytes,int dir);

View File

@@ -32,10 +32,6 @@ NAMESPACE_BEGIN(Grid);
Grid_MPI_Comm CartesianCommunicator::communicator_world; Grid_MPI_Comm CartesianCommunicator::communicator_world;
#ifdef GRID_CHECKSUM_COMMS
extern void * Grid_backtrace_buffer[_NBACKTRACE];
uint64_t checksum_index = 1;
#endif
//////////////////////////////////////////// ////////////////////////////////////////////
// First initialise of comms system // First initialise of comms system
@@ -264,39 +260,32 @@ CartesianCommunicator::~CartesianCommunicator()
} }
#ifdef USE_GRID_REDUCTION #ifdef USE_GRID_REDUCTION
void CartesianCommunicator::GlobalSum(float &f){ void CartesianCommunicator::GlobalSum(float &f){
FlightRecorder::StepLog("GlobalSumP2P");
CartesianCommunicator::GlobalSumP2P(f); CartesianCommunicator::GlobalSumP2P(f);
} }
void CartesianCommunicator::GlobalSum(double &d) void CartesianCommunicator::GlobalSum(double &d)
{ {
FlightRecorder::StepLog("GlobalSumP2P");
CartesianCommunicator::GlobalSumP2P(d); CartesianCommunicator::GlobalSumP2P(d);
} }
#else #else
void CartesianCommunicator::GlobalSum(float &f){ void CartesianCommunicator::GlobalSum(float &f){
FlightRecorder::StepLog("AllReduce float");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator); int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::GlobalSum(double &d) void CartesianCommunicator::GlobalSum(double &d)
{ {
FlightRecorder::StepLog("AllReduce double");
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator); int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0); assert(ierr==0);
} }
#endif #endif
void CartesianCommunicator::GlobalSum(uint32_t &u){ void CartesianCommunicator::GlobalSum(uint32_t &u){
FlightRecorder::StepLog("AllReduce uint32_t");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator); int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
assert(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::GlobalSum(uint64_t &u){ void CartesianCommunicator::GlobalSum(uint64_t &u){
FlightRecorder::StepLog("AllReduce uint64_t");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator); int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
assert(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::GlobalSumVector(uint64_t* u,int N){ void CartesianCommunicator::GlobalSumVector(uint64_t* u,int N){
FlightRecorder::StepLog("AllReduceVector");
int ierr=MPI_Allreduce(MPI_IN_PLACE,u,N,MPI_UINT64_T,MPI_SUM,communicator); int ierr=MPI_Allreduce(MPI_IN_PLACE,u,N,MPI_UINT64_T,MPI_SUM,communicator);
assert(ierr==0); assert(ierr==0);
} }
@@ -305,31 +294,26 @@ void CartesianCommunicator::GlobalXOR(uint32_t &u){
assert(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::GlobalXOR(uint64_t &u){ void CartesianCommunicator::GlobalXOR(uint64_t &u){
FlightRecorder::StepLog("GlobalXOR");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator); int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
assert(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::GlobalMax(float &f) void CartesianCommunicator::GlobalMax(float &f)
{ {
FlightRecorder::StepLog("GlobalMax");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_MAX,communicator); int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_MAX,communicator);
assert(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::GlobalMax(double &d) void CartesianCommunicator::GlobalMax(double &d)
{ {
FlightRecorder::StepLog("GlobalMax");
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator); int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
assert(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::GlobalSumVector(float *f,int N) void CartesianCommunicator::GlobalSumVector(float *f,int N)
{ {
FlightRecorder::StepLog("GlobalSumVector(float *)");
int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator); int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::GlobalSumVector(double *d,int N) void CartesianCommunicator::GlobalSumVector(double *d,int N)
{ {
FlightRecorder::StepLog("GlobalSumVector(double *)");
int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator); int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0); assert(ierr==0);
} }
@@ -404,16 +388,11 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
{ {
std::vector<CommsRequest_t> list; std::vector<CommsRequest_t> list;
double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir); double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
offbytes += StencilSendToRecvFromBegin(list,xmit,xmit,dest,dox,recv,recv,from,dor,bytes,bytes,dir); offbytes += StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
StencilSendToRecvFromComplete(list,dir); StencilSendToRecvFromComplete(list,dir);
return offbytes; return offbytes;
} }
int CartesianCommunicator::IsOffNode(int rank)
{
int grank = ShmRanks[rank];
if ( grank == MPI_UNDEFINED ) return true;
else return false;
}
#ifdef ACCELERATOR_AWARE_MPI #ifdef ACCELERATOR_AWARE_MPI
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {}; void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
@@ -428,9 +407,9 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
return 0.0; // Do nothing -- no preparation required return 0.0; // Do nothing -- no preparation required
} }
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,void *xmit_comp, void *xmit,
int dest,int dox, int dest,int dox,
void *recv,void *recv_comp, void *recv,
int from,int dor, int from,int dor,
int xbytes,int rbytes,int dir) int xbytes,int rbytes,int dir)
{ {
@@ -454,8 +433,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
if ( dor ) { if ( dor ) {
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) { if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+from*32; tag= dir+from*32;
// std::cout << " StencilSendToRecvFrom "<<dir<<" MPI_Irecv "<<std::hex<<recv<<std::dec<<std::endl; ierr=MPI_Irecv(recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
ierr=MPI_Irecv(recv_comp, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
assert(ierr==0); assert(ierr==0);
list.push_back(rrq); list.push_back(rrq);
off_node_bytes+=rbytes; off_node_bytes+=rbytes;
@@ -464,7 +442,6 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
else { else {
void *shm = (void *) this->ShmBufferTranslate(from,xmit); void *shm = (void *) this->ShmBufferTranslate(from,xmit);
assert(shm!=NULL); assert(shm!=NULL);
// std::cout << " StencilSendToRecvFrom "<<dir<<" CopyDeviceToDevice recv "<<std::hex<<recv<<" remote "<<shm <<std::dec<<std::endl;
acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes); acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
} }
#endif #endif
@@ -473,7 +450,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
if (dox) { if (dox) {
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) { if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+_processor*32; tag= dir+_processor*32;
ierr =MPI_Isend(xmit_comp, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq); ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
assert(ierr==0); assert(ierr==0);
list.push_back(xrq); list.push_back(xrq);
off_node_bytes+=xbytes; off_node_bytes+=xbytes;
@@ -572,11 +549,6 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
* - post device - host send buffer transfer asynch * - post device - host send buffer transfer asynch
*/ */
#ifdef GRID_CHECKSUM_COMMS
rbytes += 8;
xbytes += 8;
#endif
if ( dor ) { if ( dor ) {
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) { if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+from*32; tag= dir+from*32;
@@ -589,7 +561,6 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
srq.req = rrq; srq.req = rrq;
srq.host_buf = host_recv; srq.host_buf = host_recv;
srq.device_buf = recv; srq.device_buf = recv;
srq.tag = tag;
list.push_back(srq); list.push_back(srq);
off_node_bytes+=rbytes; off_node_bytes+=rbytes;
} }
@@ -603,16 +574,7 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
host_xmit = this->HostBufferMalloc(xbytes); host_xmit = this->HostBufferMalloc(xbytes);
CommsRequest_t srq; CommsRequest_t srq;
#ifdef GRID_CHECKSUM_COMMS
uint64_t xbytes_data = xbytes - 8;
srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes_data); // Make this Asynch
assert(xbytes % 8 == 0);
// flip one bit so that a zero buffer is not consistent
uint64_t xsum = checksum_gpu((uint64_t*)xmit, xbytes_data / 8) ^ (checksum_index + 1 + 1000 * tag);
*(uint64_t*)(((char*)host_xmit) + xbytes_data) = xsum;
#else
srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
#endif
// ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq); // ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
// assert(ierr==0); // assert(ierr==0);
@@ -654,11 +616,7 @@ void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequ
if ( flag ) { if ( flag ) {
// std::cout << " PollIrecv "<<idx<<" flag "<<flag<<std::endl; // std::cout << " PollIrecv "<<idx<<" flag "<<flag<<std::endl;
#ifdef GRID_CHECKSUM_COMMS
acceleratorCopyToDeviceAsynch(list[idx].host_buf,list[idx].device_buf,list[idx].bytes - 8);
#else
acceleratorCopyToDeviceAsynch(list[idx].host_buf,list[idx].device_buf,list[idx].bytes); acceleratorCopyToDeviceAsynch(list[idx].host_buf,list[idx].device_buf,list[idx].bytes);
#endif
list[idx].PacketType=InterNodeReceiveHtoD; list[idx].PacketType=InterNodeReceiveHtoD;
} else { } else {
pending ++; pending ++;
@@ -711,9 +669,9 @@ void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsReque
} }
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,void *xmit_comp, void *xmit,
int dest,int dox, int dest,int dox,
void *recv,void *recv_comp, void *recv,
int from,int dor, int from,int dor,
int xbytes,int rbytes,int dir) int xbytes,int rbytes,int dir)
{ {
@@ -821,40 +779,7 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
// acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes); // acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
// } // }
// } // }
#ifdef GRID_CHECKSUM_COMMS
for(int r=0;r<list.size();r++){
if ( list[r].PacketType == InterNodeReceiveHtoD ) {
uint64_t rbytes_data = list[r].bytes - 8;
uint64_t expected_cs = *(uint64_t*)(((char*)list[r].host_buf) + rbytes_data);
uint64_t computed_cs = checksum_gpu((uint64_t*)list[r].device_buf, rbytes_data / 8) ^ (checksum_index + 1 + 1000 * list[r].tag); //
if (expected_cs != computed_cs) {
// TODO: error message, backtrace, quit
fprintf(stderr, "GRID_CHECKSUM_COMMS error:\n");
fprintf(stderr, " processor = %d\n", (int)_processor);
for(int d=0;d<_processors.size();d++)
fprintf(stderr, " processor_coord[%d] = %d\n", d, _processor_coor[d]);
fprintf(stderr, " hostname: %s\n", GridHostname());
fprintf(stderr, " expected_cs: %ld\n", expected_cs);
fprintf(stderr, " computed_cs: %ld\n", computed_cs);
fprintf(stderr, " dest: %d\n", list[r].dest);
fprintf(stderr, " tag: %d\n", list[r].tag);
fprintf(stderr, " commdir: %d\n", list[r].commdir);
fprintf(stderr, " bytes: %ld\n", (uint64_t)list[r].bytes);
fflush(stderr);
// backtrace
int symbols = backtrace(Grid_backtrace_buffer,_NBACKTRACE);
backtrace_symbols_fd(Grid_backtrace_buffer,symbols, 2);
exit(1);
}
}
}
checksum_index += 1;
#endif
list.resize(0); // Delete the list list.resize(0); // Delete the list
this->HostBufferFreeAll(); // Clean up the buffer allocs this->HostBufferFreeAll(); // Clean up the buffer allocs
@@ -869,7 +794,6 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
void CartesianCommunicator::StencilBarrier(void) void CartesianCommunicator::StencilBarrier(void)
{ {
FlightRecorder::StepLog("NodeBarrier");
MPI_Barrier (ShmComm); MPI_Barrier (ShmComm);
} }
//void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list) //void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
@@ -877,13 +801,11 @@ void CartesianCommunicator::StencilBarrier(void)
//} //}
void CartesianCommunicator::Barrier(void) void CartesianCommunicator::Barrier(void)
{ {
FlightRecorder::StepLog("GridBarrier");
int ierr = MPI_Barrier(communicator); int ierr = MPI_Barrier(communicator);
assert(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::Broadcast(int root,void* data, int bytes) void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
{ {
FlightRecorder::StepLog("Broadcast");
int ierr=MPI_Bcast(data, int ierr=MPI_Bcast(data,
bytes, bytes,
MPI_BYTE, MPI_BYTE,
@@ -897,13 +819,11 @@ int CartesianCommunicator::RankWorld(void){
return r; return r;
} }
void CartesianCommunicator::BarrierWorld(void){ void CartesianCommunicator::BarrierWorld(void){
FlightRecorder::StepLog("BarrierWorld");
int ierr = MPI_Barrier(communicator_world); int ierr = MPI_Barrier(communicator_world);
assert(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
{ {
FlightRecorder::StepLog("BroadcastWorld");
int ierr= MPI_Bcast(data, int ierr= MPI_Bcast(data,
bytes, bytes,
MPI_BYTE, MPI_BYTE,
@@ -926,7 +846,6 @@ void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,
} }
void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t bytes) void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t bytes)
{ {
FlightRecorder::StepLog("AllToAll");
// MPI is a pain and uses "int" arguments // MPI is a pain and uses "int" arguments
// 64*64*64*128*16 == 500Million elements of data. // 64*64*64*128*16 == 500Million elements of data.
// When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug. // When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug.

View File

@@ -124,8 +124,6 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest
dest=0; dest=0;
} }
int CartesianCommunicator::IsOffNode(int rank) { return false; }
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
int xmit_to_rank,int dox, int xmit_to_rank,int dox,
void *recv, void *recv,

View File

@@ -43,6 +43,10 @@ Author: Christoph Lehner <christoph@lhnr.de>
#define GRID_SYCL_LEVEL_ZERO_IPC #define GRID_SYCL_LEVEL_ZERO_IPC
#define SHM_SOCKETS #define SHM_SOCKETS
#else #else
#ifdef HAVE_NUMAIF_H
#warning " Using NUMAIF "
#include <numaif.h>
#endif
#endif #endif
#include <syscall.h> #include <syscall.h>
#endif #endif
@@ -539,21 +543,49 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
#ifndef ACCELERATOR_AWARE_MPI #ifndef ACCELERATOR_AWARE_MPI
// printf("Host buffer allocate for GPU non-aware MPI\n"); // printf("Host buffer allocate for GPU non-aware MPI\n");
#if 0
HostCommBuf= acceleratorAllocHost(bytes);
#else
HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
#if 0
#warning "Moving host buffers to specific NUMA domain"
int numa;
char *numa_name=(char *)getenv("MPI_BUF_NUMA");
if(numa_name) {
unsigned long page_size = sysconf(_SC_PAGESIZE);
numa = atoi(numa_name);
unsigned long page_count = bytes/page_size;
std::vector<void *> pages(page_count);
std::vector<int> nodes(page_count,numa);
std::vector<int> status(page_count,-1);
for(unsigned long p=0;p<page_count;p++){
pages[p] =(void *) ((uint64_t) HostCommBuf + p*page_size);
}
int ret = move_pages(0,
page_count,
&pages[0],
&nodes[0],
&status[0],
MPOL_MF_MOVE);
printf("Host buffer move to numa domain %d : move_pages returned %d\n",numa,ret);
if (ret) perror(" move_pages failed for reason:");
}
#endif
acceleratorPin(HostCommBuf,bytes);
#endif
#endif #endif
ShmCommBuf = acceleratorAllocDevice(bytes); ShmCommBuf = acceleratorAllocDevice(bytes);
if (ShmCommBuf == (void *)NULL ) { if (ShmCommBuf == (void *)NULL ) {
std::cerr << "SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl; std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
if ( WorldRank == 0 ){ if ( WorldRank == 0 ){
std::cout << Mheader " acceleratorAllocDevice "<< bytes std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
<< "bytes at "<< std::hex<< ShmCommBuf << " - "<<(bytes-1+(uint64_t)ShmCommBuf) <<std::dec<<" for comms buffers " <<std::endl; << "bytes at "<< std::hex<< ShmCommBuf << " - "<<(bytes-1+(uint64_t)ShmCommBuf) <<std::dec<<" for comms buffers " <<std::endl;
} }
SharedMemoryZero(ShmCommBuf,bytes); SharedMemoryZero(ShmCommBuf,bytes);
if ( WorldRank == 0 ){ std::cout<< "Setting up IPC"<<std::endl;
std::cout<< Mheader "Setting up IPC"<<std::endl;
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Loop over ranks/gpu's on our node // Loop over ranks/gpu's on our node
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -584,6 +616,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
if ( err != ZE_RESULT_SUCCESS ) { if ( err != ZE_RESULT_SUCCESS ) {
std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl; std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} else {
std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
} }
memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int)); memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int));
handle.pid = getpid(); handle.pid = getpid();
@@ -642,12 +676,12 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
#ifdef SHM_SOCKETS #ifdef SHM_SOCKETS
myfd=UnixSockets::RecvFileDescriptor(); myfd=UnixSockets::RecvFileDescriptor();
#else #else
// std::cout<<"mapping seeking remote pid/fd " std::cout<<"mapping seeking remote pid/fd "
// <<handle.pid<<"/" <<handle.pid<<"/"
// <<handle.fd<<std::endl; <<handle.fd<<std::endl;
int pidfd = syscall(SYS_pidfd_open,handle.pid,0); int pidfd = syscall(SYS_pidfd_open,handle.pid,0);
// std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n"; std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n";
// int myfd = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0); // int myfd = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0);
myfd = syscall(438,pidfd,handle.fd,0); myfd = syscall(438,pidfd,handle.fd,0);
int err_t = errno; int err_t = errno;
@@ -657,7 +691,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
assert(0); assert(0);
} }
#endif #endif
// std::cout<<"Using IpcHandle mapped remote pid "<<handle.pid <<" FD "<<handle.fd <<" to myfd "<<myfd<<"\n"; std::cout<<"Using IpcHandle mapped remote pid "<<handle.pid <<" FD "<<handle.fd <<" to myfd "<<myfd<<"\n";
memcpy((void *)&ihandle,(void *)&handle.ze,sizeof(ihandle)); memcpy((void *)&ihandle,(void *)&handle.ze,sizeof(ihandle));
memcpy((void *)&ihandle,(void *)&myfd,sizeof(int)); memcpy((void *)&ihandle,(void *)&myfd,sizeof(int));
@@ -666,6 +700,9 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
std::cerr << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl; std::cerr << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl; std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} else {
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle succeeded for rank "<<r<<std::endl;
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle pointer is "<<std::hex<<thisBuf<<std::dec<<std::endl;
} }
assert(thisBuf!=nullptr); assert(thisBuf!=nullptr);
} }
@@ -746,7 +783,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
WorldShmCommBufs[r] =ptr; WorldShmCommBufs[r] =ptr;
// std::cout << Mheader "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl; // std::cout << Mheader "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
} }
std::cout<< Mheader " Intra-node IPC setup is complete "<<std::endl;
_ShmAlloc=1; _ShmAlloc=1;
_ShmAllocBytes = bytes; _ShmAllocBytes = bytes;
}; };
@@ -954,7 +990,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
} }
#endif #endif
// SharedMemoryTest(); SharedMemoryTest();
} }
////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////
// On node barrier // On node barrier
@@ -1003,13 +1039,11 @@ void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
{ {
int gpeer = ShmRanks[rank]; int gpeer = ShmRanks[rank];
assert(gpeer!=ShmRank); // never send to self assert(gpeer!=ShmRank); // never send to self
// std::cout << "ShmBufferTranslate for rank " << rank<<" peer "<<gpeer<<std::endl;
if (gpeer == MPI_UNDEFINED){ if (gpeer == MPI_UNDEFINED){
return NULL; return NULL;
} else { } else {
uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank]; uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank];
uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset; uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset;
// std::cout << "ShmBufferTranslate : local,offset,remote "<<std::hex<<local_p<<" "<<offset<<" "<<remote<<std::dec<<std::endl;
return (void *) remote; return (void *) remote;
} }
} }

View File

@@ -122,10 +122,10 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
{ {
acceleratorMemSet(dest,0,bytes); acceleratorMemSet(dest,0,bytes);
} }
//void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes) void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
//{ {
// acceleratorCopyToDevice(src,dest,bytes); acceleratorCopyToDevice(src,dest,bytes);
//} }
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
// Global shared functionality finished // Global shared functionality finished
// Now move to per communicator functionality // Now move to per communicator functionality

View File

@@ -202,7 +202,7 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,deviceVector<
{ {
auto buffer_p = & buffer[0]; auto buffer_p = & buffer[0];
auto table = MapCshiftTable(); auto table = MapCshiftTable();
autoView( rhs_v, rhs, AcceleratorWriteDiscard); autoView( rhs_v, rhs, AcceleratorWrite);
accelerator_for(i,ent,vobj::Nsimd(),{ accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second])); coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
}); });
@@ -228,7 +228,7 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
if(cbmask ==0x3 ) { if(cbmask ==0x3 ) {
int _slice_stride = rhs.Grid()->_slice_stride[dimension]; int _slice_stride = rhs.Grid()->_slice_stride[dimension];
int _slice_block = rhs.Grid()->_slice_block[dimension]; int _slice_block = rhs.Grid()->_slice_block[dimension];
autoView( rhs_v , rhs, AcceleratorWriteDiscard); autoView( rhs_v , rhs, AcceleratorWrite);
accelerator_for(nn,e1*e2,1,{ accelerator_for(nn,e1*e2,1,{
int n = nn%e1; int n = nn%e1;
int b = nn/e1; int b = nn/e1;
@@ -302,7 +302,7 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
{ {
auto table = MapCshiftTable(); auto table = MapCshiftTable();
autoView(rhs_v , rhs, AcceleratorRead); autoView(rhs_v , rhs, AcceleratorRead);
autoView(lhs_v , lhs, AcceleratorWriteDiscard); autoView(lhs_v , lhs, AcceleratorWrite);
accelerator_for(i,ent,vobj::Nsimd(),{ accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second])); coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second]));
}); });

View File

@@ -29,12 +29,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#ifndef _GRID_CSHIFT_MPI_H_ #ifndef _GRID_CSHIFT_MPI_H_
#define _GRID_CSHIFT_MPI_H_ #define _GRID_CSHIFT_MPI_H_
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
#ifdef GRID_CHECKSUM_COMMS
extern uint64_t checksum_index;
#endif
const int Cshift_verbose=0; const int Cshift_verbose=0;
template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift) template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
{ {
@@ -130,9 +126,8 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
static deviceVector<vobj> send_buf; send_buf.resize(buffer_size); static deviceVector<vobj> send_buf; send_buf.resize(buffer_size);
static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size); static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size);
#ifndef ACCELERATOR_AWARE_MPI #ifndef ACCELERATOR_AWARE_MPI
int pad = (8 + sizeof(vobj) - 1) / sizeof(vobj); static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size);
static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size+pad); static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size);
static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size+pad);
#endif #endif
int cb= (cbmask==0x2)? Odd : Even; int cb= (cbmask==0x2)? Odd : Even;
@@ -148,11 +143,9 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
int comm_proc = ((x+sshift)/rd)%pd; int comm_proc = ((x+sshift)/rd)%pd;
if (comm_proc==0) { if (comm_proc==0) {
FlightRecorder::StepLog("Cshift_Copy_plane");
tcopy-=usecond(); tcopy-=usecond();
Copy_plane(ret,rhs,dimension,x,sx,cbmask); Copy_plane(ret,rhs,dimension,x,sx,cbmask);
tcopy+=usecond(); tcopy+=usecond();
FlightRecorder::StepLog("Cshift_Copy_plane_complete");
} else { } else {
int words = buffer_size; int words = buffer_size;
@@ -160,11 +153,9 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
int bytes = words * sizeof(vobj); int bytes = words * sizeof(vobj);
FlightRecorder::StepLog("Cshift_Gather_plane");
tgather-=usecond(); tgather-=usecond();
Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask); Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask);
tgather+=usecond(); tgather+=usecond();
FlightRecorder::StepLog("Cshift_Gather_plane_complete");
// int rank = grid->_processor; // int rank = grid->_processor;
int recv_from_rank; int recv_from_rank;
@@ -175,7 +166,6 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
tcomms-=usecond(); tcomms-=usecond();
grid->Barrier(); grid->Barrier();
FlightRecorder::StepLog("Cshift_SendRecv");
#ifdef ACCELERATOR_AWARE_MPI #ifdef ACCELERATOR_AWARE_MPI
grid->SendToRecvFrom((void *)&send_buf[0], grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank, xmit_to_rank,
@@ -185,46 +175,17 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
#else #else
// bouncy bouncy // bouncy bouncy
acceleratorCopyFromDevice(&send_buf[0],&hsend_buf[0],bytes); acceleratorCopyFromDevice(&send_buf[0],&hsend_buf[0],bytes);
#ifdef GRID_CHECKSUM_COMMS
assert(bytes % 8 == 0);
checksum_index++;
uint64_t xsum = checksum_gpu((uint64_t*)&send_buf[0], bytes / 8) ^ (1 + checksum_index);
*(uint64_t*)(((char*)&hsend_buf[0]) + bytes) = xsum;
bytes += 8;
#endif
grid->SendToRecvFrom((void *)&hsend_buf[0], grid->SendToRecvFrom((void *)&hsend_buf[0],
xmit_to_rank, xmit_to_rank,
(void *)&hrecv_buf[0], (void *)&hrecv_buf[0],
recv_from_rank, recv_from_rank,
bytes); bytes);
#ifdef GRID_CHECKSUM_COMMS
bytes -= 8;
acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
uint64_t expected_cs = *(uint64_t*)(((char*)&hrecv_buf[0]) + bytes);
uint64_t computed_cs = checksum_gpu((uint64_t*)&recv_buf[0], bytes / 8) ^ (1 + checksum_index);
std::cout << GridLogComms<< " Cshift: "
<<" dim"<<dimension
<<" shift "<<shift
<< " rank "<< grid->ThisRank()
<<" Coor "<<grid->ThisProcessorCoor()
<<" send "<<xsum<<" to "<<xmit_to_rank
<<" recv "<<computed_cs<<" from "<<recv_from_rank
<<std::endl;
assert(expected_cs == computed_cs);
#else
acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes); acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
#endif #endif
#endif
FlightRecorder::StepLog("Cshift_SendRecv_complete");
xbytes+=bytes; xbytes+=bytes;
grid->Barrier(); grid->Barrier();
tcomms+=usecond(); tcomms+=usecond();
FlightRecorder::StepLog("Cshift_barrier_complete");
tscatter-=usecond(); tscatter-=usecond();
Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask); Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
@@ -288,16 +249,8 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
recv_buf_extract[s].resize(buffer_size); recv_buf_extract[s].resize(buffer_size);
} }
#ifndef ACCELERATOR_AWARE_MPI #ifndef ACCELERATOR_AWARE_MPI
#ifdef GRID_CHECKSUM_COMMS hostVector<scalar_object> hsend_buf; hsend_buf.resize(buffer_size);
buffer_size += (8 + sizeof(vobj) - 1) / sizeof(vobj); hostVector<scalar_object> hrecv_buf; hrecv_buf.resize(buffer_size);
#endif
static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size);
static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size);
#ifdef GRID_CHECKSUM_COMMS
buffer_size -= (8 + sizeof(vobj) - 1) / sizeof(vobj);
#endif
#endif #endif
int bytes = buffer_size*sizeof(scalar_object); int bytes = buffer_size*sizeof(scalar_object);
@@ -360,37 +313,12 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
#else #else
// bouncy bouncy // bouncy bouncy
acceleratorCopyFromDevice((void *)send_buf_extract_mpi,(void *)&hsend_buf[0],bytes); acceleratorCopyFromDevice((void *)send_buf_extract_mpi,(void *)&hsend_buf[0],bytes);
#ifdef GRID_CHECKSUM_COMMS
assert(bytes % 8 == 0);
checksum_index++;
uint64_t xsum = checksum_gpu((uint64_t*)send_buf_extract_mpi, bytes / 8) ^ (1 + checksum_index);
*(uint64_t*)(((char*)&hsend_buf[0]) + bytes) = xsum;
bytes += 8;
#endif
grid->SendToRecvFrom((void *)&hsend_buf[0], grid->SendToRecvFrom((void *)&hsend_buf[0],
xmit_to_rank, xmit_to_rank,
(void *)&hrecv_buf[0], (void *)&hrecv_buf[0],
recv_from_rank, recv_from_rank,
bytes); bytes);
#ifdef GRID_CHECKSUM_COMMS
bytes -= 8;
acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes); acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes);
uint64_t expected_cs = *(uint64_t*)(((char*)&hrecv_buf[0]) + bytes);
uint64_t computed_cs = checksum_gpu((uint64_t*)recv_buf_extract_mpi, bytes / 8) ^ (1 + checksum_index);
std::cout << GridLogComms<< " Cshift_comms_simd: "
<<" dim"<<dimension
<<" shift "<<shift
<< " rank "<< grid->ThisRank()
<<" Coor "<<grid->ThisProcessorCoor()
<<" send "<<xsum<<" to "<<xmit_to_rank
<<" recv "<<computed_cs<<" from "<<recv_from_rank
<<std::endl;
assert(expected_cs == computed_cs);
#else
acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes);
#endif
#endif #endif
xbytes+=bytes; xbytes+=bytes;

View File

@@ -236,7 +236,7 @@ public:
template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){ template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
vobj vtmp; vobj vtmp;
vtmp = r; vtmp = r;
#if 1 #if 0
deviceVector<vobj> vvtmp(1); deviceVector<vobj> vvtmp(1);
acceleratorPut(vvtmp[0],vtmp); acceleratorPut(vvtmp[0],vtmp);
vobj *vvtmp_p = & vvtmp[0]; vobj *vvtmp_p = & vvtmp[0];

View File

@@ -325,8 +325,8 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
assert(ok); assert(ok);
} }
FlightRecorder::StepLog("Start global sum"); FlightRecorder::StepLog("Start global sum");
grid->GlobalSumP2P(nrm); // grid->GlobalSumP2P(nrm);
// grid->GlobalSum(nrm); grid->GlobalSum(nrm);
FlightRecorder::StepLog("Finished global sum"); FlightRecorder::StepLog("Finished global sum");
// std::cout << " norm "<< nrm << " p2p norm "<<nrmck<<std::endl; // std::cout << " norm "<< nrm << " p2p norm "<<nrmck<<std::endl;
FlightRecorder::ReductionLog(local,real(nrm)); FlightRecorder::ReductionLog(local,real(nrm));

View File

@@ -87,25 +87,6 @@ template<class Word> Word svm_xor(Word *vec,uint64_t L)
theGridAccelerator->wait(); theGridAccelerator->wait();
return ret; return ret;
} }
template<class Word> Word checksum_gpu(Word *vec,uint64_t L)
{
Word identity; identity=0;
Word ret = 0;
{
sycl::buffer<Word, 1> abuff(&ret, {1});
theGridAccelerator->submit([&](sycl::handler &cgh) {
auto Reduction = sycl::reduction(abuff,cgh,identity,std::bit_xor<>());
cgh.parallel_for(sycl::range<1>{L},
Reduction,
[=] (sycl::id<1> index, auto &sum) {
auto l = index % 61;
sum ^= vec[index]<<l | vec[index]>>(64-l);
});
});
}
theGridAccelerator->wait();
return ret;
}
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@@ -106,47 +106,6 @@ public:
} }
}; };
#ifdef GRID_LOG_VIEWS
// Little autoscope assister
template<class View>
class ViewCloser
{
View v; // Take a copy of view and call view close when I go out of scope automatically
const char* filename; int line, mode;
public:
ViewCloser(View &_v, const char* _filename, int _line, int _mode) :
v(_v), filename(_filename), line(_line), mode(_mode) {
switch (mode){
case AcceleratorRead:
case AcceleratorWrite:
case CpuRead:
case CpuWrite:
ViewLogger::Log(filename, line, 1, mode, &v[0], v.size() * sizeof(v[0]));
break;
}
};
~ViewCloser() {
switch (mode) {
case AcceleratorWriteDiscard:
case AcceleratorWrite:
case CpuWrite:
ViewLogger::Log(filename, line, -1, mode, &v[0], v.size() * sizeof(v[0]));
break;
}
v.ViewClose();
}
};
#define autoView(l_v,l,mode) \
auto l_v = l.View(mode); \
ViewCloser<decltype(l_v)> _autoView##l_v(l_v,__FILE__,__LINE__,mode);
#else
// Little autoscope assister // Little autoscope assister
template<class View> template<class View>
class ViewCloser class ViewCloser
@@ -160,7 +119,6 @@ class ViewCloser
#define autoView(l_v,l,mode) \ #define autoView(l_v,l,mode) \
auto l_v = l.View(mode); \ auto l_v = l.View(mode); \
ViewCloser<decltype(l_v)> _autoView##l_v(l_v); ViewCloser<decltype(l_v)> _autoView##l_v(l_v);
#endif
///////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////
// Lattice expression types used by ET to assemble the AST // Lattice expression types used by ET to assemble the AST

View File

@@ -69,7 +69,6 @@ GridLogger GridLogMemory (1, "Memory", GridLogColours, "NORMAL");
GridLogger GridLogTracing(1, "Tracing", GridLogColours, "NORMAL"); GridLogger GridLogTracing(1, "Tracing", GridLogColours, "NORMAL");
GridLogger GridLogDebug (1, "Debug", GridLogColours, "PURPLE"); GridLogger GridLogDebug (1, "Debug", GridLogColours, "PURPLE");
GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN"); GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN");
GridLogger GridLogComms (1, "Comms", GridLogColours, "BLUE");
GridLogger GridLogDslash (1, "Dslash", GridLogColours, "BLUE"); GridLogger GridLogDslash (1, "Dslash", GridLogColours, "BLUE");
GridLogger GridLogIterative (1, "Iterative", GridLogColours, "BLUE"); GridLogger GridLogIterative (1, "Iterative", GridLogColours, "BLUE");
GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE"); GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE");
@@ -85,7 +84,6 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
GridLogDebug.Active(0); GridLogDebug.Active(0);
GridLogPerformance.Active(0); GridLogPerformance.Active(0);
GridLogDslash.Active(0); GridLogDslash.Active(0);
GridLogComms.Active(0);
GridLogIntegrator.Active(1); GridLogIntegrator.Active(1);
GridLogColours.Active(0); GridLogColours.Active(0);
GridLogHMC.Active(1); GridLogHMC.Active(1);
@@ -99,7 +97,6 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
if (logstreams[i] == std::string("Debug")) GridLogDebug.Active(1); if (logstreams[i] == std::string("Debug")) GridLogDebug.Active(1);
if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1); if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1);
if (logstreams[i] == std::string("Dslash")) GridLogDslash.Active(1); if (logstreams[i] == std::string("Dslash")) GridLogDslash.Active(1);
if (logstreams[i] == std::string("Comms")) GridLogComms.Active(1);
if (logstreams[i] == std::string("NoIntegrator"))GridLogIntegrator.Active(0); if (logstreams[i] == std::string("NoIntegrator"))GridLogIntegrator.Active(0);
if (logstreams[i] == std::string("NoHMC")) GridLogHMC.Active(0); if (logstreams[i] == std::string("NoHMC")) GridLogHMC.Active(0);
if (logstreams[i] == std::string("Colours")) GridLogColours.Active(1); if (logstreams[i] == std::string("Colours")) GridLogColours.Active(1);

View File

@@ -180,7 +180,6 @@ extern GridLogger GridLogError;
extern GridLogger GridLogWarning; extern GridLogger GridLogWarning;
extern GridLogger GridLogMessage; extern GridLogger GridLogMessage;
extern GridLogger GridLogDebug; extern GridLogger GridLogDebug;
extern GridLogger GridLogComms;
extern GridLogger GridLogPerformance; extern GridLogger GridLogPerformance;
extern GridLogger GridLogDslash; extern GridLogger GridLogDslash;
extern GridLogger GridLogIterative; extern GridLogger GridLogIterative;

View File

@@ -154,12 +154,6 @@ public:
StencilImpl Stencil; StencilImpl Stencil;
StencilImpl StencilEven; StencilImpl StencilEven;
StencilImpl StencilOdd; StencilImpl StencilOdd;
void SloppyComms(int sloppy)
{
Stencil.SetSloppyComms(sloppy);
StencilEven.SetSloppyComms(sloppy);
StencilOdd.SetSloppyComms(sloppy);
}
// Copy of the gauge field , with even and odd subsets // Copy of the gauge field , with even and odd subsets
DoubledGaugeField Umu; DoubledGaugeField Umu;

View File

@@ -179,12 +179,6 @@ public:
StencilImpl Stencil; StencilImpl Stencil;
StencilImpl StencilEven; StencilImpl StencilEven;
StencilImpl StencilOdd; StencilImpl StencilOdd;
void SloppyComms(int sloppy)
{
Stencil.SetSloppyComms(sloppy);
StencilEven.SetSloppyComms(sloppy);
StencilOdd.SetSloppyComms(sloppy);
}
// Copy of the gauge field , with even and odd subsets // Copy of the gauge field , with even and odd subsets
DoubledGaugeField Umu; DoubledGaugeField Umu;

View File

@@ -146,12 +146,6 @@ public:
StencilImpl Stencil; StencilImpl Stencil;
StencilImpl StencilEven; StencilImpl StencilEven;
StencilImpl StencilOdd; StencilImpl StencilOdd;
void SloppyComms(int sloppy)
{
Stencil.SetSloppyComms(sloppy);
StencilEven.SetSloppyComms(sloppy);
StencilOdd.SetSloppyComms(sloppy);
}
// Copy of the gauge field , with even and odd subsets // Copy of the gauge field , with even and odd subsets
DoubledGaugeField Umu; DoubledGaugeField Umu;

View File

@@ -32,6 +32,209 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////////////
// Wilson compressor will need FaceGather policies for:
// Periodic, Dirichlet, and partial Dirichlet for DWF
///////////////////////////////////////////////////////////////
const int dwf_compressor_depth=2;
#define DWF_COMPRESS
class FaceGatherPartialDWF
{
public:
#ifdef DWF_COMPRESS
static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/(2*dwf_compressor_depth);};
#else
static int PartialCompressionFactor(GridBase *grid) { return 1;}
#endif
template<class vobj,class cobj,class compressor>
static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
const Lattice<vobj> &rhs,
cobj *buffer,
compressor &compress,
int off,int so,int partial)
{
//DWF only hack: If a direction that is OFF node we use Partial Dirichlet
// Shrinks local and remote comms buffers
GridBase *Grid = rhs.Grid();
int Ls = Grid->_rdimensions[0];
#ifdef DWF_COMPRESS
int depth=dwf_compressor_depth;
#else
int depth=Ls/2;
#endif
std::pair<int,int> *table_v = & table[0];
auto rhs_v = rhs.View(AcceleratorRead);
int vol=table.size()/Ls;
accelerator_forNB( idx,table.size(), vobj::Nsimd(), {
Integer i=idx/Ls;
Integer s=idx%Ls;
Integer sc=depth+s-(Ls-depth);
if(s<depth) compress.Compress(buffer[off+i+s*vol],rhs_v[so+table_v[idx].second]);
if(s>=Ls-depth) compress.Compress(buffer[off+i+sc*vol],rhs_v[so+table_v[idx].second]);
});
rhs_v.ViewClose();
}
template<class decompressor,class Decompression>
static void DecompressFace(decompressor decompress,Decompression &dd)
{
auto Ls = dd.dims[0];
#ifdef DWF_COMPRESS
int depth=dwf_compressor_depth;
#else
int depth=Ls/2;
#endif
// Just pass in the Grid
auto kp = dd.kernel_p;
auto mp = dd.mpi_p;
int size= dd.buffer_size;
int vol= size/Ls;
accelerator_forNB(o,size,1,{
int idx=o/Ls;
int s=o%Ls;
if ( s < depth ) {
int oo=s*vol+idx;
kp[o]=mp[oo];
} else if ( s >= Ls-depth ) {
int sc = depth + s - (Ls-depth);
int oo=sc*vol+idx;
kp[o]=mp[oo];
} else {
kp[o] = Zero();//fill rest with zero if partial dirichlet
}
});
}
////////////////////////////////////////////////////////////////////////////////////////////
// Need to gather *interior portions* for ALL s-slices in simd directions
// Do the gather as need to treat SIMD lanes differently, and insert zeroes on receive side
// Reorder the fifth dim to be s=Ls-1 , s=0, s=1,...,Ls-2.
////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj,class cobj,class compressor>
static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
compressor &compress,int type,int partial)
{
GridBase *Grid = rhs.Grid();
int Ls = Grid->_rdimensions[0];
#ifdef DWF_COMPRESS
int depth=dwf_compressor_depth;
#else
int depth = Ls/2;
#endif
// insertion of zeroes...
assert( (table.size()&0x1)==0);
int num=table.size()/2;
int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
auto rhs_v = rhs.View(AcceleratorRead);
auto p0=&pointers[0][0];
auto p1=&pointers[1][0];
auto tp=&table[0];
int nnum=num/Ls;
accelerator_forNB(j, num, vobj::Nsimd(), {
// Reorders both local and remote comms buffers
//
int s = j % Ls;
int sp1 = (s+depth)%Ls; // peri incremented s slice
int hxyz= j/Ls;
int xyz0= hxyz*2; // xyzt part of coor
int xyz1= hxyz*2+1;
int jj= hxyz + sp1*nnum ; // 0,1,2,3 -> Ls-1 slice , 0-slice, 1-slice ....
int kk0= xyz0*Ls + s ; // s=0 goes to s=1
int kk1= xyz1*Ls + s ; // s=Ls-1 -> s=0
compress.CompressExchange(p0[jj],p1[jj],
rhs_v[so+tp[kk0 ].second], // Same s, consecutive xyz sites
rhs_v[so+tp[kk1 ].second],
type);
});
rhs_v.ViewClose();
}
// Merge routine is for SIMD faces
template<class decompressor,class Merger>
static void MergeFace(decompressor decompress,Merger &mm)
{
auto Ls = mm.dims[0];
#ifdef DWF_COMPRESS
int depth=dwf_compressor_depth;
#else
int depth = Ls/2;
#endif
int num= mm.buffer_size/2; // relate vol and Ls to buffer size
auto mp = &mm.mpointer[0];
auto vp0= &mm.vpointers[0][0]; // First arg is exchange first
auto vp1= &mm.vpointers[1][0];
auto type= mm.type;
int nnum = num/Ls;
accelerator_forNB(o,num,Merger::Nsimd,{
int s=o%Ls;
int hxyz=o/Ls; // xyzt related component
int xyz0=hxyz*2;
int xyz1=hxyz*2+1;
int sp = (s+depth)%Ls;
int jj= hxyz + sp*nnum ; // 0,1,2,3 -> Ls-1 slice , 0-slice, 1-slice ....
int oo0= s+xyz0*Ls;
int oo1= s+xyz1*Ls;
// same ss0, ss1 pair goes to new layout
decompress.Exchange(mp[oo0],mp[oo1],vp0[jj],vp1[jj],type);
});
}
};
class FaceGatherDWFMixedBCs
{
public:
#ifdef DWF_COMPRESS
static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/(2*dwf_compressor_depth);};
#else
static int PartialCompressionFactor(GridBase *grid) {return 1;}
#endif
template<class vobj,class cobj,class compressor>
static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
const Lattice<vobj> &rhs,
cobj *buffer,
compressor &compress,
int off,int so,int partial)
{
// std::cout << " face gather simple DWF partial "<<partial <<std::endl;
if(partial) FaceGatherPartialDWF::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
else FaceGatherSimple::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
}
template<class vobj,class cobj,class compressor>
static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
compressor &compress,int type,int partial)
{
// std::cout << " face gather exch DWF partial "<<partial <<std::endl;
if(partial) FaceGatherPartialDWF::Gather_plane_exchange(table,rhs,pointers,dimension, plane,cbmask,compress,type,partial);
else FaceGatherSimple::Gather_plane_exchange (table,rhs,pointers,dimension, plane,cbmask,compress,type,partial);
}
template<class decompressor,class Merger>
static void MergeFace(decompressor decompress,Merger &mm)
{
int partial = mm.partial;
// std::cout << " merge DWF partial "<<partial <<std::endl;
if ( partial ) FaceGatherPartialDWF::MergeFace(decompress,mm);
else FaceGatherSimple::MergeFace(decompress,mm);
}
template<class decompressor,class Decompression>
static void DecompressFace(decompressor decompress,Decompression &dd)
{
int partial = dd.partial;
// std::cout << " decompress DWF partial "<<partial <<std::endl;
if ( partial ) FaceGatherPartialDWF::DecompressFace(decompress,dd);
else FaceGatherSimple::DecompressFace(decompress,dd);
}
};
///////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////
// optimised versions supporting half precision too??? Deprecate // optimised versions supporting half precision too??? Deprecate
///////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////
@@ -39,7 +242,8 @@ NAMESPACE_BEGIN(Grid);
//Could make FaceGather a template param, but then behaviour is runtime not compile time //Could make FaceGather a template param, but then behaviour is runtime not compile time
template<class _HCspinor,class _Hspinor,class _Spinor, class projector> template<class _HCspinor,class _Hspinor,class _Spinor, class projector>
class WilsonCompressorTemplate : public FaceGatherSimple class WilsonCompressorTemplate : public FaceGatherDWFMixedBCs
// : public FaceGatherSimple
{ {
public: public:

View File

@@ -165,12 +165,6 @@ public:
StencilImpl Stencil; StencilImpl Stencil;
StencilImpl StencilEven; StencilImpl StencilEven;
StencilImpl StencilOdd; StencilImpl StencilOdd;
void SloppyComms(int sloppy)
{
Stencil.SetSloppyComms(sloppy);
StencilEven.SetSloppyComms(sloppy);
StencilOdd.SetSloppyComms(sloppy);
}
// Copy of the gauge field , with even and odd subsets // Copy of the gauge field , with even and odd subsets
DoubledGaugeField Umu; DoubledGaugeField Umu;

View File

@@ -204,14 +204,7 @@ public:
DoubledGaugeField Umu; DoubledGaugeField Umu;
DoubledGaugeField UmuEven; DoubledGaugeField UmuEven;
DoubledGaugeField UmuOdd; DoubledGaugeField UmuOdd;
void SloppyComms(int sloppy)
{
Stencil.SetSloppyComms(sloppy);
StencilEven.SetSloppyComms(sloppy);
StencilOdd.SetSloppyComms(sloppy);
}
// Comms buffer // Comms buffer
// std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf; // std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;

View File

@@ -57,7 +57,7 @@ public:
{ {
// RealD eps = 1.0; // RealD eps = 1.0;
// std::cout<<GridLogMessage << "ZMobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" gamma passed in"<<std::endl; std::cout<<GridLogMessage << "ZMobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" gamma passed in"<<std::endl;
std::vector<Coeff_t> zgamma(this->Ls); std::vector<Coeff_t> zgamma(this->Ls);
for(int s=0;s<this->Ls;s++){ for(int s=0;s<this->Ls;s++){
zgamma[s] = gamma[s]; zgamma[s] = gamma[s];

View File

@@ -535,7 +535,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
{ {
autoView(U_v , U,AcceleratorRead); autoView(U_v , U,AcceleratorRead);
autoView(in_v , in,AcceleratorRead); autoView(in_v , in,AcceleratorRead);
autoView(out_v,out,AcceleratorWriteDiscard); autoView(out_v,out,AcceleratorWrite);
autoView(st_v , st,AcceleratorRead); autoView(st_v , st,AcceleratorRead);
KERNEL_CALL_ID(GenericDhopSite); KERNEL_CALL_ID(GenericDhopSite);
} }

View File

@@ -118,7 +118,7 @@ protected:
GaugeK); // derivative of SmearBase GaugeK); // derivative of SmearBase
return SigmaK; return SigmaK;
} }
public:
/*! @brief Returns smeared configuration at level 'Level' */ /*! @brief Returns smeared configuration at level 'Level' */
const GaugeField &get_smeared_conf(int Level) const const GaugeField &get_smeared_conf(int Level) const
{ {

View File

@@ -819,7 +819,7 @@ public:
} // if smearingLevels = 0 do nothing } // if smearingLevels = 0 do nothing
} }
public: private:
//==================================================================== //====================================================================
// Override base clas here to mask it // Override base clas here to mask it
virtual void fill_smearedSet(GaugeField &U) virtual void fill_smearedSet(GaugeField &U)

View File

@@ -252,11 +252,6 @@ void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const{
out = in; out = in;
RealD taus = 0.; RealD taus = 0.;
// Perform initial t=0 measurements
for(auto const &meas : this->functions)
meas.second(0,taus,out);
for (unsigned int step = 1; step <= Nstep; step++) { //step indicates the number of smearing steps applied at the time of measurement for (unsigned int step = 1; step <= Nstep; step++) { //step indicates the number of smearing steps applied at the time of measurement
auto start = std::chrono::high_resolution_clock::now(); auto start = std::chrono::high_resolution_clock::now();
evolve_step(out, taus); evolve_step(out, taus);
@@ -341,11 +336,6 @@ void WilsonFlowAdaptive<Gimpl>::smear(GaugeField& out, const GaugeField& in) con
RealD taus = 0.; RealD taus = 0.;
RealD eps = init_epsilon; RealD eps = init_epsilon;
unsigned int step = 0; unsigned int step = 0;
// Perform initial t=0 measurements
for(auto const &meas : this->functions)
meas.second(step,taus,out);
do{ do{
int step_success = evolve_step_adaptive(out, taus, eps); int step_success = evolve_step_adaptive(out, taus, eps);
step += step_success; //step will not be incremented if the integration step fails step += step_success; //step will not be incremented if the integration step fails

View File

@@ -30,26 +30,25 @@
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
uint64_t DslashFullCount; uint64_t DslashFullCount;
//uint64_t DslashPartialCount; uint64_t DslashPartialCount;
uint64_t DslashDirichletCount; uint64_t DslashDirichletCount;
void DslashResetCounts(void) void DslashResetCounts(void)
{ {
DslashFullCount=0; DslashFullCount=0;
// DslashPartialCount=0; DslashPartialCount=0;
DslashDirichletCount=0; DslashDirichletCount=0;
} }
void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full) void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full)
{ {
dirichlet = DslashDirichletCount; dirichlet = DslashDirichletCount;
partial = 0; partial = DslashPartialCount;
full = DslashFullCount; full = DslashFullCount;
} }
void DslashLogFull(void) { DslashFullCount++;} void DslashLogFull(void) { DslashFullCount++;}
//void DslashLogPartial(void) { DslashPartialCount++;} void DslashLogPartial(void) { DslashPartialCount++;}
void DslashLogDirichlet(void){ DslashDirichletCount++;} void DslashLogDirichlet(void){ DslashDirichletCount++;}
deviceVector<unsigned char> StencilBuffer::DeviceCommBuf;
void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask, void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
int off,std::vector<std::pair<int,int> > & table) int off,std::vector<std::pair<int,int> > & table)

View File

@@ -55,10 +55,10 @@ NAMESPACE_BEGIN(Grid);
// These can move into a params header and be given MacroMagic serialisation // These can move into a params header and be given MacroMagic serialisation
struct DefaultImplParams { struct DefaultImplParams {
Coordinate dirichlet; // Blocksize of dirichlet BCs Coordinate dirichlet; // Blocksize of dirichlet BCs
// int partialDirichlet; int partialDirichlet;
DefaultImplParams() { DefaultImplParams() {
dirichlet.resize(0); dirichlet.resize(0);
// partialDirichlet=0; partialDirichlet=0;
}; };
}; };
@@ -69,12 +69,6 @@ struct DefaultImplParams {
void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask, void Gather_plane_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
int off,std::vector<std::pair<int,int> > & table); int off,std::vector<std::pair<int,int> > & table);
class StencilBuffer
{
public:
static deviceVector<unsigned char> DeviceCommBuf; // placed in Stencil.cc
};
void DslashResetCounts(void); void DslashResetCounts(void);
void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full); void DslashGetCounts(uint64_t &dirichlet,uint64_t &partial,uint64_t &full);
void DslashLogFull(void); void DslashLogFull(void);
@@ -119,8 +113,8 @@ class CartesianStencilAccelerator {
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
// If true, this is partially communicated per face // If true, this is partially communicated per face
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
// StencilVector _comms_partial_send; StencilVector _comms_partial_send;
// StencilVector _comms_partial_recv; StencilVector _comms_partial_recv;
// //
StencilVector _comm_buf_size; StencilVector _comm_buf_size;
StencilVector _permute_type; StencilVector _permute_type;
@@ -192,11 +186,6 @@ public:
void ViewClose(void) { } void ViewClose(void) { }
#ifdef GRID_LOG_VIEWS
size_t size() { return 0; };
uint64_t & operator[](size_t i) { static uint64_t v=0; return v; };
#endif
}; };
//////////////////////////////////////// ////////////////////////////////////////
@@ -216,16 +205,16 @@ public:
struct Packet { struct Packet {
void * send_buf; void * send_buf;
void * recv_buf; void * recv_buf;
void * compressed_send_buf; #ifndef ACCELERATOR_AWARE_MPI
void * compressed_recv_buf; void * host_send_buf; // Allocate this if not MPI_CUDA_AWARE
void * host_recv_buf; // Allocate this if not MPI_CUDA_AWARE
#endif
Integer to_rank; Integer to_rank;
Integer from_rank; Integer from_rank;
Integer do_send; Integer do_send;
Integer do_recv; Integer do_recv;
Integer xbytes; Integer xbytes;
Integer rbytes; Integer rbytes;
Integer xbytes_compressed;
Integer rbytes_compressed;
}; };
struct Merge { struct Merge {
static constexpr int Nsimd = vobj::Nsimd(); static constexpr int Nsimd = vobj::Nsimd();
@@ -234,7 +223,7 @@ public:
std::vector<cobj *> vpointers; std::vector<cobj *> vpointers;
Integer buffer_size; Integer buffer_size;
Integer type; Integer type;
// Integer partial; // partial dirichlet BCs Integer partial; // partial dirichlet BCs
Coordinate dims; Coordinate dims;
}; };
struct Decompress { struct Decompress {
@@ -242,7 +231,7 @@ public:
cobj * kernel_p; cobj * kernel_p;
cobj * mpi_p; cobj * mpi_p;
Integer buffer_size; Integer buffer_size;
// Integer partial; // partial dirichlet BCs Integer partial; // partial dirichlet BCs
Coordinate dims; Coordinate dims;
}; };
struct CopyReceiveBuffer { struct CopyReceiveBuffer {
@@ -263,45 +252,9 @@ public:
protected: protected:
GridBase * _grid; GridBase * _grid;
///////////////////////////////////////////////////
// Sloppy comms will make a second buffer upon comms
///////////////////////////////////////////////////
size_t device_heap_top; //
size_t device_heap_bytes;//
size_t device_heap_size; //
void *DeviceBufferMalloc(size_t bytes)
{
void *ptr = (void *)device_heap_top;
device_heap_top += bytes;
device_heap_bytes+= bytes;
if ( device_heap_bytes > device_heap_size ) {
std::cout << "DeviceBufferMalloc overflow bytes "<<bytes<<" heap bytes "<<device_heap_bytes<<" heap size "<<device_heap_size<<std::endl;
assert (device_heap_bytes <= device_heap_size);
}
return ptr;
}
void DeviceBufferFreeAll(void)
{
device_heap_size = _unified_buffer_size*sizeof(cobj);
// Resize up if necessary, never down
if ( StencilBuffer::DeviceCommBuf.size() < device_heap_size ) {
StencilBuffer::DeviceCommBuf.resize(device_heap_size);
}
device_heap_top =(size_t) &StencilBuffer::DeviceCommBuf[0];
device_heap_size = StencilBuffer::DeviceCommBuf.size();
device_heap_bytes=0;
}
public: public:
GridBase *Grid(void) const { return _grid; } GridBase *Grid(void) const { return _grid; }
/////////////////////////////////////////////////////////
// Control reduced precision comms
/////////////////////////////////////////////////////////
int SloppyComms;
void SetSloppyComms(int sloppy) { SloppyComms = sloppy; };
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// Needed to conveniently communicate gparity parameters into GPU memory // Needed to conveniently communicate gparity parameters into GPU memory
// without adding parameters. Perhaps a template parameter to StenciView is // without adding parameters. Perhaps a template parameter to StenciView is
@@ -315,7 +268,7 @@ public:
} }
int face_table_computed; int face_table_computed;
// int partialDirichlet; int partialDirichlet;
int fullDirichlet; int fullDirichlet;
std::vector<deviceVector<std::pair<int,int> > > face_table ; std::vector<deviceVector<std::pair<int,int> > > face_table ;
deviceVector<int> surface_list; deviceVector<int> surface_list;
@@ -408,145 +361,24 @@ public:
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// Non blocking send and receive. Necessarily parallel. // Non blocking send and receive. Necessarily parallel.
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
void DecompressPacket(Packet &packet)
{
if ( !SloppyComms ) return;
if ( packet.do_recv && _grid->IsOffNode(packet.from_rank) ) {
typedef typename getPrecision<cobj>::real_scalar_type word;
uint64_t words = packet.rbytes/sizeof(word);
const int nsimd = sizeof(typename cobj::vector_type)/sizeof(word);
const uint64_t outer = words/nsimd;
if(sizeof(word)==8) {
// Can either choose to represent as float vs double and prec change
// OR
// truncate the mantissa bfp16 style
double *dbuf =(double *) packet.recv_buf;
float *fbuf =(float *) packet.compressed_recv_buf;
accelerator_forNB(ss,outer,nsimd,{
int lane = acceleratorSIMTlane(nsimd);
dbuf[ss*nsimd+lane] = fbuf[ss*nsimd+lane]; //conversion
});
} else if ( sizeof(word)==4){
// Can either choose to represent as half vs float and prec change
// OR
// truncate the mantissa bfp16 style
uint32_t *fbuf =(uint32_t *) packet.recv_buf;
uint16_t *hbuf =(uint16_t *) packet.compressed_recv_buf;
accelerator_forNB(ss,outer,nsimd,{
int lane = acceleratorSIMTlane(nsimd);
fbuf[ss*nsimd+lane] = ((uint32_t)hbuf[ss*nsimd+lane])<<16; //copy back and pad each word with zeroes
});
} else {
assert(0 && "unknown floating point precision");
}
}
}
void CompressPacket(Packet &packet)
{
packet.xbytes_compressed = packet.xbytes;
packet.compressed_send_buf = packet.send_buf;
packet.rbytes_compressed = packet.rbytes;
packet.compressed_recv_buf = packet.recv_buf;
if ( !SloppyComms ) {
return;
}
typedef typename getPrecision<cobj>::real_scalar_type word;
uint64_t words = packet.xbytes/sizeof(word);
const int nsimd = sizeof(typename cobj::vector_type)/sizeof(word);
const uint64_t outer = words/nsimd;
if (packet.do_recv && _grid->IsOffNode(packet.from_rank) ) {
packet.rbytes_compressed = packet.rbytes/2;
packet.compressed_recv_buf = DeviceBufferMalloc(packet.rbytes_compressed);
// std::cout << " CompressPacket recv from "<<packet.from_rank<<" "<<std::hex<<packet.compressed_recv_buf<<std::dec<<std::endl;
}
//else {
// std::cout << " CompressPacket recv is uncompressed from "<<packet.from_rank<<" "<<std::hex<<packet.compressed_recv_buf<<std::dec<<std::endl;
// }
if (packet.do_send && _grid->IsOffNode(packet.to_rank) ) {
packet.xbytes_compressed = packet.xbytes/2;
packet.compressed_send_buf = DeviceBufferMalloc(packet.xbytes_compressed);
// std::cout << " CompressPacket send to "<<packet.to_rank<<" "<<std::hex<<packet.compressed_send_buf<<std::dec<<std::endl;
if(sizeof(word)==8) {
double *dbuf =(double *) packet.send_buf;
float *fbuf =(float *) packet.compressed_send_buf;
accelerator_forNB(ss,outer,nsimd,{
int lane = acceleratorSIMTlane(nsimd);
fbuf[ss*nsimd+lane] = dbuf[ss*nsimd+lane]; // convert fp64 to fp32
});
} else if ( sizeof(word)==4){
uint32_t *fbuf =(uint32_t *) packet.send_buf;
uint16_t *hbuf =(uint16_t *) packet.compressed_send_buf;
accelerator_forNB(ss,outer,nsimd,{
int lane = acceleratorSIMTlane(nsimd);
hbuf[ss*nsimd+lane] = fbuf[ss*nsimd+lane]>>16; // convert as in Bagel/BFM ; bfloat16 ; s7e8 Intel patent
});
} else {
assert(0 && "unknown floating point precision");
}
}
// else {
// std::cout << " CompressPacket send is uncompressed to "<<packet.to_rank<<" "<<std::hex<<packet.compressed_send_buf<<std::dec<<std::endl;
// }
return;
}
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs) void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
{ {
// std::cout << "Communicate Begin "<<std::endl;
// _grid->Barrier();
FlightRecorder::StepLog("Communicate begin"); FlightRecorder::StepLog("Communicate begin");
///////////////////////////////////////////////
// All GPU kernel tasks must complete // All GPU kernel tasks must complete
// accelerator_barrier(); All kernels should ALREADY be complete // accelerator_barrier(); // All kernels should ALREADY be complete
//Everyone is here, so noone running slow and still using receive buffer // _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer
_grid->StencilBarrier(); // But the HaloGather had a barrier too.
// But the HaloGather had a barrier too.
///////////////////////////////////////////////
if (SloppyComms) {
DeviceBufferFreeAll();
}
for(int i=0;i<Packets.size();i++){
this->CompressPacket(Packets[i]);
}
if (SloppyComms) {
accelerator_barrier();
#ifdef NVLINK_GET
_grid->StencilBarrier();
#endif
}
for(int i=0;i<Packets.size();i++){ for(int i=0;i<Packets.size();i++){
// std::cout << "Communicate prepare "<<i<<std::endl; // std::cout << "Communicate prepare "<<i<<std::endl;
// _grid->Barrier(); // _grid->Barrier();
_grid->StencilSendToRecvFromPrepare(MpiReqs, _grid->StencilSendToRecvFromPrepare(MpiReqs,
Packets[i].compressed_send_buf, Packets[i].send_buf,
Packets[i].to_rank,Packets[i].do_send, Packets[i].to_rank,Packets[i].do_send,
Packets[i].compressed_recv_buf, Packets[i].recv_buf,
Packets[i].from_rank,Packets[i].do_recv, Packets[i].from_rank,Packets[i].do_recv,
Packets[i].xbytes_compressed,Packets[i].rbytes_compressed,i); Packets[i].xbytes,Packets[i].rbytes,i);
} }
// std::cout << "Communicate PollDtoH "<<std::endl; // std::cout << "Communicate PollDtoH "<<std::endl;
// _grid->Barrier(); // _grid->Barrier();
@@ -557,22 +389,18 @@ public:
// Starts intranode // Starts intranode
for(int i=0;i<Packets.size();i++){ for(int i=0;i<Packets.size();i++){
// std::cout << "Communicate Begin "<<i<<std::endl; // std::cout << "Communicate Begin "<<i<<std::endl;
// _grid->Barrier();
_grid->StencilSendToRecvFromBegin(MpiReqs, _grid->StencilSendToRecvFromBegin(MpiReqs,
Packets[i].send_buf,Packets[i].compressed_send_buf, Packets[i].send_buf,
Packets[i].to_rank,Packets[i].do_send, Packets[i].to_rank,Packets[i].do_send,
Packets[i].recv_buf,Packets[i].compressed_recv_buf, Packets[i].recv_buf,
Packets[i].from_rank,Packets[i].do_recv, Packets[i].from_rank,Packets[i].do_recv,
Packets[i].xbytes_compressed,Packets[i].rbytes_compressed,i); Packets[i].xbytes,Packets[i].rbytes,i);
// std::cout << "Communicate Begin started "<<i<<std::endl;
// _grid->Barrier();
} }
FlightRecorder::StepLog("Communicate begin has finished");
// Get comms started then run checksums // Get comms started then run checksums
// Having this PRIOR to the dslash seems to make Sunspot work... (!) // Having this PRIOR to the dslash seems to make Sunspot work... (!)
for(int i=0;i<Packets.size();i++){ for(int i=0;i<Packets.size();i++){
if ( Packets[i].do_send ) if ( Packets[i].do_send )
FlightRecorder::xmitLog(Packets[i].compressed_send_buf,Packets[i].xbytes_compressed); FlightRecorder::xmitLog(Packets[i].send_buf,Packets[i].xbytes);
} }
} }
@@ -587,15 +415,14 @@ public:
// std::cout << "Communicate Complete Complete "<<std::endl; // std::cout << "Communicate Complete Complete "<<std::endl;
// _grid->Barrier(); // _grid->Barrier();
_grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done _grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
// if ( this->partialDirichlet ) DslashLogPartial(); if ( this->partialDirichlet ) DslashLogPartial();
if ( this->fullDirichlet ) DslashLogDirichlet(); else if ( this->fullDirichlet ) DslashLogDirichlet();
else DslashLogFull(); else DslashLogFull();
// acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete // acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete
// accelerator_barrier(); // accelerator_barrier();
for(int i=0;i<Packets.size();i++){ for(int i=0;i<Packets.size();i++){
this->DecompressPacket(Packets[i]);
if ( Packets[i].do_recv ) if ( Packets[i].do_recv )
FlightRecorder::recvLog(Packets[i].compressed_recv_buf,Packets[i].rbytes_compressed,Packets[i].from_rank); FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank);
} }
FlightRecorder::StepLog("Finish communicate complete"); FlightRecorder::StepLog("Finish communicate complete");
} }
@@ -790,7 +617,7 @@ public:
} }
void AddDecompress(cobj *k_p,cobj *m_p,Integer buffer_size,std::vector<Decompress> &dv) { void AddDecompress(cobj *k_p,cobj *m_p,Integer buffer_size,std::vector<Decompress> &dv) {
Decompress d; Decompress d;
// d.partial = this->partialDirichlet; d.partial = this->partialDirichlet;
d.dims = _grid->_fdimensions; d.dims = _grid->_fdimensions;
d.kernel_p = k_p; d.kernel_p = k_p;
d.mpi_p = m_p; d.mpi_p = m_p;
@@ -799,7 +626,7 @@ public:
} }
void AddMerge(cobj *merge_p,std::vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) { void AddMerge(cobj *merge_p,std::vector<cobj *> &rpointers,Integer buffer_size,Integer type,std::vector<Merge> &mv) {
Merge m; Merge m;
// m.partial = this->partialDirichlet; m.partial = this->partialDirichlet;
m.dims = _grid->_fdimensions; m.dims = _grid->_fdimensions;
m.type = type; m.type = type;
m.mpointer = merge_p; m.mpointer = merge_p;
@@ -904,8 +731,8 @@ public:
int block = dirichlet_block[dimension]; int block = dirichlet_block[dimension];
this->_comms_send[ii] = comm_dim; this->_comms_send[ii] = comm_dim;
this->_comms_recv[ii] = comm_dim; this->_comms_recv[ii] = comm_dim;
// this->_comms_partial_send[ii] = 0; this->_comms_partial_send[ii] = 0;
// this->_comms_partial_recv[ii] = 0; this->_comms_partial_recv[ii] = 0;
if ( block && comm_dim ) { if ( block && comm_dim ) {
assert(abs(displacement) < ld ); assert(abs(displacement) < ld );
// Quiesce communication across block boundaries // Quiesce communication across block boundaries
@@ -926,10 +753,10 @@ public:
if ( ( (ld*(pc+1) ) % block ) == 0 ) this->_comms_send[ii] = 0; if ( ( (ld*(pc+1) ) % block ) == 0 ) this->_comms_send[ii] = 0;
if ( ( (ld*pc ) % block ) == 0 ) this->_comms_recv[ii] = 0; if ( ( (ld*pc ) % block ) == 0 ) this->_comms_recv[ii] = 0;
} }
// if ( partialDirichlet ) { if ( partialDirichlet ) {
// this->_comms_partial_send[ii] = !this->_comms_send[ii]; this->_comms_partial_send[ii] = !this->_comms_send[ii];
// this->_comms_partial_recv[ii] = !this->_comms_recv[ii]; this->_comms_partial_recv[ii] = !this->_comms_recv[ii];
// } }
} }
} }
} }
@@ -941,7 +768,6 @@ public:
Parameters p=Parameters(), Parameters p=Parameters(),
bool preserve_shm=false) bool preserve_shm=false)
{ {
SloppyComms = 0;
face_table_computed=0; face_table_computed=0;
_grid = grid; _grid = grid;
this->parameters=p; this->parameters=p;
@@ -959,7 +785,7 @@ public:
this->same_node.resize(npoints); this->same_node.resize(npoints);
if ( p.dirichlet.size() ==0 ) p.dirichlet.resize(grid->Nd(),0); if ( p.dirichlet.size() ==0 ) p.dirichlet.resize(grid->Nd(),0);
// partialDirichlet = p.partialDirichlet; partialDirichlet = p.partialDirichlet;
DirichletBlock(p.dirichlet); // comms send/recv set up DirichletBlock(p.dirichlet); // comms send/recv set up
fullDirichlet=0; fullDirichlet=0;
for(int d=0;d<p.dirichlet.size();d++){ for(int d=0;d<p.dirichlet.size();d++){
@@ -1040,7 +866,7 @@ public:
///////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////
const int Nsimd = grid->Nsimd(); const int Nsimd = grid->Nsimd();
// Allow for multiple stencils to be communicated simultaneously // Allow for multiple stencils to exist simultaneously
if (!preserve_shm) if (!preserve_shm)
_grid->ShmBufferFreeAll(); _grid->ShmBufferFreeAll();
@@ -1108,8 +934,7 @@ public:
GridBase *grid=_grid; GridBase *grid=_grid;
const int Nsimd = grid->Nsimd(); const int Nsimd = grid->Nsimd();
// int comms_recv = this->_comms_recv[point] || this->_comms_partial_recv[point] ; int comms_recv = this->_comms_recv[point] || this->_comms_partial_recv[point] ;
int comms_recv = this->_comms_recv[point];
int fd = _grid->_fdimensions[dimension]; int fd = _grid->_fdimensions[dimension];
int ld = _grid->_ldimensions[dimension]; int ld = _grid->_ldimensions[dimension];
int rd = _grid->_rdimensions[dimension]; int rd = _grid->_rdimensions[dimension];
@@ -1298,8 +1123,8 @@ public:
int comms_send = this->_comms_send[point]; int comms_send = this->_comms_send[point];
int comms_recv = this->_comms_recv[point]; int comms_recv = this->_comms_recv[point];
// int comms_partial_send = this->_comms_partial_send[point] ; int comms_partial_send = this->_comms_partial_send[point] ;
// int comms_partial_recv = this->_comms_partial_recv[point] ; int comms_partial_recv = this->_comms_partial_recv[point] ;
assert(rhs.Grid()==_grid); assert(rhs.Grid()==_grid);
// conformable(_grid,rhs.Grid()); // conformable(_grid,rhs.Grid());
@@ -1334,11 +1159,11 @@ public:
int rbytes; int rbytes;
if ( comms_send ) xbytes = bytes; // Full send if ( comms_send ) xbytes = bytes; // Full send
// else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid); else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
else xbytes = 0; // full dirichlet else xbytes = 0; // full dirichlet
if ( comms_recv ) rbytes = bytes; if ( comms_recv ) rbytes = bytes;
// else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid); else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
else rbytes = 0; else rbytes = 0;
int so = sx*rhs.Grid()->_ostride[dimension]; // base offset for start of plane int so = sx*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
@@ -1365,8 +1190,7 @@ public:
} }
// if ( (compress.DecompressionStep()&&comms_recv) || comms_partial_recv ) { if ( (compress.DecompressionStep()&&comms_recv) || comms_partial_recv ) {
if ( compress.DecompressionStep()&&comms_recv) {
recv_buf=u_simd_recv_buf[0]; recv_buf=u_simd_recv_buf[0];
} else { } else {
recv_buf=this->u_recv_buf_p; recv_buf=this->u_recv_buf_p;
@@ -1400,8 +1224,7 @@ public:
#endif #endif
// std::cout << " GatherPlaneSimple partial send "<< comms_partial_send<<std::endl; // std::cout << " GatherPlaneSimple partial send "<< comms_partial_send<<std::endl;
// compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,comms_partial_send); compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,comms_partial_send);
compressor::Gather_plane_simple(face_table[face_idx],rhs,send_buf,compress,comm_off,so,0);
int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[comm_off],0,xbytes,rbytes,cbmask); int duplicate = CheckForDuplicate(dimension,sx,comm_proc,(void *)&recv_buf[comm_off],0,xbytes,rbytes,cbmask);
if ( !duplicate ) { // Force comms for now if ( !duplicate ) { // Force comms for now
@@ -1410,8 +1233,8 @@ public:
// Build a list of things to do after we synchronise GPUs // Build a list of things to do after we synchronise GPUs
// Start comms now??? // Start comms now???
/////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////
int do_send = (comms_send) && (!shm_send ); int do_send = (comms_send|comms_partial_send) && (!shm_send );
int do_recv = (comms_send) && (!shm_recv ); int do_recv = (comms_send|comms_partial_send) && (!shm_recv );
AddPacket((void *)&send_buf[comm_off], AddPacket((void *)&send_buf[comm_off],
(void *)&recv_buf[comm_off], (void *)&recv_buf[comm_off],
xmit_to_rank, do_send, xmit_to_rank, do_send,
@@ -1419,7 +1242,7 @@ public:
xbytes,rbytes); xbytes,rbytes);
} }
if ( (compress.DecompressionStep() && comms_recv) ) { if ( (compress.DecompressionStep() && comms_recv) || comms_partial_recv ) {
AddDecompress(&this->u_recv_buf_p[comm_off], AddDecompress(&this->u_recv_buf_p[comm_off],
&recv_buf[comm_off], &recv_buf[comm_off],
words,Decompressions); words,Decompressions);
@@ -1441,8 +1264,8 @@ public:
int comms_send = this->_comms_send[point]; int comms_send = this->_comms_send[point];
int comms_recv = this->_comms_recv[point]; int comms_recv = this->_comms_recv[point];
// int comms_partial_send = this->_comms_partial_send[point] ; int comms_partial_send = this->_comms_partial_send[point] ;
// int comms_partial_recv = this->_comms_partial_recv[point] ; int comms_partial_recv = this->_comms_partial_recv[point] ;
int fd = _grid->_fdimensions[dimension]; int fd = _grid->_fdimensions[dimension];
int rd = _grid->_rdimensions[dimension]; int rd = _grid->_rdimensions[dimension];
@@ -1517,20 +1340,18 @@ public:
if ( comms_send ) xbytes = bytes; if ( comms_send ) xbytes = bytes;
// else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid); else if ( comms_partial_send ) xbytes = bytes/compressor::PartialCompressionFactor(_grid);
else xbytes = 0; else xbytes = 0;
if ( comms_recv ) rbytes = bytes; if ( comms_recv ) rbytes = bytes;
// else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid); else if ( comms_partial_recv ) rbytes = bytes/compressor::PartialCompressionFactor(_grid);
else rbytes = 0; else rbytes = 0;
// Gathers SIMD lanes for send and merge // Gathers SIMD lanes for send and merge
// Different faces can be full comms or partial comms with multiple ranks per node // Different faces can be full comms or partial comms with multiple ranks per node
// if ( comms_send || comms_recv||comms_partial_send||comms_partial_recv ) { if ( comms_send || comms_recv||comms_partial_send||comms_partial_recv ) {
if ( comms_send || comms_recv ) {
// int partial = partialDirichlet; int partial = partialDirichlet;
int partial = 0;
compressor::Gather_plane_exchange(face_table[face_idx],rhs, compressor::Gather_plane_exchange(face_table[face_idx],rhs,
spointers,dimension,sx,cbmask, spointers,dimension,sx,cbmask,
compress,permute_type,partial ); compress,permute_type,partial );
@@ -1596,8 +1417,7 @@ public:
if ( (bytes != rbytes) && (rbytes!=0) ){ if ( (bytes != rbytes) && (rbytes!=0) ){
acceleratorMemSet(rp,0,bytes); // Zero prefill comms buffer to zero acceleratorMemSet(rp,0,bytes); // Zero prefill comms buffer to zero
} }
// int do_send = (comms_send|comms_partial_send) && (!shm_send ); int do_send = (comms_send|comms_partial_send) && (!shm_send );
int do_send = (comms_send) && (!shm_send );
AddPacket((void *)sp,(void *)rp, AddPacket((void *)sp,(void *)rp,
xmit_to_rank,do_send, xmit_to_rank,do_send,
recv_from_rank,do_send, recv_from_rank,do_send,
@@ -1611,8 +1431,7 @@ public:
} }
} }
// rpointer may be doing a remote read in the gather over SHM // rpointer may be doing a remote read in the gather over SHM
// if ( comms_recv|comms_partial_recv ) { if ( comms_recv|comms_partial_recv ) {
if ( comms_recv ) {
AddMerge(&this->u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers); AddMerge(&this->u_recv_buf_p[comm_off],rpointers,reduced_buffer_size,permute_type,Mergers);
} }

View File

@@ -67,7 +67,7 @@ void acceleratorInit(void)
printf("AcceleratorCudaInit[%d]: Device identifier: %s\n",rank, prop.name); printf("AcceleratorCudaInit[%d]: Device identifier: %s\n",rank, prop.name);
GPU_PROP_FMT(totalGlobalMem,"%zu"); GPU_PROP_FMT(totalGlobalMem,"%lld");
GPU_PROP(managedMemory); GPU_PROP(managedMemory);
GPU_PROP(isMultiGpuBoard); GPU_PROP(isMultiGpuBoard);
GPU_PROP(warpSize); GPU_PROP(warpSize);
@@ -240,7 +240,7 @@ void acceleratorInit(void)
char hostname[HOST_NAME_MAX+1]; char hostname[HOST_NAME_MAX+1];
gethostname(hostname, HOST_NAME_MAX+1); gethostname(hostname, HOST_NAME_MAX+1);
if ( rank==0 ) printf("AcceleratorSyclInit world_rank %d is host %s \n",world_rank,hostname); if ( rank==0 ) printf(" acceleratorInit world_rank %d is host %s \n",world_rank,hostname);
auto devices = sycl::device::get_devices(); auto devices = sycl::device::get_devices();
for(int d = 0;d<devices.size();d++){ for(int d = 0;d<devices.size();d++){

View File

@@ -215,7 +215,7 @@ inline void *acceleratorAllocHost(size_t bytes)
auto err = cudaMallocHost((void **)&ptr,bytes); auto err = cudaMallocHost((void **)&ptr,bytes);
if( err != cudaSuccess ) { if( err != cudaSuccess ) {
ptr = (void *) NULL; ptr = (void *) NULL;
printf(" cudaMallocHost failed for %zu %s \n",bytes,cudaGetErrorString(err)); printf(" cudaMallocHost failed for %d %s \n",bytes,cudaGetErrorString(err));
assert(0); assert(0);
} }
return ptr; return ptr;
@@ -226,7 +226,7 @@ inline void *acceleratorAllocShared(size_t bytes)
auto err = cudaMallocManaged((void **)&ptr,bytes); auto err = cudaMallocManaged((void **)&ptr,bytes);
if( err != cudaSuccess ) { if( err != cudaSuccess ) {
ptr = (void *) NULL; ptr = (void *) NULL;
printf(" cudaMallocManaged failed for %zu %s \n",bytes,cudaGetErrorString(err)); printf(" cudaMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err));
assert(0); assert(0);
} }
return ptr; return ptr;
@@ -237,7 +237,7 @@ inline void *acceleratorAllocDevice(size_t bytes)
auto err = cudaMalloc((void **)&ptr,bytes); auto err = cudaMalloc((void **)&ptr,bytes);
if( err != cudaSuccess ) { if( err != cudaSuccess ) {
ptr = (void *) NULL; ptr = (void *) NULL;
printf(" cudaMalloc failed for %zu %s \n",bytes,cudaGetErrorString(err)); printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err));
} }
return ptr; return ptr;
}; };
@@ -251,7 +251,7 @@ inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { c
inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);} inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);} inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
acceleratorCopyToDevice(from,to,bytes); acceleratorCopyToDevice(to,from,bytes, cudaMemcpyHostToDevice);
return 0; return 0;
} }
inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
@@ -337,7 +337,7 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
cgh.parallel_for( \ cgh.parallel_for( \
sycl::nd_range<3>(global,local), \ sycl::nd_range<3>(global,local), \
[=] (sycl::nd_item<3> item) /*mutable*/ \ [=] (sycl::nd_item<3> item) /*mutable*/ \
[[sycl::reqd_sub_group_size(16)]] \ [[intel::reqd_sub_group_size(16)]] \
{ \ { \
auto iter1 = item.get_global_id(0); \ auto iter1 = item.get_global_id(0); \
auto iter2 = item.get_global_id(1); \ auto iter2 = item.get_global_id(1); \

View File

@@ -28,6 +28,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
/* END LEGAL */ /* END LEGAL */
#pragma once #pragma once
#ifndef MIN
#define MIN(x,y) ((x)>(y)?(y):(x))
#endif
// Introduce a class to gain deterministic bit reproducible reduction. // Introduce a class to gain deterministic bit reproducible reduction.
// make static; perhaps just a namespace is required. // make static; perhaps just a namespace is required.
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);

View File

@@ -372,53 +372,4 @@ void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank)
} }
} }
#ifdef GRID_LOG_VIEWS
bool ViewLogger::Enabled = false;
std::vector<ViewLogger::Entry_t> ViewLogger::LogVector;
void ViewLogger::Begin() { Enabled = true; LogVector.resize(0); }
void ViewLogger::End() { Enabled = false; }
void ViewLogger::Log(const char* filename, int line, int index, int mode, void* data, uint64_t bytes)
{
if (!Enabled)
return;
size_t i = LogVector.size();
LogVector.resize(i + 1);
auto & n = LogVector[i];
n.filename = filename;
n.line = line;
n.index = index;
if (bytes < sizeof(uint64_t)) {
n.head = n.tail = 0;
} else {
switch (mode) {
case AcceleratorRead:
case AcceleratorWrite:
case AcceleratorWriteDiscard:
acceleratorCopyFromDevice((char*)data, &n.head, sizeof(uint64_t));
acceleratorCopyFromDevice((char*)data + bytes - sizeof(uint64_t), &n.tail, sizeof(uint64_t));
break;
case CpuRead:
case CpuWrite:
//case CpuWriteDiscard:
n.head = *(uint64_t*)data;
n.tail = *(uint64_t*)((char*)data + bytes - sizeof(uint64_t));
break;
}
}
}
#endif
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@@ -42,22 +42,5 @@ class FlightRecorder {
static void xmitLog(void *,uint64_t bytes); static void xmitLog(void *,uint64_t bytes);
static void recvLog(void *,uint64_t bytes,int rank); static void recvLog(void *,uint64_t bytes,int rank);
}; };
#ifdef GRID_LOG_VIEWS
class ViewLogger {
struct Entry_t {
const char* filename;
int line;
int index;
uint64_t head, tail;
};
public:
static bool Enabled;
static std::vector<Entry_t> LogVector;
static void Begin();
static void End();
static void Log(const char* filename, int line, int index, int mode, void* data, uint64_t bytes);
};
#endif
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@@ -46,14 +46,10 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <cstdlib> #include <cstdlib>
#include <memory> #include <memory>
#include <Grid/Grid.h> #include <Grid/Grid.h>
#include <Grid/util/CompilerCompatible.h> #include <Grid/util/CompilerCompatible.h>
#ifdef HAVE_UNWIND
#include <libunwind.h>
#endif
#include <fenv.h> #include <fenv.h>
#ifdef __APPLE__ #ifdef __APPLE__
@@ -299,20 +295,6 @@ void GridBanner(void)
std::cout << std::setprecision(9); std::cout << std::setprecision(9);
} }
//Some file local variables
static int fileno_stdout;
static int fileno_stderr;
static int signal_delay;
class dlRegion {
public:
uint64_t start;
uint64_t end;
uint64_t size;
uint64_t offset;
std::string name;
};
std::vector<dlRegion> dlMap;
void Grid_init(int *argc,char ***argv) void Grid_init(int *argc,char ***argv)
{ {
@@ -365,19 +347,6 @@ void Grid_init(int *argc,char ***argv)
if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){ if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
Grid_debug_handler_init(); Grid_debug_handler_init();
} }
// Sleep n-seconds at end of handler
if( GridCmdOptionExists(*argv,*argv+*argc,"--signal-delay") ){
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--signal-delay");
GridCmdOptionInt(arg,signal_delay);
}
// periodic wakeup with stack trace printed
if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-heartbeat") ){
Grid_debug_heartbeat();
}
// periodic wakeup with empty handler (interrupts some system calls)
if( GridCmdOptionExists(*argv,*argv+*argc,"--heartbeat") ){
Grid_heartbeat();
}
#if defined(A64FX) #if defined(A64FX)
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-overlap") ){ if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-overlap") ){
@@ -416,52 +385,26 @@ void Grid_init(int *argc,char ***argv)
} else { } else {
FILE *fp; FILE *fp;
std::ostringstream fname; std::ostringstream fname;
int rank = CartesianCommunicator::RankWorld();
int radix=64;
char* root = getenv("GRID_STDOUT_ROOT");
if (root) {
fname << root ;
mkdir(fname.str().c_str(), S_IRWXU );
fname << "/";
}
fname << (rank/radix)*radix ;
mkdir(fname.str().c_str(), S_IRWXU );
fname << "/";
fname<<"Grid.stdout."; fname<<"Grid.stdout.";
fname<<CartesianCommunicator::RankWorld(); fname<<CartesianCommunicator::RankWorld();
fp=freopen(fname.str().c_str(),"w",stdout); fp=freopen(fname.str().c_str(),"w",stdout);
assert(fp!=(FILE *)NULL); assert(fp!=(FILE *)NULL);
std::ostringstream ename; std::ostringstream ename;
if (root){
ename << root << "/";
}
ename << (rank/radix)*radix << "/";
ename<<"Grid.stderr."; ename<<"Grid.stderr.";
ename<<CartesianCommunicator::RankWorld(); ename<<CartesianCommunicator::RankWorld();
fp=freopen(ename.str().c_str(),"w",stderr); fp=freopen(ename.str().c_str(),"w",stderr);
assert(fp!=(FILE *)NULL); assert(fp!=(FILE *)NULL);
} }
fileno_stdout = fileno(stdout);
fileno_stderr = fileno(stderr) ;
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
// OK to use GridLogMessage etc from here on // OK to use GridLogMessage etc from here on
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
std::cout << GridLogMessage << "================================================ "<<std::endl; std::cout << GridLogMessage << "================================================ "<<std::endl;
std::cout << GridLogMessage << "MPI is initialised and logging filters activated "<<std::endl; std::cout << GridLogMessage << "MPI is initialised and logging filters activated "<<std::endl;
std::cout << GridLogMessage << "================================================ "<<std::endl; std::cout << GridLogMessage << "================================================ "<<std::endl;
{
gethostname(hostname, HOST_NAME_MAX+1); gethostname(hostname, HOST_NAME_MAX+1);
time_t mytime; std::cout << GridLogMessage << "This rank is running on host "<< hostname<<std::endl;
struct tm *info;
char buffer[80];
time(&mytime);
info = localtime(&mytime);
strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", info);
std::cout << GridLogMessage << "This rank is running on host "<< hostname<<" at local time "<<buffer<<std::endl;
}
///////////////////////////////////////////////////////// /////////////////////////////////////////////////////////
// Reporting // Reporting
@@ -478,47 +421,6 @@ void Grid_init(int *argc,char ***argv)
MemoryProfiler::stats = &dbgMemStats; MemoryProfiler::stats = &dbgMemStats;
} }
/////////////////////////////////////////////////////////
// LD.so space
/////////////////////////////////////////////////////////
#ifndef __APPLE__
{
// Provides mapping of .so files
FILE *f = fopen("/proc/self/maps", "r");
if (f) {
char line[256];
while (fgets(line, sizeof(line), f)) {
if (strstr(line, "r-xp")) {
dlRegion region;
uint32_t major, minor, inode;
uint64_t start,end,offset;
char path[PATH_MAX];
sscanf(line,"%lx-%lx r-xp %lx %x:%x %d %s",
&start,&end,&offset,
&major,&minor,&inode,path);
region.start=start;
region.end =end;
region.offset=offset;
region.name = std::string(path);
region.size = region.end-region.start;
dlMap.push_back(region);
// std::cout << GridLogMessage<< line;
}
}
fclose(f);
}
if( GridCmdOptionExists(*argv,*argv+*argc,"--dylib-map") ){
std::cout << GridLogMessage << "================================================ "<<std::endl;
std::cout << GridLogMessage<< " Dynamic library map: " <<std::endl;
std::cout << GridLogMessage << "================================================ "<<std::endl;
for(int r=0;r<dlMap.size();r++){
auto region = dlMap[r];
std::cout << GridLogMessage<<" "<<region.name<<std::hex<<region.start<<"-"<<region.end<<" sz "<<region.size<<std::dec<<std::endl;
}
std::cout << GridLogMessage << "================================================ "<<std::endl;
}
}
#endif
//////////////////////////////////// ////////////////////////////////////
// Logging // Logging
//////////////////////////////////// ////////////////////////////////////
@@ -551,19 +453,14 @@ void Grid_init(int *argc,char ***argv)
std::cout<<GridLogMessage<<" --shm-hugepages : use explicit huge pages in mmap call "<<std::endl; std::cout<<GridLogMessage<<" --shm-hugepages : use explicit huge pages in mmap call "<<std::endl;
std::cout<<GridLogMessage<<" --device-mem M : Size of device software cache for lattice fields (MB) "<<std::endl; std::cout<<GridLogMessage<<" --device-mem M : Size of device software cache for lattice fields (MB) "<<std::endl;
std::cout<<GridLogMessage<<std::endl; std::cout<<GridLogMessage<<std::endl;
std::cout<<GridLogMessage<<"Verbose:"<<std::endl; std::cout<<GridLogMessage<<"Verbose and debug:"<<std::endl;
std::cout<<GridLogMessage<<std::endl; std::cout<<GridLogMessage<<std::endl;
std::cout<<GridLogMessage<<" --log list : comma separated list from Error,Warning,Message,Performance,Iterative,Integrator,Debug,Colours"<<std::endl; std::cout<<GridLogMessage<<" --log list : comma separated list from Error,Warning,Message,Performance,Iterative,Integrator,Debug,Colours"<<std::endl;
std::cout<<GridLogMessage<<" --notimestamp : suppress millisecond resolution stamps"<<std::endl;
std::cout<<GridLogMessage<<" --decomposition : report on default omp,mpi and simd decomposition"<<std::endl; std::cout<<GridLogMessage<<" --decomposition : report on default omp,mpi and simd decomposition"<<std::endl;
std::cout<<GridLogMessage<<"Debug:"<<std::endl; std::cout<<GridLogMessage<<" --debug-signals : catch sigsegv and print a blame report"<<std::endl;
std::cout<<GridLogMessage<<" --dylib-map : print dynamic library map, useful for interpreting signal backtraces "<<std::endl; std::cout<<GridLogMessage<<" --debug-stdout : print stdout from EVERY node"<<std::endl;
std::cout<<GridLogMessage<<" --heartbeat : periodic itimer wakeup (interrupts stuck system calls!) "<<std::endl;
std::cout<<GridLogMessage<<" --signal-delay n : pause for n seconds after signal handling (useful to get ALL nodes in stuck state) "<<std::endl;
std::cout<<GridLogMessage<<" --debug-stdout : print stdout from EVERY node to file Grid.stdout/err.rank "<<std::endl;
std::cout<<GridLogMessage<<" --debug-signals : catch sigsegv and print a blame report, handle SIGHUP with a backtrace to stderr"<<std::endl;
std::cout<<GridLogMessage<<" --debug-heartbeat : periodically report backtrace "<<std::endl;
std::cout<<GridLogMessage<<" --debug-mem : print Grid allocator activity"<<std::endl; std::cout<<GridLogMessage<<" --debug-mem : print Grid allocator activity"<<std::endl;
std::cout<<GridLogMessage<<" --notimestamp : suppress millisecond resolution stamps"<<std::endl;
std::cout<<GridLogMessage<<std::endl; std::cout<<GridLogMessage<<std::endl;
std::cout<<GridLogMessage<<"Performance:"<<std::endl; std::cout<<GridLogMessage<<"Performance:"<<std::endl;
std::cout<<GridLogMessage<<std::endl; std::cout<<GridLogMessage<<std::endl;
@@ -658,56 +555,17 @@ void GridLogLayout() {
} }
void * Grid_backtrace_buffer[_NBACKTRACE]; void * Grid_backtrace_buffer[_NBACKTRACE];
#define SIGLOG(A) ::write(fileno_stderr,A,strlen(A));
void sig_print_dig(uint32_t dig) void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr)
{ {
const char *digits[] = {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "a", "b", "c", "d", "e", "f" }; fprintf(stderr,"Signal handler on host %s\n",hostname);
if ( dig>=0 && dig< 16){ fprintf(stderr,"FlightRecorder step %d stage %s \n",
SIGLOG(digits[dig]); FlightRecorder::StepLoggingCounter,
} FlightRecorder::StepName);
} fprintf(stderr,"Caught signal %d\n",si->si_signo);
void sig_print_uint(uint32_t A) fprintf(stderr," mem address %llx\n",(unsigned long long)si->si_addr);
{ fprintf(stderr," code %d\n",si->si_code);
int dig; // x86 64bit
int nz=0;
#define DIGIT(DIV) dig = (A/DIV)%10 ; if(dig|nz) sig_print_dig(dig); nz = nz|dig;
DIGIT(1000000000); // Catches 4BN = 2^32
DIGIT(100000000);
DIGIT(10000000);
DIGIT(1000000);
DIGIT(100000);
DIGIT(10000);
DIGIT(1000);
DIGIT(100);
DIGIT(10);
DIGIT(1);
if (nz==0) SIGLOG("0");
}
void sig_print_hex(uint64_t A)
{
int nz=0;
int dig;
#define NIBBLE(A) dig = A ; if(dig|nz) sig_print_dig(dig); nz = nz|dig;
SIGLOG("0x");
NIBBLE((A>>(15*4))&0xF);
NIBBLE((A>>(14*4))&0xF);
NIBBLE((A>>(13*4))&0xF);
NIBBLE((A>>(12*4))&0xF);
NIBBLE((A>>(11*4))&0xF);
NIBBLE((A>>(10*4))&0xF);
NIBBLE((A>>(9*4))&0xF);
NIBBLE((A>>(8*4))&0xF);
NIBBLE((A>>(7*4))&0xF);
NIBBLE((A>>(6*4))&0xF);
NIBBLE((A>>(5*4))&0xF);
NIBBLE((A>>(4*4))&0xF);
NIBBLE((A>>(3*4))&0xF);
NIBBLE((A>>(2*4))&0xF);
NIBBLE((A>>4)&0xF);
sig_print_dig(A&0xF);
}
/*
#ifdef __linux__ #ifdef __linux__
#ifdef __x86_64__ #ifdef __x86_64__
ucontext_t * uc= (ucontext_t *)ptr; ucontext_t * uc= (ucontext_t *)ptr;
@@ -715,158 +573,81 @@ void sig_print_hex(uint64_t A)
fprintf(stderr," instruction %llx\n",(unsigned long long)sc->rip); fprintf(stderr," instruction %llx\n",(unsigned long long)sc->rip);
#endif #endif
#endif #endif
*/ fflush(stderr);
void Grid_generic_handler(int sig,siginfo_t *si,void * ptr) BACKTRACEFP(stderr);
{ fprintf(stderr,"Called backtrace\n");
SIGLOG("Signal handler on host "); fflush(stdout);
SIGLOG(hostname); fflush(stderr);
SIGLOG(" process id ");
sig_print_uint((uint32_t)getpid());
SIGLOG("\n");
SIGLOG("FlightRecorder step ");
sig_print_uint(FlightRecorder::StepLoggingCounter);
SIGLOG(" stage ");
SIGLOG(FlightRecorder::StepName);
SIGLOG("\n");
SIGLOG("Caught signal ");
sig_print_uint(si->si_signo);
SIGLOG("\n");
SIGLOG(" mem address ");
sig_print_hex((uint64_t)si->si_addr);
SIGLOG("\n");
SIGLOG(" code ");
sig_print_uint(si->si_code);
SIGLOG("\n");
ucontext_t *uc= (ucontext_t *)ptr;
SIGLOG("Backtrace:\n");
#ifdef HAVE_UNWIND
// Debug cross check on offsets
// int symbols = backtrace(Grid_backtrace_buffer,_NBACKTRACE);
// backtrace_symbols_fd(Grid_backtrace_buffer,symbols,fileno_stderr);
unw_cursor_t cursor;
unw_word_t ip, off;
if (!unw_init_local(&cursor, uc) ) {
SIGLOG(" frame IP function\n");
int level = 0;
int ret = 0;
while(1) {
char name[128];
if (level >= _NBACKTRACE) return;
unw_get_reg(&cursor, UNW_REG_IP, &ip);
sig_print_uint(level); SIGLOG(" ");
sig_print_hex(ip); SIGLOG(" ");
for(int r=0;r<dlMap.size();r++){
if((ip>=dlMap[r].start) &&(ip<dlMap[r].end)){
SIGLOG(dlMap[r].name.c_str());
SIGLOG("+");
sig_print_hex((ip-dlMap[r].start));
break;
}
}
SIGLOG("\n");
Grid_backtrace_buffer[level]=(void *)ip;
level++;
ret = unw_step(&cursor);
if (ret <= 0) {
return;
}
}
}
#else
// Known Asynch-Signal unsafe
int symbols = backtrace(Grid_backtrace_buffer,_NBACKTRACE);
backtrace_symbols_fd(Grid_backtrace_buffer,symbols,fileno_stderr);
#endif
}
void Grid_heartbeat_signal_handler(int sig,siginfo_t *si,void * ptr)
{
Grid_generic_handler(sig,si,ptr);
SIGLOG("\n");
}
void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr)
{
Grid_generic_handler(sig,si,ptr);
if (signal_delay) {
SIGLOG("Adding extra signal delay ");
sig_print_uint(signal_delay);
SIGLOG(" s\n");
usleep( (uint64_t) signal_delay*1000LL*1000LL);
}
SIGLOG("\n");
return; return;
} }
void Grid_fatal_signal_handler(int sig,siginfo_t *si,void * ptr) void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
{ {
Grid_generic_handler(sig,si,ptr); fprintf(stderr,"Signal handler on host %s\n",hostname);
SIGLOG("\n"); fprintf(stderr,"Caught signal %d\n",si->si_signo);
fprintf(stderr," mem address %llx\n",(unsigned long long)si->si_addr);
fprintf(stderr," code %d\n",si->si_code);
// Linux/Posix
#ifdef __linux__
// And x86 64bit
#ifdef __x86_64__
ucontext_t * uc= (ucontext_t *)ptr;
struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
fprintf(stderr," instruction %llx\n",(unsigned long long)sc->rip);
#define REG(A) fprintf(stderr," %s %lx\n",#A,sc-> A);
REG(rdi);
REG(rsi);
REG(rbp);
REG(rbx);
REG(rdx);
REG(rax);
REG(rcx);
REG(rsp);
REG(rip);
REG(r8);
REG(r9);
REG(r10);
REG(r11);
REG(r12);
REG(r13);
REG(r14);
REG(r15);
#endif
#endif
fflush(stderr);
BACKTRACEFP(stderr);
fprintf(stderr,"Called backtrace\n");
fflush(stdout);
fflush(stderr);
exit(0); exit(0);
return; return;
}; };
void Grid_empty_signal_handler(int sig,siginfo_t *si,void * ptr)
{
// SIGLOG("heartbeat signal handled\n");
return;
}
void Grid_debug_heartbeat(void)
{
struct sigaction sa_ping;
sigemptyset (&sa_ping.sa_mask);
sa_ping.sa_sigaction= Grid_usr_signal_handler;
sa_ping.sa_flags = SA_SIGINFO;
sigaction(SIGALRM,&sa_ping,NULL);
// repeating 10s heartbeat
struct itimerval it_val;
it_val.it_value.tv_sec = 10;
it_val.it_value.tv_usec = 0;
it_val.it_interval = it_val.it_value;
setitimer(ITIMER_REAL, &it_val, NULL);
}
void Grid_heartbeat(void)
{
struct sigaction sa_ping;
sigemptyset (&sa_ping.sa_mask);
sa_ping.sa_sigaction= Grid_empty_signal_handler;
sa_ping.sa_flags = SA_SIGINFO;
sigaction(SIGALRM,&sa_ping,NULL);
// repeating 10s heartbeat
struct itimerval it_val;
it_val.it_value.tv_sec = 10;
it_val.it_value.tv_usec = 1000;
it_val.it_interval = it_val.it_value;
setitimer(ITIMER_REAL, &it_val, NULL);
}
void Grid_exit_handler(void) void Grid_exit_handler(void)
{ {
BACKTRACEFP(stdout); // BACKTRACEFP(stdout);
fflush(stdout); // fflush(stdout);
} }
void Grid_debug_handler_init(void) void Grid_debug_handler_init(void)
{ {
struct sigaction sa; struct sigaction sa;
sigemptyset (&sa.sa_mask); sigemptyset (&sa.sa_mask);
sa.sa_sigaction= Grid_fatal_signal_handler; sa.sa_sigaction= Grid_sa_signal_handler;
sa.sa_flags = SA_SIGINFO; sa.sa_flags = SA_SIGINFO;
// sigaction(SIGSEGV,&sa,NULL);
sigaction(SIGTRAP,&sa,NULL); sigaction(SIGTRAP,&sa,NULL);
sigaction(SIGILL,&sa,NULL);
#ifndef GRID_SYCL
sigaction(SIGSEGV,&sa,NULL); // SYCL is using SIGSEGV
sigaction(SIGBUS,&sa,NULL); sigaction(SIGBUS,&sa,NULL);
feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO); // sigaction(SIGUSR2,&sa,NULL);
sigaction(SIGFPE,&sa,NULL);
#endif
// Non terminating SIGHUP handler feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
sigaction(SIGFPE,&sa,NULL);
sigaction(SIGKILL,&sa,NULL);
sigaction(SIGILL,&sa,NULL);
// Non terminating SIGUSR1/2 handler
struct sigaction sa_ping; struct sigaction sa_ping;
sigemptyset (&sa_ping.sa_mask); sigemptyset (&sa_ping.sa_mask);
sa_ping.sa_sigaction= Grid_usr_signal_handler; sa_ping.sa_sigaction= Grid_usr_signal_handler;

View File

@@ -38,11 +38,7 @@ char * GridHostname(void);
// internal, controled with --handle // internal, controled with --handle
void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr); void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr);
void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr);
void Grid_empty_signal_handler(int sig,siginfo_t *si,void * ptr);
void Grid_debug_handler_init(void); void Grid_debug_handler_init(void);
void Grid_debug_heartbeat(void);
void Grid_heartbeat(void);
void Grid_quiesce_nodes(void); void Grid_quiesce_nodes(void);
void Grid_unquiesce_nodes(void); void Grid_unquiesce_nodes(void);

View File

@@ -66,7 +66,6 @@ namespace Grid{
}; };
} }
template <class T> void writeFile(T& in, std::string const fname){ template <class T> void writeFile(T& in, std::string const fname){
#ifdef HAVE_LIME #ifdef HAVE_LIME
// Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111 // Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
@@ -74,7 +73,7 @@ template <class T> void writeFile(T& in, std::string const fname){
Grid::emptyUserRecord record; Grid::emptyUserRecord record;
Grid::ScidacWriter WR(in.Grid()->IsBoss()); Grid::ScidacWriter WR(in.Grid()->IsBoss());
WR.open(fname); WR.open(fname);
WR.writeScidacFieldRecord(in,record,0); // Lexico WR.writeScidacFieldRecord(in,record,0);
WR.close(); WR.close();
#endif #endif
// What is the appropriate way to throw error? // What is the appropriate way to throw error?
@@ -108,18 +107,8 @@ int main(int argc, char **argv) {
for (int conf = CPar.StartConfiguration; conf <= CPar.EndConfiguration; conf+= CPar.Skip){ for (int conf = CPar.StartConfiguration; conf <= CPar.EndConfiguration; conf+= CPar.Skip){
#if 0
CPNersc.CheckpointRestore(conf, Umu, sRNG, pRNG); CPNersc.CheckpointRestore(conf, Umu, sRNG, pRNG);
#else
// Don't require Grid format RNGs
FieldMetaData header;
std::string file, filesmr;
file = CPar.conf_path + "/" + CPar.conf_prefix + "." + std::to_string(conf);
filesmr = CPar.conf_path + "/" + CPar.conf_smr_prefix + "." + std::to_string(conf);
NerscIO::readConfiguration(Umu,header,file);
#endif
std::cout << std::setprecision(15); std::cout << std::setprecision(15);
std::cout << GridLogMessage << "Initial plaquette: "<< WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu) << std::endl; std::cout << GridLogMessage << "Initial plaquette: "<< WilsonLoops<PeriodicGimplR>::avgPlaquette(Umu) << std::endl;
@@ -127,7 +116,6 @@ int main(int argc, char **argv) {
std::string file_post = CPar.conf_prefix + "." + std::to_string(conf); std::string file_post = CPar.conf_prefix + "." + std::to_string(conf);
WilsonFlow<PeriodicGimplR> WF(WFPar.step_size,WFPar.steps,WFPar.meas_interval); WilsonFlow<PeriodicGimplR> WF(WFPar.step_size,WFPar.steps,WFPar.meas_interval);
WF.addMeasurement(WFPar.meas_interval_density, [&file_pre,&file_post,&conf](int step, RealD t, const typename PeriodicGimplR::GaugeField &U){ WF.addMeasurement(WFPar.meas_interval_density, [&file_pre,&file_post,&conf](int step, RealD t, const typename PeriodicGimplR::GaugeField &U){
typedef typename PeriodicGimplR::GaugeLinkField GaugeMat; typedef typename PeriodicGimplR::GaugeLinkField GaugeMat;
@@ -177,48 +165,33 @@ int main(int argc, char **argv) {
//double coeff = 2.0 / (1.0 * Nd * (Nd - 1)) / 3.0; //double coeff = 2.0 / (1.0 * Nd * (Nd - 1)) / 3.0;
//Plq = coeff * Plq; //Plq = coeff * Plq;
RealD WFlow_TC5Li = WilsonLoops<PeriodicGimplR>::TopologicalCharge5Li(U);
int tau = std::round(t); int tau = std::round(t);
std::string efile = file_pre + "E_dnsty_" + std::to_string(tau) + "_" + file_post; std::string efile = file_pre + "E_dnsty_" + std::to_string(tau) + "_" + file_post;
// writeFile(R,efile); writeFile(R,efile);
std::string tfile = file_pre + "Top_dnsty_" + std::to_string(tau) + "_" + file_post; std::string tfile = file_pre + "Top_dnsty_" + std::to_string(tau) + "_" + file_post;
// writeFile(qfield,tfile); writeFile(qfield,tfile);
std::string ufile = file_pre + "U_" + std::to_string(tau) + "_" + file_post;
{
// PeriodicGimplR::GaugeField Ucopy = U;
// NerscIO::writeConfiguration(Ucopy,ufile);
}
RealD E = real(sum(R))/ RealD(U.Grid()->gSites()); RealD E = real(sum(R))/ RealD(U.Grid()->gSites());
RealD T = real( sum(qfield) ); RealD T = real( sum(qfield) );
Coordinate scoor; for (int mu=0; mu < Nd; mu++) scoor[mu] = 0; Coordinate scoor; for (int mu=0; mu < Nd; mu++) scoor[mu] = 0;
RealD E0 = real(peekSite(R,scoor)); RealD E0 = real(peekSite(R,scoor));
RealD T0 = real(peekSite(qfield,scoor)); RealD T0 = real(peekSite(qfield,scoor));
std::cout << GridLogMessage << "[WilsonFlow] Saved energy density (clover) & topo. charge density: " << conf << " " << step << " " << tau << " " std::cout << GridLogMessage << "[WilsonFlow] Saved energy density (clover) & topo. charge density: " << conf << " " << step << " " << tau << " "
<< "(E_avg,T_sum) " << E << " " << T << " (E, T at origin) " << E0 << " " << T0 << " Q5Li "<< WFlow_TC5Li << std::endl; << "(E_avg,T_sum) " << E << " " << T << " (E, T at origin) " << E0 << " " << T0 << std::endl;
}); });
int t=WFPar.maxTau; int t=WFPar.maxTau;
WF.smear(Uflow, Umu); WF.smear(Uflow, Umu);
// NerscIO::writeConfiguration(Uflow,filesmr);
RealD WFlow_plaq = WilsonLoops<PeriodicGimplR>::avgPlaquette(Uflow); RealD WFlow_plaq = WilsonLoops<PeriodicGimplR>::avgPlaquette(Uflow);
RealD WFlow_TC = WilsonLoops<PeriodicGimplR>::TopologicalCharge(Uflow); RealD WFlow_TC = WilsonLoops<PeriodicGimplR>::TopologicalCharge(Uflow);
RealD WFlow_TC5Li = WilsonLoops<PeriodicGimplR>::TopologicalCharge5Li(Uflow);
RealD WFlow_T0 = WF.energyDensityPlaquette(t,Uflow); // t RealD WFlow_T0 = WF.energyDensityPlaquette(t,Uflow); // t
RealD WFlow_EC = WF.energyDensityCloverleaf(t,Uflow); RealD WFlow_EC = WF.energyDensityCloverleaf(t,Uflow);
std::cout << GridLogMessage << "Plaquette "<< conf << " " << WFlow_plaq << std::endl; std::cout << GridLogMessage << "Plaquette "<< conf << " " << WFlow_plaq << std::endl;
std::cout << GridLogMessage << "T0 "<< conf << " " << WFlow_T0 << std::endl; std::cout << GridLogMessage << "T0 "<< conf << " " << WFlow_T0 << std::endl;
std::cout << GridLogMessage << "TC0 "<< conf << " " << WFlow_EC << std::endl; std::cout << GridLogMessage << "TC0 "<< conf << " " << WFlow_EC << std::endl;
std::cout << GridLogMessage << "TopologicalCharge "<< conf << " " << WFlow_TC << std::endl; std::cout << GridLogMessage << "TopologicalCharge "<< conf << " " << WFlow_TC << std::endl;
std::cout << GridLogMessage << "TopologicalCharge5Li "<< conf << " " << WFlow_TC5Li<< std::endl;
std::cout<< GridLogMessage << " Admissibility check:\n"; std::cout<< GridLogMessage << " Admissibility check:\n";
const double sp_adm = 0.067; // admissible threshold const double sp_adm = 0.067; // admissible threshold

View File

@@ -201,7 +201,8 @@ int main(int argc, char **argv) {
Params.dirichlet=NonDirichlet; Params.dirichlet=NonDirichlet;
ParamsDir.dirichlet=Dirichlet; ParamsDir.dirichlet=Dirichlet;
// ParamsDir.partialDirichlet=0; ParamsDir.partialDirichlet=0;
std::cout << GridLogMessage<< "Partial Dirichlet depth is "<<dwf_compressor_depth<<std::endl;
// double StoppingCondition = 1e-14; // double StoppingCondition = 1e-14;
// double MDStoppingCondition = 1e-9; // double MDStoppingCondition = 1e-9;
@@ -297,11 +298,11 @@ int main(int argc, char **argv) {
if ( dirichlet_den[h]==1) ParamsDen.dirichlet = Dirichlet; if ( dirichlet_den[h]==1) ParamsDen.dirichlet = Dirichlet;
else ParamsDen.dirichlet = NonDirichlet; else ParamsDen.dirichlet = NonDirichlet;
// if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1; if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1;
// else ParamsNum.partialDirichlet = 0; else ParamsNum.partialDirichlet = 0;
// if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1; if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1;
// else ParamsDen.partialDirichlet = 0; else ParamsDen.partialDirichlet = 0;
Numerators.push_back (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum)); Numerators.push_back (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum));
Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen)); Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen));

View File

@@ -333,9 +333,9 @@ int main(int argc, char **argv) {
ParamsF.dirichlet=NonDirichlet; ParamsF.dirichlet=NonDirichlet;
ParamsDir.dirichlet=Dirichlet; ParamsDir.dirichlet=Dirichlet;
ParamsDirF.dirichlet=Dirichlet; ParamsDirF.dirichlet=Dirichlet;
// ParamsDir.partialDirichlet=1; ParamsDir.partialDirichlet=1;
// ParamsDirF.partialDirichlet=1; ParamsDirF.partialDirichlet=1;
// std::cout << GridLogMessage<< "Partial Dirichlet depth is "<<dwf_compressor_depth<<std::endl; std::cout << GridLogMessage<< "Partial Dirichlet depth is "<<dwf_compressor_depth<<std::endl;
// double StoppingCondition = 1e-14; // double StoppingCondition = 1e-14;
// double MDStoppingCondition = 1e-9; // double MDStoppingCondition = 1e-9;
@@ -481,21 +481,21 @@ int main(int argc, char **argv) {
if ( dirichlet_den[h]==1) ParamsDen.dirichlet = Dirichlet; if ( dirichlet_den[h]==1) ParamsDen.dirichlet = Dirichlet;
else ParamsDen.dirichlet = NonDirichlet; else ParamsDen.dirichlet = NonDirichlet;
// if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1; if ( dirichlet_num[h]==1) ParamsNum.partialDirichlet = 1;
// else ParamsNum.partialDirichlet = 0; else ParamsNum.partialDirichlet = 0;
// if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1; if ( dirichlet_den[h]==1) ParamsDen.partialDirichlet = 1;
// else ParamsDen.partialDirichlet = 0; else ParamsDen.partialDirichlet = 0;
Numerators.push_back (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum)); Numerators.push_back (new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_num[h],M5,b,c, ParamsNum));
Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen)); Denominators.push_back(new FermionAction(U,*FGrid,*FrbGrid,*GridPtr,*GridRBPtr,light_den[h],M5,b,c, ParamsDen));
ParamsDenF.dirichlet = ParamsDen.dirichlet; ParamsDenF.dirichlet = ParamsDen.dirichlet;
// ParamsDenF.partialDirichlet = ParamsDen.partialDirichlet; ParamsDenF.partialDirichlet = ParamsDen.partialDirichlet;
DenominatorsF.push_back(new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_den[h],M5,b,c, ParamsDenF)); DenominatorsF.push_back(new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_den[h],M5,b,c, ParamsDenF));
ParamsNumF.dirichlet = ParamsNum.dirichlet; ParamsNumF.dirichlet = ParamsNum.dirichlet;
// ParamsNumF.partialDirichlet = ParamsNum.partialDirichlet; ParamsNumF.partialDirichlet = ParamsNum.partialDirichlet;
NumeratorsF.push_back (new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_num[h],M5,b,c, ParamsNumF)); NumeratorsF.push_back (new FermionActionF(UF,*FGridF,*FrbGridF,*GridPtrF,*GridRBPtrF,light_num[h],M5,b,c, ParamsNumF));
LinOpD.push_back(new LinearOperatorD(*Denominators[h])); LinOpD.push_back(new LinearOperatorD(*Denominators[h]));

View File

@@ -166,18 +166,18 @@ int main (int argc, char ** argv)
} }
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Benchmarking sequential STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl; std::cout<<GridLogMessage << "= Benchmarking concurrent STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl; std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
header(); header();
for(int lat=8;lat<=maxlat;lat+=4){ for(int lat=8;lat<=maxlat;lat+=4){
for(int Ls=8;Ls<=8;Ls*=2){ for(int Ls=8;Ls<=8;Ls*=2){
Coordinate latt_size ({lat*mpi_layout[0], Coordinate latt_size ({lat*mpi_layout[0],
lat*mpi_layout[1], lat*mpi_layout[1],
lat*mpi_layout[2], lat*mpi_layout[2],
lat*mpi_layout[3]}); lat*mpi_layout[3]});
GridCartesian Grid(latt_size,simd_layout,mpi_layout); GridCartesian Grid(latt_size,simd_layout,mpi_layout);
RealD Nrank = Grid._Nprocessors; RealD Nrank = Grid._Nprocessors;
@@ -193,6 +193,101 @@ int main (int argc, char ** argv)
rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes); rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
} }
int ncomm;
double dbytes;
for(int i=0;i<Nloop;i++){
double start=usecond();
dbytes=0;
ncomm=0;
std::vector<CommsRequest_t> requests;
for(int mu=0;mu<4;mu++){
if (mpi_layout[mu]>1 ) {
ncomm++;
int comm_proc=1;
int xmit_to_rank;
int recv_from_rank;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
dbytes+=
Grid.StencilSendToRecvFromBegin(requests,
(void *)&xbuf[mu][0],
xmit_to_rank,1,
(void *)&rbuf[mu][0],
recv_from_rank,1,
bytes,bytes,mu);
comm_proc = mpi_layout[mu]-1;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
dbytes+=
Grid.StencilSendToRecvFromBegin(requests,
(void *)&xbuf[mu+4][0],
xmit_to_rank,1,
(void *)&rbuf[mu+4][0],
recv_from_rank,1,
bytes,bytes,mu+4);
}
}
Grid.StencilSendToRecvFromComplete(requests,0);
Grid.Barrier();
double stop=usecond();
t_time[i] = stop-start; // microseconds
}
timestat.statistics(t_time);
dbytes=dbytes*ppn;
double xbytes = dbytes*0.5;
// double rbytes = dbytes*0.5;
double bidibytes = dbytes;
std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
<<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
<<std::right<< xbytes/timestat.mean<<" "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
<<xbytes/timestat.max <<" "<< xbytes/timestat.min
<< "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< " " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
<< bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
}
}
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
std::cout<<GridLogMessage << "= Benchmarking sequential STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
header();
for(int lat=8;lat<=maxlat;lat+=4){
for(int Ls=8;Ls<=8;Ls*=2){
Coordinate latt_size ({lat*mpi_layout[0],
lat*mpi_layout[1],
lat*mpi_layout[2],
lat*mpi_layout[3]});
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
RealD Nrank = Grid._Nprocessors;
RealD Nnode = Grid.NodeCount();
RealD ppn = Nrank/Nnode;
std::vector<HalfSpinColourVectorD *> xbuf(8);
std::vector<HalfSpinColourVectorD *> rbuf(8);
Grid.ShmBufferFreeAll();
uint64_t bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
for(int d=0;d<8;d++){
xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(bytes);
}
int ncomm; int ncomm;
double dbytes; double dbytes;
for(int i=0;i<Nloop;i++){ for(int i=0;i<Nloop;i++){
@@ -201,35 +296,45 @@ int main (int argc, char ** argv)
std::vector<CommsRequest_t> requests; std::vector<CommsRequest_t> requests;
dbytes=0; dbytes=0;
ncomm=0; ncomm=0;
for(int mu=0;mu<4;mu++){
for(int dir=0;dir<8;dir++) {
double tbytes;
int mu =dir % 4;
if (mpi_layout[mu]>1 ) { if (mpi_layout[mu]>1 ) {
ncomm++; ncomm++;
int comm_proc=1;
int xmit_to_rank; int xmit_to_rank;
int recv_from_rank; int recv_from_rank;
if ( dir == mu ) {
int comm_proc=1; Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); dbytes+=
} else { Grid.StencilSendToRecvFromBegin(requests,
int comm_proc = mpi_layout[mu]-1; (void *)&xbuf[mu][0],
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank); xmit_to_rank,1,
} (void *)&rbuf[mu][0],
// int tid = omp_get_thread_num(); recv_from_rank,1,
int tid = 0; bytes,bytes,mu);
tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,1, Grid.StencilSendToRecvFromComplete(requests,mu);
(void *)&rbuf[dir][0], recv_from_rank,1, bytes,tid); requests.resize(0);
dbytes+=tbytes; comm_proc = mpi_layout[mu]-1;
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
dbytes+=
Grid.StencilSendToRecvFromBegin(requests,
(void *)&xbuf[mu+4][0],
xmit_to_rank,1,
(void *)&rbuf[mu+4][0],
recv_from_rank,1,
bytes,bytes,mu+4);
Grid.StencilSendToRecvFromComplete(requests,mu+4);
requests.resize(0);
} }
} }
Grid.Barrier(); Grid.Barrier();
double stop=usecond(); double stop=usecond();
t_time[i] = stop-start; // microseconds t_time[i] = stop-start; // microseconds
} }
timestat.statistics(t_time); timestat.statistics(t_time);

View File

@@ -32,18 +32,18 @@
using namespace std; using namespace std;
using namespace Grid; using namespace Grid;
//////////////////////// template<class d>
/// Move to domains //// struct scal {
//////////////////////// d internal;
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT
}; };
void Benchmark(int Ls, Coordinate Dirichlet,bool Sloppy); Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT
};
int main (int argc, char ** argv) int main (int argc, char ** argv)
{ {
@@ -52,108 +52,39 @@ int main (int argc, char ** argv)
int threads = GridThread::GetThreads(); int threads = GridThread::GetThreads();
int Ls=16; Coordinate latt4 = GridDefaultLatt();
for(int i=0;i<argc;i++) { int Ls=8;
for(int i=0;i<argc;i++)
if(std::string(argv[i]) == "-Ls"){ if(std::string(argv[i]) == "-Ls"){
std::stringstream ss(argv[i+1]); ss >> Ls; std::stringstream ss(argv[i+1]); ss >> Ls;
} }
}
//////////////////
// With comms
//////////////////
Coordinate Dirichlet(Nd+1,0);
std::cout << "\n\n\n\n\n\n" <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
std::cout << GridLogMessage<< " Testing with full communication " <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
Benchmark(Ls,Dirichlet,false);
std::cout << "\n\n\n\n\n\n" <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
std::cout << GridLogMessage<< " Testing with sloppy communication " <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
Benchmark(Ls,Dirichlet,true);
//////////////////
// Domain decomposed
//////////////////
/*
Coordinate latt4 = GridDefaultLatt();
Coordinate mpi = GridDefaultMpi();
Coordinate CommDim(Nd);
Coordinate shm;
GlobalSharedMemory::GetShmDims(mpi,shm);
std::cout << "\n\n\n\n\n\n" <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
// std::cout << GridLogMessage<< " Testing without internode communication " <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
Dirichlet[0] = 0;
Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
Benchmark(Ls,Dirichlet,false);
std::cout << "\n\n\n\n\n\n" <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
std::cout << GridLogMessage<< " Testing with sloppy communication " <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
for(int d=0;d<Nd;d++) CommDim[d]= mpi[d]>1 ? 1 : 0;
Benchmark(Ls,Dirichlet,true);
*/
Grid_finalize();
exit(0);
}
void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
{
Coordinate latt4 = GridDefaultLatt();
GridLogLayout(); GridLogLayout();
long unsigned int single_site_flops = 8*Nc*(7+16*Nc); long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8}); GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
#undef SINGLE
#ifdef SINGLE
typedef vComplexF Simd;
typedef LatticeFermionF FermionField;
typedef LatticeGaugeFieldF GaugeField;
typedef LatticeColourMatrixF ColourMatrixField;
typedef DomainWallFermionF FermionAction;
#else
typedef vComplexD Simd;
typedef LatticeFermionD FermionField;
typedef LatticeGaugeFieldD GaugeField;
typedef LatticeColourMatrixD ColourMatrixField;
typedef DomainWallFermionD FermionAction;
#endif
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,Simd::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid); GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid); GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl; std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
GridParallelRNG RNG4(UGrid); RNG4.SeedUniqueString(std::string("The 4D RNG")); GridParallelRNG RNG4(UGrid); RNG4.SeedUniqueString(std::string("The 4D RNG"));
std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl; std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
GridParallelRNG RNG5(FGrid); RNG5.SeedUniqueString(std::string("The 5D RNG")); GridParallelRNG RNG5(FGrid); RNG5.SeedUniqueString(std::string("The 5D RNG"));
std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
LatticeFermion src (FGrid); random(RNG5,src);
FermionField src (FGrid); random(RNG5,src);
#if 0 #if 0
src = Zero(); src = Zero();
{ {
@@ -169,39 +100,46 @@ void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
src = src*N2; src = src*N2;
#endif #endif
FermionField result(FGrid); result=Zero();
FermionField ref(FGrid); ref=Zero(); LatticeFermion result(FGrid); result=Zero();
FermionField tmp(FGrid); LatticeFermion ref(FGrid); ref=Zero();
FermionField err(FGrid); LatticeFermion tmp(FGrid);
LatticeFermion err(FGrid);
std::cout << GridLogMessage << "Drawing gauge field" << std::endl; std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
GaugeField Umu(UGrid); LatticeGaugeField Umu(UGrid);
GaugeField UmuCopy(UGrid);
SU<Nc>::HotConfiguration(RNG4,Umu); SU<Nc>::HotConfiguration(RNG4,Umu);
// SU<Nc>::ColdConfiguration(Umu);
UmuCopy=Umu;
std::cout << GridLogMessage << "Random gauge initialised " << std::endl; std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
#if 0
Umu=1.0;
for(int mu=0;mu<Nd;mu++){
LatticeColourMatrix ttmp(UGrid);
ttmp = PeekIndex<LorentzIndex>(Umu,mu);
// if (mu !=2 ) ttmp = 0;
// ttmp = ttmp* pow(10.0,mu);
PokeIndex<LorentzIndex>(Umu,ttmp,mu);
}
std::cout << GridLogMessage << "Forced to diagonal " << std::endl;
#endif
////////////////////////////////////
// Apply BCs
////////////////////////////////////
Coordinate Block(4);
for(int d=0;d<4;d++) Block[d]= Dirichlet[d+1];
std::cout << GridLogMessage << "Applying BCs for Dirichlet Block5 " << Dirichlet << std::endl;
std::cout << GridLogMessage << "Applying BCs for Dirichlet Block4 " << Block << std::endl;
DirichletFilter<GaugeField> Filter(Block);
Filter.applyFilter(Umu);
//////////////////////////////////// ////////////////////////////////////
// Naive wilson implementation // Naive wilson implementation
//////////////////////////////////// ////////////////////////////////////
std::vector<ColourMatrixField> U(4,UGrid); // replicate across fifth dimension
for(int mu=0;mu<Nd;mu++){ LatticeGaugeField Umu5d(FGrid);
U[mu] = PeekIndex<LorentzIndex>(Umu,mu); std::vector<LatticeColourMatrix> U(4,FGrid);
{
autoView( Umu5d_v, Umu5d, CpuWrite);
autoView( Umu_v , Umu , CpuRead);
for(int ss=0;ss<Umu.Grid()->oSites();ss++){
for(int s=0;s<Ls;s++){
Umu5d_v[Ls*ss+s] = Umu_v[ss];
}
}
}
for(int mu=0;mu<Nd;mu++){
U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
} }
std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl; std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
if (1) if (1)
@@ -209,28 +147,10 @@ void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
ref = Zero(); ref = Zero();
for(int mu=0;mu<Nd;mu++){ for(int mu=0;mu<Nd;mu++){
tmp = Cshift(src,mu+1,1); tmp = U[mu]*Cshift(src,mu+1,1);
{
autoView( tmp_v , tmp , CpuWrite);
autoView( U_v , U[mu] , CpuRead);
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
for(int s=0;s<Ls;s++){
tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
}
}
}
ref=ref + tmp - Gamma(Gmu[mu])*tmp; ref=ref + tmp - Gamma(Gmu[mu])*tmp;
{ tmp =adj(U[mu])*src;
autoView( tmp_v , tmp , CpuWrite);
autoView( U_v , U[mu] , CpuRead);
autoView( src_v, src , CpuRead);
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
for(int s=0;s<Ls;s++){
tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
}
}
}
tmp =Cshift(tmp,mu+1,-1); tmp =Cshift(tmp,mu+1,-1);
ref=ref + tmp + Gamma(Gmu[mu])*tmp; ref=ref + tmp + Gamma(Gmu[mu])*tmp;
} }
@@ -247,9 +167,11 @@ void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl; std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl; std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl; std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop "<<std::endl; std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionD::Dhop "<<std::endl;
std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl; std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
std::cout << GridLogMessage<< "* VComplex size is "<<sizeof(Simd)<< " B"<<std::endl; std::cout << GridLogMessage<< "* VComplex size is "<<sizeof(vComplex)<< " B"<<std::endl;
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
#ifdef GRID_OMP #ifdef GRID_OMP
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl; if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl; if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
@@ -259,15 +181,9 @@ void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl; if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl; std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
FermionAction::ImplParams p; DomainWallFermionD Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
p.dirichlet=Dirichlet; int ncall =1000;
FermionAction Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,p);
Dw.SloppyComms(sloppy);
Dw.ImportGauge(Umu);
int ncall =300;
RealD n2e;
if (1) { if (1) {
FGrid->Barrier(); FGrid->Barrier();
Dw.Dhop(src,result,0); Dw.Dhop(src,result,0);
@@ -282,8 +198,8 @@ void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu]; double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=single_site_flops*volume*ncall; double flops=single_site_flops*volume*ncall;
auto nsimd = Simd::Nsimd(); auto nsimd = vComplex::Nsimd();
auto simdwidth = sizeof(Simd); auto simdwidth = sizeof(vComplex);
// RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors // RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
double data_rf = volume * ((2*Nd+1)*Nd*Nc + 2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.); double data_rf = volume * ((2*Nd+1)*Nd*Nc + 2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
@@ -292,27 +208,28 @@ void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
double data_mem = (volume * (2*Nd+1)*Nd*Nc + (volume/Ls) *2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.); double data_mem = (volume * (2*Nd+1)*Nd*Nc + (volume/Ls) *2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl; std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
// std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
// std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl; std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "mflop/s per rank = "<< flops/(t1-t0)/NP<<std::endl; std::cout<<GridLogMessage << "mflop/s per rank = "<< flops/(t1-t0)/NP<<std::endl;
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NN<<std::endl; std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NN<<std::endl;
std::cout<<GridLogMessage << "RF GiB/s (base 2) = "<< 1000000. * data_rf/((t1-t0))<<std::endl;
std::cout<<GridLogMessage << "mem GiB/s (base 2) = "<< 1000000. * data_mem/((t1-t0))<<std::endl;
err = ref-result; err = ref-result;
n2e = norm2(err); std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
std::cout<<GridLogMessage << "norm diff "<< n2e<< " Line "<<__LINE__ <<std::endl; //exit(0);
if(( n2e>1.0e-4) ) { if(( norm2(err)>1.0e-4) ) {
/*
std::cout << "RESULT\n " << result<<std::endl;
std::cout << "REF \n " << ref <<std::endl;
std::cout << "ERR \n " << err <<std::endl;
*/
std::cout<<GridLogMessage << "WRONG RESULT" << std::endl; std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
FGrid->Barrier(); FGrid->Barrier();
std::cout<<GridLogMessage << "RESULT" << std::endl;
// std::cout << result<<std::endl;
std::cout << norm2(result)<<std::endl;
std::cout<<GridLogMessage << "REF" << std::endl;
std::cout << norm2(ref)<<std::endl;
std::cout<<GridLogMessage << "ERR" << std::endl;
std::cout << norm2(err)<<std::endl;
FGrid->Barrier();
exit(-1); exit(-1);
} }
assert (n2e< 1.0e-4 ); assert (norm2(err)< 1.0e-4 );
} }
if (1) if (1)
@@ -321,30 +238,16 @@ void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
for(int mu=0;mu<Nd;mu++){ for(int mu=0;mu<Nd;mu++){
// ref = src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x // ref = src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
tmp = Cshift(src,mu+1,1); tmp = U[mu]*Cshift(src,mu+1,1);
{ {
autoView( ref_v, ref, CpuWrite); autoView( ref_v, ref, CpuWrite);
autoView( tmp_v, tmp, CpuRead); autoView( tmp_v, tmp, CpuRead);
autoView( U_v , U[mu] , CpuRead); for(int i=0;i<ref_v.size();i++){
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){ ref_v[i]+= tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]; ;
for(int s=0;s<Ls;s++){
int i=s+Ls*ss;
ref_v[i]+= U_v[ss]*(tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]); ;
}
} }
} }
{ tmp =adj(U[mu])*src;
autoView( tmp_v , tmp , CpuWrite);
autoView( U_v , U[mu] , CpuRead);
autoView( src_v, src , CpuRead);
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
for(int s=0;s<Ls;s++){
tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
}
}
}
// tmp =adj(U[mu])*src;
tmp =Cshift(tmp,mu+1,-1); tmp =Cshift(tmp,mu+1,-1);
{ {
autoView( ref_v, ref, CpuWrite); autoView( ref_v, ref, CpuWrite);
@@ -356,27 +259,27 @@ void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
} }
ref = -0.5*ref; ref = -0.5*ref;
} }
// dump=1;
Dw.Dhop(src,result,DaggerYes); Dw.Dhop(src,result,1);
std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl; std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
std::cout<<GridLogMessage << "Called DwDag"<<std::endl; std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl; std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl;
std::cout<<GridLogMessage << "norm dag ref "<< norm2(ref)<<std::endl; std::cout<<GridLogMessage << "norm dag ref "<< norm2(ref)<<std::endl;
err = ref-result; err = ref-result;
n2e= norm2(err); std::cout<<GridLogMessage << "norm dag diff "<< norm2(err)<<std::endl;
std::cout<<GridLogMessage << "norm dag diff "<< n2e<< " Line "<<__LINE__ <<std::endl; if((norm2(err)>1.0e-4)){
/*
std::cout<< "DAG RESULT\n " <<ref << std::endl;
std::cout<< "DAG sRESULT\n " <<result << std::endl;
std::cout<< "DAG ERR \n " << err <<std::endl;
*/
}
LatticeFermion src_e (FrbGrid);
LatticeFermion src_o (FrbGrid);
LatticeFermion r_e (FrbGrid);
LatticeFermion r_o (FrbGrid);
LatticeFermion r_eo (FGrid);
assert((n2e)<1.0e-4);
FermionField src_e (FrbGrid);
FermionField src_o (FrbGrid);
FermionField r_e (FrbGrid);
FermionField r_o (FrbGrid);
FermionField r_eo (FGrid);
std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl; std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl;
pickCheckerboard(Even,src_e,src); pickCheckerboard(Even,src_e,src);
@@ -388,8 +291,10 @@ void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
// S-direction is INNERMOST and takes no part in the parity. // S-direction is INNERMOST and takes no part in the parity.
std::cout << GridLogMessage<< "*********************************************************" <<std::endl; std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermion::DhopEO "<<std::endl; std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionD::DhopEO "<<std::endl;
std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl; std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
if ( sizeof(Real)==4 ) std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
if ( sizeof(Real)==8 ) std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
#ifdef GRID_OMP #ifdef GRID_OMP
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl; if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl; if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
@@ -403,7 +308,13 @@ void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
Dw.DhopEO(src_o,r_e,DaggerNo); Dw.DhopEO(src_o,r_e,DaggerNo);
double t0=usecond(); double t0=usecond();
for(int i=0;i<ncall;i++){ for(int i=0;i<ncall;i++){
#ifdef CUDA_PROFILE
if(i==10) cudaProfilerStart();
#endif
Dw.DhopEO(src_o,r_e,DaggerNo); Dw.DhopEO(src_o,r_e,DaggerNo);
#ifdef CUDA_PROFILE
if(i==20) cudaProfilerStop();
#endif
} }
double t1=usecond(); double t1=usecond();
FGrid->Barrier(); FGrid->Barrier();
@@ -427,9 +338,14 @@ void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
setCheckerboard(r_eo,r_e); setCheckerboard(r_eo,r_e);
err = r_eo-result; err = r_eo-result;
n2e= norm2(err); std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
std::cout<<GridLogMessage << "norm diff "<< n2e<<std::endl; if((norm2(err)>1.0e-4)){
assert(n2e<1.0e-4); /*
std::cout<< "Deo RESULT\n " <<r_eo << std::endl;
std::cout<< "Deo REF\n " <<result << std::endl;
std::cout<< "Deo ERR \n " << err <<std::endl;
*/
}
pickCheckerboard(Even,src_e,err); pickCheckerboard(Even,src_e,err);
pickCheckerboard(Odd,src_o,err); pickCheckerboard(Odd,src_o,err);
@@ -438,4 +354,6 @@ void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
assert(norm2(src_e)<1.0e-4); assert(norm2(src_e)<1.0e-4);
assert(norm2(src_o)<1.0e-4); assert(norm2(src_o)<1.0e-4);
Grid_finalize();
exit(0);
} }

View File

@@ -43,7 +43,7 @@ Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaT Gamma::Algebra::GammaT
}; };
void Benchmark(int Ls, Coordinate Dirichlet,bool Sloppy); void Benchmark(int Ls, Coordinate Dirichlet);
int main (int argc, char ** argv) int main (int argc, char ** argv)
{ {
@@ -69,19 +69,11 @@ int main (int argc, char ** argv)
std::cout << GridLogMessage<< " Testing with full communication " <<std::endl; std::cout << GridLogMessage<< " Testing with full communication " <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl; std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
Benchmark(Ls,Dirichlet,false); Benchmark(Ls,Dirichlet);
std::cout << "\n\n\n\n\n\n" <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
std::cout << GridLogMessage<< " Testing with sloppy communication " <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
Benchmark(Ls,Dirichlet,true);
////////////////// //////////////////
// Domain decomposed // Domain decomposed
////////////////// //////////////////
/*
Coordinate latt4 = GridDefaultLatt(); Coordinate latt4 = GridDefaultLatt();
Coordinate mpi = GridDefaultMpi(); Coordinate mpi = GridDefaultMpi();
Coordinate CommDim(Nd); Coordinate CommDim(Nd);
@@ -89,35 +81,42 @@ int main (int argc, char ** argv)
GlobalSharedMemory::GetShmDims(mpi,shm); GlobalSharedMemory::GetShmDims(mpi,shm);
//////////////////////
// Node level
//////////////////////
std::cout << "\n\n\n\n\n\n" <<std::endl; std::cout << "\n\n\n\n\n\n" <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl; std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
// std::cout << GridLogMessage<< " Testing without internode communication " <<std::endl; std::cout << GridLogMessage<< " Testing without internode communication " <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl; std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0; for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
Dirichlet[0] = 0; // Dirichlet[0] = 0;
Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0]; // Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1]; // Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2]; // Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3]; // Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
Benchmark(Ls,Dirichlet,false); Benchmark(Ls,Dirichlet);
std::cout << "\n\n\n\n\n\n" <<std::endl; std::cout << "\n\n\n\n\n\n" <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl; std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
std::cout << GridLogMessage<< " Testing with sloppy communication " <<std::endl; std::cout << GridLogMessage<< " Testing without intranode communication " <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl; std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
for(int d=0;d<Nd;d++) CommDim[d]= mpi[d]>1 ? 1 : 0; for(int d=0;d<Nd;d++) CommDim[d]= mpi[d]>1 ? 1 : 0;
// Dirichlet[0] = 0;
// Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0];
// Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1];
// Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2];
// Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3];
Benchmark(Ls,Dirichlet,true); Benchmark(Ls,Dirichlet);
*/
Grid_finalize(); Grid_finalize();
exit(0); exit(0);
} }
void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy) void Benchmark(int Ls, Coordinate Dirichlet)
{ {
Coordinate latt4 = GridDefaultLatt(); Coordinate latt4 = GridDefaultLatt();
GridLogLayout(); GridLogLayout();
@@ -133,13 +132,21 @@ void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
typedef LatticeGaugeFieldF GaugeField; typedef LatticeGaugeFieldF GaugeField;
typedef LatticeColourMatrixF ColourMatrixField; typedef LatticeColourMatrixF ColourMatrixField;
typedef DomainWallFermionF FermionAction; typedef DomainWallFermionF FermionAction;
#else #endif
#ifdef DOUBLE
typedef vComplexD Simd; typedef vComplexD Simd;
typedef LatticeFermionD FermionField; typedef LatticeFermionD FermionField;
typedef LatticeGaugeFieldD GaugeField; typedef LatticeGaugeFieldD GaugeField;
typedef LatticeColourMatrixD ColourMatrixField; typedef LatticeColourMatrixD ColourMatrixField;
typedef DomainWallFermionD FermionAction; typedef DomainWallFermionD FermionAction;
#endif #endif
#ifdef DOUBLE2
typedef vComplexD2 Simd;
typedef LatticeFermionD2 FermionField;
typedef LatticeGaugeFieldD2 GaugeField;
typedef LatticeColourMatrixD2 ColourMatrixField;
typedef DomainWallFermionD2 FermionAction;
#endif
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,Simd::Nsimd()),GridDefaultMpi()); GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,Simd::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid); GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
@@ -262,7 +269,6 @@ void Benchmark(int Ls, Coordinate Dirichlet,bool sloppy)
FermionAction::ImplParams p; FermionAction::ImplParams p;
p.dirichlet=Dirichlet; p.dirichlet=Dirichlet;
FermionAction Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,p); FermionAction Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,p);
Dw.SloppyComms(sloppy);
Dw.ImportGauge(Umu); Dw.ImportGauge(Umu);
int ncall =300; int ncall =300;

View File

@@ -0,0 +1,465 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./benchmarks/Benchmark_dwf.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#ifdef GRID_CUDA
#define CUDA_PROFILE
#endif
#ifdef CUDA_PROFILE
#include <cuda_profiler_api.h>
#endif
using namespace std;
using namespace Grid;
////////////////////////
/// Move to domains ////
////////////////////////
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT
};
void Benchmark(int Ls, Coordinate Dirichlet, int partial);
int main (int argc, char ** argv)
{
Grid_init(&argc,&argv);
int threads = GridThread::GetThreads();
int Ls=8;
for(int i=0;i<argc;i++) {
if(std::string(argv[i]) == "-Ls"){
std::stringstream ss(argv[i+1]); ss >> Ls;
}
}
//////////////////
// With comms
//////////////////
Coordinate Dirichlet(Nd+1,0);
for(auto partial : {0}) {
std::cout << "\n\n\n\n\n\n" <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
std::cout << GridLogMessage<< " Testing with full communication " <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
Benchmark(Ls,Dirichlet,partial);
}
//////////////////
// Domain decomposed
//////////////////
Coordinate latt4 = GridDefaultLatt();
Coordinate mpi = GridDefaultMpi();
Coordinate CommDim(Nd);
//Coordinate shm({2,1,1,1});
Coordinate shm;
GlobalSharedMemory::GetShmDims(mpi,shm);
std::cout <<GridLogMessage << " Shared memory MPI decomp is " <<shm<<std::endl;
//////////////////////
// Node level
//////////////////////
for(int d=0;d<Nd;d++) CommDim[d]= (mpi[d]/shm[d])>1 ? 1 : 0;
// for(int d=0;d<Nd;d++) CommDim[d]= 1;
Dirichlet[0] = 0;
Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0] * shm[0];
Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1] * shm[1];
Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2] * shm[2];
Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3] * shm[3];
for(auto partial : {0,1}) {
std::cout << "\n\n\n\n\n\n" <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
std::cout << GridLogMessage<< " Testing without internode communication partial dirichlet="<<partial <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
Benchmark(Ls,Dirichlet,partial);
}
for(int d=0;d<Nd;d++) CommDim[d]= mpi[d]>1 ? 1 : 0;
Dirichlet[0] = 0;
Dirichlet[1] = CommDim[0]*latt4[0]/mpi[0];
Dirichlet[2] = CommDim[1]*latt4[1]/mpi[1];
Dirichlet[3] = CommDim[2]*latt4[2]/mpi[2];
Dirichlet[4] = CommDim[3]*latt4[3]/mpi[3];
for(auto partial : {0,1}) {
std::cout << "\n\n\n\n\n\n" <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
std::cout << GridLogMessage<< " Testing without intranode communication; partial dirichlet= "<<partial <<std::endl;
std::cout << GridLogMessage<< "++++++++++++++++++++++++++++++++++++++++++++++++" <<std::endl;
Benchmark(Ls,Dirichlet,partial);
}
Grid_finalize();
exit(0);
}
void Benchmark(int Ls, Coordinate Dirichlet, int partial)
{
Coordinate latt4 = GridDefaultLatt();
GridLogLayout();
long unsigned int single_site_flops = 8*Nc*(7+16*Nc);
std::vector<int> seeds4({1,2,3,4});
std::vector<int> seeds5({5,6,7,8});
#define SINGLE
#ifdef SINGLE
typedef vComplexF Simd;
typedef LatticeFermionF FermionField;
typedef LatticeGaugeFieldF GaugeField;
typedef LatticeColourMatrixF ColourMatrixField;
typedef DomainWallFermionF FermionAction;
#endif
#ifdef DOUBLE
typedef vComplexD Simd;
typedef LatticeFermionD FermionField;
typedef LatticeGaugeFieldD GaugeField;
typedef LatticeColourMatrixD ColourMatrixField;
typedef DomainWallFermionD FermionAction;
#endif
#ifdef DOUBLE2
typedef vComplexD2 Simd;
typedef LatticeFermionD2 FermionField;
typedef LatticeGaugeFieldD2 GaugeField;
typedef LatticeColourMatrixD2 ColourMatrixField;
typedef DomainWallFermionD2 FermionAction;
#endif
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,Simd::Nsimd()),GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
std::cout << GridLogMessage << "Initialising 4d RNG" << std::endl;
GridParallelRNG RNG4(UGrid); RNG4.SeedUniqueString(std::string("The 4D RNG"));
std::cout << GridLogMessage << "Initialising 5d RNG" << std::endl;
GridParallelRNG RNG5(FGrid); RNG5.SeedUniqueString(std::string("The 5D RNG"));
FermionField src (FGrid); random(RNG5,src);
#if 0
src = Zero();
{
Coordinate origin({0,0,0,latt4[2]-1,0});
SpinColourVectorF tmp;
tmp=Zero();
tmp()(0)(0)=Complex(-2.0,0.0);
std::cout << " source site 0 " << tmp<<std::endl;
pokeSite(tmp,src,origin);
}
#else
RealD N2 = 1.0/::sqrt(norm2(src));
src = src*N2;
#endif
FermionField result(FGrid); result=Zero();
FermionField ref(FGrid); ref=Zero();
FermionField tmp(FGrid);
FermionField err(FGrid);
std::cout << GridLogMessage << "Drawing gauge field" << std::endl;
GaugeField Umu(UGrid);
GaugeField UmuFull(UGrid);
GaugeField UmuCopy(UGrid);
SU<Nc>::HotConfiguration(RNG4,Umu);
UmuCopy=Umu;
UmuFull=Umu;
std::cout << GridLogMessage << "Random gauge initialised " << std::endl;
////////////////////////////////////
// Apply BCs
////////////////////////////////////
Coordinate Block(4);
for(int d=0;d<4;d++) Block[d]= Dirichlet[d+1];
std::cout << GridLogMessage << "Applying BCs for Dirichlet Block5 " << Dirichlet << std::endl;
std::cout << GridLogMessage << "Applying BCs for Dirichlet Block4 " << Block << std::endl;
DirichletFilter<GaugeField> Filter(Block);
Filter.applyFilter(Umu);
if(!partial) Filter.applyFilter(UmuCopy);
////////////////////////////////////
// Naive wilson implementation
////////////////////////////////////
std::vector<ColourMatrixField> U(4,UGrid);
std::vector<ColourMatrixField> Ucopy(4,UGrid);
for(int mu=0;mu<Nd;mu++){
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
Ucopy[mu] = PeekIndex<LorentzIndex>(UmuCopy,mu);
}
std::cout << GridLogMessage << "Setting up Cshift based reference " << std::endl;
if (1)
{
ref = Zero();
for(int mu=0;mu<Nd;mu++){
int depth=dwf_compressor_depth;
tmp = Cshift(src,mu+1,1);
{
autoView( tmp_v , tmp , CpuWrite);
autoView( U_v , U[mu] , CpuRead);
autoView( Ucopy_v, Ucopy[mu] , CpuRead);
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
for(int s=0;s<Ls;s++){
if ( (s<depth) || (s>=Ls-depth)){
tmp_v[Ls*ss+s] = Ucopy_v[ss]*tmp_v[Ls*ss+s];
} else {
tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
}
}
}
}
ref=ref + tmp - Gamma(Gmu[mu])*tmp;
{
autoView( tmp_v , tmp , CpuWrite);
autoView( U_v , U[mu] , CpuRead);
autoView( Ucopy_v, Ucopy[mu] , CpuRead);
autoView( src_v, src , CpuRead);
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
for(int s=0;s<Ls;s++){
if ( (s<depth) || (s>=Ls-depth)){
tmp_v[Ls*ss+s] = adj(Ucopy_v[ss])*src_v[Ls*ss+s];
} else {
tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
}
}
}
}
tmp =Cshift(tmp,mu+1,-1);
ref=ref + tmp + Gamma(Gmu[mu])*tmp;
}
ref = -0.5*ref;
}
RealD mass=0.1;
RealD M5 =1.8;
RealD NP = UGrid->_Nprocessors;
RealD NN = UGrid->NodeCount();
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop "<<std::endl;
std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
std::cout << GridLogMessage <<"* BCs for Dirichlet Block4 " << Block << std::endl;
std::cout << GridLogMessage <<"* Partial Dirichlet BC = " << partial << std::endl;
std::cout << GridLogMessage<< "* VComplex size is "<<sizeof(Simd)<< " B"<<std::endl;
#ifdef GRID_OMP
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
#endif
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
FermionAction::ImplParams p;
p.dirichlet=Dirichlet;
p.partialDirichlet=partial;
FermionAction Dw(UmuFull,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,p);
int ncall =1;
RealD n2e;
if (1) {
FGrid->Barrier();
Dw.Dhop(src,result,0);
std::cout<<GridLogMessage<<"Called warmup"<<std::endl;
double t0=usecond();
for(int i=0;i<ncall;i++){
Dw.Dhop(src,result,0);
}
double t1=usecond();
FGrid->Barrier();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=single_site_flops*volume*ncall;
auto nsimd = Simd::Nsimd();
auto simdwidth = sizeof(Simd);
// RF: Nd Wilson * Ls, Nd gauge * Ls, Nc colors
double data_rf = volume * ((2*Nd+1)*Nd*Nc + 2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
// mem: Nd Wilson * Ls, Nd gauge, Nc colors
double data_mem = (volume * (2*Nd+1)*Nd*Nc + (volume/Ls) *2*Nd*Nc*Nc) * simdwidth / nsimd * ncall / (1024.*1024.*1024.);
std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "mflop/s per rank = "<< flops/(t1-t0)/NP<<std::endl;
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NN<<std::endl;
err = ref-result;
n2e = norm2(err);
std::cout<<GridLogMessage << "norm diff "<< n2e<< " Line "<<__LINE__ <<std::endl;
if(( n2e>1.0e-4) ) {
std::cout<<GridLogMessage << "WRONG RESULT" << std::endl;
FGrid->Barrier();
DumpSliceNorm("s-slice ref ",ref,1);
DumpSliceNorm("s-slice res ",result,1);
DumpSliceNorm("s-slice error ",err,1);
exit(-1);
}
assert (n2e< 1.0e-4 );
}
if (1)
{ // Naive wilson dag implementation
ref = Zero();
for(int mu=0;mu<Nd;mu++){
int depth=dwf_compressor_depth;
tmp = Cshift(src,mu+1,1);
{
autoView( tmp_v , tmp , CpuWrite);
autoView( U_v , U[mu] , CpuRead);
autoView( Ucopy_v, Ucopy[mu] , CpuRead);
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
for(int s=0;s<Ls;s++){
if ( (s<depth) || (s>=Ls-depth)){
tmp_v[Ls*ss+s] = Ucopy_v[ss]*tmp_v[Ls*ss+s];
} else {
tmp_v[Ls*ss+s] = U_v[ss]*tmp_v[Ls*ss+s];
}
}
}
}
ref=ref + tmp + Gamma(Gmu[mu])*tmp;
{
autoView( tmp_v , tmp , CpuWrite);
autoView( U_v , U[mu] , CpuRead);
autoView( Ucopy_v, Ucopy[mu] , CpuRead);
autoView( src_v, src , CpuRead);
for(int ss=0;ss<U[mu].Grid()->oSites();ss++){
for(int s=0;s<Ls;s++){
if ( (s<depth) || (s>=Ls-depth)){
tmp_v[Ls*ss+s] = adj(Ucopy_v[ss])*src_v[Ls*ss+s];
} else {
tmp_v[Ls*ss+s] = adj(U_v[ss])*src_v[Ls*ss+s];
}
}
}
}
tmp =Cshift(tmp,mu+1,-1);
ref=ref + tmp - Gamma(Gmu[mu])*tmp;
}
ref = -0.5*ref;
}
Dw.Dhop(src,result,DaggerYes);
std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
std::cout << GridLogMessage << "----------------------------------------------------------------" << std::endl;
std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
std::cout<<GridLogMessage << "norm dag result "<< norm2(result)<<std::endl;
std::cout<<GridLogMessage << "norm dag ref "<< norm2(ref)<<std::endl;
err = ref-result;
n2e= norm2(err);
std::cout<<GridLogMessage << "norm dag diff "<< n2e<< " Line "<<__LINE__ <<std::endl;
assert((n2e)<1.0e-4);
FermionField src_e (FrbGrid);
FermionField src_o (FrbGrid);
FermionField r_e (FrbGrid);
FermionField r_o (FrbGrid);
FermionField r_eo (FGrid);
std::cout<<GridLogMessage << "Calling Deo and Doe and //assert Deo+Doe == Dunprec"<<std::endl;
pickCheckerboard(Even,src_e,src);
pickCheckerboard(Odd,src_o,src);
std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl;
std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;
// S-direction is INNERMOST and takes no part in the parity.
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
std::cout << GridLogMessage<< "* Benchmarking DomainWallFermion::DhopEO "<<std::endl;
std::cout << GridLogMessage<< "* Vectorising space-time by "<<Simd::Nsimd()<<std::endl;
#ifdef GRID_OMP
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
#endif
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3 WilsonKernels" <<std::endl;
if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3 WilsonKernels" <<std::endl;
std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
{
FGrid->Barrier();
Dw.DhopEO(src_o,r_e,DaggerNo);
double t0=usecond();
for(int i=0;i<ncall;i++){
Dw.DhopEO(src_o,r_e,DaggerNo);
}
double t1=usecond();
FGrid->Barrier();
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
double flops=(single_site_flops*volume*ncall)/2.0;
std::cout<<GridLogMessage << "Deo mflop/s = "<< flops/(t1-t0)<<std::endl;
std::cout<<GridLogMessage << "Deo mflop/s per rank "<< flops/(t1-t0)/NP<<std::endl;
std::cout<<GridLogMessage << "Deo mflop/s per node "<< flops/(t1-t0)/NN<<std::endl;
}
Dw.DhopEO(src_o,r_e,DaggerNo);
Dw.DhopOE(src_e,r_o,DaggerNo);
Dw.Dhop (src ,result,DaggerNo);
std::cout<<GridLogMessage << "r_e"<<norm2(r_e)<<std::endl;
std::cout<<GridLogMessage << "r_o"<<norm2(r_o)<<std::endl;
std::cout<<GridLogMessage << "res"<<norm2(result)<<std::endl;
setCheckerboard(r_eo,r_o);
setCheckerboard(r_eo,r_e);
err = r_eo-result;
n2e= norm2(err);
std::cout<<GridLogMessage << "norm diff "<< n2e<< " Line "<<__LINE__ <<std::endl;
assert(n2e<1.0e-4);
pickCheckerboard(Even,src_e,err);
pickCheckerboard(Odd,src_o,err);
std::cout<<GridLogMessage << "norm diff even "<< norm2(src_e)<<std::endl;
std::cout<<GridLogMessage << "norm diff odd "<< norm2(src_o)<<std::endl;
assert(norm2(src_e)<1.0e-4);
assert(norm2(src_o)<1.0e-4);
}

View File

@@ -873,7 +873,7 @@ int main (int argc, char ** argv)
int do_su4=0; int do_su4=0;
int do_memory=1; int do_memory=1;
int do_comms =1; int do_comms =1;
int do_blas =1; int do_blas =0;
int do_dslash=1; int do_dslash=1;
int sel=4; int sel=4;

View File

@@ -86,7 +86,6 @@ AC_ARG_WITH([gmp],
[try this for a non-standard install prefix of the GMP library])], [try this for a non-standard install prefix of the GMP library])],
[AM_CXXFLAGS="-I$with_gmp/include $AM_CXXFLAGS"] [AM_CXXFLAGS="-I$with_gmp/include $AM_CXXFLAGS"]
[AM_LDFLAGS="-L$with_gmp/lib $AM_LDFLAGS"]) [AM_LDFLAGS="-L$with_gmp/lib $AM_LDFLAGS"])
AC_ARG_WITH([mpfr], AC_ARG_WITH([mpfr],
[AS_HELP_STRING([--with-mpfr=prefix], [AS_HELP_STRING([--with-mpfr=prefix],
[try this for a non-standard install prefix of the MPFR library])], [try this for a non-standard install prefix of the MPFR library])],
@@ -107,13 +106,6 @@ AC_ARG_WITH([lime],
[AM_CXXFLAGS="-I$with_lime/include $AM_CXXFLAGS"] [AM_CXXFLAGS="-I$with_lime/include $AM_CXXFLAGS"]
[AM_LDFLAGS="-L$with_lime/lib $AM_LDFLAGS"]) [AM_LDFLAGS="-L$with_lime/lib $AM_LDFLAGS"])
############### LIBUNWIND
AC_ARG_WITH([unwind],
[AS_HELP_STRING([--with-unwind=prefix],
[try this for a non-standard install prefix of the libunwind library])],
[AM_CXXFLAGS="-I$with_unwind/include $AM_CXXFLAGS"]
[AM_LDFLAGS="-L$with_unwind/lib $AM_LDFLAGS"])
############### OpenSSL ############### OpenSSL
AC_ARG_WITH([openssl], AC_ARG_WITH([openssl],
[AS_HELP_STRING([--with-openssl=prefix], [AS_HELP_STRING([--with-openssl=prefix],
@@ -222,7 +214,7 @@ esac
############### Symplectic group ############### Symplectic group
AC_ARG_ENABLE([Sp], AC_ARG_ENABLE([Sp],
[AS_HELP_STRING([--enable-Sp=yes|no],[enable gauge group Sp2n])], [AC_HELP_STRING([--enable-Sp=yes|no], [enable gauge group Sp2n])],
[ac_ENABLE_SP=${enable_Sp}], [ac_ENABLE_SP=no]) [ac_ENABLE_SP=${enable_Sp}], [ac_ENABLE_SP=no])
AM_CONDITIONAL(BUILD_SP, [ test "${ac_ENABLE_SP}X" == "yesX" ]) AM_CONDITIONAL(BUILD_SP, [ test "${ac_ENABLE_SP}X" == "yesX" ])
@@ -263,28 +255,6 @@ case ${ac_ACCELERATOR_AWARE_MPI} in
*);; *);;
esac esac
############### CHECKSUM COMMS
AC_ARG_ENABLE([checksum-comms],
[AS_HELP_STRING([--enable-checksum-comms=yes|no],[checksum all communication])],
[ac_CHECKSUM_COMMS=${enable_checksum_comms}], [ac_CHECKSUM_COMMS=yes])
case ${ac_CHECKSUM_COMMS} in
yes)
AC_DEFINE([GRID_CHECKSUM_COMMS],[1],[checksum all communication]);;
*);;
esac
############### LOG VIEWS
AC_ARG_ENABLE([log-views],
[AS_HELP_STRING([--enable-log-views=yes|no],[log information on all view open/close])],
[ac_LOG_VIEWS=${enable_log_views}], [ac_LOG_VIEWS=yes])
case ${ac_LOG_VIEWS} in
yes)
AC_DEFINE([GRID_LOG_VIEWS],[1],[log information on all view open/close]);;
*);;
esac
############### SYCL/CUDA/HIP/none ############### SYCL/CUDA/HIP/none
AC_ARG_ENABLE([accelerator], AC_ARG_ENABLE([accelerator],
[AS_HELP_STRING([--enable-accelerator=cuda|sycl|hip|none],[enable none,cuda,sycl,hip acceleration])], [AS_HELP_STRING([--enable-accelerator=cuda|sycl|hip|none],[enable none,cuda,sycl,hip acceleration])],
@@ -403,16 +373,6 @@ AC_SEARCH_LIBS([limeCreateReader], [lime],
[have_lime=true], [have_lime=true],
[AC_MSG_WARN(LIME library was not found in your system.)]) [AC_MSG_WARN(LIME library was not found in your system.)])
AC_SEARCH_LIBS([unw_backtrace], [unwind],
[AC_DEFINE([HAVE_UNWIND], [1], [Define to 1 if you have the `libunwind' library])]
[have_unwind=true],
[AC_MSG_WARN(libunwind library was not found in your system.)])
AC_SEARCH_LIBS([_Ux86_64_step], [unwind-x86_64],
[AC_DEFINE([HAVE_UNWIND_X86_64], [1], [Define to 1 if you have the `libunwind-x86_64' library])]
[have_unwind_x86_64=true],
[AC_MSG_WARN(libunwind library was not found in your system.)])
AC_SEARCH_LIBS([SHA256_Init], [crypto], AC_SEARCH_LIBS([SHA256_Init], [crypto],
[AC_DEFINE([HAVE_CRYPTO], [1], [Define to 1 if you have the `OpenSSL' library])] [AC_DEFINE([HAVE_CRYPTO], [1], [Define to 1 if you have the `OpenSSL' library])]
[have_crypto=true], [have_crypto=true],

View File

@@ -1,8 +1,7 @@
#!/bin/bash #!/bin/bash
##PBS -q EarlyAppAccess
#PBS -q debug #PBS -q debug
#PBS -l filesystems=flare
#PBS -l filesystems=home
#PBS -l select=2 #PBS -l select=2
#PBS -l walltime=00:20:00 #PBS -l walltime=00:20:00
#PBS -A LatticeQCD_aesp_CNDA #PBS -A LatticeQCD_aesp_CNDA
@@ -15,18 +14,26 @@ cp $PBS_NODEFILE nodefile
export OMP_NUM_THREADS=4 export OMP_NUM_THREADS=4
export MPICH_OFI_NIC_POLICY=GPU export MPICH_OFI_NIC_POLICY=GPU
export MPICH_CH4_SHM=XPMEM
export MPIR_CVAR_DEBUG_SUMMARY=1 #export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
export MPICH_DBG_LEVEL=VERBOSE #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
export MPICH_DBG_CLASS=ALL #unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
# #
# Local vol 16.16.16.32 # Local vol 16.16.16.32
# #
#VOL=32.64.64.96 #VOL=32.64.64.96
mpiexec -np 1 -ppn 1 -envall mpivars
for VOL in 32.32.32.96 for VOL in 32.32.32.96 32.64.64.96
do do
for AT in 32 for AT in 32
do do

View File

@@ -1,28 +1,26 @@
export MPFR=`spack find --paths mpfr | grep ^mpfr | awk '{print $2}' ` #Ahead of time compile for PVC
export GMP=`spack find --paths gmp | grep ^gmp | awk '{print $2}' `
export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' `
export UNWIND=`spack find --paths libunwind | grep ^libunwind | awk '{print $2}' `
../../configure \ export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl -lnuma -L/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/lib -fPIC -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc"
export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions -I/opt/aurora/24.180.3/spack/unified/0.8.0/install/linux-sles15-x86_64/oneapi-2024.07.30.002/numactl-2.0.14-7v6edad/include/ -fPIC"
#JIT compile
#export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl "
#export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions "
../configure \
--enable-simd=GPU \ --enable-simd=GPU \
--enable-reduction=grid \
--enable-gen-simd-width=64 \ --enable-gen-simd-width=64 \
--enable-comms=mpi-auto \ --enable-comms=mpi-auto \
--enable-debug \
--prefix $HOME/gpt-install \
--disable-gparity \ --disable-gparity \
--disable-fermion-reps \ --disable-fermion-reps \
--enable-shm=nvlink \
--enable-checksum-comms=yes \
--enable-log-views=yes \
--enable-accelerator=sycl \
--enable-accelerator-aware-mpi=no \
--enable-unified=no \
--with-lime=$CLIME \ --with-lime=$CLIME \
--with-gmp=$GMP \ --enable-shm=nvlink \
--with-mpfr=$MPFR \ --enable-accelerator=sycl \
--with-unwind=$UNWIND \ --enable-accelerator-aware-mpi=no\
--enable-unified=no \
MPICXX=mpicxx \ MPICXX=mpicxx \
CXX=icpx \ CXX=icpx
LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -lsycl -fsycl-max-parallel-link-jobs=16 -fno-sycl-rdc -lnuma" \
CXXFLAGS="-fPIC -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel"

View File

@@ -1,13 +1,16 @@
#module load oneapi/release/2023.12.15.001
#module load mpich/icc-all-debug-pmix-gpu/52.2
#module load mpich-config/mode/deterministic
#module load intel_compute_runtime/release/821.35
module load pti-gpu
source ~/spack/share/spack/setup-env.sh
spack load c-lime
spack load openssl
export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' `
export HTTP_PROXY=http://proxy.alcf.anl.gov:3128 export HTTP_PROXY=http://proxy.alcf.anl.gov:3128
export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128 export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128
export http_proxy=http://proxy.alcf.anl.gov:3128 export http_proxy=http://proxy.alcf.anl.gov:3128
export https_proxy=http://proxy.alcf.anl.gov:3128 export https_proxy=http://proxy.alcf.anl.gov:3128
git config --global http.proxy http://proxy.alcf.anl.gov:3128 git config --global http.proxy http://proxy.alcf.anl.gov:3128
source ~/spack/share/spack/setup-env.sh
spack load c-lime
spack load openssl@3.3.1%gcc@12.2.0
spack load unwind
export UNWIND=`spack find --paths libunwind | grep ^libunwind | awk '{print $2}' `
export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' `
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file" export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"

View File

@@ -1,273 +0,0 @@
RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
SLURM detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA GH200 120GB
AcceleratorCudaInit[0]: totalGlobalMem: 102005473280
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 1
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
local rank 0 device 0 bus id: 0009:01:00.0
AcceleratorCudaInit: ================================================
SharedMemoryMpi: World communicator of size 4
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x4002c0000000 - 40033fffffff for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=3737a24096282ea179607fc879814710860a0de6: (HEAD -> develop, origin/develop, origin/HEAD) clean
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : This rank is running on host jpbo-119-30.jupiter.internal
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 81604378624 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent host allocations: SMALL 8 LARGE 2 HUGE 0
Grid : Message : MemoryManager::Init() cache pool for recent device allocations: SMALL 16 LARGE 8 Huge 0
Grid : Message : MemoryManager::Init() cache pool for recent shared allocations: SMALL 16 LARGE 8 Huge 0
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 0.303000 s : ++++++++++++++++++++++++++++++++++++++++++++++++
Grid : Message : 0.309000 s : Testing with full communication
Grid : Message : 0.312000 s : ++++++++++++++++++++++++++++++++++++++++++++++++
Grid : Message : 0.313000 s : Grid Layout
Grid : Message : 0.313000 s : Global lattice size : 32 32 64 64
Grid : Message : 0.319000 s : OpenMP threads : 4
Grid : Message : 0.320000 s : MPI tasks : 1 1 2 2
Grid : Message : 0.129590 s : Initialising 4d RNG
Grid : Message : 0.764790 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 0.764920 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 0.942440 s : Initialising 5d RNG
Grid : Message : 1.149388 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 1.149404 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
local rank 1 device 0 bus id: 0019:01:00.0
local rank 2 device 0 bus id: 0029:01:00.0
local rank 3 device 0 bus id: 0039:01:00.0
Grid : Message : 43.893114 s : Drawing gauge field
Grid : Message : 54.574150 s : Random gauge initialised
Grid : Message : 54.574170 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
Grid : Message : 54.574172 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
Grid : Message : 54.580032 s : Setting up Cshift based reference
Grid : Message : 60.407451 s : *****************************************************************
Grid : Message : 60.407469 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 60.407470 s : *****************************************************************
Grid : Message : 60.407471 s : *****************************************************************
Grid : Message : 60.407472 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 60.407473 s : * Vectorising space-time by 8
Grid : Message : 60.407475 s : * VComplex size is 64 B
Grid : Message : 60.407477 s : * Using Overlapped Comms/Compute
Grid : Message : 60.407479 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 60.407480 s : *****************************************************************
Grid : Message : 61.102178 s : Called warmup
Grid : Message : 62.177160 s : Called Dw 300 times in 1074958 us
Grid : Message : 62.177198 s : mflop/s = 24721998.6
Grid : Message : 62.177201 s : mflop/s per rank = 6180499.64
Grid : Message : 62.177204 s : mflop/s per node = 24721998.6
Grid : Message : 62.182696 s : norm diff 5.8108784e-14 Line 306
Grid : Message : 71.328862 s : ----------------------------------------------------------------
Grid : Message : 71.328884 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 71.328885 s : ----------------------------------------------------------------
Grid : Message : 71.328886 s : Called DwDag
Grid : Message : 71.328887 s : norm dag result 4.12810493
Grid : Message : 71.329493 s : norm dag ref 4.12810493
Grid : Message : 71.331967 s : norm dag diff 3.40632318e-14 Line 377
Grid : Message : 71.394727 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 71.803650 s : src_e0.500003185
Grid : Message : 71.819727 s : src_o0.499996882
Grid : Message : 71.821991 s : *********************************************************
Grid : Message : 71.821993 s : * Benchmarking DomainWallFermion::DhopEO
Grid : Message : 71.821995 s : * Vectorising space-time by 8
Grid : Message : 71.821998 s : * Using Overlapped Comms/Compute
Grid : Message : 71.822002 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 71.822003 s : *********************************************************
Grid : Message : 72.377054 s : Deo mflop/s = 24065467
Grid : Message : 72.377071 s : Deo mflop/s per rank 6016366.75
Grid : Message : 72.377074 s : Deo mflop/s per node 24065467
Grid : Message : 72.624877 s : r_e2.06377678
Grid : Message : 72.625198 s : r_o2.06381058
Grid : Message : 72.625507 s : res4.12758736
Grid : Message : 73.759140 s : norm diff 0
Grid : Message : 73.868204 s : norm diff even 0
Grid : Message : 73.907201 s : norm diff odd 0
Grid : Message : 74.414580 s : ++++++++++++++++++++++++++++++++++++++++++++++++
Grid : Message : 74.414582 s : Testing without internode communication
Grid : Message : 74.414584 s : ++++++++++++++++++++++++++++++++++++++++++++++++
Grid : Message : 74.414586 s : Grid Layout
Grid : Message : 74.414586 s : Global lattice size : 32 32 64 64
Grid : Message : 74.414594 s : OpenMP threads : 4
Grid : Message : 74.414595 s : MPI tasks : 1 1 2 2
Grid : Message : 74.679364 s : Initialising 4d RNG
Grid : Message : 74.742332 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 74.742343 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 74.759525 s : Initialising 5d RNG
Grid : Message : 75.812412 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 75.812429 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 119.252016 s : Drawing gauge field
Grid : Message : 129.919846 s : Random gauge initialised
Grid : Message : 129.919863 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
Grid : Message : 129.919865 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
Grid : Message : 129.923611 s : Setting up Cshift based reference
Grid : Message : 135.522878 s : *****************************************************************
Grid : Message : 135.522897 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 135.522899 s : *****************************************************************
Grid : Message : 135.522899 s : *****************************************************************
Grid : Message : 135.522900 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 135.522901 s : * Vectorising space-time by 8
Grid : Message : 135.522903 s : * VComplex size is 64 B
Grid : Message : 135.522905 s : * Using Overlapped Comms/Compute
Grid : Message : 135.522907 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 135.522908 s : *****************************************************************
Grid : Message : 136.151202 s : Called warmup
Grid : Message : 137.224721 s : Called Dw 300 times in 1073490 us
Grid : Message : 137.224748 s : mflop/s = 24755806
Grid : Message : 137.224751 s : mflop/s per rank = 6188951.49
Grid : Message : 137.224753 s : mflop/s per node = 24755806
Grid : Message : 137.235239 s : norm diff 5.8108784e-14 Line 306
Grid : Message : 146.451686 s : ----------------------------------------------------------------
Grid : Message : 146.451708 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 146.451710 s : ----------------------------------------------------------------
Grid : Message : 146.451712 s : Called DwDag
Grid : Message : 146.451714 s : norm dag result 4.12810493
Grid : Message : 146.452323 s : norm dag ref 4.12810493
Grid : Message : 146.454799 s : norm dag diff 3.40632318e-14 Line 377
Grid : Message : 146.498557 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 146.940894 s : src_e0.500003185
Grid : Message : 146.953676 s : src_o0.499996882
Grid : Message : 146.955927 s : *********************************************************
Grid : Message : 146.955929 s : * Benchmarking DomainWallFermion::DhopEO
Grid : Message : 146.955932 s : * Vectorising space-time by 8
Grid : Message : 146.955936 s : * Using Overlapped Comms/Compute
Grid : Message : 146.955938 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 146.955941 s : *********************************************************
Grid : Message : 147.511975 s : Deo mflop/s = 24036256.5
Grid : Message : 147.511989 s : Deo mflop/s per rank 6009064.13
Grid : Message : 147.511991 s : Deo mflop/s per node 24036256.5
Grid : Message : 147.522100 s : r_e2.06377678
Grid : Message : 147.522433 s : r_o2.06381058
Grid : Message : 147.522745 s : res4.12758736
Grid : Message : 148.229848 s : norm diff 0
Grid : Message : 149.233474 s : norm diff even 0
Grid : Message : 149.235815 s : norm diff odd 0
Grid : Message : 149.960985 s : ++++++++++++++++++++++++++++++++++++++++++++++++
Grid : Message : 149.960990 s : Testing without intranode communication
Grid : Message : 149.960991 s : ++++++++++++++++++++++++++++++++++++++++++++++++
Grid : Message : 149.960995 s : Grid Layout
Grid : Message : 149.960995 s : Global lattice size : 32 32 64 64
Grid : Message : 149.961003 s : OpenMP threads : 4
Grid : Message : 149.961004 s : MPI tasks : 1 1 2 2
Grid : Message : 150.155810 s : Initialising 4d RNG
Grid : Message : 150.800200 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 150.800340 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 150.973420 s : Initialising 5d RNG
Grid : Message : 151.131117 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 151.131136 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 193.933765 s : Drawing gauge field
Grid : Message : 204.611551 s : Random gauge initialised
Grid : Message : 204.611574 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
Grid : Message : 204.611576 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
Grid : Message : 204.615265 s : Setting up Cshift based reference
Grid : Message : 210.117788 s : *****************************************************************
Grid : Message : 210.117807 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 210.117809 s : *****************************************************************
Grid : Message : 210.117810 s : *****************************************************************
Grid : Message : 210.117812 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 210.117813 s : * Vectorising space-time by 8
Grid : Message : 210.117814 s : * VComplex size is 64 B
Grid : Message : 210.117817 s : * Using Overlapped Comms/Compute
Grid : Message : 210.117818 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 210.117819 s : *****************************************************************
Grid : Message : 210.714641 s : Called warmup
Grid : Message : 211.892227 s : Called Dw 300 times in 1177557 us
Grid : Message : 211.892252 s : mflop/s = 22568003.2
Grid : Message : 211.892255 s : mflop/s per rank = 5642000.8
Grid : Message : 211.892257 s : mflop/s per node = 22568003.2
Grid : Message : 211.896037 s : norm diff 5.8108784e-14 Line 306
Grid : Message : 220.751375 s : ----------------------------------------------------------------
Grid : Message : 220.751406 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 220.751409 s : ----------------------------------------------------------------
Grid : Message : 220.751411 s : Called DwDag
Grid : Message : 220.751412 s : norm dag result 4.12810493
Grid : Message : 220.753307 s : norm dag ref 4.12810493
Grid : Message : 220.755796 s : norm dag diff 3.40632318e-14 Line 377
Grid : Message : 220.813226 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 221.697800 s : src_e0.500003185
Grid : Message : 221.890920 s : src_o0.499996882
Grid : Message : 221.913430 s : *********************************************************
Grid : Message : 221.913450 s : * Benchmarking DomainWallFermion::DhopEO
Grid : Message : 221.913480 s : * Vectorising space-time by 8
Grid : Message : 221.913500 s : * Using Overlapped Comms/Compute
Grid : Message : 221.913530 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 221.913550 s : *********************************************************
Grid : Message : 221.645213 s : Deo mflop/s = 24114032
Grid : Message : 221.645228 s : Deo mflop/s per rank 6028508.01
Grid : Message : 221.645231 s : Deo mflop/s per node 24114032
Grid : Message : 221.656021 s : r_e2.06377678
Grid : Message : 221.656389 s : r_o2.06381058
Grid : Message : 221.656698 s : res4.12758736
Grid : Message : 222.110075 s : norm diff 0
Grid : Message : 222.857692 s : norm diff even 0
Grid : Message : 222.875763 s : norm diff odd 0
Grid : Message : 223.598127 s : *******************************************
Grid : Message : 223.598145 s : ******* Grid Finalize ******
Grid : Message : 223.598146 s : *******************************************

View File

@@ -1,286 +0,0 @@
RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
RANK 1 using NUMA 1 GPU 1 NIC mlx5_1:1
RANK 3 using NUMA 3 GPU 3 NIC mlx5_3:1
RANK 0 using NUMA 0 GPU 0 NIC mlx5_0:1
RANK 2 using NUMA 2 GPU 2 NIC mlx5_2:1
SLURM detected
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device Number : 0
AcceleratorCudaInit[0]: ========================
AcceleratorCudaInit[0]: Device identifier: NVIDIA GH200 120GB
AcceleratorCudaInit[0]: totalGlobalMem: 102005473280
AcceleratorCudaInit[0]: managedMemory: 1
AcceleratorCudaInit[0]: isMultiGpuBoard: 0
AcceleratorCudaInit[0]: warpSize: 32
AcceleratorCudaInit[0]: pciBusID: 1
AcceleratorCudaInit[0]: pciDeviceID: 0
AcceleratorCudaInit[0]: maxGridSize (2147483647,65535,65535)
AcceleratorCudaInit: using default device
AcceleratorCudaInit: assume user either uses
AcceleratorCudaInit: a) IBM jsrun, or
AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding
AcceleratorCudaInit: Configure options --enable-setdevice=no
local rank 0 device 0 bus id: 0009:01:00.0
AcceleratorCudaInit: ================================================
SharedMemoryMpi: World communicator of size 16
SharedMemoryMpi: Node communicator of size 4
0SharedMemoryMpi: SharedMemoryMPI.cc acceleratorAllocDevice 2147483648bytes at 0x4002a0000000 - 40031fffffff for comms buffers
Setting up IPC
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|_ | | | | | | | | | | | | _|__
__|_ _|__
__|_ GGGG RRRR III DDDD _|__
__|_ G R R I D D _|__
__|_ G R R I D D _|__
__|_ G GG RRRR I D D _|__
__|_ G G R R I D D _|__
__|_ GGGG R R III DDDD _|__
__|_ _|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
__|__|__|__|__|__|__|__|__|__|__|__|__|__|__
| | | | | | | | | | | | | |
Copyright (C) 2015 Peter Boyle, Azusa Yamaguchi, Guido Cossu, Antonin Portelli and other authors
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Current Grid git commit hash=3737a24096282ea179607fc879814710860a0de6: (HEAD -> develop, origin/develop, origin/HEAD) clean
Grid : Message : ================================================
Grid : Message : MPI is initialised and logging filters activated
Grid : Message : ================================================
Grid : Message : This rank is running on host jpbo-012-11.jupiter.internal
Grid : Message : Requested 2147483648 byte stencil comms buffers
Grid : Message : MemoryManager Cache 81604378624 bytes
Grid : Message : MemoryManager::Init() setting up
Grid : Message : MemoryManager::Init() cache pool for recent host allocations: SMALL 8 LARGE 2 HUGE 0
Grid : Message : MemoryManager::Init() cache pool for recent device allocations: SMALL 16 LARGE 8 Huge 0
Grid : Message : MemoryManager::Init() cache pool for recent shared allocations: SMALL 16 LARGE 8 Huge 0
Grid : Message : MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory
Grid : Message : MemoryManager::Init() Using cudaMalloc
Grid : Message : 0.834000 s : ++++++++++++++++++++++++++++++++++++++++++++++++
Grid : Message : 0.838000 s : Testing with full communication
Grid : Message : 0.839000 s : ++++++++++++++++++++++++++++++++++++++++++++++++
Grid : Message : 0.840000 s : Grid Layout
Grid : Message : 0.840000 s : Global lattice size : 64 64 64 64
Grid : Message : 0.846000 s : OpenMP threads : 4
Grid : Message : 0.846000 s : MPI tasks : 2 2 2 2
Grid : Message : 0.165970 s : Initialising 4d RNG
Grid : Message : 0.787270 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 0.787340 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 0.960410 s : Initialising 5d RNG
Grid : Message : 1.142344 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 1.142352 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
local rank 2 device 0 bus id: 0029:01:00.0
local rank 3 device 0 bus id: 0039:01:00.0
local rank 1 device 0 bus id: 0019:01:00.0
Grid : Message : 44.657270 s : Drawing gauge field
Grid : Message : 55.247733 s : Random gauge initialised
Grid : Message : 55.247745 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
Grid : Message : 55.247747 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
Grid : Message : 55.253053 s : Setting up Cshift based reference
Grid : Message : 62.191747 s : *****************************************************************
Grid : Message : 62.191767 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 62.191768 s : *****************************************************************
Grid : Message : 62.191769 s : *****************************************************************
Grid : Message : 62.191769 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 62.191769 s : * Vectorising space-time by 8
Grid : Message : 62.191770 s : * VComplex size is 64 B
Grid : Message : 62.191771 s : * Using Overlapped Comms/Compute
Grid : Message : 62.191771 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 62.191772 s : *****************************************************************
Grid : Message : 62.857568 s : Called warmup
Grid : Message : 65.581790 s : Called Dw 300 times in 2200540 us
Grid : Message : 65.582120 s : mflop/s = 48306525
Grid : Message : 65.582140 s : mflop/s per rank = 3019157.81
Grid : Message : 65.582150 s : mflop/s per node = 12076631.3
Grid : Message : 65.637550 s : norm diff 5.80156793e-14 Line 306
Grid : Message : 75.122153 s : ----------------------------------------------------------------
Grid : Message : 75.122166 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 75.122167 s : ----------------------------------------------------------------
Grid : Message : 75.122167 s : Called DwDag
Grid : Message : 75.122167 s : norm dag result 4.12801829
Grid : Message : 75.123295 s : norm dag ref 4.12801829
Grid : Message : 75.125890 s : norm dag diff 3.42093991e-14 Line 377
Grid : Message : 75.188462 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 75.605683 s : src_e0.500004005
Grid : Message : 75.617824 s : src_o0.499996067
Grid : Message : 75.620089 s : *********************************************************
Grid : Message : 75.620091 s : * Benchmarking DomainWallFermion::DhopEO
Grid : Message : 75.620093 s : * Vectorising space-time by 8
Grid : Message : 75.620094 s : * Using Overlapped Comms/Compute
Grid : Message : 75.620095 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 75.620096 s : *********************************************************
Grid : Message : 76.732272 s : Deo mflop/s = 48068252.4
Grid : Message : 76.732283 s : Deo mflop/s per rank 3004265.77
Grid : Message : 76.732285 s : Deo mflop/s per node 12017063.1
Grid : Message : 76.749317 s : r_e2.06443136
Grid : Message : 76.749652 s : r_o2.06378451
Grid : Message : 76.749955 s : res4.12821587
Grid : Message : 77.198827 s : norm diff 0
Grid : Message : 77.981760 s : norm diff even 0
Grid : Message : 78.455900 s : norm diff odd 0
Grid : Message : 78.539333 s : ++++++++++++++++++++++++++++++++++++++++++++++++
Grid : Message : 78.539337 s : Testing without internode communication
Grid : Message : 78.539338 s : ++++++++++++++++++++++++++++++++++++++++++++++++
Grid : Message : 78.539339 s : Grid Layout
Grid : Message : 78.539339 s : Global lattice size : 64 64 64 64
Grid : Message : 78.539347 s : OpenMP threads : 4
Grid : Message : 78.539348 s : MPI tasks : 2 2 2 2
Grid : Message : 78.798501 s : Initialising 4d RNG
Grid : Message : 78.862916 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 78.862925 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 78.879916 s : Initialising 5d RNG
Grid : Message : 79.941271 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 79.941280 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 124.586264 s : Drawing gauge field
Grid : Message : 135.338090 s : Random gauge initialised
Grid : Message : 135.338102 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
Grid : Message : 135.338103 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
Grid : Message : 135.341266 s : Setting up Cshift based reference
Grid : Message : 142.604280 s : *****************************************************************
Grid : Message : 142.604450 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 142.604460 s : *****************************************************************
Grid : Message : 142.604470 s : *****************************************************************
Grid : Message : 142.604480 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 142.604480 s : * Vectorising space-time by 8
Grid : Message : 142.604500 s : * VComplex size is 64 B
Grid : Message : 142.604510 s : * Using Overlapped Comms/Compute
Grid : Message : 142.604510 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 142.604520 s : *****************************************************************
Grid : Message : 142.686034 s : Called warmup
Grid : Message : 144.868543 s : Called Dw 300 times in 2182483 us
Grid : Message : 144.868559 s : mflop/s = 48706194.1
Grid : Message : 144.868561 s : mflop/s per rank = 3044137.13
Grid : Message : 144.868562 s : mflop/s per node = 12176548.5
Grid : Message : 144.887595 s : norm diff 5.80156793e-14 Line 306
Grid : Message : 153.622978 s : ----------------------------------------------------------------
Grid : Message : 153.622994 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 153.622995 s : ----------------------------------------------------------------
Grid : Message : 153.622995 s : Called DwDag
Grid : Message : 153.622996 s : norm dag result 4.12801829
Grid : Message : 153.623604 s : norm dag ref 4.12801829
Grid : Message : 153.626098 s : norm dag diff 3.42093991e-14 Line 377
Grid : Message : 153.691426 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 154.148319 s : src_e0.500004005
Grid : Message : 154.151454 s : src_o0.499996067
Grid : Message : 154.153722 s : *********************************************************
Grid : Message : 154.153724 s : * Benchmarking DomainWallFermion::DhopEO
Grid : Message : 154.153725 s : * Vectorising space-time by 8
Grid : Message : 154.153726 s : * Using Overlapped Comms/Compute
Grid : Message : 154.153727 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 154.153728 s : *********************************************************
Grid : Message : 155.200671 s : Deo mflop/s = 51121022.4
Grid : Message : 155.200682 s : Deo mflop/s per rank 3195063.9
Grid : Message : 155.200684 s : Deo mflop/s per node 12780255.6
Grid : Message : 155.217204 s : r_e2.06443136
Grid : Message : 155.217550 s : r_o2.06378451
Grid : Message : 155.217869 s : res4.12821587
Grid : Message : 155.673744 s : norm diff 0
Grid : Message : 156.463329 s : norm diff even 0
Grid : Message : 156.878866 s : norm diff odd 0
Grid : Message : 157.620761 s : ++++++++++++++++++++++++++++++++++++++++++++++++
Grid : Message : 157.620764 s : Testing without intranode communication
Grid : Message : 157.620765 s : ++++++++++++++++++++++++++++++++++++++++++++++++
Grid : Message : 157.620766 s : Grid Layout
Grid : Message : 157.620766 s : Global lattice size : 64 64 64 64
Grid : Message : 157.620773 s : OpenMP threads : 4
Grid : Message : 157.620774 s : MPI tasks : 2 2 2 2
Grid : Message : 157.671479 s : Initialising 4d RNG
Grid : Message : 157.738691 s : Intialising parallel RNG with unique string 'The 4D RNG'
Grid : Message : 157.738698 s : Seed SHA256: 49db4542db694e3b1a74bf2592a8c1b83bfebbe18401693c2609a4c3af1
Grid : Message : 157.755651 s : Initialising 5d RNG
Grid : Message : 158.848676 s : Intialising parallel RNG with unique string 'The 5D RNG'
Grid : Message : 158.848685 s : Seed SHA256: b6316f2fac44ce14111f93e0296389330b077bfd0a7b359f781c58589f8a
Grid : Message : 202.465158 s : Drawing gauge field
Grid : Message : 213.214546 s : Random gauge initialised
Grid : Message : 213.214561 s : Applying BCs for Dirichlet Block5 [0 0 0 0 0]
Grid : Message : 213.214563 s : Applying BCs for Dirichlet Block4 [0 0 0 0]
Grid : Message : 213.217711 s : Setting up Cshift based reference
Grid : Message : 219.662772 s : *****************************************************************
Grid : Message : 219.662786 s : * Kernel options --dslash-generic, --dslash-unroll, --dslash-asm
Grid : Message : 219.662787 s : *****************************************************************
Grid : Message : 219.662788 s : *****************************************************************
Grid : Message : 219.662788 s : * Benchmarking DomainWallFermionR::Dhop
Grid : Message : 219.662789 s : * Vectorising space-time by 8
Grid : Message : 219.662790 s : * VComplex size is 64 B
Grid : Message : 219.662791 s : * Using Overlapped Comms/Compute
Grid : Message : 219.662791 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 219.662791 s : *****************************************************************
Grid : Message : 220.425592 s : Called warmup
Grid : Message : 222.536249 s : Called Dw 300 times in 2110597 us
Grid : Message : 222.536267 s : mflop/s = 50365105.5
Grid : Message : 222.536269 s : mflop/s per rank = 3147819.09
Grid : Message : 222.536270 s : mflop/s per node = 12591276.4
Grid : Message : 222.541053 s : norm diff 5.80156793e-14 Line 306
Grid : Message : 232.135901 s : ----------------------------------------------------------------
Grid : Message : 232.135915 s : Compare to naive wilson implementation Dag to verify correctness
Grid : Message : 232.135916 s : ----------------------------------------------------------------
Grid : Message : 232.135917 s : Called DwDag
Grid : Message : 232.135918 s : norm dag result 4.12801829
Grid : Message : 232.151938 s : norm dag ref 4.12801829
Grid : Message : 232.154451 s : norm dag diff 3.42093991e-14 Line 377
Grid : Message : 232.216117 s : Calling Deo and Doe and //assert Deo+Doe == Dunprec
Grid : Message : 232.630529 s : src_e0.500004005
Grid : Message : 232.643197 s : src_o0.499996067
Grid : Message : 232.645527 s : *********************************************************
Grid : Message : 232.645529 s : * Benchmarking DomainWallFermion::DhopEO
Grid : Message : 232.645532 s : * Vectorising space-time by 8
Grid : Message : 232.645533 s : * Using Overlapped Comms/Compute
Grid : Message : 232.645534 s : * Using GENERIC Nc WilsonKernels
Grid : Message : 232.645535 s : *********************************************************
Grid : Message : 233.774184 s : Deo mflop/s = 47432091.9
Grid : Message : 233.774194 s : Deo mflop/s per rank 2964505.74
Grid : Message : 233.774196 s : Deo mflop/s per node 11858023
Grid : Message : 233.791552 s : r_e2.06443136
Grid : Message : 233.791899 s : r_o2.06378451
Grid : Message : 233.792204 s : res4.12821587
Grid : Message : 234.230783 s : norm diff 0
Grid : Message : 235.162780 s : norm diff even 0
Grid : Message : 235.291950 s : norm diff odd 0
Grid : Message : 235.765411 s : *******************************************
Grid : Message : 235.765424 s : ******* Grid Finalize ******
Grid : Message : 235.765425 s : *******************************************

View File

@@ -1,57 +0,0 @@
#!/bin/sh
#SBATCH --account=jureap14
#SBATCH --nodes=1
#SBATCH --ntasks=4
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=64
#SBATCH --time=2:00:00
#SBATCH --partition=booster
#SBATCH --gres=gpu:4
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
OPT="--comms-overlap"
source ../sourceme.sh
cat << EOF > bind_gpu
#!/bin/bash
export GPU_MAP=(0 1 2 3)
export NUMA_MAP=(0 1 2 3)
export NIC_MAP=(0 1 2 3)
export GPU=\$SLURM_LOCALID
export NUMA=\$SLURM_LOCALID
export NIC=\$SLURM_LOCALID
export CUDA_VISIBLE_DEVICES=\$GPU
export UCX_NET_DEVICES=mlx5_\${NIC}:1
echo RANK \$SLURM_LOCALID using NUMA \$NUMA GPU \$GPU NIC \$UCX_NET_DEVICES
exec numactl -m \$NUMA -N \$NUMA \$*
EOF
chmod +x ./bind_gpu
srun --cpu-bind=no -N 1 -n $SLURM_NTASKS \
./bind_gpu ./Benchmark_dwf_fp32 \
$OPT \
--mpi 1.1.2.2 \
--accelerator-threads 8 \
--grid 32.32.64.64 \
--shm 2048 > dwf.1node.perf
srun --cpu-bind=no -N 1 -n $SLURM_NTASKS \
./bind_gpu ./Benchmark_comms_host_device \
--mpi 1.1.2.2 \
--accelerator-threads 8 \
--grid 32.32.64.64 \
--shm 2048 > comms.1node.perf

View File

@@ -1,57 +0,0 @@
#!/bin/sh
#SBATCH --account=jureap14
#SBATCH --nodes=4
#SBATCH --ntasks=16
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=64
#SBATCH --time=2:00:00
#SBATCH --partition=booster
#SBATCH --gres=gpu:4
export OMP_NUM_THREADS=4
export OMPI_MCA_btl=^uct,openib
export UCX_TLS=gdr_copy,rc,rc_x,sm,cuda_copy,cuda_ipc
export UCX_RNDV_SCHEME=put_zcopy
export UCX_RNDV_THRESH=16384
export UCX_IB_GPU_DIRECT_RDMA=yes
export UCX_MEMTYPE_CACHE=n
OPT="--comms-overlap"
source ../sourceme.sh
cat << EOF > bind_gpu
#!/bin/bash
export GPU_MAP=(0 1 2 3)
export NUMA_MAP=(0 1 2 3)
export NIC_MAP=(0 1 2 3)
export GPU=\$SLURM_LOCALID
export NUMA=\$SLURM_LOCALID
export NIC=\$SLURM_LOCALID
export CUDA_VISIBLE_DEVICES=\$GPU
export UCX_NET_DEVICES=mlx5_\${NIC}:1
echo RANK \$SLURM_LOCALID using NUMA \$NUMA GPU \$GPU NIC \$UCX_NET_DEVICES
exec numactl -m \$NUMA -N \$NUMA \$*
EOF
chmod +x ./bind_gpu
srun --cpu-bind=no -N 4 -n $SLURM_NTASKS \
./bind_gpu ./Benchmark_dwf_fp32 \
$OPT \
--mpi 2.2.2.2 \
--accelerator-threads 8 \
--grid 64.64.64.64 \
--shm 2048 > dwf.4node.perf
srun --cpu-bind=no -N 4 -n $SLURM_NTASKS \
./bind_gpu ./Benchmark_comms_host_device \
--mpi 2.2.2.2 \
--accelerator-threads 8 \
--grid 32.32.64.64 \
--shm 2048 > comms.4node.perf

View File

@@ -1,16 +0,0 @@
export CXX=nvcc
export OPENMPI=/p/software/default/stages/2025/software/OpenMPI/5.0.5-NVHPC-24.9-CUDA-12/
export LDFLAGS="-cudart shared -L${OPENMPI}/lib"
export CXXFLAGS="-ccbin clang++ -gencode arch=compute_90,code=sm_90 -std=c++17 -cudart shared -lcublas -lmpi -I${OPENMPI}/include"
../../configure \
--enable-comms=mpi \
--enable-simd=GPU \
--enable-gen-simd-width=64 \
--enable-shm=nvlink \
--enable-accelerator=cuda \
--with-lime=$CLIME \
--disable-gparity \
--disable-fermion-reps \
--disable-unified

View File

@@ -1,10 +0,0 @@
CLIME=$HOME/install/
module load Clang
module load CUDA
module load FFTW
module load OpenSSL
module load MPFR
module load NVHPC
module load UCX
module load OpenMPI
ulimit -c 0

View File

@@ -7,6 +7,8 @@ CXX=mpicxx ../../configure \
--enable-unified=yes \ --enable-unified=yes \
--prefix /Users/peterboyle/QCD/vtk/Grid/install \ --prefix /Users/peterboyle/QCD/vtk/Grid/install \
--with-lime=$CLIME \ --with-lime=$CLIME \
--with-hdf5=$HDF5 \
--with-fftw=$FFTW \
--with-openssl=$OPENSSL \ --with-openssl=$OPENSSL \
--with-gmp=$GMP \ --with-gmp=$GMP \
--with-mpfr=$MPFR \ --with-mpfr=$MPFR \

View File

@@ -1,12 +1,3 @@
spack load c-lime
spack load fftw
spack load hdf5+cxx
export FFTW=`spack find --paths fftw | grep ^fftw | awk '{print $2}' `
export HDF5=`spack find --paths hdf5+cxx | grep ^hdf5 | awk '{print $2}' `
export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' `
../../configure \ ../../configure \
--enable-comms=mpi-auto \ --enable-comms=mpi-auto \
--enable-unified=yes \ --enable-unified=yes \
@@ -14,16 +5,12 @@ export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' `
--enable-shm-fast-path=shmopen \ --enable-shm-fast-path=shmopen \
--enable-accelerator=none \ --enable-accelerator=none \
--enable-simd=AVX512 \ --enable-simd=AVX512 \
--with-lime=$CLIME \ --disable-accelerator-cshift \
--with-hdf5=$HDF5 \
--with-fftw=$FFTW \
--disable-fermion-reps \ --disable-fermion-reps \
--disable-gparity \ --disable-gparity \
CXX=clang++ \ CXX=clang++ \
MPICXX=mpicxx \ MPICXX=mpicxx \
LIBS=-llime \ CXXFLAGS="-std=c++17"
LDFLAGS=-L$CLIME/lib/ \
CXXFLAGS="-std=c++17 -fPIE"

View File

@@ -1,5 +1,4 @@
source $HOME/spack/share/spack/setup-env.sh source $HOME/spack/share/spack/setup-env.sh
spack load llvm@17.0.4 spack load llvm@17.0.4
export LD_LIBRARY_PATH=/direct/sdcc+u/paboyle/spack/opt/spack/linux-almalinux8-icelake/gcc-8.5.0/llvm-17.0.4-laufdrcip63ivkadmtgoepwmj3dtztdu/lib:$LD_LIBRARY_PATH export LD_LIBRARY_PATH=/direct/sdcc+u/paboyle/spack/opt/spack/linux-almalinux8-icelake/gcc-8.5.0/llvm-17.0.4-laufdrcip63ivkadmtgoepwmj3dtztdu/lib:$LD_LIBRARY_PATH
module load openmpi/4.1.8 module load openmpi
spack load c-lime

View File

@@ -31,23 +31,16 @@ directory
using namespace std; using namespace std;
using namespace Grid; using namespace Grid;
;
//typedef WilsonFermionD FermionOp; #if 0
typedef DomainWallFermionD FermionOp; typedef DomainWallFermionD FermionOp;
typedef typename DomainWallFermionD::FermionField FermionField; typedef typename DomainWallFermionD::FermionField FermionField;
#else
template <class T> void writeFile(T& in, std::string const fname){ typedef MobiusFermionD FermionOp;
#ifdef HAVE_LIME typedef typename MobiusFermionD::FermionField FermionField;
// Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
std::cout << Grid::GridLogMessage << "Writes to: " << fname << std::endl;
Grid::emptyUserRecord record;
Grid::ScidacWriter WR(in.Grid()->IsBoss());
WR.open(fname);
WR.writeScidacFieldRecord(in,record,0);
WR.close();
#endif #endif
// What is the appropriate way to throw error?
}
RealD AllZero(RealD x) { return 0.; } RealD AllZero(RealD x) { return 0.; }
@@ -132,7 +125,7 @@ int main(int argc, char** argv) {
int Ls=16; int Ls=16;
RealD M5=1.8; RealD M5=1.8;
RealD mass = 0.01; RealD mass = -1.0;
mass=LanParams.mass; mass=LanParams.mass;
Ls=LanParams.Ls; Ls=LanParams.Ls;
@@ -170,21 +163,22 @@ int main(int argc, char** argv) {
U[mu] = PeekIndex<LorentzIndex>(Umu, mu); U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
} }
*/ */
int Nk = 20;
int Nstop = Nk;
int Np = 80;
int Nstop = 10;
int Nk = 20;
int Np = 80;
Nstop=LanParams.Nstop; Nstop=LanParams.Nstop;
Nk=LanParams.Nk; Nk=LanParams.Nk;
Np=LanParams.Np; Np=LanParams.Np;
int Nm = Nk + Np; int Nm = Nk + Np;
int MaxIt = 100; int MaxIt = 10000;
RealD resid = 1.0e-4; RealD resid = 1.0e-5;
RealD mob_b=1.5;
//while ( mass > - 5.0){ //while ( mass > - 5.0){
FermionOp Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5); // FermionOp Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
FermionOp Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,mob_b,mob_b-1.);
MdagMLinearOperator<FermionOp,FermionField> HermOp(Ddwf); /// <----- MdagMLinearOperator<FermionOp,FermionField> HermOp(Ddwf); /// <-----
// Gamma5HermitianLinearOperator <FermionOp,LatticeFermion> HermOp2(WilsonOperator); /// <----- // Gamma5HermitianLinearOperator <FermionOp,LatticeFermion> HermOp2(WilsonOperator); /// <-----
Gamma5R5HermitianLinearOperator<FermionOp, LatticeFermion> G5R5Herm(Ddwf); Gamma5R5HermitianLinearOperator<FermionOp, LatticeFermion> G5R5Herm(Ddwf);
@@ -212,12 +206,8 @@ int main(int argc, char** argv) {
int Nconv; int Nconv;
IRL.calc(eval, evec, src, Nconv); IRL.calc(eval, evec, src, Nconv);
std::cout << mass <<" : " << eval << std::endl; std::cout << mass <<" : " << eval << std::endl;
std::cout << " #evecs " << evec.size() << std::endl;
std::cout << " Nconv " << Nconv << std::endl;
std::cout << " Nm " << Nm << std::endl;
if ( Nconv > evec.size() ) Nconv = evec.size();
#if 0 #if 0
Gamma g5(Gamma::Algebra::Gamma5) ; Gamma g5(Gamma::Algebra::Gamma5) ;
ComplexD dot; ComplexD dot;
@@ -247,7 +237,6 @@ int main(int argc, char** argv) {
vector<LatticeFermion> finalevec(Nconv, FGrid); vector<LatticeFermion> finalevec(Nconv, FGrid);
vector<RealD> eMe(Nconv), eMMe(Nconv); vector<RealD> eMe(Nconv), eMMe(Nconv);
for(int i = 0; i < Nconv; i++){ for(int i = 0; i < Nconv; i++){
cout << "calculate the matrix element["<<i<<"]" << endl;
G5R5Herm.HermOpAndNorm(evec[i], G5R5Mevec[i], eMe[i], eMMe[i]); G5R5Herm.HermOpAndNorm(evec[i], G5R5Mevec[i], eMe[i], eMMe[i]);
} }
cout << "Re<evec, G5R5M(evec)>: " << endl; cout << "Re<evec, G5R5M(evec)>: " << endl;
@@ -314,7 +303,7 @@ int main(int argc, char** argv) {
} }
} }
} }
for(int i = 0; i < Nconv; i++){ for(int i = 0; i < Nconv; i++){
G5R5Herm.HermOpAndNorm(finalevec[i], G5R5Mevec[i], eMe[i], eMMe[i]); G5R5Herm.HermOpAndNorm(finalevec[i], G5R5Mevec[i], eMe[i], eMMe[i]);
} }
cout << "Re<evec, G5R5M(evec)>: " << endl; cout << "Re<evec, G5R5M(evec)>: " << endl;
@@ -322,7 +311,6 @@ int main(int argc, char** argv) {
cout << "<G5R5M(evec), G5R5M(evec)>" << endl; cout << "<G5R5M(evec), G5R5M(evec)>" << endl;
cout << eMMe << endl; cout << eMMe << endl;
// vector<LatticeFermion> finalevec(Nconv, FGrid); // vector<LatticeFermion> finalevec(Nconv, FGrid);
// temporary, until doing rotation // temporary, until doing rotation
@@ -343,41 +331,13 @@ int main(int argc, char** argv) {
axpby_ssp(G5evec[i], -1., finalevec[i], 0., G5evec[i], j, j); axpby_ssp(G5evec[i], -1., finalevec[i], 0., G5evec[i], j, j);
} }
} }
for(int i = 0; i < Nconv; i++){
Ddwf.M(finalevec[i], G5R5Mevec[i]);
for(int j = 0; j < Nconv; j++){
std::cout << "<"<<j<<"|Ddwf|"<<i<<"> = "<<innerProduct(finalevec[j],G5R5Mevec[i])<<std::endl;
}
}
for(int i = 0; i < Nconv; i++){
RealD t1,t2;
G5R5Herm.HermOpAndNorm(finalevec[i], G5R5Mevec[i], t1, t2);
for(int j = 0; j < Nconv; j++){
std::cout << "<"<<j<<"|G5R5 M|"<<i<<"> = "<<innerProduct(finalevec[j],G5R5Mevec[i])<<std::endl;
}
}
for(int i = 0; i < Nconv; i++){ for(int i = 0; i < Nconv; i++){
chiral_matrix_real[i].resize(Nconv); chiral_matrix_real[i].resize(Nconv);
chiral_matrix[i].resize(Nconv); chiral_matrix[i].resize(Nconv);
std::string evfile("./evec_density");
evfile = evfile+"_"+std::to_string(i);
auto evdensity = localInnerProduct(finalevec[i],finalevec[i] );
writeFile(evdensity,evfile);
for(int j = 0; j < Nconv; j++){ for(int j = 0; j < Nconv; j++){
chiral_matrix[i][j] = innerProduct(finalevec[i], G5evec[j]); chiral_matrix[i][j] = innerProduct(finalevec[i], G5evec[j]);
std::cout <<" chiral_matrix_real signed "<<i<<" "<<j<<" "<< chiral_matrix_real[i][j] << std::endl;
chiral_matrix_real[i][j] = abs(chiral_matrix[i][j]); chiral_matrix_real[i][j] = abs(chiral_matrix[i][j]);
std::cout <<" chiral_matrix_real "<<i<<" "<<j<<" "<< chiral_matrix_real[i][j] << std::endl; std::cout <<" chiral_matrix_real "<<i<<" "<<j<<" "<< chiral_matrix_real[i][j] << std::endl;
if ( chiral_matrix_real[i][j] > 0.8 ) {
auto g5density = localInnerProduct(finalevec[i], G5evec[j]);
std::string chfile("./chiral_density_");
chfile = chfile +std::to_string(i)+"_"+std::to_string(j);
writeFile(g5density,chfile);
}
} }
} }
for(int i = 0; i < Nconv; i++){ for(int i = 0; i < Nconv; i++){
@@ -386,43 +346,6 @@ int main(int argc, char** argv) {
} }
} }
FILE *fp = fopen("lego-plot.py","w"); assert(fp!=NULL);
#define PYTHON_LINE(A) fprintf(fp,A"\n");
PYTHON_LINE("import matplotlib.pyplot as plt");
PYTHON_LINE("import numpy as np");
PYTHON_LINE("");
PYTHON_LINE("fig = plt.figure()");
PYTHON_LINE("ax = fig.add_subplot(projection='3d')");
PYTHON_LINE("");
PYTHON_LINE("x, y = np.random.rand(2, 100) * 4");
fprintf(fp,"hist, xedges, yedges = np.histogram2d(x, y, bins=%d, range=[[0, %d], [0, %d]])\n",Nconv,Nconv-1,Nconv-1);
PYTHON_LINE("");
PYTHON_LINE("# Construct arrays for the anchor positions of the 16 bars");
PYTHON_LINE("xpos, ypos = np.meshgrid(xedges[:-1] + 0.25, yedges[:-1] + 0.25, indexing=\"ij\")");
PYTHON_LINE("xpos = xpos.ravel()");
PYTHON_LINE("ypos = ypos.ravel()");
PYTHON_LINE("zpos = 0");
PYTHON_LINE("");
PYTHON_LINE("# Construct arrays with the dimensions for the 16 bars.");
PYTHON_LINE("dx = dy = 0.5 * np.ones_like(zpos)");
PYTHON_LINE("dz = np.array([");
for(int i = 0; i < Nconv; i++){
fprintf(fp,"\t[ ");
for(int j = 0; j < Nconv; j++){
fprintf(fp,"%lf ",chiral_matrix_real[i][j]);
if(j<Nconv-1) fprintf(fp,",");
else fprintf(fp," ");
}
fprintf(fp,"]");
if(i<Nconv-1) fprintf(fp,",\n");
else fprintf(fp,"\n");
}
PYTHON_LINE("\t])");
PYTHON_LINE("dz = dz.ravel()");
PYTHON_LINE("ax.bar3d(xpos, ypos, zpos, dx, dy, dz, zsort='average')");
PYTHON_LINE("plt.show()");
fclose(fp);
Grid_finalize(); Grid_finalize();
} }

View File

@@ -113,6 +113,9 @@ struct LanczosParameters: Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters, GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
RealD, mass , RealD, mass ,
RealD, resid, RealD, resid,
Integer, Nstop,
Integer, Nk,
Integer, Np,
RealD, ChebyLow, RealD, ChebyLow,
RealD, ChebyHigh, RealD, ChebyHigh,
Integer, ChebyOrder) Integer, ChebyOrder)
@@ -204,7 +207,6 @@ int main(int argc, char** argv) {
int Nstop = 5; int Nstop = 5;
int Nk = 10; int Nk = 10;
int Np = 90; int Np = 90;
int Nm = Nk + Np;
int MaxIt = 10000; int MaxIt = 10000;
RealD resid = 1.0e-5; RealD resid = 1.0e-5;
@@ -226,10 +228,14 @@ int main(int argc, char** argv) {
XmlWriter HMCwr("LanParams.xml.out"); XmlWriter HMCwr("LanParams.xml.out");
write(HMCwr,"LanczosParameters",LanParams); write(HMCwr,"LanczosParameters",LanParams);
} }
Nstop=LanParams.Nstop;
Nk=LanParams.Nk;
Np=LanParams.Np;
mass=LanParams.mass; mass=LanParams.mass;
resid=LanParams.resid; resid=LanParams.resid;
int Nm = Nk + Np;
while ( mass > - 5.0){ while ( mass > - 5.0){
FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,2.+mass); FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,2.+mass);

View File

@@ -27,6 +27,7 @@ directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#include <Grid/Grid.h> #include <Grid/Grid.h>
#include <Grid/parallelIO/IldgIOtypes.h>
using namespace std; using namespace std;
using namespace Grid; using namespace Grid;
@@ -38,11 +39,28 @@ typedef typename WilsonFermionD::FermionField FermionField;
RealD AllZero(RealD x) { return 0.; } RealD AllZero(RealD x) { return 0.; }
template <class T> void writeFile(T& in, std::string const fname){
#if 1
// Ref: https://github.com/paboyle/Grid/blob/feature/scidac-wp1/tests/debug/Test_general_coarse_hdcg_phys48.cc#L111
std::cout << Grid::GridLogMessage << "Writes to: " << fname << std::endl;
Grid::emptyUserRecord record;
Grid::ScidacWriter WR(in.Grid()->IsBoss());
WR.open(fname);
WR.writeScidacFieldRecord(in,record,0);
WR.close();
#endif
// What is the appropriate way to throw error?
}
namespace Grid { namespace Grid {
struct LanczosParameters: Serializable { struct LanczosParameters: Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters, GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
RealD, mass , RealD, mass ,
Integer, Nstop,
Integer, Nk,
Integer, Np,
RealD, ChebyLow, RealD, ChebyLow,
RealD, ChebyHigh, RealD, ChebyHigh,
Integer, ChebyOrder) Integer, ChebyOrder)
@@ -158,6 +176,10 @@ int main(int argc, char** argv) {
} }
mass=LanParams.mass; mass=LanParams.mass;
Nstop=LanParams.Nstop;
Nk=LanParams.Nk;
Np=LanParams.Np;
Nm = Nk + Np;
while ( mass > - 5.0){ while ( mass > - 5.0){
@@ -202,6 +224,12 @@ while ( mass > - 5.0){
tmp = g5*evec[i]; tmp = g5*evec[i];
dot = innerProduct(tmp,evec[i]); dot = innerProduct(tmp,evec[i]);
std::cout << mass << " : " << eval[i] << " " << real(dot) << " " << imag(dot) << std::endl ; std::cout << mass << " : " << eval[i] << " " << real(dot) << " " << imag(dot) << std::endl ;
// if ( i<1)
{
std::string evfile ("./evec_"+std::to_string(mass)+"_"+std::to_string(i));
auto evdensity = localInnerProduct(evec[i],evec[i] );
writeFile(evdensity,evfile);
}
} }
src = evec[0]+evec[1]+evec[2]; src = evec[0]+evec[1]+evec[2];
mass += -0.1; mass += -0.1;

View File

@@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.12 FATAL_ERROR)
project(GridViewer) project(GridViewer)
list(APPEND CMAKE_PREFIX_PATH "/home/paboyle/Visualisation/install/") list(APPEND CMAKE_PREFIX_PATH "/Users/peterboyle/QCD/vtk/VTK-9.4.2-install/")
find_package(VTK COMPONENTS find_package(VTK COMPONENTS
CommonColor CommonColor

View File

@@ -48,14 +48,15 @@ typedef vtkMarchingCubes isosurface;
int mpeg = 0 ; int mpeg = 0 ;
int xlate = 0 ; int xlate = 0 ;
int framerate = 10;
template <class T> void readFile(T& out, std::string const fname){ template <class T> void readFile(T& out, std::string const fname){
#ifdef HAVE_LIME
Grid::emptyUserRecord record; Grid::emptyUserRecord record;
Grid::ScidacReader RD; Grid::ScidacReader RD;
RD.open(fname); RD.open(fname);
RD.readScidacFieldRecord(out,record); RD.readScidacFieldRecord(out,record);
RD.close(); RD.close();
#endif
} }
using namespace Grid; using namespace Grid;
@@ -207,10 +208,6 @@ int main(int argc, char* argv[])
xlate = 1; xlate = 1;
} }
if( GridCmdOptionExists(argv,argv+argc,"--fps") ){
arg=GridCmdOptionPayload(argv,argv+argc,"--fps");
GridCmdOptionInt(arg,framerate);
}
if( GridCmdOptionExists(argv,argv+argc,"--isosurface") ){ if( GridCmdOptionExists(argv,argv+argc,"--isosurface") ){
arg=GridCmdOptionPayload(argv,argv+argc,"--isosurface"); arg=GridCmdOptionPayload(argv,argv+argc,"--isosurface");
GridCmdOptionFloat(arg,default_contour); GridCmdOptionFloat(arg,default_contour);
@@ -423,7 +420,7 @@ int main(int argc, char* argv[])
vtkFFMPEGWriter *writer = vtkFFMPEGWriter::New(); vtkFFMPEGWriter *writer = vtkFFMPEGWriter::New();
writer->SetFileName("movie.avi"); writer->SetFileName("movie.avi");
writer->SetRate(framerate); writer->SetRate(1);
writer->SetInputConnection(imageFilter->GetOutputPort()); writer->SetInputConnection(imageFilter->GetOutputPort());
writer->Start(); writer->Start();
@@ -480,7 +477,7 @@ int main(int argc, char* argv[])
slidercallback->fu_list = fu_list; slidercallback->fu_list = fu_list;
sliderWidget->AddObserver(vtkCommand::InteractionEvent, slidercallback); sliderWidget->AddObserver(vtkCommand::InteractionEvent, slidercallback);
int timerId = iren->CreateRepeatingTimer(1000/framerate); int timerId = iren->CreateRepeatingTimer(300);
std::cout << "timerId: " << timerId << std::endl; std::cout << "timerId: " << timerId << std::endl;
// Start the interaction and timer // Start the interaction and timer

View File

@@ -73,21 +73,6 @@ each to:
VTK really should make it easier to pick up the flags required for FFMPEG linkage, especially as they are very quirky on MacOS. VTK really should make it easier to pick up the flags required for FFMPEG linkage, especially as they are very quirky on MacOS.
========================================
Aurora compilation:
========================================
module load ffmpeg
download & untar: VTK-7.0.2
mkdir build
cd build
ccmake ../
"t"
Enable: VTK_MODULE_ENABLE_VTK_IOFFMPEG YES
"configure" ; should "discover" the installed ffmpeg module
Still need an "X" connection to make the MPEG files.
======================================== ========================================
Grid: Grid:
@@ -125,29 +110,4 @@ Extensions
8) Example python code: FieldDensity.py . This is not interfaced to Grid. 8) Example python code: FieldDensity.py . This is not interfaced to Grid.
================
Windowless generation of AVI files: must enable offscreen rendering. From Shuhei Yamamoto:
================
Hi Peter,
To make visualization work on Frontier, I did the following.
For headless off-screen rendering, ccmake tabs in advanced mode shown below are set as indicated.
VTK_OPENGL_HAS_* off
VTK_USE_X off
VTK_DEFAULT_RENDER_WINDOW_OFFSCREEN on
VTK_DEFAULT_RENDER_WINDOW_HEADLESS on
The list can be greater than necessary.
VTK can fall back to EGL or OSMesa at runtime. So I installed mesa via spack (as well as nasm and yasm). Either mesa or meson package requires llvm-config, which is included after rocm6.1. On Frontier, I used /opt/rocm-6.2.4. The only problem is that llvm-config is located on /opt/rocm-6.2.4/llvm/bin, instead of /opt/rocm-6.2.4/bin. So I edited packages.yaml for spack so that the prefix for rocm compiler is /opt/rocm-6.2.4/llvm. Just in case, I also changed c and cxx to /opt/rocm-6.2.4/llvm/bin/amdclang, amdclang++, respectively, but this change might not be necessary.
After installation, I added a path to libOSMesa.so to LD_LIBRARY_PATH, for which there might be a better way such as specifying -rpath for OSMesa lib by editing cmake files.
In addition, I have editied CMakeLists.txt for vtk to force vtk to find OSMesa package via find_package(OSMesa REQUIRED) after list(INSERT CMAKE_MODULE_PATH 0 "${vtk_cmake_dir}"), as there is Find package in vtk/CMake. There will be more elegant method, but I was not able to find a tab to switch on OSMesa.
When I compiled vtk and linked to Grid visualization code, with ffmpeg option, it produces avi file.
Best,
Shuhei

View File

@@ -1,17 +1,9 @@
export grid_config=/home/paboyle/GPT/install/bin/grid-config libs=`grid-config --libs`
libs=`$grid_config --libs` ldflags=`grid-config --ldflags`
ldflags=`$grid_config --ldflags` cxxflags=`grid-config --cxxflags`
cxxflags=`$grid_config --cxxflags` cxx=`grid-config --cxx`
cxx=`$grid_config --cxx`
cc=icx
mkdir build mkdir build
cd build cd build
echo CC $cc LDFLAGS="$ldflags $libs " cmake .. -DCMAKE_CXX_COMPILER=$cxx -DCMAKE_CXX_FLAGS=$cxxflags
echo CXX $cxx
echo CXXFLAGS $cxxflags
echo LDFLAGS $ldflags
echo LIBS $libs
LDFLAGS="$ldflags $libs " cmake .. -DCMAKE_C_COMPILER=$cc -DCMAKE_CXX_COMPILER="$cxx" -DCMAKE_CXX_FLAGS="$cxxflags "