mirror of
https://github.com/paboyle/Grid.git
synced 2025-07-15 20:46:54 +01:00
Merge with Christoph GPT checksum debug
This commit is contained in:
@@ -65,6 +65,7 @@ NAMESPACE_BEGIN(Grid);
|
||||
#endif
|
||||
|
||||
enum GridBLASOperation_t { GridBLAS_OP_N, GridBLAS_OP_T, GridBLAS_OP_C } ;
|
||||
enum GridBLASPrecision_t { GridBLAS_PRECISION_DEFAULT, GridBLAS_PRECISION_16F, GridBLAS_PRECISION_16BF, GridBLAS_PRECISION_TF32 };
|
||||
|
||||
class GridBLAS {
|
||||
public:
|
||||
@@ -98,6 +99,20 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef GRID_CUDA
|
||||
cublasComputeType_t toDataType(GridBLASPrecision_t p) {
|
||||
switch (p) {
|
||||
case GridBLAS_PRECISION_16F:
|
||||
return CUBLAS_COMPUTE_32F_FAST_16F;
|
||||
case GridBLAS_PRECISION_16BF:
|
||||
return CUBLAS_COMPUTE_32F_FAST_16BF;
|
||||
case GridBLAS_PRECISION_TF32:
|
||||
return CUBLAS_COMPUTE_32F_FAST_TF32;
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
// Force construct once
|
||||
GridBLAS() { Init(); };
|
||||
~GridBLAS() { };
|
||||
@@ -138,8 +153,10 @@ public:
|
||||
deviceVector<ComplexD*> &Amk, // pointer list to matrices
|
||||
deviceVector<ComplexD*> &Bkn,
|
||||
ComplexD beta,
|
||||
deviceVector<ComplexD*> &Cmn)
|
||||
deviceVector<ComplexD*> &Cmn,
|
||||
GridBLASPrecision_t precision = GridBLAS_PRECISION_DEFAULT)
|
||||
{
|
||||
assert(precision == GridBLAS_PRECISION_DEFAULT);
|
||||
gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
|
||||
m,n,k,
|
||||
alpha,
|
||||
@@ -201,8 +218,10 @@ public:
|
||||
deviceVector<ComplexD*> &Amk, // pointer list to matrices
|
||||
deviceVector<ComplexD*> &Bkn,
|
||||
ComplexD beta,
|
||||
deviceVector<ComplexD*> &Cmn)
|
||||
deviceVector<ComplexD*> &Cmn,
|
||||
GridBLASPrecision_t precision = GridBLAS_PRECISION_DEFAULT)
|
||||
{
|
||||
assert(precision == GridBLAS_PRECISION_DEFAULT);
|
||||
RealD t2=usecond();
|
||||
int32_t batchCount = Amk.size();
|
||||
assert(Bkn.size()==batchCount);
|
||||
@@ -448,7 +467,8 @@ public:
|
||||
deviceVector<ComplexF*> &Amk, // pointer list to matrices
|
||||
deviceVector<ComplexF*> &Bkn,
|
||||
ComplexF beta,
|
||||
deviceVector<ComplexF*> &Cmn)
|
||||
deviceVector<ComplexF*> &Cmn,
|
||||
GridBLASPrecision_t precision = GridBLAS_PRECISION_DEFAULT)
|
||||
{
|
||||
RealD t2=usecond();
|
||||
int32_t batchCount = Amk.size();
|
||||
@@ -473,6 +493,7 @@ public:
|
||||
assert(Bkn.size()==batchCount);
|
||||
assert(Cmn.size()==batchCount);
|
||||
#ifdef GRID_HIP
|
||||
assert(precision == GridBLAS_PRECISION_DEFAULT);
|
||||
hipblasOperation_t hOpA;
|
||||
hipblasOperation_t hOpB;
|
||||
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
|
||||
@@ -503,50 +524,67 @@ public:
|
||||
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
|
||||
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
|
||||
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
|
||||
auto err = cublasCgemmBatched(gridblasHandle,
|
||||
hOpA,
|
||||
hOpB,
|
||||
m,n,k,
|
||||
(cuComplex *) &alpha_p[0],
|
||||
(cuComplex **)&Amk[0], lda,
|
||||
(cuComplex **)&Bkn[0], ldb,
|
||||
(cuComplex *) &beta_p[0],
|
||||
(cuComplex **)&Cmn[0], ldc,
|
||||
batchCount);
|
||||
cublasStatus_t err;
|
||||
if (precision == GridBLAS_PRECISION_DEFAULT) {
|
||||
err = cublasCgemmBatched(gridblasHandle,
|
||||
hOpA,
|
||||
hOpB,
|
||||
m,n,k,
|
||||
(cuComplex *) &alpha_p[0],
|
||||
(cuComplex **)&Amk[0], lda,
|
||||
(cuComplex **)&Bkn[0], ldb,
|
||||
(cuComplex *) &beta_p[0],
|
||||
(cuComplex **)&Cmn[0], ldc,
|
||||
batchCount);
|
||||
} else {
|
||||
cublasComputeType_t compute_precision = toDataType(precision);
|
||||
err = cublasGemmBatchedEx(gridblasHandle,
|
||||
hOpA,
|
||||
hOpB,
|
||||
m,n,k,
|
||||
(void *) &alpha_p[0],
|
||||
(void **)&Amk[0], CUDA_C_32F, lda,
|
||||
(void **)&Bkn[0], CUDA_C_32F, ldb,
|
||||
(void *) &beta_p[0],
|
||||
(void **)&Cmn[0], CUDA_C_32F, ldc,
|
||||
batchCount, compute_precision, CUBLAS_GEMM_DEFAULT);
|
||||
}
|
||||
assert(err==CUBLAS_STATUS_SUCCESS);
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
int64_t m64=m;
|
||||
int64_t n64=n;
|
||||
int64_t k64=k;
|
||||
int64_t lda64=lda;
|
||||
int64_t ldb64=ldb;
|
||||
int64_t ldc64=ldc;
|
||||
int64_t batchCount64=batchCount;
|
||||
assert(precision == GridBLAS_PRECISION_DEFAULT);
|
||||
int64_t m64=m;
|
||||
int64_t n64=n;
|
||||
int64_t k64=k;
|
||||
int64_t lda64=lda;
|
||||
int64_t ldb64=ldb;
|
||||
int64_t ldc64=ldc;
|
||||
int64_t batchCount64=batchCount;
|
||||
|
||||
oneapi::mkl::transpose iOpA;
|
||||
oneapi::mkl::transpose iOpB;
|
||||
oneapi::mkl::transpose iOpA;
|
||||
oneapi::mkl::transpose iOpB;
|
||||
|
||||
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
|
||||
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
|
||||
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
|
||||
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
|
||||
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
|
||||
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
|
||||
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
|
||||
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
|
||||
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
|
||||
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
|
||||
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
|
||||
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
|
||||
|
||||
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
|
||||
&iOpA,
|
||||
&iOpB,
|
||||
&m64,&n64,&k64,
|
||||
(ComplexF *) &alpha_p[0],
|
||||
(const ComplexF **)&Amk[0], (const int64_t *)&lda64,
|
||||
(const ComplexF **)&Bkn[0], (const int64_t *)&ldb64,
|
||||
(ComplexF *) &beta_p[0],
|
||||
(ComplexF **)&Cmn[0], (const int64_t *)&ldc64,
|
||||
(int64_t)1,&batchCount64,std::vector<sycl::event>());
|
||||
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
|
||||
&iOpA,
|
||||
&iOpB,
|
||||
&m64,&n64,&k64,
|
||||
(ComplexF *) &alpha_p[0],
|
||||
(const ComplexF **)&Amk[0], (const int64_t *)&lda64,
|
||||
(const ComplexF **)&Bkn[0], (const int64_t *)&ldb64,
|
||||
(ComplexF *) &beta_p[0],
|
||||
(ComplexF **)&Cmn[0], (const int64_t *)&ldc64,
|
||||
(int64_t)1,&batchCount64,std::vector<sycl::event>());
|
||||
synchronise();
|
||||
#endif
|
||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
||||
assert(precision == GridBLAS_PRECISION_DEFAULT);
|
||||
// Need a default/reference implementation; use Eigen
|
||||
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
|
||||
thread_for (p, batchCount, {
|
||||
@@ -946,6 +984,336 @@ public:
|
||||
RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount;
|
||||
}
|
||||
|
||||
/*
|
||||
Inverse and Determinant
|
||||
|
||||
- CPU version uses Eigen
|
||||
- GPU version uses LAPACK-compatible getrf / getri
|
||||
|
||||
Design comment: Eigen does not expose getrf / getri in a LAPACK compatible manner.
|
||||
Overhead to go through getrf / getri for CPU version too large.
|
||||
Current interface therefore only guarantees the inverse and determinant
|
||||
functions on all platforms but not the getrf / getri ones.
|
||||
*/
|
||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
||||
|
||||
void inverseBatched(int64_t n,
|
||||
deviceVector<ComplexD*> &Ann,
|
||||
deviceVector<ComplexD*> &Cnn) {
|
||||
|
||||
int64_t batchCount = Ann.size();
|
||||
assert(batchCount == Cnn.size());
|
||||
thread_for(p,batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcd> eAnn(Ann[p],n,n);
|
||||
Eigen::Map<Eigen::MatrixXcd> eCnn(Cnn[p],n,n);
|
||||
eCnn = eAnn.inverse();
|
||||
});
|
||||
}
|
||||
|
||||
void inverseBatched(int64_t n,
|
||||
deviceVector<ComplexF*> &Ann,
|
||||
deviceVector<ComplexF*> &Cnn) {
|
||||
|
||||
int64_t batchCount = Ann.size();
|
||||
assert(batchCount == Cnn.size());
|
||||
thread_for(p,batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcf> eAnn(Ann[p],n,n);
|
||||
Eigen::Map<Eigen::MatrixXcf> eCnn(Cnn[p],n,n);
|
||||
eCnn = eAnn.inverse();
|
||||
});
|
||||
}
|
||||
|
||||
void determinantBatched(int64_t n,
|
||||
deviceVector<ComplexD*> &Ann,
|
||||
deviceVector<ComplexD*> &C) {
|
||||
|
||||
int64_t batchCount = Ann.size();
|
||||
assert(batchCount == C.size());
|
||||
thread_for(p,batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcd> eAnn(Ann[p],n,n);
|
||||
*C[p] = eAnn.determinant();
|
||||
});
|
||||
}
|
||||
|
||||
void determinantBatched(int64_t n,
|
||||
deviceVector<ComplexF*> &Ann,
|
||||
deviceVector<ComplexF*> &C) {
|
||||
|
||||
int64_t batchCount = Ann.size();
|
||||
assert(batchCount == C.size());
|
||||
thread_for(p,batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcf> eAnn(Ann[p],n,n);
|
||||
*C[p] = eAnn.determinant();
|
||||
});
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#ifdef GRID_SYCL
|
||||
template<typename T>
|
||||
void getrfBatchedSYCL(int64_t n,
|
||||
deviceVector<T*> &Ann,
|
||||
deviceVector<int64_t> &ipiv,
|
||||
deviceVector<int64_t> &info) {
|
||||
|
||||
int64_t batchCount = Ann.size();
|
||||
|
||||
static deviceVector<T> scratchpad;
|
||||
int64_t sp_size = oneapi::mkl::lapack::getrf_batch_scratchpad_size<T>(*gridblasHandle, &n, &n, &n, (int64_t)1, &batchCount);
|
||||
if (sp_size > scratchpad.size())
|
||||
scratchpad.resize(sp_size);
|
||||
|
||||
static deviceVector<int64_t*> _ipiv;
|
||||
if (batchCount > _ipiv.size())
|
||||
_ipiv.resize(batchCount);
|
||||
int64_t** p_ipiv = &_ipiv[0];
|
||||
int64_t* pipiv = &ipiv[0];
|
||||
|
||||
accelerator_for(i, batchCount, 1, { p_ipiv[i] = &pipiv[i*n]; });
|
||||
|
||||
oneapi::mkl::lapack::getrf_batch(*gridblasHandle,
|
||||
&n, &n,
|
||||
(T **)&Ann[0],
|
||||
&n,
|
||||
(int64_t**)&_ipiv[0],
|
||||
(int64_t)1, &batchCount,
|
||||
(T*)&scratchpad[0], (int64_t)scratchpad.size(),
|
||||
std::vector<sycl::event>());
|
||||
synchronise();
|
||||
}
|
||||
#endif
|
||||
|
||||
void getrfBatched(int64_t n,
|
||||
deviceVector<ComplexD*> &Ann,
|
||||
deviceVector<int64_t> &ipiv,
|
||||
deviceVector<int64_t> &info)
|
||||
{
|
||||
int64_t batchCount = Ann.size();
|
||||
assert(ipiv.size()==batchCount*n);
|
||||
assert(info.size()==batchCount);
|
||||
|
||||
#ifdef GRID_HIP
|
||||
auto err = hipblasZgetrfBatched(gridblasHandle,(int)n,
|
||||
(hipblasDoubleComplex **)&Ann[0], (int)n,
|
||||
(int*) &ipiv[0],
|
||||
(int*) &info[0],
|
||||
(int)batchCount);
|
||||
assert(err==HIPBLAS_STATUS_SUCCESS);
|
||||
#endif
|
||||
#ifdef GRID_CUDA
|
||||
auto err = cublasZgetrfBatched(gridblasHandle, (int)n,
|
||||
(cuDoubleComplex **)&Ann[0], (int)n,
|
||||
(int*) &ipiv[0],
|
||||
(int*) &info[0],
|
||||
(int)batchCount);
|
||||
assert(err==CUBLAS_STATUS_SUCCESS);
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
getrfBatchedSYCL(n, Ann, ipiv, info);
|
||||
#endif
|
||||
}
|
||||
|
||||
void getrfBatched(int64_t n,
|
||||
deviceVector<ComplexF*> &Ann,
|
||||
deviceVector<int64_t> &ipiv,
|
||||
deviceVector<int64_t> &info)
|
||||
{
|
||||
int64_t batchCount = Ann.size();
|
||||
assert(ipiv.size()==batchCount*n);
|
||||
assert(info.size()==batchCount);
|
||||
|
||||
#ifdef GRID_HIP
|
||||
auto err = hipblasCgetrfBatched(gridblasHandle,(int)n,
|
||||
(hipblasComplex **)&Ann[0], (int)n,
|
||||
(int*) &ipiv[0],
|
||||
(int*) &info[0],
|
||||
(int)batchCount);
|
||||
assert(err==HIPBLAS_STATUS_SUCCESS);
|
||||
#endif
|
||||
#ifdef GRID_CUDA
|
||||
auto err = cublasCgetrfBatched(gridblasHandle, (int)n,
|
||||
(cuComplex **)&Ann[0], (int)n,
|
||||
(int*) &ipiv[0],
|
||||
(int*) &info[0],
|
||||
(int)batchCount);
|
||||
assert(err==CUBLAS_STATUS_SUCCESS);
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
getrfBatchedSYCL(n, Ann, ipiv, info);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef GRID_SYCL
|
||||
template<typename T>
|
||||
void getriBatchedSYCL(int64_t n,
|
||||
deviceVector<T*> &Ann,
|
||||
deviceVector<int64_t> &ipiv,
|
||||
deviceVector<int64_t> &info,
|
||||
deviceVector<T*> &Cnn) {
|
||||
|
||||
int64_t batchCount = Ann.size();
|
||||
|
||||
static deviceVector<T> scratchpad;
|
||||
int64_t sp_size = oneapi::mkl::lapack::getri_batch_scratchpad_size<T>(*gridblasHandle, &n, &n, (int64_t)1, &batchCount);
|
||||
if (sp_size > scratchpad.size())
|
||||
scratchpad.resize(sp_size);
|
||||
|
||||
static deviceVector<int64_t*> _ipiv;
|
||||
if (batchCount > _ipiv.size())
|
||||
_ipiv.resize(batchCount);
|
||||
int64_t** p_ipiv = &_ipiv[0];
|
||||
int64_t* pipiv = &ipiv[0];
|
||||
|
||||
accelerator_for(i, batchCount, 1, { p_ipiv[i] = &pipiv[i*n]; });
|
||||
|
||||
oneapi::mkl::lapack::getri_batch(*gridblasHandle,
|
||||
&n,
|
||||
(T **)&Ann[0],
|
||||
&n,
|
||||
(int64_t**)p_ipiv,
|
||||
(int64_t)1, &batchCount,
|
||||
(T *)&scratchpad[0], (int64_t)scratchpad.size(),
|
||||
std::vector<sycl::event>());
|
||||
|
||||
synchronise();
|
||||
|
||||
T** pA = &Ann[0];
|
||||
T** pC = &Cnn[0];
|
||||
accelerator_for(i, batchCount*n*n, 1, {
|
||||
auto j = i / batchCount;
|
||||
auto k = i % batchCount;
|
||||
pC[k][j] = pA[k][j];
|
||||
});
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
void getriBatched(int64_t n,
|
||||
deviceVector<ComplexD*> &Ann,
|
||||
deviceVector<int64_t> &ipiv,
|
||||
deviceVector<int64_t> &info,
|
||||
deviceVector<ComplexD*> &Cnn)
|
||||
{
|
||||
int64_t batchCount = Ann.size();
|
||||
assert(ipiv.size()==batchCount*n);
|
||||
assert(info.size()==batchCount);
|
||||
assert(Cnn.size()==batchCount);
|
||||
|
||||
#ifdef GRID_HIP
|
||||
auto err = hipblasZgetriBatched(gridblasHandle,(int)n,
|
||||
(hipblasDoubleComplex **)&Ann[0], (int)n,
|
||||
(int*) &ipiv[0],
|
||||
(hipblasDoubleComplex **)&Cnn[0], (int)n,
|
||||
(int*) &info[0],
|
||||
(int)batchCount);
|
||||
assert(err==HIPBLAS_STATUS_SUCCESS);
|
||||
#endif
|
||||
#ifdef GRID_CUDA
|
||||
auto err = cublasZgetriBatched(gridblasHandle, (int)n,
|
||||
(cuDoubleComplex **)&Ann[0], (int)n,
|
||||
(int*) &ipiv[0],
|
||||
(cuDoubleComplex **)&Cnn[0], (int)n,
|
||||
(int*) &info[0],
|
||||
(int)batchCount);
|
||||
assert(err==CUBLAS_STATUS_SUCCESS);
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
getriBatchedSYCL(n, Ann, ipiv, info, Cnn);
|
||||
#endif
|
||||
}
|
||||
|
||||
void getriBatched(int64_t n,
|
||||
deviceVector<ComplexF*> &Ann,
|
||||
deviceVector<int64_t> &ipiv,
|
||||
deviceVector<int64_t> &info,
|
||||
deviceVector<ComplexF*> &Cnn)
|
||||
{
|
||||
int64_t batchCount = Ann.size();
|
||||
assert(ipiv.size()==batchCount*n);
|
||||
assert(info.size()==batchCount);
|
||||
assert(Cnn.size()==batchCount);
|
||||
|
||||
#ifdef GRID_HIP
|
||||
auto err = hipblasCgetriBatched(gridblasHandle,(int)n,
|
||||
(hipblasComplex **)&Ann[0], (int)n,
|
||||
(int*) &ipiv[0],
|
||||
(hipblasComplex **)&Cnn[0], (int)n,
|
||||
(int*) &info[0],
|
||||
(int)batchCount);
|
||||
assert(err==HIPBLAS_STATUS_SUCCESS);
|
||||
#endif
|
||||
#ifdef GRID_CUDA
|
||||
auto err = cublasCgetriBatched(gridblasHandle, (int)n,
|
||||
(cuComplex **)&Ann[0], (int)n,
|
||||
(int*) &ipiv[0],
|
||||
(cuComplex **)&Cnn[0], (int)n,
|
||||
(int*) &info[0],
|
||||
(int)batchCount);
|
||||
assert(err==CUBLAS_STATUS_SUCCESS);
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
getriBatchedSYCL(n, Ann, ipiv, info, Cnn);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<typename dtype>
|
||||
void inverseBatched(int64_t n,
|
||||
deviceVector<dtype*> &Ann, // this will be overwritten with LU decomposition
|
||||
deviceVector<dtype*> &Cnn // this will be overwritten with the inverse
|
||||
) {
|
||||
|
||||
int64_t batchCount = Ann.size();
|
||||
RealD t0 = usecond();
|
||||
deviceVector<int64_t> ipiv(batchCount*n);
|
||||
deviceVector<int64_t> info(batchCount);
|
||||
|
||||
//RealD t1 = usecond();
|
||||
getrfBatched(n, Ann, ipiv, info);
|
||||
// test info for non-invertibility? set to nan if yes?
|
||||
getriBatched(n, Ann, ipiv, info, Cnn);
|
||||
//synchronise();
|
||||
//RealD t2 = usecond();
|
||||
//std::cout << GridLogMessage << "Temp " << t1-t0 << " rf/ri " << t2-t1 << std::endl;
|
||||
}
|
||||
|
||||
template<typename dtype>
|
||||
void determinantBatched(int64_t n,
|
||||
deviceVector<dtype*> &Ann, // this will be overwritten with LU decomposition
|
||||
deviceVector<dtype*> &C // this will be overwritten with determinant
|
||||
) {
|
||||
|
||||
int64_t batchCount = Ann.size();
|
||||
//RealD t0 = usecond();
|
||||
deviceVector<int64_t> ipiv(batchCount*n);
|
||||
deviceVector<int64_t> info(batchCount);
|
||||
|
||||
dtype** pAnn = (dtype**)&Ann[0];
|
||||
dtype** pC = (dtype**)&C[0];
|
||||
#if defined(GRID_CUDA) || defined(GRID_HIP)
|
||||
int* pipiv = (int*)&ipiv[0];
|
||||
#else
|
||||
int64_t* pipiv = (int64_t*)&ipiv[0];
|
||||
#endif
|
||||
|
||||
//RealD t1 = usecond();
|
||||
getrfBatched(n, Ann, ipiv, info);
|
||||
//RealD t2 = usecond();
|
||||
accelerator_for(i,batchCount,1,{
|
||||
dtype det = 1.0;
|
||||
for (int64_t j=0;j<n;j++) {
|
||||
det *= pAnn[i][n*j + j];
|
||||
// branchless signs
|
||||
det *= (pipiv[i*n + j] == j+1) ? (1.0) : (-1.0);
|
||||
}
|
||||
*pC[i] = det;
|
||||
});
|
||||
|
||||
//RealD t3 = usecond();
|
||||
//std::cout << GridLogMessage << "Temp " << t1 - t0 << " rf/ri " << t2-t1 << "final" << t3 - t2 << std::endl;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
template<class CComplex>
|
||||
double benchmark(int M, int N, int K, int BATCH)
|
||||
{
|
||||
|
@@ -32,6 +32,10 @@ NAMESPACE_BEGIN(Grid);
|
||||
|
||||
|
||||
Grid_MPI_Comm CartesianCommunicator::communicator_world;
|
||||
#ifdef GRID_CHECKSUM_COMMS
|
||||
extern void * Grid_backtrace_buffer[_NBACKTRACE];
|
||||
uint64_t checksum_index = 1;
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////
|
||||
// First initialise of comms system
|
||||
@@ -568,6 +572,11 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
|
||||
* - post device - host send buffer transfer asynch
|
||||
*/
|
||||
|
||||
#ifdef GRID_CHECKSUM_COMMS
|
||||
rbytes += 8;
|
||||
xbytes += 8;
|
||||
#endif
|
||||
|
||||
if ( dor ) {
|
||||
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
|
||||
tag= dir+from*32;
|
||||
@@ -580,6 +589,7 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
|
||||
srq.req = rrq;
|
||||
srq.host_buf = host_recv;
|
||||
srq.device_buf = recv;
|
||||
srq.tag = tag;
|
||||
list.push_back(srq);
|
||||
off_node_bytes+=rbytes;
|
||||
}
|
||||
@@ -593,7 +603,16 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
|
||||
host_xmit = this->HostBufferMalloc(xbytes);
|
||||
CommsRequest_t srq;
|
||||
|
||||
#ifdef GRID_CHECKSUM_COMMS
|
||||
uint64_t xbytes_data = xbytes - 8;
|
||||
srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes_data); // Make this Asynch
|
||||
assert(xbytes % 8 == 0);
|
||||
// flip one bit so that a zero buffer is not consistent
|
||||
uint64_t xsum = checksum_gpu((uint64_t*)xmit, xbytes_data / 8) ^ (checksum_index + 1 + 1000 * tag);
|
||||
*(uint64_t*)(((char*)host_xmit) + xbytes_data) = xsum;
|
||||
#else
|
||||
srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
|
||||
#endif
|
||||
|
||||
// ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
|
||||
// assert(ierr==0);
|
||||
@@ -635,7 +654,11 @@ void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequ
|
||||
|
||||
if ( flag ) {
|
||||
// std::cout << " PollIrecv "<<idx<<" flag "<<flag<<std::endl;
|
||||
#ifdef GRID_CHECKSUM_COMMS
|
||||
acceleratorCopyToDeviceAsynch(list[idx].host_buf,list[idx].device_buf,list[idx].bytes - 8);
|
||||
#else
|
||||
acceleratorCopyToDeviceAsynch(list[idx].host_buf,list[idx].device_buf,list[idx].bytes);
|
||||
#endif
|
||||
list[idx].PacketType=InterNodeReceiveHtoD;
|
||||
} else {
|
||||
pending ++;
|
||||
@@ -798,7 +821,40 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
|
||||
// acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
|
||||
// }
|
||||
// }
|
||||
#ifdef GRID_CHECKSUM_COMMS
|
||||
for(int r=0;r<list.size();r++){
|
||||
if ( list[r].PacketType == InterNodeReceiveHtoD ) {
|
||||
uint64_t rbytes_data = list[r].bytes - 8;
|
||||
uint64_t expected_cs = *(uint64_t*)(((char*)list[r].host_buf) + rbytes_data);
|
||||
uint64_t computed_cs = checksum_gpu((uint64_t*)list[r].device_buf, rbytes_data / 8) ^ (checksum_index + 1 + 1000 * list[r].tag); //
|
||||
if (expected_cs != computed_cs) {
|
||||
// TODO: error message, backtrace, quit
|
||||
|
||||
fprintf(stderr, "GRID_CHECKSUM_COMMS error:\n");
|
||||
fprintf(stderr, " processor = %d\n", (int)_processor);
|
||||
for(int d=0;d<_processors.size();d++)
|
||||
fprintf(stderr, " processor_coord[%d] = %d\n", d, _processor_coor[d]);
|
||||
fprintf(stderr, " hostname: %s\n", GridHostname());
|
||||
fprintf(stderr, " expected_cs: %ld\n", expected_cs);
|
||||
fprintf(stderr, " computed_cs: %ld\n", computed_cs);
|
||||
fprintf(stderr, " dest: %d\n", list[r].dest);
|
||||
fprintf(stderr, " tag: %d\n", list[r].tag);
|
||||
fprintf(stderr, " commdir: %d\n", list[r].commdir);
|
||||
fprintf(stderr, " bytes: %ld\n", (uint64_t)list[r].bytes);
|
||||
|
||||
fflush(stderr);
|
||||
|
||||
// backtrace
|
||||
int symbols = backtrace(Grid_backtrace_buffer,_NBACKTRACE);
|
||||
backtrace_symbols_fd(Grid_backtrace_buffer,symbols, 2);
|
||||
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
checksum_index += 1;
|
||||
#endif
|
||||
|
||||
list.resize(0); // Delete the list
|
||||
this->HostBufferFreeAll(); // Clean up the buffer allocs
|
||||
|
@@ -43,10 +43,6 @@ Author: Christoph Lehner <christoph@lhnr.de>
|
||||
#define GRID_SYCL_LEVEL_ZERO_IPC
|
||||
#define SHM_SOCKETS
|
||||
#else
|
||||
#ifdef HAVE_NUMAIF_H
|
||||
#warning " Using NUMAIF "
|
||||
#include <numaif.h>
|
||||
#endif
|
||||
#endif
|
||||
#include <syscall.h>
|
||||
#endif
|
||||
|
@@ -202,7 +202,7 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,deviceVector<
|
||||
{
|
||||
auto buffer_p = & buffer[0];
|
||||
auto table = MapCshiftTable();
|
||||
autoView( rhs_v, rhs, AcceleratorWrite);
|
||||
autoView( rhs_v, rhs, AcceleratorWriteDiscard);
|
||||
accelerator_for(i,ent,vobj::Nsimd(),{
|
||||
coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
|
||||
});
|
||||
@@ -228,7 +228,7 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
|
||||
if(cbmask ==0x3 ) {
|
||||
int _slice_stride = rhs.Grid()->_slice_stride[dimension];
|
||||
int _slice_block = rhs.Grid()->_slice_block[dimension];
|
||||
autoView( rhs_v , rhs, AcceleratorWrite);
|
||||
autoView( rhs_v , rhs, AcceleratorWriteDiscard);
|
||||
accelerator_for(nn,e1*e2,1,{
|
||||
int n = nn%e1;
|
||||
int b = nn/e1;
|
||||
@@ -302,7 +302,7 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
|
||||
{
|
||||
auto table = MapCshiftTable();
|
||||
autoView(rhs_v , rhs, AcceleratorRead);
|
||||
autoView(lhs_v , lhs, AcceleratorWrite);
|
||||
autoView(lhs_v , lhs, AcceleratorWriteDiscard);
|
||||
accelerator_for(i,ent,vobj::Nsimd(),{
|
||||
coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second]));
|
||||
});
|
||||
|
@@ -29,8 +29,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
#ifndef _GRID_CSHIFT_MPI_H_
|
||||
#define _GRID_CSHIFT_MPI_H_
|
||||
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
#ifdef GRID_CHECKSUM_COMMS
|
||||
extern uint64_t checksum_index;
|
||||
#endif
|
||||
|
||||
const int Cshift_verbose=0;
|
||||
template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
|
||||
{
|
||||
@@ -126,8 +130,9 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
||||
static deviceVector<vobj> send_buf; send_buf.resize(buffer_size);
|
||||
static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size);
|
||||
#ifndef ACCELERATOR_AWARE_MPI
|
||||
static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size);
|
||||
static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size);
|
||||
int pad = (8 + sizeof(vobj) - 1) / sizeof(vobj);
|
||||
static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size+pad);
|
||||
static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size+pad);
|
||||
#endif
|
||||
|
||||
int cb= (cbmask==0x2)? Odd : Even;
|
||||
@@ -180,20 +185,39 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
||||
#else
|
||||
// bouncy bouncy
|
||||
acceleratorCopyFromDevice(&send_buf[0],&hsend_buf[0],bytes);
|
||||
|
||||
#ifdef GRID_CHECKSUM_COMMS
|
||||
assert(bytes % 8 == 0);
|
||||
checksum_index++;
|
||||
uint64_t xsum = checksum_gpu((uint64_t*)&send_buf[0], bytes / 8) ^ (1 + checksum_index);
|
||||
*(uint64_t*)(((char*)&hsend_buf[0]) + bytes) = xsum;
|
||||
bytes += 8;
|
||||
#endif
|
||||
|
||||
grid->SendToRecvFrom((void *)&hsend_buf[0],
|
||||
xmit_to_rank,
|
||||
(void *)&hrecv_buf[0],
|
||||
recv_from_rank,
|
||||
bytes);
|
||||
|
||||
#ifdef GRID_CHECKSUM_COMMS
|
||||
bytes -= 8;
|
||||
acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
|
||||
uint64_t expected_cs = *(uint64_t*)(((char*)&hrecv_buf[0]) + bytes);
|
||||
uint64_t computed_cs = checksum_gpu((uint64_t*)&recv_buf[0], bytes / 8) ^ (1 + checksum_index);
|
||||
std::cout << GridLogComms<< " Cshift: "
|
||||
<<" dim"<<dimension
|
||||
<<" shift "<<shift
|
||||
<< " rank "<< grid->ThisRank()
|
||||
<<" Coor "<<grid->ThisProcessorCoor()
|
||||
<<" send "<<hsend_buf[0]<<" to "<<xmit_to_rank
|
||||
<<" recv "<<hrecv_buf[0]<<" from "<<recv_from_rank
|
||||
<<" send "<<xsum<<" to "<<xmit_to_rank
|
||||
<<" recv "<<computed_cs<<" from "<<recv_from_rank
|
||||
<<std::endl;
|
||||
assert(expected_cs == computed_cs);
|
||||
#else
|
||||
acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
FlightRecorder::StepLog("Cshift_SendRecv_complete");
|
||||
|
||||
@@ -264,8 +288,16 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
||||
recv_buf_extract[s].resize(buffer_size);
|
||||
}
|
||||
#ifndef ACCELERATOR_AWARE_MPI
|
||||
hostVector<scalar_object> hsend_buf; hsend_buf.resize(buffer_size);
|
||||
hostVector<scalar_object> hrecv_buf; hrecv_buf.resize(buffer_size);
|
||||
#ifdef GRID_CHECKSUM_COMMS
|
||||
buffer_size += (8 + sizeof(vobj) - 1) / sizeof(vobj);
|
||||
#endif
|
||||
|
||||
static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size);
|
||||
static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size);
|
||||
|
||||
#ifdef GRID_CHECKSUM_COMMS
|
||||
buffer_size -= (8 + sizeof(vobj) - 1) / sizeof(vobj);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
int bytes = buffer_size*sizeof(scalar_object);
|
||||
@@ -328,21 +360,37 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
||||
#else
|
||||
// bouncy bouncy
|
||||
acceleratorCopyFromDevice((void *)send_buf_extract_mpi,(void *)&hsend_buf[0],bytes);
|
||||
#ifdef GRID_CHECKSUM_COMMS
|
||||
assert(bytes % 8 == 0);
|
||||
checksum_index++;
|
||||
uint64_t xsum = checksum_gpu((uint64_t*)send_buf_extract_mpi, bytes / 8) ^ (1 + checksum_index);
|
||||
*(uint64_t*)(((char*)&hsend_buf[0]) + bytes) = xsum;
|
||||
bytes += 8;
|
||||
#endif
|
||||
grid->SendToRecvFrom((void *)&hsend_buf[0],
|
||||
xmit_to_rank,
|
||||
(void *)&hrecv_buf[0],
|
||||
recv_from_rank,
|
||||
bytes);
|
||||
#ifdef GRID_CHECKSUM_COMMS
|
||||
bytes -= 8;
|
||||
acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes);
|
||||
uint64_t expected_cs = *(uint64_t*)(((char*)&hrecv_buf[0]) + bytes);
|
||||
uint64_t computed_cs = checksum_gpu((uint64_t*)recv_buf_extract_mpi, bytes / 8) ^ (1 + checksum_index);
|
||||
|
||||
std::cout << GridLogComms<< " Cshift_comms_simd: "
|
||||
<<" dim"<<dimension
|
||||
<<" shift "<<shift
|
||||
<< " rank "<< grid->ThisRank()
|
||||
<<" Coor "<<grid->ThisProcessorCoor()
|
||||
<<" send "<<hsend_buf[0]<<" to "<<xmit_to_rank
|
||||
<<" recv "<<hrecv_buf[0]<<" from "<<recv_from_rank
|
||||
<<" send "<<xsum<<" to "<<xmit_to_rank
|
||||
<<" recv "<<computed_cs<<" from "<<recv_from_rank
|
||||
<<std::endl;
|
||||
assert(expected_cs == computed_cs);
|
||||
#else
|
||||
acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
xbytes+=bytes;
|
||||
|
@@ -87,6 +87,25 @@ template<class Word> Word svm_xor(Word *vec,uint64_t L)
|
||||
theGridAccelerator->wait();
|
||||
return ret;
|
||||
}
|
||||
template<class Word> Word checksum_gpu(Word *vec,uint64_t L)
|
||||
{
|
||||
Word identity; identity=0;
|
||||
Word ret = 0;
|
||||
{
|
||||
sycl::buffer<Word, 1> abuff(&ret, {1});
|
||||
theGridAccelerator->submit([&](sycl::handler &cgh) {
|
||||
auto Reduction = sycl::reduction(abuff,cgh,identity,std::bit_xor<>());
|
||||
cgh.parallel_for(sycl::range<1>{L},
|
||||
Reduction,
|
||||
[=] (sycl::id<1> index, auto &sum) {
|
||||
auto l = index % 61;
|
||||
sum ^= vec[index]<<l | vec[index]>>(64-l);
|
||||
});
|
||||
});
|
||||
}
|
||||
theGridAccelerator->wait();
|
||||
return ret;
|
||||
}
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
|
@@ -106,6 +106,47 @@ public:
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
#ifdef GRID_LOG_VIEWS
|
||||
// Little autoscope assister
|
||||
template<class View>
|
||||
class ViewCloser
|
||||
{
|
||||
View v; // Take a copy of view and call view close when I go out of scope automatically
|
||||
const char* filename; int line, mode;
|
||||
public:
|
||||
ViewCloser(View &_v, const char* _filename, int _line, int _mode) :
|
||||
v(_v), filename(_filename), line(_line), mode(_mode) {
|
||||
|
||||
switch (mode){
|
||||
case AcceleratorRead:
|
||||
case AcceleratorWrite:
|
||||
case CpuRead:
|
||||
case CpuWrite:
|
||||
ViewLogger::Log(filename, line, 1, mode, &v[0], v.size() * sizeof(v[0]));
|
||||
break;
|
||||
}
|
||||
|
||||
};
|
||||
~ViewCloser() {
|
||||
|
||||
switch (mode) {
|
||||
case AcceleratorWriteDiscard:
|
||||
case AcceleratorWrite:
|
||||
case CpuWrite:
|
||||
ViewLogger::Log(filename, line, -1, mode, &v[0], v.size() * sizeof(v[0]));
|
||||
break;
|
||||
}
|
||||
|
||||
v.ViewClose();
|
||||
}
|
||||
};
|
||||
|
||||
#define autoView(l_v,l,mode) \
|
||||
auto l_v = l.View(mode); \
|
||||
ViewCloser<decltype(l_v)> _autoView##l_v(l_v,__FILE__,__LINE__,mode);
|
||||
|
||||
#else
|
||||
// Little autoscope assister
|
||||
template<class View>
|
||||
class ViewCloser
|
||||
@@ -119,6 +160,7 @@ class ViewCloser
|
||||
#define autoView(l_v,l,mode) \
|
||||
auto l_v = l.View(mode); \
|
||||
ViewCloser<decltype(l_v)> _autoView##l_v(l_v);
|
||||
#endif
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Lattice expression types used by ET to assemble the AST
|
||||
|
@@ -57,7 +57,7 @@ public:
|
||||
|
||||
{
|
||||
// RealD eps = 1.0;
|
||||
std::cout<<GridLogMessage << "ZMobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" gamma passed in"<<std::endl;
|
||||
// std::cout<<GridLogMessage << "ZMobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" gamma passed in"<<std::endl;
|
||||
std::vector<Coeff_t> zgamma(this->Ls);
|
||||
for(int s=0;s<this->Ls;s++){
|
||||
zgamma[s] = gamma[s];
|
||||
|
@@ -535,7 +535,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
||||
{
|
||||
autoView(U_v , U,AcceleratorRead);
|
||||
autoView(in_v , in,AcceleratorRead);
|
||||
autoView(out_v,out,AcceleratorWrite);
|
||||
autoView(out_v,out,AcceleratorWriteDiscard);
|
||||
autoView(st_v , st,AcceleratorRead);
|
||||
KERNEL_CALL_ID(GenericDhopSite);
|
||||
}
|
||||
|
@@ -118,7 +118,7 @@ protected:
|
||||
GaugeK); // derivative of SmearBase
|
||||
return SigmaK;
|
||||
}
|
||||
|
||||
public:
|
||||
/*! @brief Returns smeared configuration at level 'Level' */
|
||||
const GaugeField &get_smeared_conf(int Level) const
|
||||
{
|
||||
|
@@ -819,7 +819,7 @@ public:
|
||||
} // if smearingLevels = 0 do nothing
|
||||
}
|
||||
|
||||
private:
|
||||
public:
|
||||
//====================================================================
|
||||
// Override base clas here to mask it
|
||||
virtual void fill_smearedSet(GaugeField &U)
|
||||
|
@@ -192,6 +192,11 @@ public:
|
||||
|
||||
void ViewClose(void) { }
|
||||
|
||||
#ifdef GRID_LOG_VIEWS
|
||||
size_t size() { return 0; };
|
||||
uint64_t & operator[](size_t i) { static uint64_t v=0; return v; };
|
||||
#endif
|
||||
|
||||
};
|
||||
|
||||
////////////////////////////////////////
|
||||
|
@@ -372,4 +372,53 @@ void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank)
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef GRID_LOG_VIEWS
|
||||
|
||||
bool ViewLogger::Enabled = false;
|
||||
std::vector<ViewLogger::Entry_t> ViewLogger::LogVector;
|
||||
|
||||
void ViewLogger::Begin() { Enabled = true; LogVector.resize(0); }
|
||||
void ViewLogger::End() { Enabled = false; }
|
||||
|
||||
void ViewLogger::Log(const char* filename, int line, int index, int mode, void* data, uint64_t bytes)
|
||||
{
|
||||
if (!Enabled)
|
||||
return;
|
||||
|
||||
size_t i = LogVector.size();
|
||||
LogVector.resize(i + 1);
|
||||
auto & n = LogVector[i];
|
||||
|
||||
n.filename = filename;
|
||||
n.line = line;
|
||||
n.index = index;
|
||||
|
||||
if (bytes < sizeof(uint64_t)) {
|
||||
|
||||
n.head = n.tail = 0;
|
||||
|
||||
} else {
|
||||
|
||||
switch (mode) {
|
||||
case AcceleratorRead:
|
||||
case AcceleratorWrite:
|
||||
case AcceleratorWriteDiscard:
|
||||
acceleratorCopyFromDevice((char*)data, &n.head, sizeof(uint64_t));
|
||||
acceleratorCopyFromDevice((char*)data + bytes - sizeof(uint64_t), &n.tail, sizeof(uint64_t));
|
||||
break;
|
||||
|
||||
case CpuRead:
|
||||
case CpuWrite:
|
||||
//case CpuWriteDiscard:
|
||||
n.head = *(uint64_t*)data;
|
||||
n.tail = *(uint64_t*)((char*)data + bytes - sizeof(uint64_t));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
@@ -42,5 +42,22 @@ class FlightRecorder {
|
||||
static void xmitLog(void *,uint64_t bytes);
|
||||
static void recvLog(void *,uint64_t bytes,int rank);
|
||||
};
|
||||
#ifdef GRID_LOG_VIEWS
|
||||
class ViewLogger {
|
||||
struct Entry_t {
|
||||
const char* filename;
|
||||
int line;
|
||||
int index;
|
||||
uint64_t head, tail;
|
||||
};
|
||||
|
||||
public:
|
||||
static bool Enabled;
|
||||
static std::vector<Entry_t> LogVector;
|
||||
static void Begin();
|
||||
static void End();
|
||||
static void Log(const char* filename, int line, int index, int mode, void* data, uint64_t bytes);
|
||||
};
|
||||
#endif
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
|
@@ -416,12 +416,28 @@ void Grid_init(int *argc,char ***argv)
|
||||
} else {
|
||||
FILE *fp;
|
||||
std::ostringstream fname;
|
||||
|
||||
int rank = CartesianCommunicator::RankWorld();
|
||||
int radix=64;
|
||||
char* root = getenv("GRID_STDOUT_ROOT");
|
||||
if (root) {
|
||||
fname << root ;
|
||||
mkdir(fname.str().c_str(), S_IRWXU );
|
||||
fname << "/";
|
||||
}
|
||||
fname << (rank/radix)*radix ;
|
||||
mkdir(fname.str().c_str(), S_IRWXU );
|
||||
fname << "/";
|
||||
fname<<"Grid.stdout.";
|
||||
fname<<CartesianCommunicator::RankWorld();
|
||||
fp=freopen(fname.str().c_str(),"w",stdout);
|
||||
assert(fp!=(FILE *)NULL);
|
||||
|
||||
std::ostringstream ename;
|
||||
if (root){
|
||||
ename << root << "/";
|
||||
}
|
||||
ename << (rank/radix)*radix << "/";
|
||||
ename<<"Grid.stderr.";
|
||||
ename<<CartesianCommunicator::RankWorld();
|
||||
fp=freopen(ename.str().c_str(),"w",stderr);
|
||||
|
24
configure.ac
24
configure.ac
@@ -222,7 +222,7 @@ esac
|
||||
|
||||
############### Symplectic group
|
||||
AC_ARG_ENABLE([Sp],
|
||||
[AC_HELP_STRING([--enable-Sp=yes|no], [enable gauge group Sp2n])],
|
||||
[AS_HELP_STRING([--enable-Sp=yes|no],[enable gauge group Sp2n])],
|
||||
[ac_ENABLE_SP=${enable_Sp}], [ac_ENABLE_SP=no])
|
||||
|
||||
AM_CONDITIONAL(BUILD_SP, [ test "${ac_ENABLE_SP}X" == "yesX" ])
|
||||
@@ -263,6 +263,28 @@ case ${ac_ACCELERATOR_AWARE_MPI} in
|
||||
*);;
|
||||
esac
|
||||
|
||||
############### CHECKSUM COMMS
|
||||
AC_ARG_ENABLE([checksum-comms],
|
||||
[AS_HELP_STRING([--enable-checksum-comms=yes|no],[checksum all communication])],
|
||||
[ac_CHECKSUM_COMMS=${enable_checksum_comms}], [ac_CHECKSUM_COMMS=yes])
|
||||
|
||||
case ${ac_CHECKSUM_COMMS} in
|
||||
yes)
|
||||
AC_DEFINE([GRID_CHECKSUM_COMMS],[1],[checksum all communication]);;
|
||||
*);;
|
||||
esac
|
||||
|
||||
############### LOG VIEWS
|
||||
AC_ARG_ENABLE([log-views],
|
||||
[AS_HELP_STRING([--enable-log-views=yes|no],[log information on all view open/close])],
|
||||
[ac_LOG_VIEWS=${enable_log_views}], [ac_LOG_VIEWS=yes])
|
||||
|
||||
case ${ac_LOG_VIEWS} in
|
||||
yes)
|
||||
AC_DEFINE([GRID_LOG_VIEWS],[1],[log information on all view open/close]);;
|
||||
*);;
|
||||
esac
|
||||
|
||||
############### SYCL/CUDA/HIP/none
|
||||
AC_ARG_ENABLE([accelerator],
|
||||
[AS_HELP_STRING([--enable-accelerator=cuda|sycl|hip|none],[enable none,cuda,sycl,hip acceleration])],
|
||||
|
@@ -10,10 +10,11 @@ export UNWIND=`spack find --paths libunwind | grep ^libunwind | awk '{print $2
|
||||
--disable-gparity \
|
||||
--disable-fermion-reps \
|
||||
--enable-shm=nvlink \
|
||||
--enable-checksum-comms=yes \
|
||||
--enable-log-views=yes \
|
||||
--enable-accelerator=sycl \
|
||||
--enable-accelerator-aware-mpi=no \
|
||||
--enable-unified=no \
|
||||
--enable-debug \
|
||||
--with-lime=$CLIME \
|
||||
--with-gmp=$GMP \
|
||||
--with-mpfr=$MPFR \
|
||||
|
Reference in New Issue
Block a user