mirror of
https://github.com/paboyle/Grid.git
synced 2026-04-03 10:36:10 +01:00
Merge with Christoph GPT checksum debug
This commit is contained in:
@@ -65,6 +65,7 @@ NAMESPACE_BEGIN(Grid);
|
||||
#endif
|
||||
|
||||
enum GridBLASOperation_t { GridBLAS_OP_N, GridBLAS_OP_T, GridBLAS_OP_C } ;
|
||||
enum GridBLASPrecision_t { GridBLAS_PRECISION_DEFAULT, GridBLAS_PRECISION_16F, GridBLAS_PRECISION_16BF, GridBLAS_PRECISION_TF32 };
|
||||
|
||||
class GridBLAS {
|
||||
public:
|
||||
@@ -97,7 +98,21 @@ public:
|
||||
gridblasInit=1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#ifdef GRID_CUDA
|
||||
cublasComputeType_t toDataType(GridBLASPrecision_t p) {
|
||||
switch (p) {
|
||||
case GridBLAS_PRECISION_16F:
|
||||
return CUBLAS_COMPUTE_32F_FAST_16F;
|
||||
case GridBLAS_PRECISION_16BF:
|
||||
return CUBLAS_COMPUTE_32F_FAST_16BF;
|
||||
case GridBLAS_PRECISION_TF32:
|
||||
return CUBLAS_COMPUTE_32F_FAST_TF32;
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
// Force construct once
|
||||
GridBLAS() { Init(); };
|
||||
~GridBLAS() { };
|
||||
@@ -138,8 +153,10 @@ public:
|
||||
deviceVector<ComplexD*> &Amk, // pointer list to matrices
|
||||
deviceVector<ComplexD*> &Bkn,
|
||||
ComplexD beta,
|
||||
deviceVector<ComplexD*> &Cmn)
|
||||
deviceVector<ComplexD*> &Cmn,
|
||||
GridBLASPrecision_t precision = GridBLAS_PRECISION_DEFAULT)
|
||||
{
|
||||
assert(precision == GridBLAS_PRECISION_DEFAULT);
|
||||
gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
|
||||
m,n,k,
|
||||
alpha,
|
||||
@@ -201,8 +218,10 @@ public:
|
||||
deviceVector<ComplexD*> &Amk, // pointer list to matrices
|
||||
deviceVector<ComplexD*> &Bkn,
|
||||
ComplexD beta,
|
||||
deviceVector<ComplexD*> &Cmn)
|
||||
deviceVector<ComplexD*> &Cmn,
|
||||
GridBLASPrecision_t precision = GridBLAS_PRECISION_DEFAULT)
|
||||
{
|
||||
assert(precision == GridBLAS_PRECISION_DEFAULT);
|
||||
RealD t2=usecond();
|
||||
int32_t batchCount = Amk.size();
|
||||
assert(Bkn.size()==batchCount);
|
||||
@@ -448,7 +467,8 @@ public:
|
||||
deviceVector<ComplexF*> &Amk, // pointer list to matrices
|
||||
deviceVector<ComplexF*> &Bkn,
|
||||
ComplexF beta,
|
||||
deviceVector<ComplexF*> &Cmn)
|
||||
deviceVector<ComplexF*> &Cmn,
|
||||
GridBLASPrecision_t precision = GridBLAS_PRECISION_DEFAULT)
|
||||
{
|
||||
RealD t2=usecond();
|
||||
int32_t batchCount = Amk.size();
|
||||
@@ -473,6 +493,7 @@ public:
|
||||
assert(Bkn.size()==batchCount);
|
||||
assert(Cmn.size()==batchCount);
|
||||
#ifdef GRID_HIP
|
||||
assert(precision == GridBLAS_PRECISION_DEFAULT);
|
||||
hipblasOperation_t hOpA;
|
||||
hipblasOperation_t hOpB;
|
||||
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
|
||||
@@ -503,50 +524,67 @@ public:
|
||||
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
|
||||
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
|
||||
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
|
||||
auto err = cublasCgemmBatched(gridblasHandle,
|
||||
hOpA,
|
||||
hOpB,
|
||||
m,n,k,
|
||||
(cuComplex *) &alpha_p[0],
|
||||
(cuComplex **)&Amk[0], lda,
|
||||
(cuComplex **)&Bkn[0], ldb,
|
||||
(cuComplex *) &beta_p[0],
|
||||
(cuComplex **)&Cmn[0], ldc,
|
||||
batchCount);
|
||||
cublasStatus_t err;
|
||||
if (precision == GridBLAS_PRECISION_DEFAULT) {
|
||||
err = cublasCgemmBatched(gridblasHandle,
|
||||
hOpA,
|
||||
hOpB,
|
||||
m,n,k,
|
||||
(cuComplex *) &alpha_p[0],
|
||||
(cuComplex **)&Amk[0], lda,
|
||||
(cuComplex **)&Bkn[0], ldb,
|
||||
(cuComplex *) &beta_p[0],
|
||||
(cuComplex **)&Cmn[0], ldc,
|
||||
batchCount);
|
||||
} else {
|
||||
cublasComputeType_t compute_precision = toDataType(precision);
|
||||
err = cublasGemmBatchedEx(gridblasHandle,
|
||||
hOpA,
|
||||
hOpB,
|
||||
m,n,k,
|
||||
(void *) &alpha_p[0],
|
||||
(void **)&Amk[0], CUDA_C_32F, lda,
|
||||
(void **)&Bkn[0], CUDA_C_32F, ldb,
|
||||
(void *) &beta_p[0],
|
||||
(void **)&Cmn[0], CUDA_C_32F, ldc,
|
||||
batchCount, compute_precision, CUBLAS_GEMM_DEFAULT);
|
||||
}
|
||||
assert(err==CUBLAS_STATUS_SUCCESS);
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
int64_t m64=m;
|
||||
int64_t n64=n;
|
||||
int64_t k64=k;
|
||||
int64_t lda64=lda;
|
||||
int64_t ldb64=ldb;
|
||||
int64_t ldc64=ldc;
|
||||
int64_t batchCount64=batchCount;
|
||||
|
||||
oneapi::mkl::transpose iOpA;
|
||||
oneapi::mkl::transpose iOpB;
|
||||
|
||||
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
|
||||
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
|
||||
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
|
||||
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
|
||||
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
|
||||
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
|
||||
|
||||
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
|
||||
&iOpA,
|
||||
&iOpB,
|
||||
&m64,&n64,&k64,
|
||||
(ComplexF *) &alpha_p[0],
|
||||
(const ComplexF **)&Amk[0], (const int64_t *)&lda64,
|
||||
(const ComplexF **)&Bkn[0], (const int64_t *)&ldb64,
|
||||
(ComplexF *) &beta_p[0],
|
||||
(ComplexF **)&Cmn[0], (const int64_t *)&ldc64,
|
||||
(int64_t)1,&batchCount64,std::vector<sycl::event>());
|
||||
assert(precision == GridBLAS_PRECISION_DEFAULT);
|
||||
int64_t m64=m;
|
||||
int64_t n64=n;
|
||||
int64_t k64=k;
|
||||
int64_t lda64=lda;
|
||||
int64_t ldb64=ldb;
|
||||
int64_t ldc64=ldc;
|
||||
int64_t batchCount64=batchCount;
|
||||
|
||||
oneapi::mkl::transpose iOpA;
|
||||
oneapi::mkl::transpose iOpB;
|
||||
|
||||
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
|
||||
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
|
||||
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
|
||||
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
|
||||
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
|
||||
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
|
||||
|
||||
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
|
||||
&iOpA,
|
||||
&iOpB,
|
||||
&m64,&n64,&k64,
|
||||
(ComplexF *) &alpha_p[0],
|
||||
(const ComplexF **)&Amk[0], (const int64_t *)&lda64,
|
||||
(const ComplexF **)&Bkn[0], (const int64_t *)&ldb64,
|
||||
(ComplexF *) &beta_p[0],
|
||||
(ComplexF **)&Cmn[0], (const int64_t *)&ldc64,
|
||||
(int64_t)1,&batchCount64,std::vector<sycl::event>());
|
||||
synchronise();
|
||||
#endif
|
||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
||||
assert(precision == GridBLAS_PRECISION_DEFAULT);
|
||||
// Need a default/reference implementation; use Eigen
|
||||
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
|
||||
thread_for (p, batchCount, {
|
||||
@@ -946,6 +984,336 @@ public:
|
||||
RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount;
|
||||
}
|
||||
|
||||
/*
|
||||
Inverse and Determinant
|
||||
|
||||
- CPU version uses Eigen
|
||||
- GPU version uses LAPACK-compatible getrf / getri
|
||||
|
||||
Design comment: Eigen does not expose getrf / getri in a LAPACK compatible manner.
|
||||
Overhead to go through getrf / getri for CPU version too large.
|
||||
Current interface therefore only guarantees the inverse and determinant
|
||||
functions on all platforms but not the getrf / getri ones.
|
||||
*/
|
||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
||||
|
||||
void inverseBatched(int64_t n,
|
||||
deviceVector<ComplexD*> &Ann,
|
||||
deviceVector<ComplexD*> &Cnn) {
|
||||
|
||||
int64_t batchCount = Ann.size();
|
||||
assert(batchCount == Cnn.size());
|
||||
thread_for(p,batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcd> eAnn(Ann[p],n,n);
|
||||
Eigen::Map<Eigen::MatrixXcd> eCnn(Cnn[p],n,n);
|
||||
eCnn = eAnn.inverse();
|
||||
});
|
||||
}
|
||||
|
||||
void inverseBatched(int64_t n,
|
||||
deviceVector<ComplexF*> &Ann,
|
||||
deviceVector<ComplexF*> &Cnn) {
|
||||
|
||||
int64_t batchCount = Ann.size();
|
||||
assert(batchCount == Cnn.size());
|
||||
thread_for(p,batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcf> eAnn(Ann[p],n,n);
|
||||
Eigen::Map<Eigen::MatrixXcf> eCnn(Cnn[p],n,n);
|
||||
eCnn = eAnn.inverse();
|
||||
});
|
||||
}
|
||||
|
||||
void determinantBatched(int64_t n,
|
||||
deviceVector<ComplexD*> &Ann,
|
||||
deviceVector<ComplexD*> &C) {
|
||||
|
||||
int64_t batchCount = Ann.size();
|
||||
assert(batchCount == C.size());
|
||||
thread_for(p,batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcd> eAnn(Ann[p],n,n);
|
||||
*C[p] = eAnn.determinant();
|
||||
});
|
||||
}
|
||||
|
||||
void determinantBatched(int64_t n,
|
||||
deviceVector<ComplexF*> &Ann,
|
||||
deviceVector<ComplexF*> &C) {
|
||||
|
||||
int64_t batchCount = Ann.size();
|
||||
assert(batchCount == C.size());
|
||||
thread_for(p,batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcf> eAnn(Ann[p],n,n);
|
||||
*C[p] = eAnn.determinant();
|
||||
});
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#ifdef GRID_SYCL
|
||||
template<typename T>
|
||||
void getrfBatchedSYCL(int64_t n,
|
||||
deviceVector<T*> &Ann,
|
||||
deviceVector<int64_t> &ipiv,
|
||||
deviceVector<int64_t> &info) {
|
||||
|
||||
int64_t batchCount = Ann.size();
|
||||
|
||||
static deviceVector<T> scratchpad;
|
||||
int64_t sp_size = oneapi::mkl::lapack::getrf_batch_scratchpad_size<T>(*gridblasHandle, &n, &n, &n, (int64_t)1, &batchCount);
|
||||
if (sp_size > scratchpad.size())
|
||||
scratchpad.resize(sp_size);
|
||||
|
||||
static deviceVector<int64_t*> _ipiv;
|
||||
if (batchCount > _ipiv.size())
|
||||
_ipiv.resize(batchCount);
|
||||
int64_t** p_ipiv = &_ipiv[0];
|
||||
int64_t* pipiv = &ipiv[0];
|
||||
|
||||
accelerator_for(i, batchCount, 1, { p_ipiv[i] = &pipiv[i*n]; });
|
||||
|
||||
oneapi::mkl::lapack::getrf_batch(*gridblasHandle,
|
||||
&n, &n,
|
||||
(T **)&Ann[0],
|
||||
&n,
|
||||
(int64_t**)&_ipiv[0],
|
||||
(int64_t)1, &batchCount,
|
||||
(T*)&scratchpad[0], (int64_t)scratchpad.size(),
|
||||
std::vector<sycl::event>());
|
||||
synchronise();
|
||||
}
|
||||
#endif
|
||||
|
||||
void getrfBatched(int64_t n,
|
||||
deviceVector<ComplexD*> &Ann,
|
||||
deviceVector<int64_t> &ipiv,
|
||||
deviceVector<int64_t> &info)
|
||||
{
|
||||
int64_t batchCount = Ann.size();
|
||||
assert(ipiv.size()==batchCount*n);
|
||||
assert(info.size()==batchCount);
|
||||
|
||||
#ifdef GRID_HIP
|
||||
auto err = hipblasZgetrfBatched(gridblasHandle,(int)n,
|
||||
(hipblasDoubleComplex **)&Ann[0], (int)n,
|
||||
(int*) &ipiv[0],
|
||||
(int*) &info[0],
|
||||
(int)batchCount);
|
||||
assert(err==HIPBLAS_STATUS_SUCCESS);
|
||||
#endif
|
||||
#ifdef GRID_CUDA
|
||||
auto err = cublasZgetrfBatched(gridblasHandle, (int)n,
|
||||
(cuDoubleComplex **)&Ann[0], (int)n,
|
||||
(int*) &ipiv[0],
|
||||
(int*) &info[0],
|
||||
(int)batchCount);
|
||||
assert(err==CUBLAS_STATUS_SUCCESS);
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
getrfBatchedSYCL(n, Ann, ipiv, info);
|
||||
#endif
|
||||
}
|
||||
|
||||
void getrfBatched(int64_t n,
|
||||
deviceVector<ComplexF*> &Ann,
|
||||
deviceVector<int64_t> &ipiv,
|
||||
deviceVector<int64_t> &info)
|
||||
{
|
||||
int64_t batchCount = Ann.size();
|
||||
assert(ipiv.size()==batchCount*n);
|
||||
assert(info.size()==batchCount);
|
||||
|
||||
#ifdef GRID_HIP
|
||||
auto err = hipblasCgetrfBatched(gridblasHandle,(int)n,
|
||||
(hipblasComplex **)&Ann[0], (int)n,
|
||||
(int*) &ipiv[0],
|
||||
(int*) &info[0],
|
||||
(int)batchCount);
|
||||
assert(err==HIPBLAS_STATUS_SUCCESS);
|
||||
#endif
|
||||
#ifdef GRID_CUDA
|
||||
auto err = cublasCgetrfBatched(gridblasHandle, (int)n,
|
||||
(cuComplex **)&Ann[0], (int)n,
|
||||
(int*) &ipiv[0],
|
||||
(int*) &info[0],
|
||||
(int)batchCount);
|
||||
assert(err==CUBLAS_STATUS_SUCCESS);
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
getrfBatchedSYCL(n, Ann, ipiv, info);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef GRID_SYCL
|
||||
template<typename T>
|
||||
void getriBatchedSYCL(int64_t n,
|
||||
deviceVector<T*> &Ann,
|
||||
deviceVector<int64_t> &ipiv,
|
||||
deviceVector<int64_t> &info,
|
||||
deviceVector<T*> &Cnn) {
|
||||
|
||||
int64_t batchCount = Ann.size();
|
||||
|
||||
static deviceVector<T> scratchpad;
|
||||
int64_t sp_size = oneapi::mkl::lapack::getri_batch_scratchpad_size<T>(*gridblasHandle, &n, &n, (int64_t)1, &batchCount);
|
||||
if (sp_size > scratchpad.size())
|
||||
scratchpad.resize(sp_size);
|
||||
|
||||
static deviceVector<int64_t*> _ipiv;
|
||||
if (batchCount > _ipiv.size())
|
||||
_ipiv.resize(batchCount);
|
||||
int64_t** p_ipiv = &_ipiv[0];
|
||||
int64_t* pipiv = &ipiv[0];
|
||||
|
||||
accelerator_for(i, batchCount, 1, { p_ipiv[i] = &pipiv[i*n]; });
|
||||
|
||||
oneapi::mkl::lapack::getri_batch(*gridblasHandle,
|
||||
&n,
|
||||
(T **)&Ann[0],
|
||||
&n,
|
||||
(int64_t**)p_ipiv,
|
||||
(int64_t)1, &batchCount,
|
||||
(T *)&scratchpad[0], (int64_t)scratchpad.size(),
|
||||
std::vector<sycl::event>());
|
||||
|
||||
synchronise();
|
||||
|
||||
T** pA = &Ann[0];
|
||||
T** pC = &Cnn[0];
|
||||
accelerator_for(i, batchCount*n*n, 1, {
|
||||
auto j = i / batchCount;
|
||||
auto k = i % batchCount;
|
||||
pC[k][j] = pA[k][j];
|
||||
});
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
void getriBatched(int64_t n,
|
||||
deviceVector<ComplexD*> &Ann,
|
||||
deviceVector<int64_t> &ipiv,
|
||||
deviceVector<int64_t> &info,
|
||||
deviceVector<ComplexD*> &Cnn)
|
||||
{
|
||||
int64_t batchCount = Ann.size();
|
||||
assert(ipiv.size()==batchCount*n);
|
||||
assert(info.size()==batchCount);
|
||||
assert(Cnn.size()==batchCount);
|
||||
|
||||
#ifdef GRID_HIP
|
||||
auto err = hipblasZgetriBatched(gridblasHandle,(int)n,
|
||||
(hipblasDoubleComplex **)&Ann[0], (int)n,
|
||||
(int*) &ipiv[0],
|
||||
(hipblasDoubleComplex **)&Cnn[0], (int)n,
|
||||
(int*) &info[0],
|
||||
(int)batchCount);
|
||||
assert(err==HIPBLAS_STATUS_SUCCESS);
|
||||
#endif
|
||||
#ifdef GRID_CUDA
|
||||
auto err = cublasZgetriBatched(gridblasHandle, (int)n,
|
||||
(cuDoubleComplex **)&Ann[0], (int)n,
|
||||
(int*) &ipiv[0],
|
||||
(cuDoubleComplex **)&Cnn[0], (int)n,
|
||||
(int*) &info[0],
|
||||
(int)batchCount);
|
||||
assert(err==CUBLAS_STATUS_SUCCESS);
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
getriBatchedSYCL(n, Ann, ipiv, info, Cnn);
|
||||
#endif
|
||||
}
|
||||
|
||||
void getriBatched(int64_t n,
|
||||
deviceVector<ComplexF*> &Ann,
|
||||
deviceVector<int64_t> &ipiv,
|
||||
deviceVector<int64_t> &info,
|
||||
deviceVector<ComplexF*> &Cnn)
|
||||
{
|
||||
int64_t batchCount = Ann.size();
|
||||
assert(ipiv.size()==batchCount*n);
|
||||
assert(info.size()==batchCount);
|
||||
assert(Cnn.size()==batchCount);
|
||||
|
||||
#ifdef GRID_HIP
|
||||
auto err = hipblasCgetriBatched(gridblasHandle,(int)n,
|
||||
(hipblasComplex **)&Ann[0], (int)n,
|
||||
(int*) &ipiv[0],
|
||||
(hipblasComplex **)&Cnn[0], (int)n,
|
||||
(int*) &info[0],
|
||||
(int)batchCount);
|
||||
assert(err==HIPBLAS_STATUS_SUCCESS);
|
||||
#endif
|
||||
#ifdef GRID_CUDA
|
||||
auto err = cublasCgetriBatched(gridblasHandle, (int)n,
|
||||
(cuComplex **)&Ann[0], (int)n,
|
||||
(int*) &ipiv[0],
|
||||
(cuComplex **)&Cnn[0], (int)n,
|
||||
(int*) &info[0],
|
||||
(int)batchCount);
|
||||
assert(err==CUBLAS_STATUS_SUCCESS);
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
getriBatchedSYCL(n, Ann, ipiv, info, Cnn);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<typename dtype>
|
||||
void inverseBatched(int64_t n,
|
||||
deviceVector<dtype*> &Ann, // this will be overwritten with LU decomposition
|
||||
deviceVector<dtype*> &Cnn // this will be overwritten with the inverse
|
||||
) {
|
||||
|
||||
int64_t batchCount = Ann.size();
|
||||
RealD t0 = usecond();
|
||||
deviceVector<int64_t> ipiv(batchCount*n);
|
||||
deviceVector<int64_t> info(batchCount);
|
||||
|
||||
//RealD t1 = usecond();
|
||||
getrfBatched(n, Ann, ipiv, info);
|
||||
// test info for non-invertibility? set to nan if yes?
|
||||
getriBatched(n, Ann, ipiv, info, Cnn);
|
||||
//synchronise();
|
||||
//RealD t2 = usecond();
|
||||
//std::cout << GridLogMessage << "Temp " << t1-t0 << " rf/ri " << t2-t1 << std::endl;
|
||||
}
|
||||
|
||||
template<typename dtype>
|
||||
void determinantBatched(int64_t n,
|
||||
deviceVector<dtype*> &Ann, // this will be overwritten with LU decomposition
|
||||
deviceVector<dtype*> &C // this will be overwritten with determinant
|
||||
) {
|
||||
|
||||
int64_t batchCount = Ann.size();
|
||||
//RealD t0 = usecond();
|
||||
deviceVector<int64_t> ipiv(batchCount*n);
|
||||
deviceVector<int64_t> info(batchCount);
|
||||
|
||||
dtype** pAnn = (dtype**)&Ann[0];
|
||||
dtype** pC = (dtype**)&C[0];
|
||||
#if defined(GRID_CUDA) || defined(GRID_HIP)
|
||||
int* pipiv = (int*)&ipiv[0];
|
||||
#else
|
||||
int64_t* pipiv = (int64_t*)&ipiv[0];
|
||||
#endif
|
||||
|
||||
//RealD t1 = usecond();
|
||||
getrfBatched(n, Ann, ipiv, info);
|
||||
//RealD t2 = usecond();
|
||||
accelerator_for(i,batchCount,1,{
|
||||
dtype det = 1.0;
|
||||
for (int64_t j=0;j<n;j++) {
|
||||
det *= pAnn[i][n*j + j];
|
||||
// branchless signs
|
||||
det *= (pipiv[i*n + j] == j+1) ? (1.0) : (-1.0);
|
||||
}
|
||||
*pC[i] = det;
|
||||
});
|
||||
|
||||
//RealD t3 = usecond();
|
||||
//std::cout << GridLogMessage << "Temp " << t1 - t0 << " rf/ri " << t2-t1 << "final" << t3 - t2 << std::endl;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
template<class CComplex>
|
||||
double benchmark(int M, int N, int K, int BATCH)
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user