1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-11-10 00:29:32 +00:00

Compare commits

..

3 Commits

Author SHA1 Message Date
Peter Boyle
bffd30abec Optimise lie algebra project 2024-09-19 15:48:09 -04:00
Peter Boyle
da919949f9 Clean up the accelerator pick/set checkerboard 2024-08-23 12:34:41 -04:00
Peter Boyle
b12b4fdaff Attempt at operating on half checkerboard 2024-08-23 11:05:09 -04:00
429 changed files with 7045 additions and 15717 deletions

View File

@@ -12,13 +12,15 @@
#include <iostream> #include <iostream>
#include <sys/time.h> #include <sys/time.h>
#define GRID_SYCL
#undef GRID_HIP
#undef GRID_CUDA
#ifdef GRID_HIP #ifdef GRID_HIP
#include <hipblas/hipblas.h> #include <hipblas/hipblas.h>
#endif #endif
#ifdef GRID_CUDA #ifdef GRID_CUDA
#include <cublas_v2.h> #include <cublas_v2.h>
#endif #endif
#ifdef GRID_SYCL #ifdef GRID_SYCL
#include <oneapi/mkl.hpp> #include <oneapi/mkl.hpp>
@@ -43,90 +45,6 @@ inline void acceleratorFreeDevice(void *ptr,size_t bytes){free(ptr,*theAccelerat
inline void acceleratorMemSet(void *base,int value,size_t bytes) { theAccelerator->memset(base,value,bytes); theAccelerator->wait();} inline void acceleratorMemSet(void *base,int value,size_t bytes) { theAccelerator->memset(base,value,bytes); theAccelerator->wait();}
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theAccelerator->memcpy(to,from,bytes); theAccelerator->wait();} inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theAccelerator->memcpy(to,from,bytes); theAccelerator->wait();}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theAccelerator->memcpy(to,from,bytes); theAccelerator->wait();} inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theAccelerator->memcpy(to,from,bytes); theAccelerator->wait();}
#define accelerator_barrier(dummy) { theAccelerator->wait(); }
#endif
#ifdef GRID_HIP
hipStream_t copyStream;
hipStream_t computeStream;
void acceleratorInit(void)
{
int device = 0;
auto discard = hipSetDevice(device);
discard = hipStreamCreate(&copyStream);
discard = hipStreamCreate(&computeStream);
printf("AcceleratorHIPInit\n");
}
inline void *acceleratorAllocDevice(size_t bytes)
{
void *ptr=NULL;
auto err = hipMalloc((void **)&ptr,bytes);
if( err != hipSuccess ) {
ptr = (void *) NULL;
fprintf(stderr," hipMalloc failed for %ld %s \n",bytes,hipGetErrorString(err)); fflush(stderr);
}
return ptr;
};
inline void acceleratorFreeDevice(void *ptr,size_t bytes){ auto discard=hipFree(ptr);};
inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);};
inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);}
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
#define accelerator_barrier(dummy) \
{ \
auto tmp=hipStreamSynchronize(computeStream); \
auto err = hipGetLastError(); \
if ( err != hipSuccess ) { \
printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \
puts(__FILE__); \
printf("Line %d\n",__LINE__); \
exit(0); \
} \
}
#endif
#ifdef GRID_CUDA
cudaStream_t copyStream;
cudaStream_t computeStream;
void acceleratorInit(void)
{
int device = 0;
cudaSetDevice(device);
cudaStreamCreate(&copyStream);
cudaStreamCreate(&computeStream);
}
inline void *acceleratorAllocDevice(size_t bytes)
{
void *ptr=NULL;
auto err = cudaMalloc((void **)&ptr,bytes);
if( err != cudaSuccess ) {
ptr = (void *) NULL;
printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err));
}
return ptr;
};
inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
#define accelerator_barrier(dummy) \
{ \
cudaStreamSynchronize(computeStream); \
cudaError err = cudaGetLastError(); \
if ( cudaSuccess != err ) { \
printf("accelerator_barrier(): Cuda error %s \n", \
cudaGetErrorString( err )); \
printf("File %s Line %d\n",__FILE__,__LINE__); \
fflush(stdout); \
if (acceleratorAbortOnGpuError) GRID_ASSERT(err==cudaSuccess); \
} \
}
#endif
template<class T> void acceleratorPut(T& dev,T&host) template<class T> void acceleratorPut(T& dev,T&host)
{ {
acceleratorCopyToDevice(&host,&dev,sizeof(T)); acceleratorCopyToDevice(&host,&dev,sizeof(T));
@@ -137,6 +55,9 @@ template<class T> T acceleratorGet(T& dev)
acceleratorCopyFromDevice(&dev,&host,sizeof(T)); acceleratorCopyFromDevice(&dev,&host,sizeof(T));
return host; return host;
} }
#define accelerator_barrier(dummy) { theAccelerator->wait(); }
#endif
/************************************************************** /**************************************************************
* Allocator * Allocator
@@ -168,7 +89,7 @@ public:
if ( (_Tp*)ptr == (_Tp *) NULL ) { if ( (_Tp*)ptr == (_Tp *) NULL ) {
printf("Grid Device Allocator got NULL for %lu bytes\n",(unsigned long) bytes ); printf("Grid Device Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
} }
GRID_ASSERT( ( (_Tp*)ptr != (_Tp *)NULL ) ); assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
return ptr; return ptr;
} }
@@ -276,11 +197,11 @@ public:
{ {
#ifdef GRID_HIP #ifdef GRID_HIP
auto err = hipDeviceSynchronize(); auto err = hipDeviceSynchronize();
GRID_ASSERT(err==hipSuccess); assert(err==hipSuccess);
#endif #endif
#ifdef GRID_CUDA #ifdef GRID_CUDA
auto err = cudaDeviceSynchronize(); auto err = cudaDeviceSynchronize();
GRID_ASSERT(err==cudaSuccess); assert(err==cudaSuccess);
#endif #endif
#ifdef GRID_SYCL #ifdef GRID_SYCL
accelerator_barrier(); accelerator_barrier();
@@ -290,269 +211,6 @@ public:
#endif #endif
} }
/////////////////////////////////////////////////////////////
// Single matrix GEMM -- fp64 and fp32
/////////////////////////////////////////////////////////////
void gemm(GridBLASOperation_t OpA,
GridBLASOperation_t OpB,
int m,int n, int k,
ComplexD alpha,
ComplexD* Amk, // Device pointer
ComplexD* Bkn,
ComplexD beta,
ComplexD* Cmn)
{
RealD t2=usecond();
GRID_ASSERT(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
GRID_ASSERT(OpB!=GridBLAS_OP_T);
int lda = m; // m x k column major
int ldb = k; // k x n column major
int ldc = m; // m x b column major
if(OpA!=GridBLAS_OP_N)
lda = k;
if(OpB!=GridBLAS_OP_N)
ldb = n;
static deviceVector<ComplexD> alpha_p(1);
static deviceVector<ComplexD> beta_p(1);
// can prestore the 1 and the zero on device
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD));
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD));
RealD t0=usecond();
#ifdef GRID_HIP
hipblasOperation_t hOpA;
hipblasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
auto err = hipblasZgemm(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(hipblasDoubleComplex *) &alpha_p[0],
(hipblasDoubleComplex *) Amk, lda,
(hipblasDoubleComplex *) Bkn, ldb,
(hipblasDoubleComplex *) &beta_p[0],
(hipblasDoubleComplex *) Cmn, ldc);
GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
cublasOperation_t hOpA;
cublasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
auto err = cublasZgemm(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(cuDoubleComplex *) &alpha_p[0],
(cuDoubleComplex *) Amk, lda,
(cuDoubleComplex *) Bkn, ldb,
(cuDoubleComplex *) &beta_p[0],
(cuDoubleComplex *) Cmn, ldc);
GRID_ASSERT(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
int64_t m64=m;
int64_t n64=n;
int64_t k64=k;
int64_t lda64=lda;
int64_t ldb64=ldb;
int64_t ldc64=ldc;
oneapi::mkl::transpose iOpA;
oneapi::mkl::transpose iOpB;
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
oneapi::mkl::blas::column_major::gemm(*gridblasHandle,
iOpA,
iOpB,
m64,n64,k64,
(ComplexD *) &alpha_p[0],
(const ComplexD *)Amk, (int64_t )lda64,
(const ComplexD *)Bkn, (int64_t )ldb64,
(ComplexD *) &beta_p[0],
(ComplexD *)Cmn, (int64_t)ldc64);
synchronise();
#endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
// Need a default/reference implementation; use Eigen
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk,m,k);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn,k,n);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn,m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk,k,m);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn,k,n);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn,m,n);
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk,m,k);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn,n,k);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn,m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk,k,m);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn,n,k);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn,m,n);
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
} else {
assert(0);
}
#endif
RealD t1=usecond();
RealD flops = 8.0*m*n*k;
RealD bytes = 1.0*sizeof(ComplexD)*(m*k+k*n+m*n);
}
void gemm(GridBLASOperation_t OpA,
GridBLASOperation_t OpB,
int m,int n, int k,
ComplexF alpha,
ComplexF* Amk, // Device pointer
ComplexF* Bkn,
ComplexF beta,
ComplexF* Cmn)
{
RealD t2=usecond();
GRID_ASSERT(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
GRID_ASSERT(OpB!=GridBLAS_OP_T);
int lda = m; // m x k column major
int ldb = k; // k x n column major
int ldc = m; // m x b column major
if(OpA!=GridBLAS_OP_N)
lda = k;
if(OpB!=GridBLAS_OP_N)
ldb = n;
static deviceVector<ComplexF> alpha_p(1);
static deviceVector<ComplexF> beta_p(1);
// can prestore the 1 and the zero on device
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexF));
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexF));
RealD t0=usecond();
#ifdef GRID_HIP
hipblasOperation_t hOpA;
hipblasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
auto err = hipblasCgemm(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(hipblasComplex *) &alpha_p[0],
(hipblasComplex *) Amk, lda,
(hipblasComplex *) Bkn, ldb,
(hipblasComplex *) &beta_p[0],
(hipblasComplex *) Cmn, ldc);
GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
cublasOperation_t hOpA;
cublasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
auto err = cublasCgemm(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(cuComplex *) &alpha_p[0],
(cuComplex *) Amk, lda,
(cuComplex *) Bkn, ldb,
(cuComplex *) &beta_p[0],
(cuComplex *) Cmn, ldc);
GRID_ASSERT(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
int64_t m64=m;
int64_t n64=n;
int64_t k64=k;
int64_t lda64=lda;
int64_t ldb64=ldb;
int64_t ldc64=ldc;
oneapi::mkl::transpose iOpA;
oneapi::mkl::transpose iOpB;
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
oneapi::mkl::blas::column_major::gemm(*gridblasHandle,
iOpA,
iOpB,
m64,n64,k64,
(ComplexF *) &alpha_p[0],
(const ComplexF *)Amk, (int64_t )lda64,
(const ComplexF *)Bkn, (int64_t )ldb64,
(ComplexF *) &beta_p[0],
(ComplexF *)Cmn, (int64_t )ldc64);
synchronise();
#endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
// Need a default/reference implementation; use Eigen
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk,m,k);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn,k,n);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn,m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk,k,m);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn,k,n);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn,m,n);
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk,m,k);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn,n,k);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn,m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk,k,m);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn,n,k);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn,m,n);
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
} else {
assert(0);
}
#endif
RealD t1=usecond();
RealD flops = 8.0*m*n*k;
RealD bytes = 1.0*sizeof(ComplexF)*(m*k+k*n+m*n);
}
/////////////////////////////////////////////////////////////
void gemmBatched(int m,int n, int k, void gemmBatched(int m,int n, int k,
ComplexD alpha, ComplexD alpha,
deviceVector<ComplexD*> &Amk, // pointer list to matrices deviceVector<ComplexD*> &Amk, // pointer list to matrices
@@ -583,6 +241,36 @@ public:
beta, beta,
Cmn); Cmn);
} }
void gemmBatched(int m,int n, int k,
RealD alpha,
deviceVector<RealD*> &Amk, // pointer list to matrices
deviceVector<RealD*> &Bkn,
RealD beta,
deviceVector<RealD*> &Cmn)
{
gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
m,n,k,
alpha,
Amk,
Bkn,
beta,
Cmn);
}
void gemmBatched(int m,int n, int k,
RealF alpha,
deviceVector<RealF*> &Amk, // pointer list to matrices
deviceVector<RealF*> &Bkn,
RealF beta,
deviceVector<RealF*> &Cmn)
{
gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
m,n,k,
alpha,
Amk,
Bkn,
beta,
Cmn);
}
void gemmBatched(GridBLASOperation_t OpA, void gemmBatched(GridBLASOperation_t OpA,
GridBLASOperation_t OpB, GridBLASOperation_t OpB,
@@ -595,11 +283,11 @@ public:
{ {
RealD t2=usecond(); RealD t2=usecond();
int32_t batchCount = Amk.size(); int32_t batchCount = Amk.size();
GRID_ASSERT(Bkn.size()==batchCount); assert(Bkn.size()==batchCount);
GRID_ASSERT(Cmn.size()==batchCount); assert(Cmn.size()==batchCount);
GRID_ASSERT(OpA!=GridBLAS_OP_T); // Complex case expect no transpose assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
GRID_ASSERT(OpB!=GridBLAS_OP_T); assert(OpB!=GridBLAS_OP_T);
int lda = m; // m x k column major int lda = m; // m x k column major
int ldb = k; // k x n column major int ldb = k; // k x n column major
@@ -636,7 +324,7 @@ public:
(hipblasDoubleComplex **)&Cmn[0], ldc, (hipblasDoubleComplex **)&Cmn[0], ldc,
batchCount); batchCount);
// std::cout << " hipblas return code " <<(int)err<<std::endl; // std::cout << " hipblas return code " <<(int)err<<std::endl;
GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS); assert(err==HIPBLAS_STATUS_SUCCESS);
#endif #endif
#ifdef GRID_CUDA #ifdef GRID_CUDA
cublasOperation_t hOpA; cublasOperation_t hOpA;
@@ -657,7 +345,7 @@ public:
(cuDoubleComplex *) &beta_p[0], (cuDoubleComplex *) &beta_p[0],
(cuDoubleComplex **)&Cmn[0], ldc, (cuDoubleComplex **)&Cmn[0], ldc,
batchCount); batchCount);
GRID_ASSERT(err==CUBLAS_STATUS_SUCCESS); assert(err==CUBLAS_STATUS_SUCCESS);
#endif #endif
#ifdef GRID_SYCL #ifdef GRID_SYCL
int64_t m64=m; int64_t m64=m;
@@ -804,8 +492,8 @@ public:
RealD t2=usecond(); RealD t2=usecond();
int32_t batchCount = Amk.size(); int32_t batchCount = Amk.size();
GRID_ASSERT(OpA!=GridBLAS_OP_T); // Complex case expect no transpose assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
GRID_ASSERT(OpB!=GridBLAS_OP_T); assert(OpB!=GridBLAS_OP_T);
int lda = m; // m x k column major int lda = m; // m x k column major
int ldb = k; // k x n column major int ldb = k; // k x n column major
@@ -821,8 +509,8 @@ public:
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexF)); acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexF));
RealD t0=usecond(); RealD t0=usecond();
GRID_ASSERT(Bkn.size()==batchCount); assert(Bkn.size()==batchCount);
GRID_ASSERT(Cmn.size()==batchCount); assert(Cmn.size()==batchCount);
#ifdef GRID_HIP #ifdef GRID_HIP
hipblasOperation_t hOpA; hipblasOperation_t hOpA;
hipblasOperation_t hOpB; hipblasOperation_t hOpB;
@@ -843,7 +531,7 @@ public:
(hipblasComplex **)&Cmn[0], ldc, (hipblasComplex **)&Cmn[0], ldc,
batchCount); batchCount);
GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS); assert(err==HIPBLAS_STATUS_SUCCESS);
#endif #endif
#ifdef GRID_CUDA #ifdef GRID_CUDA
cublasOperation_t hOpA; cublasOperation_t hOpA;
@@ -864,7 +552,7 @@ public:
(cuComplex *) &beta_p[0], (cuComplex *) &beta_p[0],
(cuComplex **)&Cmn[0], ldc, (cuComplex **)&Cmn[0], ldc,
batchCount); batchCount);
GRID_ASSERT(err==CUBLAS_STATUS_SUCCESS); assert(err==CUBLAS_STATUS_SUCCESS);
#endif #endif
#ifdef GRID_SYCL #ifdef GRID_SYCL
int64_t m64=m; int64_t m64=m;
@@ -936,6 +624,301 @@ public:
RealD bytes = 1.0*sizeof(ComplexF)*(m*k+k*n+m*n)*batchCount; RealD bytes = 1.0*sizeof(ComplexF)*(m*k+k*n+m*n)*batchCount;
} }
///////////////////////////////////////////////////////////////////////////
// Single precision real GEMM
///////////////////////////////////////////////////////////////////////////
void gemmBatched(GridBLASOperation_t OpA,
GridBLASOperation_t OpB,
int m,int n, int k,
RealF alpha,
deviceVector<RealF*> &Amk, // pointer list to matrices
deviceVector<RealF*> &Bkn,
RealF beta,
deviceVector<RealF*> &Cmn)
{
RealD t2=usecond();
int32_t batchCount = Amk.size();
assert(OpA!=GridBLAS_OP_C); // Real case no conjugate
assert(OpB!=GridBLAS_OP_C);
int lda = m; // m x k column major
int ldb = k; // k x n column major
int ldc = m; // m x b column major
if(OpA!=GridBLAS_OP_N)
lda = k;
if(OpB!=GridBLAS_OP_N)
ldb = n;
static deviceVector<RealF> alpha_p(1);
static deviceVector<RealF> beta_p(1);
// can prestore the 1 and the zero on device
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealF));
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealF));
RealD t0=usecond();
assert(Bkn.size()==batchCount);
assert(Cmn.size()==batchCount);
#ifdef GRID_HIP
hipblasOperation_t hOpA;
hipblasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
auto err = hipblasSgemmBatched(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(float *) &alpha_p[0],
(float **)&Amk[0], lda,
(float **)&Bkn[0], ldb,
(float *) &beta_p[0],
(float **)&Cmn[0], ldc,
batchCount);
assert(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
cublasOperation_t hOpA;
cublasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
auto err = cublasSgemmBatched(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(float *) &alpha_p[0],
(float **)&Amk[0], lda,
(float **)&Bkn[0], ldb,
(float *) &beta_p[0],
(float **)&Cmn[0], ldc,
batchCount);
assert(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
int64_t m64=m;
int64_t n64=n;
int64_t k64=k;
int64_t lda64=lda;
int64_t ldb64=ldb;
int64_t ldc64=ldc;
int64_t batchCount64=batchCount;
oneapi::mkl::transpose iOpA;
oneapi::mkl::transpose iOpB;
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
&iOpA,
&iOpB,
&m64,&n64,&k64,
(float *) &alpha_p[0],
(const float **)&Amk[0], (const int64_t *)&lda64,
(const float **)&Bkn[0], (const int64_t *)&ldb64,
(float *) &beta_p[0],
(float **)&Cmn[0], (const int64_t *)&ldc64,
(int64_t)1,&batchCount64,std::vector<sycl::event>());
synchronise();
#endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
// Need a default/reference implementation; use Eigen
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
});
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
} );
} else {
assert(0);
}
#endif
RealD t1=usecond();
RealD flops = 2.0*m*n*k*batchCount;
RealD bytes = 1.0*sizeof(RealF)*(m*k+k*n+m*n)*batchCount;
}
///////////////////////////////////////////////////////////////////////////
// Double precision real GEMM
///////////////////////////////////////////////////////////////////////////
void gemmBatched(GridBLASOperation_t OpA,
GridBLASOperation_t OpB,
int m,int n, int k,
RealD alpha,
deviceVector<RealD*> &Amk, // pointer list to matrices
deviceVector<RealD*> &Bkn,
RealD beta,
deviceVector<RealD*> &Cmn)
{
RealD t2=usecond();
int32_t batchCount = Amk.size();
assert(OpA!=GridBLAS_OP_C); // Real case no conjugate
assert(OpB!=GridBLAS_OP_C);
int lda = m; // m x k column major
int ldb = k; // k x n column major
int ldc = m; // m x b column major
if(OpA!=GridBLAS_OP_N)
lda = k;
if(OpB!=GridBLAS_OP_N)
ldb = n;
static deviceVector<RealD> alpha_p(1);
static deviceVector<RealD> beta_p(1);
// can prestore the 1 and the zero on device
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealD));
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealD));
RealD t0=usecond();
assert(Bkn.size()==batchCount);
assert(Cmn.size()==batchCount);
#ifdef GRID_HIP
hipblasOperation_t hOpA;
hipblasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
auto err = hipblasDgemmBatched(gridblasHandle,
HIPBLAS_OP_N,
HIPBLAS_OP_N,
m,n,k,
(double *) &alpha_p[0],
(double **)&Amk[0], lda,
(double **)&Bkn[0], ldb,
(double *) &beta_p[0],
(double **)&Cmn[0], ldc,
batchCount);
assert(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
cublasOperation_t hOpA;
cublasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
auto err = cublasDgemmBatched(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(double *) &alpha_p[0],
(double **)&Amk[0], lda,
(double **)&Bkn[0], ldb,
(double *) &beta_p[0],
(double **)&Cmn[0], ldc,
batchCount);
assert(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
int64_t m64=m;
int64_t n64=n;
int64_t k64=k;
int64_t lda64=lda;
int64_t ldb64=ldb;
int64_t ldc64=ldc;
int64_t batchCount64=batchCount;
oneapi::mkl::transpose iOpA;
oneapi::mkl::transpose iOpB;
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
&iOpA,
&iOpB,
&m64,&n64,&k64,
(double *) &alpha_p[0],
(const double **)&Amk[0], (const int64_t *)&lda64,
(const double **)&Bkn[0], (const int64_t *)&ldb64,
(double *) &beta_p[0],
(double **)&Cmn[0], (const int64_t *)&ldc64,
(int64_t)1,&batchCount64,std::vector<sycl::event>());
synchronise();
#endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
// Need a default/reference implementation; use Eigen
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
});
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
});
} else {
assert(0);
}
#endif
RealD t1=usecond();
RealD flops = 2.0*m*n*k*batchCount;
RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount;
}
template<class CComplex> template<class CComplex>
double benchmark(int M, int N, int K, int BATCH) double benchmark(int M, int N, int K, int BATCH)
{ {
@@ -984,47 +967,6 @@ public:
return flops; // Returns gigaflops return flops; // Returns gigaflops
} }
template<class CComplex>
double benchmark(int M, int N, int K)
{
int32_t N_A = M*K;
int32_t N_B = K*N;
int32_t N_C = M*N;
deviceVector<CComplex> A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(CComplex));
deviceVector<CComplex> B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(CComplex));
deviceVector<CComplex> C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(CComplex));
CComplex alpha(1.0);
CComplex beta (1.0);
RealD flops = 8.0*M*N*K;
int ncall=10;
gemm(GridBLAS_OP_C,GridBLAS_OP_N,
M,N,K,
alpha,
&A[0], // m x k
&B[0], // k x n
beta,
&C[0]);
synchronise();
RealD t0 = usecond();
for(int i=0;i<ncall;i++){
gemm(GridBLAS_OP_N,GridBLAS_OP_N,
M,N,K,
alpha,
&A[0], // m x k
&B[0], // k x n
beta,
&C[0]);
synchronise();
}
RealD t1 = usecond();
RealD bytes = 1.0*sizeof(CComplex)*(M*N*2+N*K+M*K);
flops = 8.0*M*N*K*ncall;
flops = flops/(t1-t0)/1.e3;
return flops; // Returns gigaflops
}
}; };
@@ -1093,21 +1035,6 @@ static void BLAS(void)
std::cout<< M<<"\t\t"<<N<<"\t\t"<<K<<"\t\t"<<BATCH<<"\t\t"<<p<<std::endl; std::cout<< M<<"\t\t"<<N<<"\t\t"<<K<<"\t\t"<<BATCH<<"\t\t"<<p<<std::endl;
}} }}
fprintf(FP,"\n\n\n"); fprintf(FP,"\n\n\n");
std::cout << "----------------------------------------------------------"<<std::endl;
std::cout << " M "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / rank (inner product matrix)"<<std::endl;
std::cout << "----------------------------------------------------------"<<std::endl;
{
int M=12;
int N=12;
std::vector<int> ks({4*1024*1024, 2*1024*1024, 1024*1024, 256*1024, 1024 });
for( int kk=0;kk<ks.size();kk++ ) {
int K = ks[kk];
double p=blas.benchmark<CComplex>(M,N,K);
fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, 1, p);
std::cout<< M<<"\t\t"<<N<<"\t\t"<<K<<"\t\t"<<1<<"\t\t"<<p<<std::endl;
}
}
std::cout << "=================================================================================="<<std::endl; std::cout << "=================================================================================="<<std::endl;
}; };

View File

@@ -1,2 +1,2 @@
mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench -DGRID_SYCL mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench

View File

@@ -1,5 +0,0 @@
CXX=hipcc
MPICXX=mpicxx
CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 -I/opt/cray/pe/mpich/8.1.28/ofi/gnu/12.3/include -DGRID_HIP"
LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 -lhipblas -lrocblas -lmpi_gnu_123"
hipcc $CXXFLAGS $LDFLAGS BatchBlasBench.cc -o BatchBlasBench

View File

@@ -1,2 +0,0 @@
mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench -DGRID_SYCL

View File

@@ -51,13 +51,11 @@ directory
#pragma nv_diag_suppress cast_to_qualified_type #pragma nv_diag_suppress cast_to_qualified_type
//disables nvcc specific warning in many files //disables nvcc specific warning in many files
#pragma nv_diag_suppress esa_on_defaulted_function_ignored #pragma nv_diag_suppress esa_on_defaulted_function_ignored
#pragma nv_diag_suppress declared_but_not_referenced
#pragma nv_diag_suppress extra_semicolon #pragma nv_diag_suppress extra_semicolon
#else #else
//disables nvcc specific warning in json.hpp //disables nvcc specific warning in json.hpp
#pragma diag_suppress unsigned_compare_with_zero #pragma diag_suppress unsigned_compare_with_zero
#pragma diag_suppress cast_to_qualified_type #pragma diag_suppress cast_to_qualified_type
#pragma diag_suppress declared_but_not_referenced
//disables nvcc specific warning in many files //disables nvcc specific warning in many files
#pragma diag_suppress esa_on_defaulted_function_ignored #pragma diag_suppress esa_on_defaulted_function_ignored
#pragma diag_suppress extra_semicolon #pragma diag_suppress extra_semicolon

View File

@@ -1,17 +1,9 @@
#ifndef GRID_STD_H #ifndef GRID_STD_H
#define GRID_STD_H #define GRID_STD_H
///////////////////
// Grid config
///////////////////
#include "Config.h"
/////////////////// ///////////////////
// Std C++ dependencies // Std C++ dependencies
/////////////////// ///////////////////
#define _NBACKTRACE (256)
extern void * Grid_backtrace_buffer[_NBACKTRACE];
#include <cassert> #include <cassert>
#include <complex> #include <complex>
#include <memory> #include <memory>
@@ -24,7 +16,6 @@ extern void * Grid_backtrace_buffer[_NBACKTRACE];
#include <functional> #include <functional>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <unistd.h>
#include <strings.h> #include <strings.h>
#include <stdio.h> #include <stdio.h>
#include <signal.h> #include <signal.h>
@@ -32,35 +23,11 @@ extern void * Grid_backtrace_buffer[_NBACKTRACE];
#include <sys/time.h> #include <sys/time.h>
#include <chrono> #include <chrono>
#include <zlib.h> #include <zlib.h>
#ifdef HAVE_EXECINFO_H
#include <execinfo.h>
#endif
void GridAbort(void);
#define ASSLOG(A) ::write(STDERR_FILENO,A,strlen(A));
#ifdef HAVE_EXECINFO_H
#define GRID_ASSERT(b) if(!(b)) { \
ASSLOG(" GRID_ASSERT failure: "); \
ASSLOG(__FILE__); \
ASSLOG(" : "); \
ASSLOG(#b); \
ASSLOG(" : "); \
int symbols = backtrace(Grid_backtrace_buffer,_NBACKTRACE); \
backtrace_symbols_fd(Grid_backtrace_buffer,symbols,STDERR_FILENO); \
GridAbort(); \
};
#else
#define GRID_ASSERT(b) if(!(b)) { \
ASSLOG(" GRID_ASSERT failure: "); \
ASSLOG(__FILE__); \
ASSLOG(" : "); \
ASSLOG(#b); \
ASSLOG(" : "); \
GridAbort(); \
};
#endif
///////////////////
// Grid config
///////////////////
#include "Config.h"
#ifdef TOFU #ifdef TOFU
#undef GRID_COMMS_THREADS #undef GRID_COMMS_THREADS

View File

@@ -29,8 +29,8 @@ directory
#pragma once #pragma once
#include <type_traits> #include <type_traits>
#include <exception>
#include <cassert> #include <cassert>
#include <exception>
#define NAMESPACE_BEGIN(A) namespace A { #define NAMESPACE_BEGIN(A) namespace A {
#define NAMESPACE_END(A) } #define NAMESPACE_END(A) }

View File

@@ -50,9 +50,6 @@ NAMESPACE_CHECK(approx);
#include <Grid/algorithms/deflation/Deflation.h> #include <Grid/algorithms/deflation/Deflation.h>
#include <Grid/algorithms/deflation/MultiRHSBlockProject.h> #include <Grid/algorithms/deflation/MultiRHSBlockProject.h>
#include <Grid/algorithms/deflation/MultiRHSDeflation.h> #include <Grid/algorithms/deflation/MultiRHSDeflation.h>
#include <Grid/algorithms/deflation/MultiRHSBlockCGLinalg.h>
// Not really deflation, but useful
#include <Grid/algorithms/blas/MomentumProject.h>
NAMESPACE_CHECK(deflation); NAMESPACE_CHECK(deflation);
#include <Grid/algorithms/iterative/ConjugateGradient.h> #include <Grid/algorithms/iterative/ConjugateGradient.h>
NAMESPACE_CHECK(ConjGrad); NAMESPACE_CHECK(ConjGrad);

View File

@@ -168,8 +168,7 @@ public:
template<class vobj> template<class vobj>
void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){ void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){
#ifndef HAVE_FFTW #ifndef HAVE_FFTW
std::cerr << "FFTW is not compiled but is called"<<std::endl; assert(0);
GRID_ASSERT(0);
#else #else
conformable(result.Grid(),vgrid); conformable(result.Grid(),vgrid);
conformable(source.Grid(),vgrid); conformable(source.Grid(),vgrid);
@@ -191,7 +190,6 @@ public:
Lattice<sobj> pgbuf(&pencil_g); Lattice<sobj> pgbuf(&pencil_g);
autoView(pgbuf_v , pgbuf, CpuWrite); autoView(pgbuf_v , pgbuf, CpuWrite);
//std::cout << "CPU view" << std::endl;
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar; typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan; typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
@@ -213,9 +211,8 @@ public:
scalar div; scalar div;
if ( sign == backward ) div = 1.0/G; if ( sign == backward ) div = 1.0/G;
else if ( sign == forward ) div = 1.0; else if ( sign == forward ) div = 1.0;
else GRID_ASSERT(0); else assert(0);
//std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
FFTW_plan p; FFTW_plan p;
{ {
FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0]; FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
@@ -229,7 +226,6 @@ public:
} }
// Barrel shift and collect global pencil // Barrel shift and collect global pencil
//std::cout << GridLogPerformance<<"Making pencil" << std::endl;
Coordinate lcoor(Nd), gcoor(Nd); Coordinate lcoor(Nd), gcoor(Nd);
result = source; result = source;
int pc = processor_coor[dim]; int pc = processor_coor[dim];
@@ -251,7 +247,6 @@ public:
} }
} }
//std::cout <<GridLogPerformance<< "Looping orthog" << std::endl;
// Loop over orthog coords // Loop over orthog coords
int NN=pencil_g.lSites(); int NN=pencil_g.lSites();
GridStopWatch timer; GridStopWatch timer;
@@ -274,7 +269,6 @@ public:
usec += timer.useconds(); usec += timer.useconds();
flops+= flops_call*NN; flops+= flops_call*NN;
//std::cout <<GridLogPerformance<< "Writing back results " << std::endl;
// writing out result // writing out result
{ {
autoView(pgbuf_v,pgbuf,CpuRead); autoView(pgbuf_v,pgbuf,CpuRead);
@@ -291,7 +285,6 @@ public:
} }
result = result*div; result = result*div;
//std::cout <<GridLogPerformance<< "Destroying plan " << std::endl;
// destroying plan // destroying plan
FFTW<scalar>::fftw_destroy_plan(p); FFTW<scalar>::fftw_destroy_plan(p);
#endif #endif

View File

@@ -64,7 +64,7 @@ public:
// //
// I'm not entirely happy with implementation; to share the Schur code between herm and non-herm // I'm not entirely happy with implementation; to share the Schur code between herm and non-herm
// while still having a "OpAndNorm" in the abstract base I had to implement it in both cases // while still having a "OpAndNorm" in the abstract base I had to implement it in both cases
// with an GRID_ASSERT trap in the non-herm. This isn't right; there must be a better C++ way to // with an assert trap in the non-herm. This isn't right; there must be a better C++ way to
// do it, but I fear it required multiple inheritance and mixed in abstract base classes // do it, but I fear it required multiple inheritance and mixed in abstract base classes
///////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////
@@ -103,38 +103,6 @@ public:
_Mat.MdagM(in,out); _Mat.MdagM(in,out);
} }
}; };
template<class Matrix,class Field>
class MMdagLinearOperator : public LinearOperatorBase<Field> {
Matrix &_Mat;
public:
MMdagLinearOperator(Matrix &Mat): _Mat(Mat){};
// Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) {
_Mat.Mdiag(in,out);
}
void OpDir (const Field &in, Field &out,int dir,int disp) {
_Mat.Mdir(in,out,dir,disp);
}
void OpDirAll (const Field &in, std::vector<Field> &out){
_Mat.MdirAll(in,out);
};
void Op (const Field &in, Field &out){
_Mat.M(in,out);
}
void AdjOp (const Field &in, Field &out){
_Mat.Mdag(in,out);
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
_Mat.MMdag(in,out);
ComplexD dot = innerProduct(in,out);
n1=real(dot);
n2=norm2(out);
}
void HermOp(const Field &in, Field &out){
_Mat.MMdag(in,out);
}
};
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
// Construct herm op and shift it for mgrid smoother // Construct herm op and shift it for mgrid smoother
@@ -148,22 +116,22 @@ public:
// Support for coarsening to a multigrid // Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) { void OpDiag (const Field &in, Field &out) {
_Mat.Mdiag(in,out); _Mat.Mdiag(in,out);
GRID_ASSERT(0); assert(0);
} }
void OpDir (const Field &in, Field &out,int dir,int disp) { void OpDir (const Field &in, Field &out,int dir,int disp) {
_Mat.Mdir(in,out,dir,disp); _Mat.Mdir(in,out,dir,disp);
GRID_ASSERT(0); assert(0);
} }
void OpDirAll (const Field &in, std::vector<Field> &out){ void OpDirAll (const Field &in, std::vector<Field> &out){
GRID_ASSERT(0); assert(0);
}; };
void Op (const Field &in, Field &out){ void Op (const Field &in, Field &out){
_Mat.M(in,out); _Mat.M(in,out);
GRID_ASSERT(0); assert(0);
} }
void AdjOp (const Field &in, Field &out){ void AdjOp (const Field &in, Field &out){
_Mat.Mdag(in,out); _Mat.Mdag(in,out);
GRID_ASSERT(0); assert(0);
} }
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
HermOp(in,out); HermOp(in,out);
@@ -188,13 +156,13 @@ public:
ShiftedHermOpLinearOperator(LinearOperatorBase<Field> &Mat,RealD shift): _Mat(Mat), _shift(shift){}; ShiftedHermOpLinearOperator(LinearOperatorBase<Field> &Mat,RealD shift): _Mat(Mat), _shift(shift){};
// Support for coarsening to a multigrid // Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) { void OpDiag (const Field &in, Field &out) {
GRID_ASSERT(0); assert(0);
} }
void OpDir (const Field &in, Field &out,int dir,int disp) { void OpDir (const Field &in, Field &out,int dir,int disp) {
GRID_ASSERT(0); assert(0);
} }
void OpDirAll (const Field &in, std::vector<Field> &out){ void OpDirAll (const Field &in, std::vector<Field> &out){
GRID_ASSERT(0); assert(0);
}; };
void Op (const Field &in, Field &out){ void Op (const Field &in, Field &out){
HermOp(in,out); HermOp(in,out);
@@ -271,42 +239,10 @@ public:
_Mat.Mdag(in,out); _Mat.Mdag(in,out);
} }
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
GRID_ASSERT(0); assert(0);
} }
void HermOp(const Field &in, Field &out){ void HermOp(const Field &in, Field &out){
GRID_ASSERT(0); assert(0);
}
};
template<class Matrix,class Field>
class ShiftedNonHermitianLinearOperator : public LinearOperatorBase<Field> {
Matrix &_Mat;
RealD shift;
public:
ShiftedNonHermitianLinearOperator(Matrix &Mat,RealD shft): _Mat(Mat),shift(shft){};
// Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) {
_Mat.Mdiag(in,out);
out = out + shift*in;
}
void OpDir (const Field &in, Field &out,int dir,int disp) {
_Mat.Mdir(in,out,dir,disp);
}
void OpDirAll (const Field &in, std::vector<Field> &out){
_Mat.MdirAll(in,out);
};
void Op (const Field &in, Field &out){
_Mat.M(in,out);
out = out + shift * in;
}
void AdjOp (const Field &in, Field &out){
_Mat.Mdag(in,out);
out = out + shift * in;
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
GRID_ASSERT(0);
}
void HermOp(const Field &in, Field &out){
GRID_ASSERT(0);
} }
}; };
@@ -345,13 +281,13 @@ class SchurOperatorBase : public LinearOperatorBase<Field> {
} }
// Support for coarsening to a multigrid // Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) { void OpDiag (const Field &in, Field &out) {
GRID_ASSERT(0); // must coarsen the unpreconditioned system assert(0); // must coarsen the unpreconditioned system
} }
void OpDir (const Field &in, Field &out,int dir,int disp) { void OpDir (const Field &in, Field &out,int dir,int disp) {
GRID_ASSERT(0); assert(0);
} }
void OpDirAll (const Field &in, std::vector<Field> &out){ void OpDirAll (const Field &in, std::vector<Field> &out){
GRID_ASSERT(0); assert(0);
}; };
}; };
template<class Matrix,class Field> template<class Matrix,class Field>
@@ -447,10 +383,10 @@ class NonHermitianSchurOperatorBase : public LinearOperatorBase<Field>
MpcDag(tmp,out); MpcDag(tmp,out);
} }
virtual void HermOpAndNorm(const Field& in, Field& out, RealD& n1, RealD& n2) { virtual void HermOpAndNorm(const Field& in, Field& out, RealD& n1, RealD& n2) {
GRID_ASSERT(0); assert(0);
} }
virtual void HermOp(const Field& in, Field& out) { virtual void HermOp(const Field& in, Field& out) {
GRID_ASSERT(0); assert(0);
} }
void Op(const Field& in, Field& out) { void Op(const Field& in, Field& out) {
Mpc(in, out); Mpc(in, out);
@@ -460,13 +396,13 @@ class NonHermitianSchurOperatorBase : public LinearOperatorBase<Field>
} }
// Support for coarsening to a multigrid // Support for coarsening to a multigrid
void OpDiag(const Field& in, Field& out) { void OpDiag(const Field& in, Field& out) {
GRID_ASSERT(0); // must coarsen the unpreconditioned system assert(0); // must coarsen the unpreconditioned system
} }
void OpDir(const Field& in, Field& out, int dir, int disp) { void OpDir(const Field& in, Field& out, int dir, int disp) {
GRID_ASSERT(0); assert(0);
} }
void OpDirAll(const Field& in, std::vector<Field>& out){ void OpDirAll(const Field& in, std::vector<Field>& out){
GRID_ASSERT(0); assert(0);
}; };
}; };
@@ -580,7 +516,7 @@ class SchurStaggeredOperator : public SchurOperatorBase<Field> {
public: public:
SchurStaggeredOperator (Matrix &Mat): _Mat(Mat), tmp(_Mat.RedBlackGrid()) SchurStaggeredOperator (Matrix &Mat): _Mat(Mat), tmp(_Mat.RedBlackGrid())
{ {
GRID_ASSERT( _Mat.isTrivialEE() ); assert( _Mat.isTrivialEE() );
mass = _Mat.Mass(); mass = _Mat.Mass();
} }
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
@@ -611,7 +547,7 @@ class SchurStaggeredOperator : public SchurOperatorBase<Field> {
Mpc(in,out); Mpc(in,out);
} }
virtual void MpcDagMpc(const Field &in, Field &out) { virtual void MpcDagMpc(const Field &in, Field &out) {
GRID_ASSERT(0);// Never need with staggered assert(0);// Never need with staggered
} }
}; };
template<class Matrix,class Field> using SchurStagOperator = SchurStaggeredOperator<Matrix,Field>; template<class Matrix,class Field> using SchurStagOperator = SchurStaggeredOperator<Matrix,Field>;
@@ -623,7 +559,7 @@ template<class Field> class OperatorFunction {
public: public:
virtual void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) = 0; virtual void operator() (LinearOperatorBase<Field> &Linop, const Field &in, Field &out) = 0;
virtual void operator() (LinearOperatorBase<Field> &Linop, const std::vector<Field> &in,std::vector<Field> &out) { virtual void operator() (LinearOperatorBase<Field> &Linop, const std::vector<Field> &in,std::vector<Field> &out) {
GRID_ASSERT(in.size()==out.size()); assert(in.size()==out.size());
for(int k=0;k<in.size();k++){ for(int k=0;k<in.size();k++){
(*this)(Linop,in[k],out[k]); (*this)(Linop,in[k],out[k]);
} }
@@ -637,7 +573,7 @@ public:
virtual void operator() (const std::vector<Field> &in, std::vector<Field> &out) virtual void operator() (const std::vector<Field> &in, std::vector<Field> &out)
{ {
GRID_ASSERT(in.size() == out.size()); assert(in.size() == out.size());
for (unsigned int i = 0; i < in.size(); ++i) for (unsigned int i = 0; i < in.size(); ++i)
{ {

View File

@@ -45,11 +45,6 @@ public:
M(in,tmp); M(in,tmp);
Mdag(tmp,out); Mdag(tmp,out);
} }
virtual void MMdag(const Field &in, Field &out) {
Field tmp (in.Grid());
Mdag(in,tmp);
M(tmp,out);
}
virtual void Mdiag (const Field &in, Field &out)=0; virtual void Mdiag (const Field &in, Field &out)=0;
virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0; virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0;
virtual void MdirAll (const Field &in, std::vector<Field> &out)=0; virtual void MdirAll (const Field &in, std::vector<Field> &out)=0;

View File

@@ -59,7 +59,7 @@ public:
RealD diff = hi-lo; RealD diff = hi-lo;
RealD delta = diff*1.0e-9; RealD delta = diff*1.0e-9;
for (RealD x=lo; x<hi; x+=delta) { for (RealD x=lo; x<hi; x+=delta) {
delta*=1.02; delta*=1.1;
RealD f = approx(x); RealD f = approx(x);
out<< x<<" "<<f<<std::endl; out<< x<<" "<<f<<std::endl;
} }
@@ -131,26 +131,6 @@ public:
Coeffs[j] = s * 2.0/order; Coeffs[j] = s * 2.0/order;
} }
}; };
template<class functor>
void Init(RealD _lo,RealD _hi,int _order, functor & func)
{
lo=_lo;
hi=_hi;
order=_order;
if(order < 2) exit(-1);
Coeffs.resize(order);
for(int j=0;j<order;j++){
RealD s=0;
for(int k=0;k<order;k++){
RealD y=std::cos(M_PI*(k+0.5)/order);
RealD x=0.5*(y*(hi-lo)+(hi+lo));
RealD f=func(x);
s=s+f*std::cos( j*M_PI*(k+0.5)/order );
}
Coeffs[j] = s * 2.0/order;
}
};
void JacksonSmooth(void){ void JacksonSmooth(void){
@@ -269,9 +249,7 @@ public:
RealD xscale = 2.0/(hi-lo); RealD xscale = 2.0/(hi-lo);
RealD mscale = -(hi+lo)/(hi-lo); RealD mscale = -(hi+lo)/(hi-lo);
Linop.HermOp(T0,y); Linop.HermOp(T0,y);
grid->Barrier();
axpby(T1,xscale,mscale,y,in); axpby(T1,xscale,mscale,y,in);
grid->Barrier();
// sum = .5 c[0] T0 + c[1] T1 // sum = .5 c[0] T0 + c[1] T1
// out = ()*T0 + Coeffs[1]*T1; // out = ()*T0 + Coeffs[1]*T1;

View File

@@ -121,7 +121,7 @@ double AlgRemez::generateApprox(int num_degree, int den_degree,
// Reallocate arrays, since degree has changed // Reallocate arrays, since degree has changed
if (num_degree != n || den_degree != d) allocate(num_degree,den_degree); if (num_degree != n || den_degree != d) allocate(num_degree,den_degree);
GRID_ASSERT(a_len<=SUM_MAX); assert(a_len<=SUM_MAX);
step = new bigfloat[num_degree+den_degree+2]; step = new bigfloat[num_degree+den_degree+2];
@@ -151,9 +151,9 @@ double AlgRemez::generateApprox(int num_degree, int den_degree,
equations(); equations();
if (delta < tolerance) { if (delta < tolerance) {
std::cout<<"Delta too small, try increasing precision\n"; std::cout<<"Delta too small, try increasing precision\n";
GRID_ASSERT(0); assert(0);
}; };
GRID_ASSERT( delta>= tolerance); assert( delta>= tolerance);
search(step); search(step);
} }

View File

@@ -134,7 +134,7 @@ class AlgRemez
virtual ~AlgRemez(); virtual ~AlgRemez();
int getDegree(void){ int getDegree(void){
GRID_ASSERT(n==d); assert(n==d);
return n; return n;
} }
// Reset the bounds of the approximation // Reset the bounds of the approximation

View File

@@ -28,11 +28,11 @@ void AlgRemezGeneral::setupPolyProperties(int num_degree, int den_degree, PolyTy
pow_n = num_degree; pow_n = num_degree;
pow_d = den_degree; pow_d = den_degree;
if(pow_n % 2 == 0 && num_type_in == PolyType::Odd) GRID_ASSERT(0); if(pow_n % 2 == 0 && num_type_in == PolyType::Odd) assert(0);
if(pow_n % 2 == 1 && num_type_in == PolyType::Even) GRID_ASSERT(0); if(pow_n % 2 == 1 && num_type_in == PolyType::Even) assert(0);
if(pow_d % 2 == 0 && den_type_in == PolyType::Odd) GRID_ASSERT(0); if(pow_d % 2 == 0 && den_type_in == PolyType::Odd) assert(0);
if(pow_d % 2 == 1 && den_type_in == PolyType::Even) GRID_ASSERT(0); if(pow_d % 2 == 1 && den_type_in == PolyType::Even) assert(0);
num_type = num_type_in; num_type = num_type_in;
den_type = den_type_in; den_type = den_type_in;
@@ -112,9 +112,9 @@ double AlgRemezGeneral::generateApprox(const int num_degree, const int den_degre
equations(); equations();
if (delta < tolerance) { if (delta < tolerance) {
std::cout<<"Iteration " << iter-1 << " delta too small (" << delta << "<" << tolerance << "), try increasing precision\n"; std::cout<<"Iteration " << iter-1 << " delta too small (" << delta << "<" << tolerance << "), try increasing precision\n";
GRID_ASSERT(0); assert(0);
}; };
GRID_ASSERT( delta>= tolerance ); assert( delta>= tolerance );
search(); search();
} }
@@ -278,7 +278,7 @@ void AlgRemezGeneral::equations(){
if(num_pows[j] != -1){ *aa++ = z; t++; } if(num_pows[j] != -1){ *aa++ = z; t++; }
z *= x; z *= x;
} }
GRID_ASSERT(t == n+1); assert(t == n+1);
z = (bigfloat)1l; z = (bigfloat)1l;
t = 0; t = 0;
@@ -286,7 +286,7 @@ void AlgRemezGeneral::equations(){
if(den_pows[j] != -1){ *aa++ = -y * z; t++; } if(den_pows[j] != -1){ *aa++ = -y * z; t++; }
z *= x; z *= x;
} }
GRID_ASSERT(t == d); assert(t == d);
B[i] = y * z; // Right hand side vector B[i] = y * z; // Right hand side vector
} }

View File

@@ -106,7 +106,7 @@ class AlgRemezGeneral{
bigfloat (*f)(bigfloat x, void *data), void *data); bigfloat (*f)(bigfloat x, void *data), void *data);
inline int getDegree(void) const{ inline int getDegree(void) const{
GRID_ASSERT(n==d); assert(n==d);
return n; return n;
} }
// Reset the bounds of the approximation // Reset the bounds of the approximation

View File

@@ -74,7 +74,7 @@ bigfloat epsilonMobius(bigfloat x, void* data){
void computeZmobiusOmega(std::vector<ComplexD> &omega_out, const int Ls_out, void computeZmobiusOmega(std::vector<ComplexD> &omega_out, const int Ls_out,
const std::vector<RealD> &omega_in, const int Ls_in, const std::vector<RealD> &omega_in, const int Ls_in,
const RealD lambda_bound){ const RealD lambda_bound){
GRID_ASSERT(omega_in.size() == Ls_in); assert(omega_in.size() == Ls_in);
omega_out.resize(Ls_out); omega_out.resize(Ls_out);
//Use the Remez algorithm to generate the appropriate rational polynomial //Use the Remez algorithm to generate the appropriate rational polynomial

View File

@@ -55,17 +55,16 @@ NAMESPACE_BEGIN(Grid);
typedef cublasHandle_t gridblasHandle_t; typedef cublasHandle_t gridblasHandle_t;
#endif #endif
#ifdef GRID_SYCL #ifdef GRID_SYCL
typedef sycl::queue *gridblasHandle_t; typedef cl::sycl::queue *gridblasHandle_t;
#endif #endif
#ifdef GRID_ONE_MKL #ifdef GRID_ONE_MKL
typedef sycl::queue *gridblasHandle_t; typedef cl::sycl::queue *gridblasHandle_t;
#endif #endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL) #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
typedef int32_t gridblasHandle_t; typedef int32_t gridblasHandle_t;
#endif #endif
enum GridBLASOperation_t { GridBLAS_OP_N, GridBLAS_OP_T, GridBLAS_OP_C } ; enum GridBLASOperation_t { GridBLAS_OP_N, GridBLAS_OP_T, GridBLAS_OP_C } ;
enum GridBLASPrecision_t { GridBLAS_PRECISION_DEFAULT, GridBLAS_PRECISION_16F, GridBLAS_PRECISION_16BF, GridBLAS_PRECISION_TF32 };
class GridBLAS { class GridBLAS {
public: public:
@@ -90,29 +89,15 @@ public:
gridblasHandle = theGridAccelerator; gridblasHandle = theGridAccelerator;
#endif #endif
#ifdef GRID_ONE_MKL #ifdef GRID_ONE_MKL
sycl::gpu_selector selector; cl::sycl::gpu_selector selector;
sycl::device selectedDevice { selector }; cl::sycl::device selectedDevice { selector };
sycl::property_list q_prop{sycl::property::queue::in_order()}; cl::sycl::property_list q_prop{cl::sycl::property::queue::in_order()};
gridblasHandle =new sycl::queue (selectedDevice,q_prop); gridblasHandle =new sycl::queue (selectedDevice,q_prop);
#endif #endif
gridblasInit=1; gridblasInit=1;
} }
} }
#ifdef GRID_CUDA
cublasComputeType_t toDataType(GridBLASPrecision_t p) {
switch (p) {
case GridBLAS_PRECISION_16F:
return CUBLAS_COMPUTE_32F_FAST_16F;
case GridBLAS_PRECISION_16BF:
return CUBLAS_COMPUTE_32F_FAST_16BF;
case GridBLAS_PRECISION_TF32:
return CUBLAS_COMPUTE_32F_FAST_TF32;
default:
GRID_ASSERT(0);
}
}
#endif
// Force construct once // Force construct once
GridBLAS() { Init(); }; GridBLAS() { Init(); };
~GridBLAS() { }; ~GridBLAS() { };
@@ -134,11 +119,11 @@ public:
{ {
#ifdef GRID_HIP #ifdef GRID_HIP
auto err = hipDeviceSynchronize(); auto err = hipDeviceSynchronize();
GRID_ASSERT(err==hipSuccess); assert(err==hipSuccess);
#endif #endif
#ifdef GRID_CUDA #ifdef GRID_CUDA
auto err = cudaDeviceSynchronize(); auto err = cudaDeviceSynchronize();
GRID_ASSERT(err==cudaSuccess); assert(err==cudaSuccess);
#endif #endif
#ifdef GRID_SYCL #ifdef GRID_SYCL
accelerator_barrier(); accelerator_barrier();
@@ -153,10 +138,8 @@ public:
deviceVector<ComplexD*> &Amk, // pointer list to matrices deviceVector<ComplexD*> &Amk, // pointer list to matrices
deviceVector<ComplexD*> &Bkn, deviceVector<ComplexD*> &Bkn,
ComplexD beta, ComplexD beta,
deviceVector<ComplexD*> &Cmn, deviceVector<ComplexD*> &Cmn)
GridBLASPrecision_t precision = GridBLAS_PRECISION_DEFAULT)
{ {
GRID_ASSERT(precision == GridBLAS_PRECISION_DEFAULT);
gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
m,n,k, m,n,k,
alpha, alpha,
@@ -218,17 +201,15 @@ public:
deviceVector<ComplexD*> &Amk, // pointer list to matrices deviceVector<ComplexD*> &Amk, // pointer list to matrices
deviceVector<ComplexD*> &Bkn, deviceVector<ComplexD*> &Bkn,
ComplexD beta, ComplexD beta,
deviceVector<ComplexD*> &Cmn, deviceVector<ComplexD*> &Cmn)
GridBLASPrecision_t precision = GridBLAS_PRECISION_DEFAULT)
{ {
GRID_ASSERT(precision == GridBLAS_PRECISION_DEFAULT);
RealD t2=usecond(); RealD t2=usecond();
int32_t batchCount = Amk.size(); int32_t batchCount = Amk.size();
GRID_ASSERT(Bkn.size()==batchCount); assert(Bkn.size()==batchCount);
GRID_ASSERT(Cmn.size()==batchCount); assert(Cmn.size()==batchCount);
//assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
//assert(OpB!=GridBLAS_OP_T); assert(OpB!=GridBLAS_OP_T);
int lda = m; // m x k column major int lda = m; // m x k column major
int ldb = k; // k x n column major int ldb = k; // k x n column major
@@ -265,7 +246,7 @@ public:
(hipblasDoubleComplex **)&Cmn[0], ldc, (hipblasDoubleComplex **)&Cmn[0], ldc,
batchCount); batchCount);
// std::cout << " hipblas return code " <<(int)err<<std::endl; // std::cout << " hipblas return code " <<(int)err<<std::endl;
GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS); assert(err==HIPBLAS_STATUS_SUCCESS);
#endif #endif
#ifdef GRID_CUDA #ifdef GRID_CUDA
cublasOperation_t hOpA; cublasOperation_t hOpA;
@@ -286,7 +267,7 @@ public:
(cuDoubleComplex *) &beta_p[0], (cuDoubleComplex *) &beta_p[0],
(cuDoubleComplex **)&Cmn[0], ldc, (cuDoubleComplex **)&Cmn[0], ldc,
batchCount); batchCount);
GRID_ASSERT(err==CUBLAS_STATUS_SUCCESS); assert(err==CUBLAS_STATUS_SUCCESS);
#endif #endif
#ifdef GRID_SYCL #ifdef GRID_SYCL
int64_t m64=m; int64_t m64=m;
@@ -386,67 +367,28 @@ public:
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k); Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n); Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n); Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn ; eCmn = beta * eCmn + alpha * eAmk * eBkn ;
else
eCmn = alpha * eAmk * eBkn ;
}); });
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) { } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, { thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m); Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n); Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n); Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ; eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
else
eCmn = alpha * eAmk.adjoint() * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
else
eCmn = alpha * eAmk.transpose() * eBkn ;
}); });
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) { } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
thread_for (p, batchCount, { thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k); Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k); Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n); Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ; eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
else
eCmn = alpha * eAmk * eBkn.adjoint() ;
});
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
}); });
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) { } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
thread_for (p, batchCount, { thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m); Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k); Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n); Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ; eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
else
eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
} );
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
else
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
} ); } );
} else { } else {
assert(0); assert(0);
@@ -467,14 +409,13 @@ public:
deviceVector<ComplexF*> &Amk, // pointer list to matrices deviceVector<ComplexF*> &Amk, // pointer list to matrices
deviceVector<ComplexF*> &Bkn, deviceVector<ComplexF*> &Bkn,
ComplexF beta, ComplexF beta,
deviceVector<ComplexF*> &Cmn, deviceVector<ComplexF*> &Cmn)
GridBLASPrecision_t precision = GridBLAS_PRECISION_DEFAULT)
{ {
RealD t2=usecond(); RealD t2=usecond();
int32_t batchCount = Amk.size(); int32_t batchCount = Amk.size();
//assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
//assert(OpB!=GridBLAS_OP_T); assert(OpB!=GridBLAS_OP_T);
int lda = m; // m x k column major int lda = m; // m x k column major
int ldb = k; // k x n column major int ldb = k; // k x n column major
@@ -490,10 +431,9 @@ public:
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexF)); acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexF));
RealD t0=usecond(); RealD t0=usecond();
GRID_ASSERT(Bkn.size()==batchCount); assert(Bkn.size()==batchCount);
GRID_ASSERT(Cmn.size()==batchCount); assert(Cmn.size()==batchCount);
#ifdef GRID_HIP #ifdef GRID_HIP
GRID_ASSERT(precision == GridBLAS_PRECISION_DEFAULT);
hipblasOperation_t hOpA; hipblasOperation_t hOpA;
hipblasOperation_t hOpB; hipblasOperation_t hOpB;
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N; if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
@@ -513,7 +453,7 @@ public:
(hipblasComplex **)&Cmn[0], ldc, (hipblasComplex **)&Cmn[0], ldc,
batchCount); batchCount);
GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS); assert(err==HIPBLAS_STATUS_SUCCESS);
#endif #endif
#ifdef GRID_CUDA #ifdef GRID_CUDA
cublasOperation_t hOpA; cublasOperation_t hOpA;
@@ -524,9 +464,7 @@ public:
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N; if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T; if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C; if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
cublasStatus_t err; auto err = cublasCgemmBatched(gridblasHandle,
if (precision == GridBLAS_PRECISION_DEFAULT) {
err = cublasCgemmBatched(gridblasHandle,
hOpA, hOpA,
hOpB, hOpB,
m,n,k, m,n,k,
@@ -536,23 +474,9 @@ public:
(cuComplex *) &beta_p[0], (cuComplex *) &beta_p[0],
(cuComplex **)&Cmn[0], ldc, (cuComplex **)&Cmn[0], ldc,
batchCount); batchCount);
} else { assert(err==CUBLAS_STATUS_SUCCESS);
cublasComputeType_t compute_precision = toDataType(precision);
err = cublasGemmBatchedEx(gridblasHandle,
hOpA,
hOpB,
m,n,k,
(void *) &alpha_p[0],
(void **)&Amk[0], CUDA_C_32F, lda,
(void **)&Bkn[0], CUDA_C_32F, ldb,
(void *) &beta_p[0],
(void **)&Cmn[0], CUDA_C_32F, ldc,
batchCount, compute_precision, CUBLAS_GEMM_DEFAULT);
}
GRID_ASSERT(err==CUBLAS_STATUS_SUCCESS);
#endif #endif
#ifdef GRID_SYCL #ifdef GRID_SYCL
GRID_ASSERT(precision == GridBLAS_PRECISION_DEFAULT);
int64_t m64=m; int64_t m64=m;
int64_t n64=n; int64_t n64=n;
int64_t k64=k; int64_t k64=k;
@@ -584,77 +508,34 @@ public:
synchronise(); synchronise();
#endif #endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) #if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
GRID_ASSERT(precision == GridBLAS_PRECISION_DEFAULT);
// Need a default/reference implementation; use Eigen // Need a default/reference implementation; use Eigen
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) { if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, { thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k); Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n); Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n); Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn ; eCmn = beta * eCmn + alpha * eAmk * eBkn ;
else
eCmn = alpha * eAmk * eBkn ;
}); });
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) { } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, { thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m); Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n); Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n); Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ; eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
else
eCmn = alpha * eAmk.adjoint() * eBkn ;
});
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
else
eCmn = alpha * eAmk.transpose() * eBkn ;
}); });
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) { } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
thread_for (p, batchCount, { thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k); Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k); Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n); Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ; eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
else
eCmn = alpha * eAmk * eBkn.adjoint() ;
});
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
else
eCmn = alpha * eAmk * eBkn.transpose() ;
}); });
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) { } else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
thread_for (p, batchCount, { thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m); Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k); Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n); Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ; eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
else
eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
} );
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
else
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
} ); } );
} else { } else {
assert(0); assert(0);
@@ -681,8 +562,8 @@ public:
RealD t2=usecond(); RealD t2=usecond();
int32_t batchCount = Amk.size(); int32_t batchCount = Amk.size();
GRID_ASSERT(OpA!=GridBLAS_OP_C); // Real case no conjugate assert(OpA!=GridBLAS_OP_C); // Real case no conjugate
GRID_ASSERT(OpB!=GridBLAS_OP_C); assert(OpB!=GridBLAS_OP_C);
int lda = m; // m x k column major int lda = m; // m x k column major
int ldb = k; // k x n column major int ldb = k; // k x n column major
@@ -698,8 +579,8 @@ public:
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealF)); acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealF));
RealD t0=usecond(); RealD t0=usecond();
GRID_ASSERT(Bkn.size()==batchCount); assert(Bkn.size()==batchCount);
GRID_ASSERT(Cmn.size()==batchCount); assert(Cmn.size()==batchCount);
#ifdef GRID_HIP #ifdef GRID_HIP
hipblasOperation_t hOpA; hipblasOperation_t hOpA;
hipblasOperation_t hOpB; hipblasOperation_t hOpB;
@@ -719,7 +600,7 @@ public:
(float *) &beta_p[0], (float *) &beta_p[0],
(float **)&Cmn[0], ldc, (float **)&Cmn[0], ldc,
batchCount); batchCount);
GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS); assert(err==HIPBLAS_STATUS_SUCCESS);
#endif #endif
#ifdef GRID_CUDA #ifdef GRID_CUDA
cublasOperation_t hOpA; cublasOperation_t hOpA;
@@ -740,7 +621,7 @@ public:
(float *) &beta_p[0], (float *) &beta_p[0],
(float **)&Cmn[0], ldc, (float **)&Cmn[0], ldc,
batchCount); batchCount);
GRID_ASSERT(err==CUBLAS_STATUS_SUCCESS); assert(err==CUBLAS_STATUS_SUCCESS);
#endif #endif
#ifdef GRID_SYCL #ifdef GRID_SYCL
int64_t m64=m; int64_t m64=m;
@@ -780,40 +661,28 @@ public:
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k); Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n); Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n); Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn ; eCmn = beta * eCmn + alpha * eAmk * eBkn ;
else
eCmn = alpha * eAmk * eBkn ;
}); });
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) { } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, { thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m); Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n); Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n); Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ; eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
else
eCmn = alpha * eAmk.transpose() * eBkn ;
}); });
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) { } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, { thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k); Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k); Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n); Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ; eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
else
eCmn = alpha * eAmk * eBkn.transpose() ;
}); });
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) { } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, { thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m); Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k); Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n); Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ; eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
else
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
} ); } );
} else { } else {
assert(0); assert(0);
@@ -840,8 +709,8 @@ public:
RealD t2=usecond(); RealD t2=usecond();
int32_t batchCount = Amk.size(); int32_t batchCount = Amk.size();
GRID_ASSERT(OpA!=GridBLAS_OP_C); // Real case no conjugate assert(OpA!=GridBLAS_OP_C); // Real case no conjugate
GRID_ASSERT(OpB!=GridBLAS_OP_C); assert(OpB!=GridBLAS_OP_C);
int lda = m; // m x k column major int lda = m; // m x k column major
int ldb = k; // k x n column major int ldb = k; // k x n column major
@@ -858,8 +727,8 @@ public:
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealD)); acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealD));
RealD t0=usecond(); RealD t0=usecond();
GRID_ASSERT(Bkn.size()==batchCount); assert(Bkn.size()==batchCount);
GRID_ASSERT(Cmn.size()==batchCount); assert(Cmn.size()==batchCount);
#ifdef GRID_HIP #ifdef GRID_HIP
hipblasOperation_t hOpA; hipblasOperation_t hOpA;
hipblasOperation_t hOpB; hipblasOperation_t hOpB;
@@ -879,7 +748,7 @@ public:
(double *) &beta_p[0], (double *) &beta_p[0],
(double **)&Cmn[0], ldc, (double **)&Cmn[0], ldc,
batchCount); batchCount);
GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS); assert(err==HIPBLAS_STATUS_SUCCESS);
#endif #endif
#ifdef GRID_CUDA #ifdef GRID_CUDA
cublasOperation_t hOpA; cublasOperation_t hOpA;
@@ -900,7 +769,7 @@ public:
(double *) &beta_p[0], (double *) &beta_p[0],
(double **)&Cmn[0], ldc, (double **)&Cmn[0], ldc,
batchCount); batchCount);
GRID_ASSERT(err==CUBLAS_STATUS_SUCCESS); assert(err==CUBLAS_STATUS_SUCCESS);
#endif #endif
#ifdef GRID_SYCL #ifdef GRID_SYCL
int64_t m64=m; int64_t m64=m;
@@ -940,40 +809,28 @@ public:
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k); Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n); Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n); Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn ; eCmn = beta * eCmn + alpha * eAmk * eBkn ;
else
eCmn = alpha * eAmk * eBkn ;
}); });
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) { } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
thread_for (p, batchCount, { thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m); Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n); Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n); Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ; eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
else
eCmn = alpha * eAmk.transpose() * eBkn ;
}); });
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) { } else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, { thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k); Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k); Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n); Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ; eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
else
eCmn = alpha * eAmk * eBkn.transpose() ;
}); });
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) { } else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
thread_for (p, batchCount, { thread_for (p, batchCount, {
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m); Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k); Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n); Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
if (std::abs(beta) != 0.0)
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ; eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
else
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
}); });
} else { } else {
assert(0); assert(0);
@@ -984,336 +841,6 @@ public:
RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount; RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount;
} }
/*
Inverse and Determinant
- CPU version uses Eigen
- GPU version uses LAPACK-compatible getrf / getri
Design comment: Eigen does not expose getrf / getri in a LAPACK compatible manner.
Overhead to go through getrf / getri for CPU version too large.
Current interface therefore only guarantees the inverse and determinant
functions on all platforms but not the getrf / getri ones.
*/
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
void inverseBatched(int64_t n,
deviceVector<ComplexD*> &Ann,
deviceVector<ComplexD*> &Cnn) {
int64_t batchCount = Ann.size();
GRID_ASSERT(batchCount == Cnn.size());
thread_for(p,batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAnn(Ann[p],n,n);
Eigen::Map<Eigen::MatrixXcd> eCnn(Cnn[p],n,n);
eCnn = eAnn.inverse();
});
}
void inverseBatched(int64_t n,
deviceVector<ComplexF*> &Ann,
deviceVector<ComplexF*> &Cnn) {
int64_t batchCount = Ann.size();
GRID_ASSERT(batchCount == Cnn.size());
thread_for(p,batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAnn(Ann[p],n,n);
Eigen::Map<Eigen::MatrixXcf> eCnn(Cnn[p],n,n);
eCnn = eAnn.inverse();
});
}
void determinantBatched(int64_t n,
deviceVector<ComplexD*> &Ann,
deviceVector<ComplexD*> &C) {
int64_t batchCount = Ann.size();
GRID_ASSERT(batchCount == C.size());
thread_for(p,batchCount, {
Eigen::Map<Eigen::MatrixXcd> eAnn(Ann[p],n,n);
*C[p] = eAnn.determinant();
});
}
void determinantBatched(int64_t n,
deviceVector<ComplexF*> &Ann,
deviceVector<ComplexF*> &C) {
int64_t batchCount = Ann.size();
GRID_ASSERT(batchCount == C.size());
thread_for(p,batchCount, {
Eigen::Map<Eigen::MatrixXcf> eAnn(Ann[p],n,n);
*C[p] = eAnn.determinant();
});
}
#else
#ifdef GRID_SYCL
template<typename T>
void getrfBatchedSYCL(int64_t n,
deviceVector<T*> &Ann,
deviceVector<int64_t> &ipiv,
deviceVector<int64_t> &info) {
int64_t batchCount = Ann.size();
static deviceVector<T> scratchpad;
int64_t sp_size = oneapi::mkl::lapack::getrf_batch_scratchpad_size<T>(*gridblasHandle, &n, &n, &n, (int64_t)1, &batchCount);
if (sp_size > scratchpad.size())
scratchpad.resize(sp_size);
static deviceVector<int64_t*> _ipiv;
if (batchCount > _ipiv.size())
_ipiv.resize(batchCount);
int64_t** p_ipiv = &_ipiv[0];
int64_t* pipiv = &ipiv[0];
accelerator_for(i, batchCount, 1, { p_ipiv[i] = &pipiv[i*n]; });
oneapi::mkl::lapack::getrf_batch(*gridblasHandle,
&n, &n,
(T **)&Ann[0],
&n,
(int64_t**)&_ipiv[0],
(int64_t)1, &batchCount,
(T*)&scratchpad[0], (int64_t)scratchpad.size(),
std::vector<sycl::event>());
synchronise();
}
#endif
void getrfBatched(int64_t n,
deviceVector<ComplexD*> &Ann,
deviceVector<int64_t> &ipiv,
deviceVector<int64_t> &info)
{
int64_t batchCount = Ann.size();
GRID_ASSERT(ipiv.size()==batchCount*n);
GRID_ASSERT(info.size()==batchCount);
#ifdef GRID_HIP
auto err = hipblasZgetrfBatched(gridblasHandle,(int)n,
(hipblasDoubleComplex **)&Ann[0], (int)n,
(int*) &ipiv[0],
(int*) &info[0],
(int)batchCount);
GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
auto err = cublasZgetrfBatched(gridblasHandle, (int)n,
(cuDoubleComplex **)&Ann[0], (int)n,
(int*) &ipiv[0],
(int*) &info[0],
(int)batchCount);
GRID_ASSERT(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
getrfBatchedSYCL(n, Ann, ipiv, info);
#endif
}
void getrfBatched(int64_t n,
deviceVector<ComplexF*> &Ann,
deviceVector<int64_t> &ipiv,
deviceVector<int64_t> &info)
{
int64_t batchCount = Ann.size();
GRID_ASSERT(ipiv.size()==batchCount*n);
GRID_ASSERT(info.size()==batchCount);
#ifdef GRID_HIP
auto err = hipblasCgetrfBatched(gridblasHandle,(int)n,
(hipblasComplex **)&Ann[0], (int)n,
(int*) &ipiv[0],
(int*) &info[0],
(int)batchCount);
GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
auto err = cublasCgetrfBatched(gridblasHandle, (int)n,
(cuComplex **)&Ann[0], (int)n,
(int*) &ipiv[0],
(int*) &info[0],
(int)batchCount);
GRID_ASSERT(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
getrfBatchedSYCL(n, Ann, ipiv, info);
#endif
}
#ifdef GRID_SYCL
template<typename T>
void getriBatchedSYCL(int64_t n,
deviceVector<T*> &Ann,
deviceVector<int64_t> &ipiv,
deviceVector<int64_t> &info,
deviceVector<T*> &Cnn) {
int64_t batchCount = Ann.size();
static deviceVector<T> scratchpad;
int64_t sp_size = oneapi::mkl::lapack::getri_batch_scratchpad_size<T>(*gridblasHandle, &n, &n, (int64_t)1, &batchCount);
if (sp_size > scratchpad.size())
scratchpad.resize(sp_size);
static deviceVector<int64_t*> _ipiv;
if (batchCount > _ipiv.size())
_ipiv.resize(batchCount);
int64_t** p_ipiv = &_ipiv[0];
int64_t* pipiv = &ipiv[0];
accelerator_for(i, batchCount, 1, { p_ipiv[i] = &pipiv[i*n]; });
oneapi::mkl::lapack::getri_batch(*gridblasHandle,
&n,
(T **)&Ann[0],
&n,
(int64_t**)p_ipiv,
(int64_t)1, &batchCount,
(T *)&scratchpad[0], (int64_t)scratchpad.size(),
std::vector<sycl::event>());
synchronise();
T** pA = &Ann[0];
T** pC = &Cnn[0];
accelerator_for(i, batchCount*n*n, 1, {
auto j = i / batchCount;
auto k = i % batchCount;
pC[k][j] = pA[k][j];
});
}
#endif
void getriBatched(int64_t n,
deviceVector<ComplexD*> &Ann,
deviceVector<int64_t> &ipiv,
deviceVector<int64_t> &info,
deviceVector<ComplexD*> &Cnn)
{
int64_t batchCount = Ann.size();
GRID_ASSERT(ipiv.size()==batchCount*n);
GRID_ASSERT(info.size()==batchCount);
GRID_ASSERT(Cnn.size()==batchCount);
#ifdef GRID_HIP
auto err = hipblasZgetriBatched(gridblasHandle,(int)n,
(hipblasDoubleComplex **)&Ann[0], (int)n,
(int*) &ipiv[0],
(hipblasDoubleComplex **)&Cnn[0], (int)n,
(int*) &info[0],
(int)batchCount);
GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
auto err = cublasZgetriBatched(gridblasHandle, (int)n,
(cuDoubleComplex **)&Ann[0], (int)n,
(int*) &ipiv[0],
(cuDoubleComplex **)&Cnn[0], (int)n,
(int*) &info[0],
(int)batchCount);
GRID_ASSERT(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
getriBatchedSYCL(n, Ann, ipiv, info, Cnn);
#endif
}
void getriBatched(int64_t n,
deviceVector<ComplexF*> &Ann,
deviceVector<int64_t> &ipiv,
deviceVector<int64_t> &info,
deviceVector<ComplexF*> &Cnn)
{
int64_t batchCount = Ann.size();
GRID_ASSERT(ipiv.size()==batchCount*n);
GRID_ASSERT(info.size()==batchCount);
GRID_ASSERT(Cnn.size()==batchCount);
#ifdef GRID_HIP
auto err = hipblasCgetriBatched(gridblasHandle,(int)n,
(hipblasComplex **)&Ann[0], (int)n,
(int*) &ipiv[0],
(hipblasComplex **)&Cnn[0], (int)n,
(int*) &info[0],
(int)batchCount);
GRID_ASSERT(err==HIPBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_CUDA
auto err = cublasCgetriBatched(gridblasHandle, (int)n,
(cuComplex **)&Ann[0], (int)n,
(int*) &ipiv[0],
(cuComplex **)&Cnn[0], (int)n,
(int*) &info[0],
(int)batchCount);
GRID_ASSERT(err==CUBLAS_STATUS_SUCCESS);
#endif
#ifdef GRID_SYCL
getriBatchedSYCL(n, Ann, ipiv, info, Cnn);
#endif
}
template<typename dtype>
void inverseBatched(int64_t n,
deviceVector<dtype*> &Ann, // this will be overwritten with LU decomposition
deviceVector<dtype*> &Cnn // this will be overwritten with the inverse
) {
int64_t batchCount = Ann.size();
RealD t0 = usecond();
deviceVector<int64_t> ipiv(batchCount*n);
deviceVector<int64_t> info(batchCount);
//RealD t1 = usecond();
getrfBatched(n, Ann, ipiv, info);
// test info for non-invertibility? set to nan if yes?
getriBatched(n, Ann, ipiv, info, Cnn);
//synchronise();
//RealD t2 = usecond();
//std::cout << GridLogMessage << "Temp " << t1-t0 << " rf/ri " << t2-t1 << std::endl;
}
template<typename dtype>
void determinantBatched(int64_t n,
deviceVector<dtype*> &Ann, // this will be overwritten with LU decomposition
deviceVector<dtype*> &C // this will be overwritten with determinant
) {
int64_t batchCount = Ann.size();
//RealD t0 = usecond();
deviceVector<int64_t> ipiv(batchCount*n);
deviceVector<int64_t> info(batchCount);
dtype** pAnn = (dtype**)&Ann[0];
dtype** pC = (dtype**)&C[0];
#if defined(GRID_CUDA) || defined(GRID_HIP)
int* pipiv = (int*)&ipiv[0];
#else
int64_t* pipiv = (int64_t*)&ipiv[0];
#endif
//RealD t1 = usecond();
getrfBatched(n, Ann, ipiv, info);
//RealD t2 = usecond();
accelerator_for(i,batchCount,1,{
dtype det = 1.0;
for (int64_t j=0;j<n;j++) {
det *= pAnn[i][n*j + j];
// branchless signs
det *= (pipiv[i*n + j] == j+1) ? (1.0) : (-1.0);
}
*pC[i] = det;
});
//RealD t3 = usecond();
//std::cout << GridLogMessage << "Temp " << t1 - t0 << " rf/ri " << t2-t1 << "final" << t3 - t2 << std::endl;
}
#endif
template<class CComplex> template<class CComplex>
double benchmark(int M, int N, int K, int BATCH) double benchmark(int M, int N, int K, int BATCH)
{ {

View File

@@ -1,282 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: MomentumProject.h
Copyright (C) 2025
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
/*
MultiMomProject
Import vectors -> nxyz x (ncomponent x nt)
Import complex phases -> nmom x nxy
apply = via (possibly batched) GEMM
*/
template<class Field, class ComplexField>
class MomentumProject
{
public:
typedef typename Field::scalar_type scalar;
typedef typename Field::scalar_object scalar_object;
GridBase *grid;
uint64_t nmom;
uint64_t nxyz;
uint64_t nt;
uint64_t nbtw;
uint64_t words;
deviceVector<scalar> BLAS_V; //
deviceVector<scalar> BLAS_M; //
deviceVector<scalar> BLAS_P; //
MomentumProject(){};
~MomentumProject(){ Deallocate(); };
void Deallocate(void)
{
grid=nullptr;
nmom=0;
nxyz=0;
nt=0;
nbtw=0;
words=0;
BLAS_V.resize(0);
BLAS_M.resize(0);
BLAS_P.resize(0);
}
void Allocate(int _nmom,GridBase *_grid)
{
grid=_grid;
Coordinate ldims = grid->LocalDimensions();
nmom=_nmom;
nt = ldims[grid->Nd()-1];
nxyz = grid->lSites()/nt;
words = sizeof(scalar_object)/sizeof(scalar);
nbtw = nt * words;
BLAS_V.resize (nxyz * nt * words );
BLAS_M.resize (nmom * nxyz );
BLAS_P.resize (nmom * nt * words );
}
void ImportMomenta(const std::vector <ComplexField> &momenta)
{
GRID_ASSERT(momenta.size()==nmom);
// might as well just make the momenta here
typedef typename Field::vector_object vobj;
int nd = grid->_ndimension;
uint64_t sz = BLAS_M.size();
GRID_ASSERT(momenta.size()==nmom)
GRID_ASSERT(momenta[0].Grid()==grid);
GRID_ASSERT(sz = nxyz * nmom);
Coordinate rdimensions = grid->_rdimensions;
Coordinate ldims = grid->LocalDimensions();
int64_t osites = grid->oSites();
Coordinate simd = grid->_simd_layout;
const int Nsimd = vobj::Nsimd();
uint64_t lwords = words; // local variable for copy in to GPU
int64_t Nxyz = nxyz;
auto blasData_p = &BLAS_M[0];
for(int m=0;m<momenta.size();m++){
autoView( Data , momenta[m], AcceleratorRead);
auto Data_p = &Data[0];
accelerator_for(xyz,nxyz,1,{
//////////////////////////////////////////
// isite -- map lane within buffer to lane within lattice
////////////////////////////////////////////
Coordinate lcoor(nd,0);
Lexicographic::CoorFromIndex(lcoor,xyz,ldims);
Coordinate icoor(nd);
Coordinate ocoor(nd);
for (int d = 0; d < nd; d++) {
icoor[d] = lcoor[d]/rdimensions[d];
ocoor[d] = lcoor[d]%rdimensions[d];
}
int64_t osite;
int64_t isite;
Lexicographic::IndexFromCoor(ocoor,osite,rdimensions);
Lexicographic::IndexFromCoor(icoor,isite,simd);
// BLAS_M[nmom][slice_vol]
// Fortran Column major BLAS layout is M_xyz,mom
scalar data = extractLane(isite,Data[osite]);
uint64_t idx = xyz+m*Nxyz;
blasData_p[idx] = data;
});
}
}
void ImportVector(Field &vec)
{
typedef typename Field::vector_object vobj;
int nd = grid->_ndimension;
uint64_t sz = BLAS_V.size();
GRID_ASSERT(sz = nxyz * words * nt);
Coordinate rdimensions = grid->_rdimensions;
Coordinate ldims= grid->LocalDimensions();
int64_t osites = grid->oSites();
Coordinate simd = grid->_simd_layout;
const int Nsimd = vobj::Nsimd();
uint64_t lwords= words; // local variable for copy in to GPU
auto blasData_p = &BLAS_V[0];
autoView( Data , vec, AcceleratorRead);
auto Data_p = &Data[0];
int64_t nwords = words;// for capture
int64_t Nt = nt;// for capture
accelerator_for(sf,osites,Nsimd,{
#ifdef GRID_SIMT
{
int lane=acceleratorSIMTlane(Nsimd); // buffer lane
#else
for(int lane=0;lane<Nsimd;lane++) {
#endif
//////////////////////////////////////////
// isite -- map lane within buffer to lane within lattice
////////////////////////////////////////////
Coordinate lcoor(nd,0);
Coordinate icoor(nd);
Coordinate ocoor(nd);
Lexicographic::CoorFromIndex(icoor,lane,simd);
Lexicographic::CoorFromIndex(ocoor,sf,rdimensions);
int64_t l_xyz = 0;
for (int d = 0; d < nd; d++) {
lcoor[d] = rdimensions[d]*icoor[d] + ocoor[d];
}
uint64_t l_t = lcoor[nd-1];
Coordinate xyz_coor = lcoor;
xyz_coor[nd-1] =0;
Lexicographic::IndexFromCoor(xyz_coor,l_xyz,ldims);
scalar_object data = extractLane(lane,Data[sf]);
scalar *data_words = (scalar *) &data;
for(int w = 0 ; w < nwords; w++) {
// BLAS_V[slice_vol][nt][words]
// Fortran Column major BLAS layout is V_(t,w)_xyz
uint64_t idx = w+l_t*nwords + l_xyz * nwords * Nt;
blasData_p[idx] = data_words[w];
}
#ifdef GRID_SIMT
}
#else
}
#endif
});
}
void ExportMomentumProjection(std::vector<typename Field::scalar_object> &projection)
{
projection.resize(nmom*nt);
acceleratorCopyFromDevice(&BLAS_P[0],(scalar *)&projection[0],BLAS_P.size()*sizeof(scalar));
// Could decide on a layout late?
}
// Row major layout "C" order:
// BLAS_V[slice_vol][nt][words]
// BLAS_M[nmom][slice_vol]
// BLAS_P[nmom][nt][words]
//
// Fortran Column major BLAS layout is V_(w,t)_xyz
// Fortran Column major BLAS layout is M_xyz,mom
// Fortran Column major BLAS layout is P_(w,t),mom
//
// Projected
//
// P = (V * M)_(w,t),mom
//
void Project(Field &data,std::vector< typename Field::scalar_object > & projected_gdata)
{
this->ImportVector(data);
std::vector< typename Field::scalar_object > projected_planes;
deviceVector<scalar *> Vd(1);
deviceVector<scalar *> Md(1);
deviceVector<scalar *> Pd(1);
scalar * Vh = & BLAS_V[0];
scalar * Mh = & BLAS_M[0];
scalar * Ph = & BLAS_P[0];
acceleratorPut(Vd[0],Vh);
acceleratorPut(Md[0],Mh);
acceleratorPut(Pd[0],Ph);
GridBLAS BLAS;
/////////////////////////////////////////
// P_im = VMmx . Vxi
/////////////////////////////////////////
BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
words*nt,nmom,nxyz,
scalar(1.0),
Vd,
Md,
scalar(0.0), // wipe out result
Pd);
BLAS.synchronise();
ExportMomentumProjection(projected_planes); // resizes
/////////////////////////////////
// Reduce across MPI ranks
/////////////////////////////////
int nd = grid->Nd();
int gt = grid->GlobalDimensions()[nd-1];
int lt = grid->LocalDimensions()[nd-1];
projected_gdata.resize(gt*nmom);
for(int t=0;t<gt*nmom;t++){ // global Nt array with zeroes for stuff not on this node
projected_gdata[t]=Zero();
}
for(int t=0;t<lt;t++){
for(int m=0;m<nmom;m++){
int st = grid->LocalStarts()[nd-1];
projected_gdata[t+st + gt*m] = projected_planes[t+lt*m];
}}
grid->GlobalSumVector((scalar *)&projected_gdata[0],gt*nmom*words);
}
};
NAMESPACE_END(Grid);

View File

@@ -69,8 +69,8 @@ public:
DeflatedGuesser(const std::vector<Field> & _evec, const std::vector<RealD> & _eval, const unsigned int _N) DeflatedGuesser(const std::vector<Field> & _evec, const std::vector<RealD> & _eval, const unsigned int _N)
: evec(_evec), eval(_eval), N(_N) : evec(_evec), eval(_eval), N(_N)
{ {
GRID_ASSERT(evec.size()==eval.size()); assert(evec.size()==eval.size());
GRID_ASSERT(N <= evec.size()); assert(N <= evec.size());
} }
virtual void operator()(const Field &src,Field &guess) { virtual void operator()(const Field &src,Field &guess) {
@@ -141,7 +141,8 @@ public:
} }
//postprocessing //postprocessing
std::cout << GridLogMessage << "Start BlockPromote for loop" << std::endl; std::cout << GridLogMessage << "Start BlockPromote for loop" << std::endl;
for (int j=0;j<Nsrc;j++) { for (int j=0;j<Nsrc;j++)
{
std::cout << GridLogMessage << "BlockProject iter: " << j << std::endl; std::cout << GridLogMessage << "BlockProject iter: " << j << std::endl;
blockPromote(guess_coarse[j],guess[j],subspace); blockPromote(guess_coarse[j],guess[j],subspace);
guess[j].Checkerboard() = src[j].Checkerboard(); guess[j].Checkerboard() = src[j].Checkerboard();

View File

@@ -1,376 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: MultiRHSBlockCGLinalg.h
Copyright (C) 2024
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
/* Need helper object for BLAS accelerated mrhs blockCG */
template<class Field>
class MultiRHSBlockCGLinalg
{
public:
typedef typename Field::scalar_type scalar;
typedef typename Field::scalar_object scalar_object;
typedef typename Field::vector_object vector_object;
deviceVector<scalar> BLAS_X; // nrhs x vol -- the sources
deviceVector<scalar> BLAS_Y; // nrhs x vol -- the result
deviceVector<scalar> BLAS_C; // nrhs x nrhs -- the coefficients
deviceVector<scalar> BLAS_Cred; // nrhs x nrhs x oSites -- reduction buffer
deviceVector<scalar *> Xdip;
deviceVector<scalar *> Ydip;
deviceVector<scalar *> Cdip;
MultiRHSBlockCGLinalg() {};
~MultiRHSBlockCGLinalg(){ Deallocate(); };
void Deallocate(void)
{
Xdip.resize(0);
Ydip.resize(0);
Cdip.resize(0);
BLAS_Cred.resize(0);
BLAS_C.resize(0);
BLAS_X.resize(0);
BLAS_Y.resize(0);
}
void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0)
{
std::vector<Field> Y_copy(AP.size(),AP[0].Grid());
for(int r=0;r<AP.size();r++){
Y_copy[r] = Y[r];
}
MulMatrix(AP,m,X);
for(int r=0;r<AP.size();r++){
AP[r] = scale*AP[r]+Y_copy[r];
}
}
void MulMatrix(std::vector<Field> &Y, Eigen::MatrixXcd &m , const std::vector<Field> &X)
{
typedef typename Field::scalar_type scomplex;
GridBase *grid;
uint64_t vol;
uint64_t words;
int nrhs = Y.size();
grid = X[0].Grid();
vol = grid->lSites();
words = sizeof(scalar_object)/sizeof(scalar);
int64_t vw = vol * words;
RealD t0 = usecond();
BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
BLAS_C.resize(nrhs * nrhs);// cost free if size doesn't change
RealD t1 = usecond();
/////////////////////////////////////////////
// Copy in the multi-rhs sources
/////////////////////////////////////////////
for(int r=0;r<nrhs;r++){
int64_t offset = r*vw;
autoView(x_v,X[r],AcceleratorRead);
acceleratorCopyDeviceToDevice(&x_v[0],&BLAS_X[offset],sizeof(scalar_object)*vol);
}
// Assumes Eigen storage contiguous
acceleratorCopyToDevice(&m(0,0),&BLAS_C[0],BLAS_C.size()*sizeof(scalar));
/*
* in Fortran column major notation (cuBlas order)
*
* Xxr = [X1(x)][..][Xn(x)]
* Yxr = [Y1(x)][..][Ym(x)]
* Y = X . C
*/
deviceVector<scalar *> Xd(1);
deviceVector<scalar *> Yd(1);
deviceVector<scalar *> Cd(1);
scalar * Xh = & BLAS_X[0];
scalar * Yh = & BLAS_Y[0];
scalar * Ch = & BLAS_C[0];
acceleratorPut(Xd[0],Xh);
acceleratorPut(Yd[0],Yh);
acceleratorPut(Cd[0],Ch);
RealD t2 = usecond();
GridBLAS BLAS;
/////////////////////////////////////////
// Y = X*C (transpose?)
/////////////////////////////////////////
BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
vw,nrhs,nrhs,
scalar(1.0),
Xd,
Cd,
scalar(0.0), // wipe out Y
Yd);
BLAS.synchronise();
RealD t3 = usecond();
// Copy back Y = m X
for(int r=0;r<nrhs;r++){
int64_t offset = r*vw;
autoView(y_v,Y[r],AcceleratorWrite);
acceleratorCopyDeviceToDevice(&BLAS_Y[offset],&y_v[0],sizeof(scalar_object)*vol);
}
RealD t4 = usecond();
std::cout <<GridLogPerformance << "MulMatrix alloc took "<< t1-t0<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "MulMatrix preamble took "<< t2-t1<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "MulMatrix blas took "<< t3-t2<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "MulMatrix copy took "<< t4-t3<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "MulMatrix total "<< t4-t0<<" us"<<std::endl;
}
void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y)
{
#if 0
int nrhs;
GridBase *grid;
uint64_t vol;
uint64_t words;
nrhs = X.size();
GRID_ASSERT(X.size()==Y.size());
conformable(X[0],Y[0]);
grid = X[0].Grid();
vol = grid->lSites();
words = sizeof(scalar_object)/sizeof(scalar);
int64_t vw = vol * words;
RealD t0 = usecond();
BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
BLAS_C.resize(nrhs * nrhs);// cost free if size doesn't change
RealD t1 = usecond();
/////////////////////////////////////////////
// Copy in the multi-rhs sources
/////////////////////////////////////////////
for(int r=0;r<nrhs;r++){
int64_t offset = r*vw;
autoView(x_v,X[r],AcceleratorRead);
acceleratorCopyDeviceToDevice(&x_v[0],&BLAS_X[offset],sizeof(scalar_object)*vol);
autoView(y_v,Y[r],AcceleratorRead);
acceleratorCopyDeviceToDevice(&y_v[0],&BLAS_Y[offset],sizeof(scalar_object)*vol);
}
RealD t2 = usecond();
/*
* in Fortran column major notation (cuBlas order)
*
* Xxr = [X1(x)][..][Xn(x)]
*
* Yxr = [Y1(x)][..][Ym(x)]
*
* C_rs = X^dag Y
*/
deviceVector<scalar *> Xd(1);
deviceVector<scalar *> Yd(1);
deviceVector<scalar *> Cd(1);
scalar * Xh = & BLAS_X[0];
scalar * Yh = & BLAS_Y[0];
scalar * Ch = & BLAS_C[0];
acceleratorPut(Xd[0],Xh);
acceleratorPut(Yd[0],Yh);
acceleratorPut(Cd[0],Ch);
GridBLAS BLAS;
RealD t3 = usecond();
/////////////////////////////////////////
// C_rs = X^dag Y
/////////////////////////////////////////
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
nrhs,nrhs,vw,
ComplexD(1.0),
Xd,
Yd,
ComplexD(0.0), // wipe out C
Cd);
BLAS.synchronise();
RealD t4 = usecond();
std::vector<scalar> HOST_C(BLAS_C.size()); // nrhs . nrhs -- the coefficients
acceleratorCopyFromDevice(&BLAS_C[0],&HOST_C[0],BLAS_C.size()*sizeof(scalar));
grid->GlobalSumVector(&HOST_C[0],nrhs*nrhs);
RealD t5 = usecond();
for(int rr=0;rr<nrhs;rr++){
for(int r=0;r<nrhs;r++){
int off = r+nrhs*rr;
m(r,rr)=HOST_C[off];
}
}
RealD t6 = usecond();
uint64_t M=nrhs;
uint64_t N=nrhs;
uint64_t K=vw;
RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K);
RealD flops = 8.0*M*N*K;
flops = flops/(t4-t3)/1.e3;
bytes = bytes/(t4-t3)/1.e3;
std::cout <<GridLogPerformance<< "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t2 "<< t2-t1<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< flops<<" GF/s"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< bytes<<" GB/s"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix gsum t5 "<< t5-t4<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t6 "<< t6-t5<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
#else
int nrhs;
GridBase *grid;
uint64_t vol;
uint64_t words;
nrhs = X.size();
GRID_ASSERT(X.size()==Y.size());
conformable(X[0],Y[0]);
grid = X[0].Grid();
int rd0 = grid->_rdimensions[0] * grid->_rdimensions[1];
vol = grid->oSites()/rd0;
words = rd0*sizeof(vector_object)/sizeof(scalar);
int64_t vw = vol * words;
GRID_ASSERT(vw == grid->lSites()*sizeof(scalar_object)/sizeof(scalar));
RealD t0 = usecond();
BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
BLAS_Cred.resize(nrhs * nrhs * vol);// cost free if size doesn't change
RealD t1 = usecond();
/////////////////////////////////////////////
// Copy in the multi-rhs sources -- layout batched BLAS ready
/////////////////////////////////////////////
for(int r=0;r<nrhs;r++){
autoView(x_v,X[r],AcceleratorRead);
autoView(y_v,Y[r],AcceleratorRead);
scalar *from_x=(scalar *)&x_v[0];
scalar *from_y=(scalar *)&y_v[0];
scalar *BX = &BLAS_X[0];
scalar *BY = &BLAS_Y[0];
accelerator_for(ssw,vw,1,{
uint64_t ss=ssw/words;
uint64_t w=ssw%words;
uint64_t offset = w+r*words+ss*nrhs*words; // [ss][rhs][words]
BX[offset] = from_x[ssw];
BY[offset] = from_y[ssw];
});
}
RealD t2 = usecond();
/*
* in Fortran column major notation (cuBlas order)
*
* Xxr = [X1(x)][..][Xn(x)]
*
* Yxr = [Y1(x)][..][Ym(x)]
*
* C_rs = X^dag Y
*/
Xdip.resize(vol);
Ydip.resize(vol);
Cdip.resize(vol);
std::vector<scalar *> Xh(vol);
std::vector<scalar *> Yh(vol);
std::vector<scalar *> Ch(vol);
for(uint64_t ss=0;ss<vol;ss++){
Xh[ss] = & BLAS_X[ss*nrhs*words];
Yh[ss] = & BLAS_Y[ss*nrhs*words];
Ch[ss] = & BLAS_Cred[ss*nrhs*nrhs];
}
acceleratorCopyToDevice(&Xh[0],&Xdip[0],vol*sizeof(scalar *));
acceleratorCopyToDevice(&Yh[0],&Ydip[0],vol*sizeof(scalar *));
acceleratorCopyToDevice(&Ch[0],&Cdip[0],vol*sizeof(scalar *));
GridBLAS BLAS;
RealD t3 = usecond();
/////////////////////////////////////////
// C_rs = X^dag Y
/////////////////////////////////////////
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
nrhs,nrhs,words,
ComplexD(1.0),
Xdip,
Ydip,
ComplexD(0.0), // wipe out C
Cdip);
BLAS.synchronise();
RealD t4 = usecond();
std::vector<scalar> HOST_C(BLAS_Cred.size()); // nrhs . nrhs -- the coefficients
acceleratorCopyFromDevice(&BLAS_Cred[0],&HOST_C[0],BLAS_Cred.size()*sizeof(scalar));
RealD t5 = usecond();
m = Eigen::MatrixXcd::Zero(nrhs,nrhs);
for(int ss=0;ss<vol;ss++){
Eigen::Map<Eigen::MatrixXcd> eC((std::complex<double> *)&HOST_C[ss*nrhs*nrhs],nrhs,nrhs);
m = m + eC;
}
RealD t6l = usecond();
grid->GlobalSumVector((scalar *) &m(0,0),nrhs*nrhs);
RealD t6 = usecond();
uint64_t M=nrhs;
uint64_t N=nrhs;
uint64_t K=vw;
RealD xybytes = grid->lSites()*sizeof(scalar_object);
RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K);
RealD flops = 8.0*M*N*K;
flops = flops/(t4-t3)/1.e3;
bytes = bytes/(t4-t3)/1.e3;
xybytes = 4*xybytes/(t2-t1)/1.e3;
std::cout <<GridLogPerformance<< "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t2 "<< t2-t1<<" us "<<xybytes<<" GB/s"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< flops<<" GF/s"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< bytes<<" GB/s"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t5 "<< t5-t4<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix lsum t6l "<< t6l-t5<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix gsum t6 "<< t6-t6l<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
#endif
}
};
NAMESPACE_END(Grid);

View File

@@ -131,12 +131,12 @@ public:
typedef typename Field::vector_object vobj; typedef typename Field::vector_object vobj;
// std::cout << GridLogMessage <<" BlockProjector importing "<<nvec<< " fine grid vectors" <<std::endl; // std::cout << GridLogMessage <<" BlockProjector importing "<<nvec<< " fine grid vectors" <<std::endl;
GRID_ASSERT(vecs[0].Grid()==fine_grid); assert(vecs[0].Grid()==fine_grid);
subdivides(coarse_grid,fine_grid); // require they map subdivides(coarse_grid,fine_grid); // require they map
int _ndimension = coarse_grid->_ndimension; int _ndimension = coarse_grid->_ndimension;
GRID_ASSERT(block_vol == fine_grid->oSites() / coarse_grid->oSites()); assert(block_vol == fine_grid->oSites() / coarse_grid->oSites());
Coordinate block_r (_ndimension); Coordinate block_r (_ndimension);
for(int d=0 ; d<_ndimension;d++){ for(int d=0 ; d<_ndimension;d++){
@@ -164,7 +164,7 @@ public:
const int Nsimd = vobj::Nsimd(); const int Nsimd = vobj::Nsimd();
// std::cout << "sz "<<sz<<std::endl; // std::cout << "sz "<<sz<<std::endl;
// std::cout << "prod "<<Nsimd * coarse_grid->oSites() * block_vol * nvec * words<<std::endl; // std::cout << "prod "<<Nsimd * coarse_grid->oSites() * block_vol * nvec * words<<std::endl;
GRID_ASSERT(sz == Nsimd * coarse_grid->oSites() * block_vol * nvec * words); assert(sz == Nsimd * coarse_grid->oSites() * block_vol * nvec * words);
uint64_t lwords= words; // local variable for copy in to GPU uint64_t lwords= words; // local variable for copy in to GPU
accelerator_for(sf,osites,Nsimd,{ accelerator_for(sf,osites,Nsimd,{
#ifdef GRID_SIMT #ifdef GRID_SIMT
@@ -198,7 +198,7 @@ public:
+ v*bv + v*bv
+ sb; + sb;
// GRID_ASSERT(site*lwords<sz); // assert(site*lwords<sz);
scalar_object * ptr = (scalar_object *)&blasData_p[site*lwords]; scalar_object * ptr = (scalar_object *)&blasData_p[site*lwords];
@@ -219,12 +219,12 @@ public:
int nvec = vecs.size(); int nvec = vecs.size();
GRID_ASSERT(vecs[0].Grid()==fine_grid); assert(vecs[0].Grid()==fine_grid);
subdivides(coarse_grid,fine_grid); // require they map subdivides(coarse_grid,fine_grid); // require they map
int _ndimension = coarse_grid->_ndimension; int _ndimension = coarse_grid->_ndimension;
GRID_ASSERT(block_vol == fine_grid->oSites() / coarse_grid->oSites()); assert(block_vol == fine_grid->oSites() / coarse_grid->oSites());
Coordinate block_r (_ndimension); Coordinate block_r (_ndimension);
for(int d=0 ; d<_ndimension;d++){ for(int d=0 ; d<_ndimension;d++){
@@ -299,7 +299,7 @@ public:
// std::cout << " BlockProjector importing "<<nvec<< " coarse grid vectors" <<std::endl; // std::cout << " BlockProjector importing "<<nvec<< " coarse grid vectors" <<std::endl;
GRID_ASSERT(vecs[0].Grid()==coarse_grid); assert(vecs[0].Grid()==coarse_grid);
int _ndimension = coarse_grid->_ndimension; int _ndimension = coarse_grid->_ndimension;
@@ -320,7 +320,7 @@ public:
// loop over fine sites // loop over fine sites
const int Nsimd = vobj::Nsimd(); const int Nsimd = vobj::Nsimd();
uint64_t cwords=sizeof(typename vobj::scalar_object)/sizeof(scalar); uint64_t cwords=sizeof(typename vobj::scalar_object)/sizeof(scalar);
GRID_ASSERT(cwords==nbasis); assert(cwords==nbasis);
accelerator_for(sc,osites,Nsimd,{ accelerator_for(sc,osites,Nsimd,{
#ifdef GRID_SIMT #ifdef GRID_SIMT
@@ -353,7 +353,7 @@ public:
typedef typename vobj::scalar_object coarse_scalar_object; typedef typename vobj::scalar_object coarse_scalar_object;
// std::cout << GridLogMessage<<" BlockProjector exporting "<<nvec<< " coarse grid vectors" <<std::endl; // std::cout << GridLogMessage<<" BlockProjector exporting "<<nvec<< " coarse grid vectors" <<std::endl;
GRID_ASSERT(vecs[0].Grid()==coarse_grid); assert(vecs[0].Grid()==coarse_grid);
int _ndimension = coarse_grid->_ndimension; int _ndimension = coarse_grid->_ndimension;
@@ -375,7 +375,7 @@ public:
// loop over fine sites // loop over fine sites
const int Nsimd = vobj::Nsimd(); const int Nsimd = vobj::Nsimd();
uint64_t cwords=sizeof(typename vobj::scalar_object)/sizeof(scalar); uint64_t cwords=sizeof(typename vobj::scalar_object)/sizeof(scalar);
GRID_ASSERT(cwords==nbasis); assert(cwords==nbasis);
accelerator_for(sc,osites,Nsimd,{ accelerator_for(sc,osites,Nsimd,{
// Wrap in a macro "FOR_ALL_LANES(lane,{ ... }); // Wrap in a macro "FOR_ALL_LANES(lane,{ ... });
@@ -409,7 +409,7 @@ public:
int nrhs=fine.size(); int nrhs=fine.size();
int _nbasis = sizeof(typename cobj::scalar_object)/sizeof(scalar); int _nbasis = sizeof(typename cobj::scalar_object)/sizeof(scalar);
// std::cout << "blockProject nbasis " <<nbasis<<" " << _nbasis<<std::endl; // std::cout << "blockProject nbasis " <<nbasis<<" " << _nbasis<<std::endl;
GRID_ASSERT(nbasis==_nbasis); assert(nbasis==_nbasis);
BLAS_F.resize (fine_vol * words * nrhs ); BLAS_F.resize (fine_vol * words * nrhs );
BLAS_C.resize (coarse_vol * nbasis * nrhs ); BLAS_C.resize (coarse_vol * nbasis * nrhs );
@@ -447,10 +447,10 @@ public:
///////////////////////////////////////// /////////////////////////////////////////
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N, BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
nbasis,nrhs,vw, nbasis,nrhs,vw,
scalar(1.0), ComplexD(1.0),
Vd, Vd,
Fd, Fd,
scalar(0.0), // wipe out C ComplexD(0.0), // wipe out C
Cd); Cd);
BLAS.synchronise(); BLAS.synchronise();
// std::cout << "BlockProject done"<<std::endl; // std::cout << "BlockProject done"<<std::endl;
@@ -464,7 +464,7 @@ public:
{ {
int nrhs=fine.size(); int nrhs=fine.size();
int _nbasis = sizeof(typename cobj::scalar_object)/sizeof(scalar); int _nbasis = sizeof(typename cobj::scalar_object)/sizeof(scalar);
GRID_ASSERT(nbasis==_nbasis); assert(nbasis==_nbasis);
BLAS_F.resize (fine_vol * words * nrhs ); BLAS_F.resize (fine_vol * words * nrhs );
BLAS_C.resize (coarse_vol * nbasis * nrhs ); BLAS_C.resize (coarse_vol * nbasis * nrhs );
@@ -497,10 +497,10 @@ public:
int64_t vw = block_vol * words; int64_t vw = block_vol * words;
BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
vw,nrhs,nbasis, vw,nrhs,nbasis,
scalar(1.0), ComplexD(1.0),
Vd, Vd,
Cd, Cd,
scalar(0.0), // wipe out C ComplexD(0.0), // wipe out C
Fd); Fd);
BLAS.synchronise(); BLAS.synchronise();
// std::cout << " blas call done"<<std::endl; // std::cout << " blas call done"<<std::endl;

View File

@@ -98,7 +98,7 @@ public:
void ImportEigenVector(Field &evec,RealD &_eval, int ev) void ImportEigenVector(Field &evec,RealD &_eval, int ev)
{ {
// std::cout << " ev " <<ev<<" eval "<<_eval<< std::endl; // std::cout << " ev " <<ev<<" eval "<<_eval<< std::endl;
GRID_ASSERT(ev<eval.size()); assert(ev<eval.size());
eval[ev] = _eval; eval[ev] = _eval;
int64_t offset = ev*vol*words; int64_t offset = ev*vol*words;
@@ -113,7 +113,7 @@ public:
// Could use to import a batch of eigenvectors // Could use to import a batch of eigenvectors
void ImportEigenBasis(std::vector<Field> &evec,std::vector<RealD> &_eval, int _ev0, int _nev) void ImportEigenBasis(std::vector<Field> &evec,std::vector<RealD> &_eval, int _ev0, int _nev)
{ {
GRID_ASSERT(_ev0+_nev<=evec.size()); assert(_ev0+_nev<=evec.size());
Allocate(_nev,evec[0].Grid()); Allocate(_nev,evec[0].Grid());
@@ -126,8 +126,8 @@ public:
void DeflateSources(std::vector<Field> &source,std::vector<Field> & guess) void DeflateSources(std::vector<Field> &source,std::vector<Field> & guess)
{ {
int nrhs = source.size(); int nrhs = source.size();
GRID_ASSERT(source.size()==guess.size()); assert(source.size()==guess.size());
GRID_ASSERT(grid == guess[0].Grid()); assert(grid == guess[0].Grid());
conformable(guess[0],source[0]); conformable(guess[0],source[0]);
int64_t vw = vol * words; int64_t vw = vol * words;
@@ -182,14 +182,14 @@ public:
///////////////////////////////////////// /////////////////////////////////////////
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N, BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
nev,nrhs,vw, nev,nrhs,vw,
scalar(1.0), ComplexD(1.0),
Ed, Ed,
Rd, Rd,
scalar(0.0), // wipe out C ComplexD(0.0), // wipe out C
Cd); Cd);
BLAS.synchronise(); BLAS.synchronise();
GRID_ASSERT(BLAS_C.size()==nev*nrhs); assert(BLAS_C.size()==nev*nrhs);
std::vector<scalar> HOST_C(BLAS_C.size()); // nrhs . nev -- the coefficients std::vector<scalar> HOST_C(BLAS_C.size()); // nrhs . nev -- the coefficients
acceleratorCopyFromDevice(&BLAS_C[0],&HOST_C[0],BLAS_C.size()*sizeof(scalar)); acceleratorCopyFromDevice(&BLAS_C[0],&HOST_C[0],BLAS_C.size()*sizeof(scalar));
@@ -210,10 +210,10 @@ public:
///////////////////////////////////////// /////////////////////////////////////////
BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N, BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
vw,nrhs,nev, vw,nrhs,nev,
scalar(1.0), ComplexD(1.0),
Ed, // x . nev Ed, // x . nev
Cd, // nev . nrhs Cd, // nev . nrhs
scalar(0.0), ComplexD(0.0),
Gd); Gd);
BLAS.synchronise(); BLAS.synchronise();

View File

@@ -270,7 +270,7 @@ class TwoLevelCG : public LinearFunction<Field>
std::vector<RealD> src_nrm(nrhs); std::vector<RealD> src_nrm(nrhs);
for(int rhs=0;rhs<nrhs;rhs++) { for(int rhs=0;rhs<nrhs;rhs++) {
src_nrm[rhs]=norm2(src[rhs]); src_nrm[rhs]=norm2(src[rhs]);
GRID_ASSERT(src_nrm[rhs]!=0.0); assert(src_nrm[rhs]!=0.0);
} }
std::vector<RealD> tn(nrhs); std::vector<RealD> tn(nrhs);

View File

@@ -53,7 +53,6 @@ class TwoLevelCGmrhs
// Fine operator, Smoother, CoarseSolver // Fine operator, Smoother, CoarseSolver
LinearOperatorBase<Field> &_FineLinop; LinearOperatorBase<Field> &_FineLinop;
LinearFunction<Field> &_Smoother; LinearFunction<Field> &_Smoother;
MultiRHSBlockCGLinalg<Field> _BlockCGLinalg;
GridStopWatch ProjectTimer; GridStopWatch ProjectTimer;
GridStopWatch PromoteTimer; GridStopWatch PromoteTimer;
@@ -63,12 +62,7 @@ class TwoLevelCGmrhs
GridStopWatch SmoothTimer; GridStopWatch SmoothTimer;
GridStopWatch InsertTimer; GridStopWatch InsertTimer;
/*
Field rrr;
Field sss;
Field qqq;
Field zzz;
*/
// more most opertor functions // more most opertor functions
TwoLevelCGmrhs(RealD tol, TwoLevelCGmrhs(RealD tol,
Integer maxit, Integer maxit,
@@ -79,313 +73,12 @@ class TwoLevelCGmrhs
MaxIterations(maxit), MaxIterations(maxit),
_FineLinop(FineLinop), _FineLinop(FineLinop),
_Smoother(Smoother) _Smoother(Smoother)
/*
rrr(fine),
sss(fine),
qqq(fine),
zzz(fine)
*/
{ {
grid = fine; grid = fine;
}; };
// Vector case // Vector case
virtual void operator() (std::vector<Field> &src, std::vector<Field> &x) virtual void operator() (std::vector<Field> &src, std::vector<Field> &x)
{
// SolveSingleSystem(src,x);
SolvePrecBlockCG(src,x);
}
////////////////////////////////////////////////////////////////////////////////////////////////////
// Thin QR factorisation (google it)
////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////
//Dimensions
// R_{ferm x Nblock} = Q_{ferm x Nblock} x C_{Nblock x Nblock} -> ferm x Nblock
//
// Rdag R = m_rr = Herm = L L^dag <-- Cholesky decomposition (LLT routine in Eigen)
//
// Q C = R => Q = R C^{-1}
//
// Want Ident = Q^dag Q = C^{-dag} R^dag R C^{-1} = C^{-dag} L L^dag C^{-1} = 1_{Nblock x Nblock}
//
// Set C = L^{dag}, and then Q^dag Q = ident
//
// Checks:
// Cdag C = Rdag R ; passes.
// QdagQ = 1 ; passes
////////////////////////////////////////////////////////////////////////////////////////////////////
void ThinQRfact (Eigen::MatrixXcd &m_zz,
Eigen::MatrixXcd &C,
Eigen::MatrixXcd &Cinv,
std::vector<Field> & Q,
std::vector<Field> & MQ,
const std::vector<Field> & Z,
const std::vector<Field> & MZ)
{
RealD t0=usecond();
_BlockCGLinalg.InnerProductMatrix(m_zz,MZ,Z);
RealD t1=usecond();
m_zz = 0.5*(m_zz+m_zz.adjoint());
Eigen::MatrixXcd L = m_zz.llt().matrixL();
C = L.adjoint();
Cinv = C.inverse();
RealD t3=usecond();
_BlockCGLinalg.MulMatrix( Q,Cinv,Z);
_BlockCGLinalg.MulMatrix(MQ,Cinv,MZ);
RealD t4=usecond();
std::cout << " ThinQRfact IP :"<< t1-t0<<" us"<<std::endl;
std::cout << " ThinQRfact Eigen :"<< t3-t1<<" us"<<std::endl;
std::cout << " ThinQRfact MulMat:"<< t4-t3<<" us"<<std::endl;
}
virtual void SolvePrecBlockCG (std::vector<Field> &src, std::vector<Field> &X)
{
std::cout << GridLogMessage<<"HDCG: mrhs fPrecBlockcg starting"<<std::endl;
src[0].Grid()->Barrier();
int nrhs = src.size();
// std::vector<RealD> f(nrhs);
// std::vector<RealD> rtzp(nrhs);
// std::vector<RealD> rtz(nrhs);
// std::vector<RealD> a(nrhs);
// std::vector<RealD> d(nrhs);
// std::vector<RealD> b(nrhs);
// std::vector<RealD> rptzp(nrhs);
////////////////////////////////////////////
//Initial residual computation & set up
////////////////////////////////////////////
std::vector<RealD> ssq(nrhs);
for(int rhs=0;rhs<nrhs;rhs++){
ssq[rhs]=norm2(src[rhs]); GRID_ASSERT(ssq[rhs]!=0.0);
}
///////////////////////////
// Fields -- eliminate duplicates between fPcg and block cg
///////////////////////////
std::vector<Field> Mtmp(nrhs,grid);
std::vector<Field> tmp(nrhs,grid);
std::vector<Field> Z(nrhs,grid); // Rename Z to R
std::vector<Field> MZ(nrhs,grid); // Rename MZ to Z
std::vector<Field> Q(nrhs,grid); //
std::vector<Field> MQ(nrhs,grid); // Rename to P
std::vector<Field> D(nrhs,grid);
std::vector<Field> AD(nrhs,grid);
/************************************************************************
* Preconditioned Block conjugate gradient rQ
* Generalise Sebastien Birk Thesis, after Dubrulle 2001.
* Introduce preconditioning following Saad Ch9
************************************************************************
* Dimensions:
*
* X,B etc... ==(Nferm x nrhs)
* Matrix A==(Nferm x Nferm)
*
* Nferm = Nspin x Ncolour x Ncomplex x Nlattice_site
* QC => Thin QR factorisation (google it)
*
* R = B-AX
* Z = Mi R
* QC = Z
* D = Q
* for k:
* R = AD
* Z = Mi R
* M = [D^dag R]^{-1}
* X = X + D M C
* QS = Q - Z.M
* D = Q + D S^dag
* C = S C
*/
Eigen::MatrixXcd m_DZ = Eigen::MatrixXcd::Identity(nrhs,nrhs);
Eigen::MatrixXcd m_M = Eigen::MatrixXcd::Identity(nrhs,nrhs);
Eigen::MatrixXcd m_zz = Eigen::MatrixXcd::Zero(nrhs,nrhs);
Eigen::MatrixXcd m_rr = Eigen::MatrixXcd::Zero(nrhs,nrhs);
Eigen::MatrixXcd m_C = Eigen::MatrixXcd::Zero(nrhs,nrhs);
Eigen::MatrixXcd m_Cinv = Eigen::MatrixXcd::Zero(nrhs,nrhs);
Eigen::MatrixXcd m_S = Eigen::MatrixXcd::Zero(nrhs,nrhs);
Eigen::MatrixXcd m_Sinv = Eigen::MatrixXcd::Zero(nrhs,nrhs);
Eigen::MatrixXcd m_tmp = Eigen::MatrixXcd::Identity(nrhs,nrhs);
Eigen::MatrixXcd m_tmp1 = Eigen::MatrixXcd::Identity(nrhs,nrhs);
GridStopWatch HDCGTimer;
//////////////////////////
// x0 = Vstart -- possibly modify guess
//////////////////////////
Vstart(X,src);
//////////////////////////
// R = B-AX
//////////////////////////
for(int rhs=0;rhs<nrhs;rhs++){
// r0 = b -A x0
_FineLinop.HermOp(X[rhs],tmp[rhs]);
axpy (Z[rhs], -1.0,tmp[rhs], src[rhs]); // Computes R=Z=src - A X0
}
//////////////////////////////////
// Compute MZ = M1 Z = M1 B - M1 A x0
//////////////////////////////////
PcgM1(Z,MZ);
//////////////////////////////////
// QC = Z
//////////////////////////////////
ThinQRfact (m_zz, m_C, m_Cinv, Q, MQ, Z, MZ);
//////////////////////////////////
// D=MQ
//////////////////////////////////
for(int b=0;b<nrhs;b++) D[b]=MQ[b]; // LLT rotation of the MZ basis of search dirs
std::cout << GridLogMessage<<"PrecBlockCGrQ vec computed initial residual and QR fact " <<std::endl;
ProjectTimer.Reset();
PromoteTimer.Reset();
DeflateTimer.Reset();
CoarseTimer.Reset();
SmoothTimer.Reset();
FineTimer.Reset();
InsertTimer.Reset();
GridStopWatch M1Timer;
GridStopWatch M2Timer;
GridStopWatch M3Timer;
GridStopWatch LinalgTimer;
GridStopWatch InnerProdTimer;
HDCGTimer.Start();
std::vector<RealD> rn(nrhs);
for (int k=0;k<=MaxIterations;k++){
////////////////////
// Z = AD
////////////////////
M3Timer.Start();
for(int b=0;b<nrhs;b++) _FineLinop.HermOp(D[b], Z[b]);
M3Timer.Stop();
////////////////////
// MZ = M1 Z <==== the Multigrid preconditioner
////////////////////
M1Timer.Start();
PcgM1(Z,MZ);
M1Timer.Stop();
FineTimer.Start();
////////////////////
// M = [D^dag Z]^{-1} = (<Ddag MZ>_M)^{-1} inner prod, generalising Saad derivation of Precon CG
////////////////////
InnerProdTimer.Start();
_BlockCGLinalg.InnerProductMatrix(m_DZ,D,Z);
InnerProdTimer.Stop();
m_M = m_DZ.inverse();
///////////////////////////
// X = X + D MC
///////////////////////////
m_tmp = m_M * m_C;
LinalgTimer.Start();
_BlockCGLinalg.MaddMatrix(X,m_tmp, D,X); // D are the search directions and X takes the updates
LinalgTimer.Stop();
///////////////////////////
// QS = Q - M Z
// (MQ) S = MQ - M (M1Z)
///////////////////////////
LinalgTimer.Start();
_BlockCGLinalg.MaddMatrix(tmp ,m_M, Z, Q,-1.0);
_BlockCGLinalg.MaddMatrix(Mtmp,m_M,MZ,MQ,-1.0);
ThinQRfact (m_zz, m_S, m_Sinv, Q, MQ, tmp, Mtmp);
LinalgTimer.Stop();
////////////////////////////
// D = MQ + D S^dag
////////////////////////////
m_tmp = m_S.adjoint();
LinalgTimer.Start();
_BlockCGLinalg.MaddMatrix(D,m_tmp,D,MQ);
LinalgTimer.Stop();
////////////////////////////
// C = S C
////////////////////////////
m_C = m_S*m_C;
////////////////////////////
// convergence monitor
////////////////////////////
m_rr = m_C.adjoint() * m_C;
FineTimer.Stop();
RealD max_resid=0;
RealD rrsum=0;
RealD sssum=0;
RealD rr;
for(int b=0;b<nrhs;b++) {
rrsum+=real(m_rr(b,b));
sssum+=ssq[b];
rr = real(m_rr(b,b))/ssq[b];
if ( rr > max_resid ) max_resid = rr;
}
std::cout << GridLogMessage <<
"\t Prec BlockCGrQ Iteration "<<k<<" ave resid "<< std::sqrt(rrsum/sssum) << " max "<< std::sqrt(max_resid) <<std::endl;
if ( max_resid < Tolerance*Tolerance ) {
HDCGTimer.Stop();
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ converged in "<<k<<" iterations and "<<HDCGTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Linalg "<<LinalgTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : fine H "<<M3Timer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : prec M1 "<<M1Timer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"**** M1 breakdown:"<<std::endl;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Project "<<ProjectTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Promote "<<PromoteTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Deflate "<<DeflateTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Coarse "<<CoarseTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Fine "<<FineTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Smooth "<<SmoothTimer.Elapsed()<<std::endl;;
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Insert "<<InsertTimer.Elapsed()<<std::endl;;
for(int rhs=0;rhs<nrhs;rhs++){
_FineLinop.HermOp(X[rhs],tmp[rhs]);
Field mytmp(grid);
axpy(mytmp,-1.0,src[rhs],tmp[rhs]);
RealD xnorm = sqrt(norm2(X[rhs]));
RealD srcnorm = sqrt(norm2(src[rhs]));
RealD tmpnorm = sqrt(norm2(mytmp));
RealD true_residual = tmpnorm/srcnorm;
std::cout<<GridLogMessage
<<"HDCG: true residual ["<<rhs<<"] is "<<true_residual
<<" solution "<<xnorm
<<" source "<<srcnorm
<<std::endl;
}
return;
}
}
HDCGTimer.Stop();
std::cout<<GridLogMessage<<"HDCG: PrecBlockCGrQ not converged "<<HDCGTimer.Elapsed()<<std::endl;
GRID_ASSERT(0);
}
virtual void SolveSingleSystem (std::vector<Field> &src, std::vector<Field> &x)
{ {
std::cout << GridLogMessage<<"HDCG: mrhs fPcg starting"<<std::endl; std::cout << GridLogMessage<<"HDCG: mrhs fPcg starting"<<std::endl;
src[0].Grid()->Barrier(); src[0].Grid()->Barrier();
@@ -415,7 +108,7 @@ class TwoLevelCGmrhs
std::vector<RealD> src_nrm(nrhs); std::vector<RealD> src_nrm(nrhs);
for(int rhs=0;rhs<nrhs;rhs++) { for(int rhs=0;rhs<nrhs;rhs++) {
src_nrm[rhs]=norm2(src[rhs]); src_nrm[rhs]=norm2(src[rhs]);
GRID_ASSERT(src_nrm[rhs]!=0.0); assert(src_nrm[rhs]!=0.0);
} }
std::vector<RealD> tn(nrhs); std::vector<RealD> tn(nrhs);
@@ -668,26 +361,15 @@ public:
CoarseField PleftProjMrhs(this->coarsegridmrhs); CoarseField PleftProjMrhs(this->coarsegridmrhs);
CoarseField PleftMss_projMrhs(this->coarsegridmrhs); CoarseField PleftMss_projMrhs(this->coarsegridmrhs);
// this->rrr=in[0];
#undef SMOOTHER_BLOCK_SOLVE
#if SMOOTHER_BLOCK_SOLVE
this->SmoothTimer.Start();
this->_Smoother(in,Min);
this->SmoothTimer.Stop();
#else
for(int rhs=0;rhs<nrhs;rhs++) { for(int rhs=0;rhs<nrhs;rhs++) {
this->SmoothTimer.Start(); this->SmoothTimer.Start();
this->_Smoother(in[rhs],Min[rhs]); this->_Smoother(in[rhs],Min[rhs]);
this->SmoothTimer.Stop(); this->SmoothTimer.Stop();
}
#endif
// this->sss=Min[0];
for(int rhs=0;rhs<nrhs;rhs++) {
this->FineTimer.Start(); this->FineTimer.Start();
this->_FineLinop.HermOp(Min[rhs],out[rhs]); this->_FineLinop.HermOp(Min[rhs],out[rhs]);
axpy(tmp[rhs],-1.0,out[rhs],in[rhs]); // resid = in - A Min axpy(tmp[rhs],-1.0,out[rhs],in[rhs]); // resid = in - A Min
this->FineTimer.Stop(); this->FineTimer.Stop();
@@ -719,11 +401,9 @@ public:
this->_Projector.blockPromote(tmp,PleftMss_proj);// tmp= Q[in - A Min] this->_Projector.blockPromote(tmp,PleftMss_proj);// tmp= Q[in - A Min]
this->PromoteTimer.Stop(); this->PromoteTimer.Stop();
this->FineTimer.Start(); this->FineTimer.Start();
// this->qqq=tmp[0];
for(int rhs=0;rhs<nrhs;rhs++) { for(int rhs=0;rhs<nrhs;rhs++) {
axpy(out[rhs],1.0,Min[rhs],tmp[rhs]); // Min+tmp axpy(out[rhs],1.0,Min[rhs],tmp[rhs]); // Min+tmp
} }
// this->zzz=out[0];
this->FineTimer.Stop(); this->FineTimer.Stop();
} }
}; };

View File

@@ -47,7 +47,7 @@ class BiCGSTAB : public OperatorFunction<Field>
public: public:
using OperatorFunction<Field>::operator(); using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // throw an GRID_ASSERT when the CG fails to converge. bool ErrorOnNoConverge; // throw an assert when the CG fails to converge.
// Defaults true. // Defaults true.
RealD Tolerance; RealD Tolerance;
Integer MaxIterations; Integer MaxIterations;
@@ -77,7 +77,7 @@ class BiCGSTAB : public OperatorFunction<Field>
// Initial residual computation & set up // Initial residual computation & set up
RealD guess = norm2(psi); RealD guess = norm2(psi);
GRID_ASSERT(std::isnan(guess) == 0); assert(std::isnan(guess) == 0);
Linop.Op(psi, v); Linop.Op(psi, v);
b = norm2(v); b = norm2(v);
@@ -214,7 +214,7 @@ class BiCGSTAB : public OperatorFunction<Field>
std::cout << GridLogMessage << "\tAxpyNorm " << AxpyNormTimer.Elapsed() << std::endl; std::cout << GridLogMessage << "\tAxpyNorm " << AxpyNormTimer.Elapsed() << std::endl;
std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() << std::endl; std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() << std::endl;
if(ErrorOnNoConverge){ GRID_ASSERT(true_residual / Tolerance < 10000.0); } if(ErrorOnNoConverge){ assert(true_residual / Tolerance < 10000.0); }
IterationsToComplete = k; IterationsToComplete = k;
@@ -224,7 +224,7 @@ class BiCGSTAB : public OperatorFunction<Field>
std::cout << GridLogMessage << "BiCGSTAB did NOT converge" << std::endl; std::cout << GridLogMessage << "BiCGSTAB did NOT converge" << std::endl;
if(ErrorOnNoConverge){ GRID_ASSERT(0); } if(ErrorOnNoConverge){ assert(0); }
IterationsToComplete = k; IterationsToComplete = k;
} }
}; };

View File

@@ -31,58 +31,6 @@ directory
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
template<class Field>
void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y){
typedef typename Field::scalar_type scomplex;
int Nblock = X.size();
for(int b=0;b<Nblock;b++){
for(int bp=0;bp<Nblock;bp++) {
m(b,bp) = innerProduct(X[b],Y[bp]);
}}
}
template<class Field>
void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0){
// Should make this cache friendly with site outermost, parallel_for
// Deal with case AP aliases with either Y or X
//
//Could pack "X" and "AP" into a Nblock x Volume dense array.
// AP(Nrhs x vol) = Y(Nrhs x vol) + scale * m(nrhs x nrhs) * X(nrhs*vol)
typedef typename Field::scalar_type scomplex;
int Nblock = AP.size();
std::vector<Field> tmp(Nblock,X[0]);
for(int b=0;b<Nblock;b++){
tmp[b] = Y[b];
for(int bp=0;bp<Nblock;bp++) {
tmp[b] = tmp[b] +scomplex(scale*m(bp,b))*X[bp];
}
}
for(int b=0;b<Nblock;b++){
AP[b] = tmp[b];
}
}
template<class Field>
void MulMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X){
// Should make this cache friendly with site outermost, parallel_for
typedef typename Field::scalar_type scomplex;
int Nblock = AP.size();
for(int b=0;b<Nblock;b++){
AP[b] = Zero();
for(int bp=0;bp<Nblock;bp++) {
AP[b] += scomplex(m(bp,b))*X[bp];
}
}
}
template<class Field>
double normv(const std::vector<Field> &P){
int Nblock = P.size();
double nn = 0.0;
for(int b=0;b<Nblock;b++) {
nn+=norm2(P[b]);
}
return nn;
}
enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS, BlockCGVec, BlockCGrQVec }; enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS, BlockCGVec, BlockCGrQVec };
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
@@ -98,7 +46,7 @@ class BlockConjugateGradient : public OperatorFunction<Field> {
int Nblock; int Nblock;
BlockCGtype CGtype; BlockCGtype CGtype;
bool ErrorOnNoConverge; // throw an GRID_ASSERT when the CG fails to converge. bool ErrorOnNoConverge; // throw an assert when the CG fails to converge.
// Defaults true. // Defaults true.
RealD Tolerance; RealD Tolerance;
Integer MaxIterations; Integer MaxIterations;
@@ -139,19 +87,10 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
sliceInnerProductMatrix(m_rr,R,R,Orthog); sliceInnerProductMatrix(m_rr,R,R,Orthog);
// Force manifest hermitian to avoid rounding related // Force manifest hermitian to avoid rounding related
/*
int rank=m_rr.rows();
for(int r=0;r<rank;r++){
for(int s=0;s<rank;s++){
std::cout << "QR m_rr["<<r<<","<<s<<"] "<<m_rr(r,s)<<std::endl;
}}
*/
m_rr = 0.5*(m_rr+m_rr.adjoint()); m_rr = 0.5*(m_rr+m_rr.adjoint());
Eigen::MatrixXcd L = m_rr.llt().matrixL(); Eigen::MatrixXcd L = m_rr.llt().matrixL();
// ComplexD det = L.determinant();
// std::cout << " Det m_rr "<<det<<std::endl;
C = L.adjoint(); C = L.adjoint();
Cinv = C.inverse(); Cinv = C.inverse();
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -171,20 +110,11 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
const std::vector<Field> & R) const std::vector<Field> & R)
{ {
InnerProductMatrix(m_rr,R,R); InnerProductMatrix(m_rr,R,R);
/*
int rank=m_rr.rows();
for(int r=0;r<rank;r++){
for(int s=0;s<rank;s++){
std::cout << "QRvec m_rr["<<r<<","<<s<<"] "<<m_rr(r,s)<<std::endl;
}}
*/
m_rr = 0.5*(m_rr+m_rr.adjoint()); m_rr = 0.5*(m_rr+m_rr.adjoint());
Eigen::MatrixXcd L = m_rr.llt().matrixL(); Eigen::MatrixXcd L = m_rr.llt().matrixL();
// ComplexD det = L.determinant();
// std::cout << " Det m_rr "<<det<<std::endl;
C = L.adjoint(); C = L.adjoint();
Cinv = C.inverse(); Cinv = C.inverse();
@@ -201,7 +131,7 @@ void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
} else if (CGtype == CGmultiRHS ) { } else if (CGtype == CGmultiRHS ) {
CGmultiRHSsolve(Linop,Src,Psi); CGmultiRHSsolve(Linop,Src,Psi);
} else { } else {
GRID_ASSERT(0); assert(0);
} }
} }
virtual void operator()(LinearOperatorBase<Field> &Linop, const std::vector<Field> &Src, std::vector<Field> &Psi) virtual void operator()(LinearOperatorBase<Field> &Linop, const std::vector<Field> &Src, std::vector<Field> &Psi)
@@ -209,7 +139,7 @@ virtual void operator()(LinearOperatorBase<Field> &Linop, const std::vector<Fiel
if ( CGtype == BlockCGrQVec ) { if ( CGtype == BlockCGrQVec ) {
BlockCGrQsolveVec(Linop,Src,Psi); BlockCGrQsolveVec(Linop,Src,Psi);
} else { } else {
GRID_ASSERT(0); assert(0);
} }
} }
@@ -256,13 +186,12 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
sliceNorm(ssq,B,Orthog); sliceNorm(ssq,B,Orthog);
RealD sssum=0; RealD sssum=0;
for(int b=0;b<Nblock;b++) sssum+=ssq[b]; for(int b=0;b<Nblock;b++) sssum+=ssq[b];
for(int b=0;b<Nblock;b++) std::cout << "src["<<b<<"]" << ssq[b] <<std::endl;
sliceNorm(residuals,B,Orthog); sliceNorm(residuals,B,Orthog);
for(int b=0;b<Nblock;b++){ GRID_ASSERT(std::isnan(residuals[b])==0); } for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
sliceNorm(residuals,X,Orthog); sliceNorm(residuals,X,Orthog);
for(int b=0;b<Nblock;b++){ GRID_ASSERT(std::isnan(residuals[b])==0); } for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
/************************************************************************ /************************************************************************
* Block conjugate gradient rQ (Sebastien Birk Thesis, after Dubrulle 2001) * Block conjugate gradient rQ (Sebastien Birk Thesis, after Dubrulle 2001)
@@ -292,9 +221,6 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
Linop.HermOp(X, AD); Linop.HermOp(X, AD);
tmp = B - AD; tmp = B - AD;
sliceNorm(residuals,tmp,Orthog);
for(int b=0;b<Nblock;b++) std::cout << "res["<<b<<"]" << residuals[b] <<std::endl;
ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp); ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
D=Q; D=Q;
@@ -310,8 +236,6 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
GridStopWatch SolverTimer; GridStopWatch SolverTimer;
SolverTimer.Start(); SolverTimer.Start();
RealD max_resid=0;
int k; int k;
for (k = 1; k <= MaxIterations; k++) { for (k = 1; k <= MaxIterations; k++) {
@@ -356,7 +280,7 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
*/ */
m_rr = m_C.adjoint() * m_C; m_rr = m_C.adjoint() * m_C;
max_resid=0; RealD max_resid=0;
RealD rrsum=0; RealD rrsum=0;
RealD rr; RealD rr;
@@ -398,11 +322,9 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
} }
} }
std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl;
std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge "<<k<<" / "<<MaxIterations if (ErrorOnNoConverge) assert(0);
<<" residual "<< std::sqrt(max_resid)<< std::endl;
if (ErrorOnNoConverge) GRID_ASSERT(0);
IterationsToComplete = k; IterationsToComplete = k;
} }
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
@@ -438,10 +360,10 @@ void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &
for(int b=0;b<Nblock;b++) sssum+=ssq[b]; for(int b=0;b<Nblock;b++) sssum+=ssq[b];
sliceNorm(residuals,Src,Orthog); sliceNorm(residuals,Src,Orthog);
for(int b=0;b<Nblock;b++){ GRID_ASSERT(std::isnan(residuals[b])==0); } for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
sliceNorm(residuals,Psi,Orthog); sliceNorm(residuals,Psi,Orthog);
for(int b=0;b<Nblock;b++){ GRID_ASSERT(std::isnan(residuals[b])==0); } for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
// Initial search dir is guess // Initial search dir is guess
Linop.HermOp(Psi, AP); Linop.HermOp(Psi, AP);
@@ -540,10 +462,47 @@ void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &
} }
std::cout << GridLogMessage << "MultiRHSConjugateGradient did NOT converge" << std::endl; std::cout << GridLogMessage << "MultiRHSConjugateGradient did NOT converge" << std::endl;
if (ErrorOnNoConverge) GRID_ASSERT(0); if (ErrorOnNoConverge) assert(0);
IterationsToComplete = k; IterationsToComplete = k;
} }
void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y){
for(int b=0;b<Nblock;b++){
for(int bp=0;bp<Nblock;bp++) {
m(b,bp) = innerProduct(X[b],Y[bp]);
}}
}
void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0){
// Should make this cache friendly with site outermost, parallel_for
// Deal with case AP aliases with either Y or X
std::vector<Field> tmp(Nblock,X[0]);
for(int b=0;b<Nblock;b++){
tmp[b] = Y[b];
for(int bp=0;bp<Nblock;bp++) {
tmp[b] = tmp[b] + scomplex(scale*m(bp,b))*X[bp];
}
}
for(int b=0;b<Nblock;b++){
AP[b] = tmp[b];
}
}
void MulMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X){
// Should make this cache friendly with site outermost, parallel_for
for(int b=0;b<Nblock;b++){
AP[b] = Zero();
for(int bp=0;bp<Nblock;bp++) {
AP[b] += scomplex(m(bp,b))*X[bp];
}
}
}
double normv(const std::vector<Field> &P){
double nn = 0.0;
for(int b=0;b<Nblock;b++) {
nn+=norm2(P[b]);
}
return nn;
}
//////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////
// BlockCGrQvec implementation: // BlockCGrQvec implementation:
//-------------------------- //--------------------------
@@ -554,7 +513,7 @@ void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &
void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field> &B, std::vector<Field> &X) void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field> &B, std::vector<Field> &X)
{ {
Nblock = B.size(); Nblock = B.size();
GRID_ASSERT(Nblock == X.size()); assert(Nblock == X.size());
std::cout<<GridLogMessage<<" Block Conjugate Gradient Vec rQ : Nblock "<<Nblock<<std::endl; std::cout<<GridLogMessage<<" Block Conjugate Gradient Vec rQ : Nblock "<<Nblock<<std::endl;
@@ -590,14 +549,13 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
RealD sssum=0; RealD sssum=0;
for(int b=0;b<Nblock;b++){ ssq[b] = norm2(B[b]);} for(int b=0;b<Nblock;b++){ ssq[b] = norm2(B[b]);}
for(int b=0;b<Nblock;b++){ std::cout << "ssq["<<b<<"] "<<ssq[b]<<std::endl;}
for(int b=0;b<Nblock;b++) sssum+=ssq[b]; for(int b=0;b<Nblock;b++) sssum+=ssq[b];
for(int b=0;b<Nblock;b++){ residuals[b] = norm2(B[b]);} for(int b=0;b<Nblock;b++){ residuals[b] = norm2(B[b]);}
for(int b=0;b<Nblock;b++){ GRID_ASSERT(std::isnan(residuals[b])==0); } for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
for(int b=0;b<Nblock;b++){ residuals[b] = norm2(X[b]);} for(int b=0;b<Nblock;b++){ residuals[b] = norm2(X[b]);}
for(int b=0;b<Nblock;b++){ GRID_ASSERT(std::isnan(residuals[b])==0); } for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
/************************************************************************ /************************************************************************
* Block conjugate gradient rQ (Sebastien Birk Thesis, after Dubrulle 2001) * Block conjugate gradient rQ (Sebastien Birk Thesis, after Dubrulle 2001)
@@ -627,7 +585,6 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
for(int b=0;b<Nblock;b++) { for(int b=0;b<Nblock;b++) {
Linop.HermOp(X[b], AD[b]); Linop.HermOp(X[b], AD[b]);
tmp[b] = B[b] - AD[b]; tmp[b] = B[b] - AD[b];
std::cout << "r0["<<b<<"] "<<norm2(tmp[b])<<std::endl;
} }
ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp); ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
@@ -731,7 +688,7 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
} }
std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl; std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl;
if (ErrorOnNoConverge) GRID_ASSERT(0); if (ErrorOnNoConverge) assert(0);
IterationsToComplete = k; IterationsToComplete = k;
} }

View File

@@ -36,7 +36,7 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
public: public:
using OperatorFunction<Field>::operator(); using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // Throw an GRID_ASSERT when CAGMRES fails to converge, bool ErrorOnNoConverge; // Throw an assert when CAGMRES fails to converge,
// defaults to true // defaults to true
RealD Tolerance; RealD Tolerance;
@@ -82,7 +82,7 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
conformable(psi, src); conformable(psi, src);
RealD guess = norm2(psi); RealD guess = norm2(psi);
GRID_ASSERT(std::isnan(guess) == 0); assert(std::isnan(guess) == 0);
RealD cp; RealD cp;
RealD ssq = norm2(src); RealD ssq = norm2(src);
@@ -137,7 +137,7 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
std::cout << GridLogMessage << "CommunicationAvoidingGeneralisedMinimalResidual did NOT converge" << std::endl; std::cout << GridLogMessage << "CommunicationAvoidingGeneralisedMinimalResidual did NOT converge" << std::endl;
if (ErrorOnNoConverge) if (ErrorOnNoConverge)
GRID_ASSERT(0); assert(0);
} }
RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) { RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
@@ -185,7 +185,7 @@ class CommunicationAvoidingGeneralisedMinimalResidual : public OperatorFunction<
} }
} }
GRID_ASSERT(0); // Never reached assert(0); // Never reached
return cp; return cp;
} }

View File

@@ -38,14 +38,13 @@ NAMESPACE_BEGIN(Grid);
// single input vec, single output vec. // single input vec, single output vec.
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
template <class Field> template <class Field>
class ConjugateGradient : public OperatorFunction<Field> { class ConjugateGradient : public OperatorFunction<Field> {
public: public:
using OperatorFunction<Field>::operator(); using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // throw an GRID_ASSERT when the CG fails to converge. bool ErrorOnNoConverge; // throw an assert when the CG fails to converge.
// Defaults true. // Defaults true.
RealD Tolerance; RealD Tolerance;
Integer MaxIterations; Integer MaxIterations;
@@ -58,22 +57,10 @@ public:
ErrorOnNoConverge(err_on_no_conv) ErrorOnNoConverge(err_on_no_conv)
{}; {};
virtual void LogIteration(int k,RealD a,RealD b){
// std::cout << "ConjugageGradient::LogIteration() "<<std::endl;
};
virtual void LogBegin(void){
std::cout << "ConjugageGradient::LogBegin() "<<std::endl;
};
void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) { void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
this->LogBegin();
GRID_TRACE("ConjugateGradient"); GRID_TRACE("ConjugateGradient");
GridStopWatch PreambleTimer; GridStopWatch PreambleTimer;
GridStopWatch ConstructTimer;
GridStopWatch NormTimer;
GridStopWatch AssignTimer;
PreambleTimer.Start(); PreambleTimer.Start();
psi.Checkerboard() = src.Checkerboard(); psi.Checkerboard() = src.Checkerboard();
@@ -83,19 +70,14 @@ public:
//RealD b_pred; //RealD b_pred;
// Was doing copies // Was doing copies
ConstructTimer.Start();
Field p(src.Grid()); Field p(src.Grid());
Field mmp(src.Grid()); Field mmp(src.Grid());
Field r(src.Grid()); Field r(src.Grid());
ConstructTimer.Stop();
// Initial residual computation & set up // Initial residual computation & set up
NormTimer.Start();
ssq = norm2(src); ssq = norm2(src);
RealD guess = norm2(psi); RealD guess = norm2(psi);
NormTimer.Stop(); assert(std::isnan(guess) == 0);
GRID_ASSERT(std::isnan(guess) == 0);
AssignTimer.Start();
if ( guess == 0.0 ) { if ( guess == 0.0 ) {
r = src; r = src;
p = r; p = r;
@@ -107,7 +89,6 @@ public:
a = norm2(p); a = norm2(p);
} }
cp = a; cp = a;
AssignTimer.Stop();
// Handle trivial case of zero src // Handle trivial case of zero src
if (ssq == 0.){ if (ssq == 0.){
@@ -183,7 +164,6 @@ public:
} }
LinearCombTimer.Stop(); LinearCombTimer.Stop();
LinalgTimer.Stop(); LinalgTimer.Stop();
LogIteration(k,a,b);
IterationTimer.Stop(); IterationTimer.Stop();
if ( (k % 500) == 0 ) { if ( (k % 500) == 0 ) {
@@ -222,7 +202,7 @@ public:
std::cout << GridLogDebug << "\tMobius flop rate " << DwfFlops/ usecs<< " Gflops " <<std::endl; std::cout << GridLogDebug << "\tMobius flop rate " << DwfFlops/ usecs<< " Gflops " <<std::endl;
if (ErrorOnNoConverge) GRID_ASSERT(true_residual / Tolerance < 10000.0); if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
IterationsToComplete = k; IterationsToComplete = k;
TrueResidual = true_residual; TrueResidual = true_residual;
@@ -240,9 +220,6 @@ public:
<<" residual "<< std::sqrt(cp / ssq)<< std::endl; <<" residual "<< std::sqrt(cp / ssq)<< std::endl;
SolverTimer.Stop(); SolverTimer.Stop();
std::cout << GridLogMessage << "\tPreamble " << PreambleTimer.Elapsed() <<std::endl; std::cout << GridLogMessage << "\tPreamble " << PreambleTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tConstruct " << ConstructTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tNorm " << NormTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tAssign " << AssignTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tSolver " << SolverTimer.Elapsed() <<std::endl; std::cout << GridLogMessage << "\tSolver " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "Solver breakdown "<<std::endl; std::cout << GridLogMessage << "Solver breakdown "<<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl; std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
@@ -251,123 +228,10 @@ public:
std::cout << GridLogPerformance << "\t\tAxpyNorm " << AxpyNormTimer.Elapsed() <<std::endl; std::cout << GridLogPerformance << "\t\tAxpyNorm " << AxpyNormTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\t\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl; std::cout << GridLogPerformance << "\t\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
if (ErrorOnNoConverge) GRID_ASSERT(0); if (ErrorOnNoConverge) assert(0);
IterationsToComplete = k; IterationsToComplete = k;
} }
}; };
template <class Field>
class ConjugateGradientPolynomial : public ConjugateGradient<Field> {
public:
// Optionally record the CG polynomial
std::vector<double> ak;
std::vector<double> bk;
std::vector<double> poly_p;
std::vector<double> poly_r;
std::vector<double> poly_Ap;
std::vector<double> polynomial;
public:
ConjugateGradientPolynomial(RealD tol, Integer maxit, bool err_on_no_conv = true)
: ConjugateGradient<Field>(tol,maxit,err_on_no_conv)
{ };
void PolyHermOp(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
{
Field tmp(src.Grid());
Field AtoN(src.Grid());
AtoN = src;
psi=AtoN*polynomial[0];
for(int n=1;n<polynomial.size();n++){
tmp = AtoN;
Linop.HermOp(tmp,AtoN);
psi = psi + polynomial[n]*AtoN;
}
}
void CGsequenceHermOp(LinearOperatorBase<Field> &Linop, const Field &src, Field &x)
{
Field Ap(src.Grid());
Field r(src.Grid());
Field p(src.Grid());
p=src;
r=src;
x=Zero();
x.Checkerboard()=src.Checkerboard();
for(int k=0;k<ak.size();k++){
x = x + ak[k]*p;
Linop.HermOp(p,Ap);
r = r - ak[k] * Ap;
p = r + bk[k] * p;
}
}
void Solve(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
{
psi=Zero();
this->operator ()(Linop,src,psi);
}
virtual void LogBegin(void)
{
std::cout << "ConjugageGradientPolynomial::LogBegin() "<<std::endl;
ak.resize(0);
bk.resize(0);
polynomial.resize(0);
poly_Ap.resize(0);
poly_Ap.resize(0);
poly_p.resize(1);
poly_r.resize(1);
poly_p[0]=1.0;
poly_r[0]=1.0;
};
virtual void LogIteration(int k,RealD a,RealD b)
{
// With zero guess,
// p = r = src
//
// iterate:
// x = x + a p
// r = r - a A p
// p = r + b p
//
// [0]
// r = x
// p = x
// Ap=0
//
// [1]
// Ap = A x + 0 ==> shift poly P right by 1 and add 0.
// x = x + a p ==> add polynomials term by term
// r = r - a A p ==> add polynomials term by term
// p = r + b p ==> add polynomials term by term
//
std::cout << "ConjugageGradientPolynomial::LogIteration() "<<k<<std::endl;
ak.push_back(a);
bk.push_back(b);
// Ap= right_shift(p)
poly_Ap.resize(k+1);
poly_Ap[0]=0.0;
for(int i=0;i<k;i++){
poly_Ap[i+1]=poly_p[i];
}
// x = x + a p
polynomial.resize(k);
polynomial[k-1]=0.0;
for(int i=0;i<k;i++){
polynomial[i] = polynomial[i] + a * poly_p[i];
}
// r = r - a Ap
// p = r + b p
poly_r.resize(k+1);
poly_p.resize(k+1);
poly_r[k] = poly_p[k] = 0.0;
for(int i=0;i<k+1;i++){
poly_r[i] = poly_r[i] - a * poly_Ap[i];
poly_p[i] = poly_r[i] + b * poly_p[i];
}
}
};
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
#endif #endif

View File

@@ -116,14 +116,14 @@ NAMESPACE_BEGIN(Grid);
//Compute double precision rsd and also new RHS vector. //Compute double precision rsd and also new RHS vector.
Linop_d.HermOp(sol_d, tmp_d); Linop_d.HermOp(sol_d, tmp_d);
RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
std::cout<<GridLogMessage<<" rsd norm "<<norm<<std::endl;
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl; std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;
if(norm < OuterLoopNormMult * stop){ if(norm < OuterLoopNormMult * stop){
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl; std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl;
break; break;
} }
while(norm * inner_tol * inner_tol < stop*1.01) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ?? while(norm * inner_tol * inner_tol < stop) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
PrecChangeTimer.Start(); PrecChangeTimer.Start();
precisionChange(src_f, src_d, pc_wk_dp_to_sp); precisionChange(src_f, src_d, pc_wk_dp_to_sp);

View File

@@ -77,7 +77,7 @@ public:
} }
void operator() (const std::vector<FieldD> &src_d_in, std::vector<FieldD> &sol_d){ void operator() (const std::vector<FieldD> &src_d_in, std::vector<FieldD> &sol_d){
GRID_ASSERT(src_d_in.size() == sol_d.size()); assert(src_d_in.size() == sol_d.size());
int NBatch = src_d_in.size(); int NBatch = src_d_in.size();
std::cout << GridLogMessage << "NBatch = " << NBatch << std::endl; std::cout << GridLogMessage << "NBatch = " << NBatch << std::endl;

View File

@@ -98,15 +98,15 @@ public:
std::vector<RealD> alpha(nshift,1.0); std::vector<RealD> alpha(nshift,1.0);
std::vector<Field> ps(nshift,grid);// Search directions std::vector<Field> ps(nshift,grid);// Search directions
GRID_ASSERT(psi.size()==nshift); assert(psi.size()==nshift);
GRID_ASSERT(mass.size()==nshift); assert(mass.size()==nshift);
GRID_ASSERT(mresidual.size()==nshift); assert(mresidual.size()==nshift);
// remove dynamic sized arrays on stack; 2d is a pain with vector // dynamic sized arrays on stack; 2d is a pain with vector
std::vector<RealD> bs(nshift); RealD bs[nshift];
std::vector<RealD> rsq(nshift); RealD rsq[nshift];
std::vector<std::array<RealD,2> > z(nshift); RealD z[nshift][2];
std::vector<int> converged(nshift); int converged[nshift];
const int primary =0; const int primary =0;
@@ -122,7 +122,7 @@ public:
// Check lightest mass // Check lightest mass
for(int s=0;s<nshift;s++){ for(int s=0;s<nshift;s++){
GRID_ASSERT( mass[s]>= mass[primary] ); assert( mass[s]>= mass[primary] );
converged[s]=0; converged[s]=0;
} }
@@ -338,7 +338,7 @@ public:
} }
// ugly hack // ugly hack
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl; std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
// GRID_ASSERT(0); // assert(0);
} }
}; };

View File

@@ -118,16 +118,16 @@ public:
FieldF r_f(SinglePrecGrid); FieldF r_f(SinglePrecGrid);
FieldD mmp_d(DoublePrecGrid); FieldD mmp_d(DoublePrecGrid);
GRID_ASSERT(psi_d.size()==nshift); assert(psi_d.size()==nshift);
GRID_ASSERT(mass.size()==nshift); assert(mass.size()==nshift);
GRID_ASSERT(mresidual.size()==nshift); assert(mresidual.size()==nshift);
// dynamic sized arrays on stack; 2d is a pain with vector // dynamic sized arrays on stack; 2d is a pain with vector
std::vector<RealD> bs(nshift); RealD bs[nshift];
std::vector<RealD> rsq(nshift); RealD rsq[nshift];
std::vector<RealD> rsqf(nshift); RealD rsqf[nshift];
std::vector<std::array<RealD,2> > z(nshift); RealD z[nshift][2];
std::vector<int> converged(nshift); int converged[nshift];
const int primary =0; const int primary =0;
@@ -141,7 +141,7 @@ public:
// Check lightest mass // Check lightest mass
for(int s=0;s<nshift;s++){ for(int s=0;s<nshift;s++){
GRID_ASSERT( mass[s]>= mass[primary] ); assert( mass[s]>= mass[primary] );
converged[s]=0; converged[s]=0;
} }
@@ -179,7 +179,7 @@ public:
Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp) Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
tmp_d = tmp_d - mmp_d; tmp_d = tmp_d - mmp_d;
std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl; std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
// GRID_ASSERT(norm2(tmp_d)< 1.0e-4); // assert(norm2(tmp_d)< 1.0e-4);
axpy(mmp_d,mass[0],p_d,mmp_d); axpy(mmp_d,mass[0],p_d,mmp_d);
RealD rn = norm2(p_d); RealD rn = norm2(p_d);
@@ -365,7 +365,7 @@ public:
} }
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl; std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
GRID_ASSERT(0); assert(0);
} }
}; };

View File

@@ -48,12 +48,12 @@ public:
ShiftedLinop(LinearOperatorBase<Field> &_linop_base, RealD _shift): linop_base(_linop_base), shift(_shift){} ShiftedLinop(LinearOperatorBase<Field> &_linop_base, RealD _shift): linop_base(_linop_base), shift(_shift){}
void OpDiag (const Field &in, Field &out){ GRID_ASSERT(0); } void OpDiag (const Field &in, Field &out){ assert(0); }
void OpDir (const Field &in, Field &out,int dir,int disp){ GRID_ASSERT(0); } void OpDir (const Field &in, Field &out,int dir,int disp){ assert(0); }
void OpDirAll (const Field &in, std::vector<Field> &out){ GRID_ASSERT(0); } void OpDirAll (const Field &in, std::vector<Field> &out){ assert(0); }
void Op (const Field &in, Field &out){ GRID_ASSERT(0); } void Op (const Field &in, Field &out){ assert(0); }
void AdjOp (const Field &in, Field &out){ GRID_ASSERT(0); } void AdjOp (const Field &in, Field &out){ assert(0); }
void HermOp(const Field &in, Field &out){ void HermOp(const Field &in, Field &out){
linop_base.HermOp(in, out); linop_base.HermOp(in, out);
@@ -151,16 +151,16 @@ public:
FieldD r_d(DoublePrecGrid); FieldD r_d(DoublePrecGrid);
FieldD mmp_d(DoublePrecGrid); FieldD mmp_d(DoublePrecGrid);
GRID_ASSERT(psi_d.size()==nshift); assert(psi_d.size()==nshift);
GRID_ASSERT(mass.size()==nshift); assert(mass.size()==nshift);
GRID_ASSERT(mresidual.size()==nshift); assert(mresidual.size()==nshift);
// dynamic sized arrays on stack; 2d is a pain with vector // dynamic sized arrays on stack; 2d is a pain with vector
std::vector<RealD> bs(nshift); RealD bs[nshift];
std::vector<RealD> rsq(nshift); RealD rsq[nshift];
std::vector<RealD> rsqf(nshift); RealD rsqf[nshift];
std::vector<std::array<RealD,2> > z(nshift); RealD z[nshift][2];
std::vector<int> converged(nshift); int converged[nshift];
const int primary =0; const int primary =0;
@@ -174,7 +174,7 @@ public:
// Check lightest mass // Check lightest mass
for(int s=0;s<nshift;s++){ for(int s=0;s<nshift;s++){
GRID_ASSERT( mass[s]>= mass[primary] ); assert( mass[s]>= mass[primary] );
converged[s]=0; converged[s]=0;
} }
@@ -211,7 +211,7 @@ public:
Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp) Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
tmp_d = tmp_d - mmp_d; tmp_d = tmp_d - mmp_d;
std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl; std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
GRID_ASSERT(norm2(tmp_d)< 1.0); assert(norm2(tmp_d)< 1.0);
axpy(mmp_d,mass[0],p_d,mmp_d); axpy(mmp_d,mass[0],p_d,mmp_d);
RealD rn = norm2(p_d); RealD rn = norm2(p_d);
@@ -408,7 +408,7 @@ public:
} }
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl; std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
GRID_ASSERT(0); assert(0);
} }
}; };

View File

@@ -35,7 +35,7 @@ template<class FieldD,class FieldF,
typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class ConjugateGradientReliableUpdate : public LinearFunction<FieldD> { class ConjugateGradientReliableUpdate : public LinearFunction<FieldD> {
public: public:
bool ErrorOnNoConverge; // throw an GRID_ASSERT when the CG fails to converge. bool ErrorOnNoConverge; // throw an assert when the CG fails to converge.
// Defaults true. // Defaults true.
RealD Tolerance; RealD Tolerance;
Integer MaxIterations; Integer MaxIterations;
@@ -66,7 +66,7 @@ public:
DoFinalCleanup(true), DoFinalCleanup(true),
Linop_fallback(NULL) Linop_fallback(NULL)
{ {
GRID_ASSERT(Delta > 0. && Delta < 1. && "Expect 0 < Delta < 1"); assert(Delta > 0. && Delta < 1. && "Expect 0 < Delta < 1");
}; };
void setFallbackLinop(LinearOperatorBase<FieldF> &_Linop_fallback, const RealD _fallback_transition_tol){ void setFallbackLinop(LinearOperatorBase<FieldF> &_Linop_fallback, const RealD _fallback_transition_tol){
@@ -90,7 +90,7 @@ public:
// Initial residual computation & set up // Initial residual computation & set up
RealD guess = norm2(psi); RealD guess = norm2(psi);
GRID_ASSERT(std::isnan(guess) == 0); assert(std::isnan(guess) == 0);
Linop_d.HermOpAndNorm(psi, mmp, d, b); Linop_d.HermOpAndNorm(psi, mmp, d, b);
@@ -217,7 +217,7 @@ public:
CG(Linop_d,src,psi); CG(Linop_d,src,psi);
IterationsToCleanup = CG.IterationsToComplete; IterationsToCleanup = CG.IterationsToComplete;
} }
else if (ErrorOnNoConverge) GRID_ASSERT(true_residual / Tolerance < 10000.0); else if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate complete.\n"; std::cout << GridLogMessage << "ConjugateGradientReliableUpdate complete.\n";
return; return;
@@ -263,7 +263,7 @@ public:
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate did NOT converge" std::cout << GridLogMessage << "ConjugateGradientReliableUpdate did NOT converge"
<< std::endl; << std::endl;
if (ErrorOnNoConverge) GRID_ASSERT(0); if (ErrorOnNoConverge) assert(0);
IterationsToComplete = k; IterationsToComplete = k;
ReliableUpdatesPerformed = l; ReliableUpdatesPerformed = l;
} }

View File

@@ -106,7 +106,7 @@ public:
} }
std::cout<<GridLogMessage<<"ConjugateResidual did NOT converge"<<std::endl; std::cout<<GridLogMessage<<"ConjugateResidual did NOT converge"<<std::endl;
GRID_ASSERT(0); assert(0);
} }
}; };
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@@ -36,7 +36,7 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
public: public:
using OperatorFunction<Field>::operator(); using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // Throw an GRID_ASSERT when FCAGMRES fails to converge, bool ErrorOnNoConverge; // Throw an assert when FCAGMRES fails to converge,
// defaults to true // defaults to true
RealD Tolerance; RealD Tolerance;
@@ -87,7 +87,7 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
conformable(psi, src); conformable(psi, src);
RealD guess = norm2(psi); RealD guess = norm2(psi);
GRID_ASSERT(std::isnan(guess) == 0); assert(std::isnan(guess) == 0);
RealD cp; RealD cp;
RealD ssq = norm2(src); RealD ssq = norm2(src);
@@ -144,7 +144,7 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
std::cout << GridLogMessage << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual did NOT converge" << std::endl; std::cout << GridLogMessage << "FlexibleCommunicationAvoidingGeneralisedMinimalResidual did NOT converge" << std::endl;
if (ErrorOnNoConverge) if (ErrorOnNoConverge)
GRID_ASSERT(0); assert(0);
} }
RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) { RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
@@ -191,7 +191,7 @@ class FlexibleCommunicationAvoidingGeneralisedMinimalResidual : public OperatorF
} }
} }
GRID_ASSERT(0); // Never reached assert(0); // Never reached
return cp; return cp;
} }

View File

@@ -36,7 +36,7 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
public: public:
using OperatorFunction<Field>::operator(); using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // Throw an GRID_ASSERT when FGMRES fails to converge, bool ErrorOnNoConverge; // Throw an assert when FGMRES fails to converge,
// defaults to true // defaults to true
RealD Tolerance; RealD Tolerance;
@@ -85,7 +85,7 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
conformable(psi, src); conformable(psi, src);
RealD guess = norm2(psi); RealD guess = norm2(psi);
GRID_ASSERT(std::isnan(guess) == 0); assert(std::isnan(guess) == 0);
RealD cp; RealD cp;
RealD ssq = norm2(src); RealD ssq = norm2(src);
@@ -142,7 +142,7 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
std::cout << GridLogMessage << "FlexibleGeneralisedMinimalResidual did NOT converge" << std::endl; std::cout << GridLogMessage << "FlexibleGeneralisedMinimalResidual did NOT converge" << std::endl;
if (ErrorOnNoConverge) if (ErrorOnNoConverge)
GRID_ASSERT(0); assert(0);
} }
RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) { RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
@@ -189,7 +189,7 @@ class FlexibleGeneralisedMinimalResidual : public OperatorFunction<Field> {
} }
} }
GRID_ASSERT(0); // Never reached assert(0); // Never reached
return cp; return cp;
} }

View File

@@ -36,7 +36,7 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
public: public:
using OperatorFunction<Field>::operator(); using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // Throw an GRID_ASSERT when GMRES fails to converge, bool ErrorOnNoConverge; // Throw an assert when GMRES fails to converge,
// defaults to true // defaults to true
RealD Tolerance; RealD Tolerance;
@@ -80,7 +80,7 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
conformable(psi, src); conformable(psi, src);
RealD guess = norm2(psi); RealD guess = norm2(psi);
GRID_ASSERT(std::isnan(guess) == 0); assert(std::isnan(guess) == 0);
RealD cp; RealD cp;
RealD ssq = norm2(src); RealD ssq = norm2(src);
@@ -135,7 +135,7 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
std::cout << GridLogMessage << "GeneralisedMinimalResidual did NOT converge" << std::endl; std::cout << GridLogMessage << "GeneralisedMinimalResidual did NOT converge" << std::endl;
if (ErrorOnNoConverge) if (ErrorOnNoConverge)
GRID_ASSERT(0); assert(0);
} }
RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) { RealD outerLoopBody(LinearOperatorBase<Field> &LinOp, const Field &src, Field &psi, RealD rsq) {
@@ -181,7 +181,7 @@ class GeneralisedMinimalResidual : public OperatorFunction<Field> {
} }
} }
GRID_ASSERT(0); // Never reached assert(0); // Never reached
return cp; return cp;
} }

View File

@@ -175,7 +175,7 @@ public:
eresid(_eresid), MaxIter(_MaxIter), eresid(_eresid), MaxIter(_MaxIter),
diagonalisation(_diagonalisation),split_test(0), diagonalisation(_diagonalisation),split_test(0),
Nevec_acc(_Nu) Nevec_acc(_Nu)
{ GRID_ASSERT( (Nk%Nu==0) && (Nm%Nu==0) ); }; { assert( (Nk%Nu==0) && (Nm%Nu==0) ); };
//////////////////////////////// ////////////////////////////////
// Helpers // Helpers
@@ -206,7 +206,7 @@ public:
Glog<<"orthogonalize after: "<<j<<" of "<<k<<" "<< ip <<std::endl; Glog<<"orthogonalize after: "<<j<<" of "<<k<<" "<< ip <<std::endl;
} }
} }
GRID_ASSERT(normalize(w,if_print) != 0); assert(normalize(w,if_print) != 0);
} }
void reorthogonalize(Field& w, std::vector<Field>& evec, int k) void reorthogonalize(Field& w, std::vector<Field>& evec, int k)
{ {
@@ -225,7 +225,7 @@ public:
w[i] = w[i] - ip * evec[j]; w[i] = w[i] - ip * evec[j];
}} }}
for(int i=0; i<_Nu; ++i) for(int i=0; i<_Nu; ++i)
GRID_ASSERT(normalize(w[i],if_print) !=0); assert(normalize(w[i],if_print) !=0);
} }
@@ -244,7 +244,7 @@ public:
const uint64_t sites = grid->lSites(); const uint64_t sites = grid->lSites();
int Nbatch = R/Nevec_acc; int Nbatch = R/Nevec_acc;
GRID_ASSERT( R%Nevec_acc == 0 ); assert( R%Nevec_acc == 0 );
// Glog << "nBatch, Nevec_acc, R, Nu = " // Glog << "nBatch, Nevec_acc, R, Nu = "
// << Nbatch << "," << Nevec_acc << "," << R << "," << Nu << std::endl; // << Nbatch << "," << Nevec_acc << "," << R << "," << Nu << std::endl;
@@ -302,7 +302,7 @@ public:
} }
} }
for (int i=0; i<Nu; ++i) { for (int i=0; i<Nu; ++i) {
GRID_ASSERT(normalize(w[i],do_print)!=0); assert(normalize(w[i],do_print)!=0);
} }
Glog << "cuBLAS Zgemm done"<< std::endl; Glog << "cuBLAS Zgemm done"<< std::endl;
@@ -374,8 +374,8 @@ cudaStat = cudaMallocManaged((void **)&evec_acc, Nevec_acc*sites*12*sizeof(CUDA_
{ {
std::string fname = std::string(cname+"::calc_irbl()"); std::string fname = std::string(cname+"::calc_irbl()");
GridBase *grid = evec[0].Grid(); GridBase *grid = evec[0].Grid();
GRID_ASSERT(grid == src[0].Grid()); assert(grid == src[0].Grid());
GRID_ASSERT( Nu = src.size() ); assert( Nu = src.size() );
Glog << std::string(74,'*') << std::endl; Glog << std::string(74,'*') << std::endl;
Glog << fname + " starting iteration 0 / "<< MaxIter<< std::endl; Glog << fname + " starting iteration 0 / "<< MaxIter<< std::endl;
@@ -396,7 +396,7 @@ cudaStat = cudaMallocManaged((void **)&evec_acc, Nevec_acc*sites*12*sizeof(CUDA_
} }
Glog << std::string(74,'*') << std::endl; Glog << std::string(74,'*') << std::endl;
GRID_ASSERT(Nm == evec.size() && Nm == eval.size()); assert(Nm == evec.size() && Nm == eval.size());
std::vector<std::vector<ComplexD>> lmd(Nu,std::vector<ComplexD>(Nm,0.0)); std::vector<std::vector<ComplexD>> lmd(Nu,std::vector<ComplexD>(Nm,0.0));
std::vector<std::vector<ComplexD>> lme(Nu,std::vector<ComplexD>(Nm,0.0)); std::vector<std::vector<ComplexD>> lme(Nu,std::vector<ComplexD>(Nm,0.0));
@@ -579,8 +579,8 @@ cudaStat = cudaMallocManaged((void **)&evec_acc, Nevec_acc*sites*12*sizeof(CUDA_
{ {
std::string fname = std::string(cname+"::calc_rbl()"); std::string fname = std::string(cname+"::calc_rbl()");
GridBase *grid = evec[0].Grid(); GridBase *grid = evec[0].Grid();
GRID_ASSERT(grid == src[0].Grid()); assert(grid == src[0].Grid());
GRID_ASSERT( Nu = src.size() ); assert( Nu = src.size() );
int Np = (Nm-Nk); int Np = (Nm-Nk);
if (Np > 0 && MaxIter > 1) Np /= MaxIter; if (Np > 0 && MaxIter > 1) Np /= MaxIter;
@@ -607,7 +607,7 @@ cudaStat = cudaMallocManaged((void **)&evec_acc, Nevec_acc*sites*12*sizeof(CUDA_
} }
Glog << std::string(74,'*') << std::endl; Glog << std::string(74,'*') << std::endl;
GRID_ASSERT(Nm == evec.size() && Nm == eval.size()); assert(Nm == evec.size() && Nm == eval.size());
std::vector<std::vector<ComplexD>> lmd(Nu,std::vector<ComplexD>(Nm,0.0)); std::vector<std::vector<ComplexD>> lmd(Nu,std::vector<ComplexD>(Nm,0.0));
std::vector<std::vector<ComplexD>> lme(Nu,std::vector<ComplexD>(Nm,0.0)); std::vector<std::vector<ComplexD>> lme(Nu,std::vector<ComplexD>(Nm,0.0));
@@ -785,7 +785,7 @@ private:
int Nu = w.size(); int Nu = w.size();
int Nm = evec.size(); int Nm = evec.size();
GRID_ASSERT( b < Nm/Nu ); assert( b < Nm/Nu );
// GridCartesian *grid = evec[0]._grid; // GridCartesian *grid = evec[0]._grid;
// converts block index to full indicies for an interval [L,R) // converts block index to full indicies for an interval [L,R)
@@ -796,7 +796,7 @@ private:
Glog << "Using split grid"<< std::endl; Glog << "Using split grid"<< std::endl;
// LatticeGaugeField s_Umu(SGrid); // LatticeGaugeField s_Umu(SGrid);
GRID_ASSERT((Nu%mrhs)==0); assert((Nu%mrhs)==0);
std::vector<Field> in(mrhs,f_grid); std::vector<Field> in(mrhs,f_grid);
Field s_in(sf_grid); Field s_in(sf_grid);
@@ -906,7 +906,7 @@ if(split_test){
for (int u=0; u<Nu; ++u) { for (int u=0; u<Nu; ++u) {
// Glog << "norm2(w[" << u << "])= "<< norm2(w[u]) << std::endl; // Glog << "norm2(w[" << u << "])= "<< norm2(w[u]) << std::endl;
GRID_ASSERT (!isnan(norm2(w[u]))); assert (!isnan(norm2(w[u])));
for (int k=L+u; k<R; ++k) { for (int k=L+u; k<R; ++k) {
Glog <<" In block "<< b << "," <<" beta[" << u << "," << k-L << "] = " << lme[u][k] << std::endl; Glog <<" In block "<< b << "," <<" beta[" << u << "," << k-L << "] = " << lme[u][k] << std::endl;
} }
@@ -929,8 +929,8 @@ if(split_test){
Eigen::MatrixXcd & Qt, // Nm x Nm Eigen::MatrixXcd & Qt, // Nm x Nm
GridBase *grid) GridBase *grid)
{ {
GRID_ASSERT( Nk%Nu == 0 && Nm%Nu == 0 ); assert( Nk%Nu == 0 && Nm%Nu == 0 );
GRID_ASSERT( Nk <= Nm ); assert( Nk <= Nm );
Eigen::MatrixXcd BlockTriDiag = Eigen::MatrixXcd::Zero(Nk,Nk); Eigen::MatrixXcd BlockTriDiag = Eigen::MatrixXcd::Zero(Nk,Nk);
for ( int u=0; u<Nu; ++u ) { for ( int u=0; u<Nu; ++u ) {
@@ -970,8 +970,8 @@ if(split_test){
GridBase *grid) GridBase *grid)
{ {
Glog << "diagonalize_lapack: Nu= "<<Nu<<" Nk= "<<Nk<<" Nm= "<<std::endl; Glog << "diagonalize_lapack: Nu= "<<Nu<<" Nk= "<<Nk<<" Nm= "<<std::endl;
GRID_ASSERT( Nk%Nu == 0 && Nm%Nu == 0 ); assert( Nk%Nu == 0 && Nm%Nu == 0 );
GRID_ASSERT( Nk <= Nm ); assert( Nk <= Nm );
Eigen::MatrixXcd BlockTriDiag = Eigen::MatrixXcd::Zero(Nk,Nk); Eigen::MatrixXcd BlockTriDiag = Eigen::MatrixXcd::Zero(Nk,Nk);
for ( int u=0; u<Nu; ++u ) { for ( int u=0; u<Nu; ++u ) {
@@ -1119,7 +1119,7 @@ if (1){
diagonalize_lapack(eval,lmd,lme,Nu,Nk,Nm,Qt,grid); diagonalize_lapack(eval,lmd,lme,Nu,Nk,Nm,Qt,grid);
#endif #endif
} else { } else {
GRID_ASSERT(0); assert(0);
} }
} }
@@ -1131,8 +1131,8 @@ if (1){
Eigen::MatrixXcd& M) Eigen::MatrixXcd& M)
{ {
//Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n'; //Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n';
GRID_ASSERT( Nk%Nu == 0 && Nm%Nu == 0 ); assert( Nk%Nu == 0 && Nm%Nu == 0 );
GRID_ASSERT( Nk <= Nm ); assert( Nk <= Nm );
M = Eigen::MatrixXcd::Zero(Nk,Nk); M = Eigen::MatrixXcd::Zero(Nk,Nk);
// rearrange // rearrange
@@ -1159,8 +1159,8 @@ if (1){
Eigen::MatrixXcd& M) Eigen::MatrixXcd& M)
{ {
//Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n'; //Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n';
GRID_ASSERT( Nk%Nu == 0 && Nm%Nu == 0 ); assert( Nk%Nu == 0 && Nm%Nu == 0 );
GRID_ASSERT( Nk <= Nm ); assert( Nk <= Nm );
// rearrange // rearrange
for ( int u=0; u<Nu; ++u ) { for ( int u=0; u<Nu; ++u ) {

View File

@@ -121,7 +121,7 @@ public:
eresid(_eresid), MaxIter(_MaxIter), eresid(_eresid), MaxIter(_MaxIter),
diagonalisation(_diagonalisation), diagonalisation(_diagonalisation),
Nevec_acc(_Nu) Nevec_acc(_Nu)
{ GRID_ASSERT( (Nk%Nu==0) && (Nm%Nu==0) ); }; { assert( (Nk%Nu==0) && (Nm%Nu==0) ); };
//////////////////////////////// ////////////////////////////////
// Helpers // Helpers
@@ -151,7 +151,7 @@ public:
Glog<<"orthogonalize after: "<<j<<" of "<<k<<" "<< ip <<std::endl; Glog<<"orthogonalize after: "<<j<<" of "<<k<<" "<< ip <<std::endl;
} }
} }
GRID_ASSERT(normalize(w,if_print) != 0); assert(normalize(w,if_print) != 0);
} }
void reorthogonalize(Field& w, std::vector<Field>& evec, int k) void reorthogonalize(Field& w, std::vector<Field>& evec, int k)
{ {
@@ -169,7 +169,7 @@ public:
w[i] = w[i] - ip * evec[j]; w[i] = w[i] - ip * evec[j];
}} }}
for(int i=0; i<_Nu; ++i) for(int i=0; i<_Nu; ++i)
GRID_ASSERT(normalize(w[i],if_print) !=0); assert(normalize(w[i],if_print) !=0);
} }
void orthogonalize_blockhead(Field& w, std::vector<Field>& evec, int k, int Nu) void orthogonalize_blockhead(Field& w, std::vector<Field>& evec, int k, int Nu)
@@ -205,8 +205,8 @@ public:
{ {
std::string fname = std::string(cname+"::calc_irbl()"); std::string fname = std::string(cname+"::calc_irbl()");
GridBase *grid = evec[0].Grid(); GridBase *grid = evec[0].Grid();
GRID_ASSERT(grid == src[0].Grid()); assert(grid == src[0].Grid());
GRID_ASSERT( Nu = src.size() ); assert( Nu = src.size() );
Glog << std::string(74,'*') << std::endl; Glog << std::string(74,'*') << std::endl;
Glog << fname + " starting iteration 0 / "<< MaxIter<< std::endl; Glog << fname + " starting iteration 0 / "<< MaxIter<< std::endl;
@@ -227,7 +227,7 @@ public:
} }
Glog << std::string(74,'*') << std::endl; Glog << std::string(74,'*') << std::endl;
GRID_ASSERT(Nm == evec.size() && Nm == eval.size()); assert(Nm == evec.size() && Nm == eval.size());
std::vector<std::vector<ComplexD>> lmd(Nu,std::vector<ComplexD>(Nm,0.0)); std::vector<std::vector<ComplexD>> lmd(Nu,std::vector<ComplexD>(Nm,0.0));
std::vector<std::vector<ComplexD>> lme(Nu,std::vector<ComplexD>(Nm,0.0)); std::vector<std::vector<ComplexD>> lme(Nu,std::vector<ComplexD>(Nm,0.0));
@@ -279,16 +279,16 @@ public:
Qt = Eigen::MatrixXcd::Identity(Nm,Nm); Qt = Eigen::MatrixXcd::Identity(Nm,Nm);
diagonalize(eval2,lmd2,lme2,Nu,Nm,Nm,Qt,grid); diagonalize(eval2,lmd2,lme2,Nu,Nm,Nm,Qt,grid);
_sort.push(eval2,Nm); _sort.push(eval2,Nm);
// Glog << "#Ritz value before shift: "<< std::endl; Glog << "#Ritz value before shift: "<< std::endl;
for(int i=0; i<Nm; ++i){ for(int i=0; i<Nm; ++i){
// std::cout.precision(13); std::cout.precision(13);
// std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] "; std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
// std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl; std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
} }
//---------------------------------------------------------------------- //----------------------------------------------------------------------
if ( Nm>Nk ) { if ( Nm>Nk ) {
// Glog <<" #Apply shifted QR transformations "<<std::endl; Glog <<" #Apply shifted QR transformations "<<std::endl;
//int k2 = Nk+Nu; //int k2 = Nk+Nu;
int k2 = Nk; int k2 = Nk;
@@ -326,7 +326,7 @@ public:
Qt = Eigen::MatrixXcd::Identity(Nm,Nm); Qt = Eigen::MatrixXcd::Identity(Nm,Nm);
diagonalize(eval2,lmd2,lme2,Nu,Nk,Nm,Qt,grid); diagonalize(eval2,lmd2,lme2,Nu,Nk,Nm,Qt,grid);
_sort.push(eval2,Nk); _sort.push(eval2,Nk);
// Glog << "#Ritz value after shift: "<< std::endl; Glog << "#Ritz value after shift: "<< std::endl;
for(int i=0; i<Nk; ++i){ for(int i=0; i<Nk; ++i){
// std::cout.precision(13); // std::cout.precision(13);
// std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] "; // std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
@@ -413,8 +413,8 @@ public:
{ {
std::string fname = std::string(cname+"::calc_rbl()"); std::string fname = std::string(cname+"::calc_rbl()");
GridBase *grid = evec[0].Grid(); GridBase *grid = evec[0].Grid();
GRID_ASSERT(grid == src[0].Grid()); assert(grid == src[0].Grid());
GRID_ASSERT( Nu = src.size() ); assert( Nu = src.size() );
int Np = (Nm-Nk); int Np = (Nm-Nk);
if (Np > 0 && MaxIter > 1) Np /= MaxIter; if (Np > 0 && MaxIter > 1) Np /= MaxIter;
@@ -441,7 +441,7 @@ public:
} }
Glog << std::string(74,'*') << std::endl; Glog << std::string(74,'*') << std::endl;
GRID_ASSERT(Nm == evec.size() && Nm == eval.size()); assert(Nm == evec.size() && Nm == eval.size());
std::vector<std::vector<ComplexD>> lmd(Nu,std::vector<ComplexD>(Nm,0.0)); std::vector<std::vector<ComplexD>> lmd(Nu,std::vector<ComplexD>(Nm,0.0));
std::vector<std::vector<ComplexD>> lme(Nu,std::vector<ComplexD>(Nm,0.0)); std::vector<std::vector<ComplexD>> lme(Nu,std::vector<ComplexD>(Nm,0.0));
@@ -622,7 +622,7 @@ private:
int Nu = w.size(); int Nu = w.size();
int Nm = evec.size(); int Nm = evec.size();
GRID_ASSERT( b < Nm/Nu ); assert( b < Nm/Nu );
// converts block index to full indicies for an interval [L,R) // converts block index to full indicies for an interval [L,R)
int L = Nu*b; int L = Nu*b;
@@ -630,7 +630,7 @@ private:
Real beta; Real beta;
GRID_ASSERT((Nu%mrhs)==0); assert((Nu%mrhs)==0);
std::vector<Field> in(mrhs,f_grid); std::vector<Field> in(mrhs,f_grid);
std::vector<Field> out(mrhs,f_grid); std::vector<Field> out(mrhs,f_grid);
@@ -644,7 +644,7 @@ private:
// for (int u=0; u<mrhs; ++u) Glog << " out["<<u<<"] = "<<norm2(out[u])<<std::endl; // for (int u=0; u<mrhs; ++u) Glog << " out["<<u<<"] = "<<norm2(out[u])<<std::endl;
k_start +=mrhs; k_start +=mrhs;
} }
// Glog << "LinAlg "<< std::endl; Glog << "LinAlg "<< std::endl;
if (b>0) { if (b>0) {
for (int u=0; u<Nu; ++u) { for (int u=0; u<Nu; ++u) {
@@ -678,7 +678,7 @@ private:
} }
w_copy[u] = w[u]; w_copy[u] = w[u];
} }
// Glog << "LinAlg done"<< std::endl; Glog << "LinAlg done"<< std::endl;
// In block version, the steps 6 and 7 in Lanczos construction is // In block version, the steps 6 and 7 in Lanczos construction is
// replaced by the QR decomposition of new basis block. // replaced by the QR decomposition of new basis block.
@@ -691,15 +691,15 @@ private:
} }
// re-orthogonalization for numerical stability // re-orthogonalization for numerical stability
// Glog << "Gram Schmidt"<< std::endl; Glog << "Gram Schmidt"<< std::endl;
orthogonalize(w,Nu,evec,R); orthogonalize(w,Nu,evec,R);
// QR part // QR part
for (int u=1; u<Nu; ++u) { for (int u=1; u<Nu; ++u) {
orthogonalize(w[u],w,u); orthogonalize(w[u],w,u);
} }
// Glog << "Gram Schmidt done "<< std::endl; Glog << "Gram Schmidt done "<< std::endl;
// Glog << "LinAlg "<< std::endl; Glog << "LinAlg "<< std::endl;
for (int u=0; u<Nu; ++u) { for (int u=0; u<Nu; ++u) {
//for (int v=0; v<Nu; ++v) { //for (int v=0; v<Nu; ++v) {
for (int v=u; v<Nu; ++v) { for (int v=u; v<Nu; ++v) {
@@ -711,12 +711,12 @@ private:
for (int u=0; u<Nu; ++u) { for (int u=0; u<Nu; ++u) {
// Glog << "norm2(w[" << u << "])= "<< norm2(w[u]) << std::endl; // Glog << "norm2(w[" << u << "])= "<< norm2(w[u]) << std::endl;
GRID_ASSERT (!isnan(norm2(w[u]))); assert (!isnan(norm2(w[u])));
for (int k=L+u; k<R; ++k) { for (int k=L+u; k<R; ++k) {
// Glog <<" In block "<< b << "," <<" beta[" << u << "," << k-L << "] = " << lme[u][k] << std::endl; // Glog <<" In block "<< b << "," <<" beta[" << u << "," << k-L << "] = " << lme[u][k] << std::endl;
} }
} }
// Glog << "LinAlg done "<< std::endl; Glog << "LinAlg done "<< std::endl;
if (b < Nm/Nu-1) { if (b < Nm/Nu-1) {
for (int u=0; u<Nu; ++u) { for (int u=0; u<Nu; ++u) {
@@ -734,8 +734,8 @@ private:
Eigen::MatrixXcd & Qt, // Nm x Nm Eigen::MatrixXcd & Qt, // Nm x Nm
GridBase *grid) GridBase *grid)
{ {
GRID_ASSERT( Nk%Nu == 0 && Nm%Nu == 0 ); assert( Nk%Nu == 0 && Nm%Nu == 0 );
GRID_ASSERT( Nk <= Nm ); assert( Nk <= Nm );
Eigen::MatrixXcd BlockTriDiag = Eigen::MatrixXcd::Zero(Nk,Nk); Eigen::MatrixXcd BlockTriDiag = Eigen::MatrixXcd::Zero(Nk,Nk);
for ( int u=0; u<Nu; ++u ) { for ( int u=0; u<Nu; ++u ) {
@@ -775,8 +775,8 @@ private:
GridBase *grid) GridBase *grid)
{ {
Glog << "diagonalize_lapack: Nu= "<<Nu<<" Nk= "<<Nk<<" Nm= "<<std::endl; Glog << "diagonalize_lapack: Nu= "<<Nu<<" Nk= "<<Nk<<" Nm= "<<std::endl;
GRID_ASSERT( Nk%Nu == 0 && Nm%Nu == 0 ); assert( Nk%Nu == 0 && Nm%Nu == 0 );
GRID_ASSERT( Nk <= Nm ); assert( Nk <= Nm );
Eigen::MatrixXcd BlockTriDiag = Eigen::MatrixXcd::Zero(Nk,Nk); Eigen::MatrixXcd BlockTriDiag = Eigen::MatrixXcd::Zero(Nk,Nk);
for ( int u=0; u<Nu; ++u ) { for ( int u=0; u<Nu; ++u ) {
@@ -924,7 +924,7 @@ if (1){
diagonalize_lapack(eval,lmd,lme,Nu,Nk,Nm,Qt,grid); diagonalize_lapack(eval,lmd,lme,Nu,Nk,Nm,Qt,grid);
#endif #endif
} else { } else {
GRID_ASSERT(0); assert(0);
} }
} }
@@ -935,9 +935,9 @@ if (1){
int Nu, int Nb, int Nk, int Nm, int Nu, int Nb, int Nk, int Nm,
Eigen::MatrixXcd& M) Eigen::MatrixXcd& M)
{ {
// Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n'; Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n';
GRID_ASSERT( Nk%Nu == 0 && Nm%Nu == 0 ); assert( Nk%Nu == 0 && Nm%Nu == 0 );
GRID_ASSERT( Nk <= Nm ); assert( Nk <= Nm );
M = Eigen::MatrixXcd::Zero(Nk,Nk); M = Eigen::MatrixXcd::Zero(Nk,Nk);
// rearrange // rearrange
@@ -953,7 +953,7 @@ if (1){
M(u+(k/Nu)*Nu,k-Nu) = lme[u][k-Nu]; M(u+(k/Nu)*Nu,k-Nu) = lme[u][k-Nu];
} }
} }
// Glog << "unpackHermitBlockTriDiagMatToEigen() end" << std::endl; Glog << "unpackHermitBlockTriDiagMatToEigen() end" << std::endl;
} }
@@ -963,9 +963,9 @@ if (1){
int Nu, int Nb, int Nk, int Nm, int Nu, int Nb, int Nk, int Nm,
Eigen::MatrixXcd& M) Eigen::MatrixXcd& M)
{ {
// Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n'; Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n';
GRID_ASSERT( Nk%Nu == 0 && Nm%Nu == 0 ); assert( Nk%Nu == 0 && Nm%Nu == 0 );
GRID_ASSERT( Nk <= Nm ); assert( Nk <= Nm );
// rearrange // rearrange
for ( int u=0; u<Nu; ++u ) { for ( int u=0; u<Nu; ++u ) {
@@ -979,7 +979,7 @@ if (1){
lme[u][k-Nu] = M(u+(k/Nu)*Nu,k-Nu); lme[u][k-Nu] = M(u+(k/Nu)*Nu,k-Nu);
} }
} }
// Glog << "packHermitBlockTriDiagMatfromEigen() end" <<std::endl; Glog << "packHermitBlockTriDiagMatfromEigen() end" <<std::endl;
} }
@@ -988,7 +988,7 @@ if (1){
RealD Dsh, RealD Dsh,
Eigen::MatrixXcd& Qprod) Eigen::MatrixXcd& Qprod)
{ {
// Glog << "shiftedQRDecompEigen() begin" << '\n'; Glog << "shiftedQRDecompEigen() begin" << '\n';
Eigen::MatrixXcd Q = Eigen::MatrixXcd::Zero(Nm,Nm); Eigen::MatrixXcd Q = Eigen::MatrixXcd::Zero(Nm,Nm);
Eigen::MatrixXcd R = Eigen::MatrixXcd::Zero(Nm,Nm); Eigen::MatrixXcd R = Eigen::MatrixXcd::Zero(Nm,Nm);
Eigen::MatrixXcd Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm); Eigen::MatrixXcd Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm);
@@ -1004,7 +1004,7 @@ if (1){
// lower triangular part used to represent series // lower triangular part used to represent series
// of Q sequence. // of Q sequence.
// Glog << "shiftedQRDecompEigen() Housholder & QR" << '\n'; Glog << "shiftedQRDecompEigen() Housholder & QR" << '\n';
// equivalent operation of Qprod *= Q // equivalent operation of Qprod *= Q
//M = Eigen::MatrixXcd::Zero(Nm,Nm); //M = Eigen::MatrixXcd::Zero(Nm,Nm);
@@ -1025,7 +1025,7 @@ if (1){
Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm); Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm);
// Glog << "shiftedQRDecompEigen() Mtmp create" << '\n'; Glog << "shiftedQRDecompEigen() Mtmp create" << '\n';
for (int i=0; i<Nm; ++i) { for (int i=0; i<Nm; ++i) {
for (int j=0; j<Nm-(Nu+1); ++j) { for (int j=0; j<Nm-(Nu+1); ++j) {
for (int k=0; k<Nu+1+j; ++k) { for (int k=0; k<Nu+1+j; ++k) {
@@ -1033,7 +1033,7 @@ if (1){
} }
} }
} }
// Glog << "shiftedQRDecompEigen() Mtmp loop1" << '\n'; Glog << "shiftedQRDecompEigen() Mtmp loop1" << '\n';
for (int i=0; i<Nm; ++i) { for (int i=0; i<Nm; ++i) {
for (int j=Nm-(Nu+1); j<Nm; ++j) { for (int j=Nm-(Nu+1); j<Nm; ++j) {
for (int k=0; k<Nm; ++k) { for (int k=0; k<Nm; ++k) {
@@ -1041,7 +1041,7 @@ if (1){
} }
} }
} }
// Glog << "shiftedQRDecompEigen() Mtmp loop2" << '\n'; Glog << "shiftedQRDecompEigen() Mtmp loop2" << '\n';
//static int ntimes = 2; //static int ntimes = 2;
//for (int j=0; j<Nm-(ntimes*Nu); ++j) { //for (int j=0; j<Nm-(ntimes*Nu); ++j) {
@@ -1067,13 +1067,13 @@ if (1){
Mtmp(j,i) = conj(Mtmp(i,j)); Mtmp(j,i) = conj(Mtmp(i,j));
} }
} }
// Glog << "shiftedQRDecompEigen() Mtmp loop3" << '\n'; Glog << "shiftedQRDecompEigen() Mtmp loop3" << '\n';
for (int i=0; i<Nm; ++i) { for (int i=0; i<Nm; ++i) {
Mtmp(i,i) = real(Mtmp(i,i)) + Dsh; Mtmp(i,i) = real(Mtmp(i,i)) + Dsh;
} }
// Glog << "shiftedQRDecompEigen() Mtmp loop4" << '\n'; Glog << "shiftedQRDecompEigen() Mtmp loop4" << '\n';
M = Mtmp; M = Mtmp;
//M = Q.adjoint()*(M*Q); //M = Q.adjoint()*(M*Q);
@@ -1085,7 +1085,7 @@ if (1){
// } // }
//} //}
// Glog << "shiftedQRDecompEigen() end" <<std::endl; Glog << "shiftedQRDecompEigen() end" <<std::endl;
} }
void exampleQRDecompEigen(void) void exampleQRDecompEigen(void)

View File

@@ -211,7 +211,7 @@ until convergence
void calc(std::vector<RealD>& eval, std::vector<Field>& evec, const Field& src, int& Nconv, bool reverse=false) void calc(std::vector<RealD>& eval, std::vector<Field>& evec, const Field& src, int& Nconv, bool reverse=false)
{ {
GridBase *grid = src.Grid(); GridBase *grid = src.Grid();
GRID_ASSERT(grid == evec[0].Grid()); assert(grid == evec[0].Grid());
// GridLogIRL.TimingMode(1); // GridLogIRL.TimingMode(1);
std::cout << GridLogIRL <<"**************************************************************************"<< std::endl; std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
@@ -231,7 +231,7 @@ until convergence
} }
std::cout << GridLogIRL <<"**************************************************************************"<< std::endl; std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
GRID_ASSERT(Nm <= evec.size() && Nm <= eval.size()); assert(Nm <= evec.size() && Nm <= eval.size());
// quickly get an idea of the largest eigenvalue to more properly normalize the residuum // quickly get an idea of the largest eigenvalue to more properly normalize the residuum
RealD evalMaxApprox = 0.0; RealD evalMaxApprox = 0.0;
@@ -245,10 +245,9 @@ until convergence
_HermOp(src_n,tmp); _HermOp(src_n,tmp);
// std::cout << GridLogMessage<< tmp<<std::endl; exit(0); // std::cout << GridLogMessage<< tmp<<std::endl; exit(0);
// std::cout << GridLogIRL << " _HermOp " << norm2(tmp) << std::endl; // std::cout << GridLogIRL << " _HermOp " << norm2(tmp) << std::endl;
// RealD vnum = real(innerProduct(src_n,tmp)); // HermOp. RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
RealD vnum = real(innerProduct(tmp,tmp)); // HermOp^2.
RealD vden = norm2(src_n); RealD vden = norm2(src_n);
RealD na = std::sqrt(vnum/vden); RealD na = vnum/vden;
if (fabs(evalMaxApprox/na - 1.0) < 0.0001) if (fabs(evalMaxApprox/na - 1.0) < 0.0001)
i=_MAX_ITER_IRL_MEVAPP_; i=_MAX_ITER_IRL_MEVAPP_;
evalMaxApprox = na; evalMaxApprox = na;
@@ -256,7 +255,6 @@ until convergence
src_n = tmp; src_n = tmp;
} }
} }
std::cout << GridLogIRL << " Final evalMaxApprox " << evalMaxApprox << std::endl;
std::vector<RealD> lme(Nm); std::vector<RealD> lme(Nm);
std::vector<RealD> lme2(Nm); std::vector<RealD> lme2(Nm);
@@ -337,7 +335,7 @@ until convergence
} }
std::cout<<GridLogIRL <<"QR decomposed "<<std::endl; std::cout<<GridLogIRL <<"QR decomposed "<<std::endl;
GRID_ASSERT(k2<Nm); GRID_ASSERT(k2<Nm); GRID_ASSERT(k1>0); assert(k2<Nm); assert(k2<Nm); assert(k1>0);
basisRotate(evec,Qt,k1-1,k2+1,0,Nm,Nm); /// big constraint on the basis basisRotate(evec,Qt,k1-1,k2+1,0,Nm,Nm); /// big constraint on the basis
std::cout<<GridLogIRL <<"basisRotated by Qt *"<<k1-1<<","<<k2+1<<")"<<std::endl; std::cout<<GridLogIRL <<"basisRotated by Qt *"<<k1-1<<","<<k2+1<<")"<<std::endl;
@@ -463,7 +461,7 @@ until convergence
{ {
std::cout<<GridLogDebug << "Lanczos step " <<k<<std::endl; std::cout<<GridLogDebug << "Lanczos step " <<k<<std::endl;
const RealD tiny = 1.0e-20; const RealD tiny = 1.0e-20;
GRID_ASSERT( k< Nm ); assert( k< Nm );
GridStopWatch gsw_op,gsw_o; GridStopWatch gsw_op,gsw_o;
@@ -597,7 +595,7 @@ until convergence
} else if ( diagonalisation == IRLdiagonaliseWithEigen ) { } else if ( diagonalisation == IRLdiagonaliseWithEigen ) {
diagonalize_Eigen(lmd,lme,Nk,Nm,Qt,grid); diagonalize_Eigen(lmd,lme,Nk,Nm,Qt,grid);
} else { } else {
GRID_ASSERT(0); assert(0);
} }
} }
@@ -687,7 +685,7 @@ void diagonalize_lapack(std::vector<RealD>& lmd,
} }
} }
#else #else
GRID_ASSERT(0); assert(0);
#endif #endif
} }

View File

@@ -80,7 +80,7 @@ public:
ProjectedHermOp(LinearOperatorBase<FineField>& linop, std::vector<FineField> & _subspace) : ProjectedHermOp(LinearOperatorBase<FineField>& linop, std::vector<FineField> & _subspace) :
_Linop(linop), subspace(_subspace) _Linop(linop), subspace(_subspace)
{ {
GRID_ASSERT(subspace.size() >0); assert(subspace.size() >0);
}; };
void operator()(const CoarseField& in, CoarseField& out) { void operator()(const CoarseField& in, CoarseField& out) {
@@ -346,12 +346,12 @@ public:
void testFine(RealD resid) void testFine(RealD resid)
{ {
GRID_ASSERT(evals_fine.size() == nbasis); assert(evals_fine.size() == nbasis);
GRID_ASSERT(subspace.size() == nbasis); assert(subspace.size() == nbasis);
PlainHermOp<FineField> Op(_FineOp); PlainHermOp<FineField> Op(_FineOp);
ImplicitlyRestartedLanczosHermOpTester<FineField> SimpleTester(Op); ImplicitlyRestartedLanczosHermOpTester<FineField> SimpleTester(Op);
for(int k=0;k<nbasis;k++){ for(int k=0;k<nbasis;k++){
GRID_ASSERT(SimpleTester.ReconstructEval(k,resid,subspace[k],evals_fine[k],1.0)==1); assert(SimpleTester.ReconstructEval(k,resid,subspace[k],evals_fine[k],1.0)==1);
} }
} }
@@ -359,8 +359,8 @@ public:
//hence the smoother can be tuned after running the coarse Lanczos by using a different smoother here //hence the smoother can be tuned after running the coarse Lanczos by using a different smoother here
void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax) void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax)
{ {
GRID_ASSERT(evals_fine.size() == nbasis); assert(evals_fine.size() == nbasis);
GRID_ASSERT(subspace.size() == nbasis); assert(subspace.size() == nbasis);
////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////
// create a smoother and see if we can get a cheap convergence test and smooth inside the IRL // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////
@@ -380,7 +380,7 @@ public:
void calcFine(ChebyParams cheby_parms,int Nstop,int Nk,int Nm,RealD resid, void calcFine(ChebyParams cheby_parms,int Nstop,int Nk,int Nm,RealD resid,
RealD MaxIt, RealD betastp, int MinRes) RealD MaxIt, RealD betastp, int MinRes)
{ {
GRID_ASSERT(nbasis<=Nm); assert(nbasis<=Nm);
Chebyshev<FineField> Cheby(cheby_parms); Chebyshev<FineField> Cheby(cheby_parms);
FunctionHermOp<FineField> ChebyOp(Cheby,_FineOp); FunctionHermOp<FineField> ChebyOp(Cheby,_FineOp);
PlainHermOp<FineField> Op(_FineOp); PlainHermOp<FineField> Op(_FineOp);
@@ -400,8 +400,8 @@ public:
IRL.calc(evals_fine,subspace,src,Nconv,false); IRL.calc(evals_fine,subspace,src,Nconv,false);
// Shrink down to number saved // Shrink down to number saved
GRID_ASSERT(Nstop>=nbasis); assert(Nstop>=nbasis);
GRID_ASSERT(Nconv>=nbasis); assert(Nconv>=nbasis);
evals_fine.resize(nbasis); evals_fine.resize(nbasis);
subspace.resize(nbasis,_FineGrid); subspace.resize(nbasis,_FineGrid);
} }
@@ -433,7 +433,7 @@ public:
ImplicitlyRestartedLanczos<CoarseField> IRL(ChebyOp,ChebyOp,ChebySmoothTester,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes); ImplicitlyRestartedLanczos<CoarseField> IRL(ChebyOp,ChebyOp,ChebySmoothTester,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
int Nconv=0; int Nconv=0;
IRL.calc(evals_coarse,evec_coarse,src,Nconv,false); IRL.calc(evals_coarse,evec_coarse,src,Nconv,false);
GRID_ASSERT(Nconv>=Nstop); assert(Nconv>=Nstop);
evals_coarse.resize(Nstop); evals_coarse.resize(Nstop);
evec_coarse.resize (Nstop,_CoarseGrid); evec_coarse.resize (Nstop,_CoarseGrid);
for (int i=0;i<Nstop;i++){ for (int i=0;i<Nstop;i++){

View File

@@ -35,7 +35,7 @@ template<class Field> class MinimalResidual : public OperatorFunction<Field> {
public: public:
using OperatorFunction<Field>::operator(); using OperatorFunction<Field>::operator();
bool ErrorOnNoConverge; // throw an GRID_ASSERT when the MR fails to converge. bool ErrorOnNoConverge; // throw an assert when the MR fails to converge.
// Defaults true. // Defaults true.
RealD Tolerance; RealD Tolerance;
Integer MaxIterations; Integer MaxIterations;
@@ -59,7 +59,7 @@ template<class Field> class MinimalResidual : public OperatorFunction<Field> {
// Initial residual computation & set up // Initial residual computation & set up
RealD guess = norm2(psi); RealD guess = norm2(psi);
GRID_ASSERT(std::isnan(guess) == 0); assert(std::isnan(guess) == 0);
RealD ssq = norm2(src); RealD ssq = norm2(src);
RealD rsq = Tolerance * Tolerance * ssq; RealD rsq = Tolerance * Tolerance * ssq;
@@ -136,7 +136,7 @@ template<class Field> class MinimalResidual : public OperatorFunction<Field> {
std::cout << GridLogMessage << "MR Time elapsed: Linalg " << LinalgTimer.Elapsed() << std::endl; std::cout << GridLogMessage << "MR Time elapsed: Linalg " << LinalgTimer.Elapsed() << std::endl;
if (ErrorOnNoConverge) if (ErrorOnNoConverge)
GRID_ASSERT(true_residual / Tolerance < 10000.0); assert(true_residual / Tolerance < 10000.0);
IterationsToComplete = k; IterationsToComplete = k;
@@ -148,7 +148,7 @@ template<class Field> class MinimalResidual : public OperatorFunction<Field> {
<< std::endl; << std::endl;
if (ErrorOnNoConverge) if (ErrorOnNoConverge)
GRID_ASSERT(0); assert(0);
IterationsToComplete = k; IterationsToComplete = k;
} }

View File

@@ -37,7 +37,7 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
using OperatorFunction<FieldD>::operator(); using OperatorFunction<FieldD>::operator();
bool ErrorOnNoConverge; // Throw an GRID_ASSERT when MPFGMRES fails to converge, bool ErrorOnNoConverge; // Throw an assert when MPFGMRES fails to converge,
// defaults to true // defaults to true
RealD Tolerance; RealD Tolerance;
@@ -91,7 +91,7 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
conformable(psi, src); conformable(psi, src);
RealD guess = norm2(psi); RealD guess = norm2(psi);
GRID_ASSERT(std::isnan(guess) == 0); assert(std::isnan(guess) == 0);
RealD cp; RealD cp;
RealD ssq = norm2(src); RealD ssq = norm2(src);
@@ -150,7 +150,7 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
std::cout << GridLogMessage << "MPFGMRES did NOT converge" << std::endl; std::cout << GridLogMessage << "MPFGMRES did NOT converge" << std::endl;
if (ErrorOnNoConverge) if (ErrorOnNoConverge)
GRID_ASSERT(0); assert(0);
} }
RealD outerLoopBody(LinearOperatorBase<FieldD> &LinOp, const FieldD &src, FieldD &psi, RealD rsq) { RealD outerLoopBody(LinearOperatorBase<FieldD> &LinOp, const FieldD &src, FieldD &psi, RealD rsq) {
@@ -197,7 +197,7 @@ class MixedPrecisionFlexibleGeneralisedMinimalResidual : public OperatorFunction
} }
} }
GRID_ASSERT(0); // Never reached assert(0); // Never reached
return cp; return cp;
} }

View File

@@ -60,32 +60,6 @@ public:
} }
}; };
template<class Field> class NormalResidual : public LinearFunction<Field>{
private:
SparseMatrixBase<Field> & _Matrix;
OperatorFunction<Field> & _HermitianSolver;
LinearFunction<Field> & _Guess;
public:
/////////////////////////////////////////////////////
// Wrap the usual normal equations trick
/////////////////////////////////////////////////////
NormalResidual(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver,
LinearFunction<Field> &Guess)
: _Matrix(Matrix), _HermitianSolver(HermitianSolver), _Guess(Guess) {};
void operator() (const Field &in, Field &out){
Field res(in.Grid());
Field tmp(in.Grid());
MMdagLinearOperator<SparseMatrixBase<Field>,Field> MMdagOp(_Matrix);
_Guess(in,res);
_HermitianSolver(MMdagOp,in,res); // M Mdag res = in ;
_Matrix.Mdag(res,out); // out = Mdag res
}
};
template<class Field> class HPDSolver : public LinearFunction<Field> { template<class Field> class HPDSolver : public LinearFunction<Field> {
private: private:
LinearOperatorBase<Field> & _Matrix; LinearOperatorBase<Field> & _Matrix;

View File

@@ -20,7 +20,7 @@ template<class Field> class PowerMethod
RealD evalMaxApprox = 0.0; RealD evalMaxApprox = 0.0;
auto src_n = src; auto src_n = src;
auto tmp = src; auto tmp = src;
const int _MAX_ITER_EST_ = 200; const int _MAX_ITER_EST_ = 100;
for (int i=0;i<_MAX_ITER_EST_;i++) { for (int i=0;i<_MAX_ITER_EST_;i++) {
@@ -30,17 +30,18 @@ template<class Field> class PowerMethod
RealD vden = norm2(src_n); RealD vden = norm2(src_n);
RealD na = vnum/vden; RealD na = vnum/vden;
std::cout << GridLogMessage << "PowerMethod: Current approximation of largest eigenvalue " << na << std::endl; std::cout << GridLogIterative << "PowerMethod: Current approximation of largest eigenvalue " << na << std::endl;
// if ( (fabs(evalMaxApprox/na - 1.0) < 0.0001) || (i==_MAX_ITER_EST_-1) ) { if ( (fabs(evalMaxApprox/na - 1.0) < 0.001) || (i==_MAX_ITER_EST_-1) ) {
// evalMaxApprox = na;
// return evalMaxApprox;
// }
evalMaxApprox = na; evalMaxApprox = na;
src_n = tmp;
}
std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl; std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
return evalMaxApprox; return evalMaxApprox;
} }
evalMaxApprox = na;
src_n = tmp;
}
assert(0);
return 0;
}
}; };
} }

View File

@@ -1,76 +0,0 @@
#pragma once
namespace Grid {
class Band
{
RealD lo, hi;
public:
Band(RealD _lo,RealD _hi)
{
lo=_lo;
hi=_hi;
}
RealD operator() (RealD x){
if ( x>lo && x<hi ){
return 1.0;
} else {
return 0.0;
}
}
};
class PowerSpectrum
{
public:
template<typename T> static RealD normalise(T& v)
{
RealD nn = norm2(v);
nn = sqrt(nn);
v = v * (1.0/nn);
return nn;
}
std::vector<RealD> ranges;
std::vector<int> order;
PowerSpectrum( std::vector<RealD> &bins, std::vector<int> &_order ) : ranges(bins), order(_order) { };
template<class Field>
RealD operator()(LinearOperatorBase<Field> &HermOp, const Field &src)
{
GridBase *grid = src.Grid();
int N=ranges.size();
RealD hi = ranges[N-1];
RealD lo_band = 0.0;
RealD hi_band;
RealD nn=norm2(src);
RealD ss=0.0;
Field tmp = src;
for(int b=0;b<N;b++){
hi_band = ranges[b];
Band Notch(lo_band,hi_band);
Chebyshev<Field> polynomial;
polynomial.Init(0.0,hi,order[b],Notch);
polynomial.JacksonSmooth();
polynomial(HermOp,src,tmp) ;
RealD p=norm2(tmp);
ss=ss+p;
std::cout << GridLogMessage << " PowerSpectrum Band["<<lo_band<<","<<hi_band<<"] power "<<norm2(tmp)/nn<<std::endl;
lo_band=hi_band;
}
std::cout << GridLogMessage << " PowerSpectrum total power "<<ss/nn<<std::endl;
std::cout << GridLogMessage << " PowerSpectrum total power (unnormalised) "<<nn<<std::endl;
return 0;
};
};
}

View File

@@ -112,7 +112,7 @@ public:
} }
std::cout<<GridLogMessage<<"PrecConjugateResidual did NOT converge"<<std::endl; std::cout<<GridLogMessage<<"PrecConjugateResidual did NOT converge"<<std::endl;
GRID_ASSERT(0); assert(0);
} }
}; };
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@@ -118,7 +118,7 @@ public:
} }
GCRLogLevel<<"Variable Preconditioned GCR did not converge"<<std::endl; GCRLogLevel<<"Variable Preconditioned GCR did not converge"<<std::endl;
// GRID_ASSERT(0); // assert(0);
} }
RealD GCRnStep(const Field &src, Field &psi,RealD rsq){ RealD GCRnStep(const Field &src, Field &psi,RealD rsq){
@@ -221,7 +221,7 @@ public:
int northog = ((kp)>(mmax-1))?(mmax-1):(kp); // if more than mmax done, we orthog all mmax history. int northog = ((kp)>(mmax-1))?(mmax-1):(kp); // if more than mmax done, we orthog all mmax history.
for(int back=0;back<northog;back++){ for(int back=0;back<northog;back++){
int peri_back=(k-back)%mmax; GRID_ASSERT((k-back)>=0); int peri_back=(k-back)%mmax; assert((k-back)>=0);
b=-real(innerProduct(q[peri_back],Az))/qq[peri_back]; b=-real(innerProduct(q[peri_back],Az))/qq[peri_back];
p[peri_kp]=p[peri_kp]+b*p[peri_back]; p[peri_kp]=p[peri_kp]+b*p[peri_back];
@@ -231,7 +231,7 @@ public:
qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm
LinalgTimer.Stop(); LinalgTimer.Stop();
} }
GRID_ASSERT(0); // never reached assert(0); // never reached
return cp; return cp;
} }
}; };

View File

@@ -74,7 +74,7 @@ public:
void operator() (const Field &src, Field &psi){ void operator() (const Field &src, Field &psi){
// psi=Zero(); psi=Zero();
RealD cp, ssq,rsq; RealD cp, ssq,rsq;
ssq=norm2(src); ssq=norm2(src);
rsq=Tolerance*Tolerance*ssq; rsq=Tolerance*Tolerance*ssq;
@@ -113,7 +113,7 @@ public:
} }
GCRLogLevel<<"Variable Preconditioned GCR did not converge"<<std::endl; GCRLogLevel<<"Variable Preconditioned GCR did not converge"<<std::endl;
// GRID_ASSERT(0); // assert(0);
} }
RealD GCRnStep(const Field &src, Field &psi,RealD rsq){ RealD GCRnStep(const Field &src, Field &psi,RealD rsq){
@@ -224,7 +224,7 @@ public:
int northog = ((kp)>(mmax-1))?(mmax-1):(kp); // if more than mmax done, we orthog all mmax history. int northog = ((kp)>(mmax-1))?(mmax-1):(kp); // if more than mmax done, we orthog all mmax history.
for(int back=0;back<northog;back++){ for(int back=0;back<northog;back++){
int peri_back=(k-back)%mmax; GRID_ASSERT((k-back)>=0); int peri_back=(k-back)%mmax; assert((k-back)>=0);
b=-real(innerProduct(q[peri_back],Az))/qq[peri_back]; b=-real(innerProduct(q[peri_back],Az))/qq[peri_back];
p[peri_kp]=p[peri_kp]+b*p[peri_back]; p[peri_kp]=p[peri_kp]+b*p[peri_back];
@@ -234,7 +234,7 @@ public:
qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm
LinalgTimer.Stop(); LinalgTimer.Stop();
} }
GRID_ASSERT(0); // never reached assert(0); // never reached
return cp; return cp;
} }
}; };

View File

@@ -79,7 +79,7 @@ class QuasiMinimalResidual : public OperatorFunction<Field> {
LinOp.Op(x,r); r = b - r; LinOp.Op(x,r); r = b - r;
GRID_ASSERT(normb> 0.0); assert(normb> 0.0);
resid = norm2(r)/normb; resid = norm2(r)/normb;
if (resid <= Tolerance) { if (resid <= Tolerance) {
@@ -105,8 +105,8 @@ class QuasiMinimalResidual : public OperatorFunction<Field> {
for (int i = 1; i <= MaxIterations; i++) { for (int i = 1; i <= MaxIterations; i++) {
// Breakdown tests // Breakdown tests
GRID_ASSERT( rho != 0.0); assert( rho != 0.0);
GRID_ASSERT( xi != 0.0); assert( xi != 0.0);
v = (1. / rho) * v_tld; v = (1. / rho) * v_tld;
y = (1. / rho) * y; y = (1. / rho) * y;
@@ -134,10 +134,10 @@ class QuasiMinimalResidual : public OperatorFunction<Field> {
ep=Zep.real(); ep=Zep.real();
std::cout << "Zep "<<Zep <<std::endl; std::cout << "Zep "<<Zep <<std::endl;
// Complex Audit // Complex Audit
GRID_ASSERT(abs(ep)>0); assert(abs(ep)>0);
beta = ep / delta; beta = ep / delta;
GRID_ASSERT(abs(beta)>0); assert(abs(beta)>0);
v_tld = p_tld - beta * v; v_tld = p_tld - beta * v;
y = v_tld; y = v_tld;
@@ -158,7 +158,7 @@ class QuasiMinimalResidual : public OperatorFunction<Field> {
std::cout << "theta "<<theta<<std::endl; std::cout << "theta "<<theta<<std::endl;
std::cout << "gamma "<<gamma<<std::endl; std::cout << "gamma "<<gamma<<std::endl;
GRID_ASSERT(abs(gamma)> 0.0); assert(abs(gamma)> 0.0);
eta = -eta * rho_1 * gamma* gamma / (beta * gamma_1 * gamma_1); eta = -eta * rho_1 * gamma* gamma / (beta * gamma_1 * gamma_1);
@@ -178,7 +178,7 @@ class QuasiMinimalResidual : public OperatorFunction<Field> {
} }
std::cout << "Iteration "<<i<<" resid " << resid<<std::endl; std::cout << "Iteration "<<i<<" resid " << resid<<std::endl;
} }
GRID_ASSERT(0); assert(0);
return; // no convergence return; // no convergence
} }
#else #else

View File

@@ -327,9 +327,9 @@ namespace Grid {
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
// src_o = (source_o - Moe MeeInv source_e) // src_o = (source_o - Moe MeeInv source_e)
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); GRID_ASSERT( tmp.Checkerboard() ==Even); _Matrix.MooeeInv(src_e,tmp); assert( tmp.Checkerboard() ==Even);
_Matrix.Meooe (tmp,Mtmp); GRID_ASSERT( Mtmp.Checkerboard() ==Odd); _Matrix.Meooe (tmp,Mtmp); assert( Mtmp.Checkerboard() ==Odd);
tmp=src_o-Mtmp; GRID_ASSERT( tmp.Checkerboard() ==Odd); tmp=src_o-Mtmp; assert( tmp.Checkerboard() ==Odd);
_Matrix.Mooee(tmp,src_o); // Extra factor of "m" in source from dumb choice of matrix norm. _Matrix.Mooee(tmp,src_o); // Extra factor of "m" in source from dumb choice of matrix norm.
} }
@@ -347,17 +347,17 @@ namespace Grid {
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )... // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
_Matrix.Meooe(sol_o,tmp); GRID_ASSERT( tmp.Checkerboard() ==Even); _Matrix.Meooe(sol_o,tmp); assert( tmp.Checkerboard() ==Even);
src_e = src_e-tmp; GRID_ASSERT( src_e.Checkerboard() ==Even); src_e = src_e-tmp; assert( src_e.Checkerboard() ==Even);
_Matrix.MooeeInv(src_e,sol_e); GRID_ASSERT( sol_e.Checkerboard() ==Even); _Matrix.MooeeInv(src_e,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_e); GRID_ASSERT( sol_e.Checkerboard() ==Even); setCheckerboard(sol,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_o); GRID_ASSERT( sol_o.Checkerboard() ==Odd ); setCheckerboard(sol,sol_o); assert( sol_o.Checkerboard() ==Odd );
} }
virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o) virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o)
{ {
SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix); SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); GRID_ASSERT(sol_o.Checkerboard()==Odd); this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.Checkerboard()==Odd);
}; };
virtual void RedBlackSolve (Matrix & _Matrix,const std::vector<Field> &src_o, std::vector<Field> &sol_o) virtual void RedBlackSolve (Matrix & _Matrix,const std::vector<Field> &src_o, std::vector<Field> &sol_o)
{ {
@@ -396,13 +396,13 @@ namespace Grid {
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
// src_o = Mdag * (source_o - Moe MeeInv source_e) // src_o = Mdag * (source_o - Moe MeeInv source_e)
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); GRID_ASSERT( tmp.Checkerboard() ==Even); _Matrix.MooeeInv(src_e,tmp); assert( tmp.Checkerboard() ==Even);
_Matrix.Meooe (tmp,Mtmp); GRID_ASSERT( Mtmp.Checkerboard() ==Odd); _Matrix.Meooe (tmp,Mtmp); assert( Mtmp.Checkerboard() ==Odd);
tmp=src_o-Mtmp; GRID_ASSERT( tmp.Checkerboard() ==Odd); tmp=src_o-Mtmp; assert( tmp.Checkerboard() ==Odd);
// get the right MpcDag // get the right MpcDag
SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix); SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
_HermOpEO.MpcDag(tmp,src_o); GRID_ASSERT(src_o.Checkerboard() ==Odd); _HermOpEO.MpcDag(tmp,src_o); assert(src_o.Checkerboard() ==Odd);
} }
virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol) virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
@@ -416,17 +416,17 @@ namespace Grid {
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )... // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
_Matrix.Meooe(sol_o,tmp); GRID_ASSERT( tmp.Checkerboard() ==Even); _Matrix.Meooe(sol_o,tmp); assert( tmp.Checkerboard() ==Even);
src_e_i = src_e-tmp; GRID_ASSERT( src_e_i.Checkerboard() ==Even); src_e_i = src_e-tmp; assert( src_e_i.Checkerboard() ==Even);
_Matrix.MooeeInv(src_e_i,sol_e); GRID_ASSERT( sol_e.Checkerboard() ==Even); _Matrix.MooeeInv(src_e_i,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_e); GRID_ASSERT( sol_e.Checkerboard() ==Even); setCheckerboard(sol,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_o); GRID_ASSERT( sol_o.Checkerboard() ==Odd ); setCheckerboard(sol,sol_o); assert( sol_o.Checkerboard() ==Odd );
} }
virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o) virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o)
{ {
SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix); SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); GRID_ASSERT(sol_o.Checkerboard()==Odd); this->_HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.Checkerboard()==Odd);
}; };
virtual void RedBlackSolve (Matrix & _Matrix,const std::vector<Field> &src_o, std::vector<Field> &sol_o) virtual void RedBlackSolve (Matrix & _Matrix,const std::vector<Field> &src_o, std::vector<Field> &sol_o)
{ {
@@ -461,9 +461,9 @@ namespace Grid {
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
// src_o = Mdag * (source_o - Moe MeeInv source_e) // src_o = Mdag * (source_o - Moe MeeInv source_e)
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e, tmp); GRID_ASSERT( tmp.Checkerboard() == Even ); _Matrix.MooeeInv(src_e, tmp); assert( tmp.Checkerboard() == Even );
_Matrix.Meooe (tmp, Mtmp); GRID_ASSERT( Mtmp.Checkerboard() == Odd ); _Matrix.Meooe (tmp, Mtmp); assert( Mtmp.Checkerboard() == Odd );
src_o -= Mtmp; GRID_ASSERT( src_o.Checkerboard() == Odd ); src_o -= Mtmp; assert( src_o.Checkerboard() == Odd );
} }
virtual void RedBlackSolution(Matrix& _Matrix, const Field& sol_o, const Field& src_e, Field& sol) virtual void RedBlackSolution(Matrix& _Matrix, const Field& sol_o, const Field& src_e, Field& sol)
@@ -478,18 +478,18 @@ namespace Grid {
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )... // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
_Matrix.Meooe(sol_o, tmp); GRID_ASSERT( tmp.Checkerboard() == Even ); _Matrix.Meooe(sol_o, tmp); assert( tmp.Checkerboard() == Even );
src_e_i = src_e - tmp; GRID_ASSERT( src_e_i.Checkerboard() == Even ); src_e_i = src_e - tmp; assert( src_e_i.Checkerboard() == Even );
_Matrix.MooeeInv(src_e_i, sol_e); GRID_ASSERT( sol_e.Checkerboard() == Even ); _Matrix.MooeeInv(src_e_i, sol_e); assert( sol_e.Checkerboard() == Even );
setCheckerboard(sol, sol_e); GRID_ASSERT( sol_e.Checkerboard() == Even ); setCheckerboard(sol, sol_e); assert( sol_e.Checkerboard() == Even );
setCheckerboard(sol, sol_o); GRID_ASSERT( sol_o.Checkerboard() == Odd ); setCheckerboard(sol, sol_o); assert( sol_o.Checkerboard() == Odd );
} }
virtual void RedBlackSolve(Matrix& _Matrix, const Field& src_o, Field& sol_o) virtual void RedBlackSolve(Matrix& _Matrix, const Field& src_o, Field& sol_o)
{ {
NonHermitianSchurDiagMooeeOperator<Matrix,Field> _OpEO(_Matrix); NonHermitianSchurDiagMooeeOperator<Matrix,Field> _OpEO(_Matrix);
this->_HermitianRBSolver(_OpEO, src_o, sol_o); GRID_ASSERT(sol_o.Checkerboard() == Odd); this->_HermitianRBSolver(_OpEO, src_o, sol_o); assert(sol_o.Checkerboard() == Odd);
} }
virtual void RedBlackSolve(Matrix& _Matrix, const std::vector<Field>& src_o, std::vector<Field>& sol_o) virtual void RedBlackSolve(Matrix& _Matrix, const std::vector<Field>& src_o, std::vector<Field>& sol_o)
@@ -539,13 +539,13 @@ namespace Grid {
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
// src_o = Mpcdag *MooeeInv * (source_o - Moe MeeInv source_e) // src_o = Mpcdag *MooeeInv * (source_o - Moe MeeInv source_e)
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); GRID_ASSERT( tmp.Checkerboard() ==Even); _Matrix.MooeeInv(src_e,tmp); assert( tmp.Checkerboard() ==Even);
_Matrix.Meooe (tmp,Mtmp); GRID_ASSERT( Mtmp.Checkerboard() ==Odd); _Matrix.Meooe (tmp,Mtmp); assert( Mtmp.Checkerboard() ==Odd);
Mtmp=src_o-Mtmp; Mtmp=src_o-Mtmp;
_Matrix.MooeeInv(Mtmp,tmp); GRID_ASSERT( tmp.Checkerboard() ==Odd); _Matrix.MooeeInv(Mtmp,tmp); assert( tmp.Checkerboard() ==Odd);
// get the right MpcDag // get the right MpcDag
_HermOpEO.MpcDag(tmp,src_o); GRID_ASSERT(src_o.Checkerboard() ==Odd); _HermOpEO.MpcDag(tmp,src_o); assert(src_o.Checkerboard() ==Odd);
} }
virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol) virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
@@ -560,12 +560,12 @@ namespace Grid {
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )... // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
_Matrix.Meooe(sol_o,tmp); GRID_ASSERT( tmp.Checkerboard() ==Even); _Matrix.Meooe(sol_o,tmp); assert( tmp.Checkerboard() ==Even);
tmp = src_e-tmp; GRID_ASSERT( src_e.Checkerboard() ==Even); tmp = src_e-tmp; assert( src_e.Checkerboard() ==Even);
_Matrix.MooeeInv(tmp,sol_e); GRID_ASSERT( sol_e.Checkerboard() ==Even); _Matrix.MooeeInv(tmp,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_e); GRID_ASSERT( sol_e.Checkerboard() ==Even); setCheckerboard(sol,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_o); GRID_ASSERT( sol_o.Checkerboard() ==Odd ); setCheckerboard(sol,sol_o); assert( sol_o.Checkerboard() ==Odd );
}; };
virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o) virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o)
@@ -612,12 +612,12 @@ namespace Grid {
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
// src_o = Mdag * (source_o - Moe MeeInv source_e) // src_o = Mdag * (source_o - Moe MeeInv source_e)
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); GRID_ASSERT( tmp.Checkerboard() ==Even); _Matrix.MooeeInv(src_e,tmp); assert( tmp.Checkerboard() ==Even);
_Matrix.Meooe (tmp,Mtmp); GRID_ASSERT( Mtmp.Checkerboard() ==Odd); _Matrix.Meooe (tmp,Mtmp); assert( Mtmp.Checkerboard() ==Odd);
tmp=src_o-Mtmp; GRID_ASSERT( tmp.Checkerboard() ==Odd); tmp=src_o-Mtmp; assert( tmp.Checkerboard() ==Odd);
// get the right MpcDag // get the right MpcDag
_HermOpEO.MpcDag(tmp,src_o); GRID_ASSERT(src_o.Checkerboard() ==Odd); _HermOpEO.MpcDag(tmp,src_o); assert(src_o.Checkerboard() ==Odd);
} }
virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol) virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
@@ -638,12 +638,12 @@ namespace Grid {
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )... // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
_Matrix.Meooe(sol_o_i,tmp); GRID_ASSERT( tmp.Checkerboard() ==Even); _Matrix.Meooe(sol_o_i,tmp); assert( tmp.Checkerboard() ==Even);
tmp = src_e-tmp; GRID_ASSERT( src_e.Checkerboard() ==Even); tmp = src_e-tmp; assert( src_e.Checkerboard() ==Even);
_Matrix.MooeeInv(tmp,sol_e); GRID_ASSERT( sol_e.Checkerboard() ==Even); _Matrix.MooeeInv(tmp,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_e); GRID_ASSERT( sol_e.Checkerboard() ==Even); setCheckerboard(sol,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_o_i); GRID_ASSERT( sol_o_i.Checkerboard() ==Odd ); setCheckerboard(sol,sol_o_i); assert( sol_o_i.Checkerboard() ==Odd );
}; };
virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o) virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o)
@@ -684,9 +684,9 @@ namespace Grid {
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
// src_o = Mdag * (source_o - Moe MeeInv source_e) // src_o = Mdag * (source_o - Moe MeeInv source_e)
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e, tmp); GRID_ASSERT( tmp.Checkerboard() == Even ); _Matrix.MooeeInv(src_e, tmp); assert( tmp.Checkerboard() == Even );
_Matrix.Meooe (tmp, Mtmp); GRID_ASSERT( Mtmp.Checkerboard() == Odd ); _Matrix.Meooe (tmp, Mtmp); assert( Mtmp.Checkerboard() == Odd );
src_o -= Mtmp; GRID_ASSERT( src_o.Checkerboard() == Odd ); src_o -= Mtmp; assert( src_o.Checkerboard() == Odd );
} }
virtual void RedBlackSolution(Matrix& _Matrix, const Field& sol_o, const Field& src_e, Field& sol) virtual void RedBlackSolution(Matrix& _Matrix, const Field& sol_o, const Field& src_e, Field& sol)
@@ -707,12 +707,12 @@ namespace Grid {
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )... // sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
_Matrix.Meooe(sol_o_i, tmp); GRID_ASSERT( tmp.Checkerboard() == Even ); _Matrix.Meooe(sol_o_i, tmp); assert( tmp.Checkerboard() == Even );
tmp = src_e - tmp; GRID_ASSERT( src_e.Checkerboard() == Even ); tmp = src_e - tmp; assert( src_e.Checkerboard() == Even );
_Matrix.MooeeInv(tmp, sol_e); GRID_ASSERT( sol_e.Checkerboard() == Even ); _Matrix.MooeeInv(tmp, sol_e); assert( sol_e.Checkerboard() == Even );
setCheckerboard(sol, sol_e); GRID_ASSERT( sol_e.Checkerboard() == Even ); setCheckerboard(sol, sol_e); assert( sol_e.Checkerboard() == Even );
setCheckerboard(sol, sol_o_i); GRID_ASSERT( sol_o_i.Checkerboard() == Odd ); setCheckerboard(sol, sol_o_i); assert( sol_o_i.Checkerboard() == Odd );
}; };
virtual void RedBlackSolve(Matrix& _Matrix, const Field& src_o, Field& sol_o) virtual void RedBlackSolve(Matrix& _Matrix, const Field& src_o, Field& sol_o)

View File

@@ -30,8 +30,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
/* END LEGAL */ /* END LEGAL */
#pragma once #pragma once
#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
inline RealD AggregatePowerLaw(RealD x) inline RealD AggregatePowerLaw(RealD x)
@@ -97,7 +95,7 @@ public:
RealD scale; RealD scale;
ConjugateGradient<FineField> CG(1.0e-3,400,false); ConjugateGradient<FineField> CG(1.0e-2,100,false);
FineField noise(FineGrid); FineField noise(FineGrid);
FineField Mn(FineGrid); FineField Mn(FineGrid);
@@ -110,7 +108,7 @@ public:
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl; hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
for(int i=0;i<4;i++){ for(int i=0;i<1;i++){
CG(hermop,noise,subspace[b]); CG(hermop,noise,subspace[b]);
@@ -126,53 +124,6 @@ public:
} }
} }
virtual void CreateSubspaceGCR(GridParallelRNG &RNG,LinearOperatorBase<FineField> &DiracOp,int nn=nbasis)
{
RealD scale;
TrivialPrecon<FineField> simple_fine;
PrecGeneralisedConjugateResidualNonHermitian<FineField> GCR(0.001,30,DiracOp,simple_fine,12,12);
FineField noise(FineGrid);
FineField src(FineGrid);
FineField guess(FineGrid);
FineField Mn(FineGrid);
for(int b=0;b<nn;b++){
subspace[b] = Zero();
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|Op|n> "<<innerProduct(noise,Mn)<<std::endl;
for(int i=0;i<2;i++){
// void operator() (const Field &src, Field &psi){
#if 1
std::cout << GridLogMessage << " inverting on noise "<<std::endl;
src = noise;
guess=Zero();
GCR(src,guess);
subspace[b] = guess;
#else
std::cout << GridLogMessage << " inverting on zero "<<std::endl;
src=Zero();
guess = noise;
GCR(src,guess);
subspace[b] = guess;
#endif
noise = subspace[b];
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
}
DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|Op|f> "<<innerProduct(noise,Mn)<<std::endl;
subspace[b] = noise;
}
}
//////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////
// World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit) // World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
// and this is the best I found // and this is the best I found
@@ -209,21 +160,14 @@ public:
int b =0; int b =0;
{ {
ComplexD ip;
// Filter // Filter
Chebyshev<FineField> Cheb(lo,hi,orderfilter); Chebyshev<FineField> Cheb(lo,hi,orderfilter);
Cheb(hermop,noise,Mn); Cheb(hermop,noise,Mn);
// normalise // normalise
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
subspace[b] = Mn; subspace[b] = Mn;
hermop.Op(Mn,tmp); hermop.Op(Mn,tmp);
ip= innerProduct(Mn,tmp); std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|Op|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
hermop.AdjOp(Mn,tmp);
ip = innerProduct(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|AdjOp|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
b++; b++;
} }
@@ -269,18 +213,8 @@ public:
Mn=*Tnp; Mn=*Tnp;
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale; scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
subspace[b] = Mn; subspace[b] = Mn;
ComplexD ip;
hermop.Op(Mn,tmp); hermop.Op(Mn,tmp);
ip= innerProduct(Mn,tmp); std::cout<<GridLogMessage << n<<" filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|Op|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
hermop.AdjOp(Mn,tmp);
ip = innerProduct(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|AdjOp|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
b++; b++;
} }
@@ -292,72 +226,8 @@ public:
} }
} }
GRID_ASSERT(b==nn); assert(b==nn);
} }
virtual void CreateSubspacePolyCheby(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
int nn,
double hi,
double lo1,
int orderfilter,
double lo2,
int orderstep)
{
RealD scale;
FineField noise(FineGrid);
FineField Mn(FineGrid);
FineField tmp(FineGrid);
// New normalised noise
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
std::cout << GridLogMessage<<" CreateSubspacePolyCheby "<<std::endl;
// Initial matrix element
hermop.Op(noise,Mn);
std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
int b =0;
{
// Filter
std::cout << GridLogMessage << "Cheby "<<lo1<<","<<hi<<" "<<orderstep<<std::endl;
Chebyshev<FineField> Cheb(lo1,hi,orderfilter);
Cheb(hermop,noise,Mn);
// normalise
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
subspace[b] = Mn;
hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|n> "<<norm2(Mn)<<std::endl;
}
// Generate a full sequence of Chebyshevs
for(int n=1;n<nn;n++){
std::cout << GridLogMessage << "Cheby "<<lo2<<","<<hi<<" "<<orderstep<<std::endl;
Chebyshev<FineField> Cheb(lo2,hi,orderstep);
Cheb(hermop,subspace[n-1],Mn);
for(int m=0;m<n;m++){
ComplexD c = innerProduct(subspace[m],Mn);
Mn = Mn - c*subspace[m];
}
// normalise
scale = std::pow(norm2(Mn),-0.5);
Mn=Mn*scale;
subspace[n]=Mn;
hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<n<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
std::cout<<GridLogMessage << "filt ["<<n<<"] <n|n> "<<norm2(Mn)<<std::endl;
}
}
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop, virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
int nn, int nn,
double hi, double hi,

View File

@@ -99,7 +99,7 @@ public:
CoarseMatrix AselfInvEven; CoarseMatrix AselfInvEven;
CoarseMatrix AselfInvOdd; CoarseMatrix AselfInvOdd;
deviceVector<RealD> dag_factor; Vector<RealD> dag_factor;
/////////////////////// ///////////////////////
// Interface // Interface
@@ -124,13 +124,9 @@ public:
int npoint = geom.npoint; int npoint = geom.npoint;
typedef LatticeView<Cobj> Aview; typedef LatticeView<Cobj> Aview;
deviceVector<Aview> AcceleratorViewContainer(geom.npoint); Vector<Aview> AcceleratorViewContainer;
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
for(int p=0;p<geom.npoint;p++) { for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
}
Aview *Aview_p = & AcceleratorViewContainer[0]; Aview *Aview_p = & AcceleratorViewContainer[0];
const int Nsimd = CComplex::Nsimd(); const int Nsimd = CComplex::Nsimd();
@@ -165,7 +161,7 @@ public:
coalescedWrite(out_v[ss](b),res); coalescedWrite(out_v[ss](b),res);
}); });
for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose(); for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
}; };
void Mdag (const CoarseVector &in, CoarseVector &out) void Mdag (const CoarseVector &in, CoarseVector &out)
@@ -194,14 +190,9 @@ public:
int npoint = geom.npoint; int npoint = geom.npoint;
typedef LatticeView<Cobj> Aview; typedef LatticeView<Cobj> Aview;
Vector<Aview> AcceleratorViewContainer;
deviceVector<Aview> AcceleratorViewContainer(geom.npoint); for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
for(int p=0;p<geom.npoint;p++) {
hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
}
Aview *Aview_p = & AcceleratorViewContainer[0]; Aview *Aview_p = & AcceleratorViewContainer[0];
const int Nsimd = CComplex::Nsimd(); const int Nsimd = CComplex::Nsimd();
@@ -210,10 +201,10 @@ public:
int osites=Grid()->oSites(); int osites=Grid()->oSites();
deviceVector<int> points(geom.npoint); Vector<int> points(geom.npoint, 0);
for(int p=0; p<geom.npoint; p++) { for(int p=0; p<geom.npoint; p++)
acceleratorPut(points[p],geom.points_dagger[p]); points[p] = geom.points_dagger[p];
}
auto points_p = &points[0]; auto points_p = &points[0];
RealD* dag_factor_p = &dag_factor[0]; RealD* dag_factor_p = &dag_factor[0];
@@ -245,7 +236,7 @@ public:
coalescedWrite(out_v[ss](b),res); coalescedWrite(out_v[ss](b),res);
}); });
for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose(); for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
} }
void MdirComms(const CoarseVector &in) void MdirComms(const CoarseVector &in)
@@ -260,14 +251,8 @@ public:
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
typedef LatticeView<Cobj> Aview; typedef LatticeView<Cobj> Aview;
Vector<Aview> AcceleratorViewContainer;
deviceVector<Aview> AcceleratorViewContainer(geom.npoint); for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
for(int p=0;p<geom.npoint;p++) {
hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
}
Aview *Aview_p = & AcceleratorViewContainer[0]; Aview *Aview_p = & AcceleratorViewContainer[0];
autoView( out_v , out, AcceleratorWrite); autoView( out_v , out, AcceleratorWrite);
@@ -300,7 +285,7 @@ public:
} }
coalescedWrite(out_v[ss](b),res); coalescedWrite(out_v[ss](b),res);
}); });
for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose(); for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
} }
void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out) void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out)
{ {
@@ -309,7 +294,7 @@ public:
if ((out.size()!=ndir)&&(out.size()!=ndir+1)) { if ((out.size()!=ndir)&&(out.size()!=ndir+1)) {
std::cout <<"MdirAll out size "<< out.size()<<std::endl; std::cout <<"MdirAll out size "<< out.size()<<std::endl;
std::cout <<"MdirAll ndir "<< ndir<<std::endl; std::cout <<"MdirAll ndir "<< ndir<<std::endl;
GRID_ASSERT(0); assert(0);
} }
for(int p=0;p<ndir;p++){ for(int p=0;p<ndir;p++){
MdirCalc(in,out[p],p); MdirCalc(in,out[p],p);
@@ -373,7 +358,7 @@ public:
conformable(in.Grid(), _cbgrid); // verifies half grid conformable(in.Grid(), _cbgrid); // verifies half grid
conformable(in.Grid(), out.Grid()); // drops the cb check conformable(in.Grid(), out.Grid()); // drops the cb check
GRID_ASSERT(in.Checkerboard() == Even); assert(in.Checkerboard() == Even);
out.Checkerboard() = Odd; out.Checkerboard() = Odd;
DhopInternal(StencilEven, Aodd, in, out, dag); DhopInternal(StencilEven, Aodd, in, out, dag);
@@ -383,7 +368,7 @@ public:
conformable(in.Grid(), _cbgrid); // verifies half grid conformable(in.Grid(), _cbgrid); // verifies half grid
conformable(in.Grid(), out.Grid()); // drops the cb check conformable(in.Grid(), out.Grid()); // drops the cb check
GRID_ASSERT(in.Checkerboard() == Odd); assert(in.Checkerboard() == Odd);
out.Checkerboard() = Even; out.Checkerboard() = Even;
DhopInternal(StencilOdd, Aeven, in, out, dag); DhopInternal(StencilOdd, Aeven, in, out, dag);
@@ -391,7 +376,7 @@ public:
void MooeeInternal(const CoarseVector &in, CoarseVector &out, int dag, int inv) { void MooeeInternal(const CoarseVector &in, CoarseVector &out, int dag, int inv) {
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
GRID_ASSERT(in.Checkerboard() == Odd || in.Checkerboard() == Even); assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
CoarseMatrix *Aself = nullptr; CoarseMatrix *Aself = nullptr;
if(in.Grid()->_isCheckerBoarded) { if(in.Grid()->_isCheckerBoarded) {
@@ -406,7 +391,7 @@ public:
Aself = (inv) ? &AselfInv : &A[geom.npoint-1]; Aself = (inv) ? &AselfInv : &A[geom.npoint-1];
DselfInternal(Stencil, *Aself, in, out, dag); DselfInternal(Stencil, *Aself, in, out, dag);
} }
GRID_ASSERT(Aself != nullptr); assert(Aself != nullptr);
} }
void DselfInternal(CartesianStencil<siteVector,siteVector,DefaultImplParams> &st, CoarseMatrix &a, void DselfInternal(CartesianStencil<siteVector,siteVector,DefaultImplParams> &st, CoarseMatrix &a,
@@ -484,20 +469,14 @@ public:
// determine in what order we need the points // determine in what order we need the points
int npoint = geom.npoint-1; int npoint = geom.npoint-1;
deviceVector<int> points(npoint); Vector<int> points(npoint, 0);
for(int p=0; p<npoint; p++) { for(int p=0; p<npoint; p++)
int val = (dag && !hermitian) ? geom.points_dagger[p] : p; points[p] = (dag && !hermitian) ? geom.points_dagger[p] : p;
acceleratorPut(points[p], val);
}
auto points_p = &points[0]; auto points_p = &points[0];
deviceVector<Aview> AcceleratorViewContainer(geom.npoint); Vector<Aview> AcceleratorViewContainer;
hostVector<Aview> hAcceleratorViewContainer(geom.npoint); for(int p=0;p<npoint;p++) AcceleratorViewContainer.push_back(a[p].View(AcceleratorRead));
for(int p=0;p<geom.npoint;p++) {
hAcceleratorViewContainer[p] = a[p].View(AcceleratorRead);
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
}
Aview *Aview_p = & AcceleratorViewContainer[0]; Aview *Aview_p = & AcceleratorViewContainer[0];
const int Nsimd = CComplex::Nsimd(); const int Nsimd = CComplex::Nsimd();
@@ -560,7 +539,7 @@ public:
}); });
} }
for(int p=0;p<npoint;p++) hAcceleratorViewContainer[p].ViewClose(); for(int p=0;p<npoint;p++) AcceleratorViewContainer[p].ViewClose();
} }
CoarsenedMatrix(GridCartesian &CoarseGrid, int hermitian_=0) : CoarsenedMatrix(GridCartesian &CoarseGrid, int hermitian_=0) :
@@ -611,13 +590,11 @@ public:
} }
// GPU readable prefactor // GPU readable prefactor
std::vector<RealD> h_dag_factor(nbasis*nbasis);
thread_for(i, nbasis*nbasis, { thread_for(i, nbasis*nbasis, {
int j = i/nbasis; int j = i/nbasis;
int k = i%nbasis; int k = i%nbasis;
h_dag_factor[i] = dag_factor_eigen(j, k); dag_factor[i] = dag_factor_eigen(j, k);
}); });
acceleratorCopyToDevice(&h_dag_factor[0],&dag_factor[0],dag_factor.size()*sizeof(RealD));
} }
void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop, void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
@@ -697,7 +674,7 @@ public:
evenmask = where(mod(bcb,2)==(Integer)0,one,zero); evenmask = where(mod(bcb,2)==(Integer)0,one,zero);
oddmask = one-evenmask; oddmask = one-evenmask;
GRID_ASSERT(self_stencil!=-1); assert(self_stencil!=-1);
for(int i=0;i<nbasis;i++){ for(int i=0;i<nbasis;i++){

View File

@@ -99,7 +99,7 @@ public:
} }
} }
} }
GRID_ASSERT(nfound==geom.npoint); assert(nfound==geom.npoint);
ExchangeCoarseLinks(); ExchangeCoarseLinks();
} }
*/ */
@@ -124,7 +124,7 @@ public:
} }
void Mdag (const CoarseVector &in, CoarseVector &out) void Mdag (const CoarseVector &in, CoarseVector &out)
{ {
GRID_ASSERT(hermitian); assert(hermitian);
Mult(_A,in,out); Mult(_A,in,out);
// if ( hermitian ) M(in,out); // if ( hermitian ) M(in,out);
// else Mult(_Adag,in,out); // else Mult(_Adag,in,out);
@@ -441,20 +441,8 @@ public:
std::cout << GridLogMessage<<"CoarsenOperator inv "<<tinv<<" us"<<std::endl; std::cout << GridLogMessage<<"CoarsenOperator inv "<<tinv<<" us"<<std::endl;
} }
#else #else
//////////////////////////////////////////////////////////////////////
// Galerkin projection of matrix
//////////////////////////////////////////////////////////////////////
void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop, void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
Aggregation<Fobj,CComplex,nbasis> & Subspace) Aggregation<Fobj,CComplex,nbasis> & Subspace)
{
CoarsenOperator(linop,Subspace,Subspace);
}
//////////////////////////////////////////////////////////////////////
// Petrov - Galerkin projection of matrix
//////////////////////////////////////////////////////////////////////
void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
Aggregation<Fobj,CComplex,nbasis> & U,
Aggregation<Fobj,CComplex,nbasis> & V)
{ {
std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl; std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl;
GridBase *grid = FineGrid(); GridBase *grid = FineGrid();
@@ -470,9 +458,11 @@ public:
// Orthogonalise the subblocks over the basis // Orthogonalise the subblocks over the basis
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
CoarseScalar InnerProd(CoarseGrid()); CoarseScalar InnerProd(CoarseGrid());
blockOrthogonalise(InnerProd,V.subspace); blockOrthogonalise(InnerProd,Subspace.subspace);
blockOrthogonalise(InnerProd,U.subspace);
// for(int s=0;s<Subspace.subspace.size();s++){
// std::cout << " subspace norm "<<norm2(Subspace.subspace[s])<<std::endl;
// }
const int npoint = geom.npoint; const int npoint = geom.npoint;
Coordinate clatt = CoarseGrid()->GlobalDimensions(); Coordinate clatt = CoarseGrid()->GlobalDimensions();
@@ -552,7 +542,7 @@ public:
std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl; std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
for(int p=0;p<npoint;p++){ // Loop over momenta in npoint for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
tphaseBZ-=usecond(); tphaseBZ-=usecond();
phaV = phaF[p]*V.subspace[i]; phaV = phaF[p]*Subspace.subspace[i];
tphaseBZ+=usecond(); tphaseBZ+=usecond();
///////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////
@@ -565,7 +555,7 @@ public:
// std::cout << i << " " <<p << " MphaV "<<norm2(MphaV)<<" "<<norm2(phaV)<<std::endl; // std::cout << i << " " <<p << " MphaV "<<norm2(MphaV)<<" "<<norm2(phaV)<<std::endl;
tproj-=usecond(); tproj-=usecond();
blockProject(coarseInner,MphaV,U.subspace); blockProject(coarseInner,MphaV,Subspace.subspace);
coarseInner = conjugate(pha[p]) * coarseInner; coarseInner = conjugate(pha[p]) * coarseInner;
ComputeProj[p] = coarseInner; ComputeProj[p] = coarseInner;
@@ -619,7 +609,7 @@ public:
// _Adag[p]= Cell.ExchangePeriodic(_Adag[p]); // _Adag[p]= Cell.ExchangePeriodic(_Adag[p]);
} }
} }
virtual void Mdiag (const Field &in, Field &out){ GRID_ASSERT(0);}; virtual void Mdiag (const Field &in, Field &out){ assert(0);};
virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);}; virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);};
virtual void MdirAll (const Field &in, std::vector<Field> &out){assert(0);}; virtual void MdirAll (const Field &in, std::vector<Field> &out){assert(0);};
}; };

View File

@@ -80,12 +80,12 @@ public:
// Can be used to do I/O on the operator matrices externally // Can be used to do I/O on the operator matrices externally
void SetMatrix (int p,CoarseMatrix & A) void SetMatrix (int p,CoarseMatrix & A)
{ {
GRID_ASSERT(A.size()==geom_srhs.npoint); assert(A.size()==geom_srhs.npoint);
GridtoBLAS(A[p],BLAS_A[p]); GridtoBLAS(A[p],BLAS_A[p]);
} }
void GetMatrix (int p,CoarseMatrix & A) void GetMatrix (int p,CoarseMatrix & A)
{ {
GRID_ASSERT(A.size()==geom_srhs.npoint); assert(A.size()==geom_srhs.npoint);
BLAStoGrid(A[p],BLAS_A[p]); BLAStoGrid(A[p],BLAS_A[p]);
} }
void CopyMatrix (GeneralCoarseOp &_Op) void CopyMatrix (GeneralCoarseOp &_Op)
@@ -178,14 +178,14 @@ public:
for(int32_t point = 0 ; point < geom.npoint; point++){ for(int32_t point = 0 ; point < geom.npoint; point++){
int i=s*orhs*geom.npoint+point; int i=s*orhs*geom.npoint+point;
int32_t nbr = Stencil._entries[i]._offset*CComplex::Nsimd(); // oSite -> lSite int32_t nbr = Stencil._entries[i]._offset*CComplex::Nsimd(); // oSite -> lSite
GRID_ASSERT(nbr<BLAS_B.size()); assert(nbr<BLAS_B.size());
ComplexD * ptr = (ComplexD *)&BLAS_B[nbr]; ComplexD * ptr = (ComplexD *)&BLAS_B[nbr];
acceleratorPut(BLAS_BP[point][j],ptr); // neighbour indexing in ghost zone volume acceleratorPut(BLAS_BP[point][j],ptr); // neighbour indexing in ghost zone volume
} }
j++; j++;
} }
} }
GRID_ASSERT(j==unpadded_sites); assert(j==unpadded_sites);
} }
template<class vobj> void GridtoBLAS(const Lattice<vobj> &from,deviceVector<typename vobj::scalar_object> &to) template<class vobj> void GridtoBLAS(const Lattice<vobj> &from,deviceVector<typename vobj::scalar_object> &to)
{ {
@@ -194,7 +194,7 @@ public:
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
GridBase *Fg = from.Grid(); GridBase *Fg = from.Grid();
GRID_ASSERT(!Fg->_isCheckerBoarded); assert(!Fg->_isCheckerBoarded);
int nd = Fg->_ndimension; int nd = Fg->_ndimension;
to.resize(Fg->lSites()); to.resize(Fg->lSites());
@@ -241,10 +241,10 @@ public:
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
GridBase *Tg = grid.Grid(); GridBase *Tg = grid.Grid();
GRID_ASSERT(!Tg->_isCheckerBoarded); assert(!Tg->_isCheckerBoarded);
int nd = Tg->_ndimension; int nd = Tg->_ndimension;
GRID_ASSERT(in.size()==Tg->lSites()); assert(in.size()==Tg->lSites());
Coordinate LocalLatt = Tg->LocalDimensions(); Coordinate LocalLatt = Tg->LocalDimensions();
size_t nsite = 1; size_t nsite = 1;
@@ -669,7 +669,7 @@ Grid : Message : 328.193436 s : CoarsenOperator mat 122213270 us
const int Nsimd = CComplex::Nsimd(); const int Nsimd = CComplex::Nsimd();
int64_t nrhs =pin.Grid()->GlobalDimensions()[0]; int64_t nrhs =pin.Grid()->GlobalDimensions()[0];
GRID_ASSERT(nrhs>=1); assert(nrhs>=1);
RealD flops,bytes; RealD flops,bytes;
int64_t osites=in.Grid()->oSites(); // unpadded int64_t osites=in.Grid()->oSites(); // unpadded
@@ -721,7 +721,7 @@ Grid : Message : 328.193436 s : CoarsenOperator mat 122213270 us
// std::cout << GridLogMessage<<"Coarse overall flops/s "<< flops/t_tot<<" mflop/s"<<std::endl; // std::cout << GridLogMessage<<"Coarse overall flops/s "<< flops/t_tot<<" mflop/s"<<std::endl;
// std::cout << GridLogMessage<<"Coarse total bytes "<< bytes/1e6<<" MB"<<std::endl; // std::cout << GridLogMessage<<"Coarse total bytes "<< bytes/1e6<<" MB"<<std::endl;
}; };
virtual void Mdiag (const Field &in, Field &out){ GRID_ASSERT(0);}; virtual void Mdiag (const Field &in, Field &out){ assert(0);};
virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);}; virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);};
virtual void MdirAll (const Field &in, std::vector<Field> &out){assert(0);}; virtual void MdirAll (const Field &in, std::vector<Field> &out){assert(0);};
}; };

View File

@@ -67,8 +67,8 @@ public:
} }
int point(int dir, int disp) { int point(int dir, int disp) {
GRID_ASSERT(disp == -1 || disp == 0 || disp == 1); assert(disp == -1 || disp == 0 || disp == 1);
GRID_ASSERT(base+0 <= dir && dir < base+4); assert(base+0 <= dir && dir < base+4);
// directions faster index = new indexing // directions faster index = new indexing
// 4d (base = 0): // 4d (base = 0):
@@ -131,7 +131,7 @@ public:
return p; return p;
} }
} }
GRID_ASSERT(0); assert(0);
return -1; return -1;
} }
void BuildShifts(void) void BuildShifts(void)

View File

@@ -57,7 +57,7 @@ public:
if ( (_Tp*)ptr == (_Tp *) NULL ) { if ( (_Tp*)ptr == (_Tp *) NULL ) {
printf("Grid CPU Allocator got NULL for %lu bytes\n",(unsigned long) bytes ); printf("Grid CPU Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
} }
GRID_ASSERT( ( (_Tp*)ptr != (_Tp *)NULL ) ); assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
return ptr; return ptr;
} }
@@ -69,7 +69,7 @@ public:
} }
// FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop // FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
void construct(pointer __p, const _Tp& __val) { }; void construct(pointer __p, const _Tp& __val) { assert(0);};
void construct(pointer __p) { }; void construct(pointer __p) { };
void destroy(pointer __p) { }; void destroy(pointer __p) { };
}; };
@@ -106,7 +106,7 @@ public:
if ( (_Tp*)ptr == (_Tp *) NULL ) { if ( (_Tp*)ptr == (_Tp *) NULL ) {
printf("Grid Shared Allocator got NULL for %lu bytes\n",(unsigned long) bytes ); printf("Grid Shared Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
} }
GRID_ASSERT( ( (_Tp*)ptr != (_Tp *)NULL ) ); assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
return ptr; return ptr;
} }
@@ -154,7 +154,7 @@ public:
if ( (_Tp*)ptr == (_Tp *) NULL ) { if ( (_Tp*)ptr == (_Tp *) NULL ) {
printf("Grid Device Allocator got NULL for %lu bytes\n",(unsigned long) bytes ); printf("Grid Device Allocator got NULL for %lu bytes\n",(unsigned long) bytes );
} }
GRID_ASSERT( ( (_Tp*)ptr != (_Tp *)NULL ) ); assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
return ptr; return ptr;
} }
@@ -174,10 +174,19 @@ template<typename _Tp> inline bool operator!=(const devAllocator<_Tp>&, const d
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Template typedefs // Template typedefs
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template<class T> using hostVector = std::vector<T,alignedAllocator<T> >; // Needs autoview #ifdef ACCELERATOR_CSHIFT
template<class T> using Vector = std::vector<T,uvmAllocator<T> >; // Really want to deprecate // Cshift on device
template<class T> using uvmVector = std::vector<T,uvmAllocator<T> >; // auto migrating page template<class T> using cshiftAllocator = devAllocator<T>;
template<class T> using deviceVector = std::vector<T,devAllocator<T> >; // device vector #else
// Cshift on host
template<class T> using cshiftAllocator = std::allocator<T>;
#endif
template<class T> using Vector = std::vector<T,uvmAllocator<T> >;
template<class T> using stencilVector = std::vector<T,alignedAllocator<T> >;
template<class T> using commVector = std::vector<T,devAllocator<T> >;
template<class T> using deviceVector = std::vector<T,devAllocator<T> >;
template<class T> using cshiftVector = std::vector<T,cshiftAllocator<T> >;
/* /*
template<class T> class vecView template<class T> class vecView
@@ -188,9 +197,8 @@ template<class T> class vecView
ViewMode mode; ViewMode mode;
void * cpu_ptr; void * cpu_ptr;
public: public:
// Rvalue accessor
accelerator_inline T & operator[](size_t i) const { return this->data[i]; }; accelerator_inline T & operator[](size_t i) const { return this->data[i]; };
vecView(Vector<T> &refer_to_me,ViewMode _mode) vecView(std::vector<T> &refer_to_me,ViewMode _mode)
{ {
cpu_ptr = &refer_to_me[0]; cpu_ptr = &refer_to_me[0];
size = refer_to_me.size(); size = refer_to_me.size();
@@ -206,12 +214,22 @@ template<class T> class vecView
} }
}; };
template<class T> vecView<T> VectorView(Vector<T> &vec,ViewMode _mode) template<class T> vecView<T> VectorView(std::vector<T> &vec,ViewMode _mode)
{ {
vecView<T> ret(vec,_mode); // does the open vecView<T> ret(vec,_mode); // does the open
return ret; // must be closed return ret; // must be closed
} }
// Little autoscope assister
template<class View>
class VectorViewCloser
{
View v; // Take a copy of view and call view close when I go out of scope automatically
public:
VectorViewCloser(View &_v) : v(_v) {};
~VectorViewCloser() { auto ptr = v.cpu_ptr; v.ViewClose(); MemoryManager::NotifyDeletion(ptr);}
};
#define autoVecView(v_v,v,mode) \ #define autoVecView(v_v,v,mode) \
auto v_v = VectorView(v,mode); \ auto v_v = VectorView(v,mode); \
ViewCloser<decltype(v_v)> _autoView##v_v(v_v); ViewCloser<decltype(v_v)> _autoView##v_v(v_v);

View File

@@ -292,7 +292,7 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes) void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes)
{ {
#ifdef GRID_OMP #ifdef GRID_OMP
GRID_ASSERT(omp_in_parallel()==0); assert(omp_in_parallel()==0);
#endif #endif
if (ncache == 0) return ptr; if (ncache == 0) return ptr;
@@ -345,7 +345,7 @@ void *MemoryManager::Lookup(size_t bytes,int type)
void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes) void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes)
{ {
#ifdef GRID_OMP #ifdef GRID_OMP
GRID_ASSERT(omp_in_parallel()==0); assert(omp_in_parallel()==0);
#endif #endif
for(int e=0;e<ncache;e++){ for(int e=0;e<ncache;e++){
if ( entries[e].valid && ( entries[e].bytes == bytes ) ) { if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {

View File

@@ -1,15 +1,16 @@
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
#ifndef GRID_UVM #ifndef GRID_UVM
#warning "Using explicit device memory copies"
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
#define MAXLINE 512 #define MAXLINE 512
static char print_buffer [ MAXLINE ]; static char print_buffer [ MAXLINE ];
#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer << std::endl; #define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer;
#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug << print_buffer << std::endl; #define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug << print_buffer;
//#define dprintf(...) //#define dprintf(...)
//#define mprintf(...)
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
// For caching copies of data on device // For caching copies of data on device
@@ -50,12 +51,12 @@ int MemoryManager::EntryPresent(uint64_t CpuPtr)
{ {
if(AccViewTable.empty()) return 0; if(AccViewTable.empty()) return 0;
auto count = AccViewTable.count(CpuPtr); GRID_ASSERT((count==0)||(count==1)); auto count = AccViewTable.count(CpuPtr); assert((count==0)||(count==1));
return count; return count;
} }
void MemoryManager::EntryCreate(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint) void MemoryManager::EntryCreate(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
{ {
GRID_ASSERT(!EntryPresent(CpuPtr)); assert(!EntryPresent(CpuPtr));
AcceleratorViewEntry AccCache; AcceleratorViewEntry AccCache;
AccCache.CpuPtr = CpuPtr; AccCache.CpuPtr = CpuPtr;
AccCache.AccPtr = (uint64_t)NULL; AccCache.AccPtr = (uint64_t)NULL;
@@ -69,9 +70,9 @@ void MemoryManager::EntryCreate(uint64_t CpuPtr,size_t bytes,ViewMode mode,View
} }
MemoryManager::AccViewTableIterator MemoryManager::EntryLookup(uint64_t CpuPtr) MemoryManager::AccViewTableIterator MemoryManager::EntryLookup(uint64_t CpuPtr)
{ {
GRID_ASSERT(EntryPresent(CpuPtr)); assert(EntryPresent(CpuPtr));
auto AccCacheIterator = AccViewTable.find(CpuPtr); auto AccCacheIterator = AccViewTable.find(CpuPtr);
GRID_ASSERT(AccCacheIterator!=AccViewTable.end()); assert(AccCacheIterator!=AccViewTable.end());
return AccCacheIterator; return AccCacheIterator;
} }
void MemoryManager::EntryErase(uint64_t CpuPtr) void MemoryManager::EntryErase(uint64_t CpuPtr)
@@ -81,7 +82,7 @@ void MemoryManager::EntryErase(uint64_t CpuPtr)
} }
void MemoryManager::LRUinsert(AcceleratorViewEntry &AccCache) void MemoryManager::LRUinsert(AcceleratorViewEntry &AccCache)
{ {
GRID_ASSERT(AccCache.LRU_valid==0); assert(AccCache.LRU_valid==0);
if (AccCache.transient) { if (AccCache.transient) {
LRU.push_back(AccCache.CpuPtr); LRU.push_back(AccCache.CpuPtr);
AccCache.LRU_entry = --LRU.end(); AccCache.LRU_entry = --LRU.end();
@@ -94,7 +95,7 @@ void MemoryManager::LRUinsert(AcceleratorViewEntry &AccCache)
} }
void MemoryManager::LRUremove(AcceleratorViewEntry &AccCache) void MemoryManager::LRUremove(AcceleratorViewEntry &AccCache)
{ {
GRID_ASSERT(AccCache.LRU_valid==1); assert(AccCache.LRU_valid==1);
LRU.erase(AccCache.LRU_entry); LRU.erase(AccCache.LRU_entry);
AccCache.LRU_valid = 0; AccCache.LRU_valid = 0;
DeviceLRUBytes-=AccCache.bytes; DeviceLRUBytes-=AccCache.bytes;
@@ -108,19 +109,19 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
// Remove from Accelerator, remove entry, without flush // Remove from Accelerator, remove entry, without flush
// Cannot be locked. If allocated Must be in LRU pool. // Cannot be locked. If allocated Must be in LRU pool.
/////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////
GRID_ASSERT(AccCache.state!=Empty); assert(AccCache.state!=Empty);
dprintf("MemoryManager: Discard(%lx) %lx",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); dprintf("MemoryManager: Discard(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
GRID_ASSERT(AccCache.accLock==0); assert(AccCache.accLock==0);
GRID_ASSERT(AccCache.cpuLock==0); assert(AccCache.cpuLock==0);
GRID_ASSERT(AccCache.CpuPtr!=(uint64_t)NULL); assert(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr) { if(AccCache.AccPtr) {
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes); AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
DeviceDestroy++; DeviceDestroy++;
DeviceBytes -=AccCache.bytes; DeviceBytes -=AccCache.bytes;
LRUremove(AccCache); LRUremove(AccCache);
AccCache.AccPtr=(uint64_t) NULL; AccCache.AccPtr=(uint64_t) NULL;
dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes); dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
} }
uint64_t CpuPtr = AccCache.CpuPtr; uint64_t CpuPtr = AccCache.CpuPtr;
EntryErase(CpuPtr); EntryErase(CpuPtr);
@@ -138,9 +139,9 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
// Take these OUT LRU queue when CPU locked? // Take these OUT LRU queue when CPU locked?
// Cannot take out the table as cpuLock data is important. // Cannot take out the table as cpuLock data is important.
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
GRID_ASSERT(AccCache.state!=Empty); assert(AccCache.state!=Empty);
mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld", mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld\n",
(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr, (uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr,
(uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock); (uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock);
if (AccCache.accLock!=0) return; if (AccCache.accLock!=0) return;
@@ -154,7 +155,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
AccCache.AccPtr=(uint64_t)NULL; AccCache.AccPtr=(uint64_t)NULL;
AccCache.state=CpuDirty; // CPU primary now AccCache.state=CpuDirty; // CPU primary now
DeviceBytes -=AccCache.bytes; DeviceBytes -=AccCache.bytes;
dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld ",(uint64_t)AccCache.AccPtr,DeviceBytes); dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);
} }
// uint64_t CpuPtr = AccCache.CpuPtr; // uint64_t CpuPtr = AccCache.CpuPtr;
DeviceEvictions++; DeviceEvictions++;
@@ -162,30 +163,28 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
} }
void MemoryManager::Flush(AcceleratorViewEntry &AccCache) void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
{ {
GRID_ASSERT(AccCache.state==AccDirty); assert(AccCache.state==AccDirty);
GRID_ASSERT(AccCache.cpuLock==0); assert(AccCache.cpuLock==0);
GRID_ASSERT(AccCache.accLock==0); assert(AccCache.accLock==0);
GRID_ASSERT(AccCache.AccPtr!=(uint64_t)NULL); assert(AccCache.AccPtr!=(uint64_t)NULL);
GRID_ASSERT(AccCache.CpuPtr!=(uint64_t)NULL); assert(AccCache.CpuPtr!=(uint64_t)NULL);
acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes); acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
mprintf("MemoryManager: acceleratorCopyFromDevice Flush size %ld AccPtr %lx -> CpuPtr %lx",(uint64_t)AccCache.bytes,(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); mprintf("MemoryManager: acceleratorCopyFromDevice Flush AccPtr %lx -> CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
DeviceToHostBytes+=AccCache.bytes; DeviceToHostBytes+=AccCache.bytes;
DeviceToHostXfer++; DeviceToHostXfer++;
AccCache.state=Consistent; AccCache.state=Consistent;
} }
void MemoryManager::Clone(AcceleratorViewEntry &AccCache) void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
{ {
GRID_ASSERT(AccCache.state==CpuDirty); assert(AccCache.state==CpuDirty);
GRID_ASSERT(AccCache.cpuLock==0); assert(AccCache.cpuLock==0);
GRID_ASSERT(AccCache.accLock==0); assert(AccCache.accLock==0);
GRID_ASSERT(AccCache.CpuPtr!=(uint64_t)NULL); assert(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr==(uint64_t)NULL){ if(AccCache.AccPtr==(uint64_t)NULL){
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes); AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
DeviceBytes+=AccCache.bytes; DeviceBytes+=AccCache.bytes;
} }
mprintf("MemoryManager: acceleratorCopyToDevice Clone size %ld AccPtr %lx <- CpuPtr %lx", mprintf("MemoryManager: acceleratorCopyToDevice Clone AccPtr %lx <- CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
(uint64_t)AccCache.bytes,
(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes); acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
HostToDeviceBytes+=AccCache.bytes; HostToDeviceBytes+=AccCache.bytes;
HostToDeviceXfer++; HostToDeviceXfer++;
@@ -194,10 +193,10 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache) void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache)
{ {
GRID_ASSERT(AccCache.state!=Empty); assert(AccCache.state!=Empty);
GRID_ASSERT(AccCache.cpuLock==0); assert(AccCache.cpuLock==0);
GRID_ASSERT(AccCache.accLock==0); assert(AccCache.accLock==0);
GRID_ASSERT(AccCache.CpuPtr!=(uint64_t)NULL); assert(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr==(uint64_t)NULL){ if(AccCache.AccPtr==(uint64_t)NULL){
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes); AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
DeviceBytes+=AccCache.bytes; DeviceBytes+=AccCache.bytes;
@@ -211,36 +210,33 @@ void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache)
void MemoryManager::ViewClose(void* Ptr,ViewMode mode) void MemoryManager::ViewClose(void* Ptr,ViewMode mode)
{ {
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){ if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
dprintf("AcceleratorViewClose %lx",(uint64_t)Ptr); dprintf("AcceleratorViewClose %lx\n",(uint64_t)Ptr);
AcceleratorViewClose((uint64_t)Ptr); AcceleratorViewClose((uint64_t)Ptr);
} else if( (mode==CpuRead)||(mode==CpuWrite)){ } else if( (mode==CpuRead)||(mode==CpuWrite)){
CpuViewClose((uint64_t)Ptr); CpuViewClose((uint64_t)Ptr);
} else { } else {
GRID_ASSERT(0); assert(0);
} }
} }
void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint) void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
{ {
uint64_t CpuPtr = (uint64_t)_CpuPtr; uint64_t CpuPtr = (uint64_t)_CpuPtr;
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){ if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
dprintf("AcceleratorViewOpen %lx",(uint64_t)CpuPtr); dprintf("AcceleratorViewOpen %lx\n",(uint64_t)CpuPtr);
return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint); return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint);
} else if( (mode==CpuRead)||(mode==CpuWrite)){ } else if( (mode==CpuRead)||(mode==CpuWrite)){
return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint); return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint);
} else { } else {
GRID_ASSERT(0); assert(0);
return NULL; return NULL;
} }
} }
void MemoryManager::EvictVictims(uint64_t bytes) void MemoryManager::EvictVictims(uint64_t bytes)
{ {
if(bytes>=DeviceMaxBytes) { assert(bytes<DeviceMaxBytes);
printf("EvictVictims bytes %ld DeviceMaxBytes %ld\n",bytes,DeviceMaxBytes);
}
GRID_ASSERT(bytes<DeviceMaxBytes);
while(bytes+DeviceLRUBytes > DeviceMaxBytes){ while(bytes+DeviceLRUBytes > DeviceMaxBytes){
if ( DeviceLRUBytes > 0){ if ( DeviceLRUBytes > 0){
GRID_ASSERT(LRU.size()>0); assert(LRU.size()>0);
uint64_t victim = LRU.back(); // From the LRU uint64_t victim = LRU.back(); // From the LRU
auto AccCacheIterator = EntryLookup(victim); auto AccCacheIterator = EntryLookup(victim);
auto & AccCache = AccCacheIterator->second; auto & AccCache = AccCacheIterator->second;
@@ -264,19 +260,19 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
if (!AccCache.AccPtr) { if (!AccCache.AccPtr) {
EvictVictims(bytes); EvictVictims(bytes);
} }
GRID_ASSERT((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)); assert((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard));
GRID_ASSERT(AccCache.cpuLock==0); // Programming error assert(AccCache.cpuLock==0); // Programming error
if(AccCache.state!=Empty) { if(AccCache.state!=Empty) {
dprintf("ViewOpen found entry %lx %lx : sizes %ld %ld accLock %ld", dprintf("ViewOpen found entry %lx %lx : %ld %ld accLock %ld\n",
(uint64_t)AccCache.CpuPtr, (uint64_t)AccCache.CpuPtr,
(uint64_t)CpuPtr, (uint64_t)CpuPtr,
(uint64_t)AccCache.bytes, (uint64_t)AccCache.bytes,
(uint64_t)bytes, (uint64_t)bytes,
(uint64_t)AccCache.accLock); (uint64_t)AccCache.accLock);
GRID_ASSERT(AccCache.CpuPtr == CpuPtr); assert(AccCache.CpuPtr == CpuPtr);
GRID_ASSERT(AccCache.bytes ==bytes); assert(AccCache.bytes ==bytes);
} }
/* /*
* State transitions and actions * State transitions and actions
@@ -293,7 +289,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
* AccWrite AccDirty AccDirty - - * AccWrite AccDirty AccDirty - -
*/ */
if(AccCache.state==Empty) { if(AccCache.state==Empty) {
GRID_ASSERT(AccCache.LRU_valid==0); assert(AccCache.LRU_valid==0);
AccCache.CpuPtr = CpuPtr; AccCache.CpuPtr = CpuPtr;
AccCache.AccPtr = (uint64_t)NULL; AccCache.AccPtr = (uint64_t)NULL;
AccCache.bytes = bytes; AccCache.bytes = bytes;
@@ -309,7 +305,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
AccCache.state = Consistent; // Empty + AccRead => Consistent AccCache.state = Consistent; // Empty + AccRead => Consistent
} }
AccCache.accLock= 1; AccCache.accLock= 1;
dprintf("Copied Empty entry into device accLock= %d",AccCache.accLock); dprintf("Copied Empty entry into device accLock= %d\n",AccCache.accLock);
} else if(AccCache.state==CpuDirty ){ } else if(AccCache.state==CpuDirty ){
if(mode==AcceleratorWriteDiscard) { if(mode==AcceleratorWriteDiscard) {
CpuDiscard(AccCache); CpuDiscard(AccCache);
@@ -322,30 +318,30 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
AccCache.state = Consistent; // CpuDirty + AccRead => Consistent AccCache.state = Consistent; // CpuDirty + AccRead => Consistent
} }
AccCache.accLock++; AccCache.accLock++;
dprintf("CpuDirty entry into device ++accLock= %d",AccCache.accLock); dprintf("CpuDirty entry into device ++accLock= %d\n",AccCache.accLock);
} else if(AccCache.state==Consistent) { } else if(AccCache.state==Consistent) {
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty
else else
AccCache.state = Consistent; // Consistent + AccRead => Consistent AccCache.state = Consistent; // Consistent + AccRead => Consistent
AccCache.accLock++; AccCache.accLock++;
dprintf("Consistent entry into device ++accLock= %d",AccCache.accLock); dprintf("Consistent entry into device ++accLock= %d\n",AccCache.accLock);
} else if(AccCache.state==AccDirty) { } else if(AccCache.state==AccDirty) {
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
else else
AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty
AccCache.accLock++; AccCache.accLock++;
dprintf("AccDirty entry ++accLock= %d",AccCache.accLock); dprintf("AccDirty entry ++accLock= %d\n",AccCache.accLock);
} else { } else {
GRID_ASSERT(0); assert(0);
} }
GRID_ASSERT(AccCache.accLock>0); assert(AccCache.accLock>0);
// If view is opened on device must remove from LRU // If view is opened on device must remove from LRU
if(AccCache.LRU_valid==1){ if(AccCache.LRU_valid==1){
// must possibly remove from LRU as now locked on GPU // must possibly remove from LRU as now locked on GPU
dprintf("AccCache entry removed from LRU "); dprintf("AccCache entry removed from LRU \n");
LRUremove(AccCache); LRUremove(AccCache);
} }
@@ -362,16 +358,16 @@ void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr)
auto AccCacheIterator = EntryLookup(CpuPtr); auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second; auto & AccCache = AccCacheIterator->second;
GRID_ASSERT(AccCache.cpuLock==0); assert(AccCache.cpuLock==0);
GRID_ASSERT(AccCache.accLock>0); assert(AccCache.accLock>0);
AccCache.accLock--; AccCache.accLock--;
// Move to LRU queue if not locked and close on device // Move to LRU queue if not locked and close on device
if(AccCache.accLock==0) { if(AccCache.accLock==0) {
dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock); dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
LRUinsert(AccCache); LRUinsert(AccCache);
} else { } else {
dprintf("AccleratorViewClose %lx AccLock decremented to %ld",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock); dprintf("AccleratorViewClose %lx AccLock decremented to %ld\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
} }
} }
void MemoryManager::CpuViewClose(uint64_t CpuPtr) void MemoryManager::CpuViewClose(uint64_t CpuPtr)
@@ -379,8 +375,8 @@ void MemoryManager::CpuViewClose(uint64_t CpuPtr)
auto AccCacheIterator = EntryLookup(CpuPtr); auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second; auto & AccCache = AccCacheIterator->second;
GRID_ASSERT(AccCache.cpuLock>0); assert(AccCache.cpuLock>0);
GRID_ASSERT(AccCache.accLock==0); assert(AccCache.accLock==0);
AccCache.cpuLock--; AccCache.cpuLock--;
} }
@@ -413,12 +409,12 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V
// EvictVictims(bytes); // EvictVictims(bytes);
// } // }
GRID_ASSERT((mode==CpuRead)||(mode==CpuWrite)); assert((mode==CpuRead)||(mode==CpuWrite));
GRID_ASSERT(AccCache.accLock==0); // Programming error assert(AccCache.accLock==0); // Programming error
if(AccCache.state!=Empty) { if(AccCache.state!=Empty) {
GRID_ASSERT(AccCache.CpuPtr == CpuPtr); assert(AccCache.CpuPtr == CpuPtr);
GRID_ASSERT(AccCache.bytes==bytes); assert(AccCache.bytes==bytes);
} }
if(AccCache.state==Empty) { if(AccCache.state==Empty) {
@@ -433,20 +429,20 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V
AccCache.state = CpuDirty; // CpuDirty +CpuRead/CpuWrite => CpuDirty AccCache.state = CpuDirty; // CpuDirty +CpuRead/CpuWrite => CpuDirty
AccCache.cpuLock++; AccCache.cpuLock++;
} else if(AccCache.state==Consistent) { } else if(AccCache.state==Consistent) {
GRID_ASSERT(AccCache.AccPtr != (uint64_t)NULL); assert(AccCache.AccPtr != (uint64_t)NULL);
if(mode==CpuWrite) if(mode==CpuWrite)
AccCache.state = CpuDirty; // Consistent +CpuWrite => CpuDirty AccCache.state = CpuDirty; // Consistent +CpuWrite => CpuDirty
else else
AccCache.state = Consistent; // Consistent +CpuRead => Consistent AccCache.state = Consistent; // Consistent +CpuRead => Consistent
AccCache.cpuLock++; AccCache.cpuLock++;
} else if(AccCache.state==AccDirty) { } else if(AccCache.state==AccDirty) {
GRID_ASSERT(AccCache.AccPtr != (uint64_t)NULL); assert(AccCache.AccPtr != (uint64_t)NULL);
Flush(AccCache); Flush(AccCache);
if(mode==CpuWrite) AccCache.state = CpuDirty; // AccDirty +CpuWrite => CpuDirty, Flush if(mode==CpuWrite) AccCache.state = CpuDirty; // AccDirty +CpuWrite => CpuDirty, Flush
else AccCache.state = Consistent; // AccDirty +CpuRead => Consistent, Flush else AccCache.state = Consistent; // AccDirty +CpuRead => Consistent, Flush
AccCache.cpuLock++; AccCache.cpuLock++;
} else { } else {
GRID_ASSERT(0); // should be unreachable assert(0); // should be unreachable
} }
AccCache.transient= transient? EvictNext : 0; AccCache.transient= transient? EvictNext : 0;
@@ -528,12 +524,12 @@ void MemoryManager::Audit(std::string s)
std::cout << " Memory Manager::Audit() from "<<s<<std::endl; std::cout << " Memory Manager::Audit() from "<<s<<std::endl;
for(auto it=LRU.begin();it!=LRU.end();it++){ for(auto it=LRU.begin();it!=LRU.end();it++){
uint64_t cpuPtr = *it; uint64_t cpuPtr = *it;
GRID_ASSERT(EntryPresent(cpuPtr)); assert(EntryPresent(cpuPtr));
auto AccCacheIterator = EntryLookup(cpuPtr); auto AccCacheIterator = EntryLookup(cpuPtr);
auto & AccCache = AccCacheIterator->second; auto & AccCache = AccCacheIterator->second;
LruBytes2+=AccCache.bytes; LruBytes2+=AccCache.bytes;
GRID_ASSERT(AccCache.LRU_valid==1); assert(AccCache.LRU_valid==1);
GRID_ASSERT(AccCache.LRU_entry==it); assert(AccCache.LRU_entry==it);
} }
std::cout << " Memory Manager::Audit() LRU queue matches table entries "<<std::endl; std::cout << " Memory Manager::Audit() LRU queue matches table entries "<<std::endl;
@@ -552,7 +548,7 @@ void MemoryManager::Audit(std::string s)
if( AccCache.LRU_valid ) LruCnt++; if( AccCache.LRU_valid ) LruCnt++;
if ( AccCache.cpuLock || AccCache.accLock ) { if ( AccCache.cpuLock || AccCache.accLock ) {
GRID_ASSERT(AccCache.LRU_valid==0); assert(AccCache.LRU_valid==0);
std::cout << GridLogError << s<< "\n\t 0x"<<std::hex<<AccCache.CpuPtr<<std::dec std::cout << GridLogError << s<< "\n\t 0x"<<std::hex<<AccCache.CpuPtr<<std::dec
<< "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str << "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
@@ -561,16 +557,16 @@ void MemoryManager::Audit(std::string s)
<< "\t LRUvalid " << AccCache.LRU_valid<<std::endl; << "\t LRUvalid " << AccCache.LRU_valid<<std::endl;
} }
GRID_ASSERT( AccCache.cpuLock== 0 ) ; assert( AccCache.cpuLock== 0 ) ;
GRID_ASSERT( AccCache.accLock== 0 ) ; assert( AccCache.accLock== 0 ) ;
} }
std::cout << " Memory Manager::Audit() no locked table entries "<<std::endl; std::cout << " Memory Manager::Audit() no locked table entries "<<std::endl;
GRID_ASSERT(LruBytes1==LruBytes2); assert(LruBytes1==LruBytes2);
GRID_ASSERT(LruBytes1==DeviceLRUBytes); assert(LruBytes1==DeviceLRUBytes);
std::cout << " Memory Manager::Audit() evictable bytes matches sum over table "<<std::endl; std::cout << " Memory Manager::Audit() evictable bytes matches sum over table "<<std::endl;
GRID_ASSERT(AccBytes==DeviceBytes); assert(AccBytes==DeviceBytes);
std::cout << " Memory Manager::Audit() device bytes matches sum over table "<<std::endl; std::cout << " Memory Manager::Audit() device bytes matches sum over table "<<std::endl;
GRID_ASSERT(LruCnt == LRU.size()); assert(LruCnt == LRU.size());
std::cout << " Memory Manager::Audit() LRU entry count matches "<<std::endl; std::cout << " Memory Manager::Audit() LRU entry count matches "<<std::endl;
} }

View File

@@ -10,16 +10,16 @@ void check_huge_pages(void *Buf,uint64_t BYTES)
{ {
#ifdef __linux__ #ifdef __linux__
int fd = open("/proc/self/pagemap", O_RDONLY); int fd = open("/proc/self/pagemap", O_RDONLY);
GRID_ASSERT(fd >= 0); assert(fd >= 0);
const int page_size = 4096; const int page_size = 4096;
uint64_t virt_pfn = (uint64_t)Buf / page_size; uint64_t virt_pfn = (uint64_t)Buf / page_size;
off_t offset = sizeof(uint64_t) * virt_pfn; off_t offset = sizeof(uint64_t) * virt_pfn;
uint64_t npages = (BYTES + page_size-1) / page_size; uint64_t npages = (BYTES + page_size-1) / page_size;
std::vector<uint64_t> pagedata(npages); uint64_t pagedata[npages];
uint64_t ret = lseek(fd, offset, SEEK_SET); uint64_t ret = lseek(fd, offset, SEEK_SET);
GRID_ASSERT(ret == offset); assert(ret == offset);
ret = ::read(fd, &pagedata[0], sizeof(uint64_t)*npages); ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
GRID_ASSERT(ret == sizeof(uint64_t) * npages); assert(ret == sizeof(uint64_t) * npages);
int nhugepages = npages / 512; int nhugepages = npages / 512;
int n4ktotal, nnothuge; int n4ktotal, nnothuge;
n4ktotal = 0; n4ktotal = 0;

View File

@@ -82,7 +82,6 @@ public:
bool _isCheckerBoarded; bool _isCheckerBoarded;
int LocallyPeriodic; int LocallyPeriodic;
Coordinate _checker_dim_mask; Coordinate _checker_dim_mask;
int _checker_dim;
public: public:
@@ -92,6 +91,7 @@ public:
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
virtual int CheckerBoarded(int dim)=0; virtual int CheckerBoarded(int dim)=0;
virtual int CheckerBoard(const Coordinate &site)=0; virtual int CheckerBoard(const Coordinate &site)=0;
virtual int CheckerDim(void){ return 0; };
virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0; virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0;
virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0; virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0; virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0;
@@ -165,7 +165,7 @@ public:
// //
if ( _simd_layout[dimension] > 2 ) { if ( _simd_layout[dimension] > 2 ) {
for(int d=0;d<_ndimension;d++){ for(int d=0;d<_ndimension;d++){
if ( d != dimension ) GRID_ASSERT ( (_simd_layout[d]==1) ); if ( d != dimension ) assert ( (_simd_layout[d]==1) );
} }
permute_type = RotateBit; // How to specify distance; this is not just direction. permute_type = RotateBit; // How to specify distance; this is not just direction.
return permute_type; return permute_type;
@@ -187,7 +187,7 @@ public:
inline int64_t gSites(void) const { return (int64_t)_isites*(int64_t)_osites*(int64_t)_Nprocessors; }; inline int64_t gSites(void) const { return (int64_t)_isites*(int64_t)_osites*(int64_t)_Nprocessors; };
inline int Nd (void) const { return _ndimension;}; inline int Nd (void) const { return _ndimension;};
inline const Coordinate &LocalStarts(void) { return _lstart; }; inline const Coordinate LocalStarts(void) { return _lstart; };
inline const Coordinate &FullDimensions(void) { return _fdimensions;}; inline const Coordinate &FullDimensions(void) { return _fdimensions;};
inline const Coordinate &GlobalDimensions(void) { return _gdimensions;}; inline const Coordinate &GlobalDimensions(void) { return _gdimensions;};
inline const Coordinate &LocalDimensions(void) { return _ldimensions;}; inline const Coordinate &LocalDimensions(void) { return _ldimensions;};
@@ -216,11 +216,11 @@ public:
// Global addressing // Global addressing
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
void GlobalIndexToGlobalCoor(int64_t gidx,Coordinate &gcoor){ void GlobalIndexToGlobalCoor(int64_t gidx,Coordinate &gcoor){
GRID_ASSERT(gidx< gSites()); assert(gidx< gSites());
Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions); Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions);
} }
void LocalIndexToLocalCoor(int lidx,Coordinate &lcoor){ void LocalIndexToLocalCoor(int lidx,Coordinate &lcoor){
GRID_ASSERT(lidx<lSites()); assert(lidx<lSites());
Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions); Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions);
} }
void GlobalCoorToGlobalIndex(const Coordinate & gcoor,int64_t & gidx){ void GlobalCoorToGlobalIndex(const Coordinate & gcoor,int64_t & gidx){

View File

@@ -38,7 +38,7 @@ class GridCartesian: public GridBase {
public: public:
int dummy; int dummy;
// Coordinate _checker_dim_mask; Coordinate _checker_dim_mask;
virtual int CheckerBoardFromOindexTable (int Oindex) { virtual int CheckerBoardFromOindexTable (int Oindex) {
return 0; return 0;
} }
@@ -106,7 +106,6 @@ public:
_rdimensions.resize(_ndimension); _rdimensions.resize(_ndimension);
_simd_layout.resize(_ndimension); _simd_layout.resize(_ndimension);
_checker_dim_mask.resize(_ndimension);; _checker_dim_mask.resize(_ndimension);;
_checker_dim = -1;
_lstart.resize(_ndimension); _lstart.resize(_ndimension);
_lend.resize(_ndimension); _lend.resize(_ndimension);
@@ -128,10 +127,10 @@ public:
// Use a reduced simd grid // Use a reduced simd grid
_ldimensions[d] = _gdimensions[d] / _processors[d]; //local dimensions _ldimensions[d] = _gdimensions[d] / _processors[d]; //local dimensions
//std::cout << _ldimensions[d] << " " << _gdimensions[d] << " " << _processors[d] << std::endl; //std::cout << _ldimensions[d] << " " << _gdimensions[d] << " " << _processors[d] << std::endl;
GRID_ASSERT(_ldimensions[d] * _processors[d] == _gdimensions[d]); assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
_rdimensions[d] = _ldimensions[d] / _simd_layout[d]; //overdecomposition _rdimensions[d] = _ldimensions[d] / _simd_layout[d]; //overdecomposition
GRID_ASSERT(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]); assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
_lstart[d] = _processor_coor[d] * _ldimensions[d]; _lstart[d] = _processor_coor[d] * _ldimensions[d];
_lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1; _lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;

View File

@@ -57,17 +57,17 @@ class GridRedBlackCartesian : public GridBase
{ {
public: public:
// Coordinate _checker_dim_mask; // Coordinate _checker_dim_mask;
// int _checker_dim; int _checker_dim;
std::vector<int> _checker_board; std::vector<int> _checker_board;
virtual int isCheckerBoarded(void) const { return 1; }; virtual int CheckerDim(void){ return _checker_dim; };
virtual int CheckerBoarded(int dim){ virtual int CheckerBoarded(int dim){
if( dim==_checker_dim) return 1; if( dim==_checker_dim) return 1;
else return 0; else return 0;
} }
virtual int CheckerBoard(const Coordinate &site){ virtual int CheckerBoard(const Coordinate &site){
int linear=0; int linear=0;
GRID_ASSERT(site.size()==_ndimension); assert(site.size()==_ndimension);
for(int d=0;d<_ndimension;d++){ for(int d=0;d<_ndimension;d++){
if(_checker_dim_mask[d]) if(_checker_dim_mask[d])
linear=linear+site[d]; linear=linear+site[d];
@@ -160,11 +160,11 @@ public:
_isCheckerBoarded = true; _isCheckerBoarded = true;
_checker_dim = checker_dim; _checker_dim = checker_dim;
GRID_ASSERT(checker_dim_mask[checker_dim] == 1); assert(checker_dim_mask[checker_dim] == 1);
_ndimension = dimensions.size(); _ndimension = dimensions.size();
GRID_ASSERT(checker_dim_mask.size() == _ndimension); assert(checker_dim_mask.size() == _ndimension);
GRID_ASSERT(processor_grid.size() == _ndimension); assert(processor_grid.size() == _ndimension);
GRID_ASSERT(simd_layout.size() == _ndimension); assert(simd_layout.size() == _ndimension);
_fdimensions.resize(_ndimension); _fdimensions.resize(_ndimension);
_gdimensions.resize(_ndimension); _gdimensions.resize(_ndimension);
@@ -190,20 +190,20 @@ public:
if (d == _checker_dim) if (d == _checker_dim)
{ {
GRID_ASSERT((_gdimensions[d] & 0x1) == 0); assert((_gdimensions[d] & 0x1) == 0);
_gdimensions[d] = _gdimensions[d] / 2; // Remove a checkerboard _gdimensions[d] = _gdimensions[d] / 2; // Remove a checkerboard
_gsites /= 2; _gsites /= 2;
} }
_ldimensions[d] = _gdimensions[d] / _processors[d]; _ldimensions[d] = _gdimensions[d] / _processors[d];
GRID_ASSERT(_ldimensions[d] * _processors[d] == _gdimensions[d]); assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
_lstart[d] = _processor_coor[d] * _ldimensions[d]; _lstart[d] = _processor_coor[d] * _ldimensions[d];
_lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1; _lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;
// Use a reduced simd grid // Use a reduced simd grid
_simd_layout[d] = simd_layout[d]; _simd_layout[d] = simd_layout[d];
_rdimensions[d] = _ldimensions[d] / _simd_layout[d]; // this is not checking if this is integer _rdimensions[d] = _ldimensions[d] / _simd_layout[d]; // this is not checking if this is integer
GRID_ASSERT(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]); assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
GRID_ASSERT(_rdimensions[d] > 0); assert(_rdimensions[d] > 0);
// all elements of a simd vector must have same checkerboard. // all elements of a simd vector must have same checkerboard.
// If Ls vectorised, this must still be the case; e.g. dwf rb5d // If Ls vectorised, this must still be the case; e.g. dwf rb5d

View File

@@ -57,29 +57,18 @@ int CartesianCommunicator::ProcessorCount(void) { return
// very VERY rarely (Log, serial RNG) we need world without a grid // very VERY rarely (Log, serial RNG) we need world without a grid
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
#ifdef USE_GRID_REDUCTION
void CartesianCommunicator::GlobalSum(ComplexF &c)
{
GlobalSumP2P(c);
}
void CartesianCommunicator::GlobalSum(ComplexD &c)
{
GlobalSumP2P(c);
}
#else
void CartesianCommunicator::GlobalSum(ComplexF &c) void CartesianCommunicator::GlobalSum(ComplexF &c)
{ {
GlobalSumVector((float *)&c,2); GlobalSumVector((float *)&c,2);
} }
void CartesianCommunicator::GlobalSum(ComplexD &c)
{
GlobalSumVector((double *)&c,2);
}
#endif
void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N) void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
{ {
GlobalSumVector((float *)c,2*N); GlobalSumVector((float *)c,2*N);
} }
void CartesianCommunicator::GlobalSum(ComplexD &c)
{
GlobalSumVector((double *)&c,2);
}
void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N) void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
{ {
GlobalSumVector((double *)c,2*N); GlobalSumVector((double *)c,2*N);

View File

@@ -33,8 +33,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
/////////////////////////////////// ///////////////////////////////////
#include <Grid/communicator/SharedMemory.h> #include <Grid/communicator/SharedMemory.h>
#define NVLINK_GET
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
extern bool Stencil_force_mpi ; extern bool Stencil_force_mpi ;
@@ -130,35 +128,6 @@ public:
void GlobalXOR(uint32_t &); void GlobalXOR(uint32_t &);
void GlobalXOR(uint64_t &); void GlobalXOR(uint64_t &);
template<class obj> void GlobalSumP2P(obj &o)
{
std::vector<obj> column;
obj accum = o;
int source,dest;
for(int d=0;d<_ndimension;d++){
column.resize(_processors[d]);
column[0] = accum;
std::vector<MpiCommsRequest_t> list;
for(int p=1;p<_processors[d];p++){
ShiftedRanks(d,p,source,dest);
SendToRecvFromBegin(list,
&column[0],
dest,
&column[p],
source,
sizeof(obj),d*100+p);
}
if (!list.empty()) // avoid triggering GRID_ASSERT in comms == none
CommsComplete(list);
for(int p=1;p<_processors[d];p++){
accum = accum + column[p];
}
}
Broadcast(0,accum);
o=accum;
}
template<class obj> void GlobalSum(obj &o){ template<class obj> void GlobalSum(obj &o){
typedef typename obj::scalar_type scalar_type; typedef typename obj::scalar_type scalar_type;
int words = sizeof(obj)/sizeof(scalar_type); int words = sizeof(obj)/sizeof(scalar_type);
@@ -169,8 +138,8 @@ public:
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
// Face exchange, buffer swap in translational invariant way // Face exchange, buffer swap in translational invariant way
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
void CommsComplete(std::vector<MpiCommsRequest_t> &list); void CommsComplete(std::vector<CommsRequest_t> &list);
void SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &list, void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit, void *xmit,
int dest, int dest,
void *recv, void *recv,
@@ -183,31 +152,19 @@ public:
int recv_from_rank, int recv_from_rank,
int bytes); int bytes);
int IsOffNode(int rank);
double StencilSendToRecvFrom(void *xmit, double StencilSendToRecvFrom(void *xmit,
int xmit_to_rank,int do_xmit, int xmit_to_rank,int do_xmit,
void *recv, void *recv,
int recv_from_rank,int do_recv, int recv_from_rank,int do_recv,
int bytes,int dir); int bytes,int dir);
double StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list, double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit, void *xmit,
int xmit_to_rank,int do_xmit, int xmit_to_rank,int do_xmit,
void *recv, void *recv,
int recv_from_rank,int do_recv, int recv_from_rank,int do_recv,
int xbytes,int rbytes,int dir); int xbytes,int rbytes,int dir);
// Could do a PollHtoD and have a CommsMerge dependence
void StencilSendToRecvFromPollDtoH (std::vector<CommsRequest_t> &list);
void StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list);
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,void *xmit_comp,
int xmit_to_rank,int do_xmit,
void *recv,void *recv_comp,
int recv_from_rank,int do_recv,
int xbytes,int rbytes,int dir);
void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i); void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i);
void StencilBarrier(void); void StencilBarrier(void);
@@ -226,14 +183,14 @@ public:
// All2All down one dimension // All2All down one dimension
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
template<class T> void AllToAll(int dim,std::vector<T> &in, std::vector<T> &out){ template<class T> void AllToAll(int dim,std::vector<T> &in, std::vector<T> &out){
GRID_ASSERT(dim>=0); assert(dim>=0);
GRID_ASSERT(dim<_ndimension); assert(dim<_ndimension);
GRID_ASSERT(in.size()==out.size()); assert(in.size()==out.size());
int numnode = _processors[dim]; int numnode = _processors[dim];
uint64_t bytes=sizeof(T); uint64_t bytes=sizeof(T);
uint64_t words=in.size()/numnode; uint64_t words=in.size()/numnode;
GRID_ASSERT(numnode * words == in.size()); assert(numnode * words == in.size());
GRID_ASSERT(words < (1ULL<<31)); assert(words < (1ULL<<31));
AllToAll(dim,(void *)&in[0],(void *)&out[0],words,bytes); AllToAll(dim,(void *)&in[0],(void *)&out[0],words,bytes);
} }
void AllToAll(int dim ,void *in,void *out,uint64_t words,uint64_t bytes); void AllToAll(int dim ,void *in,void *out,uint64_t words,uint64_t bytes);

View File

@@ -28,17 +28,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
#include <Grid/communicator/SharedMemory.h> #include <Grid/communicator/SharedMemory.h>
void GridAbort(void) { MPI_Abort(MPI_COMM_WORLD,SIGABRT); }
extern void * Grid_backtrace_buffer[_NBACKTRACE];
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
Grid_MPI_Comm CartesianCommunicator::communicator_world; Grid_MPI_Comm CartesianCommunicator::communicator_world;
#ifdef GRID_CHECKSUM_COMMS
uint64_t checksum_index = 1;
#endif
//////////////////////////////////////////// ////////////////////////////////////////////
// First initialise of comms system // First initialise of comms system
@@ -63,11 +55,11 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
#endif #endif
//If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE //If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE
if( (nCommThreads == 1) && (provided == MPI_THREAD_SINGLE) ) { if( (nCommThreads == 1) && (provided == MPI_THREAD_SINGLE) ) {
GRID_ASSERT(0); assert(0);
} }
if( (nCommThreads > 1) && (provided != MPI_THREAD_MULTIPLE) ) { if( (nCommThreads > 1) && (provided != MPI_THREAD_MULTIPLE) ) {
GRID_ASSERT(0); assert(0);
} }
} }
@@ -88,20 +80,20 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest) void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
{ {
int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest); int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
GRID_ASSERT(ierr==0); assert(ierr==0);
} }
int CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor) int CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor)
{ {
int rank; int rank;
int ierr=MPI_Cart_rank (communicator, &coor[0], &rank); int ierr=MPI_Cart_rank (communicator, &coor[0], &rank);
GRID_ASSERT(ierr==0); assert(ierr==0);
return rank; return rank;
} }
void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor) void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor)
{ {
coor.resize(_ndimension); coor.resize(_ndimension);
int ierr=MPI_Cart_coords (communicator, rank, _ndimension,&coor[0]); int ierr=MPI_Cart_coords (communicator, rank, _ndimension,&coor[0]);
GRID_ASSERT(ierr==0); assert(ierr==0);
} }
//////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -128,8 +120,8 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
////////////////////////////////// //////////////////////////////////
CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank) CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank)
{ {
_ndimension = processors.size(); GRID_ASSERT(_ndimension>=1); _ndimension = processors.size(); assert(_ndimension>=1);
int parent_ndimension = parent._ndimension; GRID_ASSERT(_ndimension >= parent._ndimension); int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension);
Coordinate parent_processor_coor(_ndimension,0); Coordinate parent_processor_coor(_ndimension,0);
Coordinate parent_processors (_ndimension,1); Coordinate parent_processors (_ndimension,1);
Coordinate shm_processors (_ndimension,1); Coordinate shm_processors (_ndimension,1);
@@ -153,7 +145,7 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
childsize *= processors[d]; childsize *= processors[d];
} }
int Nchild = Nparent/childsize; int Nchild = Nparent/childsize;
GRID_ASSERT (childsize * Nchild == Nparent); assert (childsize * Nchild == Nparent);
Coordinate ccoor(_ndimension); // coor within subcommunicator Coordinate ccoor(_ndimension); // coor within subcommunicator
Coordinate scoor(_ndimension); // coor of split within parent Coordinate scoor(_ndimension); // coor of split within parent
@@ -179,12 +171,12 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
// Split the communicator // Split the communicator
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
int ierr= MPI_Comm_split(parent.communicator,srank,crank,&comm_split); int ierr= MPI_Comm_split(parent.communicator,srank,crank,&comm_split);
GRID_ASSERT(ierr==0); assert(ierr==0);
} else { } else {
srank = 0; srank = 0;
int ierr = MPI_Comm_dup (parent.communicator,&comm_split); int ierr = MPI_Comm_dup (parent.communicator,&comm_split);
GRID_ASSERT(ierr==0); assert(ierr==0);
} }
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -209,7 +201,7 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
} }
} }
for(int d=0;d<processors.size();d++){ for(int d=0;d<processors.size();d++){
GRID_ASSERT(_processor_coor[d] == ccoor[d] ); assert(_processor_coor[d] == ccoor[d] );
} }
} }
@@ -251,7 +243,7 @@ void CartesianCommunicator::InitFromMPICommunicator(const Coordinate &processors
for(int i=0;i<_ndimension*2;i++){ for(int i=0;i<_ndimension*2;i++){
MPI_Comm_dup(communicator,&communicator_halo[i]); MPI_Comm_dup(communicator,&communicator_halo[i]);
} }
GRID_ASSERT(Size==_Nprocessors); assert(Size==_Nprocessors);
} }
CartesianCommunicator::~CartesianCommunicator() CartesianCommunicator::~CartesianCommunicator()
@@ -265,79 +257,57 @@ CartesianCommunicator::~CartesianCommunicator()
} }
} }
} }
#ifdef USE_GRID_REDUCTION
void CartesianCommunicator::GlobalSum(float &f){
FlightRecorder::StepLog("GlobalSumP2P");
CartesianCommunicator::GlobalSumP2P(f);
}
void CartesianCommunicator::GlobalSum(double &d)
{
FlightRecorder::StepLog("GlobalSumP2P");
CartesianCommunicator::GlobalSumP2P(d);
}
#else
void CartesianCommunicator::GlobalSum(float &f){
FlightRecorder::StepLog("AllReduce float");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
GRID_ASSERT(ierr==0);
}
void CartesianCommunicator::GlobalSum(double &d)
{
FlightRecorder::StepLog("AllReduce double");
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
GRID_ASSERT(ierr==0);
}
#endif
void CartesianCommunicator::GlobalSum(uint32_t &u){ void CartesianCommunicator::GlobalSum(uint32_t &u){
FlightRecorder::StepLog("AllReduce uint32_t");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator); int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
GRID_ASSERT(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::GlobalSum(uint64_t &u){ void CartesianCommunicator::GlobalSum(uint64_t &u){
FlightRecorder::StepLog("AllReduce uint64_t");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator); int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
GRID_ASSERT(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::GlobalSumVector(uint64_t* u,int N){ void CartesianCommunicator::GlobalSumVector(uint64_t* u,int N){
FlightRecorder::StepLog("AllReduceVector");
int ierr=MPI_Allreduce(MPI_IN_PLACE,u,N,MPI_UINT64_T,MPI_SUM,communicator); int ierr=MPI_Allreduce(MPI_IN_PLACE,u,N,MPI_UINT64_T,MPI_SUM,communicator);
GRID_ASSERT(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::GlobalXOR(uint32_t &u){ void CartesianCommunicator::GlobalXOR(uint32_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator); int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
GRID_ASSERT(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::GlobalXOR(uint64_t &u){ void CartesianCommunicator::GlobalXOR(uint64_t &u){
FlightRecorder::StepLog("GlobalXOR");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator); int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
GRID_ASSERT(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::GlobalMax(float &f) void CartesianCommunicator::GlobalMax(float &f)
{ {
FlightRecorder::StepLog("GlobalMax");
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_MAX,communicator); int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_MAX,communicator);
GRID_ASSERT(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::GlobalMax(double &d) void CartesianCommunicator::GlobalMax(double &d)
{ {
FlightRecorder::StepLog("GlobalMax");
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator); int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
GRID_ASSERT(ierr==0); assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(float &f){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
} }
void CartesianCommunicator::GlobalSumVector(float *f,int N) void CartesianCommunicator::GlobalSumVector(float *f,int N)
{ {
FlightRecorder::StepLog("GlobalSumVector(float *)");
int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator); int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
GRID_ASSERT(ierr==0); assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(double &d)
{
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
} }
void CartesianCommunicator::GlobalSumVector(double *d,int N) void CartesianCommunicator::GlobalSumVector(double *d,int N)
{ {
FlightRecorder::StepLog("GlobalSumVector(double *)");
int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator); int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
GRID_ASSERT(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &list, void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit, void *xmit,
int dest, int dest,
void *recv, void *recv,
@@ -347,22 +317,22 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &
MPI_Request xrq; MPI_Request xrq;
MPI_Request rrq; MPI_Request rrq;
GRID_ASSERT(dest != _processor); assert(dest != _processor);
GRID_ASSERT(from != _processor); assert(from != _processor);
int tag; int tag;
tag= dir+from*32; tag= dir+from*32;
int ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator,&rrq); int ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator,&rrq);
GRID_ASSERT(ierr==0); assert(ierr==0);
list.push_back(rrq); list.push_back(rrq);
tag= dir+_processor*32; tag= dir+_processor*32;
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator,&xrq); ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator,&xrq);
GRID_ASSERT(ierr==0); assert(ierr==0);
list.push_back(xrq); list.push_back(xrq);
} }
void CartesianCommunicator::CommsComplete(std::vector<MpiCommsRequest_t> &list) void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list)
{ {
int nreq=list.size(); int nreq=list.size();
@@ -370,7 +340,7 @@ void CartesianCommunicator::CommsComplete(std::vector<MpiCommsRequest_t> &list)
std::vector<MPI_Status> status(nreq); std::vector<MPI_Status> status(nreq);
int ierr = MPI_Waitall(nreq,&list[0],&status[0]); int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
GRID_ASSERT(ierr==0); assert(ierr==0);
list.resize(0); list.resize(0);
} }
@@ -381,22 +351,27 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
int from, int from,
int bytes) int bytes)
{ {
std::vector<MpiCommsRequest_t> reqs(0); std::vector<CommsRequest_t> reqs(0);
unsigned long xcrc = crc32(0L, Z_NULL, 0);
unsigned long rcrc = crc32(0L, Z_NULL, 0);
int myrank = _processor; int myrank = _processor;
int ierr; int ierr;
// Enforce no UVM in comms, device or host OK // Enforce no UVM in comms, device or host OK
GRID_ASSERT(acceleratorIsCommunicable(xmit)); assert(acceleratorIsCommunicable(xmit));
GRID_ASSERT(acceleratorIsCommunicable(recv)); assert(acceleratorIsCommunicable(recv));
// Give the CPU to MPI immediately; can use threads to overlap optionally // Give the CPU to MPI immediately; can use threads to overlap optionally
// printf("proc %d SendToRecvFrom %d bytes Sendrecv \n",_processor,bytes); // printf("proc %d SendToRecvFrom %d bytes Sendrecv \n",_processor,bytes);
ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank, ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
recv,bytes,MPI_CHAR,from, from, recv,bytes,MPI_CHAR,from, from,
communicator,MPI_STATUS_IGNORE); communicator,MPI_STATUS_IGNORE);
GRID_ASSERT(ierr==0); assert(ierr==0);
// xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
// printf("proc %d SendToRecvFrom %d bytes xcrc %lx rcrc %lx\n",_processor,bytes,xcrc,rcrc); fflush
} }
// Basic Halo comms primitive // Basic Halo comms primitive
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
@@ -406,36 +381,18 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
int bytes,int dir) int bytes,int dir)
{ {
std::vector<CommsRequest_t> list; std::vector<CommsRequest_t> list;
double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir); double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
offbytes += StencilSendToRecvFromBegin(list,xmit,xmit,dest,dox,recv,recv,from,dor,bytes,bytes,dir);
StencilSendToRecvFromComplete(list,dir); StencilSendToRecvFromComplete(list,dir);
return offbytes; return offbytes;
} }
int CartesianCommunicator::IsOffNode(int rank)
{
int grank = ShmRanks[rank];
if ( grank == MPI_UNDEFINED ) return true;
else return false;
}
#ifdef ACCELERATOR_AWARE_MPI #undef NVLINK_GET // Define to use get instead of put DMA
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {}; double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
void *xmit, void *xmit,
int dest,int dox, int dest,int dox,
void *recv, void *recv,
int from,int dor, int from,int dor,
int xbytes,int rbytes,int dir) int xbytes,int rbytes,int dir)
{
return 0.0; // Do nothing -- no preparation required
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,void *xmit_comp,
int dest,int dox,
void *recv,void *recv_comp,
int from,int dor,
int xbytes,int rbytes,int dir)
{ {
int ncomm =communicator_halo.size(); int ncomm =communicator_halo.size();
int commdir=dir%ncomm; int commdir=dir%ncomm;
@@ -448,431 +405,62 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
int gfrom = ShmRanks[from]; int gfrom = ShmRanks[from];
int gme = ShmRanks[_processor]; int gme = ShmRanks[_processor];
GRID_ASSERT(dest != _processor); assert(dest != _processor);
GRID_ASSERT(from != _processor); assert(from != _processor);
GRID_ASSERT(gme == ShmRank); assert(gme == ShmRank);
double off_node_bytes=0.0; double off_node_bytes=0.0;
int tag; int tag;
if ( dor ) { if ( dor ) {
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) { if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+from*32; tag= dir+from*32;
// std::cout << " StencilSendToRecvFrom "<<dir<<" MPI_Irecv "<<std::hex<<recv<<std::dec<<std::endl; ierr=MPI_Irecv(recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
ierr=MPI_Irecv(recv_comp, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq); assert(ierr==0);
GRID_ASSERT(ierr==0);
list.push_back(rrq); list.push_back(rrq);
off_node_bytes+=rbytes; off_node_bytes+=rbytes;
} }
#ifdef NVLINK_GET #ifdef NVLINK_GET
else {
void *shm = (void *) this->ShmBufferTranslate(from,xmit); void *shm = (void *) this->ShmBufferTranslate(from,xmit);
GRID_ASSERT(shm!=NULL); assert(shm!=NULL);
// std::cout << " StencilSendToRecvFrom "<<dir<<" CopyDeviceToDevice recv "<<std::hex<<recv<<" remote "<<shm <<std::dec<<std::endl;
acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes); acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
}
#endif #endif
} }
// This is a NVLINK PUT
if (dox) { if (dox) {
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) { if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+_processor*32; tag= dir+_processor*32;
ierr =MPI_Isend(xmit_comp, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq); ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
GRID_ASSERT(ierr==0); assert(ierr==0);
list.push_back(xrq); list.push_back(xrq);
off_node_bytes+=xbytes; off_node_bytes+=xbytes;
} else { } else {
#ifndef NVLINK_GET #ifndef NVLINK_GET
void *shm = (void *) this->ShmBufferTranslate(dest,recv); void *shm = (void *) this->ShmBufferTranslate(dest,recv);
GRID_ASSERT(shm!=NULL); assert(shm!=NULL);
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes); acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
#endif #endif
} }
} }
return off_node_bytes;
}
return off_node_bytes;
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir) void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
{ {
int nreq=list.size(); int nreq=list.size();
/*finishes Get/Put*/
acceleratorCopySynchronise(); acceleratorCopySynchronise();
if (nreq==0) return; if (nreq==0) return;
std::vector<MPI_Status> status(nreq); std::vector<MPI_Status> status(nreq);
int ierr = MPI_Waitall(nreq,&list[0],&status[0]); int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
GRID_ASSERT(ierr==0);
list.resize(0);
this->StencilBarrier();
}
#else /* NOT ... ACCELERATOR_AWARE_MPI */
///////////////////////////////////////////
// Pipeline mode through host memory
///////////////////////////////////////////
/*
* In prepare (phase 1):
* PHASE 1: (prepare)
* - post MPI receive buffers asynch
* - post device - host send buffer transfer asynch
* PHASE 2: (Begin)
* - complete all copies
* - post MPI send asynch
* - post device - device transfers
* PHASE 3: (Complete)
* - MPI_waitall
* - host-device transfers
*
*********************************
* NB could split this further:
*--------------------------------
* PHASE 1: (Prepare)
* - post MPI receive buffers asynch
* - post device - host send buffer transfer asynch
* PHASE 2: (BeginInterNode)
* - complete all copies
* - post MPI send asynch
* PHASE 3: (BeginIntraNode)
* - post device - device transfers
* PHASE 4: (Complete)
* - MPI_waitall
* - host-device transfers asynch
* - (complete all copies)
*/
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,int dox,
void *recv,
int from,int dor,
int xbytes,int rbytes,int dir)
{
/*
* Bring sequence from Stencil.h down to lower level.
* Assume using XeLink is ok
*/
int ncomm =communicator_halo.size();
int commdir=dir%ncomm;
MPI_Request xrq;
MPI_Request rrq;
int ierr;
int gdest = ShmRanks[dest];
int gfrom = ShmRanks[from];
int gme = ShmRanks[_processor];
GRID_ASSERT(dest != _processor);
GRID_ASSERT(from != _processor);
GRID_ASSERT(gme == ShmRank);
double off_node_bytes=0.0;
int tag;
void * host_recv = NULL;
void * host_xmit = NULL;
/*
* PHASE 1: (Prepare)
* - post MPI receive buffers asynch
* - post device - host send buffer transfer asynch
*/
#ifdef GRID_CHECKSUM_COMMS
rbytes += 8;
xbytes += 8;
#endif
if ( dor ) {
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+from*32;
host_recv = this->HostBufferMalloc(rbytes);
ierr=MPI_Irecv(host_recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
GRID_ASSERT(ierr==0);
CommsRequest_t srq;
srq.PacketType = InterNodeRecv;
srq.bytes = rbytes;
srq.req = rrq;
srq.host_buf = host_recv;
srq.device_buf = recv;
srq.tag = tag;
list.push_back(srq);
off_node_bytes+=rbytes;
}
}
if (dox) {
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+_processor*32;
host_xmit = this->HostBufferMalloc(xbytes);
CommsRequest_t srq;
#ifdef GRID_CHECKSUM_COMMS
uint64_t xbytes_data = xbytes - 8;
srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes_data); // Make this Asynch
GRID_ASSERT(xbytes % 8 == 0);
// flip one bit so that a zero buffer is not consistent
uint64_t xsum = checksum_gpu((uint64_t*)xmit, xbytes_data / 8) ^ (checksum_index + 1 + 1000 * tag);
*(uint64_t*)(((char*)host_xmit) + xbytes_data) = xsum;
#else
srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
#endif
// ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
// GRID_ASSERT(ierr==0);
// off_node_bytes+=xbytes;
srq.PacketType = InterNodeXmit;
srq.bytes = xbytes;
// srq.req = xrq;
srq.host_buf = host_xmit;
srq.device_buf = xmit;
srq.tag = tag;
srq.dest = dest;
srq.commdir = commdir;
list.push_back(srq);
}
}
return off_node_bytes;
}
/*
* In the interest of better pipelining, poll for completion on each DtoH and
* start MPI_ISend in the meantime
*/
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list)
{
int pending = 0;
do {
pending = 0;
for(int idx = 0; idx<list.size();idx++){
if ( list[idx].PacketType==InterNodeRecv ) {
int flag = 0;
MPI_Status status;
int ierr = MPI_Test(&list[idx].req,&flag,&status);
assert(ierr==0); assert(ierr==0);
list.resize(0);
if ( flag ) {
// std::cout << " PollIrecv "<<idx<<" flag "<<flag<<std::endl;
#ifdef GRID_CHECKSUM_COMMS
acceleratorCopyToDeviceAsynch(list[idx].host_buf,list[idx].device_buf,list[idx].bytes - 8);
#else
acceleratorCopyToDeviceAsynch(list[idx].host_buf,list[idx].device_buf,list[idx].bytes);
#endif
list[idx].PacketType=InterNodeReceiveHtoD;
} else {
pending ++;
} }
}
}
// std::cout << " PollIrecv "<<pending<<" pending requests"<<std::endl;
} while ( pending );
}
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list)
{
int pending = 0;
do {
pending = 0;
for(int idx = 0; idx<list.size();idx++){
if ( list[idx].PacketType==InterNodeXmit ) {
if ( acceleratorEventIsComplete(list[idx].ev) ) {
void *host_xmit = list[idx].host_buf;
uint32_t xbytes = list[idx].bytes;
int dest = list[idx].dest;
int tag = list[idx].tag;
int commdir = list[idx].commdir;
///////////////////
// Send packet
///////////////////
// std::cout << " DtoH is complete for index "<<idx<<" calling MPI_Isend "<<std::endl;
MPI_Request xrq;
int ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
GRID_ASSERT(ierr==0);
list[idx].req = xrq; // Update the MPI request in the list
list[idx].PacketType=InterNodeXmitISend;
} else {
// not done, so return to polling loop
pending++;
}
}
}
} while (pending);
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,void *xmit_comp,
int dest,int dox,
void *recv,void *recv_comp,
int from,int dor,
int xbytes,int rbytes,int dir)
{
int ncomm =communicator_halo.size();
int commdir=dir%ncomm;
MPI_Request xrq;
MPI_Request rrq;
int ierr;
int gdest = ShmRanks[dest];
int gfrom = ShmRanks[from];
int gme = ShmRanks[_processor];
GRID_ASSERT(dest != _processor);
GRID_ASSERT(from != _processor);
GRID_ASSERT(gme == ShmRank);
double off_node_bytes=0.0;
int tag;
void * host_xmit = NULL;
////////////////////////////////
// Receives already posted
// Copies already started
////////////////////////////////
/*
* PHASE 2: (Begin)
* - complete all copies
* - post MPI send asynch
*/
#ifdef NVLINK_GET
if ( dor ) {
if ( ! ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) ) {
// Intranode
void *shm = (void *) this->ShmBufferTranslate(from,xmit);
GRID_ASSERT(shm!=NULL);
CommsRequest_t srq;
srq.ev = acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
srq.PacketType = IntraNodeRecv;
srq.bytes = xbytes;
// srq.req = xrq;
srq.host_buf = NULL;
srq.device_buf = xmit;
srq.tag = -1;
srq.dest = dest;
srq.commdir = dir;
list.push_back(srq);
}
}
#else
if (dox) {
if ( !( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) ) {
// Intranode
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
GRID_ASSERT(shm!=NULL);
CommsRequest_t srq;
srq.ev = acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
srq.PacketType = IntraNodeXmit;
srq.bytes = xbytes;
// srq.req = xrq;
srq.host_buf = NULL;
srq.device_buf = xmit;
srq.tag = -1;
srq.dest = dest;
srq.commdir = dir;
list.push_back(srq);
}
}
#endif
return off_node_bytes;
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
{
acceleratorCopySynchronise(); // Complete all pending copy transfers D2D
std::vector<MPI_Status> status;
std::vector<MPI_Request> MpiRequests;
for(int r=0;r<list.size();r++){
// Must check each Send buf is clear to reuse
if ( list[r].PacketType == InterNodeXmitISend ) MpiRequests.push_back(list[r].req);
// if ( list[r].PacketType == InterNodeRecv ) MpiRequests.push_back(list[r].req); // Already "Test" passed
}
int nreq=MpiRequests.size();
if (nreq>0) {
status.resize(MpiRequests.size());
int ierr = MPI_Waitall(MpiRequests.size(),&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing.
GRID_ASSERT(ierr==0);
}
// for(int r=0;r<nreq;r++){
// if ( list[r].PacketType==InterNodeRecv ) {
// acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
// }
// }
#ifdef GRID_CHECKSUM_COMMS
for(int r=0;r<list.size();r++){
if ( list[r].PacketType == InterNodeReceiveHtoD ) {
uint64_t rbytes_data = list[r].bytes - 8;
uint64_t expected_cs = *(uint64_t*)(((char*)list[r].host_buf) + rbytes_data);
uint64_t computed_cs = checksum_gpu((uint64_t*)list[r].device_buf, rbytes_data / 8) ^ (checksum_index + 1 + 1000 * list[r].tag); //
if (expected_cs != computed_cs) {
// TODO: error message, backtrace, quit
fprintf(stderr, "GRID_CHECKSUM_COMMS error:\n");
fprintf(stderr, " processor = %d\n", (int)_processor);
for(int d=0;d<_processors.size();d++)
fprintf(stderr, " processor_coord[%d] = %d\n", d, _processor_coor[d]);
fprintf(stderr, " hostname: %s\n", GridHostname());
fprintf(stderr, " expected_cs: %ld\n", expected_cs);
fprintf(stderr, " computed_cs: %ld\n", computed_cs);
fprintf(stderr, " dest: %d\n", list[r].dest);
fprintf(stderr, " tag: %d\n", list[r].tag);
fprintf(stderr, " commdir: %d\n", list[r].commdir);
fprintf(stderr, " bytes: %ld\n", (uint64_t)list[r].bytes);
fflush(stderr);
// backtrace
int symbols = backtrace(Grid_backtrace_buffer,_NBACKTRACE);
backtrace_symbols_fd(Grid_backtrace_buffer,symbols, 2);
exit(1);
}
}
}
checksum_index += 1;
#endif
list.resize(0); // Delete the list
this->HostBufferFreeAll(); // Clean up the buffer allocs
#ifndef NVLINK_GET
this->StencilBarrier(); // if PUT must check our nbrs have filled our receive buffers.
#endif
}
#endif
////////////////////////////////////////////
// END PIPELINE MODE / NO CUDA AWARE MPI
////////////////////////////////////////////
void CartesianCommunicator::StencilBarrier(void) void CartesianCommunicator::StencilBarrier(void)
{ {
FlightRecorder::StepLog("NodeBarrier");
MPI_Barrier (ShmComm); MPI_Barrier (ShmComm);
} }
//void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list) //void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
@@ -880,19 +468,17 @@ void CartesianCommunicator::StencilBarrier(void)
//} //}
void CartesianCommunicator::Barrier(void) void CartesianCommunicator::Barrier(void)
{ {
FlightRecorder::StepLog("GridBarrier");
int ierr = MPI_Barrier(communicator); int ierr = MPI_Barrier(communicator);
GRID_ASSERT(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::Broadcast(int root,void* data, int bytes) void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
{ {
FlightRecorder::StepLog("Broadcast");
int ierr=MPI_Bcast(data, int ierr=MPI_Bcast(data,
bytes, bytes,
MPI_BYTE, MPI_BYTE,
root, root,
communicator); communicator);
GRID_ASSERT(ierr==0); assert(ierr==0);
} }
int CartesianCommunicator::RankWorld(void){ int CartesianCommunicator::RankWorld(void){
int r; int r;
@@ -900,25 +486,23 @@ int CartesianCommunicator::RankWorld(void){
return r; return r;
} }
void CartesianCommunicator::BarrierWorld(void){ void CartesianCommunicator::BarrierWorld(void){
FlightRecorder::StepLog("BarrierWorld");
int ierr = MPI_Barrier(communicator_world); int ierr = MPI_Barrier(communicator_world);
GRID_ASSERT(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
{ {
FlightRecorder::StepLog("BroadcastWorld");
int ierr= MPI_Bcast(data, int ierr= MPI_Bcast(data,
bytes, bytes,
MPI_BYTE, MPI_BYTE,
root, root,
communicator_world); communicator_world);
GRID_ASSERT(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes) void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes)
{ {
Coordinate row(_ndimension,1); Coordinate row(_ndimension,1);
GRID_ASSERT(dim>=0 && dim<_ndimension); assert(dim>=0 && dim<_ndimension);
// Split the communicator // Split the communicator
row[dim] = _processors[dim]; row[dim] = _processors[dim];
@@ -929,7 +513,6 @@ void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,
} }
void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t bytes) void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t bytes)
{ {
FlightRecorder::StepLog("AllToAll");
// MPI is a pain and uses "int" arguments // MPI is a pain and uses "int" arguments
// 64*64*64*128*16 == 500Million elements of data. // 64*64*64*128*16 == 500Million elements of data.
// When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug. // When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug.
@@ -939,8 +522,8 @@ void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t
int ibytes; int ibytes;
iwords = words; iwords = words;
ibytes = bytes; ibytes = bytes;
GRID_ASSERT(words == iwords); // safe to cast to int ? assert(words == iwords); // safe to cast to int ?
GRID_ASSERT(bytes == ibytes); // safe to cast to int ? assert(bytes == ibytes); // safe to cast to int ?
MPI_Type_contiguous(ibytes,MPI_BYTE,&object); MPI_Type_contiguous(ibytes,MPI_BYTE,&object);
MPI_Type_commit(&object); MPI_Type_commit(&object);
MPI_Alltoall(in,iwords,object,out,iwords,object,communicator); MPI_Alltoall(in,iwords,object,out,iwords,object,communicator);

View File

@@ -34,8 +34,6 @@ NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////
Grid_MPI_Comm CartesianCommunicator::communicator_world; Grid_MPI_Comm CartesianCommunicator::communicator_world;
void GridAbort(void) { abort(); }
void CartesianCommunicator::Init(int *argc, char *** arv) void CartesianCommunicator::Init(int *argc, char *** arv)
{ {
GlobalSharedMemory::Init(communicator_world); GlobalSharedMemory::Init(communicator_world);
@@ -56,14 +54,14 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
{ {
_shm_processors = Coordinate(processors.size(),1); _shm_processors = Coordinate(processors.size(),1);
_processors = processors; _processors = processors;
_ndimension = processors.size(); GRID_ASSERT(_ndimension>=1); _ndimension = processors.size(); assert(_ndimension>=1);
_processor_coor.resize(_ndimension); _processor_coor.resize(_ndimension);
// Require 1^N processor grid for fake // Require 1^N processor grid for fake
_Nprocessors=1; _Nprocessors=1;
_processor = 0; _processor = 0;
for(int d=0;d<_ndimension;d++) { for(int d=0;d<_ndimension;d++) {
GRID_ASSERT(_processors[d]==1); assert(_processors[d]==1);
_processor_coor[d] = 0; _processor_coor[d] = 0;
} }
SetCommunicator(communicator_world); SetCommunicator(communicator_world);
@@ -91,9 +89,9 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
int from, int from,
int bytes) int bytes)
{ {
GRID_ASSERT(0); assert(0);
} }
void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ GRID_ASSERT(list.size()==0);} void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(0);}
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list, void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit, void *xmit,
int dest, int dest,
@@ -101,7 +99,7 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
int from, int from,
int bytes,int dir) int bytes,int dir)
{ {
GRID_ASSERT(0); assert(0);
} }
void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes) void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes)
@@ -126,8 +124,6 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest
dest=0; dest=0;
} }
int CartesianCommunicator::IsOffNode(int rank) { return false; }
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
int xmit_to_rank,int dox, int xmit_to_rank,int dox,
void *recv, void *recv,
@@ -136,17 +132,6 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
{ {
return 2.0*bytes; return 2.0*bytes;
} }
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,int dox,
void *recv,
int recv_from_rank,int dor,
int xbytes,int rbytes, int dir)
{
return 0.0;
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit, void *xmit,
int xmit_to_rank,int dox, int xmit_to_rank,int dox,

View File

@@ -58,8 +58,8 @@ int GlobalSharedMemory::WorldNode;
void GlobalSharedMemory::SharedMemoryFree(void) void GlobalSharedMemory::SharedMemoryFree(void)
{ {
GRID_ASSERT(_ShmAlloc); assert(_ShmAlloc);
GRID_ASSERT(_ShmAllocBytes>0); assert(_ShmAllocBytes>0);
for(int r=0;r<WorldShmSize;r++){ for(int r=0;r<WorldShmSize;r++){
munmap(WorldShmCommBufs[r],_ShmAllocBytes); munmap(WorldShmCommBufs[r],_ShmAllocBytes);
} }
@@ -80,7 +80,7 @@ void *SharedMemory::HostBufferMalloc(size_t bytes){
std::cout<< " Current alloc is " << (bytes/(1024*1024)) <<"MB"<<std::endl; std::cout<< " Current alloc is " << (bytes/(1024*1024)) <<"MB"<<std::endl;
std::cout<< " Current bytes is " << (host_heap_bytes/(1024*1024)) <<"MB"<<std::endl; std::cout<< " Current bytes is " << (host_heap_bytes/(1024*1024)) <<"MB"<<std::endl;
std::cout<< " Current heap is " << (host_heap_size/(1024*1024)) <<"MB"<<std::endl; std::cout<< " Current heap is " << (host_heap_size/(1024*1024)) <<"MB"<<std::endl;
GRID_ASSERT(host_heap_bytes<host_heap_size); assert(host_heap_bytes<host_heap_size);
} }
return ptr; return ptr;
} }
@@ -100,7 +100,7 @@ void *SharedMemory::ShmBufferMalloc(size_t bytes){
std::cout<< " Current alloc is " << (bytes/(1024*1024)) <<"MB"<<std::endl; std::cout<< " Current alloc is " << (bytes/(1024*1024)) <<"MB"<<std::endl;
std::cout<< " Current bytes is " << (heap_bytes/(1024*1024)) <<"MB"<<std::endl; std::cout<< " Current bytes is " << (heap_bytes/(1024*1024)) <<"MB"<<std::endl;
std::cout<< " Current heap is " << (heap_size/(1024*1024)) <<"MB"<<std::endl; std::cout<< " Current heap is " << (heap_size/(1024*1024)) <<"MB"<<std::endl;
GRID_ASSERT(heap_bytes<heap_size); assert(heap_bytes<heap_size);
} }
//std::cerr << "ShmBufferMalloc "<<std::hex<< ptr<<" - "<<((uint64_t)ptr+bytes)<<std::dec<<std::endl; //std::cerr << "ShmBufferMalloc "<<std::hex<< ptr<<" - "<<((uint64_t)ptr+bytes)<<std::dec<<std::endl;
return ptr; return ptr;
@@ -127,13 +127,13 @@ void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmD
if ( str ) { if ( str ) {
std::vector<int> IntShmDims; std::vector<int> IntShmDims;
GridCmdOptionIntVector(std::string(str),IntShmDims); GridCmdOptionIntVector(std::string(str),IntShmDims);
GRID_ASSERT(IntShmDims.size() == WorldDims.size()); assert(IntShmDims.size() == WorldDims.size());
long ShmSize = 1; long ShmSize = 1;
for (int dim=0;dim<WorldDims.size();dim++) { for (int dim=0;dim<WorldDims.size();dim++) {
ShmSize *= (ShmDims[dim] = IntShmDims[dim]); ShmSize *= (ShmDims[dim] = IntShmDims[dim]);
GRID_ASSERT(divides(ShmDims[dim],WorldDims[dim])); assert(divides(ShmDims[dim],WorldDims[dim]));
} }
GRID_ASSERT(ShmSize == WorldShmSize); assert(ShmSize == WorldShmSize);
return; return;
} }

View File

@@ -46,40 +46,8 @@ NAMESPACE_BEGIN(Grid);
#if defined (GRID_COMMS_MPI3) #if defined (GRID_COMMS_MPI3)
typedef MPI_Comm Grid_MPI_Comm; typedef MPI_Comm Grid_MPI_Comm;
typedef MPI_Request MpiCommsRequest_t;
#ifdef ACCELERATOR_AWARE_MPI
typedef MPI_Request CommsRequest_t; typedef MPI_Request CommsRequest_t;
#else #else
/*
* Enable state transitions as each packet flows.
*/
enum PacketType_t {
FaceGather,
InterNodeXmit,
InterNodeRecv,
IntraNodeXmit,
IntraNodeRecv,
InterNodeXmitISend,
InterNodeReceiveHtoD
};
/*
*Package arguments needed for various actions along packet flow
*/
typedef struct {
PacketType_t PacketType;
void *host_buf;
void *device_buf;
int dest;
int tag;
int commdir;
unsigned long bytes;
acceleratorEvent_t ev;
MpiCommsRequest_t req;
} CommsRequest_t;
#endif
#else
typedef int MpiCommsRequest_t;
typedef int CommsRequest_t; typedef int CommsRequest_t;
typedef int Grid_MPI_Comm; typedef int Grid_MPI_Comm;
#endif #endif
@@ -137,7 +105,7 @@ public:
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
static void SharedMemoryAllocate(uint64_t bytes, int flags); static void SharedMemoryAllocate(uint64_t bytes, int flags);
static void SharedMemoryFree(void); static void SharedMemoryFree(void);
// static void SharedMemoryCopy(void *dest,void *src,size_t bytes); static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
static void SharedMemoryZero(void *dest,size_t bytes); static void SharedMemoryZero(void *dest,size_t bytes);
}; };

View File

@@ -42,7 +42,6 @@ Author: Christoph Lehner <christoph@lhnr.de>
#ifdef ACCELERATOR_AWARE_MPI #ifdef ACCELERATOR_AWARE_MPI
#define GRID_SYCL_LEVEL_ZERO_IPC #define GRID_SYCL_LEVEL_ZERO_IPC
#define SHM_SOCKETS #define SHM_SOCKETS
#else
#endif #endif
#include <syscall.h> #include <syscall.h>
#endif #endif
@@ -67,7 +66,7 @@ public:
{ {
int errnum; int errnum;
sock = socket(AF_UNIX, SOCK_DGRAM, 0); GRID_ASSERT(sock>0); sock = socket(AF_UNIX, SOCK_DGRAM, 0); assert(sock>0);
struct sockaddr_un sa_un = { 0 }; struct sockaddr_un sa_un = { 0 };
sa_un.sun_family = AF_UNIX; sa_un.sun_family = AF_UNIX;
@@ -158,7 +157,7 @@ public:
/*Construct from an MPI communicator*/ /*Construct from an MPI communicator*/
void GlobalSharedMemory::Init(Grid_MPI_Comm comm) void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
{ {
GRID_ASSERT(_ShmSetup==0); assert(_ShmSetup==0);
WorldComm = comm; WorldComm = comm;
MPI_Comm_rank(WorldComm,&WorldRank); MPI_Comm_rank(WorldComm,&WorldRank);
MPI_Comm_size(WorldComm,&WorldSize); MPI_Comm_size(WorldComm,&WorldSize);
@@ -184,7 +183,7 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
// WorldNodes // WorldNodes
WorldNodes = WorldSize/WorldShmSize; WorldNodes = WorldSize/WorldShmSize;
GRID_ASSERT( (WorldNodes * WorldShmSize) == WorldSize ); assert( (WorldNodes * WorldShmSize) == WorldSize );
// FIXME: Check all WorldShmSize are the same ? // FIXME: Check all WorldShmSize are the same ?
@@ -209,7 +208,7 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
MyGroup.resize(WorldShmSize); MyGroup.resize(WorldShmSize);
for(int rank=0;rank<WorldSize;rank++){ for(int rank=0;rank<WorldSize;rank++){
if(WorldShmRanks[rank]!=MPI_UNDEFINED){ if(WorldShmRanks[rank]!=MPI_UNDEFINED){
GRID_ASSERT(g<WorldShmSize); assert(g<WorldShmSize);
MyGroup[g++] = rank; MyGroup[g++] = rank;
} }
} }
@@ -225,7 +224,7 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
// global sum leaders over comm world // global sum leaders over comm world
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,WorldComm); int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,WorldComm);
GRID_ASSERT(ierr==0); assert(ierr==0);
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
// find the group leaders world rank // find the group leaders world rank
@@ -246,7 +245,7 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
WorldNode=g; WorldNode=g;
} }
} }
GRID_ASSERT(WorldNode!=-1); assert(WorldNode!=-1);
_ShmSetup=1; _ShmSetup=1;
} }
// Gray encode support // Gray encode support
@@ -288,7 +287,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
// Assert power of two shm_size. // Assert power of two shm_size.
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE); int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE);
GRID_ASSERT(log2size != -1); assert(log2size != -1);
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Identify the hypercube coordinate of this node using hostname // Identify the hypercube coordinate of this node using hostname
@@ -309,7 +308,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
// Parse ICE-XA hostname to get hypercube location // Parse ICE-XA hostname to get hypercube location
gethostname(name,namelen); gethostname(name,namelen);
int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ; int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ;
GRID_ASSERT(nscan==3); assert(nscan==3);
int nlo = N%9; int nlo = N%9;
int nhi = N/9; int nhi = N/9;
@@ -333,8 +332,8 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////
MPI_Bcast(&rootcoor, sizeof(rootcoor), MPI_BYTE, 0, WorldComm); MPI_Bcast(&rootcoor, sizeof(rootcoor), MPI_BYTE, 0, WorldComm);
hypercoor=hypercoor-rootcoor; hypercoor=hypercoor-rootcoor;
GRID_ASSERT(hypercoor<WorldSize); assert(hypercoor<WorldSize);
GRID_ASSERT(hypercoor>=0); assert(hypercoor>=0);
////////////////////////////////////// //////////////////////////////////////
// Printing // Printing
@@ -382,7 +381,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
for(int i=0;i<ndimension;i++){ for(int i=0;i<ndimension;i++){
Nprocessors*=processors[i]; Nprocessors*=processors[i];
} }
GRID_ASSERT(WorldSize==Nprocessors); assert(WorldSize==Nprocessors);
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Establish mapping between lexico physics coord and WorldRank // Establish mapping between lexico physics coord and WorldRank
@@ -401,7 +400,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
// Build the new communicator // Build the new communicator
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm); int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
GRID_ASSERT(ierr==0); assert(ierr==0);
} }
void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM) void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM)
{ {
@@ -431,8 +430,7 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce
for(int i=0;i<ndimension;i++){ for(int i=0;i<ndimension;i++){
Nprocessors*=processors[i]; Nprocessors*=processors[i];
} }
// std::cerr << " WorldSize "<<WorldSize << " Nprocessors "<<Nprocessors<<" "<<processors<<std::endl; assert(WorldSize==Nprocessors);
GRID_ASSERT(WorldSize==Nprocessors);
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Establish mapping between lexico physics coord and WorldRank // Establish mapping between lexico physics coord and WorldRank
@@ -448,7 +446,7 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce
// Build the new communicator // Build the new communicator
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm); int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
GRID_ASSERT(ierr==0); assert(ierr==0);
} }
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
// SHMGET // SHMGET
@@ -457,8 +455,8 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl; std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
GRID_ASSERT(_ShmSetup==1); assert(_ShmSetup==1);
GRID_ASSERT(_ShmAlloc==0); assert(_ShmAlloc==0);
////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the shared windows for our group // allocate the shared windows for our group
@@ -519,8 +517,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
void * ShmCommBuf ; void * ShmCommBuf ;
GRID_ASSERT(_ShmSetup==1); assert(_ShmSetup==1);
GRID_ASSERT(_ShmAlloc==0); assert(_ShmAlloc==0);
////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the pointer array for shared windows for our group // allocate the pointer array for shared windows for our group
@@ -539,8 +537,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
// Each MPI rank should allocate our own buffer // Each MPI rank should allocate our own buffer
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
#ifndef ACCELERATOR_AWARE_MPI #ifndef ACCELERATOR_AWARE_MPI
// printf("Host buffer allocate for GPU non-aware MPI\n"); HostCommBuf= malloc(bytes);
HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
#endif #endif
ShmCommBuf = acceleratorAllocDevice(bytes); ShmCommBuf = acceleratorAllocDevice(bytes);
if (ShmCommBuf == (void *)NULL ) { if (ShmCommBuf == (void *)NULL ) {
@@ -548,13 +545,11 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
if ( WorldRank == 0 ){ if ( WorldRank == 0 ){
std::cout << Mheader " acceleratorAllocDevice "<< bytes std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
<< "bytes at "<< std::hex<< ShmCommBuf << " - "<<(bytes-1+(uint64_t)ShmCommBuf) <<std::dec<<" for comms buffers " <<std::endl; << "bytes at "<< std::hex<< ShmCommBuf << " - "<<(bytes-1+(uint64_t)ShmCommBuf) <<std::dec<<" for comms buffers " <<std::endl;
} }
SharedMemoryZero(ShmCommBuf,bytes); SharedMemoryZero(ShmCommBuf,bytes);
if ( WorldRank == 0 ){ std::cout<< "Setting up IPC"<<std::endl;
std::cout<< Mheader "Setting up IPC"<<std::endl;
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Loop over ranks/gpu's on our node // Loop over ranks/gpu's on our node
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -574,8 +569,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
#ifdef GRID_SYCL_LEVEL_ZERO_IPC #ifdef GRID_SYCL_LEVEL_ZERO_IPC
typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t; typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t;
auto zeDevice = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device()); auto zeDevice = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());
auto zeContext = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context()); auto zeContext = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());
ze_ipc_mem_handle_t ihandle; ze_ipc_mem_handle_t ihandle;
clone_mem_t handle; clone_mem_t handle;
@@ -585,6 +580,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
if ( err != ZE_RESULT_SUCCESS ) { if ( err != ZE_RESULT_SUCCESS ) {
std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl; std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} else {
std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
} }
memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int)); memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int));
handle.pid = getpid(); handle.pid = getpid();
@@ -629,7 +626,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
MPI_BYTE, MPI_BYTE,
r, r,
WorldShmComm); WorldShmComm);
GRID_ASSERT(ierr==0); assert(ierr==0);
} }
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
@@ -643,12 +640,12 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
#ifdef SHM_SOCKETS #ifdef SHM_SOCKETS
myfd=UnixSockets::RecvFileDescriptor(); myfd=UnixSockets::RecvFileDescriptor();
#else #else
// std::cout<<"mapping seeking remote pid/fd " std::cout<<"mapping seeking remote pid/fd "
// <<handle.pid<<"/" <<handle.pid<<"/"
// <<handle.fd<<std::endl; <<handle.fd<<std::endl;
int pidfd = syscall(SYS_pidfd_open,handle.pid,0); int pidfd = syscall(SYS_pidfd_open,handle.pid,0);
// std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n"; std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n";
// int myfd = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0); // int myfd = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0);
myfd = syscall(438,pidfd,handle.fd,0); myfd = syscall(438,pidfd,handle.fd,0);
int err_t = errno; int err_t = errno;
@@ -658,7 +655,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
assert(0); assert(0);
} }
#endif #endif
// std::cout<<"Using IpcHandle mapped remote pid "<<handle.pid <<" FD "<<handle.fd <<" to myfd "<<myfd<<"\n"; std::cout<<"Using IpcHandle mapped remote pid "<<handle.pid <<" FD "<<handle.fd <<" to myfd "<<myfd<<"\n";
memcpy((void *)&ihandle,(void *)&handle.ze,sizeof(ihandle)); memcpy((void *)&ihandle,(void *)&handle.ze,sizeof(ihandle));
memcpy((void *)&ihandle,(void *)&myfd,sizeof(int)); memcpy((void *)&ihandle,(void *)&myfd,sizeof(int));
@@ -667,8 +664,11 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
std::cerr << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl; std::cerr << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl; std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} else {
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle succeeded for rank "<<r<<std::endl;
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle pointer is "<<std::hex<<thisBuf<<std::dec<<std::endl;
} }
GRID_ASSERT(thisBuf!=nullptr); assert(thisBuf!=nullptr);
} }
#endif #endif
#ifdef GRID_CUDA #ifdef GRID_CUDA
@@ -709,8 +709,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl; std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
GRID_ASSERT(_ShmSetup==1); assert(_ShmSetup==1);
GRID_ASSERT(_ShmAlloc==0); assert(_ShmAlloc==0);
////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the shared windows for our group // allocate the shared windows for our group
////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -740,14 +740,13 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0); void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0);
if ( ptr == (void *)MAP_FAILED ) { if ( ptr == (void *)MAP_FAILED ) {
printf("mmap %s failed\n",shm_name); printf("mmap %s failed\n",shm_name);
perror("failed mmap"); GRID_ASSERT(0); perror("failed mmap"); assert(0);
} }
GRID_ASSERT(((uint64_t)ptr&0x3F)==0); assert(((uint64_t)ptr&0x3F)==0);
close(fd); close(fd);
WorldShmCommBufs[r] =ptr; WorldShmCommBufs[r] =ptr;
// std::cout << Mheader "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl; // std::cout << Mheader "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
} }
std::cout<< Mheader " Intra-node IPC setup is complete "<<std::endl;
_ShmAlloc=1; _ShmAlloc=1;
_ShmAllocBytes = bytes; _ShmAllocBytes = bytes;
}; };
@@ -757,8 +756,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl; std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
GRID_ASSERT(_ShmSetup==1); assert(_ShmSetup==1);
GRID_ASSERT(_ShmAlloc==0); assert(_ShmAlloc==0);
////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the shared windows for our group // allocate the shared windows for our group
////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -769,7 +768,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
// Hugetlbf and others map filesystems as mappable huge pages // Hugetlbf and others map filesystems as mappable huge pages
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
char shm_name [NAME_MAX]; char shm_name [NAME_MAX];
GRID_ASSERT(WorldShmSize == 1); assert(WorldShmSize == 1);
for(int r=0;r<WorldShmSize;r++){ for(int r=0;r<WorldShmSize;r++){
int fd=-1; int fd=-1;
@@ -783,9 +782,9 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0); void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0);
if ( ptr == (void *)MAP_FAILED ) { if ( ptr == (void *)MAP_FAILED ) {
printf("mmap %s failed\n",shm_name); printf("mmap %s failed\n",shm_name);
perror("failed mmap"); GRID_ASSERT(0); perror("failed mmap"); assert(0);
} }
GRID_ASSERT(((uint64_t)ptr&0x3F)==0); assert(((uint64_t)ptr&0x3F)==0);
close(fd); close(fd);
WorldShmCommBufs[r] =ptr; WorldShmCommBufs[r] =ptr;
// std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl; // std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
@@ -804,8 +803,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl; std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
GRID_ASSERT(_ShmSetup==1); assert(_ShmSetup==1);
GRID_ASSERT(_ShmAlloc==0); assert(_ShmAlloc==0);
MPI_Barrier(WorldShmComm); MPI_Barrier(WorldShmComm);
WorldShmCommBufs.resize(WorldShmSize); WorldShmCommBufs.resize(WorldShmSize);
@@ -836,7 +835,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
perror("failed mmap"); perror("failed mmap");
assert(0); assert(0);
} }
GRID_ASSERT(((uint64_t)ptr&0x3F)==0); assert(((uint64_t)ptr&0x3F)==0);
WorldShmCommBufs[r] =ptr; WorldShmCommBufs[r] =ptr;
close(fd); close(fd);
@@ -857,8 +856,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
if ( fd<0 ) { perror("failed shm_open"); assert(0); } if ( fd<0 ) { perror("failed shm_open"); assert(0); }
void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if ( ptr == MAP_FAILED ) { perror("failed mmap"); GRID_ASSERT(0); } if ( ptr == MAP_FAILED ) { perror("failed mmap"); assert(0); }
GRID_ASSERT(((uint64_t)ptr&0x3F)==0); assert(((uint64_t)ptr&0x3F)==0);
WorldShmCommBufs[r] =ptr; WorldShmCommBufs[r] =ptr;
close(fd); close(fd);
@@ -881,14 +880,14 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
bzero(dest,bytes); bzero(dest,bytes);
#endif #endif
} }
//void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes) void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
//{ {
//#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL) #if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
// acceleratorCopyToDevice(src,dest,bytes); acceleratorCopyToDevice(src,dest,bytes);
//#else #else
// bcopy(src,dest,bytes); bcopy(src,dest,bytes);
//#endif #endif
//} }
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
// Global shared functionality finished // Global shared functionality finished
// Now move to per communicator functionality // Now move to per communicator functionality
@@ -915,7 +914,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
// Map ShmRank to WorldShmRank and use the right buffer // Map ShmRank to WorldShmRank and use the right buffer
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
GRID_ASSERT (GlobalSharedMemory::ShmAlloc()==1); assert (GlobalSharedMemory::ShmAlloc()==1);
heap_size = GlobalSharedMemory::ShmAllocBytes(); heap_size = GlobalSharedMemory::ShmAllocBytes();
for(int r=0;r<ShmSize;r++){ for(int r=0;r<ShmSize;r++){
@@ -924,7 +923,6 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm); MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);
ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr]; ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
// std::cerr << " SetCommunicator rank "<<r<<" comm "<<ShmCommBufs[r] <<std::endl;
} }
ShmBufferFreeAll(); ShmBufferFreeAll();
@@ -977,18 +975,19 @@ void SharedMemory::SharedMemoryTest(void)
check[0]=GlobalSharedMemory::WorldNode; check[0]=GlobalSharedMemory::WorldNode;
check[1]=r; check[1]=r;
check[2]=magic; check[2]=magic;
acceleratorCopyToDevice(check,ShmCommBufs[r],3*sizeof(uint64_t)); GlobalSharedMemory::SharedMemoryCopy( ShmCommBufs[r], check, 3*sizeof(uint64_t));
} }
} }
ShmBarrier(); ShmBarrier();
for(uint64_t r=0;r<ShmSize;r++){ for(uint64_t r=0;r<ShmSize;r++){
acceleratorCopyFromDevice(ShmCommBufs[r],check,3*sizeof(uint64_t));
GRID_ASSERT(check[0]==GlobalSharedMemory::WorldNode);
GRID_ASSERT(check[1]==r);
GRID_ASSERT(check[2]==magic);
}
ShmBarrier(); ShmBarrier();
std::cout << GridLogDebug << " SharedMemoryTest has passed "<<std::endl; GlobalSharedMemory::SharedMemoryCopy(check,ShmCommBufs[r], 3*sizeof(uint64_t));
ShmBarrier();
assert(check[0]==GlobalSharedMemory::WorldNode);
assert(check[1]==r);
assert(check[2]==magic);
ShmBarrier();
}
} }
void *SharedMemory::ShmBuffer(int rank) void *SharedMemory::ShmBuffer(int rank)
@@ -1003,14 +1002,12 @@ void *SharedMemory::ShmBuffer(int rank)
void *SharedMemory::ShmBufferTranslate(int rank,void * local_p) void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
{ {
int gpeer = ShmRanks[rank]; int gpeer = ShmRanks[rank];
GRID_ASSERT(gpeer!=ShmRank); // never send to self assert(gpeer!=ShmRank); // never send to self
// std::cout << "ShmBufferTranslate for rank " << rank<<" peer "<<gpeer<<std::endl;
if (gpeer == MPI_UNDEFINED){ if (gpeer == MPI_UNDEFINED){
return NULL; return NULL;
} else { } else {
uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank]; uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank];
uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset; uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset;
// std::cout << "ShmBufferTranslate : local,offset,remote "<<std::hex<<local_p<<" "<<offset<<" "<<remote<<std::dec<<std::endl;
return (void *) remote; return (void *) remote;
} }
} }

View File

@@ -34,7 +34,7 @@ NAMESPACE_BEGIN(Grid);
/*Construct from an MPI communicator*/ /*Construct from an MPI communicator*/
void GlobalSharedMemory::Init(Grid_MPI_Comm comm) void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
{ {
GRID_ASSERT(_ShmSetup==0); assert(_ShmSetup==0);
WorldComm = 0; WorldComm = 0;
WorldRank = 0; WorldRank = 0;
WorldSize = 1; WorldSize = 1;
@@ -62,8 +62,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
std::cout << header "SharedMemoryAllocate "<< bytes<< " GPU implementation "<<std::endl; std::cout << header "SharedMemoryAllocate "<< bytes<< " GPU implementation "<<std::endl;
void * ShmCommBuf ; void * ShmCommBuf ;
GRID_ASSERT(_ShmSetup==1); assert(_ShmSetup==1);
GRID_ASSERT(_ShmAlloc==0); assert(_ShmAlloc==0);
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Each MPI rank should allocate our own buffer // Each MPI rank should allocate our own buffer
@@ -92,8 +92,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
void * ShmCommBuf ; void * ShmCommBuf ;
GRID_ASSERT(_ShmSetup==1); assert(_ShmSetup==1);
GRID_ASSERT(_ShmAlloc==0); assert(_ShmAlloc==0);
int mmap_flag =0; int mmap_flag =0;
#ifdef MAP_ANONYMOUS #ifdef MAP_ANONYMOUS
mmap_flag = mmap_flag| MAP_SHARED | MAP_ANONYMOUS; mmap_flag = mmap_flag| MAP_SHARED | MAP_ANONYMOUS;
@@ -122,17 +122,17 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
{ {
acceleratorMemSet(dest,0,bytes); acceleratorMemSet(dest,0,bytes);
} }
//void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes) void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
//{ {
// acceleratorCopyToDevice(src,dest,bytes); acceleratorCopyToDevice(src,dest,bytes);
//} }
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
// Global shared functionality finished // Global shared functionality finished
// Now move to per communicator functionality // Now move to per communicator functionality
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
void SharedMemory::SetCommunicator(Grid_MPI_Comm comm) void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
{ {
GRID_ASSERT(GlobalSharedMemory::ShmAlloc()==1); assert(GlobalSharedMemory::ShmAlloc()==1);
ShmRanks.resize(1); ShmRanks.resize(1);
ShmCommBufs.resize(1); ShmCommBufs.resize(1);
ShmRanks[0] = 0; ShmRanks[0] = 0;

View File

@@ -51,6 +51,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#endif #endif
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr> template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
auto Cshift(const Expression &expr,int dim,int shift) -> decltype(closure(expr)) auto Cshift(const Expression &expr,int dim,int shift) -> decltype(closure(expr))
{ {

View File

@@ -30,11 +30,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
extern std::vector<std::pair<int,int> > Cshift_table; extern std::vector<std::pair<int,int> > Cshift_table;
extern deviceVector<std::pair<int,int> > Cshift_table_device; extern commVector<std::pair<int,int> > Cshift_table_device;
inline std::pair<int,int> *MapCshiftTable(void) inline std::pair<int,int> *MapCshiftTable(void)
{ {
// GPU version // GPU version
#ifdef ACCELERATOR_CSHIFT
uint64_t sz=Cshift_table.size(); uint64_t sz=Cshift_table.size();
if (Cshift_table_device.size()!=sz ) { if (Cshift_table_device.size()!=sz ) {
Cshift_table_device.resize(sz); Cshift_table_device.resize(sz);
@@ -44,13 +45,16 @@ inline std::pair<int,int> *MapCshiftTable(void)
sizeof(Cshift_table[0])*sz); sizeof(Cshift_table[0])*sz);
return &Cshift_table_device[0]; return &Cshift_table_device[0];
#else
return &Cshift_table[0];
#endif
// CPU version use identify map // CPU version use identify map
} }
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
// Gather for when there is no need to SIMD split // Gather for when there is no need to SIMD split
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
template<class vobj> void template<class vobj> void
Gather_plane_simple (const Lattice<vobj> &rhs,deviceVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0) Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
{ {
int rd = rhs.Grid()->_rdimensions[dimension]; int rd = rhs.Grid()->_rdimensions[dimension];
@@ -90,10 +94,17 @@ Gather_plane_simple (const Lattice<vobj> &rhs,deviceVector<vobj> &buffer,int dim
{ {
auto buffer_p = & buffer[0]; auto buffer_p = & buffer[0];
auto table = MapCshiftTable(); auto table = MapCshiftTable();
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead); autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for(i,ent,vobj::Nsimd(),{ accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second])); coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
}); });
#else
autoView(rhs_v , rhs, CpuRead);
thread_for(i,ent,{
buffer_p[table[i].first]=rhs_v[table[i].second];
});
#endif
} }
} }
@@ -118,6 +129,7 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
int n1=rhs.Grid()->_slice_stride[dimension]; int n1=rhs.Grid()->_slice_stride[dimension];
if ( cbmask ==0x3){ if ( cbmask ==0x3){
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead); autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for(nn,e1*e2,1,{ accelerator_for(nn,e1*e2,1,{
int n = nn%e1; int n = nn%e1;
@@ -128,10 +140,21 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
vobj temp =rhs_v[so+o+b]; vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset); extract<vobj>(temp,pointers,offset);
}); });
#else
autoView(rhs_v , rhs, CpuRead);
thread_for2d(n,e1,b,e2,{
int o = n*n1;
int offset = b+n*e2;
vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset);
});
#endif
} else { } else {
Coordinate rdim=rhs.Grid()->_rdimensions; Coordinate rdim=rhs.Grid()->_rdimensions;
Coordinate cdm =rhs.Grid()->_checker_dim_mask; Coordinate cdm =rhs.Grid()->_checker_dim_mask;
std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb? std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead); autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for(nn,e1*e2,1,{ accelerator_for(nn,e1*e2,1,{
int n = nn%e1; int n = nn%e1;
@@ -152,13 +175,33 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
extract<vobj>(temp,pointers,offset); extract<vobj>(temp,pointers,offset);
} }
}); });
#else
autoView(rhs_v , rhs, CpuRead);
thread_for2d(n,e1,b,e2,{
Coordinate coor;
int o=n*n1;
int oindex = o+b;
int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm);
int ocb=1<<cb;
int offset = b+n*e2;
if ( ocb & cbmask ) {
vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset);
}
});
#endif
} }
} }
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
// Scatter for when there is no need to SIMD split // Scatter for when there is no need to SIMD split
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,deviceVector<vobj> &buffer, int dimension,int plane,int cbmask) template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<vobj> &buffer, int dimension,int plane,int cbmask)
{ {
int rd = rhs.Grid()->_rdimensions[dimension]; int rd = rhs.Grid()->_rdimensions[dimension];
@@ -202,10 +245,17 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,deviceVector<
{ {
auto buffer_p = & buffer[0]; auto buffer_p = & buffer[0];
auto table = MapCshiftTable(); auto table = MapCshiftTable();
autoView( rhs_v, rhs, AcceleratorWriteDiscard); #ifdef ACCELERATOR_CSHIFT
autoView( rhs_v, rhs, AcceleratorWrite);
accelerator_for(i,ent,vobj::Nsimd(),{ accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second])); coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
}); });
#else
autoView( rhs_v, rhs, CpuWrite);
thread_for(i,ent,{
rhs_v[table[i].first]=buffer_p[table[i].second];
});
#endif
} }
} }
@@ -228,7 +278,8 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
if(cbmask ==0x3 ) { if(cbmask ==0x3 ) {
int _slice_stride = rhs.Grid()->_slice_stride[dimension]; int _slice_stride = rhs.Grid()->_slice_stride[dimension];
int _slice_block = rhs.Grid()->_slice_block[dimension]; int _slice_block = rhs.Grid()->_slice_block[dimension];
autoView( rhs_v , rhs, AcceleratorWriteDiscard); #ifdef ACCELERATOR_CSHIFT
autoView( rhs_v , rhs, AcceleratorWrite);
accelerator_for(nn,e1*e2,1,{ accelerator_for(nn,e1*e2,1,{
int n = nn%e1; int n = nn%e1;
int b = nn/e1; int b = nn/e1;
@@ -236,13 +287,21 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
int offset = b+n*_slice_block; int offset = b+n*_slice_block;
merge(rhs_v[so+o+b],pointers,offset); merge(rhs_v[so+o+b],pointers,offset);
}); });
#else
autoView( rhs_v , rhs, CpuWrite);
thread_for2d(n,e1,b,e2,{
int o = n*_slice_stride;
int offset = b+n*_slice_block;
merge(rhs_v[so+o+b],pointers,offset);
});
#endif
} else { } else {
// Case of SIMD split AND checker dim cannot currently be hit, except in // Case of SIMD split AND checker dim cannot currently be hit, except in
// Test_cshift_red_black code. // Test_cshift_red_black code.
std::cout << "Scatter_plane merge GRID_ASSERT(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl; std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl;
GRID_ASSERT(0); // This will fail if hit on GPU assert(0); // This will fail if hit on GPU
autoView( rhs_v, rhs, CpuWrite); autoView( rhs_v, rhs, CpuWrite);
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
@@ -301,11 +360,19 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
{ {
auto table = MapCshiftTable(); auto table = MapCshiftTable();
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead); autoView(rhs_v , rhs, AcceleratorRead);
autoView(lhs_v , lhs, AcceleratorWriteDiscard); autoView(lhs_v , lhs, AcceleratorWrite);
accelerator_for(i,ent,vobj::Nsimd(),{ accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second])); coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second]));
}); });
#else
autoView(rhs_v , rhs, CpuRead);
autoView(lhs_v , lhs, CpuWrite);
thread_for(i,ent,{
lhs_v[table[i].first]=rhs_v[table[i].second];
});
#endif
} }
} }
@@ -345,11 +412,19 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
{ {
auto table = MapCshiftTable(); auto table = MapCshiftTable();
#ifdef ACCELERATOR_CSHIFT
autoView( rhs_v, rhs, AcceleratorRead); autoView( rhs_v, rhs, AcceleratorRead);
autoView( lhs_v, lhs, AcceleratorWrite); autoView( lhs_v, lhs, AcceleratorWrite);
accelerator_for(i,ent,1,{ accelerator_for(i,ent,1,{
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type); permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
}); });
#else
autoView( rhs_v, rhs, CpuRead);
autoView( lhs_v, lhs, CpuWrite);
thread_for(i,ent,{
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
});
#endif
} }
} }

View File

@@ -29,13 +29,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#ifndef _GRID_CSHIFT_MPI_H_ #ifndef _GRID_CSHIFT_MPI_H_
#define _GRID_CSHIFT_MPI_H_ #define _GRID_CSHIFT_MPI_H_
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
#ifdef GRID_CHECKSUM_COMMS
extern uint64_t checksum_index;
#endif
const int Cshift_verbose=0;
template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift) template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
{ {
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
@@ -69,7 +65,7 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
Cshift_comms(ret,rhs,dimension,shift); Cshift_comms(ret,rhs,dimension,shift);
} }
t1=usecond(); t1=usecond();
if(Cshift_verbose) std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl; // std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl;
return ret; return ret;
} }
@@ -98,7 +94,7 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even); sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even);
sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd); sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd);
// std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.Checkerboard()<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl; //std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
if ( sshift[0] == sshift[1] ) { if ( sshift[0] == sshift[1] ) {
//std::cout << "Single pass Cshift_comms" <<std::endl; //std::cout << "Single pass Cshift_comms" <<std::endl;
Cshift_comms_simd(ret,rhs,dimension,shift,0x3); Cshift_comms_simd(ret,rhs,dimension,shift,0x3);
@@ -108,6 +104,8 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
} }
} }
#define ACCELERATOR_CSHIFT_NO_COPY
#ifdef ACCELERATOR_CSHIFT_NO_COPY
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask) template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{ {
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
@@ -121,19 +119,14 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
int pd = rhs.Grid()->_processors[dimension]; int pd = rhs.Grid()->_processors[dimension];
int simd_layout = rhs.Grid()->_simd_layout[dimension]; int simd_layout = rhs.Grid()->_simd_layout[dimension];
int comm_dim = rhs.Grid()->_processors[dimension] >1 ; int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
GRID_ASSERT(simd_layout==1); assert(simd_layout==1);
GRID_ASSERT(comm_dim==1); assert(comm_dim==1);
GRID_ASSERT(shift>=0); assert(shift>=0);
GRID_ASSERT(shift<fd); assert(shift<fd);
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
static deviceVector<vobj> send_buf; send_buf.resize(buffer_size); static cshiftVector<vobj> send_buf; send_buf.resize(buffer_size);
static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size); static cshiftVector<vobj> recv_buf; recv_buf.resize(buffer_size);
#ifndef ACCELERATOR_AWARE_MPI
int pad = (8 + sizeof(vobj) - 1) / sizeof(vobj);
static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size+pad);
static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size+pad);
#endif
int cb= (cbmask==0x2)? Odd : Even; int cb= (cbmask==0x2)? Odd : Even;
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
@@ -148,11 +141,9 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
int comm_proc = ((x+sshift)/rd)%pd; int comm_proc = ((x+sshift)/rd)%pd;
if (comm_proc==0) { if (comm_proc==0) {
FlightRecorder::StepLog("Cshift_Copy_plane");
tcopy-=usecond(); tcopy-=usecond();
Copy_plane(ret,rhs,dimension,x,sx,cbmask); Copy_plane(ret,rhs,dimension,x,sx,cbmask);
tcopy+=usecond(); tcopy+=usecond();
FlightRecorder::StepLog("Cshift_Copy_plane_complete");
} else { } else {
int words = buffer_size; int words = buffer_size;
@@ -160,84 +151,39 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
int bytes = words * sizeof(vobj); int bytes = words * sizeof(vobj);
FlightRecorder::StepLog("Cshift_Gather_plane");
tgather-=usecond(); tgather-=usecond();
Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask); Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask);
tgather+=usecond(); tgather+=usecond();
FlightRecorder::StepLog("Cshift_Gather_plane_complete");
// int rank = grid->_processor; // int rank = grid->_processor;
int recv_from_rank; int recv_from_rank;
int xmit_to_rank; int xmit_to_rank;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
tcomms-=usecond(); tcomms-=usecond();
grid->Barrier(); // grid->Barrier();
FlightRecorder::StepLog("Cshift_SendRecv");
#ifdef ACCELERATOR_AWARE_MPI
grid->SendToRecvFrom((void *)&send_buf[0], grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank, xmit_to_rank,
(void *)&recv_buf[0], (void *)&recv_buf[0],
recv_from_rank, recv_from_rank,
bytes); bytes);
#else
// bouncy bouncy
acceleratorCopyFromDevice(&send_buf[0],&hsend_buf[0],bytes);
#ifdef GRID_CHECKSUM_COMMS
GRID_ASSERT(bytes % 8 == 0);
checksum_index++;
uint64_t xsum = checksum_gpu((uint64_t*)&send_buf[0], bytes / 8) ^ (1 + checksum_index);
*(uint64_t*)(((char*)&hsend_buf[0]) + bytes) = xsum;
bytes += 8;
#endif
grid->SendToRecvFrom((void *)&hsend_buf[0],
xmit_to_rank,
(void *)&hrecv_buf[0],
recv_from_rank,
bytes);
#ifdef GRID_CHECKSUM_COMMS
bytes -= 8;
acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
uint64_t expected_cs = *(uint64_t*)(((char*)&hrecv_buf[0]) + bytes);
uint64_t computed_cs = checksum_gpu((uint64_t*)&recv_buf[0], bytes / 8) ^ (1 + checksum_index);
std::cout << GridLogComms<< " Cshift: "
<<" dim"<<dimension
<<" shift "<<shift
<< " rank "<< grid->ThisRank()
<<" Coor "<<grid->ThisProcessorCoor()
<<" send "<<xsum<<" to "<<xmit_to_rank
<<" recv "<<computed_cs<<" from "<<recv_from_rank
<<std::endl;
GRID_ASSERT(expected_cs == computed_cs);
#else
acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
#endif
#endif
FlightRecorder::StepLog("Cshift_SendRecv_complete");
xbytes+=bytes; xbytes+=bytes;
grid->Barrier(); // grid->Barrier();
tcomms+=usecond(); tcomms+=usecond();
FlightRecorder::StepLog("Cshift_barrier_complete");
tscatter-=usecond(); tscatter-=usecond();
Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask); Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
tscatter+=usecond(); tscatter+=usecond();
} }
} }
if (Cshift_verbose){ /*
std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl; std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl; std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl; std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl; std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl; std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
} */
} }
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask) template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
@@ -259,10 +205,10 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
// << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout // << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout
// << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl; // << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
GRID_ASSERT(comm_dim==1); assert(comm_dim==1);
GRID_ASSERT(simd_layout==2); assert(simd_layout==2);
GRID_ASSERT(shift>=0); assert(shift>=0);
GRID_ASSERT(shift<fd); assert(shift<fd);
RealD tcopy=0.0; RealD tcopy=0.0;
RealD tgather=0.0; RealD tgather=0.0;
@@ -278,8 +224,8 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension]; int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
// int words = sizeof(vobj)/sizeof(vector_type); // int words = sizeof(vobj)/sizeof(vector_type);
static std::vector<deviceVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd); static std::vector<cshiftVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd);
static std::vector<deviceVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd); static std::vector<cshiftVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
scalar_object * recv_buf_extract_mpi; scalar_object * recv_buf_extract_mpi;
scalar_object * send_buf_extract_mpi; scalar_object * send_buf_extract_mpi;
@@ -287,18 +233,6 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
send_buf_extract[s].resize(buffer_size); send_buf_extract[s].resize(buffer_size);
recv_buf_extract[s].resize(buffer_size); recv_buf_extract[s].resize(buffer_size);
} }
#ifndef ACCELERATOR_AWARE_MPI
#ifdef GRID_CHECKSUM_COMMS
buffer_size += (8 + sizeof(vobj) - 1) / sizeof(vobj);
#endif
static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size);
static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size);
#ifdef GRID_CHECKSUM_COMMS
buffer_size -= (8 + sizeof(vobj) - 1) / sizeof(vobj);
#endif
#endif
int bytes = buffer_size*sizeof(scalar_object); int bytes = buffer_size*sizeof(scalar_object);
@@ -341,60 +275,24 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
if (nbr_ic) nbr_lane|=inner_bit; if (nbr_ic) nbr_lane|=inner_bit;
GRID_ASSERT (sx == nbr_ox); assert (sx == nbr_ox);
if(nbr_proc){ if(nbr_proc){
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
tcomms-=usecond(); tcomms-=usecond();
grid->Barrier(); // grid->Barrier();
send_buf_extract_mpi = &send_buf_extract[nbr_lane][0]; send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
recv_buf_extract_mpi = &recv_buf_extract[i][0]; recv_buf_extract_mpi = &recv_buf_extract[i][0];
#ifdef ACCELERATOR_AWARE_MPI
grid->SendToRecvFrom((void *)send_buf_extract_mpi, grid->SendToRecvFrom((void *)send_buf_extract_mpi,
xmit_to_rank, xmit_to_rank,
(void *)recv_buf_extract_mpi, (void *)recv_buf_extract_mpi,
recv_from_rank, recv_from_rank,
bytes); bytes);
#else
// bouncy bouncy
acceleratorCopyFromDevice((void *)send_buf_extract_mpi,(void *)&hsend_buf[0],bytes);
#ifdef GRID_CHECKSUM_COMMS
assert(bytes % 8 == 0);
checksum_index++;
uint64_t xsum = checksum_gpu((uint64_t*)send_buf_extract_mpi, bytes / 8) ^ (1 + checksum_index);
*(uint64_t*)(((char*)&hsend_buf[0]) + bytes) = xsum;
bytes += 8;
#endif
grid->SendToRecvFrom((void *)&hsend_buf[0],
xmit_to_rank,
(void *)&hrecv_buf[0],
recv_from_rank,
bytes);
#ifdef GRID_CHECKSUM_COMMS
bytes -= 8;
acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes);
uint64_t expected_cs = *(uint64_t*)(((char*)&hrecv_buf[0]) + bytes);
uint64_t computed_cs = checksum_gpu((uint64_t*)recv_buf_extract_mpi, bytes / 8) ^ (1 + checksum_index);
std::cout << GridLogComms<< " Cshift_comms_simd: "
<<" dim"<<dimension
<<" shift "<<shift
<< " rank "<< grid->ThisRank()
<<" Coor "<<grid->ThisProcessorCoor()
<<" send "<<xsum<<" to "<<xmit_to_rank
<<" recv "<<computed_cs<<" from "<<recv_from_rank
<<std::endl;
assert(expected_cs == computed_cs);
#else
acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes);
#endif
#endif
xbytes+=bytes; xbytes+=bytes;
grid->Barrier(); // grid->Barrier();
tcomms+=usecond(); tcomms+=usecond();
rpointers[i] = &recv_buf_extract[i][0]; rpointers[i] = &recv_buf_extract[i][0];
@@ -407,15 +305,242 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask); Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
tscatter+=usecond(); tscatter+=usecond();
} }
if(Cshift_verbose){ /*
std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl; std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl; std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl; std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl; std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl; std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
*/
} }
#else
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
GridBase *grid=rhs.Grid();
Lattice<vobj> temp(rhs.Grid());
int fd = rhs.Grid()->_fdimensions[dimension];
int rd = rhs.Grid()->_rdimensions[dimension];
int pd = rhs.Grid()->_processors[dimension];
int simd_layout = rhs.Grid()->_simd_layout[dimension];
int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
assert(simd_layout==1);
assert(comm_dim==1);
assert(shift>=0);
assert(shift<fd);
RealD tcopy=0.0;
RealD tgather=0.0;
RealD tscatter=0.0;
RealD tcomms=0.0;
uint64_t xbytes=0;
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size);
static cshiftVector<vobj> recv_buf_v; recv_buf_v.resize(buffer_size);
vobj *send_buf;
vobj *recv_buf;
{
grid->ShmBufferFreeAll();
size_t bytes = buffer_size*sizeof(vobj);
send_buf=(vobj *)grid->ShmBufferMalloc(bytes);
recv_buf=(vobj *)grid->ShmBufferMalloc(bytes);
} }
int cb= (cbmask==0x2)? Odd : Even;
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
for(int x=0;x<rd;x++){
int sx = (x+sshift)%rd;
int comm_proc = ((x+sshift)/rd)%pd;
if (comm_proc==0) {
tcopy-=usecond();
Copy_plane(ret,rhs,dimension,x,sx,cbmask);
tcopy+=usecond();
} else {
int words = buffer_size;
if (cbmask != 0x3) words=words>>1;
int bytes = words * sizeof(vobj);
tgather-=usecond();
Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask);
tgather+=usecond();
// int rank = grid->_processor;
int recv_from_rank;
int xmit_to_rank;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
tcomms-=usecond();
// grid->Barrier();
acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes);
grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank,
(void *)&recv_buf[0],
recv_from_rank,
bytes);
xbytes+=bytes;
acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes);
// grid->Barrier();
tcomms+=usecond();
tscatter-=usecond();
Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask);
tscatter+=usecond();
}
}
/*
std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
*/
}
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
GridBase *grid=rhs.Grid();
const int Nsimd = grid->Nsimd();
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_object scalar_object;
typedef typename vobj::scalar_type scalar_type;
int fd = grid->_fdimensions[dimension];
int rd = grid->_rdimensions[dimension];
int ld = grid->_ldimensions[dimension];
int pd = grid->_processors[dimension];
int simd_layout = grid->_simd_layout[dimension];
int comm_dim = grid->_processors[dimension] >1 ;
//std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
// << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout
// << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
assert(comm_dim==1);
assert(simd_layout==2);
assert(shift>=0);
assert(shift<fd);
RealD tcopy=0.0;
RealD tgather=0.0;
RealD tscatter=0.0;
RealD tcomms=0.0;
uint64_t xbytes=0;
int permute_type=grid->PermuteType(dimension);
///////////////////////////////////////////////
// Simd direction uses an extract/merge pair
///////////////////////////////////////////////
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
// int words = sizeof(vobj)/sizeof(vector_type);
static std::vector<cshiftVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd);
static std::vector<cshiftVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
scalar_object * recv_buf_extract_mpi;
scalar_object * send_buf_extract_mpi;
{
size_t bytes = sizeof(scalar_object)*buffer_size;
grid->ShmBufferFreeAll();
send_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
recv_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
}
for(int s=0;s<Nsimd;s++){
send_buf_extract[s].resize(buffer_size);
recv_buf_extract[s].resize(buffer_size);
}
int bytes = buffer_size*sizeof(scalar_object);
ExtractPointerArray<scalar_object> pointers(Nsimd); //
ExtractPointerArray<scalar_object> rpointers(Nsimd); // received pointers
///////////////////////////////////////////
// Work out what to send where
///////////////////////////////////////////
int cb = (cbmask==0x2)? Odd : Even;
int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
// loop over outer coord planes orthog to dim
for(int x=0;x<rd;x++){
// FIXME call local permute copy if none are offnode.
for(int i=0;i<Nsimd;i++){
pointers[i] = &send_buf_extract[i][0];
}
tgather-=usecond();
int sx = (x+sshift)%rd;
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
tgather+=usecond();
for(int i=0;i<Nsimd;i++){
int inner_bit = (Nsimd>>(permute_type+1));
int ic= (i&inner_bit)? 1:0;
int my_coor = rd*ic + x;
int nbr_coor = my_coor+sshift;
int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
int nbr_ic = (nbr_coor%ld)/rd; // inner coord of peer
int nbr_ox = (nbr_coor%rd); // outer coord of peer
int nbr_lane = (i&(~inner_bit));
int recv_from_rank;
int xmit_to_rank;
if (nbr_ic) nbr_lane|=inner_bit;
assert (sx == nbr_ox);
if(nbr_proc){
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
tcomms-=usecond();
// grid->Barrier();
acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes);
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
xmit_to_rank,
(void *)recv_buf_extract_mpi,
recv_from_rank,
bytes);
acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes);
xbytes+=bytes;
// grid->Barrier();
tcomms+=usecond();
rpointers[i] = &recv_buf_extract[i][0];
} else {
rpointers[i] = &send_buf_extract[nbr_lane][0];
}
}
tscatter-=usecond();
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
tscatter+=usecond();
}
/*
std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s"<<std::endl;
*/
}
#endif
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
#endif #endif

View File

@@ -1,5 +1,5 @@
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
std::vector<std::pair<int,int> > Cshift_table; std::vector<std::pair<int,int> > Cshift_table;
deviceVector<std::pair<int,int> > Cshift_table_device; commVector<std::pair<int,int> > Cshift_table_device;
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@@ -245,7 +245,7 @@ template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * =
inline void CBFromExpression(int &cb, const T1 &lat) // Lattice leaf inline void CBFromExpression(int &cb, const T1 &lat) // Lattice leaf
{ {
if ((cb == Odd) || (cb == Even)) { if ((cb == Odd) || (cb == Even)) {
GRID_ASSERT(cb == lat.Checkerboard()); assert(cb == lat.Checkerboard());
} }
cb = lat.Checkerboard(); cb = lat.Checkerboard();
} }

View File

@@ -257,30 +257,17 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
}); });
} }
#define FAST_AXPY_NORM
template<class sobj,class vobj> inline template<class sobj,class vobj> inline
RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y) RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
{ {
GRID_TRACE("axpy_norm"); GRID_TRACE("axpy_norm");
#ifdef FAST_AXPY_NORM
return axpy_norm_fast(ret,a,x,y); return axpy_norm_fast(ret,a,x,y);
#else
ret = a*x+y;
RealD nn=norm2(ret);
return nn;
#endif
} }
template<class sobj,class vobj> inline template<class sobj,class vobj> inline
RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y) RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
{ {
GRID_TRACE("axpby_norm"); GRID_TRACE("axpby_norm");
#ifdef FAST_AXPY_NORM
return axpby_norm_fast(ret,a,b,x,y); return axpby_norm_fast(ret,a,b,x,y);
#else
ret = a*x+b*y;
RealD nn=norm2(ret);
return nn;
#endif
} }
/// Trace product /// Trace product

View File

@@ -120,12 +120,12 @@ public:
GRID_TRACE("ExpressionTemplateEval"); GRID_TRACE("ExpressionTemplateEval");
GridBase *egrid(nullptr); GridBase *egrid(nullptr);
GridFromExpression(egrid,expr); GridFromExpression(egrid,expr);
GRID_ASSERT(egrid!=nullptr); assert(egrid!=nullptr);
conformable(this->_grid,egrid); conformable(this->_grid,egrid);
int cb=-1; int cb=-1;
CBFromExpression(cb,expr); CBFromExpression(cb,expr);
GRID_ASSERT( (cb==Odd) || (cb==Even)); assert( (cb==Odd) || (cb==Even));
this->checkerboard=cb; this->checkerboard=cb;
auto exprCopy = expr; auto exprCopy = expr;
@@ -144,12 +144,12 @@ public:
GRID_TRACE("ExpressionTemplateEval"); GRID_TRACE("ExpressionTemplateEval");
GridBase *egrid(nullptr); GridBase *egrid(nullptr);
GridFromExpression(egrid,expr); GridFromExpression(egrid,expr);
GRID_ASSERT(egrid!=nullptr); assert(egrid!=nullptr);
conformable(this->_grid,egrid); conformable(this->_grid,egrid);
int cb=-1; int cb=-1;
CBFromExpression(cb,expr); CBFromExpression(cb,expr);
GRID_ASSERT( (cb==Odd) || (cb==Even)); assert( (cb==Odd) || (cb==Even));
this->checkerboard=cb; this->checkerboard=cb;
auto exprCopy = expr; auto exprCopy = expr;
@@ -168,12 +168,12 @@ public:
GRID_TRACE("ExpressionTemplateEval"); GRID_TRACE("ExpressionTemplateEval");
GridBase *egrid(nullptr); GridBase *egrid(nullptr);
GridFromExpression(egrid,expr); GridFromExpression(egrid,expr);
GRID_ASSERT(egrid!=nullptr); assert(egrid!=nullptr);
conformable(this->_grid,egrid); conformable(this->_grid,egrid);
int cb=-1; int cb=-1;
CBFromExpression(cb,expr); CBFromExpression(cb,expr);
GRID_ASSERT( (cb==Odd) || (cb==Even)); assert( (cb==Odd) || (cb==Even));
this->checkerboard=cb; this->checkerboard=cb;
auto exprCopy = expr; auto exprCopy = expr;
ExpressionViewOpen(exprCopy); ExpressionViewOpen(exprCopy);
@@ -191,11 +191,11 @@ public:
Lattice(const LatticeUnaryExpression<Op,T1> & expr) { Lattice(const LatticeUnaryExpression<Op,T1> & expr) {
this->_grid = nullptr; this->_grid = nullptr;
GridFromExpression(this->_grid,expr); GridFromExpression(this->_grid,expr);
GRID_ASSERT(this->_grid!=nullptr); assert(this->_grid!=nullptr);
int cb=-1; int cb=-1;
CBFromExpression(cb,expr); CBFromExpression(cb,expr);
GRID_ASSERT( (cb==Odd) || (cb==Even)); assert( (cb==Odd) || (cb==Even));
this->checkerboard=cb; this->checkerboard=cb;
resize(this->_grid->oSites()); resize(this->_grid->oSites());
@@ -206,11 +206,11 @@ public:
Lattice(const LatticeBinaryExpression<Op,T1,T2> & expr) { Lattice(const LatticeBinaryExpression<Op,T1,T2> & expr) {
this->_grid = nullptr; this->_grid = nullptr;
GridFromExpression(this->_grid,expr); GridFromExpression(this->_grid,expr);
GRID_ASSERT(this->_grid!=nullptr); assert(this->_grid!=nullptr);
int cb=-1; int cb=-1;
CBFromExpression(cb,expr); CBFromExpression(cb,expr);
GRID_ASSERT( (cb==Odd) || (cb==Even)); assert( (cb==Odd) || (cb==Even));
this->checkerboard=cb; this->checkerboard=cb;
resize(this->_grid->oSites()); resize(this->_grid->oSites());
@@ -221,11 +221,11 @@ public:
Lattice(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr) { Lattice(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr) {
this->_grid = nullptr; this->_grid = nullptr;
GridFromExpression(this->_grid,expr); GridFromExpression(this->_grid,expr);
GRID_ASSERT(this->_grid!=nullptr); assert(this->_grid!=nullptr);
int cb=-1; int cb=-1;
CBFromExpression(cb,expr); CBFromExpression(cb,expr);
GRID_ASSERT( (cb==Odd) || (cb==Even)); assert( (cb==Odd) || (cb==Even));
this->checkerboard=cb; this->checkerboard=cb;
resize(this->_grid->oSites()); resize(this->_grid->oSites());
@@ -237,19 +237,16 @@ public:
vobj vtmp; vobj vtmp;
vtmp = r; vtmp = r;
#if 1 #if 1
deviceVector<vobj> vvtmp(1);
acceleratorPut(vvtmp[0],vtmp);
vobj *vvtmp_p = & vvtmp[0];
auto me = View(AcceleratorWrite);
accelerator_for(ss,me.size(),vobj::Nsimd(),{
auto stmp=coalescedRead(*vvtmp_p);
coalescedWrite(me[ss],stmp);
});
#else
auto me = View(CpuWrite); auto me = View(CpuWrite);
thread_for(ss,me.size(),{ thread_for(ss,me.size(),{
me[ss]= r; me[ss]= r;
}); });
#else
auto me = View(AcceleratorWrite);
accelerator_for(ss,me.size(),vobj::Nsimd(),{
auto stmp=coalescedRead(vtmp);
coalescedWrite(me[ss],stmp);
});
#endif #endif
me.ViewClose(); me.ViewClose();
return *this; return *this;
@@ -264,7 +261,7 @@ public:
Lattice(GridBase *grid,ViewMode mode=AcceleratorWriteDiscard) { Lattice(GridBase *grid,ViewMode mode=AcceleratorWriteDiscard) {
this->_grid = grid; this->_grid = grid;
resize(this->_grid->oSites()); resize(this->_grid->oSites());
GRID_ASSERT((((uint64_t)&this->_odata[0])&0xF) ==0); assert((((uint64_t)&this->_odata[0])&0xF) ==0);
this->checkerboard=0; this->checkerboard=0;
SetViewMode(mode); SetViewMode(mode);
} }

View File

@@ -53,19 +53,36 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
typedef decltype(basis[0]) Field; typedef decltype(basis[0]) Field;
typedef decltype(basis[0].View(AcceleratorRead)) View; typedef decltype(basis[0].View(AcceleratorRead)) View;
hostVector<View> h_basis_v(basis.size()); Vector<View> basis_v; basis_v.reserve(basis.size());
deviceVector<View> d_basis_v(basis.size()); typedef typename std::remove_reference<decltype(basis_v[0][0])>::type vobj;
typedef typename std::remove_reference<decltype(h_basis_v[0][0])>::type vobj;
typedef typename std::remove_reference<decltype(Qt(0,0))>::type Coeff_t; typedef typename std::remove_reference<decltype(Qt(0,0))>::type Coeff_t;
GridBase* grid = basis[0].Grid(); GridBase* grid = basis[0].Grid();
for(int k=0;k<basis.size();k++){ for(int k=0;k<basis.size();k++){
h_basis_v[k] = basis[k].View(AcceleratorWrite); basis_v.push_back(basis[k].View(AcceleratorWrite));
acceleratorPut(d_basis_v[k],h_basis_v[k]);
} }
View *basis_vp = &d_basis_v[0]; #if ( !(defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)) )
int max_threads = thread_max();
Vector < vobj > Bt(Nm * max_threads);
thread_region
{
vobj* B = &Bt[Nm * thread_num()];
thread_for_in_region(ss, grid->oSites(),{
for(int j=j0; j<j1; ++j) B[j]=0.;
for(int j=j0; j<j1; ++j){
for(int k=k0; k<k1; ++k){
B[j] +=Qt(j,k) * basis_v[k][ss];
}
}
for(int j=j0; j<j1; ++j){
basis_v[j][ss] = B[j];
}
});
}
#else
View *basis_vp = &basis_v[0];
int nrot = j1-j0; int nrot = j1-j0;
if (!nrot) // edge case not handled gracefully by Cuda if (!nrot) // edge case not handled gracefully by Cuda
@@ -74,19 +91,17 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
uint64_t oSites =grid->oSites(); uint64_t oSites =grid->oSites();
uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
deviceVector <vobj> Bt(siteBlock * nrot); Vector <vobj> Bt(siteBlock * nrot);
auto Bp=&Bt[0]; auto Bp=&Bt[0];
// GPU readable copy of matrix // GPU readable copy of matrix
hostVector<Coeff_t> h_Qt_jv(Nm*Nm); Vector<Coeff_t> Qt_jv(Nm*Nm);
deviceVector<Coeff_t> Qt_jv(Nm*Nm);
Coeff_t *Qt_p = & Qt_jv[0]; Coeff_t *Qt_p = & Qt_jv[0];
thread_for(i,Nm*Nm,{ thread_for(i,Nm*Nm,{
int j = i/Nm; int j = i/Nm;
int k = i%Nm; int k = i%Nm;
h_Qt_jv[i]=Qt(j,k); Qt_p[i]=Qt(j,k);
}); });
acceleratorCopyToDevice(&h_Qt_jv[0],Qt_p,Nm*Nm*sizeof(Coeff_t));
// Block the loop to keep storage footprint down // Block the loop to keep storage footprint down
for(uint64_t s=0;s<oSites;s+=siteBlock){ for(uint64_t s=0;s<oSites;s+=siteBlock){
@@ -122,8 +137,9 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
coalescedWrite(basis_vp[jj][sss],coalescedRead(Bp[ss*nrot+j])); coalescedWrite(basis_vp[jj][sss],coalescedRead(Bp[ss*nrot+j]));
}); });
} }
#endif
for(int k=0;k<basis.size();k++) h_basis_v[k].ViewClose(); for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
} }
// Extract a single rotated vector // Extract a single rotated vector
@@ -136,19 +152,16 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
result.Checkerboard() = basis[0].Checkerboard(); result.Checkerboard() = basis[0].Checkerboard();
hostVector<View> h_basis_v(basis.size()); Vector<View> basis_v; basis_v.reserve(basis.size());
deviceVector<View> d_basis_v(basis.size());
for(int k=0;k<basis.size();k++){ for(int k=0;k<basis.size();k++){
h_basis_v[k]=basis[k].View(AcceleratorRead); basis_v.push_back(basis[k].View(AcceleratorRead));
acceleratorPut(d_basis_v[k],h_basis_v[k]);
} }
vobj zz=Zero(); vobj zz=Zero();
deviceVector<double> Qt_jv(Nm); Vector<double> Qt_jv(Nm);
double * Qt_j = & Qt_jv[0]; double * Qt_j = & Qt_jv[0];
for(int k=0;k<Nm;++k) acceleratorPut(Qt_j[k],Qt(j,k)); for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
auto basis_vp=& d_basis_v[0]; auto basis_vp=& basis_v[0];
autoView(result_v,result,AcceleratorWrite); autoView(result_v,result,AcceleratorWrite);
accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{ accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
vobj zzz=Zero(); vobj zzz=Zero();
@@ -158,7 +171,7 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
} }
coalescedWrite(result_v[ss], B); coalescedWrite(result_v[ss], B);
}); });
for(int k=0;k<basis.size();k++) h_basis_v[k].ViewClose(); for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
} }
template<class Field> template<class Field>
@@ -166,9 +179,9 @@ void basisReorderInPlace(std::vector<Field> &_v,std::vector<RealD>& sort_vals, s
{ {
int vlen = idx.size(); int vlen = idx.size();
GRID_ASSERT(vlen>=1); assert(vlen>=1);
GRID_ASSERT(vlen<=sort_vals.size()); assert(vlen<=sort_vals.size());
GRID_ASSERT(vlen<=_v.size()); assert(vlen<=_v.size());
for (size_t i=0;i<vlen;i++) { for (size_t i=0;i<vlen;i++) {
@@ -186,7 +199,7 @@ void basisReorderInPlace(std::vector<Field> &_v,std::vector<RealD>& sort_vals, s
if (idx[j]==i) if (idx[j]==i)
break; break;
GRID_ASSERT(idx[i] > i); GRID_ASSERT(j!=idx.size()); GRID_ASSERT(idx[j]==i); assert(idx[i] > i); assert(j!=idx.size()); assert(idx[j]==i);
swap(_v[i],_v[idx[i]]); // should use vector move constructor, no data copy swap(_v[i],_v[idx[i]]); // should use vector move constructor, no data copy
std::swap(sort_vals[i],sort_vals[idx[i]]); std::swap(sort_vals[i],sort_vals[idx[i]]);
@@ -224,7 +237,7 @@ void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, boo
template<class Field> template<class Field>
void basisDeflate(const std::vector<Field> &_v,const std::vector<RealD>& eval,const Field& src_orig,Field& result) { void basisDeflate(const std::vector<Field> &_v,const std::vector<RealD>& eval,const Field& src_orig,Field& result) {
result = Zero(); result = Zero();
GRID_ASSERT(_v.size()==eval.size()); assert(_v.size()==eval.size());
int N = (int)_v.size(); int N = (int)_v.size();
for (int i=0;i<N;i++) { for (int i=0;i<N;i++) {
Field& tmp = _v[i]; Field& tmp = _v[i];

View File

@@ -32,8 +32,8 @@ NAMESPACE_BEGIN(Grid);
template<class obj1,class obj2> void conformable(const Lattice<obj1> &lhs,const Lattice<obj2> &rhs) template<class obj1,class obj2> void conformable(const Lattice<obj1> &lhs,const Lattice<obj2> &rhs)
{ {
GRID_ASSERT(lhs.Grid() == rhs.Grid()); assert(lhs.Grid() == rhs.Grid());
GRID_ASSERT(lhs.Checkerboard() == rhs.Checkerboard()); assert(lhs.Checkerboard() == rhs.Checkerboard());
} }
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@@ -42,7 +42,7 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
// Lattice<vobj> Xslice(SliceGrid); // Lattice<vobj> Xslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid); // Lattice<vobj> Rslice(SliceGrid);
GRID_ASSERT( FullGrid->_simd_layout[Orthog]==1); assert( FullGrid->_simd_layout[Orthog]==1);
//FIXME package in a convenient iterator //FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog" //Should loop over a plane orthogonal to direction "Orthog"
@@ -86,7 +86,7 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
int Nblock = X.Grid()->GlobalDimensions()[Orthog]; int Nblock = X.Grid()->GlobalDimensions()[Orthog];
GridBase *FullGrid = X.Grid(); GridBase *FullGrid = X.Grid();
GRID_ASSERT( FullGrid->_simd_layout[Orthog]==1); assert( FullGrid->_simd_layout[Orthog]==1);
//FIXME package in a convenient iterator //FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog" //Should loop over a plane orthogonal to direction "Orthog"
@@ -140,7 +140,7 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
mat = Eigen::MatrixXcd::Zero(Nblock,Nblock); mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
GRID_ASSERT( FullGrid->_simd_layout[Orthog]==1); assert( FullGrid->_simd_layout[Orthog]==1);
// int nh = FullGrid->_ndimension; // int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension; // int nl = SliceGrid->_ndimension;
// int nl = nh-1; // int nl = nh-1;

View File

@@ -98,8 +98,8 @@ void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){
int Nsimd = grid->Nsimd(); int Nsimd = grid->Nsimd();
GRID_ASSERT( l.Checkerboard()== l.Grid()->CheckerBoard(site)); assert( l.Checkerboard()== l.Grid()->CheckerBoard(site));
GRID_ASSERT( sizeof(sobj)*Nsimd == sizeof(vobj)); assert( sizeof(sobj)*Nsimd == sizeof(vobj));
int rank,odx,idx; int rank,odx,idx;
// Optional to broadcast from node 0. // Optional to broadcast from node 0.
@@ -135,7 +135,7 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
int Nsimd = grid->Nsimd(); int Nsimd = grid->Nsimd();
GRID_ASSERT( l.Checkerboard() == l.Grid()->CheckerBoard(site)); assert( l.Checkerboard() == l.Grid()->CheckerBoard(site));
int rank,odx,idx; int rank,odx,idx;
grid->GlobalCoorToRankIndex(rank,odx,idx,site); grid->GlobalCoorToRankIndex(rank,odx,idx,site);
@@ -159,14 +159,14 @@ template<class vobj,class sobj>
inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site) inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
{ {
GridBase *grid = l.getGrid(); GridBase *grid = l.getGrid();
GRID_ASSERT(l.mode==CpuRead); assert(l.mode==CpuRead);
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd(); int Nsimd = grid->Nsimd();
// GRID_ASSERT( l.Checkerboard()== grid->CheckerBoard(site)); assert( l.Checkerboard()== grid->CheckerBoard(site));
GRID_ASSERT( sizeof(sobj)*Nsimd == sizeof(vobj)); assert( sizeof(sobj)*Nsimd == sizeof(vobj));
static const int words=sizeof(vobj)/sizeof(vector_type); static const int words=sizeof(vobj)/sizeof(vector_type);
int odx,idx; int odx,idx;
@@ -179,7 +179,7 @@ inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
for(int w=0;w<words;w++){ for(int w=0;w<words;w++){
pt[w] = getlane(vp[w],idx); pt[w] = getlane(vp[w],idx);
} }
// std::cout << "peekLocalSite "<<site<<" "<<odx<<","<<idx<<" "<<s<<std::endl;
return; return;
}; };
template<class vobj,class sobj> template<class vobj,class sobj>
@@ -195,15 +195,15 @@ template<class vobj,class sobj>
inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site) inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site)
{ {
GridBase *grid=l.getGrid(); GridBase *grid=l.getGrid();
GRID_ASSERT(l.mode==CpuWrite); assert(l.mode==CpuWrite);
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd(); int Nsimd = grid->Nsimd();
// GRID_ASSERT( l.Checkerboard()== grid->CheckerBoard(site)); assert( l.Checkerboard()== grid->CheckerBoard(site));
GRID_ASSERT( sizeof(sobj)*Nsimd == sizeof(vobj)); assert( sizeof(sobj)*Nsimd == sizeof(vobj));
static const int words=sizeof(vobj)/sizeof(vector_type); static const int words=sizeof(vobj)/sizeof(vector_type);
int odx,idx; int odx,idx;

View File

@@ -46,7 +46,7 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
// const int Nsimd = vobj::Nsimd(); // const int Nsimd = vobj::Nsimd();
const int nthread = GridThread::GetThreads(); const int nthread = GridThread::GetThreads();
std::vector<sobj> sumarray(nthread); Vector<sobj> sumarray(nthread);
for(int i=0;i<nthread;i++){ for(int i=0;i<nthread;i++){
sumarray[i]=Zero(); sumarray[i]=Zero();
} }
@@ -75,7 +75,7 @@ inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)
const int nthread = GridThread::GetThreads(); const int nthread = GridThread::GetThreads();
std::vector<sobj> sumarray(nthread); Vector<sobj> sumarray(nthread);
for(int i=0;i<nthread;i++){ for(int i=0;i<nthread;i++){
sumarray[i]=Zero(); sumarray[i]=Zero();
} }
@@ -290,45 +290,23 @@ template<class vobj>
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) { inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
GridBase *grid = left.Grid(); GridBase *grid = left.Grid();
bool ok;
#ifdef GRID_SYCL #ifdef GRID_SYCL
// uint64_t csum=0; uint64_t csum=0;
// uint64_t csum2=0; if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
// if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone) {
// {
// Hack // Hack
// Fast integer xor checksum. Can also be used in comms now. // Fast integer xor checksum. Can also be used in comms now.
// autoView(l_v,left,AcceleratorRead); autoView(l_v,left,AcceleratorRead);
// Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t); Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t);
// uint64_t *base= (uint64_t *)&l_v[0]; uint64_t *base= (uint64_t *)&l_v[0];
// csum=svm_xor(base,words); csum=svm_xor(base,words);
// ok = FlightRecorder::CsumLog(csum);
// if ( !ok ) {
// csum2=svm_xor(base,words);
// std::cerr<< " Bad CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
// } else {
// csum2=svm_xor(base,words);
// std::cerr<< " ok CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
// }
// GRID_ASSERT(ok);
// }
#endif
FlightRecorder::StepLog("rank inner product");
ComplexD nrm = rankInnerProduct(left,right);
// ComplexD nrmck=nrm;
RealD local = real(nrm);
ok = FlightRecorder::NormLog(real(nrm));
if ( !ok ) {
ComplexD nrm2 = rankInnerProduct(left,right);
RealD local2 = real(nrm2);
std::cerr<< " Bad NORM " << local << " recomputed as "<<local2<<std::endl;
GRID_ASSERT(ok);
} }
FlightRecorder::StepLog("Start global sum"); FlightRecorder::CsumLog(csum);
grid->GlobalSumP2P(nrm); #endif
// grid->GlobalSum(nrm); ComplexD nrm = rankInnerProduct(left,right);
FlightRecorder::StepLog("Finished global sum"); RealD local = real(nrm);
// std::cout << " norm "<< nrm << " p2p norm "<<nrmck<<std::endl; FlightRecorder::NormLog(real(nrm));
grid->GlobalSum(nrm);
FlightRecorder::ReductionLog(local,real(nrm)); FlightRecorder::ReductionLog(local,real(nrm));
return nrm; return nrm;
} }
@@ -365,6 +343,18 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
autoView( x_v, x, AcceleratorRead); autoView( x_v, x, AcceleratorRead);
autoView( y_v, y, AcceleratorRead); autoView( y_v, y, AcceleratorRead);
autoView( z_v, z, AcceleratorWrite); autoView( z_v, z, AcceleratorWrite);
#if 0
typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, nsimd,{
auto tmp = a*x_v(ss)+b*y_v(ss);
coalescedWrite(inner_tmp_v[ss],innerProductD(tmp,tmp));
coalescedWrite(z_v[ss],tmp);
});
nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
#else
typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t; typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
deviceVector<inner_t> inner_tmp; deviceVector<inner_t> inner_tmp;
inner_tmp.resize(sites); inner_tmp.resize(sites);
@@ -375,13 +365,9 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp)); coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
coalescedWrite(z_v[ss],tmp); coalescedWrite(z_v[ss],tmp);
}); });
bool ok;
nrm = real(TensorRemove(sumD(inner_tmp_v,sites))); nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
ok = FlightRecorder::NormLog(real(nrm)); #endif
GRID_ASSERT(ok);
RealD local = real(nrm);
grid->GlobalSum(nrm); grid->GlobalSum(nrm);
FlightRecorder::ReductionLog(local,real(nrm));
return nrm; return nrm;
} }
@@ -391,7 +377,7 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
conformable(left,right); conformable(left,right);
typedef typename vobj::vector_typeD vector_type; typedef typename vobj::vector_typeD vector_type;
std::vector<ComplexD> tmp(2); Vector<ComplexD> tmp(2);
GridBase *grid = left.Grid(); GridBase *grid = left.Grid();
@@ -401,8 +387,8 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
// GPU // GPU
typedef decltype(innerProductD(vobj(),vobj())) inner_t; typedef decltype(innerProductD(vobj(),vobj())) inner_t;
typedef decltype(innerProductD(vobj(),vobj())) norm_t; typedef decltype(innerProductD(vobj(),vobj())) norm_t;
deviceVector<inner_t> inner_tmp(sites); Vector<inner_t> inner_tmp(sites);
deviceVector<norm_t> norm_tmp(sites); Vector<norm_t> norm_tmp(sites);
auto inner_tmp_v = &inner_tmp[0]; auto inner_tmp_v = &inner_tmp[0];
auto norm_tmp_v = &norm_tmp[0]; auto norm_tmp_v = &norm_tmp[0];
{ {
@@ -452,9 +438,7 @@ inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
// sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc... // sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc...
////////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj> inline void sliceSum(const Lattice<vobj> &Data, template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim)
std::vector<typename vobj::scalar_object> &result,
int orthogdim)
{ {
/////////////////////////////////////////////////////// ///////////////////////////////////////////////////////
// FIXME precision promoted summation // FIXME precision promoted summation
@@ -464,20 +448,20 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_object::scalar_type scalar_type; typedef typename vobj::scalar_object::scalar_type scalar_type;
GridBase *grid = Data.Grid(); GridBase *grid = Data.Grid();
GRID_ASSERT(grid!=NULL); assert(grid!=NULL);
const int Nd = grid->_ndimension; const int Nd = grid->_ndimension;
const int Nsimd = grid->Nsimd(); const int Nsimd = grid->Nsimd();
GRID_ASSERT(orthogdim >= 0); assert(orthogdim >= 0);
GRID_ASSERT(orthogdim < Nd); assert(orthogdim < Nd);
int fd=grid->_fdimensions[orthogdim]; int fd=grid->_fdimensions[orthogdim];
int ld=grid->_ldimensions[orthogdim]; int ld=grid->_ldimensions[orthogdim];
int rd=grid->_rdimensions[orthogdim]; int rd=grid->_rdimensions[orthogdim];
std::vector<vobj> lvSum(rd); // will locally sum vectors first Vector<vobj> lvSum(rd); // will locally sum vectors first
std::vector<sobj> lsSum(ld,Zero()); // sum across these down to scalars Vector<sobj> lsSum(ld,Zero()); // sum across these down to scalars
ExtractBuffer<sobj> extracted(Nsimd); // splitting the SIMD ExtractBuffer<sobj> extracted(Nsimd); // splitting the SIMD
result.resize(fd); // And then global sum to return the same vector to every node result.resize(fd); // And then global sum to return the same vector to every node
@@ -525,8 +509,6 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,
scalar_type * ptr = (scalar_type *) &result[0]; scalar_type * ptr = (scalar_type *) &result[0];
int words = fd*sizeof(sobj)/sizeof(scalar_type); int words = fd*sizeof(sobj)/sizeof(scalar_type);
grid->GlobalSumVector(ptr, words); grid->GlobalSumVector(ptr, words);
// std::cout << GridLogMessage << " sliceSum local"<<t_sum<<" us, host+mpi "<<t_rest<<std::endl;
} }
template<class vobj> inline template<class vobj> inline
std::vector<typename vobj::scalar_object> std::vector<typename vobj::scalar_object>
@@ -537,41 +519,28 @@ sliceSum(const Lattice<vobj> &Data,int orthogdim)
return result; return result;
} }
/*
Reimplement
1)
template<class vobj>
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)
2)
template<class vobj>
static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)
3)
-- Make Slice Mul Matrix call sliceMaddMatrix
*/
template<class vobj> template<class vobj>
static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim) static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)
{ {
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
GridBase *grid = lhs.Grid(); GridBase *grid = lhs.Grid();
GRID_ASSERT(grid!=NULL); assert(grid!=NULL);
conformable(grid,rhs.Grid()); conformable(grid,rhs.Grid());
const int Nd = grid->_ndimension; const int Nd = grid->_ndimension;
const int Nsimd = grid->Nsimd(); const int Nsimd = grid->Nsimd();
GRID_ASSERT(orthogdim >= 0); assert(orthogdim >= 0);
GRID_ASSERT(orthogdim < Nd); assert(orthogdim < Nd);
int fd=grid->_fdimensions[orthogdim]; int fd=grid->_fdimensions[orthogdim];
int ld=grid->_ldimensions[orthogdim]; int ld=grid->_ldimensions[orthogdim];
int rd=grid->_rdimensions[orthogdim]; int rd=grid->_rdimensions[orthogdim];
std::vector<vector_type> lvSum(rd); // will locally sum vectors first Vector<vector_type> lvSum(rd); // will locally sum vectors first
std::vector<scalar_type > lsSum(ld,scalar_type(0.0)); // sum across these down to scalars Vector<scalar_type > lsSum(ld,scalar_type(0.0)); // sum across these down to scalars
ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd); // splitting the SIMD ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd); // splitting the SIMD
result.resize(fd); // And then global sum to return the same vector to every node for IO to file result.resize(fd); // And then global sum to return the same vector to every node for IO to file
@@ -701,96 +670,203 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
} }
}; };
/*
inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog) inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
{ {
int NN = BlockSolverGrid->_ndimension; int NN = BlockSolverGrid->_ndimension;
int nsimd = BlockSolverGrid->Nsimd(); int nsimd = BlockSolverGrid->Nsimd();
std::vector<int> latt_phys(NN-1); std::vector<int> latt_phys(0);
Coordinate simd_phys; std::vector<int> simd_phys(0);
std::vector<int> mpi_phys(NN-1); std::vector<int> mpi_phys(0);
Coordinate checker_dim_mask(NN-1);
int checker_dim=-1;
int dd;
for(int d=0;d<NN;d++){ for(int d=0;d<NN;d++){
if( d!=Orthog ) { if( d!=Orthog ) {
latt_phys[dd]=BlockSolverGrid->_fdimensions[d]; latt_phys.push_back(BlockSolverGrid->_fdimensions[d]);
mpi_phys[dd] =BlockSolverGrid->_processors[d]; simd_phys.push_back(BlockSolverGrid->_simd_layout[d]);
checker_dim_mask[dd] = BlockSolverGrid->_checker_dim_mask[d]; mpi_phys.push_back(BlockSolverGrid->_processors[d]);
if ( d == BlockSolverGrid->_checker_dim ) checker_dim = dd;
dd++;
} }
} }
simd_phys=GridDefaultSimd(latt_phys.size(),nsimd); return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys);
GridCartesian *tmp = new GridCartesian(latt_phys,simd_phys,mpi_phys);
if(BlockSolverGrid->_isCheckerBoarded) {
GridRedBlackCartesian *ret = new GridRedBlackCartesian(tmp,checker_dim_mask,checker_dim);
delete tmp;
return (GridBase *) ret;
} else {
return (GridBase *) tmp;
}
} }
*/
template<class vobj> template<class vobj>
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)
{ {
GridBase *FullGrid = X.Grid();
GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
Lattice<vobj> Ys(SliceGrid);
Lattice<vobj> Rs(SliceGrid);
Lattice<vobj> Xs(SliceGrid);
Lattice<vobj> RR(FullGrid);
RR = R; // Copies checkerboard for insert
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nslice = X.Grid()->GlobalDimensions()[Orthog];
for(int i=0;i<Nslice;i++){ int Nblock = X.Grid()->GlobalDimensions()[Orthog];
ExtractSlice(Ys,Y,i,Orthog);
ExtractSlice(Rs,R,i,Orthog); GridBase *FullGrid = X.Grid();
Rs=Ys; // GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
for(int j=0;j<Nslice;j++){
ExtractSlice(Xs,X,j,Orthog); // Lattice<vobj> Xslice(SliceGrid);
Rs = Rs + Xs*(scale*aa(j,i)); // Lattice<vobj> Rslice(SliceGrid);
assert( FullGrid->_simd_layout[Orthog]==1);
// int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension;
// int nl = nh-1;
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
autoView( X_v, X, CpuRead);
autoView( Y_v, Y, CpuRead);
autoView( R_v, R, CpuWrite);
thread_region
{
Vector<vobj> s_x(Nblock);
thread_for_collapse_in_region(2, n,nblock, {
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
s_x[i] = X_v[o+i*ostride];
} }
InsertSlice(Rs,RR,i,Orthog);
vobj dot;
for(int i=0;i<Nblock;i++){
dot = Y_v[o+i*ostride];
for(int j=0;j<Nblock;j++){
dot = dot + s_x[j]*(scale*aa(j,i));
}
R_v[o+i*ostride]=dot;
}
}});
} }
R=RR; // Copy back handles arguments aliasing case
delete SliceGrid;
}; };
template<class vobj> template<class vobj>
static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0) static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0)
{ {
R=Zero(); typedef typename vobj::scalar_object sobj;
sliceMaddMatrix(R,aa,X,R,Orthog,scale); typedef typename vobj::vector_type vector_type;
int Nblock = X.Grid()->GlobalDimensions()[Orthog];
GridBase *FullGrid = X.Grid();
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
// Lattice<vobj> Xslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid);
assert( FullGrid->_simd_layout[Orthog]==1);
// int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension;
// int nl=1;
//FIXME package in a convenient iterator
// thread_for2d_in_region
//Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
autoView( R_v, R, CpuWrite);
autoView( X_v, X, CpuRead);
thread_region
{
std::vector<vobj> s_x(Nblock);
thread_for_collapse_in_region( 2 ,n,nblock,{
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
s_x[i] = X_v[o+i*ostride];
}
vobj dot;
for(int i=0;i<Nblock;i++){
dot = s_x[0]*(scale*aa(0,i));
for(int j=1;j<Nblock;j++){
dot = dot + s_x[j]*(scale*aa(j,i));
}
R_v[o+i*ostride]=dot;
}
}});
}
}; };
template<class vobj> template<class vobj>
static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)
{ {
GridBase *SliceGrid = makeSubSliceGrid(lhs.Grid(),Orthog);
Lattice<vobj> ls(SliceGrid);
Lattice<vobj> rs(SliceGrid);
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nslice = lhs.Grid()->GlobalDimensions()[Orthog];
mat = Eigen::MatrixXcd::Zero(Nslice,Nslice); GridBase *FullGrid = lhs.Grid();
for(int s=0;s<Nslice;s++){ // GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
ExtractSlice(ls,lhs,s,Orthog);
for(int ss=0;ss<Nslice;ss++){ int Nblock = FullGrid->GlobalDimensions()[Orthog];
ExtractSlice(rs,rhs,ss,Orthog);
mat(s,ss) = innerProduct(ls,rs); // Lattice<vobj> Lslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid);
mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
assert( FullGrid->_simd_layout[Orthog]==1);
// int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension;
// int nl = nh-1;
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
typedef typename vobj::vector_typeD vector_typeD;
autoView( lhs_v, lhs, CpuRead);
autoView( rhs_v, rhs, CpuRead);
thread_region
{
std::vector<vobj> Left(Nblock);
std::vector<vobj> Right(Nblock);
Eigen::MatrixXcd mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock);
thread_for_collapse_in_region( 2, n,nblock,{
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
Left [i] = lhs_v[o+i*ostride];
Right[i] = rhs_v[o+i*ostride];
}
for(int i=0;i<Nblock;i++){
for(int j=0;j<Nblock;j++){
auto tmp = innerProduct(Left[i],Right[j]);
auto rtmp = TensorRemove(tmp);
auto red = Reduce(rtmp);
mat_thread(i,j) += std::complex<double>(real(red),imag(red));
}}
}});
thread_critical
{
mat += mat_thread;
} }
} }
delete SliceGrid;
for(int i=0;i<Nblock;i++){
for(int j=0;j<Nblock;j++){
ComplexD sum = mat(i,j);
FullGrid->GlobalSum(sum);
mat(i,j)=sum;
}}
return;
} }
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@@ -208,18 +208,28 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi
Integer numThreads, numBlocks; Integer numThreads, numBlocks;
int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks); int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
GRID_ASSERT(ok); assert(ok);
Integer smemSize = numThreads * sizeof(sobj); Integer smemSize = numThreads * sizeof(sobj);
// Move out of UVM // Move out of UVM
// Turns out I had messed up the synchronise after move to compute stream // Turns out I had messed up the synchronise after move to compute stream
// as running this on the default stream fools the synchronise // as running this on the default stream fools the synchronise
deviceVector<sobj> buffer(numBlocks); #undef UVM_BLOCK_BUFFER
#ifndef UVM_BLOCK_BUFFER
commVector<sobj> buffer(numBlocks);
sobj *buffer_v = &buffer[0]; sobj *buffer_v = &buffer[0];
sobj result; sobj result;
reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size); reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
accelerator_barrier(); accelerator_barrier();
acceleratorCopyFromDevice(buffer_v,&result,sizeof(result)); acceleratorCopyFromDevice(buffer_v,&result,sizeof(result));
#else
Vector<sobj> buffer(numBlocks);
sobj *buffer_v = &buffer[0];
sobj result;
reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
accelerator_barrier();
result = *buffer_v;
#endif
return result; return result;
} }
@@ -234,7 +244,7 @@ inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osi
const int words = sizeof(vobj)/sizeof(vector); const int words = sizeof(vobj)/sizeof(vector);
deviceVector<vector> buffer(osites); Vector<vector> buffer(osites);
vector *dat = (vector *)lat; vector *dat = (vector *)lat;
vector *buf = &buffer[0]; vector *buf = &buffer[0];
iScalar<vector> *tbuf =(iScalar<vector> *) &buffer[0]; iScalar<vector> *tbuf =(iScalar<vector> *) &buffer[0];

View File

@@ -4,28 +4,33 @@ NAMESPACE_BEGIN(Grid);
// Possibly promote to double and sum // Possibly promote to double and sum
///////////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////////
template <class vobj> template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites) inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites)
{ {
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_objectD sobjD; typedef typename vobj::scalar_objectD sobjD;
static Vector<sobj> mysum;
mysum.resize(1);
sobj *mysum_p = & mysum[0];
sobj identity; zeroit(identity); sobj identity; zeroit(identity);
sobj ret; zeroit(ret); mysum[0] = identity;
sobj ret ;
Integer nsimd= vobj::Nsimd(); Integer nsimd= vobj::Nsimd();
{
sycl::buffer<sobj, 1> abuff(&ret, {1}); const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() });
theGridAccelerator->submit([&](sycl::handler &cgh) { theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
auto Reduction = sycl::reduction(abuff,cgh,identity,std::plus<>()); auto Reduction = cl::sycl::reduction(mysum_p,identity,std::plus<>(),PropList);
cgh.parallel_for(sycl::range<1>{osites}, cgh.parallel_for(cl::sycl::range<1>{osites},
Reduction, Reduction,
[=] (sycl::id<1> item, auto &sum) { [=] (cl::sycl::id<1> item, auto &sum) {
auto osite = item[0]; auto osite = item[0];
sum +=Reduce(lat[osite]); sum +=Reduce(lat[osite]);
}); });
}); });
} theGridAccelerator->wait();
ret = mysum[0];
// free(mysum,*theGridAccelerator);
sobjD dret; convertType(dret,ret); sobjD dret; convertType(dret,ret);
return dret; return dret;
} }
@@ -71,41 +76,59 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite
template<class Word> Word svm_xor(Word *vec,uint64_t L) template<class Word> Word svm_xor(Word *vec,uint64_t L)
{ {
Word xorResult; xorResult = 0;
static Vector<Word> d_sum;
d_sum.resize(1);
Word *d_sum_p=&d_sum[0];
Word identity; identity=0; Word identity; identity=0;
Word ret = 0; d_sum[0] = identity;
{ const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() });
sycl::buffer<Word, 1> abuff(&ret, {1}); theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
theGridAccelerator->submit([&](sycl::handler &cgh) { auto Reduction = cl::sycl::reduction(d_sum_p,identity,std::bit_xor<>(),PropList);
auto Reduction = sycl::reduction(abuff,cgh,identity,std::bit_xor<>()); cgh.parallel_for(cl::sycl::range<1>{L},
cgh.parallel_for(sycl::range<1>{L},
Reduction, Reduction,
[=] (sycl::id<1> index, auto &sum) { [=] (cl::sycl::id<1> index, auto &sum) {
sum^=vec[index]; sum^=vec[index];
}); });
}); });
}
theGridAccelerator->wait();
return ret;
}
template<class Word> Word checksum_gpu(Word *vec,uint64_t L)
{
Word identity; identity=0;
Word ret = 0;
{
sycl::buffer<Word, 1> abuff(&ret, {1});
theGridAccelerator->submit([&](sycl::handler &cgh) {
auto Reduction = sycl::reduction(abuff,cgh,identity,std::bit_xor<>());
cgh.parallel_for(sycl::range<1>{L},
Reduction,
[=] (sycl::id<1> index, auto &sum) {
auto l = index % 61;
sum ^= vec[index]<<l | vec[index]>>(64-l);
});
});
}
theGridAccelerator->wait(); theGridAccelerator->wait();
Word ret = d_sum[0];
// free(d_sum,*theGridAccelerator);
return ret; return ret;
} }
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
/*
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites)
{
typedef typename vobj::vector_type vector;
typedef typename vobj::scalar_type scalar;
typedef typename vobj::scalar_typeD scalarD;
typedef typename vobj::scalar_objectD sobjD;
sobjD ret;
scalarD *ret_p = (scalarD *)&ret;
const int nsimd = vobj::Nsimd();
const int words = sizeof(vobj)/sizeof(vector);
Vector<scalar> buffer(osites*nsimd);
scalar *buf = &buffer[0];
vector *dat = (vector *)lat;
for(int w=0;w<words;w++) {
accelerator_for(ss,osites,nsimd,{
int lane = acceleratorSIMTlane(nsimd);
buf[ss*nsimd+lane] = dat[ss*words+w].getlane(lane);
});
//Precision change at this point is to late to gain precision
ret_p[w] = svm_reduce(buf,nsimd*osites);
}
return ret;
}
*/

View File

@@ -53,10 +53,10 @@ inline int RNGfillable(GridBase *coarse,GridBase *fine)
// trivially extended in higher dims, with locality guaranteeing RNG state is local to node // trivially extended in higher dims, with locality guaranteeing RNG state is local to node
int lowerdims = fine->_ndimension - coarse->_ndimension; int lowerdims = fine->_ndimension - coarse->_ndimension;
GRID_ASSERT(lowerdims >= 0); assert(lowerdims >= 0);
for(int d=0;d<lowerdims;d++){ for(int d=0;d<lowerdims;d++){
GRID_ASSERT(fine->_simd_layout[d]==1); assert(fine->_simd_layout[d]==1);
GRID_ASSERT(fine->_processors[d]==1); assert(fine->_processors[d]==1);
} }
int multiplicity=1; int multiplicity=1;
@@ -66,9 +66,9 @@ inline int RNGfillable(GridBase *coarse,GridBase *fine)
// local and global volumes subdivide cleanly after SIMDization // local and global volumes subdivide cleanly after SIMDization
for(int d=0;d<rngdims;d++){ for(int d=0;d<rngdims;d++){
int fd= d+lowerdims; int fd= d+lowerdims;
GRID_ASSERT(coarse->_processors[d] == fine->_processors[fd]); assert(coarse->_processors[d] == fine->_processors[fd]);
GRID_ASSERT(coarse->_simd_layout[d] == fine->_simd_layout[fd]); assert(coarse->_simd_layout[d] == fine->_simd_layout[fd]);
GRID_ASSERT(((fine->_rdimensions[fd] / coarse->_rdimensions[d])* coarse->_rdimensions[d])==fine->_rdimensions[fd]); assert(((fine->_rdimensions[fd] / coarse->_rdimensions[d])* coarse->_rdimensions[d])==fine->_rdimensions[fd]);
multiplicity = multiplicity *fine->_rdimensions[fd] / coarse->_rdimensions[d]; multiplicity = multiplicity *fine->_rdimensions[fd] / coarse->_rdimensions[d];
} }
@@ -83,18 +83,18 @@ inline int RNGfillable_general(GridBase *coarse,GridBase *fine)
int rngdims = coarse->_ndimension; int rngdims = coarse->_ndimension;
// trivially extended in higher dims, with locality guaranteeing RNG state is local to node // trivially extended in higher dims, with locality guaranteeing RNG state is local to node
int lowerdims = fine->_ndimension - coarse->_ndimension; GRID_ASSERT(lowerdims >= 0); int lowerdims = fine->_ndimension - coarse->_ndimension; assert(lowerdims >= 0);
// assumes that the higher dimensions are not using more processors // assumes that the higher dimensions are not using more processors
// all further divisions are local // all further divisions are local
for(int d=0;d<lowerdims;d++) GRID_ASSERT(fine->_processors[d]==1); for(int d=0;d<lowerdims;d++) assert(fine->_processors[d]==1);
for(int d=0;d<rngdims;d++) GRID_ASSERT(coarse->_processors[d] == fine->_processors[d+lowerdims]); for(int d=0;d<rngdims;d++) assert(coarse->_processors[d] == fine->_processors[d+lowerdims]);
// then divide the number of local sites // then divide the number of local sites
// check that the total number of sims agree, meanse the iSites are the same // check that the total number of sims agree, meanse the iSites are the same
GRID_ASSERT(fine->Nsimd() == coarse->Nsimd()); assert(fine->Nsimd() == coarse->Nsimd());
// check that the two grids divide cleanly // check that the two grids divide cleanly
GRID_ASSERT( (fine->lSites() / coarse->lSites() ) * coarse->lSites() == fine->lSites() ); assert( (fine->lSites() / coarse->lSites() ) * coarse->lSites() == fine->lSites() );
return fine->lSites() / coarse->lSites(); return fine->lSites() / coarse->lSites();
} }
@@ -177,7 +177,7 @@ public:
skip = skip<<shift; skip = skip<<shift;
GRID_ASSERT((skip >> shift)==site); // check for overflow assert((skip >> shift)==site); // check for overflow
eng.discard(skip); eng.discard(skip);
#else #else
@@ -218,7 +218,7 @@ public:
GetState(saved,_generators[gen]); GetState(saved,_generators[gen]);
} }
void SetState(std::vector<RngStateType> & saved,RngEngine &eng){ void SetState(std::vector<RngStateType> & saved,RngEngine &eng){
GRID_ASSERT(saved.size()==RngStateCount); assert(saved.size()==RngStateCount);
std::stringstream ss; std::stringstream ss;
for(int i=0;i<RngStateCount;i++){ for(int i=0;i<RngStateCount;i++){
ss<< saved[i]<<" "; ss<< saved[i]<<" ";

View File

@@ -21,18 +21,9 @@ NAMESPACE_BEGIN(Grid);
#if defined(GRID_CUDA) || defined(GRID_HIP) #if defined(GRID_CUDA) || defined(GRID_HIP)
template<class vobj> template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
inline void sliceSumReduction_cub_small(const vobj *Data,
std::vector<vobj> &lvSum,
const int rd,
const int e1,
const int e2,
const int stride,
const int ostride,
const int Nsimd)
{
size_t subvol_size = e1*e2; size_t subvol_size = e1*e2;
deviceVector<vobj> reduction_buffer(rd*subvol_size); commVector<vobj> reduction_buffer(rd*subvol_size);
auto rb_p = &reduction_buffer[0]; auto rb_p = &reduction_buffer[0];
vobj zero_init; vobj zero_init;
zeroit(zero_init); zeroit(zero_init);
@@ -55,7 +46,7 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
d_offsets = static_cast<int*>(acceleratorAllocDevice((rd+1)*sizeof(int))); d_offsets = static_cast<int*>(acceleratorAllocDevice((rd+1)*sizeof(int)));
//copy offsets to device //copy offsets to device
acceleratorCopyToDeviceAsynch(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream); acceleratorCopyToDeviceAsync(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream); gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream);
@@ -88,7 +79,7 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
acceleratorCopyFromDeviceAsynch(d_out,&lvSum[0],rd*sizeof(vobj),computeStream); acceleratorCopyFromDeviceAsync(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
//sync after copy //sync after copy
accelerator_barrier(); accelerator_barrier();
@@ -103,15 +94,7 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
#if defined(GRID_SYCL) #if defined(GRID_SYCL)
template<class vobj> template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data, Vector <vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
inline void sliceSumReduction_sycl_small(const vobj *Data,
std::vector <vobj> &lvSum,
const int &rd,
const int &e1,
const int &e2,
const int &stride,
const int &ostride,
const int &Nsimd)
{ {
size_t subvol_size = e1*e2; size_t subvol_size = e1*e2;
@@ -122,7 +105,7 @@ inline void sliceSumReduction_sycl_small(const vobj *Data,
mysum[r] = vobj_zero; mysum[r] = vobj_zero;
} }
deviceVector<vobj> reduction_buffer(rd*subvol_size); commVector<vobj> reduction_buffer(rd*subvol_size);
auto rb_p = &reduction_buffer[0]; auto rb_p = &reduction_buffer[0];
@@ -141,11 +124,11 @@ inline void sliceSumReduction_sycl_small(const vobj *Data,
}); });
for (int r = 0; r < rd; r++) { for (int r = 0; r < rd; r++) {
theGridAccelerator->submit([&](sycl::handler &cgh) { theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
auto Reduction = sycl::reduction(&mysum[r],std::plus<>()); auto Reduction = cl::sycl::reduction(&mysum[r],std::plus<>());
cgh.parallel_for(sycl::range<1>{subvol_size}, cgh.parallel_for(cl::sycl::range<1>{subvol_size},
Reduction, Reduction,
[=](sycl::id<1> item, auto &sum) { [=](cl::sycl::id<1> item, auto &sum) {
auto s = item[0]; auto s = item[0];
sum += rb_p[r*subvol_size+s]; sum += rb_p[r*subvol_size+s];
}); });
@@ -161,23 +144,14 @@ inline void sliceSumReduction_sycl_small(const vobj *Data,
} }
#endif #endif
template<class vobj> template<class vobj> inline void sliceSumReduction_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
inline void sliceSumReduction_large(const vobj *Data,
std::vector<vobj> &lvSum,
const int rd,
const int e1,
const int e2,
const int stride,
const int ostride,
const int Nsimd)
{
typedef typename vobj::vector_type vector; typedef typename vobj::vector_type vector;
const int words = sizeof(vobj)/sizeof(vector); const int words = sizeof(vobj)/sizeof(vector);
const int osites = rd*e1*e2; const int osites = rd*e1*e2;
deviceVector<vector>buffer(osites); commVector<vector>buffer(osites);
vector *dat = (vector *)Data; vector *dat = (vector *)Data;
vector *buf = &buffer[0]; vector *buf = &buffer[0];
std::vector<vector> lvSum_small(rd); Vector<vector> lvSum_small(rd);
vector *lvSum_ptr = (vector *)&lvSum[0]; vector *lvSum_ptr = (vector *)&lvSum[0];
for (int w = 0; w < words; w++) { for (int w = 0; w < words; w++) {
@@ -194,18 +168,13 @@ inline void sliceSumReduction_large(const vobj *Data,
for (int r = 0; r < rd; r++) { for (int r = 0; r < rd; r++) {
lvSum_ptr[w+words*r]=lvSum_small[r]; lvSum_ptr[w+words*r]=lvSum_small[r];
} }
}
} }
template<class vobj>
inline void sliceSumReduction_gpu(const Lattice<vobj> &Data, }
std::vector<vobj> &lvSum,
const int rd, template<class vobj> inline void sliceSumReduction_gpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd)
const int e1,
const int e2,
const int stride,
const int ostride,
const int Nsimd)
{ {
autoView(Data_v, Data, AcceleratorRead); //reduction libraries cannot deal with large vobjs so we split into small/large case. autoView(Data_v, Data, AcceleratorRead); //reduction libraries cannot deal with large vobjs so we split into small/large case.
if constexpr (sizeof(vobj) <= 256) { if constexpr (sizeof(vobj) <= 256) {
@@ -223,15 +192,7 @@ inline void sliceSumReduction_gpu(const Lattice<vobj> &Data,
} }
template<class vobj> template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
inline void sliceSumReduction_cpu(const Lattice<vobj> &Data,
std::vector<vobj> &lvSum,
const int &rd,
const int &e1,
const int &e2,
const int &stride,
const int &ostride,
const int &Nsimd)
{ {
// sum over reduced dimension planes, breaking out orthog dir // sum over reduced dimension planes, breaking out orthog dir
// Parallel over orthog direction // Parallel over orthog direction
@@ -247,19 +208,15 @@ inline void sliceSumReduction_cpu(const Lattice<vobj> &Data,
}); });
} }
template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
std::vector<vobj> &lvSum,
const int &rd,
const int &e1,
const int &e2,
const int &stride,
const int &ostride,
const int &Nsimd)
{ {
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL) #if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
sliceSumReduction_gpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); sliceSumReduction_gpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
#else #else
sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd); sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
#endif #endif
} }

View File

@@ -31,61 +31,32 @@ NAMESPACE_BEGIN(Grid);
inline void subdivides(GridBase *coarse,GridBase *fine) inline void subdivides(GridBase *coarse,GridBase *fine)
{ {
GRID_ASSERT(coarse->_ndimension == fine->_ndimension); assert(coarse->_ndimension == fine->_ndimension);
int _ndimension = coarse->_ndimension; int _ndimension = coarse->_ndimension;
// local and global volumes subdivide cleanly after SIMDization // local and global volumes subdivide cleanly after SIMDization
for(int d=0;d<_ndimension;d++){ for(int d=0;d<_ndimension;d++){
GRID_ASSERT(coarse->_processors[d] == fine->_processors[d]); assert(coarse->_processors[d] == fine->_processors[d]);
GRID_ASSERT(coarse->_simd_layout[d] == fine->_simd_layout[d]); assert(coarse->_simd_layout[d] == fine->_simd_layout[d]);
GRID_ASSERT((fine->_rdimensions[d] / coarse->_rdimensions[d])* coarse->_rdimensions[d]==fine->_rdimensions[d]); assert((fine->_rdimensions[d] / coarse->_rdimensions[d])* coarse->_rdimensions[d]==fine->_rdimensions[d]);
} }
} }
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
// remove and insert a half checkerboard // remove and insert a half checkerboard
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full) template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full)
{ {
half.Checkerboard() = cb; acceleratorPickCheckerboard(cb,half,full);
autoView( half_v, half, CpuWrite);
autoView( full_v, full, CpuRead);
thread_for(ss, full.Grid()->oSites(),{
int cbos;
Coordinate coor;
full.Grid()->oCoorFromOindex(coor,ss);
cbos=half.Grid()->CheckerBoard(coor);
if (cbos==cb) {
int ssh=half.Grid()->oIndex(coor);
half_v[ssh] = full_v[ss];
}
});
} }
template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half) template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half)
{ {
int cb = half.Checkerboard(); acceleratorSetCheckerboard(full,half);
autoView( half_v , half, CpuRead);
autoView( full_v , full, CpuWrite);
thread_for(ss,full.Grid()->oSites(),{
Coordinate coor;
int cbos;
full.Grid()->oCoorFromOindex(coor,ss);
cbos=half.Grid()->CheckerBoard(coor);
if (cbos==cb) {
int ssh=half.Grid()->oIndex(coor);
full_v[ss]=half_v[ssh];
}
});
} }
template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full, int checker_dim_half=0) template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full, int dummy=0)
{ {
half.Checkerboard() = cb; half.Checkerboard() = cb;
autoView(half_v, half, AcceleratorWrite); autoView(half_v, half, AcceleratorWrite);
@@ -95,6 +66,7 @@ template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj
unsigned long ndim_half = half.Grid()->_ndimension; unsigned long ndim_half = half.Grid()->_ndimension;
Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask; Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
Coordinate ostride_half = half.Grid()->_ostride; Coordinate ostride_half = half.Grid()->_ostride;
int checker_dim_half = half.Grid()->CheckerDim();
accelerator_for(ss, full.Grid()->oSites(),full.Grid()->Nsimd(),{ accelerator_for(ss, full.Grid()->oSites(),full.Grid()->Nsimd(),{
Coordinate coor; Coordinate coor;
@@ -102,7 +74,7 @@ template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj
int linear=0; int linear=0;
Lexicographic::CoorFromIndex(coor,ss,rdim_full); Lexicographic::CoorFromIndex(coor,ss,rdim_full);
GRID_ASSERT(coor.size()==ndim_half); assert(coor.size()==ndim_half);
for(int d=0;d<ndim_half;d++){ for(int d=0;d<ndim_half;d++){
if(checker_dim_mask_half[d]) linear += coor[d]; if(checker_dim_mask_half[d]) linear += coor[d];
@@ -119,7 +91,7 @@ template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj
} }
}); });
} }
template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half, int checker_dim_half=0) template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half, int dummy=0)
{ {
int cb = half.Checkerboard(); int cb = half.Checkerboard();
autoView(half_v , half, AcceleratorRead); autoView(half_v , half, AcceleratorRead);
@@ -129,6 +101,7 @@ template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,
unsigned long ndim_half = half.Grid()->_ndimension; unsigned long ndim_half = half.Grid()->_ndimension;
Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask; Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
Coordinate ostride_half = half.Grid()->_ostride; Coordinate ostride_half = half.Grid()->_ostride;
int checker_dim_half = half.Grid()->CheckerDim();
accelerator_for(ss,full.Grid()->oSites(),full.Grid()->Nsimd(),{ accelerator_for(ss,full.Grid()->oSites(),full.Grid()->Nsimd(),{
Coordinate coor; Coordinate coor;
@@ -136,7 +109,7 @@ template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,
int linear=0; int linear=0;
Lexicographic::CoorFromIndex(coor,ss,rdim_full); Lexicographic::CoorFromIndex(coor,ss,rdim_full);
GRID_ASSERT(coor.size()==ndim_half); assert(coor.size()==ndim_half);
for(int d=0;d<ndim_half;d++){ for(int d=0;d<ndim_half;d++){
if(checker_dim_mask_half[d]) linear += coor[d]; if(checker_dim_mask_half[d]) linear += coor[d];
@@ -309,7 +282,7 @@ inline void batchBlockProject(std::vector<Lattice<iVector<CComplex,nbasis>>> &co
const VLattice &Basis) const VLattice &Basis)
{ {
int NBatch = fineData.size(); int NBatch = fineData.size();
GRID_ASSERT(coarseData.size() == NBatch); assert(coarseData.size() == NBatch);
GridBase * fine = fineData[0].Grid(); GridBase * fine = fineData[0].Grid();
GridBase * coarse= coarseData[0].Grid(); GridBase * coarse= coarseData[0].Grid();
@@ -344,7 +317,7 @@ template<class vobj,class vobj2,class CComplex>
GridBase * coarse= coarseA.Grid(); GridBase * coarse= coarseA.Grid();
fineZ.Checkerboard()=fineX.Checkerboard(); fineZ.Checkerboard()=fineX.Checkerboard();
GRID_ASSERT(fineX.Checkerboard()==fineY.Checkerboard()); assert(fineX.Checkerboard()==fineY.Checkerboard());
subdivides(coarse,fine); // require they map subdivides(coarse,fine); // require they map
conformable(fineX,fineY); conformable(fineX,fineY);
conformable(fineX,fineZ); conformable(fineX,fineZ);
@@ -356,7 +329,7 @@ template<class vobj,class vobj2,class CComplex>
// FIXME merge with subdivide checking routine as this is redundant // FIXME merge with subdivide checking routine as this is redundant
for(int d=0 ; d<_ndimension;d++){ for(int d=0 ; d<_ndimension;d++){
block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d]; block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
GRID_ASSERT(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]); assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]);
} }
autoView( fineZ_ , fineZ, AcceleratorWrite); autoView( fineZ_ , fineZ, AcceleratorWrite);
@@ -613,7 +586,7 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
int _ndimension = coarse->_ndimension; int _ndimension = coarse->_ndimension;
// checks // checks
GRID_ASSERT( nbasis == Basis.size() ); assert( nbasis == Basis.size() );
subdivides(coarse,fine); subdivides(coarse,fine);
for(int i=0;i<nbasis;i++){ for(int i=0;i<nbasis;i++){
conformable(Basis[i].Grid(),fine); conformable(Basis[i].Grid(),fine);
@@ -687,7 +660,7 @@ inline void batchBlockPromote(const std::vector<Lattice<iVector<CComplex,nbasis>
const VLattice &Basis) const VLattice &Basis)
{ {
int NBatch = coarseData.size(); int NBatch = coarseData.size();
GRID_ASSERT(fineData.size() == NBatch); assert(fineData.size() == NBatch);
GridBase * fine = fineData[0].Grid(); GridBase * fine = fineData[0].Grid();
GridBase * coarse = coarseData[0].Grid(); GridBase * coarse = coarseData[0].Grid();
@@ -715,12 +688,12 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
int ni = ig->_ndimension; int ni = ig->_ndimension;
int no = og->_ndimension; int no = og->_ndimension;
GRID_ASSERT(ni == no); assert(ni == no);
for(int d=0;d<no;d++){ for(int d=0;d<no;d++){
GRID_ASSERT(ig->_processors[d] == og->_processors[d]); assert(ig->_processors[d] == og->_processors[d]);
GRID_ASSERT(ig->_ldimensions[d] == og->_ldimensions[d]); assert(ig->_ldimensions[d] == og->_ldimensions[d]);
GRID_ASSERT(ig->lSites() == og->lSites()); assert(ig->lSites() == og->lSites());
} }
autoView(in_v,in,CpuRead); autoView(in_v,in,CpuRead);
@@ -752,16 +725,16 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
GridBase *Fg = From.Grid(); GridBase *Fg = From.Grid();
GridBase *Tg = To.Grid(); GridBase *Tg = To.Grid();
GRID_ASSERT(!Fg->_isCheckerBoarded); assert(!Fg->_isCheckerBoarded);
GRID_ASSERT(!Tg->_isCheckerBoarded); assert(!Tg->_isCheckerBoarded);
int Nsimd = Fg->Nsimd(); int Nsimd = Fg->Nsimd();
int nF = Fg->_ndimension; int nF = Fg->_ndimension;
int nT = Tg->_ndimension; int nT = Tg->_ndimension;
int nd = nF; int nd = nF;
GRID_ASSERT(nF == nT); assert(nF == nT);
for(int d=0;d<nd;d++){ for(int d=0;d<nd;d++){
GRID_ASSERT(Fg->_processors[d] == Tg->_processors[d]); assert(Fg->_processors[d] == Tg->_processors[d]);
} }
/////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////
@@ -821,12 +794,12 @@ void InsertSliceFast(const Lattice<vobj> &From,Lattice<vobj> & To,int slice, int
////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////
GridBase *Fg = From.Grid(); GridBase *Fg = From.Grid();
GridBase *Tg = To.Grid(); GridBase *Tg = To.Grid();
GRID_ASSERT(!Fg->_isCheckerBoarded); assert(!Fg->_isCheckerBoarded);
GRID_ASSERT(!Tg->_isCheckerBoarded); assert(!Tg->_isCheckerBoarded);
int Nsimd = Fg->Nsimd(); int Nsimd = Fg->Nsimd();
int nF = Fg->_ndimension; int nF = Fg->_ndimension;
int nT = Tg->_ndimension; int nT = Tg->_ndimension;
GRID_ASSERT(nF+1 == nT); assert(nF+1 == nT);
/////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////
// do the index calc on the GPU // do the index calc on the GPU
@@ -890,12 +863,12 @@ void ExtractSliceFast(Lattice<vobj> &To,const Lattice<vobj> & From,int slice, in
////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////
GridBase *Fg = From.Grid(); GridBase *Fg = From.Grid();
GridBase *Tg = To.Grid(); GridBase *Tg = To.Grid();
GRID_ASSERT(!Fg->_isCheckerBoarded); assert(!Fg->_isCheckerBoarded);
GRID_ASSERT(!Tg->_isCheckerBoarded); assert(!Tg->_isCheckerBoarded);
int Nsimd = Fg->Nsimd(); int Nsimd = Fg->Nsimd();
int nF = Fg->_ndimension; int nF = Fg->_ndimension;
int nT = Tg->_ndimension; int nT = Tg->_ndimension;
GRID_ASSERT(nT+1 == nF); assert(nT+1 == nF);
/////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////
// do the index calc on the GPU // do the index calc on the GPU
@@ -955,16 +928,16 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
int nl = lg->_ndimension; int nl = lg->_ndimension;
int nh = hg->_ndimension; int nh = hg->_ndimension;
GRID_ASSERT(nl+1 == nh); assert(nl+1 == nh);
GRID_ASSERT(orthog<nh); assert(orthog<nh);
GRID_ASSERT(orthog>=0); assert(orthog>=0);
GRID_ASSERT(hg->_processors[orthog]==1); assert(hg->_processors[orthog]==1);
int dl; dl = 0; int dl; dl = 0;
for(int d=0;d<nh;d++){ for(int d=0;d<nh;d++){
if ( d != orthog) { if ( d != orthog) {
GRID_ASSERT(lg->_processors[dl] == hg->_processors[d]); assert(lg->_processors[dl] == hg->_processors[d]);
GRID_ASSERT(lg->_ldimensions[dl] == hg->_ldimensions[d]); assert(lg->_ldimensions[dl] == hg->_ldimensions[d]);
dl++; dl++;
} }
} }
@@ -981,14 +954,8 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
hcoor[orthog] = slice; hcoor[orthog] = slice;
for(int d=0;d<nh;d++){ for(int d=0;d<nh;d++){
if ( d!=orthog ) { if ( d!=orthog ) {
hcoor[d]=lcoor[ddl]; hcoor[d]=lcoor[ddl++];
if ( hg->_checker_dim == d ) {
hcoor[d]=hcoor[d]*2; // factor in the full coor for peekLocalSite
lcoor[ddl]=lcoor[ddl]*2; // factor in the full coor for peekLocalSite
} }
ddl++;
}
} }
peekLocalSite(s,lowDimv,lcoor); peekLocalSite(s,lowDimv,lcoor);
pokeLocalSite(s,higherDimv,hcoor); pokeLocalSite(s,higherDimv,hcoor);
@@ -1005,17 +972,16 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
int nl = lg->_ndimension; int nl = lg->_ndimension;
int nh = hg->_ndimension; int nh = hg->_ndimension;
GRID_ASSERT(nl+1 == nh); assert(nl+1 == nh);
GRID_ASSERT(orthog<nh); assert(orthog<nh);
GRID_ASSERT(orthog>=0); assert(orthog>=0);
GRID_ASSERT(hg->_processors[orthog]==1); assert(hg->_processors[orthog]==1);
lowDim.Checkerboard() = higherDim.Checkerboard();
int dl; dl = 0; int dl; dl = 0;
for(int d=0;d<nh;d++){ for(int d=0;d<nh;d++){
if ( d != orthog) { if ( d != orthog) {
GRID_ASSERT(lg->_processors[dl] == hg->_processors[d]); assert(lg->_processors[dl] == hg->_processors[d]);
GRID_ASSERT(lg->_ldimensions[dl] == hg->_ldimensions[d]); assert(lg->_ldimensions[dl] == hg->_ldimensions[d]);
dl++; dl++;
} }
} }
@@ -1027,16 +993,11 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
Coordinate lcoor(nl); Coordinate lcoor(nl);
Coordinate hcoor(nh); Coordinate hcoor(nh);
lg->LocalIndexToLocalCoor(idx,lcoor); lg->LocalIndexToLocalCoor(idx,lcoor);
hcoor[orthog] = slice;
int ddl=0; int ddl=0;
hcoor[orthog] = slice;
for(int d=0;d<nh;d++){ for(int d=0;d<nh;d++){
if ( d!=orthog ) { if ( d!=orthog ) {
hcoor[d]=lcoor[ddl]; hcoor[d]=lcoor[ddl++];
if ( hg->_checker_dim == d ) {
hcoor[d]=hcoor[d]*2; // factor in the full gridd coor for peekLocalSite
lcoor[ddl]=lcoor[ddl]*2; // factor in the full coor for peekLocalSite
}
ddl++;
} }
} }
peekLocalSite(s,higherDimv,hcoor); peekLocalSite(s,higherDimv,hcoor);
@@ -1056,14 +1017,14 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
int nl = lg->_ndimension; int nl = lg->_ndimension;
int nh = hg->_ndimension; int nh = hg->_ndimension;
GRID_ASSERT(nl == nh); assert(nl == nh);
GRID_ASSERT(orthog<nh); assert(orthog<nh);
GRID_ASSERT(orthog>=0); assert(orthog>=0);
for(int d=0;d<nh;d++){ for(int d=0;d<nh;d++){
if ( d!=orthog ) { if ( d!=orthog ) {
GRID_ASSERT(lg->_processors[d] == hg->_processors[d]); assert(lg->_processors[d] == hg->_processors[d]);
GRID_ASSERT(lg->_ldimensions[d] == hg->_ldimensions[d]); assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
} }
} }
Coordinate sz = lg->_ldimensions; Coordinate sz = lg->_ldimensions;
@@ -1093,7 +1054,7 @@ void Replicate(const Lattice<vobj> &coarse,Lattice<vobj> & fine)
subdivides(cg,fg); subdivides(cg,fg);
GRID_ASSERT(cg->_ndimension==fg->_ndimension); assert(cg->_ndimension==fg->_ndimension);
Coordinate ratio(cg->_ndimension); Coordinate ratio(cg->_ndimension);
@@ -1157,7 +1118,7 @@ unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
int lex; int lex;
Lexicographic::IndexFromCoor(lcoor, lex, in_grid->_ldimensions); Lexicographic::IndexFromCoor(lcoor, lex, in_grid->_ldimensions);
GRID_ASSERT(lex < out.size()); assert(lex < out.size());
out_ptrs[lane] = &out[lex]; out_ptrs[lane] = &out[lex];
} }
@@ -1221,7 +1182,7 @@ vectorizeFromLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
typedef typename vobj::vector_type vtype; typedef typename vobj::vector_type vtype;
GridBase* grid = out.Grid(); GridBase* grid = out.Grid();
GRID_ASSERT(in.size()==grid->lSites()); assert(in.size()==grid->lSites());
const int ndim = grid->Nd(); const int ndim = grid->Nd();
constexpr int nsimd = vtype::Nsimd(); constexpr int nsimd = vtype::Nsimd();
@@ -1268,7 +1229,7 @@ vectorizeFromRevLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
typedef typename vobj::vector_type vtype; typedef typename vobj::vector_type vtype;
GridBase* grid = out._grid; GridBase* grid = out._grid;
GRID_ASSERT(in.size()==grid->lSites()); assert(in.size()==grid->lSites());
int ndim = grid->Nd(); int ndim = grid->Nd();
int nsimd = vtype::Nsimd(); int nsimd = vtype::Nsimd();
@@ -1329,9 +1290,9 @@ void precisionChangeFast(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
template<class VobjOut, class VobjIn> template<class VobjOut, class VobjIn>
void precisionChangeOrig(Lattice<VobjOut> &out, const Lattice<VobjIn> &in) void precisionChangeOrig(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
{ {
GRID_ASSERT(out.Grid()->Nd() == in.Grid()->Nd()); assert(out.Grid()->Nd() == in.Grid()->Nd());
for(int d=0;d<out.Grid()->Nd();d++){ for(int d=0;d<out.Grid()->Nd();d++){
GRID_ASSERT(out.Grid()->FullDimensions()[d] == in.Grid()->FullDimensions()[d]); assert(out.Grid()->FullDimensions()[d] == in.Grid()->FullDimensions()[d]);
} }
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
GridBase *in_grid=in.Grid(); GridBase *in_grid=in.Grid();
@@ -1382,9 +1343,9 @@ class precisionChangeWorkspace{
public: public:
precisionChangeWorkspace(GridBase *out_grid, GridBase *in_grid): _out_grid(out_grid), _in_grid(in_grid){ precisionChangeWorkspace(GridBase *out_grid, GridBase *in_grid): _out_grid(out_grid), _in_grid(in_grid){
//Build a map between the sites and lanes of the output field and the input field as we cannot use the Grids on the device //Build a map between the sites and lanes of the output field and the input field as we cannot use the Grids on the device
GRID_ASSERT(out_grid->Nd() == in_grid->Nd()); assert(out_grid->Nd() == in_grid->Nd());
for(int d=0;d<out_grid->Nd();d++){ for(int d=0;d<out_grid->Nd();d++){
GRID_ASSERT(out_grid->FullDimensions()[d] == in_grid->FullDimensions()[d]); assert(out_grid->FullDimensions()[d] == in_grid->FullDimensions()[d]);
} }
int Nsimd_out = out_grid->Nsimd(); int Nsimd_out = out_grid->Nsimd();
@@ -1549,7 +1510,7 @@ void Grid_split(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
int full_vecs = full.size(); int full_vecs = full.size();
GRID_ASSERT(full_vecs>=1); assert(full_vecs>=1);
GridBase * full_grid = full[0].Grid(); GridBase * full_grid = full[0].Grid();
GridBase *split_grid = split.Grid(); GridBase *split_grid = split.Grid();
@@ -1567,18 +1528,18 @@ void Grid_split(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
////////////////////////////// //////////////////////////////
// Checks // Checks
////////////////////////////// //////////////////////////////
GRID_ASSERT(full_grid->_ndimension==split_grid->_ndimension); assert(full_grid->_ndimension==split_grid->_ndimension);
for(int n=0;n<full_vecs;n++){ for(int n=0;n<full_vecs;n++){
GRID_ASSERT(full[n].Checkerboard() == cb); assert(full[n].Checkerboard() == cb);
for(int d=0;d<ndim;d++){ for(int d=0;d<ndim;d++){
GRID_ASSERT(full[n].Grid()->_gdimensions[d]==split.Grid()->_gdimensions[d]); assert(full[n].Grid()->_gdimensions[d]==split.Grid()->_gdimensions[d]);
GRID_ASSERT(full[n].Grid()->_fdimensions[d]==split.Grid()->_fdimensions[d]); assert(full[n].Grid()->_fdimensions[d]==split.Grid()->_fdimensions[d]);
} }
} }
int nvector =full_nproc/split_nproc; int nvector =full_nproc/split_nproc;
GRID_ASSERT(nvector*split_nproc==full_nproc); assert(nvector*split_nproc==full_nproc);
GRID_ASSERT(nvector == full_vecs); assert(nvector == full_vecs);
Coordinate ratio(ndim); Coordinate ratio(ndim);
for(int d=0;d<ndim;d++){ for(int d=0;d<ndim;d++){
@@ -1622,7 +1583,7 @@ void Grid_split(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
int fvol = lsites; int fvol = lsites;
int chunk = (nvec*fvol)/sP; GRID_ASSERT(chunk*sP == nvec*fvol); int chunk = (nvec*fvol)/sP; assert(chunk*sP == nvec*fvol);
// Loop over reordered data post A2A // Loop over reordered data post A2A
thread_for(c, chunk, { thread_for(c, chunk, {
@@ -1675,7 +1636,7 @@ void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
int full_vecs = full.size(); int full_vecs = full.size();
GRID_ASSERT(full_vecs>=1); assert(full_vecs>=1);
GridBase * full_grid = full[0].Grid(); GridBase * full_grid = full[0].Grid();
GridBase *split_grid = split.Grid(); GridBase *split_grid = split.Grid();
@@ -1693,18 +1654,18 @@ void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
////////////////////////////// //////////////////////////////
// Checks // Checks
////////////////////////////// //////////////////////////////
GRID_ASSERT(full_grid->_ndimension==split_grid->_ndimension); assert(full_grid->_ndimension==split_grid->_ndimension);
for(int n=0;n<full_vecs;n++){ for(int n=0;n<full_vecs;n++){
GRID_ASSERT(full[n].Checkerboard() == cb); assert(full[n].Checkerboard() == cb);
for(int d=0;d<ndim;d++){ for(int d=0;d<ndim;d++){
GRID_ASSERT(full[n].Grid()->_gdimensions[d]==split.Grid()->_gdimensions[d]); assert(full[n].Grid()->_gdimensions[d]==split.Grid()->_gdimensions[d]);
GRID_ASSERT(full[n].Grid()->_fdimensions[d]==split.Grid()->_fdimensions[d]); assert(full[n].Grid()->_fdimensions[d]==split.Grid()->_fdimensions[d]);
} }
} }
int nvector =full_nproc/split_nproc; int nvector =full_nproc/split_nproc;
GRID_ASSERT(nvector*split_nproc==full_nproc); assert(nvector*split_nproc==full_nproc);
GRID_ASSERT(nvector == full_vecs); assert(nvector == full_vecs);
Coordinate ratio(ndim); Coordinate ratio(ndim);
for(int d=0;d<ndim;d++){ for(int d=0;d<ndim;d++){
@@ -1740,7 +1701,7 @@ void Grid_unsplit(std::vector<Lattice<Vobj> > & full,Lattice<Vobj> & split)
auto lsites= rsites/M; // Decreases rsites by M auto lsites= rsites/M; // Decreases rsites by M
int fvol = lsites; int fvol = lsites;
int chunk = (nvec*fvol)/sP; GRID_ASSERT(chunk*sP == nvec*fvol); int chunk = (nvec*fvol)/sP; assert(chunk*sP == nvec*fvol);
{ {
// Loop over reordered data post A2A // Loop over reordered data post A2A

View File

@@ -10,7 +10,7 @@ class LatticeBase {};
///////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////
void accelerator_inline conformable(GridBase *lhs,GridBase *rhs) void accelerator_inline conformable(GridBase *lhs,GridBase *rhs)
{ {
GRID_ASSERT(lhs == rhs); assert(lhs == rhs);
} }
//////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////
@@ -106,47 +106,6 @@ public:
} }
}; };
#ifdef GRID_LOG_VIEWS
// Little autoscope assister
template<class View>
class ViewCloser
{
View v; // Take a copy of view and call view close when I go out of scope automatically
const char* filename; int line, mode;
public:
ViewCloser(View &_v, const char* _filename, int _line, int _mode) :
v(_v), filename(_filename), line(_line), mode(_mode) {
switch (mode){
case AcceleratorRead:
case AcceleratorWrite:
case CpuRead:
case CpuWrite:
ViewLogger::LogOpen(filename, line, 1, mode, &v[0], v.size() * sizeof(v[0]));
break;
}
};
~ViewCloser() {
switch (mode) {
case AcceleratorWriteDiscard:
case AcceleratorWrite:
case CpuWrite:
ViewLogger::LogClose(filename, line, -1, mode, &v[0], v.size() * sizeof(v[0]));
break;
}
v.ViewClose();
}
};
#define autoView(l_v,l,mode) \
auto l_v = l.View(mode); \
ViewCloser<decltype(l_v)> _autoView##l_v(l_v,__FILE__,__LINE__,mode);
#else
// Little autoscope assister // Little autoscope assister
template<class View> template<class View>
class ViewCloser class ViewCloser
@@ -160,7 +119,6 @@ class ViewCloser
#define autoView(l_v,l,mode) \ #define autoView(l_v,l,mode) \
auto l_v = l.View(mode); \ auto l_v = l.View(mode); \
ViewCloser<decltype(l_v)> _autoView##l_v(l_v); ViewCloser<decltype(l_v)> _autoView##l_v(l_v);
#endif
///////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////
// Lattice expression types used by ET to assemble the AST // Lattice expression types used by ET to assemble the AST

View File

@@ -54,7 +54,7 @@ struct CshiftImplGauge: public CshiftImplBase<typename Gimpl::GaugeLinkField::ve
* *
*/ */
template<class vobj> inline void ScatterSlice(const deviceVector<vobj> &buf, template<class vobj> inline void ScatterSlice(const cshiftVector<vobj> &buf,
Lattice<vobj> &lat, Lattice<vobj> &lat,
int x, int x,
int dim, int dim,
@@ -82,10 +82,10 @@ template<class vobj> inline void ScatterSlice(const deviceVector<vobj> &buf,
int rNsimd = 1; for(int d=0;d<Nd;d++) rNsimd*=rsimd[d]; int rNsimd = 1; for(int d=0;d<Nd;d++) rNsimd*=rsimd[d];
int rNsimda= Nsimd/simd[dim]; // should be equal int rNsimda= Nsimd/simd[dim]; // should be equal
GRID_ASSERT(rNsimda==rNsimd); assert(rNsimda==rNsimd);
int face_ovol=block*nblock; int face_ovol=block*nblock;
// GRID_ASSERT(buf.size()==face_ovol*rNsimd); // assert(buf.size()==face_ovol*rNsimd);
/*This will work GPU ONLY unless rNsimd is put in the lexico index*/ /*This will work GPU ONLY unless rNsimd is put in the lexico index*/
//Let's make it work on GPU and then make a special accelerator_for that //Let's make it work on GPU and then make a special accelerator_for that
@@ -140,7 +140,7 @@ template<class vobj> inline void ScatterSlice(const deviceVector<vobj> &buf,
}); });
} }
template<class vobj> inline void GatherSlice(deviceVector<vobj> &buf, template<class vobj> inline void GatherSlice(cshiftVector<vobj> &buf,
const Lattice<vobj> &lat, const Lattice<vobj> &lat,
int x, int x,
int dim, int dim,
@@ -172,7 +172,7 @@ template<class vobj> inline void GatherSlice(deviceVector<vobj> &buf,
int face_ovol=block*nblock; int face_ovol=block*nblock;
// GRID_ASSERT(buf.size()==face_ovol*rNsimd); // assert(buf.size()==face_ovol*rNsimd);
/*This will work GPU ONLY unless rNsimd is put in the lexico index*/ /*This will work GPU ONLY unless rNsimd is put in the lexico index*/
//Let's make it work on GPU and then make a special accelerator_for that //Let's make it work on GPU and then make a special accelerator_for that
@@ -247,7 +247,7 @@ public:
Coordinate local =unpadded_grid->LocalDimensions(); Coordinate local =unpadded_grid->LocalDimensions();
Coordinate procs =unpadded_grid->ProcessorGrid(); Coordinate procs =unpadded_grid->ProcessorGrid();
for(int d=0;d<dims;d++){ for(int d=0;d<dims;d++){
if ( procs[d] > 1 ) GRID_ASSERT(local[d]>=depth); if ( procs[d] > 1 ) assert(local[d]>=depth);
} }
} }
void DeleteGrids(void) void DeleteGrids(void)
@@ -448,9 +448,9 @@ public:
int nld = to.Grid()->_ldimensions[dimension]; int nld = to.Grid()->_ldimensions[dimension];
const int Nsimd = vobj::Nsimd(); const int Nsimd = vobj::Nsimd();
GRID_ASSERT(depth<=lds[dimension]); // A must be on neighbouring node assert(depth<=lds[dimension]); // A must be on neighbouring node
GRID_ASSERT(depth>0); // A caller bug if zero assert(depth>0); // A caller bug if zero
GRID_ASSERT(ld+2*depth==nld); assert(ld+2*depth==nld);
//////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////
// Face size and byte calculations // Face size and byte calculations
//////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////
@@ -460,21 +460,15 @@ public:
} }
buffer_size = buffer_size / Nsimd; buffer_size = buffer_size / Nsimd;
int rNsimd = Nsimd / simd[dimension]; int rNsimd = Nsimd / simd[dimension];
GRID_ASSERT( buffer_size == from.Grid()->_slice_nblock[dimension]*from.Grid()->_slice_block[dimension] / simd[dimension]); assert( buffer_size == from.Grid()->_slice_nblock[dimension]*from.Grid()->_slice_block[dimension] / simd[dimension]);
static deviceVector<vobj> send_buf; static cshiftVector<vobj> send_buf;
static deviceVector<vobj> recv_buf; static cshiftVector<vobj> recv_buf;
send_buf.resize(buffer_size*2*depth); send_buf.resize(buffer_size*2*depth);
recv_buf.resize(buffer_size*2*depth); recv_buf.resize(buffer_size*2*depth);
#ifndef ACCELERATOR_AWARE_MPI
static hostVector<vobj> hsend_buf;
static hostVector<vobj> hrecv_buf;
hsend_buf.resize(buffer_size*2*depth);
hrecv_buf.resize(buffer_size*2*depth);
#endif
std::vector<MpiCommsRequest_t> fwd_req; std::vector<CommsRequest_t> fwd_req;
std::vector<MpiCommsRequest_t> bwd_req; std::vector<CommsRequest_t> bwd_req;
int words = buffer_size; int words = buffer_size;
int bytes = words * sizeof(vobj); int bytes = words * sizeof(vobj);
@@ -501,16 +495,9 @@ public:
t_gather+=usecond()-t; t_gather+=usecond()-t;
t=usecond(); t=usecond();
#ifdef ACCELERATOR_AWARE_MPI
grid->SendToRecvFromBegin(fwd_req, grid->SendToRecvFromBegin(fwd_req,
(void *)&send_buf[d*buffer_size], xmit_to_rank, (void *)&send_buf[d*buffer_size], xmit_to_rank,
(void *)&recv_buf[d*buffer_size], recv_from_rank, bytes, tag); (void *)&recv_buf[d*buffer_size], recv_from_rank, bytes, tag);
#else
acceleratorCopyFromDevice(&send_buf[d*buffer_size],&hsend_buf[d*buffer_size],bytes);
grid->SendToRecvFromBegin(fwd_req,
(void *)&hsend_buf[d*buffer_size], xmit_to_rank,
(void *)&hrecv_buf[d*buffer_size], recv_from_rank, bytes, tag);
#endif
t_comms+=usecond()-t; t_comms+=usecond()-t;
} }
for ( int d=0;d < depth ; d ++ ) { for ( int d=0;d < depth ; d ++ ) {
@@ -521,16 +508,9 @@ public:
t_gather+= usecond() - t; t_gather+= usecond() - t;
t=usecond(); t=usecond();
#ifdef ACCELERATOR_AWARE_MPI
grid->SendToRecvFromBegin(bwd_req, grid->SendToRecvFromBegin(bwd_req,
(void *)&send_buf[(d+depth)*buffer_size], recv_from_rank, (void *)&send_buf[(d+depth)*buffer_size], recv_from_rank,
(void *)&recv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag); (void *)&recv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
#else
acceleratorCopyFromDevice(&send_buf[(d+depth)*buffer_size],&hsend_buf[(d+depth)*buffer_size],bytes);
grid->SendToRecvFromBegin(bwd_req,
(void *)&hsend_buf[(d+depth)*buffer_size], recv_from_rank,
(void *)&hrecv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
#endif
t_comms+=usecond()-t; t_comms+=usecond()-t;
} }
@@ -553,11 +533,6 @@ public:
t=usecond(); t=usecond();
grid->CommsComplete(fwd_req); grid->CommsComplete(fwd_req);
#ifndef ACCELERATOR_AWARE_MPI
for ( int d=0;d < depth ; d ++ ) {
acceleratorCopyToDevice(&hrecv_buf[d*buffer_size],&recv_buf[d*buffer_size],bytes);
}
#endif
t_comms+= usecond() - t; t_comms+= usecond() - t;
t=usecond(); t=usecond();
@@ -568,11 +543,6 @@ public:
t=usecond(); t=usecond();
grid->CommsComplete(bwd_req); grid->CommsComplete(bwd_req);
#ifndef ACCELERATOR_AWARE_MPI
for ( int d=0;d < depth ; d ++ ) {
acceleratorCopyToDevice(&hrecv_buf[(d+depth)*buffer_size],&recv_buf[(d+depth)*buffer_size],bytes);
}
#endif
t_comms+= usecond() - t; t_comms+= usecond() - t;
t=usecond(); t=usecond();

View File

@@ -69,7 +69,6 @@ GridLogger GridLogMemory (1, "Memory", GridLogColours, "NORMAL");
GridLogger GridLogTracing(1, "Tracing", GridLogColours, "NORMAL"); GridLogger GridLogTracing(1, "Tracing", GridLogColours, "NORMAL");
GridLogger GridLogDebug (1, "Debug", GridLogColours, "PURPLE"); GridLogger GridLogDebug (1, "Debug", GridLogColours, "PURPLE");
GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN"); GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN");
GridLogger GridLogComms (1, "Comms", GridLogColours, "BLUE");
GridLogger GridLogDslash (1, "Dslash", GridLogColours, "BLUE"); GridLogger GridLogDslash (1, "Dslash", GridLogColours, "BLUE");
GridLogger GridLogIterative (1, "Iterative", GridLogColours, "BLUE"); GridLogger GridLogIterative (1, "Iterative", GridLogColours, "BLUE");
GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE"); GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE");
@@ -85,7 +84,6 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
GridLogDebug.Active(0); GridLogDebug.Active(0);
GridLogPerformance.Active(0); GridLogPerformance.Active(0);
GridLogDslash.Active(0); GridLogDslash.Active(0);
GridLogComms.Active(0);
GridLogIntegrator.Active(1); GridLogIntegrator.Active(1);
GridLogColours.Active(0); GridLogColours.Active(0);
GridLogHMC.Active(1); GridLogHMC.Active(1);
@@ -99,7 +97,6 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
if (logstreams[i] == std::string("Debug")) GridLogDebug.Active(1); if (logstreams[i] == std::string("Debug")) GridLogDebug.Active(1);
if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1); if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1);
if (logstreams[i] == std::string("Dslash")) GridLogDslash.Active(1); if (logstreams[i] == std::string("Dslash")) GridLogDslash.Active(1);
if (logstreams[i] == std::string("Comms")) GridLogComms.Active(1);
if (logstreams[i] == std::string("NoIntegrator"))GridLogIntegrator.Active(0); if (logstreams[i] == std::string("NoIntegrator"))GridLogIntegrator.Active(0);
if (logstreams[i] == std::string("NoHMC")) GridLogHMC.Active(0); if (logstreams[i] == std::string("NoHMC")) GridLogHMC.Active(0);
if (logstreams[i] == std::string("Colours")) GridLogColours.Active(1); if (logstreams[i] == std::string("Colours")) GridLogColours.Active(1);

View File

@@ -33,6 +33,10 @@
#ifndef GRID_LOG_H #ifndef GRID_LOG_H
#define GRID_LOG_H #define GRID_LOG_H
#ifdef HAVE_EXECINFO_H
#include <execinfo.h>
#endif
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////
@@ -176,7 +180,6 @@ extern GridLogger GridLogError;
extern GridLogger GridLogWarning; extern GridLogger GridLogWarning;
extern GridLogger GridLogMessage; extern GridLogger GridLogMessage;
extern GridLogger GridLogDebug; extern GridLogger GridLogDebug;
extern GridLogger GridLogComms;
extern GridLogger GridLogPerformance; extern GridLogger GridLogPerformance;
extern GridLogger GridLogDslash; extern GridLogger GridLogDslash;
extern GridLogger GridLogIterative; extern GridLogger GridLogIterative;
@@ -223,6 +226,8 @@ inline void Grid_pass(Args&&... args) {
std::cout << "\033[32m" << GridLogMessage << msg << "\033[0m" << std::endl; std::cout << "\033[32m" << GridLogMessage << msg << "\033[0m" << std::endl;
} }
#define _NBACKTRACE (256)
extern void * Grid_backtrace_buffer[_NBACKTRACE];
#define BACKTRACEFILE() { \ #define BACKTRACEFILE() { \
char string[20]; \ char string[20]; \

View File

@@ -293,9 +293,9 @@ class BinaryIO {
// Flatten the file // Flatten the file
uint64_t lsites = grid->lSites(); uint64_t lsites = grid->lSites();
if ( control & BINARYIO_MASTER_APPEND ) { if ( control & BINARYIO_MASTER_APPEND ) {
GRID_ASSERT(iodata.size()==1); assert(iodata.size()==1);
} else { } else {
GRID_ASSERT(lsites==iodata.size()); assert(lsites==iodata.size());
} }
for(int d=0;d<ndim;d++){ for(int d=0;d<ndim;d++){
gStart[d] = lLattice[d]*pcoor[d]; gStart[d] = lLattice[d]*pcoor[d];
@@ -326,20 +326,20 @@ class BinaryIO {
// Sobj in MPI phrasing // Sobj in MPI phrasing
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
int ierr; int ierr;
ierr = MPI_Type_contiguous(numword,mpiword,&mpiObject); GRID_ASSERT(ierr==0); ierr = MPI_Type_contiguous(numword,mpiword,&mpiObject); assert(ierr==0);
ierr = MPI_Type_commit(&mpiObject); ierr = MPI_Type_commit(&mpiObject);
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
// File global array data type // File global array data type
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
ierr=MPI_Type_create_subarray(ndim,&gLattice[0],&lLattice[0],&gStart[0],MPI_ORDER_FORTRAN, mpiObject,&fileArray); GRID_ASSERT(ierr==0); ierr=MPI_Type_create_subarray(ndim,&gLattice[0],&lLattice[0],&gStart[0],MPI_ORDER_FORTRAN, mpiObject,&fileArray); assert(ierr==0);
ierr=MPI_Type_commit(&fileArray); GRID_ASSERT(ierr==0); ierr=MPI_Type_commit(&fileArray); assert(ierr==0);
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
// local lattice array // local lattice array
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
ierr=MPI_Type_create_subarray(ndim,&lLattice[0],&lLattice[0],&lStart[0],MPI_ORDER_FORTRAN, mpiObject,&localArray); GRID_ASSERT(ierr==0); ierr=MPI_Type_create_subarray(ndim,&lLattice[0],&lLattice[0],&lStart[0],MPI_ORDER_FORTRAN, mpiObject,&localArray); assert(ierr==0);
ierr=MPI_Type_commit(&localArray); GRID_ASSERT(ierr==0); ierr=MPI_Type_commit(&localArray); assert(ierr==0);
#endif #endif
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
@@ -349,8 +349,8 @@ class BinaryIO {
int ieee32 = (format == std::string("IEEE32")); int ieee32 = (format == std::string("IEEE32"));
int ieee64big = (format == std::string("IEEE64BIG")); int ieee64big = (format == std::string("IEEE64BIG"));
int ieee64 = (format == std::string("IEEE64") || format == std::string("IEEE64LITTLE")); int ieee64 = (format == std::string("IEEE64") || format == std::string("IEEE64LITTLE"));
GRID_ASSERT(ieee64||ieee32|ieee64big||ieee32big); assert(ieee64||ieee32|ieee64big||ieee32big);
GRID_ASSERT((ieee64+ieee32+ieee64big+ieee32big)==1); assert((ieee64+ieee32+ieee64big+ieee32big)==1);
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
// Do the I/O // Do the I/O
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
@@ -361,9 +361,9 @@ class BinaryIO {
if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) { if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) {
#ifdef USE_MPI_IO #ifdef USE_MPI_IO
std::cout<< GridLogMessage<<"IOobject: MPI read I/O "<< file<< std::endl; std::cout<< GridLogMessage<<"IOobject: MPI read I/O "<< file<< std::endl;
ierr=MPI_File_open(grid->communicator,(char *) file.c_str(), MPI_MODE_RDONLY, MPI_INFO_NULL, &fh); GRID_ASSERT(ierr==0); ierr=MPI_File_open(grid->communicator,(char *) file.c_str(), MPI_MODE_RDONLY, MPI_INFO_NULL, &fh); assert(ierr==0);
ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL); GRID_ASSERT(ierr==0); ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL); assert(ierr==0);
ierr=MPI_File_read_all(fh, &iodata[0], 1, localArray, &status); GRID_ASSERT(ierr==0); ierr=MPI_File_read_all(fh, &iodata[0], 1, localArray, &status); assert(ierr==0);
MPI_File_close(&fh); MPI_File_close(&fh);
MPI_Type_free(&fileArray); MPI_Type_free(&fileArray);
MPI_Type_free(&localArray); MPI_Type_free(&localArray);
@@ -384,7 +384,7 @@ class BinaryIO {
fin.seekg(offset + myrank * lsites * sizeof(fobj)); fin.seekg(offset + myrank * lsites * sizeof(fobj));
} }
fin.read((char *)&iodata[0], iodata.size() * sizeof(fobj)); fin.read((char *)&iodata[0], iodata.size() * sizeof(fobj));
GRID_ASSERT(fin.fail() == 0); assert(fin.fail() == 0);
fin.close(); fin.close();
} }
timer.Stop(); timer.Stop();
@@ -435,11 +435,11 @@ class BinaryIO {
std::cout << GridLogDebug << "MPI write I/O set view " << file << std::endl; std::cout << GridLogDebug << "MPI write I/O set view " << file << std::endl;
ierr = MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL); ierr = MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);
GRID_ASSERT(ierr == 0); assert(ierr == 0);
std::cout << GridLogDebug << "MPI write I/O write all " << file << std::endl; std::cout << GridLogDebug << "MPI write I/O write all " << file << std::endl;
ierr = MPI_File_write_all(fh, &iodata[0], 1, localArray, &status); ierr = MPI_File_write_all(fh, &iodata[0], 1, localArray, &status);
GRID_ASSERT(ierr == 0); assert(ierr == 0);
MPI_Offset os; MPI_Offset os;
MPI_File_get_position(fh, &os); MPI_File_get_position(fh, &os);

View File

@@ -289,7 +289,7 @@ class GridLimeReader : public BinaryIO {
return; return;
} }
} }
GRID_ASSERT(0); assert(0);
} }
//////////////////////////////////////////// ////////////////////////////////////////////
// Read a generic serialisable object // Read a generic serialisable object
@@ -314,7 +314,7 @@ class GridLimeReader : public BinaryIO {
} }
} }
GRID_ASSERT(0); assert(0);
} }
template<class serialisable_object> template<class serialisable_object>
@@ -348,7 +348,7 @@ class GridLimeWriter : public BinaryIO
filename= _filename; filename= _filename;
if ( boss_node ) { if ( boss_node ) {
File = fopen(filename.c_str(), "w"); File = fopen(filename.c_str(), "w");
LimeW = limeCreateWriter(File); GRID_ASSERT(LimeW != NULL ); LimeW = limeCreateWriter(File); assert(LimeW != NULL );
} }
} }
///////////////////////////////////////////// /////////////////////////////////////////////
@@ -368,7 +368,7 @@ class GridLimeWriter : public BinaryIO
if ( boss_node ) { if ( boss_node ) {
LimeRecordHeader *h; LimeRecordHeader *h;
h = limeCreateHeader(MB, ME, const_cast<char *>(message.c_str()), PayloadSize); h = limeCreateHeader(MB, ME, const_cast<char *>(message.c_str()), PayloadSize);
GRID_ASSERT(limeWriteRecordHeader(h, LimeW) >= 0); assert(limeWriteRecordHeader(h, LimeW) >= 0);
limeDestroyHeader(h); limeDestroyHeader(h);
} }
return LIME_SUCCESS; return LIME_SUCCESS;
@@ -386,11 +386,11 @@ class GridLimeWriter : public BinaryIO
// std::cout << " xmlstring "<< nbytes<< " " << xmlstring <<std::endl; // std::cout << " xmlstring "<< nbytes<< " " << xmlstring <<std::endl;
int err; int err;
LimeRecordHeader *h = limeCreateHeader(MB, ME,const_cast<char *>(record_name.c_str()), nbytes); LimeRecordHeader *h = limeCreateHeader(MB, ME,const_cast<char *>(record_name.c_str()), nbytes);
GRID_ASSERT(h!= NULL); assert(h!= NULL);
err=limeWriteRecordHeader(h, LimeW); GRID_ASSERT(err>=0); err=limeWriteRecordHeader(h, LimeW); assert(err>=0);
err=limeWriteRecordData(&xmlstring[0], &nbytes, LimeW); GRID_ASSERT(err>=0); err=limeWriteRecordData(&xmlstring[0], &nbytes, LimeW); assert(err>=0);
err=limeWriterCloseRecord(LimeW); GRID_ASSERT(err>=0); err=limeWriterCloseRecord(LimeW); assert(err>=0);
limeDestroyHeader(h); limeDestroyHeader(h);
} }
} }
@@ -431,7 +431,7 @@ class GridLimeWriter : public BinaryIO
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
GridBase *grid = field.Grid(); GridBase *grid = field.Grid();
GRID_ASSERT(boss_node == field.Grid()->IsBoss() ); assert(boss_node == field.Grid()->IsBoss() );
FieldNormMetaData FNMD; FNMD.norm2 = norm2(field); FieldNormMetaData FNMD; FNMD.norm2 = norm2(field);
@@ -473,7 +473,7 @@ class GridLimeWriter : public BinaryIO
if ( boss_node ) { if ( boss_node ) {
fseek(File,0,SEEK_END); fseek(File,0,SEEK_END);
uint64_t offset2 = ftello(File); // std::cout << " now at offset "<<offset2 << std::endl; uint64_t offset2 = ftello(File); // std::cout << " now at offset "<<offset2 << std::endl;
GRID_ASSERT( (offset2-offset1) == PayloadSize); assert( (offset2-offset1) == PayloadSize);
} }
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
@@ -481,7 +481,7 @@ class GridLimeWriter : public BinaryIO
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
if ( boss_node ) { if ( boss_node ) {
err=limeWriterCloseRecord(LimeW); GRID_ASSERT(err>=0); err=limeWriterCloseRecord(LimeW); assert(err>=0);
} }
//////////////////////////////////////// ////////////////////////////////////////
// Write checksum element, propagaing forward from the BinaryIO // Write checksum element, propagaing forward from the BinaryIO
@@ -621,8 +621,8 @@ class IldgWriter : public ScidacWriter {
uint64_t PayloadSize = LFN.size(); uint64_t PayloadSize = LFN.size();
int err; int err;
createLimeRecordHeader(ILDG_DATA_LFN, 0 , 0, PayloadSize); createLimeRecordHeader(ILDG_DATA_LFN, 0 , 0, PayloadSize);
err=limeWriteRecordData(const_cast<char*>(LFN.c_str()), &PayloadSize,LimeW); GRID_ASSERT(err>=0); err=limeWriteRecordData(const_cast<char*>(LFN.c_str()), &PayloadSize,LimeW); assert(err>=0);
err=limeWriterCloseRecord(LimeW); GRID_ASSERT(err>=0); err=limeWriterCloseRecord(LimeW); assert(err>=0);
} }
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
@@ -656,7 +656,7 @@ class IldgWriter : public ScidacWriter {
header.sequence_number = sequence; header.sequence_number = sequence;
header.ildg_lfn = LFN; header.ildg_lfn = LFN;
GRID_ASSERT ( (format == std::string("IEEE32BIG")) assert ( (format == std::string("IEEE32BIG"))
||(format == std::string("IEEE64BIG")) ); ||(format == std::string("IEEE64BIG")) );
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
@@ -676,8 +676,8 @@ class IldgWriter : public ScidacWriter {
ildgfmt.ly = header.dimension[1]; ildgfmt.ly = header.dimension[1];
ildgfmt.lz = header.dimension[2]; ildgfmt.lz = header.dimension[2];
ildgfmt.lt = header.dimension[3]; ildgfmt.lt = header.dimension[3];
GRID_ASSERT(header.nd==4); assert(header.nd==4);
GRID_ASSERT(header.nd==header.dimension.size()); assert(header.nd==header.dimension.size());
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
// Field norm tests // Field norm tests
@@ -734,7 +734,7 @@ class IldgReader : public GridLimeReader {
Coordinate dims = Umu.Grid()->FullDimensions(); Coordinate dims = Umu.Grid()->FullDimensions();
GRID_ASSERT(dims.size()==4); assert(dims.size()==4);
// Metadata holders // Metadata holders
ildgFormat ildgFormat_ ; ildgFormat ildgFormat_ ;
@@ -793,10 +793,10 @@ class IldgReader : public GridLimeReader {
if ( ildgFormat_.precision == 64 ) format = std::string("IEEE64BIG"); if ( ildgFormat_.precision == 64 ) format = std::string("IEEE64BIG");
if ( ildgFormat_.precision == 32 ) format = std::string("IEEE32BIG"); if ( ildgFormat_.precision == 32 ) format = std::string("IEEE32BIG");
GRID_ASSERT( ildgFormat_.lx == dims[0]); assert( ildgFormat_.lx == dims[0]);
GRID_ASSERT( ildgFormat_.ly == dims[1]); assert( ildgFormat_.ly == dims[1]);
GRID_ASSERT( ildgFormat_.lz == dims[2]); assert( ildgFormat_.lz == dims[2]);
GRID_ASSERT( ildgFormat_.lt == dims[3]); assert( ildgFormat_.lt == dims[3]);
found_ildgFormat = 1; found_ildgFormat = 1;
} }
@@ -813,10 +813,10 @@ class IldgReader : public GridLimeReader {
format = FieldMetaData_.floating_point; format = FieldMetaData_.floating_point;
GRID_ASSERT(FieldMetaData_.dimension[0] == dims[0]); assert(FieldMetaData_.dimension[0] == dims[0]);
GRID_ASSERT(FieldMetaData_.dimension[1] == dims[1]); assert(FieldMetaData_.dimension[1] == dims[1]);
GRID_ASSERT(FieldMetaData_.dimension[2] == dims[2]); assert(FieldMetaData_.dimension[2] == dims[2]);
GRID_ASSERT(FieldMetaData_.dimension[3] == dims[3]); assert(FieldMetaData_.dimension[3] == dims[3]);
found_FieldMetaData = 1; found_FieldMetaData = 1;
} }
@@ -866,13 +866,13 @@ class IldgReader : public GridLimeReader {
// Minimally must find binary segment and checksum // Minimally must find binary segment and checksum
// Since this is an ILDG reader require ILDG format // Since this is an ILDG reader require ILDG format
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
GRID_ASSERT(found_ildgLFN); assert(found_ildgLFN);
GRID_ASSERT(found_ildgBinary); assert(found_ildgBinary);
GRID_ASSERT(found_ildgFormat); assert(found_ildgFormat);
GRID_ASSERT(found_scidacChecksum); assert(found_scidacChecksum);
// Must find something with the lattice dimensions // Must find something with the lattice dimensions
GRID_ASSERT(found_FieldMetaData||found_ildgFormat); assert(found_FieldMetaData||found_ildgFormat);
if ( found_FieldMetaData ) { if ( found_FieldMetaData ) {
@@ -880,9 +880,9 @@ class IldgReader : public GridLimeReader {
} else { } else {
GRID_ASSERT(found_ildgFormat); assert(found_ildgFormat);
const std::string stNC = std::to_string( Nc ) ; const std::string stNC = std::to_string( Nc ) ;
GRID_ASSERT ( ildgFormat_.field == std::string("su"+stNC+"gauge") ); assert ( ildgFormat_.field == std::string("su"+stNC+"gauge") );
/////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////
// Populate our Grid metadata as best we can // Populate our Grid metadata as best we can
@@ -927,20 +927,20 @@ class IldgReader : public GridLimeReader {
FieldMetaData_.scidac_checksuma = stoull(scidacChecksum_.suma,0,16); FieldMetaData_.scidac_checksuma = stoull(scidacChecksum_.suma,0,16);
FieldMetaData_.scidac_checksumb = stoull(scidacChecksum_.sumb,0,16); FieldMetaData_.scidac_checksumb = stoull(scidacChecksum_.sumb,0,16);
scidacChecksumVerify(scidacChecksum_,scidac_csuma,scidac_csumb); scidacChecksumVerify(scidacChecksum_,scidac_csuma,scidac_csumb);
GRID_ASSERT( scidac_csuma ==FieldMetaData_.scidac_checksuma); assert( scidac_csuma ==FieldMetaData_.scidac_checksuma);
GRID_ASSERT( scidac_csumb ==FieldMetaData_.scidac_checksumb); assert( scidac_csumb ==FieldMetaData_.scidac_checksumb);
std::cout << GridLogMessage<<"SciDAC checksums match " << std::endl; std::cout << GridLogMessage<<"SciDAC checksums match " << std::endl;
} else { } else {
std::cout << GridLogWarning<<"SciDAC checksums not found. This is unsafe. " << std::endl; std::cout << GridLogWarning<<"SciDAC checksums not found. This is unsafe. " << std::endl;
GRID_ASSERT(0); // Can I insist always checksum ? assert(0); // Can I insist always checksum ?
} }
if ( found_FieldMetaData || found_usqcdInfo ) { if ( found_FieldMetaData || found_usqcdInfo ) {
FieldMetaData checker; FieldMetaData checker;
stats Stats; stats Stats;
Stats(Umu,checker); Stats(Umu,checker);
GRID_ASSERT(fabs(checker.plaquette - FieldMetaData_.plaquette )<1.0e-5); assert(fabs(checker.plaquette - FieldMetaData_.plaquette )<1.0e-5);
GRID_ASSERT(fabs(checker.link_trace - FieldMetaData_.link_trace)<1.0e-5); assert(fabs(checker.link_trace - FieldMetaData_.link_trace)<1.0e-5);
std::cout << GridLogMessage<<"Plaquette and link trace match " << std::endl; std::cout << GridLogMessage<<"Plaquette and link trace match " << std::endl;
} }
} }

View File

@@ -203,7 +203,7 @@ template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzCo
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
inline void reconstruct3(LorentzColourMatrix & cm) inline void reconstruct3(LorentzColourMatrix & cm)
{ {
GRID_ASSERT( Nc < 4 && Nc > 1 ) ; assert( Nc < 4 && Nc > 1 ) ;
for(int mu=0;mu<Nd;mu++){ for(int mu=0;mu<Nd;mu++){
#if Nc == 2 #if Nc == 2
cm(mu)()(1,0) = -adj(cm(mu)()(0,y)) ; cm(mu)()(1,0) = -adj(cm(mu)()(0,y)) ;
@@ -240,7 +240,7 @@ struct BinarySimpleUnmunger {
sobj_stype *in_buffer = (sobj_stype *)&in; sobj_stype *in_buffer = (sobj_stype *)&in;
size_t fobj_words = sizeof(out) / sizeof(fobj_stype); size_t fobj_words = sizeof(out) / sizeof(fobj_stype);
size_t sobj_words = sizeof(in) / sizeof(sobj_stype); size_t sobj_words = sizeof(in) / sizeof(sobj_stype);
GRID_ASSERT(fobj_words == sobj_words); assert(fobj_words == sobj_words);
for (unsigned int word = 0; word < sobj_words; word++) for (unsigned int word = 0; word < sobj_words; word++)
out_buffer[word] = in_buffer[word]; // type conversion on the fly out_buffer[word] = in_buffer[word]; // type conversion on the fly
@@ -259,7 +259,7 @@ struct BinarySimpleMunger {
sobj_stype *out_buffer = (sobj_stype *)&out; sobj_stype *out_buffer = (sobj_stype *)&out;
size_t fobj_words = sizeof(in) / sizeof(fobj_stype); size_t fobj_words = sizeof(in) / sizeof(fobj_stype);
size_t sobj_words = sizeof(out) / sizeof(sobj_stype); size_t sobj_words = sizeof(out) / sizeof(sobj_stype);
GRID_ASSERT(fobj_words == sobj_words); assert(fobj_words == sobj_words);
for (unsigned int word = 0; word < sobj_words; word++) for (unsigned int word = 0; word < sobj_words; word++)
out_buffer[word] = in_buffer[word]; // type conversion on the fly out_buffer[word] = in_buffer[word]; // type conversion on the fly

View File

@@ -76,7 +76,7 @@ public:
removeWhitespace(line); removeWhitespace(line);
std::cout << GridLogMessage << "* " << line << std::endl; std::cout << GridLogMessage << "* " << line << std::endl;
GRID_ASSERT(line==std::string("BEGIN_HEADER")); assert(line==std::string("BEGIN_HEADER"));
do { do {
getline(fin,line); // read one line getline(fin,line); // read one line
@@ -106,9 +106,9 @@ public:
field.dimension[2] = std::stol(header["DIMENSION_3"]); field.dimension[2] = std::stol(header["DIMENSION_3"]);
field.dimension[3] = std::stol(header["DIMENSION_4"]); field.dimension[3] = std::stol(header["DIMENSION_4"]);
GRID_ASSERT(grid->_ndimension == 4); assert(grid->_ndimension == 4);
for(int d=0;d<4;d++){ for(int d=0;d<4;d++){
GRID_ASSERT(grid->_fdimensions[d]==field.dimension[d]); assert(grid->_fdimensions[d]==field.dimension[d]);
} }
field.link_trace = std::stod(header["LINK_TRACE"]); field.link_trace = std::stod(header["LINK_TRACE"]);
@@ -183,7 +183,7 @@ public:
nersc_csum,scidac_csuma,scidac_csumb); nersc_csum,scidac_csuma,scidac_csumb);
} }
} else { } else {
GRID_ASSERT(0); assert(0);
} }
GaugeStats Stats; Stats(Umu,clone); GaugeStats Stats; Stats(Umu,clone);
@@ -205,9 +205,9 @@ public:
std::cerr << " nersc_csum " <<std::hex<< nersc_csum << " " << header.checksum<< std::dec<< std::endl; std::cerr << " nersc_csum " <<std::hex<< nersc_csum << " " << header.checksum<< std::dec<< std::endl;
exit(0); exit(0);
} }
if(exitOnReadPlaquetteMismatch()) GRID_ASSERT(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 ); if(exitOnReadPlaquetteMismatch()) assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
GRID_ASSERT(fabs(clone.link_trace-header.link_trace) < 1.0e-6 ); assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
GRID_ASSERT(nersc_csum == header.checksum ); assert(nersc_csum == header.checksum );
std::cout<<GridLogMessage <<"NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl; std::cout<<GridLogMessage <<"NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl;
} }
@@ -246,7 +246,7 @@ public:
GridBase *grid = Umu.Grid(); GridBase *grid = Umu.Grid();
GridMetaData(grid,header); GridMetaData(grid,header);
GRID_ASSERT(header.nd==4); assert(header.nd==4);
GaugeStats Stats; Stats(Umu,header); GaugeStats Stats; Stats(Umu,header);
MachineCharacteristics(header); MachineCharacteristics(header);
@@ -302,7 +302,7 @@ public:
GridBase *grid = parallel.Grid(); GridBase *grid = parallel.Grid();
GridMetaData(grid,header); GridMetaData(grid,header);
GRID_ASSERT(header.nd==4); assert(header.nd==4);
header.link_trace=0.0; header.link_trace=0.0;
header.plaquette=0.0; header.plaquette=0.0;
MachineCharacteristics(header); MachineCharacteristics(header);
@@ -355,16 +355,16 @@ public:
std::string data_type(header.data_type); std::string data_type(header.data_type);
#ifdef RNG_RANLUX #ifdef RNG_RANLUX
GRID_ASSERT(format == std::string("UINT64")); assert(format == std::string("UINT64"));
GRID_ASSERT(data_type == std::string("RANLUX48")); assert(data_type == std::string("RANLUX48"));
#endif #endif
#ifdef RNG_MT19937 #ifdef RNG_MT19937
GRID_ASSERT(format == std::string("UINT32")); assert(format == std::string("UINT32"));
GRID_ASSERT(data_type == std::string("MT19937")); assert(data_type == std::string("MT19937"));
#endif #endif
#ifdef RNG_SITMO #ifdef RNG_SITMO
GRID_ASSERT(format == std::string("UINT64")); assert(format == std::string("UINT64"));
GRID_ASSERT(data_type == std::string("SITMO")); assert(data_type == std::string("SITMO"));
#endif #endif
// depending on datatype, set up munger; // depending on datatype, set up munger;
@@ -376,7 +376,7 @@ public:
std::cerr << "checksum mismatch "<<std::hex<< nersc_csum <<" "<<header.checksum<<std::dec<<std::endl; std::cerr << "checksum mismatch "<<std::hex<< nersc_csum <<" "<<header.checksum<<std::dec<<std::endl;
exit(0); exit(0);
} }
GRID_ASSERT(nersc_csum == header.checksum ); assert(nersc_csum == header.checksum );
std::cout<<GridLogMessage <<"Read NERSC RNG file "<<file<< " format "<< data_type <<std::endl; std::cout<<GridLogMessage <<"Read NERSC RNG file "<<file<< " format "<< data_type <<std::endl;
} }

View File

@@ -49,7 +49,7 @@ public:
{ {
std::ifstream fin(file, std::ios::in | std::ios::binary); std::ifstream fin(file, std::ios::in | std::ios::binary);
fin.read(reinterpret_cast<char*>(&header), sizeof(OpenQcdHeader)); fin.read(reinterpret_cast<char*>(&header), sizeof(OpenQcdHeader));
GRID_ASSERT(!fin.fail()); assert(!fin.fail());
field.data_start = fin.tellg(); field.data_start = fin.tellg();
fin.close(); fin.close();
} }
@@ -57,10 +57,10 @@ public:
header.plaq /= normalisationFactor; header.plaq /= normalisationFactor;
// sanity check (should trigger on endian issues) // sanity check (should trigger on endian issues)
GRID_ASSERT(0 < header.Nt && header.Nt <= 1024); assert(0 < header.Nt && header.Nt <= 1024);
GRID_ASSERT(0 < header.Nx && header.Nx <= 1024); assert(0 < header.Nx && header.Nx <= 1024);
GRID_ASSERT(0 < header.Ny && header.Ny <= 1024); assert(0 < header.Ny && header.Ny <= 1024);
GRID_ASSERT(0 < header.Nz && header.Nz <= 1024); assert(0 < header.Nz && header.Nz <= 1024);
field.dimension[0] = header.Nx; field.dimension[0] = header.Nx;
field.dimension[1] = header.Ny; field.dimension[1] = header.Ny;
@@ -71,9 +71,9 @@ public:
std::cout << GridLogDebug << "grid dimensions: " << grid->_fdimensions << std::endl; std::cout << GridLogDebug << "grid dimensions: " << grid->_fdimensions << std::endl;
std::cout << GridLogDebug << "file dimensions: " << field.dimension << std::endl; std::cout << GridLogDebug << "file dimensions: " << field.dimension << std::endl;
GRID_ASSERT(grid->_ndimension == Nd); assert(grid->_ndimension == Nd);
for(int d = 0; d < Nd; d++) for(int d = 0; d < Nd; d++)
GRID_ASSERT(grid->_fdimensions[d] == field.dimension[d]); assert(grid->_fdimensions[d] == field.dimension[d]);
field.plaquette = header.plaq; field.plaquette = header.plaq;
@@ -86,10 +86,10 @@ public:
std::string file) { std::string file) {
typedef Lattice<iDoubleStoredColourMatrix<vsimd>> DoubleStoredGaugeField; typedef Lattice<iDoubleStoredColourMatrix<vsimd>> DoubleStoredGaugeField;
GRID_ASSERT(Ns == 4 and Nd == 4 and Nc == 3); assert(Ns == 4 and Nd == 4 and Nc == 3);
auto grid = dynamic_cast<GridCartesian*>(Umu.Grid()); auto grid = dynamic_cast<GridCartesian*>(Umu.Grid());
GRID_ASSERT(grid != nullptr); GRID_ASSERT(grid->_ndimension == Nd); assert(grid != nullptr); assert(grid->_ndimension == Nd);
uint64_t offset = readHeader(file, Umu.Grid(), header); uint64_t offset = readHeader(file, Umu.Grid(), header);
@@ -171,7 +171,7 @@ public:
if(plaq_diff >= tol) if(plaq_diff >= tol)
std::cout << " Plaquette mismatch (diff = " << plaq_diff << ", tol = " << tol << ")" << std::endl; std::cout << " Plaquette mismatch (diff = " << plaq_diff << ", tol = " << tol << ")" << std::endl;
GRID_ASSERT(plaq_diff < tol); assert(plaq_diff < tol);
std::cout << GridLogMessage << "OpenQcd Configuration " << file << " and plaquette agree" << std::endl; std::cout << GridLogMessage << "OpenQcd Configuration " << file << " and plaquette agree" << std::endl;
} }

View File

@@ -62,7 +62,7 @@ public:
: swap(false) : swap(false)
, grid(gridPtr) { , grid(gridPtr) {
err = MPI_File_open(comm, const_cast<char*>(filename.c_str()), MPI_MODE_RDONLY, MPI_INFO_NULL, &fp); err = MPI_File_open(comm, const_cast<char*>(filename.c_str()), MPI_MODE_RDONLY, MPI_INFO_NULL, &fp);
GRID_ASSERT(err == MPI_SUCCESS); assert(err == MPI_SUCCESS);
} }
virtual ~ParRdr() { MPI_File_close(&fp); } virtual ~ParRdr() { MPI_File_close(&fp); }
@@ -76,8 +76,8 @@ public:
} }
int readHeader(FieldMetaData& field) { int readHeader(FieldMetaData& field) {
GRID_ASSERT((grid->_ndimension == Nd) && (Nd == 4)); assert((grid->_ndimension == Nd) && (Nd == 4));
GRID_ASSERT(Nc == 3); assert(Nc == 3);
OpenQcdHeader header; OpenQcdHeader header;
@@ -86,10 +86,10 @@ public:
header.plaq /= 3.; // TODO change this into normalizationfactor header.plaq /= 3.; // TODO change this into normalizationfactor
// sanity check (should trigger on endian issues) TODO remove? // sanity check (should trigger on endian issues) TODO remove?
GRID_ASSERT(0 < header.Nt && header.Nt <= 1024); assert(0 < header.Nt && header.Nt <= 1024);
GRID_ASSERT(0 < header.Nx && header.Nx <= 1024); assert(0 < header.Nx && header.Nx <= 1024);
GRID_ASSERT(0 < header.Ny && header.Ny <= 1024); assert(0 < header.Ny && header.Ny <= 1024);
GRID_ASSERT(0 < header.Nz && header.Nz <= 1024); assert(0 < header.Nz && header.Nz <= 1024);
field.dimension[0] = header.Nx; field.dimension[0] = header.Nx;
field.dimension[1] = header.Ny; field.dimension[1] = header.Ny;
@@ -97,7 +97,7 @@ public:
field.dimension[3] = header.Nt; field.dimension[3] = header.Nt;
for(int d = 0; d < Nd; d++) for(int d = 0; d < Nd; d++)
GRID_ASSERT(grid->FullDimensions()[d] == field.dimension[d]); assert(grid->FullDimensions()[d] == field.dimension[d]);
field.plaquette = header.plaq; field.plaquette = header.plaq;
@@ -114,15 +114,15 @@ public:
int read = -1; int read = -1;
MPI_Get_count(&status, datatype, &read); MPI_Get_count(&status, datatype, &read);
// CHECK_VAR(read) // CHECK_VAR(read)
GRID_ASSERT(nbytes == (uint64_t)read); assert(nbytes == (uint64_t)read);
GRID_ASSERT(err == MPI_SUCCESS); assert(err == MPI_SUCCESS);
} }
void createTypes() { void createTypes() {
constexpr int elem_size = Nd * 2 * 2 * Nc * Nc * sizeof(double); // 2_complex 2_fwdbwd constexpr int elem_size = Nd * 2 * 2 * Nc * Nc * sizeof(double); // 2_complex 2_fwdbwd
err = MPI_Type_contiguous(elem_size, MPI_BYTE, &oddSiteType); GRID_ASSERT(err == MPI_SUCCESS); err = MPI_Type_contiguous(elem_size, MPI_BYTE, &oddSiteType); assert(err == MPI_SUCCESS);
err = MPI_Type_commit(&oddSiteType); GRID_ASSERT(err == MPI_SUCCESS); err = MPI_Type_commit(&oddSiteType); assert(err == MPI_SUCCESS);
Coordinate const L = grid->GlobalDimensions(); Coordinate const L = grid->GlobalDimensions();
Coordinate const l = grid->LocalDimensions(); Coordinate const l = grid->LocalDimensions();
@@ -132,20 +132,20 @@ public:
Coordinate subsizes({l[2] / 2, l[1], l[0], l[3]}); Coordinate subsizes({l[2] / 2, l[1], l[0], l[3]});
Coordinate starts({i[2] * l[2] / 2, i[1] * l[1], i[0] * l[0], i[3] * l[3]}); Coordinate starts({i[2] * l[2] / 2, i[1] * l[1], i[0] * l[0], i[3] * l[3]});
err = MPI_Type_create_subarray(grid->_ndimension, &sizes[0], &subsizes[0], &starts[0], MPI_ORDER_FORTRAN, oddSiteType, &fileViewType); GRID_ASSERT(err == MPI_SUCCESS); err = MPI_Type_create_subarray(grid->_ndimension, &sizes[0], &subsizes[0], &starts[0], MPI_ORDER_FORTRAN, oddSiteType, &fileViewType); assert(err == MPI_SUCCESS);
err = MPI_Type_commit(&fileViewType); GRID_ASSERT(err == MPI_SUCCESS); err = MPI_Type_commit(&fileViewType); assert(err == MPI_SUCCESS);
} }
void freeTypes() { void freeTypes() {
err = MPI_Type_free(&fileViewType); GRID_ASSERT(err == MPI_SUCCESS); err = MPI_Type_free(&fileViewType); assert(err == MPI_SUCCESS);
err = MPI_Type_free(&oddSiteType); GRID_ASSERT(err == MPI_SUCCESS); err = MPI_Type_free(&oddSiteType); assert(err == MPI_SUCCESS);
} }
bool readGauge(std::vector<ColourMatrixD>& domain_buff, FieldMetaData& meta) { bool readGauge(std::vector<ColourMatrixD>& domain_buff, FieldMetaData& meta) {
auto hdr_offset = readHeader(meta); auto hdr_offset = readHeader(meta);
CHECK CHECK
createTypes(); createTypes();
err = MPI_File_set_view(fp, hdr_offset, oddSiteType, fileViewType, "native", MPI_INFO_NULL); errInfo(err, "MPI_File_set_view0"); GRID_ASSERT(err == MPI_SUCCESS); err = MPI_File_set_view(fp, hdr_offset, oddSiteType, fileViewType, "native", MPI_INFO_NULL); errInfo(err, "MPI_File_set_view0"); assert(err == MPI_SUCCESS);
CHECK CHECK
int const domainSites = grid->lSites(); int const domainSites = grid->lSites();
domain_buff.resize(Nd * domainSites); // 2_fwdbwd * 4_Nd * domainSites / 2_onlyodd domain_buff.resize(Nd * domainSites); // 2_fwdbwd * 4_Nd * domainSites / 2_onlyodd
@@ -166,7 +166,7 @@ public:
CHECK CHECK
err = MPI_File_set_view(fp, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL); err = MPI_File_set_view(fp, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL);
errInfo(err, "MPI_File_set_view1"); errInfo(err, "MPI_File_set_view1");
GRID_ASSERT(err == MPI_SUCCESS); assert(err == MPI_SUCCESS);
freeTypes(); freeTypes();
std::cout << GridLogMessage << "read sum: " << n_os * os_size << " bytes" << std::endl; std::cout << GridLogMessage << "read sum: " << n_os * os_size << " bytes" << std::endl;
@@ -182,7 +182,7 @@ public:
std::string file) { std::string file) {
typedef Lattice<iDoubleStoredColourMatrix<vsimd>> DoubledGaugeField; typedef Lattice<iDoubleStoredColourMatrix<vsimd>> DoubledGaugeField;
GRID_ASSERT(Ns == 4 and Nd == 4 and Nc == 3); assert(Ns == 4 and Nd == 4 and Nc == 3);
auto grid = Umu.Grid(); auto grid = Umu.Grid();
@@ -225,7 +225,7 @@ public:
if(plaq_diff >= tol) if(plaq_diff >= tol)
std::cout << " Plaquette mismatch (diff = " << plaq_diff << ", tol = " << tol << ")" << std::endl; std::cout << " Plaquette mismatch (diff = " << plaq_diff << ", tol = " << tol << ")" << std::endl;
GRID_ASSERT(plaq_diff < tol); assert(plaq_diff < tol);
std::cout << GridLogMessage << "OpenQcd Configuration " << file << " and plaquette agree" << std::endl; std::cout << GridLogMessage << "OpenQcd Configuration " << file << " and plaquette agree" << std::endl;
} }
@@ -246,7 +246,7 @@ private:
static inline void copyToLatticeObject(std::vector<DoubleStoredColourMatrix>& u_fb, static inline void copyToLatticeObject(std::vector<DoubleStoredColourMatrix>& u_fb,
std::vector<ColourMatrixD> const& node_buff, std::vector<ColourMatrixD> const& node_buff,
GridBase* grid) { GridBase* grid) {
GRID_ASSERT(node_buff.size() == Nd * grid->lSites()); assert(node_buff.size() == Nd * grid->lSites());
Coordinate const& l = grid->LocalDimensions(); Coordinate const& l = grid->LocalDimensions();
@@ -274,7 +274,7 @@ private:
buff_idx += 2 * Nd; buff_idx += 2 * Nd;
} }
GRID_ASSERT(node_buff.size() == buff_idx); assert(node_buff.size() == buff_idx);
} }
}; };

Some files were not shown because too many files have changed in this diff Show More