mirror of
https://github.com/paboyle/Grid.git
synced 2025-11-07 07:09:32 +00:00
Compare commits
54 Commits
feature/ft
...
a78a61d76f
| Author | SHA1 | Date | |
|---|---|---|---|
| a78a61d76f | |||
| 2eff3f34ed | |||
| 03687c1d62 | |||
| febfe4e77f | |||
| 4d1aa134b5 | |||
| 5ec879860a | |||
| b728af903c | |||
| 54f1999030 | |||
| fd58f0b669 | |||
| c5c67b706e | |||
| be7a543e2c | |||
| 68f112d576 | |||
| ec1395a304 | |||
| beb0e474ee | |||
| 2b5fdcbbc5 | |||
| 295127d456 | |||
| 7dcfb13694 | |||
| 9fa8bd6438 | |||
| 02c8178f16 | |||
| e637fbacae | |||
| 066544281f | |||
| 11be10d2c0 | |||
| 160969a758 | |||
| 622f78ebea | |||
|
|
aa67a5b095 | ||
|
|
af9ea0864c | ||
|
|
4e2a6d87c4 | ||
|
|
a465ecece9 | ||
|
|
575eb72182 | ||
|
|
3a973914d6 | ||
|
|
f568c07bbd | ||
|
|
2c9878fc3a | ||
|
|
27b1b1b005 | ||
|
|
130d7ab077 | ||
|
|
29f6b8a74a | ||
|
|
9779aaea33 | ||
|
|
ec25604a67 | ||
|
|
3668e81c5e | ||
|
|
d66b2423cb | ||
|
|
15cc78f0b6 | ||
|
|
06db4ddea2 | ||
|
|
6cfb90e99f | ||
|
|
d8be95a2a3 | ||
|
|
f82702872d | ||
|
|
3752c49ef0 | ||
|
|
fe65fa4988 | ||
|
|
1fe4c205a3 | ||
|
|
d4dc5e0f43 | ||
|
|
77944437ce | ||
|
|
c164bff758 | ||
|
|
aa2e3d954a | ||
|
|
de62b04728 | ||
|
|
d0bdb50f24 | ||
|
|
a8fecbc609 |
@@ -12,15 +12,13 @@
|
|||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <sys/time.h>
|
#include <sys/time.h>
|
||||||
|
|
||||||
#define GRID_SYCL
|
|
||||||
#undef GRID_HIP
|
|
||||||
#undef GRID_CUDA
|
|
||||||
|
|
||||||
#ifdef GRID_HIP
|
#ifdef GRID_HIP
|
||||||
#include <hipblas/hipblas.h>
|
#include <hipblas/hipblas.h>
|
||||||
#endif
|
#endif
|
||||||
#ifdef GRID_CUDA
|
#ifdef GRID_CUDA
|
||||||
#include <cublas_v2.h>
|
#include <cublas_v2.h>
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
#ifdef GRID_SYCL
|
#ifdef GRID_SYCL
|
||||||
#include <oneapi/mkl.hpp>
|
#include <oneapi/mkl.hpp>
|
||||||
@@ -45,6 +43,90 @@ inline void acceleratorFreeDevice(void *ptr,size_t bytes){free(ptr,*theAccelerat
|
|||||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { theAccelerator->memset(base,value,bytes); theAccelerator->wait();}
|
inline void acceleratorMemSet(void *base,int value,size_t bytes) { theAccelerator->memset(base,value,bytes); theAccelerator->wait();}
|
||||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theAccelerator->memcpy(to,from,bytes); theAccelerator->wait();}
|
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theAccelerator->memcpy(to,from,bytes); theAccelerator->wait();}
|
||||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theAccelerator->memcpy(to,from,bytes); theAccelerator->wait();}
|
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theAccelerator->memcpy(to,from,bytes); theAccelerator->wait();}
|
||||||
|
#define accelerator_barrier(dummy) { theAccelerator->wait(); }
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef GRID_HIP
|
||||||
|
hipStream_t copyStream;
|
||||||
|
hipStream_t computeStream;
|
||||||
|
void acceleratorInit(void)
|
||||||
|
{
|
||||||
|
int device = 0;
|
||||||
|
auto discard = hipSetDevice(device);
|
||||||
|
discard = hipStreamCreate(©Stream);
|
||||||
|
discard = hipStreamCreate(&computeStream);
|
||||||
|
printf("AcceleratorHIPInit\n");
|
||||||
|
}
|
||||||
|
inline void *acceleratorAllocDevice(size_t bytes)
|
||||||
|
{
|
||||||
|
void *ptr=NULL;
|
||||||
|
auto err = hipMalloc((void **)&ptr,bytes);
|
||||||
|
if( err != hipSuccess ) {
|
||||||
|
ptr = (void *) NULL;
|
||||||
|
fprintf(stderr," hipMalloc failed for %ld %s \n",bytes,hipGetErrorString(err)); fflush(stderr);
|
||||||
|
}
|
||||||
|
return ptr;
|
||||||
|
};
|
||||||
|
inline void acceleratorFreeDevice(void *ptr,size_t bytes){ auto discard=hipFree(ptr);};
|
||||||
|
inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);};
|
||||||
|
inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);}
|
||||||
|
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
|
||||||
|
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
|
||||||
|
#define accelerator_barrier(dummy) \
|
||||||
|
{ \
|
||||||
|
auto tmp=hipStreamSynchronize(computeStream); \
|
||||||
|
auto err = hipGetLastError(); \
|
||||||
|
if ( err != hipSuccess ) { \
|
||||||
|
printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \
|
||||||
|
puts(__FILE__); \
|
||||||
|
printf("Line %d\n",__LINE__); \
|
||||||
|
exit(0); \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef GRID_CUDA
|
||||||
|
cudaStream_t copyStream;
|
||||||
|
cudaStream_t computeStream;
|
||||||
|
void acceleratorInit(void)
|
||||||
|
{
|
||||||
|
int device = 0;
|
||||||
|
cudaSetDevice(device);
|
||||||
|
cudaStreamCreate(©Stream);
|
||||||
|
cudaStreamCreate(&computeStream);
|
||||||
|
}
|
||||||
|
inline void *acceleratorAllocDevice(size_t bytes)
|
||||||
|
{
|
||||||
|
void *ptr=NULL;
|
||||||
|
auto err = cudaMalloc((void **)&ptr,bytes);
|
||||||
|
if( err != cudaSuccess ) {
|
||||||
|
ptr = (void *) NULL;
|
||||||
|
printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err));
|
||||||
|
}
|
||||||
|
return ptr;
|
||||||
|
};
|
||||||
|
inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
|
||||||
|
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
|
||||||
|
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
|
||||||
|
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
|
||||||
|
inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
|
||||||
|
#define accelerator_barrier(dummy) \
|
||||||
|
{ \
|
||||||
|
cudaStreamSynchronize(computeStream); \
|
||||||
|
cudaError err = cudaGetLastError(); \
|
||||||
|
if ( cudaSuccess != err ) { \
|
||||||
|
printf("accelerator_barrier(): Cuda error %s \n", \
|
||||||
|
cudaGetErrorString( err )); \
|
||||||
|
printf("File %s Line %d\n",__FILE__,__LINE__); \
|
||||||
|
fflush(stdout); \
|
||||||
|
if (acceleratorAbortOnGpuError) assert(err==cudaSuccess); \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
template<class T> void acceleratorPut(T& dev,T&host)
|
template<class T> void acceleratorPut(T& dev,T&host)
|
||||||
{
|
{
|
||||||
acceleratorCopyToDevice(&host,&dev,sizeof(T));
|
acceleratorCopyToDevice(&host,&dev,sizeof(T));
|
||||||
@@ -55,9 +137,6 @@ template<class T> T acceleratorGet(T& dev)
|
|||||||
acceleratorCopyFromDevice(&dev,&host,sizeof(T));
|
acceleratorCopyFromDevice(&dev,&host,sizeof(T));
|
||||||
return host;
|
return host;
|
||||||
}
|
}
|
||||||
#define accelerator_barrier(dummy) { theAccelerator->wait(); }
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
/**************************************************************
|
/**************************************************************
|
||||||
* Allocator
|
* Allocator
|
||||||
@@ -210,7 +289,270 @@ public:
|
|||||||
gridblasHandle->wait();
|
gridblasHandle->wait();
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////////////
|
||||||
|
// Single matrix GEMM -- fp64 and fp32
|
||||||
|
/////////////////////////////////////////////////////////////
|
||||||
|
void gemm(GridBLASOperation_t OpA,
|
||||||
|
GridBLASOperation_t OpB,
|
||||||
|
int m,int n, int k,
|
||||||
|
ComplexD alpha,
|
||||||
|
ComplexD* Amk, // Device pointer
|
||||||
|
ComplexD* Bkn,
|
||||||
|
ComplexD beta,
|
||||||
|
ComplexD* Cmn)
|
||||||
|
{
|
||||||
|
RealD t2=usecond();
|
||||||
|
|
||||||
|
assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
|
||||||
|
assert(OpB!=GridBLAS_OP_T);
|
||||||
|
|
||||||
|
int lda = m; // m x k column major
|
||||||
|
int ldb = k; // k x n column major
|
||||||
|
int ldc = m; // m x b column major
|
||||||
|
if(OpA!=GridBLAS_OP_N)
|
||||||
|
lda = k;
|
||||||
|
if(OpB!=GridBLAS_OP_N)
|
||||||
|
ldb = n;
|
||||||
|
|
||||||
|
static deviceVector<ComplexD> alpha_p(1);
|
||||||
|
static deviceVector<ComplexD> beta_p(1);
|
||||||
|
// can prestore the 1 and the zero on device
|
||||||
|
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD));
|
||||||
|
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD));
|
||||||
|
RealD t0=usecond();
|
||||||
|
|
||||||
|
#ifdef GRID_HIP
|
||||||
|
hipblasOperation_t hOpA;
|
||||||
|
hipblasOperation_t hOpB;
|
||||||
|
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
|
||||||
|
if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
|
||||||
|
if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
|
||||||
|
if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
|
||||||
|
if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
|
||||||
|
if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
|
||||||
|
auto err = hipblasZgemm(gridblasHandle,
|
||||||
|
hOpA,
|
||||||
|
hOpB,
|
||||||
|
m,n,k,
|
||||||
|
(hipblasDoubleComplex *) &alpha_p[0],
|
||||||
|
(hipblasDoubleComplex *) Amk, lda,
|
||||||
|
(hipblasDoubleComplex *) Bkn, ldb,
|
||||||
|
(hipblasDoubleComplex *) &beta_p[0],
|
||||||
|
(hipblasDoubleComplex *) Cmn, ldc);
|
||||||
|
assert(err==HIPBLAS_STATUS_SUCCESS);
|
||||||
|
#endif
|
||||||
|
#ifdef GRID_CUDA
|
||||||
|
cublasOperation_t hOpA;
|
||||||
|
cublasOperation_t hOpB;
|
||||||
|
if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
|
||||||
|
if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
|
||||||
|
if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
|
||||||
|
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
|
||||||
|
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
|
||||||
|
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
|
||||||
|
auto err = cublasZgemm(gridblasHandle,
|
||||||
|
hOpA,
|
||||||
|
hOpB,
|
||||||
|
m,n,k,
|
||||||
|
(cuDoubleComplex *) &alpha_p[0],
|
||||||
|
(cuDoubleComplex *) Amk, lda,
|
||||||
|
(cuDoubleComplex *) Bkn, ldb,
|
||||||
|
(cuDoubleComplex *) &beta_p[0],
|
||||||
|
(cuDoubleComplex *) Cmn, ldc);
|
||||||
|
assert(err==CUBLAS_STATUS_SUCCESS);
|
||||||
|
#endif
|
||||||
|
#ifdef GRID_SYCL
|
||||||
|
int64_t m64=m;
|
||||||
|
int64_t n64=n;
|
||||||
|
int64_t k64=k;
|
||||||
|
int64_t lda64=lda;
|
||||||
|
int64_t ldb64=ldb;
|
||||||
|
int64_t ldc64=ldc;
|
||||||
|
|
||||||
|
oneapi::mkl::transpose iOpA;
|
||||||
|
oneapi::mkl::transpose iOpB;
|
||||||
|
|
||||||
|
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
|
||||||
|
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
|
||||||
|
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
|
||||||
|
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
|
||||||
|
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
|
||||||
|
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
|
||||||
|
|
||||||
|
oneapi::mkl::blas::column_major::gemm(*gridblasHandle,
|
||||||
|
iOpA,
|
||||||
|
iOpB,
|
||||||
|
m64,n64,k64,
|
||||||
|
(ComplexD *) &alpha_p[0],
|
||||||
|
(const ComplexD *)Amk, (int64_t )lda64,
|
||||||
|
(const ComplexD *)Bkn, (int64_t )ldb64,
|
||||||
|
(ComplexD *) &beta_p[0],
|
||||||
|
(ComplexD *)Cmn, (int64_t)ldc64);
|
||||||
|
synchronise();
|
||||||
|
#endif
|
||||||
|
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
||||||
|
// Need a default/reference implementation; use Eigen
|
||||||
|
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
|
||||||
|
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk,m,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn,k,n);
|
||||||
|
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn,m,n);
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||||
|
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
|
||||||
|
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk,k,m);
|
||||||
|
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn,k,n);
|
||||||
|
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn,m,n);
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
|
||||||
|
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
|
||||||
|
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk,m,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn,n,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn,m,n);
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
|
||||||
|
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
|
||||||
|
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk,k,m);
|
||||||
|
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn,n,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn,m,n);
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
|
||||||
|
} else {
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
RealD t1=usecond();
|
||||||
|
RealD flops = 8.0*m*n*k;
|
||||||
|
RealD bytes = 1.0*sizeof(ComplexD)*(m*k+k*n+m*n);
|
||||||
|
}
|
||||||
|
void gemm(GridBLASOperation_t OpA,
|
||||||
|
GridBLASOperation_t OpB,
|
||||||
|
int m,int n, int k,
|
||||||
|
ComplexF alpha,
|
||||||
|
ComplexF* Amk, // Device pointer
|
||||||
|
ComplexF* Bkn,
|
||||||
|
ComplexF beta,
|
||||||
|
ComplexF* Cmn)
|
||||||
|
{
|
||||||
|
RealD t2=usecond();
|
||||||
|
|
||||||
|
assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
|
||||||
|
assert(OpB!=GridBLAS_OP_T);
|
||||||
|
|
||||||
|
int lda = m; // m x k column major
|
||||||
|
int ldb = k; // k x n column major
|
||||||
|
int ldc = m; // m x b column major
|
||||||
|
if(OpA!=GridBLAS_OP_N)
|
||||||
|
lda = k;
|
||||||
|
if(OpB!=GridBLAS_OP_N)
|
||||||
|
ldb = n;
|
||||||
|
|
||||||
|
static deviceVector<ComplexF> alpha_p(1);
|
||||||
|
static deviceVector<ComplexF> beta_p(1);
|
||||||
|
// can prestore the 1 and the zero on device
|
||||||
|
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexF));
|
||||||
|
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexF));
|
||||||
|
RealD t0=usecond();
|
||||||
|
|
||||||
|
#ifdef GRID_HIP
|
||||||
|
hipblasOperation_t hOpA;
|
||||||
|
hipblasOperation_t hOpB;
|
||||||
|
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
|
||||||
|
if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
|
||||||
|
if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
|
||||||
|
if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
|
||||||
|
if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
|
||||||
|
if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
|
||||||
|
auto err = hipblasCgemm(gridblasHandle,
|
||||||
|
hOpA,
|
||||||
|
hOpB,
|
||||||
|
m,n,k,
|
||||||
|
(hipblasComplex *) &alpha_p[0],
|
||||||
|
(hipblasComplex *) Amk, lda,
|
||||||
|
(hipblasComplex *) Bkn, ldb,
|
||||||
|
(hipblasComplex *) &beta_p[0],
|
||||||
|
(hipblasComplex *) Cmn, ldc);
|
||||||
|
assert(err==HIPBLAS_STATUS_SUCCESS);
|
||||||
|
#endif
|
||||||
|
#ifdef GRID_CUDA
|
||||||
|
cublasOperation_t hOpA;
|
||||||
|
cublasOperation_t hOpB;
|
||||||
|
if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
|
||||||
|
if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
|
||||||
|
if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
|
||||||
|
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
|
||||||
|
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
|
||||||
|
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
|
||||||
|
auto err = cublasCgemm(gridblasHandle,
|
||||||
|
hOpA,
|
||||||
|
hOpB,
|
||||||
|
m,n,k,
|
||||||
|
(cuComplex *) &alpha_p[0],
|
||||||
|
(cuComplex *) Amk, lda,
|
||||||
|
(cuComplex *) Bkn, ldb,
|
||||||
|
(cuComplex *) &beta_p[0],
|
||||||
|
(cuComplex *) Cmn, ldc);
|
||||||
|
assert(err==CUBLAS_STATUS_SUCCESS);
|
||||||
|
#endif
|
||||||
|
#ifdef GRID_SYCL
|
||||||
|
int64_t m64=m;
|
||||||
|
int64_t n64=n;
|
||||||
|
int64_t k64=k;
|
||||||
|
int64_t lda64=lda;
|
||||||
|
int64_t ldb64=ldb;
|
||||||
|
int64_t ldc64=ldc;
|
||||||
|
|
||||||
|
oneapi::mkl::transpose iOpA;
|
||||||
|
oneapi::mkl::transpose iOpB;
|
||||||
|
|
||||||
|
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
|
||||||
|
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
|
||||||
|
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
|
||||||
|
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
|
||||||
|
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
|
||||||
|
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
|
||||||
|
|
||||||
|
oneapi::mkl::blas::column_major::gemm(*gridblasHandle,
|
||||||
|
iOpA,
|
||||||
|
iOpB,
|
||||||
|
m64,n64,k64,
|
||||||
|
(ComplexF *) &alpha_p[0],
|
||||||
|
(const ComplexF *)Amk, (int64_t )lda64,
|
||||||
|
(const ComplexF *)Bkn, (int64_t )ldb64,
|
||||||
|
(ComplexF *) &beta_p[0],
|
||||||
|
(ComplexF *)Cmn, (int64_t )ldc64);
|
||||||
|
synchronise();
|
||||||
|
#endif
|
||||||
|
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
||||||
|
// Need a default/reference implementation; use Eigen
|
||||||
|
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
|
||||||
|
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk,m,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn,k,n);
|
||||||
|
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn,m,n);
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||||
|
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
|
||||||
|
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk,k,m);
|
||||||
|
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn,k,n);
|
||||||
|
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn,m,n);
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
|
||||||
|
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
|
||||||
|
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk,m,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn,n,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn,m,n);
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
|
||||||
|
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
|
||||||
|
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk,k,m);
|
||||||
|
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn,n,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn,m,n);
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
|
||||||
|
} else {
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
RealD t1=usecond();
|
||||||
|
RealD flops = 8.0*m*n*k;
|
||||||
|
RealD bytes = 1.0*sizeof(ComplexF)*(m*k+k*n+m*n);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////////////
|
||||||
void gemmBatched(int m,int n, int k,
|
void gemmBatched(int m,int n, int k,
|
||||||
ComplexD alpha,
|
ComplexD alpha,
|
||||||
deviceVector<ComplexD*> &Amk, // pointer list to matrices
|
deviceVector<ComplexD*> &Amk, // pointer list to matrices
|
||||||
@@ -241,36 +583,6 @@ public:
|
|||||||
beta,
|
beta,
|
||||||
Cmn);
|
Cmn);
|
||||||
}
|
}
|
||||||
void gemmBatched(int m,int n, int k,
|
|
||||||
RealD alpha,
|
|
||||||
deviceVector<RealD*> &Amk, // pointer list to matrices
|
|
||||||
deviceVector<RealD*> &Bkn,
|
|
||||||
RealD beta,
|
|
||||||
deviceVector<RealD*> &Cmn)
|
|
||||||
{
|
|
||||||
gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
|
|
||||||
m,n,k,
|
|
||||||
alpha,
|
|
||||||
Amk,
|
|
||||||
Bkn,
|
|
||||||
beta,
|
|
||||||
Cmn);
|
|
||||||
}
|
|
||||||
void gemmBatched(int m,int n, int k,
|
|
||||||
RealF alpha,
|
|
||||||
deviceVector<RealF*> &Amk, // pointer list to matrices
|
|
||||||
deviceVector<RealF*> &Bkn,
|
|
||||||
RealF beta,
|
|
||||||
deviceVector<RealF*> &Cmn)
|
|
||||||
{
|
|
||||||
gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
|
|
||||||
m,n,k,
|
|
||||||
alpha,
|
|
||||||
Amk,
|
|
||||||
Bkn,
|
|
||||||
beta,
|
|
||||||
Cmn);
|
|
||||||
}
|
|
||||||
|
|
||||||
void gemmBatched(GridBLASOperation_t OpA,
|
void gemmBatched(GridBLASOperation_t OpA,
|
||||||
GridBLASOperation_t OpB,
|
GridBLASOperation_t OpB,
|
||||||
@@ -623,301 +935,6 @@ public:
|
|||||||
RealD flops = 8.0*m*n*k*batchCount;
|
RealD flops = 8.0*m*n*k*batchCount;
|
||||||
RealD bytes = 1.0*sizeof(ComplexF)*(m*k+k*n+m*n)*batchCount;
|
RealD bytes = 1.0*sizeof(ComplexF)*(m*k+k*n+m*n)*batchCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
|
||||||
// Single precision real GEMM
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
|
||||||
|
|
||||||
void gemmBatched(GridBLASOperation_t OpA,
|
|
||||||
GridBLASOperation_t OpB,
|
|
||||||
int m,int n, int k,
|
|
||||||
RealF alpha,
|
|
||||||
deviceVector<RealF*> &Amk, // pointer list to matrices
|
|
||||||
deviceVector<RealF*> &Bkn,
|
|
||||||
RealF beta,
|
|
||||||
deviceVector<RealF*> &Cmn)
|
|
||||||
{
|
|
||||||
RealD t2=usecond();
|
|
||||||
int32_t batchCount = Amk.size();
|
|
||||||
|
|
||||||
assert(OpA!=GridBLAS_OP_C); // Real case no conjugate
|
|
||||||
assert(OpB!=GridBLAS_OP_C);
|
|
||||||
|
|
||||||
int lda = m; // m x k column major
|
|
||||||
int ldb = k; // k x n column major
|
|
||||||
int ldc = m; // m x b column major
|
|
||||||
if(OpA!=GridBLAS_OP_N)
|
|
||||||
lda = k;
|
|
||||||
if(OpB!=GridBLAS_OP_N)
|
|
||||||
ldb = n;
|
|
||||||
static deviceVector<RealF> alpha_p(1);
|
|
||||||
static deviceVector<RealF> beta_p(1);
|
|
||||||
// can prestore the 1 and the zero on device
|
|
||||||
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealF));
|
|
||||||
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealF));
|
|
||||||
RealD t0=usecond();
|
|
||||||
|
|
||||||
assert(Bkn.size()==batchCount);
|
|
||||||
assert(Cmn.size()==batchCount);
|
|
||||||
#ifdef GRID_HIP
|
|
||||||
hipblasOperation_t hOpA;
|
|
||||||
hipblasOperation_t hOpB;
|
|
||||||
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
|
|
||||||
if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
|
|
||||||
if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
|
|
||||||
if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
|
|
||||||
if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
|
|
||||||
if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
|
|
||||||
auto err = hipblasSgemmBatched(gridblasHandle,
|
|
||||||
hOpA,
|
|
||||||
hOpB,
|
|
||||||
m,n,k,
|
|
||||||
(float *) &alpha_p[0],
|
|
||||||
(float **)&Amk[0], lda,
|
|
||||||
(float **)&Bkn[0], ldb,
|
|
||||||
(float *) &beta_p[0],
|
|
||||||
(float **)&Cmn[0], ldc,
|
|
||||||
batchCount);
|
|
||||||
assert(err==HIPBLAS_STATUS_SUCCESS);
|
|
||||||
#endif
|
|
||||||
#ifdef GRID_CUDA
|
|
||||||
cublasOperation_t hOpA;
|
|
||||||
cublasOperation_t hOpB;
|
|
||||||
if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
|
|
||||||
if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
|
|
||||||
if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
|
|
||||||
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
|
|
||||||
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
|
|
||||||
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
|
|
||||||
auto err = cublasSgemmBatched(gridblasHandle,
|
|
||||||
hOpA,
|
|
||||||
hOpB,
|
|
||||||
m,n,k,
|
|
||||||
(float *) &alpha_p[0],
|
|
||||||
(float **)&Amk[0], lda,
|
|
||||||
(float **)&Bkn[0], ldb,
|
|
||||||
(float *) &beta_p[0],
|
|
||||||
(float **)&Cmn[0], ldc,
|
|
||||||
batchCount);
|
|
||||||
assert(err==CUBLAS_STATUS_SUCCESS);
|
|
||||||
#endif
|
|
||||||
#ifdef GRID_SYCL
|
|
||||||
int64_t m64=m;
|
|
||||||
int64_t n64=n;
|
|
||||||
int64_t k64=k;
|
|
||||||
int64_t lda64=lda;
|
|
||||||
int64_t ldb64=ldb;
|
|
||||||
int64_t ldc64=ldc;
|
|
||||||
int64_t batchCount64=batchCount;
|
|
||||||
|
|
||||||
oneapi::mkl::transpose iOpA;
|
|
||||||
oneapi::mkl::transpose iOpB;
|
|
||||||
|
|
||||||
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
|
|
||||||
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
|
|
||||||
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
|
|
||||||
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
|
|
||||||
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
|
|
||||||
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
|
|
||||||
|
|
||||||
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
|
|
||||||
&iOpA,
|
|
||||||
&iOpB,
|
|
||||||
&m64,&n64,&k64,
|
|
||||||
(float *) &alpha_p[0],
|
|
||||||
(const float **)&Amk[0], (const int64_t *)&lda64,
|
|
||||||
(const float **)&Bkn[0], (const int64_t *)&ldb64,
|
|
||||||
(float *) &beta_p[0],
|
|
||||||
(float **)&Cmn[0], (const int64_t *)&ldc64,
|
|
||||||
(int64_t)1,&batchCount64,std::vector<sycl::event>());
|
|
||||||
synchronise();
|
|
||||||
#endif
|
|
||||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
|
||||||
// Need a default/reference implementation; use Eigen
|
|
||||||
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
|
|
||||||
thread_for (p, batchCount, {
|
|
||||||
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
|
|
||||||
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
|
|
||||||
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
|
||||||
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
|
||||||
});
|
|
||||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
|
|
||||||
thread_for (p, batchCount, {
|
|
||||||
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
|
|
||||||
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
|
|
||||||
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
|
||||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
|
|
||||||
});
|
|
||||||
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
|
|
||||||
thread_for (p, batchCount, {
|
|
||||||
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
|
|
||||||
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
|
|
||||||
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
|
||||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
|
|
||||||
});
|
|
||||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
|
|
||||||
thread_for (p, batchCount, {
|
|
||||||
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
|
|
||||||
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
|
|
||||||
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
|
||||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
|
|
||||||
} );
|
|
||||||
} else {
|
|
||||||
assert(0);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
RealD t1=usecond();
|
|
||||||
RealD flops = 2.0*m*n*k*batchCount;
|
|
||||||
RealD bytes = 1.0*sizeof(RealF)*(m*k+k*n+m*n)*batchCount;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
|
||||||
// Double precision real GEMM
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
|
||||||
void gemmBatched(GridBLASOperation_t OpA,
|
|
||||||
GridBLASOperation_t OpB,
|
|
||||||
int m,int n, int k,
|
|
||||||
RealD alpha,
|
|
||||||
deviceVector<RealD*> &Amk, // pointer list to matrices
|
|
||||||
deviceVector<RealD*> &Bkn,
|
|
||||||
RealD beta,
|
|
||||||
deviceVector<RealD*> &Cmn)
|
|
||||||
{
|
|
||||||
RealD t2=usecond();
|
|
||||||
int32_t batchCount = Amk.size();
|
|
||||||
|
|
||||||
assert(OpA!=GridBLAS_OP_C); // Real case no conjugate
|
|
||||||
assert(OpB!=GridBLAS_OP_C);
|
|
||||||
|
|
||||||
int lda = m; // m x k column major
|
|
||||||
int ldb = k; // k x n column major
|
|
||||||
int ldc = m; // m x b column major
|
|
||||||
if(OpA!=GridBLAS_OP_N)
|
|
||||||
lda = k;
|
|
||||||
if(OpB!=GridBLAS_OP_N)
|
|
||||||
ldb = n;
|
|
||||||
|
|
||||||
static deviceVector<RealD> alpha_p(1);
|
|
||||||
static deviceVector<RealD> beta_p(1);
|
|
||||||
// can prestore the 1 and the zero on device
|
|
||||||
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealD));
|
|
||||||
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealD));
|
|
||||||
RealD t0=usecond();
|
|
||||||
|
|
||||||
assert(Bkn.size()==batchCount);
|
|
||||||
assert(Cmn.size()==batchCount);
|
|
||||||
#ifdef GRID_HIP
|
|
||||||
hipblasOperation_t hOpA;
|
|
||||||
hipblasOperation_t hOpB;
|
|
||||||
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
|
|
||||||
if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
|
|
||||||
if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
|
|
||||||
if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
|
|
||||||
if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
|
|
||||||
if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
|
|
||||||
auto err = hipblasDgemmBatched(gridblasHandle,
|
|
||||||
HIPBLAS_OP_N,
|
|
||||||
HIPBLAS_OP_N,
|
|
||||||
m,n,k,
|
|
||||||
(double *) &alpha_p[0],
|
|
||||||
(double **)&Amk[0], lda,
|
|
||||||
(double **)&Bkn[0], ldb,
|
|
||||||
(double *) &beta_p[0],
|
|
||||||
(double **)&Cmn[0], ldc,
|
|
||||||
batchCount);
|
|
||||||
assert(err==HIPBLAS_STATUS_SUCCESS);
|
|
||||||
#endif
|
|
||||||
#ifdef GRID_CUDA
|
|
||||||
cublasOperation_t hOpA;
|
|
||||||
cublasOperation_t hOpB;
|
|
||||||
if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
|
|
||||||
if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
|
|
||||||
if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
|
|
||||||
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
|
|
||||||
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
|
|
||||||
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
|
|
||||||
auto err = cublasDgemmBatched(gridblasHandle,
|
|
||||||
hOpA,
|
|
||||||
hOpB,
|
|
||||||
m,n,k,
|
|
||||||
(double *) &alpha_p[0],
|
|
||||||
(double **)&Amk[0], lda,
|
|
||||||
(double **)&Bkn[0], ldb,
|
|
||||||
(double *) &beta_p[0],
|
|
||||||
(double **)&Cmn[0], ldc,
|
|
||||||
batchCount);
|
|
||||||
assert(err==CUBLAS_STATUS_SUCCESS);
|
|
||||||
#endif
|
|
||||||
#ifdef GRID_SYCL
|
|
||||||
int64_t m64=m;
|
|
||||||
int64_t n64=n;
|
|
||||||
int64_t k64=k;
|
|
||||||
int64_t lda64=lda;
|
|
||||||
int64_t ldb64=ldb;
|
|
||||||
int64_t ldc64=ldc;
|
|
||||||
int64_t batchCount64=batchCount;
|
|
||||||
|
|
||||||
oneapi::mkl::transpose iOpA;
|
|
||||||
oneapi::mkl::transpose iOpB;
|
|
||||||
|
|
||||||
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
|
|
||||||
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
|
|
||||||
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
|
|
||||||
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
|
|
||||||
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
|
|
||||||
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
|
|
||||||
|
|
||||||
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
|
|
||||||
&iOpA,
|
|
||||||
&iOpB,
|
|
||||||
&m64,&n64,&k64,
|
|
||||||
(double *) &alpha_p[0],
|
|
||||||
(const double **)&Amk[0], (const int64_t *)&lda64,
|
|
||||||
(const double **)&Bkn[0], (const int64_t *)&ldb64,
|
|
||||||
(double *) &beta_p[0],
|
|
||||||
(double **)&Cmn[0], (const int64_t *)&ldc64,
|
|
||||||
(int64_t)1,&batchCount64,std::vector<sycl::event>());
|
|
||||||
synchronise();
|
|
||||||
#endif
|
|
||||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
|
||||||
// Need a default/reference implementation; use Eigen
|
|
||||||
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
|
|
||||||
thread_for (p, batchCount, {
|
|
||||||
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
|
|
||||||
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
|
|
||||||
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
|
||||||
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
|
||||||
});
|
|
||||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
|
|
||||||
thread_for (p, batchCount, {
|
|
||||||
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
|
|
||||||
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
|
|
||||||
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
|
||||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
|
|
||||||
});
|
|
||||||
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
|
|
||||||
thread_for (p, batchCount, {
|
|
||||||
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
|
|
||||||
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
|
|
||||||
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
|
||||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
|
|
||||||
});
|
|
||||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
|
|
||||||
thread_for (p, batchCount, {
|
|
||||||
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
|
|
||||||
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
|
|
||||||
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
|
||||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
|
|
||||||
});
|
|
||||||
} else {
|
|
||||||
assert(0);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
RealD t1=usecond();
|
|
||||||
RealD flops = 2.0*m*n*k*batchCount;
|
|
||||||
RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount;
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class CComplex>
|
template<class CComplex>
|
||||||
double benchmark(int M, int N, int K, int BATCH)
|
double benchmark(int M, int N, int K, int BATCH)
|
||||||
@@ -967,6 +984,47 @@ public:
|
|||||||
return flops; // Returns gigaflops
|
return flops; // Returns gigaflops
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<class CComplex>
|
||||||
|
double benchmark(int M, int N, int K)
|
||||||
|
{
|
||||||
|
int32_t N_A = M*K;
|
||||||
|
int32_t N_B = K*N;
|
||||||
|
int32_t N_C = M*N;
|
||||||
|
deviceVector<CComplex> A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(CComplex));
|
||||||
|
deviceVector<CComplex> B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(CComplex));
|
||||||
|
deviceVector<CComplex> C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(CComplex));
|
||||||
|
CComplex alpha(1.0);
|
||||||
|
CComplex beta (1.0);
|
||||||
|
RealD flops = 8.0*M*N*K;
|
||||||
|
int ncall=10;
|
||||||
|
|
||||||
|
gemm(GridBLAS_OP_C,GridBLAS_OP_N,
|
||||||
|
M,N,K,
|
||||||
|
alpha,
|
||||||
|
&A[0], // m x k
|
||||||
|
&B[0], // k x n
|
||||||
|
beta,
|
||||||
|
&C[0]);
|
||||||
|
synchronise();
|
||||||
|
|
||||||
|
RealD t0 = usecond();
|
||||||
|
for(int i=0;i<ncall;i++){
|
||||||
|
gemm(GridBLAS_OP_N,GridBLAS_OP_N,
|
||||||
|
M,N,K,
|
||||||
|
alpha,
|
||||||
|
&A[0], // m x k
|
||||||
|
&B[0], // k x n
|
||||||
|
beta,
|
||||||
|
&C[0]);
|
||||||
|
synchronise();
|
||||||
|
}
|
||||||
|
RealD t1 = usecond();
|
||||||
|
RealD bytes = 1.0*sizeof(CComplex)*(M*N*2+N*K+M*K);
|
||||||
|
flops = 8.0*M*N*K*ncall;
|
||||||
|
flops = flops/(t1-t0)/1.e3;
|
||||||
|
return flops; // Returns gigaflops
|
||||||
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@@ -1035,6 +1093,21 @@ static void BLAS(void)
|
|||||||
std::cout<< M<<"\t\t"<<N<<"\t\t"<<K<<"\t\t"<<BATCH<<"\t\t"<<p<<std::endl;
|
std::cout<< M<<"\t\t"<<N<<"\t\t"<<K<<"\t\t"<<BATCH<<"\t\t"<<p<<std::endl;
|
||||||
}}
|
}}
|
||||||
fprintf(FP,"\n\n\n");
|
fprintf(FP,"\n\n\n");
|
||||||
|
|
||||||
|
std::cout << "----------------------------------------------------------"<<std::endl;
|
||||||
|
std::cout << " M "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / rank (inner product matrix)"<<std::endl;
|
||||||
|
std::cout << "----------------------------------------------------------"<<std::endl;
|
||||||
|
{
|
||||||
|
int M=12;
|
||||||
|
int N=12;
|
||||||
|
std::vector<int> ks({4*1024*1024, 2*1024*1024, 1024*1024, 256*1024, 1024 });
|
||||||
|
for( int kk=0;kk<ks.size();kk++ ) {
|
||||||
|
int K = ks[kk];
|
||||||
|
double p=blas.benchmark<CComplex>(M,N,K);
|
||||||
|
fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, 1, p);
|
||||||
|
std::cout<< M<<"\t\t"<<N<<"\t\t"<<K<<"\t\t"<<1<<"\t\t"<<p<<std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
std::cout << "=================================================================================="<<std::endl;
|
std::cout << "=================================================================================="<<std::endl;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -1,2 +1,2 @@
|
|||||||
|
|
||||||
mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench
|
mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench -DGRID_SYCL
|
||||||
5
BLAS_benchmark/compile-command-frontier
Normal file
5
BLAS_benchmark/compile-command-frontier
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
CXX=hipcc
|
||||||
|
MPICXX=mpicxx
|
||||||
|
CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 -I/opt/cray/pe/mpich/8.1.28/ofi/gnu/12.3/include -DGRID_HIP"
|
||||||
|
LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 -lhipblas -lrocblas -lmpi_gnu_123"
|
||||||
|
hipcc $CXXFLAGS $LDFLAGS BatchBlasBench.cc -o BatchBlasBench
|
||||||
2
BLAS_benchmark/compile-command-sunspot
Normal file
2
BLAS_benchmark/compile-command-sunspot
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
|
||||||
|
mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench -DGRID_SYCL
|
||||||
@@ -50,6 +50,7 @@ NAMESPACE_CHECK(approx);
|
|||||||
#include <Grid/algorithms/deflation/Deflation.h>
|
#include <Grid/algorithms/deflation/Deflation.h>
|
||||||
#include <Grid/algorithms/deflation/MultiRHSBlockProject.h>
|
#include <Grid/algorithms/deflation/MultiRHSBlockProject.h>
|
||||||
#include <Grid/algorithms/deflation/MultiRHSDeflation.h>
|
#include <Grid/algorithms/deflation/MultiRHSDeflation.h>
|
||||||
|
#include <Grid/algorithms/deflation/MultiRHSBlockCGLinalg.h>
|
||||||
NAMESPACE_CHECK(deflation);
|
NAMESPACE_CHECK(deflation);
|
||||||
#include <Grid/algorithms/iterative/ConjugateGradient.h>
|
#include <Grid/algorithms/iterative/ConjugateGradient.h>
|
||||||
NAMESPACE_CHECK(ConjGrad);
|
NAMESPACE_CHECK(ConjGrad);
|
||||||
|
|||||||
@@ -168,6 +168,7 @@ public:
|
|||||||
template<class vobj>
|
template<class vobj>
|
||||||
void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){
|
void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){
|
||||||
#ifndef HAVE_FFTW
|
#ifndef HAVE_FFTW
|
||||||
|
std::cerr << "FFTW is not compiled but is called"<<std::endl;
|
||||||
assert(0);
|
assert(0);
|
||||||
#else
|
#else
|
||||||
conformable(result.Grid(),vgrid);
|
conformable(result.Grid(),vgrid);
|
||||||
@@ -190,7 +191,8 @@ public:
|
|||||||
|
|
||||||
Lattice<sobj> pgbuf(&pencil_g);
|
Lattice<sobj> pgbuf(&pencil_g);
|
||||||
autoView(pgbuf_v , pgbuf, CpuWrite);
|
autoView(pgbuf_v , pgbuf, CpuWrite);
|
||||||
|
std::cout << "CPU view" << std::endl;
|
||||||
|
|
||||||
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
|
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
|
||||||
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
|
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
|
||||||
|
|
||||||
@@ -213,6 +215,7 @@ public:
|
|||||||
else if ( sign == forward ) div = 1.0;
|
else if ( sign == forward ) div = 1.0;
|
||||||
else assert(0);
|
else assert(0);
|
||||||
|
|
||||||
|
std::cout << "Making FFTW plan" << std::endl;
|
||||||
FFTW_plan p;
|
FFTW_plan p;
|
||||||
{
|
{
|
||||||
FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
|
FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
|
||||||
@@ -226,6 +229,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Barrel shift and collect global pencil
|
// Barrel shift and collect global pencil
|
||||||
|
std::cout << "Making pencil" << std::endl;
|
||||||
Coordinate lcoor(Nd), gcoor(Nd);
|
Coordinate lcoor(Nd), gcoor(Nd);
|
||||||
result = source;
|
result = source;
|
||||||
int pc = processor_coor[dim];
|
int pc = processor_coor[dim];
|
||||||
@@ -247,6 +251,7 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::cout << "Looping orthog" << std::endl;
|
||||||
// Loop over orthog coords
|
// Loop over orthog coords
|
||||||
int NN=pencil_g.lSites();
|
int NN=pencil_g.lSites();
|
||||||
GridStopWatch timer;
|
GridStopWatch timer;
|
||||||
@@ -269,6 +274,7 @@ public:
|
|||||||
usec += timer.useconds();
|
usec += timer.useconds();
|
||||||
flops+= flops_call*NN;
|
flops+= flops_call*NN;
|
||||||
|
|
||||||
|
std::cout << "Writing back results " << std::endl;
|
||||||
// writing out result
|
// writing out result
|
||||||
{
|
{
|
||||||
autoView(pgbuf_v,pgbuf,CpuRead);
|
autoView(pgbuf_v,pgbuf,CpuRead);
|
||||||
@@ -285,6 +291,7 @@ public:
|
|||||||
}
|
}
|
||||||
result = result*div;
|
result = result*div;
|
||||||
|
|
||||||
|
std::cout << "Destroying plan " << std::endl;
|
||||||
// destroying plan
|
// destroying plan
|
||||||
FFTW<scalar>::fftw_destroy_plan(p);
|
FFTW<scalar>::fftw_destroy_plan(p);
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -103,6 +103,38 @@ public:
|
|||||||
_Mat.MdagM(in,out);
|
_Mat.MdagM(in,out);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
template<class Matrix,class Field>
|
||||||
|
class MMdagLinearOperator : public LinearOperatorBase<Field> {
|
||||||
|
Matrix &_Mat;
|
||||||
|
public:
|
||||||
|
MMdagLinearOperator(Matrix &Mat): _Mat(Mat){};
|
||||||
|
|
||||||
|
// Support for coarsening to a multigrid
|
||||||
|
void OpDiag (const Field &in, Field &out) {
|
||||||
|
_Mat.Mdiag(in,out);
|
||||||
|
}
|
||||||
|
void OpDir (const Field &in, Field &out,int dir,int disp) {
|
||||||
|
_Mat.Mdir(in,out,dir,disp);
|
||||||
|
}
|
||||||
|
void OpDirAll (const Field &in, std::vector<Field> &out){
|
||||||
|
_Mat.MdirAll(in,out);
|
||||||
|
};
|
||||||
|
void Op (const Field &in, Field &out){
|
||||||
|
_Mat.M(in,out);
|
||||||
|
}
|
||||||
|
void AdjOp (const Field &in, Field &out){
|
||||||
|
_Mat.Mdag(in,out);
|
||||||
|
}
|
||||||
|
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
|
||||||
|
_Mat.MMdag(in,out);
|
||||||
|
ComplexD dot = innerProduct(in,out);
|
||||||
|
n1=real(dot);
|
||||||
|
n2=norm2(out);
|
||||||
|
}
|
||||||
|
void HermOp(const Field &in, Field &out){
|
||||||
|
_Mat.MMdag(in,out);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////
|
||||||
// Construct herm op and shift it for mgrid smoother
|
// Construct herm op and shift it for mgrid smoother
|
||||||
|
|||||||
@@ -45,6 +45,11 @@ public:
|
|||||||
M(in,tmp);
|
M(in,tmp);
|
||||||
Mdag(tmp,out);
|
Mdag(tmp,out);
|
||||||
}
|
}
|
||||||
|
virtual void MMdag(const Field &in, Field &out) {
|
||||||
|
Field tmp (in.Grid());
|
||||||
|
Mdag(in,tmp);
|
||||||
|
M(tmp,out);
|
||||||
|
}
|
||||||
virtual void Mdiag (const Field &in, Field &out)=0;
|
virtual void Mdiag (const Field &in, Field &out)=0;
|
||||||
virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0;
|
virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0;
|
||||||
virtual void MdirAll (const Field &in, std::vector<Field> &out)=0;
|
virtual void MdirAll (const Field &in, std::vector<Field> &out)=0;
|
||||||
|
|||||||
@@ -59,7 +59,7 @@ public:
|
|||||||
RealD diff = hi-lo;
|
RealD diff = hi-lo;
|
||||||
RealD delta = diff*1.0e-9;
|
RealD delta = diff*1.0e-9;
|
||||||
for (RealD x=lo; x<hi; x+=delta) {
|
for (RealD x=lo; x<hi; x+=delta) {
|
||||||
delta*=1.1;
|
delta*=1.02;
|
||||||
RealD f = approx(x);
|
RealD f = approx(x);
|
||||||
out<< x<<" "<<f<<std::endl;
|
out<< x<<" "<<f<<std::endl;
|
||||||
}
|
}
|
||||||
@@ -131,6 +131,26 @@ public:
|
|||||||
Coeffs[j] = s * 2.0/order;
|
Coeffs[j] = s * 2.0/order;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
template<class functor>
|
||||||
|
void Init(RealD _lo,RealD _hi,int _order, functor & func)
|
||||||
|
{
|
||||||
|
lo=_lo;
|
||||||
|
hi=_hi;
|
||||||
|
order=_order;
|
||||||
|
|
||||||
|
if(order < 2) exit(-1);
|
||||||
|
Coeffs.resize(order);
|
||||||
|
for(int j=0;j<order;j++){
|
||||||
|
RealD s=0;
|
||||||
|
for(int k=0;k<order;k++){
|
||||||
|
RealD y=std::cos(M_PI*(k+0.5)/order);
|
||||||
|
RealD x=0.5*(y*(hi-lo)+(hi+lo));
|
||||||
|
RealD f=func(x);
|
||||||
|
s=s+f*std::cos( j*M_PI*(k+0.5)/order );
|
||||||
|
}
|
||||||
|
Coeffs[j] = s * 2.0/order;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
void JacksonSmooth(void){
|
void JacksonSmooth(void){
|
||||||
|
|||||||
@@ -55,10 +55,10 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
typedef cublasHandle_t gridblasHandle_t;
|
typedef cublasHandle_t gridblasHandle_t;
|
||||||
#endif
|
#endif
|
||||||
#ifdef GRID_SYCL
|
#ifdef GRID_SYCL
|
||||||
typedef cl::sycl::queue *gridblasHandle_t;
|
typedef sycl::queue *gridblasHandle_t;
|
||||||
#endif
|
#endif
|
||||||
#ifdef GRID_ONE_MKL
|
#ifdef GRID_ONE_MKL
|
||||||
typedef cl::sycl::queue *gridblasHandle_t;
|
typedef sycl::queue *gridblasHandle_t;
|
||||||
#endif
|
#endif
|
||||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
|
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
|
||||||
typedef int32_t gridblasHandle_t;
|
typedef int32_t gridblasHandle_t;
|
||||||
@@ -89,9 +89,9 @@ public:
|
|||||||
gridblasHandle = theGridAccelerator;
|
gridblasHandle = theGridAccelerator;
|
||||||
#endif
|
#endif
|
||||||
#ifdef GRID_ONE_MKL
|
#ifdef GRID_ONE_MKL
|
||||||
cl::sycl::gpu_selector selector;
|
sycl::gpu_selector selector;
|
||||||
cl::sycl::device selectedDevice { selector };
|
sycl::device selectedDevice { selector };
|
||||||
cl::sycl::property_list q_prop{cl::sycl::property::queue::in_order()};
|
sycl::property_list q_prop{sycl::property::queue::in_order()};
|
||||||
gridblasHandle =new sycl::queue (selectedDevice,q_prop);
|
gridblasHandle =new sycl::queue (selectedDevice,q_prop);
|
||||||
#endif
|
#endif
|
||||||
gridblasInit=1;
|
gridblasInit=1;
|
||||||
|
|||||||
376
Grid/algorithms/deflation/MultiRHSBlockCGLinalg.h
Normal file
376
Grid/algorithms/deflation/MultiRHSBlockCGLinalg.h
Normal file
@@ -0,0 +1,376 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: MultiRHSBlockCGLinalg.h
|
||||||
|
|
||||||
|
Copyright (C) 2024
|
||||||
|
|
||||||
|
Author: Peter Boyle <pboyle@bnl.gov>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
|
|
||||||
|
/* Need helper object for BLAS accelerated mrhs blockCG */
|
||||||
|
template<class Field>
|
||||||
|
class MultiRHSBlockCGLinalg
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
|
||||||
|
typedef typename Field::scalar_type scalar;
|
||||||
|
typedef typename Field::scalar_object scalar_object;
|
||||||
|
typedef typename Field::vector_object vector_object;
|
||||||
|
|
||||||
|
deviceVector<scalar> BLAS_X; // nrhs x vol -- the sources
|
||||||
|
deviceVector<scalar> BLAS_Y; // nrhs x vol -- the result
|
||||||
|
deviceVector<scalar> BLAS_C; // nrhs x nrhs -- the coefficients
|
||||||
|
deviceVector<scalar> BLAS_Cred; // nrhs x nrhs x oSites -- reduction buffer
|
||||||
|
deviceVector<scalar *> Xdip;
|
||||||
|
deviceVector<scalar *> Ydip;
|
||||||
|
deviceVector<scalar *> Cdip;
|
||||||
|
|
||||||
|
MultiRHSBlockCGLinalg() {};
|
||||||
|
~MultiRHSBlockCGLinalg(){ Deallocate(); };
|
||||||
|
|
||||||
|
void Deallocate(void)
|
||||||
|
{
|
||||||
|
Xdip.resize(0);
|
||||||
|
Ydip.resize(0);
|
||||||
|
Cdip.resize(0);
|
||||||
|
BLAS_Cred.resize(0);
|
||||||
|
BLAS_C.resize(0);
|
||||||
|
BLAS_X.resize(0);
|
||||||
|
BLAS_Y.resize(0);
|
||||||
|
}
|
||||||
|
void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0)
|
||||||
|
{
|
||||||
|
std::vector<Field> Y_copy(AP.size(),AP[0].Grid());
|
||||||
|
for(int r=0;r<AP.size();r++){
|
||||||
|
Y_copy[r] = Y[r];
|
||||||
|
}
|
||||||
|
MulMatrix(AP,m,X);
|
||||||
|
for(int r=0;r<AP.size();r++){
|
||||||
|
AP[r] = scale*AP[r]+Y_copy[r];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void MulMatrix(std::vector<Field> &Y, Eigen::MatrixXcd &m , const std::vector<Field> &X)
|
||||||
|
{
|
||||||
|
typedef typename Field::scalar_type scomplex;
|
||||||
|
GridBase *grid;
|
||||||
|
uint64_t vol;
|
||||||
|
uint64_t words;
|
||||||
|
|
||||||
|
int nrhs = Y.size();
|
||||||
|
grid = X[0].Grid();
|
||||||
|
vol = grid->lSites();
|
||||||
|
words = sizeof(scalar_object)/sizeof(scalar);
|
||||||
|
int64_t vw = vol * words;
|
||||||
|
|
||||||
|
RealD t0 = usecond();
|
||||||
|
BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
|
||||||
|
BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
|
||||||
|
BLAS_C.resize(nrhs * nrhs);// cost free if size doesn't change
|
||||||
|
RealD t1 = usecond();
|
||||||
|
|
||||||
|
/////////////////////////////////////////////
|
||||||
|
// Copy in the multi-rhs sources
|
||||||
|
/////////////////////////////////////////////
|
||||||
|
for(int r=0;r<nrhs;r++){
|
||||||
|
int64_t offset = r*vw;
|
||||||
|
autoView(x_v,X[r],AcceleratorRead);
|
||||||
|
acceleratorCopyDeviceToDevice(&x_v[0],&BLAS_X[offset],sizeof(scalar_object)*vol);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Assumes Eigen storage contiguous
|
||||||
|
acceleratorCopyToDevice(&m(0,0),&BLAS_C[0],BLAS_C.size()*sizeof(scalar));
|
||||||
|
|
||||||
|
/*
|
||||||
|
* in Fortran column major notation (cuBlas order)
|
||||||
|
*
|
||||||
|
* Xxr = [X1(x)][..][Xn(x)]
|
||||||
|
* Yxr = [Y1(x)][..][Ym(x)]
|
||||||
|
* Y = X . C
|
||||||
|
*/
|
||||||
|
deviceVector<scalar *> Xd(1);
|
||||||
|
deviceVector<scalar *> Yd(1);
|
||||||
|
deviceVector<scalar *> Cd(1);
|
||||||
|
|
||||||
|
scalar * Xh = & BLAS_X[0];
|
||||||
|
scalar * Yh = & BLAS_Y[0];
|
||||||
|
scalar * Ch = & BLAS_C[0];
|
||||||
|
|
||||||
|
acceleratorPut(Xd[0],Xh);
|
||||||
|
acceleratorPut(Yd[0],Yh);
|
||||||
|
acceleratorPut(Cd[0],Ch);
|
||||||
|
|
||||||
|
RealD t2 = usecond();
|
||||||
|
GridBLAS BLAS;
|
||||||
|
/////////////////////////////////////////
|
||||||
|
// Y = X*C (transpose?)
|
||||||
|
/////////////////////////////////////////
|
||||||
|
BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
|
||||||
|
vw,nrhs,nrhs,
|
||||||
|
scalar(1.0),
|
||||||
|
Xd,
|
||||||
|
Cd,
|
||||||
|
scalar(0.0), // wipe out Y
|
||||||
|
Yd);
|
||||||
|
BLAS.synchronise();
|
||||||
|
RealD t3 = usecond();
|
||||||
|
|
||||||
|
// Copy back Y = m X
|
||||||
|
for(int r=0;r<nrhs;r++){
|
||||||
|
int64_t offset = r*vw;
|
||||||
|
autoView(y_v,Y[r],AcceleratorWrite);
|
||||||
|
acceleratorCopyDeviceToDevice(&BLAS_Y[offset],&y_v[0],sizeof(scalar_object)*vol);
|
||||||
|
}
|
||||||
|
RealD t4 = usecond();
|
||||||
|
std::cout << "MulMatrix alloc took "<< t1-t0<<" us"<<std::endl;
|
||||||
|
std::cout << "MulMatrix preamble took "<< t2-t1<<" us"<<std::endl;
|
||||||
|
std::cout << "MulMatrix blas took "<< t3-t2<<" us"<<std::endl;
|
||||||
|
std::cout << "MulMatrix copy took "<< t4-t3<<" us"<<std::endl;
|
||||||
|
std::cout << "MulMatrix total "<< t4-t0<<" us"<<std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y)
|
||||||
|
{
|
||||||
|
#if 0
|
||||||
|
int nrhs;
|
||||||
|
GridBase *grid;
|
||||||
|
uint64_t vol;
|
||||||
|
uint64_t words;
|
||||||
|
|
||||||
|
nrhs = X.size();
|
||||||
|
assert(X.size()==Y.size());
|
||||||
|
conformable(X[0],Y[0]);
|
||||||
|
|
||||||
|
grid = X[0].Grid();
|
||||||
|
vol = grid->lSites();
|
||||||
|
words = sizeof(scalar_object)/sizeof(scalar);
|
||||||
|
int64_t vw = vol * words;
|
||||||
|
|
||||||
|
RealD t0 = usecond();
|
||||||
|
BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
|
||||||
|
BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
|
||||||
|
BLAS_C.resize(nrhs * nrhs);// cost free if size doesn't change
|
||||||
|
RealD t1 = usecond();
|
||||||
|
|
||||||
|
/////////////////////////////////////////////
|
||||||
|
// Copy in the multi-rhs sources
|
||||||
|
/////////////////////////////////////////////
|
||||||
|
for(int r=0;r<nrhs;r++){
|
||||||
|
int64_t offset = r*vw;
|
||||||
|
autoView(x_v,X[r],AcceleratorRead);
|
||||||
|
acceleratorCopyDeviceToDevice(&x_v[0],&BLAS_X[offset],sizeof(scalar_object)*vol);
|
||||||
|
autoView(y_v,Y[r],AcceleratorRead);
|
||||||
|
acceleratorCopyDeviceToDevice(&y_v[0],&BLAS_Y[offset],sizeof(scalar_object)*vol);
|
||||||
|
}
|
||||||
|
RealD t2 = usecond();
|
||||||
|
|
||||||
|
/*
|
||||||
|
* in Fortran column major notation (cuBlas order)
|
||||||
|
*
|
||||||
|
* Xxr = [X1(x)][..][Xn(x)]
|
||||||
|
*
|
||||||
|
* Yxr = [Y1(x)][..][Ym(x)]
|
||||||
|
*
|
||||||
|
* C_rs = X^dag Y
|
||||||
|
*/
|
||||||
|
deviceVector<scalar *> Xd(1);
|
||||||
|
deviceVector<scalar *> Yd(1);
|
||||||
|
deviceVector<scalar *> Cd(1);
|
||||||
|
|
||||||
|
scalar * Xh = & BLAS_X[0];
|
||||||
|
scalar * Yh = & BLAS_Y[0];
|
||||||
|
scalar * Ch = & BLAS_C[0];
|
||||||
|
|
||||||
|
acceleratorPut(Xd[0],Xh);
|
||||||
|
acceleratorPut(Yd[0],Yh);
|
||||||
|
acceleratorPut(Cd[0],Ch);
|
||||||
|
|
||||||
|
GridBLAS BLAS;
|
||||||
|
|
||||||
|
RealD t3 = usecond();
|
||||||
|
/////////////////////////////////////////
|
||||||
|
// C_rs = X^dag Y
|
||||||
|
/////////////////////////////////////////
|
||||||
|
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
|
||||||
|
nrhs,nrhs,vw,
|
||||||
|
ComplexD(1.0),
|
||||||
|
Xd,
|
||||||
|
Yd,
|
||||||
|
ComplexD(0.0), // wipe out C
|
||||||
|
Cd);
|
||||||
|
BLAS.synchronise();
|
||||||
|
RealD t4 = usecond();
|
||||||
|
|
||||||
|
std::vector<scalar> HOST_C(BLAS_C.size()); // nrhs . nrhs -- the coefficients
|
||||||
|
acceleratorCopyFromDevice(&BLAS_C[0],&HOST_C[0],BLAS_C.size()*sizeof(scalar));
|
||||||
|
grid->GlobalSumVector(&HOST_C[0],nrhs*nrhs);
|
||||||
|
|
||||||
|
RealD t5 = usecond();
|
||||||
|
for(int rr=0;rr<nrhs;rr++){
|
||||||
|
for(int r=0;r<nrhs;r++){
|
||||||
|
int off = r+nrhs*rr;
|
||||||
|
m(r,rr)=HOST_C[off];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
RealD t6 = usecond();
|
||||||
|
uint64_t M=nrhs;
|
||||||
|
uint64_t N=nrhs;
|
||||||
|
uint64_t K=vw;
|
||||||
|
RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K);
|
||||||
|
RealD flops = 8.0*M*N*K;
|
||||||
|
flops = flops/(t4-t3)/1.e3;
|
||||||
|
bytes = bytes/(t4-t3)/1.e3;
|
||||||
|
std::cout << "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
|
||||||
|
std::cout << "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
|
||||||
|
std::cout << "InnerProductMatrix cp t2 "<< t2-t1<<" us"<<std::endl;
|
||||||
|
std::cout << "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
|
||||||
|
std::cout << "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
|
||||||
|
std::cout << "InnerProductMatrix blas "<< flops<<" GF/s"<<std::endl;
|
||||||
|
std::cout << "InnerProductMatrix blas "<< bytes<<" GB/s"<<std::endl;
|
||||||
|
std::cout << "InnerProductMatrix gsum t5 "<< t5-t4<<" us"<<std::endl;
|
||||||
|
std::cout << "InnerProductMatrix cp t6 "<< t6-t5<<" us"<<std::endl;
|
||||||
|
std::cout << "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
|
||||||
|
#else
|
||||||
|
int nrhs;
|
||||||
|
GridBase *grid;
|
||||||
|
uint64_t vol;
|
||||||
|
uint64_t words;
|
||||||
|
|
||||||
|
nrhs = X.size();
|
||||||
|
assert(X.size()==Y.size());
|
||||||
|
conformable(X[0],Y[0]);
|
||||||
|
|
||||||
|
grid = X[0].Grid();
|
||||||
|
int rd0 = grid->_rdimensions[0] * grid->_rdimensions[1];
|
||||||
|
vol = grid->oSites()/rd0;
|
||||||
|
words = rd0*sizeof(vector_object)/sizeof(scalar);
|
||||||
|
int64_t vw = vol * words;
|
||||||
|
assert(vw == grid->lSites()*sizeof(scalar_object)/sizeof(scalar));
|
||||||
|
|
||||||
|
RealD t0 = usecond();
|
||||||
|
BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
|
||||||
|
BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
|
||||||
|
BLAS_Cred.resize(nrhs * nrhs * vol);// cost free if size doesn't change
|
||||||
|
RealD t1 = usecond();
|
||||||
|
|
||||||
|
/////////////////////////////////////////////
|
||||||
|
// Copy in the multi-rhs sources -- layout batched BLAS ready
|
||||||
|
/////////////////////////////////////////////
|
||||||
|
for(int r=0;r<nrhs;r++){
|
||||||
|
autoView(x_v,X[r],AcceleratorRead);
|
||||||
|
autoView(y_v,Y[r],AcceleratorRead);
|
||||||
|
scalar *from_x=(scalar *)&x_v[0];
|
||||||
|
scalar *from_y=(scalar *)&y_v[0];
|
||||||
|
scalar *BX = &BLAS_X[0];
|
||||||
|
scalar *BY = &BLAS_Y[0];
|
||||||
|
accelerator_for(ssw,vw,1,{
|
||||||
|
uint64_t ss=ssw/words;
|
||||||
|
uint64_t w=ssw%words;
|
||||||
|
uint64_t offset = w+r*words+ss*nrhs*words; // [ss][rhs][words]
|
||||||
|
BX[offset] = from_x[ssw];
|
||||||
|
BY[offset] = from_y[ssw];
|
||||||
|
});
|
||||||
|
}
|
||||||
|
RealD t2 = usecond();
|
||||||
|
|
||||||
|
/*
|
||||||
|
* in Fortran column major notation (cuBlas order)
|
||||||
|
*
|
||||||
|
* Xxr = [X1(x)][..][Xn(x)]
|
||||||
|
*
|
||||||
|
* Yxr = [Y1(x)][..][Ym(x)]
|
||||||
|
*
|
||||||
|
* C_rs = X^dag Y
|
||||||
|
*/
|
||||||
|
Xdip.resize(vol);
|
||||||
|
Ydip.resize(vol);
|
||||||
|
Cdip.resize(vol);
|
||||||
|
std::vector<scalar *> Xh(vol);
|
||||||
|
std::vector<scalar *> Yh(vol);
|
||||||
|
std::vector<scalar *> Ch(vol);
|
||||||
|
for(uint64_t ss=0;ss<vol;ss++){
|
||||||
|
|
||||||
|
Xh[ss] = & BLAS_X[ss*nrhs*words];
|
||||||
|
Yh[ss] = & BLAS_Y[ss*nrhs*words];
|
||||||
|
Ch[ss] = & BLAS_Cred[ss*nrhs*nrhs];
|
||||||
|
|
||||||
|
}
|
||||||
|
acceleratorCopyToDevice(&Xh[0],&Xdip[0],vol*sizeof(scalar *));
|
||||||
|
acceleratorCopyToDevice(&Yh[0],&Ydip[0],vol*sizeof(scalar *));
|
||||||
|
acceleratorCopyToDevice(&Ch[0],&Cdip[0],vol*sizeof(scalar *));
|
||||||
|
|
||||||
|
GridBLAS BLAS;
|
||||||
|
|
||||||
|
RealD t3 = usecond();
|
||||||
|
/////////////////////////////////////////
|
||||||
|
// C_rs = X^dag Y
|
||||||
|
/////////////////////////////////////////
|
||||||
|
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
|
||||||
|
nrhs,nrhs,words,
|
||||||
|
ComplexD(1.0),
|
||||||
|
Xdip,
|
||||||
|
Ydip,
|
||||||
|
ComplexD(0.0), // wipe out C
|
||||||
|
Cdip);
|
||||||
|
BLAS.synchronise();
|
||||||
|
RealD t4 = usecond();
|
||||||
|
|
||||||
|
std::vector<scalar> HOST_C(BLAS_Cred.size()); // nrhs . nrhs -- the coefficients
|
||||||
|
acceleratorCopyFromDevice(&BLAS_Cred[0],&HOST_C[0],BLAS_Cred.size()*sizeof(scalar));
|
||||||
|
|
||||||
|
RealD t5 = usecond();
|
||||||
|
m = Eigen::MatrixXcd::Zero(nrhs,nrhs);
|
||||||
|
for(int ss=0;ss<vol;ss++){
|
||||||
|
Eigen::Map<Eigen::MatrixXcd> eC((std::complex<double> *)&HOST_C[ss*nrhs*nrhs],nrhs,nrhs);
|
||||||
|
m = m + eC;
|
||||||
|
}
|
||||||
|
RealD t6l = usecond();
|
||||||
|
grid->GlobalSumVector((scalar *) &m(0,0),nrhs*nrhs);
|
||||||
|
RealD t6 = usecond();
|
||||||
|
uint64_t M=nrhs;
|
||||||
|
uint64_t N=nrhs;
|
||||||
|
uint64_t K=vw;
|
||||||
|
RealD xybytes = grid->lSites()*sizeof(scalar_object);
|
||||||
|
RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K);
|
||||||
|
RealD flops = 8.0*M*N*K;
|
||||||
|
flops = flops/(t4-t3)/1.e3;
|
||||||
|
bytes = bytes/(t4-t3)/1.e3;
|
||||||
|
xybytes = 4*xybytes/(t2-t1)/1.e3;
|
||||||
|
std::cout << "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
|
||||||
|
std::cout << "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
|
||||||
|
std::cout << "InnerProductMatrix cp t2 "<< t2-t1<<" us "<<xybytes<<" GB/s"<<std::endl;
|
||||||
|
std::cout << "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
|
||||||
|
std::cout << "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
|
||||||
|
std::cout << "InnerProductMatrix blas "<< flops<<" GF/s"<<std::endl;
|
||||||
|
std::cout << "InnerProductMatrix blas "<< bytes<<" GB/s"<<std::endl;
|
||||||
|
std::cout << "InnerProductMatrix cp t5 "<< t5-t4<<" us"<<std::endl;
|
||||||
|
std::cout << "InnerProductMatrix lsum t6l "<< t6l-t5<<" us"<<std::endl;
|
||||||
|
std::cout << "InnerProductMatrix gsum t6 "<< t6-t6l<<" us"<<std::endl;
|
||||||
|
std::cout << "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
NAMESPACE_END(Grid);
|
||||||
@@ -447,10 +447,10 @@ public:
|
|||||||
/////////////////////////////////////////
|
/////////////////////////////////////////
|
||||||
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
|
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
|
||||||
nbasis,nrhs,vw,
|
nbasis,nrhs,vw,
|
||||||
ComplexD(1.0),
|
scalar(1.0),
|
||||||
Vd,
|
Vd,
|
||||||
Fd,
|
Fd,
|
||||||
ComplexD(0.0), // wipe out C
|
scalar(0.0), // wipe out C
|
||||||
Cd);
|
Cd);
|
||||||
BLAS.synchronise();
|
BLAS.synchronise();
|
||||||
// std::cout << "BlockProject done"<<std::endl;
|
// std::cout << "BlockProject done"<<std::endl;
|
||||||
@@ -497,10 +497,10 @@ public:
|
|||||||
int64_t vw = block_vol * words;
|
int64_t vw = block_vol * words;
|
||||||
BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
|
BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
|
||||||
vw,nrhs,nbasis,
|
vw,nrhs,nbasis,
|
||||||
ComplexD(1.0),
|
scalar(1.0),
|
||||||
Vd,
|
Vd,
|
||||||
Cd,
|
Cd,
|
||||||
ComplexD(0.0), // wipe out C
|
scalar(0.0), // wipe out C
|
||||||
Fd);
|
Fd);
|
||||||
BLAS.synchronise();
|
BLAS.synchronise();
|
||||||
// std::cout << " blas call done"<<std::endl;
|
// std::cout << " blas call done"<<std::endl;
|
||||||
|
|||||||
@@ -182,10 +182,10 @@ public:
|
|||||||
/////////////////////////////////////////
|
/////////////////////////////////////////
|
||||||
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
|
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
|
||||||
nev,nrhs,vw,
|
nev,nrhs,vw,
|
||||||
ComplexD(1.0),
|
scalar(1.0),
|
||||||
Ed,
|
Ed,
|
||||||
Rd,
|
Rd,
|
||||||
ComplexD(0.0), // wipe out C
|
scalar(0.0), // wipe out C
|
||||||
Cd);
|
Cd);
|
||||||
BLAS.synchronise();
|
BLAS.synchronise();
|
||||||
|
|
||||||
@@ -210,10 +210,10 @@ public:
|
|||||||
/////////////////////////////////////////
|
/////////////////////////////////////////
|
||||||
BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
|
BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
|
||||||
vw,nrhs,nev,
|
vw,nrhs,nev,
|
||||||
ComplexD(1.0),
|
scalar(1.0),
|
||||||
Ed, // x . nev
|
Ed, // x . nev
|
||||||
Cd, // nev . nrhs
|
Cd, // nev . nrhs
|
||||||
ComplexD(0.0),
|
scalar(0.0),
|
||||||
Gd);
|
Gd);
|
||||||
BLAS.synchronise();
|
BLAS.synchronise();
|
||||||
|
|
||||||
|
|||||||
@@ -53,6 +53,7 @@ class TwoLevelCGmrhs
|
|||||||
// Fine operator, Smoother, CoarseSolver
|
// Fine operator, Smoother, CoarseSolver
|
||||||
LinearOperatorBase<Field> &_FineLinop;
|
LinearOperatorBase<Field> &_FineLinop;
|
||||||
LinearFunction<Field> &_Smoother;
|
LinearFunction<Field> &_Smoother;
|
||||||
|
MultiRHSBlockCGLinalg<Field> _BlockCGLinalg;
|
||||||
|
|
||||||
GridStopWatch ProjectTimer;
|
GridStopWatch ProjectTimer;
|
||||||
GridStopWatch PromoteTimer;
|
GridStopWatch PromoteTimer;
|
||||||
@@ -79,6 +80,301 @@ class TwoLevelCGmrhs
|
|||||||
|
|
||||||
// Vector case
|
// Vector case
|
||||||
virtual void operator() (std::vector<Field> &src, std::vector<Field> &x)
|
virtual void operator() (std::vector<Field> &src, std::vector<Field> &x)
|
||||||
|
{
|
||||||
|
SolveSingleSystem(src,x);
|
||||||
|
// SolvePrecBlockCG(src,x);
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Thin QR factorisation (google it)
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//Dimensions
|
||||||
|
// R_{ferm x Nblock} = Q_{ferm x Nblock} x C_{Nblock x Nblock} -> ferm x Nblock
|
||||||
|
//
|
||||||
|
// Rdag R = m_rr = Herm = L L^dag <-- Cholesky decomposition (LLT routine in Eigen)
|
||||||
|
//
|
||||||
|
// Q C = R => Q = R C^{-1}
|
||||||
|
//
|
||||||
|
// Want Ident = Q^dag Q = C^{-dag} R^dag R C^{-1} = C^{-dag} L L^dag C^{-1} = 1_{Nblock x Nblock}
|
||||||
|
//
|
||||||
|
// Set C = L^{dag}, and then Q^dag Q = ident
|
||||||
|
//
|
||||||
|
// Checks:
|
||||||
|
// Cdag C = Rdag R ; passes.
|
||||||
|
// QdagQ = 1 ; passes
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
void ThinQRfact (Eigen::MatrixXcd &m_zz,
|
||||||
|
Eigen::MatrixXcd &C,
|
||||||
|
Eigen::MatrixXcd &Cinv,
|
||||||
|
std::vector<Field> & Q,
|
||||||
|
std::vector<Field> & MQ,
|
||||||
|
const std::vector<Field> & Z,
|
||||||
|
const std::vector<Field> & MZ)
|
||||||
|
{
|
||||||
|
RealD t0=usecond();
|
||||||
|
_BlockCGLinalg.InnerProductMatrix(m_zz,MZ,Z);
|
||||||
|
RealD t1=usecond();
|
||||||
|
|
||||||
|
m_zz = 0.5*(m_zz+m_zz.adjoint());
|
||||||
|
|
||||||
|
Eigen::MatrixXcd L = m_zz.llt().matrixL();
|
||||||
|
|
||||||
|
C = L.adjoint();
|
||||||
|
Cinv = C.inverse();
|
||||||
|
|
||||||
|
RealD t3=usecond();
|
||||||
|
_BlockCGLinalg.MulMatrix( Q,Cinv,Z);
|
||||||
|
_BlockCGLinalg.MulMatrix(MQ,Cinv,MZ);
|
||||||
|
RealD t4=usecond();
|
||||||
|
std::cout << " ThinQRfact IP :"<< t1-t0<<" us"<<std::endl;
|
||||||
|
std::cout << " ThinQRfact Eigen :"<< t3-t1<<" us"<<std::endl;
|
||||||
|
std::cout << " ThinQRfact MulMat:"<< t4-t3<<" us"<<std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual void SolvePrecBlockCG (std::vector<Field> &src, std::vector<Field> &X)
|
||||||
|
{
|
||||||
|
std::cout << GridLogMessage<<"HDCG: mrhs fPrecBlockcg starting"<<std::endl;
|
||||||
|
src[0].Grid()->Barrier();
|
||||||
|
int nrhs = src.size();
|
||||||
|
// std::vector<RealD> f(nrhs);
|
||||||
|
// std::vector<RealD> rtzp(nrhs);
|
||||||
|
// std::vector<RealD> rtz(nrhs);
|
||||||
|
// std::vector<RealD> a(nrhs);
|
||||||
|
// std::vector<RealD> d(nrhs);
|
||||||
|
// std::vector<RealD> b(nrhs);
|
||||||
|
// std::vector<RealD> rptzp(nrhs);
|
||||||
|
|
||||||
|
////////////////////////////////////////////
|
||||||
|
//Initial residual computation & set up
|
||||||
|
////////////////////////////////////////////
|
||||||
|
std::vector<RealD> ssq(nrhs);
|
||||||
|
for(int rhs=0;rhs<nrhs;rhs++){
|
||||||
|
ssq[rhs]=norm2(src[rhs]); assert(ssq[rhs]!=0.0);
|
||||||
|
}
|
||||||
|
|
||||||
|
///////////////////////////
|
||||||
|
// Fields -- eliminate duplicates between fPcg and block cg
|
||||||
|
///////////////////////////
|
||||||
|
std::vector<Field> Mtmp(nrhs,grid);
|
||||||
|
std::vector<Field> tmp(nrhs,grid);
|
||||||
|
std::vector<Field> Z(nrhs,grid); // Rename Z to R
|
||||||
|
std::vector<Field> MZ(nrhs,grid); // Rename MZ to Z
|
||||||
|
std::vector<Field> Q(nrhs,grid); //
|
||||||
|
std::vector<Field> MQ(nrhs,grid); // Rename to P
|
||||||
|
std::vector<Field> D(nrhs,grid);
|
||||||
|
std::vector<Field> AD(nrhs,grid);
|
||||||
|
|
||||||
|
/************************************************************************
|
||||||
|
* Preconditioned Block conjugate gradient rQ
|
||||||
|
* Generalise Sebastien Birk Thesis, after Dubrulle 2001.
|
||||||
|
* Introduce preconditioning following Saad Ch9
|
||||||
|
************************************************************************
|
||||||
|
* Dimensions:
|
||||||
|
*
|
||||||
|
* X,B etc... ==(Nferm x nrhs)
|
||||||
|
* Matrix A==(Nferm x Nferm)
|
||||||
|
*
|
||||||
|
* Nferm = Nspin x Ncolour x Ncomplex x Nlattice_site
|
||||||
|
* QC => Thin QR factorisation (google it)
|
||||||
|
*
|
||||||
|
* R = B-AX
|
||||||
|
* Z = Mi R
|
||||||
|
* QC = Z
|
||||||
|
* D = Q
|
||||||
|
* for k:
|
||||||
|
* R = AD
|
||||||
|
* Z = Mi R
|
||||||
|
* M = [D^dag R]^{-1}
|
||||||
|
* X = X + D M C
|
||||||
|
* QS = Q - Z.M
|
||||||
|
* D = Q + D S^dag
|
||||||
|
* C = S C
|
||||||
|
*/
|
||||||
|
Eigen::MatrixXcd m_DZ = Eigen::MatrixXcd::Identity(nrhs,nrhs);
|
||||||
|
Eigen::MatrixXcd m_M = Eigen::MatrixXcd::Identity(nrhs,nrhs);
|
||||||
|
Eigen::MatrixXcd m_zz = Eigen::MatrixXcd::Zero(nrhs,nrhs);
|
||||||
|
Eigen::MatrixXcd m_rr = Eigen::MatrixXcd::Zero(nrhs,nrhs);
|
||||||
|
|
||||||
|
Eigen::MatrixXcd m_C = Eigen::MatrixXcd::Zero(nrhs,nrhs);
|
||||||
|
Eigen::MatrixXcd m_Cinv = Eigen::MatrixXcd::Zero(nrhs,nrhs);
|
||||||
|
Eigen::MatrixXcd m_S = Eigen::MatrixXcd::Zero(nrhs,nrhs);
|
||||||
|
Eigen::MatrixXcd m_Sinv = Eigen::MatrixXcd::Zero(nrhs,nrhs);
|
||||||
|
|
||||||
|
Eigen::MatrixXcd m_tmp = Eigen::MatrixXcd::Identity(nrhs,nrhs);
|
||||||
|
Eigen::MatrixXcd m_tmp1 = Eigen::MatrixXcd::Identity(nrhs,nrhs);
|
||||||
|
|
||||||
|
GridStopWatch HDCGTimer;
|
||||||
|
|
||||||
|
//////////////////////////
|
||||||
|
// x0 = Vstart -- possibly modify guess
|
||||||
|
//////////////////////////
|
||||||
|
Vstart(X,src);
|
||||||
|
|
||||||
|
//////////////////////////
|
||||||
|
// R = B-AX
|
||||||
|
//////////////////////////
|
||||||
|
for(int rhs=0;rhs<nrhs;rhs++){
|
||||||
|
// r0 = b -A x0
|
||||||
|
_FineLinop.HermOp(X[rhs],tmp[rhs]);
|
||||||
|
axpy (Z[rhs], -1.0,tmp[rhs], src[rhs]); // Computes R=Z=src - A X0
|
||||||
|
}
|
||||||
|
|
||||||
|
//////////////////////////////////
|
||||||
|
// Compute MZ = M1 Z = M1 B - M1 A x0
|
||||||
|
//////////////////////////////////
|
||||||
|
PcgM1(Z,MZ);
|
||||||
|
|
||||||
|
//////////////////////////////////
|
||||||
|
// QC = Z
|
||||||
|
//////////////////////////////////
|
||||||
|
ThinQRfact (m_zz, m_C, m_Cinv, Q, MQ, Z, MZ);
|
||||||
|
|
||||||
|
//////////////////////////////////
|
||||||
|
// D=MQ
|
||||||
|
//////////////////////////////////
|
||||||
|
for(int b=0;b<nrhs;b++) D[b]=MQ[b]; // LLT rotation of the MZ basis of search dirs
|
||||||
|
|
||||||
|
std::cout << GridLogMessage<<"PrecBlockCGrQ vec computed initial residual and QR fact " <<std::endl;
|
||||||
|
|
||||||
|
ProjectTimer.Reset();
|
||||||
|
PromoteTimer.Reset();
|
||||||
|
DeflateTimer.Reset();
|
||||||
|
CoarseTimer.Reset();
|
||||||
|
SmoothTimer.Reset();
|
||||||
|
FineTimer.Reset();
|
||||||
|
InsertTimer.Reset();
|
||||||
|
|
||||||
|
GridStopWatch M1Timer;
|
||||||
|
GridStopWatch M2Timer;
|
||||||
|
GridStopWatch M3Timer;
|
||||||
|
GridStopWatch LinalgTimer;
|
||||||
|
GridStopWatch InnerProdTimer;
|
||||||
|
|
||||||
|
HDCGTimer.Start();
|
||||||
|
|
||||||
|
std::vector<RealD> rn(nrhs);
|
||||||
|
for (int k=0;k<=MaxIterations;k++){
|
||||||
|
|
||||||
|
////////////////////
|
||||||
|
// Z = AD
|
||||||
|
////////////////////
|
||||||
|
M3Timer.Start();
|
||||||
|
for(int b=0;b<nrhs;b++) _FineLinop.HermOp(D[b], Z[b]);
|
||||||
|
M3Timer.Stop();
|
||||||
|
|
||||||
|
////////////////////
|
||||||
|
// MZ = M1 Z <==== the Multigrid preconditioner
|
||||||
|
////////////////////
|
||||||
|
M1Timer.Start();
|
||||||
|
PcgM1(Z,MZ);
|
||||||
|
M1Timer.Stop();
|
||||||
|
|
||||||
|
FineTimer.Start();
|
||||||
|
////////////////////
|
||||||
|
// M = [D^dag Z]^{-1} = (<Ddag MZ>_M)^{-1} inner prod, generalising Saad derivation of Precon CG
|
||||||
|
////////////////////
|
||||||
|
InnerProdTimer.Start();
|
||||||
|
_BlockCGLinalg.InnerProductMatrix(m_DZ,D,Z);
|
||||||
|
InnerProdTimer.Stop();
|
||||||
|
m_M = m_DZ.inverse();
|
||||||
|
|
||||||
|
///////////////////////////
|
||||||
|
// X = X + D MC
|
||||||
|
///////////////////////////
|
||||||
|
m_tmp = m_M * m_C;
|
||||||
|
LinalgTimer.Start();
|
||||||
|
_BlockCGLinalg.MaddMatrix(X,m_tmp, D,X); // D are the search directions and X takes the updates
|
||||||
|
LinalgTimer.Stop();
|
||||||
|
|
||||||
|
///////////////////////////
|
||||||
|
// QS = Q - M Z
|
||||||
|
// (MQ) S = MQ - M (M1Z)
|
||||||
|
///////////////////////////
|
||||||
|
LinalgTimer.Start();
|
||||||
|
_BlockCGLinalg.MaddMatrix(tmp ,m_M, Z, Q,-1.0);
|
||||||
|
_BlockCGLinalg.MaddMatrix(Mtmp,m_M,MZ,MQ,-1.0);
|
||||||
|
ThinQRfact (m_zz, m_S, m_Sinv, Q, MQ, tmp, Mtmp);
|
||||||
|
LinalgTimer.Stop();
|
||||||
|
|
||||||
|
////////////////////////////
|
||||||
|
// D = MQ + D S^dag
|
||||||
|
////////////////////////////
|
||||||
|
m_tmp = m_S.adjoint();
|
||||||
|
LinalgTimer.Start();
|
||||||
|
_BlockCGLinalg.MaddMatrix(D,m_tmp,D,MQ);
|
||||||
|
LinalgTimer.Stop();
|
||||||
|
|
||||||
|
////////////////////////////
|
||||||
|
// C = S C
|
||||||
|
////////////////////////////
|
||||||
|
m_C = m_S*m_C;
|
||||||
|
|
||||||
|
////////////////////////////
|
||||||
|
// convergence monitor
|
||||||
|
////////////////////////////
|
||||||
|
m_rr = m_C.adjoint() * m_C;
|
||||||
|
|
||||||
|
FineTimer.Stop();
|
||||||
|
|
||||||
|
RealD max_resid=0;
|
||||||
|
RealD rrsum=0;
|
||||||
|
RealD sssum=0;
|
||||||
|
RealD rr;
|
||||||
|
|
||||||
|
for(int b=0;b<nrhs;b++) {
|
||||||
|
rrsum+=real(m_rr(b,b));
|
||||||
|
sssum+=ssq[b];
|
||||||
|
rr = real(m_rr(b,b))/ssq[b];
|
||||||
|
if ( rr > max_resid ) max_resid = rr;
|
||||||
|
}
|
||||||
|
std::cout << GridLogMessage <<
|
||||||
|
"\t Prec BlockCGrQ Iteration "<<k<<" ave resid "<< std::sqrt(rrsum/sssum) << " max "<< std::sqrt(max_resid) <<std::endl;
|
||||||
|
|
||||||
|
|
||||||
|
if ( max_resid < Tolerance*Tolerance ) {
|
||||||
|
|
||||||
|
HDCGTimer.Stop();
|
||||||
|
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ converged in "<<k<<" iterations and "<<HDCGTimer.Elapsed()<<std::endl;;
|
||||||
|
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Linalg "<<LinalgTimer.Elapsed()<<std::endl;;
|
||||||
|
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : fine H "<<M3Timer.Elapsed()<<std::endl;;
|
||||||
|
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : prec M1 "<<M1Timer.Elapsed()<<std::endl;;
|
||||||
|
std::cout<<GridLogMessage<<"**** M1 breakdown:"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Project "<<ProjectTimer.Elapsed()<<std::endl;;
|
||||||
|
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Promote "<<PromoteTimer.Elapsed()<<std::endl;;
|
||||||
|
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Deflate "<<DeflateTimer.Elapsed()<<std::endl;;
|
||||||
|
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Coarse "<<CoarseTimer.Elapsed()<<std::endl;;
|
||||||
|
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Fine "<<FineTimer.Elapsed()<<std::endl;;
|
||||||
|
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Smooth "<<SmoothTimer.Elapsed()<<std::endl;;
|
||||||
|
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Insert "<<InsertTimer.Elapsed()<<std::endl;;
|
||||||
|
|
||||||
|
for(int rhs=0;rhs<nrhs;rhs++){
|
||||||
|
|
||||||
|
_FineLinop.HermOp(X[rhs],tmp[rhs]);
|
||||||
|
|
||||||
|
Field mytmp(grid);
|
||||||
|
axpy(mytmp,-1.0,src[rhs],tmp[rhs]);
|
||||||
|
|
||||||
|
RealD xnorm = sqrt(norm2(X[rhs]));
|
||||||
|
RealD srcnorm = sqrt(norm2(src[rhs]));
|
||||||
|
RealD tmpnorm = sqrt(norm2(mytmp));
|
||||||
|
RealD true_residual = tmpnorm/srcnorm;
|
||||||
|
std::cout<<GridLogMessage
|
||||||
|
<<"HDCG: true residual ["<<rhs<<"] is "<<true_residual
|
||||||
|
<<" solution "<<xnorm
|
||||||
|
<<" source "<<srcnorm
|
||||||
|
<<std::endl;
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
HDCGTimer.Stop();
|
||||||
|
std::cout<<GridLogMessage<<"HDCG: PrecBlockCGrQ not converged "<<HDCGTimer.Elapsed()<<std::endl;
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual void SolveSingleSystem (std::vector<Field> &src, std::vector<Field> &x)
|
||||||
{
|
{
|
||||||
std::cout << GridLogMessage<<"HDCG: mrhs fPcg starting"<<std::endl;
|
std::cout << GridLogMessage<<"HDCG: mrhs fPcg starting"<<std::endl;
|
||||||
src[0].Grid()->Barrier();
|
src[0].Grid()->Barrier();
|
||||||
@@ -361,15 +657,23 @@ public:
|
|||||||
CoarseField PleftProjMrhs(this->coarsegridmrhs);
|
CoarseField PleftProjMrhs(this->coarsegridmrhs);
|
||||||
CoarseField PleftMss_projMrhs(this->coarsegridmrhs);
|
CoarseField PleftMss_projMrhs(this->coarsegridmrhs);
|
||||||
|
|
||||||
|
#undef SMOOTHER_BLOCK_SOLVE
|
||||||
|
#if SMOOTHER_BLOCK_SOLVE
|
||||||
|
this->SmoothTimer.Start();
|
||||||
|
this->_Smoother(in,Min);
|
||||||
|
this->SmoothTimer.Stop();
|
||||||
|
#else
|
||||||
for(int rhs=0;rhs<nrhs;rhs++) {
|
for(int rhs=0;rhs<nrhs;rhs++) {
|
||||||
|
|
||||||
this->SmoothTimer.Start();
|
this->SmoothTimer.Start();
|
||||||
this->_Smoother(in[rhs],Min[rhs]);
|
this->_Smoother(in[rhs],Min[rhs]);
|
||||||
this->SmoothTimer.Stop();
|
this->SmoothTimer.Stop();
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
for(int rhs=0;rhs<nrhs;rhs++) {
|
||||||
|
|
||||||
this->FineTimer.Start();
|
this->FineTimer.Start();
|
||||||
this->_FineLinop.HermOp(Min[rhs],out[rhs]);
|
this->_FineLinop.HermOp(Min[rhs],out[rhs]);
|
||||||
|
|
||||||
axpy(tmp[rhs],-1.0,out[rhs],in[rhs]); // resid = in - A Min
|
axpy(tmp[rhs],-1.0,out[rhs],in[rhs]); // resid = in - A Min
|
||||||
this->FineTimer.Stop();
|
this->FineTimer.Stop();
|
||||||
|
|
||||||
@@ -407,7 +711,7 @@ public:
|
|||||||
this->FineTimer.Stop();
|
this->FineTimer.Stop();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
|||||||
@@ -31,6 +31,58 @@ directory
|
|||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
|
template<class Field>
|
||||||
|
void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y){
|
||||||
|
typedef typename Field::scalar_type scomplex;
|
||||||
|
int Nblock = X.size();
|
||||||
|
for(int b=0;b<Nblock;b++){
|
||||||
|
for(int bp=0;bp<Nblock;bp++) {
|
||||||
|
m(b,bp) = innerProduct(X[b],Y[bp]);
|
||||||
|
}}
|
||||||
|
}
|
||||||
|
template<class Field>
|
||||||
|
void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0){
|
||||||
|
// Should make this cache friendly with site outermost, parallel_for
|
||||||
|
// Deal with case AP aliases with either Y or X
|
||||||
|
//
|
||||||
|
//Could pack "X" and "AP" into a Nblock x Volume dense array.
|
||||||
|
// AP(Nrhs x vol) = Y(Nrhs x vol) + scale * m(nrhs x nrhs) * X(nrhs*vol)
|
||||||
|
typedef typename Field::scalar_type scomplex;
|
||||||
|
int Nblock = AP.size();
|
||||||
|
std::vector<Field> tmp(Nblock,X[0]);
|
||||||
|
for(int b=0;b<Nblock;b++){
|
||||||
|
tmp[b] = Y[b];
|
||||||
|
for(int bp=0;bp<Nblock;bp++) {
|
||||||
|
tmp[b] = tmp[b] +scomplex(scale*m(bp,b))*X[bp];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for(int b=0;b<Nblock;b++){
|
||||||
|
AP[b] = tmp[b];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
template<class Field>
|
||||||
|
void MulMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X){
|
||||||
|
// Should make this cache friendly with site outermost, parallel_for
|
||||||
|
typedef typename Field::scalar_type scomplex;
|
||||||
|
int Nblock = AP.size();
|
||||||
|
for(int b=0;b<Nblock;b++){
|
||||||
|
AP[b] = Zero();
|
||||||
|
for(int bp=0;bp<Nblock;bp++) {
|
||||||
|
AP[b] += scomplex(m(bp,b))*X[bp];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
template<class Field>
|
||||||
|
double normv(const std::vector<Field> &P){
|
||||||
|
int Nblock = P.size();
|
||||||
|
double nn = 0.0;
|
||||||
|
for(int b=0;b<Nblock;b++) {
|
||||||
|
nn+=norm2(P[b]);
|
||||||
|
}
|
||||||
|
return nn;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS, BlockCGVec, BlockCGrQVec };
|
enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS, BlockCGVec, BlockCGrQVec };
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
@@ -87,10 +139,19 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
|
|||||||
sliceInnerProductMatrix(m_rr,R,R,Orthog);
|
sliceInnerProductMatrix(m_rr,R,R,Orthog);
|
||||||
|
|
||||||
// Force manifest hermitian to avoid rounding related
|
// Force manifest hermitian to avoid rounding related
|
||||||
|
/*
|
||||||
|
int rank=m_rr.rows();
|
||||||
|
for(int r=0;r<rank;r++){
|
||||||
|
for(int s=0;s<rank;s++){
|
||||||
|
std::cout << "QR m_rr["<<r<<","<<s<<"] "<<m_rr(r,s)<<std::endl;
|
||||||
|
}}
|
||||||
|
*/
|
||||||
m_rr = 0.5*(m_rr+m_rr.adjoint());
|
m_rr = 0.5*(m_rr+m_rr.adjoint());
|
||||||
|
|
||||||
Eigen::MatrixXcd L = m_rr.llt().matrixL();
|
Eigen::MatrixXcd L = m_rr.llt().matrixL();
|
||||||
|
|
||||||
|
// ComplexD det = L.determinant();
|
||||||
|
// std::cout << " Det m_rr "<<det<<std::endl;
|
||||||
C = L.adjoint();
|
C = L.adjoint();
|
||||||
Cinv = C.inverse();
|
Cinv = C.inverse();
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
@@ -110,11 +171,20 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
|
|||||||
const std::vector<Field> & R)
|
const std::vector<Field> & R)
|
||||||
{
|
{
|
||||||
InnerProductMatrix(m_rr,R,R);
|
InnerProductMatrix(m_rr,R,R);
|
||||||
|
/*
|
||||||
|
int rank=m_rr.rows();
|
||||||
|
for(int r=0;r<rank;r++){
|
||||||
|
for(int s=0;s<rank;s++){
|
||||||
|
std::cout << "QRvec m_rr["<<r<<","<<s<<"] "<<m_rr(r,s)<<std::endl;
|
||||||
|
}}
|
||||||
|
*/
|
||||||
m_rr = 0.5*(m_rr+m_rr.adjoint());
|
m_rr = 0.5*(m_rr+m_rr.adjoint());
|
||||||
|
|
||||||
Eigen::MatrixXcd L = m_rr.llt().matrixL();
|
Eigen::MatrixXcd L = m_rr.llt().matrixL();
|
||||||
|
|
||||||
|
// ComplexD det = L.determinant();
|
||||||
|
// std::cout << " Det m_rr "<<det<<std::endl;
|
||||||
|
|
||||||
C = L.adjoint();
|
C = L.adjoint();
|
||||||
Cinv = C.inverse();
|
Cinv = C.inverse();
|
||||||
|
|
||||||
@@ -186,6 +256,7 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
|
|||||||
sliceNorm(ssq,B,Orthog);
|
sliceNorm(ssq,B,Orthog);
|
||||||
RealD sssum=0;
|
RealD sssum=0;
|
||||||
for(int b=0;b<Nblock;b++) sssum+=ssq[b];
|
for(int b=0;b<Nblock;b++) sssum+=ssq[b];
|
||||||
|
for(int b=0;b<Nblock;b++) std::cout << "src["<<b<<"]" << ssq[b] <<std::endl;
|
||||||
|
|
||||||
sliceNorm(residuals,B,Orthog);
|
sliceNorm(residuals,B,Orthog);
|
||||||
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
|
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
|
||||||
@@ -221,6 +292,9 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
|
|||||||
Linop.HermOp(X, AD);
|
Linop.HermOp(X, AD);
|
||||||
tmp = B - AD;
|
tmp = B - AD;
|
||||||
|
|
||||||
|
sliceNorm(residuals,tmp,Orthog);
|
||||||
|
for(int b=0;b<Nblock;b++) std::cout << "res["<<b<<"]" << residuals[b] <<std::endl;
|
||||||
|
|
||||||
ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
|
ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
|
||||||
D=Q;
|
D=Q;
|
||||||
|
|
||||||
@@ -236,6 +310,8 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
|
|||||||
GridStopWatch SolverTimer;
|
GridStopWatch SolverTimer;
|
||||||
SolverTimer.Start();
|
SolverTimer.Start();
|
||||||
|
|
||||||
|
RealD max_resid=0;
|
||||||
|
|
||||||
int k;
|
int k;
|
||||||
for (k = 1; k <= MaxIterations; k++) {
|
for (k = 1; k <= MaxIterations; k++) {
|
||||||
|
|
||||||
@@ -280,7 +356,7 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
|
|||||||
*/
|
*/
|
||||||
m_rr = m_C.adjoint() * m_C;
|
m_rr = m_C.adjoint() * m_C;
|
||||||
|
|
||||||
RealD max_resid=0;
|
max_resid=0;
|
||||||
RealD rrsum=0;
|
RealD rrsum=0;
|
||||||
RealD rr;
|
RealD rr;
|
||||||
|
|
||||||
@@ -322,7 +398,9 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
|
|||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl;
|
|
||||||
|
std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge "<<k<<" / "<<MaxIterations
|
||||||
|
<<" residual "<< std::sqrt(max_resid)<< std::endl;
|
||||||
|
|
||||||
if (ErrorOnNoConverge) assert(0);
|
if (ErrorOnNoConverge) assert(0);
|
||||||
IterationsToComplete = k;
|
IterationsToComplete = k;
|
||||||
@@ -466,43 +544,6 @@ void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &
|
|||||||
IterationsToComplete = k;
|
IterationsToComplete = k;
|
||||||
}
|
}
|
||||||
|
|
||||||
void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y){
|
|
||||||
for(int b=0;b<Nblock;b++){
|
|
||||||
for(int bp=0;bp<Nblock;bp++) {
|
|
||||||
m(b,bp) = innerProduct(X[b],Y[bp]);
|
|
||||||
}}
|
|
||||||
}
|
|
||||||
void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0){
|
|
||||||
// Should make this cache friendly with site outermost, parallel_for
|
|
||||||
// Deal with case AP aliases with either Y or X
|
|
||||||
std::vector<Field> tmp(Nblock,X[0]);
|
|
||||||
for(int b=0;b<Nblock;b++){
|
|
||||||
tmp[b] = Y[b];
|
|
||||||
for(int bp=0;bp<Nblock;bp++) {
|
|
||||||
tmp[b] = tmp[b] + scomplex(scale*m(bp,b))*X[bp];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for(int b=0;b<Nblock;b++){
|
|
||||||
AP[b] = tmp[b];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
void MulMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X){
|
|
||||||
// Should make this cache friendly with site outermost, parallel_for
|
|
||||||
for(int b=0;b<Nblock;b++){
|
|
||||||
AP[b] = Zero();
|
|
||||||
for(int bp=0;bp<Nblock;bp++) {
|
|
||||||
AP[b] += scomplex(m(bp,b))*X[bp];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
double normv(const std::vector<Field> &P){
|
|
||||||
double nn = 0.0;
|
|
||||||
for(int b=0;b<Nblock;b++) {
|
|
||||||
nn+=norm2(P[b]);
|
|
||||||
}
|
|
||||||
return nn;
|
|
||||||
}
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////
|
||||||
// BlockCGrQvec implementation:
|
// BlockCGrQvec implementation:
|
||||||
//--------------------------
|
//--------------------------
|
||||||
@@ -549,6 +590,7 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
|
|||||||
|
|
||||||
RealD sssum=0;
|
RealD sssum=0;
|
||||||
for(int b=0;b<Nblock;b++){ ssq[b] = norm2(B[b]);}
|
for(int b=0;b<Nblock;b++){ ssq[b] = norm2(B[b]);}
|
||||||
|
for(int b=0;b<Nblock;b++){ std::cout << "ssq["<<b<<"] "<<ssq[b]<<std::endl;}
|
||||||
for(int b=0;b<Nblock;b++) sssum+=ssq[b];
|
for(int b=0;b<Nblock;b++) sssum+=ssq[b];
|
||||||
|
|
||||||
for(int b=0;b<Nblock;b++){ residuals[b] = norm2(B[b]);}
|
for(int b=0;b<Nblock;b++){ residuals[b] = norm2(B[b]);}
|
||||||
@@ -585,6 +627,7 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
|
|||||||
for(int b=0;b<Nblock;b++) {
|
for(int b=0;b<Nblock;b++) {
|
||||||
Linop.HermOp(X[b], AD[b]);
|
Linop.HermOp(X[b], AD[b]);
|
||||||
tmp[b] = B[b] - AD[b];
|
tmp[b] = B[b] - AD[b];
|
||||||
|
std::cout << "r0["<<b<<"] "<<norm2(tmp[b])<<std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
|
ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
|
||||||
|
|||||||
@@ -38,12 +38,13 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
// single input vec, single output vec.
|
// single input vec, single output vec.
|
||||||
/////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
|
||||||
template <class Field>
|
template <class Field>
|
||||||
class ConjugateGradient : public OperatorFunction<Field> {
|
class ConjugateGradient : public OperatorFunction<Field> {
|
||||||
public:
|
public:
|
||||||
|
|
||||||
using OperatorFunction<Field>::operator();
|
using OperatorFunction<Field>::operator();
|
||||||
|
|
||||||
bool ErrorOnNoConverge; // throw an assert when the CG fails to converge.
|
bool ErrorOnNoConverge; // throw an assert when the CG fails to converge.
|
||||||
// Defaults true.
|
// Defaults true.
|
||||||
RealD Tolerance;
|
RealD Tolerance;
|
||||||
@@ -57,10 +58,22 @@ public:
|
|||||||
ErrorOnNoConverge(err_on_no_conv)
|
ErrorOnNoConverge(err_on_no_conv)
|
||||||
{};
|
{};
|
||||||
|
|
||||||
void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
|
virtual void LogIteration(int k,RealD a,RealD b){
|
||||||
|
// std::cout << "ConjugageGradient::LogIteration() "<<std::endl;
|
||||||
|
};
|
||||||
|
virtual void LogBegin(void){
|
||||||
|
std::cout << "ConjugageGradient::LogBegin() "<<std::endl;
|
||||||
|
};
|
||||||
|
|
||||||
GRID_TRACE("ConjugateGradient");
|
void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
|
||||||
|
|
||||||
|
this->LogBegin();
|
||||||
|
|
||||||
|
GRID_TRACE("ConjugateGradient");
|
||||||
GridStopWatch PreambleTimer;
|
GridStopWatch PreambleTimer;
|
||||||
|
GridStopWatch ConstructTimer;
|
||||||
|
GridStopWatch NormTimer;
|
||||||
|
GridStopWatch AssignTimer;
|
||||||
PreambleTimer.Start();
|
PreambleTimer.Start();
|
||||||
psi.Checkerboard() = src.Checkerboard();
|
psi.Checkerboard() = src.Checkerboard();
|
||||||
|
|
||||||
@@ -70,14 +83,19 @@ public:
|
|||||||
//RealD b_pred;
|
//RealD b_pred;
|
||||||
|
|
||||||
// Was doing copies
|
// Was doing copies
|
||||||
Field p(src.Grid());
|
ConstructTimer.Start();
|
||||||
|
Field p (src.Grid());
|
||||||
Field mmp(src.Grid());
|
Field mmp(src.Grid());
|
||||||
Field r(src.Grid());
|
Field r (src.Grid());
|
||||||
|
ConstructTimer.Stop();
|
||||||
|
|
||||||
// Initial residual computation & set up
|
// Initial residual computation & set up
|
||||||
|
NormTimer.Start();
|
||||||
ssq = norm2(src);
|
ssq = norm2(src);
|
||||||
RealD guess = norm2(psi);
|
RealD guess = norm2(psi);
|
||||||
|
NormTimer.Stop();
|
||||||
assert(std::isnan(guess) == 0);
|
assert(std::isnan(guess) == 0);
|
||||||
|
AssignTimer.Start();
|
||||||
if ( guess == 0.0 ) {
|
if ( guess == 0.0 ) {
|
||||||
r = src;
|
r = src;
|
||||||
p = r;
|
p = r;
|
||||||
@@ -89,6 +107,7 @@ public:
|
|||||||
a = norm2(p);
|
a = norm2(p);
|
||||||
}
|
}
|
||||||
cp = a;
|
cp = a;
|
||||||
|
AssignTimer.Stop();
|
||||||
|
|
||||||
// Handle trivial case of zero src
|
// Handle trivial case of zero src
|
||||||
if (ssq == 0.){
|
if (ssq == 0.){
|
||||||
@@ -164,6 +183,7 @@ public:
|
|||||||
}
|
}
|
||||||
LinearCombTimer.Stop();
|
LinearCombTimer.Stop();
|
||||||
LinalgTimer.Stop();
|
LinalgTimer.Stop();
|
||||||
|
LogIteration(k,a,b);
|
||||||
|
|
||||||
IterationTimer.Stop();
|
IterationTimer.Stop();
|
||||||
if ( (k % 500) == 0 ) {
|
if ( (k % 500) == 0 ) {
|
||||||
@@ -220,6 +240,9 @@ public:
|
|||||||
<<" residual "<< std::sqrt(cp / ssq)<< std::endl;
|
<<" residual "<< std::sqrt(cp / ssq)<< std::endl;
|
||||||
SolverTimer.Stop();
|
SolverTimer.Stop();
|
||||||
std::cout << GridLogMessage << "\tPreamble " << PreambleTimer.Elapsed() <<std::endl;
|
std::cout << GridLogMessage << "\tPreamble " << PreambleTimer.Elapsed() <<std::endl;
|
||||||
|
std::cout << GridLogMessage << "\tConstruct " << ConstructTimer.Elapsed() <<std::endl;
|
||||||
|
std::cout << GridLogMessage << "\tNorm " << NormTimer.Elapsed() <<std::endl;
|
||||||
|
std::cout << GridLogMessage << "\tAssign " << AssignTimer.Elapsed() <<std::endl;
|
||||||
std::cout << GridLogMessage << "\tSolver " << SolverTimer.Elapsed() <<std::endl;
|
std::cout << GridLogMessage << "\tSolver " << SolverTimer.Elapsed() <<std::endl;
|
||||||
std::cout << GridLogMessage << "Solver breakdown "<<std::endl;
|
std::cout << GridLogMessage << "Solver breakdown "<<std::endl;
|
||||||
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
|
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
|
||||||
@@ -233,5 +256,118 @@ public:
|
|||||||
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
template <class Field>
|
||||||
|
class ConjugateGradientPolynomial : public ConjugateGradient<Field> {
|
||||||
|
public:
|
||||||
|
// Optionally record the CG polynomial
|
||||||
|
std::vector<double> ak;
|
||||||
|
std::vector<double> bk;
|
||||||
|
std::vector<double> poly_p;
|
||||||
|
std::vector<double> poly_r;
|
||||||
|
std::vector<double> poly_Ap;
|
||||||
|
std::vector<double> polynomial;
|
||||||
|
|
||||||
|
public:
|
||||||
|
ConjugateGradientPolynomial(RealD tol, Integer maxit, bool err_on_no_conv = true)
|
||||||
|
: ConjugateGradient<Field>(tol,maxit,err_on_no_conv)
|
||||||
|
{ };
|
||||||
|
void PolyHermOp(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
|
||||||
|
{
|
||||||
|
Field tmp(src.Grid());
|
||||||
|
Field AtoN(src.Grid());
|
||||||
|
AtoN = src;
|
||||||
|
psi=AtoN*polynomial[0];
|
||||||
|
for(int n=1;n<polynomial.size();n++){
|
||||||
|
tmp = AtoN;
|
||||||
|
Linop.HermOp(tmp,AtoN);
|
||||||
|
psi = psi + polynomial[n]*AtoN;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void CGsequenceHermOp(LinearOperatorBase<Field> &Linop, const Field &src, Field &x)
|
||||||
|
{
|
||||||
|
Field Ap(src.Grid());
|
||||||
|
Field r(src.Grid());
|
||||||
|
Field p(src.Grid());
|
||||||
|
p=src;
|
||||||
|
r=src;
|
||||||
|
x=Zero();
|
||||||
|
x.Checkerboard()=src.Checkerboard();
|
||||||
|
for(int k=0;k<ak.size();k++){
|
||||||
|
x = x + ak[k]*p;
|
||||||
|
Linop.HermOp(p,Ap);
|
||||||
|
r = r - ak[k] * Ap;
|
||||||
|
p = r + bk[k] * p;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void Solve(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
|
||||||
|
{
|
||||||
|
psi=Zero();
|
||||||
|
this->operator ()(Linop,src,psi);
|
||||||
|
}
|
||||||
|
virtual void LogBegin(void)
|
||||||
|
{
|
||||||
|
std::cout << "ConjugageGradientPolynomial::LogBegin() "<<std::endl;
|
||||||
|
ak.resize(0);
|
||||||
|
bk.resize(0);
|
||||||
|
polynomial.resize(0);
|
||||||
|
poly_Ap.resize(0);
|
||||||
|
poly_Ap.resize(0);
|
||||||
|
poly_p.resize(1);
|
||||||
|
poly_r.resize(1);
|
||||||
|
poly_p[0]=1.0;
|
||||||
|
poly_r[0]=1.0;
|
||||||
|
};
|
||||||
|
virtual void LogIteration(int k,RealD a,RealD b)
|
||||||
|
{
|
||||||
|
// With zero guess,
|
||||||
|
// p = r = src
|
||||||
|
//
|
||||||
|
// iterate:
|
||||||
|
// x = x + a p
|
||||||
|
// r = r - a A p
|
||||||
|
// p = r + b p
|
||||||
|
//
|
||||||
|
// [0]
|
||||||
|
// r = x
|
||||||
|
// p = x
|
||||||
|
// Ap=0
|
||||||
|
//
|
||||||
|
// [1]
|
||||||
|
// Ap = A x + 0 ==> shift poly P right by 1 and add 0.
|
||||||
|
// x = x + a p ==> add polynomials term by term
|
||||||
|
// r = r - a A p ==> add polynomials term by term
|
||||||
|
// p = r + b p ==> add polynomials term by term
|
||||||
|
//
|
||||||
|
std::cout << "ConjugageGradientPolynomial::LogIteration() "<<k<<std::endl;
|
||||||
|
ak.push_back(a);
|
||||||
|
bk.push_back(b);
|
||||||
|
// Ap= right_shift(p)
|
||||||
|
poly_Ap.resize(k+1);
|
||||||
|
poly_Ap[0]=0.0;
|
||||||
|
for(int i=0;i<k;i++){
|
||||||
|
poly_Ap[i+1]=poly_p[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
// x = x + a p
|
||||||
|
polynomial.resize(k);
|
||||||
|
polynomial[k-1]=0.0;
|
||||||
|
for(int i=0;i<k;i++){
|
||||||
|
polynomial[i] = polynomial[i] + a * poly_p[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
// r = r - a Ap
|
||||||
|
// p = r + b p
|
||||||
|
poly_r.resize(k+1);
|
||||||
|
poly_p.resize(k+1);
|
||||||
|
poly_r[k] = poly_p[k] = 0.0;
|
||||||
|
for(int i=0;i<k+1;i++){
|
||||||
|
poly_r[i] = poly_r[i] - a * poly_Ap[i];
|
||||||
|
poly_p[i] = poly_r[i] + b * poly_p[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -116,14 +116,14 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
//Compute double precision rsd and also new RHS vector.
|
//Compute double precision rsd and also new RHS vector.
|
||||||
Linop_d.HermOp(sol_d, tmp_d);
|
Linop_d.HermOp(sol_d, tmp_d);
|
||||||
RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
|
RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
|
||||||
|
std::cout<<GridLogMessage<<" rsd norm "<<norm<<std::endl;
|
||||||
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;
|
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;
|
||||||
|
|
||||||
if(norm < OuterLoopNormMult * stop){
|
if(norm < OuterLoopNormMult * stop){
|
||||||
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl;
|
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
while(norm * inner_tol * inner_tol < stop) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
|
while(norm * inner_tol * inner_tol < stop*1.01) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
|
||||||
|
|
||||||
PrecChangeTimer.Start();
|
PrecChangeTimer.Start();
|
||||||
precisionChange(src_f, src_d, pc_wk_dp_to_sp);
|
precisionChange(src_f, src_d, pc_wk_dp_to_sp);
|
||||||
|
|||||||
@@ -102,11 +102,11 @@ public:
|
|||||||
assert(mass.size()==nshift);
|
assert(mass.size()==nshift);
|
||||||
assert(mresidual.size()==nshift);
|
assert(mresidual.size()==nshift);
|
||||||
|
|
||||||
// dynamic sized arrays on stack; 2d is a pain with vector
|
// remove dynamic sized arrays on stack; 2d is a pain with vector
|
||||||
RealD bs[nshift];
|
std::vector<RealD> bs(nshift);
|
||||||
RealD rsq[nshift];
|
std::vector<RealD> rsq(nshift);
|
||||||
RealD z[nshift][2];
|
std::vector<std::array<RealD,2> > z(nshift);
|
||||||
int converged[nshift];
|
std::vector<int> converged(nshift);
|
||||||
|
|
||||||
const int primary =0;
|
const int primary =0;
|
||||||
|
|
||||||
|
|||||||
@@ -123,11 +123,11 @@ public:
|
|||||||
assert(mresidual.size()==nshift);
|
assert(mresidual.size()==nshift);
|
||||||
|
|
||||||
// dynamic sized arrays on stack; 2d is a pain with vector
|
// dynamic sized arrays on stack; 2d is a pain with vector
|
||||||
RealD bs[nshift];
|
std::vector<RealD> bs(nshift);
|
||||||
RealD rsq[nshift];
|
std::vector<RealD> rsq(nshift);
|
||||||
RealD rsqf[nshift];
|
std::vector<RealD> rsqf(nshift);
|
||||||
RealD z[nshift][2];
|
std::vector<std::array<RealD,2> > z(nshift);
|
||||||
int converged[nshift];
|
std::vector<int> converged(nshift);
|
||||||
|
|
||||||
const int primary =0;
|
const int primary =0;
|
||||||
|
|
||||||
|
|||||||
@@ -156,11 +156,11 @@ public:
|
|||||||
assert(mresidual.size()==nshift);
|
assert(mresidual.size()==nshift);
|
||||||
|
|
||||||
// dynamic sized arrays on stack; 2d is a pain with vector
|
// dynamic sized arrays on stack; 2d is a pain with vector
|
||||||
RealD bs[nshift];
|
std::vector<RealD> bs(nshift);
|
||||||
RealD rsq[nshift];
|
std::vector<RealD> rsq(nshift);
|
||||||
RealD rsqf[nshift];
|
std::vector<RealD> rsqf(nshift);
|
||||||
RealD z[nshift][2];
|
std::vector<std::array<RealD,2> > z(nshift);
|
||||||
int converged[nshift];
|
std::vector<int> converged(nshift);
|
||||||
|
|
||||||
const int primary =0;
|
const int primary =0;
|
||||||
|
|
||||||
|
|||||||
@@ -143,7 +143,7 @@ public:
|
|||||||
ip = innerProduct(evec[j],w);
|
ip = innerProduct(evec[j],w);
|
||||||
if(if_print)
|
if(if_print)
|
||||||
if( norm(ip)/norm2(w) > 1e-14)
|
if( norm(ip)/norm2(w) > 1e-14)
|
||||||
Glog<<"orthogonalize before: "<<j<<" of "<<k<<" "<< ip <<std::endl;
|
Glog<<"orthogonalize before: "<<j<<" of "<<k<<" "<< ip <<std::endl;
|
||||||
w = w - ip * evec[j];
|
w = w - ip * evec[j];
|
||||||
if(if_print) {
|
if(if_print) {
|
||||||
ip = innerProduct(evec[j],w);
|
ip = innerProduct(evec[j],w);
|
||||||
@@ -279,16 +279,16 @@ public:
|
|||||||
Qt = Eigen::MatrixXcd::Identity(Nm,Nm);
|
Qt = Eigen::MatrixXcd::Identity(Nm,Nm);
|
||||||
diagonalize(eval2,lmd2,lme2,Nu,Nm,Nm,Qt,grid);
|
diagonalize(eval2,lmd2,lme2,Nu,Nm,Nm,Qt,grid);
|
||||||
_sort.push(eval2,Nm);
|
_sort.push(eval2,Nm);
|
||||||
Glog << "#Ritz value before shift: "<< std::endl;
|
// Glog << "#Ritz value before shift: "<< std::endl;
|
||||||
for(int i=0; i<Nm; ++i){
|
for(int i=0; i<Nm; ++i){
|
||||||
std::cout.precision(13);
|
// std::cout.precision(13);
|
||||||
std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
|
// std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
|
||||||
std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
|
// std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
//----------------------------------------------------------------------
|
//----------------------------------------------------------------------
|
||||||
if ( Nm>Nk ) {
|
if ( Nm>Nk ) {
|
||||||
Glog <<" #Apply shifted QR transformations "<<std::endl;
|
// Glog <<" #Apply shifted QR transformations "<<std::endl;
|
||||||
//int k2 = Nk+Nu;
|
//int k2 = Nk+Nu;
|
||||||
int k2 = Nk;
|
int k2 = Nk;
|
||||||
|
|
||||||
@@ -326,11 +326,11 @@ public:
|
|||||||
Qt = Eigen::MatrixXcd::Identity(Nm,Nm);
|
Qt = Eigen::MatrixXcd::Identity(Nm,Nm);
|
||||||
diagonalize(eval2,lmd2,lme2,Nu,Nk,Nm,Qt,grid);
|
diagonalize(eval2,lmd2,lme2,Nu,Nk,Nm,Qt,grid);
|
||||||
_sort.push(eval2,Nk);
|
_sort.push(eval2,Nk);
|
||||||
Glog << "#Ritz value after shift: "<< std::endl;
|
// Glog << "#Ritz value after shift: "<< std::endl;
|
||||||
for(int i=0; i<Nk; ++i){
|
for(int i=0; i<Nk; ++i){
|
||||||
// std::cout.precision(13);
|
// std::cout.precision(13);
|
||||||
// std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
|
// std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
|
||||||
// std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
|
// std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
//----------------------------------------------------------------------
|
//----------------------------------------------------------------------
|
||||||
@@ -644,7 +644,7 @@ private:
|
|||||||
// for (int u=0; u<mrhs; ++u) Glog << " out["<<u<<"] = "<<norm2(out[u])<<std::endl;
|
// for (int u=0; u<mrhs; ++u) Glog << " out["<<u<<"] = "<<norm2(out[u])<<std::endl;
|
||||||
k_start +=mrhs;
|
k_start +=mrhs;
|
||||||
}
|
}
|
||||||
Glog << "LinAlg "<< std::endl;
|
// Glog << "LinAlg "<< std::endl;
|
||||||
|
|
||||||
if (b>0) {
|
if (b>0) {
|
||||||
for (int u=0; u<Nu; ++u) {
|
for (int u=0; u<Nu; ++u) {
|
||||||
@@ -678,7 +678,7 @@ private:
|
|||||||
}
|
}
|
||||||
w_copy[u] = w[u];
|
w_copy[u] = w[u];
|
||||||
}
|
}
|
||||||
Glog << "LinAlg done"<< std::endl;
|
// Glog << "LinAlg done"<< std::endl;
|
||||||
|
|
||||||
// In block version, the steps 6 and 7 in Lanczos construction is
|
// In block version, the steps 6 and 7 in Lanczos construction is
|
||||||
// replaced by the QR decomposition of new basis block.
|
// replaced by the QR decomposition of new basis block.
|
||||||
@@ -691,15 +691,15 @@ private:
|
|||||||
}
|
}
|
||||||
|
|
||||||
// re-orthogonalization for numerical stability
|
// re-orthogonalization for numerical stability
|
||||||
Glog << "Gram Schmidt"<< std::endl;
|
// Glog << "Gram Schmidt"<< std::endl;
|
||||||
orthogonalize(w,Nu,evec,R);
|
orthogonalize(w,Nu,evec,R);
|
||||||
// QR part
|
// QR part
|
||||||
for (int u=1; u<Nu; ++u) {
|
for (int u=1; u<Nu; ++u) {
|
||||||
orthogonalize(w[u],w,u);
|
orthogonalize(w[u],w,u);
|
||||||
}
|
}
|
||||||
Glog << "Gram Schmidt done "<< std::endl;
|
// Glog << "Gram Schmidt done "<< std::endl;
|
||||||
|
|
||||||
Glog << "LinAlg "<< std::endl;
|
// Glog << "LinAlg "<< std::endl;
|
||||||
for (int u=0; u<Nu; ++u) {
|
for (int u=0; u<Nu; ++u) {
|
||||||
//for (int v=0; v<Nu; ++v) {
|
//for (int v=0; v<Nu; ++v) {
|
||||||
for (int v=u; v<Nu; ++v) {
|
for (int v=u; v<Nu; ++v) {
|
||||||
@@ -716,7 +716,7 @@ private:
|
|||||||
// Glog <<" In block "<< b << "," <<" beta[" << u << "," << k-L << "] = " << lme[u][k] << std::endl;
|
// Glog <<" In block "<< b << "," <<" beta[" << u << "," << k-L << "] = " << lme[u][k] << std::endl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Glog << "LinAlg done "<< std::endl;
|
// Glog << "LinAlg done "<< std::endl;
|
||||||
|
|
||||||
if (b < Nm/Nu-1) {
|
if (b < Nm/Nu-1) {
|
||||||
for (int u=0; u<Nu; ++u) {
|
for (int u=0; u<Nu; ++u) {
|
||||||
@@ -935,7 +935,7 @@ if (1){
|
|||||||
int Nu, int Nb, int Nk, int Nm,
|
int Nu, int Nb, int Nk, int Nm,
|
||||||
Eigen::MatrixXcd& M)
|
Eigen::MatrixXcd& M)
|
||||||
{
|
{
|
||||||
Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n';
|
// Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n';
|
||||||
assert( Nk%Nu == 0 && Nm%Nu == 0 );
|
assert( Nk%Nu == 0 && Nm%Nu == 0 );
|
||||||
assert( Nk <= Nm );
|
assert( Nk <= Nm );
|
||||||
M = Eigen::MatrixXcd::Zero(Nk,Nk);
|
M = Eigen::MatrixXcd::Zero(Nk,Nk);
|
||||||
@@ -953,7 +953,7 @@ if (1){
|
|||||||
M(u+(k/Nu)*Nu,k-Nu) = lme[u][k-Nu];
|
M(u+(k/Nu)*Nu,k-Nu) = lme[u][k-Nu];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Glog << "unpackHermitBlockTriDiagMatToEigen() end" << std::endl;
|
// Glog << "unpackHermitBlockTriDiagMatToEigen() end" << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -963,7 +963,7 @@ if (1){
|
|||||||
int Nu, int Nb, int Nk, int Nm,
|
int Nu, int Nb, int Nk, int Nm,
|
||||||
Eigen::MatrixXcd& M)
|
Eigen::MatrixXcd& M)
|
||||||
{
|
{
|
||||||
Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n';
|
// Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n';
|
||||||
assert( Nk%Nu == 0 && Nm%Nu == 0 );
|
assert( Nk%Nu == 0 && Nm%Nu == 0 );
|
||||||
assert( Nk <= Nm );
|
assert( Nk <= Nm );
|
||||||
|
|
||||||
@@ -979,7 +979,7 @@ if (1){
|
|||||||
lme[u][k-Nu] = M(u+(k/Nu)*Nu,k-Nu);
|
lme[u][k-Nu] = M(u+(k/Nu)*Nu,k-Nu);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Glog << "packHermitBlockTriDiagMatfromEigen() end" <<std::endl;
|
// Glog << "packHermitBlockTriDiagMatfromEigen() end" <<std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -988,7 +988,7 @@ if (1){
|
|||||||
RealD Dsh,
|
RealD Dsh,
|
||||||
Eigen::MatrixXcd& Qprod)
|
Eigen::MatrixXcd& Qprod)
|
||||||
{
|
{
|
||||||
Glog << "shiftedQRDecompEigen() begin" << '\n';
|
// Glog << "shiftedQRDecompEigen() begin" << '\n';
|
||||||
Eigen::MatrixXcd Q = Eigen::MatrixXcd::Zero(Nm,Nm);
|
Eigen::MatrixXcd Q = Eigen::MatrixXcd::Zero(Nm,Nm);
|
||||||
Eigen::MatrixXcd R = Eigen::MatrixXcd::Zero(Nm,Nm);
|
Eigen::MatrixXcd R = Eigen::MatrixXcd::Zero(Nm,Nm);
|
||||||
Eigen::MatrixXcd Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm);
|
Eigen::MatrixXcd Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm);
|
||||||
@@ -1004,7 +1004,7 @@ if (1){
|
|||||||
// lower triangular part used to represent series
|
// lower triangular part used to represent series
|
||||||
// of Q sequence.
|
// of Q sequence.
|
||||||
|
|
||||||
Glog << "shiftedQRDecompEigen() Housholder & QR" << '\n';
|
// Glog << "shiftedQRDecompEigen() Housholder & QR" << '\n';
|
||||||
// equivalent operation of Qprod *= Q
|
// equivalent operation of Qprod *= Q
|
||||||
//M = Eigen::MatrixXcd::Zero(Nm,Nm);
|
//M = Eigen::MatrixXcd::Zero(Nm,Nm);
|
||||||
|
|
||||||
@@ -1025,7 +1025,7 @@ if (1){
|
|||||||
|
|
||||||
Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm);
|
Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm);
|
||||||
|
|
||||||
Glog << "shiftedQRDecompEigen() Mtmp create" << '\n';
|
// Glog << "shiftedQRDecompEigen() Mtmp create" << '\n';
|
||||||
for (int i=0; i<Nm; ++i) {
|
for (int i=0; i<Nm; ++i) {
|
||||||
for (int j=0; j<Nm-(Nu+1); ++j) {
|
for (int j=0; j<Nm-(Nu+1); ++j) {
|
||||||
for (int k=0; k<Nu+1+j; ++k) {
|
for (int k=0; k<Nu+1+j; ++k) {
|
||||||
@@ -1033,7 +1033,7 @@ if (1){
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Glog << "shiftedQRDecompEigen() Mtmp loop1" << '\n';
|
// Glog << "shiftedQRDecompEigen() Mtmp loop1" << '\n';
|
||||||
for (int i=0; i<Nm; ++i) {
|
for (int i=0; i<Nm; ++i) {
|
||||||
for (int j=Nm-(Nu+1); j<Nm; ++j) {
|
for (int j=Nm-(Nu+1); j<Nm; ++j) {
|
||||||
for (int k=0; k<Nm; ++k) {
|
for (int k=0; k<Nm; ++k) {
|
||||||
@@ -1041,7 +1041,7 @@ if (1){
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Glog << "shiftedQRDecompEigen() Mtmp loop2" << '\n';
|
// Glog << "shiftedQRDecompEigen() Mtmp loop2" << '\n';
|
||||||
|
|
||||||
//static int ntimes = 2;
|
//static int ntimes = 2;
|
||||||
//for (int j=0; j<Nm-(ntimes*Nu); ++j) {
|
//for (int j=0; j<Nm-(ntimes*Nu); ++j) {
|
||||||
@@ -1067,13 +1067,13 @@ if (1){
|
|||||||
Mtmp(j,i) = conj(Mtmp(i,j));
|
Mtmp(j,i) = conj(Mtmp(i,j));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Glog << "shiftedQRDecompEigen() Mtmp loop3" << '\n';
|
// Glog << "shiftedQRDecompEigen() Mtmp loop3" << '\n';
|
||||||
|
|
||||||
for (int i=0; i<Nm; ++i) {
|
for (int i=0; i<Nm; ++i) {
|
||||||
Mtmp(i,i) = real(Mtmp(i,i)) + Dsh;
|
Mtmp(i,i) = real(Mtmp(i,i)) + Dsh;
|
||||||
}
|
}
|
||||||
|
|
||||||
Glog << "shiftedQRDecompEigen() Mtmp loop4" << '\n';
|
// Glog << "shiftedQRDecompEigen() Mtmp loop4" << '\n';
|
||||||
M = Mtmp;
|
M = Mtmp;
|
||||||
|
|
||||||
//M = Q.adjoint()*(M*Q);
|
//M = Q.adjoint()*(M*Q);
|
||||||
@@ -1085,7 +1085,7 @@ if (1){
|
|||||||
// }
|
// }
|
||||||
//}
|
//}
|
||||||
|
|
||||||
Glog << "shiftedQRDecompEigen() end" <<std::endl;
|
// Glog << "shiftedQRDecompEigen() end" <<std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
void exampleQRDecompEigen(void)
|
void exampleQRDecompEigen(void)
|
||||||
|
|||||||
@@ -60,6 +60,32 @@ public:
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template<class Field> class NormalResidual : public LinearFunction<Field>{
|
||||||
|
private:
|
||||||
|
SparseMatrixBase<Field> & _Matrix;
|
||||||
|
OperatorFunction<Field> & _HermitianSolver;
|
||||||
|
LinearFunction<Field> & _Guess;
|
||||||
|
public:
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////
|
||||||
|
// Wrap the usual normal equations trick
|
||||||
|
/////////////////////////////////////////////////////
|
||||||
|
NormalResidual(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver,
|
||||||
|
LinearFunction<Field> &Guess)
|
||||||
|
: _Matrix(Matrix), _HermitianSolver(HermitianSolver), _Guess(Guess) {};
|
||||||
|
|
||||||
|
void operator() (const Field &in, Field &out){
|
||||||
|
|
||||||
|
Field res(in.Grid());
|
||||||
|
Field tmp(in.Grid());
|
||||||
|
|
||||||
|
MMdagLinearOperator<SparseMatrixBase<Field>,Field> MMdagOp(_Matrix);
|
||||||
|
_Guess(in,res);
|
||||||
|
_HermitianSolver(MMdagOp,in,res); // M Mdag res = in ;
|
||||||
|
_Matrix.Mdag(res,out); // out = Mdag res
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
template<class Field> class HPDSolver : public LinearFunction<Field> {
|
template<class Field> class HPDSolver : public LinearFunction<Field> {
|
||||||
private:
|
private:
|
||||||
LinearOperatorBase<Field> & _Matrix;
|
LinearOperatorBase<Field> & _Matrix;
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ template<class Field> class PowerMethod
|
|||||||
RealD evalMaxApprox = 0.0;
|
RealD evalMaxApprox = 0.0;
|
||||||
auto src_n = src;
|
auto src_n = src;
|
||||||
auto tmp = src;
|
auto tmp = src;
|
||||||
const int _MAX_ITER_EST_ = 100;
|
const int _MAX_ITER_EST_ = 200;
|
||||||
|
|
||||||
for (int i=0;i<_MAX_ITER_EST_;i++) {
|
for (int i=0;i<_MAX_ITER_EST_;i++) {
|
||||||
|
|
||||||
@@ -30,18 +30,17 @@ template<class Field> class PowerMethod
|
|||||||
RealD vden = norm2(src_n);
|
RealD vden = norm2(src_n);
|
||||||
RealD na = vnum/vden;
|
RealD na = vnum/vden;
|
||||||
|
|
||||||
std::cout << GridLogIterative << "PowerMethod: Current approximation of largest eigenvalue " << na << std::endl;
|
std::cout << GridLogMessage << "PowerMethod: Current approximation of largest eigenvalue " << na << std::endl;
|
||||||
|
|
||||||
if ( (fabs(evalMaxApprox/na - 1.0) < 0.001) || (i==_MAX_ITER_EST_-1) ) {
|
// if ( (fabs(evalMaxApprox/na - 1.0) < 0.0001) || (i==_MAX_ITER_EST_-1) ) {
|
||||||
evalMaxApprox = na;
|
// evalMaxApprox = na;
|
||||||
std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
|
// return evalMaxApprox;
|
||||||
return evalMaxApprox;
|
// }
|
||||||
}
|
|
||||||
evalMaxApprox = na;
|
evalMaxApprox = na;
|
||||||
src_n = tmp;
|
src_n = tmp;
|
||||||
}
|
}
|
||||||
assert(0);
|
std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
|
||||||
return 0;
|
return evalMaxApprox;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
76
Grid/algorithms/iterative/PowerSpectrum.h
Normal file
76
Grid/algorithms/iterative/PowerSpectrum.h
Normal file
@@ -0,0 +1,76 @@
|
|||||||
|
#pragma once
|
||||||
|
namespace Grid {
|
||||||
|
|
||||||
|
class Band
|
||||||
|
{
|
||||||
|
RealD lo, hi;
|
||||||
|
public:
|
||||||
|
Band(RealD _lo,RealD _hi)
|
||||||
|
{
|
||||||
|
lo=_lo;
|
||||||
|
hi=_hi;
|
||||||
|
}
|
||||||
|
RealD operator() (RealD x){
|
||||||
|
if ( x>lo && x<hi ){
|
||||||
|
return 1.0;
|
||||||
|
} else {
|
||||||
|
return 0.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class PowerSpectrum
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
|
||||||
|
template<typename T> static RealD normalise(T& v)
|
||||||
|
{
|
||||||
|
RealD nn = norm2(v);
|
||||||
|
nn = sqrt(nn);
|
||||||
|
v = v * (1.0/nn);
|
||||||
|
return nn;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<RealD> ranges;
|
||||||
|
std::vector<int> order;
|
||||||
|
|
||||||
|
PowerSpectrum( std::vector<RealD> &bins, std::vector<int> &_order ) : ranges(bins), order(_order) { };
|
||||||
|
|
||||||
|
template<class Field>
|
||||||
|
RealD operator()(LinearOperatorBase<Field> &HermOp, const Field &src)
|
||||||
|
{
|
||||||
|
GridBase *grid = src.Grid();
|
||||||
|
int N=ranges.size();
|
||||||
|
RealD hi = ranges[N-1];
|
||||||
|
|
||||||
|
RealD lo_band = 0.0;
|
||||||
|
RealD hi_band;
|
||||||
|
RealD nn=norm2(src);
|
||||||
|
RealD ss=0.0;
|
||||||
|
|
||||||
|
Field tmp = src;
|
||||||
|
|
||||||
|
for(int b=0;b<N;b++){
|
||||||
|
hi_band = ranges[b];
|
||||||
|
Band Notch(lo_band,hi_band);
|
||||||
|
|
||||||
|
Chebyshev<Field> polynomial;
|
||||||
|
polynomial.Init(0.0,hi,order[b],Notch);
|
||||||
|
polynomial.JacksonSmooth();
|
||||||
|
|
||||||
|
polynomial(HermOp,src,tmp) ;
|
||||||
|
|
||||||
|
RealD p=norm2(tmp);
|
||||||
|
ss=ss+p;
|
||||||
|
std::cout << GridLogMessage << " PowerSpectrum Band["<<lo_band<<","<<hi_band<<"] power "<<norm2(tmp)/nn<<std::endl;
|
||||||
|
|
||||||
|
lo_band=hi_band;
|
||||||
|
}
|
||||||
|
std::cout << GridLogMessage << " PowerSpectrum total power "<<ss/nn<<std::endl;
|
||||||
|
std::cout << GridLogMessage << " PowerSpectrum total power (unnormalised) "<<nn<<std::endl;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
||||||
@@ -99,7 +99,7 @@ public:
|
|||||||
CoarseMatrix AselfInvEven;
|
CoarseMatrix AselfInvEven;
|
||||||
CoarseMatrix AselfInvOdd;
|
CoarseMatrix AselfInvOdd;
|
||||||
|
|
||||||
Vector<RealD> dag_factor;
|
deviceVector<RealD> dag_factor;
|
||||||
|
|
||||||
///////////////////////
|
///////////////////////
|
||||||
// Interface
|
// Interface
|
||||||
@@ -124,9 +124,13 @@ public:
|
|||||||
int npoint = geom.npoint;
|
int npoint = geom.npoint;
|
||||||
typedef LatticeView<Cobj> Aview;
|
typedef LatticeView<Cobj> Aview;
|
||||||
|
|
||||||
Vector<Aview> AcceleratorViewContainer;
|
deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
|
||||||
|
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
|
||||||
|
|
||||||
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
|
for(int p=0;p<geom.npoint;p++) {
|
||||||
|
hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
|
||||||
|
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
|
||||||
|
}
|
||||||
Aview *Aview_p = & AcceleratorViewContainer[0];
|
Aview *Aview_p = & AcceleratorViewContainer[0];
|
||||||
|
|
||||||
const int Nsimd = CComplex::Nsimd();
|
const int Nsimd = CComplex::Nsimd();
|
||||||
@@ -161,7 +165,7 @@ public:
|
|||||||
coalescedWrite(out_v[ss](b),res);
|
coalescedWrite(out_v[ss](b),res);
|
||||||
});
|
});
|
||||||
|
|
||||||
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
|
for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
|
||||||
};
|
};
|
||||||
|
|
||||||
void Mdag (const CoarseVector &in, CoarseVector &out)
|
void Mdag (const CoarseVector &in, CoarseVector &out)
|
||||||
@@ -190,9 +194,14 @@ public:
|
|||||||
int npoint = geom.npoint;
|
int npoint = geom.npoint;
|
||||||
typedef LatticeView<Cobj> Aview;
|
typedef LatticeView<Cobj> Aview;
|
||||||
|
|
||||||
Vector<Aview> AcceleratorViewContainer;
|
|
||||||
|
|
||||||
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
|
deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
|
||||||
|
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
|
||||||
|
|
||||||
|
for(int p=0;p<geom.npoint;p++) {
|
||||||
|
hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
|
||||||
|
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
|
||||||
|
}
|
||||||
Aview *Aview_p = & AcceleratorViewContainer[0];
|
Aview *Aview_p = & AcceleratorViewContainer[0];
|
||||||
|
|
||||||
const int Nsimd = CComplex::Nsimd();
|
const int Nsimd = CComplex::Nsimd();
|
||||||
@@ -201,10 +210,10 @@ public:
|
|||||||
|
|
||||||
int osites=Grid()->oSites();
|
int osites=Grid()->oSites();
|
||||||
|
|
||||||
Vector<int> points(geom.npoint, 0);
|
deviceVector<int> points(geom.npoint);
|
||||||
for(int p=0; p<geom.npoint; p++)
|
for(int p=0; p<geom.npoint; p++) {
|
||||||
points[p] = geom.points_dagger[p];
|
acceleratorPut(points[p],geom.points_dagger[p]);
|
||||||
|
}
|
||||||
auto points_p = &points[0];
|
auto points_p = &points[0];
|
||||||
|
|
||||||
RealD* dag_factor_p = &dag_factor[0];
|
RealD* dag_factor_p = &dag_factor[0];
|
||||||
@@ -236,7 +245,7 @@ public:
|
|||||||
coalescedWrite(out_v[ss](b),res);
|
coalescedWrite(out_v[ss](b),res);
|
||||||
});
|
});
|
||||||
|
|
||||||
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
|
for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
|
||||||
}
|
}
|
||||||
|
|
||||||
void MdirComms(const CoarseVector &in)
|
void MdirComms(const CoarseVector &in)
|
||||||
@@ -251,8 +260,14 @@ public:
|
|||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
|
|
||||||
typedef LatticeView<Cobj> Aview;
|
typedef LatticeView<Cobj> Aview;
|
||||||
Vector<Aview> AcceleratorViewContainer;
|
|
||||||
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
|
deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
|
||||||
|
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
|
||||||
|
|
||||||
|
for(int p=0;p<geom.npoint;p++) {
|
||||||
|
hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
|
||||||
|
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
|
||||||
|
}
|
||||||
Aview *Aview_p = & AcceleratorViewContainer[0];
|
Aview *Aview_p = & AcceleratorViewContainer[0];
|
||||||
|
|
||||||
autoView( out_v , out, AcceleratorWrite);
|
autoView( out_v , out, AcceleratorWrite);
|
||||||
@@ -285,7 +300,7 @@ public:
|
|||||||
}
|
}
|
||||||
coalescedWrite(out_v[ss](b),res);
|
coalescedWrite(out_v[ss](b),res);
|
||||||
});
|
});
|
||||||
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
|
for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
|
||||||
}
|
}
|
||||||
void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out)
|
void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out)
|
||||||
{
|
{
|
||||||
@@ -469,14 +484,20 @@ public:
|
|||||||
|
|
||||||
// determine in what order we need the points
|
// determine in what order we need the points
|
||||||
int npoint = geom.npoint-1;
|
int npoint = geom.npoint-1;
|
||||||
Vector<int> points(npoint, 0);
|
deviceVector<int> points(npoint);
|
||||||
for(int p=0; p<npoint; p++)
|
for(int p=0; p<npoint; p++) {
|
||||||
points[p] = (dag && !hermitian) ? geom.points_dagger[p] : p;
|
int val = (dag && !hermitian) ? geom.points_dagger[p] : p;
|
||||||
|
acceleratorPut(points[p], val);
|
||||||
|
}
|
||||||
auto points_p = &points[0];
|
auto points_p = &points[0];
|
||||||
|
|
||||||
Vector<Aview> AcceleratorViewContainer;
|
deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
|
||||||
for(int p=0;p<npoint;p++) AcceleratorViewContainer.push_back(a[p].View(AcceleratorRead));
|
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
|
||||||
|
|
||||||
|
for(int p=0;p<geom.npoint;p++) {
|
||||||
|
hAcceleratorViewContainer[p] = a[p].View(AcceleratorRead);
|
||||||
|
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
|
||||||
|
}
|
||||||
Aview *Aview_p = & AcceleratorViewContainer[0];
|
Aview *Aview_p = & AcceleratorViewContainer[0];
|
||||||
|
|
||||||
const int Nsimd = CComplex::Nsimd();
|
const int Nsimd = CComplex::Nsimd();
|
||||||
@@ -539,7 +560,7 @@ public:
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
for(int p=0;p<npoint;p++) AcceleratorViewContainer[p].ViewClose();
|
for(int p=0;p<npoint;p++) hAcceleratorViewContainer[p].ViewClose();
|
||||||
}
|
}
|
||||||
|
|
||||||
CoarsenedMatrix(GridCartesian &CoarseGrid, int hermitian_=0) :
|
CoarsenedMatrix(GridCartesian &CoarseGrid, int hermitian_=0) :
|
||||||
@@ -590,11 +611,13 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
// GPU readable prefactor
|
// GPU readable prefactor
|
||||||
|
std::vector<RealD> h_dag_factor(nbasis*nbasis);
|
||||||
thread_for(i, nbasis*nbasis, {
|
thread_for(i, nbasis*nbasis, {
|
||||||
int j = i/nbasis;
|
int j = i/nbasis;
|
||||||
int k = i%nbasis;
|
int k = i%nbasis;
|
||||||
dag_factor[i] = dag_factor_eigen(j, k);
|
h_dag_factor[i] = dag_factor_eigen(j, k);
|
||||||
});
|
});
|
||||||
|
acceleratorCopyToDevice(&h_dag_factor[0],&dag_factor[0],dag_factor.size()*sizeof(RealD));
|
||||||
}
|
}
|
||||||
|
|
||||||
void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
|
void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
|
||||||
|
|||||||
@@ -174,21 +174,11 @@ template<typename _Tp> inline bool operator!=(const devAllocator<_Tp>&, const d
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Template typedefs
|
// Template typedefs
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
#ifdef ACCELERATOR_CSHIFT
|
template<class T> using hostVector = std::vector<T,alignedAllocator<T> >; // Needs autoview
|
||||||
// Cshift on device
|
template<class T> using Vector = std::vector<T,uvmAllocator<T> >; //
|
||||||
template<class T> using cshiftAllocator = devAllocator<T>;
|
template<class T> using uvmVector = std::vector<T,uvmAllocator<T> >; // auto migrating page
|
||||||
#else
|
template<class T> using deviceVector = std::vector<T,devAllocator<T> >; // device vector
|
||||||
// Cshift on host
|
|
||||||
template<class T> using cshiftAllocator = std::allocator<T>;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template<class T> using Vector = std::vector<T,uvmAllocator<T> >;
|
|
||||||
template<class T> using stencilVector = std::vector<T,alignedAllocator<T> >;
|
|
||||||
template<class T> using commVector = std::vector<T,devAllocator<T> >;
|
|
||||||
template<class T> using deviceVector = std::vector<T,devAllocator<T> >;
|
|
||||||
template<class T> using cshiftVector = std::vector<T,cshiftAllocator<T> >;
|
|
||||||
|
|
||||||
/*
|
|
||||||
template<class T> class vecView
|
template<class T> class vecView
|
||||||
{
|
{
|
||||||
protected:
|
protected:
|
||||||
@@ -197,8 +187,9 @@ template<class T> class vecView
|
|||||||
ViewMode mode;
|
ViewMode mode;
|
||||||
void * cpu_ptr;
|
void * cpu_ptr;
|
||||||
public:
|
public:
|
||||||
|
// Rvalue accessor
|
||||||
accelerator_inline T & operator[](size_t i) const { return this->data[i]; };
|
accelerator_inline T & operator[](size_t i) const { return this->data[i]; };
|
||||||
vecView(std::vector<T> &refer_to_me,ViewMode _mode)
|
vecView(Vector<T> &refer_to_me,ViewMode _mode)
|
||||||
{
|
{
|
||||||
cpu_ptr = &refer_to_me[0];
|
cpu_ptr = &refer_to_me[0];
|
||||||
size = refer_to_me.size();
|
size = refer_to_me.size();
|
||||||
@@ -214,26 +205,15 @@ template<class T> class vecView
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template<class T> vecView<T> VectorView(std::vector<T> &vec,ViewMode _mode)
|
template<class T> vecView<T> VectorView(Vector<T> &vec,ViewMode _mode)
|
||||||
{
|
{
|
||||||
vecView<T> ret(vec,_mode); // does the open
|
vecView<T> ret(vec,_mode); // does the open
|
||||||
return ret; // must be closed
|
return ret; // must be closed
|
||||||
}
|
}
|
||||||
|
|
||||||
// Little autoscope assister
|
|
||||||
template<class View>
|
|
||||||
class VectorViewCloser
|
|
||||||
{
|
|
||||||
View v; // Take a copy of view and call view close when I go out of scope automatically
|
|
||||||
public:
|
|
||||||
VectorViewCloser(View &_v) : v(_v) {};
|
|
||||||
~VectorViewCloser() { auto ptr = v.cpu_ptr; v.ViewClose(); MemoryManager::NotifyDeletion(ptr);}
|
|
||||||
};
|
|
||||||
|
|
||||||
#define autoVecView(v_v,v,mode) \
|
#define autoVecView(v_v,v,mode) \
|
||||||
auto v_v = VectorView(v,mode); \
|
auto v_v = VectorView(v,mode); \
|
||||||
ViewCloser<decltype(v_v)> _autoView##v_v(v_v);
|
ViewCloser<decltype(v_v)> _autoView##v_v(v_v);
|
||||||
*/
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
|||||||
@@ -15,10 +15,10 @@ void check_huge_pages(void *Buf,uint64_t BYTES)
|
|||||||
uint64_t virt_pfn = (uint64_t)Buf / page_size;
|
uint64_t virt_pfn = (uint64_t)Buf / page_size;
|
||||||
off_t offset = sizeof(uint64_t) * virt_pfn;
|
off_t offset = sizeof(uint64_t) * virt_pfn;
|
||||||
uint64_t npages = (BYTES + page_size-1) / page_size;
|
uint64_t npages = (BYTES + page_size-1) / page_size;
|
||||||
uint64_t pagedata[npages];
|
std::vector<uint64_t> pagedata(npages);
|
||||||
uint64_t ret = lseek(fd, offset, SEEK_SET);
|
uint64_t ret = lseek(fd, offset, SEEK_SET);
|
||||||
assert(ret == offset);
|
assert(ret == offset);
|
||||||
ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
|
ret = ::read(fd, &pagedata[0], sizeof(uint64_t)*npages);
|
||||||
assert(ret == sizeof(uint64_t) * npages);
|
assert(ret == sizeof(uint64_t) * npages);
|
||||||
int nhugepages = npages / 512;
|
int nhugepages = npages / 512;
|
||||||
int n4ktotal, nnothuge;
|
int n4ktotal, nnothuge;
|
||||||
|
|||||||
@@ -82,6 +82,7 @@ public:
|
|||||||
bool _isCheckerBoarded;
|
bool _isCheckerBoarded;
|
||||||
int LocallyPeriodic;
|
int LocallyPeriodic;
|
||||||
Coordinate _checker_dim_mask;
|
Coordinate _checker_dim_mask;
|
||||||
|
int _checker_dim;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
@@ -89,7 +90,7 @@ public:
|
|||||||
// Checkerboarding interface is virtual and overridden by
|
// Checkerboarding interface is virtual and overridden by
|
||||||
// GridCartesian / GridRedBlackCartesian
|
// GridCartesian / GridRedBlackCartesian
|
||||||
////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////
|
||||||
virtual int CheckerBoarded(int dim)=0;
|
virtual int CheckerBoarded(int dim) =0;
|
||||||
virtual int CheckerBoard(const Coordinate &site)=0;
|
virtual int CheckerBoard(const Coordinate &site)=0;
|
||||||
virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0;
|
virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0;
|
||||||
virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
|
virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
|
||||||
|
|||||||
@@ -38,7 +38,7 @@ class GridCartesian: public GridBase {
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
int dummy;
|
int dummy;
|
||||||
Coordinate _checker_dim_mask;
|
// Coordinate _checker_dim_mask;
|
||||||
virtual int CheckerBoardFromOindexTable (int Oindex) {
|
virtual int CheckerBoardFromOindexTable (int Oindex) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@@ -46,7 +46,7 @@ public:
|
|||||||
{
|
{
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
virtual int CheckerBoarded(int dim){
|
virtual int CheckerBoarded(int dim) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
virtual int CheckerBoard(const Coordinate &site){
|
virtual int CheckerBoard(const Coordinate &site){
|
||||||
@@ -106,6 +106,7 @@ public:
|
|||||||
_rdimensions.resize(_ndimension);
|
_rdimensions.resize(_ndimension);
|
||||||
_simd_layout.resize(_ndimension);
|
_simd_layout.resize(_ndimension);
|
||||||
_checker_dim_mask.resize(_ndimension);;
|
_checker_dim_mask.resize(_ndimension);;
|
||||||
|
_checker_dim = -1;
|
||||||
_lstart.resize(_ndimension);
|
_lstart.resize(_ndimension);
|
||||||
_lend.resize(_ndimension);
|
_lend.resize(_ndimension);
|
||||||
|
|
||||||
|
|||||||
@@ -57,9 +57,10 @@ class GridRedBlackCartesian : public GridBase
|
|||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
// Coordinate _checker_dim_mask;
|
// Coordinate _checker_dim_mask;
|
||||||
int _checker_dim;
|
// int _checker_dim;
|
||||||
std::vector<int> _checker_board;
|
std::vector<int> _checker_board;
|
||||||
|
|
||||||
|
virtual int isCheckerBoarded(void) const { return 1; };
|
||||||
virtual int CheckerBoarded(int dim){
|
virtual int CheckerBoarded(int dim){
|
||||||
if( dim==_checker_dim) return 1;
|
if( dim==_checker_dim) return 1;
|
||||||
else return 0;
|
else return 0;
|
||||||
@@ -147,7 +148,7 @@ public:
|
|||||||
{
|
{
|
||||||
Init(base->_fdimensions,base->_simd_layout,base->_processors,checker_dim_mask,checker_dim) ;
|
Init(base->_fdimensions,base->_simd_layout,base->_processors,checker_dim_mask,checker_dim) ;
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual ~GridRedBlackCartesian() = default;
|
virtual ~GridRedBlackCartesian() = default;
|
||||||
|
|
||||||
void Init(const Coordinate &dimensions,
|
void Init(const Coordinate &dimensions,
|
||||||
|
|||||||
@@ -57,18 +57,29 @@ int CartesianCommunicator::ProcessorCount(void) { return
|
|||||||
// very VERY rarely (Log, serial RNG) we need world without a grid
|
// very VERY rarely (Log, serial RNG) we need world without a grid
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
#ifdef USE_GRID_REDUCTION
|
||||||
|
void CartesianCommunicator::GlobalSum(ComplexF &c)
|
||||||
|
{
|
||||||
|
GlobalSumP2P(c);
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::GlobalSum(ComplexD &c)
|
||||||
|
{
|
||||||
|
GlobalSumP2P(c);
|
||||||
|
}
|
||||||
|
#else
|
||||||
void CartesianCommunicator::GlobalSum(ComplexF &c)
|
void CartesianCommunicator::GlobalSum(ComplexF &c)
|
||||||
{
|
{
|
||||||
GlobalSumVector((float *)&c,2);
|
GlobalSumVector((float *)&c,2);
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
|
|
||||||
{
|
|
||||||
GlobalSumVector((float *)c,2*N);
|
|
||||||
}
|
|
||||||
void CartesianCommunicator::GlobalSum(ComplexD &c)
|
void CartesianCommunicator::GlobalSum(ComplexD &c)
|
||||||
{
|
{
|
||||||
GlobalSumVector((double *)&c,2);
|
GlobalSumVector((double *)&c,2);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
|
||||||
|
{
|
||||||
|
GlobalSumVector((float *)c,2*N);
|
||||||
|
}
|
||||||
void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
|
void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
|
||||||
{
|
{
|
||||||
GlobalSumVector((double *)c,2*N);
|
GlobalSumVector((double *)c,2*N);
|
||||||
|
|||||||
@@ -128,6 +128,34 @@ public:
|
|||||||
void GlobalXOR(uint32_t &);
|
void GlobalXOR(uint32_t &);
|
||||||
void GlobalXOR(uint64_t &);
|
void GlobalXOR(uint64_t &);
|
||||||
|
|
||||||
|
template<class obj> void GlobalSumP2P(obj &o)
|
||||||
|
{
|
||||||
|
std::vector<obj> column;
|
||||||
|
obj accum = o;
|
||||||
|
int source,dest;
|
||||||
|
for(int d=0;d<_ndimension;d++){
|
||||||
|
column.resize(_processors[d]);
|
||||||
|
column[0] = accum;
|
||||||
|
std::vector<CommsRequest_t> list;
|
||||||
|
for(int p=1;p<_processors[d];p++){
|
||||||
|
ShiftedRanks(d,p,source,dest);
|
||||||
|
SendToRecvFromBegin(list,
|
||||||
|
&column[0],
|
||||||
|
dest,
|
||||||
|
&column[p],
|
||||||
|
source,
|
||||||
|
sizeof(obj),d*100+p);
|
||||||
|
|
||||||
|
}
|
||||||
|
CommsComplete(list);
|
||||||
|
for(int p=1;p<_processors[d];p++){
|
||||||
|
accum = accum + column[p];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Broadcast(0,accum);
|
||||||
|
o=accum;
|
||||||
|
}
|
||||||
|
|
||||||
template<class obj> void GlobalSum(obj &o){
|
template<class obj> void GlobalSum(obj &o){
|
||||||
typedef typename obj::scalar_type scalar_type;
|
typedef typename obj::scalar_type scalar_type;
|
||||||
int words = sizeof(obj)/sizeof(scalar_type);
|
int words = sizeof(obj)/sizeof(scalar_type);
|
||||||
|
|||||||
@@ -257,6 +257,25 @@ CartesianCommunicator::~CartesianCommunicator()
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#ifdef USE_GRID_REDUCTION
|
||||||
|
void CartesianCommunicator::GlobalSum(float &f){
|
||||||
|
CartesianCommunicator::GlobalSumP2P(f);
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::GlobalSum(double &d)
|
||||||
|
{
|
||||||
|
CartesianCommunicator::GlobalSumP2P(d);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
void CartesianCommunicator::GlobalSum(float &f){
|
||||||
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
|
||||||
|
assert(ierr==0);
|
||||||
|
}
|
||||||
|
void CartesianCommunicator::GlobalSum(double &d)
|
||||||
|
{
|
||||||
|
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
|
||||||
|
assert(ierr==0);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
void CartesianCommunicator::GlobalSum(uint32_t &u){
|
void CartesianCommunicator::GlobalSum(uint32_t &u){
|
||||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
@@ -287,20 +306,11 @@ void CartesianCommunicator::GlobalMax(double &d)
|
|||||||
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
|
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::GlobalSum(float &f){
|
|
||||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
|
|
||||||
assert(ierr==0);
|
|
||||||
}
|
|
||||||
void CartesianCommunicator::GlobalSumVector(float *f,int N)
|
void CartesianCommunicator::GlobalSumVector(float *f,int N)
|
||||||
{
|
{
|
||||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::GlobalSum(double &d)
|
|
||||||
{
|
|
||||||
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
|
|
||||||
assert(ierr==0);
|
|
||||||
}
|
|
||||||
void CartesianCommunicator::GlobalSumVector(double *d,int N)
|
void CartesianCommunicator::GlobalSumVector(double *d,int N)
|
||||||
{
|
{
|
||||||
int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
|
int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
|
||||||
|
|||||||
@@ -569,8 +569,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
|
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
|
||||||
typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t;
|
typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t;
|
||||||
|
|
||||||
auto zeDevice = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());
|
auto zeDevice = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());
|
||||||
auto zeContext = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());
|
auto zeContext = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());
|
||||||
|
|
||||||
ze_ipc_mem_handle_t ihandle;
|
ze_ipc_mem_handle_t ihandle;
|
||||||
clone_mem_t handle;
|
clone_mem_t handle;
|
||||||
|
|||||||
@@ -51,7 +51,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
|
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
|
||||||
auto Cshift(const Expression &expr,int dim,int shift) -> decltype(closure(expr))
|
auto Cshift(const Expression &expr,int dim,int shift) -> decltype(closure(expr))
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -30,12 +30,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
extern std::vector<std::pair<int,int> > Cshift_table;
|
extern std::vector<std::pair<int,int> > Cshift_table;
|
||||||
extern commVector<std::pair<int,int> > Cshift_table_device;
|
extern deviceVector<std::pair<int,int> > Cshift_table_device;
|
||||||
|
|
||||||
inline std::pair<int,int> *MapCshiftTable(void)
|
inline std::pair<int,int> *MapCshiftTable(void)
|
||||||
{
|
{
|
||||||
// GPU version
|
// GPU version
|
||||||
#ifdef ACCELERATOR_CSHIFT
|
|
||||||
uint64_t sz=Cshift_table.size();
|
uint64_t sz=Cshift_table.size();
|
||||||
if (Cshift_table_device.size()!=sz ) {
|
if (Cshift_table_device.size()!=sz ) {
|
||||||
Cshift_table_device.resize(sz);
|
Cshift_table_device.resize(sz);
|
||||||
@@ -45,16 +44,13 @@ inline std::pair<int,int> *MapCshiftTable(void)
|
|||||||
sizeof(Cshift_table[0])*sz);
|
sizeof(Cshift_table[0])*sz);
|
||||||
|
|
||||||
return &Cshift_table_device[0];
|
return &Cshift_table_device[0];
|
||||||
#else
|
|
||||||
return &Cshift_table[0];
|
|
||||||
#endif
|
|
||||||
// CPU version use identify map
|
// CPU version use identify map
|
||||||
}
|
}
|
||||||
///////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////
|
||||||
// Gather for when there is no need to SIMD split
|
// Gather for when there is no need to SIMD split
|
||||||
///////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////
|
||||||
template<class vobj> void
|
template<class vobj> void
|
||||||
Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
|
Gather_plane_simple (const Lattice<vobj> &rhs,deviceVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
|
||||||
{
|
{
|
||||||
int rd = rhs.Grid()->_rdimensions[dimension];
|
int rd = rhs.Grid()->_rdimensions[dimension];
|
||||||
|
|
||||||
@@ -94,17 +90,10 @@ Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dim
|
|||||||
{
|
{
|
||||||
auto buffer_p = & buffer[0];
|
auto buffer_p = & buffer[0];
|
||||||
auto table = MapCshiftTable();
|
auto table = MapCshiftTable();
|
||||||
#ifdef ACCELERATOR_CSHIFT
|
|
||||||
autoView(rhs_v , rhs, AcceleratorRead);
|
autoView(rhs_v , rhs, AcceleratorRead);
|
||||||
accelerator_for(i,ent,vobj::Nsimd(),{
|
accelerator_for(i,ent,vobj::Nsimd(),{
|
||||||
coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
|
coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
|
||||||
});
|
});
|
||||||
#else
|
|
||||||
autoView(rhs_v , rhs, CpuRead);
|
|
||||||
thread_for(i,ent,{
|
|
||||||
buffer_p[table[i].first]=rhs_v[table[i].second];
|
|
||||||
});
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -129,7 +118,6 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
|
|||||||
int n1=rhs.Grid()->_slice_stride[dimension];
|
int n1=rhs.Grid()->_slice_stride[dimension];
|
||||||
|
|
||||||
if ( cbmask ==0x3){
|
if ( cbmask ==0x3){
|
||||||
#ifdef ACCELERATOR_CSHIFT
|
|
||||||
autoView(rhs_v , rhs, AcceleratorRead);
|
autoView(rhs_v , rhs, AcceleratorRead);
|
||||||
accelerator_for(nn,e1*e2,1,{
|
accelerator_for(nn,e1*e2,1,{
|
||||||
int n = nn%e1;
|
int n = nn%e1;
|
||||||
@@ -140,21 +128,10 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
|
|||||||
vobj temp =rhs_v[so+o+b];
|
vobj temp =rhs_v[so+o+b];
|
||||||
extract<vobj>(temp,pointers,offset);
|
extract<vobj>(temp,pointers,offset);
|
||||||
});
|
});
|
||||||
#else
|
|
||||||
autoView(rhs_v , rhs, CpuRead);
|
|
||||||
thread_for2d(n,e1,b,e2,{
|
|
||||||
int o = n*n1;
|
|
||||||
int offset = b+n*e2;
|
|
||||||
|
|
||||||
vobj temp =rhs_v[so+o+b];
|
|
||||||
extract<vobj>(temp,pointers,offset);
|
|
||||||
});
|
|
||||||
#endif
|
|
||||||
} else {
|
} else {
|
||||||
Coordinate rdim=rhs.Grid()->_rdimensions;
|
Coordinate rdim=rhs.Grid()->_rdimensions;
|
||||||
Coordinate cdm =rhs.Grid()->_checker_dim_mask;
|
Coordinate cdm =rhs.Grid()->_checker_dim_mask;
|
||||||
std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
|
std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
|
||||||
#ifdef ACCELERATOR_CSHIFT
|
|
||||||
autoView(rhs_v , rhs, AcceleratorRead);
|
autoView(rhs_v , rhs, AcceleratorRead);
|
||||||
accelerator_for(nn,e1*e2,1,{
|
accelerator_for(nn,e1*e2,1,{
|
||||||
int n = nn%e1;
|
int n = nn%e1;
|
||||||
@@ -175,33 +152,13 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
|
|||||||
extract<vobj>(temp,pointers,offset);
|
extract<vobj>(temp,pointers,offset);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
#else
|
|
||||||
autoView(rhs_v , rhs, CpuRead);
|
|
||||||
thread_for2d(n,e1,b,e2,{
|
|
||||||
|
|
||||||
Coordinate coor;
|
|
||||||
|
|
||||||
int o=n*n1;
|
|
||||||
int oindex = o+b;
|
|
||||||
|
|
||||||
int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm);
|
|
||||||
|
|
||||||
int ocb=1<<cb;
|
|
||||||
int offset = b+n*e2;
|
|
||||||
|
|
||||||
if ( ocb & cbmask ) {
|
|
||||||
vobj temp =rhs_v[so+o+b];
|
|
||||||
extract<vobj>(temp,pointers,offset);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
// Scatter for when there is no need to SIMD split
|
// Scatter for when there is no need to SIMD split
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<vobj> &buffer, int dimension,int plane,int cbmask)
|
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,deviceVector<vobj> &buffer, int dimension,int plane,int cbmask)
|
||||||
{
|
{
|
||||||
int rd = rhs.Grid()->_rdimensions[dimension];
|
int rd = rhs.Grid()->_rdimensions[dimension];
|
||||||
|
|
||||||
@@ -245,17 +202,10 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<
|
|||||||
{
|
{
|
||||||
auto buffer_p = & buffer[0];
|
auto buffer_p = & buffer[0];
|
||||||
auto table = MapCshiftTable();
|
auto table = MapCshiftTable();
|
||||||
#ifdef ACCELERATOR_CSHIFT
|
|
||||||
autoView( rhs_v, rhs, AcceleratorWrite);
|
autoView( rhs_v, rhs, AcceleratorWrite);
|
||||||
accelerator_for(i,ent,vobj::Nsimd(),{
|
accelerator_for(i,ent,vobj::Nsimd(),{
|
||||||
coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
|
coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
|
||||||
});
|
});
|
||||||
#else
|
|
||||||
autoView( rhs_v, rhs, CpuWrite);
|
|
||||||
thread_for(i,ent,{
|
|
||||||
rhs_v[table[i].first]=buffer_p[table[i].second];
|
|
||||||
});
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -278,7 +228,6 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
|
|||||||
if(cbmask ==0x3 ) {
|
if(cbmask ==0x3 ) {
|
||||||
int _slice_stride = rhs.Grid()->_slice_stride[dimension];
|
int _slice_stride = rhs.Grid()->_slice_stride[dimension];
|
||||||
int _slice_block = rhs.Grid()->_slice_block[dimension];
|
int _slice_block = rhs.Grid()->_slice_block[dimension];
|
||||||
#ifdef ACCELERATOR_CSHIFT
|
|
||||||
autoView( rhs_v , rhs, AcceleratorWrite);
|
autoView( rhs_v , rhs, AcceleratorWrite);
|
||||||
accelerator_for(nn,e1*e2,1,{
|
accelerator_for(nn,e1*e2,1,{
|
||||||
int n = nn%e1;
|
int n = nn%e1;
|
||||||
@@ -287,14 +236,6 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
|
|||||||
int offset = b+n*_slice_block;
|
int offset = b+n*_slice_block;
|
||||||
merge(rhs_v[so+o+b],pointers,offset);
|
merge(rhs_v[so+o+b],pointers,offset);
|
||||||
});
|
});
|
||||||
#else
|
|
||||||
autoView( rhs_v , rhs, CpuWrite);
|
|
||||||
thread_for2d(n,e1,b,e2,{
|
|
||||||
int o = n*_slice_stride;
|
|
||||||
int offset = b+n*_slice_block;
|
|
||||||
merge(rhs_v[so+o+b],pointers,offset);
|
|
||||||
});
|
|
||||||
#endif
|
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
// Case of SIMD split AND checker dim cannot currently be hit, except in
|
// Case of SIMD split AND checker dim cannot currently be hit, except in
|
||||||
@@ -360,19 +301,11 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
|
|||||||
|
|
||||||
{
|
{
|
||||||
auto table = MapCshiftTable();
|
auto table = MapCshiftTable();
|
||||||
#ifdef ACCELERATOR_CSHIFT
|
|
||||||
autoView(rhs_v , rhs, AcceleratorRead);
|
autoView(rhs_v , rhs, AcceleratorRead);
|
||||||
autoView(lhs_v , lhs, AcceleratorWrite);
|
autoView(lhs_v , lhs, AcceleratorWrite);
|
||||||
accelerator_for(i,ent,vobj::Nsimd(),{
|
accelerator_for(i,ent,vobj::Nsimd(),{
|
||||||
coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second]));
|
coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second]));
|
||||||
});
|
});
|
||||||
#else
|
|
||||||
autoView(rhs_v , rhs, CpuRead);
|
|
||||||
autoView(lhs_v , lhs, CpuWrite);
|
|
||||||
thread_for(i,ent,{
|
|
||||||
lhs_v[table[i].first]=rhs_v[table[i].second];
|
|
||||||
});
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -412,19 +345,11 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
|
|||||||
|
|
||||||
{
|
{
|
||||||
auto table = MapCshiftTable();
|
auto table = MapCshiftTable();
|
||||||
#ifdef ACCELERATOR_CSHIFT
|
|
||||||
autoView( rhs_v, rhs, AcceleratorRead);
|
autoView( rhs_v, rhs, AcceleratorRead);
|
||||||
autoView( lhs_v, lhs, AcceleratorWrite);
|
autoView( lhs_v, lhs, AcceleratorWrite);
|
||||||
accelerator_for(i,ent,1,{
|
accelerator_for(i,ent,1,{
|
||||||
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
|
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
|
||||||
});
|
});
|
||||||
#else
|
|
||||||
autoView( rhs_v, rhs, CpuRead);
|
|
||||||
autoView( lhs_v, lhs, CpuWrite);
|
|
||||||
thread_for(i,ent,{
|
|
||||||
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
|
|
||||||
});
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -55,13 +55,13 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
|
|||||||
RealD t1,t0;
|
RealD t1,t0;
|
||||||
t0=usecond();
|
t0=usecond();
|
||||||
if ( !comm_dim ) {
|
if ( !comm_dim ) {
|
||||||
//std::cout << "CSHIFT: Cshift_local" <<std::endl;
|
// std::cout << "CSHIFT: Cshift_local" <<std::endl;
|
||||||
Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding
|
Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding
|
||||||
} else if ( splice_dim ) {
|
} else if ( splice_dim ) {
|
||||||
//std::cout << "CSHIFT: Cshift_comms_simd call - splice_dim = " << splice_dim << " shift " << shift << " dimension = " << dimension << std::endl;
|
// std::cout << "CSHIFT: Cshift_comms_simd call - splice_dim = " << splice_dim << " shift " << shift << " dimension = " << dimension << std::endl;
|
||||||
Cshift_comms_simd(ret,rhs,dimension,shift);
|
Cshift_comms_simd(ret,rhs,dimension,shift);
|
||||||
} else {
|
} else {
|
||||||
//std::cout << "CSHIFT: Cshift_comms" <<std::endl;
|
// std::cout << "CSHIFT: Cshift_comms" <<std::endl;
|
||||||
Cshift_comms(ret,rhs,dimension,shift);
|
Cshift_comms(ret,rhs,dimension,shift);
|
||||||
}
|
}
|
||||||
t1=usecond();
|
t1=usecond();
|
||||||
@@ -94,18 +94,16 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
|
|||||||
sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even);
|
sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even);
|
||||||
sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd);
|
sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd);
|
||||||
|
|
||||||
//std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
|
// std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.Checkerboard()<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
|
||||||
if ( sshift[0] == sshift[1] ) {
|
if ( sshift[0] == sshift[1] ) {
|
||||||
//std::cout << "Single pass Cshift_comms" <<std::endl;
|
// std::cout << "Single pass Cshift_comms" <<std::endl;
|
||||||
Cshift_comms_simd(ret,rhs,dimension,shift,0x3);
|
Cshift_comms_simd(ret,rhs,dimension,shift,0x3);
|
||||||
} else {
|
} else {
|
||||||
//std::cout << "Two pass Cshift_comms" <<std::endl;
|
// std::cout << "Two pass Cshift_comms" <<std::endl;
|
||||||
Cshift_comms_simd(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
|
Cshift_comms_simd(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
|
||||||
Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
|
Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#define ACCELERATOR_CSHIFT_NO_COPY
|
|
||||||
#ifdef ACCELERATOR_CSHIFT_NO_COPY
|
|
||||||
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
||||||
{
|
{
|
||||||
typedef typename vobj::vector_type vector_type;
|
typedef typename vobj::vector_type vector_type;
|
||||||
@@ -125,8 +123,8 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
|||||||
assert(shift<fd);
|
assert(shift<fd);
|
||||||
|
|
||||||
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
|
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
|
||||||
static cshiftVector<vobj> send_buf; send_buf.resize(buffer_size);
|
static deviceVector<vobj> send_buf; send_buf.resize(buffer_size);
|
||||||
static cshiftVector<vobj> recv_buf; recv_buf.resize(buffer_size);
|
static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size);
|
||||||
|
|
||||||
int cb= (cbmask==0x2)? Odd : Even;
|
int cb= (cbmask==0x2)? Odd : Even;
|
||||||
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
|
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
|
||||||
@@ -161,7 +159,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
|||||||
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
|
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
|
||||||
|
|
||||||
tcomms-=usecond();
|
tcomms-=usecond();
|
||||||
// grid->Barrier();
|
grid->Barrier();
|
||||||
|
|
||||||
grid->SendToRecvFrom((void *)&send_buf[0],
|
grid->SendToRecvFrom((void *)&send_buf[0],
|
||||||
xmit_to_rank,
|
xmit_to_rank,
|
||||||
@@ -169,7 +167,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
|||||||
recv_from_rank,
|
recv_from_rank,
|
||||||
bytes);
|
bytes);
|
||||||
xbytes+=bytes;
|
xbytes+=bytes;
|
||||||
// grid->Barrier();
|
grid->Barrier();
|
||||||
tcomms+=usecond();
|
tcomms+=usecond();
|
||||||
|
|
||||||
tscatter-=usecond();
|
tscatter-=usecond();
|
||||||
@@ -177,13 +175,11 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
|||||||
tscatter+=usecond();
|
tscatter+=usecond();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/*
|
|
||||||
std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl;
|
std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl;
|
||||||
std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl;
|
std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl;
|
||||||
std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
|
std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
|
||||||
std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl;
|
std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl;
|
||||||
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
|
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
|
||||||
*/
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
||||||
@@ -201,9 +197,9 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
|||||||
int simd_layout = grid->_simd_layout[dimension];
|
int simd_layout = grid->_simd_layout[dimension];
|
||||||
int comm_dim = grid->_processors[dimension] >1 ;
|
int comm_dim = grid->_processors[dimension] >1 ;
|
||||||
|
|
||||||
//std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
|
// std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
|
||||||
// << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout
|
// << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout
|
||||||
// << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
|
// << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
|
||||||
|
|
||||||
assert(comm_dim==1);
|
assert(comm_dim==1);
|
||||||
assert(simd_layout==2);
|
assert(simd_layout==2);
|
||||||
@@ -224,8 +220,8 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
|||||||
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
|
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
|
||||||
// int words = sizeof(vobj)/sizeof(vector_type);
|
// int words = sizeof(vobj)/sizeof(vector_type);
|
||||||
|
|
||||||
static std::vector<cshiftVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd);
|
static std::vector<deviceVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd);
|
||||||
static std::vector<cshiftVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
|
static std::vector<deviceVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
|
||||||
scalar_object * recv_buf_extract_mpi;
|
scalar_object * recv_buf_extract_mpi;
|
||||||
scalar_object * send_buf_extract_mpi;
|
scalar_object * send_buf_extract_mpi;
|
||||||
|
|
||||||
@@ -281,7 +277,7 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
|||||||
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
||||||
|
|
||||||
tcomms-=usecond();
|
tcomms-=usecond();
|
||||||
// grid->Barrier();
|
grid->Barrier();
|
||||||
|
|
||||||
send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
|
send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
|
||||||
recv_buf_extract_mpi = &recv_buf_extract[i][0];
|
recv_buf_extract_mpi = &recv_buf_extract[i][0];
|
||||||
@@ -292,7 +288,7 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
|||||||
bytes);
|
bytes);
|
||||||
|
|
||||||
xbytes+=bytes;
|
xbytes+=bytes;
|
||||||
// grid->Barrier();
|
grid->Barrier();
|
||||||
tcomms+=usecond();
|
tcomms+=usecond();
|
||||||
|
|
||||||
rpointers[i] = &recv_buf_extract[i][0];
|
rpointers[i] = &recv_buf_extract[i][0];
|
||||||
@@ -305,242 +301,12 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
|||||||
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
|
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
|
||||||
tscatter+=usecond();
|
tscatter+=usecond();
|
||||||
}
|
}
|
||||||
/*
|
|
||||||
std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl;
|
std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl;
|
||||||
std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl;
|
std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl;
|
||||||
std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
|
std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
|
||||||
std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl;
|
std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl;
|
||||||
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
|
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
|
||||||
*/
|
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
|
||||||
{
|
|
||||||
typedef typename vobj::vector_type vector_type;
|
|
||||||
typedef typename vobj::scalar_type scalar_type;
|
|
||||||
|
|
||||||
GridBase *grid=rhs.Grid();
|
|
||||||
Lattice<vobj> temp(rhs.Grid());
|
|
||||||
|
|
||||||
int fd = rhs.Grid()->_fdimensions[dimension];
|
|
||||||
int rd = rhs.Grid()->_rdimensions[dimension];
|
|
||||||
int pd = rhs.Grid()->_processors[dimension];
|
|
||||||
int simd_layout = rhs.Grid()->_simd_layout[dimension];
|
|
||||||
int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
|
|
||||||
assert(simd_layout==1);
|
|
||||||
assert(comm_dim==1);
|
|
||||||
assert(shift>=0);
|
|
||||||
assert(shift<fd);
|
|
||||||
RealD tcopy=0.0;
|
|
||||||
RealD tgather=0.0;
|
|
||||||
RealD tscatter=0.0;
|
|
||||||
RealD tcomms=0.0;
|
|
||||||
uint64_t xbytes=0;
|
|
||||||
|
|
||||||
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
|
|
||||||
static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size);
|
|
||||||
static cshiftVector<vobj> recv_buf_v; recv_buf_v.resize(buffer_size);
|
|
||||||
vobj *send_buf;
|
|
||||||
vobj *recv_buf;
|
|
||||||
{
|
|
||||||
grid->ShmBufferFreeAll();
|
|
||||||
size_t bytes = buffer_size*sizeof(vobj);
|
|
||||||
send_buf=(vobj *)grid->ShmBufferMalloc(bytes);
|
|
||||||
recv_buf=(vobj *)grid->ShmBufferMalloc(bytes);
|
|
||||||
}
|
|
||||||
|
|
||||||
int cb= (cbmask==0x2)? Odd : Even;
|
|
||||||
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
|
|
||||||
|
|
||||||
for(int x=0;x<rd;x++){
|
|
||||||
|
|
||||||
int sx = (x+sshift)%rd;
|
|
||||||
int comm_proc = ((x+sshift)/rd)%pd;
|
|
||||||
|
|
||||||
if (comm_proc==0) {
|
|
||||||
|
|
||||||
tcopy-=usecond();
|
|
||||||
Copy_plane(ret,rhs,dimension,x,sx,cbmask);
|
|
||||||
tcopy+=usecond();
|
|
||||||
|
|
||||||
} else {
|
|
||||||
|
|
||||||
int words = buffer_size;
|
|
||||||
if (cbmask != 0x3) words=words>>1;
|
|
||||||
|
|
||||||
int bytes = words * sizeof(vobj);
|
|
||||||
|
|
||||||
tgather-=usecond();
|
|
||||||
Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask);
|
|
||||||
tgather+=usecond();
|
|
||||||
|
|
||||||
// int rank = grid->_processor;
|
|
||||||
int recv_from_rank;
|
|
||||||
int xmit_to_rank;
|
|
||||||
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
|
|
||||||
|
|
||||||
|
|
||||||
tcomms-=usecond();
|
|
||||||
// grid->Barrier();
|
|
||||||
|
|
||||||
acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes);
|
|
||||||
grid->SendToRecvFrom((void *)&send_buf[0],
|
|
||||||
xmit_to_rank,
|
|
||||||
(void *)&recv_buf[0],
|
|
||||||
recv_from_rank,
|
|
||||||
bytes);
|
|
||||||
xbytes+=bytes;
|
|
||||||
acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes);
|
|
||||||
|
|
||||||
// grid->Barrier();
|
|
||||||
tcomms+=usecond();
|
|
||||||
|
|
||||||
tscatter-=usecond();
|
|
||||||
Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask);
|
|
||||||
tscatter+=usecond();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
/*
|
|
||||||
std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl;
|
|
||||||
std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl;
|
|
||||||
std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
|
|
||||||
std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl;
|
|
||||||
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
|
|
||||||
*/
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
|
||||||
{
|
|
||||||
GridBase *grid=rhs.Grid();
|
|
||||||
const int Nsimd = grid->Nsimd();
|
|
||||||
typedef typename vobj::vector_type vector_type;
|
|
||||||
typedef typename vobj::scalar_object scalar_object;
|
|
||||||
typedef typename vobj::scalar_type scalar_type;
|
|
||||||
|
|
||||||
int fd = grid->_fdimensions[dimension];
|
|
||||||
int rd = grid->_rdimensions[dimension];
|
|
||||||
int ld = grid->_ldimensions[dimension];
|
|
||||||
int pd = grid->_processors[dimension];
|
|
||||||
int simd_layout = grid->_simd_layout[dimension];
|
|
||||||
int comm_dim = grid->_processors[dimension] >1 ;
|
|
||||||
|
|
||||||
//std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
|
|
||||||
// << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout
|
|
||||||
// << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
|
|
||||||
|
|
||||||
assert(comm_dim==1);
|
|
||||||
assert(simd_layout==2);
|
|
||||||
assert(shift>=0);
|
|
||||||
assert(shift<fd);
|
|
||||||
RealD tcopy=0.0;
|
|
||||||
RealD tgather=0.0;
|
|
||||||
RealD tscatter=0.0;
|
|
||||||
RealD tcomms=0.0;
|
|
||||||
uint64_t xbytes=0;
|
|
||||||
|
|
||||||
int permute_type=grid->PermuteType(dimension);
|
|
||||||
|
|
||||||
///////////////////////////////////////////////
|
|
||||||
// Simd direction uses an extract/merge pair
|
|
||||||
///////////////////////////////////////////////
|
|
||||||
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
|
|
||||||
// int words = sizeof(vobj)/sizeof(vector_type);
|
|
||||||
|
|
||||||
static std::vector<cshiftVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd);
|
|
||||||
static std::vector<cshiftVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
|
|
||||||
scalar_object * recv_buf_extract_mpi;
|
|
||||||
scalar_object * send_buf_extract_mpi;
|
|
||||||
{
|
|
||||||
size_t bytes = sizeof(scalar_object)*buffer_size;
|
|
||||||
grid->ShmBufferFreeAll();
|
|
||||||
send_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
|
|
||||||
recv_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
|
|
||||||
}
|
|
||||||
for(int s=0;s<Nsimd;s++){
|
|
||||||
send_buf_extract[s].resize(buffer_size);
|
|
||||||
recv_buf_extract[s].resize(buffer_size);
|
|
||||||
}
|
|
||||||
|
|
||||||
int bytes = buffer_size*sizeof(scalar_object);
|
|
||||||
|
|
||||||
ExtractPointerArray<scalar_object> pointers(Nsimd); //
|
|
||||||
ExtractPointerArray<scalar_object> rpointers(Nsimd); // received pointers
|
|
||||||
|
|
||||||
///////////////////////////////////////////
|
|
||||||
// Work out what to send where
|
|
||||||
///////////////////////////////////////////
|
|
||||||
int cb = (cbmask==0x2)? Odd : Even;
|
|
||||||
int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
|
|
||||||
|
|
||||||
// loop over outer coord planes orthog to dim
|
|
||||||
for(int x=0;x<rd;x++){
|
|
||||||
|
|
||||||
// FIXME call local permute copy if none are offnode.
|
|
||||||
for(int i=0;i<Nsimd;i++){
|
|
||||||
pointers[i] = &send_buf_extract[i][0];
|
|
||||||
}
|
|
||||||
tgather-=usecond();
|
|
||||||
int sx = (x+sshift)%rd;
|
|
||||||
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
|
|
||||||
tgather+=usecond();
|
|
||||||
|
|
||||||
for(int i=0;i<Nsimd;i++){
|
|
||||||
|
|
||||||
int inner_bit = (Nsimd>>(permute_type+1));
|
|
||||||
int ic= (i&inner_bit)? 1:0;
|
|
||||||
|
|
||||||
int my_coor = rd*ic + x;
|
|
||||||
int nbr_coor = my_coor+sshift;
|
|
||||||
int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
|
|
||||||
|
|
||||||
int nbr_ic = (nbr_coor%ld)/rd; // inner coord of peer
|
|
||||||
int nbr_ox = (nbr_coor%rd); // outer coord of peer
|
|
||||||
int nbr_lane = (i&(~inner_bit));
|
|
||||||
|
|
||||||
int recv_from_rank;
|
|
||||||
int xmit_to_rank;
|
|
||||||
|
|
||||||
if (nbr_ic) nbr_lane|=inner_bit;
|
|
||||||
|
|
||||||
assert (sx == nbr_ox);
|
|
||||||
|
|
||||||
if(nbr_proc){
|
|
||||||
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
|
||||||
|
|
||||||
tcomms-=usecond();
|
|
||||||
// grid->Barrier();
|
|
||||||
|
|
||||||
acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes);
|
|
||||||
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
|
|
||||||
xmit_to_rank,
|
|
||||||
(void *)recv_buf_extract_mpi,
|
|
||||||
recv_from_rank,
|
|
||||||
bytes);
|
|
||||||
acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes);
|
|
||||||
xbytes+=bytes;
|
|
||||||
|
|
||||||
// grid->Barrier();
|
|
||||||
tcomms+=usecond();
|
|
||||||
rpointers[i] = &recv_buf_extract[i][0];
|
|
||||||
} else {
|
|
||||||
rpointers[i] = &send_buf_extract[nbr_lane][0];
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
tscatter-=usecond();
|
|
||||||
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
|
|
||||||
tscatter+=usecond();
|
|
||||||
|
|
||||||
}
|
|
||||||
/*
|
|
||||||
std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl;
|
|
||||||
std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl;
|
|
||||||
std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
|
|
||||||
std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl;
|
|
||||||
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s"<<std::endl;
|
|
||||||
*/
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
#include <Grid/GridCore.h>
|
#include <Grid/GridCore.h>
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
std::vector<std::pair<int,int> > Cshift_table;
|
std::vector<std::pair<int,int> > Cshift_table;
|
||||||
commVector<std::pair<int,int> > Cshift_table_device;
|
deviceVector<std::pair<int,int> > Cshift_table_device;
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|||||||
@@ -257,17 +257,30 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define FAST_AXPY_NORM
|
||||||
template<class sobj,class vobj> inline
|
template<class sobj,class vobj> inline
|
||||||
RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
|
RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
|
||||||
{
|
{
|
||||||
GRID_TRACE("axpy_norm");
|
GRID_TRACE("axpy_norm");
|
||||||
return axpy_norm_fast(ret,a,x,y);
|
#ifdef FAST_AXPY_NORM
|
||||||
|
return axpy_norm_fast(ret,a,x,y);
|
||||||
|
#else
|
||||||
|
ret = a*x+y;
|
||||||
|
RealD nn=norm2(ret);
|
||||||
|
return nn;
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
template<class sobj,class vobj> inline
|
template<class sobj,class vobj> inline
|
||||||
RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
|
RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
|
||||||
{
|
{
|
||||||
GRID_TRACE("axpby_norm");
|
GRID_TRACE("axpby_norm");
|
||||||
return axpby_norm_fast(ret,a,b,x,y);
|
#ifdef FAST_AXPY_NORM
|
||||||
|
return axpby_norm_fast(ret,a,b,x,y);
|
||||||
|
#else
|
||||||
|
ret = a*x+b*y;
|
||||||
|
RealD nn=norm2(ret);
|
||||||
|
return nn;
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Trace product
|
/// Trace product
|
||||||
|
|||||||
@@ -236,17 +236,20 @@ public:
|
|||||||
template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
|
template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
|
||||||
vobj vtmp;
|
vobj vtmp;
|
||||||
vtmp = r;
|
vtmp = r;
|
||||||
#if 1
|
#if 0
|
||||||
|
deviceVector<vobj> vvtmp(1);
|
||||||
|
acceleratorPut(vvtmp[0],vtmp);
|
||||||
|
vobj *vvtmp_p = & vvtmp[0];
|
||||||
|
auto me = View(AcceleratorWrite);
|
||||||
|
accelerator_for(ss,me.size(),vobj::Nsimd(),{
|
||||||
|
auto stmp=coalescedRead(*vvtmp_p);
|
||||||
|
coalescedWrite(me[ss],stmp);
|
||||||
|
});
|
||||||
|
#else
|
||||||
auto me = View(CpuWrite);
|
auto me = View(CpuWrite);
|
||||||
thread_for(ss,me.size(),{
|
thread_for(ss,me.size(),{
|
||||||
me[ss]= r;
|
me[ss]= r;
|
||||||
});
|
});
|
||||||
#else
|
|
||||||
auto me = View(AcceleratorWrite);
|
|
||||||
accelerator_for(ss,me.size(),vobj::Nsimd(),{
|
|
||||||
auto stmp=coalescedRead(vtmp);
|
|
||||||
coalescedWrite(me[ss],stmp);
|
|
||||||
});
|
|
||||||
#endif
|
#endif
|
||||||
me.ViewClose();
|
me.ViewClose();
|
||||||
return *this;
|
return *this;
|
||||||
|
|||||||
@@ -53,36 +53,19 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
|
|||||||
typedef decltype(basis[0]) Field;
|
typedef decltype(basis[0]) Field;
|
||||||
typedef decltype(basis[0].View(AcceleratorRead)) View;
|
typedef decltype(basis[0].View(AcceleratorRead)) View;
|
||||||
|
|
||||||
Vector<View> basis_v; basis_v.reserve(basis.size());
|
hostVector<View> h_basis_v(basis.size());
|
||||||
typedef typename std::remove_reference<decltype(basis_v[0][0])>::type vobj;
|
deviceVector<View> d_basis_v(basis.size());
|
||||||
|
typedef typename std::remove_reference<decltype(h_basis_v[0][0])>::type vobj;
|
||||||
typedef typename std::remove_reference<decltype(Qt(0,0))>::type Coeff_t;
|
typedef typename std::remove_reference<decltype(Qt(0,0))>::type Coeff_t;
|
||||||
|
|
||||||
GridBase* grid = basis[0].Grid();
|
GridBase* grid = basis[0].Grid();
|
||||||
|
|
||||||
for(int k=0;k<basis.size();k++){
|
for(int k=0;k<basis.size();k++){
|
||||||
basis_v.push_back(basis[k].View(AcceleratorWrite));
|
h_basis_v[k] = basis[k].View(AcceleratorWrite);
|
||||||
|
acceleratorPut(d_basis_v[k],h_basis_v[k]);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if ( !(defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)) )
|
View *basis_vp = &d_basis_v[0];
|
||||||
int max_threads = thread_max();
|
|
||||||
Vector < vobj > Bt(Nm * max_threads);
|
|
||||||
thread_region
|
|
||||||
{
|
|
||||||
vobj* B = &Bt[Nm * thread_num()];
|
|
||||||
thread_for_in_region(ss, grid->oSites(),{
|
|
||||||
for(int j=j0; j<j1; ++j) B[j]=0.;
|
|
||||||
|
|
||||||
for(int j=j0; j<j1; ++j){
|
|
||||||
for(int k=k0; k<k1; ++k){
|
|
||||||
B[j] +=Qt(j,k) * basis_v[k][ss];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for(int j=j0; j<j1; ++j){
|
|
||||||
basis_v[j][ss] = B[j];
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
View *basis_vp = &basis_v[0];
|
|
||||||
|
|
||||||
int nrot = j1-j0;
|
int nrot = j1-j0;
|
||||||
if (!nrot) // edge case not handled gracefully by Cuda
|
if (!nrot) // edge case not handled gracefully by Cuda
|
||||||
@@ -91,17 +74,19 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
|
|||||||
uint64_t oSites =grid->oSites();
|
uint64_t oSites =grid->oSites();
|
||||||
uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
|
uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
|
||||||
|
|
||||||
Vector <vobj> Bt(siteBlock * nrot);
|
deviceVector <vobj> Bt(siteBlock * nrot);
|
||||||
auto Bp=&Bt[0];
|
auto Bp=&Bt[0];
|
||||||
|
|
||||||
// GPU readable copy of matrix
|
// GPU readable copy of matrix
|
||||||
Vector<Coeff_t> Qt_jv(Nm*Nm);
|
hostVector<Coeff_t> h_Qt_jv(Nm*Nm);
|
||||||
|
deviceVector<Coeff_t> Qt_jv(Nm*Nm);
|
||||||
Coeff_t *Qt_p = & Qt_jv[0];
|
Coeff_t *Qt_p = & Qt_jv[0];
|
||||||
thread_for(i,Nm*Nm,{
|
thread_for(i,Nm*Nm,{
|
||||||
int j = i/Nm;
|
int j = i/Nm;
|
||||||
int k = i%Nm;
|
int k = i%Nm;
|
||||||
Qt_p[i]=Qt(j,k);
|
h_Qt_jv[i]=Qt(j,k);
|
||||||
});
|
});
|
||||||
|
acceleratorCopyToDevice(&h_Qt_jv[0],Qt_p,Nm*Nm*sizeof(Coeff_t));
|
||||||
|
|
||||||
// Block the loop to keep storage footprint down
|
// Block the loop to keep storage footprint down
|
||||||
for(uint64_t s=0;s<oSites;s+=siteBlock){
|
for(uint64_t s=0;s<oSites;s+=siteBlock){
|
||||||
@@ -137,9 +122,8 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
|
|||||||
coalescedWrite(basis_vp[jj][sss],coalescedRead(Bp[ss*nrot+j]));
|
coalescedWrite(basis_vp[jj][sss],coalescedRead(Bp[ss*nrot+j]));
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
|
for(int k=0;k<basis.size();k++) h_basis_v[k].ViewClose();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Extract a single rotated vector
|
// Extract a single rotated vector
|
||||||
@@ -152,16 +136,19 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
|
|||||||
|
|
||||||
result.Checkerboard() = basis[0].Checkerboard();
|
result.Checkerboard() = basis[0].Checkerboard();
|
||||||
|
|
||||||
Vector<View> basis_v; basis_v.reserve(basis.size());
|
hostVector<View> h_basis_v(basis.size());
|
||||||
|
deviceVector<View> d_basis_v(basis.size());
|
||||||
for(int k=0;k<basis.size();k++){
|
for(int k=0;k<basis.size();k++){
|
||||||
basis_v.push_back(basis[k].View(AcceleratorRead));
|
h_basis_v[k]=basis[k].View(AcceleratorRead);
|
||||||
|
acceleratorPut(d_basis_v[k],h_basis_v[k]);
|
||||||
}
|
}
|
||||||
vobj zz=Zero();
|
|
||||||
Vector<double> Qt_jv(Nm);
|
|
||||||
double * Qt_j = & Qt_jv[0];
|
|
||||||
for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
|
|
||||||
|
|
||||||
auto basis_vp=& basis_v[0];
|
vobj zz=Zero();
|
||||||
|
deviceVector<double> Qt_jv(Nm);
|
||||||
|
double * Qt_j = & Qt_jv[0];
|
||||||
|
for(int k=0;k<Nm;++k) acceleratorPut(Qt_j[k],Qt(j,k));
|
||||||
|
|
||||||
|
auto basis_vp=& d_basis_v[0];
|
||||||
autoView(result_v,result,AcceleratorWrite);
|
autoView(result_v,result,AcceleratorWrite);
|
||||||
accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
|
accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
|
||||||
vobj zzz=Zero();
|
vobj zzz=Zero();
|
||||||
@@ -171,7 +158,7 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
|
|||||||
}
|
}
|
||||||
coalescedWrite(result_v[ss], B);
|
coalescedWrite(result_v[ss], B);
|
||||||
});
|
});
|
||||||
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
|
for(int k=0;k<basis.size();k++) h_basis_v[k].ViewClose();
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Field>
|
template<class Field>
|
||||||
|
|||||||
@@ -165,7 +165,7 @@ inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
|
|||||||
|
|
||||||
int Nsimd = grid->Nsimd();
|
int Nsimd = grid->Nsimd();
|
||||||
|
|
||||||
assert( l.Checkerboard()== grid->CheckerBoard(site));
|
// assert( l.Checkerboard()== grid->CheckerBoard(site));
|
||||||
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
|
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
|
||||||
|
|
||||||
static const int words=sizeof(vobj)/sizeof(vector_type);
|
static const int words=sizeof(vobj)/sizeof(vector_type);
|
||||||
@@ -179,7 +179,7 @@ inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
|
|||||||
for(int w=0;w<words;w++){
|
for(int w=0;w<words;w++){
|
||||||
pt[w] = getlane(vp[w],idx);
|
pt[w] = getlane(vp[w],idx);
|
||||||
}
|
}
|
||||||
|
// std::cout << "peekLocalSite "<<site<<" "<<odx<<","<<idx<<" "<<s<<std::endl;
|
||||||
return;
|
return;
|
||||||
};
|
};
|
||||||
template<class vobj,class sobj>
|
template<class vobj,class sobj>
|
||||||
@@ -202,7 +202,7 @@ inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site)
|
|||||||
|
|
||||||
int Nsimd = grid->Nsimd();
|
int Nsimd = grid->Nsimd();
|
||||||
|
|
||||||
assert( l.Checkerboard()== grid->CheckerBoard(site));
|
// assert( l.Checkerboard()== grid->CheckerBoard(site));
|
||||||
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
|
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
|
||||||
|
|
||||||
static const int words=sizeof(vobj)/sizeof(vector_type);
|
static const int words=sizeof(vobj)/sizeof(vector_type);
|
||||||
|
|||||||
@@ -46,7 +46,7 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
|
|||||||
// const int Nsimd = vobj::Nsimd();
|
// const int Nsimd = vobj::Nsimd();
|
||||||
const int nthread = GridThread::GetThreads();
|
const int nthread = GridThread::GetThreads();
|
||||||
|
|
||||||
Vector<sobj> sumarray(nthread);
|
std::vector<sobj> sumarray(nthread);
|
||||||
for(int i=0;i<nthread;i++){
|
for(int i=0;i<nthread;i++){
|
||||||
sumarray[i]=Zero();
|
sumarray[i]=Zero();
|
||||||
}
|
}
|
||||||
@@ -75,7 +75,7 @@ inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)
|
|||||||
|
|
||||||
const int nthread = GridThread::GetThreads();
|
const int nthread = GridThread::GetThreads();
|
||||||
|
|
||||||
Vector<sobj> sumarray(nthread);
|
std::vector<sobj> sumarray(nthread);
|
||||||
for(int i=0;i<nthread;i++){
|
for(int i=0;i<nthread;i++){
|
||||||
sumarray[i]=Zero();
|
sumarray[i]=Zero();
|
||||||
}
|
}
|
||||||
@@ -290,8 +290,10 @@ template<class vobj>
|
|||||||
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
|
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
|
||||||
GridBase *grid = left.Grid();
|
GridBase *grid = left.Grid();
|
||||||
|
|
||||||
|
bool ok;
|
||||||
#ifdef GRID_SYCL
|
#ifdef GRID_SYCL
|
||||||
uint64_t csum=0;
|
uint64_t csum=0;
|
||||||
|
uint64_t csum2=0;
|
||||||
if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
|
if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
|
||||||
{
|
{
|
||||||
// Hack
|
// Hack
|
||||||
@@ -300,13 +302,33 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
|
|||||||
Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t);
|
Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t);
|
||||||
uint64_t *base= (uint64_t *)&l_v[0];
|
uint64_t *base= (uint64_t *)&l_v[0];
|
||||||
csum=svm_xor(base,words);
|
csum=svm_xor(base,words);
|
||||||
|
ok = FlightRecorder::CsumLog(csum);
|
||||||
|
if ( !ok ) {
|
||||||
|
csum2=svm_xor(base,words);
|
||||||
|
std::cerr<< " Bad CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
|
||||||
|
} else {
|
||||||
|
// csum2=svm_xor(base,words);
|
||||||
|
// std::cerr<< " ok CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
|
||||||
|
}
|
||||||
|
assert(ok);
|
||||||
}
|
}
|
||||||
FlightRecorder::CsumLog(csum);
|
|
||||||
#endif
|
#endif
|
||||||
|
FlightRecorder::StepLog("rank inner product");
|
||||||
ComplexD nrm = rankInnerProduct(left,right);
|
ComplexD nrm = rankInnerProduct(left,right);
|
||||||
|
// ComplexD nrmck=nrm;
|
||||||
RealD local = real(nrm);
|
RealD local = real(nrm);
|
||||||
FlightRecorder::NormLog(real(nrm));
|
ok = FlightRecorder::NormLog(real(nrm));
|
||||||
|
if ( !ok ) {
|
||||||
|
ComplexD nrm2 = rankInnerProduct(left,right);
|
||||||
|
RealD local2 = real(nrm2);
|
||||||
|
std::cerr<< " Bad NORM " << local << " recomputed as "<<local2<<std::endl;
|
||||||
|
assert(ok);
|
||||||
|
}
|
||||||
|
FlightRecorder::StepLog("Start global sum");
|
||||||
|
// grid->GlobalSumP2P(nrm);
|
||||||
grid->GlobalSum(nrm);
|
grid->GlobalSum(nrm);
|
||||||
|
FlightRecorder::StepLog("Finished global sum");
|
||||||
|
// std::cout << " norm "<< nrm << " p2p norm "<<nrmck<<std::endl;
|
||||||
FlightRecorder::ReductionLog(local,real(nrm));
|
FlightRecorder::ReductionLog(local,real(nrm));
|
||||||
return nrm;
|
return nrm;
|
||||||
}
|
}
|
||||||
@@ -343,18 +365,6 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
|
|||||||
autoView( x_v, x, AcceleratorRead);
|
autoView( x_v, x, AcceleratorRead);
|
||||||
autoView( y_v, y, AcceleratorRead);
|
autoView( y_v, y, AcceleratorRead);
|
||||||
autoView( z_v, z, AcceleratorWrite);
|
autoView( z_v, z, AcceleratorWrite);
|
||||||
#if 0
|
|
||||||
typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
|
|
||||||
Vector<inner_t> inner_tmp(sites);
|
|
||||||
auto inner_tmp_v = &inner_tmp[0];
|
|
||||||
|
|
||||||
accelerator_for( ss, sites, nsimd,{
|
|
||||||
auto tmp = a*x_v(ss)+b*y_v(ss);
|
|
||||||
coalescedWrite(inner_tmp_v[ss],innerProductD(tmp,tmp));
|
|
||||||
coalescedWrite(z_v[ss],tmp);
|
|
||||||
});
|
|
||||||
nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
|
|
||||||
#else
|
|
||||||
typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
|
typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
|
||||||
deviceVector<inner_t> inner_tmp;
|
deviceVector<inner_t> inner_tmp;
|
||||||
inner_tmp.resize(sites);
|
inner_tmp.resize(sites);
|
||||||
@@ -365,9 +375,44 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
|
|||||||
coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
|
coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
|
||||||
coalescedWrite(z_v[ss],tmp);
|
coalescedWrite(z_v[ss],tmp);
|
||||||
});
|
});
|
||||||
nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
|
bool ok;
|
||||||
|
uint64_t csum=0;
|
||||||
|
uint64_t csum2=0;
|
||||||
|
#ifdef GRID_SYCL
|
||||||
|
if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
|
||||||
|
{
|
||||||
|
// z_v
|
||||||
|
{
|
||||||
|
Integer words = sites*sizeof(vobj)/sizeof(uint64_t);
|
||||||
|
uint64_t *base= (uint64_t *)&z_v[0];
|
||||||
|
csum=svm_xor(base,words);
|
||||||
|
ok = FlightRecorder::CsumLog(csum);
|
||||||
|
if ( !ok ) {
|
||||||
|
csum2=svm_xor(base,words);
|
||||||
|
std::cerr<< " Bad z_v CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
|
||||||
|
}
|
||||||
|
assert(ok);
|
||||||
|
}
|
||||||
|
// inner_v
|
||||||
|
{
|
||||||
|
Integer words = sites*sizeof(inner_t)/sizeof(uint64_t);
|
||||||
|
uint64_t *base= (uint64_t *)&inner_tmp_v[0];
|
||||||
|
csum=svm_xor(base,words);
|
||||||
|
ok = FlightRecorder::CsumLog(csum);
|
||||||
|
if ( !ok ) {
|
||||||
|
csum2=svm_xor(base,words);
|
||||||
|
std::cerr<< " Bad inner_tmp_v CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
|
||||||
|
}
|
||||||
|
assert(ok);
|
||||||
|
}
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
|
||||||
|
ok = FlightRecorder::NormLog(real(nrm));
|
||||||
|
assert(ok);
|
||||||
|
RealD local = real(nrm);
|
||||||
grid->GlobalSum(nrm);
|
grid->GlobalSum(nrm);
|
||||||
|
FlightRecorder::ReductionLog(local,real(nrm));
|
||||||
return nrm;
|
return nrm;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -377,7 +422,7 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
|
|||||||
conformable(left,right);
|
conformable(left,right);
|
||||||
|
|
||||||
typedef typename vobj::vector_typeD vector_type;
|
typedef typename vobj::vector_typeD vector_type;
|
||||||
Vector<ComplexD> tmp(2);
|
std::vector<ComplexD> tmp(2);
|
||||||
|
|
||||||
GridBase *grid = left.Grid();
|
GridBase *grid = left.Grid();
|
||||||
|
|
||||||
@@ -387,8 +432,8 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
|
|||||||
// GPU
|
// GPU
|
||||||
typedef decltype(innerProductD(vobj(),vobj())) inner_t;
|
typedef decltype(innerProductD(vobj(),vobj())) inner_t;
|
||||||
typedef decltype(innerProductD(vobj(),vobj())) norm_t;
|
typedef decltype(innerProductD(vobj(),vobj())) norm_t;
|
||||||
Vector<inner_t> inner_tmp(sites);
|
deviceVector<inner_t> inner_tmp(sites);
|
||||||
Vector<norm_t> norm_tmp(sites);
|
deviceVector<norm_t> norm_tmp(sites);
|
||||||
auto inner_tmp_v = &inner_tmp[0];
|
auto inner_tmp_v = &inner_tmp[0];
|
||||||
auto norm_tmp_v = &norm_tmp[0];
|
auto norm_tmp_v = &norm_tmp[0];
|
||||||
{
|
{
|
||||||
@@ -438,7 +483,9 @@ inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
|
|||||||
// sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc...
|
// sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc...
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim)
|
template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,
|
||||||
|
std::vector<typename vobj::scalar_object> &result,
|
||||||
|
int orthogdim)
|
||||||
{
|
{
|
||||||
///////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////
|
||||||
// FIXME precision promoted summation
|
// FIXME precision promoted summation
|
||||||
@@ -460,8 +507,8 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
|
|||||||
int ld=grid->_ldimensions[orthogdim];
|
int ld=grid->_ldimensions[orthogdim];
|
||||||
int rd=grid->_rdimensions[orthogdim];
|
int rd=grid->_rdimensions[orthogdim];
|
||||||
|
|
||||||
Vector<vobj> lvSum(rd); // will locally sum vectors first
|
std::vector<vobj> lvSum(rd); // will locally sum vectors first
|
||||||
Vector<sobj> lsSum(ld,Zero()); // sum across these down to scalars
|
std::vector<sobj> lsSum(ld,Zero()); // sum across these down to scalars
|
||||||
ExtractBuffer<sobj> extracted(Nsimd); // splitting the SIMD
|
ExtractBuffer<sobj> extracted(Nsimd); // splitting the SIMD
|
||||||
|
|
||||||
result.resize(fd); // And then global sum to return the same vector to every node
|
result.resize(fd); // And then global sum to return the same vector to every node
|
||||||
@@ -519,7 +566,20 @@ sliceSum(const Lattice<vobj> &Data,int orthogdim)
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
Reimplement
|
||||||
|
|
||||||
|
1)
|
||||||
|
template<class vobj>
|
||||||
|
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)
|
||||||
|
|
||||||
|
2)
|
||||||
|
template<class vobj>
|
||||||
|
static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)
|
||||||
|
|
||||||
|
3)
|
||||||
|
-- Make Slice Mul Matrix call sliceMaddMatrix
|
||||||
|
*/
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)
|
static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)
|
||||||
{
|
{
|
||||||
@@ -539,8 +599,8 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti
|
|||||||
int ld=grid->_ldimensions[orthogdim];
|
int ld=grid->_ldimensions[orthogdim];
|
||||||
int rd=grid->_rdimensions[orthogdim];
|
int rd=grid->_rdimensions[orthogdim];
|
||||||
|
|
||||||
Vector<vector_type> lvSum(rd); // will locally sum vectors first
|
std::vector<vector_type> lvSum(rd); // will locally sum vectors first
|
||||||
Vector<scalar_type > lsSum(ld,scalar_type(0.0)); // sum across these down to scalars
|
std::vector<scalar_type > lsSum(ld,scalar_type(0.0)); // sum across these down to scalars
|
||||||
ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd); // splitting the SIMD
|
ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd); // splitting the SIMD
|
||||||
|
|
||||||
result.resize(fd); // And then global sum to return the same vector to every node for IO to file
|
result.resize(fd); // And then global sum to return the same vector to every node for IO to file
|
||||||
@@ -670,203 +730,96 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
|
||||||
inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
|
inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
|
||||||
{
|
{
|
||||||
int NN = BlockSolverGrid->_ndimension;
|
int NN = BlockSolverGrid->_ndimension;
|
||||||
int nsimd = BlockSolverGrid->Nsimd();
|
int nsimd = BlockSolverGrid->Nsimd();
|
||||||
|
|
||||||
std::vector<int> latt_phys(0);
|
std::vector<int> latt_phys(NN-1);
|
||||||
std::vector<int> simd_phys(0);
|
Coordinate simd_phys;
|
||||||
std::vector<int> mpi_phys(0);
|
std::vector<int> mpi_phys(NN-1);
|
||||||
|
Coordinate checker_dim_mask(NN-1);
|
||||||
|
int checker_dim=-1;
|
||||||
|
|
||||||
|
int dd;
|
||||||
for(int d=0;d<NN;d++){
|
for(int d=0;d<NN;d++){
|
||||||
if( d!=Orthog ) {
|
if( d!=Orthog ) {
|
||||||
latt_phys.push_back(BlockSolverGrid->_fdimensions[d]);
|
latt_phys[dd]=BlockSolverGrid->_fdimensions[d];
|
||||||
simd_phys.push_back(BlockSolverGrid->_simd_layout[d]);
|
mpi_phys[dd] =BlockSolverGrid->_processors[d];
|
||||||
mpi_phys.push_back(BlockSolverGrid->_processors[d]);
|
checker_dim_mask[dd] = BlockSolverGrid->_checker_dim_mask[d];
|
||||||
|
if ( d == BlockSolverGrid->_checker_dim ) checker_dim = dd;
|
||||||
|
dd++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys);
|
simd_phys=GridDefaultSimd(latt_phys.size(),nsimd);
|
||||||
|
GridCartesian *tmp = new GridCartesian(latt_phys,simd_phys,mpi_phys);
|
||||||
|
if(BlockSolverGrid->_isCheckerBoarded) {
|
||||||
|
GridRedBlackCartesian *ret = new GridRedBlackCartesian(tmp,checker_dim_mask,checker_dim);
|
||||||
|
delete tmp;
|
||||||
|
return (GridBase *) ret;
|
||||||
|
} else {
|
||||||
|
return (GridBase *) tmp;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
*/
|
|
||||||
|
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)
|
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)
|
||||||
{
|
{
|
||||||
|
GridBase *FullGrid = X.Grid();
|
||||||
|
GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
|
||||||
|
|
||||||
|
Lattice<vobj> Ys(SliceGrid);
|
||||||
|
Lattice<vobj> Rs(SliceGrid);
|
||||||
|
Lattice<vobj> Xs(SliceGrid);
|
||||||
|
Lattice<vobj> RR(FullGrid);
|
||||||
|
|
||||||
|
RR = R; // Copies checkerboard for insert
|
||||||
|
|
||||||
typedef typename vobj::scalar_object sobj;
|
typedef typename vobj::scalar_object sobj;
|
||||||
typedef typename vobj::vector_type vector_type;
|
typedef typename vobj::vector_type vector_type;
|
||||||
|
int Nslice = X.Grid()->GlobalDimensions()[Orthog];
|
||||||
int Nblock = X.Grid()->GlobalDimensions()[Orthog];
|
for(int i=0;i<Nslice;i++){
|
||||||
|
ExtractSlice(Ys,Y,i,Orthog);
|
||||||
GridBase *FullGrid = X.Grid();
|
ExtractSlice(Rs,R,i,Orthog);
|
||||||
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
|
Rs=Ys;
|
||||||
|
for(int j=0;j<Nslice;j++){
|
||||||
// Lattice<vobj> Xslice(SliceGrid);
|
ExtractSlice(Xs,X,j,Orthog);
|
||||||
// Lattice<vobj> Rslice(SliceGrid);
|
Rs = Rs + Xs*(scale*aa(j,i));
|
||||||
|
}
|
||||||
assert( FullGrid->_simd_layout[Orthog]==1);
|
InsertSlice(Rs,RR,i,Orthog);
|
||||||
// int nh = FullGrid->_ndimension;
|
|
||||||
// int nl = SliceGrid->_ndimension;
|
|
||||||
// int nl = nh-1;
|
|
||||||
|
|
||||||
//FIXME package in a convenient iterator
|
|
||||||
//Should loop over a plane orthogonal to direction "Orthog"
|
|
||||||
int stride=FullGrid->_slice_stride[Orthog];
|
|
||||||
int block =FullGrid->_slice_block [Orthog];
|
|
||||||
int nblock=FullGrid->_slice_nblock[Orthog];
|
|
||||||
int ostride=FullGrid->_ostride[Orthog];
|
|
||||||
|
|
||||||
autoView( X_v, X, CpuRead);
|
|
||||||
autoView( Y_v, Y, CpuRead);
|
|
||||||
autoView( R_v, R, CpuWrite);
|
|
||||||
thread_region
|
|
||||||
{
|
|
||||||
Vector<vobj> s_x(Nblock);
|
|
||||||
|
|
||||||
thread_for_collapse_in_region(2, n,nblock, {
|
|
||||||
for(int b=0;b<block;b++){
|
|
||||||
int o = n*stride + b;
|
|
||||||
|
|
||||||
for(int i=0;i<Nblock;i++){
|
|
||||||
s_x[i] = X_v[o+i*ostride];
|
|
||||||
}
|
|
||||||
|
|
||||||
vobj dot;
|
|
||||||
for(int i=0;i<Nblock;i++){
|
|
||||||
dot = Y_v[o+i*ostride];
|
|
||||||
for(int j=0;j<Nblock;j++){
|
|
||||||
dot = dot + s_x[j]*(scale*aa(j,i));
|
|
||||||
}
|
|
||||||
R_v[o+i*ostride]=dot;
|
|
||||||
}
|
|
||||||
}});
|
|
||||||
}
|
}
|
||||||
|
R=RR; // Copy back handles arguments aliasing case
|
||||||
|
delete SliceGrid;
|
||||||
};
|
};
|
||||||
|
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0)
|
static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0)
|
||||||
{
|
{
|
||||||
typedef typename vobj::scalar_object sobj;
|
R=Zero();
|
||||||
typedef typename vobj::vector_type vector_type;
|
sliceMaddMatrix(R,aa,X,R,Orthog,scale);
|
||||||
|
|
||||||
int Nblock = X.Grid()->GlobalDimensions()[Orthog];
|
|
||||||
|
|
||||||
GridBase *FullGrid = X.Grid();
|
|
||||||
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
|
|
||||||
// Lattice<vobj> Xslice(SliceGrid);
|
|
||||||
// Lattice<vobj> Rslice(SliceGrid);
|
|
||||||
|
|
||||||
assert( FullGrid->_simd_layout[Orthog]==1);
|
|
||||||
// int nh = FullGrid->_ndimension;
|
|
||||||
// int nl = SliceGrid->_ndimension;
|
|
||||||
// int nl=1;
|
|
||||||
|
|
||||||
//FIXME package in a convenient iterator
|
|
||||||
// thread_for2d_in_region
|
|
||||||
//Should loop over a plane orthogonal to direction "Orthog"
|
|
||||||
int stride=FullGrid->_slice_stride[Orthog];
|
|
||||||
int block =FullGrid->_slice_block [Orthog];
|
|
||||||
int nblock=FullGrid->_slice_nblock[Orthog];
|
|
||||||
int ostride=FullGrid->_ostride[Orthog];
|
|
||||||
autoView( R_v, R, CpuWrite);
|
|
||||||
autoView( X_v, X, CpuRead);
|
|
||||||
thread_region
|
|
||||||
{
|
|
||||||
std::vector<vobj> s_x(Nblock);
|
|
||||||
|
|
||||||
|
|
||||||
thread_for_collapse_in_region( 2 ,n,nblock,{
|
|
||||||
for(int b=0;b<block;b++){
|
|
||||||
int o = n*stride + b;
|
|
||||||
|
|
||||||
for(int i=0;i<Nblock;i++){
|
|
||||||
s_x[i] = X_v[o+i*ostride];
|
|
||||||
}
|
|
||||||
|
|
||||||
vobj dot;
|
|
||||||
for(int i=0;i<Nblock;i++){
|
|
||||||
dot = s_x[0]*(scale*aa(0,i));
|
|
||||||
for(int j=1;j<Nblock;j++){
|
|
||||||
dot = dot + s_x[j]*(scale*aa(j,i));
|
|
||||||
}
|
|
||||||
R_v[o+i*ostride]=dot;
|
|
||||||
}
|
|
||||||
}});
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)
|
static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)
|
||||||
{
|
{
|
||||||
|
GridBase *SliceGrid = makeSubSliceGrid(lhs.Grid(),Orthog);
|
||||||
|
|
||||||
|
Lattice<vobj> ls(SliceGrid);
|
||||||
|
Lattice<vobj> rs(SliceGrid);
|
||||||
|
|
||||||
typedef typename vobj::scalar_object sobj;
|
typedef typename vobj::scalar_object sobj;
|
||||||
typedef typename vobj::vector_type vector_type;
|
typedef typename vobj::vector_type vector_type;
|
||||||
|
int Nslice = lhs.Grid()->GlobalDimensions()[Orthog];
|
||||||
GridBase *FullGrid = lhs.Grid();
|
mat = Eigen::MatrixXcd::Zero(Nslice,Nslice);
|
||||||
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
|
for(int s=0;s<Nslice;s++){
|
||||||
|
ExtractSlice(ls,lhs,s,Orthog);
|
||||||
int Nblock = FullGrid->GlobalDimensions()[Orthog];
|
for(int ss=0;ss<Nslice;ss++){
|
||||||
|
ExtractSlice(rs,rhs,ss,Orthog);
|
||||||
// Lattice<vobj> Lslice(SliceGrid);
|
mat(s,ss) = innerProduct(ls,rs);
|
||||||
// Lattice<vobj> Rslice(SliceGrid);
|
}
|
||||||
|
|
||||||
mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
|
|
||||||
|
|
||||||
assert( FullGrid->_simd_layout[Orthog]==1);
|
|
||||||
// int nh = FullGrid->_ndimension;
|
|
||||||
// int nl = SliceGrid->_ndimension;
|
|
||||||
// int nl = nh-1;
|
|
||||||
|
|
||||||
//FIXME package in a convenient iterator
|
|
||||||
//Should loop over a plane orthogonal to direction "Orthog"
|
|
||||||
int stride=FullGrid->_slice_stride[Orthog];
|
|
||||||
int block =FullGrid->_slice_block [Orthog];
|
|
||||||
int nblock=FullGrid->_slice_nblock[Orthog];
|
|
||||||
int ostride=FullGrid->_ostride[Orthog];
|
|
||||||
|
|
||||||
typedef typename vobj::vector_typeD vector_typeD;
|
|
||||||
|
|
||||||
autoView( lhs_v, lhs, CpuRead);
|
|
||||||
autoView( rhs_v, rhs, CpuRead);
|
|
||||||
thread_region
|
|
||||||
{
|
|
||||||
std::vector<vobj> Left(Nblock);
|
|
||||||
std::vector<vobj> Right(Nblock);
|
|
||||||
Eigen::MatrixXcd mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock);
|
|
||||||
|
|
||||||
thread_for_collapse_in_region( 2, n,nblock,{
|
|
||||||
for(int b=0;b<block;b++){
|
|
||||||
|
|
||||||
int o = n*stride + b;
|
|
||||||
|
|
||||||
for(int i=0;i<Nblock;i++){
|
|
||||||
Left [i] = lhs_v[o+i*ostride];
|
|
||||||
Right[i] = rhs_v[o+i*ostride];
|
|
||||||
}
|
|
||||||
|
|
||||||
for(int i=0;i<Nblock;i++){
|
|
||||||
for(int j=0;j<Nblock;j++){
|
|
||||||
auto tmp = innerProduct(Left[i],Right[j]);
|
|
||||||
auto rtmp = TensorRemove(tmp);
|
|
||||||
auto red = Reduce(rtmp);
|
|
||||||
mat_thread(i,j) += std::complex<double>(real(red),imag(red));
|
|
||||||
}}
|
|
||||||
}});
|
|
||||||
thread_critical
|
|
||||||
{
|
|
||||||
mat += mat_thread;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
delete SliceGrid;
|
||||||
for(int i=0;i<Nblock;i++){
|
|
||||||
for(int j=0;j<Nblock;j++){
|
|
||||||
ComplexD sum = mat(i,j);
|
|
||||||
FullGrid->GlobalSum(sum);
|
|
||||||
mat(i,j)=sum;
|
|
||||||
}}
|
|
||||||
|
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|||||||
@@ -214,22 +214,12 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi
|
|||||||
// Move out of UVM
|
// Move out of UVM
|
||||||
// Turns out I had messed up the synchronise after move to compute stream
|
// Turns out I had messed up the synchronise after move to compute stream
|
||||||
// as running this on the default stream fools the synchronise
|
// as running this on the default stream fools the synchronise
|
||||||
#undef UVM_BLOCK_BUFFER
|
deviceVector<sobj> buffer(numBlocks);
|
||||||
#ifndef UVM_BLOCK_BUFFER
|
|
||||||
commVector<sobj> buffer(numBlocks);
|
|
||||||
sobj *buffer_v = &buffer[0];
|
sobj *buffer_v = &buffer[0];
|
||||||
sobj result;
|
sobj result;
|
||||||
reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
|
reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
|
||||||
accelerator_barrier();
|
accelerator_barrier();
|
||||||
acceleratorCopyFromDevice(buffer_v,&result,sizeof(result));
|
acceleratorCopyFromDevice(buffer_v,&result,sizeof(result));
|
||||||
#else
|
|
||||||
Vector<sobj> buffer(numBlocks);
|
|
||||||
sobj *buffer_v = &buffer[0];
|
|
||||||
sobj result;
|
|
||||||
reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
|
|
||||||
accelerator_barrier();
|
|
||||||
result = *buffer_v;
|
|
||||||
#endif
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -244,7 +234,7 @@ inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osi
|
|||||||
|
|
||||||
const int words = sizeof(vobj)/sizeof(vector);
|
const int words = sizeof(vobj)/sizeof(vector);
|
||||||
|
|
||||||
Vector<vector> buffer(osites);
|
deviceVector<vector> buffer(osites);
|
||||||
vector *dat = (vector *)lat;
|
vector *dat = (vector *)lat;
|
||||||
vector *buf = &buffer[0];
|
vector *buf = &buffer[0];
|
||||||
iScalar<vector> *tbuf =(iScalar<vector> *) &buffer[0];
|
iScalar<vector> *tbuf =(iScalar<vector> *) &buffer[0];
|
||||||
|
|||||||
@@ -4,33 +4,28 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
// Possibly promote to double and sum
|
// Possibly promote to double and sum
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
|
||||||
template <class vobj>
|
template <class vobj>
|
||||||
inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites)
|
inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites)
|
||||||
{
|
{
|
||||||
typedef typename vobj::scalar_object sobj;
|
typedef typename vobj::scalar_object sobj;
|
||||||
typedef typename vobj::scalar_objectD sobjD;
|
typedef typename vobj::scalar_objectD sobjD;
|
||||||
static Vector<sobj> mysum;
|
|
||||||
mysum.resize(1);
|
|
||||||
sobj *mysum_p = & mysum[0];
|
|
||||||
sobj identity; zeroit(identity);
|
sobj identity; zeroit(identity);
|
||||||
mysum[0] = identity;
|
sobj ret; zeroit(ret);
|
||||||
sobj ret ;
|
|
||||||
|
|
||||||
Integer nsimd= vobj::Nsimd();
|
Integer nsimd= vobj::Nsimd();
|
||||||
|
{
|
||||||
const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() });
|
sycl::buffer<sobj, 1> abuff(&ret, {1});
|
||||||
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
|
theGridAccelerator->submit([&](sycl::handler &cgh) {
|
||||||
auto Reduction = cl::sycl::reduction(mysum_p,identity,std::plus<>(),PropList);
|
auto Reduction = sycl::reduction(abuff,cgh,identity,std::plus<>());
|
||||||
cgh.parallel_for(cl::sycl::range<1>{osites},
|
cgh.parallel_for(sycl::range<1>{osites},
|
||||||
Reduction,
|
Reduction,
|
||||||
[=] (cl::sycl::id<1> item, auto &sum) {
|
[=] (sycl::id<1> item, auto &sum) {
|
||||||
auto osite = item[0];
|
auto osite = item[0];
|
||||||
sum +=Reduce(lat[osite]);
|
sum +=Reduce(lat[osite]);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
theGridAccelerator->wait();
|
}
|
||||||
ret = mysum[0];
|
|
||||||
// free(mysum,*theGridAccelerator);
|
|
||||||
sobjD dret; convertType(dret,ret);
|
sobjD dret; convertType(dret,ret);
|
||||||
return dret;
|
return dret;
|
||||||
}
|
}
|
||||||
@@ -76,59 +71,22 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite
|
|||||||
|
|
||||||
template<class Word> Word svm_xor(Word *vec,uint64_t L)
|
template<class Word> Word svm_xor(Word *vec,uint64_t L)
|
||||||
{
|
{
|
||||||
Word xorResult; xorResult = 0;
|
|
||||||
static Vector<Word> d_sum;
|
|
||||||
d_sum.resize(1);
|
|
||||||
Word *d_sum_p=&d_sum[0];
|
|
||||||
Word identity; identity=0;
|
Word identity; identity=0;
|
||||||
d_sum[0] = identity;
|
Word ret = 0;
|
||||||
const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() });
|
{
|
||||||
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
|
sycl::buffer<Word, 1> abuff(&ret, {1});
|
||||||
auto Reduction = cl::sycl::reduction(d_sum_p,identity,std::bit_xor<>(),PropList);
|
theGridAccelerator->submit([&](sycl::handler &cgh) {
|
||||||
cgh.parallel_for(cl::sycl::range<1>{L},
|
auto Reduction = sycl::reduction(abuff,cgh,identity,std::bit_xor<>());
|
||||||
Reduction,
|
cgh.parallel_for(sycl::range<1>{L},
|
||||||
[=] (cl::sycl::id<1> index, auto &sum) {
|
Reduction,
|
||||||
sum^=vec[index];
|
[=] (sycl::id<1> index, auto &sum) {
|
||||||
});
|
sum ^=vec[index];
|
||||||
});
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
theGridAccelerator->wait();
|
theGridAccelerator->wait();
|
||||||
Word ret = d_sum[0];
|
|
||||||
// free(d_sum,*theGridAccelerator);
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
/*
|
|
||||||
|
|
||||||
template <class vobj>
|
|
||||||
inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites)
|
|
||||||
{
|
|
||||||
typedef typename vobj::vector_type vector;
|
|
||||||
typedef typename vobj::scalar_type scalar;
|
|
||||||
|
|
||||||
typedef typename vobj::scalar_typeD scalarD;
|
|
||||||
typedef typename vobj::scalar_objectD sobjD;
|
|
||||||
|
|
||||||
sobjD ret;
|
|
||||||
scalarD *ret_p = (scalarD *)&ret;
|
|
||||||
|
|
||||||
const int nsimd = vobj::Nsimd();
|
|
||||||
const int words = sizeof(vobj)/sizeof(vector);
|
|
||||||
|
|
||||||
Vector<scalar> buffer(osites*nsimd);
|
|
||||||
scalar *buf = &buffer[0];
|
|
||||||
vector *dat = (vector *)lat;
|
|
||||||
|
|
||||||
for(int w=0;w<words;w++) {
|
|
||||||
|
|
||||||
accelerator_for(ss,osites,nsimd,{
|
|
||||||
int lane = acceleratorSIMTlane(nsimd);
|
|
||||||
buf[ss*nsimd+lane] = dat[ss*words+w].getlane(lane);
|
|
||||||
});
|
|
||||||
//Precision change at this point is to late to gain precision
|
|
||||||
ret_p[w] = svm_reduce(buf,nsimd*osites);
|
|
||||||
}
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|||||||
@@ -21,9 +21,18 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
|
|
||||||
|
|
||||||
#if defined(GRID_CUDA) || defined(GRID_HIP)
|
#if defined(GRID_CUDA) || defined(GRID_HIP)
|
||||||
template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
|
template<class vobj>
|
||||||
|
inline void sliceSumReduction_cub_small(const vobj *Data,
|
||||||
|
std::vector<vobj> &lvSum,
|
||||||
|
const int rd,
|
||||||
|
const int e1,
|
||||||
|
const int e2,
|
||||||
|
const int stride,
|
||||||
|
const int ostride,
|
||||||
|
const int Nsimd)
|
||||||
|
{
|
||||||
size_t subvol_size = e1*e2;
|
size_t subvol_size = e1*e2;
|
||||||
commVector<vobj> reduction_buffer(rd*subvol_size);
|
deviceVector<vobj> reduction_buffer(rd*subvol_size);
|
||||||
auto rb_p = &reduction_buffer[0];
|
auto rb_p = &reduction_buffer[0];
|
||||||
vobj zero_init;
|
vobj zero_init;
|
||||||
zeroit(zero_init);
|
zeroit(zero_init);
|
||||||
@@ -94,7 +103,15 @@ template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, V
|
|||||||
|
|
||||||
|
|
||||||
#if defined(GRID_SYCL)
|
#if defined(GRID_SYCL)
|
||||||
template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data, Vector <vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
|
template<class vobj>
|
||||||
|
inline void sliceSumReduction_sycl_small(const vobj *Data,
|
||||||
|
std::vector <vobj> &lvSum,
|
||||||
|
const int &rd,
|
||||||
|
const int &e1,
|
||||||
|
const int &e2,
|
||||||
|
const int &stride,
|
||||||
|
const int &ostride,
|
||||||
|
const int &Nsimd)
|
||||||
{
|
{
|
||||||
size_t subvol_size = e1*e2;
|
size_t subvol_size = e1*e2;
|
||||||
|
|
||||||
@@ -105,7 +122,7 @@ template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data,
|
|||||||
mysum[r] = vobj_zero;
|
mysum[r] = vobj_zero;
|
||||||
}
|
}
|
||||||
|
|
||||||
commVector<vobj> reduction_buffer(rd*subvol_size);
|
deviceVector<vobj> reduction_buffer(rd*subvol_size);
|
||||||
|
|
||||||
auto rb_p = &reduction_buffer[0];
|
auto rb_p = &reduction_buffer[0];
|
||||||
|
|
||||||
@@ -124,11 +141,11 @@ template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data,
|
|||||||
});
|
});
|
||||||
|
|
||||||
for (int r = 0; r < rd; r++) {
|
for (int r = 0; r < rd; r++) {
|
||||||
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
|
theGridAccelerator->submit([&](sycl::handler &cgh) {
|
||||||
auto Reduction = cl::sycl::reduction(&mysum[r],std::plus<>());
|
auto Reduction = sycl::reduction(&mysum[r],std::plus<>());
|
||||||
cgh.parallel_for(cl::sycl::range<1>{subvol_size},
|
cgh.parallel_for(sycl::range<1>{subvol_size},
|
||||||
Reduction,
|
Reduction,
|
||||||
[=](cl::sycl::id<1> item, auto &sum) {
|
[=](sycl::id<1> item, auto &sum) {
|
||||||
auto s = item[0];
|
auto s = item[0];
|
||||||
sum += rb_p[r*subvol_size+s];
|
sum += rb_p[r*subvol_size+s];
|
||||||
});
|
});
|
||||||
@@ -144,14 +161,23 @@ template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data,
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
template<class vobj> inline void sliceSumReduction_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
|
template<class vobj>
|
||||||
|
inline void sliceSumReduction_large(const vobj *Data,
|
||||||
|
std::vector<vobj> &lvSum,
|
||||||
|
const int rd,
|
||||||
|
const int e1,
|
||||||
|
const int e2,
|
||||||
|
const int stride,
|
||||||
|
const int ostride,
|
||||||
|
const int Nsimd)
|
||||||
|
{
|
||||||
typedef typename vobj::vector_type vector;
|
typedef typename vobj::vector_type vector;
|
||||||
const int words = sizeof(vobj)/sizeof(vector);
|
const int words = sizeof(vobj)/sizeof(vector);
|
||||||
const int osites = rd*e1*e2;
|
const int osites = rd*e1*e2;
|
||||||
commVector<vector>buffer(osites);
|
deviceVector<vector>buffer(osites);
|
||||||
vector *dat = (vector *)Data;
|
vector *dat = (vector *)Data;
|
||||||
vector *buf = &buffer[0];
|
vector *buf = &buffer[0];
|
||||||
Vector<vector> lvSum_small(rd);
|
std::vector<vector> lvSum_small(rd);
|
||||||
vector *lvSum_ptr = (vector *)&lvSum[0];
|
vector *lvSum_ptr = (vector *)&lvSum[0];
|
||||||
|
|
||||||
for (int w = 0; w < words; w++) {
|
for (int w = 0; w < words; w++) {
|
||||||
@@ -168,13 +194,18 @@ template<class vobj> inline void sliceSumReduction_large(const vobj *Data, Vecto
|
|||||||
for (int r = 0; r < rd; r++) {
|
for (int r = 0; r < rd; r++) {
|
||||||
lvSum_ptr[w+words*r]=lvSum_small[r];
|
lvSum_ptr[w+words*r]=lvSum_small[r];
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj> inline void sliceSumReduction_gpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd)
|
template<class vobj>
|
||||||
|
inline void sliceSumReduction_gpu(const Lattice<vobj> &Data,
|
||||||
|
std::vector<vobj> &lvSum,
|
||||||
|
const int rd,
|
||||||
|
const int e1,
|
||||||
|
const int e2,
|
||||||
|
const int stride,
|
||||||
|
const int ostride,
|
||||||
|
const int Nsimd)
|
||||||
{
|
{
|
||||||
autoView(Data_v, Data, AcceleratorRead); //reduction libraries cannot deal with large vobjs so we split into small/large case.
|
autoView(Data_v, Data, AcceleratorRead); //reduction libraries cannot deal with large vobjs so we split into small/large case.
|
||||||
if constexpr (sizeof(vobj) <= 256) {
|
if constexpr (sizeof(vobj) <= 256) {
|
||||||
@@ -192,7 +223,15 @@ template<class vobj> inline void sliceSumReduction_gpu(const Lattice<vobj> &Data
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
|
template<class vobj>
|
||||||
|
inline void sliceSumReduction_cpu(const Lattice<vobj> &Data,
|
||||||
|
std::vector<vobj> &lvSum,
|
||||||
|
const int &rd,
|
||||||
|
const int &e1,
|
||||||
|
const int &e2,
|
||||||
|
const int &stride,
|
||||||
|
const int &ostride,
|
||||||
|
const int &Nsimd)
|
||||||
{
|
{
|
||||||
// sum over reduced dimension planes, breaking out orthog dir
|
// sum over reduced dimension planes, breaking out orthog dir
|
||||||
// Parallel over orthog direction
|
// Parallel over orthog direction
|
||||||
@@ -208,16 +247,20 @@ template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
|
template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data,
|
||||||
|
std::vector<vobj> &lvSum,
|
||||||
|
const int &rd,
|
||||||
|
const int &e1,
|
||||||
|
const int &e2,
|
||||||
|
const int &stride,
|
||||||
|
const int &ostride,
|
||||||
|
const int &Nsimd)
|
||||||
{
|
{
|
||||||
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
|
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
|
||||||
|
|
||||||
sliceSumReduction_gpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
sliceSumReduction_gpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
||||||
|
#else
|
||||||
#else
|
|
||||||
sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
||||||
|
#endif
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -981,8 +981,14 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
|
|||||||
hcoor[orthog] = slice;
|
hcoor[orthog] = slice;
|
||||||
for(int d=0;d<nh;d++){
|
for(int d=0;d<nh;d++){
|
||||||
if ( d!=orthog ) {
|
if ( d!=orthog ) {
|
||||||
hcoor[d]=lcoor[ddl++];
|
hcoor[d]=lcoor[ddl];
|
||||||
|
if ( hg->_checker_dim == d ) {
|
||||||
|
hcoor[d]=hcoor[d]*2; // factor in the full coor for peekLocalSite
|
||||||
|
lcoor[ddl]=lcoor[ddl]*2; // factor in the full coor for peekLocalSite
|
||||||
|
}
|
||||||
|
ddl++;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
peekLocalSite(s,lowDimv,lcoor);
|
peekLocalSite(s,lowDimv,lcoor);
|
||||||
pokeLocalSite(s,higherDimv,hcoor);
|
pokeLocalSite(s,higherDimv,hcoor);
|
||||||
@@ -1003,6 +1009,7 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
|
|||||||
assert(orthog<nh);
|
assert(orthog<nh);
|
||||||
assert(orthog>=0);
|
assert(orthog>=0);
|
||||||
assert(hg->_processors[orthog]==1);
|
assert(hg->_processors[orthog]==1);
|
||||||
|
lowDim.Checkerboard() = higherDim.Checkerboard();
|
||||||
|
|
||||||
int dl; dl = 0;
|
int dl; dl = 0;
|
||||||
for(int d=0;d<nh;d++){
|
for(int d=0;d<nh;d++){
|
||||||
@@ -1020,11 +1027,16 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
|
|||||||
Coordinate lcoor(nl);
|
Coordinate lcoor(nl);
|
||||||
Coordinate hcoor(nh);
|
Coordinate hcoor(nh);
|
||||||
lg->LocalIndexToLocalCoor(idx,lcoor);
|
lg->LocalIndexToLocalCoor(idx,lcoor);
|
||||||
int ddl=0;
|
|
||||||
hcoor[orthog] = slice;
|
hcoor[orthog] = slice;
|
||||||
|
int ddl=0;
|
||||||
for(int d=0;d<nh;d++){
|
for(int d=0;d<nh;d++){
|
||||||
if ( d!=orthog ) {
|
if ( d!=orthog ) {
|
||||||
hcoor[d]=lcoor[ddl++];
|
hcoor[d]=lcoor[ddl];
|
||||||
|
if ( hg->_checker_dim == d ) {
|
||||||
|
hcoor[d]=hcoor[d]*2; // factor in the full gridd coor for peekLocalSite
|
||||||
|
lcoor[ddl]=lcoor[ddl]*2; // factor in the full coor for peekLocalSite
|
||||||
|
}
|
||||||
|
ddl++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
peekLocalSite(s,higherDimv,hcoor);
|
peekLocalSite(s,higherDimv,hcoor);
|
||||||
|
|||||||
@@ -54,7 +54,7 @@ struct CshiftImplGauge: public CshiftImplBase<typename Gimpl::GaugeLinkField::ve
|
|||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
|
|
||||||
template<class vobj> inline void ScatterSlice(const cshiftVector<vobj> &buf,
|
template<class vobj> inline void ScatterSlice(const deviceVector<vobj> &buf,
|
||||||
Lattice<vobj> &lat,
|
Lattice<vobj> &lat,
|
||||||
int x,
|
int x,
|
||||||
int dim,
|
int dim,
|
||||||
@@ -140,7 +140,7 @@ template<class vobj> inline void ScatterSlice(const cshiftVector<vobj> &buf,
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj> inline void GatherSlice(cshiftVector<vobj> &buf,
|
template<class vobj> inline void GatherSlice(deviceVector<vobj> &buf,
|
||||||
const Lattice<vobj> &lat,
|
const Lattice<vobj> &lat,
|
||||||
int x,
|
int x,
|
||||||
int dim,
|
int dim,
|
||||||
@@ -462,8 +462,8 @@ public:
|
|||||||
int rNsimd = Nsimd / simd[dimension];
|
int rNsimd = Nsimd / simd[dimension];
|
||||||
assert( buffer_size == from.Grid()->_slice_nblock[dimension]*from.Grid()->_slice_block[dimension] / simd[dimension]);
|
assert( buffer_size == from.Grid()->_slice_nblock[dimension]*from.Grid()->_slice_block[dimension] / simd[dimension]);
|
||||||
|
|
||||||
static cshiftVector<vobj> send_buf;
|
static deviceVector<vobj> send_buf;
|
||||||
static cshiftVector<vobj> recv_buf;
|
static deviceVector<vobj> recv_buf;
|
||||||
send_buf.resize(buffer_size*2*depth);
|
send_buf.resize(buffer_size*2*depth);
|
||||||
recv_buf.resize(buffer_size*2*depth);
|
recv_buf.resize(buffer_size*2*depth);
|
||||||
|
|
||||||
|
|||||||
@@ -90,16 +90,16 @@ public:
|
|||||||
void M5D(const FermionField &psi,
|
void M5D(const FermionField &psi,
|
||||||
const FermionField &phi,
|
const FermionField &phi,
|
||||||
FermionField &chi,
|
FermionField &chi,
|
||||||
Vector<Coeff_t> &lower,
|
std::vector<Coeff_t> &lower,
|
||||||
Vector<Coeff_t> &diag,
|
std::vector<Coeff_t> &diag,
|
||||||
Vector<Coeff_t> &upper);
|
std::vector<Coeff_t> &upper);
|
||||||
|
|
||||||
void M5Ddag(const FermionField &psi,
|
void M5Ddag(const FermionField &psi,
|
||||||
const FermionField &phi,
|
const FermionField &phi,
|
||||||
FermionField &chi,
|
FermionField &chi,
|
||||||
Vector<Coeff_t> &lower,
|
std::vector<Coeff_t> &lower,
|
||||||
Vector<Coeff_t> &diag,
|
std::vector<Coeff_t> &diag,
|
||||||
Vector<Coeff_t> &upper);
|
std::vector<Coeff_t> &upper);
|
||||||
|
|
||||||
virtual void Instantiatable(void)=0;
|
virtual void Instantiatable(void)=0;
|
||||||
|
|
||||||
@@ -119,35 +119,35 @@ public:
|
|||||||
RealD mass_plus, mass_minus;
|
RealD mass_plus, mass_minus;
|
||||||
|
|
||||||
// Save arguments to SetCoefficientsInternal
|
// Save arguments to SetCoefficientsInternal
|
||||||
Vector<Coeff_t> _gamma;
|
std::vector<Coeff_t> _gamma;
|
||||||
RealD _zolo_hi;
|
RealD _zolo_hi;
|
||||||
RealD _b;
|
RealD _b;
|
||||||
RealD _c;
|
RealD _c;
|
||||||
|
|
||||||
// Cayley form Moebius (tanh and zolotarev)
|
// Cayley form Moebius (tanh and zolotarev)
|
||||||
Vector<Coeff_t> omega;
|
std::vector<Coeff_t> omega;
|
||||||
Vector<Coeff_t> bs; // S dependent coeffs
|
std::vector<Coeff_t> bs; // S dependent coeffs
|
||||||
Vector<Coeff_t> cs;
|
std::vector<Coeff_t> cs;
|
||||||
Vector<Coeff_t> as;
|
std::vector<Coeff_t> as;
|
||||||
// For preconditioning Cayley form
|
// For preconditioning Cayley form
|
||||||
Vector<Coeff_t> bee;
|
std::vector<Coeff_t> bee;
|
||||||
Vector<Coeff_t> cee;
|
std::vector<Coeff_t> cee;
|
||||||
Vector<Coeff_t> aee;
|
std::vector<Coeff_t> aee;
|
||||||
Vector<Coeff_t> beo;
|
std::vector<Coeff_t> beo;
|
||||||
Vector<Coeff_t> ceo;
|
std::vector<Coeff_t> ceo;
|
||||||
Vector<Coeff_t> aeo;
|
std::vector<Coeff_t> aeo;
|
||||||
// LDU factorisation of the eeoo matrix
|
// LDU factorisation of the eeoo matrix
|
||||||
Vector<Coeff_t> lee;
|
std::vector<Coeff_t> lee;
|
||||||
Vector<Coeff_t> leem;
|
std::vector<Coeff_t> leem;
|
||||||
Vector<Coeff_t> uee;
|
std::vector<Coeff_t> uee;
|
||||||
Vector<Coeff_t> ueem;
|
std::vector<Coeff_t> ueem;
|
||||||
Vector<Coeff_t> dee;
|
std::vector<Coeff_t> dee;
|
||||||
|
|
||||||
// Matrices of 5d ee inverse params
|
// Matrices of 5d ee inverse params
|
||||||
Vector<iSinglet<Simd> > MatpInv;
|
// std::vector<iSinglet<Simd> > MatpInv;
|
||||||
Vector<iSinglet<Simd> > MatmInv;
|
// std::vector<iSinglet<Simd> > MatmInv;
|
||||||
Vector<iSinglet<Simd> > MatpInvDag;
|
// std::vector<iSinglet<Simd> > MatpInvDag;
|
||||||
Vector<iSinglet<Simd> > MatmInvDag;
|
// std::vector<iSinglet<Simd> > MatmInvDag;
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
// Conserved current utilities
|
// Conserved current utilities
|
||||||
@@ -187,7 +187,7 @@ public:
|
|||||||
protected:
|
protected:
|
||||||
virtual void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
|
virtual void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
|
||||||
virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c);
|
virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c);
|
||||||
virtual void SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t> & gamma,RealD b,RealD c);
|
virtual void SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c);
|
||||||
};
|
};
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|||||||
@@ -90,12 +90,12 @@ protected:
|
|||||||
RealD mass;
|
RealD mass;
|
||||||
RealD R;
|
RealD R;
|
||||||
RealD ZoloHiInv;
|
RealD ZoloHiInv;
|
||||||
Vector<double> Beta;
|
std::vector<double> Beta;
|
||||||
Vector<double> cc;;
|
std::vector<double> cc;;
|
||||||
Vector<double> cc_d;;
|
std::vector<double> cc_d;;
|
||||||
Vector<double> sqrt_cc;
|
std::vector<double> sqrt_cc;
|
||||||
Vector<double> See;
|
std::vector<double> See;
|
||||||
Vector<double> Aee;
|
std::vector<double> Aee;
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -69,10 +69,10 @@ public:
|
|||||||
// Instantiate different versions depending on Impl
|
// Instantiate different versions depending on Impl
|
||||||
/////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////
|
||||||
void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
|
void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
|
||||||
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);
|
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
|
||||||
|
|
||||||
void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
|
void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
|
||||||
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);
|
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
|
||||||
|
|
||||||
virtual void RefreshShiftCoefficients(RealD new_shift);
|
virtual void RefreshShiftCoefficients(RealD new_shift);
|
||||||
|
|
||||||
@@ -83,7 +83,7 @@ public:
|
|||||||
RealD _M5, const ImplParams& p=ImplParams());
|
RealD _M5, const ImplParams& p=ImplParams());
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
void SetCoefficientsInternal(RealD zolo_hi, Vector<Coeff_t>& gamma, RealD b, RealD c);
|
void SetCoefficientsInternal(RealD zolo_hi, std::vector<Coeff_t>& gamma, RealD b, RealD c);
|
||||||
};
|
};
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|||||||
@@ -102,11 +102,11 @@ public:
|
|||||||
GaugeField &mat,
|
GaugeField &mat,
|
||||||
const FermionField &A, const FermionField &B, int dag);
|
const FermionField &A, const FermionField &B, int dag);
|
||||||
|
|
||||||
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
void DhopInternal(StencilImpl &st, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||||
const FermionField &in, FermionField &out, int dag);
|
const FermionField &in, FermionField &out, int dag);
|
||||||
void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
void DhopInternalSerialComms(StencilImpl &st, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||||
const FermionField &in, FermionField &out, int dag);
|
const FermionField &in, FermionField &out, int dag);
|
||||||
void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
void DhopInternalOverlappedComms(StencilImpl &st, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||||
const FermionField &in, FermionField &out, int dag);
|
const FermionField &in, FermionField &out, int dag);
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
@@ -164,8 +164,6 @@ public:
|
|||||||
DoubledGaugeField UUUmuEven;
|
DoubledGaugeField UUUmuEven;
|
||||||
DoubledGaugeField UUUmuOdd;
|
DoubledGaugeField UUUmuOdd;
|
||||||
|
|
||||||
LebesgueOrder Lebesgue;
|
|
||||||
LebesgueOrder LebesgueEvenOdd;
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
// Conserved current utilities
|
// Conserved current utilities
|
||||||
|
|||||||
@@ -100,7 +100,6 @@ public:
|
|||||||
int dag);
|
int dag);
|
||||||
|
|
||||||
void DhopInternal(StencilImpl & st,
|
void DhopInternal(StencilImpl & st,
|
||||||
LebesgueOrder &lo,
|
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
DoubledGaugeField &UUU,
|
DoubledGaugeField &UUU,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
@@ -108,7 +107,6 @@ public:
|
|||||||
int dag);
|
int dag);
|
||||||
|
|
||||||
void DhopInternalOverlappedComms(StencilImpl & st,
|
void DhopInternalOverlappedComms(StencilImpl & st,
|
||||||
LebesgueOrder &lo,
|
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
DoubledGaugeField &UUU,
|
DoubledGaugeField &UUU,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
@@ -116,7 +114,6 @@ public:
|
|||||||
int dag);
|
int dag);
|
||||||
|
|
||||||
void DhopInternalSerialComms(StencilImpl & st,
|
void DhopInternalSerialComms(StencilImpl & st,
|
||||||
LebesgueOrder &lo,
|
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
DoubledGaugeField &UUU,
|
DoubledGaugeField &UUU,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
@@ -192,8 +189,6 @@ public:
|
|||||||
DoubledGaugeField UUUmuEven;
|
DoubledGaugeField UUUmuEven;
|
||||||
DoubledGaugeField UUUmuOdd;
|
DoubledGaugeField UUUmuOdd;
|
||||||
|
|
||||||
LebesgueOrder Lebesgue;
|
|
||||||
LebesgueOrder LebesgueEvenOdd;
|
|
||||||
|
|
||||||
// Comms buffer
|
// Comms buffer
|
||||||
// std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
|
// std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
|
||||||
|
|||||||
@@ -42,11 +42,11 @@ public:
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
// Shift operator coefficients for red-black preconditioned Mobius EOFA
|
// Shift operator coefficients for red-black preconditioned Mobius EOFA
|
||||||
Vector<Coeff_t> Mooee_shift;
|
std::vector<Coeff_t> Mooee_shift;
|
||||||
Vector<Coeff_t> MooeeInv_shift_lc;
|
std::vector<Coeff_t> MooeeInv_shift_lc;
|
||||||
Vector<Coeff_t> MooeeInv_shift_norm;
|
std::vector<Coeff_t> MooeeInv_shift_norm;
|
||||||
Vector<Coeff_t> MooeeInvDag_shift_lc;
|
std::vector<Coeff_t> MooeeInvDag_shift_lc;
|
||||||
Vector<Coeff_t> MooeeInvDag_shift_norm;
|
std::vector<Coeff_t> MooeeInvDag_shift_norm;
|
||||||
|
|
||||||
virtual void Instantiatable(void) {};
|
virtual void Instantiatable(void) {};
|
||||||
|
|
||||||
@@ -74,18 +74,18 @@ public:
|
|||||||
// Instantiate different versions depending on Impl
|
// Instantiate different versions depending on Impl
|
||||||
/////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////
|
||||||
void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
|
void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
|
||||||
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);
|
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
|
||||||
|
|
||||||
void M5D_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
|
void M5D_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
|
||||||
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper,
|
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
|
||||||
Vector<Coeff_t>& shift_coeffs);
|
std::vector<Coeff_t>& shift_coeffs);
|
||||||
|
|
||||||
void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
|
void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
|
||||||
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);
|
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
|
||||||
|
|
||||||
void M5Ddag_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
|
void M5Ddag_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
|
||||||
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper,
|
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
|
||||||
Vector<Coeff_t>& shift_coeffs);
|
std::vector<Coeff_t>& shift_coeffs);
|
||||||
|
|
||||||
virtual void RefreshShiftCoefficients(RealD new_shift);
|
virtual void RefreshShiftCoefficients(RealD new_shift);
|
||||||
|
|
||||||
|
|||||||
@@ -102,11 +102,11 @@ public:
|
|||||||
GaugeField &mat,
|
GaugeField &mat,
|
||||||
const FermionField &A, const FermionField &B, int dag);
|
const FermionField &A, const FermionField &B, int dag);
|
||||||
|
|
||||||
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
void DhopInternal(StencilImpl &st, DoubledGaugeField &U,
|
||||||
const FermionField &in, FermionField &out, int dag);
|
const FermionField &in, FermionField &out, int dag);
|
||||||
void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
void DhopInternalSerialComms(StencilImpl &st, DoubledGaugeField &U,
|
||||||
const FermionField &in, FermionField &out, int dag);
|
const FermionField &in, FermionField &out, int dag);
|
||||||
void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
void DhopInternalOverlappedComms(StencilImpl &st, DoubledGaugeField &U,
|
||||||
const FermionField &in, FermionField &out, int dag);
|
const FermionField &in, FermionField &out, int dag);
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
@@ -152,9 +152,6 @@ public:
|
|||||||
DoubledGaugeField UmuEven;
|
DoubledGaugeField UmuEven;
|
||||||
DoubledGaugeField UmuOdd;
|
DoubledGaugeField UmuOdd;
|
||||||
|
|
||||||
LebesgueOrder Lebesgue;
|
|
||||||
LebesgueOrder LebesgueEvenOdd;
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
// Conserved current utilities
|
// Conserved current utilities
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
|
|||||||
@@ -94,8 +94,8 @@ protected:
|
|||||||
RealD R;
|
RealD R;
|
||||||
RealD amax;
|
RealD amax;
|
||||||
RealD scale;
|
RealD scale;
|
||||||
Vector<double> p;
|
std::vector<double> p;
|
||||||
Vector<double> q;
|
std::vector<double> q;
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -35,7 +35,7 @@ template<class Matrix, class Field>
|
|||||||
class KappaSimilarityTransform {
|
class KappaSimilarityTransform {
|
||||||
public:
|
public:
|
||||||
INHERIT_IMPL_TYPES(Matrix);
|
INHERIT_IMPL_TYPES(Matrix);
|
||||||
Vector<Coeff_t> kappa, kappaDag, kappaInv, kappaInvDag;
|
std::vector<Coeff_t> kappa, kappaDag, kappaInv, kappaInvDag;
|
||||||
|
|
||||||
KappaSimilarityTransform (Matrix &zmob) {
|
KappaSimilarityTransform (Matrix &zmob) {
|
||||||
for (int i=0;i<(int)zmob.bs.size();i++) {
|
for (int i=0;i<(int)zmob.bs.size();i++) {
|
||||||
|
|||||||
@@ -49,10 +49,10 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
void DhopImproved(StencilImpl &st, LebesgueOrder &lo,
|
void DhopImproved(StencilImpl &st,
|
||||||
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
const FermionField &in, FermionField &out, int dag, int interior,int exterior);
|
const FermionField &in, FermionField &out, int dag, int interior,int exterior);
|
||||||
void DhopNaive(StencilImpl &st, LebesgueOrder &lo,
|
void DhopNaive(StencilImpl &st,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
const FermionField &in, FermionField &out, int dag, int interior,int exterior);
|
const FermionField &in, FermionField &out, int dag, int interior,int exterior);
|
||||||
|
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ public:
|
|||||||
static int PartialCompressionFactor(GridBase *grid) { return 1;}
|
static int PartialCompressionFactor(GridBase *grid) { return 1;}
|
||||||
#endif
|
#endif
|
||||||
template<class vobj,class cobj,class compressor>
|
template<class vobj,class cobj,class compressor>
|
||||||
static void Gather_plane_simple (commVector<std::pair<int,int> >& table,
|
static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
|
||||||
const Lattice<vobj> &rhs,
|
const Lattice<vobj> &rhs,
|
||||||
cobj *buffer,
|
cobj *buffer,
|
||||||
compressor &compress,
|
compressor &compress,
|
||||||
@@ -109,7 +109,7 @@ public:
|
|||||||
// Reorder the fifth dim to be s=Ls-1 , s=0, s=1,...,Ls-2.
|
// Reorder the fifth dim to be s=Ls-1 , s=0, s=1,...,Ls-2.
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
template<class vobj,class cobj,class compressor>
|
template<class vobj,class cobj,class compressor>
|
||||||
static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
|
static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
|
||||||
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
|
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
|
||||||
compressor &compress,int type,int partial)
|
compressor &compress,int type,int partial)
|
||||||
{
|
{
|
||||||
@@ -197,7 +197,7 @@ public:
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
template<class vobj,class cobj,class compressor>
|
template<class vobj,class cobj,class compressor>
|
||||||
static void Gather_plane_simple (commVector<std::pair<int,int> >& table,
|
static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
|
||||||
const Lattice<vobj> &rhs,
|
const Lattice<vobj> &rhs,
|
||||||
cobj *buffer,
|
cobj *buffer,
|
||||||
compressor &compress,
|
compressor &compress,
|
||||||
@@ -208,7 +208,7 @@ public:
|
|||||||
else FaceGatherSimple::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
|
else FaceGatherSimple::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
|
||||||
}
|
}
|
||||||
template<class vobj,class cobj,class compressor>
|
template<class vobj,class cobj,class compressor>
|
||||||
static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
|
static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
|
||||||
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
|
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
|
||||||
compressor &compress,int type,int partial)
|
compressor &compress,int type,int partial)
|
||||||
{
|
{
|
||||||
@@ -402,7 +402,6 @@ public:
|
|||||||
|
|
||||||
typedef CartesianStencil<vobj,cobj,Parameters> Base;
|
typedef CartesianStencil<vobj,cobj,Parameters> Base;
|
||||||
typedef typename Base::View_type View_type;
|
typedef typename Base::View_type View_type;
|
||||||
typedef typename Base::StencilVector StencilVector;
|
|
||||||
|
|
||||||
// Vector<int> surface_list;
|
// Vector<int> surface_list;
|
||||||
WilsonStencil(GridBase *grid,
|
WilsonStencil(GridBase *grid,
|
||||||
|
|||||||
@@ -126,14 +126,17 @@ public:
|
|||||||
void DerivInternal(StencilImpl &st, DoubledGaugeField &U, GaugeField &mat,
|
void DerivInternal(StencilImpl &st, DoubledGaugeField &U, GaugeField &mat,
|
||||||
const FermionField &A, const FermionField &B, int dag);
|
const FermionField &A, const FermionField &B, int dag);
|
||||||
|
|
||||||
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
void DhopInternal(StencilImpl &st,
|
||||||
|
DoubledGaugeField &U,
|
||||||
const FermionField &in, FermionField &out, int dag);
|
const FermionField &in, FermionField &out, int dag);
|
||||||
|
|
||||||
void DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
void DhopInternalSerial(StencilImpl &st,
|
||||||
const FermionField &in, FermionField &out, int dag);
|
DoubledGaugeField &U,
|
||||||
|
const FermionField &in, FermionField &out, int dag);
|
||||||
|
|
||||||
void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
void DhopInternalOverlappedComms(StencilImpl &st,
|
||||||
const FermionField &in, FermionField &out, int dag);
|
DoubledGaugeField &U,
|
||||||
|
const FermionField &in, FermionField &out, int dag);
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
|
WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
|
||||||
@@ -168,9 +171,6 @@ public:
|
|||||||
DoubledGaugeField UmuEven;
|
DoubledGaugeField UmuEven;
|
||||||
DoubledGaugeField UmuOdd;
|
DoubledGaugeField UmuOdd;
|
||||||
|
|
||||||
LebesgueOrder Lebesgue;
|
|
||||||
LebesgueOrder LebesgueEvenOdd;
|
|
||||||
|
|
||||||
WilsonAnisotropyCoefficients anisotropyCoeff;
|
WilsonAnisotropyCoefficients anisotropyCoeff;
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
|
|||||||
@@ -135,21 +135,18 @@ public:
|
|||||||
int dag);
|
int dag);
|
||||||
|
|
||||||
void DhopInternal(StencilImpl & st,
|
void DhopInternal(StencilImpl & st,
|
||||||
LebesgueOrder &lo,
|
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
FermionField &out,
|
FermionField &out,
|
||||||
int dag);
|
int dag);
|
||||||
|
|
||||||
void DhopInternalOverlappedComms(StencilImpl & st,
|
void DhopInternalOverlappedComms(StencilImpl & st,
|
||||||
LebesgueOrder &lo,
|
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
FermionField &out,
|
FermionField &out,
|
||||||
int dag);
|
int dag);
|
||||||
|
|
||||||
void DhopInternalSerialComms(StencilImpl & st,
|
void DhopInternalSerialComms(StencilImpl & st,
|
||||||
LebesgueOrder &lo,
|
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
FermionField &out,
|
FermionField &out,
|
||||||
@@ -203,9 +200,6 @@ public:
|
|||||||
DoubledGaugeField UmuEven;
|
DoubledGaugeField UmuEven;
|
||||||
DoubledGaugeField UmuOdd;
|
DoubledGaugeField UmuOdd;
|
||||||
|
|
||||||
LebesgueOrder Lebesgue;
|
|
||||||
LebesgueOrder LebesgueEvenOdd;
|
|
||||||
|
|
||||||
// Comms buffer
|
// Comms buffer
|
||||||
// std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
|
// std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
|
||||||
|
|
||||||
|
|||||||
@@ -58,7 +58,7 @@ public:
|
|||||||
{
|
{
|
||||||
// RealD eps = 1.0;
|
// RealD eps = 1.0;
|
||||||
std::cout<<GridLogMessage << "ZMobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" gamma passed in"<<std::endl;
|
std::cout<<GridLogMessage << "ZMobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" gamma passed in"<<std::endl;
|
||||||
Vector<Coeff_t> zgamma(this->Ls);
|
std::vector<Coeff_t> zgamma(this->Ls);
|
||||||
for(int s=0;s<this->Ls;s++){
|
for(int s=0;s<this->Ls;s++){
|
||||||
zgamma[s] = gamma[s];
|
zgamma[s] = gamma[s];
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
#if 0
|
||||||
|
|
||||||
/*************************************************************************************
|
/*************************************************************************************
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
@@ -818,3 +820,5 @@ CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,
|
|||||||
}
|
}
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
#endif
|
||||||
@@ -1,3 +1,4 @@
|
|||||||
|
#if 0
|
||||||
/*************************************************************************************
|
/*************************************************************************************
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
@@ -241,3 +242,4 @@ void LebesgueOrder::ZGraph(void)
|
|||||||
}
|
}
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
#endif
|
||||||
@@ -72,7 +72,7 @@ public:
|
|||||||
void ThreadInterleave(void);
|
void ThreadInterleave(void);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
Vector<IndexInteger> _LebesgueReorder;
|
deviceVector<IndexInteger> _LebesgueReorder;
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -156,18 +156,18 @@ template<class Impl>
|
|||||||
void CayleyFermion5D<Impl>::M5D (const FermionField &psi, FermionField &chi)
|
void CayleyFermion5D<Impl>::M5D (const FermionField &psi, FermionField &chi)
|
||||||
{
|
{
|
||||||
int Ls=this->Ls;
|
int Ls=this->Ls;
|
||||||
Vector<Coeff_t> diag (Ls,1.0);
|
std::vector<Coeff_t> diag (Ls,1.0);
|
||||||
Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass_minus;
|
std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass_minus;
|
||||||
Vector<Coeff_t> lower(Ls,-1.0); lower[0] =mass_plus;
|
std::vector<Coeff_t> lower(Ls,-1.0); lower[0] =mass_plus;
|
||||||
M5D(psi,chi,chi,lower,diag,upper);
|
M5D(psi,chi,chi,lower,diag,upper);
|
||||||
}
|
}
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void CayleyFermion5D<Impl>::Meooe5D (const FermionField &psi, FermionField &Din)
|
void CayleyFermion5D<Impl>::Meooe5D (const FermionField &psi, FermionField &Din)
|
||||||
{
|
{
|
||||||
int Ls=this->Ls;
|
int Ls=this->Ls;
|
||||||
Vector<Coeff_t> diag = bs;
|
std::vector<Coeff_t> diag = bs;
|
||||||
Vector<Coeff_t> upper= cs;
|
std::vector<Coeff_t> upper= cs;
|
||||||
Vector<Coeff_t> lower= cs;
|
std::vector<Coeff_t> lower= cs;
|
||||||
upper[Ls-1]=-mass_minus*upper[Ls-1];
|
upper[Ls-1]=-mass_minus*upper[Ls-1];
|
||||||
lower[0] =-mass_plus*lower[0];
|
lower[0] =-mass_plus*lower[0];
|
||||||
M5D(psi,psi,Din,lower,diag,upper);
|
M5D(psi,psi,Din,lower,diag,upper);
|
||||||
@@ -176,9 +176,9 @@ void CayleyFermion5D<Impl>::Meooe5D (const FermionField &psi, FermionField &D
|
|||||||
template<class Impl> void CayleyFermion5D<Impl>::Meo5D (const FermionField &psi, FermionField &chi)
|
template<class Impl> void CayleyFermion5D<Impl>::Meo5D (const FermionField &psi, FermionField &chi)
|
||||||
{
|
{
|
||||||
int Ls=this->Ls;
|
int Ls=this->Ls;
|
||||||
Vector<Coeff_t> diag = beo;
|
std::vector<Coeff_t> diag = beo;
|
||||||
Vector<Coeff_t> upper(Ls);
|
std::vector<Coeff_t> upper(Ls);
|
||||||
Vector<Coeff_t> lower(Ls);
|
std::vector<Coeff_t> lower(Ls);
|
||||||
for(int i=0;i<Ls;i++) {
|
for(int i=0;i<Ls;i++) {
|
||||||
upper[i]=-ceo[i];
|
upper[i]=-ceo[i];
|
||||||
lower[i]=-ceo[i];
|
lower[i]=-ceo[i];
|
||||||
@@ -191,9 +191,9 @@ template<class Impl>
|
|||||||
void CayleyFermion5D<Impl>::Mooee (const FermionField &psi, FermionField &chi)
|
void CayleyFermion5D<Impl>::Mooee (const FermionField &psi, FermionField &chi)
|
||||||
{
|
{
|
||||||
int Ls=this->Ls;
|
int Ls=this->Ls;
|
||||||
Vector<Coeff_t> diag = bee;
|
std::vector<Coeff_t> diag = bee;
|
||||||
Vector<Coeff_t> upper(Ls);
|
std::vector<Coeff_t> upper(Ls);
|
||||||
Vector<Coeff_t> lower(Ls);
|
std::vector<Coeff_t> lower(Ls);
|
||||||
for(int i=0;i<Ls;i++) {
|
for(int i=0;i<Ls;i++) {
|
||||||
upper[i]=-cee[i];
|
upper[i]=-cee[i];
|
||||||
lower[i]=-cee[i];
|
lower[i]=-cee[i];
|
||||||
@@ -206,9 +206,9 @@ template<class Impl>
|
|||||||
void CayleyFermion5D<Impl>::MooeeDag (const FermionField &psi, FermionField &chi)
|
void CayleyFermion5D<Impl>::MooeeDag (const FermionField &psi, FermionField &chi)
|
||||||
{
|
{
|
||||||
int Ls=this->Ls;
|
int Ls=this->Ls;
|
||||||
Vector<Coeff_t> diag = bee;
|
std::vector<Coeff_t> diag = bee;
|
||||||
Vector<Coeff_t> upper(Ls);
|
std::vector<Coeff_t> upper(Ls);
|
||||||
Vector<Coeff_t> lower(Ls);
|
std::vector<Coeff_t> lower(Ls);
|
||||||
|
|
||||||
for (int s=0;s<Ls;s++){
|
for (int s=0;s<Ls;s++){
|
||||||
// Assemble the 5d matrix
|
// Assemble the 5d matrix
|
||||||
@@ -236,9 +236,9 @@ template<class Impl>
|
|||||||
void CayleyFermion5D<Impl>::M5Ddag (const FermionField &psi, FermionField &chi)
|
void CayleyFermion5D<Impl>::M5Ddag (const FermionField &psi, FermionField &chi)
|
||||||
{
|
{
|
||||||
int Ls=this->Ls;
|
int Ls=this->Ls;
|
||||||
Vector<Coeff_t> diag(Ls,1.0);
|
std::vector<Coeff_t> diag(Ls,1.0);
|
||||||
Vector<Coeff_t> upper(Ls,-1.0);
|
std::vector<Coeff_t> upper(Ls,-1.0);
|
||||||
Vector<Coeff_t> lower(Ls,-1.0);
|
std::vector<Coeff_t> lower(Ls,-1.0);
|
||||||
upper[Ls-1]=-mass_plus*upper[Ls-1];
|
upper[Ls-1]=-mass_plus*upper[Ls-1];
|
||||||
lower[0] =-mass_minus*lower[0];
|
lower[0] =-mass_minus*lower[0];
|
||||||
M5Ddag(psi,chi,chi,lower,diag,upper);
|
M5Ddag(psi,chi,chi,lower,diag,upper);
|
||||||
@@ -248,9 +248,9 @@ template<class Impl>
|
|||||||
void CayleyFermion5D<Impl>::MeooeDag5D (const FermionField &psi, FermionField &Din)
|
void CayleyFermion5D<Impl>::MeooeDag5D (const FermionField &psi, FermionField &Din)
|
||||||
{
|
{
|
||||||
int Ls=this->Ls;
|
int Ls=this->Ls;
|
||||||
Vector<Coeff_t> diag =bs;
|
std::vector<Coeff_t> diag =bs;
|
||||||
Vector<Coeff_t> upper=cs;
|
std::vector<Coeff_t> upper=cs;
|
||||||
Vector<Coeff_t> lower=cs;
|
std::vector<Coeff_t> lower=cs;
|
||||||
|
|
||||||
for (int s=0;s<Ls;s++){
|
for (int s=0;s<Ls;s++){
|
||||||
if ( s== 0 ) {
|
if ( s== 0 ) {
|
||||||
@@ -394,7 +394,7 @@ void CayleyFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const
|
|||||||
template<class Impl>
|
template<class Impl>
|
||||||
void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c)
|
void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c)
|
||||||
{
|
{
|
||||||
Vector<Coeff_t> gamma(this->Ls);
|
std::vector<Coeff_t> gamma(this->Ls);
|
||||||
for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
|
for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
|
||||||
SetCoefficientsInternal(1.0,gamma,b,c);
|
SetCoefficientsInternal(1.0,gamma,b,c);
|
||||||
}
|
}
|
||||||
@@ -402,13 +402,13 @@ void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,Re
|
|||||||
template<class Impl>
|
template<class Impl>
|
||||||
void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c)
|
void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c)
|
||||||
{
|
{
|
||||||
Vector<Coeff_t> gamma(this->Ls);
|
std::vector<Coeff_t> gamma(this->Ls);
|
||||||
for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
|
for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
|
||||||
SetCoefficientsInternal(zolo_hi,gamma,b,c);
|
SetCoefficientsInternal(zolo_hi,gamma,b,c);
|
||||||
}
|
}
|
||||||
//Zolo
|
//Zolo
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t> & gamma,RealD b,RealD c)
|
void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c)
|
||||||
{
|
{
|
||||||
int Ls=this->Ls;
|
int Ls=this->Ls;
|
||||||
|
|
||||||
|
|||||||
@@ -43,9 +43,9 @@ void
|
|||||||
CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
|
CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
|
||||||
const FermionField &phi_i,
|
const FermionField &phi_i,
|
||||||
FermionField &chi_i,
|
FermionField &chi_i,
|
||||||
Vector<Coeff_t> &lower,
|
std::vector<Coeff_t> &lower,
|
||||||
Vector<Coeff_t> &diag,
|
std::vector<Coeff_t> &diag,
|
||||||
Vector<Coeff_t> &upper)
|
std::vector<Coeff_t> &upper)
|
||||||
{
|
{
|
||||||
|
|
||||||
chi_i.Checkerboard()=psi_i.Checkerboard();
|
chi_i.Checkerboard()=psi_i.Checkerboard();
|
||||||
@@ -55,12 +55,16 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
|
|||||||
autoView(chi , chi_i,AcceleratorWrite);
|
autoView(chi , chi_i,AcceleratorWrite);
|
||||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||||
|
|
||||||
auto pdiag = &diag[0];
|
|
||||||
auto pupper = &upper[0];
|
|
||||||
auto plower = &lower[0];
|
|
||||||
|
|
||||||
int Ls =this->Ls;
|
int Ls =this->Ls;
|
||||||
|
|
||||||
|
static deviceVector<Coeff_t> d_diag(Ls) ; acceleratorCopyToDevice(&diag[0] ,&d_diag[0],Ls*sizeof(Coeff_t));
|
||||||
|
static deviceVector<Coeff_t> d_upper(Ls); acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t));
|
||||||
|
static deviceVector<Coeff_t> d_lower(Ls); acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t));
|
||||||
|
|
||||||
|
auto pdiag = &d_diag[0];
|
||||||
|
auto pupper = &d_upper[0];
|
||||||
|
auto plower = &d_lower[0];
|
||||||
|
|
||||||
// 10 = 3 complex mult + 2 complex add
|
// 10 = 3 complex mult + 2 complex add
|
||||||
// Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting)
|
// Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting)
|
||||||
uint64_t nloop = grid->oSites();
|
uint64_t nloop = grid->oSites();
|
||||||
@@ -82,9 +86,9 @@ void
|
|||||||
CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
|
CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
|
||||||
const FermionField &phi_i,
|
const FermionField &phi_i,
|
||||||
FermionField &chi_i,
|
FermionField &chi_i,
|
||||||
Vector<Coeff_t> &lower,
|
std::vector<Coeff_t> &lower,
|
||||||
Vector<Coeff_t> &diag,
|
std::vector<Coeff_t> &diag,
|
||||||
Vector<Coeff_t> &upper)
|
std::vector<Coeff_t> &upper)
|
||||||
{
|
{
|
||||||
chi_i.Checkerboard()=psi_i.Checkerboard();
|
chi_i.Checkerboard()=psi_i.Checkerboard();
|
||||||
GridBase *grid=psi_i.Grid();
|
GridBase *grid=psi_i.Grid();
|
||||||
@@ -93,12 +97,16 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
|
|||||||
autoView(chi , chi_i,AcceleratorWrite);
|
autoView(chi , chi_i,AcceleratorWrite);
|
||||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||||
|
|
||||||
auto pdiag = &diag[0];
|
|
||||||
auto pupper = &upper[0];
|
|
||||||
auto plower = &lower[0];
|
|
||||||
|
|
||||||
int Ls=this->Ls;
|
int Ls=this->Ls;
|
||||||
|
|
||||||
|
static deviceVector<Coeff_t> d_diag(Ls) ; acceleratorCopyToDevice(&diag[0] ,&d_diag[0],Ls*sizeof(Coeff_t));
|
||||||
|
static deviceVector<Coeff_t> d_upper(Ls); acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t));
|
||||||
|
static deviceVector<Coeff_t> d_lower(Ls); acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t));
|
||||||
|
|
||||||
|
auto pdiag = &d_diag[0];
|
||||||
|
auto pupper = &d_upper[0];
|
||||||
|
auto plower = &d_lower[0];
|
||||||
|
|
||||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||||
uint64_t nloop = grid->oSites();
|
uint64_t nloop = grid->oSites();
|
||||||
accelerator_for(sss,nloop,Simd::Nsimd(),{
|
accelerator_for(sss,nloop,Simd::Nsimd(),{
|
||||||
@@ -126,11 +134,17 @@ CayleyFermion5D<Impl>::MooeeInv (const FermionField &psi_i, FermionField &chi
|
|||||||
|
|
||||||
int Ls=this->Ls;
|
int Ls=this->Ls;
|
||||||
|
|
||||||
auto plee = & lee [0];
|
static deviceVector<Coeff_t> d_lee(Ls); acceleratorCopyToDevice(&lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
|
||||||
auto pdee = & dee [0];
|
static deviceVector<Coeff_t> d_dee(Ls); acceleratorCopyToDevice(&dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
|
||||||
auto puee = & uee [0];
|
static deviceVector<Coeff_t> d_uee(Ls); acceleratorCopyToDevice(&uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
|
||||||
auto pleem = & leem[0];
|
static deviceVector<Coeff_t> d_leem(Ls); acceleratorCopyToDevice(&leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
|
||||||
auto pueem = & ueem[0];
|
static deviceVector<Coeff_t> d_ueem(Ls); acceleratorCopyToDevice(&ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
|
||||||
|
|
||||||
|
auto plee = & d_lee [0];
|
||||||
|
auto pdee = & d_dee [0];
|
||||||
|
auto puee = & d_uee [0];
|
||||||
|
auto pleem = & d_leem[0];
|
||||||
|
auto pueem = & d_ueem[0];
|
||||||
|
|
||||||
uint64_t nloop = grid->oSites()/Ls;
|
uint64_t nloop = grid->oSites()/Ls;
|
||||||
accelerator_for(sss,nloop,Simd::Nsimd(),{
|
accelerator_for(sss,nloop,Simd::Nsimd(),{
|
||||||
@@ -182,11 +196,17 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi
|
|||||||
autoView(psi , psi_i,AcceleratorRead);
|
autoView(psi , psi_i,AcceleratorRead);
|
||||||
autoView(chi , chi_i,AcceleratorWrite);
|
autoView(chi , chi_i,AcceleratorWrite);
|
||||||
|
|
||||||
auto plee = & lee [0];
|
static deviceVector<Coeff_t> d_lee(Ls); acceleratorCopyToDevice(&lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
|
||||||
auto pdee = & dee [0];
|
static deviceVector<Coeff_t> d_dee(Ls); acceleratorCopyToDevice(&dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
|
||||||
auto puee = & uee [0];
|
static deviceVector<Coeff_t> d_uee(Ls); acceleratorCopyToDevice(&uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
|
||||||
auto pleem = & leem[0];
|
static deviceVector<Coeff_t> d_leem(Ls); acceleratorCopyToDevice(&leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
|
||||||
auto pueem = & ueem[0];
|
static deviceVector<Coeff_t> d_ueem(Ls); acceleratorCopyToDevice(&ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
|
||||||
|
|
||||||
|
auto plee = & d_lee [0];
|
||||||
|
auto pdee = & d_dee [0];
|
||||||
|
auto puee = & d_uee [0];
|
||||||
|
auto pleem = & d_leem[0];
|
||||||
|
auto pueem = & d_ueem[0];
|
||||||
|
|
||||||
assert(psi.Checkerboard() == psi.Checkerboard());
|
assert(psi.Checkerboard() == psi.Checkerboard());
|
||||||
|
|
||||||
|
|||||||
@@ -41,7 +41,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
// Pplus backwards..
|
// Pplus backwards..
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i,
|
void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i,
|
||||||
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
|
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
|
||||||
{
|
{
|
||||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
@@ -50,9 +50,15 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
|
|||||||
autoView( psi , psi_i, AcceleratorRead);
|
autoView( psi , psi_i, AcceleratorRead);
|
||||||
autoView( chi , chi_i, AcceleratorWrite);
|
autoView( chi , chi_i, AcceleratorWrite);
|
||||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||||
auto pdiag = &diag[0];
|
|
||||||
auto pupper = &upper[0];
|
static deviceVector<Coeff_t> d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t));
|
||||||
auto plower = &lower[0];
|
static deviceVector<Coeff_t> d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t));
|
||||||
|
static deviceVector<Coeff_t> d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t));
|
||||||
|
|
||||||
|
auto pdiag = &d_diag[0];
|
||||||
|
auto pupper = &d_upper[0];
|
||||||
|
auto plower = &d_lower[0];
|
||||||
|
|
||||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||||
|
|
||||||
auto nloop=grid->oSites()/Ls;
|
auto nloop=grid->oSites()/Ls;
|
||||||
@@ -73,7 +79,7 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
|
|||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const FermionField& phi_i, FermionField& chi_i,
|
void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const FermionField& phi_i, FermionField& chi_i,
|
||||||
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
|
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
|
||||||
{
|
{
|
||||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||||
GridBase* grid = psi_i.Grid();
|
GridBase* grid = psi_i.Grid();
|
||||||
@@ -83,9 +89,14 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const Fermio
|
|||||||
autoView( phi , phi_i, AcceleratorRead);
|
autoView( phi , phi_i, AcceleratorRead);
|
||||||
autoView( chi , chi_i, AcceleratorWrite);
|
autoView( chi , chi_i, AcceleratorWrite);
|
||||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||||
auto pdiag = &diag[0];
|
|
||||||
auto pupper = &upper[0];
|
static deviceVector<Coeff_t> d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t));
|
||||||
auto plower = &lower[0];
|
static deviceVector<Coeff_t> d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t));
|
||||||
|
static deviceVector<Coeff_t> d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t));
|
||||||
|
|
||||||
|
auto pdiag = &d_diag[0];
|
||||||
|
auto pupper = &d_upper[0];
|
||||||
|
auto plower = &d_lower[0];
|
||||||
|
|
||||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||||
|
|
||||||
@@ -114,13 +125,18 @@ void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionFie
|
|||||||
autoView( chi, chi_i, AcceleratorWrite);
|
autoView( chi, chi_i, AcceleratorWrite);
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
|
|
||||||
auto plee = & this->lee[0];
|
static deviceVector<Coeff_t> d_lee(Ls); acceleratorCopyToDevice(&this->lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
|
||||||
auto pdee = & this->dee[0];
|
static deviceVector<Coeff_t> d_dee(Ls); acceleratorCopyToDevice(&this->dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
|
||||||
auto puee = & this->uee[0];
|
static deviceVector<Coeff_t> d_uee(Ls); acceleratorCopyToDevice(&this->uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
|
||||||
|
static deviceVector<Coeff_t> d_leem(Ls); acceleratorCopyToDevice(&this->leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
|
||||||
auto pleem = & this->leem[0];
|
static deviceVector<Coeff_t> d_ueem(Ls); acceleratorCopyToDevice(&this->ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
|
||||||
auto pueem = & this->ueem[0];
|
|
||||||
|
|
||||||
|
auto plee = & d_lee [0];
|
||||||
|
auto pdee = & d_dee [0];
|
||||||
|
auto puee = & d_uee [0];
|
||||||
|
auto pleem = & d_leem[0];
|
||||||
|
auto pueem = & d_ueem[0];
|
||||||
|
|
||||||
uint64_t nloop=grid->oSites()/Ls;
|
uint64_t nloop=grid->oSites()/Ls;
|
||||||
accelerator_for(sss,nloop,Simd::Nsimd(),{
|
accelerator_for(sss,nloop,Simd::Nsimd(),{
|
||||||
uint64_t ss=sss*Ls;
|
uint64_t ss=sss*Ls;
|
||||||
|
|||||||
@@ -131,9 +131,9 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi
|
|||||||
else{ shiftm = -shift*(mq3-mq2); }
|
else{ shiftm = -shift*(mq3-mq2); }
|
||||||
}
|
}
|
||||||
|
|
||||||
Vector<Coeff_t> diag(Ls,1.0);
|
std::vector<Coeff_t> diag(Ls,1.0);
|
||||||
Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftm;
|
std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftm;
|
||||||
Vector<Coeff_t> lower(Ls,-1.0); lower[0] = mq1 + shiftp;
|
std::vector<Coeff_t> lower(Ls,-1.0); lower[0] = mq1 + shiftp;
|
||||||
|
|
||||||
#if(0)
|
#if(0)
|
||||||
std::cout << GridLogMessage << "DomainWallEOFAFermion::M5D(FF&,FF&):" << std::endl;
|
std::cout << GridLogMessage << "DomainWallEOFAFermion::M5D(FF&,FF&):" << std::endl;
|
||||||
@@ -168,9 +168,9 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField&
|
|||||||
else{ shiftm = -shift*(mq3-mq2); }
|
else{ shiftm = -shift*(mq3-mq2); }
|
||||||
}
|
}
|
||||||
|
|
||||||
Vector<Coeff_t> diag(Ls,1.0);
|
std::vector<Coeff_t> diag(Ls,1.0);
|
||||||
Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftp;
|
std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftp;
|
||||||
Vector<Coeff_t> lower(Ls,-1.0); lower[0] = mq1 + shiftm;
|
std::vector<Coeff_t> lower(Ls,-1.0); lower[0] = mq1 + shiftm;
|
||||||
|
|
||||||
this->M5Ddag(psi, chi, chi, lower, diag, upper);
|
this->M5Ddag(psi, chi, chi, lower, diag, upper);
|
||||||
}
|
}
|
||||||
@@ -181,9 +181,9 @@ void DomainWallEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& c
|
|||||||
{
|
{
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
|
|
||||||
Vector<Coeff_t> diag = this->bee;
|
std::vector<Coeff_t> diag = this->bee;
|
||||||
Vector<Coeff_t> upper(Ls);
|
std::vector<Coeff_t> upper(Ls);
|
||||||
Vector<Coeff_t> lower(Ls);
|
std::vector<Coeff_t> lower(Ls);
|
||||||
|
|
||||||
for(int s=0; s<Ls; s++){
|
for(int s=0; s<Ls; s++){
|
||||||
upper[s] = -this->cee[s];
|
upper[s] = -this->cee[s];
|
||||||
@@ -200,9 +200,9 @@ void DomainWallEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField
|
|||||||
{
|
{
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
|
|
||||||
Vector<Coeff_t> diag = this->bee;
|
std::vector<Coeff_t> diag = this->bee;
|
||||||
Vector<Coeff_t> upper(Ls);
|
std::vector<Coeff_t> upper(Ls);
|
||||||
Vector<Coeff_t> lower(Ls);
|
std::vector<Coeff_t> lower(Ls);
|
||||||
|
|
||||||
for(int s=0; s<Ls; s++){
|
for(int s=0; s<Ls; s++){
|
||||||
upper[s] = -this->cee[s];
|
upper[s] = -this->cee[s];
|
||||||
@@ -218,7 +218,7 @@ void DomainWallEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField
|
|||||||
|
|
||||||
//Zolo
|
//Zolo
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void DomainWallEOFAFermion<Impl>::SetCoefficientsInternal(RealD zolo_hi, Vector<Coeff_t>& gamma, RealD b, RealD c)
|
void DomainWallEOFAFermion<Impl>::SetCoefficientsInternal(RealD zolo_hi, std::vector<Coeff_t>& gamma, RealD b, RealD c)
|
||||||
{
|
{
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
int pm = this->pm;
|
int pm = this->pm;
|
||||||
|
|||||||
@@ -61,8 +61,6 @@ ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GridCartesian
|
|||||||
UUUmu(&FourDimGrid),
|
UUUmu(&FourDimGrid),
|
||||||
UUUmuEven(&FourDimRedBlackGrid),
|
UUUmuEven(&FourDimRedBlackGrid),
|
||||||
UUUmuOdd(&FourDimRedBlackGrid),
|
UUUmuOdd(&FourDimRedBlackGrid),
|
||||||
Lebesgue(&FourDimGrid),
|
|
||||||
LebesgueEvenOdd(&FourDimRedBlackGrid),
|
|
||||||
_tmp(&FiveDimRedBlackGrid)
|
_tmp(&FiveDimRedBlackGrid)
|
||||||
{
|
{
|
||||||
|
|
||||||
@@ -277,18 +275,18 @@ void ImprovedStaggeredFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
|
|||||||
|
|
||||||
/*CHANGE */
|
/*CHANGE */
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
|
void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st,
|
||||||
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
||||||
const FermionField &in, FermionField &out,int dag)
|
const FermionField &in, FermionField &out,int dag)
|
||||||
{
|
{
|
||||||
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
|
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
|
||||||
DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
|
DhopInternalOverlappedComms(st,U,UUU,in,out,dag);
|
||||||
else
|
else
|
||||||
DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
|
DhopInternalSerialComms(st,U,UUU,in,out,dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo,
|
void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
|
||||||
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
||||||
const FermionField &in, FermionField &out,int dag)
|
const FermionField &in, FermionField &out,int dag)
|
||||||
{
|
{
|
||||||
@@ -313,7 +311,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
|
|||||||
{
|
{
|
||||||
int interior=1;
|
int interior=1;
|
||||||
int exterior=0;
|
int exterior=0;
|
||||||
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
|
Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
|
||||||
}
|
}
|
||||||
|
|
||||||
st.CommsMerge(compressor);
|
st.CommsMerge(compressor);
|
||||||
@@ -323,12 +321,12 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
|
|||||||
{
|
{
|
||||||
int interior=0;
|
int interior=0;
|
||||||
int exterior=1;
|
int exterior=1;
|
||||||
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
|
Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
|
void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
|
||||||
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
||||||
const FermionField &in, FermionField &out,int dag)
|
const FermionField &in, FermionField &out,int dag)
|
||||||
{
|
{
|
||||||
@@ -341,7 +339,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
|
|||||||
{
|
{
|
||||||
int interior=1;
|
int interior=1;
|
||||||
int exterior=1;
|
int exterior=1;
|
||||||
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
|
Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/*CHANGE END*/
|
/*CHANGE END*/
|
||||||
@@ -357,7 +355,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionFie
|
|||||||
assert(in.Checkerboard()==Even);
|
assert(in.Checkerboard()==Even);
|
||||||
out.Checkerboard() = Odd;
|
out.Checkerboard() = Odd;
|
||||||
|
|
||||||
DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,UUUmuOdd,in,out,dag);
|
DhopInternal(StencilEven,UmuOdd,UUUmuOdd,in,out,dag);
|
||||||
}
|
}
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
|
void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
|
||||||
@@ -368,7 +366,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionFie
|
|||||||
assert(in.Checkerboard()==Odd);
|
assert(in.Checkerboard()==Odd);
|
||||||
out.Checkerboard() = Even;
|
out.Checkerboard() = Even;
|
||||||
|
|
||||||
DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,UUUmuEven,in,out,dag);
|
DhopInternal(StencilOdd,UmuEven,UUUmuEven,in,out,dag);
|
||||||
}
|
}
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
|
void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
|
||||||
@@ -378,7 +376,7 @@ void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField
|
|||||||
|
|
||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
|
|
||||||
DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag);
|
DhopInternal(Stencil,Umu,UUUmu,in,out,dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////
|
||||||
|
|||||||
@@ -48,8 +48,6 @@ ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GridCartesian &Fgrid, G
|
|||||||
StencilEven(&Hgrid, npoint, Even, directions, displacements,p), // source is Even
|
StencilEven(&Hgrid, npoint, Even, directions, displacements,p), // source is Even
|
||||||
StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p), // source is Odd
|
StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p), // source is Odd
|
||||||
mass(_mass),
|
mass(_mass),
|
||||||
Lebesgue(_grid),
|
|
||||||
LebesgueEvenOdd(_cbgrid),
|
|
||||||
Umu(&Fgrid),
|
Umu(&Fgrid),
|
||||||
UmuEven(&Hgrid),
|
UmuEven(&Hgrid),
|
||||||
UmuOdd(&Hgrid),
|
UmuOdd(&Hgrid),
|
||||||
@@ -339,7 +337,7 @@ void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &
|
|||||||
|
|
||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
|
|
||||||
DhopInternal(Stencil, Lebesgue, Umu, UUUmu, in, out, dag);
|
DhopInternal(Stencil, Umu, UUUmu, in, out, dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
@@ -351,7 +349,7 @@ void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField
|
|||||||
assert(in.Checkerboard() == Even);
|
assert(in.Checkerboard() == Even);
|
||||||
out.Checkerboard() = Odd;
|
out.Checkerboard() = Odd;
|
||||||
|
|
||||||
DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, UUUmuOdd, in, out, dag);
|
DhopInternal(StencilEven, UmuOdd, UUUmuOdd, in, out, dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
@@ -363,7 +361,7 @@ void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField
|
|||||||
assert(in.Checkerboard() == Odd);
|
assert(in.Checkerboard() == Odd);
|
||||||
out.Checkerboard() = Even;
|
out.Checkerboard() = Even;
|
||||||
|
|
||||||
DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, UUUmuEven, in, out, dag);
|
DhopInternal(StencilOdd, UmuEven, UUUmuEven, in, out, dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
@@ -394,19 +392,19 @@ void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionFiel
|
|||||||
|
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
|
void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
DoubledGaugeField &UUU,
|
DoubledGaugeField &UUU,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
FermionField &out, int dag)
|
FermionField &out, int dag)
|
||||||
{
|
{
|
||||||
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
|
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
|
||||||
DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
|
DhopInternalOverlappedComms(st,U,UUU,in,out,dag);
|
||||||
else
|
else
|
||||||
DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
|
DhopInternalSerialComms(st,U,UUU,in,out,dag);
|
||||||
}
|
}
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
|
void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
DoubledGaugeField &UUU,
|
DoubledGaugeField &UUU,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
@@ -429,7 +427,7 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
|
|||||||
{
|
{
|
||||||
int interior=1;
|
int interior=1;
|
||||||
int exterior=0;
|
int exterior=0;
|
||||||
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
|
Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
|
||||||
}
|
}
|
||||||
|
|
||||||
st.CommunicateComplete(requests);
|
st.CommunicateComplete(requests);
|
||||||
@@ -440,13 +438,13 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
|
|||||||
{
|
{
|
||||||
int interior=0;
|
int interior=0;
|
||||||
int exterior=1;
|
int exterior=1;
|
||||||
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
|
Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo,
|
void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
DoubledGaugeField &UUU,
|
DoubledGaugeField &UUU,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
@@ -460,7 +458,7 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, Le
|
|||||||
{
|
{
|
||||||
int interior=1;
|
int interior=1;
|
||||||
int exterior=1;
|
int exterior=1;
|
||||||
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
|
Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -39,7 +39,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
|
void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
|
||||||
Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper)
|
std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper)
|
||||||
{
|
{
|
||||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||||
GridBase *grid = psi_i.Grid();
|
GridBase *grid = psi_i.Grid();
|
||||||
@@ -50,9 +50,13 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField
|
|||||||
|
|
||||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||||
|
|
||||||
auto pdiag = &diag[0];
|
static deviceVector<Coeff_t> d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t));
|
||||||
auto pupper = &upper[0];
|
static deviceVector<Coeff_t> d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t));
|
||||||
auto plower = &lower[0];
|
static deviceVector<Coeff_t> d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t));
|
||||||
|
|
||||||
|
auto pdiag = &d_diag[0];
|
||||||
|
auto pupper = &d_upper[0];
|
||||||
|
auto plower = &d_lower[0];
|
||||||
|
|
||||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||||
int nloop = grid->oSites()/Ls;
|
int nloop = grid->oSites()/Ls;
|
||||||
@@ -74,8 +78,8 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField
|
|||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
|
void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
|
||||||
Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper,
|
std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper,
|
||||||
Vector<Coeff_t> &shift_coeffs)
|
std::vector<Coeff_t> &shift_coeffs)
|
||||||
{
|
{
|
||||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||||
GridBase *grid = psi_i.Grid();
|
GridBase *grid = psi_i.Grid();
|
||||||
@@ -86,13 +90,18 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
|
|||||||
|
|
||||||
auto pm = this->pm;
|
auto pm = this->pm;
|
||||||
int shift_s = (pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
|
int shift_s = (pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
|
||||||
|
|
||||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||||
|
|
||||||
auto pdiag = &diag[0];
|
static deviceVector<Coeff_t> d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t));
|
||||||
auto pupper = &upper[0];
|
static deviceVector<Coeff_t> d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t));
|
||||||
auto plower = &lower[0];
|
static deviceVector<Coeff_t> d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t));
|
||||||
auto pshift_coeffs = &shift_coeffs[0];
|
static deviceVector<Coeff_t> d_shift_coeffs(Ls);acceleratorCopyToDevice(&shift_coeffs[0],&d_shift_coeffs[0],Ls*sizeof(Coeff_t));
|
||||||
|
|
||||||
|
auto pdiag = &d_diag[0];
|
||||||
|
auto pupper = &d_upper[0];
|
||||||
|
auto plower = &d_lower[0];
|
||||||
|
auto pshift_coeffs = &d_shift_coeffs[0];
|
||||||
|
|
||||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||||
int nloop = grid->oSites()/Ls;
|
int nloop = grid->oSites()/Ls;
|
||||||
@@ -119,7 +128,7 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
|
|||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
|
void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
|
||||||
Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper)
|
std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper)
|
||||||
{
|
{
|
||||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||||
GridBase *grid = psi_i.Grid();
|
GridBase *grid = psi_i.Grid();
|
||||||
@@ -130,9 +139,13 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
|
|||||||
|
|
||||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||||
|
|
||||||
auto pdiag = &diag[0];
|
static deviceVector<Coeff_t> d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t));
|
||||||
auto pupper = &upper[0];
|
static deviceVector<Coeff_t> d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t));
|
||||||
auto plower = &lower[0];
|
static deviceVector<Coeff_t> d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t));
|
||||||
|
|
||||||
|
auto pdiag = &d_diag[0];
|
||||||
|
auto pupper = &d_upper[0];
|
||||||
|
auto plower = &d_lower[0];
|
||||||
|
|
||||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||||
int nloop = grid->oSites()/Ls;
|
int nloop = grid->oSites()/Ls;
|
||||||
@@ -154,8 +167,8 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
|
|||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
|
void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
|
||||||
Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper,
|
std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper,
|
||||||
Vector<Coeff_t> &shift_coeffs)
|
std::vector<Coeff_t> &shift_coeffs)
|
||||||
{
|
{
|
||||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||||
GridBase *grid = psi_i.Grid();
|
GridBase *grid = psi_i.Grid();
|
||||||
@@ -167,10 +180,15 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const Ferm
|
|||||||
|
|
||||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||||
|
|
||||||
auto pdiag = &diag[0];
|
static deviceVector<Coeff_t> d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t));
|
||||||
auto pupper = &upper[0];
|
static deviceVector<Coeff_t> d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t));
|
||||||
auto plower = &lower[0];
|
static deviceVector<Coeff_t> d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t));
|
||||||
auto pshift_coeffs = &shift_coeffs[0];
|
static deviceVector<Coeff_t> d_shift_coeffs(Ls);acceleratorCopyToDevice(&shift_coeffs[0],&d_shift_coeffs[0],Ls*sizeof(Coeff_t));
|
||||||
|
|
||||||
|
auto pdiag = &d_diag[0];
|
||||||
|
auto pupper = &d_upper[0];
|
||||||
|
auto plower = &d_lower[0];
|
||||||
|
auto pshift_coeffs = &d_shift_coeffs[0];
|
||||||
|
|
||||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||||
auto pm = this->pm;
|
auto pm = this->pm;
|
||||||
@@ -212,11 +230,17 @@ void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField &
|
|||||||
autoView(psi , psi_i, AcceleratorRead);
|
autoView(psi , psi_i, AcceleratorRead);
|
||||||
autoView(chi , chi_i, AcceleratorWrite);
|
autoView(chi , chi_i, AcceleratorWrite);
|
||||||
|
|
||||||
auto plee = & this->lee [0];
|
static deviceVector<Coeff_t> d_lee(Ls); acceleratorCopyToDevice(&this->lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
|
||||||
auto pdee = & this->dee [0];
|
static deviceVector<Coeff_t> d_dee(Ls); acceleratorCopyToDevice(&this->dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
|
||||||
auto puee = & this->uee [0];
|
static deviceVector<Coeff_t> d_uee(Ls); acceleratorCopyToDevice(&this->uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
|
||||||
auto pleem= & this->leem[0];
|
static deviceVector<Coeff_t> d_leem(Ls); acceleratorCopyToDevice(&this->leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
|
||||||
auto pueem= & this->ueem[0];
|
static deviceVector<Coeff_t> d_ueem(Ls); acceleratorCopyToDevice(&this->ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
|
||||||
|
|
||||||
|
auto plee = & d_lee [0];
|
||||||
|
auto pdee = & d_dee [0];
|
||||||
|
auto puee = & d_uee [0];
|
||||||
|
auto pleem = & d_leem[0];
|
||||||
|
auto pueem = & d_ueem[0];
|
||||||
|
|
||||||
if(this->shift != 0.0){ MooeeInv_shift(psi_i,chi_i); return; }
|
if(this->shift != 0.0){ MooeeInv_shift(psi_i,chi_i); return; }
|
||||||
|
|
||||||
@@ -268,14 +292,24 @@ void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionF
|
|||||||
autoView(psi , psi_i, AcceleratorRead);
|
autoView(psi , psi_i, AcceleratorRead);
|
||||||
autoView(chi , chi_i, AcceleratorWrite);
|
autoView(chi , chi_i, AcceleratorWrite);
|
||||||
|
|
||||||
|
// Move into object and constructor
|
||||||
|
static deviceVector<Coeff_t> d_lee(Ls); acceleratorCopyToDevice(&this->lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
|
||||||
|
static deviceVector<Coeff_t> d_dee(Ls); acceleratorCopyToDevice(&this->dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
|
||||||
|
static deviceVector<Coeff_t> d_uee(Ls); acceleratorCopyToDevice(&this->uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
|
||||||
|
static deviceVector<Coeff_t> d_leem(Ls); acceleratorCopyToDevice(&this->leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
|
||||||
|
static deviceVector<Coeff_t> d_ueem(Ls); acceleratorCopyToDevice(&this->ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
|
||||||
|
|
||||||
auto pm = this->pm;
|
auto pm = this->pm;
|
||||||
auto plee = & this->lee [0];
|
auto plee = & d_lee [0];
|
||||||
auto pdee = & this->dee [0];
|
auto pdee = & d_dee [0];
|
||||||
auto puee = & this->uee [0];
|
auto puee = & d_uee [0];
|
||||||
auto pleem= & this->leem[0];
|
auto pleem = & d_leem[0];
|
||||||
auto pueem= & this->ueem[0];
|
auto pueem = & d_ueem[0];
|
||||||
auto pMooeeInv_shift_lc = &MooeeInv_shift_lc[0];
|
|
||||||
auto pMooeeInv_shift_norm = &MooeeInv_shift_norm[0];
|
static deviceVector<Coeff_t> d_MooeeInv_shift_lc(Ls); acceleratorCopyToDevice(&MooeeInv_shift_lc[0],&d_MooeeInv_shift_lc[0],Ls*sizeof(Coeff_t));
|
||||||
|
static deviceVector<Coeff_t> d_MooeeInv_shift_norm(Ls); acceleratorCopyToDevice(&MooeeInv_shift_norm[0],&d_MooeeInv_shift_norm[0],Ls*sizeof(Coeff_t));
|
||||||
|
auto pMooeeInv_shift_lc = &d_MooeeInv_shift_lc[0];
|
||||||
|
auto pMooeeInv_shift_norm = &d_MooeeInv_shift_norm[0];
|
||||||
|
|
||||||
int nloop = grid->oSites()/Ls;
|
int nloop = grid->oSites()/Ls;
|
||||||
accelerator_for(sss,nloop,Simd::Nsimd(),{
|
accelerator_for(sss,nloop,Simd::Nsimd(),{
|
||||||
@@ -333,11 +367,17 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionFiel
|
|||||||
autoView(psi , psi_i, AcceleratorRead);
|
autoView(psi , psi_i, AcceleratorRead);
|
||||||
autoView(chi , chi_i, AcceleratorWrite);
|
autoView(chi , chi_i, AcceleratorWrite);
|
||||||
|
|
||||||
auto plee = & this->lee [0];
|
static deviceVector<Coeff_t> d_lee(Ls); acceleratorCopyToDevice(&this->lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
|
||||||
auto pdee = & this->dee [0];
|
static deviceVector<Coeff_t> d_dee(Ls); acceleratorCopyToDevice(&this->dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
|
||||||
auto puee = & this->uee [0];
|
static deviceVector<Coeff_t> d_uee(Ls); acceleratorCopyToDevice(&this->uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
|
||||||
auto pleem= & this->leem[0];
|
static deviceVector<Coeff_t> d_leem(Ls); acceleratorCopyToDevice(&this->leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
|
||||||
auto pueem= & this->ueem[0];
|
static deviceVector<Coeff_t> d_ueem(Ls); acceleratorCopyToDevice(&this->ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
|
||||||
|
|
||||||
|
auto plee = & d_lee [0];
|
||||||
|
auto pdee = & d_dee [0];
|
||||||
|
auto puee = & d_uee [0];
|
||||||
|
auto pleem = & d_leem[0];
|
||||||
|
auto pueem = & d_ueem[0];
|
||||||
|
|
||||||
int nloop = grid->oSites()/Ls;
|
int nloop = grid->oSites()/Ls;
|
||||||
accelerator_for(sss,nloop,Simd::Nsimd(),{
|
accelerator_for(sss,nloop,Simd::Nsimd(),{
|
||||||
@@ -386,14 +426,28 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, Fermi
|
|||||||
autoView(chi , chi_i, AcceleratorWrite);
|
autoView(chi , chi_i, AcceleratorWrite);
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
|
|
||||||
|
static deviceVector<Coeff_t> d_lee(Ls); acceleratorCopyToDevice(&this->lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
|
||||||
|
static deviceVector<Coeff_t> d_dee(Ls); acceleratorCopyToDevice(&this->dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
|
||||||
|
static deviceVector<Coeff_t> d_uee(Ls); acceleratorCopyToDevice(&this->uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
|
||||||
|
static deviceVector<Coeff_t> d_leem(Ls); acceleratorCopyToDevice(&this->leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
|
||||||
|
static deviceVector<Coeff_t> d_ueem(Ls); acceleratorCopyToDevice(&this->ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
|
||||||
|
|
||||||
auto pm = this->pm;
|
auto pm = this->pm;
|
||||||
auto plee = & this->lee [0];
|
auto plee = & d_lee [0];
|
||||||
auto pdee = & this->dee [0];
|
auto pdee = & d_dee [0];
|
||||||
auto puee = & this->uee [0];
|
auto puee = & d_uee [0];
|
||||||
auto pleem= & this->leem[0];
|
auto pleem = & d_leem[0];
|
||||||
auto pueem= & this->ueem[0];
|
auto pueem = & d_ueem[0];
|
||||||
auto pMooeeInvDag_shift_lc = &MooeeInvDag_shift_lc[0];
|
|
||||||
auto pMooeeInvDag_shift_norm = &MooeeInvDag_shift_norm[0];
|
static deviceVector<Coeff_t> d_MooeeInvDag_shift_lc(Ls);
|
||||||
|
static deviceVector<Coeff_t> d_MooeeInvDag_shift_norm(Ls);
|
||||||
|
acceleratorCopyToDevice(&MooeeInvDag_shift_lc[0],&d_MooeeInvDag_shift_lc[0],Ls*sizeof(Coeff_t));
|
||||||
|
acceleratorCopyToDevice(&MooeeInvDag_shift_norm[0],&d_MooeeInvDag_shift_norm[0],Ls*sizeof(Coeff_t));
|
||||||
|
auto pMooeeInvDag_shift_lc = &d_MooeeInvDag_shift_lc[0];
|
||||||
|
auto pMooeeInvDag_shift_norm = &d_MooeeInvDag_shift_norm[0];
|
||||||
|
|
||||||
|
// auto pMooeeInvDag_shift_lc = &MooeeInvDag_shift_lc[0];
|
||||||
|
// auto pMooeeInvDag_shift_norm = &MooeeInvDag_shift_norm[0];
|
||||||
|
|
||||||
int nloop = grid->oSites()/Ls;
|
int nloop = grid->oSites()/Ls;
|
||||||
accelerator_for(sss,nloop,Simd::Nsimd(),{
|
accelerator_for(sss,nloop,Simd::Nsimd(),{
|
||||||
|
|||||||
@@ -196,9 +196,9 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
|
|||||||
{
|
{
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
|
|
||||||
Vector<Coeff_t> diag(Ls,1.0);
|
std::vector<Coeff_t> diag(Ls,1.0);
|
||||||
Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = this->mq1;
|
std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = this->mq1;
|
||||||
Vector<Coeff_t> lower(Ls,-1.0); lower[0] = this->mq1;
|
std::vector<Coeff_t> lower(Ls,-1.0); lower[0] = this->mq1;
|
||||||
|
|
||||||
// no shift term
|
// no shift term
|
||||||
if(this->shift == 0.0){ this->M5D(psi, chi, chi, lower, diag, upper); }
|
if(this->shift == 0.0){ this->M5D(psi, chi, chi, lower, diag, upper); }
|
||||||
@@ -212,9 +212,9 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& chi)
|
|||||||
{
|
{
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
|
|
||||||
Vector<Coeff_t> diag(Ls,1.0);
|
std::vector<Coeff_t> diag(Ls,1.0);
|
||||||
Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = this->mq1;
|
std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = this->mq1;
|
||||||
Vector<Coeff_t> lower(Ls,-1.0); lower[0] = this->mq1;
|
std::vector<Coeff_t> lower(Ls,-1.0); lower[0] = this->mq1;
|
||||||
|
|
||||||
// no shift term
|
// no shift term
|
||||||
if(this->shift == 0.0){ this->M5Ddag(psi, chi, chi, lower, diag, upper); }
|
if(this->shift == 0.0){ this->M5Ddag(psi, chi, chi, lower, diag, upper); }
|
||||||
@@ -230,9 +230,9 @@ void MobiusEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& chi)
|
|||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
|
|
||||||
// coefficients of Mooee
|
// coefficients of Mooee
|
||||||
Vector<Coeff_t> diag = this->bee;
|
std::vector<Coeff_t> diag = this->bee;
|
||||||
Vector<Coeff_t> upper(Ls);
|
std::vector<Coeff_t> upper(Ls);
|
||||||
Vector<Coeff_t> lower(Ls);
|
std::vector<Coeff_t> lower(Ls);
|
||||||
for(int s=0; s<Ls; s++){
|
for(int s=0; s<Ls; s++){
|
||||||
upper[s] = -this->cee[s];
|
upper[s] = -this->cee[s];
|
||||||
lower[s] = -this->cee[s];
|
lower[s] = -this->cee[s];
|
||||||
@@ -253,9 +253,9 @@ void MobiusEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField& ch
|
|||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
|
|
||||||
// coefficients of MooeeDag
|
// coefficients of MooeeDag
|
||||||
Vector<Coeff_t> diag = this->bee;
|
std::vector<Coeff_t> diag = this->bee;
|
||||||
Vector<Coeff_t> upper(Ls);
|
std::vector<Coeff_t> upper(Ls);
|
||||||
Vector<Coeff_t> lower(Ls);
|
std::vector<Coeff_t> lower(Ls);
|
||||||
for(int s=0; s<Ls; s++){
|
for(int s=0; s<Ls; s++){
|
||||||
if(s==0) {
|
if(s==0) {
|
||||||
upper[s] = -this->cee[s+1];
|
upper[s] = -this->cee[s+1];
|
||||||
@@ -314,10 +314,10 @@ void MobiusEOFAFermion<Impl>::SetCoefficientsPrecondShiftOps()
|
|||||||
// Tridiagonal solve for MooeeInvDag_shift_lc
|
// Tridiagonal solve for MooeeInvDag_shift_lc
|
||||||
{
|
{
|
||||||
Coeff_t m(0.0);
|
Coeff_t m(0.0);
|
||||||
Vector<Coeff_t> d = Mooee_shift;
|
std::vector<Coeff_t> d = Mooee_shift;
|
||||||
Vector<Coeff_t> u(Ls,0.0);
|
std::vector<Coeff_t> u(Ls,0.0);
|
||||||
Vector<Coeff_t> y(Ls,0.0);
|
std::vector<Coeff_t> y(Ls,0.0);
|
||||||
Vector<Coeff_t> q(Ls,0.0);
|
std::vector<Coeff_t> q(Ls,0.0);
|
||||||
if(pm == 1){ u[0] = 1.0; }
|
if(pm == 1){ u[0] = 1.0; }
|
||||||
else{ u[Ls-1] = 1.0; }
|
else{ u[Ls-1] = 1.0; }
|
||||||
|
|
||||||
|
|||||||
@@ -48,8 +48,6 @@ NaiveStaggeredFermion<Impl>::NaiveStaggeredFermion(GridCartesian &Fgrid, GridRed
|
|||||||
StencilEven(&Hgrid, npoint, Even, directions, displacements,p), // source is Even
|
StencilEven(&Hgrid, npoint, Even, directions, displacements,p), // source is Even
|
||||||
StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p), // source is Odd
|
StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p), // source is Odd
|
||||||
mass(_mass),
|
mass(_mass),
|
||||||
Lebesgue(_grid),
|
|
||||||
LebesgueEvenOdd(_cbgrid),
|
|
||||||
Umu(&Fgrid),
|
Umu(&Fgrid),
|
||||||
UmuEven(&Hgrid),
|
UmuEven(&Hgrid),
|
||||||
UmuOdd(&Hgrid),
|
UmuOdd(&Hgrid),
|
||||||
@@ -268,7 +266,7 @@ void NaiveStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out
|
|||||||
|
|
||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
|
|
||||||
DhopInternal(Stencil, Lebesgue, Umu, in, out, dag);
|
DhopInternal(Stencil, Umu, in, out, dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
@@ -280,7 +278,7 @@ void NaiveStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &o
|
|||||||
assert(in.Checkerboard() == Even);
|
assert(in.Checkerboard() == Even);
|
||||||
out.Checkerboard() = Odd;
|
out.Checkerboard() = Odd;
|
||||||
|
|
||||||
DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag);
|
DhopInternal(StencilEven, UmuOdd, in, out, dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
@@ -292,7 +290,7 @@ void NaiveStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &o
|
|||||||
assert(in.Checkerboard() == Odd);
|
assert(in.Checkerboard() == Odd);
|
||||||
out.Checkerboard() = Even;
|
out.Checkerboard() = Even;
|
||||||
|
|
||||||
DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag);
|
DhopInternal(StencilOdd, UmuEven, in, out, dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
@@ -323,18 +321,18 @@ void NaiveStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionField &
|
|||||||
|
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void NaiveStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
|
void NaiveStaggeredFermion<Impl>::DhopInternal(StencilImpl &st,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
FermionField &out, int dag)
|
FermionField &out, int dag)
|
||||||
{
|
{
|
||||||
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
|
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
|
||||||
DhopInternalOverlappedComms(st,lo,U,in,out,dag);
|
DhopInternalOverlappedComms(st,U,in,out,dag);
|
||||||
else
|
else
|
||||||
DhopInternalSerialComms(st,lo,U,in,out,dag);
|
DhopInternalSerialComms(st,U,in,out,dag);
|
||||||
}
|
}
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
|
void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
FermionField &out, int dag)
|
FermionField &out, int dag)
|
||||||
@@ -356,7 +354,7 @@ void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, L
|
|||||||
{
|
{
|
||||||
int interior=1;
|
int interior=1;
|
||||||
int exterior=0;
|
int exterior=0;
|
||||||
Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
|
Kernels::DhopNaive(st,U,in,out,dag,interior,exterior);
|
||||||
}
|
}
|
||||||
|
|
||||||
st.CommunicateComplete(requests);
|
st.CommunicateComplete(requests);
|
||||||
@@ -367,12 +365,12 @@ void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, L
|
|||||||
{
|
{
|
||||||
int interior=0;
|
int interior=0;
|
||||||
int exterior=1;
|
int exterior=1;
|
||||||
Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
|
Kernels::DhopNaive(st,U,in,out,dag,interior,exterior);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo,
|
void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
FermionField &out, int dag)
|
FermionField &out, int dag)
|
||||||
@@ -385,7 +383,7 @@ void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, Lebes
|
|||||||
{
|
{
|
||||||
int interior=1;
|
int interior=1;
|
||||||
int exterior=1;
|
int exterior=1;
|
||||||
Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
|
Kernels::DhopNaive(st,U,in,out,dag,interior,exterior);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -375,23 +375,6 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
#define DHOP_SITE_HAND_INSTANTIATE(IMPL) \
|
|
||||||
template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \
|
|
||||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
|
|
||||||
SiteSpinor *buf, int LLs, int sU, \
|
|
||||||
const FermionFieldView &in, FermionFieldView &out, int dag); \
|
|
||||||
\
|
|
||||||
template void StaggeredKernels<IMPL>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, \
|
|
||||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
|
|
||||||
SiteSpinor *buf, int LLs, int sU, \
|
|
||||||
const FermionFieldView &in, FermionFieldView &out, int dag); \
|
|
||||||
\
|
|
||||||
template void StaggeredKernels<IMPL>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, \
|
|
||||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
|
|
||||||
SiteSpinor *buf, int LLs, int sU, \
|
|
||||||
const FermionFieldView &in, FermionFieldView &out, int dag); \
|
|
||||||
*/
|
|
||||||
#undef LOAD_CHI
|
#undef LOAD_CHI
|
||||||
#undef HAND_DECLARATIONS
|
#undef HAND_DECLARATIONS
|
||||||
|
|
||||||
|
|||||||
@@ -256,7 +256,7 @@ void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldVie
|
|||||||
});
|
});
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,
|
void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st,
|
||||||
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
const FermionField &in, FermionField &out, int dag, int interior,int exterior)
|
const FermionField &in, FermionField &out, int dag, int interior,int exterior)
|
||||||
{
|
{
|
||||||
@@ -294,7 +294,7 @@ void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,
|
|||||||
assert(0 && " Kernel optimisation case not covered ");
|
assert(0 && " Kernel optimisation case not covered ");
|
||||||
}
|
}
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo,
|
void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
const FermionField &in, FermionField &out, int dag, int interior,int exterior)
|
const FermionField &in, FermionField &out, int dag, int interior,int exterior)
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -58,15 +58,9 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
|
|||||||
Umu(_FourDimGrid),
|
Umu(_FourDimGrid),
|
||||||
UmuEven(_FourDimRedBlackGrid),
|
UmuEven(_FourDimRedBlackGrid),
|
||||||
UmuOdd (_FourDimRedBlackGrid),
|
UmuOdd (_FourDimRedBlackGrid),
|
||||||
Lebesgue(_FourDimGrid),
|
|
||||||
LebesgueEvenOdd(_FourDimRedBlackGrid),
|
|
||||||
_tmp(&FiveDimRedBlackGrid),
|
_tmp(&FiveDimRedBlackGrid),
|
||||||
Dirichlet(0)
|
Dirichlet(0)
|
||||||
{
|
{
|
||||||
Stencil.lo = &Lebesgue;
|
|
||||||
StencilEven.lo = &LebesgueEvenOdd;
|
|
||||||
StencilOdd.lo = &LebesgueEvenOdd;
|
|
||||||
|
|
||||||
// some assertions
|
// some assertions
|
||||||
assert(FiveDimGrid._ndimension==5);
|
assert(FiveDimGrid._ndimension==5);
|
||||||
assert(FourDimGrid._ndimension==4);
|
assert(FourDimGrid._ndimension==4);
|
||||||
@@ -305,19 +299,19 @@ void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
|
void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st,
|
||||||
DoubledGaugeField & U,
|
DoubledGaugeField & U,
|
||||||
const FermionField &in, FermionField &out,int dag)
|
const FermionField &in, FermionField &out,int dag)
|
||||||
{
|
{
|
||||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
|
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
|
||||||
DhopInternalOverlappedComms(st,lo,U,in,out,dag);
|
DhopInternalOverlappedComms(st,U,in,out,dag);
|
||||||
else
|
else
|
||||||
DhopInternalSerialComms(st,lo,U,in,out,dag);
|
DhopInternalSerialComms(st,U,in,out,dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo,
|
void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
|
||||||
DoubledGaugeField & U,
|
DoubledGaugeField & U,
|
||||||
const FermionField &in, FermionField &out,int dag)
|
const FermionField &in, FermionField &out,int dag)
|
||||||
{
|
{
|
||||||
@@ -331,10 +325,12 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
|
|||||||
// Start comms // Gather intranode and extra node differentiated??
|
// Start comms // Gather intranode and extra node differentiated??
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
{
|
{
|
||||||
|
// std::cout << " WilsonFermion5D gather " <<std::endl;
|
||||||
GRID_TRACE("Gather");
|
GRID_TRACE("Gather");
|
||||||
st.HaloExchangeOptGather(in,compressor); // Put the barrier in the routine
|
st.HaloExchangeOptGather(in,compressor); // Put the barrier in the routine
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// std::cout << " WilsonFermion5D Communicate Begin " <<std::endl;
|
||||||
std::vector<std::vector<CommsRequest_t> > requests;
|
std::vector<std::vector<CommsRequest_t> > requests;
|
||||||
auto id=traceStart("Communicate overlapped");
|
auto id=traceStart("Communicate overlapped");
|
||||||
st.CommunicateBegin(requests);
|
st.CommunicateBegin(requests);
|
||||||
@@ -343,6 +339,7 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
|
|||||||
// Overlap with comms
|
// Overlap with comms
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
{
|
{
|
||||||
|
// std::cout << " WilsonFermion5D Comms merge " <<std::endl;
|
||||||
GRID_TRACE("MergeSHM");
|
GRID_TRACE("MergeSHM");
|
||||||
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
|
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
|
||||||
}
|
}
|
||||||
@@ -350,6 +347,7 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
|
|||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
// do the compute interior
|
// do the compute interior
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
|
// std::cout << " WilsonFermion5D Interior " <<std::endl;
|
||||||
int Opt = WilsonKernelsStatic::Opt; // Why pass this. Kernels should know
|
int Opt = WilsonKernelsStatic::Opt; // Why pass this. Kernels should know
|
||||||
if (dag == DaggerYes) {
|
if (dag == DaggerYes) {
|
||||||
GRID_TRACE("DhopDagInterior");
|
GRID_TRACE("DhopDagInterior");
|
||||||
@@ -362,6 +360,7 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
|
|||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
// Complete comms
|
// Complete comms
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
|
// std::cout << " WilsonFermion5D Comms Complete " <<std::endl;
|
||||||
st.CommunicateComplete(requests);
|
st.CommunicateComplete(requests);
|
||||||
traceStop(id);
|
traceStop(id);
|
||||||
|
|
||||||
@@ -369,11 +368,13 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
|
|||||||
// do the compute exterior
|
// do the compute exterior
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
{
|
{
|
||||||
|
// std::cout << " WilsonFermion5D Comms Merge " <<std::endl;
|
||||||
GRID_TRACE("Merge");
|
GRID_TRACE("Merge");
|
||||||
st.CommsMerge(compressor);
|
st.CommsMerge(compressor);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// std::cout << " WilsonFermion5D Exterior " <<std::endl;
|
||||||
if (dag == DaggerYes) {
|
if (dag == DaggerYes) {
|
||||||
GRID_TRACE("DhopDagExterior");
|
GRID_TRACE("DhopDagExterior");
|
||||||
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
|
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
|
||||||
@@ -381,11 +382,12 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
|
|||||||
GRID_TRACE("DhopExterior");
|
GRID_TRACE("DhopExterior");
|
||||||
Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
|
Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
|
||||||
}
|
}
|
||||||
|
// std::cout << " WilsonFermion5D Done " <<std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
|
void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
|
||||||
DoubledGaugeField & U,
|
DoubledGaugeField & U,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
FermionField &out,int dag)
|
FermionField &out,int dag)
|
||||||
@@ -395,11 +397,13 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOr
|
|||||||
|
|
||||||
int LLs = in.Grid()->_rdimensions[0];
|
int LLs = in.Grid()->_rdimensions[0];
|
||||||
|
|
||||||
|
// std::cout << " WilsonFermion5D Halo exch " <<std::endl;
|
||||||
{
|
{
|
||||||
GRID_TRACE("HaloExchange");
|
GRID_TRACE("HaloExchange");
|
||||||
st.HaloExchangeOpt(in,compressor);
|
st.HaloExchangeOpt(in,compressor);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// std::cout << " WilsonFermion5D Dhop " <<std::endl;
|
||||||
int Opt = WilsonKernelsStatic::Opt;
|
int Opt = WilsonKernelsStatic::Opt;
|
||||||
if (dag == DaggerYes) {
|
if (dag == DaggerYes) {
|
||||||
GRID_TRACE("DhopDag");
|
GRID_TRACE("DhopDag");
|
||||||
@@ -408,6 +412,7 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOr
|
|||||||
GRID_TRACE("Dhop");
|
GRID_TRACE("Dhop");
|
||||||
Kernels::DhopKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
|
Kernels::DhopKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
|
||||||
}
|
}
|
||||||
|
// std::cout << " WilsonFermion5D Done " <<std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -420,7 +425,7 @@ void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int
|
|||||||
assert(in.Checkerboard()==Even);
|
assert(in.Checkerboard()==Even);
|
||||||
out.Checkerboard() = Odd;
|
out.Checkerboard() = Odd;
|
||||||
|
|
||||||
DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,in,out,dag);
|
DhopInternal(StencilEven,UmuOdd,in,out,dag);
|
||||||
}
|
}
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
|
void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
|
||||||
@@ -431,7 +436,7 @@ void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int
|
|||||||
assert(in.Checkerboard()==Odd);
|
assert(in.Checkerboard()==Odd);
|
||||||
out.Checkerboard() = Even;
|
out.Checkerboard() = Even;
|
||||||
|
|
||||||
DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,in,out,dag);
|
DhopInternal(StencilOdd,UmuEven,in,out,dag);
|
||||||
}
|
}
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
|
void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
|
||||||
@@ -441,7 +446,7 @@ void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int d
|
|||||||
|
|
||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
|
|
||||||
DhopInternal(Stencil,Lebesgue,Umu,in,out,dag);
|
DhopInternal(Stencil,Umu,in,out,dag);
|
||||||
}
|
}
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag)
|
void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag)
|
||||||
|
|||||||
@@ -52,17 +52,12 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
|
|||||||
StencilEven(&Hgrid, npoint, Even, directions,displacements,p), // source is Even
|
StencilEven(&Hgrid, npoint, Even, directions,displacements,p), // source is Even
|
||||||
StencilOdd(&Hgrid, npoint, Odd, directions,displacements,p), // source is Odd
|
StencilOdd(&Hgrid, npoint, Odd, directions,displacements,p), // source is Odd
|
||||||
mass(_mass),
|
mass(_mass),
|
||||||
Lebesgue(_grid),
|
|
||||||
LebesgueEvenOdd(_cbgrid),
|
|
||||||
Umu(&Fgrid),
|
Umu(&Fgrid),
|
||||||
UmuEven(&Hgrid),
|
UmuEven(&Hgrid),
|
||||||
UmuOdd(&Hgrid),
|
UmuOdd(&Hgrid),
|
||||||
_tmp(&Hgrid),
|
_tmp(&Hgrid),
|
||||||
anisotropyCoeff(anis)
|
anisotropyCoeff(anis)
|
||||||
{
|
{
|
||||||
Stencil.lo = &Lebesgue;
|
|
||||||
StencilEven.lo = &LebesgueEvenOdd;
|
|
||||||
StencilOdd.lo = &LebesgueEvenOdd;
|
|
||||||
// Allocate the required comms buffer
|
// Allocate the required comms buffer
|
||||||
ImportGauge(_Umu);
|
ImportGauge(_Umu);
|
||||||
if (anisotropyCoeff.isAnisotropic){
|
if (anisotropyCoeff.isAnisotropic){
|
||||||
@@ -314,7 +309,7 @@ void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int da
|
|||||||
|
|
||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
|
|
||||||
DhopInternal(Stencil, Lebesgue, Umu, in, out, dag);
|
DhopInternal(Stencil, Umu, in, out, dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
@@ -326,7 +321,7 @@ void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int
|
|||||||
assert(in.Checkerboard() == Even);
|
assert(in.Checkerboard() == Even);
|
||||||
out.Checkerboard() = Odd;
|
out.Checkerboard() = Odd;
|
||||||
|
|
||||||
DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag);
|
DhopInternal(StencilEven, UmuOdd, in, out, dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
@@ -338,7 +333,7 @@ void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int d
|
|||||||
assert(in.Checkerboard() == Odd);
|
assert(in.Checkerboard() == Odd);
|
||||||
out.Checkerboard() = Even;
|
out.Checkerboard() = Even;
|
||||||
|
|
||||||
DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag);
|
DhopInternal(StencilOdd, UmuEven, in, out, dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
@@ -391,21 +386,21 @@ void WilsonFermion<Impl>::DhopDirCalc(const FermionField &in, FermionField &out,
|
|||||||
};
|
};
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
|
void WilsonFermion<Impl>::DhopInternal(StencilImpl &st,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
FermionField &out, int dag)
|
FermionField &out, int dag)
|
||||||
{
|
{
|
||||||
#ifdef GRID_OMP
|
#ifdef GRID_OMP
|
||||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
|
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
|
||||||
DhopInternalOverlappedComms(st,lo,U,in,out,dag);
|
DhopInternalOverlappedComms(st,U,in,out,dag);
|
||||||
else
|
else
|
||||||
#endif
|
#endif
|
||||||
DhopInternalSerial(st,lo,U,in,out,dag);
|
DhopInternalSerial(st,U,in,out,dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
|
void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
FermionField &out, int dag)
|
FermionField &out, int dag)
|
||||||
@@ -474,10 +469,10 @@ void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueO
|
|||||||
|
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo,
|
void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
FermionField &out, int dag)
|
FermionField &out, int dag)
|
||||||
{
|
{
|
||||||
GRID_TRACE("DhopSerial");
|
GRID_TRACE("DhopSerial");
|
||||||
assert((dag == DaggerNo) || (dag == DaggerYes));
|
assert((dag == DaggerNo) || (dag == DaggerYes));
|
||||||
|
|||||||
@@ -40,11 +40,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
/// Switch off the 5d vectorised code optimisations
|
/// Switch off the 5d vectorised code optimisations
|
||||||
#undef DWFVEC5D
|
#undef DWFVEC5D
|
||||||
|
|
||||||
static Vector<vComplexF> signsF;
|
static std::vector<vComplexF> signsF;
|
||||||
|
|
||||||
template<typename vtype>
|
template<typename vtype>
|
||||||
int setupSigns(Vector<vtype>& signs ){
|
int setupSigns(std::vector<vtype>& signs ){
|
||||||
Vector<vtype> bother(2);
|
std::vector<vtype> bother(2);
|
||||||
signs = bother;
|
signs = bother;
|
||||||
vrsign(signs[0]);
|
vrsign(signs[0]);
|
||||||
visign(signs[1]);
|
visign(signs[1]);
|
||||||
@@ -364,7 +364,7 @@ WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDagExt(StencilView &st, Doubled
|
|||||||
|
|
||||||
#include <simd/Intel512double.h>
|
#include <simd/Intel512double.h>
|
||||||
|
|
||||||
static Vector<vComplexD> signsD;
|
static std::vector<vComplexD> signsD;
|
||||||
static int signInitD = setupSigns(signsD);
|
static int signInitD = setupSigns(signsD);
|
||||||
|
|
||||||
#define MAYBEPERM(A,perm) if (perm) { A ; }
|
#define MAYBEPERM(A,perm) if (perm) { A ; }
|
||||||
|
|||||||
@@ -434,7 +434,7 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
|
|||||||
|
|
||||||
#define ASM_CALL(A) \
|
#define ASM_CALL(A) \
|
||||||
thread_for( sss, Nsite, { \
|
thread_for( sss, Nsite, { \
|
||||||
int ss = st.lo->Reorder(sss); \
|
int ss = sss; /*st.lo->Reorder(sss);*/ \
|
||||||
int sU = ss; \
|
int sU = ss; \
|
||||||
int sF = ss*Ls; \
|
int sF = ss*Ls; \
|
||||||
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \
|
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \
|
||||||
|
|||||||
@@ -40,7 +40,7 @@ public:
|
|||||||
U = Zero();
|
U = Zero();
|
||||||
LatticeColourMatrix tmp(Uin.Grid());
|
LatticeColourMatrix tmp(Uin.Grid());
|
||||||
|
|
||||||
Vector<typename SU<ncolour>::Matrix> ta(Dimension);
|
std::vector<typename SU<ncolour>::Matrix> ta(Dimension);
|
||||||
|
|
||||||
// Debug lines
|
// Debug lines
|
||||||
// LatticeMatrix uno(Uin.Grid());
|
// LatticeMatrix uno(Uin.Grid());
|
||||||
|
|||||||
@@ -43,7 +43,7 @@ public:
|
|||||||
U = Zero();
|
U = Zero();
|
||||||
LatticeColourMatrix tmp(Uin.Grid());
|
LatticeColourMatrix tmp(Uin.Grid());
|
||||||
|
|
||||||
Vector<typename GaugeGroup<ncolour,group_name>::Matrix> eij(Dimension);
|
std::vector<typename GaugeGroup<ncolour,group_name>::Matrix> eij(Dimension);
|
||||||
|
|
||||||
for (int a = 0; a < Dimension; a++)
|
for (int a = 0; a < Dimension; a++)
|
||||||
GaugeGroupTwoIndex<ncolour, S, group_name>::base(a, eij[a]);
|
GaugeGroupTwoIndex<ncolour, S, group_name>::base(a, eij[a]);
|
||||||
|
|||||||
@@ -158,12 +158,12 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
|
|||||||
int MFrvol = rd*Lblock*Rblock*Nmom;
|
int MFrvol = rd*Lblock*Rblock*Nmom;
|
||||||
int MFlvol = ld*Lblock*Rblock*Nmom;
|
int MFlvol = ld*Lblock*Rblock*Nmom;
|
||||||
|
|
||||||
Vector<SpinMatrix_v > lvSum(MFrvol);
|
std::vector<SpinMatrix_v > lvSum(MFrvol);
|
||||||
thread_for( r, MFrvol,{
|
thread_for( r, MFrvol,{
|
||||||
lvSum[r] = Zero();
|
lvSum[r] = Zero();
|
||||||
});
|
});
|
||||||
|
|
||||||
Vector<SpinMatrix_s > lsSum(MFlvol);
|
std::vector<SpinMatrix_s > lsSum(MFlvol);
|
||||||
thread_for(r,MFlvol,{
|
thread_for(r,MFlvol,{
|
||||||
lsSum[r]=scalar_type(0.0);
|
lsSum[r]=scalar_type(0.0);
|
||||||
});
|
});
|
||||||
@@ -346,12 +346,12 @@ void A2Autils<FImpl>::PionFieldXX(Eigen::Tensor<ComplexD,3> &mat,
|
|||||||
int MFrvol = rd*Lblock*Rblock;
|
int MFrvol = rd*Lblock*Rblock;
|
||||||
int MFlvol = ld*Lblock*Rblock;
|
int MFlvol = ld*Lblock*Rblock;
|
||||||
|
|
||||||
Vector<vector_type > lvSum(MFrvol);
|
std::vector<vector_type > lvSum(MFrvol);
|
||||||
thread_for(r,MFrvol,{
|
thread_for(r,MFrvol,{
|
||||||
lvSum[r] = Zero();
|
lvSum[r] = Zero();
|
||||||
});
|
});
|
||||||
|
|
||||||
Vector<scalar_type > lsSum(MFlvol);
|
std::vector<scalar_type > lsSum(MFlvol);
|
||||||
thread_for(r,MFlvol,{
|
thread_for(r,MFlvol,{
|
||||||
lsSum[r]=scalar_type(0.0);
|
lsSum[r]=scalar_type(0.0);
|
||||||
});
|
});
|
||||||
@@ -493,12 +493,12 @@ void A2Autils<FImpl>::PionFieldWVmom(Eigen::Tensor<ComplexD,4> &mat,
|
|||||||
int MFrvol = rd*Lblock*Rblock*Nmom;
|
int MFrvol = rd*Lblock*Rblock*Nmom;
|
||||||
int MFlvol = ld*Lblock*Rblock*Nmom;
|
int MFlvol = ld*Lblock*Rblock*Nmom;
|
||||||
|
|
||||||
Vector<vector_type > lvSum(MFrvol);
|
std::vector<vector_type > lvSum(MFrvol);
|
||||||
thread_for(r,MFrvol,{
|
thread_for(r,MFrvol,{
|
||||||
lvSum[r] = Zero();
|
lvSum[r] = Zero();
|
||||||
});
|
});
|
||||||
|
|
||||||
Vector<scalar_type > lsSum(MFlvol);
|
std::vector<scalar_type > lsSum(MFlvol);
|
||||||
thread_for(r,MFlvol,{
|
thread_for(r,MFlvol,{
|
||||||
lsSum[r]=scalar_type(0.0);
|
lsSum[r]=scalar_type(0.0);
|
||||||
});
|
});
|
||||||
@@ -700,13 +700,13 @@ void A2Autils<FImpl>::AslashField(TensorType &mat,
|
|||||||
int MFrvol = rd*Lblock*Rblock*Nem;
|
int MFrvol = rd*Lblock*Rblock*Nem;
|
||||||
int MFlvol = ld*Lblock*Rblock*Nem;
|
int MFlvol = ld*Lblock*Rblock*Nem;
|
||||||
|
|
||||||
Vector<vector_type> lvSum(MFrvol);
|
std::vector<vector_type> lvSum(MFrvol);
|
||||||
thread_for(r,MFrvol,
|
thread_for(r,MFrvol,
|
||||||
{
|
{
|
||||||
lvSum[r] = Zero();
|
lvSum[r] = Zero();
|
||||||
});
|
});
|
||||||
|
|
||||||
Vector<scalar_type> lsSum(MFlvol);
|
std::vector<scalar_type> lsSum(MFlvol);
|
||||||
thread_for(r,MFlvol,
|
thread_for(r,MFlvol,
|
||||||
{
|
{
|
||||||
lsSum[r] = scalar_type(0.0);
|
lsSum[r] = scalar_type(0.0);
|
||||||
|
|||||||
@@ -971,7 +971,9 @@ void BaryonUtils<FImpl>::BaryonGamma3pt(
|
|||||||
autoView( vq_ti , q_ti , AcceleratorRead);
|
autoView( vq_ti , q_ti , AcceleratorRead);
|
||||||
autoView( vq_tf , q_tf , AcceleratorRead);
|
autoView( vq_tf , q_tf , AcceleratorRead);
|
||||||
|
|
||||||
Vector<mobj> my_Dq_spec{Dq_spec1,Dq_spec2};
|
deviceVector<mobj> my_Dq_spec(2);
|
||||||
|
acceleratorPut(my_Dq_spec[0],Dq_spec1);
|
||||||
|
acceleratorPut(my_Dq_spec[1],Dq_spec2);
|
||||||
mobj * Dq_spec_p = &my_Dq_spec[0];
|
mobj * Dq_spec_p = &my_Dq_spec[0];
|
||||||
|
|
||||||
if (group == 1) {
|
if (group == 1) {
|
||||||
@@ -1300,7 +1302,8 @@ void BaryonUtils<FImpl>::SigmaToNucleonEye(const PropagatorField &qq_loop,
|
|||||||
autoView( vd_tf , qd_tf , AcceleratorRead);
|
autoView( vd_tf , qd_tf , AcceleratorRead);
|
||||||
autoView( vs_ti , qs_ti , AcceleratorRead);
|
autoView( vs_ti , qs_ti , AcceleratorRead);
|
||||||
|
|
||||||
Vector<mobj> my_Dq_spec{Du_spec};
|
deviceVector<mobj> my_Dq_spec(1);
|
||||||
|
acceleratorPut(my_Dq_spec[0],Du_spec);
|
||||||
mobj * Dq_spec_p = &my_Dq_spec[0];
|
mobj * Dq_spec_p = &my_Dq_spec[0];
|
||||||
|
|
||||||
if(op == "Q1"){
|
if(op == "Q1"){
|
||||||
@@ -1353,7 +1356,8 @@ void BaryonUtils<FImpl>::SigmaToNucleonNonEye(const PropagatorField &qq_ti,
|
|||||||
autoView( vd_tf , qd_tf , AcceleratorRead );
|
autoView( vd_tf , qd_tf , AcceleratorRead );
|
||||||
autoView( vs_ti , qs_ti , AcceleratorRead );
|
autoView( vs_ti , qs_ti , AcceleratorRead );
|
||||||
|
|
||||||
Vector<mobj> my_Dq_spec{Du_spec};
|
deviceVector<mobj> my_Dq_spec(1);
|
||||||
|
acceleratorPut(my_Dq_spec[0],Du_spec);
|
||||||
mobj * Dq_spec_p = &my_Dq_spec[0];
|
mobj * Dq_spec_p = &my_Dq_spec[0];
|
||||||
|
|
||||||
if(op == "Q1"){
|
if(op == "Q1"){
|
||||||
@@ -1544,7 +1548,9 @@ void BaryonUtils<FImpl>::XiToSigmaEye(const PropagatorField &qq_loop,
|
|||||||
autoView( vd_tf , qd_tf , AcceleratorRead);
|
autoView( vd_tf , qd_tf , AcceleratorRead);
|
||||||
autoView( vs_ti , qs_ti , AcceleratorRead);
|
autoView( vs_ti , qs_ti , AcceleratorRead);
|
||||||
|
|
||||||
Vector<mobj> my_Dq_spec{Dd_spec,Ds_spec};
|
deviceVector<mobj> my_Dq_spec(2);
|
||||||
|
acceleratorPut(my_Dq_spec[0],Dd_spec);
|
||||||
|
acceleratorPut(my_Dq_spec[0],Ds_spec);
|
||||||
mobj * Dq_spec_p = &my_Dq_spec[0];
|
mobj * Dq_spec_p = &my_Dq_spec[0];
|
||||||
|
|
||||||
if(op == "Q1"){
|
if(op == "Q1"){
|
||||||
|
|||||||
@@ -62,7 +62,7 @@ public:
|
|||||||
// returns i(T_Adj)^index necessary for the projectors
|
// returns i(T_Adj)^index necessary for the projectors
|
||||||
// see definitions above
|
// see definitions above
|
||||||
iAdjTa = Zero();
|
iAdjTa = Zero();
|
||||||
Vector<iSUnMatrix<cplx> > ta(ncolour * ncolour - 1);
|
iSUnMatrix<cplx> ta[ncolour * ncolour - 1];
|
||||||
iSUnMatrix<cplx> tmp;
|
iSUnMatrix<cplx> tmp;
|
||||||
|
|
||||||
// FIXME not very efficient to get all the generators everytime
|
// FIXME not very efficient to get all the generators everytime
|
||||||
|
|||||||
@@ -72,7 +72,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Resident in managed memory
|
// Resident in managed memory
|
||||||
Vector<GeneralStencilEntry> _entries;
|
deviceVector<GeneralStencilEntry> _entries;
|
||||||
|
|
||||||
GeneralLocalStencil(GridBase *grid, const std::vector<Coordinate> &shifts)
|
GeneralLocalStencil(GridBase *grid, const std::vector<Coordinate> &shifts)
|
||||||
{
|
{
|
||||||
@@ -141,7 +141,7 @@ public:
|
|||||||
////////////////////////////////////////////////
|
////////////////////////////////////////////////
|
||||||
// Store in look up table
|
// Store in look up table
|
||||||
////////////////////////////////////////////////
|
////////////////////////////////////////////////
|
||||||
this->_entries[lex] = SE;
|
acceleratorPut(this->_entries[lex],SE);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -19,7 +19,7 @@ public:
|
|||||||
static int PartialCompressionFactor(GridBase *grid) {return 1;};
|
static int PartialCompressionFactor(GridBase *grid) {return 1;};
|
||||||
// Decompress is after merge so ok
|
// Decompress is after merge so ok
|
||||||
template<class vobj,class cobj,class compressor>
|
template<class vobj,class cobj,class compressor>
|
||||||
static void Gather_plane_simple (commVector<std::pair<int,int> >& table,
|
static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
|
||||||
const Lattice<vobj> &rhs,
|
const Lattice<vobj> &rhs,
|
||||||
cobj *buffer,
|
cobj *buffer,
|
||||||
compressor &compress,
|
compressor &compress,
|
||||||
@@ -35,7 +35,7 @@ public:
|
|||||||
rhs_v.ViewClose();
|
rhs_v.ViewClose();
|
||||||
}
|
}
|
||||||
template<class vobj,class cobj,class compressor>
|
template<class vobj,class cobj,class compressor>
|
||||||
static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
|
static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
|
||||||
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
|
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
|
||||||
compressor &compress,int type,int partial)
|
compressor &compress,int type,int partial)
|
||||||
{
|
{
|
||||||
@@ -83,25 +83,6 @@ public:
|
|||||||
// Wilson compressor will add alternate policies for Dirichlet
|
// Wilson compressor will add alternate policies for Dirichlet
|
||||||
// and possibly partial Dirichlet for DWF
|
// and possibly partial Dirichlet for DWF
|
||||||
////////////////////////////////////
|
////////////////////////////////////
|
||||||
/*
|
|
||||||
class FaceGatherDirichlet
|
|
||||||
{
|
|
||||||
// If it's dirichlet we don't assemble comms buffers
|
|
||||||
//
|
|
||||||
// Rely on zeroes in gauge field to drive the correct result
|
|
||||||
// NAN propgagation: field will locally wrap, so fermion should NOT contain NAN and just permute
|
|
||||||
template<class vobj,class cobj,class compressor>
|
|
||||||
static void Gather_plane_simple (commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so){};
|
|
||||||
template<class vobj,class cobj,class compressor>
|
|
||||||
static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
|
|
||||||
Vector<cobj *> pointers,int dimension,int plane,int cbmask,
|
|
||||||
compressor &compress,int type) {}
|
|
||||||
template<class decompressor,class Merger>
|
|
||||||
static void Merge(decompressor decompress,Merge &mm) { }
|
|
||||||
template<class decompressor,class Decompression>
|
|
||||||
static void Decompress(decompressor decompress,Decompression &dd) {}
|
|
||||||
};
|
|
||||||
*/
|
|
||||||
|
|
||||||
template<class vobj,class FaceGather>
|
template<class vobj,class FaceGather>
|
||||||
class SimpleCompressorGather : public FaceGather {
|
class SimpleCompressorGather : public FaceGather {
|
||||||
|
|||||||
@@ -31,7 +31,6 @@
|
|||||||
#define STENCIL_MAX (16)
|
#define STENCIL_MAX (16)
|
||||||
|
|
||||||
#include <Grid/stencil/SimpleCompressor.h> // subdir aggregate
|
#include <Grid/stencil/SimpleCompressor.h> // subdir aggregate
|
||||||
#include <Grid/stencil/Lebesgue.h> // subdir aggregate
|
|
||||||
#include <Grid/stencil/GeneralLocalStencil.h>
|
#include <Grid/stencil/GeneralLocalStencil.h>
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////////
|
||||||
@@ -256,7 +255,6 @@ protected:
|
|||||||
GridBase * _grid;
|
GridBase * _grid;
|
||||||
public:
|
public:
|
||||||
GridBase *Grid(void) const { return _grid; }
|
GridBase *Grid(void) const { return _grid; }
|
||||||
LebesgueOrder *lo;
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
// Needed to conveniently communicate gparity parameters into GPU memory
|
// Needed to conveniently communicate gparity parameters into GPU memory
|
||||||
@@ -273,11 +271,11 @@ public:
|
|||||||
int face_table_computed;
|
int face_table_computed;
|
||||||
int partialDirichlet;
|
int partialDirichlet;
|
||||||
int fullDirichlet;
|
int fullDirichlet;
|
||||||
std::vector<commVector<std::pair<int,int> > > face_table ;
|
std::vector<deviceVector<std::pair<int,int> > > face_table ;
|
||||||
Vector<int> surface_list;
|
deviceVector<int> surface_list;
|
||||||
|
|
||||||
stencilVector<StencilEntry> _entries; // Resident in managed memory
|
std::vector<StencilEntry> _entries; // Resident in host memory
|
||||||
commVector<StencilEntry> _entries_device; // Resident in device memory
|
deviceVector<StencilEntry> _entries_device; // Resident in device memory
|
||||||
std::vector<Packet> Packets;
|
std::vector<Packet> Packets;
|
||||||
std::vector<Merge> Mergers;
|
std::vector<Merge> Mergers;
|
||||||
std::vector<Merge> MergersSHM;
|
std::vector<Merge> MergersSHM;
|
||||||
@@ -366,11 +364,11 @@ public:
|
|||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
|
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
|
||||||
{
|
{
|
||||||
|
FlightRecorder::StepLog("Communicate begin");
|
||||||
// All GPU kernel tasks must complete
|
// All GPU kernel tasks must complete
|
||||||
// accelerator_barrier(); // All kernels should ALREADY be complete
|
// accelerator_barrier(); // All kernels should ALREADY be complete
|
||||||
// _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer
|
// _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer
|
||||||
// But the HaloGather had a barrier too.
|
// But the HaloGather had a barrier too.
|
||||||
#ifdef ACCELERATOR_AWARE_MPI
|
|
||||||
for(int i=0;i<Packets.size();i++){
|
for(int i=0;i<Packets.size();i++){
|
||||||
_grid->StencilSendToRecvFromBegin(MpiReqs,
|
_grid->StencilSendToRecvFromBegin(MpiReqs,
|
||||||
Packets[i].send_buf,
|
Packets[i].send_buf,
|
||||||
@@ -379,23 +377,6 @@ public:
|
|||||||
Packets[i].from_rank,Packets[i].do_recv,
|
Packets[i].from_rank,Packets[i].do_recv,
|
||||||
Packets[i].xbytes,Packets[i].rbytes,i);
|
Packets[i].xbytes,Packets[i].rbytes,i);
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
#warning "Using COPY VIA HOST BUFFERS IN STENCIL"
|
|
||||||
for(int i=0;i<Packets.size();i++){
|
|
||||||
// Introduce a host buffer with a cheap slab allocator and zero cost wipe all
|
|
||||||
Packets[i].host_send_buf = _grid->HostBufferMalloc(Packets[i].xbytes);
|
|
||||||
Packets[i].host_recv_buf = _grid->HostBufferMalloc(Packets[i].rbytes);
|
|
||||||
if ( Packets[i].do_send ) {
|
|
||||||
acceleratorCopyFromDevice(Packets[i].send_buf, Packets[i].host_send_buf,Packets[i].xbytes);
|
|
||||||
}
|
|
||||||
_grid->StencilSendToRecvFromBegin(MpiReqs,
|
|
||||||
Packets[i].host_send_buf,
|
|
||||||
Packets[i].to_rank,Packets[i].do_send,
|
|
||||||
Packets[i].host_recv_buf,
|
|
||||||
Packets[i].from_rank,Packets[i].do_recv,
|
|
||||||
Packets[i].xbytes,Packets[i].rbytes,i);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
// Get comms started then run checksums
|
// Get comms started then run checksums
|
||||||
// Having this PRIOR to the dslash seems to make Sunspot work... (!)
|
// Having this PRIOR to the dslash seems to make Sunspot work... (!)
|
||||||
for(int i=0;i<Packets.size();i++){
|
for(int i=0;i<Packets.size();i++){
|
||||||
@@ -406,27 +387,20 @@ public:
|
|||||||
|
|
||||||
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
|
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
|
||||||
{
|
{
|
||||||
|
FlightRecorder::StepLog("Start communicate complete");
|
||||||
_grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
|
_grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
|
||||||
if ( this->partialDirichlet ) DslashLogPartial();
|
if ( this->partialDirichlet ) DslashLogPartial();
|
||||||
else if ( this->fullDirichlet ) DslashLogDirichlet();
|
else if ( this->fullDirichlet ) DslashLogDirichlet();
|
||||||
else DslashLogFull();
|
else DslashLogFull();
|
||||||
// acceleratorCopySynchronise() is in the StencilSendToRecvFromComplete
|
// acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete
|
||||||
// accelerator_barrier();
|
// accelerator_barrier();
|
||||||
_grid->StencilBarrier();
|
_grid->StencilBarrier();
|
||||||
#ifndef ACCELERATOR_AWARE_MPI
|
|
||||||
#warning "Using COPY VIA HOST BUFFERS IN STENCIL"
|
|
||||||
for(int i=0;i<Packets.size();i++){
|
|
||||||
if ( Packets[i].do_recv ) {
|
|
||||||
acceleratorCopyToDevice(Packets[i].host_recv_buf, Packets[i].recv_buf,Packets[i].rbytes);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
_grid->HostBufferFreeAll();
|
|
||||||
#endif
|
|
||||||
// run any checksums
|
// run any checksums
|
||||||
for(int i=0;i<Packets.size();i++){
|
for(int i=0;i<Packets.size();i++){
|
||||||
if ( Packets[i].do_recv )
|
if ( Packets[i].do_recv )
|
||||||
FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank);
|
FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank);
|
||||||
}
|
}
|
||||||
|
FlightRecorder::StepLog("Finish communicate complete");
|
||||||
}
|
}
|
||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
// Blocking send and receive. Either sequential or parallel.
|
// Blocking send and receive. Either sequential or parallel.
|
||||||
@@ -516,6 +490,7 @@ public:
|
|||||||
HaloGatherDir(source,compress,point,face_idx);
|
HaloGatherDir(source,compress,point,face_idx);
|
||||||
}
|
}
|
||||||
accelerator_barrier(); // All my local gathers are complete
|
accelerator_barrier(); // All my local gathers are complete
|
||||||
|
// _grid->StencilBarrier();// Synch shared memory on a single nodes
|
||||||
face_table_computed=1;
|
face_table_computed=1;
|
||||||
assert(u_comm_offset==_unified_buffer_size);
|
assert(u_comm_offset==_unified_buffer_size);
|
||||||
}
|
}
|
||||||
@@ -668,7 +643,7 @@ public:
|
|||||||
for(int point=0;point<this->_npoints;point++){
|
for(int point=0;point<this->_npoints;point++){
|
||||||
this->same_node[point] = this->SameNode(point);
|
this->same_node[point] = this->SameNode(point);
|
||||||
}
|
}
|
||||||
|
int32_t surface_list_size=0;
|
||||||
for(int site = 0 ;site< vol4;site++){
|
for(int site = 0 ;site< vol4;site++){
|
||||||
int local = 1;
|
int local = 1;
|
||||||
for(int point=0;point<this->_npoints;point++){
|
for(int point=0;point<this->_npoints;point++){
|
||||||
@@ -678,11 +653,30 @@ public:
|
|||||||
}
|
}
|
||||||
if(local == 0) {
|
if(local == 0) {
|
||||||
for(int s=0;s<Ls;s++){
|
for(int s=0;s<Ls;s++){
|
||||||
surface_list.push_back(site*Ls+s);
|
surface_list_size++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
//std::cout << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
|
std::cout << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
|
||||||
|
surface_list.resize(surface_list_size);
|
||||||
|
std::vector<int> surface_list_host(surface_list_size);
|
||||||
|
int32_t ss=0;
|
||||||
|
for(int site = 0 ;site< vol4;site++){
|
||||||
|
int local = 1;
|
||||||
|
for(int point=0;point<this->_npoints;point++){
|
||||||
|
if( (!this->GetNodeLocal(site*Ls,point)) && (!this->same_node[point]) ){
|
||||||
|
local = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(local == 0) {
|
||||||
|
for(int s=0;s<Ls;s++){
|
||||||
|
int idx=site*Ls+s;
|
||||||
|
surface_list_host[ss]= idx;
|
||||||
|
ss++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
acceleratorCopyToDevice(&surface_list_host[0],&surface_list[0],surface_list_size*sizeof(int));
|
||||||
}
|
}
|
||||||
/// Introduce a block structure and switch off comms on boundaries
|
/// Introduce a block structure and switch off comms on boundaries
|
||||||
void DirichletBlock(const Coordinate &dirichlet_block)
|
void DirichletBlock(const Coordinate &dirichlet_block)
|
||||||
|
|||||||
@@ -202,15 +202,15 @@ void acceleratorInit(void)
|
|||||||
|
|
||||||
#ifdef GRID_SYCL
|
#ifdef GRID_SYCL
|
||||||
|
|
||||||
cl::sycl::queue *theGridAccelerator;
|
sycl::queue *theGridAccelerator;
|
||||||
cl::sycl::queue *theCopyAccelerator;
|
sycl::queue *theCopyAccelerator;
|
||||||
void acceleratorInit(void)
|
void acceleratorInit(void)
|
||||||
{
|
{
|
||||||
int nDevices = 1;
|
int nDevices = 1;
|
||||||
cl::sycl::gpu_selector selector;
|
// sycl::gpu_selector selector;
|
||||||
cl::sycl::device selectedDevice { selector };
|
// sycl::device selectedDevice { selector };
|
||||||
theGridAccelerator = new sycl::queue (selectedDevice);
|
theGridAccelerator = new sycl::queue (sycl::gpu_selector_v);
|
||||||
theCopyAccelerator = new sycl::queue (selectedDevice);
|
theCopyAccelerator = new sycl::queue (sycl::gpu_selector_v);
|
||||||
// theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway.
|
// theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway.
|
||||||
|
|
||||||
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
|
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
|
||||||
@@ -242,14 +242,14 @@ void acceleratorInit(void)
|
|||||||
gethostname(hostname, HOST_NAME_MAX+1);
|
gethostname(hostname, HOST_NAME_MAX+1);
|
||||||
if ( rank==0 ) printf(" acceleratorInit world_rank %d is host %s \n",world_rank,hostname);
|
if ( rank==0 ) printf(" acceleratorInit world_rank %d is host %s \n",world_rank,hostname);
|
||||||
|
|
||||||
auto devices = cl::sycl::device::get_devices();
|
auto devices = sycl::device::get_devices();
|
||||||
for(int d = 0;d<devices.size();d++){
|
for(int d = 0;d<devices.size();d++){
|
||||||
|
|
||||||
#define GPU_PROP_STR(prop) \
|
#define GPU_PROP_STR(prop) \
|
||||||
printf("AcceleratorSyclInit: " #prop ": %s \n",devices[d].get_info<cl::sycl::info::device::prop>().c_str());
|
printf("AcceleratorSyclInit: " #prop ": %s \n",devices[d].get_info<sycl::info::device::prop>().c_str());
|
||||||
|
|
||||||
#define GPU_PROP_FMT(prop,FMT) \
|
#define GPU_PROP_FMT(prop,FMT) \
|
||||||
printf("AcceleratorSyclInit: " #prop ": " FMT" \n",devices[d].get_info<cl::sycl::info::device::prop>());
|
printf("AcceleratorSyclInit: " #prop ": " FMT" \n",devices[d].get_info<sycl::info::device::prop>());
|
||||||
|
|
||||||
#define GPU_PROP(prop) GPU_PROP_FMT(prop,"%ld");
|
#define GPU_PROP(prop) GPU_PROP_FMT(prop,"%ld");
|
||||||
if ( world_rank == 0) {
|
if ( world_rank == 0) {
|
||||||
|
|||||||
@@ -302,7 +302,7 @@ NAMESPACE_END(Grid);
|
|||||||
|
|
||||||
// Force deterministic reductions
|
// Force deterministic reductions
|
||||||
#define SYCL_REDUCTION_DETERMINISTIC
|
#define SYCL_REDUCTION_DETERMINISTIC
|
||||||
#include <sycl/CL/sycl.hpp>
|
#include <sycl/sycl.hpp>
|
||||||
#include <sycl/usm.hpp>
|
#include <sycl/usm.hpp>
|
||||||
#include <level_zero/ze_api.h>
|
#include <level_zero/ze_api.h>
|
||||||
#include <sycl/ext/oneapi/backend/level_zero.hpp>
|
#include <sycl/ext/oneapi/backend/level_zero.hpp>
|
||||||
@@ -314,8 +314,8 @@ inline void acceleratorMem(void)
|
|||||||
std::cout <<" SYCL acceleratorMem not implemented"<<std::endl;
|
std::cout <<" SYCL acceleratorMem not implemented"<<std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
extern cl::sycl::queue *theGridAccelerator;
|
extern sycl::queue *theGridAccelerator;
|
||||||
extern cl::sycl::queue *theCopyAccelerator;
|
extern sycl::queue *theCopyAccelerator;
|
||||||
|
|
||||||
#ifdef __SYCL_DEVICE_ONLY__
|
#ifdef __SYCL_DEVICE_ONLY__
|
||||||
#define GRID_SIMT
|
#define GRID_SIMT
|
||||||
@@ -326,24 +326,24 @@ extern cl::sycl::queue *theCopyAccelerator;
|
|||||||
|
|
||||||
accelerator_inline int acceleratorSIMTlane(int Nsimd) {
|
accelerator_inline int acceleratorSIMTlane(int Nsimd) {
|
||||||
#ifdef GRID_SIMT
|
#ifdef GRID_SIMT
|
||||||
return __spirv::initLocalInvocationId<3, cl::sycl::id<3>>()[2];
|
return __spirv::initLocalInvocationId<3, sycl::id<3>>()[2];
|
||||||
#else
|
#else
|
||||||
return 0;
|
return 0;
|
||||||
#endif
|
#endif
|
||||||
} // SYCL specific
|
} // SYCL specific
|
||||||
|
|
||||||
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
|
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
|
||||||
theGridAccelerator->submit([&](cl::sycl::handler &cgh) { \
|
theGridAccelerator->submit([&](sycl::handler &cgh) { \
|
||||||
unsigned long nt=acceleratorThreads(); \
|
unsigned long nt=acceleratorThreads(); \
|
||||||
if(nt < 8)nt=8; \
|
if(nt < 8)nt=8; \
|
||||||
unsigned long unum1 = num1; \
|
unsigned long unum1 = num1; \
|
||||||
unsigned long unum2 = num2; \
|
unsigned long unum2 = num2; \
|
||||||
unsigned long unum1_divisible_by_nt = ((unum1 + nt - 1) / nt) * nt; \
|
unsigned long unum1_divisible_by_nt = ((unum1 + nt - 1) / nt) * nt; \
|
||||||
cl::sycl::range<3> local {nt,1,nsimd}; \
|
sycl::range<3> local {nt,1,nsimd}; \
|
||||||
cl::sycl::range<3> global{unum1_divisible_by_nt,unum2,nsimd}; \
|
sycl::range<3> global{unum1_divisible_by_nt,unum2,nsimd}; \
|
||||||
cgh.parallel_for( \
|
cgh.parallel_for( \
|
||||||
cl::sycl::nd_range<3>(global,local), \
|
sycl::nd_range<3>(global,local), \
|
||||||
[=] (cl::sycl::nd_item<3> item) /*mutable*/ \
|
[=] (sycl::nd_item<3> item) /*mutable*/ \
|
||||||
[[intel::reqd_sub_group_size(16)]] \
|
[[intel::reqd_sub_group_size(16)]] \
|
||||||
{ \
|
{ \
|
||||||
auto iter1 = item.get_global_id(0); \
|
auto iter1 = item.get_global_id(0); \
|
||||||
@@ -369,8 +369,8 @@ inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccele
|
|||||||
inline int acceleratorIsCommunicable(void *ptr)
|
inline int acceleratorIsCommunicable(void *ptr)
|
||||||
{
|
{
|
||||||
#if 0
|
#if 0
|
||||||
auto uvm = cl::sycl::usm::get_pointer_type(ptr, theGridAccelerator->get_context());
|
auto uvm = sycl::usm::get_pointer_type(ptr, theGridAccelerator->get_context());
|
||||||
if ( uvm = cl::sycl::usm::alloc::shared ) return 1;
|
if ( uvm = sycl::usm::alloc::shared ) return 1;
|
||||||
else return 0;
|
else return 0;
|
||||||
#endif
|
#endif
|
||||||
return 1;
|
return 1;
|
||||||
|
|||||||
@@ -39,6 +39,8 @@ int FlightRecorder::ContinueOnFail;
|
|||||||
int FlightRecorder::LoggingMode;
|
int FlightRecorder::LoggingMode;
|
||||||
int FlightRecorder::ChecksumComms;
|
int FlightRecorder::ChecksumComms;
|
||||||
int FlightRecorder::ChecksumCommsSend;
|
int FlightRecorder::ChecksumCommsSend;
|
||||||
|
const char * FlightRecorder::StepName;
|
||||||
|
int32_t FlightRecorder::StepLoggingCounter;
|
||||||
int32_t FlightRecorder::XmitLoggingCounter;
|
int32_t FlightRecorder::XmitLoggingCounter;
|
||||||
int32_t FlightRecorder::RecvLoggingCounter;
|
int32_t FlightRecorder::RecvLoggingCounter;
|
||||||
int32_t FlightRecorder::CsumLoggingCounter;
|
int32_t FlightRecorder::CsumLoggingCounter;
|
||||||
@@ -58,6 +60,8 @@ void FlightRecorder::ResetCounters(void)
|
|||||||
CsumLoggingCounter=0;
|
CsumLoggingCounter=0;
|
||||||
NormLoggingCounter=0;
|
NormLoggingCounter=0;
|
||||||
ReductionLoggingCounter=0;
|
ReductionLoggingCounter=0;
|
||||||
|
StepName = "No steps started";
|
||||||
|
StepLoggingCounter=0;
|
||||||
}
|
}
|
||||||
void FlightRecorder::Truncate(void)
|
void FlightRecorder::Truncate(void)
|
||||||
{
|
{
|
||||||
@@ -88,6 +92,12 @@ void FlightRecorder::SetLoggingMode(FlightRecorder::LoggingMode_t mode)
|
|||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
bool FlightRecorder::StepLog(const char *name)
|
||||||
|
{
|
||||||
|
StepName = name;
|
||||||
|
StepLoggingCounter ++;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
void FlightRecorder::SetLoggingModePrint(void)
|
void FlightRecorder::SetLoggingModePrint(void)
|
||||||
{
|
{
|
||||||
@@ -111,17 +121,19 @@ uint64_t FlightRecorder::ErrorCount(void)
|
|||||||
{
|
{
|
||||||
return ErrorCounter;
|
return ErrorCounter;
|
||||||
}
|
}
|
||||||
void FlightRecorder::NormLog(double value)
|
bool FlightRecorder::NormLog(double value)
|
||||||
{
|
{
|
||||||
uint64_t hex = * ( (uint64_t *)&value );
|
uint64_t hex = * ( (uint64_t *)&value );
|
||||||
if(LoggingMode == LoggingModePrint) {
|
if(LoggingMode == LoggingModePrint) {
|
||||||
std::cerr<<"FlightRecorder::NormLog : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
|
std::cerr<<"FlightRecorder::NormLog : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
|
||||||
NormLoggingCounter++;
|
NormLoggingCounter++;
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
if(LoggingMode == LoggingModeRecord) {
|
if(LoggingMode == LoggingModeRecord) {
|
||||||
std::cerr<<"FlightRecorder::NormLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
|
std::cerr<<"FlightRecorder::NormLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
|
||||||
NormLogVector.push_back(value);
|
NormLogVector.push_back(value);
|
||||||
NormLoggingCounter++;
|
NormLoggingCounter++;
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
if(LoggingMode == LoggingModeVerify) {
|
if(LoggingMode == LoggingModeVerify) {
|
||||||
|
|
||||||
@@ -130,6 +142,9 @@ void FlightRecorder::NormLog(double value)
|
|||||||
|
|
||||||
if ( (value != NormLogVector[NormLoggingCounter]) || std::isnan(value) ) {
|
if ( (value != NormLogVector[NormLoggingCounter]) || std::isnan(value) ) {
|
||||||
|
|
||||||
|
fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
|
||||||
|
FlightRecorder::StepLoggingCounter,
|
||||||
|
FlightRecorder::StepName);
|
||||||
std::cerr<<"FlightRecorder::NormLog Oops, I did it again "<< NormLoggingCounter
|
std::cerr<<"FlightRecorder::NormLog Oops, I did it again "<< NormLoggingCounter
|
||||||
<<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<" "
|
<<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<" "
|
||||||
<<std::hexfloat<<value<<" "<< NormLogVector[NormLoggingCounter]<<std::endl;
|
<<std::hexfloat<<value<<" "<< NormLogVector[NormLoggingCounter]<<std::endl;
|
||||||
@@ -142,7 +157,9 @@ void FlightRecorder::NormLog(double value)
|
|||||||
NormLoggingCounter,NormLogVector.size(),
|
NormLoggingCounter,NormLogVector.size(),
|
||||||
value, NormLogVector[NormLoggingCounter]); fflush(stderr);
|
value, NormLogVector[NormLoggingCounter]); fflush(stderr);
|
||||||
|
|
||||||
if(!ContinueOnFail)assert(0); // Force takedown of job
|
BACKTRACEFP(stderr);
|
||||||
|
|
||||||
|
if(!ContinueOnFail) return false;
|
||||||
|
|
||||||
ErrorCounter++;
|
ErrorCounter++;
|
||||||
} else {
|
} else {
|
||||||
@@ -159,18 +176,21 @@ void FlightRecorder::NormLog(double value)
|
|||||||
}
|
}
|
||||||
NormLoggingCounter++;
|
NormLoggingCounter++;
|
||||||
}
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
void FlightRecorder::CsumLog(uint64_t hex)
|
bool FlightRecorder::CsumLog(uint64_t hex)
|
||||||
{
|
{
|
||||||
if(LoggingMode == LoggingModePrint) {
|
if(LoggingMode == LoggingModePrint) {
|
||||||
std::cerr<<"FlightRecorder::CsumLog : "<< CsumLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
|
std::cerr<<"FlightRecorder::CsumLog : "<< CsumLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
|
||||||
CsumLoggingCounter++;
|
CsumLoggingCounter++;
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
if(LoggingMode == LoggingModeRecord) {
|
if(LoggingMode == LoggingModeRecord) {
|
||||||
std::cerr<<"FlightRecorder::CsumLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
|
std::cerr<<"FlightRecorder::CsumLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
|
||||||
CsumLogVector.push_back(hex);
|
CsumLogVector.push_back(hex);
|
||||||
CsumLoggingCounter++;
|
CsumLoggingCounter++;
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
if(LoggingMode == LoggingModeVerify) {
|
if(LoggingMode == LoggingModeVerify) {
|
||||||
@@ -181,6 +201,9 @@ void FlightRecorder::CsumLog(uint64_t hex)
|
|||||||
|
|
||||||
if ( hex != hexref ) {
|
if ( hex != hexref ) {
|
||||||
|
|
||||||
|
fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
|
||||||
|
FlightRecorder::StepLoggingCounter,
|
||||||
|
FlightRecorder::StepName);
|
||||||
std::cerr<<"FlightRecorder::CsumLog Oops, I did it again "<< CsumLoggingCounter
|
std::cerr<<"FlightRecorder::CsumLog Oops, I did it again "<< CsumLoggingCounter
|
||||||
<<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<std::endl;
|
<<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<std::endl;
|
||||||
|
|
||||||
@@ -188,9 +211,10 @@ void FlightRecorder::CsumLog(uint64_t hex)
|
|||||||
GridHostname(),
|
GridHostname(),
|
||||||
GlobalSharedMemory::WorldShmRank,
|
GlobalSharedMemory::WorldShmRank,
|
||||||
CsumLoggingCounter,hex, hexref);
|
CsumLoggingCounter,hex, hexref);
|
||||||
|
BACKTRACEFP(stderr);
|
||||||
fflush(stderr);
|
fflush(stderr);
|
||||||
|
|
||||||
if(!ContinueOnFail) assert(0); // Force takedown of job
|
if(!ContinueOnFail) return false;
|
||||||
|
|
||||||
ErrorCounter++;
|
ErrorCounter++;
|
||||||
|
|
||||||
@@ -207,7 +231,9 @@ void FlightRecorder::CsumLog(uint64_t hex)
|
|||||||
}
|
}
|
||||||
CsumLoggingCounter++;
|
CsumLoggingCounter++;
|
||||||
}
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void FlightRecorder::ReductionLog(double local,double global)
|
void FlightRecorder::ReductionLog(double local,double global)
|
||||||
{
|
{
|
||||||
uint64_t hex_l = * ( (uint64_t *)&local );
|
uint64_t hex_l = * ( (uint64_t *)&local );
|
||||||
@@ -224,11 +250,15 @@ void FlightRecorder::ReductionLog(double local,double global)
|
|||||||
if(LoggingMode == LoggingModeVerify) {
|
if(LoggingMode == LoggingModeVerify) {
|
||||||
if(ReductionLoggingCounter < ReductionLogVector.size()){
|
if(ReductionLoggingCounter < ReductionLogVector.size()){
|
||||||
if ( global != ReductionLogVector[ReductionLoggingCounter] ) {
|
if ( global != ReductionLogVector[ReductionLoggingCounter] ) {
|
||||||
|
fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
|
||||||
|
FlightRecorder::StepLoggingCounter,
|
||||||
|
FlightRecorder::StepName);
|
||||||
fprintf(stderr,"%s:%d Oops, MPI_Allreduce did it again! Reproduce failure for norm %d/%zu glb %.16e lcl %.16e expect glb %.16e\n",
|
fprintf(stderr,"%s:%d Oops, MPI_Allreduce did it again! Reproduce failure for norm %d/%zu glb %.16e lcl %.16e expect glb %.16e\n",
|
||||||
GridHostname(),
|
GridHostname(),
|
||||||
GlobalSharedMemory::WorldShmRank,
|
GlobalSharedMemory::WorldShmRank,
|
||||||
ReductionLoggingCounter,ReductionLogVector.size(),
|
ReductionLoggingCounter,ReductionLogVector.size(),
|
||||||
global, local, ReductionLogVector[ReductionLoggingCounter]); fflush(stderr);
|
global, local, ReductionLogVector[ReductionLoggingCounter]); fflush(stderr);
|
||||||
|
BACKTRACEFP(stderr);
|
||||||
|
|
||||||
if ( !ContinueOnFail ) assert(0);
|
if ( !ContinueOnFail ) assert(0);
|
||||||
|
|
||||||
@@ -267,11 +297,15 @@ void FlightRecorder::xmitLog(void *buf,uint64_t bytes)
|
|||||||
if(LoggingMode == LoggingModeVerify) {
|
if(LoggingMode == LoggingModeVerify) {
|
||||||
if(XmitLoggingCounter < XmitLogVector.size()){
|
if(XmitLoggingCounter < XmitLogVector.size()){
|
||||||
if ( _xor != XmitLogVector[XmitLoggingCounter] ) {
|
if ( _xor != XmitLogVector[XmitLoggingCounter] ) {
|
||||||
|
fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
|
||||||
|
FlightRecorder::StepLoggingCounter,
|
||||||
|
FlightRecorder::StepName);
|
||||||
fprintf(stderr,"%s:%d Oops, send buf difference! Reproduce failure for xmit %d/%zu %lx expect glb %lx\n",
|
fprintf(stderr,"%s:%d Oops, send buf difference! Reproduce failure for xmit %d/%zu %lx expect glb %lx\n",
|
||||||
GridHostname(),
|
GridHostname(),
|
||||||
GlobalSharedMemory::WorldShmRank,
|
GlobalSharedMemory::WorldShmRank,
|
||||||
XmitLoggingCounter,XmitLogVector.size(),
|
XmitLoggingCounter,XmitLogVector.size(),
|
||||||
_xor, XmitLogVector[XmitLoggingCounter]); fflush(stderr);
|
_xor, XmitLogVector[XmitLoggingCounter]); fflush(stderr);
|
||||||
|
BACKTRACEFP(stderr);
|
||||||
|
|
||||||
if ( !ContinueOnFail ) assert(0);
|
if ( !ContinueOnFail ) assert(0);
|
||||||
|
|
||||||
@@ -309,11 +343,15 @@ void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank)
|
|||||||
if(LoggingMode == LoggingModeVerify) {
|
if(LoggingMode == LoggingModeVerify) {
|
||||||
if(RecvLoggingCounter < RecvLogVector.size()){
|
if(RecvLoggingCounter < RecvLogVector.size()){
|
||||||
if ( _xor != RecvLogVector[RecvLoggingCounter] ) {
|
if ( _xor != RecvLogVector[RecvLoggingCounter] ) {
|
||||||
|
fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
|
||||||
|
FlightRecorder::StepLoggingCounter,
|
||||||
|
FlightRecorder::StepName);
|
||||||
fprintf(stderr,"%s:%d Oops, recv buf difference! Reproduce failure for recv %d/%zu %lx expect glb %lx from MPI rank %d\n",
|
fprintf(stderr,"%s:%d Oops, recv buf difference! Reproduce failure for recv %d/%zu %lx expect glb %lx from MPI rank %d\n",
|
||||||
GridHostname(),
|
GridHostname(),
|
||||||
GlobalSharedMemory::WorldShmRank,
|
GlobalSharedMemory::WorldShmRank,
|
||||||
RecvLoggingCounter,RecvLogVector.size(),
|
RecvLoggingCounter,RecvLogVector.size(),
|
||||||
_xor, RecvLogVector[RecvLoggingCounter],rank); fflush(stderr);
|
_xor, RecvLogVector[RecvLoggingCounter],rank); fflush(stderr);
|
||||||
|
BACKTRACEFP(stderr);
|
||||||
|
|
||||||
if ( !ContinueOnFail ) assert(0);
|
if ( !ContinueOnFail ) assert(0);
|
||||||
|
|
||||||
|
|||||||
@@ -12,6 +12,8 @@ class FlightRecorder {
|
|||||||
|
|
||||||
static int LoggingMode;
|
static int LoggingMode;
|
||||||
static uint64_t ErrorCounter;
|
static uint64_t ErrorCounter;
|
||||||
|
static const char * StepName;
|
||||||
|
static int32_t StepLoggingCounter;
|
||||||
static int32_t XmitLoggingCounter;
|
static int32_t XmitLoggingCounter;
|
||||||
static int32_t RecvLoggingCounter;
|
static int32_t RecvLoggingCounter;
|
||||||
static int32_t CsumLoggingCounter;
|
static int32_t CsumLoggingCounter;
|
||||||
@@ -30,8 +32,9 @@ class FlightRecorder {
|
|||||||
static void SetLoggingModeRecord(void);
|
static void SetLoggingModeRecord(void);
|
||||||
static void SetLoggingModeVerify(void);
|
static void SetLoggingModeVerify(void);
|
||||||
static void SetLoggingMode(LoggingMode_t mode);
|
static void SetLoggingMode(LoggingMode_t mode);
|
||||||
static void NormLog(double value);
|
static bool StepLog(const char *name);
|
||||||
static void CsumLog(uint64_t csum);
|
static bool NormLog(double value);
|
||||||
|
static bool CsumLog(uint64_t csum);
|
||||||
static void ReductionLog(double lcl, double glbl);
|
static void ReductionLog(double lcl, double glbl);
|
||||||
static void Truncate(void);
|
static void Truncate(void);
|
||||||
static void ResetCounters(void);
|
static void ResetCounters(void);
|
||||||
|
|||||||
@@ -464,16 +464,12 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
std::cout<<GridLogMessage<<std::endl;
|
std::cout<<GridLogMessage<<std::endl;
|
||||||
std::cout<<GridLogMessage<<"Performance:"<<std::endl;
|
std::cout<<GridLogMessage<<"Performance:"<<std::endl;
|
||||||
std::cout<<GridLogMessage<<std::endl;
|
std::cout<<GridLogMessage<<std::endl;
|
||||||
std::cout<<GridLogMessage<<" --comms-concurrent : Asynchronous MPI calls; several dirs at a time "<<std::endl;
|
|
||||||
std::cout<<GridLogMessage<<" --comms-sequential : Synchronous MPI calls; one dirs at a time "<<std::endl;
|
|
||||||
std::cout<<GridLogMessage<<" --comms-overlap : Overlap comms with compute "<<std::endl;
|
std::cout<<GridLogMessage<<" --comms-overlap : Overlap comms with compute "<<std::endl;
|
||||||
std::cout<<GridLogMessage<<std::endl;
|
std::cout<<GridLogMessage<<std::endl;
|
||||||
std::cout<<GridLogMessage<<" --dslash-generic: Wilson kernel for generic Nc"<<std::endl;
|
std::cout<<GridLogMessage<<" --dslash-generic: Wilson kernel for generic Nc"<<std::endl;
|
||||||
std::cout<<GridLogMessage<<" --dslash-unroll : Wilson kernel for Nc=3"<<std::endl;
|
std::cout<<GridLogMessage<<" --dslash-unroll : Wilson kernel for Nc=3"<<std::endl;
|
||||||
std::cout<<GridLogMessage<<" --dslash-asm : Wilson kernel for AVX512"<<std::endl;
|
std::cout<<GridLogMessage<<" --dslash-asm : Wilson kernel for AVX512"<<std::endl;
|
||||||
std::cout<<GridLogMessage<<std::endl;
|
std::cout<<GridLogMessage<<std::endl;
|
||||||
std::cout<<GridLogMessage<<" --lebesgue : Cache oblivious Lebesgue curve/Morton order/Z-graph stencil looping"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage<<" --cacheblocking n.m.o.p : Hypercuboidal cache blocking"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage<<std::endl;
|
std::cout<<GridLogMessage<<std::endl;
|
||||||
exit(EXIT_SUCCESS);
|
exit(EXIT_SUCCESS);
|
||||||
}
|
}
|
||||||
@@ -501,28 +497,8 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsThenCompute;
|
WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsThenCompute;
|
||||||
StaggeredKernelsStatic::Comms = StaggeredKernelsStatic::CommsThenCompute;
|
StaggeredKernelsStatic::Comms = StaggeredKernelsStatic::CommsThenCompute;
|
||||||
}
|
}
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-concurrent") ){
|
|
||||||
CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicyConcurrent);
|
|
||||||
}
|
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-sequential") ){
|
|
||||||
CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
|
|
||||||
}
|
|
||||||
|
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
|
|
||||||
LebesgueOrder::UseLebesgueOrder=1;
|
|
||||||
}
|
|
||||||
CartesianCommunicator::nCommThreads = 1;
|
CartesianCommunicator::nCommThreads = 1;
|
||||||
#ifdef GRID_COMMS_THREADS
|
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-threads") ){
|
|
||||||
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--comms-threads");
|
|
||||||
GridCmdOptionInt(arg,CartesianCommunicator::nCommThreads);
|
|
||||||
assert(CartesianCommunicator::nCommThreads > 0);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
|
|
||||||
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking");
|
|
||||||
GridCmdOptionIntVector(arg,LebesgueOrder::Block);
|
|
||||||
}
|
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--notimestamp") ){
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--notimestamp") ){
|
||||||
GridLogTimestamp(0);
|
GridLogTimestamp(0);
|
||||||
} else {
|
} else {
|
||||||
@@ -573,8 +549,34 @@ void GridLogLayout() {
|
|||||||
|
|
||||||
void * Grid_backtrace_buffer[_NBACKTRACE];
|
void * Grid_backtrace_buffer[_NBACKTRACE];
|
||||||
|
|
||||||
|
void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr)
|
||||||
|
{
|
||||||
|
fprintf(stderr,"Signal handler on host %s\n",hostname);
|
||||||
|
fprintf(stderr,"FlightRecorder step %d stage %s \n",
|
||||||
|
FlightRecorder::StepLoggingCounter,
|
||||||
|
FlightRecorder::StepName);
|
||||||
|
fprintf(stderr,"Caught signal %d\n",si->si_signo);
|
||||||
|
fprintf(stderr," mem address %llx\n",(unsigned long long)si->si_addr);
|
||||||
|
fprintf(stderr," code %d\n",si->si_code);
|
||||||
|
// x86 64bit
|
||||||
|
#ifdef __linux__
|
||||||
|
#ifdef __x86_64__
|
||||||
|
ucontext_t * uc= (ucontext_t *)ptr;
|
||||||
|
struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
|
||||||
|
fprintf(stderr," instruction %llx\n",(unsigned long long)sc->rip);
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
fflush(stderr);
|
||||||
|
BACKTRACEFP(stderr);
|
||||||
|
fprintf(stderr,"Called backtrace\n");
|
||||||
|
fflush(stdout);
|
||||||
|
fflush(stderr);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
|
void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
|
||||||
{
|
{
|
||||||
|
fprintf(stderr,"Signal handler on host %s\n",hostname);
|
||||||
fprintf(stderr,"Caught signal %d\n",si->si_signo);
|
fprintf(stderr,"Caught signal %d\n",si->si_signo);
|
||||||
fprintf(stderr," mem address %llx\n",(unsigned long long)si->si_addr);
|
fprintf(stderr," mem address %llx\n",(unsigned long long)si->si_addr);
|
||||||
fprintf(stderr," code %d\n",si->si_code);
|
fprintf(stderr," code %d\n",si->si_code);
|
||||||
@@ -585,7 +587,7 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
|
|||||||
ucontext_t * uc= (ucontext_t *)ptr;
|
ucontext_t * uc= (ucontext_t *)ptr;
|
||||||
struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
|
struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
|
||||||
fprintf(stderr," instruction %llx\n",(unsigned long long)sc->rip);
|
fprintf(stderr," instruction %llx\n",(unsigned long long)sc->rip);
|
||||||
#define REG(A) printf(" %s %lx\n",#A,sc-> A);
|
#define REG(A) fprintf(stderr," %s %lx\n",#A,sc-> A);
|
||||||
REG(rdi);
|
REG(rdi);
|
||||||
REG(rsi);
|
REG(rsi);
|
||||||
REG(rbp);
|
REG(rbp);
|
||||||
@@ -618,8 +620,8 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
|
|||||||
|
|
||||||
void Grid_exit_handler(void)
|
void Grid_exit_handler(void)
|
||||||
{
|
{
|
||||||
BACKTRACEFP(stdout);
|
// BACKTRACEFP(stdout);
|
||||||
fflush(stdout);
|
// fflush(stdout);
|
||||||
}
|
}
|
||||||
void Grid_debug_handler_init(void)
|
void Grid_debug_handler_init(void)
|
||||||
{
|
{
|
||||||
@@ -627,10 +629,10 @@ void Grid_debug_handler_init(void)
|
|||||||
sigemptyset (&sa.sa_mask);
|
sigemptyset (&sa.sa_mask);
|
||||||
sa.sa_sigaction= Grid_sa_signal_handler;
|
sa.sa_sigaction= Grid_sa_signal_handler;
|
||||||
sa.sa_flags = SA_SIGINFO;
|
sa.sa_flags = SA_SIGINFO;
|
||||||
sigaction(SIGSEGV,&sa,NULL);
|
// sigaction(SIGSEGV,&sa,NULL);
|
||||||
sigaction(SIGTRAP,&sa,NULL);
|
sigaction(SIGTRAP,&sa,NULL);
|
||||||
sigaction(SIGBUS,&sa,NULL);
|
sigaction(SIGBUS,&sa,NULL);
|
||||||
sigaction(SIGUSR2,&sa,NULL);
|
// sigaction(SIGUSR2,&sa,NULL);
|
||||||
|
|
||||||
feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
|
feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
|
||||||
|
|
||||||
@@ -638,7 +640,14 @@ void Grid_debug_handler_init(void)
|
|||||||
sigaction(SIGKILL,&sa,NULL);
|
sigaction(SIGKILL,&sa,NULL);
|
||||||
sigaction(SIGILL,&sa,NULL);
|
sigaction(SIGILL,&sa,NULL);
|
||||||
|
|
||||||
atexit(Grid_exit_handler);
|
// Non terminating SIGUSR1/2 handler
|
||||||
|
struct sigaction sa_ping;
|
||||||
|
sigemptyset (&sa_ping.sa_mask);
|
||||||
|
sa_ping.sa_sigaction= Grid_usr_signal_handler;
|
||||||
|
sa_ping.sa_flags = SA_SIGINFO;
|
||||||
|
sigaction(SIGHUP,&sa_ping,NULL);
|
||||||
|
|
||||||
|
// atexit(Grid_exit_handler);
|
||||||
}
|
}
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|||||||
@@ -644,11 +644,6 @@ int main (int argc, char ** argv)
|
|||||||
Grid_init(&argc,&argv);
|
Grid_init(&argc,&argv);
|
||||||
|
|
||||||
CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
|
CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
|
||||||
#ifdef KNL
|
|
||||||
LebesgueOrder::Block = std::vector<int>({8,2,2,2});
|
|
||||||
#else
|
|
||||||
LebesgueOrder::Block = std::vector<int>({2,2,2,2});
|
|
||||||
#endif
|
|
||||||
Benchmark::Decomposition();
|
Benchmark::Decomposition();
|
||||||
|
|
||||||
int do_su4=1;
|
int do_su4=1;
|
||||||
|
|||||||
@@ -70,7 +70,7 @@ int main (int argc, char ** argv)
|
|||||||
pRNG.SeedFixedIntegers(std::vector<int>({56,17,89,101}));
|
pRNG.SeedFixedIntegers(std::vector<int>({56,17,89,101}));
|
||||||
|
|
||||||
std::vector<double> stop(threads);
|
std::vector<double> stop(threads);
|
||||||
Vector<Vec> sum(threads);
|
std::vector<Vec> sum(threads);
|
||||||
|
|
||||||
std::vector<LatticeVec> x(threads,&Grid);
|
std::vector<LatticeVec> x(threads,&Grid);
|
||||||
for(int t=0;t<threads;t++){
|
for(int t=0;t<threads;t++){
|
||||||
|
|||||||
@@ -78,9 +78,9 @@ int main (int argc, char ** argv)
|
|||||||
double t0,t1;
|
double t0,t1;
|
||||||
|
|
||||||
typedef typename DomainWallFermionD::Coeff_t Coeff_t;
|
typedef typename DomainWallFermionD::Coeff_t Coeff_t;
|
||||||
Vector<Coeff_t> diag = Dw.bs;
|
std::vector<Coeff_t> diag = Dw.bs;
|
||||||
Vector<Coeff_t> upper= Dw.cs;
|
std::vector<Coeff_t> upper= Dw.cs;
|
||||||
Vector<Coeff_t> lower= Dw.cs;
|
std::vector<Coeff_t> lower= Dw.cs;
|
||||||
upper[Ls-1]=-Dw.mass_minus*upper[Ls-1];
|
upper[Ls-1]=-Dw.mass_minus*upper[Ls-1];
|
||||||
lower[0] =-Dw.mass_plus*lower[0];
|
lower[0] =-Dw.mass_plus*lower[0];
|
||||||
|
|
||||||
|
|||||||
@@ -861,7 +861,7 @@ int main (int argc, char ** argv)
|
|||||||
}
|
}
|
||||||
|
|
||||||
CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
|
CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
|
||||||
LebesgueOrder::Block = std::vector<int>({2,2,2,2});
|
// LebesgueOrder::Block = std::vector<int>({2,2,2,2});
|
||||||
|
|
||||||
Benchmark::Decomposition();
|
Benchmark::Decomposition();
|
||||||
|
|
||||||
|
|||||||
36
configure.ac
36
configure.ac
@@ -128,6 +128,20 @@ case ${ac_LAPACK} in
|
|||||||
AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);;
|
AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
|
############### internal reduction
|
||||||
|
AC_ARG_ENABLE([reduction],
|
||||||
|
[AS_HELP_STRING([--enable-reduction=mpi|grid],[enable reduction])],
|
||||||
|
[ac_REDUCTION=${enable_reduction}], [ac_REDUCTION=grid])
|
||||||
|
|
||||||
|
case ${ac_REDUCTION} in
|
||||||
|
mpi)
|
||||||
|
;;
|
||||||
|
grid)
|
||||||
|
AC_DEFINE([USE_GRID_REDUCTION],[1],[use GRID REDUCTION]);;
|
||||||
|
*)
|
||||||
|
AC_DEFINE([USE_GRID_REDUCTION],[1],[use GRID REDUCTION]);;
|
||||||
|
esac
|
||||||
|
|
||||||
############### tracing
|
############### tracing
|
||||||
AC_ARG_ENABLE([tracing],
|
AC_ARG_ENABLE([tracing],
|
||||||
[AS_HELP_STRING([--enable-tracing=none|nvtx|roctx|timer],[enable tracing])],
|
[AS_HELP_STRING([--enable-tracing=none|nvtx|roctx|timer],[enable tracing])],
|
||||||
@@ -225,18 +239,6 @@ case ${ac_SFW_FP16} in
|
|||||||
AC_MSG_ERROR(["SFW FP16 option not supported ${ac_SFW_FP16}"]);;
|
AC_MSG_ERROR(["SFW FP16 option not supported ${ac_SFW_FP16}"]);;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
############### Default to accelerator cshift, but revert to host if UCX is buggy or other reasons
|
|
||||||
AC_ARG_ENABLE([accelerator-aware-mpi],
|
|
||||||
[AS_HELP_STRING([--enable-accelerator-aware-mpi=yes|no],[run mpi transfers from device])],
|
|
||||||
[ac_ACCELERATOR_AWARE_MPI=${enable_accelerator_aware_mpi}], [ac_ACCELERATOR_AWARE_MPI=yes])
|
|
||||||
|
|
||||||
case ${ac_ACCELERATOR_AWARE_MPI} in
|
|
||||||
yes)
|
|
||||||
AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ Cshift runs on host])
|
|
||||||
AC_DEFINE([ACCELERATOR_AWARE_MPI],[1],[ Stencil can use device pointers]);;
|
|
||||||
*);;
|
|
||||||
esac
|
|
||||||
|
|
||||||
|
|
||||||
############### SYCL/CUDA/HIP/none
|
############### SYCL/CUDA/HIP/none
|
||||||
AC_ARG_ENABLE([accelerator],
|
AC_ARG_ENABLE([accelerator],
|
||||||
@@ -664,16 +666,6 @@ case ${ac_SHM_FAST_PATH} in
|
|||||||
*) ;;
|
*) ;;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
############### communication type selection
|
|
||||||
AC_ARG_ENABLE([comms-threads],[AS_HELP_STRING([--enable-comms-threads | --disable-comms-threads],[Use multiple threads in MPI calls])],[ac_COMMS_THREADS=${enable_comms_threads}],[ac_COMMS_THREADS=yes])
|
|
||||||
|
|
||||||
case ${ac_COMMS_THREADS} in
|
|
||||||
yes)
|
|
||||||
AC_DEFINE([GRID_COMMS_THREADING],[1],[GRID_COMMS_NONE] )
|
|
||||||
;;
|
|
||||||
*) ;;
|
|
||||||
esac
|
|
||||||
|
|
||||||
############### communication type selection
|
############### communication type selection
|
||||||
AC_ARG_ENABLE([comms],[AS_HELP_STRING([--enable-comms=none|mpi|mpi-auto],[Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none])
|
AC_ARG_ENABLE([comms],[AS_HELP_STRING([--enable-comms=none|mpi|mpi-auto],[Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none])
|
||||||
|
|
||||||
|
|||||||
23
systems/Aurora-AOT/config-command
Normal file
23
systems/Aurora-AOT/config-command
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
#Ahead of time compile for PVC
|
||||||
|
export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl "
|
||||||
|
export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions -fsycl-targets=spir64_gen -Xs -device -Xs pvc "
|
||||||
|
|
||||||
|
#JIT compile
|
||||||
|
#export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl "
|
||||||
|
#export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions "
|
||||||
|
|
||||||
|
../../configure \
|
||||||
|
--enable-simd=GPU \
|
||||||
|
--enable-gen-simd-width=64 \
|
||||||
|
--enable-comms=mpi-auto \
|
||||||
|
--enable-debug \
|
||||||
|
--disable-gparity \
|
||||||
|
--disable-fermion-reps \
|
||||||
|
--with-lime=$CLIME \
|
||||||
|
--enable-shm=nvlink \
|
||||||
|
--enable-accelerator=sycl \
|
||||||
|
--enable-accelerator-aware-mpi=yes\
|
||||||
|
--enable-unified=no \
|
||||||
|
MPICXX=mpicxx \
|
||||||
|
CXX=icpx
|
||||||
|
|
||||||
15
systems/Aurora-AOT/sourceme.sh
Normal file
15
systems/Aurora-AOT/sourceme.sh
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
#module load oneapi/release/2023.12.15.001
|
||||||
|
#module load mpich/icc-all-debug-pmix-gpu/52.2
|
||||||
|
#module load mpich-config/mode/deterministic
|
||||||
|
#module load intel_compute_runtime/release/821.35
|
||||||
|
|
||||||
|
source ~/spack/share/spack/setup-env.sh
|
||||||
|
spack load c-lime
|
||||||
|
spack load openssl
|
||||||
|
export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' `
|
||||||
|
export HTTP_PROXY=http://proxy.alcf.anl.gov:3128
|
||||||
|
export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128
|
||||||
|
export http_proxy=http://proxy.alcf.anl.gov:3128
|
||||||
|
export https_proxy=http://proxy.alcf.anl.gov:3128
|
||||||
|
git config --global http.proxy http://proxy.alcf.anl.gov:3128
|
||||||
|
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user