mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-18 15:57:05 +01:00
Compare commits
3 Commits
9fa8bd6438
...
feature/ft
Author | SHA1 | Date | |
---|---|---|---|
bffd30abec | |||
da919949f9 | |||
b12b4fdaff |
@ -12,13 +12,15 @@
|
|||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <sys/time.h>
|
#include <sys/time.h>
|
||||||
|
|
||||||
|
#define GRID_SYCL
|
||||||
|
#undef GRID_HIP
|
||||||
|
#undef GRID_CUDA
|
||||||
|
|
||||||
#ifdef GRID_HIP
|
#ifdef GRID_HIP
|
||||||
#include <hipblas/hipblas.h>
|
#include <hipblas/hipblas.h>
|
||||||
#endif
|
#endif
|
||||||
#ifdef GRID_CUDA
|
#ifdef GRID_CUDA
|
||||||
#include <cublas_v2.h>
|
#include <cublas_v2.h>
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
#ifdef GRID_SYCL
|
#ifdef GRID_SYCL
|
||||||
#include <oneapi/mkl.hpp>
|
#include <oneapi/mkl.hpp>
|
||||||
@ -43,90 +45,6 @@ inline void acceleratorFreeDevice(void *ptr,size_t bytes){free(ptr,*theAccelerat
|
|||||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { theAccelerator->memset(base,value,bytes); theAccelerator->wait();}
|
inline void acceleratorMemSet(void *base,int value,size_t bytes) { theAccelerator->memset(base,value,bytes); theAccelerator->wait();}
|
||||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theAccelerator->memcpy(to,from,bytes); theAccelerator->wait();}
|
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theAccelerator->memcpy(to,from,bytes); theAccelerator->wait();}
|
||||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theAccelerator->memcpy(to,from,bytes); theAccelerator->wait();}
|
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theAccelerator->memcpy(to,from,bytes); theAccelerator->wait();}
|
||||||
#define accelerator_barrier(dummy) { theAccelerator->wait(); }
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef GRID_HIP
|
|
||||||
hipStream_t copyStream;
|
|
||||||
hipStream_t computeStream;
|
|
||||||
void acceleratorInit(void)
|
|
||||||
{
|
|
||||||
int device = 0;
|
|
||||||
auto discard = hipSetDevice(device);
|
|
||||||
discard = hipStreamCreate(©Stream);
|
|
||||||
discard = hipStreamCreate(&computeStream);
|
|
||||||
printf("AcceleratorHIPInit\n");
|
|
||||||
}
|
|
||||||
inline void *acceleratorAllocDevice(size_t bytes)
|
|
||||||
{
|
|
||||||
void *ptr=NULL;
|
|
||||||
auto err = hipMalloc((void **)&ptr,bytes);
|
|
||||||
if( err != hipSuccess ) {
|
|
||||||
ptr = (void *) NULL;
|
|
||||||
fprintf(stderr," hipMalloc failed for %ld %s \n",bytes,hipGetErrorString(err)); fflush(stderr);
|
|
||||||
}
|
|
||||||
return ptr;
|
|
||||||
};
|
|
||||||
inline void acceleratorFreeDevice(void *ptr,size_t bytes){ auto discard=hipFree(ptr);};
|
|
||||||
inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);};
|
|
||||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);}
|
|
||||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
|
|
||||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
|
|
||||||
#define accelerator_barrier(dummy) \
|
|
||||||
{ \
|
|
||||||
auto tmp=hipStreamSynchronize(computeStream); \
|
|
||||||
auto err = hipGetLastError(); \
|
|
||||||
if ( err != hipSuccess ) { \
|
|
||||||
printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \
|
|
||||||
puts(__FILE__); \
|
|
||||||
printf("Line %d\n",__LINE__); \
|
|
||||||
exit(0); \
|
|
||||||
} \
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef GRID_CUDA
|
|
||||||
cudaStream_t copyStream;
|
|
||||||
cudaStream_t computeStream;
|
|
||||||
void acceleratorInit(void)
|
|
||||||
{
|
|
||||||
int device = 0;
|
|
||||||
cudaSetDevice(device);
|
|
||||||
cudaStreamCreate(©Stream);
|
|
||||||
cudaStreamCreate(&computeStream);
|
|
||||||
}
|
|
||||||
inline void *acceleratorAllocDevice(size_t bytes)
|
|
||||||
{
|
|
||||||
void *ptr=NULL;
|
|
||||||
auto err = cudaMalloc((void **)&ptr,bytes);
|
|
||||||
if( err != cudaSuccess ) {
|
|
||||||
ptr = (void *) NULL;
|
|
||||||
printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err));
|
|
||||||
}
|
|
||||||
return ptr;
|
|
||||||
};
|
|
||||||
inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
|
|
||||||
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
|
|
||||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
|
|
||||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
|
|
||||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
|
|
||||||
#define accelerator_barrier(dummy) \
|
|
||||||
{ \
|
|
||||||
cudaStreamSynchronize(computeStream); \
|
|
||||||
cudaError err = cudaGetLastError(); \
|
|
||||||
if ( cudaSuccess != err ) { \
|
|
||||||
printf("accelerator_barrier(): Cuda error %s \n", \
|
|
||||||
cudaGetErrorString( err )); \
|
|
||||||
printf("File %s Line %d\n",__FILE__,__LINE__); \
|
|
||||||
fflush(stdout); \
|
|
||||||
if (acceleratorAbortOnGpuError) assert(err==cudaSuccess); \
|
|
||||||
} \
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
template<class T> void acceleratorPut(T& dev,T&host)
|
template<class T> void acceleratorPut(T& dev,T&host)
|
||||||
{
|
{
|
||||||
acceleratorCopyToDevice(&host,&dev,sizeof(T));
|
acceleratorCopyToDevice(&host,&dev,sizeof(T));
|
||||||
@ -137,6 +55,9 @@ template<class T> T acceleratorGet(T& dev)
|
|||||||
acceleratorCopyFromDevice(&dev,&host,sizeof(T));
|
acceleratorCopyFromDevice(&dev,&host,sizeof(T));
|
||||||
return host;
|
return host;
|
||||||
}
|
}
|
||||||
|
#define accelerator_barrier(dummy) { theAccelerator->wait(); }
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**************************************************************
|
/**************************************************************
|
||||||
* Allocator
|
* Allocator
|
||||||
@ -290,269 +211,6 @@ public:
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////
|
|
||||||
// Single matrix GEMM -- fp64 and fp32
|
|
||||||
/////////////////////////////////////////////////////////////
|
|
||||||
void gemm(GridBLASOperation_t OpA,
|
|
||||||
GridBLASOperation_t OpB,
|
|
||||||
int m,int n, int k,
|
|
||||||
ComplexD alpha,
|
|
||||||
ComplexD* Amk, // Device pointer
|
|
||||||
ComplexD* Bkn,
|
|
||||||
ComplexD beta,
|
|
||||||
ComplexD* Cmn)
|
|
||||||
{
|
|
||||||
RealD t2=usecond();
|
|
||||||
|
|
||||||
assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
|
|
||||||
assert(OpB!=GridBLAS_OP_T);
|
|
||||||
|
|
||||||
int lda = m; // m x k column major
|
|
||||||
int ldb = k; // k x n column major
|
|
||||||
int ldc = m; // m x b column major
|
|
||||||
if(OpA!=GridBLAS_OP_N)
|
|
||||||
lda = k;
|
|
||||||
if(OpB!=GridBLAS_OP_N)
|
|
||||||
ldb = n;
|
|
||||||
|
|
||||||
static deviceVector<ComplexD> alpha_p(1);
|
|
||||||
static deviceVector<ComplexD> beta_p(1);
|
|
||||||
// can prestore the 1 and the zero on device
|
|
||||||
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD));
|
|
||||||
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD));
|
|
||||||
RealD t0=usecond();
|
|
||||||
|
|
||||||
#ifdef GRID_HIP
|
|
||||||
hipblasOperation_t hOpA;
|
|
||||||
hipblasOperation_t hOpB;
|
|
||||||
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
|
|
||||||
if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
|
|
||||||
if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
|
|
||||||
if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
|
|
||||||
if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
|
|
||||||
if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
|
|
||||||
auto err = hipblasZgemm(gridblasHandle,
|
|
||||||
hOpA,
|
|
||||||
hOpB,
|
|
||||||
m,n,k,
|
|
||||||
(hipblasDoubleComplex *) &alpha_p[0],
|
|
||||||
(hipblasDoubleComplex *) Amk, lda,
|
|
||||||
(hipblasDoubleComplex *) Bkn, ldb,
|
|
||||||
(hipblasDoubleComplex *) &beta_p[0],
|
|
||||||
(hipblasDoubleComplex *) Cmn, ldc);
|
|
||||||
assert(err==HIPBLAS_STATUS_SUCCESS);
|
|
||||||
#endif
|
|
||||||
#ifdef GRID_CUDA
|
|
||||||
cublasOperation_t hOpA;
|
|
||||||
cublasOperation_t hOpB;
|
|
||||||
if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
|
|
||||||
if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
|
|
||||||
if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
|
|
||||||
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
|
|
||||||
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
|
|
||||||
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
|
|
||||||
auto err = cublasZgemm(gridblasHandle,
|
|
||||||
hOpA,
|
|
||||||
hOpB,
|
|
||||||
m,n,k,
|
|
||||||
(cuDoubleComplex *) &alpha_p[0],
|
|
||||||
(cuDoubleComplex *) Amk, lda,
|
|
||||||
(cuDoubleComplex *) Bkn, ldb,
|
|
||||||
(cuDoubleComplex *) &beta_p[0],
|
|
||||||
(cuDoubleComplex *) Cmn, ldc);
|
|
||||||
assert(err==CUBLAS_STATUS_SUCCESS);
|
|
||||||
#endif
|
|
||||||
#ifdef GRID_SYCL
|
|
||||||
int64_t m64=m;
|
|
||||||
int64_t n64=n;
|
|
||||||
int64_t k64=k;
|
|
||||||
int64_t lda64=lda;
|
|
||||||
int64_t ldb64=ldb;
|
|
||||||
int64_t ldc64=ldc;
|
|
||||||
|
|
||||||
oneapi::mkl::transpose iOpA;
|
|
||||||
oneapi::mkl::transpose iOpB;
|
|
||||||
|
|
||||||
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
|
|
||||||
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
|
|
||||||
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
|
|
||||||
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
|
|
||||||
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
|
|
||||||
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
|
|
||||||
|
|
||||||
oneapi::mkl::blas::column_major::gemm(*gridblasHandle,
|
|
||||||
iOpA,
|
|
||||||
iOpB,
|
|
||||||
m64,n64,k64,
|
|
||||||
(ComplexD *) &alpha_p[0],
|
|
||||||
(const ComplexD *)Amk, (int64_t )lda64,
|
|
||||||
(const ComplexD *)Bkn, (int64_t )ldb64,
|
|
||||||
(ComplexD *) &beta_p[0],
|
|
||||||
(ComplexD *)Cmn, (int64_t)ldc64);
|
|
||||||
synchronise();
|
|
||||||
#endif
|
|
||||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
|
||||||
// Need a default/reference implementation; use Eigen
|
|
||||||
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
|
|
||||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk,m,k);
|
|
||||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn,k,n);
|
|
||||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn,m,n);
|
|
||||||
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
|
||||||
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
|
|
||||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk,k,m);
|
|
||||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn,k,n);
|
|
||||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn,m,n);
|
|
||||||
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
|
|
||||||
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
|
|
||||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk,m,k);
|
|
||||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn,n,k);
|
|
||||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn,m,n);
|
|
||||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
|
|
||||||
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
|
|
||||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk,k,m);
|
|
||||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn,n,k);
|
|
||||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn,m,n);
|
|
||||||
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
|
|
||||||
} else {
|
|
||||||
assert(0);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
RealD t1=usecond();
|
|
||||||
RealD flops = 8.0*m*n*k;
|
|
||||||
RealD bytes = 1.0*sizeof(ComplexD)*(m*k+k*n+m*n);
|
|
||||||
}
|
|
||||||
void gemm(GridBLASOperation_t OpA,
|
|
||||||
GridBLASOperation_t OpB,
|
|
||||||
int m,int n, int k,
|
|
||||||
ComplexF alpha,
|
|
||||||
ComplexF* Amk, // Device pointer
|
|
||||||
ComplexF* Bkn,
|
|
||||||
ComplexF beta,
|
|
||||||
ComplexF* Cmn)
|
|
||||||
{
|
|
||||||
RealD t2=usecond();
|
|
||||||
|
|
||||||
assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
|
|
||||||
assert(OpB!=GridBLAS_OP_T);
|
|
||||||
|
|
||||||
int lda = m; // m x k column major
|
|
||||||
int ldb = k; // k x n column major
|
|
||||||
int ldc = m; // m x b column major
|
|
||||||
if(OpA!=GridBLAS_OP_N)
|
|
||||||
lda = k;
|
|
||||||
if(OpB!=GridBLAS_OP_N)
|
|
||||||
ldb = n;
|
|
||||||
|
|
||||||
static deviceVector<ComplexF> alpha_p(1);
|
|
||||||
static deviceVector<ComplexF> beta_p(1);
|
|
||||||
// can prestore the 1 and the zero on device
|
|
||||||
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexF));
|
|
||||||
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexF));
|
|
||||||
RealD t0=usecond();
|
|
||||||
|
|
||||||
#ifdef GRID_HIP
|
|
||||||
hipblasOperation_t hOpA;
|
|
||||||
hipblasOperation_t hOpB;
|
|
||||||
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
|
|
||||||
if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
|
|
||||||
if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
|
|
||||||
if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
|
|
||||||
if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
|
|
||||||
if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
|
|
||||||
auto err = hipblasCgemm(gridblasHandle,
|
|
||||||
hOpA,
|
|
||||||
hOpB,
|
|
||||||
m,n,k,
|
|
||||||
(hipblasComplex *) &alpha_p[0],
|
|
||||||
(hipblasComplex *) Amk, lda,
|
|
||||||
(hipblasComplex *) Bkn, ldb,
|
|
||||||
(hipblasComplex *) &beta_p[0],
|
|
||||||
(hipblasComplex *) Cmn, ldc);
|
|
||||||
assert(err==HIPBLAS_STATUS_SUCCESS);
|
|
||||||
#endif
|
|
||||||
#ifdef GRID_CUDA
|
|
||||||
cublasOperation_t hOpA;
|
|
||||||
cublasOperation_t hOpB;
|
|
||||||
if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
|
|
||||||
if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
|
|
||||||
if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
|
|
||||||
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
|
|
||||||
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
|
|
||||||
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
|
|
||||||
auto err = cublasCgemm(gridblasHandle,
|
|
||||||
hOpA,
|
|
||||||
hOpB,
|
|
||||||
m,n,k,
|
|
||||||
(cuComplex *) &alpha_p[0],
|
|
||||||
(cuComplex *) Amk, lda,
|
|
||||||
(cuComplex *) Bkn, ldb,
|
|
||||||
(cuComplex *) &beta_p[0],
|
|
||||||
(cuComplex *) Cmn, ldc);
|
|
||||||
assert(err==CUBLAS_STATUS_SUCCESS);
|
|
||||||
#endif
|
|
||||||
#ifdef GRID_SYCL
|
|
||||||
int64_t m64=m;
|
|
||||||
int64_t n64=n;
|
|
||||||
int64_t k64=k;
|
|
||||||
int64_t lda64=lda;
|
|
||||||
int64_t ldb64=ldb;
|
|
||||||
int64_t ldc64=ldc;
|
|
||||||
|
|
||||||
oneapi::mkl::transpose iOpA;
|
|
||||||
oneapi::mkl::transpose iOpB;
|
|
||||||
|
|
||||||
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
|
|
||||||
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
|
|
||||||
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
|
|
||||||
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
|
|
||||||
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
|
|
||||||
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
|
|
||||||
|
|
||||||
oneapi::mkl::blas::column_major::gemm(*gridblasHandle,
|
|
||||||
iOpA,
|
|
||||||
iOpB,
|
|
||||||
m64,n64,k64,
|
|
||||||
(ComplexF *) &alpha_p[0],
|
|
||||||
(const ComplexF *)Amk, (int64_t )lda64,
|
|
||||||
(const ComplexF *)Bkn, (int64_t )ldb64,
|
|
||||||
(ComplexF *) &beta_p[0],
|
|
||||||
(ComplexF *)Cmn, (int64_t )ldc64);
|
|
||||||
synchronise();
|
|
||||||
#endif
|
|
||||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
|
||||||
// Need a default/reference implementation; use Eigen
|
|
||||||
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
|
|
||||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk,m,k);
|
|
||||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn,k,n);
|
|
||||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn,m,n);
|
|
||||||
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
|
||||||
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
|
|
||||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk,k,m);
|
|
||||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn,k,n);
|
|
||||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn,m,n);
|
|
||||||
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
|
|
||||||
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
|
|
||||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk,m,k);
|
|
||||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn,n,k);
|
|
||||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn,m,n);
|
|
||||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
|
|
||||||
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
|
|
||||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk,k,m);
|
|
||||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn,n,k);
|
|
||||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn,m,n);
|
|
||||||
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
|
|
||||||
} else {
|
|
||||||
assert(0);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
RealD t1=usecond();
|
|
||||||
RealD flops = 8.0*m*n*k;
|
|
||||||
RealD bytes = 1.0*sizeof(ComplexF)*(m*k+k*n+m*n);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////
|
|
||||||
void gemmBatched(int m,int n, int k,
|
void gemmBatched(int m,int n, int k,
|
||||||
ComplexD alpha,
|
ComplexD alpha,
|
||||||
deviceVector<ComplexD*> &Amk, // pointer list to matrices
|
deviceVector<ComplexD*> &Amk, // pointer list to matrices
|
||||||
@ -583,6 +241,36 @@ public:
|
|||||||
beta,
|
beta,
|
||||||
Cmn);
|
Cmn);
|
||||||
}
|
}
|
||||||
|
void gemmBatched(int m,int n, int k,
|
||||||
|
RealD alpha,
|
||||||
|
deviceVector<RealD*> &Amk, // pointer list to matrices
|
||||||
|
deviceVector<RealD*> &Bkn,
|
||||||
|
RealD beta,
|
||||||
|
deviceVector<RealD*> &Cmn)
|
||||||
|
{
|
||||||
|
gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
|
||||||
|
m,n,k,
|
||||||
|
alpha,
|
||||||
|
Amk,
|
||||||
|
Bkn,
|
||||||
|
beta,
|
||||||
|
Cmn);
|
||||||
|
}
|
||||||
|
void gemmBatched(int m,int n, int k,
|
||||||
|
RealF alpha,
|
||||||
|
deviceVector<RealF*> &Amk, // pointer list to matrices
|
||||||
|
deviceVector<RealF*> &Bkn,
|
||||||
|
RealF beta,
|
||||||
|
deviceVector<RealF*> &Cmn)
|
||||||
|
{
|
||||||
|
gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
|
||||||
|
m,n,k,
|
||||||
|
alpha,
|
||||||
|
Amk,
|
||||||
|
Bkn,
|
||||||
|
beta,
|
||||||
|
Cmn);
|
||||||
|
}
|
||||||
|
|
||||||
void gemmBatched(GridBLASOperation_t OpA,
|
void gemmBatched(GridBLASOperation_t OpA,
|
||||||
GridBLASOperation_t OpB,
|
GridBLASOperation_t OpB,
|
||||||
@ -936,6 +624,301 @@ public:
|
|||||||
RealD bytes = 1.0*sizeof(ComplexF)*(m*k+k*n+m*n)*batchCount;
|
RealD bytes = 1.0*sizeof(ComplexF)*(m*k+k*n+m*n)*batchCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
// Single precision real GEMM
|
||||||
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
void gemmBatched(GridBLASOperation_t OpA,
|
||||||
|
GridBLASOperation_t OpB,
|
||||||
|
int m,int n, int k,
|
||||||
|
RealF alpha,
|
||||||
|
deviceVector<RealF*> &Amk, // pointer list to matrices
|
||||||
|
deviceVector<RealF*> &Bkn,
|
||||||
|
RealF beta,
|
||||||
|
deviceVector<RealF*> &Cmn)
|
||||||
|
{
|
||||||
|
RealD t2=usecond();
|
||||||
|
int32_t batchCount = Amk.size();
|
||||||
|
|
||||||
|
assert(OpA!=GridBLAS_OP_C); // Real case no conjugate
|
||||||
|
assert(OpB!=GridBLAS_OP_C);
|
||||||
|
|
||||||
|
int lda = m; // m x k column major
|
||||||
|
int ldb = k; // k x n column major
|
||||||
|
int ldc = m; // m x b column major
|
||||||
|
if(OpA!=GridBLAS_OP_N)
|
||||||
|
lda = k;
|
||||||
|
if(OpB!=GridBLAS_OP_N)
|
||||||
|
ldb = n;
|
||||||
|
static deviceVector<RealF> alpha_p(1);
|
||||||
|
static deviceVector<RealF> beta_p(1);
|
||||||
|
// can prestore the 1 and the zero on device
|
||||||
|
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealF));
|
||||||
|
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealF));
|
||||||
|
RealD t0=usecond();
|
||||||
|
|
||||||
|
assert(Bkn.size()==batchCount);
|
||||||
|
assert(Cmn.size()==batchCount);
|
||||||
|
#ifdef GRID_HIP
|
||||||
|
hipblasOperation_t hOpA;
|
||||||
|
hipblasOperation_t hOpB;
|
||||||
|
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
|
||||||
|
if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
|
||||||
|
if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
|
||||||
|
if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
|
||||||
|
if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
|
||||||
|
if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
|
||||||
|
auto err = hipblasSgemmBatched(gridblasHandle,
|
||||||
|
hOpA,
|
||||||
|
hOpB,
|
||||||
|
m,n,k,
|
||||||
|
(float *) &alpha_p[0],
|
||||||
|
(float **)&Amk[0], lda,
|
||||||
|
(float **)&Bkn[0], ldb,
|
||||||
|
(float *) &beta_p[0],
|
||||||
|
(float **)&Cmn[0], ldc,
|
||||||
|
batchCount);
|
||||||
|
assert(err==HIPBLAS_STATUS_SUCCESS);
|
||||||
|
#endif
|
||||||
|
#ifdef GRID_CUDA
|
||||||
|
cublasOperation_t hOpA;
|
||||||
|
cublasOperation_t hOpB;
|
||||||
|
if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
|
||||||
|
if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
|
||||||
|
if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
|
||||||
|
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
|
||||||
|
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
|
||||||
|
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
|
||||||
|
auto err = cublasSgemmBatched(gridblasHandle,
|
||||||
|
hOpA,
|
||||||
|
hOpB,
|
||||||
|
m,n,k,
|
||||||
|
(float *) &alpha_p[0],
|
||||||
|
(float **)&Amk[0], lda,
|
||||||
|
(float **)&Bkn[0], ldb,
|
||||||
|
(float *) &beta_p[0],
|
||||||
|
(float **)&Cmn[0], ldc,
|
||||||
|
batchCount);
|
||||||
|
assert(err==CUBLAS_STATUS_SUCCESS);
|
||||||
|
#endif
|
||||||
|
#ifdef GRID_SYCL
|
||||||
|
int64_t m64=m;
|
||||||
|
int64_t n64=n;
|
||||||
|
int64_t k64=k;
|
||||||
|
int64_t lda64=lda;
|
||||||
|
int64_t ldb64=ldb;
|
||||||
|
int64_t ldc64=ldc;
|
||||||
|
int64_t batchCount64=batchCount;
|
||||||
|
|
||||||
|
oneapi::mkl::transpose iOpA;
|
||||||
|
oneapi::mkl::transpose iOpB;
|
||||||
|
|
||||||
|
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
|
||||||
|
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
|
||||||
|
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
|
||||||
|
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
|
||||||
|
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
|
||||||
|
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
|
||||||
|
|
||||||
|
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
|
||||||
|
&iOpA,
|
||||||
|
&iOpB,
|
||||||
|
&m64,&n64,&k64,
|
||||||
|
(float *) &alpha_p[0],
|
||||||
|
(const float **)&Amk[0], (const int64_t *)&lda64,
|
||||||
|
(const float **)&Bkn[0], (const int64_t *)&ldb64,
|
||||||
|
(float *) &beta_p[0],
|
||||||
|
(float **)&Cmn[0], (const int64_t *)&ldc64,
|
||||||
|
(int64_t)1,&batchCount64,std::vector<sycl::event>());
|
||||||
|
synchronise();
|
||||||
|
#endif
|
||||||
|
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
||||||
|
// Need a default/reference implementation; use Eigen
|
||||||
|
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
|
||||||
|
thread_for (p, batchCount, {
|
||||||
|
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
|
||||||
|
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||||
|
});
|
||||||
|
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
|
||||||
|
thread_for (p, batchCount, {
|
||||||
|
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
|
||||||
|
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
|
||||||
|
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
|
||||||
|
});
|
||||||
|
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
|
||||||
|
thread_for (p, batchCount, {
|
||||||
|
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
|
||||||
|
});
|
||||||
|
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
|
||||||
|
thread_for (p, batchCount, {
|
||||||
|
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
|
||||||
|
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||||
|
} );
|
||||||
|
} else {
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
RealD t1=usecond();
|
||||||
|
RealD flops = 2.0*m*n*k*batchCount;
|
||||||
|
RealD bytes = 1.0*sizeof(RealF)*(m*k+k*n+m*n)*batchCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
// Double precision real GEMM
|
||||||
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
void gemmBatched(GridBLASOperation_t OpA,
|
||||||
|
GridBLASOperation_t OpB,
|
||||||
|
int m,int n, int k,
|
||||||
|
RealD alpha,
|
||||||
|
deviceVector<RealD*> &Amk, // pointer list to matrices
|
||||||
|
deviceVector<RealD*> &Bkn,
|
||||||
|
RealD beta,
|
||||||
|
deviceVector<RealD*> &Cmn)
|
||||||
|
{
|
||||||
|
RealD t2=usecond();
|
||||||
|
int32_t batchCount = Amk.size();
|
||||||
|
|
||||||
|
assert(OpA!=GridBLAS_OP_C); // Real case no conjugate
|
||||||
|
assert(OpB!=GridBLAS_OP_C);
|
||||||
|
|
||||||
|
int lda = m; // m x k column major
|
||||||
|
int ldb = k; // k x n column major
|
||||||
|
int ldc = m; // m x b column major
|
||||||
|
if(OpA!=GridBLAS_OP_N)
|
||||||
|
lda = k;
|
||||||
|
if(OpB!=GridBLAS_OP_N)
|
||||||
|
ldb = n;
|
||||||
|
|
||||||
|
static deviceVector<RealD> alpha_p(1);
|
||||||
|
static deviceVector<RealD> beta_p(1);
|
||||||
|
// can prestore the 1 and the zero on device
|
||||||
|
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealD));
|
||||||
|
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealD));
|
||||||
|
RealD t0=usecond();
|
||||||
|
|
||||||
|
assert(Bkn.size()==batchCount);
|
||||||
|
assert(Cmn.size()==batchCount);
|
||||||
|
#ifdef GRID_HIP
|
||||||
|
hipblasOperation_t hOpA;
|
||||||
|
hipblasOperation_t hOpB;
|
||||||
|
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
|
||||||
|
if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
|
||||||
|
if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
|
||||||
|
if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
|
||||||
|
if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
|
||||||
|
if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
|
||||||
|
auto err = hipblasDgemmBatched(gridblasHandle,
|
||||||
|
HIPBLAS_OP_N,
|
||||||
|
HIPBLAS_OP_N,
|
||||||
|
m,n,k,
|
||||||
|
(double *) &alpha_p[0],
|
||||||
|
(double **)&Amk[0], lda,
|
||||||
|
(double **)&Bkn[0], ldb,
|
||||||
|
(double *) &beta_p[0],
|
||||||
|
(double **)&Cmn[0], ldc,
|
||||||
|
batchCount);
|
||||||
|
assert(err==HIPBLAS_STATUS_SUCCESS);
|
||||||
|
#endif
|
||||||
|
#ifdef GRID_CUDA
|
||||||
|
cublasOperation_t hOpA;
|
||||||
|
cublasOperation_t hOpB;
|
||||||
|
if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
|
||||||
|
if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
|
||||||
|
if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
|
||||||
|
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
|
||||||
|
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
|
||||||
|
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
|
||||||
|
auto err = cublasDgemmBatched(gridblasHandle,
|
||||||
|
hOpA,
|
||||||
|
hOpB,
|
||||||
|
m,n,k,
|
||||||
|
(double *) &alpha_p[0],
|
||||||
|
(double **)&Amk[0], lda,
|
||||||
|
(double **)&Bkn[0], ldb,
|
||||||
|
(double *) &beta_p[0],
|
||||||
|
(double **)&Cmn[0], ldc,
|
||||||
|
batchCount);
|
||||||
|
assert(err==CUBLAS_STATUS_SUCCESS);
|
||||||
|
#endif
|
||||||
|
#ifdef GRID_SYCL
|
||||||
|
int64_t m64=m;
|
||||||
|
int64_t n64=n;
|
||||||
|
int64_t k64=k;
|
||||||
|
int64_t lda64=lda;
|
||||||
|
int64_t ldb64=ldb;
|
||||||
|
int64_t ldc64=ldc;
|
||||||
|
int64_t batchCount64=batchCount;
|
||||||
|
|
||||||
|
oneapi::mkl::transpose iOpA;
|
||||||
|
oneapi::mkl::transpose iOpB;
|
||||||
|
|
||||||
|
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
|
||||||
|
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
|
||||||
|
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
|
||||||
|
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
|
||||||
|
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
|
||||||
|
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
|
||||||
|
|
||||||
|
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
|
||||||
|
&iOpA,
|
||||||
|
&iOpB,
|
||||||
|
&m64,&n64,&k64,
|
||||||
|
(double *) &alpha_p[0],
|
||||||
|
(const double **)&Amk[0], (const int64_t *)&lda64,
|
||||||
|
(const double **)&Bkn[0], (const int64_t *)&ldb64,
|
||||||
|
(double *) &beta_p[0],
|
||||||
|
(double **)&Cmn[0], (const int64_t *)&ldc64,
|
||||||
|
(int64_t)1,&batchCount64,std::vector<sycl::event>());
|
||||||
|
synchronise();
|
||||||
|
#endif
|
||||||
|
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
||||||
|
// Need a default/reference implementation; use Eigen
|
||||||
|
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
|
||||||
|
thread_for (p, batchCount, {
|
||||||
|
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
|
||||||
|
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||||
|
});
|
||||||
|
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
|
||||||
|
thread_for (p, batchCount, {
|
||||||
|
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
|
||||||
|
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
|
||||||
|
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
|
||||||
|
});
|
||||||
|
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
|
||||||
|
thread_for (p, batchCount, {
|
||||||
|
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
|
||||||
|
});
|
||||||
|
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
|
||||||
|
thread_for (p, batchCount, {
|
||||||
|
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
|
||||||
|
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
RealD t1=usecond();
|
||||||
|
RealD flops = 2.0*m*n*k*batchCount;
|
||||||
|
RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount;
|
||||||
|
}
|
||||||
|
|
||||||
template<class CComplex>
|
template<class CComplex>
|
||||||
double benchmark(int M, int N, int K, int BATCH)
|
double benchmark(int M, int N, int K, int BATCH)
|
||||||
{
|
{
|
||||||
@ -984,47 +967,6 @@ public:
|
|||||||
return flops; // Returns gigaflops
|
return flops; // Returns gigaflops
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class CComplex>
|
|
||||||
double benchmark(int M, int N, int K)
|
|
||||||
{
|
|
||||||
int32_t N_A = M*K;
|
|
||||||
int32_t N_B = K*N;
|
|
||||||
int32_t N_C = M*N;
|
|
||||||
deviceVector<CComplex> A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(CComplex));
|
|
||||||
deviceVector<CComplex> B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(CComplex));
|
|
||||||
deviceVector<CComplex> C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(CComplex));
|
|
||||||
CComplex alpha(1.0);
|
|
||||||
CComplex beta (1.0);
|
|
||||||
RealD flops = 8.0*M*N*K;
|
|
||||||
int ncall=10;
|
|
||||||
|
|
||||||
gemm(GridBLAS_OP_C,GridBLAS_OP_N,
|
|
||||||
M,N,K,
|
|
||||||
alpha,
|
|
||||||
&A[0], // m x k
|
|
||||||
&B[0], // k x n
|
|
||||||
beta,
|
|
||||||
&C[0]);
|
|
||||||
synchronise();
|
|
||||||
|
|
||||||
RealD t0 = usecond();
|
|
||||||
for(int i=0;i<ncall;i++){
|
|
||||||
gemm(GridBLAS_OP_N,GridBLAS_OP_N,
|
|
||||||
M,N,K,
|
|
||||||
alpha,
|
|
||||||
&A[0], // m x k
|
|
||||||
&B[0], // k x n
|
|
||||||
beta,
|
|
||||||
&C[0]);
|
|
||||||
synchronise();
|
|
||||||
}
|
|
||||||
RealD t1 = usecond();
|
|
||||||
RealD bytes = 1.0*sizeof(CComplex)*(M*N*2+N*K+M*K);
|
|
||||||
flops = 8.0*M*N*K*ncall;
|
|
||||||
flops = flops/(t1-t0)/1.e3;
|
|
||||||
return flops; // Returns gigaflops
|
|
||||||
}
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@ -1093,21 +1035,6 @@ static void BLAS(void)
|
|||||||
std::cout<< M<<"\t\t"<<N<<"\t\t"<<K<<"\t\t"<<BATCH<<"\t\t"<<p<<std::endl;
|
std::cout<< M<<"\t\t"<<N<<"\t\t"<<K<<"\t\t"<<BATCH<<"\t\t"<<p<<std::endl;
|
||||||
}}
|
}}
|
||||||
fprintf(FP,"\n\n\n");
|
fprintf(FP,"\n\n\n");
|
||||||
|
|
||||||
std::cout << "----------------------------------------------------------"<<std::endl;
|
|
||||||
std::cout << " M "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / rank (inner product matrix)"<<std::endl;
|
|
||||||
std::cout << "----------------------------------------------------------"<<std::endl;
|
|
||||||
{
|
|
||||||
int M=12;
|
|
||||||
int N=12;
|
|
||||||
std::vector<int> ks({4*1024*1024, 2*1024*1024, 1024*1024, 256*1024, 1024 });
|
|
||||||
for( int kk=0;kk<ks.size();kk++ ) {
|
|
||||||
int K = ks[kk];
|
|
||||||
double p=blas.benchmark<CComplex>(M,N,K);
|
|
||||||
fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, 1, p);
|
|
||||||
std::cout<< M<<"\t\t"<<N<<"\t\t"<<K<<"\t\t"<<1<<"\t\t"<<p<<std::endl;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
std::cout << "=================================================================================="<<std::endl;
|
std::cout << "=================================================================================="<<std::endl;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -1,2 +1,2 @@
|
|||||||
|
|
||||||
mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench -DGRID_SYCL
|
mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench
|
@ -1,5 +0,0 @@
|
|||||||
CXX=hipcc
|
|
||||||
MPICXX=mpicxx
|
|
||||||
CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 -I/opt/cray/pe/mpich/8.1.28/ofi/gnu/12.3/include -DGRID_HIP"
|
|
||||||
LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 -lhipblas -lrocblas -lmpi_gnu_123"
|
|
||||||
hipcc $CXXFLAGS $LDFLAGS BatchBlasBench.cc -o BatchBlasBench
|
|
@ -1,2 +0,0 @@
|
|||||||
|
|
||||||
mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench -DGRID_SYCL
|
|
@ -50,7 +50,6 @@ NAMESPACE_CHECK(approx);
|
|||||||
#include <Grid/algorithms/deflation/Deflation.h>
|
#include <Grid/algorithms/deflation/Deflation.h>
|
||||||
#include <Grid/algorithms/deflation/MultiRHSBlockProject.h>
|
#include <Grid/algorithms/deflation/MultiRHSBlockProject.h>
|
||||||
#include <Grid/algorithms/deflation/MultiRHSDeflation.h>
|
#include <Grid/algorithms/deflation/MultiRHSDeflation.h>
|
||||||
#include <Grid/algorithms/deflation/MultiRHSBlockCGLinalg.h>
|
|
||||||
NAMESPACE_CHECK(deflation);
|
NAMESPACE_CHECK(deflation);
|
||||||
#include <Grid/algorithms/iterative/ConjugateGradient.h>
|
#include <Grid/algorithms/iterative/ConjugateGradient.h>
|
||||||
NAMESPACE_CHECK(ConjGrad);
|
NAMESPACE_CHECK(ConjGrad);
|
||||||
|
@ -168,7 +168,6 @@ public:
|
|||||||
template<class vobj>
|
template<class vobj>
|
||||||
void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){
|
void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){
|
||||||
#ifndef HAVE_FFTW
|
#ifndef HAVE_FFTW
|
||||||
std::cerr << "FFTW is not compiled but is called"<<std::endl;
|
|
||||||
assert(0);
|
assert(0);
|
||||||
#else
|
#else
|
||||||
conformable(result.Grid(),vgrid);
|
conformable(result.Grid(),vgrid);
|
||||||
@ -191,7 +190,6 @@ public:
|
|||||||
|
|
||||||
Lattice<sobj> pgbuf(&pencil_g);
|
Lattice<sobj> pgbuf(&pencil_g);
|
||||||
autoView(pgbuf_v , pgbuf, CpuWrite);
|
autoView(pgbuf_v , pgbuf, CpuWrite);
|
||||||
std::cout << "CPU view" << std::endl;
|
|
||||||
|
|
||||||
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
|
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
|
||||||
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
|
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
|
||||||
@ -215,7 +213,6 @@ public:
|
|||||||
else if ( sign == forward ) div = 1.0;
|
else if ( sign == forward ) div = 1.0;
|
||||||
else assert(0);
|
else assert(0);
|
||||||
|
|
||||||
std::cout << "Making FFTW plan" << std::endl;
|
|
||||||
FFTW_plan p;
|
FFTW_plan p;
|
||||||
{
|
{
|
||||||
FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
|
FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
|
||||||
@ -229,7 +226,6 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Barrel shift and collect global pencil
|
// Barrel shift and collect global pencil
|
||||||
std::cout << "Making pencil" << std::endl;
|
|
||||||
Coordinate lcoor(Nd), gcoor(Nd);
|
Coordinate lcoor(Nd), gcoor(Nd);
|
||||||
result = source;
|
result = source;
|
||||||
int pc = processor_coor[dim];
|
int pc = processor_coor[dim];
|
||||||
@ -251,7 +247,6 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::cout << "Looping orthog" << std::endl;
|
|
||||||
// Loop over orthog coords
|
// Loop over orthog coords
|
||||||
int NN=pencil_g.lSites();
|
int NN=pencil_g.lSites();
|
||||||
GridStopWatch timer;
|
GridStopWatch timer;
|
||||||
@ -274,7 +269,6 @@ public:
|
|||||||
usec += timer.useconds();
|
usec += timer.useconds();
|
||||||
flops+= flops_call*NN;
|
flops+= flops_call*NN;
|
||||||
|
|
||||||
std::cout << "Writing back results " << std::endl;
|
|
||||||
// writing out result
|
// writing out result
|
||||||
{
|
{
|
||||||
autoView(pgbuf_v,pgbuf,CpuRead);
|
autoView(pgbuf_v,pgbuf,CpuRead);
|
||||||
@ -291,7 +285,6 @@ public:
|
|||||||
}
|
}
|
||||||
result = result*div;
|
result = result*div;
|
||||||
|
|
||||||
std::cout << "Destroying plan " << std::endl;
|
|
||||||
// destroying plan
|
// destroying plan
|
||||||
FFTW<scalar>::fftw_destroy_plan(p);
|
FFTW<scalar>::fftw_destroy_plan(p);
|
||||||
#endif
|
#endif
|
||||||
|
@ -103,38 +103,6 @@ public:
|
|||||||
_Mat.MdagM(in,out);
|
_Mat.MdagM(in,out);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
template<class Matrix,class Field>
|
|
||||||
class MMdagLinearOperator : public LinearOperatorBase<Field> {
|
|
||||||
Matrix &_Mat;
|
|
||||||
public:
|
|
||||||
MMdagLinearOperator(Matrix &Mat): _Mat(Mat){};
|
|
||||||
|
|
||||||
// Support for coarsening to a multigrid
|
|
||||||
void OpDiag (const Field &in, Field &out) {
|
|
||||||
_Mat.Mdiag(in,out);
|
|
||||||
}
|
|
||||||
void OpDir (const Field &in, Field &out,int dir,int disp) {
|
|
||||||
_Mat.Mdir(in,out,dir,disp);
|
|
||||||
}
|
|
||||||
void OpDirAll (const Field &in, std::vector<Field> &out){
|
|
||||||
_Mat.MdirAll(in,out);
|
|
||||||
};
|
|
||||||
void Op (const Field &in, Field &out){
|
|
||||||
_Mat.M(in,out);
|
|
||||||
}
|
|
||||||
void AdjOp (const Field &in, Field &out){
|
|
||||||
_Mat.Mdag(in,out);
|
|
||||||
}
|
|
||||||
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
|
|
||||||
_Mat.MMdag(in,out);
|
|
||||||
ComplexD dot = innerProduct(in,out);
|
|
||||||
n1=real(dot);
|
|
||||||
n2=norm2(out);
|
|
||||||
}
|
|
||||||
void HermOp(const Field &in, Field &out){
|
|
||||||
_Mat.MMdag(in,out);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////
|
||||||
// Construct herm op and shift it for mgrid smoother
|
// Construct herm op and shift it for mgrid smoother
|
||||||
|
@ -45,11 +45,6 @@ public:
|
|||||||
M(in,tmp);
|
M(in,tmp);
|
||||||
Mdag(tmp,out);
|
Mdag(tmp,out);
|
||||||
}
|
}
|
||||||
virtual void MMdag(const Field &in, Field &out) {
|
|
||||||
Field tmp (in.Grid());
|
|
||||||
Mdag(in,tmp);
|
|
||||||
M(tmp,out);
|
|
||||||
}
|
|
||||||
virtual void Mdiag (const Field &in, Field &out)=0;
|
virtual void Mdiag (const Field &in, Field &out)=0;
|
||||||
virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0;
|
virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0;
|
||||||
virtual void MdirAll (const Field &in, std::vector<Field> &out)=0;
|
virtual void MdirAll (const Field &in, std::vector<Field> &out)=0;
|
||||||
|
@ -59,7 +59,7 @@ public:
|
|||||||
RealD diff = hi-lo;
|
RealD diff = hi-lo;
|
||||||
RealD delta = diff*1.0e-9;
|
RealD delta = diff*1.0e-9;
|
||||||
for (RealD x=lo; x<hi; x+=delta) {
|
for (RealD x=lo; x<hi; x+=delta) {
|
||||||
delta*=1.02;
|
delta*=1.1;
|
||||||
RealD f = approx(x);
|
RealD f = approx(x);
|
||||||
out<< x<<" "<<f<<std::endl;
|
out<< x<<" "<<f<<std::endl;
|
||||||
}
|
}
|
||||||
@ -131,26 +131,6 @@ public:
|
|||||||
Coeffs[j] = s * 2.0/order;
|
Coeffs[j] = s * 2.0/order;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
template<class functor>
|
|
||||||
void Init(RealD _lo,RealD _hi,int _order, functor & func)
|
|
||||||
{
|
|
||||||
lo=_lo;
|
|
||||||
hi=_hi;
|
|
||||||
order=_order;
|
|
||||||
|
|
||||||
if(order < 2) exit(-1);
|
|
||||||
Coeffs.resize(order);
|
|
||||||
for(int j=0;j<order;j++){
|
|
||||||
RealD s=0;
|
|
||||||
for(int k=0;k<order;k++){
|
|
||||||
RealD y=std::cos(M_PI*(k+0.5)/order);
|
|
||||||
RealD x=0.5*(y*(hi-lo)+(hi+lo));
|
|
||||||
RealD f=func(x);
|
|
||||||
s=s+f*std::cos( j*M_PI*(k+0.5)/order );
|
|
||||||
}
|
|
||||||
Coeffs[j] = s * 2.0/order;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
void JacksonSmooth(void){
|
void JacksonSmooth(void){
|
||||||
|
@ -1,376 +0,0 @@
|
|||||||
/*************************************************************************************
|
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
|
||||||
|
|
||||||
Source file: MultiRHSBlockCGLinalg.h
|
|
||||||
|
|
||||||
Copyright (C) 2024
|
|
||||||
|
|
||||||
Author: Peter Boyle <pboyle@bnl.gov>
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License along
|
|
||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
|
|
||||||
See the full license in the file "LICENSE" in the top level distribution directory
|
|
||||||
*************************************************************************************/
|
|
||||||
/* END LEGAL */
|
|
||||||
#pragma once
|
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
|
||||||
|
|
||||||
|
|
||||||
/* Need helper object for BLAS accelerated mrhs blockCG */
|
|
||||||
template<class Field>
|
|
||||||
class MultiRHSBlockCGLinalg
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
|
|
||||||
typedef typename Field::scalar_type scalar;
|
|
||||||
typedef typename Field::scalar_object scalar_object;
|
|
||||||
typedef typename Field::vector_object vector_object;
|
|
||||||
|
|
||||||
deviceVector<scalar> BLAS_X; // nrhs x vol -- the sources
|
|
||||||
deviceVector<scalar> BLAS_Y; // nrhs x vol -- the result
|
|
||||||
deviceVector<scalar> BLAS_C; // nrhs x nrhs -- the coefficients
|
|
||||||
deviceVector<scalar> BLAS_Cred; // nrhs x nrhs x oSites -- reduction buffer
|
|
||||||
deviceVector<scalar *> Xdip;
|
|
||||||
deviceVector<scalar *> Ydip;
|
|
||||||
deviceVector<scalar *> Cdip;
|
|
||||||
|
|
||||||
MultiRHSBlockCGLinalg() {};
|
|
||||||
~MultiRHSBlockCGLinalg(){ Deallocate(); };
|
|
||||||
|
|
||||||
void Deallocate(void)
|
|
||||||
{
|
|
||||||
Xdip.resize(0);
|
|
||||||
Ydip.resize(0);
|
|
||||||
Cdip.resize(0);
|
|
||||||
BLAS_Cred.resize(0);
|
|
||||||
BLAS_C.resize(0);
|
|
||||||
BLAS_X.resize(0);
|
|
||||||
BLAS_Y.resize(0);
|
|
||||||
}
|
|
||||||
void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0)
|
|
||||||
{
|
|
||||||
std::vector<Field> Y_copy(AP.size(),AP[0].Grid());
|
|
||||||
for(int r=0;r<AP.size();r++){
|
|
||||||
Y_copy[r] = Y[r];
|
|
||||||
}
|
|
||||||
MulMatrix(AP,m,X);
|
|
||||||
for(int r=0;r<AP.size();r++){
|
|
||||||
AP[r] = scale*AP[r]+Y_copy[r];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
void MulMatrix(std::vector<Field> &Y, Eigen::MatrixXcd &m , const std::vector<Field> &X)
|
|
||||||
{
|
|
||||||
typedef typename Field::scalar_type scomplex;
|
|
||||||
GridBase *grid;
|
|
||||||
uint64_t vol;
|
|
||||||
uint64_t words;
|
|
||||||
|
|
||||||
int nrhs = Y.size();
|
|
||||||
grid = X[0].Grid();
|
|
||||||
vol = grid->lSites();
|
|
||||||
words = sizeof(scalar_object)/sizeof(scalar);
|
|
||||||
int64_t vw = vol * words;
|
|
||||||
|
|
||||||
RealD t0 = usecond();
|
|
||||||
BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
|
|
||||||
BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
|
|
||||||
BLAS_C.resize(nrhs * nrhs);// cost free if size doesn't change
|
|
||||||
RealD t1 = usecond();
|
|
||||||
|
|
||||||
/////////////////////////////////////////////
|
|
||||||
// Copy in the multi-rhs sources
|
|
||||||
/////////////////////////////////////////////
|
|
||||||
for(int r=0;r<nrhs;r++){
|
|
||||||
int64_t offset = r*vw;
|
|
||||||
autoView(x_v,X[r],AcceleratorRead);
|
|
||||||
acceleratorCopyDeviceToDevice(&x_v[0],&BLAS_X[offset],sizeof(scalar_object)*vol);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Assumes Eigen storage contiguous
|
|
||||||
acceleratorCopyToDevice(&m(0,0),&BLAS_C[0],BLAS_C.size()*sizeof(scalar));
|
|
||||||
|
|
||||||
/*
|
|
||||||
* in Fortran column major notation (cuBlas order)
|
|
||||||
*
|
|
||||||
* Xxr = [X1(x)][..][Xn(x)]
|
|
||||||
* Yxr = [Y1(x)][..][Ym(x)]
|
|
||||||
* Y = X . C
|
|
||||||
*/
|
|
||||||
deviceVector<scalar *> Xd(1);
|
|
||||||
deviceVector<scalar *> Yd(1);
|
|
||||||
deviceVector<scalar *> Cd(1);
|
|
||||||
|
|
||||||
scalar * Xh = & BLAS_X[0];
|
|
||||||
scalar * Yh = & BLAS_Y[0];
|
|
||||||
scalar * Ch = & BLAS_C[0];
|
|
||||||
|
|
||||||
acceleratorPut(Xd[0],Xh);
|
|
||||||
acceleratorPut(Yd[0],Yh);
|
|
||||||
acceleratorPut(Cd[0],Ch);
|
|
||||||
|
|
||||||
RealD t2 = usecond();
|
|
||||||
GridBLAS BLAS;
|
|
||||||
/////////////////////////////////////////
|
|
||||||
// Y = X*C (transpose?)
|
|
||||||
/////////////////////////////////////////
|
|
||||||
BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
|
|
||||||
vw,nrhs,nrhs,
|
|
||||||
scalar(1.0),
|
|
||||||
Xd,
|
|
||||||
Cd,
|
|
||||||
scalar(0.0), // wipe out Y
|
|
||||||
Yd);
|
|
||||||
BLAS.synchronise();
|
|
||||||
RealD t3 = usecond();
|
|
||||||
|
|
||||||
// Copy back Y = m X
|
|
||||||
for(int r=0;r<nrhs;r++){
|
|
||||||
int64_t offset = r*vw;
|
|
||||||
autoView(y_v,Y[r],AcceleratorWrite);
|
|
||||||
acceleratorCopyDeviceToDevice(&BLAS_Y[offset],&y_v[0],sizeof(scalar_object)*vol);
|
|
||||||
}
|
|
||||||
RealD t4 = usecond();
|
|
||||||
std::cout << "MulMatrix alloc took "<< t1-t0<<" us"<<std::endl;
|
|
||||||
std::cout << "MulMatrix preamble took "<< t2-t1<<" us"<<std::endl;
|
|
||||||
std::cout << "MulMatrix blas took "<< t3-t2<<" us"<<std::endl;
|
|
||||||
std::cout << "MulMatrix copy took "<< t4-t3<<" us"<<std::endl;
|
|
||||||
std::cout << "MulMatrix total "<< t4-t0<<" us"<<std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y)
|
|
||||||
{
|
|
||||||
#if 0
|
|
||||||
int nrhs;
|
|
||||||
GridBase *grid;
|
|
||||||
uint64_t vol;
|
|
||||||
uint64_t words;
|
|
||||||
|
|
||||||
nrhs = X.size();
|
|
||||||
assert(X.size()==Y.size());
|
|
||||||
conformable(X[0],Y[0]);
|
|
||||||
|
|
||||||
grid = X[0].Grid();
|
|
||||||
vol = grid->lSites();
|
|
||||||
words = sizeof(scalar_object)/sizeof(scalar);
|
|
||||||
int64_t vw = vol * words;
|
|
||||||
|
|
||||||
RealD t0 = usecond();
|
|
||||||
BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
|
|
||||||
BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
|
|
||||||
BLAS_C.resize(nrhs * nrhs);// cost free if size doesn't change
|
|
||||||
RealD t1 = usecond();
|
|
||||||
|
|
||||||
/////////////////////////////////////////////
|
|
||||||
// Copy in the multi-rhs sources
|
|
||||||
/////////////////////////////////////////////
|
|
||||||
for(int r=0;r<nrhs;r++){
|
|
||||||
int64_t offset = r*vw;
|
|
||||||
autoView(x_v,X[r],AcceleratorRead);
|
|
||||||
acceleratorCopyDeviceToDevice(&x_v[0],&BLAS_X[offset],sizeof(scalar_object)*vol);
|
|
||||||
autoView(y_v,Y[r],AcceleratorRead);
|
|
||||||
acceleratorCopyDeviceToDevice(&y_v[0],&BLAS_Y[offset],sizeof(scalar_object)*vol);
|
|
||||||
}
|
|
||||||
RealD t2 = usecond();
|
|
||||||
|
|
||||||
/*
|
|
||||||
* in Fortran column major notation (cuBlas order)
|
|
||||||
*
|
|
||||||
* Xxr = [X1(x)][..][Xn(x)]
|
|
||||||
*
|
|
||||||
* Yxr = [Y1(x)][..][Ym(x)]
|
|
||||||
*
|
|
||||||
* C_rs = X^dag Y
|
|
||||||
*/
|
|
||||||
deviceVector<scalar *> Xd(1);
|
|
||||||
deviceVector<scalar *> Yd(1);
|
|
||||||
deviceVector<scalar *> Cd(1);
|
|
||||||
|
|
||||||
scalar * Xh = & BLAS_X[0];
|
|
||||||
scalar * Yh = & BLAS_Y[0];
|
|
||||||
scalar * Ch = & BLAS_C[0];
|
|
||||||
|
|
||||||
acceleratorPut(Xd[0],Xh);
|
|
||||||
acceleratorPut(Yd[0],Yh);
|
|
||||||
acceleratorPut(Cd[0],Ch);
|
|
||||||
|
|
||||||
GridBLAS BLAS;
|
|
||||||
|
|
||||||
RealD t3 = usecond();
|
|
||||||
/////////////////////////////////////////
|
|
||||||
// C_rs = X^dag Y
|
|
||||||
/////////////////////////////////////////
|
|
||||||
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
|
|
||||||
nrhs,nrhs,vw,
|
|
||||||
ComplexD(1.0),
|
|
||||||
Xd,
|
|
||||||
Yd,
|
|
||||||
ComplexD(0.0), // wipe out C
|
|
||||||
Cd);
|
|
||||||
BLAS.synchronise();
|
|
||||||
RealD t4 = usecond();
|
|
||||||
|
|
||||||
std::vector<scalar> HOST_C(BLAS_C.size()); // nrhs . nrhs -- the coefficients
|
|
||||||
acceleratorCopyFromDevice(&BLAS_C[0],&HOST_C[0],BLAS_C.size()*sizeof(scalar));
|
|
||||||
grid->GlobalSumVector(&HOST_C[0],nrhs*nrhs);
|
|
||||||
|
|
||||||
RealD t5 = usecond();
|
|
||||||
for(int rr=0;rr<nrhs;rr++){
|
|
||||||
for(int r=0;r<nrhs;r++){
|
|
||||||
int off = r+nrhs*rr;
|
|
||||||
m(r,rr)=HOST_C[off];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
RealD t6 = usecond();
|
|
||||||
uint64_t M=nrhs;
|
|
||||||
uint64_t N=nrhs;
|
|
||||||
uint64_t K=vw;
|
|
||||||
RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K);
|
|
||||||
RealD flops = 8.0*M*N*K;
|
|
||||||
flops = flops/(t4-t3)/1.e3;
|
|
||||||
bytes = bytes/(t4-t3)/1.e3;
|
|
||||||
std::cout << "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
|
|
||||||
std::cout << "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
|
|
||||||
std::cout << "InnerProductMatrix cp t2 "<< t2-t1<<" us"<<std::endl;
|
|
||||||
std::cout << "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
|
|
||||||
std::cout << "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
|
|
||||||
std::cout << "InnerProductMatrix blas "<< flops<<" GF/s"<<std::endl;
|
|
||||||
std::cout << "InnerProductMatrix blas "<< bytes<<" GB/s"<<std::endl;
|
|
||||||
std::cout << "InnerProductMatrix gsum t5 "<< t5-t4<<" us"<<std::endl;
|
|
||||||
std::cout << "InnerProductMatrix cp t6 "<< t6-t5<<" us"<<std::endl;
|
|
||||||
std::cout << "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
|
|
||||||
#else
|
|
||||||
int nrhs;
|
|
||||||
GridBase *grid;
|
|
||||||
uint64_t vol;
|
|
||||||
uint64_t words;
|
|
||||||
|
|
||||||
nrhs = X.size();
|
|
||||||
assert(X.size()==Y.size());
|
|
||||||
conformable(X[0],Y[0]);
|
|
||||||
|
|
||||||
grid = X[0].Grid();
|
|
||||||
int rd0 = grid->_rdimensions[0] * grid->_rdimensions[1];
|
|
||||||
vol = grid->oSites()/rd0;
|
|
||||||
words = rd0*sizeof(vector_object)/sizeof(scalar);
|
|
||||||
int64_t vw = vol * words;
|
|
||||||
assert(vw == grid->lSites()*sizeof(scalar_object)/sizeof(scalar));
|
|
||||||
|
|
||||||
RealD t0 = usecond();
|
|
||||||
BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
|
|
||||||
BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
|
|
||||||
BLAS_Cred.resize(nrhs * nrhs * vol);// cost free if size doesn't change
|
|
||||||
RealD t1 = usecond();
|
|
||||||
|
|
||||||
/////////////////////////////////////////////
|
|
||||||
// Copy in the multi-rhs sources -- layout batched BLAS ready
|
|
||||||
/////////////////////////////////////////////
|
|
||||||
for(int r=0;r<nrhs;r++){
|
|
||||||
autoView(x_v,X[r],AcceleratorRead);
|
|
||||||
autoView(y_v,Y[r],AcceleratorRead);
|
|
||||||
scalar *from_x=(scalar *)&x_v[0];
|
|
||||||
scalar *from_y=(scalar *)&y_v[0];
|
|
||||||
scalar *BX = &BLAS_X[0];
|
|
||||||
scalar *BY = &BLAS_Y[0];
|
|
||||||
accelerator_for(ssw,vw,1,{
|
|
||||||
uint64_t ss=ssw/words;
|
|
||||||
uint64_t w=ssw%words;
|
|
||||||
uint64_t offset = w+r*words+ss*nrhs*words; // [ss][rhs][words]
|
|
||||||
BX[offset] = from_x[ssw];
|
|
||||||
BY[offset] = from_y[ssw];
|
|
||||||
});
|
|
||||||
}
|
|
||||||
RealD t2 = usecond();
|
|
||||||
|
|
||||||
/*
|
|
||||||
* in Fortran column major notation (cuBlas order)
|
|
||||||
*
|
|
||||||
* Xxr = [X1(x)][..][Xn(x)]
|
|
||||||
*
|
|
||||||
* Yxr = [Y1(x)][..][Ym(x)]
|
|
||||||
*
|
|
||||||
* C_rs = X^dag Y
|
|
||||||
*/
|
|
||||||
Xdip.resize(vol);
|
|
||||||
Ydip.resize(vol);
|
|
||||||
Cdip.resize(vol);
|
|
||||||
std::vector<scalar *> Xh(vol);
|
|
||||||
std::vector<scalar *> Yh(vol);
|
|
||||||
std::vector<scalar *> Ch(vol);
|
|
||||||
for(uint64_t ss=0;ss<vol;ss++){
|
|
||||||
|
|
||||||
Xh[ss] = & BLAS_X[ss*nrhs*words];
|
|
||||||
Yh[ss] = & BLAS_Y[ss*nrhs*words];
|
|
||||||
Ch[ss] = & BLAS_Cred[ss*nrhs*nrhs];
|
|
||||||
|
|
||||||
}
|
|
||||||
acceleratorCopyToDevice(&Xh[0],&Xdip[0],vol*sizeof(scalar *));
|
|
||||||
acceleratorCopyToDevice(&Yh[0],&Ydip[0],vol*sizeof(scalar *));
|
|
||||||
acceleratorCopyToDevice(&Ch[0],&Cdip[0],vol*sizeof(scalar *));
|
|
||||||
|
|
||||||
GridBLAS BLAS;
|
|
||||||
|
|
||||||
RealD t3 = usecond();
|
|
||||||
/////////////////////////////////////////
|
|
||||||
// C_rs = X^dag Y
|
|
||||||
/////////////////////////////////////////
|
|
||||||
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
|
|
||||||
nrhs,nrhs,words,
|
|
||||||
ComplexD(1.0),
|
|
||||||
Xdip,
|
|
||||||
Ydip,
|
|
||||||
ComplexD(0.0), // wipe out C
|
|
||||||
Cdip);
|
|
||||||
BLAS.synchronise();
|
|
||||||
RealD t4 = usecond();
|
|
||||||
|
|
||||||
std::vector<scalar> HOST_C(BLAS_Cred.size()); // nrhs . nrhs -- the coefficients
|
|
||||||
acceleratorCopyFromDevice(&BLAS_Cred[0],&HOST_C[0],BLAS_Cred.size()*sizeof(scalar));
|
|
||||||
|
|
||||||
RealD t5 = usecond();
|
|
||||||
m = Eigen::MatrixXcd::Zero(nrhs,nrhs);
|
|
||||||
for(int ss=0;ss<vol;ss++){
|
|
||||||
Eigen::Map<Eigen::MatrixXcd> eC((std::complex<double> *)&HOST_C[ss*nrhs*nrhs],nrhs,nrhs);
|
|
||||||
m = m + eC;
|
|
||||||
}
|
|
||||||
RealD t6l = usecond();
|
|
||||||
grid->GlobalSumVector((scalar *) &m(0,0),nrhs*nrhs);
|
|
||||||
RealD t6 = usecond();
|
|
||||||
uint64_t M=nrhs;
|
|
||||||
uint64_t N=nrhs;
|
|
||||||
uint64_t K=vw;
|
|
||||||
RealD xybytes = grid->lSites()*sizeof(scalar_object);
|
|
||||||
RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K);
|
|
||||||
RealD flops = 8.0*M*N*K;
|
|
||||||
flops = flops/(t4-t3)/1.e3;
|
|
||||||
bytes = bytes/(t4-t3)/1.e3;
|
|
||||||
xybytes = 4*xybytes/(t2-t1)/1.e3;
|
|
||||||
std::cout << "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
|
|
||||||
std::cout << "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
|
|
||||||
std::cout << "InnerProductMatrix cp t2 "<< t2-t1<<" us "<<xybytes<<" GB/s"<<std::endl;
|
|
||||||
std::cout << "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
|
|
||||||
std::cout << "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
|
|
||||||
std::cout << "InnerProductMatrix blas "<< flops<<" GF/s"<<std::endl;
|
|
||||||
std::cout << "InnerProductMatrix blas "<< bytes<<" GB/s"<<std::endl;
|
|
||||||
std::cout << "InnerProductMatrix cp t5 "<< t5-t4<<" us"<<std::endl;
|
|
||||||
std::cout << "InnerProductMatrix lsum t6l "<< t6l-t5<<" us"<<std::endl;
|
|
||||||
std::cout << "InnerProductMatrix gsum t6 "<< t6-t6l<<" us"<<std::endl;
|
|
||||||
std::cout << "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
|
@ -447,10 +447,10 @@ public:
|
|||||||
/////////////////////////////////////////
|
/////////////////////////////////////////
|
||||||
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
|
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
|
||||||
nbasis,nrhs,vw,
|
nbasis,nrhs,vw,
|
||||||
scalar(1.0),
|
ComplexD(1.0),
|
||||||
Vd,
|
Vd,
|
||||||
Fd,
|
Fd,
|
||||||
scalar(0.0), // wipe out C
|
ComplexD(0.0), // wipe out C
|
||||||
Cd);
|
Cd);
|
||||||
BLAS.synchronise();
|
BLAS.synchronise();
|
||||||
// std::cout << "BlockProject done"<<std::endl;
|
// std::cout << "BlockProject done"<<std::endl;
|
||||||
@ -497,10 +497,10 @@ public:
|
|||||||
int64_t vw = block_vol * words;
|
int64_t vw = block_vol * words;
|
||||||
BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
|
BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
|
||||||
vw,nrhs,nbasis,
|
vw,nrhs,nbasis,
|
||||||
scalar(1.0),
|
ComplexD(1.0),
|
||||||
Vd,
|
Vd,
|
||||||
Cd,
|
Cd,
|
||||||
scalar(0.0), // wipe out C
|
ComplexD(0.0), // wipe out C
|
||||||
Fd);
|
Fd);
|
||||||
BLAS.synchronise();
|
BLAS.synchronise();
|
||||||
// std::cout << " blas call done"<<std::endl;
|
// std::cout << " blas call done"<<std::endl;
|
||||||
|
@ -182,10 +182,10 @@ public:
|
|||||||
/////////////////////////////////////////
|
/////////////////////////////////////////
|
||||||
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
|
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
|
||||||
nev,nrhs,vw,
|
nev,nrhs,vw,
|
||||||
scalar(1.0),
|
ComplexD(1.0),
|
||||||
Ed,
|
Ed,
|
||||||
Rd,
|
Rd,
|
||||||
scalar(0.0), // wipe out C
|
ComplexD(0.0), // wipe out C
|
||||||
Cd);
|
Cd);
|
||||||
BLAS.synchronise();
|
BLAS.synchronise();
|
||||||
|
|
||||||
@ -210,10 +210,10 @@ public:
|
|||||||
/////////////////////////////////////////
|
/////////////////////////////////////////
|
||||||
BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
|
BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
|
||||||
vw,nrhs,nev,
|
vw,nrhs,nev,
|
||||||
scalar(1.0),
|
ComplexD(1.0),
|
||||||
Ed, // x . nev
|
Ed, // x . nev
|
||||||
Cd, // nev . nrhs
|
Cd, // nev . nrhs
|
||||||
scalar(0.0),
|
ComplexD(0.0),
|
||||||
Gd);
|
Gd);
|
||||||
BLAS.synchronise();
|
BLAS.synchronise();
|
||||||
|
|
||||||
|
@ -53,7 +53,6 @@ class TwoLevelCGmrhs
|
|||||||
// Fine operator, Smoother, CoarseSolver
|
// Fine operator, Smoother, CoarseSolver
|
||||||
LinearOperatorBase<Field> &_FineLinop;
|
LinearOperatorBase<Field> &_FineLinop;
|
||||||
LinearFunction<Field> &_Smoother;
|
LinearFunction<Field> &_Smoother;
|
||||||
MultiRHSBlockCGLinalg<Field> _BlockCGLinalg;
|
|
||||||
|
|
||||||
GridStopWatch ProjectTimer;
|
GridStopWatch ProjectTimer;
|
||||||
GridStopWatch PromoteTimer;
|
GridStopWatch PromoteTimer;
|
||||||
@ -80,301 +79,6 @@ class TwoLevelCGmrhs
|
|||||||
|
|
||||||
// Vector case
|
// Vector case
|
||||||
virtual void operator() (std::vector<Field> &src, std::vector<Field> &x)
|
virtual void operator() (std::vector<Field> &src, std::vector<Field> &x)
|
||||||
{
|
|
||||||
SolveSingleSystem(src,x);
|
|
||||||
// SolvePrecBlockCG(src,x);
|
|
||||||
}
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Thin QR factorisation (google it)
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
//Dimensions
|
|
||||||
// R_{ferm x Nblock} = Q_{ferm x Nblock} x C_{Nblock x Nblock} -> ferm x Nblock
|
|
||||||
//
|
|
||||||
// Rdag R = m_rr = Herm = L L^dag <-- Cholesky decomposition (LLT routine in Eigen)
|
|
||||||
//
|
|
||||||
// Q C = R => Q = R C^{-1}
|
|
||||||
//
|
|
||||||
// Want Ident = Q^dag Q = C^{-dag} R^dag R C^{-1} = C^{-dag} L L^dag C^{-1} = 1_{Nblock x Nblock}
|
|
||||||
//
|
|
||||||
// Set C = L^{dag}, and then Q^dag Q = ident
|
|
||||||
//
|
|
||||||
// Checks:
|
|
||||||
// Cdag C = Rdag R ; passes.
|
|
||||||
// QdagQ = 1 ; passes
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
void ThinQRfact (Eigen::MatrixXcd &m_zz,
|
|
||||||
Eigen::MatrixXcd &C,
|
|
||||||
Eigen::MatrixXcd &Cinv,
|
|
||||||
std::vector<Field> & Q,
|
|
||||||
std::vector<Field> & MQ,
|
|
||||||
const std::vector<Field> & Z,
|
|
||||||
const std::vector<Field> & MZ)
|
|
||||||
{
|
|
||||||
RealD t0=usecond();
|
|
||||||
_BlockCGLinalg.InnerProductMatrix(m_zz,MZ,Z);
|
|
||||||
RealD t1=usecond();
|
|
||||||
|
|
||||||
m_zz = 0.5*(m_zz+m_zz.adjoint());
|
|
||||||
|
|
||||||
Eigen::MatrixXcd L = m_zz.llt().matrixL();
|
|
||||||
|
|
||||||
C = L.adjoint();
|
|
||||||
Cinv = C.inverse();
|
|
||||||
|
|
||||||
RealD t3=usecond();
|
|
||||||
_BlockCGLinalg.MulMatrix( Q,Cinv,Z);
|
|
||||||
_BlockCGLinalg.MulMatrix(MQ,Cinv,MZ);
|
|
||||||
RealD t4=usecond();
|
|
||||||
std::cout << " ThinQRfact IP :"<< t1-t0<<" us"<<std::endl;
|
|
||||||
std::cout << " ThinQRfact Eigen :"<< t3-t1<<" us"<<std::endl;
|
|
||||||
std::cout << " ThinQRfact MulMat:"<< t4-t3<<" us"<<std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
virtual void SolvePrecBlockCG (std::vector<Field> &src, std::vector<Field> &X)
|
|
||||||
{
|
|
||||||
std::cout << GridLogMessage<<"HDCG: mrhs fPrecBlockcg starting"<<std::endl;
|
|
||||||
src[0].Grid()->Barrier();
|
|
||||||
int nrhs = src.size();
|
|
||||||
// std::vector<RealD> f(nrhs);
|
|
||||||
// std::vector<RealD> rtzp(nrhs);
|
|
||||||
// std::vector<RealD> rtz(nrhs);
|
|
||||||
// std::vector<RealD> a(nrhs);
|
|
||||||
// std::vector<RealD> d(nrhs);
|
|
||||||
// std::vector<RealD> b(nrhs);
|
|
||||||
// std::vector<RealD> rptzp(nrhs);
|
|
||||||
|
|
||||||
////////////////////////////////////////////
|
|
||||||
//Initial residual computation & set up
|
|
||||||
////////////////////////////////////////////
|
|
||||||
std::vector<RealD> ssq(nrhs);
|
|
||||||
for(int rhs=0;rhs<nrhs;rhs++){
|
|
||||||
ssq[rhs]=norm2(src[rhs]); assert(ssq[rhs]!=0.0);
|
|
||||||
}
|
|
||||||
|
|
||||||
///////////////////////////
|
|
||||||
// Fields -- eliminate duplicates between fPcg and block cg
|
|
||||||
///////////////////////////
|
|
||||||
std::vector<Field> Mtmp(nrhs,grid);
|
|
||||||
std::vector<Field> tmp(nrhs,grid);
|
|
||||||
std::vector<Field> Z(nrhs,grid); // Rename Z to R
|
|
||||||
std::vector<Field> MZ(nrhs,grid); // Rename MZ to Z
|
|
||||||
std::vector<Field> Q(nrhs,grid); //
|
|
||||||
std::vector<Field> MQ(nrhs,grid); // Rename to P
|
|
||||||
std::vector<Field> D(nrhs,grid);
|
|
||||||
std::vector<Field> AD(nrhs,grid);
|
|
||||||
|
|
||||||
/************************************************************************
|
|
||||||
* Preconditioned Block conjugate gradient rQ
|
|
||||||
* Generalise Sebastien Birk Thesis, after Dubrulle 2001.
|
|
||||||
* Introduce preconditioning following Saad Ch9
|
|
||||||
************************************************************************
|
|
||||||
* Dimensions:
|
|
||||||
*
|
|
||||||
* X,B etc... ==(Nferm x nrhs)
|
|
||||||
* Matrix A==(Nferm x Nferm)
|
|
||||||
*
|
|
||||||
* Nferm = Nspin x Ncolour x Ncomplex x Nlattice_site
|
|
||||||
* QC => Thin QR factorisation (google it)
|
|
||||||
*
|
|
||||||
* R = B-AX
|
|
||||||
* Z = Mi R
|
|
||||||
* QC = Z
|
|
||||||
* D = Q
|
|
||||||
* for k:
|
|
||||||
* R = AD
|
|
||||||
* Z = Mi R
|
|
||||||
* M = [D^dag R]^{-1}
|
|
||||||
* X = X + D M C
|
|
||||||
* QS = Q - Z.M
|
|
||||||
* D = Q + D S^dag
|
|
||||||
* C = S C
|
|
||||||
*/
|
|
||||||
Eigen::MatrixXcd m_DZ = Eigen::MatrixXcd::Identity(nrhs,nrhs);
|
|
||||||
Eigen::MatrixXcd m_M = Eigen::MatrixXcd::Identity(nrhs,nrhs);
|
|
||||||
Eigen::MatrixXcd m_zz = Eigen::MatrixXcd::Zero(nrhs,nrhs);
|
|
||||||
Eigen::MatrixXcd m_rr = Eigen::MatrixXcd::Zero(nrhs,nrhs);
|
|
||||||
|
|
||||||
Eigen::MatrixXcd m_C = Eigen::MatrixXcd::Zero(nrhs,nrhs);
|
|
||||||
Eigen::MatrixXcd m_Cinv = Eigen::MatrixXcd::Zero(nrhs,nrhs);
|
|
||||||
Eigen::MatrixXcd m_S = Eigen::MatrixXcd::Zero(nrhs,nrhs);
|
|
||||||
Eigen::MatrixXcd m_Sinv = Eigen::MatrixXcd::Zero(nrhs,nrhs);
|
|
||||||
|
|
||||||
Eigen::MatrixXcd m_tmp = Eigen::MatrixXcd::Identity(nrhs,nrhs);
|
|
||||||
Eigen::MatrixXcd m_tmp1 = Eigen::MatrixXcd::Identity(nrhs,nrhs);
|
|
||||||
|
|
||||||
GridStopWatch HDCGTimer;
|
|
||||||
|
|
||||||
//////////////////////////
|
|
||||||
// x0 = Vstart -- possibly modify guess
|
|
||||||
//////////////////////////
|
|
||||||
Vstart(X,src);
|
|
||||||
|
|
||||||
//////////////////////////
|
|
||||||
// R = B-AX
|
|
||||||
//////////////////////////
|
|
||||||
for(int rhs=0;rhs<nrhs;rhs++){
|
|
||||||
// r0 = b -A x0
|
|
||||||
_FineLinop.HermOp(X[rhs],tmp[rhs]);
|
|
||||||
axpy (Z[rhs], -1.0,tmp[rhs], src[rhs]); // Computes R=Z=src - A X0
|
|
||||||
}
|
|
||||||
|
|
||||||
//////////////////////////////////
|
|
||||||
// Compute MZ = M1 Z = M1 B - M1 A x0
|
|
||||||
//////////////////////////////////
|
|
||||||
PcgM1(Z,MZ);
|
|
||||||
|
|
||||||
//////////////////////////////////
|
|
||||||
// QC = Z
|
|
||||||
//////////////////////////////////
|
|
||||||
ThinQRfact (m_zz, m_C, m_Cinv, Q, MQ, Z, MZ);
|
|
||||||
|
|
||||||
//////////////////////////////////
|
|
||||||
// D=MQ
|
|
||||||
//////////////////////////////////
|
|
||||||
for(int b=0;b<nrhs;b++) D[b]=MQ[b]; // LLT rotation of the MZ basis of search dirs
|
|
||||||
|
|
||||||
std::cout << GridLogMessage<<"PrecBlockCGrQ vec computed initial residual and QR fact " <<std::endl;
|
|
||||||
|
|
||||||
ProjectTimer.Reset();
|
|
||||||
PromoteTimer.Reset();
|
|
||||||
DeflateTimer.Reset();
|
|
||||||
CoarseTimer.Reset();
|
|
||||||
SmoothTimer.Reset();
|
|
||||||
FineTimer.Reset();
|
|
||||||
InsertTimer.Reset();
|
|
||||||
|
|
||||||
GridStopWatch M1Timer;
|
|
||||||
GridStopWatch M2Timer;
|
|
||||||
GridStopWatch M3Timer;
|
|
||||||
GridStopWatch LinalgTimer;
|
|
||||||
GridStopWatch InnerProdTimer;
|
|
||||||
|
|
||||||
HDCGTimer.Start();
|
|
||||||
|
|
||||||
std::vector<RealD> rn(nrhs);
|
|
||||||
for (int k=0;k<=MaxIterations;k++){
|
|
||||||
|
|
||||||
////////////////////
|
|
||||||
// Z = AD
|
|
||||||
////////////////////
|
|
||||||
M3Timer.Start();
|
|
||||||
for(int b=0;b<nrhs;b++) _FineLinop.HermOp(D[b], Z[b]);
|
|
||||||
M3Timer.Stop();
|
|
||||||
|
|
||||||
////////////////////
|
|
||||||
// MZ = M1 Z <==== the Multigrid preconditioner
|
|
||||||
////////////////////
|
|
||||||
M1Timer.Start();
|
|
||||||
PcgM1(Z,MZ);
|
|
||||||
M1Timer.Stop();
|
|
||||||
|
|
||||||
FineTimer.Start();
|
|
||||||
////////////////////
|
|
||||||
// M = [D^dag Z]^{-1} = (<Ddag MZ>_M)^{-1} inner prod, generalising Saad derivation of Precon CG
|
|
||||||
////////////////////
|
|
||||||
InnerProdTimer.Start();
|
|
||||||
_BlockCGLinalg.InnerProductMatrix(m_DZ,D,Z);
|
|
||||||
InnerProdTimer.Stop();
|
|
||||||
m_M = m_DZ.inverse();
|
|
||||||
|
|
||||||
///////////////////////////
|
|
||||||
// X = X + D MC
|
|
||||||
///////////////////////////
|
|
||||||
m_tmp = m_M * m_C;
|
|
||||||
LinalgTimer.Start();
|
|
||||||
_BlockCGLinalg.MaddMatrix(X,m_tmp, D,X); // D are the search directions and X takes the updates
|
|
||||||
LinalgTimer.Stop();
|
|
||||||
|
|
||||||
///////////////////////////
|
|
||||||
// QS = Q - M Z
|
|
||||||
// (MQ) S = MQ - M (M1Z)
|
|
||||||
///////////////////////////
|
|
||||||
LinalgTimer.Start();
|
|
||||||
_BlockCGLinalg.MaddMatrix(tmp ,m_M, Z, Q,-1.0);
|
|
||||||
_BlockCGLinalg.MaddMatrix(Mtmp,m_M,MZ,MQ,-1.0);
|
|
||||||
ThinQRfact (m_zz, m_S, m_Sinv, Q, MQ, tmp, Mtmp);
|
|
||||||
LinalgTimer.Stop();
|
|
||||||
|
|
||||||
////////////////////////////
|
|
||||||
// D = MQ + D S^dag
|
|
||||||
////////////////////////////
|
|
||||||
m_tmp = m_S.adjoint();
|
|
||||||
LinalgTimer.Start();
|
|
||||||
_BlockCGLinalg.MaddMatrix(D,m_tmp,D,MQ);
|
|
||||||
LinalgTimer.Stop();
|
|
||||||
|
|
||||||
////////////////////////////
|
|
||||||
// C = S C
|
|
||||||
////////////////////////////
|
|
||||||
m_C = m_S*m_C;
|
|
||||||
|
|
||||||
////////////////////////////
|
|
||||||
// convergence monitor
|
|
||||||
////////////////////////////
|
|
||||||
m_rr = m_C.adjoint() * m_C;
|
|
||||||
|
|
||||||
FineTimer.Stop();
|
|
||||||
|
|
||||||
RealD max_resid=0;
|
|
||||||
RealD rrsum=0;
|
|
||||||
RealD sssum=0;
|
|
||||||
RealD rr;
|
|
||||||
|
|
||||||
for(int b=0;b<nrhs;b++) {
|
|
||||||
rrsum+=real(m_rr(b,b));
|
|
||||||
sssum+=ssq[b];
|
|
||||||
rr = real(m_rr(b,b))/ssq[b];
|
|
||||||
if ( rr > max_resid ) max_resid = rr;
|
|
||||||
}
|
|
||||||
std::cout << GridLogMessage <<
|
|
||||||
"\t Prec BlockCGrQ Iteration "<<k<<" ave resid "<< std::sqrt(rrsum/sssum) << " max "<< std::sqrt(max_resid) <<std::endl;
|
|
||||||
|
|
||||||
|
|
||||||
if ( max_resid < Tolerance*Tolerance ) {
|
|
||||||
|
|
||||||
HDCGTimer.Stop();
|
|
||||||
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ converged in "<<k<<" iterations and "<<HDCGTimer.Elapsed()<<std::endl;;
|
|
||||||
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Linalg "<<LinalgTimer.Elapsed()<<std::endl;;
|
|
||||||
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : fine H "<<M3Timer.Elapsed()<<std::endl;;
|
|
||||||
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : prec M1 "<<M1Timer.Elapsed()<<std::endl;;
|
|
||||||
std::cout<<GridLogMessage<<"**** M1 breakdown:"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Project "<<ProjectTimer.Elapsed()<<std::endl;;
|
|
||||||
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Promote "<<PromoteTimer.Elapsed()<<std::endl;;
|
|
||||||
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Deflate "<<DeflateTimer.Elapsed()<<std::endl;;
|
|
||||||
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Coarse "<<CoarseTimer.Elapsed()<<std::endl;;
|
|
||||||
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Fine "<<FineTimer.Elapsed()<<std::endl;;
|
|
||||||
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Smooth "<<SmoothTimer.Elapsed()<<std::endl;;
|
|
||||||
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Insert "<<InsertTimer.Elapsed()<<std::endl;;
|
|
||||||
|
|
||||||
for(int rhs=0;rhs<nrhs;rhs++){
|
|
||||||
|
|
||||||
_FineLinop.HermOp(X[rhs],tmp[rhs]);
|
|
||||||
|
|
||||||
Field mytmp(grid);
|
|
||||||
axpy(mytmp,-1.0,src[rhs],tmp[rhs]);
|
|
||||||
|
|
||||||
RealD xnorm = sqrt(norm2(X[rhs]));
|
|
||||||
RealD srcnorm = sqrt(norm2(src[rhs]));
|
|
||||||
RealD tmpnorm = sqrt(norm2(mytmp));
|
|
||||||
RealD true_residual = tmpnorm/srcnorm;
|
|
||||||
std::cout<<GridLogMessage
|
|
||||||
<<"HDCG: true residual ["<<rhs<<"] is "<<true_residual
|
|
||||||
<<" solution "<<xnorm
|
|
||||||
<<" source "<<srcnorm
|
|
||||||
<<std::endl;
|
|
||||||
}
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
HDCGTimer.Stop();
|
|
||||||
std::cout<<GridLogMessage<<"HDCG: PrecBlockCGrQ not converged "<<HDCGTimer.Elapsed()<<std::endl;
|
|
||||||
assert(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
virtual void SolveSingleSystem (std::vector<Field> &src, std::vector<Field> &x)
|
|
||||||
{
|
{
|
||||||
std::cout << GridLogMessage<<"HDCG: mrhs fPcg starting"<<std::endl;
|
std::cout << GridLogMessage<<"HDCG: mrhs fPcg starting"<<std::endl;
|
||||||
src[0].Grid()->Barrier();
|
src[0].Grid()->Barrier();
|
||||||
@ -657,23 +361,15 @@ public:
|
|||||||
CoarseField PleftProjMrhs(this->coarsegridmrhs);
|
CoarseField PleftProjMrhs(this->coarsegridmrhs);
|
||||||
CoarseField PleftMss_projMrhs(this->coarsegridmrhs);
|
CoarseField PleftMss_projMrhs(this->coarsegridmrhs);
|
||||||
|
|
||||||
#undef SMOOTHER_BLOCK_SOLVE
|
|
||||||
#if SMOOTHER_BLOCK_SOLVE
|
|
||||||
this->SmoothTimer.Start();
|
|
||||||
this->_Smoother(in,Min);
|
|
||||||
this->SmoothTimer.Stop();
|
|
||||||
#else
|
|
||||||
for(int rhs=0;rhs<nrhs;rhs++) {
|
for(int rhs=0;rhs<nrhs;rhs++) {
|
||||||
|
|
||||||
this->SmoothTimer.Start();
|
this->SmoothTimer.Start();
|
||||||
this->_Smoother(in[rhs],Min[rhs]);
|
this->_Smoother(in[rhs],Min[rhs]);
|
||||||
this->SmoothTimer.Stop();
|
this->SmoothTimer.Stop();
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
for(int rhs=0;rhs<nrhs;rhs++) {
|
|
||||||
|
|
||||||
this->FineTimer.Start();
|
this->FineTimer.Start();
|
||||||
this->_FineLinop.HermOp(Min[rhs],out[rhs]);
|
this->_FineLinop.HermOp(Min[rhs],out[rhs]);
|
||||||
|
|
||||||
axpy(tmp[rhs],-1.0,out[rhs],in[rhs]); // resid = in - A Min
|
axpy(tmp[rhs],-1.0,out[rhs],in[rhs]); // resid = in - A Min
|
||||||
this->FineTimer.Stop();
|
this->FineTimer.Stop();
|
||||||
|
|
||||||
|
@ -31,58 +31,6 @@ directory
|
|||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
template<class Field>
|
|
||||||
void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y){
|
|
||||||
typedef typename Field::scalar_type scomplex;
|
|
||||||
int Nblock = X.size();
|
|
||||||
for(int b=0;b<Nblock;b++){
|
|
||||||
for(int bp=0;bp<Nblock;bp++) {
|
|
||||||
m(b,bp) = innerProduct(X[b],Y[bp]);
|
|
||||||
}}
|
|
||||||
}
|
|
||||||
template<class Field>
|
|
||||||
void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0){
|
|
||||||
// Should make this cache friendly with site outermost, parallel_for
|
|
||||||
// Deal with case AP aliases with either Y or X
|
|
||||||
//
|
|
||||||
//Could pack "X" and "AP" into a Nblock x Volume dense array.
|
|
||||||
// AP(Nrhs x vol) = Y(Nrhs x vol) + scale * m(nrhs x nrhs) * X(nrhs*vol)
|
|
||||||
typedef typename Field::scalar_type scomplex;
|
|
||||||
int Nblock = AP.size();
|
|
||||||
std::vector<Field> tmp(Nblock,X[0]);
|
|
||||||
for(int b=0;b<Nblock;b++){
|
|
||||||
tmp[b] = Y[b];
|
|
||||||
for(int bp=0;bp<Nblock;bp++) {
|
|
||||||
tmp[b] = tmp[b] +scomplex(scale*m(bp,b))*X[bp];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for(int b=0;b<Nblock;b++){
|
|
||||||
AP[b] = tmp[b];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
template<class Field>
|
|
||||||
void MulMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X){
|
|
||||||
// Should make this cache friendly with site outermost, parallel_for
|
|
||||||
typedef typename Field::scalar_type scomplex;
|
|
||||||
int Nblock = AP.size();
|
|
||||||
for(int b=0;b<Nblock;b++){
|
|
||||||
AP[b] = Zero();
|
|
||||||
for(int bp=0;bp<Nblock;bp++) {
|
|
||||||
AP[b] += scomplex(m(bp,b))*X[bp];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
template<class Field>
|
|
||||||
double normv(const std::vector<Field> &P){
|
|
||||||
int Nblock = P.size();
|
|
||||||
double nn = 0.0;
|
|
||||||
for(int b=0;b<Nblock;b++) {
|
|
||||||
nn+=norm2(P[b]);
|
|
||||||
}
|
|
||||||
return nn;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS, BlockCGVec, BlockCGrQVec };
|
enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS, BlockCGVec, BlockCGrQVec };
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
@ -139,19 +87,10 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
|
|||||||
sliceInnerProductMatrix(m_rr,R,R,Orthog);
|
sliceInnerProductMatrix(m_rr,R,R,Orthog);
|
||||||
|
|
||||||
// Force manifest hermitian to avoid rounding related
|
// Force manifest hermitian to avoid rounding related
|
||||||
/*
|
|
||||||
int rank=m_rr.rows();
|
|
||||||
for(int r=0;r<rank;r++){
|
|
||||||
for(int s=0;s<rank;s++){
|
|
||||||
std::cout << "QR m_rr["<<r<<","<<s<<"] "<<m_rr(r,s)<<std::endl;
|
|
||||||
}}
|
|
||||||
*/
|
|
||||||
m_rr = 0.5*(m_rr+m_rr.adjoint());
|
m_rr = 0.5*(m_rr+m_rr.adjoint());
|
||||||
|
|
||||||
Eigen::MatrixXcd L = m_rr.llt().matrixL();
|
Eigen::MatrixXcd L = m_rr.llt().matrixL();
|
||||||
|
|
||||||
// ComplexD det = L.determinant();
|
|
||||||
// std::cout << " Det m_rr "<<det<<std::endl;
|
|
||||||
C = L.adjoint();
|
C = L.adjoint();
|
||||||
Cinv = C.inverse();
|
Cinv = C.inverse();
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
@ -171,20 +110,11 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
|
|||||||
const std::vector<Field> & R)
|
const std::vector<Field> & R)
|
||||||
{
|
{
|
||||||
InnerProductMatrix(m_rr,R,R);
|
InnerProductMatrix(m_rr,R,R);
|
||||||
/*
|
|
||||||
int rank=m_rr.rows();
|
|
||||||
for(int r=0;r<rank;r++){
|
|
||||||
for(int s=0;s<rank;s++){
|
|
||||||
std::cout << "QRvec m_rr["<<r<<","<<s<<"] "<<m_rr(r,s)<<std::endl;
|
|
||||||
}}
|
|
||||||
*/
|
|
||||||
m_rr = 0.5*(m_rr+m_rr.adjoint());
|
m_rr = 0.5*(m_rr+m_rr.adjoint());
|
||||||
|
|
||||||
Eigen::MatrixXcd L = m_rr.llt().matrixL();
|
Eigen::MatrixXcd L = m_rr.llt().matrixL();
|
||||||
|
|
||||||
// ComplexD det = L.determinant();
|
|
||||||
// std::cout << " Det m_rr "<<det<<std::endl;
|
|
||||||
|
|
||||||
C = L.adjoint();
|
C = L.adjoint();
|
||||||
Cinv = C.inverse();
|
Cinv = C.inverse();
|
||||||
|
|
||||||
@ -256,7 +186,6 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
|
|||||||
sliceNorm(ssq,B,Orthog);
|
sliceNorm(ssq,B,Orthog);
|
||||||
RealD sssum=0;
|
RealD sssum=0;
|
||||||
for(int b=0;b<Nblock;b++) sssum+=ssq[b];
|
for(int b=0;b<Nblock;b++) sssum+=ssq[b];
|
||||||
for(int b=0;b<Nblock;b++) std::cout << "src["<<b<<"]" << ssq[b] <<std::endl;
|
|
||||||
|
|
||||||
sliceNorm(residuals,B,Orthog);
|
sliceNorm(residuals,B,Orthog);
|
||||||
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
|
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
|
||||||
@ -292,9 +221,6 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
|
|||||||
Linop.HermOp(X, AD);
|
Linop.HermOp(X, AD);
|
||||||
tmp = B - AD;
|
tmp = B - AD;
|
||||||
|
|
||||||
sliceNorm(residuals,tmp,Orthog);
|
|
||||||
for(int b=0;b<Nblock;b++) std::cout << "res["<<b<<"]" << residuals[b] <<std::endl;
|
|
||||||
|
|
||||||
ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
|
ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
|
||||||
D=Q;
|
D=Q;
|
||||||
|
|
||||||
@ -310,8 +236,6 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
|
|||||||
GridStopWatch SolverTimer;
|
GridStopWatch SolverTimer;
|
||||||
SolverTimer.Start();
|
SolverTimer.Start();
|
||||||
|
|
||||||
RealD max_resid=0;
|
|
||||||
|
|
||||||
int k;
|
int k;
|
||||||
for (k = 1; k <= MaxIterations; k++) {
|
for (k = 1; k <= MaxIterations; k++) {
|
||||||
|
|
||||||
@ -356,7 +280,7 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
|
|||||||
*/
|
*/
|
||||||
m_rr = m_C.adjoint() * m_C;
|
m_rr = m_C.adjoint() * m_C;
|
||||||
|
|
||||||
max_resid=0;
|
RealD max_resid=0;
|
||||||
RealD rrsum=0;
|
RealD rrsum=0;
|
||||||
RealD rr;
|
RealD rr;
|
||||||
|
|
||||||
@ -398,9 +322,7 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
|
|||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl;
|
||||||
std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge "<<k<<" / "<<MaxIterations
|
|
||||||
<<" residual "<< std::sqrt(max_resid)<< std::endl;
|
|
||||||
|
|
||||||
if (ErrorOnNoConverge) assert(0);
|
if (ErrorOnNoConverge) assert(0);
|
||||||
IterationsToComplete = k;
|
IterationsToComplete = k;
|
||||||
@ -544,6 +466,43 @@ void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &
|
|||||||
IterationsToComplete = k;
|
IterationsToComplete = k;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y){
|
||||||
|
for(int b=0;b<Nblock;b++){
|
||||||
|
for(int bp=0;bp<Nblock;bp++) {
|
||||||
|
m(b,bp) = innerProduct(X[b],Y[bp]);
|
||||||
|
}}
|
||||||
|
}
|
||||||
|
void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0){
|
||||||
|
// Should make this cache friendly with site outermost, parallel_for
|
||||||
|
// Deal with case AP aliases with either Y or X
|
||||||
|
std::vector<Field> tmp(Nblock,X[0]);
|
||||||
|
for(int b=0;b<Nblock;b++){
|
||||||
|
tmp[b] = Y[b];
|
||||||
|
for(int bp=0;bp<Nblock;bp++) {
|
||||||
|
tmp[b] = tmp[b] + scomplex(scale*m(bp,b))*X[bp];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for(int b=0;b<Nblock;b++){
|
||||||
|
AP[b] = tmp[b];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void MulMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X){
|
||||||
|
// Should make this cache friendly with site outermost, parallel_for
|
||||||
|
for(int b=0;b<Nblock;b++){
|
||||||
|
AP[b] = Zero();
|
||||||
|
for(int bp=0;bp<Nblock;bp++) {
|
||||||
|
AP[b] += scomplex(m(bp,b))*X[bp];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
double normv(const std::vector<Field> &P){
|
||||||
|
double nn = 0.0;
|
||||||
|
for(int b=0;b<Nblock;b++) {
|
||||||
|
nn+=norm2(P[b]);
|
||||||
|
}
|
||||||
|
return nn;
|
||||||
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////
|
||||||
// BlockCGrQvec implementation:
|
// BlockCGrQvec implementation:
|
||||||
//--------------------------
|
//--------------------------
|
||||||
@ -590,7 +549,6 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
|
|||||||
|
|
||||||
RealD sssum=0;
|
RealD sssum=0;
|
||||||
for(int b=0;b<Nblock;b++){ ssq[b] = norm2(B[b]);}
|
for(int b=0;b<Nblock;b++){ ssq[b] = norm2(B[b]);}
|
||||||
for(int b=0;b<Nblock;b++){ std::cout << "ssq["<<b<<"] "<<ssq[b]<<std::endl;}
|
|
||||||
for(int b=0;b<Nblock;b++) sssum+=ssq[b];
|
for(int b=0;b<Nblock;b++) sssum+=ssq[b];
|
||||||
|
|
||||||
for(int b=0;b<Nblock;b++){ residuals[b] = norm2(B[b]);}
|
for(int b=0;b<Nblock;b++){ residuals[b] = norm2(B[b]);}
|
||||||
@ -627,7 +585,6 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
|
|||||||
for(int b=0;b<Nblock;b++) {
|
for(int b=0;b<Nblock;b++) {
|
||||||
Linop.HermOp(X[b], AD[b]);
|
Linop.HermOp(X[b], AD[b]);
|
||||||
tmp[b] = B[b] - AD[b];
|
tmp[b] = B[b] - AD[b];
|
||||||
std::cout << "r0["<<b<<"] "<<norm2(tmp[b])<<std::endl;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
|
ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
|
||||||
|
@ -38,7 +38,6 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
// single input vec, single output vec.
|
// single input vec, single output vec.
|
||||||
/////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
|
||||||
template <class Field>
|
template <class Field>
|
||||||
class ConjugateGradient : public OperatorFunction<Field> {
|
class ConjugateGradient : public OperatorFunction<Field> {
|
||||||
public:
|
public:
|
||||||
@ -58,22 +57,10 @@ public:
|
|||||||
ErrorOnNoConverge(err_on_no_conv)
|
ErrorOnNoConverge(err_on_no_conv)
|
||||||
{};
|
{};
|
||||||
|
|
||||||
virtual void LogIteration(int k,RealD a,RealD b){
|
|
||||||
// std::cout << "ConjugageGradient::LogIteration() "<<std::endl;
|
|
||||||
};
|
|
||||||
virtual void LogBegin(void){
|
|
||||||
std::cout << "ConjugageGradient::LogBegin() "<<std::endl;
|
|
||||||
};
|
|
||||||
|
|
||||||
void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
|
void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
|
||||||
|
|
||||||
this->LogBegin();
|
|
||||||
|
|
||||||
GRID_TRACE("ConjugateGradient");
|
GRID_TRACE("ConjugateGradient");
|
||||||
GridStopWatch PreambleTimer;
|
GridStopWatch PreambleTimer;
|
||||||
GridStopWatch ConstructTimer;
|
|
||||||
GridStopWatch NormTimer;
|
|
||||||
GridStopWatch AssignTimer;
|
|
||||||
PreambleTimer.Start();
|
PreambleTimer.Start();
|
||||||
psi.Checkerboard() = src.Checkerboard();
|
psi.Checkerboard() = src.Checkerboard();
|
||||||
|
|
||||||
@ -83,19 +70,14 @@ public:
|
|||||||
//RealD b_pred;
|
//RealD b_pred;
|
||||||
|
|
||||||
// Was doing copies
|
// Was doing copies
|
||||||
ConstructTimer.Start();
|
Field p(src.Grid());
|
||||||
Field p (src.Grid());
|
|
||||||
Field mmp(src.Grid());
|
Field mmp(src.Grid());
|
||||||
Field r (src.Grid());
|
Field r(src.Grid());
|
||||||
ConstructTimer.Stop();
|
|
||||||
|
|
||||||
// Initial residual computation & set up
|
// Initial residual computation & set up
|
||||||
NormTimer.Start();
|
|
||||||
ssq = norm2(src);
|
ssq = norm2(src);
|
||||||
RealD guess = norm2(psi);
|
RealD guess = norm2(psi);
|
||||||
NormTimer.Stop();
|
|
||||||
assert(std::isnan(guess) == 0);
|
assert(std::isnan(guess) == 0);
|
||||||
AssignTimer.Start();
|
|
||||||
if ( guess == 0.0 ) {
|
if ( guess == 0.0 ) {
|
||||||
r = src;
|
r = src;
|
||||||
p = r;
|
p = r;
|
||||||
@ -107,7 +89,6 @@ public:
|
|||||||
a = norm2(p);
|
a = norm2(p);
|
||||||
}
|
}
|
||||||
cp = a;
|
cp = a;
|
||||||
AssignTimer.Stop();
|
|
||||||
|
|
||||||
// Handle trivial case of zero src
|
// Handle trivial case of zero src
|
||||||
if (ssq == 0.){
|
if (ssq == 0.){
|
||||||
@ -183,7 +164,6 @@ public:
|
|||||||
}
|
}
|
||||||
LinearCombTimer.Stop();
|
LinearCombTimer.Stop();
|
||||||
LinalgTimer.Stop();
|
LinalgTimer.Stop();
|
||||||
LogIteration(k,a,b);
|
|
||||||
|
|
||||||
IterationTimer.Stop();
|
IterationTimer.Stop();
|
||||||
if ( (k % 500) == 0 ) {
|
if ( (k % 500) == 0 ) {
|
||||||
@ -240,9 +220,6 @@ public:
|
|||||||
<<" residual "<< std::sqrt(cp / ssq)<< std::endl;
|
<<" residual "<< std::sqrt(cp / ssq)<< std::endl;
|
||||||
SolverTimer.Stop();
|
SolverTimer.Stop();
|
||||||
std::cout << GridLogMessage << "\tPreamble " << PreambleTimer.Elapsed() <<std::endl;
|
std::cout << GridLogMessage << "\tPreamble " << PreambleTimer.Elapsed() <<std::endl;
|
||||||
std::cout << GridLogMessage << "\tConstruct " << ConstructTimer.Elapsed() <<std::endl;
|
|
||||||
std::cout << GridLogMessage << "\tNorm " << NormTimer.Elapsed() <<std::endl;
|
|
||||||
std::cout << GridLogMessage << "\tAssign " << AssignTimer.Elapsed() <<std::endl;
|
|
||||||
std::cout << GridLogMessage << "\tSolver " << SolverTimer.Elapsed() <<std::endl;
|
std::cout << GridLogMessage << "\tSolver " << SolverTimer.Elapsed() <<std::endl;
|
||||||
std::cout << GridLogMessage << "Solver breakdown "<<std::endl;
|
std::cout << GridLogMessage << "Solver breakdown "<<std::endl;
|
||||||
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
|
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
|
||||||
@ -256,118 +233,5 @@ public:
|
|||||||
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
template <class Field>
|
|
||||||
class ConjugateGradientPolynomial : public ConjugateGradient<Field> {
|
|
||||||
public:
|
|
||||||
// Optionally record the CG polynomial
|
|
||||||
std::vector<double> ak;
|
|
||||||
std::vector<double> bk;
|
|
||||||
std::vector<double> poly_p;
|
|
||||||
std::vector<double> poly_r;
|
|
||||||
std::vector<double> poly_Ap;
|
|
||||||
std::vector<double> polynomial;
|
|
||||||
|
|
||||||
public:
|
|
||||||
ConjugateGradientPolynomial(RealD tol, Integer maxit, bool err_on_no_conv = true)
|
|
||||||
: ConjugateGradient<Field>(tol,maxit,err_on_no_conv)
|
|
||||||
{ };
|
|
||||||
void PolyHermOp(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
|
|
||||||
{
|
|
||||||
Field tmp(src.Grid());
|
|
||||||
Field AtoN(src.Grid());
|
|
||||||
AtoN = src;
|
|
||||||
psi=AtoN*polynomial[0];
|
|
||||||
for(int n=1;n<polynomial.size();n++){
|
|
||||||
tmp = AtoN;
|
|
||||||
Linop.HermOp(tmp,AtoN);
|
|
||||||
psi = psi + polynomial[n]*AtoN;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
void CGsequenceHermOp(LinearOperatorBase<Field> &Linop, const Field &src, Field &x)
|
|
||||||
{
|
|
||||||
Field Ap(src.Grid());
|
|
||||||
Field r(src.Grid());
|
|
||||||
Field p(src.Grid());
|
|
||||||
p=src;
|
|
||||||
r=src;
|
|
||||||
x=Zero();
|
|
||||||
x.Checkerboard()=src.Checkerboard();
|
|
||||||
for(int k=0;k<ak.size();k++){
|
|
||||||
x = x + ak[k]*p;
|
|
||||||
Linop.HermOp(p,Ap);
|
|
||||||
r = r - ak[k] * Ap;
|
|
||||||
p = r + bk[k] * p;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
void Solve(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
|
|
||||||
{
|
|
||||||
psi=Zero();
|
|
||||||
this->operator ()(Linop,src,psi);
|
|
||||||
}
|
|
||||||
virtual void LogBegin(void)
|
|
||||||
{
|
|
||||||
std::cout << "ConjugageGradientPolynomial::LogBegin() "<<std::endl;
|
|
||||||
ak.resize(0);
|
|
||||||
bk.resize(0);
|
|
||||||
polynomial.resize(0);
|
|
||||||
poly_Ap.resize(0);
|
|
||||||
poly_Ap.resize(0);
|
|
||||||
poly_p.resize(1);
|
|
||||||
poly_r.resize(1);
|
|
||||||
poly_p[0]=1.0;
|
|
||||||
poly_r[0]=1.0;
|
|
||||||
};
|
|
||||||
virtual void LogIteration(int k,RealD a,RealD b)
|
|
||||||
{
|
|
||||||
// With zero guess,
|
|
||||||
// p = r = src
|
|
||||||
//
|
|
||||||
// iterate:
|
|
||||||
// x = x + a p
|
|
||||||
// r = r - a A p
|
|
||||||
// p = r + b p
|
|
||||||
//
|
|
||||||
// [0]
|
|
||||||
// r = x
|
|
||||||
// p = x
|
|
||||||
// Ap=0
|
|
||||||
//
|
|
||||||
// [1]
|
|
||||||
// Ap = A x + 0 ==> shift poly P right by 1 and add 0.
|
|
||||||
// x = x + a p ==> add polynomials term by term
|
|
||||||
// r = r - a A p ==> add polynomials term by term
|
|
||||||
// p = r + b p ==> add polynomials term by term
|
|
||||||
//
|
|
||||||
std::cout << "ConjugageGradientPolynomial::LogIteration() "<<k<<std::endl;
|
|
||||||
ak.push_back(a);
|
|
||||||
bk.push_back(b);
|
|
||||||
// Ap= right_shift(p)
|
|
||||||
poly_Ap.resize(k+1);
|
|
||||||
poly_Ap[0]=0.0;
|
|
||||||
for(int i=0;i<k;i++){
|
|
||||||
poly_Ap[i+1]=poly_p[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
// x = x + a p
|
|
||||||
polynomial.resize(k);
|
|
||||||
polynomial[k-1]=0.0;
|
|
||||||
for(int i=0;i<k;i++){
|
|
||||||
polynomial[i] = polynomial[i] + a * poly_p[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
// r = r - a Ap
|
|
||||||
// p = r + b p
|
|
||||||
poly_r.resize(k+1);
|
|
||||||
poly_p.resize(k+1);
|
|
||||||
poly_r[k] = poly_p[k] = 0.0;
|
|
||||||
for(int i=0;i<k+1;i++){
|
|
||||||
poly_r[i] = poly_r[i] - a * poly_Ap[i];
|
|
||||||
poly_p[i] = poly_r[i] + b * poly_p[i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
#endif
|
#endif
|
||||||
|
@ -102,11 +102,11 @@ public:
|
|||||||
assert(mass.size()==nshift);
|
assert(mass.size()==nshift);
|
||||||
assert(mresidual.size()==nshift);
|
assert(mresidual.size()==nshift);
|
||||||
|
|
||||||
// remove dynamic sized arrays on stack; 2d is a pain with vector
|
// dynamic sized arrays on stack; 2d is a pain with vector
|
||||||
std::vector<RealD> bs(nshift);
|
RealD bs[nshift];
|
||||||
std::vector<RealD> rsq(nshift);
|
RealD rsq[nshift];
|
||||||
std::vector<std::array<RealD,2> > z(nshift);
|
RealD z[nshift][2];
|
||||||
std::vector<int> converged(nshift);
|
int converged[nshift];
|
||||||
|
|
||||||
const int primary =0;
|
const int primary =0;
|
||||||
|
|
||||||
|
@ -123,11 +123,11 @@ public:
|
|||||||
assert(mresidual.size()==nshift);
|
assert(mresidual.size()==nshift);
|
||||||
|
|
||||||
// dynamic sized arrays on stack; 2d is a pain with vector
|
// dynamic sized arrays on stack; 2d is a pain with vector
|
||||||
std::vector<RealD> bs(nshift);
|
RealD bs[nshift];
|
||||||
std::vector<RealD> rsq(nshift);
|
RealD rsq[nshift];
|
||||||
std::vector<RealD> rsqf(nshift);
|
RealD rsqf[nshift];
|
||||||
std::vector<std::array<RealD,2> > z(nshift);
|
RealD z[nshift][2];
|
||||||
std::vector<int> converged(nshift);
|
int converged[nshift];
|
||||||
|
|
||||||
const int primary =0;
|
const int primary =0;
|
||||||
|
|
||||||
|
@ -156,11 +156,11 @@ public:
|
|||||||
assert(mresidual.size()==nshift);
|
assert(mresidual.size()==nshift);
|
||||||
|
|
||||||
// dynamic sized arrays on stack; 2d is a pain with vector
|
// dynamic sized arrays on stack; 2d is a pain with vector
|
||||||
std::vector<RealD> bs(nshift);
|
RealD bs[nshift];
|
||||||
std::vector<RealD> rsq(nshift);
|
RealD rsq[nshift];
|
||||||
std::vector<RealD> rsqf(nshift);
|
RealD rsqf[nshift];
|
||||||
std::vector<std::array<RealD,2> > z(nshift);
|
RealD z[nshift][2];
|
||||||
std::vector<int> converged(nshift);
|
int converged[nshift];
|
||||||
|
|
||||||
const int primary =0;
|
const int primary =0;
|
||||||
|
|
||||||
|
@ -279,16 +279,16 @@ public:
|
|||||||
Qt = Eigen::MatrixXcd::Identity(Nm,Nm);
|
Qt = Eigen::MatrixXcd::Identity(Nm,Nm);
|
||||||
diagonalize(eval2,lmd2,lme2,Nu,Nm,Nm,Qt,grid);
|
diagonalize(eval2,lmd2,lme2,Nu,Nm,Nm,Qt,grid);
|
||||||
_sort.push(eval2,Nm);
|
_sort.push(eval2,Nm);
|
||||||
// Glog << "#Ritz value before shift: "<< std::endl;
|
Glog << "#Ritz value before shift: "<< std::endl;
|
||||||
for(int i=0; i<Nm; ++i){
|
for(int i=0; i<Nm; ++i){
|
||||||
// std::cout.precision(13);
|
std::cout.precision(13);
|
||||||
// std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
|
std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
|
||||||
// std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
|
std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
//----------------------------------------------------------------------
|
//----------------------------------------------------------------------
|
||||||
if ( Nm>Nk ) {
|
if ( Nm>Nk ) {
|
||||||
// Glog <<" #Apply shifted QR transformations "<<std::endl;
|
Glog <<" #Apply shifted QR transformations "<<std::endl;
|
||||||
//int k2 = Nk+Nu;
|
//int k2 = Nk+Nu;
|
||||||
int k2 = Nk;
|
int k2 = Nk;
|
||||||
|
|
||||||
@ -326,7 +326,7 @@ public:
|
|||||||
Qt = Eigen::MatrixXcd::Identity(Nm,Nm);
|
Qt = Eigen::MatrixXcd::Identity(Nm,Nm);
|
||||||
diagonalize(eval2,lmd2,lme2,Nu,Nk,Nm,Qt,grid);
|
diagonalize(eval2,lmd2,lme2,Nu,Nk,Nm,Qt,grid);
|
||||||
_sort.push(eval2,Nk);
|
_sort.push(eval2,Nk);
|
||||||
// Glog << "#Ritz value after shift: "<< std::endl;
|
Glog << "#Ritz value after shift: "<< std::endl;
|
||||||
for(int i=0; i<Nk; ++i){
|
for(int i=0; i<Nk; ++i){
|
||||||
// std::cout.precision(13);
|
// std::cout.precision(13);
|
||||||
// std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
|
// std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
|
||||||
@ -644,7 +644,7 @@ private:
|
|||||||
// for (int u=0; u<mrhs; ++u) Glog << " out["<<u<<"] = "<<norm2(out[u])<<std::endl;
|
// for (int u=0; u<mrhs; ++u) Glog << " out["<<u<<"] = "<<norm2(out[u])<<std::endl;
|
||||||
k_start +=mrhs;
|
k_start +=mrhs;
|
||||||
}
|
}
|
||||||
// Glog << "LinAlg "<< std::endl;
|
Glog << "LinAlg "<< std::endl;
|
||||||
|
|
||||||
if (b>0) {
|
if (b>0) {
|
||||||
for (int u=0; u<Nu; ++u) {
|
for (int u=0; u<Nu; ++u) {
|
||||||
@ -678,7 +678,7 @@ private:
|
|||||||
}
|
}
|
||||||
w_copy[u] = w[u];
|
w_copy[u] = w[u];
|
||||||
}
|
}
|
||||||
// Glog << "LinAlg done"<< std::endl;
|
Glog << "LinAlg done"<< std::endl;
|
||||||
|
|
||||||
// In block version, the steps 6 and 7 in Lanczos construction is
|
// In block version, the steps 6 and 7 in Lanczos construction is
|
||||||
// replaced by the QR decomposition of new basis block.
|
// replaced by the QR decomposition of new basis block.
|
||||||
@ -691,15 +691,15 @@ private:
|
|||||||
}
|
}
|
||||||
|
|
||||||
// re-orthogonalization for numerical stability
|
// re-orthogonalization for numerical stability
|
||||||
// Glog << "Gram Schmidt"<< std::endl;
|
Glog << "Gram Schmidt"<< std::endl;
|
||||||
orthogonalize(w,Nu,evec,R);
|
orthogonalize(w,Nu,evec,R);
|
||||||
// QR part
|
// QR part
|
||||||
for (int u=1; u<Nu; ++u) {
|
for (int u=1; u<Nu; ++u) {
|
||||||
orthogonalize(w[u],w,u);
|
orthogonalize(w[u],w,u);
|
||||||
}
|
}
|
||||||
// Glog << "Gram Schmidt done "<< std::endl;
|
Glog << "Gram Schmidt done "<< std::endl;
|
||||||
|
|
||||||
// Glog << "LinAlg "<< std::endl;
|
Glog << "LinAlg "<< std::endl;
|
||||||
for (int u=0; u<Nu; ++u) {
|
for (int u=0; u<Nu; ++u) {
|
||||||
//for (int v=0; v<Nu; ++v) {
|
//for (int v=0; v<Nu; ++v) {
|
||||||
for (int v=u; v<Nu; ++v) {
|
for (int v=u; v<Nu; ++v) {
|
||||||
@ -716,7 +716,7 @@ private:
|
|||||||
// Glog <<" In block "<< b << "," <<" beta[" << u << "," << k-L << "] = " << lme[u][k] << std::endl;
|
// Glog <<" In block "<< b << "," <<" beta[" << u << "," << k-L << "] = " << lme[u][k] << std::endl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Glog << "LinAlg done "<< std::endl;
|
Glog << "LinAlg done "<< std::endl;
|
||||||
|
|
||||||
if (b < Nm/Nu-1) {
|
if (b < Nm/Nu-1) {
|
||||||
for (int u=0; u<Nu; ++u) {
|
for (int u=0; u<Nu; ++u) {
|
||||||
@ -935,7 +935,7 @@ if (1){
|
|||||||
int Nu, int Nb, int Nk, int Nm,
|
int Nu, int Nb, int Nk, int Nm,
|
||||||
Eigen::MatrixXcd& M)
|
Eigen::MatrixXcd& M)
|
||||||
{
|
{
|
||||||
// Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n';
|
Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n';
|
||||||
assert( Nk%Nu == 0 && Nm%Nu == 0 );
|
assert( Nk%Nu == 0 && Nm%Nu == 0 );
|
||||||
assert( Nk <= Nm );
|
assert( Nk <= Nm );
|
||||||
M = Eigen::MatrixXcd::Zero(Nk,Nk);
|
M = Eigen::MatrixXcd::Zero(Nk,Nk);
|
||||||
@ -953,7 +953,7 @@ if (1){
|
|||||||
M(u+(k/Nu)*Nu,k-Nu) = lme[u][k-Nu];
|
M(u+(k/Nu)*Nu,k-Nu) = lme[u][k-Nu];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Glog << "unpackHermitBlockTriDiagMatToEigen() end" << std::endl;
|
Glog << "unpackHermitBlockTriDiagMatToEigen() end" << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -963,7 +963,7 @@ if (1){
|
|||||||
int Nu, int Nb, int Nk, int Nm,
|
int Nu, int Nb, int Nk, int Nm,
|
||||||
Eigen::MatrixXcd& M)
|
Eigen::MatrixXcd& M)
|
||||||
{
|
{
|
||||||
// Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n';
|
Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n';
|
||||||
assert( Nk%Nu == 0 && Nm%Nu == 0 );
|
assert( Nk%Nu == 0 && Nm%Nu == 0 );
|
||||||
assert( Nk <= Nm );
|
assert( Nk <= Nm );
|
||||||
|
|
||||||
@ -979,7 +979,7 @@ if (1){
|
|||||||
lme[u][k-Nu] = M(u+(k/Nu)*Nu,k-Nu);
|
lme[u][k-Nu] = M(u+(k/Nu)*Nu,k-Nu);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Glog << "packHermitBlockTriDiagMatfromEigen() end" <<std::endl;
|
Glog << "packHermitBlockTriDiagMatfromEigen() end" <<std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -988,7 +988,7 @@ if (1){
|
|||||||
RealD Dsh,
|
RealD Dsh,
|
||||||
Eigen::MatrixXcd& Qprod)
|
Eigen::MatrixXcd& Qprod)
|
||||||
{
|
{
|
||||||
// Glog << "shiftedQRDecompEigen() begin" << '\n';
|
Glog << "shiftedQRDecompEigen() begin" << '\n';
|
||||||
Eigen::MatrixXcd Q = Eigen::MatrixXcd::Zero(Nm,Nm);
|
Eigen::MatrixXcd Q = Eigen::MatrixXcd::Zero(Nm,Nm);
|
||||||
Eigen::MatrixXcd R = Eigen::MatrixXcd::Zero(Nm,Nm);
|
Eigen::MatrixXcd R = Eigen::MatrixXcd::Zero(Nm,Nm);
|
||||||
Eigen::MatrixXcd Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm);
|
Eigen::MatrixXcd Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm);
|
||||||
@ -1004,7 +1004,7 @@ if (1){
|
|||||||
// lower triangular part used to represent series
|
// lower triangular part used to represent series
|
||||||
// of Q sequence.
|
// of Q sequence.
|
||||||
|
|
||||||
// Glog << "shiftedQRDecompEigen() Housholder & QR" << '\n';
|
Glog << "shiftedQRDecompEigen() Housholder & QR" << '\n';
|
||||||
// equivalent operation of Qprod *= Q
|
// equivalent operation of Qprod *= Q
|
||||||
//M = Eigen::MatrixXcd::Zero(Nm,Nm);
|
//M = Eigen::MatrixXcd::Zero(Nm,Nm);
|
||||||
|
|
||||||
@ -1025,7 +1025,7 @@ if (1){
|
|||||||
|
|
||||||
Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm);
|
Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm);
|
||||||
|
|
||||||
// Glog << "shiftedQRDecompEigen() Mtmp create" << '\n';
|
Glog << "shiftedQRDecompEigen() Mtmp create" << '\n';
|
||||||
for (int i=0; i<Nm; ++i) {
|
for (int i=0; i<Nm; ++i) {
|
||||||
for (int j=0; j<Nm-(Nu+1); ++j) {
|
for (int j=0; j<Nm-(Nu+1); ++j) {
|
||||||
for (int k=0; k<Nu+1+j; ++k) {
|
for (int k=0; k<Nu+1+j; ++k) {
|
||||||
@ -1033,7 +1033,7 @@ if (1){
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Glog << "shiftedQRDecompEigen() Mtmp loop1" << '\n';
|
Glog << "shiftedQRDecompEigen() Mtmp loop1" << '\n';
|
||||||
for (int i=0; i<Nm; ++i) {
|
for (int i=0; i<Nm; ++i) {
|
||||||
for (int j=Nm-(Nu+1); j<Nm; ++j) {
|
for (int j=Nm-(Nu+1); j<Nm; ++j) {
|
||||||
for (int k=0; k<Nm; ++k) {
|
for (int k=0; k<Nm; ++k) {
|
||||||
@ -1041,7 +1041,7 @@ if (1){
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Glog << "shiftedQRDecompEigen() Mtmp loop2" << '\n';
|
Glog << "shiftedQRDecompEigen() Mtmp loop2" << '\n';
|
||||||
|
|
||||||
//static int ntimes = 2;
|
//static int ntimes = 2;
|
||||||
//for (int j=0; j<Nm-(ntimes*Nu); ++j) {
|
//for (int j=0; j<Nm-(ntimes*Nu); ++j) {
|
||||||
@ -1067,13 +1067,13 @@ if (1){
|
|||||||
Mtmp(j,i) = conj(Mtmp(i,j));
|
Mtmp(j,i) = conj(Mtmp(i,j));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Glog << "shiftedQRDecompEigen() Mtmp loop3" << '\n';
|
Glog << "shiftedQRDecompEigen() Mtmp loop3" << '\n';
|
||||||
|
|
||||||
for (int i=0; i<Nm; ++i) {
|
for (int i=0; i<Nm; ++i) {
|
||||||
Mtmp(i,i) = real(Mtmp(i,i)) + Dsh;
|
Mtmp(i,i) = real(Mtmp(i,i)) + Dsh;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Glog << "shiftedQRDecompEigen() Mtmp loop4" << '\n';
|
Glog << "shiftedQRDecompEigen() Mtmp loop4" << '\n';
|
||||||
M = Mtmp;
|
M = Mtmp;
|
||||||
|
|
||||||
//M = Q.adjoint()*(M*Q);
|
//M = Q.adjoint()*(M*Q);
|
||||||
@ -1085,7 +1085,7 @@ if (1){
|
|||||||
// }
|
// }
|
||||||
//}
|
//}
|
||||||
|
|
||||||
// Glog << "shiftedQRDecompEigen() end" <<std::endl;
|
Glog << "shiftedQRDecompEigen() end" <<std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
void exampleQRDecompEigen(void)
|
void exampleQRDecompEigen(void)
|
||||||
|
@ -60,32 +60,6 @@ public:
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template<class Field> class NormalResidual : public LinearFunction<Field>{
|
|
||||||
private:
|
|
||||||
SparseMatrixBase<Field> & _Matrix;
|
|
||||||
OperatorFunction<Field> & _HermitianSolver;
|
|
||||||
LinearFunction<Field> & _Guess;
|
|
||||||
public:
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////
|
|
||||||
// Wrap the usual normal equations trick
|
|
||||||
/////////////////////////////////////////////////////
|
|
||||||
NormalResidual(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver,
|
|
||||||
LinearFunction<Field> &Guess)
|
|
||||||
: _Matrix(Matrix), _HermitianSolver(HermitianSolver), _Guess(Guess) {};
|
|
||||||
|
|
||||||
void operator() (const Field &in, Field &out){
|
|
||||||
|
|
||||||
Field res(in.Grid());
|
|
||||||
Field tmp(in.Grid());
|
|
||||||
|
|
||||||
MMdagLinearOperator<SparseMatrixBase<Field>,Field> MMdagOp(_Matrix);
|
|
||||||
_Guess(in,res);
|
|
||||||
_HermitianSolver(MMdagOp,in,res); // M Mdag res = in ;
|
|
||||||
_Matrix.Mdag(res,out); // out = Mdag res
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
template<class Field> class HPDSolver : public LinearFunction<Field> {
|
template<class Field> class HPDSolver : public LinearFunction<Field> {
|
||||||
private:
|
private:
|
||||||
LinearOperatorBase<Field> & _Matrix;
|
LinearOperatorBase<Field> & _Matrix;
|
||||||
|
@ -20,7 +20,7 @@ template<class Field> class PowerMethod
|
|||||||
RealD evalMaxApprox = 0.0;
|
RealD evalMaxApprox = 0.0;
|
||||||
auto src_n = src;
|
auto src_n = src;
|
||||||
auto tmp = src;
|
auto tmp = src;
|
||||||
const int _MAX_ITER_EST_ = 200;
|
const int _MAX_ITER_EST_ = 100;
|
||||||
|
|
||||||
for (int i=0;i<_MAX_ITER_EST_;i++) {
|
for (int i=0;i<_MAX_ITER_EST_;i++) {
|
||||||
|
|
||||||
@ -30,17 +30,18 @@ template<class Field> class PowerMethod
|
|||||||
RealD vden = norm2(src_n);
|
RealD vden = norm2(src_n);
|
||||||
RealD na = vnum/vden;
|
RealD na = vnum/vden;
|
||||||
|
|
||||||
std::cout << GridLogMessage << "PowerMethod: Current approximation of largest eigenvalue " << na << std::endl;
|
std::cout << GridLogIterative << "PowerMethod: Current approximation of largest eigenvalue " << na << std::endl;
|
||||||
|
|
||||||
// if ( (fabs(evalMaxApprox/na - 1.0) < 0.0001) || (i==_MAX_ITER_EST_-1) ) {
|
if ( (fabs(evalMaxApprox/na - 1.0) < 0.001) || (i==_MAX_ITER_EST_-1) ) {
|
||||||
// evalMaxApprox = na;
|
|
||||||
// return evalMaxApprox;
|
|
||||||
// }
|
|
||||||
evalMaxApprox = na;
|
evalMaxApprox = na;
|
||||||
src_n = tmp;
|
|
||||||
}
|
|
||||||
std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
|
std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
|
||||||
return evalMaxApprox;
|
return evalMaxApprox;
|
||||||
}
|
}
|
||||||
|
evalMaxApprox = na;
|
||||||
|
src_n = tmp;
|
||||||
|
}
|
||||||
|
assert(0);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -1,76 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
namespace Grid {
|
|
||||||
|
|
||||||
class Band
|
|
||||||
{
|
|
||||||
RealD lo, hi;
|
|
||||||
public:
|
|
||||||
Band(RealD _lo,RealD _hi)
|
|
||||||
{
|
|
||||||
lo=_lo;
|
|
||||||
hi=_hi;
|
|
||||||
}
|
|
||||||
RealD operator() (RealD x){
|
|
||||||
if ( x>lo && x<hi ){
|
|
||||||
return 1.0;
|
|
||||||
} else {
|
|
||||||
return 0.0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
class PowerSpectrum
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
|
|
||||||
template<typename T> static RealD normalise(T& v)
|
|
||||||
{
|
|
||||||
RealD nn = norm2(v);
|
|
||||||
nn = sqrt(nn);
|
|
||||||
v = v * (1.0/nn);
|
|
||||||
return nn;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<RealD> ranges;
|
|
||||||
std::vector<int> order;
|
|
||||||
|
|
||||||
PowerSpectrum( std::vector<RealD> &bins, std::vector<int> &_order ) : ranges(bins), order(_order) { };
|
|
||||||
|
|
||||||
template<class Field>
|
|
||||||
RealD operator()(LinearOperatorBase<Field> &HermOp, const Field &src)
|
|
||||||
{
|
|
||||||
GridBase *grid = src.Grid();
|
|
||||||
int N=ranges.size();
|
|
||||||
RealD hi = ranges[N-1];
|
|
||||||
|
|
||||||
RealD lo_band = 0.0;
|
|
||||||
RealD hi_band;
|
|
||||||
RealD nn=norm2(src);
|
|
||||||
RealD ss=0.0;
|
|
||||||
|
|
||||||
Field tmp = src;
|
|
||||||
|
|
||||||
for(int b=0;b<N;b++){
|
|
||||||
hi_band = ranges[b];
|
|
||||||
Band Notch(lo_band,hi_band);
|
|
||||||
|
|
||||||
Chebyshev<Field> polynomial;
|
|
||||||
polynomial.Init(0.0,hi,order[b],Notch);
|
|
||||||
polynomial.JacksonSmooth();
|
|
||||||
|
|
||||||
polynomial(HermOp,src,tmp) ;
|
|
||||||
|
|
||||||
RealD p=norm2(tmp);
|
|
||||||
ss=ss+p;
|
|
||||||
std::cout << GridLogMessage << " PowerSpectrum Band["<<lo_band<<","<<hi_band<<"] power "<<norm2(tmp)/nn<<std::endl;
|
|
||||||
|
|
||||||
lo_band=hi_band;
|
|
||||||
}
|
|
||||||
std::cout << GridLogMessage << " PowerSpectrum total power "<<ss/nn<<std::endl;
|
|
||||||
std::cout << GridLogMessage << " PowerSpectrum total power (unnormalised) "<<nn<<std::endl;
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
}
|
|
@ -99,7 +99,7 @@ public:
|
|||||||
CoarseMatrix AselfInvEven;
|
CoarseMatrix AselfInvEven;
|
||||||
CoarseMatrix AselfInvOdd;
|
CoarseMatrix AselfInvOdd;
|
||||||
|
|
||||||
deviceVector<RealD> dag_factor;
|
Vector<RealD> dag_factor;
|
||||||
|
|
||||||
///////////////////////
|
///////////////////////
|
||||||
// Interface
|
// Interface
|
||||||
@ -124,13 +124,9 @@ public:
|
|||||||
int npoint = geom.npoint;
|
int npoint = geom.npoint;
|
||||||
typedef LatticeView<Cobj> Aview;
|
typedef LatticeView<Cobj> Aview;
|
||||||
|
|
||||||
deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
|
Vector<Aview> AcceleratorViewContainer;
|
||||||
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
|
|
||||||
|
|
||||||
for(int p=0;p<geom.npoint;p++) {
|
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
|
||||||
hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
|
|
||||||
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
|
|
||||||
}
|
|
||||||
Aview *Aview_p = & AcceleratorViewContainer[0];
|
Aview *Aview_p = & AcceleratorViewContainer[0];
|
||||||
|
|
||||||
const int Nsimd = CComplex::Nsimd();
|
const int Nsimd = CComplex::Nsimd();
|
||||||
@ -165,7 +161,7 @@ public:
|
|||||||
coalescedWrite(out_v[ss](b),res);
|
coalescedWrite(out_v[ss](b),res);
|
||||||
});
|
});
|
||||||
|
|
||||||
for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
|
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
|
||||||
};
|
};
|
||||||
|
|
||||||
void Mdag (const CoarseVector &in, CoarseVector &out)
|
void Mdag (const CoarseVector &in, CoarseVector &out)
|
||||||
@ -194,14 +190,9 @@ public:
|
|||||||
int npoint = geom.npoint;
|
int npoint = geom.npoint;
|
||||||
typedef LatticeView<Cobj> Aview;
|
typedef LatticeView<Cobj> Aview;
|
||||||
|
|
||||||
|
Vector<Aview> AcceleratorViewContainer;
|
||||||
|
|
||||||
deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
|
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
|
||||||
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
|
|
||||||
|
|
||||||
for(int p=0;p<geom.npoint;p++) {
|
|
||||||
hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
|
|
||||||
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
|
|
||||||
}
|
|
||||||
Aview *Aview_p = & AcceleratorViewContainer[0];
|
Aview *Aview_p = & AcceleratorViewContainer[0];
|
||||||
|
|
||||||
const int Nsimd = CComplex::Nsimd();
|
const int Nsimd = CComplex::Nsimd();
|
||||||
@ -210,10 +201,10 @@ public:
|
|||||||
|
|
||||||
int osites=Grid()->oSites();
|
int osites=Grid()->oSites();
|
||||||
|
|
||||||
deviceVector<int> points(geom.npoint);
|
Vector<int> points(geom.npoint, 0);
|
||||||
for(int p=0; p<geom.npoint; p++) {
|
for(int p=0; p<geom.npoint; p++)
|
||||||
acceleratorPut(points[p],geom.points_dagger[p]);
|
points[p] = geom.points_dagger[p];
|
||||||
}
|
|
||||||
auto points_p = &points[0];
|
auto points_p = &points[0];
|
||||||
|
|
||||||
RealD* dag_factor_p = &dag_factor[0];
|
RealD* dag_factor_p = &dag_factor[0];
|
||||||
@ -245,7 +236,7 @@ public:
|
|||||||
coalescedWrite(out_v[ss](b),res);
|
coalescedWrite(out_v[ss](b),res);
|
||||||
});
|
});
|
||||||
|
|
||||||
for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
|
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
|
||||||
}
|
}
|
||||||
|
|
||||||
void MdirComms(const CoarseVector &in)
|
void MdirComms(const CoarseVector &in)
|
||||||
@ -260,14 +251,8 @@ public:
|
|||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
|
|
||||||
typedef LatticeView<Cobj> Aview;
|
typedef LatticeView<Cobj> Aview;
|
||||||
|
Vector<Aview> AcceleratorViewContainer;
|
||||||
deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
|
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
|
||||||
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
|
|
||||||
|
|
||||||
for(int p=0;p<geom.npoint;p++) {
|
|
||||||
hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
|
|
||||||
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
|
|
||||||
}
|
|
||||||
Aview *Aview_p = & AcceleratorViewContainer[0];
|
Aview *Aview_p = & AcceleratorViewContainer[0];
|
||||||
|
|
||||||
autoView( out_v , out, AcceleratorWrite);
|
autoView( out_v , out, AcceleratorWrite);
|
||||||
@ -300,7 +285,7 @@ public:
|
|||||||
}
|
}
|
||||||
coalescedWrite(out_v[ss](b),res);
|
coalescedWrite(out_v[ss](b),res);
|
||||||
});
|
});
|
||||||
for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
|
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
|
||||||
}
|
}
|
||||||
void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out)
|
void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out)
|
||||||
{
|
{
|
||||||
@ -484,20 +469,14 @@ public:
|
|||||||
|
|
||||||
// determine in what order we need the points
|
// determine in what order we need the points
|
||||||
int npoint = geom.npoint-1;
|
int npoint = geom.npoint-1;
|
||||||
deviceVector<int> points(npoint);
|
Vector<int> points(npoint, 0);
|
||||||
for(int p=0; p<npoint; p++) {
|
for(int p=0; p<npoint; p++)
|
||||||
int val = (dag && !hermitian) ? geom.points_dagger[p] : p;
|
points[p] = (dag && !hermitian) ? geom.points_dagger[p] : p;
|
||||||
acceleratorPut(points[p], val);
|
|
||||||
}
|
|
||||||
auto points_p = &points[0];
|
auto points_p = &points[0];
|
||||||
|
|
||||||
deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
|
Vector<Aview> AcceleratorViewContainer;
|
||||||
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
|
for(int p=0;p<npoint;p++) AcceleratorViewContainer.push_back(a[p].View(AcceleratorRead));
|
||||||
|
|
||||||
for(int p=0;p<geom.npoint;p++) {
|
|
||||||
hAcceleratorViewContainer[p] = a[p].View(AcceleratorRead);
|
|
||||||
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
|
|
||||||
}
|
|
||||||
Aview *Aview_p = & AcceleratorViewContainer[0];
|
Aview *Aview_p = & AcceleratorViewContainer[0];
|
||||||
|
|
||||||
const int Nsimd = CComplex::Nsimd();
|
const int Nsimd = CComplex::Nsimd();
|
||||||
@ -560,7 +539,7 @@ public:
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
for(int p=0;p<npoint;p++) hAcceleratorViewContainer[p].ViewClose();
|
for(int p=0;p<npoint;p++) AcceleratorViewContainer[p].ViewClose();
|
||||||
}
|
}
|
||||||
|
|
||||||
CoarsenedMatrix(GridCartesian &CoarseGrid, int hermitian_=0) :
|
CoarsenedMatrix(GridCartesian &CoarseGrid, int hermitian_=0) :
|
||||||
@ -611,13 +590,11 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
// GPU readable prefactor
|
// GPU readable prefactor
|
||||||
std::vector<RealD> h_dag_factor(nbasis*nbasis);
|
|
||||||
thread_for(i, nbasis*nbasis, {
|
thread_for(i, nbasis*nbasis, {
|
||||||
int j = i/nbasis;
|
int j = i/nbasis;
|
||||||
int k = i%nbasis;
|
int k = i%nbasis;
|
||||||
h_dag_factor[i] = dag_factor_eigen(j, k);
|
dag_factor[i] = dag_factor_eigen(j, k);
|
||||||
});
|
});
|
||||||
acceleratorCopyToDevice(&h_dag_factor[0],&dag_factor[0],dag_factor.size()*sizeof(RealD));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
|
void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
|
||||||
|
@ -174,11 +174,21 @@ template<typename _Tp> inline bool operator!=(const devAllocator<_Tp>&, const d
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Template typedefs
|
// Template typedefs
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
template<class T> using hostVector = std::vector<T,alignedAllocator<T> >; // Needs autoview
|
#ifdef ACCELERATOR_CSHIFT
|
||||||
template<class T> using Vector = std::vector<T,uvmAllocator<T> >; //
|
// Cshift on device
|
||||||
template<class T> using uvmVector = std::vector<T,uvmAllocator<T> >; // auto migrating page
|
template<class T> using cshiftAllocator = devAllocator<T>;
|
||||||
template<class T> using deviceVector = std::vector<T,devAllocator<T> >; // device vector
|
#else
|
||||||
|
// Cshift on host
|
||||||
|
template<class T> using cshiftAllocator = std::allocator<T>;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
template<class T> using Vector = std::vector<T,uvmAllocator<T> >;
|
||||||
|
template<class T> using stencilVector = std::vector<T,alignedAllocator<T> >;
|
||||||
|
template<class T> using commVector = std::vector<T,devAllocator<T> >;
|
||||||
|
template<class T> using deviceVector = std::vector<T,devAllocator<T> >;
|
||||||
|
template<class T> using cshiftVector = std::vector<T,cshiftAllocator<T> >;
|
||||||
|
|
||||||
|
/*
|
||||||
template<class T> class vecView
|
template<class T> class vecView
|
||||||
{
|
{
|
||||||
protected:
|
protected:
|
||||||
@ -187,9 +197,8 @@ template<class T> class vecView
|
|||||||
ViewMode mode;
|
ViewMode mode;
|
||||||
void * cpu_ptr;
|
void * cpu_ptr;
|
||||||
public:
|
public:
|
||||||
// Rvalue accessor
|
|
||||||
accelerator_inline T & operator[](size_t i) const { return this->data[i]; };
|
accelerator_inline T & operator[](size_t i) const { return this->data[i]; };
|
||||||
vecView(Vector<T> &refer_to_me,ViewMode _mode)
|
vecView(std::vector<T> &refer_to_me,ViewMode _mode)
|
||||||
{
|
{
|
||||||
cpu_ptr = &refer_to_me[0];
|
cpu_ptr = &refer_to_me[0];
|
||||||
size = refer_to_me.size();
|
size = refer_to_me.size();
|
||||||
@ -205,15 +214,26 @@ template<class T> class vecView
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template<class T> vecView<T> VectorView(Vector<T> &vec,ViewMode _mode)
|
template<class T> vecView<T> VectorView(std::vector<T> &vec,ViewMode _mode)
|
||||||
{
|
{
|
||||||
vecView<T> ret(vec,_mode); // does the open
|
vecView<T> ret(vec,_mode); // does the open
|
||||||
return ret; // must be closed
|
return ret; // must be closed
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Little autoscope assister
|
||||||
|
template<class View>
|
||||||
|
class VectorViewCloser
|
||||||
|
{
|
||||||
|
View v; // Take a copy of view and call view close when I go out of scope automatically
|
||||||
|
public:
|
||||||
|
VectorViewCloser(View &_v) : v(_v) {};
|
||||||
|
~VectorViewCloser() { auto ptr = v.cpu_ptr; v.ViewClose(); MemoryManager::NotifyDeletion(ptr);}
|
||||||
|
};
|
||||||
|
|
||||||
#define autoVecView(v_v,v,mode) \
|
#define autoVecView(v_v,v,mode) \
|
||||||
auto v_v = VectorView(v,mode); \
|
auto v_v = VectorView(v,mode); \
|
||||||
ViewCloser<decltype(v_v)> _autoView##v_v(v_v);
|
ViewCloser<decltype(v_v)> _autoView##v_v(v_v);
|
||||||
|
*/
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
@ -15,10 +15,10 @@ void check_huge_pages(void *Buf,uint64_t BYTES)
|
|||||||
uint64_t virt_pfn = (uint64_t)Buf / page_size;
|
uint64_t virt_pfn = (uint64_t)Buf / page_size;
|
||||||
off_t offset = sizeof(uint64_t) * virt_pfn;
|
off_t offset = sizeof(uint64_t) * virt_pfn;
|
||||||
uint64_t npages = (BYTES + page_size-1) / page_size;
|
uint64_t npages = (BYTES + page_size-1) / page_size;
|
||||||
std::vector<uint64_t> pagedata(npages);
|
uint64_t pagedata[npages];
|
||||||
uint64_t ret = lseek(fd, offset, SEEK_SET);
|
uint64_t ret = lseek(fd, offset, SEEK_SET);
|
||||||
assert(ret == offset);
|
assert(ret == offset);
|
||||||
ret = ::read(fd, &pagedata[0], sizeof(uint64_t)*npages);
|
ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
|
||||||
assert(ret == sizeof(uint64_t) * npages);
|
assert(ret == sizeof(uint64_t) * npages);
|
||||||
int nhugepages = npages / 512;
|
int nhugepages = npages / 512;
|
||||||
int n4ktotal, nnothuge;
|
int n4ktotal, nnothuge;
|
||||||
|
@ -82,7 +82,6 @@ public:
|
|||||||
bool _isCheckerBoarded;
|
bool _isCheckerBoarded;
|
||||||
int LocallyPeriodic;
|
int LocallyPeriodic;
|
||||||
Coordinate _checker_dim_mask;
|
Coordinate _checker_dim_mask;
|
||||||
int _checker_dim;
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
@ -90,8 +89,9 @@ public:
|
|||||||
// Checkerboarding interface is virtual and overridden by
|
// Checkerboarding interface is virtual and overridden by
|
||||||
// GridCartesian / GridRedBlackCartesian
|
// GridCartesian / GridRedBlackCartesian
|
||||||
////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////
|
||||||
virtual int CheckerBoarded(int dim) =0;
|
virtual int CheckerBoarded(int dim)=0;
|
||||||
virtual int CheckerBoard(const Coordinate &site)=0;
|
virtual int CheckerBoard(const Coordinate &site)=0;
|
||||||
|
virtual int CheckerDim(void){ return 0; };
|
||||||
virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0;
|
virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0;
|
||||||
virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
|
virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
|
||||||
virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0;
|
virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0;
|
||||||
|
@ -38,7 +38,7 @@ class GridCartesian: public GridBase {
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
int dummy;
|
int dummy;
|
||||||
// Coordinate _checker_dim_mask;
|
Coordinate _checker_dim_mask;
|
||||||
virtual int CheckerBoardFromOindexTable (int Oindex) {
|
virtual int CheckerBoardFromOindexTable (int Oindex) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@ -46,7 +46,7 @@ public:
|
|||||||
{
|
{
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
virtual int CheckerBoarded(int dim) {
|
virtual int CheckerBoarded(int dim){
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
virtual int CheckerBoard(const Coordinate &site){
|
virtual int CheckerBoard(const Coordinate &site){
|
||||||
@ -106,7 +106,6 @@ public:
|
|||||||
_rdimensions.resize(_ndimension);
|
_rdimensions.resize(_ndimension);
|
||||||
_simd_layout.resize(_ndimension);
|
_simd_layout.resize(_ndimension);
|
||||||
_checker_dim_mask.resize(_ndimension);;
|
_checker_dim_mask.resize(_ndimension);;
|
||||||
_checker_dim = -1;
|
|
||||||
_lstart.resize(_ndimension);
|
_lstart.resize(_ndimension);
|
||||||
_lend.resize(_ndimension);
|
_lend.resize(_ndimension);
|
||||||
|
|
||||||
|
@ -57,10 +57,10 @@ class GridRedBlackCartesian : public GridBase
|
|||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
// Coordinate _checker_dim_mask;
|
// Coordinate _checker_dim_mask;
|
||||||
// int _checker_dim;
|
int _checker_dim;
|
||||||
std::vector<int> _checker_board;
|
std::vector<int> _checker_board;
|
||||||
|
|
||||||
virtual int isCheckerBoarded(void) const { return 1; };
|
virtual int CheckerDim(void){ return _checker_dim; };
|
||||||
virtual int CheckerBoarded(int dim){
|
virtual int CheckerBoarded(int dim){
|
||||||
if( dim==_checker_dim) return 1;
|
if( dim==_checker_dim) return 1;
|
||||||
else return 0;
|
else return 0;
|
||||||
|
@ -51,6 +51,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
|
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
|
||||||
auto Cshift(const Expression &expr,int dim,int shift) -> decltype(closure(expr))
|
auto Cshift(const Expression &expr,int dim,int shift) -> decltype(closure(expr))
|
||||||
{
|
{
|
||||||
|
@ -30,11 +30,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
extern std::vector<std::pair<int,int> > Cshift_table;
|
extern std::vector<std::pair<int,int> > Cshift_table;
|
||||||
extern deviceVector<std::pair<int,int> > Cshift_table_device;
|
extern commVector<std::pair<int,int> > Cshift_table_device;
|
||||||
|
|
||||||
inline std::pair<int,int> *MapCshiftTable(void)
|
inline std::pair<int,int> *MapCshiftTable(void)
|
||||||
{
|
{
|
||||||
// GPU version
|
// GPU version
|
||||||
|
#ifdef ACCELERATOR_CSHIFT
|
||||||
uint64_t sz=Cshift_table.size();
|
uint64_t sz=Cshift_table.size();
|
||||||
if (Cshift_table_device.size()!=sz ) {
|
if (Cshift_table_device.size()!=sz ) {
|
||||||
Cshift_table_device.resize(sz);
|
Cshift_table_device.resize(sz);
|
||||||
@ -44,13 +45,16 @@ inline std::pair<int,int> *MapCshiftTable(void)
|
|||||||
sizeof(Cshift_table[0])*sz);
|
sizeof(Cshift_table[0])*sz);
|
||||||
|
|
||||||
return &Cshift_table_device[0];
|
return &Cshift_table_device[0];
|
||||||
|
#else
|
||||||
|
return &Cshift_table[0];
|
||||||
|
#endif
|
||||||
// CPU version use identify map
|
// CPU version use identify map
|
||||||
}
|
}
|
||||||
///////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////
|
||||||
// Gather for when there is no need to SIMD split
|
// Gather for when there is no need to SIMD split
|
||||||
///////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////
|
||||||
template<class vobj> void
|
template<class vobj> void
|
||||||
Gather_plane_simple (const Lattice<vobj> &rhs,deviceVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
|
Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
|
||||||
{
|
{
|
||||||
int rd = rhs.Grid()->_rdimensions[dimension];
|
int rd = rhs.Grid()->_rdimensions[dimension];
|
||||||
|
|
||||||
@ -90,10 +94,17 @@ Gather_plane_simple (const Lattice<vobj> &rhs,deviceVector<vobj> &buffer,int dim
|
|||||||
{
|
{
|
||||||
auto buffer_p = & buffer[0];
|
auto buffer_p = & buffer[0];
|
||||||
auto table = MapCshiftTable();
|
auto table = MapCshiftTable();
|
||||||
|
#ifdef ACCELERATOR_CSHIFT
|
||||||
autoView(rhs_v , rhs, AcceleratorRead);
|
autoView(rhs_v , rhs, AcceleratorRead);
|
||||||
accelerator_for(i,ent,vobj::Nsimd(),{
|
accelerator_for(i,ent,vobj::Nsimd(),{
|
||||||
coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
|
coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
|
||||||
});
|
});
|
||||||
|
#else
|
||||||
|
autoView(rhs_v , rhs, CpuRead);
|
||||||
|
thread_for(i,ent,{
|
||||||
|
buffer_p[table[i].first]=rhs_v[table[i].second];
|
||||||
|
});
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -118,6 +129,7 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
|
|||||||
int n1=rhs.Grid()->_slice_stride[dimension];
|
int n1=rhs.Grid()->_slice_stride[dimension];
|
||||||
|
|
||||||
if ( cbmask ==0x3){
|
if ( cbmask ==0x3){
|
||||||
|
#ifdef ACCELERATOR_CSHIFT
|
||||||
autoView(rhs_v , rhs, AcceleratorRead);
|
autoView(rhs_v , rhs, AcceleratorRead);
|
||||||
accelerator_for(nn,e1*e2,1,{
|
accelerator_for(nn,e1*e2,1,{
|
||||||
int n = nn%e1;
|
int n = nn%e1;
|
||||||
@ -128,10 +140,21 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
|
|||||||
vobj temp =rhs_v[so+o+b];
|
vobj temp =rhs_v[so+o+b];
|
||||||
extract<vobj>(temp,pointers,offset);
|
extract<vobj>(temp,pointers,offset);
|
||||||
});
|
});
|
||||||
|
#else
|
||||||
|
autoView(rhs_v , rhs, CpuRead);
|
||||||
|
thread_for2d(n,e1,b,e2,{
|
||||||
|
int o = n*n1;
|
||||||
|
int offset = b+n*e2;
|
||||||
|
|
||||||
|
vobj temp =rhs_v[so+o+b];
|
||||||
|
extract<vobj>(temp,pointers,offset);
|
||||||
|
});
|
||||||
|
#endif
|
||||||
} else {
|
} else {
|
||||||
Coordinate rdim=rhs.Grid()->_rdimensions;
|
Coordinate rdim=rhs.Grid()->_rdimensions;
|
||||||
Coordinate cdm =rhs.Grid()->_checker_dim_mask;
|
Coordinate cdm =rhs.Grid()->_checker_dim_mask;
|
||||||
std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
|
std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
|
||||||
|
#ifdef ACCELERATOR_CSHIFT
|
||||||
autoView(rhs_v , rhs, AcceleratorRead);
|
autoView(rhs_v , rhs, AcceleratorRead);
|
||||||
accelerator_for(nn,e1*e2,1,{
|
accelerator_for(nn,e1*e2,1,{
|
||||||
int n = nn%e1;
|
int n = nn%e1;
|
||||||
@ -152,13 +175,33 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
|
|||||||
extract<vobj>(temp,pointers,offset);
|
extract<vobj>(temp,pointers,offset);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
#else
|
||||||
|
autoView(rhs_v , rhs, CpuRead);
|
||||||
|
thread_for2d(n,e1,b,e2,{
|
||||||
|
|
||||||
|
Coordinate coor;
|
||||||
|
|
||||||
|
int o=n*n1;
|
||||||
|
int oindex = o+b;
|
||||||
|
|
||||||
|
int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm);
|
||||||
|
|
||||||
|
int ocb=1<<cb;
|
||||||
|
int offset = b+n*e2;
|
||||||
|
|
||||||
|
if ( ocb & cbmask ) {
|
||||||
|
vobj temp =rhs_v[so+o+b];
|
||||||
|
extract<vobj>(temp,pointers,offset);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
// Scatter for when there is no need to SIMD split
|
// Scatter for when there is no need to SIMD split
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,deviceVector<vobj> &buffer, int dimension,int plane,int cbmask)
|
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<vobj> &buffer, int dimension,int plane,int cbmask)
|
||||||
{
|
{
|
||||||
int rd = rhs.Grid()->_rdimensions[dimension];
|
int rd = rhs.Grid()->_rdimensions[dimension];
|
||||||
|
|
||||||
@ -202,10 +245,17 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,deviceVector<
|
|||||||
{
|
{
|
||||||
auto buffer_p = & buffer[0];
|
auto buffer_p = & buffer[0];
|
||||||
auto table = MapCshiftTable();
|
auto table = MapCshiftTable();
|
||||||
|
#ifdef ACCELERATOR_CSHIFT
|
||||||
autoView( rhs_v, rhs, AcceleratorWrite);
|
autoView( rhs_v, rhs, AcceleratorWrite);
|
||||||
accelerator_for(i,ent,vobj::Nsimd(),{
|
accelerator_for(i,ent,vobj::Nsimd(),{
|
||||||
coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
|
coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
|
||||||
});
|
});
|
||||||
|
#else
|
||||||
|
autoView( rhs_v, rhs, CpuWrite);
|
||||||
|
thread_for(i,ent,{
|
||||||
|
rhs_v[table[i].first]=buffer_p[table[i].second];
|
||||||
|
});
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -228,6 +278,7 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
|
|||||||
if(cbmask ==0x3 ) {
|
if(cbmask ==0x3 ) {
|
||||||
int _slice_stride = rhs.Grid()->_slice_stride[dimension];
|
int _slice_stride = rhs.Grid()->_slice_stride[dimension];
|
||||||
int _slice_block = rhs.Grid()->_slice_block[dimension];
|
int _slice_block = rhs.Grid()->_slice_block[dimension];
|
||||||
|
#ifdef ACCELERATOR_CSHIFT
|
||||||
autoView( rhs_v , rhs, AcceleratorWrite);
|
autoView( rhs_v , rhs, AcceleratorWrite);
|
||||||
accelerator_for(nn,e1*e2,1,{
|
accelerator_for(nn,e1*e2,1,{
|
||||||
int n = nn%e1;
|
int n = nn%e1;
|
||||||
@ -236,6 +287,14 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
|
|||||||
int offset = b+n*_slice_block;
|
int offset = b+n*_slice_block;
|
||||||
merge(rhs_v[so+o+b],pointers,offset);
|
merge(rhs_v[so+o+b],pointers,offset);
|
||||||
});
|
});
|
||||||
|
#else
|
||||||
|
autoView( rhs_v , rhs, CpuWrite);
|
||||||
|
thread_for2d(n,e1,b,e2,{
|
||||||
|
int o = n*_slice_stride;
|
||||||
|
int offset = b+n*_slice_block;
|
||||||
|
merge(rhs_v[so+o+b],pointers,offset);
|
||||||
|
});
|
||||||
|
#endif
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
// Case of SIMD split AND checker dim cannot currently be hit, except in
|
// Case of SIMD split AND checker dim cannot currently be hit, except in
|
||||||
@ -301,11 +360,19 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
|
|||||||
|
|
||||||
{
|
{
|
||||||
auto table = MapCshiftTable();
|
auto table = MapCshiftTable();
|
||||||
|
#ifdef ACCELERATOR_CSHIFT
|
||||||
autoView(rhs_v , rhs, AcceleratorRead);
|
autoView(rhs_v , rhs, AcceleratorRead);
|
||||||
autoView(lhs_v , lhs, AcceleratorWrite);
|
autoView(lhs_v , lhs, AcceleratorWrite);
|
||||||
accelerator_for(i,ent,vobj::Nsimd(),{
|
accelerator_for(i,ent,vobj::Nsimd(),{
|
||||||
coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second]));
|
coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second]));
|
||||||
});
|
});
|
||||||
|
#else
|
||||||
|
autoView(rhs_v , rhs, CpuRead);
|
||||||
|
autoView(lhs_v , lhs, CpuWrite);
|
||||||
|
thread_for(i,ent,{
|
||||||
|
lhs_v[table[i].first]=rhs_v[table[i].second];
|
||||||
|
});
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -345,11 +412,19 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
|
|||||||
|
|
||||||
{
|
{
|
||||||
auto table = MapCshiftTable();
|
auto table = MapCshiftTable();
|
||||||
|
#ifdef ACCELERATOR_CSHIFT
|
||||||
autoView( rhs_v, rhs, AcceleratorRead);
|
autoView( rhs_v, rhs, AcceleratorRead);
|
||||||
autoView( lhs_v, lhs, AcceleratorWrite);
|
autoView( lhs_v, lhs, AcceleratorWrite);
|
||||||
accelerator_for(i,ent,1,{
|
accelerator_for(i,ent,1,{
|
||||||
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
|
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
|
||||||
});
|
});
|
||||||
|
#else
|
||||||
|
autoView( rhs_v, rhs, CpuRead);
|
||||||
|
autoView( lhs_v, lhs, CpuWrite);
|
||||||
|
thread_for(i,ent,{
|
||||||
|
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
|
||||||
|
});
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -55,13 +55,13 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
|
|||||||
RealD t1,t0;
|
RealD t1,t0;
|
||||||
t0=usecond();
|
t0=usecond();
|
||||||
if ( !comm_dim ) {
|
if ( !comm_dim ) {
|
||||||
// std::cout << "CSHIFT: Cshift_local" <<std::endl;
|
//std::cout << "CSHIFT: Cshift_local" <<std::endl;
|
||||||
Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding
|
Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding
|
||||||
} else if ( splice_dim ) {
|
} else if ( splice_dim ) {
|
||||||
// std::cout << "CSHIFT: Cshift_comms_simd call - splice_dim = " << splice_dim << " shift " << shift << " dimension = " << dimension << std::endl;
|
//std::cout << "CSHIFT: Cshift_comms_simd call - splice_dim = " << splice_dim << " shift " << shift << " dimension = " << dimension << std::endl;
|
||||||
Cshift_comms_simd(ret,rhs,dimension,shift);
|
Cshift_comms_simd(ret,rhs,dimension,shift);
|
||||||
} else {
|
} else {
|
||||||
// std::cout << "CSHIFT: Cshift_comms" <<std::endl;
|
//std::cout << "CSHIFT: Cshift_comms" <<std::endl;
|
||||||
Cshift_comms(ret,rhs,dimension,shift);
|
Cshift_comms(ret,rhs,dimension,shift);
|
||||||
}
|
}
|
||||||
t1=usecond();
|
t1=usecond();
|
||||||
@ -94,16 +94,18 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
|
|||||||
sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even);
|
sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even);
|
||||||
sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd);
|
sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd);
|
||||||
|
|
||||||
// std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.Checkerboard()<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
|
//std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
|
||||||
if ( sshift[0] == sshift[1] ) {
|
if ( sshift[0] == sshift[1] ) {
|
||||||
// std::cout << "Single pass Cshift_comms" <<std::endl;
|
//std::cout << "Single pass Cshift_comms" <<std::endl;
|
||||||
Cshift_comms_simd(ret,rhs,dimension,shift,0x3);
|
Cshift_comms_simd(ret,rhs,dimension,shift,0x3);
|
||||||
} else {
|
} else {
|
||||||
// std::cout << "Two pass Cshift_comms" <<std::endl;
|
//std::cout << "Two pass Cshift_comms" <<std::endl;
|
||||||
Cshift_comms_simd(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
|
Cshift_comms_simd(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
|
||||||
Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
|
Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#define ACCELERATOR_CSHIFT_NO_COPY
|
||||||
|
#ifdef ACCELERATOR_CSHIFT_NO_COPY
|
||||||
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
||||||
{
|
{
|
||||||
typedef typename vobj::vector_type vector_type;
|
typedef typename vobj::vector_type vector_type;
|
||||||
@ -123,8 +125,8 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
|||||||
assert(shift<fd);
|
assert(shift<fd);
|
||||||
|
|
||||||
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
|
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
|
||||||
static deviceVector<vobj> send_buf; send_buf.resize(buffer_size);
|
static cshiftVector<vobj> send_buf; send_buf.resize(buffer_size);
|
||||||
static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size);
|
static cshiftVector<vobj> recv_buf; recv_buf.resize(buffer_size);
|
||||||
|
|
||||||
int cb= (cbmask==0x2)? Odd : Even;
|
int cb= (cbmask==0x2)? Odd : Even;
|
||||||
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
|
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
|
||||||
@ -159,7 +161,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
|||||||
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
|
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
|
||||||
|
|
||||||
tcomms-=usecond();
|
tcomms-=usecond();
|
||||||
grid->Barrier();
|
// grid->Barrier();
|
||||||
|
|
||||||
grid->SendToRecvFrom((void *)&send_buf[0],
|
grid->SendToRecvFrom((void *)&send_buf[0],
|
||||||
xmit_to_rank,
|
xmit_to_rank,
|
||||||
@ -167,7 +169,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
|||||||
recv_from_rank,
|
recv_from_rank,
|
||||||
bytes);
|
bytes);
|
||||||
xbytes+=bytes;
|
xbytes+=bytes;
|
||||||
grid->Barrier();
|
// grid->Barrier();
|
||||||
tcomms+=usecond();
|
tcomms+=usecond();
|
||||||
|
|
||||||
tscatter-=usecond();
|
tscatter-=usecond();
|
||||||
@ -175,11 +177,13 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
|||||||
tscatter+=usecond();
|
tscatter+=usecond();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
/*
|
||||||
std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl;
|
std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl;
|
||||||
std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl;
|
std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl;
|
||||||
std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
|
std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
|
||||||
std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl;
|
std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl;
|
||||||
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
|
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
|
||||||
|
*/
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
||||||
@ -197,7 +201,7 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
|||||||
int simd_layout = grid->_simd_layout[dimension];
|
int simd_layout = grid->_simd_layout[dimension];
|
||||||
int comm_dim = grid->_processors[dimension] >1 ;
|
int comm_dim = grid->_processors[dimension] >1 ;
|
||||||
|
|
||||||
// std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
|
//std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
|
||||||
// << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout
|
// << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout
|
||||||
// << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
|
// << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
|
||||||
|
|
||||||
@ -220,8 +224,8 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
|||||||
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
|
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
|
||||||
// int words = sizeof(vobj)/sizeof(vector_type);
|
// int words = sizeof(vobj)/sizeof(vector_type);
|
||||||
|
|
||||||
static std::vector<deviceVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd);
|
static std::vector<cshiftVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd);
|
||||||
static std::vector<deviceVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
|
static std::vector<cshiftVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
|
||||||
scalar_object * recv_buf_extract_mpi;
|
scalar_object * recv_buf_extract_mpi;
|
||||||
scalar_object * send_buf_extract_mpi;
|
scalar_object * send_buf_extract_mpi;
|
||||||
|
|
||||||
@ -277,7 +281,7 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
|||||||
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
||||||
|
|
||||||
tcomms-=usecond();
|
tcomms-=usecond();
|
||||||
grid->Barrier();
|
// grid->Barrier();
|
||||||
|
|
||||||
send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
|
send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
|
||||||
recv_buf_extract_mpi = &recv_buf_extract[i][0];
|
recv_buf_extract_mpi = &recv_buf_extract[i][0];
|
||||||
@ -288,7 +292,7 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
|||||||
bytes);
|
bytes);
|
||||||
|
|
||||||
xbytes+=bytes;
|
xbytes+=bytes;
|
||||||
grid->Barrier();
|
// grid->Barrier();
|
||||||
tcomms+=usecond();
|
tcomms+=usecond();
|
||||||
|
|
||||||
rpointers[i] = &recv_buf_extract[i][0];
|
rpointers[i] = &recv_buf_extract[i][0];
|
||||||
@ -301,12 +305,242 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
|||||||
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
|
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
|
||||||
tscatter+=usecond();
|
tscatter+=usecond();
|
||||||
}
|
}
|
||||||
|
/*
|
||||||
std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl;
|
std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl;
|
||||||
std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl;
|
std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl;
|
||||||
std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
|
std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
|
||||||
std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl;
|
std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl;
|
||||||
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
|
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
|
||||||
|
*/
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
||||||
|
{
|
||||||
|
typedef typename vobj::vector_type vector_type;
|
||||||
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
|
|
||||||
|
GridBase *grid=rhs.Grid();
|
||||||
|
Lattice<vobj> temp(rhs.Grid());
|
||||||
|
|
||||||
|
int fd = rhs.Grid()->_fdimensions[dimension];
|
||||||
|
int rd = rhs.Grid()->_rdimensions[dimension];
|
||||||
|
int pd = rhs.Grid()->_processors[dimension];
|
||||||
|
int simd_layout = rhs.Grid()->_simd_layout[dimension];
|
||||||
|
int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
|
||||||
|
assert(simd_layout==1);
|
||||||
|
assert(comm_dim==1);
|
||||||
|
assert(shift>=0);
|
||||||
|
assert(shift<fd);
|
||||||
|
RealD tcopy=0.0;
|
||||||
|
RealD tgather=0.0;
|
||||||
|
RealD tscatter=0.0;
|
||||||
|
RealD tcomms=0.0;
|
||||||
|
uint64_t xbytes=0;
|
||||||
|
|
||||||
|
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
|
||||||
|
static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size);
|
||||||
|
static cshiftVector<vobj> recv_buf_v; recv_buf_v.resize(buffer_size);
|
||||||
|
vobj *send_buf;
|
||||||
|
vobj *recv_buf;
|
||||||
|
{
|
||||||
|
grid->ShmBufferFreeAll();
|
||||||
|
size_t bytes = buffer_size*sizeof(vobj);
|
||||||
|
send_buf=(vobj *)grid->ShmBufferMalloc(bytes);
|
||||||
|
recv_buf=(vobj *)grid->ShmBufferMalloc(bytes);
|
||||||
|
}
|
||||||
|
|
||||||
|
int cb= (cbmask==0x2)? Odd : Even;
|
||||||
|
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
|
||||||
|
|
||||||
|
for(int x=0;x<rd;x++){
|
||||||
|
|
||||||
|
int sx = (x+sshift)%rd;
|
||||||
|
int comm_proc = ((x+sshift)/rd)%pd;
|
||||||
|
|
||||||
|
if (comm_proc==0) {
|
||||||
|
|
||||||
|
tcopy-=usecond();
|
||||||
|
Copy_plane(ret,rhs,dimension,x,sx,cbmask);
|
||||||
|
tcopy+=usecond();
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
|
int words = buffer_size;
|
||||||
|
if (cbmask != 0x3) words=words>>1;
|
||||||
|
|
||||||
|
int bytes = words * sizeof(vobj);
|
||||||
|
|
||||||
|
tgather-=usecond();
|
||||||
|
Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask);
|
||||||
|
tgather+=usecond();
|
||||||
|
|
||||||
|
// int rank = grid->_processor;
|
||||||
|
int recv_from_rank;
|
||||||
|
int xmit_to_rank;
|
||||||
|
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
|
||||||
|
|
||||||
|
|
||||||
|
tcomms-=usecond();
|
||||||
|
// grid->Barrier();
|
||||||
|
|
||||||
|
acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes);
|
||||||
|
grid->SendToRecvFrom((void *)&send_buf[0],
|
||||||
|
xmit_to_rank,
|
||||||
|
(void *)&recv_buf[0],
|
||||||
|
recv_from_rank,
|
||||||
|
bytes);
|
||||||
|
xbytes+=bytes;
|
||||||
|
acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes);
|
||||||
|
|
||||||
|
// grid->Barrier();
|
||||||
|
tcomms+=usecond();
|
||||||
|
|
||||||
|
tscatter-=usecond();
|
||||||
|
Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask);
|
||||||
|
tscatter+=usecond();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl;
|
||||||
|
std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl;
|
||||||
|
std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
|
||||||
|
std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl;
|
||||||
|
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
|
||||||
|
*/
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
||||||
|
{
|
||||||
|
GridBase *grid=rhs.Grid();
|
||||||
|
const int Nsimd = grid->Nsimd();
|
||||||
|
typedef typename vobj::vector_type vector_type;
|
||||||
|
typedef typename vobj::scalar_object scalar_object;
|
||||||
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
|
|
||||||
|
int fd = grid->_fdimensions[dimension];
|
||||||
|
int rd = grid->_rdimensions[dimension];
|
||||||
|
int ld = grid->_ldimensions[dimension];
|
||||||
|
int pd = grid->_processors[dimension];
|
||||||
|
int simd_layout = grid->_simd_layout[dimension];
|
||||||
|
int comm_dim = grid->_processors[dimension] >1 ;
|
||||||
|
|
||||||
|
//std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
|
||||||
|
// << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout
|
||||||
|
// << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
|
||||||
|
|
||||||
|
assert(comm_dim==1);
|
||||||
|
assert(simd_layout==2);
|
||||||
|
assert(shift>=0);
|
||||||
|
assert(shift<fd);
|
||||||
|
RealD tcopy=0.0;
|
||||||
|
RealD tgather=0.0;
|
||||||
|
RealD tscatter=0.0;
|
||||||
|
RealD tcomms=0.0;
|
||||||
|
uint64_t xbytes=0;
|
||||||
|
|
||||||
|
int permute_type=grid->PermuteType(dimension);
|
||||||
|
|
||||||
|
///////////////////////////////////////////////
|
||||||
|
// Simd direction uses an extract/merge pair
|
||||||
|
///////////////////////////////////////////////
|
||||||
|
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
|
||||||
|
// int words = sizeof(vobj)/sizeof(vector_type);
|
||||||
|
|
||||||
|
static std::vector<cshiftVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd);
|
||||||
|
static std::vector<cshiftVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
|
||||||
|
scalar_object * recv_buf_extract_mpi;
|
||||||
|
scalar_object * send_buf_extract_mpi;
|
||||||
|
{
|
||||||
|
size_t bytes = sizeof(scalar_object)*buffer_size;
|
||||||
|
grid->ShmBufferFreeAll();
|
||||||
|
send_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
|
||||||
|
recv_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
|
||||||
|
}
|
||||||
|
for(int s=0;s<Nsimd;s++){
|
||||||
|
send_buf_extract[s].resize(buffer_size);
|
||||||
|
recv_buf_extract[s].resize(buffer_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
int bytes = buffer_size*sizeof(scalar_object);
|
||||||
|
|
||||||
|
ExtractPointerArray<scalar_object> pointers(Nsimd); //
|
||||||
|
ExtractPointerArray<scalar_object> rpointers(Nsimd); // received pointers
|
||||||
|
|
||||||
|
///////////////////////////////////////////
|
||||||
|
// Work out what to send where
|
||||||
|
///////////////////////////////////////////
|
||||||
|
int cb = (cbmask==0x2)? Odd : Even;
|
||||||
|
int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
|
||||||
|
|
||||||
|
// loop over outer coord planes orthog to dim
|
||||||
|
for(int x=0;x<rd;x++){
|
||||||
|
|
||||||
|
// FIXME call local permute copy if none are offnode.
|
||||||
|
for(int i=0;i<Nsimd;i++){
|
||||||
|
pointers[i] = &send_buf_extract[i][0];
|
||||||
|
}
|
||||||
|
tgather-=usecond();
|
||||||
|
int sx = (x+sshift)%rd;
|
||||||
|
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
|
||||||
|
tgather+=usecond();
|
||||||
|
|
||||||
|
for(int i=0;i<Nsimd;i++){
|
||||||
|
|
||||||
|
int inner_bit = (Nsimd>>(permute_type+1));
|
||||||
|
int ic= (i&inner_bit)? 1:0;
|
||||||
|
|
||||||
|
int my_coor = rd*ic + x;
|
||||||
|
int nbr_coor = my_coor+sshift;
|
||||||
|
int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
|
||||||
|
|
||||||
|
int nbr_ic = (nbr_coor%ld)/rd; // inner coord of peer
|
||||||
|
int nbr_ox = (nbr_coor%rd); // outer coord of peer
|
||||||
|
int nbr_lane = (i&(~inner_bit));
|
||||||
|
|
||||||
|
int recv_from_rank;
|
||||||
|
int xmit_to_rank;
|
||||||
|
|
||||||
|
if (nbr_ic) nbr_lane|=inner_bit;
|
||||||
|
|
||||||
|
assert (sx == nbr_ox);
|
||||||
|
|
||||||
|
if(nbr_proc){
|
||||||
|
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
||||||
|
|
||||||
|
tcomms-=usecond();
|
||||||
|
// grid->Barrier();
|
||||||
|
|
||||||
|
acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes);
|
||||||
|
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
|
||||||
|
xmit_to_rank,
|
||||||
|
(void *)recv_buf_extract_mpi,
|
||||||
|
recv_from_rank,
|
||||||
|
bytes);
|
||||||
|
acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes);
|
||||||
|
xbytes+=bytes;
|
||||||
|
|
||||||
|
// grid->Barrier();
|
||||||
|
tcomms+=usecond();
|
||||||
|
rpointers[i] = &recv_buf_extract[i][0];
|
||||||
|
} else {
|
||||||
|
rpointers[i] = &send_buf_extract[nbr_lane][0];
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
tscatter-=usecond();
|
||||||
|
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
|
||||||
|
tscatter+=usecond();
|
||||||
|
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl;
|
||||||
|
std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl;
|
||||||
|
std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
|
||||||
|
std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl;
|
||||||
|
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s"<<std::endl;
|
||||||
|
*/
|
||||||
|
}
|
||||||
|
#endif
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#include <Grid/GridCore.h>
|
#include <Grid/GridCore.h>
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
std::vector<std::pair<int,int> > Cshift_table;
|
std::vector<std::pair<int,int> > Cshift_table;
|
||||||
deviceVector<std::pair<int,int> > Cshift_table_device;
|
commVector<std::pair<int,int> > Cshift_table_device;
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
@ -236,20 +236,17 @@ public:
|
|||||||
template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
|
template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
|
||||||
vobj vtmp;
|
vobj vtmp;
|
||||||
vtmp = r;
|
vtmp = r;
|
||||||
#if 0
|
#if 1
|
||||||
deviceVector<vobj> vvtmp(1);
|
|
||||||
acceleratorPut(vvtmp[0],vtmp);
|
|
||||||
vobj *vvtmp_p = & vvtmp[0];
|
|
||||||
auto me = View(AcceleratorWrite);
|
|
||||||
accelerator_for(ss,me.size(),vobj::Nsimd(),{
|
|
||||||
auto stmp=coalescedRead(*vvtmp_p);
|
|
||||||
coalescedWrite(me[ss],stmp);
|
|
||||||
});
|
|
||||||
#else
|
|
||||||
auto me = View(CpuWrite);
|
auto me = View(CpuWrite);
|
||||||
thread_for(ss,me.size(),{
|
thread_for(ss,me.size(),{
|
||||||
me[ss]= r;
|
me[ss]= r;
|
||||||
});
|
});
|
||||||
|
#else
|
||||||
|
auto me = View(AcceleratorWrite);
|
||||||
|
accelerator_for(ss,me.size(),vobj::Nsimd(),{
|
||||||
|
auto stmp=coalescedRead(vtmp);
|
||||||
|
coalescedWrite(me[ss],stmp);
|
||||||
|
});
|
||||||
#endif
|
#endif
|
||||||
me.ViewClose();
|
me.ViewClose();
|
||||||
return *this;
|
return *this;
|
||||||
|
@ -53,19 +53,36 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
|
|||||||
typedef decltype(basis[0]) Field;
|
typedef decltype(basis[0]) Field;
|
||||||
typedef decltype(basis[0].View(AcceleratorRead)) View;
|
typedef decltype(basis[0].View(AcceleratorRead)) View;
|
||||||
|
|
||||||
hostVector<View> h_basis_v(basis.size());
|
Vector<View> basis_v; basis_v.reserve(basis.size());
|
||||||
deviceVector<View> d_basis_v(basis.size());
|
typedef typename std::remove_reference<decltype(basis_v[0][0])>::type vobj;
|
||||||
typedef typename std::remove_reference<decltype(h_basis_v[0][0])>::type vobj;
|
|
||||||
typedef typename std::remove_reference<decltype(Qt(0,0))>::type Coeff_t;
|
typedef typename std::remove_reference<decltype(Qt(0,0))>::type Coeff_t;
|
||||||
|
|
||||||
GridBase* grid = basis[0].Grid();
|
GridBase* grid = basis[0].Grid();
|
||||||
|
|
||||||
for(int k=0;k<basis.size();k++){
|
for(int k=0;k<basis.size();k++){
|
||||||
h_basis_v[k] = basis[k].View(AcceleratorWrite);
|
basis_v.push_back(basis[k].View(AcceleratorWrite));
|
||||||
acceleratorPut(d_basis_v[k],h_basis_v[k]);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
View *basis_vp = &d_basis_v[0];
|
#if ( !(defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)) )
|
||||||
|
int max_threads = thread_max();
|
||||||
|
Vector < vobj > Bt(Nm * max_threads);
|
||||||
|
thread_region
|
||||||
|
{
|
||||||
|
vobj* B = &Bt[Nm * thread_num()];
|
||||||
|
thread_for_in_region(ss, grid->oSites(),{
|
||||||
|
for(int j=j0; j<j1; ++j) B[j]=0.;
|
||||||
|
|
||||||
|
for(int j=j0; j<j1; ++j){
|
||||||
|
for(int k=k0; k<k1; ++k){
|
||||||
|
B[j] +=Qt(j,k) * basis_v[k][ss];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for(int j=j0; j<j1; ++j){
|
||||||
|
basis_v[j][ss] = B[j];
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
View *basis_vp = &basis_v[0];
|
||||||
|
|
||||||
int nrot = j1-j0;
|
int nrot = j1-j0;
|
||||||
if (!nrot) // edge case not handled gracefully by Cuda
|
if (!nrot) // edge case not handled gracefully by Cuda
|
||||||
@ -74,19 +91,17 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
|
|||||||
uint64_t oSites =grid->oSites();
|
uint64_t oSites =grid->oSites();
|
||||||
uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
|
uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
|
||||||
|
|
||||||
deviceVector <vobj> Bt(siteBlock * nrot);
|
Vector <vobj> Bt(siteBlock * nrot);
|
||||||
auto Bp=&Bt[0];
|
auto Bp=&Bt[0];
|
||||||
|
|
||||||
// GPU readable copy of matrix
|
// GPU readable copy of matrix
|
||||||
hostVector<Coeff_t> h_Qt_jv(Nm*Nm);
|
Vector<Coeff_t> Qt_jv(Nm*Nm);
|
||||||
deviceVector<Coeff_t> Qt_jv(Nm*Nm);
|
|
||||||
Coeff_t *Qt_p = & Qt_jv[0];
|
Coeff_t *Qt_p = & Qt_jv[0];
|
||||||
thread_for(i,Nm*Nm,{
|
thread_for(i,Nm*Nm,{
|
||||||
int j = i/Nm;
|
int j = i/Nm;
|
||||||
int k = i%Nm;
|
int k = i%Nm;
|
||||||
h_Qt_jv[i]=Qt(j,k);
|
Qt_p[i]=Qt(j,k);
|
||||||
});
|
});
|
||||||
acceleratorCopyToDevice(&h_Qt_jv[0],Qt_p,Nm*Nm*sizeof(Coeff_t));
|
|
||||||
|
|
||||||
// Block the loop to keep storage footprint down
|
// Block the loop to keep storage footprint down
|
||||||
for(uint64_t s=0;s<oSites;s+=siteBlock){
|
for(uint64_t s=0;s<oSites;s+=siteBlock){
|
||||||
@ -122,8 +137,9 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
|
|||||||
coalescedWrite(basis_vp[jj][sss],coalescedRead(Bp[ss*nrot+j]));
|
coalescedWrite(basis_vp[jj][sss],coalescedRead(Bp[ss*nrot+j]));
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
for(int k=0;k<basis.size();k++) h_basis_v[k].ViewClose();
|
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Extract a single rotated vector
|
// Extract a single rotated vector
|
||||||
@ -136,19 +152,16 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
|
|||||||
|
|
||||||
result.Checkerboard() = basis[0].Checkerboard();
|
result.Checkerboard() = basis[0].Checkerboard();
|
||||||
|
|
||||||
hostVector<View> h_basis_v(basis.size());
|
Vector<View> basis_v; basis_v.reserve(basis.size());
|
||||||
deviceVector<View> d_basis_v(basis.size());
|
|
||||||
for(int k=0;k<basis.size();k++){
|
for(int k=0;k<basis.size();k++){
|
||||||
h_basis_v[k]=basis[k].View(AcceleratorRead);
|
basis_v.push_back(basis[k].View(AcceleratorRead));
|
||||||
acceleratorPut(d_basis_v[k],h_basis_v[k]);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
vobj zz=Zero();
|
vobj zz=Zero();
|
||||||
deviceVector<double> Qt_jv(Nm);
|
Vector<double> Qt_jv(Nm);
|
||||||
double * Qt_j = & Qt_jv[0];
|
double * Qt_j = & Qt_jv[0];
|
||||||
for(int k=0;k<Nm;++k) acceleratorPut(Qt_j[k],Qt(j,k));
|
for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
|
||||||
|
|
||||||
auto basis_vp=& d_basis_v[0];
|
auto basis_vp=& basis_v[0];
|
||||||
autoView(result_v,result,AcceleratorWrite);
|
autoView(result_v,result,AcceleratorWrite);
|
||||||
accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
|
accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
|
||||||
vobj zzz=Zero();
|
vobj zzz=Zero();
|
||||||
@ -158,7 +171,7 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
|
|||||||
}
|
}
|
||||||
coalescedWrite(result_v[ss], B);
|
coalescedWrite(result_v[ss], B);
|
||||||
});
|
});
|
||||||
for(int k=0;k<basis.size();k++) h_basis_v[k].ViewClose();
|
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Field>
|
template<class Field>
|
||||||
|
@ -165,7 +165,7 @@ inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
|
|||||||
|
|
||||||
int Nsimd = grid->Nsimd();
|
int Nsimd = grid->Nsimd();
|
||||||
|
|
||||||
// assert( l.Checkerboard()== grid->CheckerBoard(site));
|
assert( l.Checkerboard()== grid->CheckerBoard(site));
|
||||||
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
|
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
|
||||||
|
|
||||||
static const int words=sizeof(vobj)/sizeof(vector_type);
|
static const int words=sizeof(vobj)/sizeof(vector_type);
|
||||||
@ -179,7 +179,7 @@ inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
|
|||||||
for(int w=0;w<words;w++){
|
for(int w=0;w<words;w++){
|
||||||
pt[w] = getlane(vp[w],idx);
|
pt[w] = getlane(vp[w],idx);
|
||||||
}
|
}
|
||||||
// std::cout << "peekLocalSite "<<site<<" "<<odx<<","<<idx<<" "<<s<<std::endl;
|
|
||||||
return;
|
return;
|
||||||
};
|
};
|
||||||
template<class vobj,class sobj>
|
template<class vobj,class sobj>
|
||||||
@ -202,7 +202,7 @@ inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site)
|
|||||||
|
|
||||||
int Nsimd = grid->Nsimd();
|
int Nsimd = grid->Nsimd();
|
||||||
|
|
||||||
// assert( l.Checkerboard()== grid->CheckerBoard(site));
|
assert( l.Checkerboard()== grid->CheckerBoard(site));
|
||||||
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
|
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
|
||||||
|
|
||||||
static const int words=sizeof(vobj)/sizeof(vector_type);
|
static const int words=sizeof(vobj)/sizeof(vector_type);
|
||||||
|
@ -46,7 +46,7 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
|
|||||||
// const int Nsimd = vobj::Nsimd();
|
// const int Nsimd = vobj::Nsimd();
|
||||||
const int nthread = GridThread::GetThreads();
|
const int nthread = GridThread::GetThreads();
|
||||||
|
|
||||||
std::vector<sobj> sumarray(nthread);
|
Vector<sobj> sumarray(nthread);
|
||||||
for(int i=0;i<nthread;i++){
|
for(int i=0;i<nthread;i++){
|
||||||
sumarray[i]=Zero();
|
sumarray[i]=Zero();
|
||||||
}
|
}
|
||||||
@ -75,7 +75,7 @@ inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)
|
|||||||
|
|
||||||
const int nthread = GridThread::GetThreads();
|
const int nthread = GridThread::GetThreads();
|
||||||
|
|
||||||
std::vector<sobj> sumarray(nthread);
|
Vector<sobj> sumarray(nthread);
|
||||||
for(int i=0;i<nthread;i++){
|
for(int i=0;i<nthread;i++){
|
||||||
sumarray[i]=Zero();
|
sumarray[i]=Zero();
|
||||||
}
|
}
|
||||||
@ -343,6 +343,18 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
|
|||||||
autoView( x_v, x, AcceleratorRead);
|
autoView( x_v, x, AcceleratorRead);
|
||||||
autoView( y_v, y, AcceleratorRead);
|
autoView( y_v, y, AcceleratorRead);
|
||||||
autoView( z_v, z, AcceleratorWrite);
|
autoView( z_v, z, AcceleratorWrite);
|
||||||
|
#if 0
|
||||||
|
typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
|
||||||
|
Vector<inner_t> inner_tmp(sites);
|
||||||
|
auto inner_tmp_v = &inner_tmp[0];
|
||||||
|
|
||||||
|
accelerator_for( ss, sites, nsimd,{
|
||||||
|
auto tmp = a*x_v(ss)+b*y_v(ss);
|
||||||
|
coalescedWrite(inner_tmp_v[ss],innerProductD(tmp,tmp));
|
||||||
|
coalescedWrite(z_v[ss],tmp);
|
||||||
|
});
|
||||||
|
nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
|
||||||
|
#else
|
||||||
typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
|
typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
|
||||||
deviceVector<inner_t> inner_tmp;
|
deviceVector<inner_t> inner_tmp;
|
||||||
inner_tmp.resize(sites);
|
inner_tmp.resize(sites);
|
||||||
@ -354,6 +366,7 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
|
|||||||
coalescedWrite(z_v[ss],tmp);
|
coalescedWrite(z_v[ss],tmp);
|
||||||
});
|
});
|
||||||
nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
|
nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
|
||||||
|
#endif
|
||||||
grid->GlobalSum(nrm);
|
grid->GlobalSum(nrm);
|
||||||
return nrm;
|
return nrm;
|
||||||
}
|
}
|
||||||
@ -364,7 +377,7 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
|
|||||||
conformable(left,right);
|
conformable(left,right);
|
||||||
|
|
||||||
typedef typename vobj::vector_typeD vector_type;
|
typedef typename vobj::vector_typeD vector_type;
|
||||||
std::vector<ComplexD> tmp(2);
|
Vector<ComplexD> tmp(2);
|
||||||
|
|
||||||
GridBase *grid = left.Grid();
|
GridBase *grid = left.Grid();
|
||||||
|
|
||||||
@ -374,8 +387,8 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
|
|||||||
// GPU
|
// GPU
|
||||||
typedef decltype(innerProductD(vobj(),vobj())) inner_t;
|
typedef decltype(innerProductD(vobj(),vobj())) inner_t;
|
||||||
typedef decltype(innerProductD(vobj(),vobj())) norm_t;
|
typedef decltype(innerProductD(vobj(),vobj())) norm_t;
|
||||||
deviceVector<inner_t> inner_tmp(sites);
|
Vector<inner_t> inner_tmp(sites);
|
||||||
deviceVector<norm_t> norm_tmp(sites);
|
Vector<norm_t> norm_tmp(sites);
|
||||||
auto inner_tmp_v = &inner_tmp[0];
|
auto inner_tmp_v = &inner_tmp[0];
|
||||||
auto norm_tmp_v = &norm_tmp[0];
|
auto norm_tmp_v = &norm_tmp[0];
|
||||||
{
|
{
|
||||||
@ -425,9 +438,7 @@ inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
|
|||||||
// sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc...
|
// sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc...
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,
|
template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim)
|
||||||
std::vector<typename vobj::scalar_object> &result,
|
|
||||||
int orthogdim)
|
|
||||||
{
|
{
|
||||||
///////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////
|
||||||
// FIXME precision promoted summation
|
// FIXME precision promoted summation
|
||||||
@ -449,8 +460,8 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,
|
|||||||
int ld=grid->_ldimensions[orthogdim];
|
int ld=grid->_ldimensions[orthogdim];
|
||||||
int rd=grid->_rdimensions[orthogdim];
|
int rd=grid->_rdimensions[orthogdim];
|
||||||
|
|
||||||
std::vector<vobj> lvSum(rd); // will locally sum vectors first
|
Vector<vobj> lvSum(rd); // will locally sum vectors first
|
||||||
std::vector<sobj> lsSum(ld,Zero()); // sum across these down to scalars
|
Vector<sobj> lsSum(ld,Zero()); // sum across these down to scalars
|
||||||
ExtractBuffer<sobj> extracted(Nsimd); // splitting the SIMD
|
ExtractBuffer<sobj> extracted(Nsimd); // splitting the SIMD
|
||||||
|
|
||||||
result.resize(fd); // And then global sum to return the same vector to every node
|
result.resize(fd); // And then global sum to return the same vector to every node
|
||||||
@ -508,20 +519,7 @@ sliceSum(const Lattice<vobj> &Data,int orthogdim)
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
Reimplement
|
|
||||||
|
|
||||||
1)
|
|
||||||
template<class vobj>
|
|
||||||
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)
|
|
||||||
|
|
||||||
2)
|
|
||||||
template<class vobj>
|
|
||||||
static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)
|
|
||||||
|
|
||||||
3)
|
|
||||||
-- Make Slice Mul Matrix call sliceMaddMatrix
|
|
||||||
*/
|
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)
|
static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)
|
||||||
{
|
{
|
||||||
@ -541,8 +539,8 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti
|
|||||||
int ld=grid->_ldimensions[orthogdim];
|
int ld=grid->_ldimensions[orthogdim];
|
||||||
int rd=grid->_rdimensions[orthogdim];
|
int rd=grid->_rdimensions[orthogdim];
|
||||||
|
|
||||||
std::vector<vector_type> lvSum(rd); // will locally sum vectors first
|
Vector<vector_type> lvSum(rd); // will locally sum vectors first
|
||||||
std::vector<scalar_type > lsSum(ld,scalar_type(0.0)); // sum across these down to scalars
|
Vector<scalar_type > lsSum(ld,scalar_type(0.0)); // sum across these down to scalars
|
||||||
ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd); // splitting the SIMD
|
ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd); // splitting the SIMD
|
||||||
|
|
||||||
result.resize(fd); // And then global sum to return the same vector to every node for IO to file
|
result.resize(fd); // And then global sum to return the same vector to every node for IO to file
|
||||||
@ -672,96 +670,203 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
|
inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
|
||||||
{
|
{
|
||||||
int NN = BlockSolverGrid->_ndimension;
|
int NN = BlockSolverGrid->_ndimension;
|
||||||
int nsimd = BlockSolverGrid->Nsimd();
|
int nsimd = BlockSolverGrid->Nsimd();
|
||||||
|
|
||||||
std::vector<int> latt_phys(NN-1);
|
std::vector<int> latt_phys(0);
|
||||||
Coordinate simd_phys;
|
std::vector<int> simd_phys(0);
|
||||||
std::vector<int> mpi_phys(NN-1);
|
std::vector<int> mpi_phys(0);
|
||||||
Coordinate checker_dim_mask(NN-1);
|
|
||||||
int checker_dim=-1;
|
|
||||||
|
|
||||||
int dd;
|
|
||||||
for(int d=0;d<NN;d++){
|
for(int d=0;d<NN;d++){
|
||||||
if( d!=Orthog ) {
|
if( d!=Orthog ) {
|
||||||
latt_phys[dd]=BlockSolverGrid->_fdimensions[d];
|
latt_phys.push_back(BlockSolverGrid->_fdimensions[d]);
|
||||||
mpi_phys[dd] =BlockSolverGrid->_processors[d];
|
simd_phys.push_back(BlockSolverGrid->_simd_layout[d]);
|
||||||
checker_dim_mask[dd] = BlockSolverGrid->_checker_dim_mask[d];
|
mpi_phys.push_back(BlockSolverGrid->_processors[d]);
|
||||||
if ( d == BlockSolverGrid->_checker_dim ) checker_dim = dd;
|
|
||||||
dd++;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
simd_phys=GridDefaultSimd(latt_phys.size(),nsimd);
|
return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys);
|
||||||
GridCartesian *tmp = new GridCartesian(latt_phys,simd_phys,mpi_phys);
|
|
||||||
if(BlockSolverGrid->_isCheckerBoarded) {
|
|
||||||
GridRedBlackCartesian *ret = new GridRedBlackCartesian(tmp,checker_dim_mask,checker_dim);
|
|
||||||
delete tmp;
|
|
||||||
return (GridBase *) ret;
|
|
||||||
} else {
|
|
||||||
return (GridBase *) tmp;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)
|
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)
|
||||||
{
|
{
|
||||||
GridBase *FullGrid = X.Grid();
|
|
||||||
GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
|
|
||||||
|
|
||||||
Lattice<vobj> Ys(SliceGrid);
|
|
||||||
Lattice<vobj> Rs(SliceGrid);
|
|
||||||
Lattice<vobj> Xs(SliceGrid);
|
|
||||||
Lattice<vobj> RR(FullGrid);
|
|
||||||
|
|
||||||
RR = R; // Copies checkerboard for insert
|
|
||||||
|
|
||||||
typedef typename vobj::scalar_object sobj;
|
typedef typename vobj::scalar_object sobj;
|
||||||
typedef typename vobj::vector_type vector_type;
|
typedef typename vobj::vector_type vector_type;
|
||||||
int Nslice = X.Grid()->GlobalDimensions()[Orthog];
|
|
||||||
for(int i=0;i<Nslice;i++){
|
int Nblock = X.Grid()->GlobalDimensions()[Orthog];
|
||||||
ExtractSlice(Ys,Y,i,Orthog);
|
|
||||||
ExtractSlice(Rs,R,i,Orthog);
|
GridBase *FullGrid = X.Grid();
|
||||||
Rs=Ys;
|
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
|
||||||
for(int j=0;j<Nslice;j++){
|
|
||||||
ExtractSlice(Xs,X,j,Orthog);
|
// Lattice<vobj> Xslice(SliceGrid);
|
||||||
Rs = Rs + Xs*(scale*aa(j,i));
|
// Lattice<vobj> Rslice(SliceGrid);
|
||||||
|
|
||||||
|
assert( FullGrid->_simd_layout[Orthog]==1);
|
||||||
|
// int nh = FullGrid->_ndimension;
|
||||||
|
// int nl = SliceGrid->_ndimension;
|
||||||
|
// int nl = nh-1;
|
||||||
|
|
||||||
|
//FIXME package in a convenient iterator
|
||||||
|
//Should loop over a plane orthogonal to direction "Orthog"
|
||||||
|
int stride=FullGrid->_slice_stride[Orthog];
|
||||||
|
int block =FullGrid->_slice_block [Orthog];
|
||||||
|
int nblock=FullGrid->_slice_nblock[Orthog];
|
||||||
|
int ostride=FullGrid->_ostride[Orthog];
|
||||||
|
|
||||||
|
autoView( X_v, X, CpuRead);
|
||||||
|
autoView( Y_v, Y, CpuRead);
|
||||||
|
autoView( R_v, R, CpuWrite);
|
||||||
|
thread_region
|
||||||
|
{
|
||||||
|
Vector<vobj> s_x(Nblock);
|
||||||
|
|
||||||
|
thread_for_collapse_in_region(2, n,nblock, {
|
||||||
|
for(int b=0;b<block;b++){
|
||||||
|
int o = n*stride + b;
|
||||||
|
|
||||||
|
for(int i=0;i<Nblock;i++){
|
||||||
|
s_x[i] = X_v[o+i*ostride];
|
||||||
}
|
}
|
||||||
InsertSlice(Rs,RR,i,Orthog);
|
|
||||||
|
vobj dot;
|
||||||
|
for(int i=0;i<Nblock;i++){
|
||||||
|
dot = Y_v[o+i*ostride];
|
||||||
|
for(int j=0;j<Nblock;j++){
|
||||||
|
dot = dot + s_x[j]*(scale*aa(j,i));
|
||||||
|
}
|
||||||
|
R_v[o+i*ostride]=dot;
|
||||||
|
}
|
||||||
|
}});
|
||||||
}
|
}
|
||||||
R=RR; // Copy back handles arguments aliasing case
|
|
||||||
delete SliceGrid;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0)
|
static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0)
|
||||||
{
|
{
|
||||||
R=Zero();
|
typedef typename vobj::scalar_object sobj;
|
||||||
sliceMaddMatrix(R,aa,X,R,Orthog,scale);
|
typedef typename vobj::vector_type vector_type;
|
||||||
|
|
||||||
|
int Nblock = X.Grid()->GlobalDimensions()[Orthog];
|
||||||
|
|
||||||
|
GridBase *FullGrid = X.Grid();
|
||||||
|
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
|
||||||
|
// Lattice<vobj> Xslice(SliceGrid);
|
||||||
|
// Lattice<vobj> Rslice(SliceGrid);
|
||||||
|
|
||||||
|
assert( FullGrid->_simd_layout[Orthog]==1);
|
||||||
|
// int nh = FullGrid->_ndimension;
|
||||||
|
// int nl = SliceGrid->_ndimension;
|
||||||
|
// int nl=1;
|
||||||
|
|
||||||
|
//FIXME package in a convenient iterator
|
||||||
|
// thread_for2d_in_region
|
||||||
|
//Should loop over a plane orthogonal to direction "Orthog"
|
||||||
|
int stride=FullGrid->_slice_stride[Orthog];
|
||||||
|
int block =FullGrid->_slice_block [Orthog];
|
||||||
|
int nblock=FullGrid->_slice_nblock[Orthog];
|
||||||
|
int ostride=FullGrid->_ostride[Orthog];
|
||||||
|
autoView( R_v, R, CpuWrite);
|
||||||
|
autoView( X_v, X, CpuRead);
|
||||||
|
thread_region
|
||||||
|
{
|
||||||
|
std::vector<vobj> s_x(Nblock);
|
||||||
|
|
||||||
|
|
||||||
|
thread_for_collapse_in_region( 2 ,n,nblock,{
|
||||||
|
for(int b=0;b<block;b++){
|
||||||
|
int o = n*stride + b;
|
||||||
|
|
||||||
|
for(int i=0;i<Nblock;i++){
|
||||||
|
s_x[i] = X_v[o+i*ostride];
|
||||||
|
}
|
||||||
|
|
||||||
|
vobj dot;
|
||||||
|
for(int i=0;i<Nblock;i++){
|
||||||
|
dot = s_x[0]*(scale*aa(0,i));
|
||||||
|
for(int j=1;j<Nblock;j++){
|
||||||
|
dot = dot + s_x[j]*(scale*aa(j,i));
|
||||||
|
}
|
||||||
|
R_v[o+i*ostride]=dot;
|
||||||
|
}
|
||||||
|
}});
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)
|
static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)
|
||||||
{
|
{
|
||||||
GridBase *SliceGrid = makeSubSliceGrid(lhs.Grid(),Orthog);
|
|
||||||
|
|
||||||
Lattice<vobj> ls(SliceGrid);
|
|
||||||
Lattice<vobj> rs(SliceGrid);
|
|
||||||
|
|
||||||
typedef typename vobj::scalar_object sobj;
|
typedef typename vobj::scalar_object sobj;
|
||||||
typedef typename vobj::vector_type vector_type;
|
typedef typename vobj::vector_type vector_type;
|
||||||
int Nslice = lhs.Grid()->GlobalDimensions()[Orthog];
|
|
||||||
mat = Eigen::MatrixXcd::Zero(Nslice,Nslice);
|
GridBase *FullGrid = lhs.Grid();
|
||||||
for(int s=0;s<Nslice;s++){
|
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
|
||||||
ExtractSlice(ls,lhs,s,Orthog);
|
|
||||||
for(int ss=0;ss<Nslice;ss++){
|
int Nblock = FullGrid->GlobalDimensions()[Orthog];
|
||||||
ExtractSlice(rs,rhs,ss,Orthog);
|
|
||||||
mat(s,ss) = innerProduct(ls,rs);
|
// Lattice<vobj> Lslice(SliceGrid);
|
||||||
|
// Lattice<vobj> Rslice(SliceGrid);
|
||||||
|
|
||||||
|
mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
|
||||||
|
|
||||||
|
assert( FullGrid->_simd_layout[Orthog]==1);
|
||||||
|
// int nh = FullGrid->_ndimension;
|
||||||
|
// int nl = SliceGrid->_ndimension;
|
||||||
|
// int nl = nh-1;
|
||||||
|
|
||||||
|
//FIXME package in a convenient iterator
|
||||||
|
//Should loop over a plane orthogonal to direction "Orthog"
|
||||||
|
int stride=FullGrid->_slice_stride[Orthog];
|
||||||
|
int block =FullGrid->_slice_block [Orthog];
|
||||||
|
int nblock=FullGrid->_slice_nblock[Orthog];
|
||||||
|
int ostride=FullGrid->_ostride[Orthog];
|
||||||
|
|
||||||
|
typedef typename vobj::vector_typeD vector_typeD;
|
||||||
|
|
||||||
|
autoView( lhs_v, lhs, CpuRead);
|
||||||
|
autoView( rhs_v, rhs, CpuRead);
|
||||||
|
thread_region
|
||||||
|
{
|
||||||
|
std::vector<vobj> Left(Nblock);
|
||||||
|
std::vector<vobj> Right(Nblock);
|
||||||
|
Eigen::MatrixXcd mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock);
|
||||||
|
|
||||||
|
thread_for_collapse_in_region( 2, n,nblock,{
|
||||||
|
for(int b=0;b<block;b++){
|
||||||
|
|
||||||
|
int o = n*stride + b;
|
||||||
|
|
||||||
|
for(int i=0;i<Nblock;i++){
|
||||||
|
Left [i] = lhs_v[o+i*ostride];
|
||||||
|
Right[i] = rhs_v[o+i*ostride];
|
||||||
|
}
|
||||||
|
|
||||||
|
for(int i=0;i<Nblock;i++){
|
||||||
|
for(int j=0;j<Nblock;j++){
|
||||||
|
auto tmp = innerProduct(Left[i],Right[j]);
|
||||||
|
auto rtmp = TensorRemove(tmp);
|
||||||
|
auto red = Reduce(rtmp);
|
||||||
|
mat_thread(i,j) += std::complex<double>(real(red),imag(red));
|
||||||
|
}}
|
||||||
|
}});
|
||||||
|
thread_critical
|
||||||
|
{
|
||||||
|
mat += mat_thread;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
delete SliceGrid;
|
|
||||||
|
for(int i=0;i<Nblock;i++){
|
||||||
|
for(int j=0;j<Nblock;j++){
|
||||||
|
ComplexD sum = mat(i,j);
|
||||||
|
FullGrid->GlobalSum(sum);
|
||||||
|
mat(i,j)=sum;
|
||||||
|
}}
|
||||||
|
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
@ -214,12 +214,22 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi
|
|||||||
// Move out of UVM
|
// Move out of UVM
|
||||||
// Turns out I had messed up the synchronise after move to compute stream
|
// Turns out I had messed up the synchronise after move to compute stream
|
||||||
// as running this on the default stream fools the synchronise
|
// as running this on the default stream fools the synchronise
|
||||||
deviceVector<sobj> buffer(numBlocks);
|
#undef UVM_BLOCK_BUFFER
|
||||||
|
#ifndef UVM_BLOCK_BUFFER
|
||||||
|
commVector<sobj> buffer(numBlocks);
|
||||||
sobj *buffer_v = &buffer[0];
|
sobj *buffer_v = &buffer[0];
|
||||||
sobj result;
|
sobj result;
|
||||||
reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
|
reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
|
||||||
accelerator_barrier();
|
accelerator_barrier();
|
||||||
acceleratorCopyFromDevice(buffer_v,&result,sizeof(result));
|
acceleratorCopyFromDevice(buffer_v,&result,sizeof(result));
|
||||||
|
#else
|
||||||
|
Vector<sobj> buffer(numBlocks);
|
||||||
|
sobj *buffer_v = &buffer[0];
|
||||||
|
sobj result;
|
||||||
|
reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
|
||||||
|
accelerator_barrier();
|
||||||
|
result = *buffer_v;
|
||||||
|
#endif
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -234,7 +244,7 @@ inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osi
|
|||||||
|
|
||||||
const int words = sizeof(vobj)/sizeof(vector);
|
const int words = sizeof(vobj)/sizeof(vector);
|
||||||
|
|
||||||
deviceVector<vector> buffer(osites);
|
Vector<vector> buffer(osites);
|
||||||
vector *dat = (vector *)lat;
|
vector *dat = (vector *)lat;
|
||||||
vector *buf = &buffer[0];
|
vector *buf = &buffer[0];
|
||||||
iScalar<vector> *tbuf =(iScalar<vector> *) &buffer[0];
|
iScalar<vector> *tbuf =(iScalar<vector> *) &buffer[0];
|
||||||
|
@ -4,20 +4,23 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
// Possibly promote to double and sum
|
// Possibly promote to double and sum
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
|
||||||
template <class vobj>
|
template <class vobj>
|
||||||
inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites)
|
inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites)
|
||||||
{
|
{
|
||||||
typedef typename vobj::scalar_object sobj;
|
typedef typename vobj::scalar_object sobj;
|
||||||
typedef typename vobj::scalar_objectD sobjD;
|
typedef typename vobj::scalar_objectD sobjD;
|
||||||
|
static Vector<sobj> mysum;
|
||||||
|
mysum.resize(1);
|
||||||
|
sobj *mysum_p = & mysum[0];
|
||||||
sobj identity; zeroit(identity);
|
sobj identity; zeroit(identity);
|
||||||
sobj ret; zeroit(ret);
|
mysum[0] = identity;
|
||||||
|
sobj ret ;
|
||||||
|
|
||||||
Integer nsimd= vobj::Nsimd();
|
Integer nsimd= vobj::Nsimd();
|
||||||
{
|
|
||||||
sycl::buffer<sobj, 1> abuff(&ret, {1});
|
const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() });
|
||||||
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
|
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
|
||||||
auto Reduction = cl::sycl::reduction(abuff,cgh,identity,std::plus<>());
|
auto Reduction = cl::sycl::reduction(mysum_p,identity,std::plus<>(),PropList);
|
||||||
cgh.parallel_for(cl::sycl::range<1>{osites},
|
cgh.parallel_for(cl::sycl::range<1>{osites},
|
||||||
Reduction,
|
Reduction,
|
||||||
[=] (cl::sycl::id<1> item, auto &sum) {
|
[=] (cl::sycl::id<1> item, auto &sum) {
|
||||||
@ -25,7 +28,9 @@ inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer os
|
|||||||
sum +=Reduce(lat[osite]);
|
sum +=Reduce(lat[osite]);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
}
|
theGridAccelerator->wait();
|
||||||
|
ret = mysum[0];
|
||||||
|
// free(mysum,*theGridAccelerator);
|
||||||
sobjD dret; convertType(dret,ret);
|
sobjD dret; convertType(dret,ret);
|
||||||
return dret;
|
return dret;
|
||||||
}
|
}
|
||||||
@ -71,22 +76,59 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite
|
|||||||
|
|
||||||
template<class Word> Word svm_xor(Word *vec,uint64_t L)
|
template<class Word> Word svm_xor(Word *vec,uint64_t L)
|
||||||
{
|
{
|
||||||
|
Word xorResult; xorResult = 0;
|
||||||
|
static Vector<Word> d_sum;
|
||||||
|
d_sum.resize(1);
|
||||||
|
Word *d_sum_p=&d_sum[0];
|
||||||
Word identity; identity=0;
|
Word identity; identity=0;
|
||||||
Word ret = 0;
|
d_sum[0] = identity;
|
||||||
{
|
const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() });
|
||||||
sycl::buffer<Word, 1> abuff(&ret, {1});
|
|
||||||
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
|
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
|
||||||
auto Reduction = cl::sycl::reduction(abuff,cgh,identity,std::bit_xor<>());
|
auto Reduction = cl::sycl::reduction(d_sum_p,identity,std::bit_xor<>(),PropList);
|
||||||
cgh.parallel_for(cl::sycl::range<1>{L},
|
cgh.parallel_for(cl::sycl::range<1>{L},
|
||||||
Reduction,
|
Reduction,
|
||||||
[=] (cl::sycl::id<1> index, auto &sum) {
|
[=] (cl::sycl::id<1> index, auto &sum) {
|
||||||
sum ^=vec[index];
|
sum^=vec[index];
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
}
|
|
||||||
theGridAccelerator->wait();
|
theGridAccelerator->wait();
|
||||||
|
Word ret = d_sum[0];
|
||||||
|
// free(d_sum,*theGridAccelerator);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
/*
|
||||||
|
|
||||||
|
template <class vobj>
|
||||||
|
inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites)
|
||||||
|
{
|
||||||
|
typedef typename vobj::vector_type vector;
|
||||||
|
typedef typename vobj::scalar_type scalar;
|
||||||
|
|
||||||
|
typedef typename vobj::scalar_typeD scalarD;
|
||||||
|
typedef typename vobj::scalar_objectD sobjD;
|
||||||
|
|
||||||
|
sobjD ret;
|
||||||
|
scalarD *ret_p = (scalarD *)&ret;
|
||||||
|
|
||||||
|
const int nsimd = vobj::Nsimd();
|
||||||
|
const int words = sizeof(vobj)/sizeof(vector);
|
||||||
|
|
||||||
|
Vector<scalar> buffer(osites*nsimd);
|
||||||
|
scalar *buf = &buffer[0];
|
||||||
|
vector *dat = (vector *)lat;
|
||||||
|
|
||||||
|
for(int w=0;w<words;w++) {
|
||||||
|
|
||||||
|
accelerator_for(ss,osites,nsimd,{
|
||||||
|
int lane = acceleratorSIMTlane(nsimd);
|
||||||
|
buf[ss*nsimd+lane] = dat[ss*words+w].getlane(lane);
|
||||||
|
});
|
||||||
|
//Precision change at this point is to late to gain precision
|
||||||
|
ret_p[w] = svm_reduce(buf,nsimd*osites);
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
@ -21,18 +21,9 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
|
|
||||||
|
|
||||||
#if defined(GRID_CUDA) || defined(GRID_HIP)
|
#if defined(GRID_CUDA) || defined(GRID_HIP)
|
||||||
template<class vobj>
|
template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
|
||||||
inline void sliceSumReduction_cub_small(const vobj *Data,
|
|
||||||
std::vector<vobj> &lvSum,
|
|
||||||
const int rd,
|
|
||||||
const int e1,
|
|
||||||
const int e2,
|
|
||||||
const int stride,
|
|
||||||
const int ostride,
|
|
||||||
const int Nsimd)
|
|
||||||
{
|
|
||||||
size_t subvol_size = e1*e2;
|
size_t subvol_size = e1*e2;
|
||||||
deviceVector<vobj> reduction_buffer(rd*subvol_size);
|
commVector<vobj> reduction_buffer(rd*subvol_size);
|
||||||
auto rb_p = &reduction_buffer[0];
|
auto rb_p = &reduction_buffer[0];
|
||||||
vobj zero_init;
|
vobj zero_init;
|
||||||
zeroit(zero_init);
|
zeroit(zero_init);
|
||||||
@ -103,15 +94,7 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
|
|||||||
|
|
||||||
|
|
||||||
#if defined(GRID_SYCL)
|
#if defined(GRID_SYCL)
|
||||||
template<class vobj>
|
template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data, Vector <vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
|
||||||
inline void sliceSumReduction_sycl_small(const vobj *Data,
|
|
||||||
std::vector <vobj> &lvSum,
|
|
||||||
const int &rd,
|
|
||||||
const int &e1,
|
|
||||||
const int &e2,
|
|
||||||
const int &stride,
|
|
||||||
const int &ostride,
|
|
||||||
const int &Nsimd)
|
|
||||||
{
|
{
|
||||||
size_t subvol_size = e1*e2;
|
size_t subvol_size = e1*e2;
|
||||||
|
|
||||||
@ -122,7 +105,7 @@ inline void sliceSumReduction_sycl_small(const vobj *Data,
|
|||||||
mysum[r] = vobj_zero;
|
mysum[r] = vobj_zero;
|
||||||
}
|
}
|
||||||
|
|
||||||
deviceVector<vobj> reduction_buffer(rd*subvol_size);
|
commVector<vobj> reduction_buffer(rd*subvol_size);
|
||||||
|
|
||||||
auto rb_p = &reduction_buffer[0];
|
auto rb_p = &reduction_buffer[0];
|
||||||
|
|
||||||
@ -161,23 +144,14 @@ inline void sliceSumReduction_sycl_small(const vobj *Data,
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
template<class vobj>
|
template<class vobj> inline void sliceSumReduction_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
|
||||||
inline void sliceSumReduction_large(const vobj *Data,
|
|
||||||
std::vector<vobj> &lvSum,
|
|
||||||
const int rd,
|
|
||||||
const int e1,
|
|
||||||
const int e2,
|
|
||||||
const int stride,
|
|
||||||
const int ostride,
|
|
||||||
const int Nsimd)
|
|
||||||
{
|
|
||||||
typedef typename vobj::vector_type vector;
|
typedef typename vobj::vector_type vector;
|
||||||
const int words = sizeof(vobj)/sizeof(vector);
|
const int words = sizeof(vobj)/sizeof(vector);
|
||||||
const int osites = rd*e1*e2;
|
const int osites = rd*e1*e2;
|
||||||
deviceVector<vector>buffer(osites);
|
commVector<vector>buffer(osites);
|
||||||
vector *dat = (vector *)Data;
|
vector *dat = (vector *)Data;
|
||||||
vector *buf = &buffer[0];
|
vector *buf = &buffer[0];
|
||||||
std::vector<vector> lvSum_small(rd);
|
Vector<vector> lvSum_small(rd);
|
||||||
vector *lvSum_ptr = (vector *)&lvSum[0];
|
vector *lvSum_ptr = (vector *)&lvSum[0];
|
||||||
|
|
||||||
for (int w = 0; w < words; w++) {
|
for (int w = 0; w < words; w++) {
|
||||||
@ -194,18 +168,13 @@ inline void sliceSumReduction_large(const vobj *Data,
|
|||||||
for (int r = 0; r < rd; r++) {
|
for (int r = 0; r < rd; r++) {
|
||||||
lvSum_ptr[w+words*r]=lvSum_small[r];
|
lvSum_ptr[w+words*r]=lvSum_small[r];
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj>
|
template<class vobj> inline void sliceSumReduction_gpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd)
|
||||||
inline void sliceSumReduction_gpu(const Lattice<vobj> &Data,
|
|
||||||
std::vector<vobj> &lvSum,
|
|
||||||
const int rd,
|
|
||||||
const int e1,
|
|
||||||
const int e2,
|
|
||||||
const int stride,
|
|
||||||
const int ostride,
|
|
||||||
const int Nsimd)
|
|
||||||
{
|
{
|
||||||
autoView(Data_v, Data, AcceleratorRead); //reduction libraries cannot deal with large vobjs so we split into small/large case.
|
autoView(Data_v, Data, AcceleratorRead); //reduction libraries cannot deal with large vobjs so we split into small/large case.
|
||||||
if constexpr (sizeof(vobj) <= 256) {
|
if constexpr (sizeof(vobj) <= 256) {
|
||||||
@ -223,15 +192,7 @@ inline void sliceSumReduction_gpu(const Lattice<vobj> &Data,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template<class vobj>
|
template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
|
||||||
inline void sliceSumReduction_cpu(const Lattice<vobj> &Data,
|
|
||||||
std::vector<vobj> &lvSum,
|
|
||||||
const int &rd,
|
|
||||||
const int &e1,
|
|
||||||
const int &e2,
|
|
||||||
const int &stride,
|
|
||||||
const int &ostride,
|
|
||||||
const int &Nsimd)
|
|
||||||
{
|
{
|
||||||
// sum over reduced dimension planes, breaking out orthog dir
|
// sum over reduced dimension planes, breaking out orthog dir
|
||||||
// Parallel over orthog direction
|
// Parallel over orthog direction
|
||||||
@ -247,20 +208,16 @@ inline void sliceSumReduction_cpu(const Lattice<vobj> &Data,
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data,
|
template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
|
||||||
std::vector<vobj> &lvSum,
|
|
||||||
const int &rd,
|
|
||||||
const int &e1,
|
|
||||||
const int &e2,
|
|
||||||
const int &stride,
|
|
||||||
const int &ostride,
|
|
||||||
const int &Nsimd)
|
|
||||||
{
|
{
|
||||||
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
|
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
|
||||||
|
|
||||||
sliceSumReduction_gpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
sliceSumReduction_gpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
||||||
#else
|
|
||||||
|
#else
|
||||||
sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
||||||
#endif
|
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -43,49 +43,20 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// remove and insert a half checkerboard
|
// remove and insert a half checkerboard
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full)
|
template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full)
|
||||||
{
|
{
|
||||||
half.Checkerboard() = cb;
|
acceleratorPickCheckerboard(cb,half,full);
|
||||||
|
|
||||||
autoView( half_v, half, CpuWrite);
|
|
||||||
autoView( full_v, full, CpuRead);
|
|
||||||
thread_for(ss, full.Grid()->oSites(),{
|
|
||||||
int cbos;
|
|
||||||
Coordinate coor;
|
|
||||||
full.Grid()->oCoorFromOindex(coor,ss);
|
|
||||||
cbos=half.Grid()->CheckerBoard(coor);
|
|
||||||
|
|
||||||
if (cbos==cb) {
|
|
||||||
int ssh=half.Grid()->oIndex(coor);
|
|
||||||
half_v[ssh] = full_v[ss];
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half)
|
template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half)
|
||||||
{
|
{
|
||||||
int cb = half.Checkerboard();
|
acceleratorSetCheckerboard(full,half);
|
||||||
autoView( half_v , half, CpuRead);
|
|
||||||
autoView( full_v , full, CpuWrite);
|
|
||||||
thread_for(ss,full.Grid()->oSites(),{
|
|
||||||
|
|
||||||
Coordinate coor;
|
|
||||||
int cbos;
|
|
||||||
|
|
||||||
full.Grid()->oCoorFromOindex(coor,ss);
|
|
||||||
cbos=half.Grid()->CheckerBoard(coor);
|
|
||||||
|
|
||||||
if (cbos==cb) {
|
|
||||||
int ssh=half.Grid()->oIndex(coor);
|
|
||||||
full_v[ss]=half_v[ssh];
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full, int checker_dim_half=0)
|
template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full, int dummy=0)
|
||||||
{
|
{
|
||||||
half.Checkerboard() = cb;
|
half.Checkerboard() = cb;
|
||||||
autoView(half_v, half, AcceleratorWrite);
|
autoView(half_v, half, AcceleratorWrite);
|
||||||
@ -95,6 +66,7 @@ template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj
|
|||||||
unsigned long ndim_half = half.Grid()->_ndimension;
|
unsigned long ndim_half = half.Grid()->_ndimension;
|
||||||
Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
|
Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
|
||||||
Coordinate ostride_half = half.Grid()->_ostride;
|
Coordinate ostride_half = half.Grid()->_ostride;
|
||||||
|
int checker_dim_half = half.Grid()->CheckerDim();
|
||||||
accelerator_for(ss, full.Grid()->oSites(),full.Grid()->Nsimd(),{
|
accelerator_for(ss, full.Grid()->oSites(),full.Grid()->Nsimd(),{
|
||||||
|
|
||||||
Coordinate coor;
|
Coordinate coor;
|
||||||
@ -119,7 +91,7 @@ template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half, int checker_dim_half=0)
|
template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half, int dummy=0)
|
||||||
{
|
{
|
||||||
int cb = half.Checkerboard();
|
int cb = half.Checkerboard();
|
||||||
autoView(half_v , half, AcceleratorRead);
|
autoView(half_v , half, AcceleratorRead);
|
||||||
@ -129,6 +101,7 @@ template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,
|
|||||||
unsigned long ndim_half = half.Grid()->_ndimension;
|
unsigned long ndim_half = half.Grid()->_ndimension;
|
||||||
Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
|
Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
|
||||||
Coordinate ostride_half = half.Grid()->_ostride;
|
Coordinate ostride_half = half.Grid()->_ostride;
|
||||||
|
int checker_dim_half = half.Grid()->CheckerDim();
|
||||||
accelerator_for(ss,full.Grid()->oSites(),full.Grid()->Nsimd(),{
|
accelerator_for(ss,full.Grid()->oSites(),full.Grid()->Nsimd(),{
|
||||||
|
|
||||||
Coordinate coor;
|
Coordinate coor;
|
||||||
@ -981,14 +954,8 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
|
|||||||
hcoor[orthog] = slice;
|
hcoor[orthog] = slice;
|
||||||
for(int d=0;d<nh;d++){
|
for(int d=0;d<nh;d++){
|
||||||
if ( d!=orthog ) {
|
if ( d!=orthog ) {
|
||||||
hcoor[d]=lcoor[ddl];
|
hcoor[d]=lcoor[ddl++];
|
||||||
if ( hg->_checker_dim == d ) {
|
|
||||||
hcoor[d]=hcoor[d]*2; // factor in the full coor for peekLocalSite
|
|
||||||
lcoor[ddl]=lcoor[ddl]*2; // factor in the full coor for peekLocalSite
|
|
||||||
}
|
}
|
||||||
ddl++;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
peekLocalSite(s,lowDimv,lcoor);
|
peekLocalSite(s,lowDimv,lcoor);
|
||||||
pokeLocalSite(s,higherDimv,hcoor);
|
pokeLocalSite(s,higherDimv,hcoor);
|
||||||
@ -1009,7 +976,6 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
|
|||||||
assert(orthog<nh);
|
assert(orthog<nh);
|
||||||
assert(orthog>=0);
|
assert(orthog>=0);
|
||||||
assert(hg->_processors[orthog]==1);
|
assert(hg->_processors[orthog]==1);
|
||||||
lowDim.Checkerboard() = higherDim.Checkerboard();
|
|
||||||
|
|
||||||
int dl; dl = 0;
|
int dl; dl = 0;
|
||||||
for(int d=0;d<nh;d++){
|
for(int d=0;d<nh;d++){
|
||||||
@ -1027,16 +993,11 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
|
|||||||
Coordinate lcoor(nl);
|
Coordinate lcoor(nl);
|
||||||
Coordinate hcoor(nh);
|
Coordinate hcoor(nh);
|
||||||
lg->LocalIndexToLocalCoor(idx,lcoor);
|
lg->LocalIndexToLocalCoor(idx,lcoor);
|
||||||
hcoor[orthog] = slice;
|
|
||||||
int ddl=0;
|
int ddl=0;
|
||||||
|
hcoor[orthog] = slice;
|
||||||
for(int d=0;d<nh;d++){
|
for(int d=0;d<nh;d++){
|
||||||
if ( d!=orthog ) {
|
if ( d!=orthog ) {
|
||||||
hcoor[d]=lcoor[ddl];
|
hcoor[d]=lcoor[ddl++];
|
||||||
if ( hg->_checker_dim == d ) {
|
|
||||||
hcoor[d]=hcoor[d]*2; // factor in the full gridd coor for peekLocalSite
|
|
||||||
lcoor[ddl]=lcoor[ddl]*2; // factor in the full coor for peekLocalSite
|
|
||||||
}
|
|
||||||
ddl++;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
peekLocalSite(s,higherDimv,hcoor);
|
peekLocalSite(s,higherDimv,hcoor);
|
||||||
|
@ -54,7 +54,7 @@ struct CshiftImplGauge: public CshiftImplBase<typename Gimpl::GaugeLinkField::ve
|
|||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
|
|
||||||
template<class vobj> inline void ScatterSlice(const deviceVector<vobj> &buf,
|
template<class vobj> inline void ScatterSlice(const cshiftVector<vobj> &buf,
|
||||||
Lattice<vobj> &lat,
|
Lattice<vobj> &lat,
|
||||||
int x,
|
int x,
|
||||||
int dim,
|
int dim,
|
||||||
@ -140,7 +140,7 @@ template<class vobj> inline void ScatterSlice(const deviceVector<vobj> &buf,
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj> inline void GatherSlice(deviceVector<vobj> &buf,
|
template<class vobj> inline void GatherSlice(cshiftVector<vobj> &buf,
|
||||||
const Lattice<vobj> &lat,
|
const Lattice<vobj> &lat,
|
||||||
int x,
|
int x,
|
||||||
int dim,
|
int dim,
|
||||||
@ -462,8 +462,8 @@ public:
|
|||||||
int rNsimd = Nsimd / simd[dimension];
|
int rNsimd = Nsimd / simd[dimension];
|
||||||
assert( buffer_size == from.Grid()->_slice_nblock[dimension]*from.Grid()->_slice_block[dimension] / simd[dimension]);
|
assert( buffer_size == from.Grid()->_slice_nblock[dimension]*from.Grid()->_slice_block[dimension] / simd[dimension]);
|
||||||
|
|
||||||
static deviceVector<vobj> send_buf;
|
static cshiftVector<vobj> send_buf;
|
||||||
static deviceVector<vobj> recv_buf;
|
static cshiftVector<vobj> recv_buf;
|
||||||
send_buf.resize(buffer_size*2*depth);
|
send_buf.resize(buffer_size*2*depth);
|
||||||
recv_buf.resize(buffer_size*2*depth);
|
recv_buf.resize(buffer_size*2*depth);
|
||||||
|
|
||||||
|
@ -90,16 +90,16 @@ public:
|
|||||||
void M5D(const FermionField &psi,
|
void M5D(const FermionField &psi,
|
||||||
const FermionField &phi,
|
const FermionField &phi,
|
||||||
FermionField &chi,
|
FermionField &chi,
|
||||||
std::vector<Coeff_t> &lower,
|
Vector<Coeff_t> &lower,
|
||||||
std::vector<Coeff_t> &diag,
|
Vector<Coeff_t> &diag,
|
||||||
std::vector<Coeff_t> &upper);
|
Vector<Coeff_t> &upper);
|
||||||
|
|
||||||
void M5Ddag(const FermionField &psi,
|
void M5Ddag(const FermionField &psi,
|
||||||
const FermionField &phi,
|
const FermionField &phi,
|
||||||
FermionField &chi,
|
FermionField &chi,
|
||||||
std::vector<Coeff_t> &lower,
|
Vector<Coeff_t> &lower,
|
||||||
std::vector<Coeff_t> &diag,
|
Vector<Coeff_t> &diag,
|
||||||
std::vector<Coeff_t> &upper);
|
Vector<Coeff_t> &upper);
|
||||||
|
|
||||||
virtual void Instantiatable(void)=0;
|
virtual void Instantiatable(void)=0;
|
||||||
|
|
||||||
@ -119,35 +119,35 @@ public:
|
|||||||
RealD mass_plus, mass_minus;
|
RealD mass_plus, mass_minus;
|
||||||
|
|
||||||
// Save arguments to SetCoefficientsInternal
|
// Save arguments to SetCoefficientsInternal
|
||||||
std::vector<Coeff_t> _gamma;
|
Vector<Coeff_t> _gamma;
|
||||||
RealD _zolo_hi;
|
RealD _zolo_hi;
|
||||||
RealD _b;
|
RealD _b;
|
||||||
RealD _c;
|
RealD _c;
|
||||||
|
|
||||||
// Cayley form Moebius (tanh and zolotarev)
|
// Cayley form Moebius (tanh and zolotarev)
|
||||||
std::vector<Coeff_t> omega;
|
Vector<Coeff_t> omega;
|
||||||
std::vector<Coeff_t> bs; // S dependent coeffs
|
Vector<Coeff_t> bs; // S dependent coeffs
|
||||||
std::vector<Coeff_t> cs;
|
Vector<Coeff_t> cs;
|
||||||
std::vector<Coeff_t> as;
|
Vector<Coeff_t> as;
|
||||||
// For preconditioning Cayley form
|
// For preconditioning Cayley form
|
||||||
std::vector<Coeff_t> bee;
|
Vector<Coeff_t> bee;
|
||||||
std::vector<Coeff_t> cee;
|
Vector<Coeff_t> cee;
|
||||||
std::vector<Coeff_t> aee;
|
Vector<Coeff_t> aee;
|
||||||
std::vector<Coeff_t> beo;
|
Vector<Coeff_t> beo;
|
||||||
std::vector<Coeff_t> ceo;
|
Vector<Coeff_t> ceo;
|
||||||
std::vector<Coeff_t> aeo;
|
Vector<Coeff_t> aeo;
|
||||||
// LDU factorisation of the eeoo matrix
|
// LDU factorisation of the eeoo matrix
|
||||||
std::vector<Coeff_t> lee;
|
Vector<Coeff_t> lee;
|
||||||
std::vector<Coeff_t> leem;
|
Vector<Coeff_t> leem;
|
||||||
std::vector<Coeff_t> uee;
|
Vector<Coeff_t> uee;
|
||||||
std::vector<Coeff_t> ueem;
|
Vector<Coeff_t> ueem;
|
||||||
std::vector<Coeff_t> dee;
|
Vector<Coeff_t> dee;
|
||||||
|
|
||||||
// Matrices of 5d ee inverse params
|
// Matrices of 5d ee inverse params
|
||||||
// std::vector<iSinglet<Simd> > MatpInv;
|
Vector<iSinglet<Simd> > MatpInv;
|
||||||
// std::vector<iSinglet<Simd> > MatmInv;
|
Vector<iSinglet<Simd> > MatmInv;
|
||||||
// std::vector<iSinglet<Simd> > MatpInvDag;
|
Vector<iSinglet<Simd> > MatpInvDag;
|
||||||
// std::vector<iSinglet<Simd> > MatmInvDag;
|
Vector<iSinglet<Simd> > MatmInvDag;
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
// Conserved current utilities
|
// Conserved current utilities
|
||||||
@ -187,7 +187,7 @@ public:
|
|||||||
protected:
|
protected:
|
||||||
virtual void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
|
virtual void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
|
||||||
virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c);
|
virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c);
|
||||||
virtual void SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c);
|
virtual void SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t> & gamma,RealD b,RealD c);
|
||||||
};
|
};
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
@ -90,12 +90,12 @@ protected:
|
|||||||
RealD mass;
|
RealD mass;
|
||||||
RealD R;
|
RealD R;
|
||||||
RealD ZoloHiInv;
|
RealD ZoloHiInv;
|
||||||
std::vector<double> Beta;
|
Vector<double> Beta;
|
||||||
std::vector<double> cc;;
|
Vector<double> cc;;
|
||||||
std::vector<double> cc_d;;
|
Vector<double> cc_d;;
|
||||||
std::vector<double> sqrt_cc;
|
Vector<double> sqrt_cc;
|
||||||
std::vector<double> See;
|
Vector<double> See;
|
||||||
std::vector<double> Aee;
|
Vector<double> Aee;
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -69,10 +69,10 @@ public:
|
|||||||
// Instantiate different versions depending on Impl
|
// Instantiate different versions depending on Impl
|
||||||
/////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////
|
||||||
void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
|
void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
|
||||||
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
|
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);
|
||||||
|
|
||||||
void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
|
void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
|
||||||
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
|
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);
|
||||||
|
|
||||||
virtual void RefreshShiftCoefficients(RealD new_shift);
|
virtual void RefreshShiftCoefficients(RealD new_shift);
|
||||||
|
|
||||||
@ -83,7 +83,7 @@ public:
|
|||||||
RealD _M5, const ImplParams& p=ImplParams());
|
RealD _M5, const ImplParams& p=ImplParams());
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
void SetCoefficientsInternal(RealD zolo_hi, std::vector<Coeff_t>& gamma, RealD b, RealD c);
|
void SetCoefficientsInternal(RealD zolo_hi, Vector<Coeff_t>& gamma, RealD b, RealD c);
|
||||||
};
|
};
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
@ -102,11 +102,11 @@ public:
|
|||||||
GaugeField &mat,
|
GaugeField &mat,
|
||||||
const FermionField &A, const FermionField &B, int dag);
|
const FermionField &A, const FermionField &B, int dag);
|
||||||
|
|
||||||
void DhopInternal(StencilImpl &st, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||||
const FermionField &in, FermionField &out, int dag);
|
const FermionField &in, FermionField &out, int dag);
|
||||||
void DhopInternalSerialComms(StencilImpl &st, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||||
const FermionField &in, FermionField &out, int dag);
|
const FermionField &in, FermionField &out, int dag);
|
||||||
void DhopInternalOverlappedComms(StencilImpl &st, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||||
const FermionField &in, FermionField &out, int dag);
|
const FermionField &in, FermionField &out, int dag);
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
@ -164,6 +164,8 @@ public:
|
|||||||
DoubledGaugeField UUUmuEven;
|
DoubledGaugeField UUUmuEven;
|
||||||
DoubledGaugeField UUUmuOdd;
|
DoubledGaugeField UUUmuOdd;
|
||||||
|
|
||||||
|
LebesgueOrder Lebesgue;
|
||||||
|
LebesgueOrder LebesgueEvenOdd;
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
// Conserved current utilities
|
// Conserved current utilities
|
||||||
|
@ -100,6 +100,7 @@ public:
|
|||||||
int dag);
|
int dag);
|
||||||
|
|
||||||
void DhopInternal(StencilImpl & st,
|
void DhopInternal(StencilImpl & st,
|
||||||
|
LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
DoubledGaugeField &UUU,
|
DoubledGaugeField &UUU,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
@ -107,6 +108,7 @@ public:
|
|||||||
int dag);
|
int dag);
|
||||||
|
|
||||||
void DhopInternalOverlappedComms(StencilImpl & st,
|
void DhopInternalOverlappedComms(StencilImpl & st,
|
||||||
|
LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
DoubledGaugeField &UUU,
|
DoubledGaugeField &UUU,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
@ -114,6 +116,7 @@ public:
|
|||||||
int dag);
|
int dag);
|
||||||
|
|
||||||
void DhopInternalSerialComms(StencilImpl & st,
|
void DhopInternalSerialComms(StencilImpl & st,
|
||||||
|
LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
DoubledGaugeField &UUU,
|
DoubledGaugeField &UUU,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
@ -189,6 +192,8 @@ public:
|
|||||||
DoubledGaugeField UUUmuEven;
|
DoubledGaugeField UUUmuEven;
|
||||||
DoubledGaugeField UUUmuOdd;
|
DoubledGaugeField UUUmuOdd;
|
||||||
|
|
||||||
|
LebesgueOrder Lebesgue;
|
||||||
|
LebesgueOrder LebesgueEvenOdd;
|
||||||
|
|
||||||
// Comms buffer
|
// Comms buffer
|
||||||
// std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
|
// std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
|
||||||
|
@ -42,11 +42,11 @@ public:
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
// Shift operator coefficients for red-black preconditioned Mobius EOFA
|
// Shift operator coefficients for red-black preconditioned Mobius EOFA
|
||||||
std::vector<Coeff_t> Mooee_shift;
|
Vector<Coeff_t> Mooee_shift;
|
||||||
std::vector<Coeff_t> MooeeInv_shift_lc;
|
Vector<Coeff_t> MooeeInv_shift_lc;
|
||||||
std::vector<Coeff_t> MooeeInv_shift_norm;
|
Vector<Coeff_t> MooeeInv_shift_norm;
|
||||||
std::vector<Coeff_t> MooeeInvDag_shift_lc;
|
Vector<Coeff_t> MooeeInvDag_shift_lc;
|
||||||
std::vector<Coeff_t> MooeeInvDag_shift_norm;
|
Vector<Coeff_t> MooeeInvDag_shift_norm;
|
||||||
|
|
||||||
virtual void Instantiatable(void) {};
|
virtual void Instantiatable(void) {};
|
||||||
|
|
||||||
@ -74,18 +74,18 @@ public:
|
|||||||
// Instantiate different versions depending on Impl
|
// Instantiate different versions depending on Impl
|
||||||
/////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////
|
||||||
void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
|
void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
|
||||||
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
|
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);
|
||||||
|
|
||||||
void M5D_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
|
void M5D_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
|
||||||
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
|
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper,
|
||||||
std::vector<Coeff_t>& shift_coeffs);
|
Vector<Coeff_t>& shift_coeffs);
|
||||||
|
|
||||||
void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
|
void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
|
||||||
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
|
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);
|
||||||
|
|
||||||
void M5Ddag_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
|
void M5Ddag_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
|
||||||
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
|
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper,
|
||||||
std::vector<Coeff_t>& shift_coeffs);
|
Vector<Coeff_t>& shift_coeffs);
|
||||||
|
|
||||||
virtual void RefreshShiftCoefficients(RealD new_shift);
|
virtual void RefreshShiftCoefficients(RealD new_shift);
|
||||||
|
|
||||||
|
@ -102,11 +102,11 @@ public:
|
|||||||
GaugeField &mat,
|
GaugeField &mat,
|
||||||
const FermionField &A, const FermionField &B, int dag);
|
const FermionField &A, const FermionField &B, int dag);
|
||||||
|
|
||||||
void DhopInternal(StencilImpl &st, DoubledGaugeField &U,
|
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||||
const FermionField &in, FermionField &out, int dag);
|
const FermionField &in, FermionField &out, int dag);
|
||||||
void DhopInternalSerialComms(StencilImpl &st, DoubledGaugeField &U,
|
void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||||
const FermionField &in, FermionField &out, int dag);
|
const FermionField &in, FermionField &out, int dag);
|
||||||
void DhopInternalOverlappedComms(StencilImpl &st, DoubledGaugeField &U,
|
void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||||
const FermionField &in, FermionField &out, int dag);
|
const FermionField &in, FermionField &out, int dag);
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
@ -152,6 +152,9 @@ public:
|
|||||||
DoubledGaugeField UmuEven;
|
DoubledGaugeField UmuEven;
|
||||||
DoubledGaugeField UmuOdd;
|
DoubledGaugeField UmuOdd;
|
||||||
|
|
||||||
|
LebesgueOrder Lebesgue;
|
||||||
|
LebesgueOrder LebesgueEvenOdd;
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
// Conserved current utilities
|
// Conserved current utilities
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
|
@ -94,8 +94,8 @@ protected:
|
|||||||
RealD R;
|
RealD R;
|
||||||
RealD amax;
|
RealD amax;
|
||||||
RealD scale;
|
RealD scale;
|
||||||
std::vector<double> p;
|
Vector<double> p;
|
||||||
std::vector<double> q;
|
Vector<double> q;
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -35,7 +35,7 @@ template<class Matrix, class Field>
|
|||||||
class KappaSimilarityTransform {
|
class KappaSimilarityTransform {
|
||||||
public:
|
public:
|
||||||
INHERIT_IMPL_TYPES(Matrix);
|
INHERIT_IMPL_TYPES(Matrix);
|
||||||
std::vector<Coeff_t> kappa, kappaDag, kappaInv, kappaInvDag;
|
Vector<Coeff_t> kappa, kappaDag, kappaInv, kappaInvDag;
|
||||||
|
|
||||||
KappaSimilarityTransform (Matrix &zmob) {
|
KappaSimilarityTransform (Matrix &zmob) {
|
||||||
for (int i=0;i<(int)zmob.bs.size();i++) {
|
for (int i=0;i<(int)zmob.bs.size();i++) {
|
||||||
|
@ -49,10 +49,10 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
void DhopImproved(StencilImpl &st,
|
void DhopImproved(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
const FermionField &in, FermionField &out, int dag, int interior,int exterior);
|
const FermionField &in, FermionField &out, int dag, int interior,int exterior);
|
||||||
void DhopNaive(StencilImpl &st,
|
void DhopNaive(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
const FermionField &in, FermionField &out, int dag, int interior,int exterior);
|
const FermionField &in, FermionField &out, int dag, int interior,int exterior);
|
||||||
|
|
||||||
|
@ -47,7 +47,7 @@ public:
|
|||||||
static int PartialCompressionFactor(GridBase *grid) { return 1;}
|
static int PartialCompressionFactor(GridBase *grid) { return 1;}
|
||||||
#endif
|
#endif
|
||||||
template<class vobj,class cobj,class compressor>
|
template<class vobj,class cobj,class compressor>
|
||||||
static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
|
static void Gather_plane_simple (commVector<std::pair<int,int> >& table,
|
||||||
const Lattice<vobj> &rhs,
|
const Lattice<vobj> &rhs,
|
||||||
cobj *buffer,
|
cobj *buffer,
|
||||||
compressor &compress,
|
compressor &compress,
|
||||||
@ -109,7 +109,7 @@ public:
|
|||||||
// Reorder the fifth dim to be s=Ls-1 , s=0, s=1,...,Ls-2.
|
// Reorder the fifth dim to be s=Ls-1 , s=0, s=1,...,Ls-2.
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
template<class vobj,class cobj,class compressor>
|
template<class vobj,class cobj,class compressor>
|
||||||
static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
|
static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
|
||||||
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
|
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
|
||||||
compressor &compress,int type,int partial)
|
compressor &compress,int type,int partial)
|
||||||
{
|
{
|
||||||
@ -197,7 +197,7 @@ public:
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
template<class vobj,class cobj,class compressor>
|
template<class vobj,class cobj,class compressor>
|
||||||
static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
|
static void Gather_plane_simple (commVector<std::pair<int,int> >& table,
|
||||||
const Lattice<vobj> &rhs,
|
const Lattice<vobj> &rhs,
|
||||||
cobj *buffer,
|
cobj *buffer,
|
||||||
compressor &compress,
|
compressor &compress,
|
||||||
@ -208,7 +208,7 @@ public:
|
|||||||
else FaceGatherSimple::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
|
else FaceGatherSimple::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
|
||||||
}
|
}
|
||||||
template<class vobj,class cobj,class compressor>
|
template<class vobj,class cobj,class compressor>
|
||||||
static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
|
static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
|
||||||
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
|
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
|
||||||
compressor &compress,int type,int partial)
|
compressor &compress,int type,int partial)
|
||||||
{
|
{
|
||||||
@ -402,6 +402,7 @@ public:
|
|||||||
|
|
||||||
typedef CartesianStencil<vobj,cobj,Parameters> Base;
|
typedef CartesianStencil<vobj,cobj,Parameters> Base;
|
||||||
typedef typename Base::View_type View_type;
|
typedef typename Base::View_type View_type;
|
||||||
|
typedef typename Base::StencilVector StencilVector;
|
||||||
|
|
||||||
// Vector<int> surface_list;
|
// Vector<int> surface_list;
|
||||||
WilsonStencil(GridBase *grid,
|
WilsonStencil(GridBase *grid,
|
||||||
|
@ -126,16 +126,13 @@ public:
|
|||||||
void DerivInternal(StencilImpl &st, DoubledGaugeField &U, GaugeField &mat,
|
void DerivInternal(StencilImpl &st, DoubledGaugeField &U, GaugeField &mat,
|
||||||
const FermionField &A, const FermionField &B, int dag);
|
const FermionField &A, const FermionField &B, int dag);
|
||||||
|
|
||||||
void DhopInternal(StencilImpl &st,
|
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||||
DoubledGaugeField &U,
|
|
||||||
const FermionField &in, FermionField &out, int dag);
|
const FermionField &in, FermionField &out, int dag);
|
||||||
|
|
||||||
void DhopInternalSerial(StencilImpl &st,
|
void DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||||
DoubledGaugeField &U,
|
|
||||||
const FermionField &in, FermionField &out, int dag);
|
const FermionField &in, FermionField &out, int dag);
|
||||||
|
|
||||||
void DhopInternalOverlappedComms(StencilImpl &st,
|
void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||||
DoubledGaugeField &U,
|
|
||||||
const FermionField &in, FermionField &out, int dag);
|
const FermionField &in, FermionField &out, int dag);
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
@ -171,6 +168,9 @@ public:
|
|||||||
DoubledGaugeField UmuEven;
|
DoubledGaugeField UmuEven;
|
||||||
DoubledGaugeField UmuOdd;
|
DoubledGaugeField UmuOdd;
|
||||||
|
|
||||||
|
LebesgueOrder Lebesgue;
|
||||||
|
LebesgueOrder LebesgueEvenOdd;
|
||||||
|
|
||||||
WilsonAnisotropyCoefficients anisotropyCoeff;
|
WilsonAnisotropyCoefficients anisotropyCoeff;
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
|
@ -135,18 +135,21 @@ public:
|
|||||||
int dag);
|
int dag);
|
||||||
|
|
||||||
void DhopInternal(StencilImpl & st,
|
void DhopInternal(StencilImpl & st,
|
||||||
|
LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
FermionField &out,
|
FermionField &out,
|
||||||
int dag);
|
int dag);
|
||||||
|
|
||||||
void DhopInternalOverlappedComms(StencilImpl & st,
|
void DhopInternalOverlappedComms(StencilImpl & st,
|
||||||
|
LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
FermionField &out,
|
FermionField &out,
|
||||||
int dag);
|
int dag);
|
||||||
|
|
||||||
void DhopInternalSerialComms(StencilImpl & st,
|
void DhopInternalSerialComms(StencilImpl & st,
|
||||||
|
LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
FermionField &out,
|
FermionField &out,
|
||||||
@ -200,6 +203,9 @@ public:
|
|||||||
DoubledGaugeField UmuEven;
|
DoubledGaugeField UmuEven;
|
||||||
DoubledGaugeField UmuOdd;
|
DoubledGaugeField UmuOdd;
|
||||||
|
|
||||||
|
LebesgueOrder Lebesgue;
|
||||||
|
LebesgueOrder LebesgueEvenOdd;
|
||||||
|
|
||||||
// Comms buffer
|
// Comms buffer
|
||||||
// std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
|
// std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
|
||||||
|
|
||||||
|
@ -58,7 +58,7 @@ public:
|
|||||||
{
|
{
|
||||||
// RealD eps = 1.0;
|
// RealD eps = 1.0;
|
||||||
std::cout<<GridLogMessage << "ZMobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" gamma passed in"<<std::endl;
|
std::cout<<GridLogMessage << "ZMobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" gamma passed in"<<std::endl;
|
||||||
std::vector<Coeff_t> zgamma(this->Ls);
|
Vector<Coeff_t> zgamma(this->Ls);
|
||||||
for(int s=0;s<this->Ls;s++){
|
for(int s=0;s<this->Ls;s++){
|
||||||
zgamma[s] = gamma[s];
|
zgamma[s] = gamma[s];
|
||||||
}
|
}
|
||||||
|
@ -156,18 +156,18 @@ template<class Impl>
|
|||||||
void CayleyFermion5D<Impl>::M5D (const FermionField &psi, FermionField &chi)
|
void CayleyFermion5D<Impl>::M5D (const FermionField &psi, FermionField &chi)
|
||||||
{
|
{
|
||||||
int Ls=this->Ls;
|
int Ls=this->Ls;
|
||||||
std::vector<Coeff_t> diag (Ls,1.0);
|
Vector<Coeff_t> diag (Ls,1.0);
|
||||||
std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass_minus;
|
Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass_minus;
|
||||||
std::vector<Coeff_t> lower(Ls,-1.0); lower[0] =mass_plus;
|
Vector<Coeff_t> lower(Ls,-1.0); lower[0] =mass_plus;
|
||||||
M5D(psi,chi,chi,lower,diag,upper);
|
M5D(psi,chi,chi,lower,diag,upper);
|
||||||
}
|
}
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void CayleyFermion5D<Impl>::Meooe5D (const FermionField &psi, FermionField &Din)
|
void CayleyFermion5D<Impl>::Meooe5D (const FermionField &psi, FermionField &Din)
|
||||||
{
|
{
|
||||||
int Ls=this->Ls;
|
int Ls=this->Ls;
|
||||||
std::vector<Coeff_t> diag = bs;
|
Vector<Coeff_t> diag = bs;
|
||||||
std::vector<Coeff_t> upper= cs;
|
Vector<Coeff_t> upper= cs;
|
||||||
std::vector<Coeff_t> lower= cs;
|
Vector<Coeff_t> lower= cs;
|
||||||
upper[Ls-1]=-mass_minus*upper[Ls-1];
|
upper[Ls-1]=-mass_minus*upper[Ls-1];
|
||||||
lower[0] =-mass_plus*lower[0];
|
lower[0] =-mass_plus*lower[0];
|
||||||
M5D(psi,psi,Din,lower,diag,upper);
|
M5D(psi,psi,Din,lower,diag,upper);
|
||||||
@ -176,9 +176,9 @@ void CayleyFermion5D<Impl>::Meooe5D (const FermionField &psi, FermionField &D
|
|||||||
template<class Impl> void CayleyFermion5D<Impl>::Meo5D (const FermionField &psi, FermionField &chi)
|
template<class Impl> void CayleyFermion5D<Impl>::Meo5D (const FermionField &psi, FermionField &chi)
|
||||||
{
|
{
|
||||||
int Ls=this->Ls;
|
int Ls=this->Ls;
|
||||||
std::vector<Coeff_t> diag = beo;
|
Vector<Coeff_t> diag = beo;
|
||||||
std::vector<Coeff_t> upper(Ls);
|
Vector<Coeff_t> upper(Ls);
|
||||||
std::vector<Coeff_t> lower(Ls);
|
Vector<Coeff_t> lower(Ls);
|
||||||
for(int i=0;i<Ls;i++) {
|
for(int i=0;i<Ls;i++) {
|
||||||
upper[i]=-ceo[i];
|
upper[i]=-ceo[i];
|
||||||
lower[i]=-ceo[i];
|
lower[i]=-ceo[i];
|
||||||
@ -191,9 +191,9 @@ template<class Impl>
|
|||||||
void CayleyFermion5D<Impl>::Mooee (const FermionField &psi, FermionField &chi)
|
void CayleyFermion5D<Impl>::Mooee (const FermionField &psi, FermionField &chi)
|
||||||
{
|
{
|
||||||
int Ls=this->Ls;
|
int Ls=this->Ls;
|
||||||
std::vector<Coeff_t> diag = bee;
|
Vector<Coeff_t> diag = bee;
|
||||||
std::vector<Coeff_t> upper(Ls);
|
Vector<Coeff_t> upper(Ls);
|
||||||
std::vector<Coeff_t> lower(Ls);
|
Vector<Coeff_t> lower(Ls);
|
||||||
for(int i=0;i<Ls;i++) {
|
for(int i=0;i<Ls;i++) {
|
||||||
upper[i]=-cee[i];
|
upper[i]=-cee[i];
|
||||||
lower[i]=-cee[i];
|
lower[i]=-cee[i];
|
||||||
@ -206,9 +206,9 @@ template<class Impl>
|
|||||||
void CayleyFermion5D<Impl>::MooeeDag (const FermionField &psi, FermionField &chi)
|
void CayleyFermion5D<Impl>::MooeeDag (const FermionField &psi, FermionField &chi)
|
||||||
{
|
{
|
||||||
int Ls=this->Ls;
|
int Ls=this->Ls;
|
||||||
std::vector<Coeff_t> diag = bee;
|
Vector<Coeff_t> diag = bee;
|
||||||
std::vector<Coeff_t> upper(Ls);
|
Vector<Coeff_t> upper(Ls);
|
||||||
std::vector<Coeff_t> lower(Ls);
|
Vector<Coeff_t> lower(Ls);
|
||||||
|
|
||||||
for (int s=0;s<Ls;s++){
|
for (int s=0;s<Ls;s++){
|
||||||
// Assemble the 5d matrix
|
// Assemble the 5d matrix
|
||||||
@ -236,9 +236,9 @@ template<class Impl>
|
|||||||
void CayleyFermion5D<Impl>::M5Ddag (const FermionField &psi, FermionField &chi)
|
void CayleyFermion5D<Impl>::M5Ddag (const FermionField &psi, FermionField &chi)
|
||||||
{
|
{
|
||||||
int Ls=this->Ls;
|
int Ls=this->Ls;
|
||||||
std::vector<Coeff_t> diag(Ls,1.0);
|
Vector<Coeff_t> diag(Ls,1.0);
|
||||||
std::vector<Coeff_t> upper(Ls,-1.0);
|
Vector<Coeff_t> upper(Ls,-1.0);
|
||||||
std::vector<Coeff_t> lower(Ls,-1.0);
|
Vector<Coeff_t> lower(Ls,-1.0);
|
||||||
upper[Ls-1]=-mass_plus*upper[Ls-1];
|
upper[Ls-1]=-mass_plus*upper[Ls-1];
|
||||||
lower[0] =-mass_minus*lower[0];
|
lower[0] =-mass_minus*lower[0];
|
||||||
M5Ddag(psi,chi,chi,lower,diag,upper);
|
M5Ddag(psi,chi,chi,lower,diag,upper);
|
||||||
@ -248,9 +248,9 @@ template<class Impl>
|
|||||||
void CayleyFermion5D<Impl>::MeooeDag5D (const FermionField &psi, FermionField &Din)
|
void CayleyFermion5D<Impl>::MeooeDag5D (const FermionField &psi, FermionField &Din)
|
||||||
{
|
{
|
||||||
int Ls=this->Ls;
|
int Ls=this->Ls;
|
||||||
std::vector<Coeff_t> diag =bs;
|
Vector<Coeff_t> diag =bs;
|
||||||
std::vector<Coeff_t> upper=cs;
|
Vector<Coeff_t> upper=cs;
|
||||||
std::vector<Coeff_t> lower=cs;
|
Vector<Coeff_t> lower=cs;
|
||||||
|
|
||||||
for (int s=0;s<Ls;s++){
|
for (int s=0;s<Ls;s++){
|
||||||
if ( s== 0 ) {
|
if ( s== 0 ) {
|
||||||
@ -394,7 +394,7 @@ void CayleyFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const
|
|||||||
template<class Impl>
|
template<class Impl>
|
||||||
void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c)
|
void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c)
|
||||||
{
|
{
|
||||||
std::vector<Coeff_t> gamma(this->Ls);
|
Vector<Coeff_t> gamma(this->Ls);
|
||||||
for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
|
for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
|
||||||
SetCoefficientsInternal(1.0,gamma,b,c);
|
SetCoefficientsInternal(1.0,gamma,b,c);
|
||||||
}
|
}
|
||||||
@ -402,13 +402,13 @@ void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,Re
|
|||||||
template<class Impl>
|
template<class Impl>
|
||||||
void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c)
|
void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c)
|
||||||
{
|
{
|
||||||
std::vector<Coeff_t> gamma(this->Ls);
|
Vector<Coeff_t> gamma(this->Ls);
|
||||||
for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
|
for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
|
||||||
SetCoefficientsInternal(zolo_hi,gamma,b,c);
|
SetCoefficientsInternal(zolo_hi,gamma,b,c);
|
||||||
}
|
}
|
||||||
//Zolo
|
//Zolo
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c)
|
void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t> & gamma,RealD b,RealD c)
|
||||||
{
|
{
|
||||||
int Ls=this->Ls;
|
int Ls=this->Ls;
|
||||||
|
|
||||||
|
@ -43,9 +43,9 @@ void
|
|||||||
CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
|
CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
|
||||||
const FermionField &phi_i,
|
const FermionField &phi_i,
|
||||||
FermionField &chi_i,
|
FermionField &chi_i,
|
||||||
std::vector<Coeff_t> &lower,
|
Vector<Coeff_t> &lower,
|
||||||
std::vector<Coeff_t> &diag,
|
Vector<Coeff_t> &diag,
|
||||||
std::vector<Coeff_t> &upper)
|
Vector<Coeff_t> &upper)
|
||||||
{
|
{
|
||||||
|
|
||||||
chi_i.Checkerboard()=psi_i.Checkerboard();
|
chi_i.Checkerboard()=psi_i.Checkerboard();
|
||||||
@ -55,16 +55,12 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
|
|||||||
autoView(chi , chi_i,AcceleratorWrite);
|
autoView(chi , chi_i,AcceleratorWrite);
|
||||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||||
|
|
||||||
|
auto pdiag = &diag[0];
|
||||||
|
auto pupper = &upper[0];
|
||||||
|
auto plower = &lower[0];
|
||||||
|
|
||||||
int Ls =this->Ls;
|
int Ls =this->Ls;
|
||||||
|
|
||||||
static deviceVector<Coeff_t> d_diag(Ls) ; acceleratorCopyToDevice(&diag[0] ,&d_diag[0],Ls*sizeof(Coeff_t));
|
|
||||||
static deviceVector<Coeff_t> d_upper(Ls); acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t));
|
|
||||||
static deviceVector<Coeff_t> d_lower(Ls); acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t));
|
|
||||||
|
|
||||||
auto pdiag = &d_diag[0];
|
|
||||||
auto pupper = &d_upper[0];
|
|
||||||
auto plower = &d_lower[0];
|
|
||||||
|
|
||||||
// 10 = 3 complex mult + 2 complex add
|
// 10 = 3 complex mult + 2 complex add
|
||||||
// Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting)
|
// Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting)
|
||||||
uint64_t nloop = grid->oSites();
|
uint64_t nloop = grid->oSites();
|
||||||
@ -86,9 +82,9 @@ void
|
|||||||
CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
|
CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
|
||||||
const FermionField &phi_i,
|
const FermionField &phi_i,
|
||||||
FermionField &chi_i,
|
FermionField &chi_i,
|
||||||
std::vector<Coeff_t> &lower,
|
Vector<Coeff_t> &lower,
|
||||||
std::vector<Coeff_t> &diag,
|
Vector<Coeff_t> &diag,
|
||||||
std::vector<Coeff_t> &upper)
|
Vector<Coeff_t> &upper)
|
||||||
{
|
{
|
||||||
chi_i.Checkerboard()=psi_i.Checkerboard();
|
chi_i.Checkerboard()=psi_i.Checkerboard();
|
||||||
GridBase *grid=psi_i.Grid();
|
GridBase *grid=psi_i.Grid();
|
||||||
@ -97,16 +93,12 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
|
|||||||
autoView(chi , chi_i,AcceleratorWrite);
|
autoView(chi , chi_i,AcceleratorWrite);
|
||||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||||
|
|
||||||
|
auto pdiag = &diag[0];
|
||||||
|
auto pupper = &upper[0];
|
||||||
|
auto plower = &lower[0];
|
||||||
|
|
||||||
int Ls=this->Ls;
|
int Ls=this->Ls;
|
||||||
|
|
||||||
static deviceVector<Coeff_t> d_diag(Ls) ; acceleratorCopyToDevice(&diag[0] ,&d_diag[0],Ls*sizeof(Coeff_t));
|
|
||||||
static deviceVector<Coeff_t> d_upper(Ls); acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t));
|
|
||||||
static deviceVector<Coeff_t> d_lower(Ls); acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t));
|
|
||||||
|
|
||||||
auto pdiag = &d_diag[0];
|
|
||||||
auto pupper = &d_upper[0];
|
|
||||||
auto plower = &d_lower[0];
|
|
||||||
|
|
||||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||||
uint64_t nloop = grid->oSites();
|
uint64_t nloop = grid->oSites();
|
||||||
accelerator_for(sss,nloop,Simd::Nsimd(),{
|
accelerator_for(sss,nloop,Simd::Nsimd(),{
|
||||||
@ -134,17 +126,11 @@ CayleyFermion5D<Impl>::MooeeInv (const FermionField &psi_i, FermionField &chi
|
|||||||
|
|
||||||
int Ls=this->Ls;
|
int Ls=this->Ls;
|
||||||
|
|
||||||
static deviceVector<Coeff_t> d_lee(Ls); acceleratorCopyToDevice(&lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
|
auto plee = & lee [0];
|
||||||
static deviceVector<Coeff_t> d_dee(Ls); acceleratorCopyToDevice(&dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
|
auto pdee = & dee [0];
|
||||||
static deviceVector<Coeff_t> d_uee(Ls); acceleratorCopyToDevice(&uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
|
auto puee = & uee [0];
|
||||||
static deviceVector<Coeff_t> d_leem(Ls); acceleratorCopyToDevice(&leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
|
auto pleem = & leem[0];
|
||||||
static deviceVector<Coeff_t> d_ueem(Ls); acceleratorCopyToDevice(&ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
|
auto pueem = & ueem[0];
|
||||||
|
|
||||||
auto plee = & d_lee [0];
|
|
||||||
auto pdee = & d_dee [0];
|
|
||||||
auto puee = & d_uee [0];
|
|
||||||
auto pleem = & d_leem[0];
|
|
||||||
auto pueem = & d_ueem[0];
|
|
||||||
|
|
||||||
uint64_t nloop = grid->oSites()/Ls;
|
uint64_t nloop = grid->oSites()/Ls;
|
||||||
accelerator_for(sss,nloop,Simd::Nsimd(),{
|
accelerator_for(sss,nloop,Simd::Nsimd(),{
|
||||||
@ -196,17 +182,11 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi
|
|||||||
autoView(psi , psi_i,AcceleratorRead);
|
autoView(psi , psi_i,AcceleratorRead);
|
||||||
autoView(chi , chi_i,AcceleratorWrite);
|
autoView(chi , chi_i,AcceleratorWrite);
|
||||||
|
|
||||||
static deviceVector<Coeff_t> d_lee(Ls); acceleratorCopyToDevice(&lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
|
auto plee = & lee [0];
|
||||||
static deviceVector<Coeff_t> d_dee(Ls); acceleratorCopyToDevice(&dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
|
auto pdee = & dee [0];
|
||||||
static deviceVector<Coeff_t> d_uee(Ls); acceleratorCopyToDevice(&uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
|
auto puee = & uee [0];
|
||||||
static deviceVector<Coeff_t> d_leem(Ls); acceleratorCopyToDevice(&leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
|
auto pleem = & leem[0];
|
||||||
static deviceVector<Coeff_t> d_ueem(Ls); acceleratorCopyToDevice(&ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
|
auto pueem = & ueem[0];
|
||||||
|
|
||||||
auto plee = & d_lee [0];
|
|
||||||
auto pdee = & d_dee [0];
|
|
||||||
auto puee = & d_uee [0];
|
|
||||||
auto pleem = & d_leem[0];
|
|
||||||
auto pueem = & d_ueem[0];
|
|
||||||
|
|
||||||
assert(psi.Checkerboard() == psi.Checkerboard());
|
assert(psi.Checkerboard() == psi.Checkerboard());
|
||||||
|
|
||||||
|
@ -1,5 +1,3 @@
|
|||||||
#if 0
|
|
||||||
|
|
||||||
/*************************************************************************************
|
/*************************************************************************************
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
@ -820,5 +818,3 @@ CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,
|
|||||||
}
|
}
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
#endif
|
|
@ -41,7 +41,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
// Pplus backwards..
|
// Pplus backwards..
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i,
|
void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i,
|
||||||
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
|
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
|
||||||
{
|
{
|
||||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
@ -50,15 +50,9 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
|
|||||||
autoView( psi , psi_i, AcceleratorRead);
|
autoView( psi , psi_i, AcceleratorRead);
|
||||||
autoView( chi , chi_i, AcceleratorWrite);
|
autoView( chi , chi_i, AcceleratorWrite);
|
||||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||||
|
auto pdiag = &diag[0];
|
||||||
static deviceVector<Coeff_t> d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t));
|
auto pupper = &upper[0];
|
||||||
static deviceVector<Coeff_t> d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t));
|
auto plower = &lower[0];
|
||||||
static deviceVector<Coeff_t> d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t));
|
|
||||||
|
|
||||||
auto pdiag = &d_diag[0];
|
|
||||||
auto pupper = &d_upper[0];
|
|
||||||
auto plower = &d_lower[0];
|
|
||||||
|
|
||||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||||
|
|
||||||
auto nloop=grid->oSites()/Ls;
|
auto nloop=grid->oSites()/Ls;
|
||||||
@ -79,7 +73,7 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
|
|||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const FermionField& phi_i, FermionField& chi_i,
|
void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const FermionField& phi_i, FermionField& chi_i,
|
||||||
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
|
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
|
||||||
{
|
{
|
||||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||||
GridBase* grid = psi_i.Grid();
|
GridBase* grid = psi_i.Grid();
|
||||||
@ -89,14 +83,9 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const Fermio
|
|||||||
autoView( phi , phi_i, AcceleratorRead);
|
autoView( phi , phi_i, AcceleratorRead);
|
||||||
autoView( chi , chi_i, AcceleratorWrite);
|
autoView( chi , chi_i, AcceleratorWrite);
|
||||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||||
|
auto pdiag = &diag[0];
|
||||||
static deviceVector<Coeff_t> d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t));
|
auto pupper = &upper[0];
|
||||||
static deviceVector<Coeff_t> d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t));
|
auto plower = &lower[0];
|
||||||
static deviceVector<Coeff_t> d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t));
|
|
||||||
|
|
||||||
auto pdiag = &d_diag[0];
|
|
||||||
auto pupper = &d_upper[0];
|
|
||||||
auto plower = &d_lower[0];
|
|
||||||
|
|
||||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||||
|
|
||||||
@ -125,17 +114,12 @@ void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionFie
|
|||||||
autoView( chi, chi_i, AcceleratorWrite);
|
autoView( chi, chi_i, AcceleratorWrite);
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
|
|
||||||
static deviceVector<Coeff_t> d_lee(Ls); acceleratorCopyToDevice(&this->lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
|
auto plee = & this->lee[0];
|
||||||
static deviceVector<Coeff_t> d_dee(Ls); acceleratorCopyToDevice(&this->dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
|
auto pdee = & this->dee[0];
|
||||||
static deviceVector<Coeff_t> d_uee(Ls); acceleratorCopyToDevice(&this->uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
|
auto puee = & this->uee[0];
|
||||||
static deviceVector<Coeff_t> d_leem(Ls); acceleratorCopyToDevice(&this->leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
|
|
||||||
static deviceVector<Coeff_t> d_ueem(Ls); acceleratorCopyToDevice(&this->ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
|
|
||||||
|
|
||||||
auto plee = & d_lee [0];
|
auto pleem = & this->leem[0];
|
||||||
auto pdee = & d_dee [0];
|
auto pueem = & this->ueem[0];
|
||||||
auto puee = & d_uee [0];
|
|
||||||
auto pleem = & d_leem[0];
|
|
||||||
auto pueem = & d_ueem[0];
|
|
||||||
|
|
||||||
uint64_t nloop=grid->oSites()/Ls;
|
uint64_t nloop=grid->oSites()/Ls;
|
||||||
accelerator_for(sss,nloop,Simd::Nsimd(),{
|
accelerator_for(sss,nloop,Simd::Nsimd(),{
|
||||||
|
@ -131,9 +131,9 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi
|
|||||||
else{ shiftm = -shift*(mq3-mq2); }
|
else{ shiftm = -shift*(mq3-mq2); }
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<Coeff_t> diag(Ls,1.0);
|
Vector<Coeff_t> diag(Ls,1.0);
|
||||||
std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftm;
|
Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftm;
|
||||||
std::vector<Coeff_t> lower(Ls,-1.0); lower[0] = mq1 + shiftp;
|
Vector<Coeff_t> lower(Ls,-1.0); lower[0] = mq1 + shiftp;
|
||||||
|
|
||||||
#if(0)
|
#if(0)
|
||||||
std::cout << GridLogMessage << "DomainWallEOFAFermion::M5D(FF&,FF&):" << std::endl;
|
std::cout << GridLogMessage << "DomainWallEOFAFermion::M5D(FF&,FF&):" << std::endl;
|
||||||
@ -168,9 +168,9 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField&
|
|||||||
else{ shiftm = -shift*(mq3-mq2); }
|
else{ shiftm = -shift*(mq3-mq2); }
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<Coeff_t> diag(Ls,1.0);
|
Vector<Coeff_t> diag(Ls,1.0);
|
||||||
std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftp;
|
Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftp;
|
||||||
std::vector<Coeff_t> lower(Ls,-1.0); lower[0] = mq1 + shiftm;
|
Vector<Coeff_t> lower(Ls,-1.0); lower[0] = mq1 + shiftm;
|
||||||
|
|
||||||
this->M5Ddag(psi, chi, chi, lower, diag, upper);
|
this->M5Ddag(psi, chi, chi, lower, diag, upper);
|
||||||
}
|
}
|
||||||
@ -181,9 +181,9 @@ void DomainWallEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& c
|
|||||||
{
|
{
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
|
|
||||||
std::vector<Coeff_t> diag = this->bee;
|
Vector<Coeff_t> diag = this->bee;
|
||||||
std::vector<Coeff_t> upper(Ls);
|
Vector<Coeff_t> upper(Ls);
|
||||||
std::vector<Coeff_t> lower(Ls);
|
Vector<Coeff_t> lower(Ls);
|
||||||
|
|
||||||
for(int s=0; s<Ls; s++){
|
for(int s=0; s<Ls; s++){
|
||||||
upper[s] = -this->cee[s];
|
upper[s] = -this->cee[s];
|
||||||
@ -200,9 +200,9 @@ void DomainWallEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField
|
|||||||
{
|
{
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
|
|
||||||
std::vector<Coeff_t> diag = this->bee;
|
Vector<Coeff_t> diag = this->bee;
|
||||||
std::vector<Coeff_t> upper(Ls);
|
Vector<Coeff_t> upper(Ls);
|
||||||
std::vector<Coeff_t> lower(Ls);
|
Vector<Coeff_t> lower(Ls);
|
||||||
|
|
||||||
for(int s=0; s<Ls; s++){
|
for(int s=0; s<Ls; s++){
|
||||||
upper[s] = -this->cee[s];
|
upper[s] = -this->cee[s];
|
||||||
@ -218,7 +218,7 @@ void DomainWallEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField
|
|||||||
|
|
||||||
//Zolo
|
//Zolo
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void DomainWallEOFAFermion<Impl>::SetCoefficientsInternal(RealD zolo_hi, std::vector<Coeff_t>& gamma, RealD b, RealD c)
|
void DomainWallEOFAFermion<Impl>::SetCoefficientsInternal(RealD zolo_hi, Vector<Coeff_t>& gamma, RealD b, RealD c)
|
||||||
{
|
{
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
int pm = this->pm;
|
int pm = this->pm;
|
||||||
|
@ -61,6 +61,8 @@ ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GridCartesian
|
|||||||
UUUmu(&FourDimGrid),
|
UUUmu(&FourDimGrid),
|
||||||
UUUmuEven(&FourDimRedBlackGrid),
|
UUUmuEven(&FourDimRedBlackGrid),
|
||||||
UUUmuOdd(&FourDimRedBlackGrid),
|
UUUmuOdd(&FourDimRedBlackGrid),
|
||||||
|
Lebesgue(&FourDimGrid),
|
||||||
|
LebesgueEvenOdd(&FourDimRedBlackGrid),
|
||||||
_tmp(&FiveDimRedBlackGrid)
|
_tmp(&FiveDimRedBlackGrid)
|
||||||
{
|
{
|
||||||
|
|
||||||
@ -275,18 +277,18 @@ void ImprovedStaggeredFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
|
|||||||
|
|
||||||
/*CHANGE */
|
/*CHANGE */
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st,
|
void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
||||||
const FermionField &in, FermionField &out,int dag)
|
const FermionField &in, FermionField &out,int dag)
|
||||||
{
|
{
|
||||||
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
|
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
|
||||||
DhopInternalOverlappedComms(st,U,UUU,in,out,dag);
|
DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
|
||||||
else
|
else
|
||||||
DhopInternalSerialComms(st,U,UUU,in,out,dag);
|
DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
|
void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
||||||
const FermionField &in, FermionField &out,int dag)
|
const FermionField &in, FermionField &out,int dag)
|
||||||
{
|
{
|
||||||
@ -311,7 +313,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
|
|||||||
{
|
{
|
||||||
int interior=1;
|
int interior=1;
|
||||||
int exterior=0;
|
int exterior=0;
|
||||||
Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
|
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
|
||||||
}
|
}
|
||||||
|
|
||||||
st.CommsMerge(compressor);
|
st.CommsMerge(compressor);
|
||||||
@ -321,12 +323,12 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
|
|||||||
{
|
{
|
||||||
int interior=0;
|
int interior=0;
|
||||||
int exterior=1;
|
int exterior=1;
|
||||||
Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
|
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
|
void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
||||||
const FermionField &in, FermionField &out,int dag)
|
const FermionField &in, FermionField &out,int dag)
|
||||||
{
|
{
|
||||||
@ -339,7 +341,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
|
|||||||
{
|
{
|
||||||
int interior=1;
|
int interior=1;
|
||||||
int exterior=1;
|
int exterior=1;
|
||||||
Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
|
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/*CHANGE END*/
|
/*CHANGE END*/
|
||||||
@ -355,7 +357,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionFie
|
|||||||
assert(in.Checkerboard()==Even);
|
assert(in.Checkerboard()==Even);
|
||||||
out.Checkerboard() = Odd;
|
out.Checkerboard() = Odd;
|
||||||
|
|
||||||
DhopInternal(StencilEven,UmuOdd,UUUmuOdd,in,out,dag);
|
DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,UUUmuOdd,in,out,dag);
|
||||||
}
|
}
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
|
void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
|
||||||
@ -366,7 +368,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionFie
|
|||||||
assert(in.Checkerboard()==Odd);
|
assert(in.Checkerboard()==Odd);
|
||||||
out.Checkerboard() = Even;
|
out.Checkerboard() = Even;
|
||||||
|
|
||||||
DhopInternal(StencilOdd,UmuEven,UUUmuEven,in,out,dag);
|
DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,UUUmuEven,in,out,dag);
|
||||||
}
|
}
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
|
void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
|
||||||
@ -376,7 +378,7 @@ void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField
|
|||||||
|
|
||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
|
|
||||||
DhopInternal(Stencil,Umu,UUUmu,in,out,dag);
|
DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////
|
||||||
|
@ -48,6 +48,8 @@ ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GridCartesian &Fgrid, G
|
|||||||
StencilEven(&Hgrid, npoint, Even, directions, displacements,p), // source is Even
|
StencilEven(&Hgrid, npoint, Even, directions, displacements,p), // source is Even
|
||||||
StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p), // source is Odd
|
StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p), // source is Odd
|
||||||
mass(_mass),
|
mass(_mass),
|
||||||
|
Lebesgue(_grid),
|
||||||
|
LebesgueEvenOdd(_cbgrid),
|
||||||
Umu(&Fgrid),
|
Umu(&Fgrid),
|
||||||
UmuEven(&Hgrid),
|
UmuEven(&Hgrid),
|
||||||
UmuOdd(&Hgrid),
|
UmuOdd(&Hgrid),
|
||||||
@ -337,7 +339,7 @@ void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &
|
|||||||
|
|
||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
|
|
||||||
DhopInternal(Stencil, Umu, UUUmu, in, out, dag);
|
DhopInternal(Stencil, Lebesgue, Umu, UUUmu, in, out, dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
@ -349,7 +351,7 @@ void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField
|
|||||||
assert(in.Checkerboard() == Even);
|
assert(in.Checkerboard() == Even);
|
||||||
out.Checkerboard() = Odd;
|
out.Checkerboard() = Odd;
|
||||||
|
|
||||||
DhopInternal(StencilEven, UmuOdd, UUUmuOdd, in, out, dag);
|
DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, UUUmuOdd, in, out, dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
@ -361,7 +363,7 @@ void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField
|
|||||||
assert(in.Checkerboard() == Odd);
|
assert(in.Checkerboard() == Odd);
|
||||||
out.Checkerboard() = Even;
|
out.Checkerboard() = Even;
|
||||||
|
|
||||||
DhopInternal(StencilOdd, UmuEven, UUUmuEven, in, out, dag);
|
DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, UUUmuEven, in, out, dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
@ -392,19 +394,19 @@ void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionFiel
|
|||||||
|
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st,
|
void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
DoubledGaugeField &UUU,
|
DoubledGaugeField &UUU,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
FermionField &out, int dag)
|
FermionField &out, int dag)
|
||||||
{
|
{
|
||||||
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
|
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
|
||||||
DhopInternalOverlappedComms(st,U,UUU,in,out,dag);
|
DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
|
||||||
else
|
else
|
||||||
DhopInternalSerialComms(st,U,UUU,in,out,dag);
|
DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
|
||||||
}
|
}
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st,
|
void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
DoubledGaugeField &UUU,
|
DoubledGaugeField &UUU,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
@ -427,7 +429,7 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
|
|||||||
{
|
{
|
||||||
int interior=1;
|
int interior=1;
|
||||||
int exterior=0;
|
int exterior=0;
|
||||||
Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
|
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
|
||||||
}
|
}
|
||||||
|
|
||||||
st.CommunicateComplete(requests);
|
st.CommunicateComplete(requests);
|
||||||
@ -438,13 +440,13 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
|
|||||||
{
|
{
|
||||||
int interior=0;
|
int interior=0;
|
||||||
int exterior=1;
|
int exterior=1;
|
||||||
Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
|
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st,
|
void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
DoubledGaugeField &UUU,
|
DoubledGaugeField &UUU,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
@ -458,7 +460,7 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st,
|
|||||||
{
|
{
|
||||||
int interior=1;
|
int interior=1;
|
||||||
int exterior=1;
|
int exterior=1;
|
||||||
Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
|
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -39,7 +39,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
|
void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
|
||||||
std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper)
|
Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper)
|
||||||
{
|
{
|
||||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||||
GridBase *grid = psi_i.Grid();
|
GridBase *grid = psi_i.Grid();
|
||||||
@ -50,13 +50,9 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField
|
|||||||
|
|
||||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||||
|
|
||||||
static deviceVector<Coeff_t> d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t));
|
auto pdiag = &diag[0];
|
||||||
static deviceVector<Coeff_t> d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t));
|
auto pupper = &upper[0];
|
||||||
static deviceVector<Coeff_t> d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t));
|
auto plower = &lower[0];
|
||||||
|
|
||||||
auto pdiag = &d_diag[0];
|
|
||||||
auto pupper = &d_upper[0];
|
|
||||||
auto plower = &d_lower[0];
|
|
||||||
|
|
||||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||||
int nloop = grid->oSites()/Ls;
|
int nloop = grid->oSites()/Ls;
|
||||||
@ -78,8 +74,8 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField
|
|||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
|
void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
|
||||||
std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper,
|
Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper,
|
||||||
std::vector<Coeff_t> &shift_coeffs)
|
Vector<Coeff_t> &shift_coeffs)
|
||||||
{
|
{
|
||||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||||
GridBase *grid = psi_i.Grid();
|
GridBase *grid = psi_i.Grid();
|
||||||
@ -93,15 +89,10 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
|
|||||||
|
|
||||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||||
|
|
||||||
static deviceVector<Coeff_t> d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t));
|
auto pdiag = &diag[0];
|
||||||
static deviceVector<Coeff_t> d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t));
|
auto pupper = &upper[0];
|
||||||
static deviceVector<Coeff_t> d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t));
|
auto plower = &lower[0];
|
||||||
static deviceVector<Coeff_t> d_shift_coeffs(Ls);acceleratorCopyToDevice(&shift_coeffs[0],&d_shift_coeffs[0],Ls*sizeof(Coeff_t));
|
auto pshift_coeffs = &shift_coeffs[0];
|
||||||
|
|
||||||
auto pdiag = &d_diag[0];
|
|
||||||
auto pupper = &d_upper[0];
|
|
||||||
auto plower = &d_lower[0];
|
|
||||||
auto pshift_coeffs = &d_shift_coeffs[0];
|
|
||||||
|
|
||||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||||
int nloop = grid->oSites()/Ls;
|
int nloop = grid->oSites()/Ls;
|
||||||
@ -128,7 +119,7 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
|
|||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
|
void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
|
||||||
std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper)
|
Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper)
|
||||||
{
|
{
|
||||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||||
GridBase *grid = psi_i.Grid();
|
GridBase *grid = psi_i.Grid();
|
||||||
@ -139,13 +130,9 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
|
|||||||
|
|
||||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||||
|
|
||||||
static deviceVector<Coeff_t> d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t));
|
auto pdiag = &diag[0];
|
||||||
static deviceVector<Coeff_t> d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t));
|
auto pupper = &upper[0];
|
||||||
static deviceVector<Coeff_t> d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t));
|
auto plower = &lower[0];
|
||||||
|
|
||||||
auto pdiag = &d_diag[0];
|
|
||||||
auto pupper = &d_upper[0];
|
|
||||||
auto plower = &d_lower[0];
|
|
||||||
|
|
||||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||||
int nloop = grid->oSites()/Ls;
|
int nloop = grid->oSites()/Ls;
|
||||||
@ -167,8 +154,8 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
|
|||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
|
void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
|
||||||
std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper,
|
Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper,
|
||||||
std::vector<Coeff_t> &shift_coeffs)
|
Vector<Coeff_t> &shift_coeffs)
|
||||||
{
|
{
|
||||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||||
GridBase *grid = psi_i.Grid();
|
GridBase *grid = psi_i.Grid();
|
||||||
@ -180,15 +167,10 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const Ferm
|
|||||||
|
|
||||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||||
|
|
||||||
static deviceVector<Coeff_t> d_diag(Ls); acceleratorCopyToDevice(&diag[0],&d_diag[0],Ls*sizeof(Coeff_t));
|
auto pdiag = &diag[0];
|
||||||
static deviceVector<Coeff_t> d_upper(Ls);acceleratorCopyToDevice(&upper[0],&d_upper[0],Ls*sizeof(Coeff_t));
|
auto pupper = &upper[0];
|
||||||
static deviceVector<Coeff_t> d_lower(Ls);acceleratorCopyToDevice(&lower[0],&d_lower[0],Ls*sizeof(Coeff_t));
|
auto plower = &lower[0];
|
||||||
static deviceVector<Coeff_t> d_shift_coeffs(Ls);acceleratorCopyToDevice(&shift_coeffs[0],&d_shift_coeffs[0],Ls*sizeof(Coeff_t));
|
auto pshift_coeffs = &shift_coeffs[0];
|
||||||
|
|
||||||
auto pdiag = &d_diag[0];
|
|
||||||
auto pupper = &d_upper[0];
|
|
||||||
auto plower = &d_lower[0];
|
|
||||||
auto pshift_coeffs = &d_shift_coeffs[0];
|
|
||||||
|
|
||||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||||
auto pm = this->pm;
|
auto pm = this->pm;
|
||||||
@ -230,17 +212,11 @@ void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField &
|
|||||||
autoView(psi , psi_i, AcceleratorRead);
|
autoView(psi , psi_i, AcceleratorRead);
|
||||||
autoView(chi , chi_i, AcceleratorWrite);
|
autoView(chi , chi_i, AcceleratorWrite);
|
||||||
|
|
||||||
static deviceVector<Coeff_t> d_lee(Ls); acceleratorCopyToDevice(&this->lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
|
auto plee = & this->lee [0];
|
||||||
static deviceVector<Coeff_t> d_dee(Ls); acceleratorCopyToDevice(&this->dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
|
auto pdee = & this->dee [0];
|
||||||
static deviceVector<Coeff_t> d_uee(Ls); acceleratorCopyToDevice(&this->uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
|
auto puee = & this->uee [0];
|
||||||
static deviceVector<Coeff_t> d_leem(Ls); acceleratorCopyToDevice(&this->leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
|
auto pleem= & this->leem[0];
|
||||||
static deviceVector<Coeff_t> d_ueem(Ls); acceleratorCopyToDevice(&this->ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
|
auto pueem= & this->ueem[0];
|
||||||
|
|
||||||
auto plee = & d_lee [0];
|
|
||||||
auto pdee = & d_dee [0];
|
|
||||||
auto puee = & d_uee [0];
|
|
||||||
auto pleem = & d_leem[0];
|
|
||||||
auto pueem = & d_ueem[0];
|
|
||||||
|
|
||||||
if(this->shift != 0.0){ MooeeInv_shift(psi_i,chi_i); return; }
|
if(this->shift != 0.0){ MooeeInv_shift(psi_i,chi_i); return; }
|
||||||
|
|
||||||
@ -292,24 +268,14 @@ void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionF
|
|||||||
autoView(psi , psi_i, AcceleratorRead);
|
autoView(psi , psi_i, AcceleratorRead);
|
||||||
autoView(chi , chi_i, AcceleratorWrite);
|
autoView(chi , chi_i, AcceleratorWrite);
|
||||||
|
|
||||||
// Move into object and constructor
|
|
||||||
static deviceVector<Coeff_t> d_lee(Ls); acceleratorCopyToDevice(&this->lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
|
|
||||||
static deviceVector<Coeff_t> d_dee(Ls); acceleratorCopyToDevice(&this->dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
|
|
||||||
static deviceVector<Coeff_t> d_uee(Ls); acceleratorCopyToDevice(&this->uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
|
|
||||||
static deviceVector<Coeff_t> d_leem(Ls); acceleratorCopyToDevice(&this->leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
|
|
||||||
static deviceVector<Coeff_t> d_ueem(Ls); acceleratorCopyToDevice(&this->ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
|
|
||||||
|
|
||||||
auto pm = this->pm;
|
auto pm = this->pm;
|
||||||
auto plee = & d_lee [0];
|
auto plee = & this->lee [0];
|
||||||
auto pdee = & d_dee [0];
|
auto pdee = & this->dee [0];
|
||||||
auto puee = & d_uee [0];
|
auto puee = & this->uee [0];
|
||||||
auto pleem = & d_leem[0];
|
auto pleem= & this->leem[0];
|
||||||
auto pueem = & d_ueem[0];
|
auto pueem= & this->ueem[0];
|
||||||
|
auto pMooeeInv_shift_lc = &MooeeInv_shift_lc[0];
|
||||||
static deviceVector<Coeff_t> d_MooeeInv_shift_lc(Ls); acceleratorCopyToDevice(&MooeeInv_shift_lc[0],&d_MooeeInv_shift_lc[0],Ls*sizeof(Coeff_t));
|
auto pMooeeInv_shift_norm = &MooeeInv_shift_norm[0];
|
||||||
static deviceVector<Coeff_t> d_MooeeInv_shift_norm(Ls); acceleratorCopyToDevice(&MooeeInv_shift_norm[0],&d_MooeeInv_shift_norm[0],Ls*sizeof(Coeff_t));
|
|
||||||
auto pMooeeInv_shift_lc = &d_MooeeInv_shift_lc[0];
|
|
||||||
auto pMooeeInv_shift_norm = &d_MooeeInv_shift_norm[0];
|
|
||||||
|
|
||||||
int nloop = grid->oSites()/Ls;
|
int nloop = grid->oSites()/Ls;
|
||||||
accelerator_for(sss,nloop,Simd::Nsimd(),{
|
accelerator_for(sss,nloop,Simd::Nsimd(),{
|
||||||
@ -367,17 +333,11 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionFiel
|
|||||||
autoView(psi , psi_i, AcceleratorRead);
|
autoView(psi , psi_i, AcceleratorRead);
|
||||||
autoView(chi , chi_i, AcceleratorWrite);
|
autoView(chi , chi_i, AcceleratorWrite);
|
||||||
|
|
||||||
static deviceVector<Coeff_t> d_lee(Ls); acceleratorCopyToDevice(&this->lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
|
auto plee = & this->lee [0];
|
||||||
static deviceVector<Coeff_t> d_dee(Ls); acceleratorCopyToDevice(&this->dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
|
auto pdee = & this->dee [0];
|
||||||
static deviceVector<Coeff_t> d_uee(Ls); acceleratorCopyToDevice(&this->uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
|
auto puee = & this->uee [0];
|
||||||
static deviceVector<Coeff_t> d_leem(Ls); acceleratorCopyToDevice(&this->leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
|
auto pleem= & this->leem[0];
|
||||||
static deviceVector<Coeff_t> d_ueem(Ls); acceleratorCopyToDevice(&this->ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
|
auto pueem= & this->ueem[0];
|
||||||
|
|
||||||
auto plee = & d_lee [0];
|
|
||||||
auto pdee = & d_dee [0];
|
|
||||||
auto puee = & d_uee [0];
|
|
||||||
auto pleem = & d_leem[0];
|
|
||||||
auto pueem = & d_ueem[0];
|
|
||||||
|
|
||||||
int nloop = grid->oSites()/Ls;
|
int nloop = grid->oSites()/Ls;
|
||||||
accelerator_for(sss,nloop,Simd::Nsimd(),{
|
accelerator_for(sss,nloop,Simd::Nsimd(),{
|
||||||
@ -426,28 +386,14 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, Fermi
|
|||||||
autoView(chi , chi_i, AcceleratorWrite);
|
autoView(chi , chi_i, AcceleratorWrite);
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
|
|
||||||
static deviceVector<Coeff_t> d_lee(Ls); acceleratorCopyToDevice(&this->lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
|
|
||||||
static deviceVector<Coeff_t> d_dee(Ls); acceleratorCopyToDevice(&this->dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
|
|
||||||
static deviceVector<Coeff_t> d_uee(Ls); acceleratorCopyToDevice(&this->uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
|
|
||||||
static deviceVector<Coeff_t> d_leem(Ls); acceleratorCopyToDevice(&this->leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
|
|
||||||
static deviceVector<Coeff_t> d_ueem(Ls); acceleratorCopyToDevice(&this->ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
|
|
||||||
|
|
||||||
auto pm = this->pm;
|
auto pm = this->pm;
|
||||||
auto plee = & d_lee [0];
|
auto plee = & this->lee [0];
|
||||||
auto pdee = & d_dee [0];
|
auto pdee = & this->dee [0];
|
||||||
auto puee = & d_uee [0];
|
auto puee = & this->uee [0];
|
||||||
auto pleem = & d_leem[0];
|
auto pleem= & this->leem[0];
|
||||||
auto pueem = & d_ueem[0];
|
auto pueem= & this->ueem[0];
|
||||||
|
auto pMooeeInvDag_shift_lc = &MooeeInvDag_shift_lc[0];
|
||||||
static deviceVector<Coeff_t> d_MooeeInvDag_shift_lc(Ls);
|
auto pMooeeInvDag_shift_norm = &MooeeInvDag_shift_norm[0];
|
||||||
static deviceVector<Coeff_t> d_MooeeInvDag_shift_norm(Ls);
|
|
||||||
acceleratorCopyToDevice(&MooeeInvDag_shift_lc[0],&d_MooeeInvDag_shift_lc[0],Ls*sizeof(Coeff_t));
|
|
||||||
acceleratorCopyToDevice(&MooeeInvDag_shift_norm[0],&d_MooeeInvDag_shift_norm[0],Ls*sizeof(Coeff_t));
|
|
||||||
auto pMooeeInvDag_shift_lc = &d_MooeeInvDag_shift_lc[0];
|
|
||||||
auto pMooeeInvDag_shift_norm = &d_MooeeInvDag_shift_norm[0];
|
|
||||||
|
|
||||||
// auto pMooeeInvDag_shift_lc = &MooeeInvDag_shift_lc[0];
|
|
||||||
// auto pMooeeInvDag_shift_norm = &MooeeInvDag_shift_norm[0];
|
|
||||||
|
|
||||||
int nloop = grid->oSites()/Ls;
|
int nloop = grid->oSites()/Ls;
|
||||||
accelerator_for(sss,nloop,Simd::Nsimd(),{
|
accelerator_for(sss,nloop,Simd::Nsimd(),{
|
||||||
|
@ -196,9 +196,9 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
|
|||||||
{
|
{
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
|
|
||||||
std::vector<Coeff_t> diag(Ls,1.0);
|
Vector<Coeff_t> diag(Ls,1.0);
|
||||||
std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = this->mq1;
|
Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = this->mq1;
|
||||||
std::vector<Coeff_t> lower(Ls,-1.0); lower[0] = this->mq1;
|
Vector<Coeff_t> lower(Ls,-1.0); lower[0] = this->mq1;
|
||||||
|
|
||||||
// no shift term
|
// no shift term
|
||||||
if(this->shift == 0.0){ this->M5D(psi, chi, chi, lower, diag, upper); }
|
if(this->shift == 0.0){ this->M5D(psi, chi, chi, lower, diag, upper); }
|
||||||
@ -212,9 +212,9 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& chi)
|
|||||||
{
|
{
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
|
|
||||||
std::vector<Coeff_t> diag(Ls,1.0);
|
Vector<Coeff_t> diag(Ls,1.0);
|
||||||
std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = this->mq1;
|
Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = this->mq1;
|
||||||
std::vector<Coeff_t> lower(Ls,-1.0); lower[0] = this->mq1;
|
Vector<Coeff_t> lower(Ls,-1.0); lower[0] = this->mq1;
|
||||||
|
|
||||||
// no shift term
|
// no shift term
|
||||||
if(this->shift == 0.0){ this->M5Ddag(psi, chi, chi, lower, diag, upper); }
|
if(this->shift == 0.0){ this->M5Ddag(psi, chi, chi, lower, diag, upper); }
|
||||||
@ -230,9 +230,9 @@ void MobiusEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& chi)
|
|||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
|
|
||||||
// coefficients of Mooee
|
// coefficients of Mooee
|
||||||
std::vector<Coeff_t> diag = this->bee;
|
Vector<Coeff_t> diag = this->bee;
|
||||||
std::vector<Coeff_t> upper(Ls);
|
Vector<Coeff_t> upper(Ls);
|
||||||
std::vector<Coeff_t> lower(Ls);
|
Vector<Coeff_t> lower(Ls);
|
||||||
for(int s=0; s<Ls; s++){
|
for(int s=0; s<Ls; s++){
|
||||||
upper[s] = -this->cee[s];
|
upper[s] = -this->cee[s];
|
||||||
lower[s] = -this->cee[s];
|
lower[s] = -this->cee[s];
|
||||||
@ -253,9 +253,9 @@ void MobiusEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField& ch
|
|||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
|
|
||||||
// coefficients of MooeeDag
|
// coefficients of MooeeDag
|
||||||
std::vector<Coeff_t> diag = this->bee;
|
Vector<Coeff_t> diag = this->bee;
|
||||||
std::vector<Coeff_t> upper(Ls);
|
Vector<Coeff_t> upper(Ls);
|
||||||
std::vector<Coeff_t> lower(Ls);
|
Vector<Coeff_t> lower(Ls);
|
||||||
for(int s=0; s<Ls; s++){
|
for(int s=0; s<Ls; s++){
|
||||||
if(s==0) {
|
if(s==0) {
|
||||||
upper[s] = -this->cee[s+1];
|
upper[s] = -this->cee[s+1];
|
||||||
@ -314,10 +314,10 @@ void MobiusEOFAFermion<Impl>::SetCoefficientsPrecondShiftOps()
|
|||||||
// Tridiagonal solve for MooeeInvDag_shift_lc
|
// Tridiagonal solve for MooeeInvDag_shift_lc
|
||||||
{
|
{
|
||||||
Coeff_t m(0.0);
|
Coeff_t m(0.0);
|
||||||
std::vector<Coeff_t> d = Mooee_shift;
|
Vector<Coeff_t> d = Mooee_shift;
|
||||||
std::vector<Coeff_t> u(Ls,0.0);
|
Vector<Coeff_t> u(Ls,0.0);
|
||||||
std::vector<Coeff_t> y(Ls,0.0);
|
Vector<Coeff_t> y(Ls,0.0);
|
||||||
std::vector<Coeff_t> q(Ls,0.0);
|
Vector<Coeff_t> q(Ls,0.0);
|
||||||
if(pm == 1){ u[0] = 1.0; }
|
if(pm == 1){ u[0] = 1.0; }
|
||||||
else{ u[Ls-1] = 1.0; }
|
else{ u[Ls-1] = 1.0; }
|
||||||
|
|
||||||
|
@ -48,6 +48,8 @@ NaiveStaggeredFermion<Impl>::NaiveStaggeredFermion(GridCartesian &Fgrid, GridRed
|
|||||||
StencilEven(&Hgrid, npoint, Even, directions, displacements,p), // source is Even
|
StencilEven(&Hgrid, npoint, Even, directions, displacements,p), // source is Even
|
||||||
StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p), // source is Odd
|
StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p), // source is Odd
|
||||||
mass(_mass),
|
mass(_mass),
|
||||||
|
Lebesgue(_grid),
|
||||||
|
LebesgueEvenOdd(_cbgrid),
|
||||||
Umu(&Fgrid),
|
Umu(&Fgrid),
|
||||||
UmuEven(&Hgrid),
|
UmuEven(&Hgrid),
|
||||||
UmuOdd(&Hgrid),
|
UmuOdd(&Hgrid),
|
||||||
@ -266,7 +268,7 @@ void NaiveStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out
|
|||||||
|
|
||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
|
|
||||||
DhopInternal(Stencil, Umu, in, out, dag);
|
DhopInternal(Stencil, Lebesgue, Umu, in, out, dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
@ -278,7 +280,7 @@ void NaiveStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &o
|
|||||||
assert(in.Checkerboard() == Even);
|
assert(in.Checkerboard() == Even);
|
||||||
out.Checkerboard() = Odd;
|
out.Checkerboard() = Odd;
|
||||||
|
|
||||||
DhopInternal(StencilEven, UmuOdd, in, out, dag);
|
DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
@ -290,7 +292,7 @@ void NaiveStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &o
|
|||||||
assert(in.Checkerboard() == Odd);
|
assert(in.Checkerboard() == Odd);
|
||||||
out.Checkerboard() = Even;
|
out.Checkerboard() = Even;
|
||||||
|
|
||||||
DhopInternal(StencilOdd, UmuEven, in, out, dag);
|
DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
@ -321,18 +323,18 @@ void NaiveStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionField &
|
|||||||
|
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void NaiveStaggeredFermion<Impl>::DhopInternal(StencilImpl &st,
|
void NaiveStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
FermionField &out, int dag)
|
FermionField &out, int dag)
|
||||||
{
|
{
|
||||||
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
|
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
|
||||||
DhopInternalOverlappedComms(st,U,in,out,dag);
|
DhopInternalOverlappedComms(st,lo,U,in,out,dag);
|
||||||
else
|
else
|
||||||
DhopInternalSerialComms(st,U,in,out,dag);
|
DhopInternalSerialComms(st,lo,U,in,out,dag);
|
||||||
}
|
}
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st,
|
void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
FermionField &out, int dag)
|
FermionField &out, int dag)
|
||||||
@ -354,7 +356,7 @@ void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st,
|
|||||||
{
|
{
|
||||||
int interior=1;
|
int interior=1;
|
||||||
int exterior=0;
|
int exterior=0;
|
||||||
Kernels::DhopNaive(st,U,in,out,dag,interior,exterior);
|
Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
|
||||||
}
|
}
|
||||||
|
|
||||||
st.CommunicateComplete(requests);
|
st.CommunicateComplete(requests);
|
||||||
@ -365,12 +367,12 @@ void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st,
|
|||||||
{
|
{
|
||||||
int interior=0;
|
int interior=0;
|
||||||
int exterior=1;
|
int exterior=1;
|
||||||
Kernels::DhopNaive(st,U,in,out,dag,interior,exterior);
|
Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st,
|
void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
FermionField &out, int dag)
|
FermionField &out, int dag)
|
||||||
@ -383,7 +385,7 @@ void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st,
|
|||||||
{
|
{
|
||||||
int interior=1;
|
int interior=1;
|
||||||
int exterior=1;
|
int exterior=1;
|
||||||
Kernels::DhopNaive(st,U,in,out,dag,interior,exterior);
|
Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -375,6 +375,23 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
#define DHOP_SITE_HAND_INSTANTIATE(IMPL) \
|
||||||
|
template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \
|
||||||
|
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
|
||||||
|
SiteSpinor *buf, int LLs, int sU, \
|
||||||
|
const FermionFieldView &in, FermionFieldView &out, int dag); \
|
||||||
|
\
|
||||||
|
template void StaggeredKernels<IMPL>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, \
|
||||||
|
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
|
||||||
|
SiteSpinor *buf, int LLs, int sU, \
|
||||||
|
const FermionFieldView &in, FermionFieldView &out, int dag); \
|
||||||
|
\
|
||||||
|
template void StaggeredKernels<IMPL>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, \
|
||||||
|
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
|
||||||
|
SiteSpinor *buf, int LLs, int sU, \
|
||||||
|
const FermionFieldView &in, FermionFieldView &out, int dag); \
|
||||||
|
*/
|
||||||
#undef LOAD_CHI
|
#undef LOAD_CHI
|
||||||
#undef HAND_DECLARATIONS
|
#undef HAND_DECLARATIONS
|
||||||
|
|
||||||
|
@ -256,7 +256,7 @@ void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldVie
|
|||||||
});
|
});
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st,
|
void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
const FermionField &in, FermionField &out, int dag, int interior,int exterior)
|
const FermionField &in, FermionField &out, int dag, int interior,int exterior)
|
||||||
{
|
{
|
||||||
@ -294,7 +294,7 @@ void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st,
|
|||||||
assert(0 && " Kernel optimisation case not covered ");
|
assert(0 && " Kernel optimisation case not covered ");
|
||||||
}
|
}
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st,
|
void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
const FermionField &in, FermionField &out, int dag, int interior,int exterior)
|
const FermionField &in, FermionField &out, int dag, int interior,int exterior)
|
||||||
{
|
{
|
||||||
|
@ -58,9 +58,15 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
|
|||||||
Umu(_FourDimGrid),
|
Umu(_FourDimGrid),
|
||||||
UmuEven(_FourDimRedBlackGrid),
|
UmuEven(_FourDimRedBlackGrid),
|
||||||
UmuOdd (_FourDimRedBlackGrid),
|
UmuOdd (_FourDimRedBlackGrid),
|
||||||
|
Lebesgue(_FourDimGrid),
|
||||||
|
LebesgueEvenOdd(_FourDimRedBlackGrid),
|
||||||
_tmp(&FiveDimRedBlackGrid),
|
_tmp(&FiveDimRedBlackGrid),
|
||||||
Dirichlet(0)
|
Dirichlet(0)
|
||||||
{
|
{
|
||||||
|
Stencil.lo = &Lebesgue;
|
||||||
|
StencilEven.lo = &LebesgueEvenOdd;
|
||||||
|
StencilOdd.lo = &LebesgueEvenOdd;
|
||||||
|
|
||||||
// some assertions
|
// some assertions
|
||||||
assert(FiveDimGrid._ndimension==5);
|
assert(FiveDimGrid._ndimension==5);
|
||||||
assert(FourDimGrid._ndimension==4);
|
assert(FourDimGrid._ndimension==4);
|
||||||
@ -299,19 +305,19 @@ void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st,
|
void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField & U,
|
DoubledGaugeField & U,
|
||||||
const FermionField &in, FermionField &out,int dag)
|
const FermionField &in, FermionField &out,int dag)
|
||||||
{
|
{
|
||||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
|
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
|
||||||
DhopInternalOverlappedComms(st,U,in,out,dag);
|
DhopInternalOverlappedComms(st,lo,U,in,out,dag);
|
||||||
else
|
else
|
||||||
DhopInternalSerialComms(st,U,in,out,dag);
|
DhopInternalSerialComms(st,lo,U,in,out,dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
|
void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField & U,
|
DoubledGaugeField & U,
|
||||||
const FermionField &in, FermionField &out,int dag)
|
const FermionField &in, FermionField &out,int dag)
|
||||||
{
|
{
|
||||||
@ -325,12 +331,10 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
|
|||||||
// Start comms // Gather intranode and extra node differentiated??
|
// Start comms // Gather intranode and extra node differentiated??
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
{
|
{
|
||||||
// std::cout << " WilsonFermion5D gather " <<std::endl;
|
|
||||||
GRID_TRACE("Gather");
|
GRID_TRACE("Gather");
|
||||||
st.HaloExchangeOptGather(in,compressor); // Put the barrier in the routine
|
st.HaloExchangeOptGather(in,compressor); // Put the barrier in the routine
|
||||||
}
|
}
|
||||||
|
|
||||||
// std::cout << " WilsonFermion5D Communicate Begin " <<std::endl;
|
|
||||||
std::vector<std::vector<CommsRequest_t> > requests;
|
std::vector<std::vector<CommsRequest_t> > requests;
|
||||||
auto id=traceStart("Communicate overlapped");
|
auto id=traceStart("Communicate overlapped");
|
||||||
st.CommunicateBegin(requests);
|
st.CommunicateBegin(requests);
|
||||||
@ -339,7 +343,6 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
|
|||||||
// Overlap with comms
|
// Overlap with comms
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
{
|
{
|
||||||
// std::cout << " WilsonFermion5D Comms merge " <<std::endl;
|
|
||||||
GRID_TRACE("MergeSHM");
|
GRID_TRACE("MergeSHM");
|
||||||
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
|
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
|
||||||
}
|
}
|
||||||
@ -347,7 +350,6 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
|
|||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
// do the compute interior
|
// do the compute interior
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
// std::cout << " WilsonFermion5D Interior " <<std::endl;
|
|
||||||
int Opt = WilsonKernelsStatic::Opt; // Why pass this. Kernels should know
|
int Opt = WilsonKernelsStatic::Opt; // Why pass this. Kernels should know
|
||||||
if (dag == DaggerYes) {
|
if (dag == DaggerYes) {
|
||||||
GRID_TRACE("DhopDagInterior");
|
GRID_TRACE("DhopDagInterior");
|
||||||
@ -360,7 +362,6 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
|
|||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
// Complete comms
|
// Complete comms
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
// std::cout << " WilsonFermion5D Comms Complete " <<std::endl;
|
|
||||||
st.CommunicateComplete(requests);
|
st.CommunicateComplete(requests);
|
||||||
traceStop(id);
|
traceStop(id);
|
||||||
|
|
||||||
@ -368,13 +369,11 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
|
|||||||
// do the compute exterior
|
// do the compute exterior
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
{
|
{
|
||||||
// std::cout << " WilsonFermion5D Comms Merge " <<std::endl;
|
|
||||||
GRID_TRACE("Merge");
|
GRID_TRACE("Merge");
|
||||||
st.CommsMerge(compressor);
|
st.CommsMerge(compressor);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// std::cout << " WilsonFermion5D Exterior " <<std::endl;
|
|
||||||
if (dag == DaggerYes) {
|
if (dag == DaggerYes) {
|
||||||
GRID_TRACE("DhopDagExterior");
|
GRID_TRACE("DhopDagExterior");
|
||||||
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
|
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
|
||||||
@ -382,12 +381,11 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
|
|||||||
GRID_TRACE("DhopExterior");
|
GRID_TRACE("DhopExterior");
|
||||||
Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
|
Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
|
||||||
}
|
}
|
||||||
// std::cout << " WilsonFermion5D Done " <<std::endl;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
|
void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField & U,
|
DoubledGaugeField & U,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
FermionField &out,int dag)
|
FermionField &out,int dag)
|
||||||
@ -397,13 +395,11 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
|
|||||||
|
|
||||||
int LLs = in.Grid()->_rdimensions[0];
|
int LLs = in.Grid()->_rdimensions[0];
|
||||||
|
|
||||||
// std::cout << " WilsonFermion5D Halo exch " <<std::endl;
|
|
||||||
{
|
{
|
||||||
GRID_TRACE("HaloExchange");
|
GRID_TRACE("HaloExchange");
|
||||||
st.HaloExchangeOpt(in,compressor);
|
st.HaloExchangeOpt(in,compressor);
|
||||||
}
|
}
|
||||||
|
|
||||||
// std::cout << " WilsonFermion5D Dhop " <<std::endl;
|
|
||||||
int Opt = WilsonKernelsStatic::Opt;
|
int Opt = WilsonKernelsStatic::Opt;
|
||||||
if (dag == DaggerYes) {
|
if (dag == DaggerYes) {
|
||||||
GRID_TRACE("DhopDag");
|
GRID_TRACE("DhopDag");
|
||||||
@ -412,7 +408,6 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
|
|||||||
GRID_TRACE("Dhop");
|
GRID_TRACE("Dhop");
|
||||||
Kernels::DhopKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
|
Kernels::DhopKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
|
||||||
}
|
}
|
||||||
// std::cout << " WilsonFermion5D Done " <<std::endl;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -425,7 +420,7 @@ void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int
|
|||||||
assert(in.Checkerboard()==Even);
|
assert(in.Checkerboard()==Even);
|
||||||
out.Checkerboard() = Odd;
|
out.Checkerboard() = Odd;
|
||||||
|
|
||||||
DhopInternal(StencilEven,UmuOdd,in,out,dag);
|
DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,in,out,dag);
|
||||||
}
|
}
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
|
void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
|
||||||
@ -436,7 +431,7 @@ void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int
|
|||||||
assert(in.Checkerboard()==Odd);
|
assert(in.Checkerboard()==Odd);
|
||||||
out.Checkerboard() = Even;
|
out.Checkerboard() = Even;
|
||||||
|
|
||||||
DhopInternal(StencilOdd,UmuEven,in,out,dag);
|
DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,in,out,dag);
|
||||||
}
|
}
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
|
void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
|
||||||
@ -446,7 +441,7 @@ void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int d
|
|||||||
|
|
||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
|
|
||||||
DhopInternal(Stencil,Umu,in,out,dag);
|
DhopInternal(Stencil,Lebesgue,Umu,in,out,dag);
|
||||||
}
|
}
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag)
|
void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag)
|
||||||
|
@ -52,12 +52,17 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
|
|||||||
StencilEven(&Hgrid, npoint, Even, directions,displacements,p), // source is Even
|
StencilEven(&Hgrid, npoint, Even, directions,displacements,p), // source is Even
|
||||||
StencilOdd(&Hgrid, npoint, Odd, directions,displacements,p), // source is Odd
|
StencilOdd(&Hgrid, npoint, Odd, directions,displacements,p), // source is Odd
|
||||||
mass(_mass),
|
mass(_mass),
|
||||||
|
Lebesgue(_grid),
|
||||||
|
LebesgueEvenOdd(_cbgrid),
|
||||||
Umu(&Fgrid),
|
Umu(&Fgrid),
|
||||||
UmuEven(&Hgrid),
|
UmuEven(&Hgrid),
|
||||||
UmuOdd(&Hgrid),
|
UmuOdd(&Hgrid),
|
||||||
_tmp(&Hgrid),
|
_tmp(&Hgrid),
|
||||||
anisotropyCoeff(anis)
|
anisotropyCoeff(anis)
|
||||||
{
|
{
|
||||||
|
Stencil.lo = &Lebesgue;
|
||||||
|
StencilEven.lo = &LebesgueEvenOdd;
|
||||||
|
StencilOdd.lo = &LebesgueEvenOdd;
|
||||||
// Allocate the required comms buffer
|
// Allocate the required comms buffer
|
||||||
ImportGauge(_Umu);
|
ImportGauge(_Umu);
|
||||||
if (anisotropyCoeff.isAnisotropic){
|
if (anisotropyCoeff.isAnisotropic){
|
||||||
@ -309,7 +314,7 @@ void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int da
|
|||||||
|
|
||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
|
|
||||||
DhopInternal(Stencil, Umu, in, out, dag);
|
DhopInternal(Stencil, Lebesgue, Umu, in, out, dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
@ -321,7 +326,7 @@ void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int
|
|||||||
assert(in.Checkerboard() == Even);
|
assert(in.Checkerboard() == Even);
|
||||||
out.Checkerboard() = Odd;
|
out.Checkerboard() = Odd;
|
||||||
|
|
||||||
DhopInternal(StencilEven, UmuOdd, in, out, dag);
|
DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
@ -333,7 +338,7 @@ void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int d
|
|||||||
assert(in.Checkerboard() == Odd);
|
assert(in.Checkerboard() == Odd);
|
||||||
out.Checkerboard() = Even;
|
out.Checkerboard() = Even;
|
||||||
|
|
||||||
DhopInternal(StencilOdd, UmuEven, in, out, dag);
|
DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
@ -386,21 +391,21 @@ void WilsonFermion<Impl>::DhopDirCalc(const FermionField &in, FermionField &out,
|
|||||||
};
|
};
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonFermion<Impl>::DhopInternal(StencilImpl &st,
|
void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
FermionField &out, int dag)
|
FermionField &out, int dag)
|
||||||
{
|
{
|
||||||
#ifdef GRID_OMP
|
#ifdef GRID_OMP
|
||||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
|
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
|
||||||
DhopInternalOverlappedComms(st,U,in,out,dag);
|
DhopInternalOverlappedComms(st,lo,U,in,out,dag);
|
||||||
else
|
else
|
||||||
#endif
|
#endif
|
||||||
DhopInternalSerial(st,U,in,out,dag);
|
DhopInternalSerial(st,lo,U,in,out,dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st,
|
void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
FermionField &out, int dag)
|
FermionField &out, int dag)
|
||||||
@ -469,7 +474,7 @@ void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st,
|
|||||||
|
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st,
|
void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
FermionField &out, int dag)
|
FermionField &out, int dag)
|
||||||
|
@ -40,11 +40,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
/// Switch off the 5d vectorised code optimisations
|
/// Switch off the 5d vectorised code optimisations
|
||||||
#undef DWFVEC5D
|
#undef DWFVEC5D
|
||||||
|
|
||||||
static std::vector<vComplexF> signsF;
|
static Vector<vComplexF> signsF;
|
||||||
|
|
||||||
template<typename vtype>
|
template<typename vtype>
|
||||||
int setupSigns(std::vector<vtype>& signs ){
|
int setupSigns(Vector<vtype>& signs ){
|
||||||
std::vector<vtype> bother(2);
|
Vector<vtype> bother(2);
|
||||||
signs = bother;
|
signs = bother;
|
||||||
vrsign(signs[0]);
|
vrsign(signs[0]);
|
||||||
visign(signs[1]);
|
visign(signs[1]);
|
||||||
@ -364,7 +364,7 @@ WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDagExt(StencilView &st, Doubled
|
|||||||
|
|
||||||
#include <simd/Intel512double.h>
|
#include <simd/Intel512double.h>
|
||||||
|
|
||||||
static std::vector<vComplexD> signsD;
|
static Vector<vComplexD> signsD;
|
||||||
static int signInitD = setupSigns(signsD);
|
static int signInitD = setupSigns(signsD);
|
||||||
|
|
||||||
#define MAYBEPERM(A,perm) if (perm) { A ; }
|
#define MAYBEPERM(A,perm) if (perm) { A ; }
|
||||||
|
@ -434,7 +434,7 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
|
|||||||
|
|
||||||
#define ASM_CALL(A) \
|
#define ASM_CALL(A) \
|
||||||
thread_for( sss, Nsite, { \
|
thread_for( sss, Nsite, { \
|
||||||
int ss = sss; /*st.lo->Reorder(sss);*/ \
|
int ss = st.lo->Reorder(sss); \
|
||||||
int sU = ss; \
|
int sU = ss; \
|
||||||
int sF = ss*Ls; \
|
int sF = ss*Ls; \
|
||||||
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \
|
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \
|
||||||
|
@ -40,7 +40,7 @@ public:
|
|||||||
U = Zero();
|
U = Zero();
|
||||||
LatticeColourMatrix tmp(Uin.Grid());
|
LatticeColourMatrix tmp(Uin.Grid());
|
||||||
|
|
||||||
std::vector<typename SU<ncolour>::Matrix> ta(Dimension);
|
Vector<typename SU<ncolour>::Matrix> ta(Dimension);
|
||||||
|
|
||||||
// Debug lines
|
// Debug lines
|
||||||
// LatticeMatrix uno(Uin.Grid());
|
// LatticeMatrix uno(Uin.Grid());
|
||||||
|
@ -43,7 +43,7 @@ public:
|
|||||||
U = Zero();
|
U = Zero();
|
||||||
LatticeColourMatrix tmp(Uin.Grid());
|
LatticeColourMatrix tmp(Uin.Grid());
|
||||||
|
|
||||||
std::vector<typename GaugeGroup<ncolour,group_name>::Matrix> eij(Dimension);
|
Vector<typename GaugeGroup<ncolour,group_name>::Matrix> eij(Dimension);
|
||||||
|
|
||||||
for (int a = 0; a < Dimension; a++)
|
for (int a = 0; a < Dimension; a++)
|
||||||
GaugeGroupTwoIndex<ncolour, S, group_name>::base(a, eij[a]);
|
GaugeGroupTwoIndex<ncolour, S, group_name>::base(a, eij[a]);
|
||||||
|
@ -32,7 +32,9 @@ private:
|
|||||||
// Smear_Stout<Gimpl> *StoutSmearing;
|
// Smear_Stout<Gimpl> *StoutSmearing;
|
||||||
// std::vector<GaugeField> SmearedSet;
|
// std::vector<GaugeField> SmearedSet;
|
||||||
|
|
||||||
|
GridRedBlackCartesian * UrbGrid; // keep a copy of the redblack grid for life of object
|
||||||
std::vector<LatticeLorentzComplex> masks;
|
std::vector<LatticeLorentzComplex> masks;
|
||||||
|
std::vector<int> cbs;
|
||||||
|
|
||||||
typedef typename SU3Adjoint::AMatrix AdjMatrix;
|
typedef typename SU3Adjoint::AMatrix AdjMatrix;
|
||||||
typedef typename SU3Adjoint::LatticeAdjMatrix AdjMatrixField;
|
typedef typename SU3Adjoint::LatticeAdjMatrix AdjMatrixField;
|
||||||
@ -147,6 +149,25 @@ private:
|
|||||||
}
|
}
|
||||||
pokeLorentz(Fdet, Fdet_pol, nu);
|
pokeLorentz(Fdet, Fdet_pol, nu);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void Compute_MpInvJx_dNxxdSy(int cb,
|
||||||
|
const GaugeLinkField &PlaqL,
|
||||||
|
const GaugeLinkField &PlaqR,
|
||||||
|
AdjMatrixField MpInvJx,
|
||||||
|
AdjVectorField &Fdet2 )
|
||||||
|
{
|
||||||
|
GaugeLinkField PlaqLeo(UrbGrid);
|
||||||
|
GaugeLinkField PlaqReo(UrbGrid);
|
||||||
|
AdjMatrixField MpInvJxeo(UrbGrid);
|
||||||
|
AdjVectorField Fdet2eo(UrbGrid);
|
||||||
|
pickCheckerboard(cb,PlaqLeo,PlaqL);
|
||||||
|
pickCheckerboard(cb,PlaqReo,PlaqR);
|
||||||
|
pickCheckerboard(cb,MpInvJxeo,MpInvJx);
|
||||||
|
Fdet2eo.Checkerboard()=cb;
|
||||||
|
Compute_MpInvJx_dNxxdSy(PlaqLeo,PlaqReo,MpInvJxeo,Fdet2eo);
|
||||||
|
setCheckerboard(Fdet2,Fdet2eo);
|
||||||
|
}
|
||||||
|
|
||||||
void Compute_MpInvJx_dNxxdSy(const GaugeLinkField &PlaqL,const GaugeLinkField &PlaqR, AdjMatrixField MpInvJx,AdjVectorField &Fdet2 )
|
void Compute_MpInvJx_dNxxdSy(const GaugeLinkField &PlaqL,const GaugeLinkField &PlaqR, AdjMatrixField MpInvJx,AdjVectorField &Fdet2 )
|
||||||
{
|
{
|
||||||
GaugeLinkField UtaU(PlaqL.Grid());
|
GaugeLinkField UtaU(PlaqL.Grid());
|
||||||
@ -278,6 +299,7 @@ public:
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Mask the gauge field
|
// Mask the gauge field
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
int cb = cbs[smr];
|
||||||
auto mask=PeekIndex<LorentzIndex>(masks[smr],mu); // the cb mask
|
auto mask=PeekIndex<LorentzIndex>(masks[smr],mu); // the cb mask
|
||||||
|
|
||||||
Umsk = U;
|
Umsk = U;
|
||||||
@ -442,7 +464,7 @@ public:
|
|||||||
AdjMatrixField MpInvJx_nu(grid);
|
AdjMatrixField MpInvJx_nu(grid);
|
||||||
MpInvJx = (-1.0)*MpAdInv * JxAd;// rho is on the plaq factor
|
MpInvJx = (-1.0)*MpAdInv * JxAd;// rho is on the plaq factor
|
||||||
|
|
||||||
Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx,FdetV);
|
Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx,FdetV);
|
||||||
Fdet2_mu=FdetV;
|
Fdet2_mu=FdetV;
|
||||||
Fdet1_mu=Zero();
|
Fdet1_mu=Zero();
|
||||||
|
|
||||||
@ -499,7 +521,7 @@ public:
|
|||||||
|
|
||||||
time=-usecond();
|
time=-usecond();
|
||||||
PlaqR=(-1.0)*PlaqR;
|
PlaqR=(-1.0)*PlaqR;
|
||||||
Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx,FdetV);
|
Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx,FdetV);
|
||||||
Fdet2_nu = FdetV;
|
Fdet2_nu = FdetV;
|
||||||
time+=usecond();
|
time+=usecond();
|
||||||
std::cout << GridLogMessage << "Compute_MpInvJx_dNxxSy (occurs 6x) took "<<time<< " us"<<std::endl;
|
std::cout << GridLogMessage << "Compute_MpInvJx_dNxxSy (occurs 6x) took "<<time<< " us"<<std::endl;
|
||||||
@ -520,7 +542,7 @@ public:
|
|||||||
|
|
||||||
|
|
||||||
MpInvJx_nu = Cshift(MpInvJx,mu,-1);
|
MpInvJx_nu = Cshift(MpInvJx,mu,-1);
|
||||||
Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
|
Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV);
|
||||||
Fdet2_nu = Fdet2_nu+FdetV;
|
Fdet2_nu = Fdet2_nu+FdetV;
|
||||||
|
|
||||||
///////////////// -ve nu /////////////////
|
///////////////// -ve nu /////////////////
|
||||||
@ -539,7 +561,7 @@ public:
|
|||||||
Fdet1_nu = Fdet1_nu + transpose(Nxy)*dJdXe_nMpInv_y;
|
Fdet1_nu = Fdet1_nu + transpose(Nxy)*dJdXe_nMpInv_y;
|
||||||
|
|
||||||
MpInvJx_nu = Cshift(MpInvJx,nu,1);
|
MpInvJx_nu = Cshift(MpInvJx,nu,1);
|
||||||
Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
|
Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV);
|
||||||
Fdet2_nu = Fdet2_nu+FdetV;
|
Fdet2_nu = Fdet2_nu+FdetV;
|
||||||
|
|
||||||
// x==
|
// x==
|
||||||
@ -560,7 +582,7 @@ public:
|
|||||||
|
|
||||||
MpInvJx_nu = Cshift(MpInvJx,mu,-1);
|
MpInvJx_nu = Cshift(MpInvJx,mu,-1);
|
||||||
MpInvJx_nu = Cshift(MpInvJx_nu,nu,1);
|
MpInvJx_nu = Cshift(MpInvJx_nu,nu,1);
|
||||||
Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
|
Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV);
|
||||||
Fdet2_nu = Fdet2_nu+FdetV;
|
Fdet2_nu = Fdet2_nu+FdetV;
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////
|
||||||
@ -589,7 +611,7 @@ public:
|
|||||||
|
|
||||||
MpInvJx_nu = Cshift(MpInvJx,nu,-1);
|
MpInvJx_nu = Cshift(MpInvJx,nu,-1);
|
||||||
|
|
||||||
Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
|
Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV);
|
||||||
Fdet2_mu = Fdet2_mu+FdetV;
|
Fdet2_mu = Fdet2_mu+FdetV;
|
||||||
|
|
||||||
// __
|
// __
|
||||||
@ -609,7 +631,7 @@ public:
|
|||||||
|
|
||||||
MpInvJx_nu = Cshift(MpInvJx,nu,1);
|
MpInvJx_nu = Cshift(MpInvJx,nu,1);
|
||||||
|
|
||||||
Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
|
Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV);
|
||||||
Fdet2_mu = Fdet2_mu+FdetV;
|
Fdet2_mu = Fdet2_mu+FdetV;
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -931,6 +953,10 @@ private:
|
|||||||
public:
|
public:
|
||||||
|
|
||||||
/* Standard constructor */
|
/* Standard constructor */
|
||||||
|
virtual ~SmearedConfigurationMasked()
|
||||||
|
{
|
||||||
|
delete UrbGrid;
|
||||||
|
}
|
||||||
SmearedConfigurationMasked(GridCartesian* _UGrid, unsigned int Nsmear, Smear_Stout<Gimpl>& Stout)
|
SmearedConfigurationMasked(GridCartesian* _UGrid, unsigned int Nsmear, Smear_Stout<Gimpl>& Stout)
|
||||||
: SmearedConfiguration<Gimpl>(_UGrid, Nsmear,Stout)
|
: SmearedConfiguration<Gimpl>(_UGrid, Nsmear,Stout)
|
||||||
{
|
{
|
||||||
@ -939,7 +965,6 @@ public:
|
|||||||
// was resized in base class
|
// was resized in base class
|
||||||
assert(this->SmearedSet.size()==Nsmear);
|
assert(this->SmearedSet.size()==Nsmear);
|
||||||
|
|
||||||
GridRedBlackCartesian * UrbGrid;
|
|
||||||
UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(_UGrid);
|
UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(_UGrid);
|
||||||
LatticeComplex one(_UGrid); one = ComplexD(1.0,0.0);
|
LatticeComplex one(_UGrid); one = ComplexD(1.0,0.0);
|
||||||
LatticeComplex tmp(_UGrid);
|
LatticeComplex tmp(_UGrid);
|
||||||
@ -947,11 +972,12 @@ public:
|
|||||||
for (unsigned int i = 0; i < this->smearingLevels; ++i) {
|
for (unsigned int i = 0; i < this->smearingLevels; ++i) {
|
||||||
|
|
||||||
masks.push_back(*(new LatticeLorentzComplex(_UGrid)));
|
masks.push_back(*(new LatticeLorentzComplex(_UGrid)));
|
||||||
|
|
||||||
int mu= (i/2) %Nd;
|
int mu= (i/2) %Nd;
|
||||||
int cb= (i%2);
|
int cb= (i%2);
|
||||||
LatticeComplex tmpcb(UrbGrid);
|
LatticeComplex tmpcb(UrbGrid);
|
||||||
|
|
||||||
|
cbs.push_back(cb);
|
||||||
|
|
||||||
masks[i]=Zero();
|
masks[i]=Zero();
|
||||||
////////////////////
|
////////////////////
|
||||||
// Setup the mask
|
// Setup the mask
|
||||||
@ -962,7 +988,6 @@ public:
|
|||||||
PokeIndex<LorentzIndex>(masks[i],tmp, mu);
|
PokeIndex<LorentzIndex>(masks[i],tmp, mu);
|
||||||
|
|
||||||
}
|
}
|
||||||
delete UrbGrid;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual void smeared_force(GaugeField &SigmaTilde)
|
virtual void smeared_force(GaugeField &SigmaTilde)
|
||||||
|
@ -158,12 +158,12 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
|
|||||||
int MFrvol = rd*Lblock*Rblock*Nmom;
|
int MFrvol = rd*Lblock*Rblock*Nmom;
|
||||||
int MFlvol = ld*Lblock*Rblock*Nmom;
|
int MFlvol = ld*Lblock*Rblock*Nmom;
|
||||||
|
|
||||||
std::vector<SpinMatrix_v > lvSum(MFrvol);
|
Vector<SpinMatrix_v > lvSum(MFrvol);
|
||||||
thread_for( r, MFrvol,{
|
thread_for( r, MFrvol,{
|
||||||
lvSum[r] = Zero();
|
lvSum[r] = Zero();
|
||||||
});
|
});
|
||||||
|
|
||||||
std::vector<SpinMatrix_s > lsSum(MFlvol);
|
Vector<SpinMatrix_s > lsSum(MFlvol);
|
||||||
thread_for(r,MFlvol,{
|
thread_for(r,MFlvol,{
|
||||||
lsSum[r]=scalar_type(0.0);
|
lsSum[r]=scalar_type(0.0);
|
||||||
});
|
});
|
||||||
@ -346,12 +346,12 @@ void A2Autils<FImpl>::PionFieldXX(Eigen::Tensor<ComplexD,3> &mat,
|
|||||||
int MFrvol = rd*Lblock*Rblock;
|
int MFrvol = rd*Lblock*Rblock;
|
||||||
int MFlvol = ld*Lblock*Rblock;
|
int MFlvol = ld*Lblock*Rblock;
|
||||||
|
|
||||||
std::vector<vector_type > lvSum(MFrvol);
|
Vector<vector_type > lvSum(MFrvol);
|
||||||
thread_for(r,MFrvol,{
|
thread_for(r,MFrvol,{
|
||||||
lvSum[r] = Zero();
|
lvSum[r] = Zero();
|
||||||
});
|
});
|
||||||
|
|
||||||
std::vector<scalar_type > lsSum(MFlvol);
|
Vector<scalar_type > lsSum(MFlvol);
|
||||||
thread_for(r,MFlvol,{
|
thread_for(r,MFlvol,{
|
||||||
lsSum[r]=scalar_type(0.0);
|
lsSum[r]=scalar_type(0.0);
|
||||||
});
|
});
|
||||||
@ -493,12 +493,12 @@ void A2Autils<FImpl>::PionFieldWVmom(Eigen::Tensor<ComplexD,4> &mat,
|
|||||||
int MFrvol = rd*Lblock*Rblock*Nmom;
|
int MFrvol = rd*Lblock*Rblock*Nmom;
|
||||||
int MFlvol = ld*Lblock*Rblock*Nmom;
|
int MFlvol = ld*Lblock*Rblock*Nmom;
|
||||||
|
|
||||||
std::vector<vector_type > lvSum(MFrvol);
|
Vector<vector_type > lvSum(MFrvol);
|
||||||
thread_for(r,MFrvol,{
|
thread_for(r,MFrvol,{
|
||||||
lvSum[r] = Zero();
|
lvSum[r] = Zero();
|
||||||
});
|
});
|
||||||
|
|
||||||
std::vector<scalar_type > lsSum(MFlvol);
|
Vector<scalar_type > lsSum(MFlvol);
|
||||||
thread_for(r,MFlvol,{
|
thread_for(r,MFlvol,{
|
||||||
lsSum[r]=scalar_type(0.0);
|
lsSum[r]=scalar_type(0.0);
|
||||||
});
|
});
|
||||||
@ -700,13 +700,13 @@ void A2Autils<FImpl>::AslashField(TensorType &mat,
|
|||||||
int MFrvol = rd*Lblock*Rblock*Nem;
|
int MFrvol = rd*Lblock*Rblock*Nem;
|
||||||
int MFlvol = ld*Lblock*Rblock*Nem;
|
int MFlvol = ld*Lblock*Rblock*Nem;
|
||||||
|
|
||||||
std::vector<vector_type> lvSum(MFrvol);
|
Vector<vector_type> lvSum(MFrvol);
|
||||||
thread_for(r,MFrvol,
|
thread_for(r,MFrvol,
|
||||||
{
|
{
|
||||||
lvSum[r] = Zero();
|
lvSum[r] = Zero();
|
||||||
});
|
});
|
||||||
|
|
||||||
std::vector<scalar_type> lsSum(MFlvol);
|
Vector<scalar_type> lsSum(MFlvol);
|
||||||
thread_for(r,MFlvol,
|
thread_for(r,MFlvol,
|
||||||
{
|
{
|
||||||
lsSum[r] = scalar_type(0.0);
|
lsSum[r] = scalar_type(0.0);
|
||||||
|
@ -971,9 +971,7 @@ void BaryonUtils<FImpl>::BaryonGamma3pt(
|
|||||||
autoView( vq_ti , q_ti , AcceleratorRead);
|
autoView( vq_ti , q_ti , AcceleratorRead);
|
||||||
autoView( vq_tf , q_tf , AcceleratorRead);
|
autoView( vq_tf , q_tf , AcceleratorRead);
|
||||||
|
|
||||||
deviceVector<mobj> my_Dq_spec(2);
|
Vector<mobj> my_Dq_spec{Dq_spec1,Dq_spec2};
|
||||||
acceleratorPut(my_Dq_spec[0],Dq_spec1);
|
|
||||||
acceleratorPut(my_Dq_spec[1],Dq_spec2);
|
|
||||||
mobj * Dq_spec_p = &my_Dq_spec[0];
|
mobj * Dq_spec_p = &my_Dq_spec[0];
|
||||||
|
|
||||||
if (group == 1) {
|
if (group == 1) {
|
||||||
@ -1302,8 +1300,7 @@ void BaryonUtils<FImpl>::SigmaToNucleonEye(const PropagatorField &qq_loop,
|
|||||||
autoView( vd_tf , qd_tf , AcceleratorRead);
|
autoView( vd_tf , qd_tf , AcceleratorRead);
|
||||||
autoView( vs_ti , qs_ti , AcceleratorRead);
|
autoView( vs_ti , qs_ti , AcceleratorRead);
|
||||||
|
|
||||||
deviceVector<mobj> my_Dq_spec(1);
|
Vector<mobj> my_Dq_spec{Du_spec};
|
||||||
acceleratorPut(my_Dq_spec[0],Du_spec);
|
|
||||||
mobj * Dq_spec_p = &my_Dq_spec[0];
|
mobj * Dq_spec_p = &my_Dq_spec[0];
|
||||||
|
|
||||||
if(op == "Q1"){
|
if(op == "Q1"){
|
||||||
@ -1356,8 +1353,7 @@ void BaryonUtils<FImpl>::SigmaToNucleonNonEye(const PropagatorField &qq_ti,
|
|||||||
autoView( vd_tf , qd_tf , AcceleratorRead );
|
autoView( vd_tf , qd_tf , AcceleratorRead );
|
||||||
autoView( vs_ti , qs_ti , AcceleratorRead );
|
autoView( vs_ti , qs_ti , AcceleratorRead );
|
||||||
|
|
||||||
deviceVector<mobj> my_Dq_spec(1);
|
Vector<mobj> my_Dq_spec{Du_spec};
|
||||||
acceleratorPut(my_Dq_spec[0],Du_spec);
|
|
||||||
mobj * Dq_spec_p = &my_Dq_spec[0];
|
mobj * Dq_spec_p = &my_Dq_spec[0];
|
||||||
|
|
||||||
if(op == "Q1"){
|
if(op == "Q1"){
|
||||||
@ -1548,9 +1544,7 @@ void BaryonUtils<FImpl>::XiToSigmaEye(const PropagatorField &qq_loop,
|
|||||||
autoView( vd_tf , qd_tf , AcceleratorRead);
|
autoView( vd_tf , qd_tf , AcceleratorRead);
|
||||||
autoView( vs_ti , qs_ti , AcceleratorRead);
|
autoView( vs_ti , qs_ti , AcceleratorRead);
|
||||||
|
|
||||||
deviceVector<mobj> my_Dq_spec(2);
|
Vector<mobj> my_Dq_spec{Dd_spec,Ds_spec};
|
||||||
acceleratorPut(my_Dq_spec[0],Dd_spec);
|
|
||||||
acceleratorPut(my_Dq_spec[0],Ds_spec);
|
|
||||||
mobj * Dq_spec_p = &my_Dq_spec[0];
|
mobj * Dq_spec_p = &my_Dq_spec[0];
|
||||||
|
|
||||||
if(op == "Q1"){
|
if(op == "Q1"){
|
||||||
|
@ -418,32 +418,32 @@ static void LieAlgebraProject(LatticeAlgebraMatrix &out,const LatticeMatrix &in,
|
|||||||
int hNNm1= NNm1/2;
|
int hNNm1= NNm1/2;
|
||||||
RealD sqrt_2 = sqrt(2.0);
|
RealD sqrt_2 = sqrt(2.0);
|
||||||
Complex ci(0.0,1.0);
|
Complex ci(0.0,1.0);
|
||||||
|
|
||||||
|
const int nsimd= Matrix::Nsimd();
|
||||||
|
accelerator_for(ss,grid->oSites(),nsimd,{
|
||||||
for(int su2Index=0;su2Index<hNNm1;su2Index++){
|
for(int su2Index=0;su2Index<hNNm1;su2Index++){
|
||||||
int i1, i2;
|
int i1, i2;
|
||||||
su2SubGroupIndex(i1, i2, su2Index);
|
su2SubGroupIndex(i1, i2, su2Index);
|
||||||
int ax = su2Index*2;
|
int ax = su2Index*2;
|
||||||
int ay = su2Index*2+1;
|
int ay = su2Index*2+1;
|
||||||
accelerator_for(ss,grid->oSites(),1,{
|
|
||||||
// in is traceless ANTI-hermitian whereas Grid generators are Hermitian.
|
// in is traceless ANTI-hermitian whereas Grid generators are Hermitian.
|
||||||
// trace( Ta x Ci in)
|
// trace( Ta x Ci in)
|
||||||
// Bet I need to move to real part with mult by -i
|
// Bet I need to move to real part with mult by -i
|
||||||
out_v[ss]()()(ax,b) = 0.5*(real(in_v[ss]()()(i2,i1)) - real(in_v[ss]()()(i1,i2)));
|
coalescedWrite(out_v[ss]()()(ax,b),0.5*(real(in_v(ss)()()(i2,i1)) - real(in_v(ss)()()(i1,i2))));
|
||||||
out_v[ss]()()(ay,b) = 0.5*(imag(in_v[ss]()()(i1,i2)) + imag(in_v[ss]()()(i2,i1)));
|
coalescedWrite(out_v[ss]()()(ay,b),0.5*(imag(in_v(ss)()()(i1,i2)) + imag(in_v(ss)()()(i2,i1))));
|
||||||
});
|
|
||||||
}
|
}
|
||||||
for(int diagIndex=0;diagIndex<N-1;diagIndex++){
|
for(int diagIndex=0;diagIndex<N-1;diagIndex++){
|
||||||
int k = diagIndex + 1; // diagIndex starts from 0
|
int k = diagIndex + 1; // diagIndex starts from 0
|
||||||
int a = NNm1+diagIndex;
|
int a = NNm1+diagIndex;
|
||||||
RealD scale = 1.0/sqrt(2.0*k*(k+1));
|
RealD scale = 1.0/sqrt(2.0*k*(k+1));
|
||||||
accelerator_for(ss,grid->oSites(),vComplex::Nsimd(),{
|
auto tmp = in_v(ss)()()(0,0);
|
||||||
auto tmp = in_v[ss]()()(0,0);
|
|
||||||
for(int i=1;i<k;i++){
|
for(int i=1;i<k;i++){
|
||||||
tmp=tmp+in_v[ss]()()(i,i);
|
tmp=tmp+in_v(ss)()()(i,i);
|
||||||
|
}
|
||||||
|
tmp = tmp - in_v(ss)()()(k,k)*k;
|
||||||
|
coalescedWrite(out_v[ss]()()(a,b),imag(tmp) * scale);
|
||||||
}
|
}
|
||||||
tmp = tmp - in_v[ss]()()(k,k)*k;
|
|
||||||
out_v[ss]()()(a,b) =imag(tmp) * scale;
|
|
||||||
});
|
});
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -118,7 +118,7 @@ static void generatorDiagonal(int diagIndex, iGroupMatrix<cplx> &ta) {
|
|||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
// Map a su2 subgroup number to the pair of rows that are non zero
|
// Map a su2 subgroup number to the pair of rows that are non zero
|
||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
static void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::SU) {
|
static accelerator_inline void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::SU) {
|
||||||
assert((su2_index >= 0) && (su2_index < (ncolour * (ncolour - 1)) / 2));
|
assert((su2_index >= 0) && (su2_index < (ncolour * (ncolour - 1)) / 2));
|
||||||
|
|
||||||
int spare = su2_index;
|
int spare = su2_index;
|
||||||
|
@ -62,7 +62,7 @@ public:
|
|||||||
// returns i(T_Adj)^index necessary for the projectors
|
// returns i(T_Adj)^index necessary for the projectors
|
||||||
// see definitions above
|
// see definitions above
|
||||||
iAdjTa = Zero();
|
iAdjTa = Zero();
|
||||||
iSUnMatrix<cplx> ta[ncolour * ncolour - 1];
|
Vector<iSUnMatrix<cplx> > ta(ncolour * ncolour - 1);
|
||||||
iSUnMatrix<cplx> tmp;
|
iSUnMatrix<cplx> tmp;
|
||||||
|
|
||||||
// FIXME not very efficient to get all the generators everytime
|
// FIXME not very efficient to get all the generators everytime
|
||||||
|
@ -72,7 +72,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Resident in managed memory
|
// Resident in managed memory
|
||||||
deviceVector<GeneralStencilEntry> _entries;
|
Vector<GeneralStencilEntry> _entries;
|
||||||
|
|
||||||
GeneralLocalStencil(GridBase *grid, const std::vector<Coordinate> &shifts)
|
GeneralLocalStencil(GridBase *grid, const std::vector<Coordinate> &shifts)
|
||||||
{
|
{
|
||||||
@ -141,7 +141,7 @@ public:
|
|||||||
////////////////////////////////////////////////
|
////////////////////////////////////////////////
|
||||||
// Store in look up table
|
// Store in look up table
|
||||||
////////////////////////////////////////////////
|
////////////////////////////////////////////////
|
||||||
acceleratorPut(this->_entries[lex],SE);
|
this->_entries[lex] = SE;
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
#if 0
|
|
||||||
/*************************************************************************************
|
/*************************************************************************************
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
@ -242,4 +241,3 @@ void LebesgueOrder::ZGraph(void)
|
|||||||
}
|
}
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
#endif
|
|
@ -72,7 +72,7 @@ public:
|
|||||||
void ThreadInterleave(void);
|
void ThreadInterleave(void);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
deviceVector<IndexInteger> _LebesgueReorder;
|
Vector<IndexInteger> _LebesgueReorder;
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
@ -19,7 +19,7 @@ public:
|
|||||||
static int PartialCompressionFactor(GridBase *grid) {return 1;};
|
static int PartialCompressionFactor(GridBase *grid) {return 1;};
|
||||||
// Decompress is after merge so ok
|
// Decompress is after merge so ok
|
||||||
template<class vobj,class cobj,class compressor>
|
template<class vobj,class cobj,class compressor>
|
||||||
static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
|
static void Gather_plane_simple (commVector<std::pair<int,int> >& table,
|
||||||
const Lattice<vobj> &rhs,
|
const Lattice<vobj> &rhs,
|
||||||
cobj *buffer,
|
cobj *buffer,
|
||||||
compressor &compress,
|
compressor &compress,
|
||||||
@ -35,7 +35,7 @@ public:
|
|||||||
rhs_v.ViewClose();
|
rhs_v.ViewClose();
|
||||||
}
|
}
|
||||||
template<class vobj,class cobj,class compressor>
|
template<class vobj,class cobj,class compressor>
|
||||||
static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
|
static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
|
||||||
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
|
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
|
||||||
compressor &compress,int type,int partial)
|
compressor &compress,int type,int partial)
|
||||||
{
|
{
|
||||||
@ -83,6 +83,25 @@ public:
|
|||||||
// Wilson compressor will add alternate policies for Dirichlet
|
// Wilson compressor will add alternate policies for Dirichlet
|
||||||
// and possibly partial Dirichlet for DWF
|
// and possibly partial Dirichlet for DWF
|
||||||
////////////////////////////////////
|
////////////////////////////////////
|
||||||
|
/*
|
||||||
|
class FaceGatherDirichlet
|
||||||
|
{
|
||||||
|
// If it's dirichlet we don't assemble comms buffers
|
||||||
|
//
|
||||||
|
// Rely on zeroes in gauge field to drive the correct result
|
||||||
|
// NAN propgagation: field will locally wrap, so fermion should NOT contain NAN and just permute
|
||||||
|
template<class vobj,class cobj,class compressor>
|
||||||
|
static void Gather_plane_simple (commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so){};
|
||||||
|
template<class vobj,class cobj,class compressor>
|
||||||
|
static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
|
||||||
|
Vector<cobj *> pointers,int dimension,int plane,int cbmask,
|
||||||
|
compressor &compress,int type) {}
|
||||||
|
template<class decompressor,class Merger>
|
||||||
|
static void Merge(decompressor decompress,Merge &mm) { }
|
||||||
|
template<class decompressor,class Decompression>
|
||||||
|
static void Decompress(decompressor decompress,Decompression &dd) {}
|
||||||
|
};
|
||||||
|
*/
|
||||||
|
|
||||||
template<class vobj,class FaceGather>
|
template<class vobj,class FaceGather>
|
||||||
class SimpleCompressorGather : public FaceGather {
|
class SimpleCompressorGather : public FaceGather {
|
||||||
|
@ -31,6 +31,7 @@
|
|||||||
#define STENCIL_MAX (16)
|
#define STENCIL_MAX (16)
|
||||||
|
|
||||||
#include <Grid/stencil/SimpleCompressor.h> // subdir aggregate
|
#include <Grid/stencil/SimpleCompressor.h> // subdir aggregate
|
||||||
|
#include <Grid/stencil/Lebesgue.h> // subdir aggregate
|
||||||
#include <Grid/stencil/GeneralLocalStencil.h>
|
#include <Grid/stencil/GeneralLocalStencil.h>
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////////
|
||||||
@ -255,6 +256,7 @@ protected:
|
|||||||
GridBase * _grid;
|
GridBase * _grid;
|
||||||
public:
|
public:
|
||||||
GridBase *Grid(void) const { return _grid; }
|
GridBase *Grid(void) const { return _grid; }
|
||||||
|
LebesgueOrder *lo;
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
// Needed to conveniently communicate gparity parameters into GPU memory
|
// Needed to conveniently communicate gparity parameters into GPU memory
|
||||||
@ -271,11 +273,11 @@ public:
|
|||||||
int face_table_computed;
|
int face_table_computed;
|
||||||
int partialDirichlet;
|
int partialDirichlet;
|
||||||
int fullDirichlet;
|
int fullDirichlet;
|
||||||
std::vector<deviceVector<std::pair<int,int> > > face_table ;
|
std::vector<commVector<std::pair<int,int> > > face_table ;
|
||||||
deviceVector<int> surface_list;
|
Vector<int> surface_list;
|
||||||
|
|
||||||
std::vector<StencilEntry> _entries; // Resident in host memory
|
stencilVector<StencilEntry> _entries; // Resident in managed memory
|
||||||
deviceVector<StencilEntry> _entries_device; // Resident in device memory
|
commVector<StencilEntry> _entries_device; // Resident in device memory
|
||||||
std::vector<Packet> Packets;
|
std::vector<Packet> Packets;
|
||||||
std::vector<Merge> Mergers;
|
std::vector<Merge> Mergers;
|
||||||
std::vector<Merge> MergersSHM;
|
std::vector<Merge> MergersSHM;
|
||||||
@ -365,9 +367,10 @@ public:
|
|||||||
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
|
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
|
||||||
{
|
{
|
||||||
// All GPU kernel tasks must complete
|
// All GPU kernel tasks must complete
|
||||||
accelerator_barrier(); // All kernels should ALREADY be complete
|
// accelerator_barrier(); // All kernels should ALREADY be complete
|
||||||
_grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer
|
// _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer
|
||||||
// But the HaloGather had a barrier too.
|
// But the HaloGather had a barrier too.
|
||||||
|
#ifdef ACCELERATOR_AWARE_MPI
|
||||||
for(int i=0;i<Packets.size();i++){
|
for(int i=0;i<Packets.size();i++){
|
||||||
_grid->StencilSendToRecvFromBegin(MpiReqs,
|
_grid->StencilSendToRecvFromBegin(MpiReqs,
|
||||||
Packets[i].send_buf,
|
Packets[i].send_buf,
|
||||||
@ -376,6 +379,23 @@ public:
|
|||||||
Packets[i].from_rank,Packets[i].do_recv,
|
Packets[i].from_rank,Packets[i].do_recv,
|
||||||
Packets[i].xbytes,Packets[i].rbytes,i);
|
Packets[i].xbytes,Packets[i].rbytes,i);
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
#warning "Using COPY VIA HOST BUFFERS IN STENCIL"
|
||||||
|
for(int i=0;i<Packets.size();i++){
|
||||||
|
// Introduce a host buffer with a cheap slab allocator and zero cost wipe all
|
||||||
|
Packets[i].host_send_buf = _grid->HostBufferMalloc(Packets[i].xbytes);
|
||||||
|
Packets[i].host_recv_buf = _grid->HostBufferMalloc(Packets[i].rbytes);
|
||||||
|
if ( Packets[i].do_send ) {
|
||||||
|
acceleratorCopyFromDevice(Packets[i].send_buf, Packets[i].host_send_buf,Packets[i].xbytes);
|
||||||
|
}
|
||||||
|
_grid->StencilSendToRecvFromBegin(MpiReqs,
|
||||||
|
Packets[i].host_send_buf,
|
||||||
|
Packets[i].to_rank,Packets[i].do_send,
|
||||||
|
Packets[i].host_recv_buf,
|
||||||
|
Packets[i].from_rank,Packets[i].do_recv,
|
||||||
|
Packets[i].xbytes,Packets[i].rbytes,i);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
// Get comms started then run checksums
|
// Get comms started then run checksums
|
||||||
// Having this PRIOR to the dslash seems to make Sunspot work... (!)
|
// Having this PRIOR to the dslash seems to make Sunspot work... (!)
|
||||||
for(int i=0;i<Packets.size();i++){
|
for(int i=0;i<Packets.size();i++){
|
||||||
@ -390,9 +410,18 @@ public:
|
|||||||
if ( this->partialDirichlet ) DslashLogPartial();
|
if ( this->partialDirichlet ) DslashLogPartial();
|
||||||
else if ( this->fullDirichlet ) DslashLogDirichlet();
|
else if ( this->fullDirichlet ) DslashLogDirichlet();
|
||||||
else DslashLogFull();
|
else DslashLogFull();
|
||||||
acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete
|
// acceleratorCopySynchronise() is in the StencilSendToRecvFromComplete
|
||||||
accelerator_barrier();
|
// accelerator_barrier();
|
||||||
_grid->StencilBarrier();
|
_grid->StencilBarrier();
|
||||||
|
#ifndef ACCELERATOR_AWARE_MPI
|
||||||
|
#warning "Using COPY VIA HOST BUFFERS IN STENCIL"
|
||||||
|
for(int i=0;i<Packets.size();i++){
|
||||||
|
if ( Packets[i].do_recv ) {
|
||||||
|
acceleratorCopyToDevice(Packets[i].host_recv_buf, Packets[i].recv_buf,Packets[i].rbytes);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_grid->HostBufferFreeAll();
|
||||||
|
#endif
|
||||||
// run any checksums
|
// run any checksums
|
||||||
for(int i=0;i<Packets.size();i++){
|
for(int i=0;i<Packets.size();i++){
|
||||||
if ( Packets[i].do_recv )
|
if ( Packets[i].do_recv )
|
||||||
@ -473,7 +502,7 @@ public:
|
|||||||
template<class compressor>
|
template<class compressor>
|
||||||
void HaloGather(const Lattice<vobj> &source,compressor &compress)
|
void HaloGather(const Lattice<vobj> &source,compressor &compress)
|
||||||
{
|
{
|
||||||
accelerator_barrier();
|
// accelerator_barrier();
|
||||||
_grid->StencilBarrier();// Synch shared memory on a single nodes
|
_grid->StencilBarrier();// Synch shared memory on a single nodes
|
||||||
|
|
||||||
assert(source.Grid()==_grid);
|
assert(source.Grid()==_grid);
|
||||||
@ -487,7 +516,6 @@ public:
|
|||||||
HaloGatherDir(source,compress,point,face_idx);
|
HaloGatherDir(source,compress,point,face_idx);
|
||||||
}
|
}
|
||||||
accelerator_barrier(); // All my local gathers are complete
|
accelerator_barrier(); // All my local gathers are complete
|
||||||
_grid->StencilBarrier();// Synch shared memory on a single nodes
|
|
||||||
face_table_computed=1;
|
face_table_computed=1;
|
||||||
assert(u_comm_offset==_unified_buffer_size);
|
assert(u_comm_offset==_unified_buffer_size);
|
||||||
}
|
}
|
||||||
@ -640,7 +668,7 @@ public:
|
|||||||
for(int point=0;point<this->_npoints;point++){
|
for(int point=0;point<this->_npoints;point++){
|
||||||
this->same_node[point] = this->SameNode(point);
|
this->same_node[point] = this->SameNode(point);
|
||||||
}
|
}
|
||||||
int32_t surface_list_size=0;
|
|
||||||
for(int site = 0 ;site< vol4;site++){
|
for(int site = 0 ;site< vol4;site++){
|
||||||
int local = 1;
|
int local = 1;
|
||||||
for(int point=0;point<this->_npoints;point++){
|
for(int point=0;point<this->_npoints;point++){
|
||||||
@ -650,30 +678,11 @@ public:
|
|||||||
}
|
}
|
||||||
if(local == 0) {
|
if(local == 0) {
|
||||||
for(int s=0;s<Ls;s++){
|
for(int s=0;s<Ls;s++){
|
||||||
surface_list_size++;
|
surface_list.push_back(site*Ls+s);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
std::cout << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
|
//std::cout << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
|
||||||
surface_list.resize(surface_list_size);
|
|
||||||
std::vector<int> surface_list_host(surface_list_size);
|
|
||||||
int32_t ss=0;
|
|
||||||
for(int site = 0 ;site< vol4;site++){
|
|
||||||
int local = 1;
|
|
||||||
for(int point=0;point<this->_npoints;point++){
|
|
||||||
if( (!this->GetNodeLocal(site*Ls,point)) && (!this->same_node[point]) ){
|
|
||||||
local = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if(local == 0) {
|
|
||||||
for(int s=0;s<Ls;s++){
|
|
||||||
int idx=site*Ls+s;
|
|
||||||
surface_list_host[ss]= idx;
|
|
||||||
ss++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
acceleratorCopyToDevice(&surface_list_host[0],&surface_list[0],surface_list_size*sizeof(int));
|
|
||||||
}
|
}
|
||||||
/// Introduce a block structure and switch off comms on boundaries
|
/// Introduce a block structure and switch off comms on boundaries
|
||||||
void DirichletBlock(const Coordinate &dirichlet_block)
|
void DirichletBlock(const Coordinate &dirichlet_block)
|
||||||
|
@ -207,10 +207,10 @@ cl::sycl::queue *theCopyAccelerator;
|
|||||||
void acceleratorInit(void)
|
void acceleratorInit(void)
|
||||||
{
|
{
|
||||||
int nDevices = 1;
|
int nDevices = 1;
|
||||||
// cl::sycl::gpu_selector selector;
|
cl::sycl::gpu_selector selector;
|
||||||
// cl::sycl::device selectedDevice { selector };
|
cl::sycl::device selectedDevice { selector };
|
||||||
theGridAccelerator = new sycl::queue (sycl::gpu_selector_v);
|
theGridAccelerator = new sycl::queue (selectedDevice);
|
||||||
theCopyAccelerator = new sycl::queue (sycl::gpu_selector_v);
|
theCopyAccelerator = new sycl::queue (selectedDevice);
|
||||||
// theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway.
|
// theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway.
|
||||||
|
|
||||||
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
|
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
|
||||||
|
@ -464,12 +464,16 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
std::cout<<GridLogMessage<<std::endl;
|
std::cout<<GridLogMessage<<std::endl;
|
||||||
std::cout<<GridLogMessage<<"Performance:"<<std::endl;
|
std::cout<<GridLogMessage<<"Performance:"<<std::endl;
|
||||||
std::cout<<GridLogMessage<<std::endl;
|
std::cout<<GridLogMessage<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<" --comms-concurrent : Asynchronous MPI calls; several dirs at a time "<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<" --comms-sequential : Synchronous MPI calls; one dirs at a time "<<std::endl;
|
||||||
std::cout<<GridLogMessage<<" --comms-overlap : Overlap comms with compute "<<std::endl;
|
std::cout<<GridLogMessage<<" --comms-overlap : Overlap comms with compute "<<std::endl;
|
||||||
std::cout<<GridLogMessage<<std::endl;
|
std::cout<<GridLogMessage<<std::endl;
|
||||||
std::cout<<GridLogMessage<<" --dslash-generic: Wilson kernel for generic Nc"<<std::endl;
|
std::cout<<GridLogMessage<<" --dslash-generic: Wilson kernel for generic Nc"<<std::endl;
|
||||||
std::cout<<GridLogMessage<<" --dslash-unroll : Wilson kernel for Nc=3"<<std::endl;
|
std::cout<<GridLogMessage<<" --dslash-unroll : Wilson kernel for Nc=3"<<std::endl;
|
||||||
std::cout<<GridLogMessage<<" --dslash-asm : Wilson kernel for AVX512"<<std::endl;
|
std::cout<<GridLogMessage<<" --dslash-asm : Wilson kernel for AVX512"<<std::endl;
|
||||||
std::cout<<GridLogMessage<<std::endl;
|
std::cout<<GridLogMessage<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<" --lebesgue : Cache oblivious Lebesgue curve/Morton order/Z-graph stencil looping"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<" --cacheblocking n.m.o.p : Hypercuboidal cache blocking"<<std::endl;
|
||||||
std::cout<<GridLogMessage<<std::endl;
|
std::cout<<GridLogMessage<<std::endl;
|
||||||
exit(EXIT_SUCCESS);
|
exit(EXIT_SUCCESS);
|
||||||
}
|
}
|
||||||
@ -497,8 +501,28 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsThenCompute;
|
WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsThenCompute;
|
||||||
StaggeredKernelsStatic::Comms = StaggeredKernelsStatic::CommsThenCompute;
|
StaggeredKernelsStatic::Comms = StaggeredKernelsStatic::CommsThenCompute;
|
||||||
}
|
}
|
||||||
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-concurrent") ){
|
||||||
|
CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicyConcurrent);
|
||||||
|
}
|
||||||
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-sequential") ){
|
||||||
|
CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
|
||||||
|
}
|
||||||
|
|
||||||
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
|
||||||
|
LebesgueOrder::UseLebesgueOrder=1;
|
||||||
|
}
|
||||||
CartesianCommunicator::nCommThreads = 1;
|
CartesianCommunicator::nCommThreads = 1;
|
||||||
|
#ifdef GRID_COMMS_THREADS
|
||||||
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-threads") ){
|
||||||
|
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--comms-threads");
|
||||||
|
GridCmdOptionInt(arg,CartesianCommunicator::nCommThreads);
|
||||||
|
assert(CartesianCommunicator::nCommThreads > 0);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
|
||||||
|
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking");
|
||||||
|
GridCmdOptionIntVector(arg,LebesgueOrder::Block);
|
||||||
|
}
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--notimestamp") ){
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--notimestamp") ){
|
||||||
GridLogTimestamp(0);
|
GridLogTimestamp(0);
|
||||||
} else {
|
} else {
|
||||||
@ -549,31 +573,8 @@ void GridLogLayout() {
|
|||||||
|
|
||||||
void * Grid_backtrace_buffer[_NBACKTRACE];
|
void * Grid_backtrace_buffer[_NBACKTRACE];
|
||||||
|
|
||||||
void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr)
|
|
||||||
{
|
|
||||||
fprintf(stderr,"Signal handler on host %s\n",hostname);
|
|
||||||
fprintf(stderr,"Caught signal %d\n",si->si_signo);
|
|
||||||
fprintf(stderr," mem address %llx\n",(unsigned long long)si->si_addr);
|
|
||||||
fprintf(stderr," code %d\n",si->si_code);
|
|
||||||
// x86 64bit
|
|
||||||
#ifdef __linux__
|
|
||||||
#ifdef __x86_64__
|
|
||||||
ucontext_t * uc= (ucontext_t *)ptr;
|
|
||||||
struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
|
|
||||||
fprintf(stderr," instruction %llx\n",(unsigned long long)sc->rip);
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
fflush(stderr);
|
|
||||||
BACKTRACEFP(stderr);
|
|
||||||
fprintf(stderr,"Called backtrace\n");
|
|
||||||
fflush(stdout);
|
|
||||||
fflush(stderr);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
|
void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
|
||||||
{
|
{
|
||||||
fprintf(stderr,"Signal handler on host %s\n",hostname);
|
|
||||||
fprintf(stderr,"Caught signal %d\n",si->si_signo);
|
fprintf(stderr,"Caught signal %d\n",si->si_signo);
|
||||||
fprintf(stderr," mem address %llx\n",(unsigned long long)si->si_addr);
|
fprintf(stderr," mem address %llx\n",(unsigned long long)si->si_addr);
|
||||||
fprintf(stderr," code %d\n",si->si_code);
|
fprintf(stderr," code %d\n",si->si_code);
|
||||||
@ -584,7 +585,7 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
|
|||||||
ucontext_t * uc= (ucontext_t *)ptr;
|
ucontext_t * uc= (ucontext_t *)ptr;
|
||||||
struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
|
struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
|
||||||
fprintf(stderr," instruction %llx\n",(unsigned long long)sc->rip);
|
fprintf(stderr," instruction %llx\n",(unsigned long long)sc->rip);
|
||||||
#define REG(A) fprintf(stderr," %s %lx\n",#A,sc-> A);
|
#define REG(A) printf(" %s %lx\n",#A,sc-> A);
|
||||||
REG(rdi);
|
REG(rdi);
|
||||||
REG(rsi);
|
REG(rsi);
|
||||||
REG(rbp);
|
REG(rbp);
|
||||||
@ -617,8 +618,8 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
|
|||||||
|
|
||||||
void Grid_exit_handler(void)
|
void Grid_exit_handler(void)
|
||||||
{
|
{
|
||||||
// BACKTRACEFP(stdout);
|
BACKTRACEFP(stdout);
|
||||||
// fflush(stdout);
|
fflush(stdout);
|
||||||
}
|
}
|
||||||
void Grid_debug_handler_init(void)
|
void Grid_debug_handler_init(void)
|
||||||
{
|
{
|
||||||
@ -626,10 +627,10 @@ void Grid_debug_handler_init(void)
|
|||||||
sigemptyset (&sa.sa_mask);
|
sigemptyset (&sa.sa_mask);
|
||||||
sa.sa_sigaction= Grid_sa_signal_handler;
|
sa.sa_sigaction= Grid_sa_signal_handler;
|
||||||
sa.sa_flags = SA_SIGINFO;
|
sa.sa_flags = SA_SIGINFO;
|
||||||
// sigaction(SIGSEGV,&sa,NULL);
|
sigaction(SIGSEGV,&sa,NULL);
|
||||||
sigaction(SIGTRAP,&sa,NULL);
|
sigaction(SIGTRAP,&sa,NULL);
|
||||||
sigaction(SIGBUS,&sa,NULL);
|
sigaction(SIGBUS,&sa,NULL);
|
||||||
// sigaction(SIGUSR2,&sa,NULL);
|
sigaction(SIGUSR2,&sa,NULL);
|
||||||
|
|
||||||
feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
|
feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
|
||||||
|
|
||||||
@ -637,14 +638,7 @@ void Grid_debug_handler_init(void)
|
|||||||
sigaction(SIGKILL,&sa,NULL);
|
sigaction(SIGKILL,&sa,NULL);
|
||||||
sigaction(SIGILL,&sa,NULL);
|
sigaction(SIGILL,&sa,NULL);
|
||||||
|
|
||||||
// Non terminating SIGUSR1/2 handler
|
atexit(Grid_exit_handler);
|
||||||
struct sigaction sa_ping;
|
|
||||||
sigemptyset (&sa_ping.sa_mask);
|
|
||||||
sa_ping.sa_sigaction= Grid_usr_signal_handler;
|
|
||||||
sa_ping.sa_flags = SA_SIGINFO;
|
|
||||||
sigaction(SIGHUP,&sa_ping,NULL);
|
|
||||||
|
|
||||||
// atexit(Grid_exit_handler);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
@ -644,6 +644,11 @@ int main (int argc, char ** argv)
|
|||||||
Grid_init(&argc,&argv);
|
Grid_init(&argc,&argv);
|
||||||
|
|
||||||
CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
|
CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
|
||||||
|
#ifdef KNL
|
||||||
|
LebesgueOrder::Block = std::vector<int>({8,2,2,2});
|
||||||
|
#else
|
||||||
|
LebesgueOrder::Block = std::vector<int>({2,2,2,2});
|
||||||
|
#endif
|
||||||
Benchmark::Decomposition();
|
Benchmark::Decomposition();
|
||||||
|
|
||||||
int do_su4=1;
|
int do_su4=1;
|
||||||
|
@ -70,7 +70,7 @@ int main (int argc, char ** argv)
|
|||||||
pRNG.SeedFixedIntegers(std::vector<int>({56,17,89,101}));
|
pRNG.SeedFixedIntegers(std::vector<int>({56,17,89,101}));
|
||||||
|
|
||||||
std::vector<double> stop(threads);
|
std::vector<double> stop(threads);
|
||||||
std::vector<Vec> sum(threads);
|
Vector<Vec> sum(threads);
|
||||||
|
|
||||||
std::vector<LatticeVec> x(threads,&Grid);
|
std::vector<LatticeVec> x(threads,&Grid);
|
||||||
for(int t=0;t<threads;t++){
|
for(int t=0;t<threads;t++){
|
||||||
|
@ -78,9 +78,9 @@ int main (int argc, char ** argv)
|
|||||||
double t0,t1;
|
double t0,t1;
|
||||||
|
|
||||||
typedef typename DomainWallFermionD::Coeff_t Coeff_t;
|
typedef typename DomainWallFermionD::Coeff_t Coeff_t;
|
||||||
std::vector<Coeff_t> diag = Dw.bs;
|
Vector<Coeff_t> diag = Dw.bs;
|
||||||
std::vector<Coeff_t> upper= Dw.cs;
|
Vector<Coeff_t> upper= Dw.cs;
|
||||||
std::vector<Coeff_t> lower= Dw.cs;
|
Vector<Coeff_t> lower= Dw.cs;
|
||||||
upper[Ls-1]=-Dw.mass_minus*upper[Ls-1];
|
upper[Ls-1]=-Dw.mass_minus*upper[Ls-1];
|
||||||
lower[0] =-Dw.mass_plus*lower[0];
|
lower[0] =-Dw.mass_plus*lower[0];
|
||||||
|
|
||||||
|
@ -861,7 +861,7 @@ int main (int argc, char ** argv)
|
|||||||
}
|
}
|
||||||
|
|
||||||
CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
|
CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
|
||||||
// LebesgueOrder::Block = std::vector<int>({2,2,2,2});
|
LebesgueOrder::Block = std::vector<int>({2,2,2,2});
|
||||||
|
|
||||||
Benchmark::Decomposition();
|
Benchmark::Decomposition();
|
||||||
|
|
||||||
|
22
configure.ac
22
configure.ac
@ -225,6 +225,18 @@ case ${ac_SFW_FP16} in
|
|||||||
AC_MSG_ERROR(["SFW FP16 option not supported ${ac_SFW_FP16}"]);;
|
AC_MSG_ERROR(["SFW FP16 option not supported ${ac_SFW_FP16}"]);;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
|
############### Default to accelerator cshift, but revert to host if UCX is buggy or other reasons
|
||||||
|
AC_ARG_ENABLE([accelerator-aware-mpi],
|
||||||
|
[AS_HELP_STRING([--enable-accelerator-aware-mpi=yes|no],[run mpi transfers from device])],
|
||||||
|
[ac_ACCELERATOR_AWARE_MPI=${enable_accelerator_aware_mpi}], [ac_ACCELERATOR_AWARE_MPI=yes])
|
||||||
|
|
||||||
|
case ${ac_ACCELERATOR_AWARE_MPI} in
|
||||||
|
yes)
|
||||||
|
AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ Cshift runs on host])
|
||||||
|
AC_DEFINE([ACCELERATOR_AWARE_MPI],[1],[ Stencil can use device pointers]);;
|
||||||
|
*);;
|
||||||
|
esac
|
||||||
|
|
||||||
|
|
||||||
############### SYCL/CUDA/HIP/none
|
############### SYCL/CUDA/HIP/none
|
||||||
AC_ARG_ENABLE([accelerator],
|
AC_ARG_ENABLE([accelerator],
|
||||||
@ -652,6 +664,16 @@ case ${ac_SHM_FAST_PATH} in
|
|||||||
*) ;;
|
*) ;;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
|
############### communication type selection
|
||||||
|
AC_ARG_ENABLE([comms-threads],[AS_HELP_STRING([--enable-comms-threads | --disable-comms-threads],[Use multiple threads in MPI calls])],[ac_COMMS_THREADS=${enable_comms_threads}],[ac_COMMS_THREADS=yes])
|
||||||
|
|
||||||
|
case ${ac_COMMS_THREADS} in
|
||||||
|
yes)
|
||||||
|
AC_DEFINE([GRID_COMMS_THREADING],[1],[GRID_COMMS_NONE] )
|
||||||
|
;;
|
||||||
|
*) ;;
|
||||||
|
esac
|
||||||
|
|
||||||
############### communication type selection
|
############### communication type selection
|
||||||
AC_ARG_ENABLE([comms],[AS_HELP_STRING([--enable-comms=none|mpi|mpi-auto],[Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none])
|
AC_ARG_ENABLE([comms],[AS_HELP_STRING([--enable-comms=none|mpi|mpi-auto],[Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none])
|
||||||
|
|
||||||
|
@ -1,23 +0,0 @@
|
|||||||
#Ahead of time compile for PVC
|
|
||||||
export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl "
|
|
||||||
export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions -fsycl-targets=spir64_gen -Xs -device -Xs pvc "
|
|
||||||
|
|
||||||
#JIT compile
|
|
||||||
#export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl "
|
|
||||||
#export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -Wno-tautological-compare -qmkl=parallel -fsycl -fno-exceptions "
|
|
||||||
|
|
||||||
../../configure \
|
|
||||||
--enable-simd=GPU \
|
|
||||||
--enable-gen-simd-width=64 \
|
|
||||||
--enable-comms=mpi-auto \
|
|
||||||
--enable-debug \
|
|
||||||
--disable-gparity \
|
|
||||||
--disable-fermion-reps \
|
|
||||||
--with-lime=$CLIME \
|
|
||||||
--enable-shm=nvlink \
|
|
||||||
--enable-accelerator=sycl \
|
|
||||||
--enable-accelerator-aware-mpi=yes\
|
|
||||||
--enable-unified=no \
|
|
||||||
MPICXX=mpicxx \
|
|
||||||
CXX=icpx
|
|
||||||
|
|
@ -1,15 +0,0 @@
|
|||||||
#module load oneapi/release/2023.12.15.001
|
|
||||||
#module load mpich/icc-all-debug-pmix-gpu/52.2
|
|
||||||
#module load mpich-config/mode/deterministic
|
|
||||||
#module load intel_compute_runtime/release/821.35
|
|
||||||
|
|
||||||
source ~/spack/share/spack/setup-env.sh
|
|
||||||
spack load c-lime
|
|
||||||
spack load openssl
|
|
||||||
export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' `
|
|
||||||
export HTTP_PROXY=http://proxy.alcf.anl.gov:3128
|
|
||||||
export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128
|
|
||||||
export http_proxy=http://proxy.alcf.anl.gov:3128
|
|
||||||
export https_proxy=http://proxy.alcf.anl.gov:3128
|
|
||||||
git config --global http.proxy http://proxy.alcf.anl.gov:3128
|
|
||||||
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
|
|
@ -1,74 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
#PBS -l select=512
|
|
||||||
#PBS -q EarlyAppAccess
|
|
||||||
#PBS -A LatticeQCD_aesp_CNDA
|
|
||||||
#PBS -l walltime=6:00:00
|
|
||||||
#PBS -N reproBigJob
|
|
||||||
#PBS -k doe
|
|
||||||
|
|
||||||
#export OMP_PROC_BIND=spread
|
|
||||||
#unset OMP_PLACES
|
|
||||||
|
|
||||||
#module load oneapi/eng-compiler/2023.05.15.003
|
|
||||||
#module load mpich/51.2/icc-all-deterministic-pmix-gpu
|
|
||||||
|
|
||||||
# 56 cores / 6 threads ~9
|
|
||||||
export OMP_NUM_THREADS=6
|
|
||||||
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
|
||||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
|
|
||||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
|
|
||||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=10485760
|
|
||||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
|
|
||||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
|
|
||||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
|
|
||||||
#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
|
|
||||||
|
|
||||||
#export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
|
|
||||||
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=1
|
|
||||||
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
|
|
||||||
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
|
|
||||||
|
|
||||||
export GRID_PRINT_ENTIRE_LOG=0
|
|
||||||
export GRID_CHECKSUM_RECV_BUF=0
|
|
||||||
export GRID_CHECKSUM_SEND_BUF=0
|
|
||||||
|
|
||||||
export MPICH_OFI_NIC_POLICY=GPU
|
|
||||||
|
|
||||||
#export MPIR_CVAR_ALLREDUCE_DEVICE_COLLECTIVE=0
|
|
||||||
#export MPIR_CVAR_REDUCE_DEVICE_COLLECTIVE=0
|
|
||||||
#export MPIR_CVAR_ALLREDUCE_INTRA_ALGORITHM=recursive_doubling
|
|
||||||
#unset MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE
|
|
||||||
#unset MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE
|
|
||||||
#unset MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE
|
|
||||||
|
|
||||||
cd $PBS_O_WORKDIR
|
|
||||||
|
|
||||||
cp $PBS_NODEFILE nodefile
|
|
||||||
|
|
||||||
DIR=reproBigJob.$PBS_JOBID
|
|
||||||
|
|
||||||
mkdir -p $DIR
|
|
||||||
cd $DIR
|
|
||||||
|
|
||||||
cp $PBS_NODEFILE nodefile
|
|
||||||
|
|
||||||
BINARY=../Test_dwf_mixedcg_prec
|
|
||||||
|
|
||||||
echo > pingjob <<EOF
|
|
||||||
while read node ;
|
|
||||||
do
|
|
||||||
echo ssh $node killall -s USR1 -- ../Test_dwf_mixedcg_prec
|
|
||||||
done < nodefile
|
|
||||||
EOF
|
|
||||||
|
|
||||||
CMD="mpiexec -np 6144 -ppn 12 -envall --hostfile nodefile \
|
|
||||||
../gpu_tile_compact.sh \
|
|
||||||
$BINARY --mpi 8.8.8.12 --grid 128.128.128.288 \
|
|
||||||
--shm-mpi 0 --shm 4096 --device-mem 32000 --accelerator-threads 32 --seconds 18000 --debug-stdout --log Message --debug-signals --comms-overlap"
|
|
||||||
|
|
||||||
echo $CMD > command-line
|
|
||||||
env > environment
|
|
||||||
$CMD
|
|
||||||
grep Oops Grid.stderr.* > failures.$PBS_JOBID
|
|
||||||
rm core.*
|
|
@ -1,38 +1,67 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
#PBS -q EarlyAppAccess
|
#PBS -q debug
|
||||||
#PBS -l select=1
|
#PBS -l select=1
|
||||||
#PBS -l walltime=00:20:00
|
#PBS -l walltime=00:20:00
|
||||||
#PBS -A LatticeQCD_aesp_CNDA
|
#PBS -A LatticeQCD_aesp_CNDA
|
||||||
|
|
||||||
|
#export OMP_PROC_BIND=spread
|
||||||
|
#unset OMP_PLACES
|
||||||
|
|
||||||
cd $PBS_O_WORKDIR
|
cd $PBS_O_WORKDIR
|
||||||
|
|
||||||
source ../sourceme.sh
|
source ../sourceme.sh
|
||||||
|
module load pti-gpu
|
||||||
|
|
||||||
cp $PBS_NODEFILE nodefile
|
#cat $PBS_NODEFILE
|
||||||
|
|
||||||
export OMP_NUM_THREADS=4
|
export OMP_NUM_THREADS=4
|
||||||
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
||||||
unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
|
|
||||||
unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
|
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
|
||||||
unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
|
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
|
||||||
|
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
|
||||||
|
|
||||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
|
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
|
||||||
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
|
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
|
||||||
#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
|
#export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
|
||||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
|
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ=1048576
|
||||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
|
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD=131072
|
||||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
|
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK=16
|
||||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
|
#export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS=16
|
||||||
export MPICH_OFI_NIC_POLICY=GPU
|
export MPICH_OFI_NIC_POLICY=GPU
|
||||||
|
|
||||||
|
# 12 ppn, 2 nodes, 24 ranks
|
||||||
|
#
|
||||||
CMD="mpiexec -np 12 -ppn 12 -envall \
|
CMD="mpiexec -np 12 -ppn 12 -envall \
|
||||||
./Benchmark_dwf_fp32 --mpi 2.1.2.3 --grid 32.32.64.48 \
|
./gpu_tile_compact.sh \
|
||||||
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --debug-signals"
|
./Benchmark_comms_host_device --mpi 2.2.1.3 --grid 24.32.32.24 \
|
||||||
|
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32"
|
||||||
|
#$CMD | tee 1node.comms
|
||||||
|
|
||||||
#for f in 1 2 3 4 5 6 7 8
|
|
||||||
for f in 1
|
CMD="mpiexec -np 1 -ppn 1 -envall \
|
||||||
do
|
./gpu_tile_compact.sh \
|
||||||
echo $CMD
|
./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 16.32.32.32 \
|
||||||
$CMD | tee 1node.32.32.64.48.dwf.hbm.$f
|
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 "
|
||||||
done
|
#$CMD | tee 1tile.dwf
|
||||||
|
|
||||||
|
CMD="mpiexec -np 12 -ppn 12 -envall \
|
||||||
|
./gpu_tile_compact.sh \
|
||||||
|
./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 32.32.32.48 \
|
||||||
|
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
|
||||||
|
$CMD | tee 1node.32.32.32.48.dwf
|
||||||
|
|
||||||
|
|
||||||
|
CMD="mpiexec -np 12 -ppn 12 -envall \
|
||||||
|
./gpu_tile_compact.sh \
|
||||||
|
./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 64.64.32.96 \
|
||||||
|
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
|
||||||
|
#$CMD | tee 1node.64.64.32.96.dwf
|
||||||
|
|
||||||
|
CMD="mpiexec -np 12 -ppn 12 -envall \
|
||||||
|
./gpu_tile_compact.sh \
|
||||||
|
./Benchmark_dwf_fp32 --mpi 2.2.1.3 --grid 64.32.32.48 \
|
||||||
|
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
|
||||||
|
#$CMD | tee 1node.64.32.32.48.dwf
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
#PBS -q EarlyAppAccess
|
#PBS -q workq
|
||||||
#PBS -l select=2
|
#PBS -l select=2
|
||||||
#PBS -l walltime=00:20:00
|
#PBS -l walltime=00:20:00
|
||||||
#PBS -A LatticeQCD_aesp_CNDA
|
#PBS -A LatticeQCD_aesp_CNDA
|
||||||
@ -11,16 +11,17 @@
|
|||||||
cd $PBS_O_WORKDIR
|
cd $PBS_O_WORKDIR
|
||||||
|
|
||||||
source ../sourceme.sh
|
source ../sourceme.sh
|
||||||
#module load pti-gpu
|
module load pti-gpu
|
||||||
|
|
||||||
|
#cat $PBS_NODEFILE
|
||||||
cp $PBS_NODEFILE nodefile
|
|
||||||
|
|
||||||
export OMP_NUM_THREADS=4
|
export OMP_NUM_THREADS=4
|
||||||
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
||||||
|
|
||||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
|
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
|
||||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
|
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
|
||||||
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
|
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
|
||||||
|
|
||||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE=0
|
||||||
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
|
export MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE=0
|
||||||
export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
|
export MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST=1
|
||||||
@ -33,26 +34,22 @@ export MPICH_OFI_NIC_POLICY=GPU
|
|||||||
# 12 ppn, 2 nodes, 24 ranks
|
# 12 ppn, 2 nodes, 24 ranks
|
||||||
#
|
#
|
||||||
CMD="mpiexec -np 24 -ppn 12 -envall \
|
CMD="mpiexec -np 24 -ppn 12 -envall \
|
||||||
./gpu_tile.sh \
|
./gpu_tile_compact.sh \
|
||||||
./Benchmark_comms_host_device --mpi 2.2.2.3 --grid 24.32.32.24 \
|
./Benchmark_comms_host_device --mpi 2.2.2.3 --grid 24.32.32.24 \
|
||||||
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32"
|
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32"
|
||||||
#$CMD | tee 2node.comms.hbm
|
$CMD | tee 2node.comms
|
||||||
|
|
||||||
|
|
||||||
CMD="mpiexec -np 24 -ppn 12 -envall \
|
CMD="mpiexec -np 24 -ppn 12 -envall \
|
||||||
|
./gpu_tile_compact.sh \
|
||||||
./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 32.32.64.48 \
|
./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 32.32.64.48 \
|
||||||
--shm-mpi 1 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap --debug-signals"
|
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
|
||||||
|
$CMD | tee 2node.32.32.64.48.dwf
|
||||||
|
|
||||||
#for f in 1 2 3 4 5 6 7 8
|
|
||||||
for f in 1
|
|
||||||
do
|
|
||||||
echo $CMD
|
|
||||||
$CMD | tee 2node.32.32.64.48.dwf.hbm.$f
|
|
||||||
done
|
|
||||||
|
|
||||||
CMD="mpiexec -np 24 -ppn 12 -envall \
|
CMD="mpiexec -np 24 -ppn 12 -envall \
|
||||||
./gpu_tile.sh \
|
./gpu_tile_compact.sh \
|
||||||
./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 64.64.64.96 \
|
./Benchmark_dwf_fp32 --mpi 2.2.2.3 --grid 64.64.64.96 \
|
||||||
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
|
--shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap"
|
||||||
#$CMD | tee 2node.64.64.64.96.dwf.hbm
|
$CMD | tee 2node.64.64.64.96.dwf
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
|
|
||||||
export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl "
|
export LDFLAGS="-fiopenmp -fsycl -fsycl-device-code-split=per_kernel -fsycl-targets=spir64_gen -Xs -device -Xs pvc -fsycl-device-lib=all -lze_loader -L${MKLROOT}/lib -qmkl=parallel -fsycl -lsycl "
|
||||||
export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel -fsycl -fno-exceptions "
|
export CXXFLAGS="-O3 -fiopenmp -fsycl-unnamed-lambda -fsycl -I$INSTALL/include -Wno-tautological-compare -I$HOME/ -qmkl=parallel -fsycl -fno-exceptions -fsycl-targets=spir64_gen -Xs -device -Xs pvc "
|
||||||
../../configure \
|
../../configure \
|
||||||
--enable-simd=GPU \
|
--enable-simd=GPU \
|
||||||
--enable-gen-simd-width=64 \
|
--enable-gen-simd-width=64 \
|
||||||
|
@ -1,14 +1,40 @@
|
|||||||
module load oneapi/release/2023.12.15.001
|
|
||||||
#module load mpich/icc-all-debug-pmix-gpu/52.2
|
|
||||||
#module load mpich-config/mode/deterministic
|
|
||||||
#module load intel_compute_runtime/release/821.35
|
|
||||||
source ~/spack/share/spack/setup-env.sh
|
source ~/spack/share/spack/setup-env.sh
|
||||||
spack load c-lime
|
spack load c-lime
|
||||||
spack load openssl
|
|
||||||
export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' `
|
export CLIME=`spack find --paths c-lime | grep ^c-lime | awk '{print $2}' `
|
||||||
|
#spack load libefence
|
||||||
|
#export EFENCE=`spack find --paths libefence | grep ^libefence | awk '{print $2}' `
|
||||||
|
#export LD_LIBRARY_PATH=${EFENCE}/lib:$LD_LIBRARY_PATH
|
||||||
|
#spack load gperftools
|
||||||
|
export TCMALLOC=/home/paboyle/gperftools/install
|
||||||
|
export LD_LIBRARY_PATH=${TCMALLOC}/lib:$LD_LIBRARY_PATH
|
||||||
|
export INTELGT_AUTO_ATTACH_DISABLE=1
|
||||||
|
|
||||||
|
#export ONEAPI_DEVICE_SELECTOR=level_zero:0.0
|
||||||
|
#module load oneapi/release/2023.12.15.001
|
||||||
|
#module use /soft/modulefiles
|
||||||
|
#module load intel_compute_runtime/release/agama-devel-682.22
|
||||||
|
|
||||||
|
#export FI_CXI_DEFAULT_CQ_SIZE=131072
|
||||||
|
#export FI_CXI_CQ_FILL_PERCENT=20
|
||||||
|
#export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
|
||||||
|
#export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-intel-enable-auto-large-GRF-mode"
|
||||||
|
|
||||||
|
#
|
||||||
|
# -ftarget-register-alloc-mode=pvc:default
|
||||||
|
# -ftarget-register-alloc-mode=pvc:small
|
||||||
|
# -ftarget-register-alloc-mode=pvc:large
|
||||||
|
# -ftarget-register-alloc-mode=pvc:auto
|
||||||
|
#export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
|
||||||
|
|
||||||
export HTTP_PROXY=http://proxy.alcf.anl.gov:3128
|
export HTTP_PROXY=http://proxy.alcf.anl.gov:3128
|
||||||
export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128
|
export HTTPS_PROXY=http://proxy.alcf.anl.gov:3128
|
||||||
export http_proxy=http://proxy.alcf.anl.gov:3128
|
export http_proxy=http://proxy.alcf.anl.gov:3128
|
||||||
export https_proxy=http://proxy.alcf.anl.gov:3128
|
export https_proxy=http://proxy.alcf.anl.gov:3128
|
||||||
git config --global http.proxy http://proxy.alcf.anl.gov:3128
|
git config --global http.proxy http://proxy.alcf.anl.gov:3128
|
||||||
|
|
||||||
|
#source ~/spack/share/spack/setup-env.sh
|
||||||
|
#spack load gperftools
|
||||||
|
#export TCMALLOC=`spack find --paths gperftools | grep ^gperftools | awk '{print $2}' `
|
||||||
|
#export LD_LIBRARY_PATH=${TCMALLOC}/lib:$LD_LIBRARY_PATH
|
||||||
|
|
||||||
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
|
export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
|
||||||
|
@ -2,8 +2,7 @@
|
|||||||
|
|
||||||
## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
|
## qsub -q EarlyAppAccess -A Aurora_Deployment -I -l select=1 -l walltime=60:00
|
||||||
|
|
||||||
#PBS -l select=16
|
#PBS -l select=16:system=sunspot,place=scatter
|
||||||
#PBS -q EarlyAppAccess
|
|
||||||
#PBS -A LatticeQCD_aesp_CNDA
|
#PBS -A LatticeQCD_aesp_CNDA
|
||||||
#PBS -l walltime=01:00:00
|
#PBS -l walltime=01:00:00
|
||||||
#PBS -N dwf
|
#PBS -N dwf
|
||||||
@ -14,14 +13,19 @@
|
|||||||
|
|
||||||
cd $PBS_O_WORKDIR
|
cd $PBS_O_WORKDIR
|
||||||
|
|
||||||
source ../sourceme.sh
|
#source ../sourceme.sh
|
||||||
|
|
||||||
cat $PBS_NODEFILE
|
cat $PBS_NODEFILE
|
||||||
|
|
||||||
|
#export MPICH_COLL_SYNC=1
|
||||||
|
#export MPICH_ENV_DISPLAY=1
|
||||||
|
export MPICH_
|
||||||
export OMP_NUM_THREADS=3
|
export OMP_NUM_THREADS=3
|
||||||
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
export MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE=1
|
||||||
|
module load oneapi/eng-compiler/2023.05.15.003
|
||||||
|
module load mpich/51.2/icc-all-deterministic-pmix-gpu
|
||||||
|
#export LD_LIBRARY_PATH=/soft/restricted/CNDA/updates/2023.05.15.001/oneapi/compiler/eng-20230512/compiler/linux/lib/:$LD_LIBRARY_PATH
|
||||||
|
|
||||||
#module load mpich/51.2/icc-all-deterministic-pmix-gpu
|
|
||||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
|
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE
|
||||||
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
|
#unset MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE
|
||||||
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
|
#unset MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
#PBS -l select=16
|
#PBS -l select=16:system=sunspot,place=scatter
|
||||||
#PBS -q EarlyAppAccess
|
|
||||||
#PBS -A LatticeQCD_aesp_CNDA
|
#PBS -A LatticeQCD_aesp_CNDA
|
||||||
#PBS -l walltime=02:00:00
|
#PBS -l walltime=02:00:00
|
||||||
#PBS -N repro1gpu
|
#PBS -N repro1gpu
|
||||||
@ -10,9 +9,8 @@
|
|||||||
#export OMP_PROC_BIND=spread
|
#export OMP_PROC_BIND=spread
|
||||||
#unset OMP_PLACES
|
#unset OMP_PLACES
|
||||||
|
|
||||||
|
module load oneapi/eng-compiler/2023.05.15.003
|
||||||
#module load oneapi/eng-compiler/2023.05.15.003
|
module load mpich/51.2/icc-all-deterministic-pmix-gpu
|
||||||
#module load mpich/51.2/icc-all-deterministic-pmix-gpu
|
|
||||||
|
|
||||||
# 56 cores / 6 threads ~9
|
# 56 cores / 6 threads ~9
|
||||||
export OMP_NUM_THREADS=6
|
export OMP_NUM_THREADS=6
|
||||||
@ -36,8 +34,6 @@ export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file"
|
|||||||
|
|
||||||
cd $PBS_O_WORKDIR
|
cd $PBS_O_WORKDIR
|
||||||
|
|
||||||
source ../sourceme.sh
|
|
||||||
|
|
||||||
NN=`cat $PBS_NODEFILE | wc -l`
|
NN=`cat $PBS_NODEFILE | wc -l`
|
||||||
echo $PBS_NODEFILE
|
echo $PBS_NODEFILE
|
||||||
cat $PBS_NODEFILE
|
cat $PBS_NODEFILE
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user