mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-15 14:27:06 +01:00
Compare commits
3 Commits
25ab9325e7
...
feature/ft
Author | SHA1 | Date | |
---|---|---|---|
bffd30abec | |||
da919949f9 | |||
b12b4fdaff |
@ -12,13 +12,15 @@
|
|||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <sys/time.h>
|
#include <sys/time.h>
|
||||||
|
|
||||||
|
#define GRID_SYCL
|
||||||
|
#undef GRID_HIP
|
||||||
|
#undef GRID_CUDA
|
||||||
|
|
||||||
#ifdef GRID_HIP
|
#ifdef GRID_HIP
|
||||||
#include <hipblas/hipblas.h>
|
#include <hipblas/hipblas.h>
|
||||||
#endif
|
#endif
|
||||||
#ifdef GRID_CUDA
|
#ifdef GRID_CUDA
|
||||||
#include <cublas_v2.h>
|
#include <cublas_v2.h>
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
#ifdef GRID_SYCL
|
#ifdef GRID_SYCL
|
||||||
#include <oneapi/mkl.hpp>
|
#include <oneapi/mkl.hpp>
|
||||||
@ -43,90 +45,6 @@ inline void acceleratorFreeDevice(void *ptr,size_t bytes){free(ptr,*theAccelerat
|
|||||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { theAccelerator->memset(base,value,bytes); theAccelerator->wait();}
|
inline void acceleratorMemSet(void *base,int value,size_t bytes) { theAccelerator->memset(base,value,bytes); theAccelerator->wait();}
|
||||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theAccelerator->memcpy(to,from,bytes); theAccelerator->wait();}
|
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theAccelerator->memcpy(to,from,bytes); theAccelerator->wait();}
|
||||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theAccelerator->memcpy(to,from,bytes); theAccelerator->wait();}
|
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theAccelerator->memcpy(to,from,bytes); theAccelerator->wait();}
|
||||||
#define accelerator_barrier(dummy) { theAccelerator->wait(); }
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef GRID_HIP
|
|
||||||
hipStream_t copyStream;
|
|
||||||
hipStream_t computeStream;
|
|
||||||
void acceleratorInit(void)
|
|
||||||
{
|
|
||||||
int device = 0;
|
|
||||||
auto discard = hipSetDevice(device);
|
|
||||||
discard = hipStreamCreate(©Stream);
|
|
||||||
discard = hipStreamCreate(&computeStream);
|
|
||||||
printf("AcceleratorHIPInit\n");
|
|
||||||
}
|
|
||||||
inline void *acceleratorAllocDevice(size_t bytes)
|
|
||||||
{
|
|
||||||
void *ptr=NULL;
|
|
||||||
auto err = hipMalloc((void **)&ptr,bytes);
|
|
||||||
if( err != hipSuccess ) {
|
|
||||||
ptr = (void *) NULL;
|
|
||||||
fprintf(stderr," hipMalloc failed for %ld %s \n",bytes,hipGetErrorString(err)); fflush(stderr);
|
|
||||||
}
|
|
||||||
return ptr;
|
|
||||||
};
|
|
||||||
inline void acceleratorFreeDevice(void *ptr,size_t bytes){ auto discard=hipFree(ptr);};
|
|
||||||
inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);};
|
|
||||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);}
|
|
||||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
|
|
||||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
|
|
||||||
#define accelerator_barrier(dummy) \
|
|
||||||
{ \
|
|
||||||
auto tmp=hipStreamSynchronize(computeStream); \
|
|
||||||
auto err = hipGetLastError(); \
|
|
||||||
if ( err != hipSuccess ) { \
|
|
||||||
printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \
|
|
||||||
puts(__FILE__); \
|
|
||||||
printf("Line %d\n",__LINE__); \
|
|
||||||
exit(0); \
|
|
||||||
} \
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef GRID_CUDA
|
|
||||||
cudaStream_t copyStream;
|
|
||||||
cudaStream_t computeStream;
|
|
||||||
void acceleratorInit(void)
|
|
||||||
{
|
|
||||||
int device = 0;
|
|
||||||
cudaSetDevice(device);
|
|
||||||
cudaStreamCreate(©Stream);
|
|
||||||
cudaStreamCreate(&computeStream);
|
|
||||||
}
|
|
||||||
inline void *acceleratorAllocDevice(size_t bytes)
|
|
||||||
{
|
|
||||||
void *ptr=NULL;
|
|
||||||
auto err = cudaMalloc((void **)&ptr,bytes);
|
|
||||||
if( err != cudaSuccess ) {
|
|
||||||
ptr = (void *) NULL;
|
|
||||||
printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err));
|
|
||||||
}
|
|
||||||
return ptr;
|
|
||||||
};
|
|
||||||
inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
|
|
||||||
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
|
|
||||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
|
|
||||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
|
|
||||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
|
|
||||||
#define accelerator_barrier(dummy) \
|
|
||||||
{ \
|
|
||||||
cudaStreamSynchronize(computeStream); \
|
|
||||||
cudaError err = cudaGetLastError(); \
|
|
||||||
if ( cudaSuccess != err ) { \
|
|
||||||
printf("accelerator_barrier(): Cuda error %s \n", \
|
|
||||||
cudaGetErrorString( err )); \
|
|
||||||
printf("File %s Line %d\n",__FILE__,__LINE__); \
|
|
||||||
fflush(stdout); \
|
|
||||||
if (acceleratorAbortOnGpuError) assert(err==cudaSuccess); \
|
|
||||||
} \
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
template<class T> void acceleratorPut(T& dev,T&host)
|
template<class T> void acceleratorPut(T& dev,T&host)
|
||||||
{
|
{
|
||||||
acceleratorCopyToDevice(&host,&dev,sizeof(T));
|
acceleratorCopyToDevice(&host,&dev,sizeof(T));
|
||||||
@ -137,6 +55,9 @@ template<class T> T acceleratorGet(T& dev)
|
|||||||
acceleratorCopyFromDevice(&dev,&host,sizeof(T));
|
acceleratorCopyFromDevice(&dev,&host,sizeof(T));
|
||||||
return host;
|
return host;
|
||||||
}
|
}
|
||||||
|
#define accelerator_barrier(dummy) { theAccelerator->wait(); }
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**************************************************************
|
/**************************************************************
|
||||||
* Allocator
|
* Allocator
|
||||||
@ -289,270 +210,7 @@ public:
|
|||||||
gridblasHandle->wait();
|
gridblasHandle->wait();
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////
|
|
||||||
// Single matrix GEMM -- fp64 and fp32
|
|
||||||
/////////////////////////////////////////////////////////////
|
|
||||||
void gemm(GridBLASOperation_t OpA,
|
|
||||||
GridBLASOperation_t OpB,
|
|
||||||
int m,int n, int k,
|
|
||||||
ComplexD alpha,
|
|
||||||
ComplexD* Amk, // Device pointer
|
|
||||||
ComplexD* Bkn,
|
|
||||||
ComplexD beta,
|
|
||||||
ComplexD* Cmn)
|
|
||||||
{
|
|
||||||
RealD t2=usecond();
|
|
||||||
|
|
||||||
assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
|
|
||||||
assert(OpB!=GridBLAS_OP_T);
|
|
||||||
|
|
||||||
int lda = m; // m x k column major
|
|
||||||
int ldb = k; // k x n column major
|
|
||||||
int ldc = m; // m x b column major
|
|
||||||
if(OpA!=GridBLAS_OP_N)
|
|
||||||
lda = k;
|
|
||||||
if(OpB!=GridBLAS_OP_N)
|
|
||||||
ldb = n;
|
|
||||||
|
|
||||||
static deviceVector<ComplexD> alpha_p(1);
|
|
||||||
static deviceVector<ComplexD> beta_p(1);
|
|
||||||
// can prestore the 1 and the zero on device
|
|
||||||
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD));
|
|
||||||
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD));
|
|
||||||
RealD t0=usecond();
|
|
||||||
|
|
||||||
#ifdef GRID_HIP
|
|
||||||
hipblasOperation_t hOpA;
|
|
||||||
hipblasOperation_t hOpB;
|
|
||||||
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
|
|
||||||
if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
|
|
||||||
if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
|
|
||||||
if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
|
|
||||||
if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
|
|
||||||
if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
|
|
||||||
auto err = hipblasZgemm(gridblasHandle,
|
|
||||||
hOpA,
|
|
||||||
hOpB,
|
|
||||||
m,n,k,
|
|
||||||
(hipblasDoubleComplex *) &alpha_p[0],
|
|
||||||
(hipblasDoubleComplex *) Amk, lda,
|
|
||||||
(hipblasDoubleComplex *) Bkn, ldb,
|
|
||||||
(hipblasDoubleComplex *) &beta_p[0],
|
|
||||||
(hipblasDoubleComplex *) Cmn, ldc);
|
|
||||||
assert(err==HIPBLAS_STATUS_SUCCESS);
|
|
||||||
#endif
|
|
||||||
#ifdef GRID_CUDA
|
|
||||||
cublasOperation_t hOpA;
|
|
||||||
cublasOperation_t hOpB;
|
|
||||||
if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
|
|
||||||
if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
|
|
||||||
if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
|
|
||||||
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
|
|
||||||
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
|
|
||||||
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
|
|
||||||
auto err = cublasZgemm(gridblasHandle,
|
|
||||||
hOpA,
|
|
||||||
hOpB,
|
|
||||||
m,n,k,
|
|
||||||
(cuDoubleComplex *) &alpha_p[0],
|
|
||||||
(cuDoubleComplex *) Amk, lda,
|
|
||||||
(cuDoubleComplex *) Bkn, ldb,
|
|
||||||
(cuDoubleComplex *) &beta_p[0],
|
|
||||||
(cuDoubleComplex *) Cmn, ldc);
|
|
||||||
assert(err==CUBLAS_STATUS_SUCCESS);
|
|
||||||
#endif
|
|
||||||
#ifdef GRID_SYCL
|
|
||||||
int64_t m64=m;
|
|
||||||
int64_t n64=n;
|
|
||||||
int64_t k64=k;
|
|
||||||
int64_t lda64=lda;
|
|
||||||
int64_t ldb64=ldb;
|
|
||||||
int64_t ldc64=ldc;
|
|
||||||
|
|
||||||
oneapi::mkl::transpose iOpA;
|
|
||||||
oneapi::mkl::transpose iOpB;
|
|
||||||
|
|
||||||
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
|
|
||||||
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
|
|
||||||
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
|
|
||||||
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
|
|
||||||
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
|
|
||||||
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
|
|
||||||
|
|
||||||
oneapi::mkl::blas::column_major::gemm(*gridblasHandle,
|
|
||||||
iOpA,
|
|
||||||
iOpB,
|
|
||||||
m64,n64,k64,
|
|
||||||
(ComplexD *) &alpha_p[0],
|
|
||||||
(const ComplexD *)Amk, (int64_t )lda64,
|
|
||||||
(const ComplexD *)Bkn, (int64_t )ldb64,
|
|
||||||
(ComplexD *) &beta_p[0],
|
|
||||||
(ComplexD *)Cmn, (int64_t)ldc64);
|
|
||||||
synchronise();
|
|
||||||
#endif
|
|
||||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
|
||||||
// Need a default/reference implementation; use Eigen
|
|
||||||
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
|
|
||||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk,m,k);
|
|
||||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn,k,n);
|
|
||||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn,m,n);
|
|
||||||
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
|
||||||
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
|
|
||||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk,k,m);
|
|
||||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn,k,n);
|
|
||||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn,m,n);
|
|
||||||
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
|
|
||||||
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
|
|
||||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk,m,k);
|
|
||||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn,n,k);
|
|
||||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn,m,n);
|
|
||||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
|
|
||||||
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
|
|
||||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk,k,m);
|
|
||||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn,n,k);
|
|
||||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn,m,n);
|
|
||||||
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
|
|
||||||
} else {
|
|
||||||
assert(0);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
RealD t1=usecond();
|
|
||||||
RealD flops = 8.0*m*n*k;
|
|
||||||
RealD bytes = 1.0*sizeof(ComplexD)*(m*k+k*n+m*n);
|
|
||||||
}
|
|
||||||
void gemm(GridBLASOperation_t OpA,
|
|
||||||
GridBLASOperation_t OpB,
|
|
||||||
int m,int n, int k,
|
|
||||||
ComplexF alpha,
|
|
||||||
ComplexF* Amk, // Device pointer
|
|
||||||
ComplexF* Bkn,
|
|
||||||
ComplexF beta,
|
|
||||||
ComplexF* Cmn)
|
|
||||||
{
|
|
||||||
RealD t2=usecond();
|
|
||||||
|
|
||||||
assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
|
|
||||||
assert(OpB!=GridBLAS_OP_T);
|
|
||||||
|
|
||||||
int lda = m; // m x k column major
|
|
||||||
int ldb = k; // k x n column major
|
|
||||||
int ldc = m; // m x b column major
|
|
||||||
if(OpA!=GridBLAS_OP_N)
|
|
||||||
lda = k;
|
|
||||||
if(OpB!=GridBLAS_OP_N)
|
|
||||||
ldb = n;
|
|
||||||
|
|
||||||
static deviceVector<ComplexF> alpha_p(1);
|
|
||||||
static deviceVector<ComplexF> beta_p(1);
|
|
||||||
// can prestore the 1 and the zero on device
|
|
||||||
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexF));
|
|
||||||
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexF));
|
|
||||||
RealD t0=usecond();
|
|
||||||
|
|
||||||
#ifdef GRID_HIP
|
|
||||||
hipblasOperation_t hOpA;
|
|
||||||
hipblasOperation_t hOpB;
|
|
||||||
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
|
|
||||||
if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
|
|
||||||
if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
|
|
||||||
if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
|
|
||||||
if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
|
|
||||||
if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
|
|
||||||
auto err = hipblasCgemm(gridblasHandle,
|
|
||||||
hOpA,
|
|
||||||
hOpB,
|
|
||||||
m,n,k,
|
|
||||||
(hipblasComplex *) &alpha_p[0],
|
|
||||||
(hipblasComplex *) Amk, lda,
|
|
||||||
(hipblasComplex *) Bkn, ldb,
|
|
||||||
(hipblasComplex *) &beta_p[0],
|
|
||||||
(hipblasComplex *) Cmn, ldc);
|
|
||||||
assert(err==HIPBLAS_STATUS_SUCCESS);
|
|
||||||
#endif
|
|
||||||
#ifdef GRID_CUDA
|
|
||||||
cublasOperation_t hOpA;
|
|
||||||
cublasOperation_t hOpB;
|
|
||||||
if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
|
|
||||||
if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
|
|
||||||
if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
|
|
||||||
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
|
|
||||||
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
|
|
||||||
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
|
|
||||||
auto err = cublasCgemm(gridblasHandle,
|
|
||||||
hOpA,
|
|
||||||
hOpB,
|
|
||||||
m,n,k,
|
|
||||||
(cuComplex *) &alpha_p[0],
|
|
||||||
(cuComplex *) Amk, lda,
|
|
||||||
(cuComplex *) Bkn, ldb,
|
|
||||||
(cuComplex *) &beta_p[0],
|
|
||||||
(cuComplex *) Cmn, ldc);
|
|
||||||
assert(err==CUBLAS_STATUS_SUCCESS);
|
|
||||||
#endif
|
|
||||||
#ifdef GRID_SYCL
|
|
||||||
int64_t m64=m;
|
|
||||||
int64_t n64=n;
|
|
||||||
int64_t k64=k;
|
|
||||||
int64_t lda64=lda;
|
|
||||||
int64_t ldb64=ldb;
|
|
||||||
int64_t ldc64=ldc;
|
|
||||||
|
|
||||||
oneapi::mkl::transpose iOpA;
|
|
||||||
oneapi::mkl::transpose iOpB;
|
|
||||||
|
|
||||||
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
|
|
||||||
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
|
|
||||||
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
|
|
||||||
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
|
|
||||||
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
|
|
||||||
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
|
|
||||||
|
|
||||||
oneapi::mkl::blas::column_major::gemm(*gridblasHandle,
|
|
||||||
iOpA,
|
|
||||||
iOpB,
|
|
||||||
m64,n64,k64,
|
|
||||||
(ComplexF *) &alpha_p[0],
|
|
||||||
(const ComplexF *)Amk, (int64_t )lda64,
|
|
||||||
(const ComplexF *)Bkn, (int64_t )ldb64,
|
|
||||||
(ComplexF *) &beta_p[0],
|
|
||||||
(ComplexF *)Cmn, (int64_t )ldc64);
|
|
||||||
synchronise();
|
|
||||||
#endif
|
|
||||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
|
||||||
// Need a default/reference implementation; use Eigen
|
|
||||||
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
|
|
||||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk,m,k);
|
|
||||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn,k,n);
|
|
||||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn,m,n);
|
|
||||||
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
|
||||||
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
|
|
||||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk,k,m);
|
|
||||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn,k,n);
|
|
||||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn,m,n);
|
|
||||||
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
|
|
||||||
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
|
|
||||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk,m,k);
|
|
||||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn,n,k);
|
|
||||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn,m,n);
|
|
||||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
|
|
||||||
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
|
|
||||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk,k,m);
|
|
||||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn,n,k);
|
|
||||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn,m,n);
|
|
||||||
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
|
|
||||||
} else {
|
|
||||||
assert(0);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
RealD t1=usecond();
|
|
||||||
RealD flops = 8.0*m*n*k;
|
|
||||||
RealD bytes = 1.0*sizeof(ComplexF)*(m*k+k*n+m*n);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////
|
|
||||||
void gemmBatched(int m,int n, int k,
|
void gemmBatched(int m,int n, int k,
|
||||||
ComplexD alpha,
|
ComplexD alpha,
|
||||||
deviceVector<ComplexD*> &Amk, // pointer list to matrices
|
deviceVector<ComplexD*> &Amk, // pointer list to matrices
|
||||||
@ -583,6 +241,36 @@ public:
|
|||||||
beta,
|
beta,
|
||||||
Cmn);
|
Cmn);
|
||||||
}
|
}
|
||||||
|
void gemmBatched(int m,int n, int k,
|
||||||
|
RealD alpha,
|
||||||
|
deviceVector<RealD*> &Amk, // pointer list to matrices
|
||||||
|
deviceVector<RealD*> &Bkn,
|
||||||
|
RealD beta,
|
||||||
|
deviceVector<RealD*> &Cmn)
|
||||||
|
{
|
||||||
|
gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
|
||||||
|
m,n,k,
|
||||||
|
alpha,
|
||||||
|
Amk,
|
||||||
|
Bkn,
|
||||||
|
beta,
|
||||||
|
Cmn);
|
||||||
|
}
|
||||||
|
void gemmBatched(int m,int n, int k,
|
||||||
|
RealF alpha,
|
||||||
|
deviceVector<RealF*> &Amk, // pointer list to matrices
|
||||||
|
deviceVector<RealF*> &Bkn,
|
||||||
|
RealF beta,
|
||||||
|
deviceVector<RealF*> &Cmn)
|
||||||
|
{
|
||||||
|
gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
|
||||||
|
m,n,k,
|
||||||
|
alpha,
|
||||||
|
Amk,
|
||||||
|
Bkn,
|
||||||
|
beta,
|
||||||
|
Cmn);
|
||||||
|
}
|
||||||
|
|
||||||
void gemmBatched(GridBLASOperation_t OpA,
|
void gemmBatched(GridBLASOperation_t OpA,
|
||||||
GridBLASOperation_t OpB,
|
GridBLASOperation_t OpB,
|
||||||
@ -935,6 +623,301 @@ public:
|
|||||||
RealD flops = 8.0*m*n*k*batchCount;
|
RealD flops = 8.0*m*n*k*batchCount;
|
||||||
RealD bytes = 1.0*sizeof(ComplexF)*(m*k+k*n+m*n)*batchCount;
|
RealD bytes = 1.0*sizeof(ComplexF)*(m*k+k*n+m*n)*batchCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
// Single precision real GEMM
|
||||||
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
void gemmBatched(GridBLASOperation_t OpA,
|
||||||
|
GridBLASOperation_t OpB,
|
||||||
|
int m,int n, int k,
|
||||||
|
RealF alpha,
|
||||||
|
deviceVector<RealF*> &Amk, // pointer list to matrices
|
||||||
|
deviceVector<RealF*> &Bkn,
|
||||||
|
RealF beta,
|
||||||
|
deviceVector<RealF*> &Cmn)
|
||||||
|
{
|
||||||
|
RealD t2=usecond();
|
||||||
|
int32_t batchCount = Amk.size();
|
||||||
|
|
||||||
|
assert(OpA!=GridBLAS_OP_C); // Real case no conjugate
|
||||||
|
assert(OpB!=GridBLAS_OP_C);
|
||||||
|
|
||||||
|
int lda = m; // m x k column major
|
||||||
|
int ldb = k; // k x n column major
|
||||||
|
int ldc = m; // m x b column major
|
||||||
|
if(OpA!=GridBLAS_OP_N)
|
||||||
|
lda = k;
|
||||||
|
if(OpB!=GridBLAS_OP_N)
|
||||||
|
ldb = n;
|
||||||
|
static deviceVector<RealF> alpha_p(1);
|
||||||
|
static deviceVector<RealF> beta_p(1);
|
||||||
|
// can prestore the 1 and the zero on device
|
||||||
|
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealF));
|
||||||
|
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealF));
|
||||||
|
RealD t0=usecond();
|
||||||
|
|
||||||
|
assert(Bkn.size()==batchCount);
|
||||||
|
assert(Cmn.size()==batchCount);
|
||||||
|
#ifdef GRID_HIP
|
||||||
|
hipblasOperation_t hOpA;
|
||||||
|
hipblasOperation_t hOpB;
|
||||||
|
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
|
||||||
|
if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
|
||||||
|
if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
|
||||||
|
if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
|
||||||
|
if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
|
||||||
|
if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
|
||||||
|
auto err = hipblasSgemmBatched(gridblasHandle,
|
||||||
|
hOpA,
|
||||||
|
hOpB,
|
||||||
|
m,n,k,
|
||||||
|
(float *) &alpha_p[0],
|
||||||
|
(float **)&Amk[0], lda,
|
||||||
|
(float **)&Bkn[0], ldb,
|
||||||
|
(float *) &beta_p[0],
|
||||||
|
(float **)&Cmn[0], ldc,
|
||||||
|
batchCount);
|
||||||
|
assert(err==HIPBLAS_STATUS_SUCCESS);
|
||||||
|
#endif
|
||||||
|
#ifdef GRID_CUDA
|
||||||
|
cublasOperation_t hOpA;
|
||||||
|
cublasOperation_t hOpB;
|
||||||
|
if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
|
||||||
|
if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
|
||||||
|
if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
|
||||||
|
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
|
||||||
|
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
|
||||||
|
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
|
||||||
|
auto err = cublasSgemmBatched(gridblasHandle,
|
||||||
|
hOpA,
|
||||||
|
hOpB,
|
||||||
|
m,n,k,
|
||||||
|
(float *) &alpha_p[0],
|
||||||
|
(float **)&Amk[0], lda,
|
||||||
|
(float **)&Bkn[0], ldb,
|
||||||
|
(float *) &beta_p[0],
|
||||||
|
(float **)&Cmn[0], ldc,
|
||||||
|
batchCount);
|
||||||
|
assert(err==CUBLAS_STATUS_SUCCESS);
|
||||||
|
#endif
|
||||||
|
#ifdef GRID_SYCL
|
||||||
|
int64_t m64=m;
|
||||||
|
int64_t n64=n;
|
||||||
|
int64_t k64=k;
|
||||||
|
int64_t lda64=lda;
|
||||||
|
int64_t ldb64=ldb;
|
||||||
|
int64_t ldc64=ldc;
|
||||||
|
int64_t batchCount64=batchCount;
|
||||||
|
|
||||||
|
oneapi::mkl::transpose iOpA;
|
||||||
|
oneapi::mkl::transpose iOpB;
|
||||||
|
|
||||||
|
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
|
||||||
|
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
|
||||||
|
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
|
||||||
|
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
|
||||||
|
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
|
||||||
|
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
|
||||||
|
|
||||||
|
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
|
||||||
|
&iOpA,
|
||||||
|
&iOpB,
|
||||||
|
&m64,&n64,&k64,
|
||||||
|
(float *) &alpha_p[0],
|
||||||
|
(const float **)&Amk[0], (const int64_t *)&lda64,
|
||||||
|
(const float **)&Bkn[0], (const int64_t *)&ldb64,
|
||||||
|
(float *) &beta_p[0],
|
||||||
|
(float **)&Cmn[0], (const int64_t *)&ldc64,
|
||||||
|
(int64_t)1,&batchCount64,std::vector<sycl::event>());
|
||||||
|
synchronise();
|
||||||
|
#endif
|
||||||
|
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
||||||
|
// Need a default/reference implementation; use Eigen
|
||||||
|
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
|
||||||
|
thread_for (p, batchCount, {
|
||||||
|
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
|
||||||
|
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||||
|
});
|
||||||
|
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
|
||||||
|
thread_for (p, batchCount, {
|
||||||
|
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
|
||||||
|
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
|
||||||
|
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
|
||||||
|
});
|
||||||
|
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
|
||||||
|
thread_for (p, batchCount, {
|
||||||
|
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
|
||||||
|
});
|
||||||
|
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
|
||||||
|
thread_for (p, batchCount, {
|
||||||
|
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
|
||||||
|
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||||
|
} );
|
||||||
|
} else {
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
RealD t1=usecond();
|
||||||
|
RealD flops = 2.0*m*n*k*batchCount;
|
||||||
|
RealD bytes = 1.0*sizeof(RealF)*(m*k+k*n+m*n)*batchCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
// Double precision real GEMM
|
||||||
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
void gemmBatched(GridBLASOperation_t OpA,
|
||||||
|
GridBLASOperation_t OpB,
|
||||||
|
int m,int n, int k,
|
||||||
|
RealD alpha,
|
||||||
|
deviceVector<RealD*> &Amk, // pointer list to matrices
|
||||||
|
deviceVector<RealD*> &Bkn,
|
||||||
|
RealD beta,
|
||||||
|
deviceVector<RealD*> &Cmn)
|
||||||
|
{
|
||||||
|
RealD t2=usecond();
|
||||||
|
int32_t batchCount = Amk.size();
|
||||||
|
|
||||||
|
assert(OpA!=GridBLAS_OP_C); // Real case no conjugate
|
||||||
|
assert(OpB!=GridBLAS_OP_C);
|
||||||
|
|
||||||
|
int lda = m; // m x k column major
|
||||||
|
int ldb = k; // k x n column major
|
||||||
|
int ldc = m; // m x b column major
|
||||||
|
if(OpA!=GridBLAS_OP_N)
|
||||||
|
lda = k;
|
||||||
|
if(OpB!=GridBLAS_OP_N)
|
||||||
|
ldb = n;
|
||||||
|
|
||||||
|
static deviceVector<RealD> alpha_p(1);
|
||||||
|
static deviceVector<RealD> beta_p(1);
|
||||||
|
// can prestore the 1 and the zero on device
|
||||||
|
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealD));
|
||||||
|
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealD));
|
||||||
|
RealD t0=usecond();
|
||||||
|
|
||||||
|
assert(Bkn.size()==batchCount);
|
||||||
|
assert(Cmn.size()==batchCount);
|
||||||
|
#ifdef GRID_HIP
|
||||||
|
hipblasOperation_t hOpA;
|
||||||
|
hipblasOperation_t hOpB;
|
||||||
|
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
|
||||||
|
if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
|
||||||
|
if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
|
||||||
|
if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
|
||||||
|
if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
|
||||||
|
if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
|
||||||
|
auto err = hipblasDgemmBatched(gridblasHandle,
|
||||||
|
HIPBLAS_OP_N,
|
||||||
|
HIPBLAS_OP_N,
|
||||||
|
m,n,k,
|
||||||
|
(double *) &alpha_p[0],
|
||||||
|
(double **)&Amk[0], lda,
|
||||||
|
(double **)&Bkn[0], ldb,
|
||||||
|
(double *) &beta_p[0],
|
||||||
|
(double **)&Cmn[0], ldc,
|
||||||
|
batchCount);
|
||||||
|
assert(err==HIPBLAS_STATUS_SUCCESS);
|
||||||
|
#endif
|
||||||
|
#ifdef GRID_CUDA
|
||||||
|
cublasOperation_t hOpA;
|
||||||
|
cublasOperation_t hOpB;
|
||||||
|
if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
|
||||||
|
if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
|
||||||
|
if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
|
||||||
|
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
|
||||||
|
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
|
||||||
|
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
|
||||||
|
auto err = cublasDgemmBatched(gridblasHandle,
|
||||||
|
hOpA,
|
||||||
|
hOpB,
|
||||||
|
m,n,k,
|
||||||
|
(double *) &alpha_p[0],
|
||||||
|
(double **)&Amk[0], lda,
|
||||||
|
(double **)&Bkn[0], ldb,
|
||||||
|
(double *) &beta_p[0],
|
||||||
|
(double **)&Cmn[0], ldc,
|
||||||
|
batchCount);
|
||||||
|
assert(err==CUBLAS_STATUS_SUCCESS);
|
||||||
|
#endif
|
||||||
|
#ifdef GRID_SYCL
|
||||||
|
int64_t m64=m;
|
||||||
|
int64_t n64=n;
|
||||||
|
int64_t k64=k;
|
||||||
|
int64_t lda64=lda;
|
||||||
|
int64_t ldb64=ldb;
|
||||||
|
int64_t ldc64=ldc;
|
||||||
|
int64_t batchCount64=batchCount;
|
||||||
|
|
||||||
|
oneapi::mkl::transpose iOpA;
|
||||||
|
oneapi::mkl::transpose iOpB;
|
||||||
|
|
||||||
|
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
|
||||||
|
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
|
||||||
|
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
|
||||||
|
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
|
||||||
|
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
|
||||||
|
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
|
||||||
|
|
||||||
|
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
|
||||||
|
&iOpA,
|
||||||
|
&iOpB,
|
||||||
|
&m64,&n64,&k64,
|
||||||
|
(double *) &alpha_p[0],
|
||||||
|
(const double **)&Amk[0], (const int64_t *)&lda64,
|
||||||
|
(const double **)&Bkn[0], (const int64_t *)&ldb64,
|
||||||
|
(double *) &beta_p[0],
|
||||||
|
(double **)&Cmn[0], (const int64_t *)&ldc64,
|
||||||
|
(int64_t)1,&batchCount64,std::vector<sycl::event>());
|
||||||
|
synchronise();
|
||||||
|
#endif
|
||||||
|
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
||||||
|
// Need a default/reference implementation; use Eigen
|
||||||
|
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
|
||||||
|
thread_for (p, batchCount, {
|
||||||
|
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
|
||||||
|
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||||
|
});
|
||||||
|
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
|
||||||
|
thread_for (p, batchCount, {
|
||||||
|
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
|
||||||
|
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
|
||||||
|
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
|
||||||
|
});
|
||||||
|
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
|
||||||
|
thread_for (p, batchCount, {
|
||||||
|
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
|
||||||
|
});
|
||||||
|
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
|
||||||
|
thread_for (p, batchCount, {
|
||||||
|
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
|
||||||
|
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
|
||||||
|
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
||||||
|
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
RealD t1=usecond();
|
||||||
|
RealD flops = 2.0*m*n*k*batchCount;
|
||||||
|
RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount;
|
||||||
|
}
|
||||||
|
|
||||||
template<class CComplex>
|
template<class CComplex>
|
||||||
double benchmark(int M, int N, int K, int BATCH)
|
double benchmark(int M, int N, int K, int BATCH)
|
||||||
@ -984,47 +967,6 @@ public:
|
|||||||
return flops; // Returns gigaflops
|
return flops; // Returns gigaflops
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class CComplex>
|
|
||||||
double benchmark(int M, int N, int K)
|
|
||||||
{
|
|
||||||
int32_t N_A = M*K;
|
|
||||||
int32_t N_B = K*N;
|
|
||||||
int32_t N_C = M*N;
|
|
||||||
deviceVector<CComplex> A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(CComplex));
|
|
||||||
deviceVector<CComplex> B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(CComplex));
|
|
||||||
deviceVector<CComplex> C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(CComplex));
|
|
||||||
CComplex alpha(1.0);
|
|
||||||
CComplex beta (1.0);
|
|
||||||
RealD flops = 8.0*M*N*K;
|
|
||||||
int ncall=10;
|
|
||||||
|
|
||||||
gemm(GridBLAS_OP_C,GridBLAS_OP_N,
|
|
||||||
M,N,K,
|
|
||||||
alpha,
|
|
||||||
&A[0], // m x k
|
|
||||||
&B[0], // k x n
|
|
||||||
beta,
|
|
||||||
&C[0]);
|
|
||||||
synchronise();
|
|
||||||
|
|
||||||
RealD t0 = usecond();
|
|
||||||
for(int i=0;i<ncall;i++){
|
|
||||||
gemm(GridBLAS_OP_N,GridBLAS_OP_N,
|
|
||||||
M,N,K,
|
|
||||||
alpha,
|
|
||||||
&A[0], // m x k
|
|
||||||
&B[0], // k x n
|
|
||||||
beta,
|
|
||||||
&C[0]);
|
|
||||||
synchronise();
|
|
||||||
}
|
|
||||||
RealD t1 = usecond();
|
|
||||||
RealD bytes = 1.0*sizeof(CComplex)*(M*N*2+N*K+M*K);
|
|
||||||
flops = 8.0*M*N*K*ncall;
|
|
||||||
flops = flops/(t1-t0)/1.e3;
|
|
||||||
return flops; // Returns gigaflops
|
|
||||||
}
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@ -1093,21 +1035,6 @@ static void BLAS(void)
|
|||||||
std::cout<< M<<"\t\t"<<N<<"\t\t"<<K<<"\t\t"<<BATCH<<"\t\t"<<p<<std::endl;
|
std::cout<< M<<"\t\t"<<N<<"\t\t"<<K<<"\t\t"<<BATCH<<"\t\t"<<p<<std::endl;
|
||||||
}}
|
}}
|
||||||
fprintf(FP,"\n\n\n");
|
fprintf(FP,"\n\n\n");
|
||||||
|
|
||||||
std::cout << "----------------------------------------------------------"<<std::endl;
|
|
||||||
std::cout << " M "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / rank (inner product matrix)"<<std::endl;
|
|
||||||
std::cout << "----------------------------------------------------------"<<std::endl;
|
|
||||||
{
|
|
||||||
int M=12;
|
|
||||||
int N=12;
|
|
||||||
std::vector<int> ks({4*1024*1024, 2*1024*1024, 1024*1024, 256*1024, 1024 });
|
|
||||||
for( int kk=0;kk<ks.size();kk++ ) {
|
|
||||||
int K = ks[kk];
|
|
||||||
double p=blas.benchmark<CComplex>(M,N,K);
|
|
||||||
fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, 1, p);
|
|
||||||
std::cout<< M<<"\t\t"<<N<<"\t\t"<<K<<"\t\t"<<1<<"\t\t"<<p<<std::endl;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
std::cout << "=================================================================================="<<std::endl;
|
std::cout << "=================================================================================="<<std::endl;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -1,2 +1,2 @@
|
|||||||
|
|
||||||
mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench -DGRID_SYCL
|
mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench
|
@ -1,5 +0,0 @@
|
|||||||
CXX=hipcc
|
|
||||||
MPICXX=mpicxx
|
|
||||||
CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 -I/opt/cray/pe/mpich/8.1.28/ofi/gnu/12.3/include -DGRID_HIP"
|
|
||||||
LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 -lhipblas -lrocblas -lmpi_gnu_123"
|
|
||||||
hipcc $CXXFLAGS $LDFLAGS BatchBlasBench.cc -o BatchBlasBench
|
|
@ -1,2 +0,0 @@
|
|||||||
|
|
||||||
mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench -DGRID_SYCL
|
|
@ -50,7 +50,6 @@ NAMESPACE_CHECK(approx);
|
|||||||
#include <Grid/algorithms/deflation/Deflation.h>
|
#include <Grid/algorithms/deflation/Deflation.h>
|
||||||
#include <Grid/algorithms/deflation/MultiRHSBlockProject.h>
|
#include <Grid/algorithms/deflation/MultiRHSBlockProject.h>
|
||||||
#include <Grid/algorithms/deflation/MultiRHSDeflation.h>
|
#include <Grid/algorithms/deflation/MultiRHSDeflation.h>
|
||||||
#include <Grid/algorithms/deflation/MultiRHSBlockCGLinalg.h>
|
|
||||||
NAMESPACE_CHECK(deflation);
|
NAMESPACE_CHECK(deflation);
|
||||||
#include <Grid/algorithms/iterative/ConjugateGradient.h>
|
#include <Grid/algorithms/iterative/ConjugateGradient.h>
|
||||||
NAMESPACE_CHECK(ConjGrad);
|
NAMESPACE_CHECK(ConjGrad);
|
||||||
|
@ -168,7 +168,6 @@ public:
|
|||||||
template<class vobj>
|
template<class vobj>
|
||||||
void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){
|
void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){
|
||||||
#ifndef HAVE_FFTW
|
#ifndef HAVE_FFTW
|
||||||
std::cerr << "FFTW is not compiled but is called"<<std::endl;
|
|
||||||
assert(0);
|
assert(0);
|
||||||
#else
|
#else
|
||||||
conformable(result.Grid(),vgrid);
|
conformable(result.Grid(),vgrid);
|
||||||
@ -191,8 +190,7 @@ public:
|
|||||||
|
|
||||||
Lattice<sobj> pgbuf(&pencil_g);
|
Lattice<sobj> pgbuf(&pencil_g);
|
||||||
autoView(pgbuf_v , pgbuf, CpuWrite);
|
autoView(pgbuf_v , pgbuf, CpuWrite);
|
||||||
std::cout << "CPU view" << std::endl;
|
|
||||||
|
|
||||||
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
|
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
|
||||||
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
|
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
|
||||||
|
|
||||||
@ -215,7 +213,6 @@ public:
|
|||||||
else if ( sign == forward ) div = 1.0;
|
else if ( sign == forward ) div = 1.0;
|
||||||
else assert(0);
|
else assert(0);
|
||||||
|
|
||||||
std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
|
|
||||||
FFTW_plan p;
|
FFTW_plan p;
|
||||||
{
|
{
|
||||||
FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
|
FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
|
||||||
@ -229,7 +226,6 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Barrel shift and collect global pencil
|
// Barrel shift and collect global pencil
|
||||||
std::cout << GridLogPerformance<<"Making pencil" << std::endl;
|
|
||||||
Coordinate lcoor(Nd), gcoor(Nd);
|
Coordinate lcoor(Nd), gcoor(Nd);
|
||||||
result = source;
|
result = source;
|
||||||
int pc = processor_coor[dim];
|
int pc = processor_coor[dim];
|
||||||
@ -251,7 +247,6 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::cout <<GridLogPerformance<< "Looping orthog" << std::endl;
|
|
||||||
// Loop over orthog coords
|
// Loop over orthog coords
|
||||||
int NN=pencil_g.lSites();
|
int NN=pencil_g.lSites();
|
||||||
GridStopWatch timer;
|
GridStopWatch timer;
|
||||||
@ -274,7 +269,6 @@ public:
|
|||||||
usec += timer.useconds();
|
usec += timer.useconds();
|
||||||
flops+= flops_call*NN;
|
flops+= flops_call*NN;
|
||||||
|
|
||||||
std::cout <<GridLogPerformance<< "Writing back results " << std::endl;
|
|
||||||
// writing out result
|
// writing out result
|
||||||
{
|
{
|
||||||
autoView(pgbuf_v,pgbuf,CpuRead);
|
autoView(pgbuf_v,pgbuf,CpuRead);
|
||||||
@ -291,7 +285,6 @@ public:
|
|||||||
}
|
}
|
||||||
result = result*div;
|
result = result*div;
|
||||||
|
|
||||||
std::cout <<GridLogPerformance<< "Destroying plan " << std::endl;
|
|
||||||
// destroying plan
|
// destroying plan
|
||||||
FFTW<scalar>::fftw_destroy_plan(p);
|
FFTW<scalar>::fftw_destroy_plan(p);
|
||||||
#endif
|
#endif
|
||||||
|
@ -103,38 +103,6 @@ public:
|
|||||||
_Mat.MdagM(in,out);
|
_Mat.MdagM(in,out);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
template<class Matrix,class Field>
|
|
||||||
class MMdagLinearOperator : public LinearOperatorBase<Field> {
|
|
||||||
Matrix &_Mat;
|
|
||||||
public:
|
|
||||||
MMdagLinearOperator(Matrix &Mat): _Mat(Mat){};
|
|
||||||
|
|
||||||
// Support for coarsening to a multigrid
|
|
||||||
void OpDiag (const Field &in, Field &out) {
|
|
||||||
_Mat.Mdiag(in,out);
|
|
||||||
}
|
|
||||||
void OpDir (const Field &in, Field &out,int dir,int disp) {
|
|
||||||
_Mat.Mdir(in,out,dir,disp);
|
|
||||||
}
|
|
||||||
void OpDirAll (const Field &in, std::vector<Field> &out){
|
|
||||||
_Mat.MdirAll(in,out);
|
|
||||||
};
|
|
||||||
void Op (const Field &in, Field &out){
|
|
||||||
_Mat.M(in,out);
|
|
||||||
}
|
|
||||||
void AdjOp (const Field &in, Field &out){
|
|
||||||
_Mat.Mdag(in,out);
|
|
||||||
}
|
|
||||||
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
|
|
||||||
_Mat.MMdag(in,out);
|
|
||||||
ComplexD dot = innerProduct(in,out);
|
|
||||||
n1=real(dot);
|
|
||||||
n2=norm2(out);
|
|
||||||
}
|
|
||||||
void HermOp(const Field &in, Field &out){
|
|
||||||
_Mat.MMdag(in,out);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////
|
||||||
// Construct herm op and shift it for mgrid smoother
|
// Construct herm op and shift it for mgrid smoother
|
||||||
|
@ -45,11 +45,6 @@ public:
|
|||||||
M(in,tmp);
|
M(in,tmp);
|
||||||
Mdag(tmp,out);
|
Mdag(tmp,out);
|
||||||
}
|
}
|
||||||
virtual void MMdag(const Field &in, Field &out) {
|
|
||||||
Field tmp (in.Grid());
|
|
||||||
Mdag(in,tmp);
|
|
||||||
M(tmp,out);
|
|
||||||
}
|
|
||||||
virtual void Mdiag (const Field &in, Field &out)=0;
|
virtual void Mdiag (const Field &in, Field &out)=0;
|
||||||
virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0;
|
virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0;
|
||||||
virtual void MdirAll (const Field &in, std::vector<Field> &out)=0;
|
virtual void MdirAll (const Field &in, std::vector<Field> &out)=0;
|
||||||
|
@ -59,7 +59,7 @@ public:
|
|||||||
RealD diff = hi-lo;
|
RealD diff = hi-lo;
|
||||||
RealD delta = diff*1.0e-9;
|
RealD delta = diff*1.0e-9;
|
||||||
for (RealD x=lo; x<hi; x+=delta) {
|
for (RealD x=lo; x<hi; x+=delta) {
|
||||||
delta*=1.02;
|
delta*=1.1;
|
||||||
RealD f = approx(x);
|
RealD f = approx(x);
|
||||||
out<< x<<" "<<f<<std::endl;
|
out<< x<<" "<<f<<std::endl;
|
||||||
}
|
}
|
||||||
@ -131,26 +131,6 @@ public:
|
|||||||
Coeffs[j] = s * 2.0/order;
|
Coeffs[j] = s * 2.0/order;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
template<class functor>
|
|
||||||
void Init(RealD _lo,RealD _hi,int _order, functor & func)
|
|
||||||
{
|
|
||||||
lo=_lo;
|
|
||||||
hi=_hi;
|
|
||||||
order=_order;
|
|
||||||
|
|
||||||
if(order < 2) exit(-1);
|
|
||||||
Coeffs.resize(order);
|
|
||||||
for(int j=0;j<order;j++){
|
|
||||||
RealD s=0;
|
|
||||||
for(int k=0;k<order;k++){
|
|
||||||
RealD y=std::cos(M_PI*(k+0.5)/order);
|
|
||||||
RealD x=0.5*(y*(hi-lo)+(hi+lo));
|
|
||||||
RealD f=func(x);
|
|
||||||
s=s+f*std::cos( j*M_PI*(k+0.5)/order );
|
|
||||||
}
|
|
||||||
Coeffs[j] = s * 2.0/order;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
void JacksonSmooth(void){
|
void JacksonSmooth(void){
|
||||||
|
@ -55,10 +55,10 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
typedef cublasHandle_t gridblasHandle_t;
|
typedef cublasHandle_t gridblasHandle_t;
|
||||||
#endif
|
#endif
|
||||||
#ifdef GRID_SYCL
|
#ifdef GRID_SYCL
|
||||||
typedef sycl::queue *gridblasHandle_t;
|
typedef cl::sycl::queue *gridblasHandle_t;
|
||||||
#endif
|
#endif
|
||||||
#ifdef GRID_ONE_MKL
|
#ifdef GRID_ONE_MKL
|
||||||
typedef sycl::queue *gridblasHandle_t;
|
typedef cl::sycl::queue *gridblasHandle_t;
|
||||||
#endif
|
#endif
|
||||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
|
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
|
||||||
typedef int32_t gridblasHandle_t;
|
typedef int32_t gridblasHandle_t;
|
||||||
@ -89,9 +89,9 @@ public:
|
|||||||
gridblasHandle = theGridAccelerator;
|
gridblasHandle = theGridAccelerator;
|
||||||
#endif
|
#endif
|
||||||
#ifdef GRID_ONE_MKL
|
#ifdef GRID_ONE_MKL
|
||||||
sycl::gpu_selector selector;
|
cl::sycl::gpu_selector selector;
|
||||||
sycl::device selectedDevice { selector };
|
cl::sycl::device selectedDevice { selector };
|
||||||
sycl::property_list q_prop{sycl::property::queue::in_order()};
|
cl::sycl::property_list q_prop{cl::sycl::property::queue::in_order()};
|
||||||
gridblasHandle =new sycl::queue (selectedDevice,q_prop);
|
gridblasHandle =new sycl::queue (selectedDevice,q_prop);
|
||||||
#endif
|
#endif
|
||||||
gridblasInit=1;
|
gridblasInit=1;
|
||||||
|
@ -1,376 +0,0 @@
|
|||||||
/*************************************************************************************
|
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
|
||||||
|
|
||||||
Source file: MultiRHSBlockCGLinalg.h
|
|
||||||
|
|
||||||
Copyright (C) 2024
|
|
||||||
|
|
||||||
Author: Peter Boyle <pboyle@bnl.gov>
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation; either version 2 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License along
|
|
||||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
|
||||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
||||||
|
|
||||||
See the full license in the file "LICENSE" in the top level distribution directory
|
|
||||||
*************************************************************************************/
|
|
||||||
/* END LEGAL */
|
|
||||||
#pragma once
|
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
|
||||||
|
|
||||||
|
|
||||||
/* Need helper object for BLAS accelerated mrhs blockCG */
|
|
||||||
template<class Field>
|
|
||||||
class MultiRHSBlockCGLinalg
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
|
|
||||||
typedef typename Field::scalar_type scalar;
|
|
||||||
typedef typename Field::scalar_object scalar_object;
|
|
||||||
typedef typename Field::vector_object vector_object;
|
|
||||||
|
|
||||||
deviceVector<scalar> BLAS_X; // nrhs x vol -- the sources
|
|
||||||
deviceVector<scalar> BLAS_Y; // nrhs x vol -- the result
|
|
||||||
deviceVector<scalar> BLAS_C; // nrhs x nrhs -- the coefficients
|
|
||||||
deviceVector<scalar> BLAS_Cred; // nrhs x nrhs x oSites -- reduction buffer
|
|
||||||
deviceVector<scalar *> Xdip;
|
|
||||||
deviceVector<scalar *> Ydip;
|
|
||||||
deviceVector<scalar *> Cdip;
|
|
||||||
|
|
||||||
MultiRHSBlockCGLinalg() {};
|
|
||||||
~MultiRHSBlockCGLinalg(){ Deallocate(); };
|
|
||||||
|
|
||||||
void Deallocate(void)
|
|
||||||
{
|
|
||||||
Xdip.resize(0);
|
|
||||||
Ydip.resize(0);
|
|
||||||
Cdip.resize(0);
|
|
||||||
BLAS_Cred.resize(0);
|
|
||||||
BLAS_C.resize(0);
|
|
||||||
BLAS_X.resize(0);
|
|
||||||
BLAS_Y.resize(0);
|
|
||||||
}
|
|
||||||
void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0)
|
|
||||||
{
|
|
||||||
std::vector<Field> Y_copy(AP.size(),AP[0].Grid());
|
|
||||||
for(int r=0;r<AP.size();r++){
|
|
||||||
Y_copy[r] = Y[r];
|
|
||||||
}
|
|
||||||
MulMatrix(AP,m,X);
|
|
||||||
for(int r=0;r<AP.size();r++){
|
|
||||||
AP[r] = scale*AP[r]+Y_copy[r];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
void MulMatrix(std::vector<Field> &Y, Eigen::MatrixXcd &m , const std::vector<Field> &X)
|
|
||||||
{
|
|
||||||
typedef typename Field::scalar_type scomplex;
|
|
||||||
GridBase *grid;
|
|
||||||
uint64_t vol;
|
|
||||||
uint64_t words;
|
|
||||||
|
|
||||||
int nrhs = Y.size();
|
|
||||||
grid = X[0].Grid();
|
|
||||||
vol = grid->lSites();
|
|
||||||
words = sizeof(scalar_object)/sizeof(scalar);
|
|
||||||
int64_t vw = vol * words;
|
|
||||||
|
|
||||||
RealD t0 = usecond();
|
|
||||||
BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
|
|
||||||
BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
|
|
||||||
BLAS_C.resize(nrhs * nrhs);// cost free if size doesn't change
|
|
||||||
RealD t1 = usecond();
|
|
||||||
|
|
||||||
/////////////////////////////////////////////
|
|
||||||
// Copy in the multi-rhs sources
|
|
||||||
/////////////////////////////////////////////
|
|
||||||
for(int r=0;r<nrhs;r++){
|
|
||||||
int64_t offset = r*vw;
|
|
||||||
autoView(x_v,X[r],AcceleratorRead);
|
|
||||||
acceleratorCopyDeviceToDevice(&x_v[0],&BLAS_X[offset],sizeof(scalar_object)*vol);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Assumes Eigen storage contiguous
|
|
||||||
acceleratorCopyToDevice(&m(0,0),&BLAS_C[0],BLAS_C.size()*sizeof(scalar));
|
|
||||||
|
|
||||||
/*
|
|
||||||
* in Fortran column major notation (cuBlas order)
|
|
||||||
*
|
|
||||||
* Xxr = [X1(x)][..][Xn(x)]
|
|
||||||
* Yxr = [Y1(x)][..][Ym(x)]
|
|
||||||
* Y = X . C
|
|
||||||
*/
|
|
||||||
deviceVector<scalar *> Xd(1);
|
|
||||||
deviceVector<scalar *> Yd(1);
|
|
||||||
deviceVector<scalar *> Cd(1);
|
|
||||||
|
|
||||||
scalar * Xh = & BLAS_X[0];
|
|
||||||
scalar * Yh = & BLAS_Y[0];
|
|
||||||
scalar * Ch = & BLAS_C[0];
|
|
||||||
|
|
||||||
acceleratorPut(Xd[0],Xh);
|
|
||||||
acceleratorPut(Yd[0],Yh);
|
|
||||||
acceleratorPut(Cd[0],Ch);
|
|
||||||
|
|
||||||
RealD t2 = usecond();
|
|
||||||
GridBLAS BLAS;
|
|
||||||
/////////////////////////////////////////
|
|
||||||
// Y = X*C (transpose?)
|
|
||||||
/////////////////////////////////////////
|
|
||||||
BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
|
|
||||||
vw,nrhs,nrhs,
|
|
||||||
scalar(1.0),
|
|
||||||
Xd,
|
|
||||||
Cd,
|
|
||||||
scalar(0.0), // wipe out Y
|
|
||||||
Yd);
|
|
||||||
BLAS.synchronise();
|
|
||||||
RealD t3 = usecond();
|
|
||||||
|
|
||||||
// Copy back Y = m X
|
|
||||||
for(int r=0;r<nrhs;r++){
|
|
||||||
int64_t offset = r*vw;
|
|
||||||
autoView(y_v,Y[r],AcceleratorWrite);
|
|
||||||
acceleratorCopyDeviceToDevice(&BLAS_Y[offset],&y_v[0],sizeof(scalar_object)*vol);
|
|
||||||
}
|
|
||||||
RealD t4 = usecond();
|
|
||||||
std::cout <<GridLogPerformance << "MulMatrix alloc took "<< t1-t0<<" us"<<std::endl;
|
|
||||||
std::cout <<GridLogPerformance<< "MulMatrix preamble took "<< t2-t1<<" us"<<std::endl;
|
|
||||||
std::cout <<GridLogPerformance<< "MulMatrix blas took "<< t3-t2<<" us"<<std::endl;
|
|
||||||
std::cout <<GridLogPerformance<< "MulMatrix copy took "<< t4-t3<<" us"<<std::endl;
|
|
||||||
std::cout <<GridLogPerformance<< "MulMatrix total "<< t4-t0<<" us"<<std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y)
|
|
||||||
{
|
|
||||||
#if 0
|
|
||||||
int nrhs;
|
|
||||||
GridBase *grid;
|
|
||||||
uint64_t vol;
|
|
||||||
uint64_t words;
|
|
||||||
|
|
||||||
nrhs = X.size();
|
|
||||||
assert(X.size()==Y.size());
|
|
||||||
conformable(X[0],Y[0]);
|
|
||||||
|
|
||||||
grid = X[0].Grid();
|
|
||||||
vol = grid->lSites();
|
|
||||||
words = sizeof(scalar_object)/sizeof(scalar);
|
|
||||||
int64_t vw = vol * words;
|
|
||||||
|
|
||||||
RealD t0 = usecond();
|
|
||||||
BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
|
|
||||||
BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
|
|
||||||
BLAS_C.resize(nrhs * nrhs);// cost free if size doesn't change
|
|
||||||
RealD t1 = usecond();
|
|
||||||
|
|
||||||
/////////////////////////////////////////////
|
|
||||||
// Copy in the multi-rhs sources
|
|
||||||
/////////////////////////////////////////////
|
|
||||||
for(int r=0;r<nrhs;r++){
|
|
||||||
int64_t offset = r*vw;
|
|
||||||
autoView(x_v,X[r],AcceleratorRead);
|
|
||||||
acceleratorCopyDeviceToDevice(&x_v[0],&BLAS_X[offset],sizeof(scalar_object)*vol);
|
|
||||||
autoView(y_v,Y[r],AcceleratorRead);
|
|
||||||
acceleratorCopyDeviceToDevice(&y_v[0],&BLAS_Y[offset],sizeof(scalar_object)*vol);
|
|
||||||
}
|
|
||||||
RealD t2 = usecond();
|
|
||||||
|
|
||||||
/*
|
|
||||||
* in Fortran column major notation (cuBlas order)
|
|
||||||
*
|
|
||||||
* Xxr = [X1(x)][..][Xn(x)]
|
|
||||||
*
|
|
||||||
* Yxr = [Y1(x)][..][Ym(x)]
|
|
||||||
*
|
|
||||||
* C_rs = X^dag Y
|
|
||||||
*/
|
|
||||||
deviceVector<scalar *> Xd(1);
|
|
||||||
deviceVector<scalar *> Yd(1);
|
|
||||||
deviceVector<scalar *> Cd(1);
|
|
||||||
|
|
||||||
scalar * Xh = & BLAS_X[0];
|
|
||||||
scalar * Yh = & BLAS_Y[0];
|
|
||||||
scalar * Ch = & BLAS_C[0];
|
|
||||||
|
|
||||||
acceleratorPut(Xd[0],Xh);
|
|
||||||
acceleratorPut(Yd[0],Yh);
|
|
||||||
acceleratorPut(Cd[0],Ch);
|
|
||||||
|
|
||||||
GridBLAS BLAS;
|
|
||||||
|
|
||||||
RealD t3 = usecond();
|
|
||||||
/////////////////////////////////////////
|
|
||||||
// C_rs = X^dag Y
|
|
||||||
/////////////////////////////////////////
|
|
||||||
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
|
|
||||||
nrhs,nrhs,vw,
|
|
||||||
ComplexD(1.0),
|
|
||||||
Xd,
|
|
||||||
Yd,
|
|
||||||
ComplexD(0.0), // wipe out C
|
|
||||||
Cd);
|
|
||||||
BLAS.synchronise();
|
|
||||||
RealD t4 = usecond();
|
|
||||||
|
|
||||||
std::vector<scalar> HOST_C(BLAS_C.size()); // nrhs . nrhs -- the coefficients
|
|
||||||
acceleratorCopyFromDevice(&BLAS_C[0],&HOST_C[0],BLAS_C.size()*sizeof(scalar));
|
|
||||||
grid->GlobalSumVector(&HOST_C[0],nrhs*nrhs);
|
|
||||||
|
|
||||||
RealD t5 = usecond();
|
|
||||||
for(int rr=0;rr<nrhs;rr++){
|
|
||||||
for(int r=0;r<nrhs;r++){
|
|
||||||
int off = r+nrhs*rr;
|
|
||||||
m(r,rr)=HOST_C[off];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
RealD t6 = usecond();
|
|
||||||
uint64_t M=nrhs;
|
|
||||||
uint64_t N=nrhs;
|
|
||||||
uint64_t K=vw;
|
|
||||||
RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K);
|
|
||||||
RealD flops = 8.0*M*N*K;
|
|
||||||
flops = flops/(t4-t3)/1.e3;
|
|
||||||
bytes = bytes/(t4-t3)/1.e3;
|
|
||||||
std::cout <<GridLogPerformance<< "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
|
|
||||||
std::cout <<GridLogPerformance<< "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
|
|
||||||
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t2 "<< t2-t1<<" us"<<std::endl;
|
|
||||||
std::cout <<GridLogPerformance<< "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
|
|
||||||
std::cout <<GridLogPerformance<< "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
|
|
||||||
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< flops<<" GF/s"<<std::endl;
|
|
||||||
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< bytes<<" GB/s"<<std::endl;
|
|
||||||
std::cout <<GridLogPerformance<< "InnerProductMatrix gsum t5 "<< t5-t4<<" us"<<std::endl;
|
|
||||||
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t6 "<< t6-t5<<" us"<<std::endl;
|
|
||||||
std::cout <<GridLogPerformance<< "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
|
|
||||||
#else
|
|
||||||
int nrhs;
|
|
||||||
GridBase *grid;
|
|
||||||
uint64_t vol;
|
|
||||||
uint64_t words;
|
|
||||||
|
|
||||||
nrhs = X.size();
|
|
||||||
assert(X.size()==Y.size());
|
|
||||||
conformable(X[0],Y[0]);
|
|
||||||
|
|
||||||
grid = X[0].Grid();
|
|
||||||
int rd0 = grid->_rdimensions[0] * grid->_rdimensions[1];
|
|
||||||
vol = grid->oSites()/rd0;
|
|
||||||
words = rd0*sizeof(vector_object)/sizeof(scalar);
|
|
||||||
int64_t vw = vol * words;
|
|
||||||
assert(vw == grid->lSites()*sizeof(scalar_object)/sizeof(scalar));
|
|
||||||
|
|
||||||
RealD t0 = usecond();
|
|
||||||
BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
|
|
||||||
BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
|
|
||||||
BLAS_Cred.resize(nrhs * nrhs * vol);// cost free if size doesn't change
|
|
||||||
RealD t1 = usecond();
|
|
||||||
|
|
||||||
/////////////////////////////////////////////
|
|
||||||
// Copy in the multi-rhs sources -- layout batched BLAS ready
|
|
||||||
/////////////////////////////////////////////
|
|
||||||
for(int r=0;r<nrhs;r++){
|
|
||||||
autoView(x_v,X[r],AcceleratorRead);
|
|
||||||
autoView(y_v,Y[r],AcceleratorRead);
|
|
||||||
scalar *from_x=(scalar *)&x_v[0];
|
|
||||||
scalar *from_y=(scalar *)&y_v[0];
|
|
||||||
scalar *BX = &BLAS_X[0];
|
|
||||||
scalar *BY = &BLAS_Y[0];
|
|
||||||
accelerator_for(ssw,vw,1,{
|
|
||||||
uint64_t ss=ssw/words;
|
|
||||||
uint64_t w=ssw%words;
|
|
||||||
uint64_t offset = w+r*words+ss*nrhs*words; // [ss][rhs][words]
|
|
||||||
BX[offset] = from_x[ssw];
|
|
||||||
BY[offset] = from_y[ssw];
|
|
||||||
});
|
|
||||||
}
|
|
||||||
RealD t2 = usecond();
|
|
||||||
|
|
||||||
/*
|
|
||||||
* in Fortran column major notation (cuBlas order)
|
|
||||||
*
|
|
||||||
* Xxr = [X1(x)][..][Xn(x)]
|
|
||||||
*
|
|
||||||
* Yxr = [Y1(x)][..][Ym(x)]
|
|
||||||
*
|
|
||||||
* C_rs = X^dag Y
|
|
||||||
*/
|
|
||||||
Xdip.resize(vol);
|
|
||||||
Ydip.resize(vol);
|
|
||||||
Cdip.resize(vol);
|
|
||||||
std::vector<scalar *> Xh(vol);
|
|
||||||
std::vector<scalar *> Yh(vol);
|
|
||||||
std::vector<scalar *> Ch(vol);
|
|
||||||
for(uint64_t ss=0;ss<vol;ss++){
|
|
||||||
|
|
||||||
Xh[ss] = & BLAS_X[ss*nrhs*words];
|
|
||||||
Yh[ss] = & BLAS_Y[ss*nrhs*words];
|
|
||||||
Ch[ss] = & BLAS_Cred[ss*nrhs*nrhs];
|
|
||||||
|
|
||||||
}
|
|
||||||
acceleratorCopyToDevice(&Xh[0],&Xdip[0],vol*sizeof(scalar *));
|
|
||||||
acceleratorCopyToDevice(&Yh[0],&Ydip[0],vol*sizeof(scalar *));
|
|
||||||
acceleratorCopyToDevice(&Ch[0],&Cdip[0],vol*sizeof(scalar *));
|
|
||||||
|
|
||||||
GridBLAS BLAS;
|
|
||||||
|
|
||||||
RealD t3 = usecond();
|
|
||||||
/////////////////////////////////////////
|
|
||||||
// C_rs = X^dag Y
|
|
||||||
/////////////////////////////////////////
|
|
||||||
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
|
|
||||||
nrhs,nrhs,words,
|
|
||||||
ComplexD(1.0),
|
|
||||||
Xdip,
|
|
||||||
Ydip,
|
|
||||||
ComplexD(0.0), // wipe out C
|
|
||||||
Cdip);
|
|
||||||
BLAS.synchronise();
|
|
||||||
RealD t4 = usecond();
|
|
||||||
|
|
||||||
std::vector<scalar> HOST_C(BLAS_Cred.size()); // nrhs . nrhs -- the coefficients
|
|
||||||
acceleratorCopyFromDevice(&BLAS_Cred[0],&HOST_C[0],BLAS_Cred.size()*sizeof(scalar));
|
|
||||||
|
|
||||||
RealD t5 = usecond();
|
|
||||||
m = Eigen::MatrixXcd::Zero(nrhs,nrhs);
|
|
||||||
for(int ss=0;ss<vol;ss++){
|
|
||||||
Eigen::Map<Eigen::MatrixXcd> eC((std::complex<double> *)&HOST_C[ss*nrhs*nrhs],nrhs,nrhs);
|
|
||||||
m = m + eC;
|
|
||||||
}
|
|
||||||
RealD t6l = usecond();
|
|
||||||
grid->GlobalSumVector((scalar *) &m(0,0),nrhs*nrhs);
|
|
||||||
RealD t6 = usecond();
|
|
||||||
uint64_t M=nrhs;
|
|
||||||
uint64_t N=nrhs;
|
|
||||||
uint64_t K=vw;
|
|
||||||
RealD xybytes = grid->lSites()*sizeof(scalar_object);
|
|
||||||
RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K);
|
|
||||||
RealD flops = 8.0*M*N*K;
|
|
||||||
flops = flops/(t4-t3)/1.e3;
|
|
||||||
bytes = bytes/(t4-t3)/1.e3;
|
|
||||||
xybytes = 4*xybytes/(t2-t1)/1.e3;
|
|
||||||
std::cout <<GridLogPerformance<< "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
|
|
||||||
std::cout <<GridLogPerformance<< "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
|
|
||||||
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t2 "<< t2-t1<<" us "<<xybytes<<" GB/s"<<std::endl;
|
|
||||||
std::cout <<GridLogPerformance<< "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
|
|
||||||
std::cout <<GridLogPerformance<< "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
|
|
||||||
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< flops<<" GF/s"<<std::endl;
|
|
||||||
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< bytes<<" GB/s"<<std::endl;
|
|
||||||
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t5 "<< t5-t4<<" us"<<std::endl;
|
|
||||||
std::cout <<GridLogPerformance<< "InnerProductMatrix lsum t6l "<< t6l-t5<<" us"<<std::endl;
|
|
||||||
std::cout <<GridLogPerformance<< "InnerProductMatrix gsum t6 "<< t6-t6l<<" us"<<std::endl;
|
|
||||||
std::cout <<GridLogPerformance<< "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
|
@ -447,10 +447,10 @@ public:
|
|||||||
/////////////////////////////////////////
|
/////////////////////////////////////////
|
||||||
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
|
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
|
||||||
nbasis,nrhs,vw,
|
nbasis,nrhs,vw,
|
||||||
scalar(1.0),
|
ComplexD(1.0),
|
||||||
Vd,
|
Vd,
|
||||||
Fd,
|
Fd,
|
||||||
scalar(0.0), // wipe out C
|
ComplexD(0.0), // wipe out C
|
||||||
Cd);
|
Cd);
|
||||||
BLAS.synchronise();
|
BLAS.synchronise();
|
||||||
// std::cout << "BlockProject done"<<std::endl;
|
// std::cout << "BlockProject done"<<std::endl;
|
||||||
@ -497,10 +497,10 @@ public:
|
|||||||
int64_t vw = block_vol * words;
|
int64_t vw = block_vol * words;
|
||||||
BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
|
BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
|
||||||
vw,nrhs,nbasis,
|
vw,nrhs,nbasis,
|
||||||
scalar(1.0),
|
ComplexD(1.0),
|
||||||
Vd,
|
Vd,
|
||||||
Cd,
|
Cd,
|
||||||
scalar(0.0), // wipe out C
|
ComplexD(0.0), // wipe out C
|
||||||
Fd);
|
Fd);
|
||||||
BLAS.synchronise();
|
BLAS.synchronise();
|
||||||
// std::cout << " blas call done"<<std::endl;
|
// std::cout << " blas call done"<<std::endl;
|
||||||
|
@ -182,10 +182,10 @@ public:
|
|||||||
/////////////////////////////////////////
|
/////////////////////////////////////////
|
||||||
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
|
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
|
||||||
nev,nrhs,vw,
|
nev,nrhs,vw,
|
||||||
scalar(1.0),
|
ComplexD(1.0),
|
||||||
Ed,
|
Ed,
|
||||||
Rd,
|
Rd,
|
||||||
scalar(0.0), // wipe out C
|
ComplexD(0.0), // wipe out C
|
||||||
Cd);
|
Cd);
|
||||||
BLAS.synchronise();
|
BLAS.synchronise();
|
||||||
|
|
||||||
@ -210,10 +210,10 @@ public:
|
|||||||
/////////////////////////////////////////
|
/////////////////////////////////////////
|
||||||
BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
|
BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
|
||||||
vw,nrhs,nev,
|
vw,nrhs,nev,
|
||||||
scalar(1.0),
|
ComplexD(1.0),
|
||||||
Ed, // x . nev
|
Ed, // x . nev
|
||||||
Cd, // nev . nrhs
|
Cd, // nev . nrhs
|
||||||
scalar(0.0),
|
ComplexD(0.0),
|
||||||
Gd);
|
Gd);
|
||||||
BLAS.synchronise();
|
BLAS.synchronise();
|
||||||
|
|
||||||
|
@ -53,7 +53,6 @@ class TwoLevelCGmrhs
|
|||||||
// Fine operator, Smoother, CoarseSolver
|
// Fine operator, Smoother, CoarseSolver
|
||||||
LinearOperatorBase<Field> &_FineLinop;
|
LinearOperatorBase<Field> &_FineLinop;
|
||||||
LinearFunction<Field> &_Smoother;
|
LinearFunction<Field> &_Smoother;
|
||||||
MultiRHSBlockCGLinalg<Field> _BlockCGLinalg;
|
|
||||||
|
|
||||||
GridStopWatch ProjectTimer;
|
GridStopWatch ProjectTimer;
|
||||||
GridStopWatch PromoteTimer;
|
GridStopWatch PromoteTimer;
|
||||||
@ -63,12 +62,7 @@ class TwoLevelCGmrhs
|
|||||||
GridStopWatch SmoothTimer;
|
GridStopWatch SmoothTimer;
|
||||||
GridStopWatch InsertTimer;
|
GridStopWatch InsertTimer;
|
||||||
|
|
||||||
/*
|
|
||||||
Field rrr;
|
|
||||||
Field sss;
|
|
||||||
Field qqq;
|
|
||||||
Field zzz;
|
|
||||||
*/
|
|
||||||
// more most opertor functions
|
// more most opertor functions
|
||||||
TwoLevelCGmrhs(RealD tol,
|
TwoLevelCGmrhs(RealD tol,
|
||||||
Integer maxit,
|
Integer maxit,
|
||||||
@ -79,313 +73,12 @@ class TwoLevelCGmrhs
|
|||||||
MaxIterations(maxit),
|
MaxIterations(maxit),
|
||||||
_FineLinop(FineLinop),
|
_FineLinop(FineLinop),
|
||||||
_Smoother(Smoother)
|
_Smoother(Smoother)
|
||||||
/*
|
|
||||||
rrr(fine),
|
|
||||||
sss(fine),
|
|
||||||
qqq(fine),
|
|
||||||
zzz(fine)
|
|
||||||
*/
|
|
||||||
{
|
{
|
||||||
grid = fine;
|
grid = fine;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Vector case
|
// Vector case
|
||||||
virtual void operator() (std::vector<Field> &src, std::vector<Field> &x)
|
virtual void operator() (std::vector<Field> &src, std::vector<Field> &x)
|
||||||
{
|
|
||||||
// SolveSingleSystem(src,x);
|
|
||||||
SolvePrecBlockCG(src,x);
|
|
||||||
}
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Thin QR factorisation (google it)
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
//Dimensions
|
|
||||||
// R_{ferm x Nblock} = Q_{ferm x Nblock} x C_{Nblock x Nblock} -> ferm x Nblock
|
|
||||||
//
|
|
||||||
// Rdag R = m_rr = Herm = L L^dag <-- Cholesky decomposition (LLT routine in Eigen)
|
|
||||||
//
|
|
||||||
// Q C = R => Q = R C^{-1}
|
|
||||||
//
|
|
||||||
// Want Ident = Q^dag Q = C^{-dag} R^dag R C^{-1} = C^{-dag} L L^dag C^{-1} = 1_{Nblock x Nblock}
|
|
||||||
//
|
|
||||||
// Set C = L^{dag}, and then Q^dag Q = ident
|
|
||||||
//
|
|
||||||
// Checks:
|
|
||||||
// Cdag C = Rdag R ; passes.
|
|
||||||
// QdagQ = 1 ; passes
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
void ThinQRfact (Eigen::MatrixXcd &m_zz,
|
|
||||||
Eigen::MatrixXcd &C,
|
|
||||||
Eigen::MatrixXcd &Cinv,
|
|
||||||
std::vector<Field> & Q,
|
|
||||||
std::vector<Field> & MQ,
|
|
||||||
const std::vector<Field> & Z,
|
|
||||||
const std::vector<Field> & MZ)
|
|
||||||
{
|
|
||||||
RealD t0=usecond();
|
|
||||||
_BlockCGLinalg.InnerProductMatrix(m_zz,MZ,Z);
|
|
||||||
RealD t1=usecond();
|
|
||||||
|
|
||||||
m_zz = 0.5*(m_zz+m_zz.adjoint());
|
|
||||||
|
|
||||||
Eigen::MatrixXcd L = m_zz.llt().matrixL();
|
|
||||||
|
|
||||||
C = L.adjoint();
|
|
||||||
Cinv = C.inverse();
|
|
||||||
|
|
||||||
RealD t3=usecond();
|
|
||||||
_BlockCGLinalg.MulMatrix( Q,Cinv,Z);
|
|
||||||
_BlockCGLinalg.MulMatrix(MQ,Cinv,MZ);
|
|
||||||
RealD t4=usecond();
|
|
||||||
std::cout << " ThinQRfact IP :"<< t1-t0<<" us"<<std::endl;
|
|
||||||
std::cout << " ThinQRfact Eigen :"<< t3-t1<<" us"<<std::endl;
|
|
||||||
std::cout << " ThinQRfact MulMat:"<< t4-t3<<" us"<<std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
virtual void SolvePrecBlockCG (std::vector<Field> &src, std::vector<Field> &X)
|
|
||||||
{
|
|
||||||
std::cout << GridLogMessage<<"HDCG: mrhs fPrecBlockcg starting"<<std::endl;
|
|
||||||
src[0].Grid()->Barrier();
|
|
||||||
int nrhs = src.size();
|
|
||||||
// std::vector<RealD> f(nrhs);
|
|
||||||
// std::vector<RealD> rtzp(nrhs);
|
|
||||||
// std::vector<RealD> rtz(nrhs);
|
|
||||||
// std::vector<RealD> a(nrhs);
|
|
||||||
// std::vector<RealD> d(nrhs);
|
|
||||||
// std::vector<RealD> b(nrhs);
|
|
||||||
// std::vector<RealD> rptzp(nrhs);
|
|
||||||
|
|
||||||
////////////////////////////////////////////
|
|
||||||
//Initial residual computation & set up
|
|
||||||
////////////////////////////////////////////
|
|
||||||
std::vector<RealD> ssq(nrhs);
|
|
||||||
for(int rhs=0;rhs<nrhs;rhs++){
|
|
||||||
ssq[rhs]=norm2(src[rhs]); assert(ssq[rhs]!=0.0);
|
|
||||||
}
|
|
||||||
|
|
||||||
///////////////////////////
|
|
||||||
// Fields -- eliminate duplicates between fPcg and block cg
|
|
||||||
///////////////////////////
|
|
||||||
std::vector<Field> Mtmp(nrhs,grid);
|
|
||||||
std::vector<Field> tmp(nrhs,grid);
|
|
||||||
std::vector<Field> Z(nrhs,grid); // Rename Z to R
|
|
||||||
std::vector<Field> MZ(nrhs,grid); // Rename MZ to Z
|
|
||||||
std::vector<Field> Q(nrhs,grid); //
|
|
||||||
std::vector<Field> MQ(nrhs,grid); // Rename to P
|
|
||||||
std::vector<Field> D(nrhs,grid);
|
|
||||||
std::vector<Field> AD(nrhs,grid);
|
|
||||||
|
|
||||||
/************************************************************************
|
|
||||||
* Preconditioned Block conjugate gradient rQ
|
|
||||||
* Generalise Sebastien Birk Thesis, after Dubrulle 2001.
|
|
||||||
* Introduce preconditioning following Saad Ch9
|
|
||||||
************************************************************************
|
|
||||||
* Dimensions:
|
|
||||||
*
|
|
||||||
* X,B etc... ==(Nferm x nrhs)
|
|
||||||
* Matrix A==(Nferm x Nferm)
|
|
||||||
*
|
|
||||||
* Nferm = Nspin x Ncolour x Ncomplex x Nlattice_site
|
|
||||||
* QC => Thin QR factorisation (google it)
|
|
||||||
*
|
|
||||||
* R = B-AX
|
|
||||||
* Z = Mi R
|
|
||||||
* QC = Z
|
|
||||||
* D = Q
|
|
||||||
* for k:
|
|
||||||
* R = AD
|
|
||||||
* Z = Mi R
|
|
||||||
* M = [D^dag R]^{-1}
|
|
||||||
* X = X + D M C
|
|
||||||
* QS = Q - Z.M
|
|
||||||
* D = Q + D S^dag
|
|
||||||
* C = S C
|
|
||||||
*/
|
|
||||||
Eigen::MatrixXcd m_DZ = Eigen::MatrixXcd::Identity(nrhs,nrhs);
|
|
||||||
Eigen::MatrixXcd m_M = Eigen::MatrixXcd::Identity(nrhs,nrhs);
|
|
||||||
Eigen::MatrixXcd m_zz = Eigen::MatrixXcd::Zero(nrhs,nrhs);
|
|
||||||
Eigen::MatrixXcd m_rr = Eigen::MatrixXcd::Zero(nrhs,nrhs);
|
|
||||||
|
|
||||||
Eigen::MatrixXcd m_C = Eigen::MatrixXcd::Zero(nrhs,nrhs);
|
|
||||||
Eigen::MatrixXcd m_Cinv = Eigen::MatrixXcd::Zero(nrhs,nrhs);
|
|
||||||
Eigen::MatrixXcd m_S = Eigen::MatrixXcd::Zero(nrhs,nrhs);
|
|
||||||
Eigen::MatrixXcd m_Sinv = Eigen::MatrixXcd::Zero(nrhs,nrhs);
|
|
||||||
|
|
||||||
Eigen::MatrixXcd m_tmp = Eigen::MatrixXcd::Identity(nrhs,nrhs);
|
|
||||||
Eigen::MatrixXcd m_tmp1 = Eigen::MatrixXcd::Identity(nrhs,nrhs);
|
|
||||||
|
|
||||||
GridStopWatch HDCGTimer;
|
|
||||||
|
|
||||||
//////////////////////////
|
|
||||||
// x0 = Vstart -- possibly modify guess
|
|
||||||
//////////////////////////
|
|
||||||
Vstart(X,src);
|
|
||||||
|
|
||||||
//////////////////////////
|
|
||||||
// R = B-AX
|
|
||||||
//////////////////////////
|
|
||||||
for(int rhs=0;rhs<nrhs;rhs++){
|
|
||||||
// r0 = b -A x0
|
|
||||||
_FineLinop.HermOp(X[rhs],tmp[rhs]);
|
|
||||||
axpy (Z[rhs], -1.0,tmp[rhs], src[rhs]); // Computes R=Z=src - A X0
|
|
||||||
}
|
|
||||||
|
|
||||||
//////////////////////////////////
|
|
||||||
// Compute MZ = M1 Z = M1 B - M1 A x0
|
|
||||||
//////////////////////////////////
|
|
||||||
PcgM1(Z,MZ);
|
|
||||||
|
|
||||||
//////////////////////////////////
|
|
||||||
// QC = Z
|
|
||||||
//////////////////////////////////
|
|
||||||
ThinQRfact (m_zz, m_C, m_Cinv, Q, MQ, Z, MZ);
|
|
||||||
|
|
||||||
//////////////////////////////////
|
|
||||||
// D=MQ
|
|
||||||
//////////////////////////////////
|
|
||||||
for(int b=0;b<nrhs;b++) D[b]=MQ[b]; // LLT rotation of the MZ basis of search dirs
|
|
||||||
|
|
||||||
std::cout << GridLogMessage<<"PrecBlockCGrQ vec computed initial residual and QR fact " <<std::endl;
|
|
||||||
|
|
||||||
ProjectTimer.Reset();
|
|
||||||
PromoteTimer.Reset();
|
|
||||||
DeflateTimer.Reset();
|
|
||||||
CoarseTimer.Reset();
|
|
||||||
SmoothTimer.Reset();
|
|
||||||
FineTimer.Reset();
|
|
||||||
InsertTimer.Reset();
|
|
||||||
|
|
||||||
GridStopWatch M1Timer;
|
|
||||||
GridStopWatch M2Timer;
|
|
||||||
GridStopWatch M3Timer;
|
|
||||||
GridStopWatch LinalgTimer;
|
|
||||||
GridStopWatch InnerProdTimer;
|
|
||||||
|
|
||||||
HDCGTimer.Start();
|
|
||||||
|
|
||||||
std::vector<RealD> rn(nrhs);
|
|
||||||
for (int k=0;k<=MaxIterations;k++){
|
|
||||||
|
|
||||||
////////////////////
|
|
||||||
// Z = AD
|
|
||||||
////////////////////
|
|
||||||
M3Timer.Start();
|
|
||||||
for(int b=0;b<nrhs;b++) _FineLinop.HermOp(D[b], Z[b]);
|
|
||||||
M3Timer.Stop();
|
|
||||||
|
|
||||||
////////////////////
|
|
||||||
// MZ = M1 Z <==== the Multigrid preconditioner
|
|
||||||
////////////////////
|
|
||||||
M1Timer.Start();
|
|
||||||
PcgM1(Z,MZ);
|
|
||||||
M1Timer.Stop();
|
|
||||||
|
|
||||||
FineTimer.Start();
|
|
||||||
////////////////////
|
|
||||||
// M = [D^dag Z]^{-1} = (<Ddag MZ>_M)^{-1} inner prod, generalising Saad derivation of Precon CG
|
|
||||||
////////////////////
|
|
||||||
InnerProdTimer.Start();
|
|
||||||
_BlockCGLinalg.InnerProductMatrix(m_DZ,D,Z);
|
|
||||||
InnerProdTimer.Stop();
|
|
||||||
m_M = m_DZ.inverse();
|
|
||||||
|
|
||||||
///////////////////////////
|
|
||||||
// X = X + D MC
|
|
||||||
///////////////////////////
|
|
||||||
m_tmp = m_M * m_C;
|
|
||||||
LinalgTimer.Start();
|
|
||||||
_BlockCGLinalg.MaddMatrix(X,m_tmp, D,X); // D are the search directions and X takes the updates
|
|
||||||
LinalgTimer.Stop();
|
|
||||||
|
|
||||||
///////////////////////////
|
|
||||||
// QS = Q - M Z
|
|
||||||
// (MQ) S = MQ - M (M1Z)
|
|
||||||
///////////////////////////
|
|
||||||
LinalgTimer.Start();
|
|
||||||
_BlockCGLinalg.MaddMatrix(tmp ,m_M, Z, Q,-1.0);
|
|
||||||
_BlockCGLinalg.MaddMatrix(Mtmp,m_M,MZ,MQ,-1.0);
|
|
||||||
ThinQRfact (m_zz, m_S, m_Sinv, Q, MQ, tmp, Mtmp);
|
|
||||||
LinalgTimer.Stop();
|
|
||||||
|
|
||||||
////////////////////////////
|
|
||||||
// D = MQ + D S^dag
|
|
||||||
////////////////////////////
|
|
||||||
m_tmp = m_S.adjoint();
|
|
||||||
LinalgTimer.Start();
|
|
||||||
_BlockCGLinalg.MaddMatrix(D,m_tmp,D,MQ);
|
|
||||||
LinalgTimer.Stop();
|
|
||||||
|
|
||||||
////////////////////////////
|
|
||||||
// C = S C
|
|
||||||
////////////////////////////
|
|
||||||
m_C = m_S*m_C;
|
|
||||||
|
|
||||||
////////////////////////////
|
|
||||||
// convergence monitor
|
|
||||||
////////////////////////////
|
|
||||||
m_rr = m_C.adjoint() * m_C;
|
|
||||||
|
|
||||||
FineTimer.Stop();
|
|
||||||
|
|
||||||
RealD max_resid=0;
|
|
||||||
RealD rrsum=0;
|
|
||||||
RealD sssum=0;
|
|
||||||
RealD rr;
|
|
||||||
|
|
||||||
for(int b=0;b<nrhs;b++) {
|
|
||||||
rrsum+=real(m_rr(b,b));
|
|
||||||
sssum+=ssq[b];
|
|
||||||
rr = real(m_rr(b,b))/ssq[b];
|
|
||||||
if ( rr > max_resid ) max_resid = rr;
|
|
||||||
}
|
|
||||||
std::cout << GridLogMessage <<
|
|
||||||
"\t Prec BlockCGrQ Iteration "<<k<<" ave resid "<< std::sqrt(rrsum/sssum) << " max "<< std::sqrt(max_resid) <<std::endl;
|
|
||||||
|
|
||||||
|
|
||||||
if ( max_resid < Tolerance*Tolerance ) {
|
|
||||||
|
|
||||||
HDCGTimer.Stop();
|
|
||||||
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ converged in "<<k<<" iterations and "<<HDCGTimer.Elapsed()<<std::endl;;
|
|
||||||
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Linalg "<<LinalgTimer.Elapsed()<<std::endl;;
|
|
||||||
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : fine H "<<M3Timer.Elapsed()<<std::endl;;
|
|
||||||
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : prec M1 "<<M1Timer.Elapsed()<<std::endl;;
|
|
||||||
std::cout<<GridLogMessage<<"**** M1 breakdown:"<<std::endl;
|
|
||||||
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Project "<<ProjectTimer.Elapsed()<<std::endl;;
|
|
||||||
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Promote "<<PromoteTimer.Elapsed()<<std::endl;;
|
|
||||||
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Deflate "<<DeflateTimer.Elapsed()<<std::endl;;
|
|
||||||
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Coarse "<<CoarseTimer.Elapsed()<<std::endl;;
|
|
||||||
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Fine "<<FineTimer.Elapsed()<<std::endl;;
|
|
||||||
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Smooth "<<SmoothTimer.Elapsed()<<std::endl;;
|
|
||||||
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Insert "<<InsertTimer.Elapsed()<<std::endl;;
|
|
||||||
|
|
||||||
for(int rhs=0;rhs<nrhs;rhs++){
|
|
||||||
|
|
||||||
_FineLinop.HermOp(X[rhs],tmp[rhs]);
|
|
||||||
|
|
||||||
Field mytmp(grid);
|
|
||||||
axpy(mytmp,-1.0,src[rhs],tmp[rhs]);
|
|
||||||
|
|
||||||
RealD xnorm = sqrt(norm2(X[rhs]));
|
|
||||||
RealD srcnorm = sqrt(norm2(src[rhs]));
|
|
||||||
RealD tmpnorm = sqrt(norm2(mytmp));
|
|
||||||
RealD true_residual = tmpnorm/srcnorm;
|
|
||||||
std::cout<<GridLogMessage
|
|
||||||
<<"HDCG: true residual ["<<rhs<<"] is "<<true_residual
|
|
||||||
<<" solution "<<xnorm
|
|
||||||
<<" source "<<srcnorm
|
|
||||||
<<std::endl;
|
|
||||||
}
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
HDCGTimer.Stop();
|
|
||||||
std::cout<<GridLogMessage<<"HDCG: PrecBlockCGrQ not converged "<<HDCGTimer.Elapsed()<<std::endl;
|
|
||||||
assert(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
virtual void SolveSingleSystem (std::vector<Field> &src, std::vector<Field> &x)
|
|
||||||
{
|
{
|
||||||
std::cout << GridLogMessage<<"HDCG: mrhs fPcg starting"<<std::endl;
|
std::cout << GridLogMessage<<"HDCG: mrhs fPcg starting"<<std::endl;
|
||||||
src[0].Grid()->Barrier();
|
src[0].Grid()->Barrier();
|
||||||
@ -668,26 +361,15 @@ public:
|
|||||||
CoarseField PleftProjMrhs(this->coarsegridmrhs);
|
CoarseField PleftProjMrhs(this->coarsegridmrhs);
|
||||||
CoarseField PleftMss_projMrhs(this->coarsegridmrhs);
|
CoarseField PleftMss_projMrhs(this->coarsegridmrhs);
|
||||||
|
|
||||||
// this->rrr=in[0];
|
|
||||||
|
|
||||||
#undef SMOOTHER_BLOCK_SOLVE
|
|
||||||
#if SMOOTHER_BLOCK_SOLVE
|
|
||||||
this->SmoothTimer.Start();
|
|
||||||
this->_Smoother(in,Min);
|
|
||||||
this->SmoothTimer.Stop();
|
|
||||||
#else
|
|
||||||
for(int rhs=0;rhs<nrhs;rhs++) {
|
for(int rhs=0;rhs<nrhs;rhs++) {
|
||||||
|
|
||||||
this->SmoothTimer.Start();
|
this->SmoothTimer.Start();
|
||||||
this->_Smoother(in[rhs],Min[rhs]);
|
this->_Smoother(in[rhs],Min[rhs]);
|
||||||
this->SmoothTimer.Stop();
|
this->SmoothTimer.Stop();
|
||||||
}
|
|
||||||
#endif
|
|
||||||
// this->sss=Min[0];
|
|
||||||
|
|
||||||
for(int rhs=0;rhs<nrhs;rhs++) {
|
|
||||||
|
|
||||||
this->FineTimer.Start();
|
this->FineTimer.Start();
|
||||||
this->_FineLinop.HermOp(Min[rhs],out[rhs]);
|
this->_FineLinop.HermOp(Min[rhs],out[rhs]);
|
||||||
|
|
||||||
axpy(tmp[rhs],-1.0,out[rhs],in[rhs]); // resid = in - A Min
|
axpy(tmp[rhs],-1.0,out[rhs],in[rhs]); // resid = in - A Min
|
||||||
this->FineTimer.Stop();
|
this->FineTimer.Stop();
|
||||||
|
|
||||||
@ -719,15 +401,13 @@ public:
|
|||||||
this->_Projector.blockPromote(tmp,PleftMss_proj);// tmp= Q[in - A Min]
|
this->_Projector.blockPromote(tmp,PleftMss_proj);// tmp= Q[in - A Min]
|
||||||
this->PromoteTimer.Stop();
|
this->PromoteTimer.Stop();
|
||||||
this->FineTimer.Start();
|
this->FineTimer.Start();
|
||||||
// this->qqq=tmp[0];
|
|
||||||
for(int rhs=0;rhs<nrhs;rhs++) {
|
for(int rhs=0;rhs<nrhs;rhs++) {
|
||||||
axpy(out[rhs],1.0,Min[rhs],tmp[rhs]); // Min+tmp
|
axpy(out[rhs],1.0,Min[rhs],tmp[rhs]); // Min+tmp
|
||||||
}
|
}
|
||||||
// this->zzz=out[0];
|
|
||||||
this->FineTimer.Stop();
|
this->FineTimer.Stop();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
@ -31,58 +31,6 @@ directory
|
|||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
template<class Field>
|
|
||||||
void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y){
|
|
||||||
typedef typename Field::scalar_type scomplex;
|
|
||||||
int Nblock = X.size();
|
|
||||||
for(int b=0;b<Nblock;b++){
|
|
||||||
for(int bp=0;bp<Nblock;bp++) {
|
|
||||||
m(b,bp) = innerProduct(X[b],Y[bp]);
|
|
||||||
}}
|
|
||||||
}
|
|
||||||
template<class Field>
|
|
||||||
void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0){
|
|
||||||
// Should make this cache friendly with site outermost, parallel_for
|
|
||||||
// Deal with case AP aliases with either Y or X
|
|
||||||
//
|
|
||||||
//Could pack "X" and "AP" into a Nblock x Volume dense array.
|
|
||||||
// AP(Nrhs x vol) = Y(Nrhs x vol) + scale * m(nrhs x nrhs) * X(nrhs*vol)
|
|
||||||
typedef typename Field::scalar_type scomplex;
|
|
||||||
int Nblock = AP.size();
|
|
||||||
std::vector<Field> tmp(Nblock,X[0]);
|
|
||||||
for(int b=0;b<Nblock;b++){
|
|
||||||
tmp[b] = Y[b];
|
|
||||||
for(int bp=0;bp<Nblock;bp++) {
|
|
||||||
tmp[b] = tmp[b] +scomplex(scale*m(bp,b))*X[bp];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for(int b=0;b<Nblock;b++){
|
|
||||||
AP[b] = tmp[b];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
template<class Field>
|
|
||||||
void MulMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X){
|
|
||||||
// Should make this cache friendly with site outermost, parallel_for
|
|
||||||
typedef typename Field::scalar_type scomplex;
|
|
||||||
int Nblock = AP.size();
|
|
||||||
for(int b=0;b<Nblock;b++){
|
|
||||||
AP[b] = Zero();
|
|
||||||
for(int bp=0;bp<Nblock;bp++) {
|
|
||||||
AP[b] += scomplex(m(bp,b))*X[bp];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
template<class Field>
|
|
||||||
double normv(const std::vector<Field> &P){
|
|
||||||
int Nblock = P.size();
|
|
||||||
double nn = 0.0;
|
|
||||||
for(int b=0;b<Nblock;b++) {
|
|
||||||
nn+=norm2(P[b]);
|
|
||||||
}
|
|
||||||
return nn;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS, BlockCGVec, BlockCGrQVec };
|
enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS, BlockCGVec, BlockCGrQVec };
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
@ -139,19 +87,10 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
|
|||||||
sliceInnerProductMatrix(m_rr,R,R,Orthog);
|
sliceInnerProductMatrix(m_rr,R,R,Orthog);
|
||||||
|
|
||||||
// Force manifest hermitian to avoid rounding related
|
// Force manifest hermitian to avoid rounding related
|
||||||
/*
|
|
||||||
int rank=m_rr.rows();
|
|
||||||
for(int r=0;r<rank;r++){
|
|
||||||
for(int s=0;s<rank;s++){
|
|
||||||
std::cout << "QR m_rr["<<r<<","<<s<<"] "<<m_rr(r,s)<<std::endl;
|
|
||||||
}}
|
|
||||||
*/
|
|
||||||
m_rr = 0.5*(m_rr+m_rr.adjoint());
|
m_rr = 0.5*(m_rr+m_rr.adjoint());
|
||||||
|
|
||||||
Eigen::MatrixXcd L = m_rr.llt().matrixL();
|
Eigen::MatrixXcd L = m_rr.llt().matrixL();
|
||||||
|
|
||||||
// ComplexD det = L.determinant();
|
|
||||||
// std::cout << " Det m_rr "<<det<<std::endl;
|
|
||||||
C = L.adjoint();
|
C = L.adjoint();
|
||||||
Cinv = C.inverse();
|
Cinv = C.inverse();
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
@ -171,20 +110,11 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
|
|||||||
const std::vector<Field> & R)
|
const std::vector<Field> & R)
|
||||||
{
|
{
|
||||||
InnerProductMatrix(m_rr,R,R);
|
InnerProductMatrix(m_rr,R,R);
|
||||||
/*
|
|
||||||
int rank=m_rr.rows();
|
|
||||||
for(int r=0;r<rank;r++){
|
|
||||||
for(int s=0;s<rank;s++){
|
|
||||||
std::cout << "QRvec m_rr["<<r<<","<<s<<"] "<<m_rr(r,s)<<std::endl;
|
|
||||||
}}
|
|
||||||
*/
|
|
||||||
m_rr = 0.5*(m_rr+m_rr.adjoint());
|
m_rr = 0.5*(m_rr+m_rr.adjoint());
|
||||||
|
|
||||||
Eigen::MatrixXcd L = m_rr.llt().matrixL();
|
Eigen::MatrixXcd L = m_rr.llt().matrixL();
|
||||||
|
|
||||||
// ComplexD det = L.determinant();
|
|
||||||
// std::cout << " Det m_rr "<<det<<std::endl;
|
|
||||||
|
|
||||||
C = L.adjoint();
|
C = L.adjoint();
|
||||||
Cinv = C.inverse();
|
Cinv = C.inverse();
|
||||||
|
|
||||||
@ -256,7 +186,6 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
|
|||||||
sliceNorm(ssq,B,Orthog);
|
sliceNorm(ssq,B,Orthog);
|
||||||
RealD sssum=0;
|
RealD sssum=0;
|
||||||
for(int b=0;b<Nblock;b++) sssum+=ssq[b];
|
for(int b=0;b<Nblock;b++) sssum+=ssq[b];
|
||||||
for(int b=0;b<Nblock;b++) std::cout << "src["<<b<<"]" << ssq[b] <<std::endl;
|
|
||||||
|
|
||||||
sliceNorm(residuals,B,Orthog);
|
sliceNorm(residuals,B,Orthog);
|
||||||
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
|
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
|
||||||
@ -292,9 +221,6 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
|
|||||||
Linop.HermOp(X, AD);
|
Linop.HermOp(X, AD);
|
||||||
tmp = B - AD;
|
tmp = B - AD;
|
||||||
|
|
||||||
sliceNorm(residuals,tmp,Orthog);
|
|
||||||
for(int b=0;b<Nblock;b++) std::cout << "res["<<b<<"]" << residuals[b] <<std::endl;
|
|
||||||
|
|
||||||
ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
|
ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
|
||||||
D=Q;
|
D=Q;
|
||||||
|
|
||||||
@ -310,8 +236,6 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
|
|||||||
GridStopWatch SolverTimer;
|
GridStopWatch SolverTimer;
|
||||||
SolverTimer.Start();
|
SolverTimer.Start();
|
||||||
|
|
||||||
RealD max_resid=0;
|
|
||||||
|
|
||||||
int k;
|
int k;
|
||||||
for (k = 1; k <= MaxIterations; k++) {
|
for (k = 1; k <= MaxIterations; k++) {
|
||||||
|
|
||||||
@ -356,7 +280,7 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
|
|||||||
*/
|
*/
|
||||||
m_rr = m_C.adjoint() * m_C;
|
m_rr = m_C.adjoint() * m_C;
|
||||||
|
|
||||||
max_resid=0;
|
RealD max_resid=0;
|
||||||
RealD rrsum=0;
|
RealD rrsum=0;
|
||||||
RealD rr;
|
RealD rr;
|
||||||
|
|
||||||
@ -398,9 +322,7 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
|
|||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl;
|
||||||
std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge "<<k<<" / "<<MaxIterations
|
|
||||||
<<" residual "<< std::sqrt(max_resid)<< std::endl;
|
|
||||||
|
|
||||||
if (ErrorOnNoConverge) assert(0);
|
if (ErrorOnNoConverge) assert(0);
|
||||||
IterationsToComplete = k;
|
IterationsToComplete = k;
|
||||||
@ -544,6 +466,43 @@ void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &
|
|||||||
IterationsToComplete = k;
|
IterationsToComplete = k;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y){
|
||||||
|
for(int b=0;b<Nblock;b++){
|
||||||
|
for(int bp=0;bp<Nblock;bp++) {
|
||||||
|
m(b,bp) = innerProduct(X[b],Y[bp]);
|
||||||
|
}}
|
||||||
|
}
|
||||||
|
void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0){
|
||||||
|
// Should make this cache friendly with site outermost, parallel_for
|
||||||
|
// Deal with case AP aliases with either Y or X
|
||||||
|
std::vector<Field> tmp(Nblock,X[0]);
|
||||||
|
for(int b=0;b<Nblock;b++){
|
||||||
|
tmp[b] = Y[b];
|
||||||
|
for(int bp=0;bp<Nblock;bp++) {
|
||||||
|
tmp[b] = tmp[b] + scomplex(scale*m(bp,b))*X[bp];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for(int b=0;b<Nblock;b++){
|
||||||
|
AP[b] = tmp[b];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void MulMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X){
|
||||||
|
// Should make this cache friendly with site outermost, parallel_for
|
||||||
|
for(int b=0;b<Nblock;b++){
|
||||||
|
AP[b] = Zero();
|
||||||
|
for(int bp=0;bp<Nblock;bp++) {
|
||||||
|
AP[b] += scomplex(m(bp,b))*X[bp];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
double normv(const std::vector<Field> &P){
|
||||||
|
double nn = 0.0;
|
||||||
|
for(int b=0;b<Nblock;b++) {
|
||||||
|
nn+=norm2(P[b]);
|
||||||
|
}
|
||||||
|
return nn;
|
||||||
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////
|
||||||
// BlockCGrQvec implementation:
|
// BlockCGrQvec implementation:
|
||||||
//--------------------------
|
//--------------------------
|
||||||
@ -590,7 +549,6 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
|
|||||||
|
|
||||||
RealD sssum=0;
|
RealD sssum=0;
|
||||||
for(int b=0;b<Nblock;b++){ ssq[b] = norm2(B[b]);}
|
for(int b=0;b<Nblock;b++){ ssq[b] = norm2(B[b]);}
|
||||||
for(int b=0;b<Nblock;b++){ std::cout << "ssq["<<b<<"] "<<ssq[b]<<std::endl;}
|
|
||||||
for(int b=0;b<Nblock;b++) sssum+=ssq[b];
|
for(int b=0;b<Nblock;b++) sssum+=ssq[b];
|
||||||
|
|
||||||
for(int b=0;b<Nblock;b++){ residuals[b] = norm2(B[b]);}
|
for(int b=0;b<Nblock;b++){ residuals[b] = norm2(B[b]);}
|
||||||
@ -627,7 +585,6 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
|
|||||||
for(int b=0;b<Nblock;b++) {
|
for(int b=0;b<Nblock;b++) {
|
||||||
Linop.HermOp(X[b], AD[b]);
|
Linop.HermOp(X[b], AD[b]);
|
||||||
tmp[b] = B[b] - AD[b];
|
tmp[b] = B[b] - AD[b];
|
||||||
std::cout << "r0["<<b<<"] "<<norm2(tmp[b])<<std::endl;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
|
ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
|
||||||
|
@ -38,13 +38,12 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
// single input vec, single output vec.
|
// single input vec, single output vec.
|
||||||
/////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
|
||||||
template <class Field>
|
template <class Field>
|
||||||
class ConjugateGradient : public OperatorFunction<Field> {
|
class ConjugateGradient : public OperatorFunction<Field> {
|
||||||
public:
|
public:
|
||||||
|
|
||||||
using OperatorFunction<Field>::operator();
|
using OperatorFunction<Field>::operator();
|
||||||
|
|
||||||
bool ErrorOnNoConverge; // throw an assert when the CG fails to converge.
|
bool ErrorOnNoConverge; // throw an assert when the CG fails to converge.
|
||||||
// Defaults true.
|
// Defaults true.
|
||||||
RealD Tolerance;
|
RealD Tolerance;
|
||||||
@ -58,22 +57,10 @@ public:
|
|||||||
ErrorOnNoConverge(err_on_no_conv)
|
ErrorOnNoConverge(err_on_no_conv)
|
||||||
{};
|
{};
|
||||||
|
|
||||||
virtual void LogIteration(int k,RealD a,RealD b){
|
void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
|
||||||
// std::cout << "ConjugageGradient::LogIteration() "<<std::endl;
|
|
||||||
};
|
|
||||||
virtual void LogBegin(void){
|
|
||||||
std::cout << "ConjugageGradient::LogBegin() "<<std::endl;
|
|
||||||
};
|
|
||||||
|
|
||||||
void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
|
GRID_TRACE("ConjugateGradient");
|
||||||
|
|
||||||
this->LogBegin();
|
|
||||||
|
|
||||||
GRID_TRACE("ConjugateGradient");
|
|
||||||
GridStopWatch PreambleTimer;
|
GridStopWatch PreambleTimer;
|
||||||
GridStopWatch ConstructTimer;
|
|
||||||
GridStopWatch NormTimer;
|
|
||||||
GridStopWatch AssignTimer;
|
|
||||||
PreambleTimer.Start();
|
PreambleTimer.Start();
|
||||||
psi.Checkerboard() = src.Checkerboard();
|
psi.Checkerboard() = src.Checkerboard();
|
||||||
|
|
||||||
@ -83,19 +70,14 @@ public:
|
|||||||
//RealD b_pred;
|
//RealD b_pred;
|
||||||
|
|
||||||
// Was doing copies
|
// Was doing copies
|
||||||
ConstructTimer.Start();
|
Field p(src.Grid());
|
||||||
Field p (src.Grid());
|
|
||||||
Field mmp(src.Grid());
|
Field mmp(src.Grid());
|
||||||
Field r (src.Grid());
|
Field r(src.Grid());
|
||||||
ConstructTimer.Stop();
|
|
||||||
|
|
||||||
// Initial residual computation & set up
|
// Initial residual computation & set up
|
||||||
NormTimer.Start();
|
|
||||||
ssq = norm2(src);
|
ssq = norm2(src);
|
||||||
RealD guess = norm2(psi);
|
RealD guess = norm2(psi);
|
||||||
NormTimer.Stop();
|
|
||||||
assert(std::isnan(guess) == 0);
|
assert(std::isnan(guess) == 0);
|
||||||
AssignTimer.Start();
|
|
||||||
if ( guess == 0.0 ) {
|
if ( guess == 0.0 ) {
|
||||||
r = src;
|
r = src;
|
||||||
p = r;
|
p = r;
|
||||||
@ -107,7 +89,6 @@ public:
|
|||||||
a = norm2(p);
|
a = norm2(p);
|
||||||
}
|
}
|
||||||
cp = a;
|
cp = a;
|
||||||
AssignTimer.Stop();
|
|
||||||
|
|
||||||
// Handle trivial case of zero src
|
// Handle trivial case of zero src
|
||||||
if (ssq == 0.){
|
if (ssq == 0.){
|
||||||
@ -183,7 +164,6 @@ public:
|
|||||||
}
|
}
|
||||||
LinearCombTimer.Stop();
|
LinearCombTimer.Stop();
|
||||||
LinalgTimer.Stop();
|
LinalgTimer.Stop();
|
||||||
LogIteration(k,a,b);
|
|
||||||
|
|
||||||
IterationTimer.Stop();
|
IterationTimer.Stop();
|
||||||
if ( (k % 500) == 0 ) {
|
if ( (k % 500) == 0 ) {
|
||||||
@ -240,9 +220,6 @@ public:
|
|||||||
<<" residual "<< std::sqrt(cp / ssq)<< std::endl;
|
<<" residual "<< std::sqrt(cp / ssq)<< std::endl;
|
||||||
SolverTimer.Stop();
|
SolverTimer.Stop();
|
||||||
std::cout << GridLogMessage << "\tPreamble " << PreambleTimer.Elapsed() <<std::endl;
|
std::cout << GridLogMessage << "\tPreamble " << PreambleTimer.Elapsed() <<std::endl;
|
||||||
std::cout << GridLogMessage << "\tConstruct " << ConstructTimer.Elapsed() <<std::endl;
|
|
||||||
std::cout << GridLogMessage << "\tNorm " << NormTimer.Elapsed() <<std::endl;
|
|
||||||
std::cout << GridLogMessage << "\tAssign " << AssignTimer.Elapsed() <<std::endl;
|
|
||||||
std::cout << GridLogMessage << "\tSolver " << SolverTimer.Elapsed() <<std::endl;
|
std::cout << GridLogMessage << "\tSolver " << SolverTimer.Elapsed() <<std::endl;
|
||||||
std::cout << GridLogMessage << "Solver breakdown "<<std::endl;
|
std::cout << GridLogMessage << "Solver breakdown "<<std::endl;
|
||||||
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
|
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
|
||||||
@ -256,118 +233,5 @@ public:
|
|||||||
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
template <class Field>
|
|
||||||
class ConjugateGradientPolynomial : public ConjugateGradient<Field> {
|
|
||||||
public:
|
|
||||||
// Optionally record the CG polynomial
|
|
||||||
std::vector<double> ak;
|
|
||||||
std::vector<double> bk;
|
|
||||||
std::vector<double> poly_p;
|
|
||||||
std::vector<double> poly_r;
|
|
||||||
std::vector<double> poly_Ap;
|
|
||||||
std::vector<double> polynomial;
|
|
||||||
|
|
||||||
public:
|
|
||||||
ConjugateGradientPolynomial(RealD tol, Integer maxit, bool err_on_no_conv = true)
|
|
||||||
: ConjugateGradient<Field>(tol,maxit,err_on_no_conv)
|
|
||||||
{ };
|
|
||||||
void PolyHermOp(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
|
|
||||||
{
|
|
||||||
Field tmp(src.Grid());
|
|
||||||
Field AtoN(src.Grid());
|
|
||||||
AtoN = src;
|
|
||||||
psi=AtoN*polynomial[0];
|
|
||||||
for(int n=1;n<polynomial.size();n++){
|
|
||||||
tmp = AtoN;
|
|
||||||
Linop.HermOp(tmp,AtoN);
|
|
||||||
psi = psi + polynomial[n]*AtoN;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
void CGsequenceHermOp(LinearOperatorBase<Field> &Linop, const Field &src, Field &x)
|
|
||||||
{
|
|
||||||
Field Ap(src.Grid());
|
|
||||||
Field r(src.Grid());
|
|
||||||
Field p(src.Grid());
|
|
||||||
p=src;
|
|
||||||
r=src;
|
|
||||||
x=Zero();
|
|
||||||
x.Checkerboard()=src.Checkerboard();
|
|
||||||
for(int k=0;k<ak.size();k++){
|
|
||||||
x = x + ak[k]*p;
|
|
||||||
Linop.HermOp(p,Ap);
|
|
||||||
r = r - ak[k] * Ap;
|
|
||||||
p = r + bk[k] * p;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
void Solve(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
|
|
||||||
{
|
|
||||||
psi=Zero();
|
|
||||||
this->operator ()(Linop,src,psi);
|
|
||||||
}
|
|
||||||
virtual void LogBegin(void)
|
|
||||||
{
|
|
||||||
std::cout << "ConjugageGradientPolynomial::LogBegin() "<<std::endl;
|
|
||||||
ak.resize(0);
|
|
||||||
bk.resize(0);
|
|
||||||
polynomial.resize(0);
|
|
||||||
poly_Ap.resize(0);
|
|
||||||
poly_Ap.resize(0);
|
|
||||||
poly_p.resize(1);
|
|
||||||
poly_r.resize(1);
|
|
||||||
poly_p[0]=1.0;
|
|
||||||
poly_r[0]=1.0;
|
|
||||||
};
|
|
||||||
virtual void LogIteration(int k,RealD a,RealD b)
|
|
||||||
{
|
|
||||||
// With zero guess,
|
|
||||||
// p = r = src
|
|
||||||
//
|
|
||||||
// iterate:
|
|
||||||
// x = x + a p
|
|
||||||
// r = r - a A p
|
|
||||||
// p = r + b p
|
|
||||||
//
|
|
||||||
// [0]
|
|
||||||
// r = x
|
|
||||||
// p = x
|
|
||||||
// Ap=0
|
|
||||||
//
|
|
||||||
// [1]
|
|
||||||
// Ap = A x + 0 ==> shift poly P right by 1 and add 0.
|
|
||||||
// x = x + a p ==> add polynomials term by term
|
|
||||||
// r = r - a A p ==> add polynomials term by term
|
|
||||||
// p = r + b p ==> add polynomials term by term
|
|
||||||
//
|
|
||||||
std::cout << "ConjugageGradientPolynomial::LogIteration() "<<k<<std::endl;
|
|
||||||
ak.push_back(a);
|
|
||||||
bk.push_back(b);
|
|
||||||
// Ap= right_shift(p)
|
|
||||||
poly_Ap.resize(k+1);
|
|
||||||
poly_Ap[0]=0.0;
|
|
||||||
for(int i=0;i<k;i++){
|
|
||||||
poly_Ap[i+1]=poly_p[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
// x = x + a p
|
|
||||||
polynomial.resize(k);
|
|
||||||
polynomial[k-1]=0.0;
|
|
||||||
for(int i=0;i<k;i++){
|
|
||||||
polynomial[i] = polynomial[i] + a * poly_p[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
// r = r - a Ap
|
|
||||||
// p = r + b p
|
|
||||||
poly_r.resize(k+1);
|
|
||||||
poly_p.resize(k+1);
|
|
||||||
poly_r[k] = poly_p[k] = 0.0;
|
|
||||||
for(int i=0;i<k+1;i++){
|
|
||||||
poly_r[i] = poly_r[i] - a * poly_Ap[i];
|
|
||||||
poly_p[i] = poly_r[i] + b * poly_p[i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
#endif
|
#endif
|
||||||
|
@ -116,14 +116,14 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
//Compute double precision rsd and also new RHS vector.
|
//Compute double precision rsd and also new RHS vector.
|
||||||
Linop_d.HermOp(sol_d, tmp_d);
|
Linop_d.HermOp(sol_d, tmp_d);
|
||||||
RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
|
RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
|
||||||
std::cout<<GridLogMessage<<" rsd norm "<<norm<<std::endl;
|
|
||||||
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;
|
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;
|
||||||
|
|
||||||
if(norm < OuterLoopNormMult * stop){
|
if(norm < OuterLoopNormMult * stop){
|
||||||
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl;
|
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
while(norm * inner_tol * inner_tol < stop*1.01) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
|
while(norm * inner_tol * inner_tol < stop) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
|
||||||
|
|
||||||
PrecChangeTimer.Start();
|
PrecChangeTimer.Start();
|
||||||
precisionChange(src_f, src_d, pc_wk_dp_to_sp);
|
precisionChange(src_f, src_d, pc_wk_dp_to_sp);
|
||||||
|
@ -102,11 +102,11 @@ public:
|
|||||||
assert(mass.size()==nshift);
|
assert(mass.size()==nshift);
|
||||||
assert(mresidual.size()==nshift);
|
assert(mresidual.size()==nshift);
|
||||||
|
|
||||||
// remove dynamic sized arrays on stack; 2d is a pain with vector
|
// dynamic sized arrays on stack; 2d is a pain with vector
|
||||||
std::vector<RealD> bs(nshift);
|
RealD bs[nshift];
|
||||||
std::vector<RealD> rsq(nshift);
|
RealD rsq[nshift];
|
||||||
std::vector<std::array<RealD,2> > z(nshift);
|
RealD z[nshift][2];
|
||||||
std::vector<int> converged(nshift);
|
int converged[nshift];
|
||||||
|
|
||||||
const int primary =0;
|
const int primary =0;
|
||||||
|
|
||||||
|
@ -123,11 +123,11 @@ public:
|
|||||||
assert(mresidual.size()==nshift);
|
assert(mresidual.size()==nshift);
|
||||||
|
|
||||||
// dynamic sized arrays on stack; 2d is a pain with vector
|
// dynamic sized arrays on stack; 2d is a pain with vector
|
||||||
std::vector<RealD> bs(nshift);
|
RealD bs[nshift];
|
||||||
std::vector<RealD> rsq(nshift);
|
RealD rsq[nshift];
|
||||||
std::vector<RealD> rsqf(nshift);
|
RealD rsqf[nshift];
|
||||||
std::vector<std::array<RealD,2> > z(nshift);
|
RealD z[nshift][2];
|
||||||
std::vector<int> converged(nshift);
|
int converged[nshift];
|
||||||
|
|
||||||
const int primary =0;
|
const int primary =0;
|
||||||
|
|
||||||
|
@ -156,11 +156,11 @@ public:
|
|||||||
assert(mresidual.size()==nshift);
|
assert(mresidual.size()==nshift);
|
||||||
|
|
||||||
// dynamic sized arrays on stack; 2d is a pain with vector
|
// dynamic sized arrays on stack; 2d is a pain with vector
|
||||||
std::vector<RealD> bs(nshift);
|
RealD bs[nshift];
|
||||||
std::vector<RealD> rsq(nshift);
|
RealD rsq[nshift];
|
||||||
std::vector<RealD> rsqf(nshift);
|
RealD rsqf[nshift];
|
||||||
std::vector<std::array<RealD,2> > z(nshift);
|
RealD z[nshift][2];
|
||||||
std::vector<int> converged(nshift);
|
int converged[nshift];
|
||||||
|
|
||||||
const int primary =0;
|
const int primary =0;
|
||||||
|
|
||||||
|
@ -143,7 +143,7 @@ public:
|
|||||||
ip = innerProduct(evec[j],w);
|
ip = innerProduct(evec[j],w);
|
||||||
if(if_print)
|
if(if_print)
|
||||||
if( norm(ip)/norm2(w) > 1e-14)
|
if( norm(ip)/norm2(w) > 1e-14)
|
||||||
Glog<<"orthogonalize before: "<<j<<" of "<<k<<" "<< ip <<std::endl;
|
Glog<<"orthogonalize before: "<<j<<" of "<<k<<" "<< ip <<std::endl;
|
||||||
w = w - ip * evec[j];
|
w = w - ip * evec[j];
|
||||||
if(if_print) {
|
if(if_print) {
|
||||||
ip = innerProduct(evec[j],w);
|
ip = innerProduct(evec[j],w);
|
||||||
@ -279,16 +279,16 @@ public:
|
|||||||
Qt = Eigen::MatrixXcd::Identity(Nm,Nm);
|
Qt = Eigen::MatrixXcd::Identity(Nm,Nm);
|
||||||
diagonalize(eval2,lmd2,lme2,Nu,Nm,Nm,Qt,grid);
|
diagonalize(eval2,lmd2,lme2,Nu,Nm,Nm,Qt,grid);
|
||||||
_sort.push(eval2,Nm);
|
_sort.push(eval2,Nm);
|
||||||
// Glog << "#Ritz value before shift: "<< std::endl;
|
Glog << "#Ritz value before shift: "<< std::endl;
|
||||||
for(int i=0; i<Nm; ++i){
|
for(int i=0; i<Nm; ++i){
|
||||||
// std::cout.precision(13);
|
std::cout.precision(13);
|
||||||
// std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
|
std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
|
||||||
// std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
|
std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
//----------------------------------------------------------------------
|
//----------------------------------------------------------------------
|
||||||
if ( Nm>Nk ) {
|
if ( Nm>Nk ) {
|
||||||
// Glog <<" #Apply shifted QR transformations "<<std::endl;
|
Glog <<" #Apply shifted QR transformations "<<std::endl;
|
||||||
//int k2 = Nk+Nu;
|
//int k2 = Nk+Nu;
|
||||||
int k2 = Nk;
|
int k2 = Nk;
|
||||||
|
|
||||||
@ -326,11 +326,11 @@ public:
|
|||||||
Qt = Eigen::MatrixXcd::Identity(Nm,Nm);
|
Qt = Eigen::MatrixXcd::Identity(Nm,Nm);
|
||||||
diagonalize(eval2,lmd2,lme2,Nu,Nk,Nm,Qt,grid);
|
diagonalize(eval2,lmd2,lme2,Nu,Nk,Nm,Qt,grid);
|
||||||
_sort.push(eval2,Nk);
|
_sort.push(eval2,Nk);
|
||||||
// Glog << "#Ritz value after shift: "<< std::endl;
|
Glog << "#Ritz value after shift: "<< std::endl;
|
||||||
for(int i=0; i<Nk; ++i){
|
for(int i=0; i<Nk; ++i){
|
||||||
// std::cout.precision(13);
|
// std::cout.precision(13);
|
||||||
// std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
|
// std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
|
||||||
// std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
|
// std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
//----------------------------------------------------------------------
|
//----------------------------------------------------------------------
|
||||||
@ -644,7 +644,7 @@ private:
|
|||||||
// for (int u=0; u<mrhs; ++u) Glog << " out["<<u<<"] = "<<norm2(out[u])<<std::endl;
|
// for (int u=0; u<mrhs; ++u) Glog << " out["<<u<<"] = "<<norm2(out[u])<<std::endl;
|
||||||
k_start +=mrhs;
|
k_start +=mrhs;
|
||||||
}
|
}
|
||||||
// Glog << "LinAlg "<< std::endl;
|
Glog << "LinAlg "<< std::endl;
|
||||||
|
|
||||||
if (b>0) {
|
if (b>0) {
|
||||||
for (int u=0; u<Nu; ++u) {
|
for (int u=0; u<Nu; ++u) {
|
||||||
@ -678,7 +678,7 @@ private:
|
|||||||
}
|
}
|
||||||
w_copy[u] = w[u];
|
w_copy[u] = w[u];
|
||||||
}
|
}
|
||||||
// Glog << "LinAlg done"<< std::endl;
|
Glog << "LinAlg done"<< std::endl;
|
||||||
|
|
||||||
// In block version, the steps 6 and 7 in Lanczos construction is
|
// In block version, the steps 6 and 7 in Lanczos construction is
|
||||||
// replaced by the QR decomposition of new basis block.
|
// replaced by the QR decomposition of new basis block.
|
||||||
@ -691,15 +691,15 @@ private:
|
|||||||
}
|
}
|
||||||
|
|
||||||
// re-orthogonalization for numerical stability
|
// re-orthogonalization for numerical stability
|
||||||
// Glog << "Gram Schmidt"<< std::endl;
|
Glog << "Gram Schmidt"<< std::endl;
|
||||||
orthogonalize(w,Nu,evec,R);
|
orthogonalize(w,Nu,evec,R);
|
||||||
// QR part
|
// QR part
|
||||||
for (int u=1; u<Nu; ++u) {
|
for (int u=1; u<Nu; ++u) {
|
||||||
orthogonalize(w[u],w,u);
|
orthogonalize(w[u],w,u);
|
||||||
}
|
}
|
||||||
// Glog << "Gram Schmidt done "<< std::endl;
|
Glog << "Gram Schmidt done "<< std::endl;
|
||||||
|
|
||||||
// Glog << "LinAlg "<< std::endl;
|
Glog << "LinAlg "<< std::endl;
|
||||||
for (int u=0; u<Nu; ++u) {
|
for (int u=0; u<Nu; ++u) {
|
||||||
//for (int v=0; v<Nu; ++v) {
|
//for (int v=0; v<Nu; ++v) {
|
||||||
for (int v=u; v<Nu; ++v) {
|
for (int v=u; v<Nu; ++v) {
|
||||||
@ -716,7 +716,7 @@ private:
|
|||||||
// Glog <<" In block "<< b << "," <<" beta[" << u << "," << k-L << "] = " << lme[u][k] << std::endl;
|
// Glog <<" In block "<< b << "," <<" beta[" << u << "," << k-L << "] = " << lme[u][k] << std::endl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Glog << "LinAlg done "<< std::endl;
|
Glog << "LinAlg done "<< std::endl;
|
||||||
|
|
||||||
if (b < Nm/Nu-1) {
|
if (b < Nm/Nu-1) {
|
||||||
for (int u=0; u<Nu; ++u) {
|
for (int u=0; u<Nu; ++u) {
|
||||||
@ -935,7 +935,7 @@ if (1){
|
|||||||
int Nu, int Nb, int Nk, int Nm,
|
int Nu, int Nb, int Nk, int Nm,
|
||||||
Eigen::MatrixXcd& M)
|
Eigen::MatrixXcd& M)
|
||||||
{
|
{
|
||||||
// Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n';
|
Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n';
|
||||||
assert( Nk%Nu == 0 && Nm%Nu == 0 );
|
assert( Nk%Nu == 0 && Nm%Nu == 0 );
|
||||||
assert( Nk <= Nm );
|
assert( Nk <= Nm );
|
||||||
M = Eigen::MatrixXcd::Zero(Nk,Nk);
|
M = Eigen::MatrixXcd::Zero(Nk,Nk);
|
||||||
@ -953,7 +953,7 @@ if (1){
|
|||||||
M(u+(k/Nu)*Nu,k-Nu) = lme[u][k-Nu];
|
M(u+(k/Nu)*Nu,k-Nu) = lme[u][k-Nu];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Glog << "unpackHermitBlockTriDiagMatToEigen() end" << std::endl;
|
Glog << "unpackHermitBlockTriDiagMatToEigen() end" << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -963,7 +963,7 @@ if (1){
|
|||||||
int Nu, int Nb, int Nk, int Nm,
|
int Nu, int Nb, int Nk, int Nm,
|
||||||
Eigen::MatrixXcd& M)
|
Eigen::MatrixXcd& M)
|
||||||
{
|
{
|
||||||
// Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n';
|
Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n';
|
||||||
assert( Nk%Nu == 0 && Nm%Nu == 0 );
|
assert( Nk%Nu == 0 && Nm%Nu == 0 );
|
||||||
assert( Nk <= Nm );
|
assert( Nk <= Nm );
|
||||||
|
|
||||||
@ -979,7 +979,7 @@ if (1){
|
|||||||
lme[u][k-Nu] = M(u+(k/Nu)*Nu,k-Nu);
|
lme[u][k-Nu] = M(u+(k/Nu)*Nu,k-Nu);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Glog << "packHermitBlockTriDiagMatfromEigen() end" <<std::endl;
|
Glog << "packHermitBlockTriDiagMatfromEigen() end" <<std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -988,7 +988,7 @@ if (1){
|
|||||||
RealD Dsh,
|
RealD Dsh,
|
||||||
Eigen::MatrixXcd& Qprod)
|
Eigen::MatrixXcd& Qprod)
|
||||||
{
|
{
|
||||||
// Glog << "shiftedQRDecompEigen() begin" << '\n';
|
Glog << "shiftedQRDecompEigen() begin" << '\n';
|
||||||
Eigen::MatrixXcd Q = Eigen::MatrixXcd::Zero(Nm,Nm);
|
Eigen::MatrixXcd Q = Eigen::MatrixXcd::Zero(Nm,Nm);
|
||||||
Eigen::MatrixXcd R = Eigen::MatrixXcd::Zero(Nm,Nm);
|
Eigen::MatrixXcd R = Eigen::MatrixXcd::Zero(Nm,Nm);
|
||||||
Eigen::MatrixXcd Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm);
|
Eigen::MatrixXcd Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm);
|
||||||
@ -1004,7 +1004,7 @@ if (1){
|
|||||||
// lower triangular part used to represent series
|
// lower triangular part used to represent series
|
||||||
// of Q sequence.
|
// of Q sequence.
|
||||||
|
|
||||||
// Glog << "shiftedQRDecompEigen() Housholder & QR" << '\n';
|
Glog << "shiftedQRDecompEigen() Housholder & QR" << '\n';
|
||||||
// equivalent operation of Qprod *= Q
|
// equivalent operation of Qprod *= Q
|
||||||
//M = Eigen::MatrixXcd::Zero(Nm,Nm);
|
//M = Eigen::MatrixXcd::Zero(Nm,Nm);
|
||||||
|
|
||||||
@ -1025,7 +1025,7 @@ if (1){
|
|||||||
|
|
||||||
Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm);
|
Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm);
|
||||||
|
|
||||||
// Glog << "shiftedQRDecompEigen() Mtmp create" << '\n';
|
Glog << "shiftedQRDecompEigen() Mtmp create" << '\n';
|
||||||
for (int i=0; i<Nm; ++i) {
|
for (int i=0; i<Nm; ++i) {
|
||||||
for (int j=0; j<Nm-(Nu+1); ++j) {
|
for (int j=0; j<Nm-(Nu+1); ++j) {
|
||||||
for (int k=0; k<Nu+1+j; ++k) {
|
for (int k=0; k<Nu+1+j; ++k) {
|
||||||
@ -1033,7 +1033,7 @@ if (1){
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Glog << "shiftedQRDecompEigen() Mtmp loop1" << '\n';
|
Glog << "shiftedQRDecompEigen() Mtmp loop1" << '\n';
|
||||||
for (int i=0; i<Nm; ++i) {
|
for (int i=0; i<Nm; ++i) {
|
||||||
for (int j=Nm-(Nu+1); j<Nm; ++j) {
|
for (int j=Nm-(Nu+1); j<Nm; ++j) {
|
||||||
for (int k=0; k<Nm; ++k) {
|
for (int k=0; k<Nm; ++k) {
|
||||||
@ -1041,7 +1041,7 @@ if (1){
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Glog << "shiftedQRDecompEigen() Mtmp loop2" << '\n';
|
Glog << "shiftedQRDecompEigen() Mtmp loop2" << '\n';
|
||||||
|
|
||||||
//static int ntimes = 2;
|
//static int ntimes = 2;
|
||||||
//for (int j=0; j<Nm-(ntimes*Nu); ++j) {
|
//for (int j=0; j<Nm-(ntimes*Nu); ++j) {
|
||||||
@ -1067,13 +1067,13 @@ if (1){
|
|||||||
Mtmp(j,i) = conj(Mtmp(i,j));
|
Mtmp(j,i) = conj(Mtmp(i,j));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Glog << "shiftedQRDecompEigen() Mtmp loop3" << '\n';
|
Glog << "shiftedQRDecompEigen() Mtmp loop3" << '\n';
|
||||||
|
|
||||||
for (int i=0; i<Nm; ++i) {
|
for (int i=0; i<Nm; ++i) {
|
||||||
Mtmp(i,i) = real(Mtmp(i,i)) + Dsh;
|
Mtmp(i,i) = real(Mtmp(i,i)) + Dsh;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Glog << "shiftedQRDecompEigen() Mtmp loop4" << '\n';
|
Glog << "shiftedQRDecompEigen() Mtmp loop4" << '\n';
|
||||||
M = Mtmp;
|
M = Mtmp;
|
||||||
|
|
||||||
//M = Q.adjoint()*(M*Q);
|
//M = Q.adjoint()*(M*Q);
|
||||||
@ -1085,7 +1085,7 @@ if (1){
|
|||||||
// }
|
// }
|
||||||
//}
|
//}
|
||||||
|
|
||||||
// Glog << "shiftedQRDecompEigen() end" <<std::endl;
|
Glog << "shiftedQRDecompEigen() end" <<std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
void exampleQRDecompEigen(void)
|
void exampleQRDecompEigen(void)
|
||||||
|
@ -60,32 +60,6 @@ public:
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template<class Field> class NormalResidual : public LinearFunction<Field>{
|
|
||||||
private:
|
|
||||||
SparseMatrixBase<Field> & _Matrix;
|
|
||||||
OperatorFunction<Field> & _HermitianSolver;
|
|
||||||
LinearFunction<Field> & _Guess;
|
|
||||||
public:
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////
|
|
||||||
// Wrap the usual normal equations trick
|
|
||||||
/////////////////////////////////////////////////////
|
|
||||||
NormalResidual(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver,
|
|
||||||
LinearFunction<Field> &Guess)
|
|
||||||
: _Matrix(Matrix), _HermitianSolver(HermitianSolver), _Guess(Guess) {};
|
|
||||||
|
|
||||||
void operator() (const Field &in, Field &out){
|
|
||||||
|
|
||||||
Field res(in.Grid());
|
|
||||||
Field tmp(in.Grid());
|
|
||||||
|
|
||||||
MMdagLinearOperator<SparseMatrixBase<Field>,Field> MMdagOp(_Matrix);
|
|
||||||
_Guess(in,res);
|
|
||||||
_HermitianSolver(MMdagOp,in,res); // M Mdag res = in ;
|
|
||||||
_Matrix.Mdag(res,out); // out = Mdag res
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
template<class Field> class HPDSolver : public LinearFunction<Field> {
|
template<class Field> class HPDSolver : public LinearFunction<Field> {
|
||||||
private:
|
private:
|
||||||
LinearOperatorBase<Field> & _Matrix;
|
LinearOperatorBase<Field> & _Matrix;
|
||||||
|
@ -20,7 +20,7 @@ template<class Field> class PowerMethod
|
|||||||
RealD evalMaxApprox = 0.0;
|
RealD evalMaxApprox = 0.0;
|
||||||
auto src_n = src;
|
auto src_n = src;
|
||||||
auto tmp = src;
|
auto tmp = src;
|
||||||
const int _MAX_ITER_EST_ = 200;
|
const int _MAX_ITER_EST_ = 100;
|
||||||
|
|
||||||
for (int i=0;i<_MAX_ITER_EST_;i++) {
|
for (int i=0;i<_MAX_ITER_EST_;i++) {
|
||||||
|
|
||||||
@ -30,17 +30,18 @@ template<class Field> class PowerMethod
|
|||||||
RealD vden = norm2(src_n);
|
RealD vden = norm2(src_n);
|
||||||
RealD na = vnum/vden;
|
RealD na = vnum/vden;
|
||||||
|
|
||||||
std::cout << GridLogMessage << "PowerMethod: Current approximation of largest eigenvalue " << na << std::endl;
|
std::cout << GridLogIterative << "PowerMethod: Current approximation of largest eigenvalue " << na << std::endl;
|
||||||
|
|
||||||
// if ( (fabs(evalMaxApprox/na - 1.0) < 0.0001) || (i==_MAX_ITER_EST_-1) ) {
|
if ( (fabs(evalMaxApprox/na - 1.0) < 0.001) || (i==_MAX_ITER_EST_-1) ) {
|
||||||
// evalMaxApprox = na;
|
evalMaxApprox = na;
|
||||||
// return evalMaxApprox;
|
std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
|
||||||
// }
|
return evalMaxApprox;
|
||||||
|
}
|
||||||
evalMaxApprox = na;
|
evalMaxApprox = na;
|
||||||
src_n = tmp;
|
src_n = tmp;
|
||||||
}
|
}
|
||||||
std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
|
assert(0);
|
||||||
return evalMaxApprox;
|
return 0;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -1,76 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
namespace Grid {
|
|
||||||
|
|
||||||
class Band
|
|
||||||
{
|
|
||||||
RealD lo, hi;
|
|
||||||
public:
|
|
||||||
Band(RealD _lo,RealD _hi)
|
|
||||||
{
|
|
||||||
lo=_lo;
|
|
||||||
hi=_hi;
|
|
||||||
}
|
|
||||||
RealD operator() (RealD x){
|
|
||||||
if ( x>lo && x<hi ){
|
|
||||||
return 1.0;
|
|
||||||
} else {
|
|
||||||
return 0.0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
class PowerSpectrum
|
|
||||||
{
|
|
||||||
public:
|
|
||||||
|
|
||||||
template<typename T> static RealD normalise(T& v)
|
|
||||||
{
|
|
||||||
RealD nn = norm2(v);
|
|
||||||
nn = sqrt(nn);
|
|
||||||
v = v * (1.0/nn);
|
|
||||||
return nn;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<RealD> ranges;
|
|
||||||
std::vector<int> order;
|
|
||||||
|
|
||||||
PowerSpectrum( std::vector<RealD> &bins, std::vector<int> &_order ) : ranges(bins), order(_order) { };
|
|
||||||
|
|
||||||
template<class Field>
|
|
||||||
RealD operator()(LinearOperatorBase<Field> &HermOp, const Field &src)
|
|
||||||
{
|
|
||||||
GridBase *grid = src.Grid();
|
|
||||||
int N=ranges.size();
|
|
||||||
RealD hi = ranges[N-1];
|
|
||||||
|
|
||||||
RealD lo_band = 0.0;
|
|
||||||
RealD hi_band;
|
|
||||||
RealD nn=norm2(src);
|
|
||||||
RealD ss=0.0;
|
|
||||||
|
|
||||||
Field tmp = src;
|
|
||||||
|
|
||||||
for(int b=0;b<N;b++){
|
|
||||||
hi_band = ranges[b];
|
|
||||||
Band Notch(lo_band,hi_band);
|
|
||||||
|
|
||||||
Chebyshev<Field> polynomial;
|
|
||||||
polynomial.Init(0.0,hi,order[b],Notch);
|
|
||||||
polynomial.JacksonSmooth();
|
|
||||||
|
|
||||||
polynomial(HermOp,src,tmp) ;
|
|
||||||
|
|
||||||
RealD p=norm2(tmp);
|
|
||||||
ss=ss+p;
|
|
||||||
std::cout << GridLogMessage << " PowerSpectrum Band["<<lo_band<<","<<hi_band<<"] power "<<norm2(tmp)/nn<<std::endl;
|
|
||||||
|
|
||||||
lo_band=hi_band;
|
|
||||||
}
|
|
||||||
std::cout << GridLogMessage << " PowerSpectrum total power "<<ss/nn<<std::endl;
|
|
||||||
std::cout << GridLogMessage << " PowerSpectrum total power (unnormalised) "<<nn<<std::endl;
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
}
|
|
@ -74,7 +74,7 @@ public:
|
|||||||
|
|
||||||
void operator() (const Field &src, Field &psi){
|
void operator() (const Field &src, Field &psi){
|
||||||
|
|
||||||
// psi=Zero();
|
psi=Zero();
|
||||||
RealD cp, ssq,rsq;
|
RealD cp, ssq,rsq;
|
||||||
ssq=norm2(src);
|
ssq=norm2(src);
|
||||||
rsq=Tolerance*Tolerance*ssq;
|
rsq=Tolerance*Tolerance*ssq;
|
||||||
|
@ -30,8 +30,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
/* END LEGAL */
|
/* END LEGAL */
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
|
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
inline RealD AggregatePowerLaw(RealD x)
|
inline RealD AggregatePowerLaw(RealD x)
|
||||||
@ -126,53 +124,6 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual void CreateSubspaceGCR(GridParallelRNG &RNG,LinearOperatorBase<FineField> &DiracOp,int nn=nbasis)
|
|
||||||
{
|
|
||||||
RealD scale;
|
|
||||||
|
|
||||||
TrivialPrecon<FineField> simple_fine;
|
|
||||||
PrecGeneralisedConjugateResidualNonHermitian<FineField> GCR(0.001,30,DiracOp,simple_fine,12,12);
|
|
||||||
FineField noise(FineGrid);
|
|
||||||
FineField src(FineGrid);
|
|
||||||
FineField guess(FineGrid);
|
|
||||||
FineField Mn(FineGrid);
|
|
||||||
|
|
||||||
for(int b=0;b<nn;b++){
|
|
||||||
|
|
||||||
subspace[b] = Zero();
|
|
||||||
gaussian(RNG,noise);
|
|
||||||
scale = std::pow(norm2(noise),-0.5);
|
|
||||||
noise=noise*scale;
|
|
||||||
|
|
||||||
DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|Op|n> "<<innerProduct(noise,Mn)<<std::endl;
|
|
||||||
|
|
||||||
for(int i=0;i<3;i++){
|
|
||||||
// void operator() (const Field &src, Field &psi){
|
|
||||||
#if 1
|
|
||||||
std::cout << GridLogMessage << " inverting on noise "<<std::endl;
|
|
||||||
src = noise;
|
|
||||||
guess=Zero();
|
|
||||||
GCR(src,guess);
|
|
||||||
subspace[b] = guess;
|
|
||||||
#else
|
|
||||||
std::cout << GridLogMessage << " inverting on zero "<<std::endl;
|
|
||||||
src=Zero();
|
|
||||||
guess = noise;
|
|
||||||
GCR(src,guess);
|
|
||||||
subspace[b] = guess;
|
|
||||||
#endif
|
|
||||||
noise = subspace[b];
|
|
||||||
scale = std::pow(norm2(noise),-0.5);
|
|
||||||
noise=noise*scale;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|Op|f> "<<innerProduct(noise,Mn)<<std::endl;
|
|
||||||
subspace[b] = noise;
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
|
// World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
|
||||||
// and this is the best I found
|
// and this is the best I found
|
||||||
@ -209,21 +160,14 @@ public:
|
|||||||
|
|
||||||
int b =0;
|
int b =0;
|
||||||
{
|
{
|
||||||
ComplexD ip;
|
|
||||||
// Filter
|
// Filter
|
||||||
Chebyshev<FineField> Cheb(lo,hi,orderfilter);
|
Chebyshev<FineField> Cheb(lo,hi,orderfilter);
|
||||||
Cheb(hermop,noise,Mn);
|
Cheb(hermop,noise,Mn);
|
||||||
// normalise
|
// normalise
|
||||||
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
|
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
|
||||||
subspace[b] = Mn;
|
subspace[b] = Mn;
|
||||||
|
hermop.Op(Mn,tmp);
|
||||||
hermop.Op(Mn,tmp);
|
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
|
||||||
ip= innerProduct(Mn,tmp);
|
|
||||||
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|Op|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
|
|
||||||
|
|
||||||
hermop.AdjOp(Mn,tmp);
|
|
||||||
ip = innerProduct(Mn,tmp);
|
|
||||||
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|AdjOp|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
|
|
||||||
b++;
|
b++;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -269,18 +213,8 @@ public:
|
|||||||
Mn=*Tnp;
|
Mn=*Tnp;
|
||||||
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
|
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
|
||||||
subspace[b] = Mn;
|
subspace[b] = Mn;
|
||||||
|
hermop.Op(Mn,tmp);
|
||||||
|
std::cout<<GridLogMessage << n<<" filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
|
||||||
ComplexD ip;
|
|
||||||
|
|
||||||
hermop.Op(Mn,tmp);
|
|
||||||
ip= innerProduct(Mn,tmp);
|
|
||||||
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|Op|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
|
|
||||||
|
|
||||||
hermop.AdjOp(Mn,tmp);
|
|
||||||
ip = innerProduct(Mn,tmp);
|
|
||||||
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|AdjOp|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
|
|
||||||
|
|
||||||
b++;
|
b++;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -294,70 +228,6 @@ public:
|
|||||||
}
|
}
|
||||||
assert(b==nn);
|
assert(b==nn);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
virtual void CreateSubspacePolyCheby(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
|
|
||||||
int nn,
|
|
||||||
double hi,
|
|
||||||
double lo1,
|
|
||||||
int orderfilter,
|
|
||||||
double lo2,
|
|
||||||
int orderstep)
|
|
||||||
{
|
|
||||||
RealD scale;
|
|
||||||
|
|
||||||
FineField noise(FineGrid);
|
|
||||||
FineField Mn(FineGrid);
|
|
||||||
FineField tmp(FineGrid);
|
|
||||||
|
|
||||||
// New normalised noise
|
|
||||||
gaussian(RNG,noise);
|
|
||||||
scale = std::pow(norm2(noise),-0.5);
|
|
||||||
noise=noise*scale;
|
|
||||||
|
|
||||||
std::cout << GridLogMessage<<" CreateSubspacePolyCheby "<<std::endl;
|
|
||||||
// Initial matrix element
|
|
||||||
hermop.Op(noise,Mn);
|
|
||||||
std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
|
|
||||||
|
|
||||||
int b =0;
|
|
||||||
{
|
|
||||||
// Filter
|
|
||||||
std::cout << GridLogMessage << "Cheby "<<lo1<<","<<hi<<" "<<orderstep<<std::endl;
|
|
||||||
Chebyshev<FineField> Cheb(lo1,hi,orderfilter);
|
|
||||||
Cheb(hermop,noise,Mn);
|
|
||||||
// normalise
|
|
||||||
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
|
|
||||||
subspace[b] = Mn;
|
|
||||||
hermop.Op(Mn,tmp);
|
|
||||||
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|n> "<<norm2(Mn)<<std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Generate a full sequence of Chebyshevs
|
|
||||||
for(int n=1;n<nn;n++){
|
|
||||||
std::cout << GridLogMessage << "Cheby "<<lo2<<","<<hi<<" "<<orderstep<<std::endl;
|
|
||||||
Chebyshev<FineField> Cheb(lo2,hi,orderstep);
|
|
||||||
Cheb(hermop,subspace[n-1],Mn);
|
|
||||||
|
|
||||||
for(int m=0;m<n;m++){
|
|
||||||
ComplexD c = innerProduct(subspace[m],Mn);
|
|
||||||
Mn = Mn - c*subspace[m];
|
|
||||||
}
|
|
||||||
|
|
||||||
// normalise
|
|
||||||
scale = std::pow(norm2(Mn),-0.5);
|
|
||||||
Mn=Mn*scale;
|
|
||||||
|
|
||||||
subspace[n]=Mn;
|
|
||||||
|
|
||||||
hermop.Op(Mn,tmp);
|
|
||||||
std::cout<<GridLogMessage << "filt ["<<n<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
|
|
||||||
std::cout<<GridLogMessage << "filt ["<<n<<"] <n|n> "<<norm2(Mn)<<std::endl;
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
|
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
|
||||||
int nn,
|
int nn,
|
||||||
double hi,
|
double hi,
|
||||||
|
@ -99,7 +99,7 @@ public:
|
|||||||
CoarseMatrix AselfInvEven;
|
CoarseMatrix AselfInvEven;
|
||||||
CoarseMatrix AselfInvOdd;
|
CoarseMatrix AselfInvOdd;
|
||||||
|
|
||||||
deviceVector<RealD> dag_factor;
|
Vector<RealD> dag_factor;
|
||||||
|
|
||||||
///////////////////////
|
///////////////////////
|
||||||
// Interface
|
// Interface
|
||||||
@ -124,13 +124,9 @@ public:
|
|||||||
int npoint = geom.npoint;
|
int npoint = geom.npoint;
|
||||||
typedef LatticeView<Cobj> Aview;
|
typedef LatticeView<Cobj> Aview;
|
||||||
|
|
||||||
deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
|
Vector<Aview> AcceleratorViewContainer;
|
||||||
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
|
|
||||||
|
|
||||||
for(int p=0;p<geom.npoint;p++) {
|
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
|
||||||
hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
|
|
||||||
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
|
|
||||||
}
|
|
||||||
Aview *Aview_p = & AcceleratorViewContainer[0];
|
Aview *Aview_p = & AcceleratorViewContainer[0];
|
||||||
|
|
||||||
const int Nsimd = CComplex::Nsimd();
|
const int Nsimd = CComplex::Nsimd();
|
||||||
@ -165,7 +161,7 @@ public:
|
|||||||
coalescedWrite(out_v[ss](b),res);
|
coalescedWrite(out_v[ss](b),res);
|
||||||
});
|
});
|
||||||
|
|
||||||
for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
|
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
|
||||||
};
|
};
|
||||||
|
|
||||||
void Mdag (const CoarseVector &in, CoarseVector &out)
|
void Mdag (const CoarseVector &in, CoarseVector &out)
|
||||||
@ -194,14 +190,9 @@ public:
|
|||||||
int npoint = geom.npoint;
|
int npoint = geom.npoint;
|
||||||
typedef LatticeView<Cobj> Aview;
|
typedef LatticeView<Cobj> Aview;
|
||||||
|
|
||||||
|
Vector<Aview> AcceleratorViewContainer;
|
||||||
|
|
||||||
deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
|
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
|
||||||
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
|
|
||||||
|
|
||||||
for(int p=0;p<geom.npoint;p++) {
|
|
||||||
hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
|
|
||||||
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
|
|
||||||
}
|
|
||||||
Aview *Aview_p = & AcceleratorViewContainer[0];
|
Aview *Aview_p = & AcceleratorViewContainer[0];
|
||||||
|
|
||||||
const int Nsimd = CComplex::Nsimd();
|
const int Nsimd = CComplex::Nsimd();
|
||||||
@ -210,10 +201,10 @@ public:
|
|||||||
|
|
||||||
int osites=Grid()->oSites();
|
int osites=Grid()->oSites();
|
||||||
|
|
||||||
deviceVector<int> points(geom.npoint);
|
Vector<int> points(geom.npoint, 0);
|
||||||
for(int p=0; p<geom.npoint; p++) {
|
for(int p=0; p<geom.npoint; p++)
|
||||||
acceleratorPut(points[p],geom.points_dagger[p]);
|
points[p] = geom.points_dagger[p];
|
||||||
}
|
|
||||||
auto points_p = &points[0];
|
auto points_p = &points[0];
|
||||||
|
|
||||||
RealD* dag_factor_p = &dag_factor[0];
|
RealD* dag_factor_p = &dag_factor[0];
|
||||||
@ -245,7 +236,7 @@ public:
|
|||||||
coalescedWrite(out_v[ss](b),res);
|
coalescedWrite(out_v[ss](b),res);
|
||||||
});
|
});
|
||||||
|
|
||||||
for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
|
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
|
||||||
}
|
}
|
||||||
|
|
||||||
void MdirComms(const CoarseVector &in)
|
void MdirComms(const CoarseVector &in)
|
||||||
@ -260,14 +251,8 @@ public:
|
|||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
|
|
||||||
typedef LatticeView<Cobj> Aview;
|
typedef LatticeView<Cobj> Aview;
|
||||||
|
Vector<Aview> AcceleratorViewContainer;
|
||||||
deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
|
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
|
||||||
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
|
|
||||||
|
|
||||||
for(int p=0;p<geom.npoint;p++) {
|
|
||||||
hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
|
|
||||||
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
|
|
||||||
}
|
|
||||||
Aview *Aview_p = & AcceleratorViewContainer[0];
|
Aview *Aview_p = & AcceleratorViewContainer[0];
|
||||||
|
|
||||||
autoView( out_v , out, AcceleratorWrite);
|
autoView( out_v , out, AcceleratorWrite);
|
||||||
@ -300,7 +285,7 @@ public:
|
|||||||
}
|
}
|
||||||
coalescedWrite(out_v[ss](b),res);
|
coalescedWrite(out_v[ss](b),res);
|
||||||
});
|
});
|
||||||
for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
|
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
|
||||||
}
|
}
|
||||||
void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out)
|
void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out)
|
||||||
{
|
{
|
||||||
@ -484,20 +469,14 @@ public:
|
|||||||
|
|
||||||
// determine in what order we need the points
|
// determine in what order we need the points
|
||||||
int npoint = geom.npoint-1;
|
int npoint = geom.npoint-1;
|
||||||
deviceVector<int> points(npoint);
|
Vector<int> points(npoint, 0);
|
||||||
for(int p=0; p<npoint; p++) {
|
for(int p=0; p<npoint; p++)
|
||||||
int val = (dag && !hermitian) ? geom.points_dagger[p] : p;
|
points[p] = (dag && !hermitian) ? geom.points_dagger[p] : p;
|
||||||
acceleratorPut(points[p], val);
|
|
||||||
}
|
|
||||||
auto points_p = &points[0];
|
auto points_p = &points[0];
|
||||||
|
|
||||||
deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
|
Vector<Aview> AcceleratorViewContainer;
|
||||||
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
|
for(int p=0;p<npoint;p++) AcceleratorViewContainer.push_back(a[p].View(AcceleratorRead));
|
||||||
|
|
||||||
for(int p=0;p<geom.npoint;p++) {
|
|
||||||
hAcceleratorViewContainer[p] = a[p].View(AcceleratorRead);
|
|
||||||
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
|
|
||||||
}
|
|
||||||
Aview *Aview_p = & AcceleratorViewContainer[0];
|
Aview *Aview_p = & AcceleratorViewContainer[0];
|
||||||
|
|
||||||
const int Nsimd = CComplex::Nsimd();
|
const int Nsimd = CComplex::Nsimd();
|
||||||
@ -560,7 +539,7 @@ public:
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
for(int p=0;p<npoint;p++) hAcceleratorViewContainer[p].ViewClose();
|
for(int p=0;p<npoint;p++) AcceleratorViewContainer[p].ViewClose();
|
||||||
}
|
}
|
||||||
|
|
||||||
CoarsenedMatrix(GridCartesian &CoarseGrid, int hermitian_=0) :
|
CoarsenedMatrix(GridCartesian &CoarseGrid, int hermitian_=0) :
|
||||||
@ -611,13 +590,11 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
// GPU readable prefactor
|
// GPU readable prefactor
|
||||||
std::vector<RealD> h_dag_factor(nbasis*nbasis);
|
|
||||||
thread_for(i, nbasis*nbasis, {
|
thread_for(i, nbasis*nbasis, {
|
||||||
int j = i/nbasis;
|
int j = i/nbasis;
|
||||||
int k = i%nbasis;
|
int k = i%nbasis;
|
||||||
h_dag_factor[i] = dag_factor_eigen(j, k);
|
dag_factor[i] = dag_factor_eigen(j, k);
|
||||||
});
|
});
|
||||||
acceleratorCopyToDevice(&h_dag_factor[0],&dag_factor[0],dag_factor.size()*sizeof(RealD));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
|
void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
|
||||||
|
@ -69,7 +69,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
// FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
|
// FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
|
||||||
void construct(pointer __p, const _Tp& __val) { };
|
void construct(pointer __p, const _Tp& __val) { assert(0);};
|
||||||
void construct(pointer __p) { };
|
void construct(pointer __p) { };
|
||||||
void destroy(pointer __p) { };
|
void destroy(pointer __p) { };
|
||||||
};
|
};
|
||||||
@ -174,10 +174,19 @@ template<typename _Tp> inline bool operator!=(const devAllocator<_Tp>&, const d
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Template typedefs
|
// Template typedefs
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
template<class T> using hostVector = std::vector<T,alignedAllocator<T> >; // Needs autoview
|
#ifdef ACCELERATOR_CSHIFT
|
||||||
template<class T> using Vector = std::vector<T,uvmAllocator<T> >; // Really want to deprecate
|
// Cshift on device
|
||||||
template<class T> using uvmVector = std::vector<T,uvmAllocator<T> >; // auto migrating page
|
template<class T> using cshiftAllocator = devAllocator<T>;
|
||||||
template<class T> using deviceVector = std::vector<T,devAllocator<T> >; // device vector
|
#else
|
||||||
|
// Cshift on host
|
||||||
|
template<class T> using cshiftAllocator = std::allocator<T>;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
template<class T> using Vector = std::vector<T,uvmAllocator<T> >;
|
||||||
|
template<class T> using stencilVector = std::vector<T,alignedAllocator<T> >;
|
||||||
|
template<class T> using commVector = std::vector<T,devAllocator<T> >;
|
||||||
|
template<class T> using deviceVector = std::vector<T,devAllocator<T> >;
|
||||||
|
template<class T> using cshiftVector = std::vector<T,cshiftAllocator<T> >;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
template<class T> class vecView
|
template<class T> class vecView
|
||||||
@ -188,9 +197,8 @@ template<class T> class vecView
|
|||||||
ViewMode mode;
|
ViewMode mode;
|
||||||
void * cpu_ptr;
|
void * cpu_ptr;
|
||||||
public:
|
public:
|
||||||
// Rvalue accessor
|
|
||||||
accelerator_inline T & operator[](size_t i) const { return this->data[i]; };
|
accelerator_inline T & operator[](size_t i) const { return this->data[i]; };
|
||||||
vecView(Vector<T> &refer_to_me,ViewMode _mode)
|
vecView(std::vector<T> &refer_to_me,ViewMode _mode)
|
||||||
{
|
{
|
||||||
cpu_ptr = &refer_to_me[0];
|
cpu_ptr = &refer_to_me[0];
|
||||||
size = refer_to_me.size();
|
size = refer_to_me.size();
|
||||||
@ -206,12 +214,22 @@ template<class T> class vecView
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template<class T> vecView<T> VectorView(Vector<T> &vec,ViewMode _mode)
|
template<class T> vecView<T> VectorView(std::vector<T> &vec,ViewMode _mode)
|
||||||
{
|
{
|
||||||
vecView<T> ret(vec,_mode); // does the open
|
vecView<T> ret(vec,_mode); // does the open
|
||||||
return ret; // must be closed
|
return ret; // must be closed
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Little autoscope assister
|
||||||
|
template<class View>
|
||||||
|
class VectorViewCloser
|
||||||
|
{
|
||||||
|
View v; // Take a copy of view and call view close when I go out of scope automatically
|
||||||
|
public:
|
||||||
|
VectorViewCloser(View &_v) : v(_v) {};
|
||||||
|
~VectorViewCloser() { auto ptr = v.cpu_ptr; v.ViewClose(); MemoryManager::NotifyDeletion(ptr);}
|
||||||
|
};
|
||||||
|
|
||||||
#define autoVecView(v_v,v,mode) \
|
#define autoVecView(v_v,v,mode) \
|
||||||
auto v_v = VectorView(v,mode); \
|
auto v_v = VectorView(v,mode); \
|
||||||
ViewCloser<decltype(v_v)> _autoView##v_v(v_v);
|
ViewCloser<decltype(v_v)> _autoView##v_v(v_v);
|
||||||
|
@ -1,15 +1,16 @@
|
|||||||
#include <Grid/GridCore.h>
|
#include <Grid/GridCore.h>
|
||||||
#ifndef GRID_UVM
|
#ifndef GRID_UVM
|
||||||
|
|
||||||
|
#warning "Using explicit device memory copies"
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
#define MAXLINE 512
|
#define MAXLINE 512
|
||||||
static char print_buffer [ MAXLINE ];
|
static char print_buffer [ MAXLINE ];
|
||||||
|
|
||||||
#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer << std::endl;
|
#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer;
|
||||||
#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug << print_buffer << std::endl;
|
#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug << print_buffer;
|
||||||
//#define dprintf(...)
|
//#define dprintf(...)
|
||||||
//#define mprintf(...)
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////
|
||||||
// For caching copies of data on device
|
// For caching copies of data on device
|
||||||
@ -110,7 +111,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
|
|||||||
///////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////
|
||||||
assert(AccCache.state!=Empty);
|
assert(AccCache.state!=Empty);
|
||||||
|
|
||||||
dprintf("MemoryManager: Discard(%lx) %lx",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
|
dprintf("MemoryManager: Discard(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
|
||||||
assert(AccCache.accLock==0);
|
assert(AccCache.accLock==0);
|
||||||
assert(AccCache.cpuLock==0);
|
assert(AccCache.cpuLock==0);
|
||||||
assert(AccCache.CpuPtr!=(uint64_t)NULL);
|
assert(AccCache.CpuPtr!=(uint64_t)NULL);
|
||||||
@ -120,7 +121,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
|
|||||||
DeviceBytes -=AccCache.bytes;
|
DeviceBytes -=AccCache.bytes;
|
||||||
LRUremove(AccCache);
|
LRUremove(AccCache);
|
||||||
AccCache.AccPtr=(uint64_t) NULL;
|
AccCache.AccPtr=(uint64_t) NULL;
|
||||||
dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
|
dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
|
||||||
}
|
}
|
||||||
uint64_t CpuPtr = AccCache.CpuPtr;
|
uint64_t CpuPtr = AccCache.CpuPtr;
|
||||||
EntryErase(CpuPtr);
|
EntryErase(CpuPtr);
|
||||||
@ -140,7 +141,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
|
|||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
assert(AccCache.state!=Empty);
|
assert(AccCache.state!=Empty);
|
||||||
|
|
||||||
mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld",
|
mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld\n",
|
||||||
(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr,
|
(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr,
|
||||||
(uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock);
|
(uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock);
|
||||||
if (AccCache.accLock!=0) return;
|
if (AccCache.accLock!=0) return;
|
||||||
@ -154,7 +155,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
|
|||||||
AccCache.AccPtr=(uint64_t)NULL;
|
AccCache.AccPtr=(uint64_t)NULL;
|
||||||
AccCache.state=CpuDirty; // CPU primary now
|
AccCache.state=CpuDirty; // CPU primary now
|
||||||
DeviceBytes -=AccCache.bytes;
|
DeviceBytes -=AccCache.bytes;
|
||||||
dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld ",(uint64_t)AccCache.AccPtr,DeviceBytes);
|
dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);
|
||||||
}
|
}
|
||||||
// uint64_t CpuPtr = AccCache.CpuPtr;
|
// uint64_t CpuPtr = AccCache.CpuPtr;
|
||||||
DeviceEvictions++;
|
DeviceEvictions++;
|
||||||
@ -168,7 +169,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
|
|||||||
assert(AccCache.AccPtr!=(uint64_t)NULL);
|
assert(AccCache.AccPtr!=(uint64_t)NULL);
|
||||||
assert(AccCache.CpuPtr!=(uint64_t)NULL);
|
assert(AccCache.CpuPtr!=(uint64_t)NULL);
|
||||||
acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
|
acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
|
||||||
mprintf("MemoryManager: acceleratorCopyFromDevice Flush size %ld AccPtr %lx -> CpuPtr %lx",(uint64_t)AccCache.bytes,(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
|
mprintf("MemoryManager: acceleratorCopyFromDevice Flush AccPtr %lx -> CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
|
||||||
DeviceToHostBytes+=AccCache.bytes;
|
DeviceToHostBytes+=AccCache.bytes;
|
||||||
DeviceToHostXfer++;
|
DeviceToHostXfer++;
|
||||||
AccCache.state=Consistent;
|
AccCache.state=Consistent;
|
||||||
@ -183,9 +184,7 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
|
|||||||
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
|
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
|
||||||
DeviceBytes+=AccCache.bytes;
|
DeviceBytes+=AccCache.bytes;
|
||||||
}
|
}
|
||||||
mprintf("MemoryManager: acceleratorCopyToDevice Clone size %ld AccPtr %lx <- CpuPtr %lx",
|
mprintf("MemoryManager: acceleratorCopyToDevice Clone AccPtr %lx <- CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
|
||||||
(uint64_t)AccCache.bytes,
|
|
||||||
(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
|
|
||||||
acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
|
acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
|
||||||
HostToDeviceBytes+=AccCache.bytes;
|
HostToDeviceBytes+=AccCache.bytes;
|
||||||
HostToDeviceXfer++;
|
HostToDeviceXfer++;
|
||||||
@ -211,7 +210,7 @@ void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache)
|
|||||||
void MemoryManager::ViewClose(void* Ptr,ViewMode mode)
|
void MemoryManager::ViewClose(void* Ptr,ViewMode mode)
|
||||||
{
|
{
|
||||||
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
|
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
|
||||||
dprintf("AcceleratorViewClose %lx",(uint64_t)Ptr);
|
dprintf("AcceleratorViewClose %lx\n",(uint64_t)Ptr);
|
||||||
AcceleratorViewClose((uint64_t)Ptr);
|
AcceleratorViewClose((uint64_t)Ptr);
|
||||||
} else if( (mode==CpuRead)||(mode==CpuWrite)){
|
} else if( (mode==CpuRead)||(mode==CpuWrite)){
|
||||||
CpuViewClose((uint64_t)Ptr);
|
CpuViewClose((uint64_t)Ptr);
|
||||||
@ -223,7 +222,7 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
|
|||||||
{
|
{
|
||||||
uint64_t CpuPtr = (uint64_t)_CpuPtr;
|
uint64_t CpuPtr = (uint64_t)_CpuPtr;
|
||||||
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
|
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
|
||||||
dprintf("AcceleratorViewOpen %lx",(uint64_t)CpuPtr);
|
dprintf("AcceleratorViewOpen %lx\n",(uint64_t)CpuPtr);
|
||||||
return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint);
|
return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint);
|
||||||
} else if( (mode==CpuRead)||(mode==CpuWrite)){
|
} else if( (mode==CpuRead)||(mode==CpuWrite)){
|
||||||
return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint);
|
return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint);
|
||||||
@ -234,9 +233,6 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
|
|||||||
}
|
}
|
||||||
void MemoryManager::EvictVictims(uint64_t bytes)
|
void MemoryManager::EvictVictims(uint64_t bytes)
|
||||||
{
|
{
|
||||||
if(bytes>=DeviceMaxBytes) {
|
|
||||||
printf("EvictVictims bytes %ld DeviceMaxBytes %ld\n",bytes,DeviceMaxBytes);
|
|
||||||
}
|
|
||||||
assert(bytes<DeviceMaxBytes);
|
assert(bytes<DeviceMaxBytes);
|
||||||
while(bytes+DeviceLRUBytes > DeviceMaxBytes){
|
while(bytes+DeviceLRUBytes > DeviceMaxBytes){
|
||||||
if ( DeviceLRUBytes > 0){
|
if ( DeviceLRUBytes > 0){
|
||||||
@ -269,7 +265,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
|
|||||||
assert(AccCache.cpuLock==0); // Programming error
|
assert(AccCache.cpuLock==0); // Programming error
|
||||||
|
|
||||||
if(AccCache.state!=Empty) {
|
if(AccCache.state!=Empty) {
|
||||||
dprintf("ViewOpen found entry %lx %lx : sizes %ld %ld accLock %ld",
|
dprintf("ViewOpen found entry %lx %lx : %ld %ld accLock %ld\n",
|
||||||
(uint64_t)AccCache.CpuPtr,
|
(uint64_t)AccCache.CpuPtr,
|
||||||
(uint64_t)CpuPtr,
|
(uint64_t)CpuPtr,
|
||||||
(uint64_t)AccCache.bytes,
|
(uint64_t)AccCache.bytes,
|
||||||
@ -309,7 +305,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
|
|||||||
AccCache.state = Consistent; // Empty + AccRead => Consistent
|
AccCache.state = Consistent; // Empty + AccRead => Consistent
|
||||||
}
|
}
|
||||||
AccCache.accLock= 1;
|
AccCache.accLock= 1;
|
||||||
dprintf("Copied Empty entry into device accLock= %d",AccCache.accLock);
|
dprintf("Copied Empty entry into device accLock= %d\n",AccCache.accLock);
|
||||||
} else if(AccCache.state==CpuDirty ){
|
} else if(AccCache.state==CpuDirty ){
|
||||||
if(mode==AcceleratorWriteDiscard) {
|
if(mode==AcceleratorWriteDiscard) {
|
||||||
CpuDiscard(AccCache);
|
CpuDiscard(AccCache);
|
||||||
@ -322,21 +318,21 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
|
|||||||
AccCache.state = Consistent; // CpuDirty + AccRead => Consistent
|
AccCache.state = Consistent; // CpuDirty + AccRead => Consistent
|
||||||
}
|
}
|
||||||
AccCache.accLock++;
|
AccCache.accLock++;
|
||||||
dprintf("CpuDirty entry into device ++accLock= %d",AccCache.accLock);
|
dprintf("CpuDirty entry into device ++accLock= %d\n",AccCache.accLock);
|
||||||
} else if(AccCache.state==Consistent) {
|
} else if(AccCache.state==Consistent) {
|
||||||
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
|
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
|
||||||
AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty
|
AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty
|
||||||
else
|
else
|
||||||
AccCache.state = Consistent; // Consistent + AccRead => Consistent
|
AccCache.state = Consistent; // Consistent + AccRead => Consistent
|
||||||
AccCache.accLock++;
|
AccCache.accLock++;
|
||||||
dprintf("Consistent entry into device ++accLock= %d",AccCache.accLock);
|
dprintf("Consistent entry into device ++accLock= %d\n",AccCache.accLock);
|
||||||
} else if(AccCache.state==AccDirty) {
|
} else if(AccCache.state==AccDirty) {
|
||||||
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
|
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
|
||||||
AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
|
AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
|
||||||
else
|
else
|
||||||
AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty
|
AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty
|
||||||
AccCache.accLock++;
|
AccCache.accLock++;
|
||||||
dprintf("AccDirty entry ++accLock= %d",AccCache.accLock);
|
dprintf("AccDirty entry ++accLock= %d\n",AccCache.accLock);
|
||||||
} else {
|
} else {
|
||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
@ -345,7 +341,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
|
|||||||
// If view is opened on device must remove from LRU
|
// If view is opened on device must remove from LRU
|
||||||
if(AccCache.LRU_valid==1){
|
if(AccCache.LRU_valid==1){
|
||||||
// must possibly remove from LRU as now locked on GPU
|
// must possibly remove from LRU as now locked on GPU
|
||||||
dprintf("AccCache entry removed from LRU ");
|
dprintf("AccCache entry removed from LRU \n");
|
||||||
LRUremove(AccCache);
|
LRUremove(AccCache);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -368,10 +364,10 @@ void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr)
|
|||||||
AccCache.accLock--;
|
AccCache.accLock--;
|
||||||
// Move to LRU queue if not locked and close on device
|
// Move to LRU queue if not locked and close on device
|
||||||
if(AccCache.accLock==0) {
|
if(AccCache.accLock==0) {
|
||||||
dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
|
dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
|
||||||
LRUinsert(AccCache);
|
LRUinsert(AccCache);
|
||||||
} else {
|
} else {
|
||||||
dprintf("AccleratorViewClose %lx AccLock decremented to %ld",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
|
dprintf("AccleratorViewClose %lx AccLock decremented to %ld\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
void MemoryManager::CpuViewClose(uint64_t CpuPtr)
|
void MemoryManager::CpuViewClose(uint64_t CpuPtr)
|
||||||
|
@ -15,10 +15,10 @@ void check_huge_pages(void *Buf,uint64_t BYTES)
|
|||||||
uint64_t virt_pfn = (uint64_t)Buf / page_size;
|
uint64_t virt_pfn = (uint64_t)Buf / page_size;
|
||||||
off_t offset = sizeof(uint64_t) * virt_pfn;
|
off_t offset = sizeof(uint64_t) * virt_pfn;
|
||||||
uint64_t npages = (BYTES + page_size-1) / page_size;
|
uint64_t npages = (BYTES + page_size-1) / page_size;
|
||||||
std::vector<uint64_t> pagedata(npages);
|
uint64_t pagedata[npages];
|
||||||
uint64_t ret = lseek(fd, offset, SEEK_SET);
|
uint64_t ret = lseek(fd, offset, SEEK_SET);
|
||||||
assert(ret == offset);
|
assert(ret == offset);
|
||||||
ret = ::read(fd, &pagedata[0], sizeof(uint64_t)*npages);
|
ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
|
||||||
assert(ret == sizeof(uint64_t) * npages);
|
assert(ret == sizeof(uint64_t) * npages);
|
||||||
int nhugepages = npages / 512;
|
int nhugepages = npages / 512;
|
||||||
int n4ktotal, nnothuge;
|
int n4ktotal, nnothuge;
|
||||||
|
@ -82,7 +82,6 @@ public:
|
|||||||
bool _isCheckerBoarded;
|
bool _isCheckerBoarded;
|
||||||
int LocallyPeriodic;
|
int LocallyPeriodic;
|
||||||
Coordinate _checker_dim_mask;
|
Coordinate _checker_dim_mask;
|
||||||
int _checker_dim;
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
@ -90,8 +89,9 @@ public:
|
|||||||
// Checkerboarding interface is virtual and overridden by
|
// Checkerboarding interface is virtual and overridden by
|
||||||
// GridCartesian / GridRedBlackCartesian
|
// GridCartesian / GridRedBlackCartesian
|
||||||
////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////
|
||||||
virtual int CheckerBoarded(int dim) =0;
|
virtual int CheckerBoarded(int dim)=0;
|
||||||
virtual int CheckerBoard(const Coordinate &site)=0;
|
virtual int CheckerBoard(const Coordinate &site)=0;
|
||||||
|
virtual int CheckerDim(void){ return 0; };
|
||||||
virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0;
|
virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0;
|
||||||
virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
|
virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
|
||||||
virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0;
|
virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0;
|
||||||
|
@ -38,7 +38,7 @@ class GridCartesian: public GridBase {
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
int dummy;
|
int dummy;
|
||||||
// Coordinate _checker_dim_mask;
|
Coordinate _checker_dim_mask;
|
||||||
virtual int CheckerBoardFromOindexTable (int Oindex) {
|
virtual int CheckerBoardFromOindexTable (int Oindex) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@ -46,7 +46,7 @@ public:
|
|||||||
{
|
{
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
virtual int CheckerBoarded(int dim) {
|
virtual int CheckerBoarded(int dim){
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
virtual int CheckerBoard(const Coordinate &site){
|
virtual int CheckerBoard(const Coordinate &site){
|
||||||
@ -106,7 +106,6 @@ public:
|
|||||||
_rdimensions.resize(_ndimension);
|
_rdimensions.resize(_ndimension);
|
||||||
_simd_layout.resize(_ndimension);
|
_simd_layout.resize(_ndimension);
|
||||||
_checker_dim_mask.resize(_ndimension);;
|
_checker_dim_mask.resize(_ndimension);;
|
||||||
_checker_dim = -1;
|
|
||||||
_lstart.resize(_ndimension);
|
_lstart.resize(_ndimension);
|
||||||
_lend.resize(_ndimension);
|
_lend.resize(_ndimension);
|
||||||
|
|
||||||
|
@ -57,10 +57,10 @@ class GridRedBlackCartesian : public GridBase
|
|||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
// Coordinate _checker_dim_mask;
|
// Coordinate _checker_dim_mask;
|
||||||
// int _checker_dim;
|
int _checker_dim;
|
||||||
std::vector<int> _checker_board;
|
std::vector<int> _checker_board;
|
||||||
|
|
||||||
virtual int isCheckerBoarded(void) const { return 1; };
|
virtual int CheckerDim(void){ return _checker_dim; };
|
||||||
virtual int CheckerBoarded(int dim){
|
virtual int CheckerBoarded(int dim){
|
||||||
if( dim==_checker_dim) return 1;
|
if( dim==_checker_dim) return 1;
|
||||||
else return 0;
|
else return 0;
|
||||||
@ -148,7 +148,7 @@ public:
|
|||||||
{
|
{
|
||||||
Init(base->_fdimensions,base->_simd_layout,base->_processors,checker_dim_mask,checker_dim) ;
|
Init(base->_fdimensions,base->_simd_layout,base->_processors,checker_dim_mask,checker_dim) ;
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual ~GridRedBlackCartesian() = default;
|
virtual ~GridRedBlackCartesian() = default;
|
||||||
|
|
||||||
void Init(const Coordinate &dimensions,
|
void Init(const Coordinate &dimensions,
|
||||||
|
@ -57,29 +57,18 @@ int CartesianCommunicator::ProcessorCount(void) { return
|
|||||||
// very VERY rarely (Log, serial RNG) we need world without a grid
|
// very VERY rarely (Log, serial RNG) we need world without a grid
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
#ifdef USE_GRID_REDUCTION
|
|
||||||
void CartesianCommunicator::GlobalSum(ComplexF &c)
|
|
||||||
{
|
|
||||||
GlobalSumP2P(c);
|
|
||||||
}
|
|
||||||
void CartesianCommunicator::GlobalSum(ComplexD &c)
|
|
||||||
{
|
|
||||||
GlobalSumP2P(c);
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
void CartesianCommunicator::GlobalSum(ComplexF &c)
|
void CartesianCommunicator::GlobalSum(ComplexF &c)
|
||||||
{
|
{
|
||||||
GlobalSumVector((float *)&c,2);
|
GlobalSumVector((float *)&c,2);
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::GlobalSum(ComplexD &c)
|
|
||||||
{
|
|
||||||
GlobalSumVector((double *)&c,2);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
|
void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
|
||||||
{
|
{
|
||||||
GlobalSumVector((float *)c,2*N);
|
GlobalSumVector((float *)c,2*N);
|
||||||
}
|
}
|
||||||
|
void CartesianCommunicator::GlobalSum(ComplexD &c)
|
||||||
|
{
|
||||||
|
GlobalSumVector((double *)&c,2);
|
||||||
|
}
|
||||||
void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
|
void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
|
||||||
{
|
{
|
||||||
GlobalSumVector((double *)c,2*N);
|
GlobalSumVector((double *)c,2*N);
|
||||||
|
@ -33,8 +33,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
///////////////////////////////////
|
///////////////////////////////////
|
||||||
#include <Grid/communicator/SharedMemory.h>
|
#include <Grid/communicator/SharedMemory.h>
|
||||||
|
|
||||||
#define NVLINK_GET
|
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
extern bool Stencil_force_mpi ;
|
extern bool Stencil_force_mpi ;
|
||||||
@ -129,35 +127,7 @@ public:
|
|||||||
void GlobalSumVector(ComplexD *c,int N);
|
void GlobalSumVector(ComplexD *c,int N);
|
||||||
void GlobalXOR(uint32_t &);
|
void GlobalXOR(uint32_t &);
|
||||||
void GlobalXOR(uint64_t &);
|
void GlobalXOR(uint64_t &);
|
||||||
|
|
||||||
template<class obj> void GlobalSumP2P(obj &o)
|
|
||||||
{
|
|
||||||
std::vector<obj> column;
|
|
||||||
obj accum = o;
|
|
||||||
int source,dest;
|
|
||||||
for(int d=0;d<_ndimension;d++){
|
|
||||||
column.resize(_processors[d]);
|
|
||||||
column[0] = accum;
|
|
||||||
std::vector<MpiCommsRequest_t> list;
|
|
||||||
for(int p=1;p<_processors[d];p++){
|
|
||||||
ShiftedRanks(d,p,source,dest);
|
|
||||||
SendToRecvFromBegin(list,
|
|
||||||
&column[0],
|
|
||||||
dest,
|
|
||||||
&column[p],
|
|
||||||
source,
|
|
||||||
sizeof(obj),d*100+p);
|
|
||||||
|
|
||||||
}
|
|
||||||
CommsComplete(list);
|
|
||||||
for(int p=1;p<_processors[d];p++){
|
|
||||||
accum = accum + column[p];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Broadcast(0,accum);
|
|
||||||
o=accum;
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class obj> void GlobalSum(obj &o){
|
template<class obj> void GlobalSum(obj &o){
|
||||||
typedef typename obj::scalar_type scalar_type;
|
typedef typename obj::scalar_type scalar_type;
|
||||||
int words = sizeof(obj)/sizeof(scalar_type);
|
int words = sizeof(obj)/sizeof(scalar_type);
|
||||||
@ -168,8 +138,8 @@ public:
|
|||||||
////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////
|
||||||
// Face exchange, buffer swap in translational invariant way
|
// Face exchange, buffer swap in translational invariant way
|
||||||
////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////
|
||||||
void CommsComplete(std::vector<MpiCommsRequest_t> &list);
|
void CommsComplete(std::vector<CommsRequest_t> &list);
|
||||||
void SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &list,
|
void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||||
void *xmit,
|
void *xmit,
|
||||||
int dest,
|
int dest,
|
||||||
void *recv,
|
void *recv,
|
||||||
@ -188,17 +158,6 @@ public:
|
|||||||
int recv_from_rank,int do_recv,
|
int recv_from_rank,int do_recv,
|
||||||
int bytes,int dir);
|
int bytes,int dir);
|
||||||
|
|
||||||
double StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
|
|
||||||
void *xmit,
|
|
||||||
int xmit_to_rank,int do_xmit,
|
|
||||||
void *recv,
|
|
||||||
int recv_from_rank,int do_recv,
|
|
||||||
int xbytes,int rbytes,int dir);
|
|
||||||
|
|
||||||
// Could do a PollHtoD and have a CommsMerge dependence
|
|
||||||
void StencilSendToRecvFromPollDtoH (std::vector<CommsRequest_t> &list);
|
|
||||||
void StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list);
|
|
||||||
|
|
||||||
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||||
void *xmit,
|
void *xmit,
|
||||||
int xmit_to_rank,int do_xmit,
|
int xmit_to_rank,int do_xmit,
|
||||||
|
@ -30,7 +30,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
|
|
||||||
Grid_MPI_Comm CartesianCommunicator::communicator_world;
|
Grid_MPI_Comm CartesianCommunicator::communicator_world;
|
||||||
|
|
||||||
////////////////////////////////////////////
|
////////////////////////////////////////////
|
||||||
@ -258,25 +257,6 @@ CartesianCommunicator::~CartesianCommunicator()
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#ifdef USE_GRID_REDUCTION
|
|
||||||
void CartesianCommunicator::GlobalSum(float &f){
|
|
||||||
CartesianCommunicator::GlobalSumP2P(f);
|
|
||||||
}
|
|
||||||
void CartesianCommunicator::GlobalSum(double &d)
|
|
||||||
{
|
|
||||||
CartesianCommunicator::GlobalSumP2P(d);
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
void CartesianCommunicator::GlobalSum(float &f){
|
|
||||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
|
|
||||||
assert(ierr==0);
|
|
||||||
}
|
|
||||||
void CartesianCommunicator::GlobalSum(double &d)
|
|
||||||
{
|
|
||||||
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
|
|
||||||
assert(ierr==0);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
void CartesianCommunicator::GlobalSum(uint32_t &u){
|
void CartesianCommunicator::GlobalSum(uint32_t &u){
|
||||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
@ -307,18 +287,27 @@ void CartesianCommunicator::GlobalMax(double &d)
|
|||||||
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
|
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
}
|
}
|
||||||
|
void CartesianCommunicator::GlobalSum(float &f){
|
||||||
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
|
||||||
|
assert(ierr==0);
|
||||||
|
}
|
||||||
void CartesianCommunicator::GlobalSumVector(float *f,int N)
|
void CartesianCommunicator::GlobalSumVector(float *f,int N)
|
||||||
{
|
{
|
||||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
|
int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
}
|
}
|
||||||
|
void CartesianCommunicator::GlobalSum(double &d)
|
||||||
|
{
|
||||||
|
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
|
||||||
|
assert(ierr==0);
|
||||||
|
}
|
||||||
void CartesianCommunicator::GlobalSumVector(double *d,int N)
|
void CartesianCommunicator::GlobalSumVector(double *d,int N)
|
||||||
{
|
{
|
||||||
int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
|
int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
}
|
}
|
||||||
|
|
||||||
void CartesianCommunicator::SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &list,
|
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||||
void *xmit,
|
void *xmit,
|
||||||
int dest,
|
int dest,
|
||||||
void *recv,
|
void *recv,
|
||||||
@ -343,7 +332,7 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &
|
|||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
list.push_back(xrq);
|
list.push_back(xrq);
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::CommsComplete(std::vector<MpiCommsRequest_t> &list)
|
void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list)
|
||||||
{
|
{
|
||||||
int nreq=list.size();
|
int nreq=list.size();
|
||||||
|
|
||||||
@ -362,7 +351,9 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
|||||||
int from,
|
int from,
|
||||||
int bytes)
|
int bytes)
|
||||||
{
|
{
|
||||||
std::vector<MpiCommsRequest_t> reqs(0);
|
std::vector<CommsRequest_t> reqs(0);
|
||||||
|
unsigned long xcrc = crc32(0L, Z_NULL, 0);
|
||||||
|
unsigned long rcrc = crc32(0L, Z_NULL, 0);
|
||||||
|
|
||||||
int myrank = _processor;
|
int myrank = _processor;
|
||||||
int ierr;
|
int ierr;
|
||||||
@ -378,6 +369,9 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
|||||||
communicator,MPI_STATUS_IGNORE);
|
communicator,MPI_STATUS_IGNORE);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
|
|
||||||
|
// xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
|
||||||
|
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
|
||||||
|
// printf("proc %d SendToRecvFrom %d bytes xcrc %lx rcrc %lx\n",_processor,bytes,xcrc,rcrc); fflush
|
||||||
}
|
}
|
||||||
// Basic Halo comms primitive
|
// Basic Halo comms primitive
|
||||||
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
||||||
@ -387,25 +381,12 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
|||||||
int bytes,int dir)
|
int bytes,int dir)
|
||||||
{
|
{
|
||||||
std::vector<CommsRequest_t> list;
|
std::vector<CommsRequest_t> list;
|
||||||
double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
|
double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
|
||||||
offbytes += StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
|
|
||||||
StencilSendToRecvFromComplete(list,dir);
|
StencilSendToRecvFromComplete(list,dir);
|
||||||
return offbytes;
|
return offbytes;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#undef NVLINK_GET // Define to use get instead of put DMA
|
||||||
#ifdef ACCELERATOR_AWARE_MPI
|
|
||||||
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
|
|
||||||
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
|
|
||||||
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
|
|
||||||
void *xmit,
|
|
||||||
int dest,int dox,
|
|
||||||
void *recv,
|
|
||||||
int from,int dor,
|
|
||||||
int xbytes,int rbytes,int dir)
|
|
||||||
{
|
|
||||||
return 0.0; // Do nothing -- no preparation required
|
|
||||||
}
|
|
||||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||||
void *xmit,
|
void *xmit,
|
||||||
int dest,int dox,
|
int dest,int dox,
|
||||||
@ -429,7 +410,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
|||||||
assert(gme == ShmRank);
|
assert(gme == ShmRank);
|
||||||
double off_node_bytes=0.0;
|
double off_node_bytes=0.0;
|
||||||
int tag;
|
int tag;
|
||||||
|
|
||||||
if ( dor ) {
|
if ( dor ) {
|
||||||
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
|
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
|
||||||
tag= dir+from*32;
|
tag= dir+from*32;
|
||||||
@ -438,9 +419,15 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
|||||||
list.push_back(rrq);
|
list.push_back(rrq);
|
||||||
off_node_bytes+=rbytes;
|
off_node_bytes+=rbytes;
|
||||||
}
|
}
|
||||||
|
#ifdef NVLINK_GET
|
||||||
|
void *shm = (void *) this->ShmBufferTranslate(from,xmit);
|
||||||
|
assert(shm!=NULL);
|
||||||
|
acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
if (dox) {
|
if (dox) {
|
||||||
|
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
|
||||||
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
|
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
|
||||||
tag= dir+_processor*32;
|
tag= dir+_processor*32;
|
||||||
ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
|
ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
|
||||||
@ -448,14 +435,17 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
|||||||
list.push_back(xrq);
|
list.push_back(xrq);
|
||||||
off_node_bytes+=xbytes;
|
off_node_bytes+=xbytes;
|
||||||
} else {
|
} else {
|
||||||
|
#ifndef NVLINK_GET
|
||||||
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
|
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
|
||||||
assert(shm!=NULL);
|
assert(shm!=NULL);
|
||||||
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
|
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
|
||||||
|
#endif
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return off_node_bytes;
|
return off_node_bytes;
|
||||||
}
|
}
|
||||||
|
|
||||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
|
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
|
||||||
{
|
{
|
||||||
int nreq=list.size();
|
int nreq=list.size();
|
||||||
@ -463,326 +453,12 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
|
|||||||
acceleratorCopySynchronise();
|
acceleratorCopySynchronise();
|
||||||
|
|
||||||
if (nreq==0) return;
|
if (nreq==0) return;
|
||||||
|
|
||||||
std::vector<MPI_Status> status(nreq);
|
std::vector<MPI_Status> status(nreq);
|
||||||
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
|
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
list.resize(0);
|
list.resize(0);
|
||||||
this->StencilBarrier();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#else /* NOT ... ACCELERATOR_AWARE_MPI */
|
|
||||||
///////////////////////////////////////////
|
|
||||||
// Pipeline mode through host memory
|
|
||||||
///////////////////////////////////////////
|
|
||||||
/*
|
|
||||||
* In prepare (phase 1):
|
|
||||||
* PHASE 1: (prepare)
|
|
||||||
* - post MPI receive buffers asynch
|
|
||||||
* - post device - host send buffer transfer asynch
|
|
||||||
* PHASE 2: (Begin)
|
|
||||||
* - complete all copies
|
|
||||||
* - post MPI send asynch
|
|
||||||
* - post device - device transfers
|
|
||||||
* PHASE 3: (Complete)
|
|
||||||
* - MPI_waitall
|
|
||||||
* - host-device transfers
|
|
||||||
*
|
|
||||||
*********************************
|
|
||||||
* NB could split this further:
|
|
||||||
*--------------------------------
|
|
||||||
* PHASE 1: (Prepare)
|
|
||||||
* - post MPI receive buffers asynch
|
|
||||||
* - post device - host send buffer transfer asynch
|
|
||||||
* PHASE 2: (BeginInterNode)
|
|
||||||
* - complete all copies
|
|
||||||
* - post MPI send asynch
|
|
||||||
* PHASE 3: (BeginIntraNode)
|
|
||||||
* - post device - device transfers
|
|
||||||
* PHASE 4: (Complete)
|
|
||||||
* - MPI_waitall
|
|
||||||
* - host-device transfers asynch
|
|
||||||
* - (complete all copies)
|
|
||||||
*/
|
|
||||||
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
|
|
||||||
void *xmit,
|
|
||||||
int dest,int dox,
|
|
||||||
void *recv,
|
|
||||||
int from,int dor,
|
|
||||||
int xbytes,int rbytes,int dir)
|
|
||||||
{
|
|
||||||
/*
|
|
||||||
* Bring sequence from Stencil.h down to lower level.
|
|
||||||
* Assume using XeLink is ok
|
|
||||||
*/
|
|
||||||
int ncomm =communicator_halo.size();
|
|
||||||
int commdir=dir%ncomm;
|
|
||||||
|
|
||||||
MPI_Request xrq;
|
|
||||||
MPI_Request rrq;
|
|
||||||
|
|
||||||
int ierr;
|
|
||||||
int gdest = ShmRanks[dest];
|
|
||||||
int gfrom = ShmRanks[from];
|
|
||||||
int gme = ShmRanks[_processor];
|
|
||||||
|
|
||||||
assert(dest != _processor);
|
|
||||||
assert(from != _processor);
|
|
||||||
assert(gme == ShmRank);
|
|
||||||
double off_node_bytes=0.0;
|
|
||||||
int tag;
|
|
||||||
|
|
||||||
void * host_recv = NULL;
|
|
||||||
void * host_xmit = NULL;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* PHASE 1: (Prepare)
|
|
||||||
* - post MPI receive buffers asynch
|
|
||||||
* - post device - host send buffer transfer asynch
|
|
||||||
*/
|
|
||||||
|
|
||||||
if ( dor ) {
|
|
||||||
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
|
|
||||||
tag= dir+from*32;
|
|
||||||
host_recv = this->HostBufferMalloc(rbytes);
|
|
||||||
ierr=MPI_Irecv(host_recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
|
|
||||||
assert(ierr==0);
|
|
||||||
CommsRequest_t srq;
|
|
||||||
srq.PacketType = InterNodeRecv;
|
|
||||||
srq.bytes = rbytes;
|
|
||||||
srq.req = rrq;
|
|
||||||
srq.host_buf = host_recv;
|
|
||||||
srq.device_buf = recv;
|
|
||||||
list.push_back(srq);
|
|
||||||
off_node_bytes+=rbytes;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (dox) {
|
|
||||||
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
|
|
||||||
|
|
||||||
tag= dir+_processor*32;
|
|
||||||
|
|
||||||
host_xmit = this->HostBufferMalloc(xbytes);
|
|
||||||
CommsRequest_t srq;
|
|
||||||
|
|
||||||
srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
|
|
||||||
|
|
||||||
// ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
|
|
||||||
// assert(ierr==0);
|
|
||||||
// off_node_bytes+=xbytes;
|
|
||||||
|
|
||||||
srq.PacketType = InterNodeXmit;
|
|
||||||
srq.bytes = xbytes;
|
|
||||||
// srq.req = xrq;
|
|
||||||
srq.host_buf = host_xmit;
|
|
||||||
srq.device_buf = xmit;
|
|
||||||
srq.tag = tag;
|
|
||||||
srq.dest = dest;
|
|
||||||
srq.commdir = commdir;
|
|
||||||
list.push_back(srq);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return off_node_bytes;
|
|
||||||
}
|
|
||||||
/*
|
|
||||||
* In the interest of better pipelining, poll for completion on each DtoH and
|
|
||||||
* start MPI_ISend in the meantime
|
|
||||||
*/
|
|
||||||
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list)
|
|
||||||
{
|
|
||||||
int pending = 0;
|
|
||||||
do {
|
|
||||||
|
|
||||||
pending = 0;
|
|
||||||
|
|
||||||
for(int idx = 0; idx<list.size();idx++){
|
|
||||||
|
|
||||||
if ( list[idx].PacketType==InterNodeRecv ) {
|
|
||||||
|
|
||||||
int flag = 0;
|
|
||||||
MPI_Status status;
|
|
||||||
int ierr = MPI_Test(&list[idx].req,&flag,&status);
|
|
||||||
assert(ierr==0);
|
|
||||||
|
|
||||||
if ( flag ) {
|
|
||||||
// std::cout << " PollIrecv "<<idx<<" flag "<<flag<<std::endl;
|
|
||||||
acceleratorCopyToDeviceAsynch(list[idx].host_buf,list[idx].device_buf,list[idx].bytes);
|
|
||||||
list[idx].PacketType=InterNodeReceiveHtoD;
|
|
||||||
} else {
|
|
||||||
pending ++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// std::cout << " PollIrecv "<<pending<<" pending requests"<<std::endl;
|
|
||||||
} while ( pending );
|
|
||||||
|
|
||||||
}
|
|
||||||
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list)
|
|
||||||
{
|
|
||||||
int pending = 0;
|
|
||||||
do {
|
|
||||||
|
|
||||||
pending = 0;
|
|
||||||
|
|
||||||
for(int idx = 0; idx<list.size();idx++){
|
|
||||||
|
|
||||||
if ( list[idx].PacketType==InterNodeXmit ) {
|
|
||||||
|
|
||||||
if ( acceleratorEventIsComplete(list[idx].ev) ) {
|
|
||||||
|
|
||||||
void *host_xmit = list[idx].host_buf;
|
|
||||||
uint32_t xbytes = list[idx].bytes;
|
|
||||||
int dest = list[idx].dest;
|
|
||||||
int tag = list[idx].tag;
|
|
||||||
int commdir = list[idx].commdir;
|
|
||||||
///////////////////
|
|
||||||
// Send packet
|
|
||||||
///////////////////
|
|
||||||
|
|
||||||
// std::cout << " DtoH is complete for index "<<idx<<" calling MPI_Isend "<<std::endl;
|
|
||||||
|
|
||||||
MPI_Request xrq;
|
|
||||||
int ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
|
|
||||||
assert(ierr==0);
|
|
||||||
|
|
||||||
list[idx].req = xrq; // Update the MPI request in the list
|
|
||||||
|
|
||||||
list[idx].PacketType=InterNodeXmitISend;
|
|
||||||
|
|
||||||
} else {
|
|
||||||
// not done, so return to polling loop
|
|
||||||
pending++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} while (pending);
|
|
||||||
}
|
|
||||||
|
|
||||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
|
||||||
void *xmit,
|
|
||||||
int dest,int dox,
|
|
||||||
void *recv,
|
|
||||||
int from,int dor,
|
|
||||||
int xbytes,int rbytes,int dir)
|
|
||||||
{
|
|
||||||
int ncomm =communicator_halo.size();
|
|
||||||
int commdir=dir%ncomm;
|
|
||||||
|
|
||||||
MPI_Request xrq;
|
|
||||||
MPI_Request rrq;
|
|
||||||
|
|
||||||
int ierr;
|
|
||||||
int gdest = ShmRanks[dest];
|
|
||||||
int gfrom = ShmRanks[from];
|
|
||||||
int gme = ShmRanks[_processor];
|
|
||||||
|
|
||||||
assert(dest != _processor);
|
|
||||||
assert(from != _processor);
|
|
||||||
assert(gme == ShmRank);
|
|
||||||
double off_node_bytes=0.0;
|
|
||||||
int tag;
|
|
||||||
|
|
||||||
void * host_xmit = NULL;
|
|
||||||
|
|
||||||
////////////////////////////////
|
|
||||||
// Receives already posted
|
|
||||||
// Copies already started
|
|
||||||
////////////////////////////////
|
|
||||||
/*
|
|
||||||
* PHASE 2: (Begin)
|
|
||||||
* - complete all copies
|
|
||||||
* - post MPI send asynch
|
|
||||||
*/
|
|
||||||
#ifdef NVLINK_GET
|
|
||||||
if ( dor ) {
|
|
||||||
|
|
||||||
if ( ! ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) ) {
|
|
||||||
// Intranode
|
|
||||||
void *shm = (void *) this->ShmBufferTranslate(from,xmit);
|
|
||||||
assert(shm!=NULL);
|
|
||||||
|
|
||||||
CommsRequest_t srq;
|
|
||||||
|
|
||||||
srq.ev = acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
|
|
||||||
|
|
||||||
srq.PacketType = IntraNodeRecv;
|
|
||||||
srq.bytes = xbytes;
|
|
||||||
// srq.req = xrq;
|
|
||||||
srq.host_buf = NULL;
|
|
||||||
srq.device_buf = xmit;
|
|
||||||
srq.tag = -1;
|
|
||||||
srq.dest = dest;
|
|
||||||
srq.commdir = dir;
|
|
||||||
list.push_back(srq);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
if (dox) {
|
|
||||||
|
|
||||||
if ( !( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) ) {
|
|
||||||
// Intranode
|
|
||||||
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
|
|
||||||
assert(shm!=NULL);
|
|
||||||
|
|
||||||
CommsRequest_t srq;
|
|
||||||
|
|
||||||
srq.ev = acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
|
|
||||||
|
|
||||||
srq.PacketType = IntraNodeXmit;
|
|
||||||
srq.bytes = xbytes;
|
|
||||||
// srq.req = xrq;
|
|
||||||
srq.host_buf = NULL;
|
|
||||||
srq.device_buf = xmit;
|
|
||||||
srq.tag = -1;
|
|
||||||
srq.dest = dest;
|
|
||||||
srq.commdir = dir;
|
|
||||||
list.push_back(srq);
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
return off_node_bytes;
|
|
||||||
}
|
|
||||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
|
|
||||||
{
|
|
||||||
acceleratorCopySynchronise(); // Complete all pending copy transfers D2D
|
|
||||||
|
|
||||||
std::vector<MPI_Status> status;
|
|
||||||
std::vector<MPI_Request> MpiRequests;
|
|
||||||
|
|
||||||
for(int r=0;r<list.size();r++){
|
|
||||||
// Must check each Send buf is clear to reuse
|
|
||||||
if ( list[r].PacketType == InterNodeXmitISend ) MpiRequests.push_back(list[r].req);
|
|
||||||
// if ( list[r].PacketType == InterNodeRecv ) MpiRequests.push_back(list[r].req); // Already "Test" passed
|
|
||||||
}
|
|
||||||
|
|
||||||
int nreq=MpiRequests.size();
|
|
||||||
|
|
||||||
if (nreq>0) {
|
|
||||||
status.resize(MpiRequests.size());
|
|
||||||
int ierr = MPI_Waitall(MpiRequests.size(),&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing.
|
|
||||||
assert(ierr==0);
|
|
||||||
}
|
|
||||||
|
|
||||||
// for(int r=0;r<nreq;r++){
|
|
||||||
// if ( list[r].PacketType==InterNodeRecv ) {
|
|
||||||
// acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
|
|
||||||
|
|
||||||
list.resize(0); // Delete the list
|
|
||||||
this->HostBufferFreeAll(); // Clean up the buffer allocs
|
|
||||||
#ifndef NVLINK_GET
|
|
||||||
this->StencilBarrier(); // if PUT must check our nbrs have filled our receive buffers.
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
////////////////////////////////////////////
|
|
||||||
// END PIPELINE MODE / NO CUDA AWARE MPI
|
|
||||||
////////////////////////////////////////////
|
|
||||||
|
|
||||||
void CartesianCommunicator::StencilBarrier(void)
|
void CartesianCommunicator::StencilBarrier(void)
|
||||||
{
|
{
|
||||||
MPI_Barrier (ShmComm);
|
MPI_Barrier (ShmComm);
|
||||||
|
@ -91,7 +91,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
|||||||
{
|
{
|
||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(list.size()==0);}
|
void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(0);}
|
||||||
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||||
void *xmit,
|
void *xmit,
|
||||||
int dest,
|
int dest,
|
||||||
@ -132,17 +132,6 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
|||||||
{
|
{
|
||||||
return 2.0*bytes;
|
return 2.0*bytes;
|
||||||
}
|
}
|
||||||
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
|
|
||||||
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
|
|
||||||
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
|
|
||||||
void *xmit,
|
|
||||||
int xmit_to_rank,int dox,
|
|
||||||
void *recv,
|
|
||||||
int recv_from_rank,int dor,
|
|
||||||
int xbytes,int rbytes, int dir)
|
|
||||||
{
|
|
||||||
return 0.0;
|
|
||||||
}
|
|
||||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||||
void *xmit,
|
void *xmit,
|
||||||
int xmit_to_rank,int dox,
|
int xmit_to_rank,int dox,
|
||||||
|
@ -46,40 +46,8 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
|
|
||||||
#if defined (GRID_COMMS_MPI3)
|
#if defined (GRID_COMMS_MPI3)
|
||||||
typedef MPI_Comm Grid_MPI_Comm;
|
typedef MPI_Comm Grid_MPI_Comm;
|
||||||
typedef MPI_Request MpiCommsRequest_t;
|
|
||||||
#ifdef ACCELERATOR_AWARE_MPI
|
|
||||||
typedef MPI_Request CommsRequest_t;
|
typedef MPI_Request CommsRequest_t;
|
||||||
#else
|
|
||||||
/*
|
|
||||||
* Enable state transitions as each packet flows.
|
|
||||||
*/
|
|
||||||
enum PacketType_t {
|
|
||||||
FaceGather,
|
|
||||||
InterNodeXmit,
|
|
||||||
InterNodeRecv,
|
|
||||||
IntraNodeXmit,
|
|
||||||
IntraNodeRecv,
|
|
||||||
InterNodeXmitISend,
|
|
||||||
InterNodeReceiveHtoD
|
|
||||||
};
|
|
||||||
/*
|
|
||||||
*Package arguments needed for various actions along packet flow
|
|
||||||
*/
|
|
||||||
typedef struct {
|
|
||||||
PacketType_t PacketType;
|
|
||||||
void *host_buf;
|
|
||||||
void *device_buf;
|
|
||||||
int dest;
|
|
||||||
int tag;
|
|
||||||
int commdir;
|
|
||||||
unsigned long bytes;
|
|
||||||
acceleratorEvent_t ev;
|
|
||||||
MpiCommsRequest_t req;
|
|
||||||
} CommsRequest_t;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#else
|
#else
|
||||||
typedef int MpiCommsRequest_t;
|
|
||||||
typedef int CommsRequest_t;
|
typedef int CommsRequest_t;
|
||||||
typedef int Grid_MPI_Comm;
|
typedef int Grid_MPI_Comm;
|
||||||
#endif
|
#endif
|
||||||
|
@ -42,11 +42,6 @@ Author: Christoph Lehner <christoph@lhnr.de>
|
|||||||
#ifdef ACCELERATOR_AWARE_MPI
|
#ifdef ACCELERATOR_AWARE_MPI
|
||||||
#define GRID_SYCL_LEVEL_ZERO_IPC
|
#define GRID_SYCL_LEVEL_ZERO_IPC
|
||||||
#define SHM_SOCKETS
|
#define SHM_SOCKETS
|
||||||
#else
|
|
||||||
#ifdef HAVE_NUMAIF_H
|
|
||||||
#warning " Using NUMAIF "
|
|
||||||
#include <numaif.h>
|
|
||||||
#endif
|
|
||||||
#endif
|
#endif
|
||||||
#include <syscall.h>
|
#include <syscall.h>
|
||||||
#endif
|
#endif
|
||||||
@ -542,38 +537,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
// Each MPI rank should allocate our own buffer
|
// Each MPI rank should allocate our own buffer
|
||||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
#ifndef ACCELERATOR_AWARE_MPI
|
#ifndef ACCELERATOR_AWARE_MPI
|
||||||
printf("Host buffer allocate for GPU non-aware MPI\n");
|
HostCommBuf= malloc(bytes);
|
||||||
#if 0
|
|
||||||
HostCommBuf= acceleratorAllocHost(bytes);
|
|
||||||
#else
|
|
||||||
HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
|
|
||||||
#ifdef HAVE_NUMAIF_H
|
|
||||||
#warning "Moving host buffers to specific NUMA domain"
|
|
||||||
int numa;
|
|
||||||
char *numa_name=(char *)getenv("MPI_BUF_NUMA");
|
|
||||||
if(numa_name) {
|
|
||||||
unsigned long page_size = sysconf(_SC_PAGESIZE);
|
|
||||||
numa = atoi(numa_name);
|
|
||||||
unsigned long page_count = bytes/page_size;
|
|
||||||
std::vector<void *> pages(page_count);
|
|
||||||
std::vector<int> nodes(page_count,numa);
|
|
||||||
std::vector<int> status(page_count,-1);
|
|
||||||
for(unsigned long p=0;p<page_count;p++){
|
|
||||||
pages[p] =(void *) ((uint64_t) HostCommBuf + p*page_size);
|
|
||||||
}
|
|
||||||
int ret = move_pages(0,
|
|
||||||
page_count,
|
|
||||||
&pages[0],
|
|
||||||
&nodes[0],
|
|
||||||
&status[0],
|
|
||||||
MPOL_MF_MOVE);
|
|
||||||
printf("Host buffer move to numa domain %d : move_pages returned %d\n",numa,ret);
|
|
||||||
if (ret) perror(" move_pages failed for reason:");
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
acceleratorPin(HostCommBuf,bytes);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
ShmCommBuf = acceleratorAllocDevice(bytes);
|
ShmCommBuf = acceleratorAllocDevice(bytes);
|
||||||
if (ShmCommBuf == (void *)NULL ) {
|
if (ShmCommBuf == (void *)NULL ) {
|
||||||
@ -605,8 +569,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
|
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
|
||||||
typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t;
|
typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t;
|
||||||
|
|
||||||
auto zeDevice = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());
|
auto zeDevice = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());
|
||||||
auto zeContext = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());
|
auto zeContext = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());
|
||||||
|
|
||||||
ze_ipc_mem_handle_t ihandle;
|
ze_ipc_mem_handle_t ihandle;
|
||||||
clone_mem_t handle;
|
clone_mem_t handle;
|
||||||
|
@ -51,6 +51,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
|
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
|
||||||
auto Cshift(const Expression &expr,int dim,int shift) -> decltype(closure(expr))
|
auto Cshift(const Expression &expr,int dim,int shift) -> decltype(closure(expr))
|
||||||
{
|
{
|
||||||
|
@ -30,11 +30,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
extern std::vector<std::pair<int,int> > Cshift_table;
|
extern std::vector<std::pair<int,int> > Cshift_table;
|
||||||
extern deviceVector<std::pair<int,int> > Cshift_table_device;
|
extern commVector<std::pair<int,int> > Cshift_table_device;
|
||||||
|
|
||||||
inline std::pair<int,int> *MapCshiftTable(void)
|
inline std::pair<int,int> *MapCshiftTable(void)
|
||||||
{
|
{
|
||||||
// GPU version
|
// GPU version
|
||||||
|
#ifdef ACCELERATOR_CSHIFT
|
||||||
uint64_t sz=Cshift_table.size();
|
uint64_t sz=Cshift_table.size();
|
||||||
if (Cshift_table_device.size()!=sz ) {
|
if (Cshift_table_device.size()!=sz ) {
|
||||||
Cshift_table_device.resize(sz);
|
Cshift_table_device.resize(sz);
|
||||||
@ -44,13 +45,16 @@ inline std::pair<int,int> *MapCshiftTable(void)
|
|||||||
sizeof(Cshift_table[0])*sz);
|
sizeof(Cshift_table[0])*sz);
|
||||||
|
|
||||||
return &Cshift_table_device[0];
|
return &Cshift_table_device[0];
|
||||||
|
#else
|
||||||
|
return &Cshift_table[0];
|
||||||
|
#endif
|
||||||
// CPU version use identify map
|
// CPU version use identify map
|
||||||
}
|
}
|
||||||
///////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////
|
||||||
// Gather for when there is no need to SIMD split
|
// Gather for when there is no need to SIMD split
|
||||||
///////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////
|
||||||
template<class vobj> void
|
template<class vobj> void
|
||||||
Gather_plane_simple (const Lattice<vobj> &rhs,deviceVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
|
Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
|
||||||
{
|
{
|
||||||
int rd = rhs.Grid()->_rdimensions[dimension];
|
int rd = rhs.Grid()->_rdimensions[dimension];
|
||||||
|
|
||||||
@ -90,10 +94,17 @@ Gather_plane_simple (const Lattice<vobj> &rhs,deviceVector<vobj> &buffer,int dim
|
|||||||
{
|
{
|
||||||
auto buffer_p = & buffer[0];
|
auto buffer_p = & buffer[0];
|
||||||
auto table = MapCshiftTable();
|
auto table = MapCshiftTable();
|
||||||
|
#ifdef ACCELERATOR_CSHIFT
|
||||||
autoView(rhs_v , rhs, AcceleratorRead);
|
autoView(rhs_v , rhs, AcceleratorRead);
|
||||||
accelerator_for(i,ent,vobj::Nsimd(),{
|
accelerator_for(i,ent,vobj::Nsimd(),{
|
||||||
coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
|
coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
|
||||||
});
|
});
|
||||||
|
#else
|
||||||
|
autoView(rhs_v , rhs, CpuRead);
|
||||||
|
thread_for(i,ent,{
|
||||||
|
buffer_p[table[i].first]=rhs_v[table[i].second];
|
||||||
|
});
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -118,6 +129,7 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
|
|||||||
int n1=rhs.Grid()->_slice_stride[dimension];
|
int n1=rhs.Grid()->_slice_stride[dimension];
|
||||||
|
|
||||||
if ( cbmask ==0x3){
|
if ( cbmask ==0x3){
|
||||||
|
#ifdef ACCELERATOR_CSHIFT
|
||||||
autoView(rhs_v , rhs, AcceleratorRead);
|
autoView(rhs_v , rhs, AcceleratorRead);
|
||||||
accelerator_for(nn,e1*e2,1,{
|
accelerator_for(nn,e1*e2,1,{
|
||||||
int n = nn%e1;
|
int n = nn%e1;
|
||||||
@ -128,10 +140,21 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
|
|||||||
vobj temp =rhs_v[so+o+b];
|
vobj temp =rhs_v[so+o+b];
|
||||||
extract<vobj>(temp,pointers,offset);
|
extract<vobj>(temp,pointers,offset);
|
||||||
});
|
});
|
||||||
|
#else
|
||||||
|
autoView(rhs_v , rhs, CpuRead);
|
||||||
|
thread_for2d(n,e1,b,e2,{
|
||||||
|
int o = n*n1;
|
||||||
|
int offset = b+n*e2;
|
||||||
|
|
||||||
|
vobj temp =rhs_v[so+o+b];
|
||||||
|
extract<vobj>(temp,pointers,offset);
|
||||||
|
});
|
||||||
|
#endif
|
||||||
} else {
|
} else {
|
||||||
Coordinate rdim=rhs.Grid()->_rdimensions;
|
Coordinate rdim=rhs.Grid()->_rdimensions;
|
||||||
Coordinate cdm =rhs.Grid()->_checker_dim_mask;
|
Coordinate cdm =rhs.Grid()->_checker_dim_mask;
|
||||||
std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
|
std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
|
||||||
|
#ifdef ACCELERATOR_CSHIFT
|
||||||
autoView(rhs_v , rhs, AcceleratorRead);
|
autoView(rhs_v , rhs, AcceleratorRead);
|
||||||
accelerator_for(nn,e1*e2,1,{
|
accelerator_for(nn,e1*e2,1,{
|
||||||
int n = nn%e1;
|
int n = nn%e1;
|
||||||
@ -152,13 +175,33 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
|
|||||||
extract<vobj>(temp,pointers,offset);
|
extract<vobj>(temp,pointers,offset);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
#else
|
||||||
|
autoView(rhs_v , rhs, CpuRead);
|
||||||
|
thread_for2d(n,e1,b,e2,{
|
||||||
|
|
||||||
|
Coordinate coor;
|
||||||
|
|
||||||
|
int o=n*n1;
|
||||||
|
int oindex = o+b;
|
||||||
|
|
||||||
|
int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm);
|
||||||
|
|
||||||
|
int ocb=1<<cb;
|
||||||
|
int offset = b+n*e2;
|
||||||
|
|
||||||
|
if ( ocb & cbmask ) {
|
||||||
|
vobj temp =rhs_v[so+o+b];
|
||||||
|
extract<vobj>(temp,pointers,offset);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
// Scatter for when there is no need to SIMD split
|
// Scatter for when there is no need to SIMD split
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,deviceVector<vobj> &buffer, int dimension,int plane,int cbmask)
|
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<vobj> &buffer, int dimension,int plane,int cbmask)
|
||||||
{
|
{
|
||||||
int rd = rhs.Grid()->_rdimensions[dimension];
|
int rd = rhs.Grid()->_rdimensions[dimension];
|
||||||
|
|
||||||
@ -202,10 +245,17 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,deviceVector<
|
|||||||
{
|
{
|
||||||
auto buffer_p = & buffer[0];
|
auto buffer_p = & buffer[0];
|
||||||
auto table = MapCshiftTable();
|
auto table = MapCshiftTable();
|
||||||
|
#ifdef ACCELERATOR_CSHIFT
|
||||||
autoView( rhs_v, rhs, AcceleratorWrite);
|
autoView( rhs_v, rhs, AcceleratorWrite);
|
||||||
accelerator_for(i,ent,vobj::Nsimd(),{
|
accelerator_for(i,ent,vobj::Nsimd(),{
|
||||||
coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
|
coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
|
||||||
});
|
});
|
||||||
|
#else
|
||||||
|
autoView( rhs_v, rhs, CpuWrite);
|
||||||
|
thread_for(i,ent,{
|
||||||
|
rhs_v[table[i].first]=buffer_p[table[i].second];
|
||||||
|
});
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -228,6 +278,7 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
|
|||||||
if(cbmask ==0x3 ) {
|
if(cbmask ==0x3 ) {
|
||||||
int _slice_stride = rhs.Grid()->_slice_stride[dimension];
|
int _slice_stride = rhs.Grid()->_slice_stride[dimension];
|
||||||
int _slice_block = rhs.Grid()->_slice_block[dimension];
|
int _slice_block = rhs.Grid()->_slice_block[dimension];
|
||||||
|
#ifdef ACCELERATOR_CSHIFT
|
||||||
autoView( rhs_v , rhs, AcceleratorWrite);
|
autoView( rhs_v , rhs, AcceleratorWrite);
|
||||||
accelerator_for(nn,e1*e2,1,{
|
accelerator_for(nn,e1*e2,1,{
|
||||||
int n = nn%e1;
|
int n = nn%e1;
|
||||||
@ -236,6 +287,14 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
|
|||||||
int offset = b+n*_slice_block;
|
int offset = b+n*_slice_block;
|
||||||
merge(rhs_v[so+o+b],pointers,offset);
|
merge(rhs_v[so+o+b],pointers,offset);
|
||||||
});
|
});
|
||||||
|
#else
|
||||||
|
autoView( rhs_v , rhs, CpuWrite);
|
||||||
|
thread_for2d(n,e1,b,e2,{
|
||||||
|
int o = n*_slice_stride;
|
||||||
|
int offset = b+n*_slice_block;
|
||||||
|
merge(rhs_v[so+o+b],pointers,offset);
|
||||||
|
});
|
||||||
|
#endif
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
// Case of SIMD split AND checker dim cannot currently be hit, except in
|
// Case of SIMD split AND checker dim cannot currently be hit, except in
|
||||||
@ -301,11 +360,19 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
|
|||||||
|
|
||||||
{
|
{
|
||||||
auto table = MapCshiftTable();
|
auto table = MapCshiftTable();
|
||||||
|
#ifdef ACCELERATOR_CSHIFT
|
||||||
autoView(rhs_v , rhs, AcceleratorRead);
|
autoView(rhs_v , rhs, AcceleratorRead);
|
||||||
autoView(lhs_v , lhs, AcceleratorWrite);
|
autoView(lhs_v , lhs, AcceleratorWrite);
|
||||||
accelerator_for(i,ent,vobj::Nsimd(),{
|
accelerator_for(i,ent,vobj::Nsimd(),{
|
||||||
coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second]));
|
coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second]));
|
||||||
});
|
});
|
||||||
|
#else
|
||||||
|
autoView(rhs_v , rhs, CpuRead);
|
||||||
|
autoView(lhs_v , lhs, CpuWrite);
|
||||||
|
thread_for(i,ent,{
|
||||||
|
lhs_v[table[i].first]=rhs_v[table[i].second];
|
||||||
|
});
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -345,11 +412,19 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
|
|||||||
|
|
||||||
{
|
{
|
||||||
auto table = MapCshiftTable();
|
auto table = MapCshiftTable();
|
||||||
|
#ifdef ACCELERATOR_CSHIFT
|
||||||
autoView( rhs_v, rhs, AcceleratorRead);
|
autoView( rhs_v, rhs, AcceleratorRead);
|
||||||
autoView( lhs_v, lhs, AcceleratorWrite);
|
autoView( lhs_v, lhs, AcceleratorWrite);
|
||||||
accelerator_for(i,ent,1,{
|
accelerator_for(i,ent,1,{
|
||||||
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
|
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
|
||||||
});
|
});
|
||||||
|
#else
|
||||||
|
autoView( rhs_v, rhs, CpuRead);
|
||||||
|
autoView( lhs_v, lhs, CpuWrite);
|
||||||
|
thread_for(i,ent,{
|
||||||
|
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
|
||||||
|
});
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -31,7 +31,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
|
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
const int Cshift_verbose=0;
|
|
||||||
template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
|
template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
|
||||||
{
|
{
|
||||||
typedef typename vobj::vector_type vector_type;
|
typedef typename vobj::vector_type vector_type;
|
||||||
@ -55,17 +55,17 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
|
|||||||
RealD t1,t0;
|
RealD t1,t0;
|
||||||
t0=usecond();
|
t0=usecond();
|
||||||
if ( !comm_dim ) {
|
if ( !comm_dim ) {
|
||||||
// std::cout << "CSHIFT: Cshift_local" <<std::endl;
|
//std::cout << "CSHIFT: Cshift_local" <<std::endl;
|
||||||
Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding
|
Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding
|
||||||
} else if ( splice_dim ) {
|
} else if ( splice_dim ) {
|
||||||
// std::cout << "CSHIFT: Cshift_comms_simd call - splice_dim = " << splice_dim << " shift " << shift << " dimension = " << dimension << std::endl;
|
//std::cout << "CSHIFT: Cshift_comms_simd call - splice_dim = " << splice_dim << " shift " << shift << " dimension = " << dimension << std::endl;
|
||||||
Cshift_comms_simd(ret,rhs,dimension,shift);
|
Cshift_comms_simd(ret,rhs,dimension,shift);
|
||||||
} else {
|
} else {
|
||||||
// std::cout << "CSHIFT: Cshift_comms" <<std::endl;
|
//std::cout << "CSHIFT: Cshift_comms" <<std::endl;
|
||||||
Cshift_comms(ret,rhs,dimension,shift);
|
Cshift_comms(ret,rhs,dimension,shift);
|
||||||
}
|
}
|
||||||
t1=usecond();
|
t1=usecond();
|
||||||
if(Cshift_verbose) std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl;
|
// std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl;
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -94,16 +94,18 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
|
|||||||
sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even);
|
sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even);
|
||||||
sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd);
|
sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd);
|
||||||
|
|
||||||
// std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.Checkerboard()<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
|
//std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
|
||||||
if ( sshift[0] == sshift[1] ) {
|
if ( sshift[0] == sshift[1] ) {
|
||||||
// std::cout << "Single pass Cshift_comms" <<std::endl;
|
//std::cout << "Single pass Cshift_comms" <<std::endl;
|
||||||
Cshift_comms_simd(ret,rhs,dimension,shift,0x3);
|
Cshift_comms_simd(ret,rhs,dimension,shift,0x3);
|
||||||
} else {
|
} else {
|
||||||
// std::cout << "Two pass Cshift_comms" <<std::endl;
|
//std::cout << "Two pass Cshift_comms" <<std::endl;
|
||||||
Cshift_comms_simd(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
|
Cshift_comms_simd(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
|
||||||
Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
|
Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#define ACCELERATOR_CSHIFT_NO_COPY
|
||||||
|
#ifdef ACCELERATOR_CSHIFT_NO_COPY
|
||||||
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
||||||
{
|
{
|
||||||
typedef typename vobj::vector_type vector_type;
|
typedef typename vobj::vector_type vector_type;
|
||||||
@ -123,13 +125,9 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
|||||||
assert(shift<fd);
|
assert(shift<fd);
|
||||||
|
|
||||||
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
|
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
|
||||||
static deviceVector<vobj> send_buf; send_buf.resize(buffer_size);
|
static cshiftVector<vobj> send_buf; send_buf.resize(buffer_size);
|
||||||
static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size);
|
static cshiftVector<vobj> recv_buf; recv_buf.resize(buffer_size);
|
||||||
#ifndef ACCELERATOR_AWARE_MPI
|
|
||||||
static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size);
|
|
||||||
static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
int cb= (cbmask==0x2)? Odd : Even;
|
int cb= (cbmask==0x2)? Odd : Even;
|
||||||
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
|
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
|
||||||
RealD tcopy=0.0;
|
RealD tcopy=0.0;
|
||||||
@ -160,31 +158,18 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
|||||||
// int rank = grid->_processor;
|
// int rank = grid->_processor;
|
||||||
int recv_from_rank;
|
int recv_from_rank;
|
||||||
int xmit_to_rank;
|
int xmit_to_rank;
|
||||||
|
|
||||||
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
|
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
|
||||||
|
|
||||||
tcomms-=usecond();
|
tcomms-=usecond();
|
||||||
grid->Barrier();
|
// grid->Barrier();
|
||||||
|
|
||||||
#ifdef ACCELERATOR_AWARE_MPI
|
|
||||||
grid->SendToRecvFrom((void *)&send_buf[0],
|
grid->SendToRecvFrom((void *)&send_buf[0],
|
||||||
xmit_to_rank,
|
xmit_to_rank,
|
||||||
(void *)&recv_buf[0],
|
(void *)&recv_buf[0],
|
||||||
recv_from_rank,
|
recv_from_rank,
|
||||||
bytes);
|
bytes);
|
||||||
#else
|
|
||||||
// bouncy bouncy
|
|
||||||
acceleratorCopyFromDevice(&send_buf[0],&hsend_buf[0],bytes);
|
|
||||||
grid->SendToRecvFrom((void *)&hsend_buf[0],
|
|
||||||
xmit_to_rank,
|
|
||||||
(void *)&hrecv_buf[0],
|
|
||||||
recv_from_rank,
|
|
||||||
bytes);
|
|
||||||
acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
xbytes+=bytes;
|
xbytes+=bytes;
|
||||||
grid->Barrier();
|
// grid->Barrier();
|
||||||
tcomms+=usecond();
|
tcomms+=usecond();
|
||||||
|
|
||||||
tscatter-=usecond();
|
tscatter-=usecond();
|
||||||
@ -192,13 +177,13 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
|||||||
tscatter+=usecond();
|
tscatter+=usecond();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (Cshift_verbose){
|
/*
|
||||||
std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl;
|
std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl;
|
||||||
std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl;
|
std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl;
|
||||||
std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
|
std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
|
||||||
std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl;
|
std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl;
|
||||||
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
|
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
|
||||||
}
|
*/
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
||||||
@ -216,9 +201,9 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
|||||||
int simd_layout = grid->_simd_layout[dimension];
|
int simd_layout = grid->_simd_layout[dimension];
|
||||||
int comm_dim = grid->_processors[dimension] >1 ;
|
int comm_dim = grid->_processors[dimension] >1 ;
|
||||||
|
|
||||||
// std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
|
//std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
|
||||||
// << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout
|
// << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout
|
||||||
// << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
|
// << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
|
||||||
|
|
||||||
assert(comm_dim==1);
|
assert(comm_dim==1);
|
||||||
assert(simd_layout==2);
|
assert(simd_layout==2);
|
||||||
@ -239,20 +224,16 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
|||||||
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
|
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
|
||||||
// int words = sizeof(vobj)/sizeof(vector_type);
|
// int words = sizeof(vobj)/sizeof(vector_type);
|
||||||
|
|
||||||
static std::vector<deviceVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd);
|
static std::vector<cshiftVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd);
|
||||||
static std::vector<deviceVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
|
static std::vector<cshiftVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
|
||||||
scalar_object * recv_buf_extract_mpi;
|
scalar_object * recv_buf_extract_mpi;
|
||||||
scalar_object * send_buf_extract_mpi;
|
scalar_object * send_buf_extract_mpi;
|
||||||
|
|
||||||
for(int s=0;s<Nsimd;s++){
|
for(int s=0;s<Nsimd;s++){
|
||||||
send_buf_extract[s].resize(buffer_size);
|
send_buf_extract[s].resize(buffer_size);
|
||||||
recv_buf_extract[s].resize(buffer_size);
|
recv_buf_extract[s].resize(buffer_size);
|
||||||
}
|
}
|
||||||
#ifndef ACCELERATOR_AWARE_MPI
|
|
||||||
hostVector<scalar_object> hsend_buf; hsend_buf.resize(buffer_size);
|
|
||||||
hostVector<scalar_object> hrecv_buf; hrecv_buf.resize(buffer_size);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
int bytes = buffer_size*sizeof(scalar_object);
|
int bytes = buffer_size*sizeof(scalar_object);
|
||||||
|
|
||||||
ExtractPointerArray<scalar_object> pointers(Nsimd); //
|
ExtractPointerArray<scalar_object> pointers(Nsimd); //
|
||||||
@ -300,29 +281,18 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
|||||||
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
||||||
|
|
||||||
tcomms-=usecond();
|
tcomms-=usecond();
|
||||||
grid->Barrier();
|
// grid->Barrier();
|
||||||
|
|
||||||
send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
|
send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
|
||||||
recv_buf_extract_mpi = &recv_buf_extract[i][0];
|
recv_buf_extract_mpi = &recv_buf_extract[i][0];
|
||||||
#ifdef ACCELERATOR_AWARE_MPI
|
|
||||||
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
|
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
|
||||||
xmit_to_rank,
|
xmit_to_rank,
|
||||||
(void *)recv_buf_extract_mpi,
|
(void *)recv_buf_extract_mpi,
|
||||||
recv_from_rank,
|
recv_from_rank,
|
||||||
bytes);
|
bytes);
|
||||||
#else
|
|
||||||
// bouncy bouncy
|
|
||||||
acceleratorCopyFromDevice((void *)send_buf_extract_mpi,(void *)&hsend_buf[0],bytes);
|
|
||||||
grid->SendToRecvFrom((void *)&hsend_buf[0],
|
|
||||||
xmit_to_rank,
|
|
||||||
(void *)&hrecv_buf[0],
|
|
||||||
recv_from_rank,
|
|
||||||
bytes);
|
|
||||||
acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
xbytes+=bytes;
|
xbytes+=bytes;
|
||||||
grid->Barrier();
|
// grid->Barrier();
|
||||||
tcomms+=usecond();
|
tcomms+=usecond();
|
||||||
|
|
||||||
rpointers[i] = &recv_buf_extract[i][0];
|
rpointers[i] = &recv_buf_extract[i][0];
|
||||||
@ -335,15 +305,242 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
|||||||
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
|
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
|
||||||
tscatter+=usecond();
|
tscatter+=usecond();
|
||||||
}
|
}
|
||||||
if(Cshift_verbose){
|
/*
|
||||||
std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl;
|
std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl;
|
||||||
std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl;
|
std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl;
|
||||||
std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
|
std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
|
||||||
std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl;
|
std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl;
|
||||||
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
|
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
|
||||||
|
*/
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
||||||
|
{
|
||||||
|
typedef typename vobj::vector_type vector_type;
|
||||||
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
|
|
||||||
|
GridBase *grid=rhs.Grid();
|
||||||
|
Lattice<vobj> temp(rhs.Grid());
|
||||||
|
|
||||||
|
int fd = rhs.Grid()->_fdimensions[dimension];
|
||||||
|
int rd = rhs.Grid()->_rdimensions[dimension];
|
||||||
|
int pd = rhs.Grid()->_processors[dimension];
|
||||||
|
int simd_layout = rhs.Grid()->_simd_layout[dimension];
|
||||||
|
int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
|
||||||
|
assert(simd_layout==1);
|
||||||
|
assert(comm_dim==1);
|
||||||
|
assert(shift>=0);
|
||||||
|
assert(shift<fd);
|
||||||
|
RealD tcopy=0.0;
|
||||||
|
RealD tgather=0.0;
|
||||||
|
RealD tscatter=0.0;
|
||||||
|
RealD tcomms=0.0;
|
||||||
|
uint64_t xbytes=0;
|
||||||
|
|
||||||
|
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
|
||||||
|
static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size);
|
||||||
|
static cshiftVector<vobj> recv_buf_v; recv_buf_v.resize(buffer_size);
|
||||||
|
vobj *send_buf;
|
||||||
|
vobj *recv_buf;
|
||||||
|
{
|
||||||
|
grid->ShmBufferFreeAll();
|
||||||
|
size_t bytes = buffer_size*sizeof(vobj);
|
||||||
|
send_buf=(vobj *)grid->ShmBufferMalloc(bytes);
|
||||||
|
recv_buf=(vobj *)grid->ShmBufferMalloc(bytes);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int cb= (cbmask==0x2)? Odd : Even;
|
||||||
|
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
|
||||||
|
|
||||||
|
for(int x=0;x<rd;x++){
|
||||||
|
|
||||||
|
int sx = (x+sshift)%rd;
|
||||||
|
int comm_proc = ((x+sshift)/rd)%pd;
|
||||||
|
|
||||||
|
if (comm_proc==0) {
|
||||||
|
|
||||||
|
tcopy-=usecond();
|
||||||
|
Copy_plane(ret,rhs,dimension,x,sx,cbmask);
|
||||||
|
tcopy+=usecond();
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
|
int words = buffer_size;
|
||||||
|
if (cbmask != 0x3) words=words>>1;
|
||||||
|
|
||||||
|
int bytes = words * sizeof(vobj);
|
||||||
|
|
||||||
|
tgather-=usecond();
|
||||||
|
Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask);
|
||||||
|
tgather+=usecond();
|
||||||
|
|
||||||
|
// int rank = grid->_processor;
|
||||||
|
int recv_from_rank;
|
||||||
|
int xmit_to_rank;
|
||||||
|
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
|
||||||
|
|
||||||
|
|
||||||
|
tcomms-=usecond();
|
||||||
|
// grid->Barrier();
|
||||||
|
|
||||||
|
acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes);
|
||||||
|
grid->SendToRecvFrom((void *)&send_buf[0],
|
||||||
|
xmit_to_rank,
|
||||||
|
(void *)&recv_buf[0],
|
||||||
|
recv_from_rank,
|
||||||
|
bytes);
|
||||||
|
xbytes+=bytes;
|
||||||
|
acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes);
|
||||||
|
|
||||||
|
// grid->Barrier();
|
||||||
|
tcomms+=usecond();
|
||||||
|
|
||||||
|
tscatter-=usecond();
|
||||||
|
Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask);
|
||||||
|
tscatter+=usecond();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl;
|
||||||
|
std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl;
|
||||||
|
std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
|
||||||
|
std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl;
|
||||||
|
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
|
||||||
|
*/
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
||||||
|
{
|
||||||
|
GridBase *grid=rhs.Grid();
|
||||||
|
const int Nsimd = grid->Nsimd();
|
||||||
|
typedef typename vobj::vector_type vector_type;
|
||||||
|
typedef typename vobj::scalar_object scalar_object;
|
||||||
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
|
|
||||||
|
int fd = grid->_fdimensions[dimension];
|
||||||
|
int rd = grid->_rdimensions[dimension];
|
||||||
|
int ld = grid->_ldimensions[dimension];
|
||||||
|
int pd = grid->_processors[dimension];
|
||||||
|
int simd_layout = grid->_simd_layout[dimension];
|
||||||
|
int comm_dim = grid->_processors[dimension] >1 ;
|
||||||
|
|
||||||
|
//std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
|
||||||
|
// << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout
|
||||||
|
// << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
|
||||||
|
|
||||||
|
assert(comm_dim==1);
|
||||||
|
assert(simd_layout==2);
|
||||||
|
assert(shift>=0);
|
||||||
|
assert(shift<fd);
|
||||||
|
RealD tcopy=0.0;
|
||||||
|
RealD tgather=0.0;
|
||||||
|
RealD tscatter=0.0;
|
||||||
|
RealD tcomms=0.0;
|
||||||
|
uint64_t xbytes=0;
|
||||||
|
|
||||||
|
int permute_type=grid->PermuteType(dimension);
|
||||||
|
|
||||||
|
///////////////////////////////////////////////
|
||||||
|
// Simd direction uses an extract/merge pair
|
||||||
|
///////////////////////////////////////////////
|
||||||
|
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
|
||||||
|
// int words = sizeof(vobj)/sizeof(vector_type);
|
||||||
|
|
||||||
|
static std::vector<cshiftVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd);
|
||||||
|
static std::vector<cshiftVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
|
||||||
|
scalar_object * recv_buf_extract_mpi;
|
||||||
|
scalar_object * send_buf_extract_mpi;
|
||||||
|
{
|
||||||
|
size_t bytes = sizeof(scalar_object)*buffer_size;
|
||||||
|
grid->ShmBufferFreeAll();
|
||||||
|
send_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
|
||||||
|
recv_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
|
||||||
|
}
|
||||||
|
for(int s=0;s<Nsimd;s++){
|
||||||
|
send_buf_extract[s].resize(buffer_size);
|
||||||
|
recv_buf_extract[s].resize(buffer_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
int bytes = buffer_size*sizeof(scalar_object);
|
||||||
|
|
||||||
|
ExtractPointerArray<scalar_object> pointers(Nsimd); //
|
||||||
|
ExtractPointerArray<scalar_object> rpointers(Nsimd); // received pointers
|
||||||
|
|
||||||
|
///////////////////////////////////////////
|
||||||
|
// Work out what to send where
|
||||||
|
///////////////////////////////////////////
|
||||||
|
int cb = (cbmask==0x2)? Odd : Even;
|
||||||
|
int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
|
||||||
|
|
||||||
|
// loop over outer coord planes orthog to dim
|
||||||
|
for(int x=0;x<rd;x++){
|
||||||
|
|
||||||
|
// FIXME call local permute copy if none are offnode.
|
||||||
|
for(int i=0;i<Nsimd;i++){
|
||||||
|
pointers[i] = &send_buf_extract[i][0];
|
||||||
|
}
|
||||||
|
tgather-=usecond();
|
||||||
|
int sx = (x+sshift)%rd;
|
||||||
|
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
|
||||||
|
tgather+=usecond();
|
||||||
|
|
||||||
|
for(int i=0;i<Nsimd;i++){
|
||||||
|
|
||||||
|
int inner_bit = (Nsimd>>(permute_type+1));
|
||||||
|
int ic= (i&inner_bit)? 1:0;
|
||||||
|
|
||||||
|
int my_coor = rd*ic + x;
|
||||||
|
int nbr_coor = my_coor+sshift;
|
||||||
|
int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
|
||||||
|
|
||||||
|
int nbr_ic = (nbr_coor%ld)/rd; // inner coord of peer
|
||||||
|
int nbr_ox = (nbr_coor%rd); // outer coord of peer
|
||||||
|
int nbr_lane = (i&(~inner_bit));
|
||||||
|
|
||||||
|
int recv_from_rank;
|
||||||
|
int xmit_to_rank;
|
||||||
|
|
||||||
|
if (nbr_ic) nbr_lane|=inner_bit;
|
||||||
|
|
||||||
|
assert (sx == nbr_ox);
|
||||||
|
|
||||||
|
if(nbr_proc){
|
||||||
|
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
||||||
|
|
||||||
|
tcomms-=usecond();
|
||||||
|
// grid->Barrier();
|
||||||
|
|
||||||
|
acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes);
|
||||||
|
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
|
||||||
|
xmit_to_rank,
|
||||||
|
(void *)recv_buf_extract_mpi,
|
||||||
|
recv_from_rank,
|
||||||
|
bytes);
|
||||||
|
acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes);
|
||||||
|
xbytes+=bytes;
|
||||||
|
|
||||||
|
// grid->Barrier();
|
||||||
|
tcomms+=usecond();
|
||||||
|
rpointers[i] = &recv_buf_extract[i][0];
|
||||||
|
} else {
|
||||||
|
rpointers[i] = &send_buf_extract[nbr_lane][0];
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
tscatter-=usecond();
|
||||||
|
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
|
||||||
|
tscatter+=usecond();
|
||||||
|
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl;
|
||||||
|
std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl;
|
||||||
|
std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
|
||||||
|
std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl;
|
||||||
|
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s"<<std::endl;
|
||||||
|
*/
|
||||||
|
}
|
||||||
|
#endif
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
#include <Grid/GridCore.h>
|
#include <Grid/GridCore.h>
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
std::vector<std::pair<int,int> > Cshift_table;
|
std::vector<std::pair<int,int> > Cshift_table;
|
||||||
deviceVector<std::pair<int,int> > Cshift_table_device;
|
commVector<std::pair<int,int> > Cshift_table_device;
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
@ -257,30 +257,17 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
#define FAST_AXPY_NORM
|
|
||||||
template<class sobj,class vobj> inline
|
template<class sobj,class vobj> inline
|
||||||
RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
|
RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
|
||||||
{
|
{
|
||||||
GRID_TRACE("axpy_norm");
|
GRID_TRACE("axpy_norm");
|
||||||
#ifdef FAST_AXPY_NORM
|
return axpy_norm_fast(ret,a,x,y);
|
||||||
return axpy_norm_fast(ret,a,x,y);
|
|
||||||
#else
|
|
||||||
ret = a*x+y;
|
|
||||||
RealD nn=norm2(ret);
|
|
||||||
return nn;
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
template<class sobj,class vobj> inline
|
template<class sobj,class vobj> inline
|
||||||
RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
|
RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
|
||||||
{
|
{
|
||||||
GRID_TRACE("axpby_norm");
|
GRID_TRACE("axpby_norm");
|
||||||
#ifdef FAST_AXPY_NORM
|
return axpby_norm_fast(ret,a,b,x,y);
|
||||||
return axpby_norm_fast(ret,a,b,x,y);
|
|
||||||
#else
|
|
||||||
ret = a*x+b*y;
|
|
||||||
RealD nn=norm2(ret);
|
|
||||||
return nn;
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Trace product
|
/// Trace product
|
||||||
|
@ -236,20 +236,17 @@ public:
|
|||||||
template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
|
template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
|
||||||
vobj vtmp;
|
vobj vtmp;
|
||||||
vtmp = r;
|
vtmp = r;
|
||||||
#if 0
|
#if 1
|
||||||
deviceVector<vobj> vvtmp(1);
|
|
||||||
acceleratorPut(vvtmp[0],vtmp);
|
|
||||||
vobj *vvtmp_p = & vvtmp[0];
|
|
||||||
auto me = View(AcceleratorWrite);
|
|
||||||
accelerator_for(ss,me.size(),vobj::Nsimd(),{
|
|
||||||
auto stmp=coalescedRead(*vvtmp_p);
|
|
||||||
coalescedWrite(me[ss],stmp);
|
|
||||||
});
|
|
||||||
#else
|
|
||||||
auto me = View(CpuWrite);
|
auto me = View(CpuWrite);
|
||||||
thread_for(ss,me.size(),{
|
thread_for(ss,me.size(),{
|
||||||
me[ss]= r;
|
me[ss]= r;
|
||||||
});
|
});
|
||||||
|
#else
|
||||||
|
auto me = View(AcceleratorWrite);
|
||||||
|
accelerator_for(ss,me.size(),vobj::Nsimd(),{
|
||||||
|
auto stmp=coalescedRead(vtmp);
|
||||||
|
coalescedWrite(me[ss],stmp);
|
||||||
|
});
|
||||||
#endif
|
#endif
|
||||||
me.ViewClose();
|
me.ViewClose();
|
||||||
return *this;
|
return *this;
|
||||||
|
@ -53,19 +53,36 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
|
|||||||
typedef decltype(basis[0]) Field;
|
typedef decltype(basis[0]) Field;
|
||||||
typedef decltype(basis[0].View(AcceleratorRead)) View;
|
typedef decltype(basis[0].View(AcceleratorRead)) View;
|
||||||
|
|
||||||
hostVector<View> h_basis_v(basis.size());
|
Vector<View> basis_v; basis_v.reserve(basis.size());
|
||||||
deviceVector<View> d_basis_v(basis.size());
|
typedef typename std::remove_reference<decltype(basis_v[0][0])>::type vobj;
|
||||||
typedef typename std::remove_reference<decltype(h_basis_v[0][0])>::type vobj;
|
|
||||||
typedef typename std::remove_reference<decltype(Qt(0,0))>::type Coeff_t;
|
typedef typename std::remove_reference<decltype(Qt(0,0))>::type Coeff_t;
|
||||||
|
|
||||||
GridBase* grid = basis[0].Grid();
|
GridBase* grid = basis[0].Grid();
|
||||||
|
|
||||||
for(int k=0;k<basis.size();k++){
|
for(int k=0;k<basis.size();k++){
|
||||||
h_basis_v[k] = basis[k].View(AcceleratorWrite);
|
basis_v.push_back(basis[k].View(AcceleratorWrite));
|
||||||
acceleratorPut(d_basis_v[k],h_basis_v[k]);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
View *basis_vp = &d_basis_v[0];
|
#if ( !(defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)) )
|
||||||
|
int max_threads = thread_max();
|
||||||
|
Vector < vobj > Bt(Nm * max_threads);
|
||||||
|
thread_region
|
||||||
|
{
|
||||||
|
vobj* B = &Bt[Nm * thread_num()];
|
||||||
|
thread_for_in_region(ss, grid->oSites(),{
|
||||||
|
for(int j=j0; j<j1; ++j) B[j]=0.;
|
||||||
|
|
||||||
|
for(int j=j0; j<j1; ++j){
|
||||||
|
for(int k=k0; k<k1; ++k){
|
||||||
|
B[j] +=Qt(j,k) * basis_v[k][ss];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for(int j=j0; j<j1; ++j){
|
||||||
|
basis_v[j][ss] = B[j];
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
View *basis_vp = &basis_v[0];
|
||||||
|
|
||||||
int nrot = j1-j0;
|
int nrot = j1-j0;
|
||||||
if (!nrot) // edge case not handled gracefully by Cuda
|
if (!nrot) // edge case not handled gracefully by Cuda
|
||||||
@ -74,19 +91,17 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
|
|||||||
uint64_t oSites =grid->oSites();
|
uint64_t oSites =grid->oSites();
|
||||||
uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
|
uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
|
||||||
|
|
||||||
deviceVector <vobj> Bt(siteBlock * nrot);
|
Vector <vobj> Bt(siteBlock * nrot);
|
||||||
auto Bp=&Bt[0];
|
auto Bp=&Bt[0];
|
||||||
|
|
||||||
// GPU readable copy of matrix
|
// GPU readable copy of matrix
|
||||||
hostVector<Coeff_t> h_Qt_jv(Nm*Nm);
|
Vector<Coeff_t> Qt_jv(Nm*Nm);
|
||||||
deviceVector<Coeff_t> Qt_jv(Nm*Nm);
|
|
||||||
Coeff_t *Qt_p = & Qt_jv[0];
|
Coeff_t *Qt_p = & Qt_jv[0];
|
||||||
thread_for(i,Nm*Nm,{
|
thread_for(i,Nm*Nm,{
|
||||||
int j = i/Nm;
|
int j = i/Nm;
|
||||||
int k = i%Nm;
|
int k = i%Nm;
|
||||||
h_Qt_jv[i]=Qt(j,k);
|
Qt_p[i]=Qt(j,k);
|
||||||
});
|
});
|
||||||
acceleratorCopyToDevice(&h_Qt_jv[0],Qt_p,Nm*Nm*sizeof(Coeff_t));
|
|
||||||
|
|
||||||
// Block the loop to keep storage footprint down
|
// Block the loop to keep storage footprint down
|
||||||
for(uint64_t s=0;s<oSites;s+=siteBlock){
|
for(uint64_t s=0;s<oSites;s+=siteBlock){
|
||||||
@ -122,8 +137,9 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
|
|||||||
coalescedWrite(basis_vp[jj][sss],coalescedRead(Bp[ss*nrot+j]));
|
coalescedWrite(basis_vp[jj][sss],coalescedRead(Bp[ss*nrot+j]));
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
for(int k=0;k<basis.size();k++) h_basis_v[k].ViewClose();
|
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Extract a single rotated vector
|
// Extract a single rotated vector
|
||||||
@ -136,19 +152,16 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
|
|||||||
|
|
||||||
result.Checkerboard() = basis[0].Checkerboard();
|
result.Checkerboard() = basis[0].Checkerboard();
|
||||||
|
|
||||||
hostVector<View> h_basis_v(basis.size());
|
Vector<View> basis_v; basis_v.reserve(basis.size());
|
||||||
deviceVector<View> d_basis_v(basis.size());
|
|
||||||
for(int k=0;k<basis.size();k++){
|
for(int k=0;k<basis.size();k++){
|
||||||
h_basis_v[k]=basis[k].View(AcceleratorRead);
|
basis_v.push_back(basis[k].View(AcceleratorRead));
|
||||||
acceleratorPut(d_basis_v[k],h_basis_v[k]);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
vobj zz=Zero();
|
vobj zz=Zero();
|
||||||
deviceVector<double> Qt_jv(Nm);
|
Vector<double> Qt_jv(Nm);
|
||||||
double * Qt_j = & Qt_jv[0];
|
double * Qt_j = & Qt_jv[0];
|
||||||
for(int k=0;k<Nm;++k) acceleratorPut(Qt_j[k],Qt(j,k));
|
for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
|
||||||
|
|
||||||
auto basis_vp=& d_basis_v[0];
|
auto basis_vp=& basis_v[0];
|
||||||
autoView(result_v,result,AcceleratorWrite);
|
autoView(result_v,result,AcceleratorWrite);
|
||||||
accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
|
accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
|
||||||
vobj zzz=Zero();
|
vobj zzz=Zero();
|
||||||
@ -158,7 +171,7 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
|
|||||||
}
|
}
|
||||||
coalescedWrite(result_v[ss], B);
|
coalescedWrite(result_v[ss], B);
|
||||||
});
|
});
|
||||||
for(int k=0;k<basis.size();k++) h_basis_v[k].ViewClose();
|
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Field>
|
template<class Field>
|
||||||
|
@ -165,7 +165,7 @@ inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
|
|||||||
|
|
||||||
int Nsimd = grid->Nsimd();
|
int Nsimd = grid->Nsimd();
|
||||||
|
|
||||||
// assert( l.Checkerboard()== grid->CheckerBoard(site));
|
assert( l.Checkerboard()== grid->CheckerBoard(site));
|
||||||
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
|
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
|
||||||
|
|
||||||
static const int words=sizeof(vobj)/sizeof(vector_type);
|
static const int words=sizeof(vobj)/sizeof(vector_type);
|
||||||
@ -179,7 +179,7 @@ inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
|
|||||||
for(int w=0;w<words;w++){
|
for(int w=0;w<words;w++){
|
||||||
pt[w] = getlane(vp[w],idx);
|
pt[w] = getlane(vp[w],idx);
|
||||||
}
|
}
|
||||||
// std::cout << "peekLocalSite "<<site<<" "<<odx<<","<<idx<<" "<<s<<std::endl;
|
|
||||||
return;
|
return;
|
||||||
};
|
};
|
||||||
template<class vobj,class sobj>
|
template<class vobj,class sobj>
|
||||||
@ -202,7 +202,7 @@ inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site)
|
|||||||
|
|
||||||
int Nsimd = grid->Nsimd();
|
int Nsimd = grid->Nsimd();
|
||||||
|
|
||||||
// assert( l.Checkerboard()== grid->CheckerBoard(site));
|
assert( l.Checkerboard()== grid->CheckerBoard(site));
|
||||||
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
|
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
|
||||||
|
|
||||||
static const int words=sizeof(vobj)/sizeof(vector_type);
|
static const int words=sizeof(vobj)/sizeof(vector_type);
|
||||||
|
@ -46,7 +46,7 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
|
|||||||
// const int Nsimd = vobj::Nsimd();
|
// const int Nsimd = vobj::Nsimd();
|
||||||
const int nthread = GridThread::GetThreads();
|
const int nthread = GridThread::GetThreads();
|
||||||
|
|
||||||
std::vector<sobj> sumarray(nthread);
|
Vector<sobj> sumarray(nthread);
|
||||||
for(int i=0;i<nthread;i++){
|
for(int i=0;i<nthread;i++){
|
||||||
sumarray[i]=Zero();
|
sumarray[i]=Zero();
|
||||||
}
|
}
|
||||||
@ -75,7 +75,7 @@ inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)
|
|||||||
|
|
||||||
const int nthread = GridThread::GetThreads();
|
const int nthread = GridThread::GetThreads();
|
||||||
|
|
||||||
std::vector<sobj> sumarray(nthread);
|
Vector<sobj> sumarray(nthread);
|
||||||
for(int i=0;i<nthread;i++){
|
for(int i=0;i<nthread;i++){
|
||||||
sumarray[i]=Zero();
|
sumarray[i]=Zero();
|
||||||
}
|
}
|
||||||
@ -290,10 +290,8 @@ template<class vobj>
|
|||||||
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
|
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
|
||||||
GridBase *grid = left.Grid();
|
GridBase *grid = left.Grid();
|
||||||
|
|
||||||
bool ok;
|
|
||||||
#ifdef GRID_SYCL
|
#ifdef GRID_SYCL
|
||||||
uint64_t csum=0;
|
uint64_t csum=0;
|
||||||
uint64_t csum2=0;
|
|
||||||
if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
|
if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
|
||||||
{
|
{
|
||||||
// Hack
|
// Hack
|
||||||
@ -302,33 +300,13 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
|
|||||||
Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t);
|
Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t);
|
||||||
uint64_t *base= (uint64_t *)&l_v[0];
|
uint64_t *base= (uint64_t *)&l_v[0];
|
||||||
csum=svm_xor(base,words);
|
csum=svm_xor(base,words);
|
||||||
ok = FlightRecorder::CsumLog(csum);
|
|
||||||
if ( !ok ) {
|
|
||||||
csum2=svm_xor(base,words);
|
|
||||||
std::cerr<< " Bad CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
|
|
||||||
} else {
|
|
||||||
// csum2=svm_xor(base,words);
|
|
||||||
// std::cerr<< " ok CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
|
|
||||||
}
|
|
||||||
assert(ok);
|
|
||||||
}
|
}
|
||||||
|
FlightRecorder::CsumLog(csum);
|
||||||
#endif
|
#endif
|
||||||
FlightRecorder::StepLog("rank inner product");
|
|
||||||
ComplexD nrm = rankInnerProduct(left,right);
|
ComplexD nrm = rankInnerProduct(left,right);
|
||||||
// ComplexD nrmck=nrm;
|
|
||||||
RealD local = real(nrm);
|
RealD local = real(nrm);
|
||||||
ok = FlightRecorder::NormLog(real(nrm));
|
FlightRecorder::NormLog(real(nrm));
|
||||||
if ( !ok ) {
|
|
||||||
ComplexD nrm2 = rankInnerProduct(left,right);
|
|
||||||
RealD local2 = real(nrm2);
|
|
||||||
std::cerr<< " Bad NORM " << local << " recomputed as "<<local2<<std::endl;
|
|
||||||
assert(ok);
|
|
||||||
}
|
|
||||||
FlightRecorder::StepLog("Start global sum");
|
|
||||||
// grid->GlobalSumP2P(nrm);
|
|
||||||
grid->GlobalSum(nrm);
|
grid->GlobalSum(nrm);
|
||||||
FlightRecorder::StepLog("Finished global sum");
|
|
||||||
// std::cout << " norm "<< nrm << " p2p norm "<<nrmck<<std::endl;
|
|
||||||
FlightRecorder::ReductionLog(local,real(nrm));
|
FlightRecorder::ReductionLog(local,real(nrm));
|
||||||
return nrm;
|
return nrm;
|
||||||
}
|
}
|
||||||
@ -365,6 +343,18 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
|
|||||||
autoView( x_v, x, AcceleratorRead);
|
autoView( x_v, x, AcceleratorRead);
|
||||||
autoView( y_v, y, AcceleratorRead);
|
autoView( y_v, y, AcceleratorRead);
|
||||||
autoView( z_v, z, AcceleratorWrite);
|
autoView( z_v, z, AcceleratorWrite);
|
||||||
|
#if 0
|
||||||
|
typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
|
||||||
|
Vector<inner_t> inner_tmp(sites);
|
||||||
|
auto inner_tmp_v = &inner_tmp[0];
|
||||||
|
|
||||||
|
accelerator_for( ss, sites, nsimd,{
|
||||||
|
auto tmp = a*x_v(ss)+b*y_v(ss);
|
||||||
|
coalescedWrite(inner_tmp_v[ss],innerProductD(tmp,tmp));
|
||||||
|
coalescedWrite(z_v[ss],tmp);
|
||||||
|
});
|
||||||
|
nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
|
||||||
|
#else
|
||||||
typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
|
typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
|
||||||
deviceVector<inner_t> inner_tmp;
|
deviceVector<inner_t> inner_tmp;
|
||||||
inner_tmp.resize(sites);
|
inner_tmp.resize(sites);
|
||||||
@ -375,44 +365,9 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
|
|||||||
coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
|
coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
|
||||||
coalescedWrite(z_v[ss],tmp);
|
coalescedWrite(z_v[ss],tmp);
|
||||||
});
|
});
|
||||||
bool ok;
|
|
||||||
#ifdef GRID_SYCL
|
|
||||||
uint64_t csum=0;
|
|
||||||
uint64_t csum2=0;
|
|
||||||
if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
|
|
||||||
{
|
|
||||||
// z_v
|
|
||||||
{
|
|
||||||
Integer words = sites*sizeof(vobj)/sizeof(uint64_t);
|
|
||||||
uint64_t *base= (uint64_t *)&z_v[0];
|
|
||||||
csum=svm_xor(base,words);
|
|
||||||
ok = FlightRecorder::CsumLog(csum);
|
|
||||||
if ( !ok ) {
|
|
||||||
csum2=svm_xor(base,words);
|
|
||||||
std::cerr<< " Bad z_v CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
|
|
||||||
}
|
|
||||||
assert(ok);
|
|
||||||
}
|
|
||||||
// inner_v
|
|
||||||
{
|
|
||||||
Integer words = sites*sizeof(inner_t)/sizeof(uint64_t);
|
|
||||||
uint64_t *base= (uint64_t *)&inner_tmp_v[0];
|
|
||||||
csum=svm_xor(base,words);
|
|
||||||
ok = FlightRecorder::CsumLog(csum);
|
|
||||||
if ( !ok ) {
|
|
||||||
csum2=svm_xor(base,words);
|
|
||||||
std::cerr<< " Bad inner_tmp_v CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
|
|
||||||
}
|
|
||||||
assert(ok);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
|
nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
|
||||||
ok = FlightRecorder::NormLog(real(nrm));
|
#endif
|
||||||
assert(ok);
|
|
||||||
RealD local = real(nrm);
|
|
||||||
grid->GlobalSum(nrm);
|
grid->GlobalSum(nrm);
|
||||||
FlightRecorder::ReductionLog(local,real(nrm));
|
|
||||||
return nrm;
|
return nrm;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -422,7 +377,7 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
|
|||||||
conformable(left,right);
|
conformable(left,right);
|
||||||
|
|
||||||
typedef typename vobj::vector_typeD vector_type;
|
typedef typename vobj::vector_typeD vector_type;
|
||||||
std::vector<ComplexD> tmp(2);
|
Vector<ComplexD> tmp(2);
|
||||||
|
|
||||||
GridBase *grid = left.Grid();
|
GridBase *grid = left.Grid();
|
||||||
|
|
||||||
@ -432,8 +387,8 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
|
|||||||
// GPU
|
// GPU
|
||||||
typedef decltype(innerProductD(vobj(),vobj())) inner_t;
|
typedef decltype(innerProductD(vobj(),vobj())) inner_t;
|
||||||
typedef decltype(innerProductD(vobj(),vobj())) norm_t;
|
typedef decltype(innerProductD(vobj(),vobj())) norm_t;
|
||||||
deviceVector<inner_t> inner_tmp(sites);
|
Vector<inner_t> inner_tmp(sites);
|
||||||
deviceVector<norm_t> norm_tmp(sites);
|
Vector<norm_t> norm_tmp(sites);
|
||||||
auto inner_tmp_v = &inner_tmp[0];
|
auto inner_tmp_v = &inner_tmp[0];
|
||||||
auto norm_tmp_v = &norm_tmp[0];
|
auto norm_tmp_v = &norm_tmp[0];
|
||||||
{
|
{
|
||||||
@ -483,9 +438,7 @@ inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
|
|||||||
// sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc...
|
// sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc...
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,
|
template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim)
|
||||||
std::vector<typename vobj::scalar_object> &result,
|
|
||||||
int orthogdim)
|
|
||||||
{
|
{
|
||||||
///////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////
|
||||||
// FIXME precision promoted summation
|
// FIXME precision promoted summation
|
||||||
@ -507,8 +460,8 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,
|
|||||||
int ld=grid->_ldimensions[orthogdim];
|
int ld=grid->_ldimensions[orthogdim];
|
||||||
int rd=grid->_rdimensions[orthogdim];
|
int rd=grid->_rdimensions[orthogdim];
|
||||||
|
|
||||||
std::vector<vobj> lvSum(rd); // will locally sum vectors first
|
Vector<vobj> lvSum(rd); // will locally sum vectors first
|
||||||
std::vector<sobj> lsSum(ld,Zero()); // sum across these down to scalars
|
Vector<sobj> lsSum(ld,Zero()); // sum across these down to scalars
|
||||||
ExtractBuffer<sobj> extracted(Nsimd); // splitting the SIMD
|
ExtractBuffer<sobj> extracted(Nsimd); // splitting the SIMD
|
||||||
|
|
||||||
result.resize(fd); // And then global sum to return the same vector to every node
|
result.resize(fd); // And then global sum to return the same vector to every node
|
||||||
@ -556,8 +509,6 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,
|
|||||||
scalar_type * ptr = (scalar_type *) &result[0];
|
scalar_type * ptr = (scalar_type *) &result[0];
|
||||||
int words = fd*sizeof(sobj)/sizeof(scalar_type);
|
int words = fd*sizeof(sobj)/sizeof(scalar_type);
|
||||||
grid->GlobalSumVector(ptr, words);
|
grid->GlobalSumVector(ptr, words);
|
||||||
// std::cout << GridLogMessage << " sliceSum local"<<t_sum<<" us, host+mpi "<<t_rest<<std::endl;
|
|
||||||
|
|
||||||
}
|
}
|
||||||
template<class vobj> inline
|
template<class vobj> inline
|
||||||
std::vector<typename vobj::scalar_object>
|
std::vector<typename vobj::scalar_object>
|
||||||
@ -568,20 +519,7 @@ sliceSum(const Lattice<vobj> &Data,int orthogdim)
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
Reimplement
|
|
||||||
|
|
||||||
1)
|
|
||||||
template<class vobj>
|
|
||||||
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)
|
|
||||||
|
|
||||||
2)
|
|
||||||
template<class vobj>
|
|
||||||
static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)
|
|
||||||
|
|
||||||
3)
|
|
||||||
-- Make Slice Mul Matrix call sliceMaddMatrix
|
|
||||||
*/
|
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)
|
static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)
|
||||||
{
|
{
|
||||||
@ -601,8 +539,8 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti
|
|||||||
int ld=grid->_ldimensions[orthogdim];
|
int ld=grid->_ldimensions[orthogdim];
|
||||||
int rd=grid->_rdimensions[orthogdim];
|
int rd=grid->_rdimensions[orthogdim];
|
||||||
|
|
||||||
std::vector<vector_type> lvSum(rd); // will locally sum vectors first
|
Vector<vector_type> lvSum(rd); // will locally sum vectors first
|
||||||
std::vector<scalar_type > lsSum(ld,scalar_type(0.0)); // sum across these down to scalars
|
Vector<scalar_type > lsSum(ld,scalar_type(0.0)); // sum across these down to scalars
|
||||||
ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd); // splitting the SIMD
|
ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd); // splitting the SIMD
|
||||||
|
|
||||||
result.resize(fd); // And then global sum to return the same vector to every node for IO to file
|
result.resize(fd); // And then global sum to return the same vector to every node for IO to file
|
||||||
@ -732,96 +670,203 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
|
inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
|
||||||
{
|
{
|
||||||
int NN = BlockSolverGrid->_ndimension;
|
int NN = BlockSolverGrid->_ndimension;
|
||||||
int nsimd = BlockSolverGrid->Nsimd();
|
int nsimd = BlockSolverGrid->Nsimd();
|
||||||
|
|
||||||
std::vector<int> latt_phys(NN-1);
|
std::vector<int> latt_phys(0);
|
||||||
Coordinate simd_phys;
|
std::vector<int> simd_phys(0);
|
||||||
std::vector<int> mpi_phys(NN-1);
|
std::vector<int> mpi_phys(0);
|
||||||
Coordinate checker_dim_mask(NN-1);
|
|
||||||
int checker_dim=-1;
|
|
||||||
|
|
||||||
int dd;
|
|
||||||
for(int d=0;d<NN;d++){
|
for(int d=0;d<NN;d++){
|
||||||
if( d!=Orthog ) {
|
if( d!=Orthog ) {
|
||||||
latt_phys[dd]=BlockSolverGrid->_fdimensions[d];
|
latt_phys.push_back(BlockSolverGrid->_fdimensions[d]);
|
||||||
mpi_phys[dd] =BlockSolverGrid->_processors[d];
|
simd_phys.push_back(BlockSolverGrid->_simd_layout[d]);
|
||||||
checker_dim_mask[dd] = BlockSolverGrid->_checker_dim_mask[d];
|
mpi_phys.push_back(BlockSolverGrid->_processors[d]);
|
||||||
if ( d == BlockSolverGrid->_checker_dim ) checker_dim = dd;
|
|
||||||
dd++;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
simd_phys=GridDefaultSimd(latt_phys.size(),nsimd);
|
return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys);
|
||||||
GridCartesian *tmp = new GridCartesian(latt_phys,simd_phys,mpi_phys);
|
|
||||||
if(BlockSolverGrid->_isCheckerBoarded) {
|
|
||||||
GridRedBlackCartesian *ret = new GridRedBlackCartesian(tmp,checker_dim_mask,checker_dim);
|
|
||||||
delete tmp;
|
|
||||||
return (GridBase *) ret;
|
|
||||||
} else {
|
|
||||||
return (GridBase *) tmp;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)
|
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)
|
||||||
{
|
{
|
||||||
GridBase *FullGrid = X.Grid();
|
|
||||||
GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
|
|
||||||
|
|
||||||
Lattice<vobj> Ys(SliceGrid);
|
|
||||||
Lattice<vobj> Rs(SliceGrid);
|
|
||||||
Lattice<vobj> Xs(SliceGrid);
|
|
||||||
Lattice<vobj> RR(FullGrid);
|
|
||||||
|
|
||||||
RR = R; // Copies checkerboard for insert
|
|
||||||
|
|
||||||
typedef typename vobj::scalar_object sobj;
|
typedef typename vobj::scalar_object sobj;
|
||||||
typedef typename vobj::vector_type vector_type;
|
typedef typename vobj::vector_type vector_type;
|
||||||
int Nslice = X.Grid()->GlobalDimensions()[Orthog];
|
|
||||||
for(int i=0;i<Nslice;i++){
|
int Nblock = X.Grid()->GlobalDimensions()[Orthog];
|
||||||
ExtractSlice(Ys,Y,i,Orthog);
|
|
||||||
ExtractSlice(Rs,R,i,Orthog);
|
GridBase *FullGrid = X.Grid();
|
||||||
Rs=Ys;
|
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
|
||||||
for(int j=0;j<Nslice;j++){
|
|
||||||
ExtractSlice(Xs,X,j,Orthog);
|
// Lattice<vobj> Xslice(SliceGrid);
|
||||||
Rs = Rs + Xs*(scale*aa(j,i));
|
// Lattice<vobj> Rslice(SliceGrid);
|
||||||
}
|
|
||||||
InsertSlice(Rs,RR,i,Orthog);
|
assert( FullGrid->_simd_layout[Orthog]==1);
|
||||||
|
// int nh = FullGrid->_ndimension;
|
||||||
|
// int nl = SliceGrid->_ndimension;
|
||||||
|
// int nl = nh-1;
|
||||||
|
|
||||||
|
//FIXME package in a convenient iterator
|
||||||
|
//Should loop over a plane orthogonal to direction "Orthog"
|
||||||
|
int stride=FullGrid->_slice_stride[Orthog];
|
||||||
|
int block =FullGrid->_slice_block [Orthog];
|
||||||
|
int nblock=FullGrid->_slice_nblock[Orthog];
|
||||||
|
int ostride=FullGrid->_ostride[Orthog];
|
||||||
|
|
||||||
|
autoView( X_v, X, CpuRead);
|
||||||
|
autoView( Y_v, Y, CpuRead);
|
||||||
|
autoView( R_v, R, CpuWrite);
|
||||||
|
thread_region
|
||||||
|
{
|
||||||
|
Vector<vobj> s_x(Nblock);
|
||||||
|
|
||||||
|
thread_for_collapse_in_region(2, n,nblock, {
|
||||||
|
for(int b=0;b<block;b++){
|
||||||
|
int o = n*stride + b;
|
||||||
|
|
||||||
|
for(int i=0;i<Nblock;i++){
|
||||||
|
s_x[i] = X_v[o+i*ostride];
|
||||||
|
}
|
||||||
|
|
||||||
|
vobj dot;
|
||||||
|
for(int i=0;i<Nblock;i++){
|
||||||
|
dot = Y_v[o+i*ostride];
|
||||||
|
for(int j=0;j<Nblock;j++){
|
||||||
|
dot = dot + s_x[j]*(scale*aa(j,i));
|
||||||
|
}
|
||||||
|
R_v[o+i*ostride]=dot;
|
||||||
|
}
|
||||||
|
}});
|
||||||
}
|
}
|
||||||
R=RR; // Copy back handles arguments aliasing case
|
|
||||||
delete SliceGrid;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0)
|
static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0)
|
||||||
{
|
{
|
||||||
R=Zero();
|
typedef typename vobj::scalar_object sobj;
|
||||||
sliceMaddMatrix(R,aa,X,R,Orthog,scale);
|
typedef typename vobj::vector_type vector_type;
|
||||||
|
|
||||||
|
int Nblock = X.Grid()->GlobalDimensions()[Orthog];
|
||||||
|
|
||||||
|
GridBase *FullGrid = X.Grid();
|
||||||
|
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
|
||||||
|
// Lattice<vobj> Xslice(SliceGrid);
|
||||||
|
// Lattice<vobj> Rslice(SliceGrid);
|
||||||
|
|
||||||
|
assert( FullGrid->_simd_layout[Orthog]==1);
|
||||||
|
// int nh = FullGrid->_ndimension;
|
||||||
|
// int nl = SliceGrid->_ndimension;
|
||||||
|
// int nl=1;
|
||||||
|
|
||||||
|
//FIXME package in a convenient iterator
|
||||||
|
// thread_for2d_in_region
|
||||||
|
//Should loop over a plane orthogonal to direction "Orthog"
|
||||||
|
int stride=FullGrid->_slice_stride[Orthog];
|
||||||
|
int block =FullGrid->_slice_block [Orthog];
|
||||||
|
int nblock=FullGrid->_slice_nblock[Orthog];
|
||||||
|
int ostride=FullGrid->_ostride[Orthog];
|
||||||
|
autoView( R_v, R, CpuWrite);
|
||||||
|
autoView( X_v, X, CpuRead);
|
||||||
|
thread_region
|
||||||
|
{
|
||||||
|
std::vector<vobj> s_x(Nblock);
|
||||||
|
|
||||||
|
|
||||||
|
thread_for_collapse_in_region( 2 ,n,nblock,{
|
||||||
|
for(int b=0;b<block;b++){
|
||||||
|
int o = n*stride + b;
|
||||||
|
|
||||||
|
for(int i=0;i<Nblock;i++){
|
||||||
|
s_x[i] = X_v[o+i*ostride];
|
||||||
|
}
|
||||||
|
|
||||||
|
vobj dot;
|
||||||
|
for(int i=0;i<Nblock;i++){
|
||||||
|
dot = s_x[0]*(scale*aa(0,i));
|
||||||
|
for(int j=1;j<Nblock;j++){
|
||||||
|
dot = dot + s_x[j]*(scale*aa(j,i));
|
||||||
|
}
|
||||||
|
R_v[o+i*ostride]=dot;
|
||||||
|
}
|
||||||
|
}});
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)
|
static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)
|
||||||
{
|
{
|
||||||
GridBase *SliceGrid = makeSubSliceGrid(lhs.Grid(),Orthog);
|
|
||||||
|
|
||||||
Lattice<vobj> ls(SliceGrid);
|
|
||||||
Lattice<vobj> rs(SliceGrid);
|
|
||||||
|
|
||||||
typedef typename vobj::scalar_object sobj;
|
typedef typename vobj::scalar_object sobj;
|
||||||
typedef typename vobj::vector_type vector_type;
|
typedef typename vobj::vector_type vector_type;
|
||||||
int Nslice = lhs.Grid()->GlobalDimensions()[Orthog];
|
|
||||||
mat = Eigen::MatrixXcd::Zero(Nslice,Nslice);
|
GridBase *FullGrid = lhs.Grid();
|
||||||
for(int s=0;s<Nslice;s++){
|
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
|
||||||
ExtractSlice(ls,lhs,s,Orthog);
|
|
||||||
for(int ss=0;ss<Nslice;ss++){
|
int Nblock = FullGrid->GlobalDimensions()[Orthog];
|
||||||
ExtractSlice(rs,rhs,ss,Orthog);
|
|
||||||
mat(s,ss) = innerProduct(ls,rs);
|
// Lattice<vobj> Lslice(SliceGrid);
|
||||||
}
|
// Lattice<vobj> Rslice(SliceGrid);
|
||||||
|
|
||||||
|
mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
|
||||||
|
|
||||||
|
assert( FullGrid->_simd_layout[Orthog]==1);
|
||||||
|
// int nh = FullGrid->_ndimension;
|
||||||
|
// int nl = SliceGrid->_ndimension;
|
||||||
|
// int nl = nh-1;
|
||||||
|
|
||||||
|
//FIXME package in a convenient iterator
|
||||||
|
//Should loop over a plane orthogonal to direction "Orthog"
|
||||||
|
int stride=FullGrid->_slice_stride[Orthog];
|
||||||
|
int block =FullGrid->_slice_block [Orthog];
|
||||||
|
int nblock=FullGrid->_slice_nblock[Orthog];
|
||||||
|
int ostride=FullGrid->_ostride[Orthog];
|
||||||
|
|
||||||
|
typedef typename vobj::vector_typeD vector_typeD;
|
||||||
|
|
||||||
|
autoView( lhs_v, lhs, CpuRead);
|
||||||
|
autoView( rhs_v, rhs, CpuRead);
|
||||||
|
thread_region
|
||||||
|
{
|
||||||
|
std::vector<vobj> Left(Nblock);
|
||||||
|
std::vector<vobj> Right(Nblock);
|
||||||
|
Eigen::MatrixXcd mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock);
|
||||||
|
|
||||||
|
thread_for_collapse_in_region( 2, n,nblock,{
|
||||||
|
for(int b=0;b<block;b++){
|
||||||
|
|
||||||
|
int o = n*stride + b;
|
||||||
|
|
||||||
|
for(int i=0;i<Nblock;i++){
|
||||||
|
Left [i] = lhs_v[o+i*ostride];
|
||||||
|
Right[i] = rhs_v[o+i*ostride];
|
||||||
|
}
|
||||||
|
|
||||||
|
for(int i=0;i<Nblock;i++){
|
||||||
|
for(int j=0;j<Nblock;j++){
|
||||||
|
auto tmp = innerProduct(Left[i],Right[j]);
|
||||||
|
auto rtmp = TensorRemove(tmp);
|
||||||
|
auto red = Reduce(rtmp);
|
||||||
|
mat_thread(i,j) += std::complex<double>(real(red),imag(red));
|
||||||
|
}}
|
||||||
|
}});
|
||||||
|
thread_critical
|
||||||
|
{
|
||||||
|
mat += mat_thread;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
delete SliceGrid;
|
|
||||||
|
for(int i=0;i<Nblock;i++){
|
||||||
|
for(int j=0;j<Nblock;j++){
|
||||||
|
ComplexD sum = mat(i,j);
|
||||||
|
FullGrid->GlobalSum(sum);
|
||||||
|
mat(i,j)=sum;
|
||||||
|
}}
|
||||||
|
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
@ -214,12 +214,22 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi
|
|||||||
// Move out of UVM
|
// Move out of UVM
|
||||||
// Turns out I had messed up the synchronise after move to compute stream
|
// Turns out I had messed up the synchronise after move to compute stream
|
||||||
// as running this on the default stream fools the synchronise
|
// as running this on the default stream fools the synchronise
|
||||||
deviceVector<sobj> buffer(numBlocks);
|
#undef UVM_BLOCK_BUFFER
|
||||||
|
#ifndef UVM_BLOCK_BUFFER
|
||||||
|
commVector<sobj> buffer(numBlocks);
|
||||||
sobj *buffer_v = &buffer[0];
|
sobj *buffer_v = &buffer[0];
|
||||||
sobj result;
|
sobj result;
|
||||||
reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
|
reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
|
||||||
accelerator_barrier();
|
accelerator_barrier();
|
||||||
acceleratorCopyFromDevice(buffer_v,&result,sizeof(result));
|
acceleratorCopyFromDevice(buffer_v,&result,sizeof(result));
|
||||||
|
#else
|
||||||
|
Vector<sobj> buffer(numBlocks);
|
||||||
|
sobj *buffer_v = &buffer[0];
|
||||||
|
sobj result;
|
||||||
|
reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
|
||||||
|
accelerator_barrier();
|
||||||
|
result = *buffer_v;
|
||||||
|
#endif
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -234,7 +244,7 @@ inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osi
|
|||||||
|
|
||||||
const int words = sizeof(vobj)/sizeof(vector);
|
const int words = sizeof(vobj)/sizeof(vector);
|
||||||
|
|
||||||
deviceVector<vector> buffer(osites);
|
Vector<vector> buffer(osites);
|
||||||
vector *dat = (vector *)lat;
|
vector *dat = (vector *)lat;
|
||||||
vector *buf = &buffer[0];
|
vector *buf = &buffer[0];
|
||||||
iScalar<vector> *tbuf =(iScalar<vector> *) &buffer[0];
|
iScalar<vector> *tbuf =(iScalar<vector> *) &buffer[0];
|
||||||
|
@ -4,28 +4,33 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
// Possibly promote to double and sum
|
// Possibly promote to double and sum
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
|
||||||
template <class vobj>
|
template <class vobj>
|
||||||
inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites)
|
inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites)
|
||||||
{
|
{
|
||||||
typedef typename vobj::scalar_object sobj;
|
typedef typename vobj::scalar_object sobj;
|
||||||
typedef typename vobj::scalar_objectD sobjD;
|
typedef typename vobj::scalar_objectD sobjD;
|
||||||
|
static Vector<sobj> mysum;
|
||||||
|
mysum.resize(1);
|
||||||
|
sobj *mysum_p = & mysum[0];
|
||||||
sobj identity; zeroit(identity);
|
sobj identity; zeroit(identity);
|
||||||
sobj ret; zeroit(ret);
|
mysum[0] = identity;
|
||||||
|
sobj ret ;
|
||||||
|
|
||||||
Integer nsimd= vobj::Nsimd();
|
Integer nsimd= vobj::Nsimd();
|
||||||
{
|
|
||||||
sycl::buffer<sobj, 1> abuff(&ret, {1});
|
const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() });
|
||||||
theGridAccelerator->submit([&](sycl::handler &cgh) {
|
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
|
||||||
auto Reduction = sycl::reduction(abuff,cgh,identity,std::plus<>());
|
auto Reduction = cl::sycl::reduction(mysum_p,identity,std::plus<>(),PropList);
|
||||||
cgh.parallel_for(sycl::range<1>{osites},
|
cgh.parallel_for(cl::sycl::range<1>{osites},
|
||||||
Reduction,
|
Reduction,
|
||||||
[=] (sycl::id<1> item, auto &sum) {
|
[=] (cl::sycl::id<1> item, auto &sum) {
|
||||||
auto osite = item[0];
|
auto osite = item[0];
|
||||||
sum +=Reduce(lat[osite]);
|
sum +=Reduce(lat[osite]);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
}
|
theGridAccelerator->wait();
|
||||||
|
ret = mysum[0];
|
||||||
|
// free(mysum,*theGridAccelerator);
|
||||||
sobjD dret; convertType(dret,ret);
|
sobjD dret; convertType(dret,ret);
|
||||||
return dret;
|
return dret;
|
||||||
}
|
}
|
||||||
@ -71,22 +76,59 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite
|
|||||||
|
|
||||||
template<class Word> Word svm_xor(Word *vec,uint64_t L)
|
template<class Word> Word svm_xor(Word *vec,uint64_t L)
|
||||||
{
|
{
|
||||||
|
Word xorResult; xorResult = 0;
|
||||||
|
static Vector<Word> d_sum;
|
||||||
|
d_sum.resize(1);
|
||||||
|
Word *d_sum_p=&d_sum[0];
|
||||||
Word identity; identity=0;
|
Word identity; identity=0;
|
||||||
Word ret = 0;
|
d_sum[0] = identity;
|
||||||
{
|
const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() });
|
||||||
sycl::buffer<Word, 1> abuff(&ret, {1});
|
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
|
||||||
theGridAccelerator->submit([&](sycl::handler &cgh) {
|
auto Reduction = cl::sycl::reduction(d_sum_p,identity,std::bit_xor<>(),PropList);
|
||||||
auto Reduction = sycl::reduction(abuff,cgh,identity,std::bit_xor<>());
|
cgh.parallel_for(cl::sycl::range<1>{L},
|
||||||
cgh.parallel_for(sycl::range<1>{L},
|
Reduction,
|
||||||
Reduction,
|
[=] (cl::sycl::id<1> index, auto &sum) {
|
||||||
[=] (sycl::id<1> index, auto &sum) {
|
sum^=vec[index];
|
||||||
sum ^=vec[index];
|
});
|
||||||
});
|
});
|
||||||
});
|
|
||||||
}
|
|
||||||
theGridAccelerator->wait();
|
theGridAccelerator->wait();
|
||||||
|
Word ret = d_sum[0];
|
||||||
|
// free(d_sum,*theGridAccelerator);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
/*
|
||||||
|
|
||||||
|
template <class vobj>
|
||||||
|
inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites)
|
||||||
|
{
|
||||||
|
typedef typename vobj::vector_type vector;
|
||||||
|
typedef typename vobj::scalar_type scalar;
|
||||||
|
|
||||||
|
typedef typename vobj::scalar_typeD scalarD;
|
||||||
|
typedef typename vobj::scalar_objectD sobjD;
|
||||||
|
|
||||||
|
sobjD ret;
|
||||||
|
scalarD *ret_p = (scalarD *)&ret;
|
||||||
|
|
||||||
|
const int nsimd = vobj::Nsimd();
|
||||||
|
const int words = sizeof(vobj)/sizeof(vector);
|
||||||
|
|
||||||
|
Vector<scalar> buffer(osites*nsimd);
|
||||||
|
scalar *buf = &buffer[0];
|
||||||
|
vector *dat = (vector *)lat;
|
||||||
|
|
||||||
|
for(int w=0;w<words;w++) {
|
||||||
|
|
||||||
|
accelerator_for(ss,osites,nsimd,{
|
||||||
|
int lane = acceleratorSIMTlane(nsimd);
|
||||||
|
buf[ss*nsimd+lane] = dat[ss*words+w].getlane(lane);
|
||||||
|
});
|
||||||
|
//Precision change at this point is to late to gain precision
|
||||||
|
ret_p[w] = svm_reduce(buf,nsimd*osites);
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
@ -21,18 +21,9 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
|
|
||||||
|
|
||||||
#if defined(GRID_CUDA) || defined(GRID_HIP)
|
#if defined(GRID_CUDA) || defined(GRID_HIP)
|
||||||
template<class vobj>
|
template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
|
||||||
inline void sliceSumReduction_cub_small(const vobj *Data,
|
|
||||||
std::vector<vobj> &lvSum,
|
|
||||||
const int rd,
|
|
||||||
const int e1,
|
|
||||||
const int e2,
|
|
||||||
const int stride,
|
|
||||||
const int ostride,
|
|
||||||
const int Nsimd)
|
|
||||||
{
|
|
||||||
size_t subvol_size = e1*e2;
|
size_t subvol_size = e1*e2;
|
||||||
deviceVector<vobj> reduction_buffer(rd*subvol_size);
|
commVector<vobj> reduction_buffer(rd*subvol_size);
|
||||||
auto rb_p = &reduction_buffer[0];
|
auto rb_p = &reduction_buffer[0];
|
||||||
vobj zero_init;
|
vobj zero_init;
|
||||||
zeroit(zero_init);
|
zeroit(zero_init);
|
||||||
@ -103,15 +94,7 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
|
|||||||
|
|
||||||
|
|
||||||
#if defined(GRID_SYCL)
|
#if defined(GRID_SYCL)
|
||||||
template<class vobj>
|
template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data, Vector <vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
|
||||||
inline void sliceSumReduction_sycl_small(const vobj *Data,
|
|
||||||
std::vector <vobj> &lvSum,
|
|
||||||
const int &rd,
|
|
||||||
const int &e1,
|
|
||||||
const int &e2,
|
|
||||||
const int &stride,
|
|
||||||
const int &ostride,
|
|
||||||
const int &Nsimd)
|
|
||||||
{
|
{
|
||||||
size_t subvol_size = e1*e2;
|
size_t subvol_size = e1*e2;
|
||||||
|
|
||||||
@ -122,7 +105,7 @@ inline void sliceSumReduction_sycl_small(const vobj *Data,
|
|||||||
mysum[r] = vobj_zero;
|
mysum[r] = vobj_zero;
|
||||||
}
|
}
|
||||||
|
|
||||||
deviceVector<vobj> reduction_buffer(rd*subvol_size);
|
commVector<vobj> reduction_buffer(rd*subvol_size);
|
||||||
|
|
||||||
auto rb_p = &reduction_buffer[0];
|
auto rb_p = &reduction_buffer[0];
|
||||||
|
|
||||||
@ -141,11 +124,11 @@ inline void sliceSumReduction_sycl_small(const vobj *Data,
|
|||||||
});
|
});
|
||||||
|
|
||||||
for (int r = 0; r < rd; r++) {
|
for (int r = 0; r < rd; r++) {
|
||||||
theGridAccelerator->submit([&](sycl::handler &cgh) {
|
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
|
||||||
auto Reduction = sycl::reduction(&mysum[r],std::plus<>());
|
auto Reduction = cl::sycl::reduction(&mysum[r],std::plus<>());
|
||||||
cgh.parallel_for(sycl::range<1>{subvol_size},
|
cgh.parallel_for(cl::sycl::range<1>{subvol_size},
|
||||||
Reduction,
|
Reduction,
|
||||||
[=](sycl::id<1> item, auto &sum) {
|
[=](cl::sycl::id<1> item, auto &sum) {
|
||||||
auto s = item[0];
|
auto s = item[0];
|
||||||
sum += rb_p[r*subvol_size+s];
|
sum += rb_p[r*subvol_size+s];
|
||||||
});
|
});
|
||||||
@ -161,23 +144,14 @@ inline void sliceSumReduction_sycl_small(const vobj *Data,
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
template<class vobj>
|
template<class vobj> inline void sliceSumReduction_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
|
||||||
inline void sliceSumReduction_large(const vobj *Data,
|
|
||||||
std::vector<vobj> &lvSum,
|
|
||||||
const int rd,
|
|
||||||
const int e1,
|
|
||||||
const int e2,
|
|
||||||
const int stride,
|
|
||||||
const int ostride,
|
|
||||||
const int Nsimd)
|
|
||||||
{
|
|
||||||
typedef typename vobj::vector_type vector;
|
typedef typename vobj::vector_type vector;
|
||||||
const int words = sizeof(vobj)/sizeof(vector);
|
const int words = sizeof(vobj)/sizeof(vector);
|
||||||
const int osites = rd*e1*e2;
|
const int osites = rd*e1*e2;
|
||||||
deviceVector<vector>buffer(osites);
|
commVector<vector>buffer(osites);
|
||||||
vector *dat = (vector *)Data;
|
vector *dat = (vector *)Data;
|
||||||
vector *buf = &buffer[0];
|
vector *buf = &buffer[0];
|
||||||
std::vector<vector> lvSum_small(rd);
|
Vector<vector> lvSum_small(rd);
|
||||||
vector *lvSum_ptr = (vector *)&lvSum[0];
|
vector *lvSum_ptr = (vector *)&lvSum[0];
|
||||||
|
|
||||||
for (int w = 0; w < words; w++) {
|
for (int w = 0; w < words; w++) {
|
||||||
@ -194,18 +168,13 @@ inline void sliceSumReduction_large(const vobj *Data,
|
|||||||
for (int r = 0; r < rd; r++) {
|
for (int r = 0; r < rd; r++) {
|
||||||
lvSum_ptr[w+words*r]=lvSum_small[r];
|
lvSum_ptr[w+words*r]=lvSum_small[r];
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj>
|
template<class vobj> inline void sliceSumReduction_gpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd)
|
||||||
inline void sliceSumReduction_gpu(const Lattice<vobj> &Data,
|
|
||||||
std::vector<vobj> &lvSum,
|
|
||||||
const int rd,
|
|
||||||
const int e1,
|
|
||||||
const int e2,
|
|
||||||
const int stride,
|
|
||||||
const int ostride,
|
|
||||||
const int Nsimd)
|
|
||||||
{
|
{
|
||||||
autoView(Data_v, Data, AcceleratorRead); //reduction libraries cannot deal with large vobjs so we split into small/large case.
|
autoView(Data_v, Data, AcceleratorRead); //reduction libraries cannot deal with large vobjs so we split into small/large case.
|
||||||
if constexpr (sizeof(vobj) <= 256) {
|
if constexpr (sizeof(vobj) <= 256) {
|
||||||
@ -223,15 +192,7 @@ inline void sliceSumReduction_gpu(const Lattice<vobj> &Data,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template<class vobj>
|
template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
|
||||||
inline void sliceSumReduction_cpu(const Lattice<vobj> &Data,
|
|
||||||
std::vector<vobj> &lvSum,
|
|
||||||
const int &rd,
|
|
||||||
const int &e1,
|
|
||||||
const int &e2,
|
|
||||||
const int &stride,
|
|
||||||
const int &ostride,
|
|
||||||
const int &Nsimd)
|
|
||||||
{
|
{
|
||||||
// sum over reduced dimension planes, breaking out orthog dir
|
// sum over reduced dimension planes, breaking out orthog dir
|
||||||
// Parallel over orthog direction
|
// Parallel over orthog direction
|
||||||
@ -247,20 +208,16 @@ inline void sliceSumReduction_cpu(const Lattice<vobj> &Data,
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data,
|
template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
|
||||||
std::vector<vobj> &lvSum,
|
|
||||||
const int &rd,
|
|
||||||
const int &e1,
|
|
||||||
const int &e2,
|
|
||||||
const int &stride,
|
|
||||||
const int &ostride,
|
|
||||||
const int &Nsimd)
|
|
||||||
{
|
{
|
||||||
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
|
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
|
||||||
|
|
||||||
sliceSumReduction_gpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
sliceSumReduction_gpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
||||||
#else
|
|
||||||
|
#else
|
||||||
sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
||||||
#endif
|
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -42,50 +42,21 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
|
|||||||
assert((fine->_rdimensions[d] / coarse->_rdimensions[d])* coarse->_rdimensions[d]==fine->_rdimensions[d]);
|
assert((fine->_rdimensions[d] / coarse->_rdimensions[d])* coarse->_rdimensions[d]==fine->_rdimensions[d]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// remove and insert a half checkerboard
|
// remove and insert a half checkerboard
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full)
|
template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full)
|
||||||
{
|
{
|
||||||
half.Checkerboard() = cb;
|
acceleratorPickCheckerboard(cb,half,full);
|
||||||
|
|
||||||
autoView( half_v, half, CpuWrite);
|
|
||||||
autoView( full_v, full, CpuRead);
|
|
||||||
thread_for(ss, full.Grid()->oSites(),{
|
|
||||||
int cbos;
|
|
||||||
Coordinate coor;
|
|
||||||
full.Grid()->oCoorFromOindex(coor,ss);
|
|
||||||
cbos=half.Grid()->CheckerBoard(coor);
|
|
||||||
|
|
||||||
if (cbos==cb) {
|
|
||||||
int ssh=half.Grid()->oIndex(coor);
|
|
||||||
half_v[ssh] = full_v[ss];
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half)
|
template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half)
|
||||||
{
|
{
|
||||||
int cb = half.Checkerboard();
|
acceleratorSetCheckerboard(full,half);
|
||||||
autoView( half_v , half, CpuRead);
|
|
||||||
autoView( full_v , full, CpuWrite);
|
|
||||||
thread_for(ss,full.Grid()->oSites(),{
|
|
||||||
|
|
||||||
Coordinate coor;
|
|
||||||
int cbos;
|
|
||||||
|
|
||||||
full.Grid()->oCoorFromOindex(coor,ss);
|
|
||||||
cbos=half.Grid()->CheckerBoard(coor);
|
|
||||||
|
|
||||||
if (cbos==cb) {
|
|
||||||
int ssh=half.Grid()->oIndex(coor);
|
|
||||||
full_v[ss]=half_v[ssh];
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full, int checker_dim_half=0)
|
template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full, int dummy=0)
|
||||||
{
|
{
|
||||||
half.Checkerboard() = cb;
|
half.Checkerboard() = cb;
|
||||||
autoView(half_v, half, AcceleratorWrite);
|
autoView(half_v, half, AcceleratorWrite);
|
||||||
@ -95,6 +66,7 @@ template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj
|
|||||||
unsigned long ndim_half = half.Grid()->_ndimension;
|
unsigned long ndim_half = half.Grid()->_ndimension;
|
||||||
Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
|
Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
|
||||||
Coordinate ostride_half = half.Grid()->_ostride;
|
Coordinate ostride_half = half.Grid()->_ostride;
|
||||||
|
int checker_dim_half = half.Grid()->CheckerDim();
|
||||||
accelerator_for(ss, full.Grid()->oSites(),full.Grid()->Nsimd(),{
|
accelerator_for(ss, full.Grid()->oSites(),full.Grid()->Nsimd(),{
|
||||||
|
|
||||||
Coordinate coor;
|
Coordinate coor;
|
||||||
@ -119,7 +91,7 @@ template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half, int checker_dim_half=0)
|
template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half, int dummy=0)
|
||||||
{
|
{
|
||||||
int cb = half.Checkerboard();
|
int cb = half.Checkerboard();
|
||||||
autoView(half_v , half, AcceleratorRead);
|
autoView(half_v , half, AcceleratorRead);
|
||||||
@ -129,6 +101,7 @@ template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,
|
|||||||
unsigned long ndim_half = half.Grid()->_ndimension;
|
unsigned long ndim_half = half.Grid()->_ndimension;
|
||||||
Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
|
Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
|
||||||
Coordinate ostride_half = half.Grid()->_ostride;
|
Coordinate ostride_half = half.Grid()->_ostride;
|
||||||
|
int checker_dim_half = half.Grid()->CheckerDim();
|
||||||
accelerator_for(ss,full.Grid()->oSites(),full.Grid()->Nsimd(),{
|
accelerator_for(ss,full.Grid()->oSites(),full.Grid()->Nsimd(),{
|
||||||
|
|
||||||
Coordinate coor;
|
Coordinate coor;
|
||||||
@ -981,14 +954,8 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
|
|||||||
hcoor[orthog] = slice;
|
hcoor[orthog] = slice;
|
||||||
for(int d=0;d<nh;d++){
|
for(int d=0;d<nh;d++){
|
||||||
if ( d!=orthog ) {
|
if ( d!=orthog ) {
|
||||||
hcoor[d]=lcoor[ddl];
|
hcoor[d]=lcoor[ddl++];
|
||||||
if ( hg->_checker_dim == d ) {
|
|
||||||
hcoor[d]=hcoor[d]*2; // factor in the full coor for peekLocalSite
|
|
||||||
lcoor[ddl]=lcoor[ddl]*2; // factor in the full coor for peekLocalSite
|
|
||||||
}
|
|
||||||
ddl++;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
peekLocalSite(s,lowDimv,lcoor);
|
peekLocalSite(s,lowDimv,lcoor);
|
||||||
pokeLocalSite(s,higherDimv,hcoor);
|
pokeLocalSite(s,higherDimv,hcoor);
|
||||||
@ -1009,7 +976,6 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
|
|||||||
assert(orthog<nh);
|
assert(orthog<nh);
|
||||||
assert(orthog>=0);
|
assert(orthog>=0);
|
||||||
assert(hg->_processors[orthog]==1);
|
assert(hg->_processors[orthog]==1);
|
||||||
lowDim.Checkerboard() = higherDim.Checkerboard();
|
|
||||||
|
|
||||||
int dl; dl = 0;
|
int dl; dl = 0;
|
||||||
for(int d=0;d<nh;d++){
|
for(int d=0;d<nh;d++){
|
||||||
@ -1027,16 +993,11 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
|
|||||||
Coordinate lcoor(nl);
|
Coordinate lcoor(nl);
|
||||||
Coordinate hcoor(nh);
|
Coordinate hcoor(nh);
|
||||||
lg->LocalIndexToLocalCoor(idx,lcoor);
|
lg->LocalIndexToLocalCoor(idx,lcoor);
|
||||||
hcoor[orthog] = slice;
|
|
||||||
int ddl=0;
|
int ddl=0;
|
||||||
|
hcoor[orthog] = slice;
|
||||||
for(int d=0;d<nh;d++){
|
for(int d=0;d<nh;d++){
|
||||||
if ( d!=orthog ) {
|
if ( d!=orthog ) {
|
||||||
hcoor[d]=lcoor[ddl];
|
hcoor[d]=lcoor[ddl++];
|
||||||
if ( hg->_checker_dim == d ) {
|
|
||||||
hcoor[d]=hcoor[d]*2; // factor in the full gridd coor for peekLocalSite
|
|
||||||
lcoor[ddl]=lcoor[ddl]*2; // factor in the full coor for peekLocalSite
|
|
||||||
}
|
|
||||||
ddl++;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
peekLocalSite(s,higherDimv,hcoor);
|
peekLocalSite(s,higherDimv,hcoor);
|
||||||
|
@ -54,7 +54,7 @@ struct CshiftImplGauge: public CshiftImplBase<typename Gimpl::GaugeLinkField::ve
|
|||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
|
|
||||||
template<class vobj> inline void ScatterSlice(const deviceVector<vobj> &buf,
|
template<class vobj> inline void ScatterSlice(const cshiftVector<vobj> &buf,
|
||||||
Lattice<vobj> &lat,
|
Lattice<vobj> &lat,
|
||||||
int x,
|
int x,
|
||||||
int dim,
|
int dim,
|
||||||
@ -140,7 +140,7 @@ template<class vobj> inline void ScatterSlice(const deviceVector<vobj> &buf,
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj> inline void GatherSlice(deviceVector<vobj> &buf,
|
template<class vobj> inline void GatherSlice(cshiftVector<vobj> &buf,
|
||||||
const Lattice<vobj> &lat,
|
const Lattice<vobj> &lat,
|
||||||
int x,
|
int x,
|
||||||
int dim,
|
int dim,
|
||||||
@ -462,19 +462,13 @@ public:
|
|||||||
int rNsimd = Nsimd / simd[dimension];
|
int rNsimd = Nsimd / simd[dimension];
|
||||||
assert( buffer_size == from.Grid()->_slice_nblock[dimension]*from.Grid()->_slice_block[dimension] / simd[dimension]);
|
assert( buffer_size == from.Grid()->_slice_nblock[dimension]*from.Grid()->_slice_block[dimension] / simd[dimension]);
|
||||||
|
|
||||||
static deviceVector<vobj> send_buf;
|
static cshiftVector<vobj> send_buf;
|
||||||
static deviceVector<vobj> recv_buf;
|
static cshiftVector<vobj> recv_buf;
|
||||||
send_buf.resize(buffer_size*2*depth);
|
send_buf.resize(buffer_size*2*depth);
|
||||||
recv_buf.resize(buffer_size*2*depth);
|
recv_buf.resize(buffer_size*2*depth);
|
||||||
#ifndef ACCELERATOR_AWARE_MPI
|
|
||||||
static hostVector<vobj> hsend_buf;
|
|
||||||
static hostVector<vobj> hrecv_buf;
|
|
||||||
hsend_buf.resize(buffer_size*2*depth);
|
|
||||||
hrecv_buf.resize(buffer_size*2*depth);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
std::vector<MpiCommsRequest_t> fwd_req;
|
std::vector<CommsRequest_t> fwd_req;
|
||||||
std::vector<MpiCommsRequest_t> bwd_req;
|
std::vector<CommsRequest_t> bwd_req;
|
||||||
|
|
||||||
int words = buffer_size;
|
int words = buffer_size;
|
||||||
int bytes = words * sizeof(vobj);
|
int bytes = words * sizeof(vobj);
|
||||||
@ -501,17 +495,9 @@ public:
|
|||||||
t_gather+=usecond()-t;
|
t_gather+=usecond()-t;
|
||||||
|
|
||||||
t=usecond();
|
t=usecond();
|
||||||
#ifdef ACCELERATOR_AWARE_MPI
|
|
||||||
grid->SendToRecvFromBegin(fwd_req,
|
grid->SendToRecvFromBegin(fwd_req,
|
||||||
(void *)&send_buf[d*buffer_size], xmit_to_rank,
|
(void *)&send_buf[d*buffer_size], xmit_to_rank,
|
||||||
(void *)&recv_buf[d*buffer_size], recv_from_rank, bytes, tag);
|
(void *)&recv_buf[d*buffer_size], recv_from_rank, bytes, tag);
|
||||||
#else
|
|
||||||
acceleratorCopyFromDevice(&send_buf[d*buffer_size],&hsend_buf[d*buffer_size],bytes);
|
|
||||||
grid->SendToRecvFromBegin(fwd_req,
|
|
||||||
(void *)&hsend_buf[d*buffer_size], xmit_to_rank,
|
|
||||||
(void *)&hrecv_buf[d*buffer_size], recv_from_rank, bytes, tag);
|
|
||||||
acceleratorCopyToDevice(&hrecv_buf[d*buffer_size],&recv_buf[d*buffer_size],bytes);
|
|
||||||
#endif
|
|
||||||
t_comms+=usecond()-t;
|
t_comms+=usecond()-t;
|
||||||
}
|
}
|
||||||
for ( int d=0;d < depth ; d ++ ) {
|
for ( int d=0;d < depth ; d ++ ) {
|
||||||
@ -522,17 +508,9 @@ public:
|
|||||||
t_gather+= usecond() - t;
|
t_gather+= usecond() - t;
|
||||||
|
|
||||||
t=usecond();
|
t=usecond();
|
||||||
#ifdef ACCELERATOR_AWARE_MPI
|
|
||||||
grid->SendToRecvFromBegin(bwd_req,
|
grid->SendToRecvFromBegin(bwd_req,
|
||||||
(void *)&send_buf[(d+depth)*buffer_size], recv_from_rank,
|
(void *)&send_buf[(d+depth)*buffer_size], recv_from_rank,
|
||||||
(void *)&recv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
|
(void *)&recv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
|
||||||
#else
|
|
||||||
acceleratorCopyFromDevice(&send_buf[(d+depth)*buffer_size],&hsend_buf[(d+depth)*buffer_size],bytes);
|
|
||||||
grid->SendToRecvFromBegin(bwd_req,
|
|
||||||
(void *)&hsend_buf[(d+depth)*buffer_size], recv_from_rank,
|
|
||||||
(void *)&hrecv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
|
|
||||||
acceleratorCopyToDevice(&hrecv_buf[(d+depth)*buffer_size],&recv_buf[(d+depth)*buffer_size],bytes);
|
|
||||||
#endif
|
|
||||||
t_comms+=usecond()-t;
|
t_comms+=usecond()-t;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -98,7 +98,7 @@ public:
|
|||||||
virtual RealD S(const GaugeField& U) = 0; // evaluate the action
|
virtual RealD S(const GaugeField& U) = 0; // evaluate the action
|
||||||
virtual RealD Sinitial(const GaugeField& U) { return this->S(U); } ; // if the refresh computes the action, can cache it. Alternately refreshAndAction() ?
|
virtual RealD Sinitial(const GaugeField& U) { return this->S(U); } ; // if the refresh computes the action, can cache it. Alternately refreshAndAction() ?
|
||||||
virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0; // evaluate the action derivative
|
virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0; // evaluate the action derivative
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////
|
||||||
// virtual smeared interface through configuration container
|
// virtual smeared interface through configuration container
|
||||||
/////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////
|
||||||
@ -132,10 +132,6 @@ public:
|
|||||||
template <class GaugeField >
|
template <class GaugeField >
|
||||||
class EmptyAction : public Action <GaugeField>
|
class EmptyAction : public Action <GaugeField>
|
||||||
{
|
{
|
||||||
using Action<GaugeField>::refresh;
|
|
||||||
using Action<GaugeField>::Sinitial;
|
|
||||||
using Action<GaugeField>::deriv;
|
|
||||||
|
|
||||||
virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { assert(0);}; // refresh pseudofermions
|
virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { assert(0);}; // refresh pseudofermions
|
||||||
virtual RealD S(const GaugeField& U) { return 0.0;}; // evaluate the action
|
virtual RealD S(const GaugeField& U) { return 0.0;}; // evaluate the action
|
||||||
virtual void deriv(const GaugeField& U, GaugeField& dSdU) { assert(0); }; // evaluate the action derivative
|
virtual void deriv(const GaugeField& U, GaugeField& dSdU) { assert(0); }; // evaluate the action derivative
|
||||||
|
@ -55,11 +55,6 @@ public:
|
|||||||
RealD alpha; // Mobius scale
|
RealD alpha; // Mobius scale
|
||||||
RealD k; // EOFA normalization constant
|
RealD k; // EOFA normalization constant
|
||||||
|
|
||||||
// Device resident
|
|
||||||
deviceVector<Coeff_t> d_shift_coefficients;
|
|
||||||
deviceVector<Coeff_t> d_MooeeInv_shift_lc;
|
|
||||||
deviceVector<Coeff_t> d_MooeeInv_shift_norm;
|
|
||||||
|
|
||||||
virtual void Instantiatable(void) = 0;
|
virtual void Instantiatable(void) = 0;
|
||||||
|
|
||||||
// EOFA-specific operations
|
// EOFA-specific operations
|
||||||
@ -97,11 +92,6 @@ public:
|
|||||||
this->k = this->alpha * (_mq3-_mq2) * std::pow(this->alpha+1.0,2*Ls) /
|
this->k = this->alpha * (_mq3-_mq2) * std::pow(this->alpha+1.0,2*Ls) /
|
||||||
( std::pow(this->alpha+1.0,Ls) + _mq2*std::pow(this->alpha-1.0,Ls) ) /
|
( std::pow(this->alpha+1.0,Ls) + _mq2*std::pow(this->alpha-1.0,Ls) ) /
|
||||||
( std::pow(this->alpha+1.0,Ls) + _mq3*std::pow(this->alpha-1.0,Ls) );
|
( std::pow(this->alpha+1.0,Ls) + _mq3*std::pow(this->alpha-1.0,Ls) );
|
||||||
|
|
||||||
d_shift_coefficients.resize(Ls);
|
|
||||||
d_MooeeInv_shift_lc.resize(Ls);
|
|
||||||
d_MooeeInv_shift_norm.resize(Ls);
|
|
||||||
|
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -90,16 +90,16 @@ public:
|
|||||||
void M5D(const FermionField &psi,
|
void M5D(const FermionField &psi,
|
||||||
const FermionField &phi,
|
const FermionField &phi,
|
||||||
FermionField &chi,
|
FermionField &chi,
|
||||||
std::vector<Coeff_t> &lower,
|
Vector<Coeff_t> &lower,
|
||||||
std::vector<Coeff_t> &diag,
|
Vector<Coeff_t> &diag,
|
||||||
std::vector<Coeff_t> &upper);
|
Vector<Coeff_t> &upper);
|
||||||
|
|
||||||
void M5Ddag(const FermionField &psi,
|
void M5Ddag(const FermionField &psi,
|
||||||
const FermionField &phi,
|
const FermionField &phi,
|
||||||
FermionField &chi,
|
FermionField &chi,
|
||||||
std::vector<Coeff_t> &lower,
|
Vector<Coeff_t> &lower,
|
||||||
std::vector<Coeff_t> &diag,
|
Vector<Coeff_t> &diag,
|
||||||
std::vector<Coeff_t> &upper);
|
Vector<Coeff_t> &upper);
|
||||||
|
|
||||||
virtual void Instantiatable(void)=0;
|
virtual void Instantiatable(void)=0;
|
||||||
|
|
||||||
@ -119,51 +119,35 @@ public:
|
|||||||
RealD mass_plus, mass_minus;
|
RealD mass_plus, mass_minus;
|
||||||
|
|
||||||
// Save arguments to SetCoefficientsInternal
|
// Save arguments to SetCoefficientsInternal
|
||||||
std::vector<Coeff_t> _gamma;
|
Vector<Coeff_t> _gamma;
|
||||||
RealD _zolo_hi;
|
RealD _zolo_hi;
|
||||||
RealD _b;
|
RealD _b;
|
||||||
RealD _c;
|
RealD _c;
|
||||||
|
|
||||||
// possible boost
|
|
||||||
std::vector<ComplexD> qmu;
|
|
||||||
void set_qmu(std::vector<ComplexD> _qmu) { qmu=_qmu; assert(qmu.size()==Nd);};
|
|
||||||
void addQmu(const FermionField &in, FermionField &out, int dag);
|
|
||||||
|
|
||||||
// Cayley form Moebius (tanh and zolotarev)
|
// Cayley form Moebius (tanh and zolotarev)
|
||||||
std::vector<Coeff_t> omega;
|
Vector<Coeff_t> omega;
|
||||||
std::vector<Coeff_t> bs; // S dependent coeffs
|
Vector<Coeff_t> bs; // S dependent coeffs
|
||||||
std::vector<Coeff_t> cs;
|
Vector<Coeff_t> cs;
|
||||||
std::vector<Coeff_t> as;
|
Vector<Coeff_t> as;
|
||||||
// For preconditioning Cayley form
|
// For preconditioning Cayley form
|
||||||
std::vector<Coeff_t> bee;
|
Vector<Coeff_t> bee;
|
||||||
std::vector<Coeff_t> cee;
|
Vector<Coeff_t> cee;
|
||||||
std::vector<Coeff_t> aee;
|
Vector<Coeff_t> aee;
|
||||||
std::vector<Coeff_t> beo;
|
Vector<Coeff_t> beo;
|
||||||
std::vector<Coeff_t> ceo;
|
Vector<Coeff_t> ceo;
|
||||||
std::vector<Coeff_t> aeo;
|
Vector<Coeff_t> aeo;
|
||||||
// LDU factorisation of the eeoo matrix
|
// LDU factorisation of the eeoo matrix
|
||||||
std::vector<Coeff_t> lee;
|
Vector<Coeff_t> lee;
|
||||||
std::vector<Coeff_t> leem;
|
Vector<Coeff_t> leem;
|
||||||
std::vector<Coeff_t> uee;
|
Vector<Coeff_t> uee;
|
||||||
std::vector<Coeff_t> ueem;
|
Vector<Coeff_t> ueem;
|
||||||
std::vector<Coeff_t> dee;
|
Vector<Coeff_t> dee;
|
||||||
|
|
||||||
// Device memory
|
|
||||||
deviceVector<Coeff_t> d_diag;
|
|
||||||
deviceVector<Coeff_t> d_upper;
|
|
||||||
deviceVector<Coeff_t> d_lower;
|
|
||||||
|
|
||||||
deviceVector<Coeff_t> d_lee;
|
|
||||||
deviceVector<Coeff_t> d_dee;
|
|
||||||
deviceVector<Coeff_t> d_uee;
|
|
||||||
deviceVector<Coeff_t> d_leem;
|
|
||||||
deviceVector<Coeff_t> d_ueem;
|
|
||||||
|
|
||||||
// Matrices of 5d ee inverse params
|
// Matrices of 5d ee inverse params
|
||||||
// std::vector<iSinglet<Simd> > MatpInv;
|
Vector<iSinglet<Simd> > MatpInv;
|
||||||
// std::vector<iSinglet<Simd> > MatmInv;
|
Vector<iSinglet<Simd> > MatmInv;
|
||||||
// std::vector<iSinglet<Simd> > MatpInvDag;
|
Vector<iSinglet<Simd> > MatpInvDag;
|
||||||
// std::vector<iSinglet<Simd> > MatmInvDag;
|
Vector<iSinglet<Simd> > MatmInvDag;
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
// Conserved current utilities
|
// Conserved current utilities
|
||||||
@ -203,7 +187,7 @@ public:
|
|||||||
protected:
|
protected:
|
||||||
virtual void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
|
virtual void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
|
||||||
virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c);
|
virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c);
|
||||||
virtual void SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c);
|
virtual void SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t> & gamma,RealD b,RealD c);
|
||||||
};
|
};
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
@ -60,50 +60,6 @@ public:
|
|||||||
// virtual void Instantiatable(void)=0;
|
// virtual void Instantiatable(void)=0;
|
||||||
virtual void Instantiatable(void) =0;
|
virtual void Instantiatable(void) =0;
|
||||||
|
|
||||||
void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary, std::vector<double> twist)
|
|
||||||
{
|
|
||||||
std::cout << "Free Propagator for PartialFraction"<<std::endl;
|
|
||||||
FermionField in_k(in.Grid());
|
|
||||||
FermionField prop_k(in.Grid());
|
|
||||||
|
|
||||||
FFT theFFT((GridCartesian *) in.Grid());
|
|
||||||
|
|
||||||
//phase for boundary condition
|
|
||||||
ComplexField coor(in.Grid());
|
|
||||||
ComplexField ph(in.Grid()); ph = Zero();
|
|
||||||
FermionField in_buf(in.Grid()); in_buf = Zero();
|
|
||||||
typedef typename Simd::scalar_type Scalar;
|
|
||||||
Scalar ci(0.0,1.0);
|
|
||||||
assert(twist.size() == Nd);//check that twist is Nd
|
|
||||||
assert(boundary.size() == Nd);//check that boundary conditions is Nd
|
|
||||||
int shift = 0;
|
|
||||||
for(unsigned int nu = 0; nu < Nd; nu++)
|
|
||||||
{
|
|
||||||
// Shift coordinate lattice index by 1 to account for 5th dimension.
|
|
||||||
LatticeCoordinate(coor, nu + shift);
|
|
||||||
double boundary_phase = ::acos(real(boundary[nu]));
|
|
||||||
ph = ph + boundary_phase*coor*((1./(in.Grid()->_fdimensions[nu+shift])));
|
|
||||||
//momenta for propagator shifted by twist+boundary
|
|
||||||
twist[nu] = twist[nu] + boundary_phase/((2.0*M_PI));
|
|
||||||
}
|
|
||||||
in_buf = exp(ci*ph*(-1.0))*in;
|
|
||||||
|
|
||||||
theFFT.FFT_all_dim(in_k,in,FFT::forward);
|
|
||||||
this->MomentumSpacePropagatorHw(prop_k,in_k,mass,twist);
|
|
||||||
theFFT.FFT_all_dim(out,prop_k,FFT::backward);
|
|
||||||
|
|
||||||
//phase for boundary condition
|
|
||||||
out = out * exp(ci*ph);
|
|
||||||
};
|
|
||||||
|
|
||||||
virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) {
|
|
||||||
std::vector<double> twist(Nd,0.0); //default: periodic boundarys in all directions
|
|
||||||
std::vector<Complex> boundary;
|
|
||||||
for(int i=0;i<Nd;i++) boundary.push_back(1);//default: periodic boundary conditions
|
|
||||||
FreePropagator(in,out,mass,boundary,twist);
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
// Efficient support for multigrid coarsening
|
// Efficient support for multigrid coarsening
|
||||||
virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp);
|
virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp);
|
||||||
virtual void MdirAll(const FermionField &in, std::vector<FermionField> &out);
|
virtual void MdirAll(const FermionField &in, std::vector<FermionField> &out);
|
||||||
@ -134,12 +90,12 @@ protected:
|
|||||||
RealD mass;
|
RealD mass;
|
||||||
RealD R;
|
RealD R;
|
||||||
RealD ZoloHiInv;
|
RealD ZoloHiInv;
|
||||||
std::vector<double> Beta;
|
Vector<double> Beta;
|
||||||
std::vector<double> cc;;
|
Vector<double> cc;;
|
||||||
std::vector<double> cc_d;;
|
Vector<double> cc_d;;
|
||||||
std::vector<double> sqrt_cc;
|
Vector<double> sqrt_cc;
|
||||||
std::vector<double> See;
|
Vector<double> See;
|
||||||
std::vector<double> Aee;
|
Vector<double> Aee;
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -69,10 +69,10 @@ public:
|
|||||||
// Instantiate different versions depending on Impl
|
// Instantiate different versions depending on Impl
|
||||||
/////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////
|
||||||
void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
|
void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
|
||||||
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
|
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);
|
||||||
|
|
||||||
void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
|
void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
|
||||||
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
|
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);
|
||||||
|
|
||||||
virtual void RefreshShiftCoefficients(RealD new_shift);
|
virtual void RefreshShiftCoefficients(RealD new_shift);
|
||||||
|
|
||||||
@ -83,7 +83,7 @@ public:
|
|||||||
RealD _M5, const ImplParams& p=ImplParams());
|
RealD _M5, const ImplParams& p=ImplParams());
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
void SetCoefficientsInternal(RealD zolo_hi, std::vector<Coeff_t>& gamma, RealD b, RealD c);
|
void SetCoefficientsInternal(RealD zolo_hi, Vector<Coeff_t>& gamma, RealD b, RealD c);
|
||||||
};
|
};
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
@ -102,11 +102,11 @@ public:
|
|||||||
GaugeField &mat,
|
GaugeField &mat,
|
||||||
const FermionField &A, const FermionField &B, int dag);
|
const FermionField &A, const FermionField &B, int dag);
|
||||||
|
|
||||||
void DhopInternal(StencilImpl &st, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||||
const FermionField &in, FermionField &out, int dag);
|
const FermionField &in, FermionField &out, int dag);
|
||||||
void DhopInternalSerialComms(StencilImpl &st, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||||
const FermionField &in, FermionField &out, int dag);
|
const FermionField &in, FermionField &out, int dag);
|
||||||
void DhopInternalOverlappedComms(StencilImpl &st, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||||
const FermionField &in, FermionField &out, int dag);
|
const FermionField &in, FermionField &out, int dag);
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
@ -164,6 +164,8 @@ public:
|
|||||||
DoubledGaugeField UUUmuEven;
|
DoubledGaugeField UUUmuEven;
|
||||||
DoubledGaugeField UUUmuOdd;
|
DoubledGaugeField UUUmuOdd;
|
||||||
|
|
||||||
|
LebesgueOrder Lebesgue;
|
||||||
|
LebesgueOrder LebesgueEvenOdd;
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
// Conserved current utilities
|
// Conserved current utilities
|
||||||
|
@ -100,6 +100,7 @@ public:
|
|||||||
int dag);
|
int dag);
|
||||||
|
|
||||||
void DhopInternal(StencilImpl & st,
|
void DhopInternal(StencilImpl & st,
|
||||||
|
LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
DoubledGaugeField &UUU,
|
DoubledGaugeField &UUU,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
@ -107,6 +108,7 @@ public:
|
|||||||
int dag);
|
int dag);
|
||||||
|
|
||||||
void DhopInternalOverlappedComms(StencilImpl & st,
|
void DhopInternalOverlappedComms(StencilImpl & st,
|
||||||
|
LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
DoubledGaugeField &UUU,
|
DoubledGaugeField &UUU,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
@ -114,6 +116,7 @@ public:
|
|||||||
int dag);
|
int dag);
|
||||||
|
|
||||||
void DhopInternalSerialComms(StencilImpl & st,
|
void DhopInternalSerialComms(StencilImpl & st,
|
||||||
|
LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
DoubledGaugeField &UUU,
|
DoubledGaugeField &UUU,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
@ -189,6 +192,8 @@ public:
|
|||||||
DoubledGaugeField UUUmuEven;
|
DoubledGaugeField UUUmuEven;
|
||||||
DoubledGaugeField UUUmuOdd;
|
DoubledGaugeField UUUmuOdd;
|
||||||
|
|
||||||
|
LebesgueOrder Lebesgue;
|
||||||
|
LebesgueOrder LebesgueEvenOdd;
|
||||||
|
|
||||||
// Comms buffer
|
// Comms buffer
|
||||||
// std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
|
// std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
|
||||||
|
@ -42,11 +42,11 @@ public:
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
// Shift operator coefficients for red-black preconditioned Mobius EOFA
|
// Shift operator coefficients for red-black preconditioned Mobius EOFA
|
||||||
std::vector<Coeff_t> Mooee_shift;
|
Vector<Coeff_t> Mooee_shift;
|
||||||
std::vector<Coeff_t> MooeeInv_shift_lc;
|
Vector<Coeff_t> MooeeInv_shift_lc;
|
||||||
std::vector<Coeff_t> MooeeInv_shift_norm;
|
Vector<Coeff_t> MooeeInv_shift_norm;
|
||||||
std::vector<Coeff_t> MooeeInvDag_shift_lc;
|
Vector<Coeff_t> MooeeInvDag_shift_lc;
|
||||||
std::vector<Coeff_t> MooeeInvDag_shift_norm;
|
Vector<Coeff_t> MooeeInvDag_shift_norm;
|
||||||
|
|
||||||
virtual void Instantiatable(void) {};
|
virtual void Instantiatable(void) {};
|
||||||
|
|
||||||
@ -74,18 +74,18 @@ public:
|
|||||||
// Instantiate different versions depending on Impl
|
// Instantiate different versions depending on Impl
|
||||||
/////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////
|
||||||
void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
|
void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
|
||||||
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
|
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);
|
||||||
|
|
||||||
void M5D_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
|
void M5D_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
|
||||||
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
|
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper,
|
||||||
std::vector<Coeff_t>& shift_coeffs);
|
Vector<Coeff_t>& shift_coeffs);
|
||||||
|
|
||||||
void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
|
void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
|
||||||
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
|
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);
|
||||||
|
|
||||||
void M5Ddag_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
|
void M5Ddag_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
|
||||||
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
|
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper,
|
||||||
std::vector<Coeff_t>& shift_coeffs);
|
Vector<Coeff_t>& shift_coeffs);
|
||||||
|
|
||||||
virtual void RefreshShiftCoefficients(RealD new_shift);
|
virtual void RefreshShiftCoefficients(RealD new_shift);
|
||||||
|
|
||||||
|
@ -102,11 +102,11 @@ public:
|
|||||||
GaugeField &mat,
|
GaugeField &mat,
|
||||||
const FermionField &A, const FermionField &B, int dag);
|
const FermionField &A, const FermionField &B, int dag);
|
||||||
|
|
||||||
void DhopInternal(StencilImpl &st, DoubledGaugeField &U,
|
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||||
const FermionField &in, FermionField &out, int dag);
|
const FermionField &in, FermionField &out, int dag);
|
||||||
void DhopInternalSerialComms(StencilImpl &st, DoubledGaugeField &U,
|
void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||||
const FermionField &in, FermionField &out, int dag);
|
const FermionField &in, FermionField &out, int dag);
|
||||||
void DhopInternalOverlappedComms(StencilImpl &st, DoubledGaugeField &U,
|
void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||||
const FermionField &in, FermionField &out, int dag);
|
const FermionField &in, FermionField &out, int dag);
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
@ -152,6 +152,9 @@ public:
|
|||||||
DoubledGaugeField UmuEven;
|
DoubledGaugeField UmuEven;
|
||||||
DoubledGaugeField UmuOdd;
|
DoubledGaugeField UmuOdd;
|
||||||
|
|
||||||
|
LebesgueOrder Lebesgue;
|
||||||
|
LebesgueOrder LebesgueEvenOdd;
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
// Conserved current utilities
|
// Conserved current utilities
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
|
@ -42,7 +42,7 @@ public:
|
|||||||
|
|
||||||
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
|
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
|
||||||
this->MomentumSpacePropagatorHw(out,in,_m,twist);
|
this->MomentumSpacePropagatorHw(out,in,_m,twist);
|
||||||
};
|
};
|
||||||
|
|
||||||
// Constructors
|
// Constructors
|
||||||
OverlapWilsonCayleyTanhFermion(GaugeField &_Umu,
|
OverlapWilsonCayleyTanhFermion(GaugeField &_Umu,
|
||||||
|
@ -41,10 +41,6 @@ public:
|
|||||||
public:
|
public:
|
||||||
|
|
||||||
// Constructors
|
// Constructors
|
||||||
virtual void Instantiatable(void){};
|
|
||||||
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
|
|
||||||
this->MomentumSpacePropagatorHw(out,in,_m,twist);
|
|
||||||
};
|
|
||||||
|
|
||||||
OverlapWilsonCayleyZolotarevFermion(GaugeField &_Umu,
|
OverlapWilsonCayleyZolotarevFermion(GaugeField &_Umu,
|
||||||
GridCartesian &FiveDimGrid,
|
GridCartesian &FiveDimGrid,
|
||||||
|
@ -41,9 +41,6 @@ public:
|
|||||||
public:
|
public:
|
||||||
|
|
||||||
virtual void Instantiatable(void){};
|
virtual void Instantiatable(void){};
|
||||||
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
|
|
||||||
this->MomentumSpacePropagatorHw(out,in,_m,twist);
|
|
||||||
};
|
|
||||||
// Constructors
|
// Constructors
|
||||||
OverlapWilsonContFracTanhFermion(GaugeField &_Umu,
|
OverlapWilsonContFracTanhFermion(GaugeField &_Umu,
|
||||||
GridCartesian &FiveDimGrid,
|
GridCartesian &FiveDimGrid,
|
||||||
|
@ -40,9 +40,6 @@ public:
|
|||||||
INHERIT_IMPL_TYPES(Impl);
|
INHERIT_IMPL_TYPES(Impl);
|
||||||
|
|
||||||
virtual void Instantiatable(void){};
|
virtual void Instantiatable(void){};
|
||||||
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
|
|
||||||
this->MomentumSpacePropagatorHw(out,in,_m,twist);
|
|
||||||
};
|
|
||||||
// Constructors
|
// Constructors
|
||||||
OverlapWilsonContFracZolotarevFermion(GaugeField &_Umu,
|
OverlapWilsonContFracZolotarevFermion(GaugeField &_Umu,
|
||||||
GridCartesian &FiveDimGrid,
|
GridCartesian &FiveDimGrid,
|
||||||
|
@ -41,9 +41,6 @@ public:
|
|||||||
public:
|
public:
|
||||||
|
|
||||||
virtual void Instantiatable(void){};
|
virtual void Instantiatable(void){};
|
||||||
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
|
|
||||||
this->MomentumSpacePropagatorHw(out,in,_m,twist);
|
|
||||||
};
|
|
||||||
// Constructors
|
// Constructors
|
||||||
OverlapWilsonPartialFractionTanhFermion(GaugeField &_Umu,
|
OverlapWilsonPartialFractionTanhFermion(GaugeField &_Umu,
|
||||||
GridCartesian &FiveDimGrid,
|
GridCartesian &FiveDimGrid,
|
||||||
|
@ -40,11 +40,6 @@ public:
|
|||||||
INHERIT_IMPL_TYPES(Impl);
|
INHERIT_IMPL_TYPES(Impl);
|
||||||
|
|
||||||
virtual void Instantiatable(void){};
|
virtual void Instantiatable(void){};
|
||||||
|
|
||||||
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
|
|
||||||
this->MomentumSpacePropagatorHw(out,in,_m,twist);
|
|
||||||
};
|
|
||||||
|
|
||||||
// Constructors
|
// Constructors
|
||||||
OverlapWilsonPartialFractionZolotarevFermion(GaugeField &_Umu,
|
OverlapWilsonPartialFractionZolotarevFermion(GaugeField &_Umu,
|
||||||
GridCartesian &FiveDimGrid,
|
GridCartesian &FiveDimGrid,
|
||||||
|
@ -39,7 +39,7 @@ class PartialFractionFermion5D : public WilsonFermion5D<Impl>
|
|||||||
public:
|
public:
|
||||||
INHERIT_IMPL_TYPES(Impl);
|
INHERIT_IMPL_TYPES(Impl);
|
||||||
|
|
||||||
const int part_frac_chroma_convention=0;
|
const int part_frac_chroma_convention=1;
|
||||||
|
|
||||||
void Meooe_internal(const FermionField &in, FermionField &out,int dag);
|
void Meooe_internal(const FermionField &in, FermionField &out,int dag);
|
||||||
void Mooee_internal(const FermionField &in, FermionField &out,int dag);
|
void Mooee_internal(const FermionField &in, FermionField &out,int dag);
|
||||||
@ -83,78 +83,19 @@ public:
|
|||||||
GridRedBlackCartesian &FourDimRedBlackGrid,
|
GridRedBlackCartesian &FourDimRedBlackGrid,
|
||||||
RealD _mass,RealD M5,const ImplParams &p= ImplParams());
|
RealD _mass,RealD M5,const ImplParams &p= ImplParams());
|
||||||
|
|
||||||
PartialFractionFermion5D(GaugeField &_Umu,
|
|
||||||
GridCartesian &FiveDimGrid,
|
|
||||||
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
|
||||||
GridCartesian &FourDimGrid,
|
|
||||||
GridRedBlackCartesian &FourDimRedBlackGrid,
|
|
||||||
RealD _mass,RealD M5,std::vector<RealD> &_qmu,const ImplParams &p= ImplParams());
|
|
||||||
|
|
||||||
void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary, std::vector<double> twist)
|
|
||||||
{
|
|
||||||
std::cout << "Free Propagator for PartialFraction"<<std::endl;
|
|
||||||
FermionField in_k(in.Grid());
|
|
||||||
FermionField prop_k(in.Grid());
|
|
||||||
|
|
||||||
FFT theFFT((GridCartesian *) in.Grid());
|
|
||||||
|
|
||||||
//phase for boundary condition
|
|
||||||
ComplexField coor(in.Grid());
|
|
||||||
ComplexField ph(in.Grid()); ph = Zero();
|
|
||||||
FermionField in_buf(in.Grid()); in_buf = Zero();
|
|
||||||
typedef typename Simd::scalar_type Scalar;
|
|
||||||
Scalar ci(0.0,1.0);
|
|
||||||
assert(twist.size() == Nd);//check that twist is Nd
|
|
||||||
assert(boundary.size() == Nd);//check that boundary conditions is Nd
|
|
||||||
int shift = 0;
|
|
||||||
for(unsigned int nu = 0; nu < Nd; nu++)
|
|
||||||
{
|
|
||||||
// Shift coordinate lattice index by 1 to account for 5th dimension.
|
|
||||||
LatticeCoordinate(coor, nu + shift);
|
|
||||||
double boundary_phase = ::acos(real(boundary[nu]));
|
|
||||||
ph = ph + boundary_phase*coor*((1./(in.Grid()->_fdimensions[nu+shift])));
|
|
||||||
//momenta for propagator shifted by twist+boundary
|
|
||||||
twist[nu] = twist[nu] + boundary_phase/((2.0*M_PI));
|
|
||||||
}
|
|
||||||
in_buf = exp(ci*ph*(-1.0))*in;
|
|
||||||
|
|
||||||
theFFT.FFT_all_dim(in_k,in,FFT::forward);
|
|
||||||
if ( this->qmu.size() ){
|
|
||||||
this->MomentumSpacePropagatorHwQ(prop_k,in_k,mass,twist,this->qmu);
|
|
||||||
} else {
|
|
||||||
this->MomentumSpacePropagatorHw(prop_k,in_k,mass,twist);
|
|
||||||
}
|
|
||||||
theFFT.FFT_all_dim(out,prop_k,FFT::backward);
|
|
||||||
|
|
||||||
//phase for boundary condition
|
|
||||||
out = out * exp(ci*ph);
|
|
||||||
};
|
|
||||||
|
|
||||||
virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) {
|
|
||||||
std::vector<double> twist(Nd,0.0); //default: periodic boundarys in all directions
|
|
||||||
std::vector<Complex> boundary;
|
|
||||||
for(int i=0;i<Nd;i++) boundary.push_back(1);//default: periodic boundary conditions
|
|
||||||
FreePropagator(in,out,mass,boundary,twist);
|
|
||||||
};
|
|
||||||
|
|
||||||
void set_qmu(std::vector<RealD> _qmu) { qmu=_qmu; assert(qmu.size()==Nd);};
|
|
||||||
void addQmu(const FermionField &in, FermionField &out, int dag);
|
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
|
|
||||||
virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale);
|
virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale);
|
||||||
virtual void SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata);
|
virtual void SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata);
|
||||||
|
|
||||||
std::vector<RealD> qmu;
|
|
||||||
|
|
||||||
// Part frac
|
// Part frac
|
||||||
RealD mass;
|
RealD mass;
|
||||||
RealD dw_diag;
|
RealD dw_diag;
|
||||||
RealD R;
|
RealD R;
|
||||||
RealD amax;
|
RealD amax;
|
||||||
RealD scale;
|
RealD scale;
|
||||||
std::vector<double> p;
|
Vector<double> p;
|
||||||
std::vector<double> q;
|
Vector<double> q;
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -35,7 +35,7 @@ template<class Matrix, class Field>
|
|||||||
class KappaSimilarityTransform {
|
class KappaSimilarityTransform {
|
||||||
public:
|
public:
|
||||||
INHERIT_IMPL_TYPES(Matrix);
|
INHERIT_IMPL_TYPES(Matrix);
|
||||||
std::vector<Coeff_t> kappa, kappaDag, kappaInv, kappaInvDag;
|
Vector<Coeff_t> kappa, kappaDag, kappaInv, kappaInvDag;
|
||||||
|
|
||||||
KappaSimilarityTransform (Matrix &zmob) {
|
KappaSimilarityTransform (Matrix &zmob) {
|
||||||
for (int i=0;i<(int)zmob.bs.size();i++) {
|
for (int i=0;i<(int)zmob.bs.size();i++) {
|
||||||
|
@ -49,10 +49,10 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
void DhopImproved(StencilImpl &st,
|
void DhopImproved(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
const FermionField &in, FermionField &out, int dag, int interior,int exterior);
|
const FermionField &in, FermionField &out, int dag, int interior,int exterior);
|
||||||
void DhopNaive(StencilImpl &st,
|
void DhopNaive(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
const FermionField &in, FermionField &out, int dag, int interior,int exterior);
|
const FermionField &in, FermionField &out, int dag, int interior,int exterior);
|
||||||
|
|
||||||
|
@ -47,7 +47,7 @@ public:
|
|||||||
static int PartialCompressionFactor(GridBase *grid) { return 1;}
|
static int PartialCompressionFactor(GridBase *grid) { return 1;}
|
||||||
#endif
|
#endif
|
||||||
template<class vobj,class cobj,class compressor>
|
template<class vobj,class cobj,class compressor>
|
||||||
static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
|
static void Gather_plane_simple (commVector<std::pair<int,int> >& table,
|
||||||
const Lattice<vobj> &rhs,
|
const Lattice<vobj> &rhs,
|
||||||
cobj *buffer,
|
cobj *buffer,
|
||||||
compressor &compress,
|
compressor &compress,
|
||||||
@ -109,7 +109,7 @@ public:
|
|||||||
// Reorder the fifth dim to be s=Ls-1 , s=0, s=1,...,Ls-2.
|
// Reorder the fifth dim to be s=Ls-1 , s=0, s=1,...,Ls-2.
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
template<class vobj,class cobj,class compressor>
|
template<class vobj,class cobj,class compressor>
|
||||||
static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
|
static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
|
||||||
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
|
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
|
||||||
compressor &compress,int type,int partial)
|
compressor &compress,int type,int partial)
|
||||||
{
|
{
|
||||||
@ -197,7 +197,7 @@ public:
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
template<class vobj,class cobj,class compressor>
|
template<class vobj,class cobj,class compressor>
|
||||||
static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
|
static void Gather_plane_simple (commVector<std::pair<int,int> >& table,
|
||||||
const Lattice<vobj> &rhs,
|
const Lattice<vobj> &rhs,
|
||||||
cobj *buffer,
|
cobj *buffer,
|
||||||
compressor &compress,
|
compressor &compress,
|
||||||
@ -208,7 +208,7 @@ public:
|
|||||||
else FaceGatherSimple::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
|
else FaceGatherSimple::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
|
||||||
}
|
}
|
||||||
template<class vobj,class cobj,class compressor>
|
template<class vobj,class cobj,class compressor>
|
||||||
static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
|
static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
|
||||||
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
|
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
|
||||||
compressor &compress,int type,int partial)
|
compressor &compress,int type,int partial)
|
||||||
{
|
{
|
||||||
@ -402,6 +402,7 @@ public:
|
|||||||
|
|
||||||
typedef CartesianStencil<vobj,cobj,Parameters> Base;
|
typedef CartesianStencil<vobj,cobj,Parameters> Base;
|
||||||
typedef typename Base::View_type View_type;
|
typedef typename Base::View_type View_type;
|
||||||
|
typedef typename Base::StencilVector StencilVector;
|
||||||
|
|
||||||
// Vector<int> surface_list;
|
// Vector<int> surface_list;
|
||||||
WilsonStencil(GridBase *grid,
|
WilsonStencil(GridBase *grid,
|
||||||
@ -414,6 +415,29 @@ public:
|
|||||||
// surface_list.resize(0);
|
// surface_list.resize(0);
|
||||||
this->same_node.resize(npoints);
|
this->same_node.resize(npoints);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
void BuildSurfaceList(int Ls,int vol4){
|
||||||
|
|
||||||
|
// find same node for SHM
|
||||||
|
// Here we know the distance is 1 for WilsonStencil
|
||||||
|
for(int point=0;point<this->_npoints;point++){
|
||||||
|
this->same_node[point] = this->SameNode(point);
|
||||||
|
}
|
||||||
|
|
||||||
|
for(int site = 0 ;site< vol4;site++){
|
||||||
|
int local = 1;
|
||||||
|
for(int point=0;point<this->_npoints;point++){
|
||||||
|
if( (!this->GetNodeLocal(site*Ls,point)) && (!this->same_node[point]) ){
|
||||||
|
local = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(local == 0) {
|
||||||
|
surface_list.push_back(site);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
template < class compressor>
|
template < class compressor>
|
||||||
void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress)
|
void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress)
|
||||||
@ -484,11 +508,6 @@ public:
|
|||||||
this->face_table_computed=1;
|
this->face_table_computed=1;
|
||||||
assert(this->u_comm_offset==this->_unified_buffer_size);
|
assert(this->u_comm_offset==this->_unified_buffer_size);
|
||||||
accelerator_barrier();
|
accelerator_barrier();
|
||||||
#ifdef NVLINK_GET
|
|
||||||
this->_grid->StencilBarrier(); // He can now get mu local gather, I can get his
|
|
||||||
// Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check
|
|
||||||
// Or issue barrier AFTER the DMA is running
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
|
@ -126,17 +126,14 @@ public:
|
|||||||
void DerivInternal(StencilImpl &st, DoubledGaugeField &U, GaugeField &mat,
|
void DerivInternal(StencilImpl &st, DoubledGaugeField &U, GaugeField &mat,
|
||||||
const FermionField &A, const FermionField &B, int dag);
|
const FermionField &A, const FermionField &B, int dag);
|
||||||
|
|
||||||
void DhopInternal(StencilImpl &st,
|
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||||
DoubledGaugeField &U,
|
|
||||||
const FermionField &in, FermionField &out, int dag);
|
const FermionField &in, FermionField &out, int dag);
|
||||||
|
|
||||||
void DhopInternalSerial(StencilImpl &st,
|
void DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||||
DoubledGaugeField &U,
|
const FermionField &in, FermionField &out, int dag);
|
||||||
const FermionField &in, FermionField &out, int dag);
|
|
||||||
|
|
||||||
void DhopInternalOverlappedComms(StencilImpl &st,
|
void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||||
DoubledGaugeField &U,
|
const FermionField &in, FermionField &out, int dag);
|
||||||
const FermionField &in, FermionField &out, int dag);
|
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
|
WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
|
||||||
@ -171,6 +168,9 @@ public:
|
|||||||
DoubledGaugeField UmuEven;
|
DoubledGaugeField UmuEven;
|
||||||
DoubledGaugeField UmuOdd;
|
DoubledGaugeField UmuOdd;
|
||||||
|
|
||||||
|
LebesgueOrder Lebesgue;
|
||||||
|
LebesgueOrder LebesgueEvenOdd;
|
||||||
|
|
||||||
WilsonAnisotropyCoefficients anisotropyCoeff;
|
WilsonAnisotropyCoefficients anisotropyCoeff;
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
|
@ -109,8 +109,6 @@ public:
|
|||||||
void MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
|
void MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
|
||||||
void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
|
void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
|
||||||
void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
|
void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
|
||||||
void MomentumSpacePropagatorHwQ(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist,
|
|
||||||
std::vector<double> qmu) ;
|
|
||||||
|
|
||||||
// Implement hopping term non-hermitian hopping term; half cb or both
|
// Implement hopping term non-hermitian hopping term; half cb or both
|
||||||
// Implement s-diagonal DW
|
// Implement s-diagonal DW
|
||||||
@ -119,9 +117,6 @@ public:
|
|||||||
void DhopOE(const FermionField &in, FermionField &out,int dag);
|
void DhopOE(const FermionField &in, FermionField &out,int dag);
|
||||||
void DhopEO(const FermionField &in, FermionField &out,int dag);
|
void DhopEO(const FermionField &in, FermionField &out,int dag);
|
||||||
|
|
||||||
void DhopComms (const FermionField &in, FermionField &out);
|
|
||||||
void DhopCalc (const FermionField &in, FermionField &out,uint64_t *ids);
|
|
||||||
|
|
||||||
// add a DhopComm
|
// add a DhopComm
|
||||||
// -- suboptimal interface will presently trigger multiple comms.
|
// -- suboptimal interface will presently trigger multiple comms.
|
||||||
void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);
|
void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);
|
||||||
@ -140,18 +135,21 @@ public:
|
|||||||
int dag);
|
int dag);
|
||||||
|
|
||||||
void DhopInternal(StencilImpl & st,
|
void DhopInternal(StencilImpl & st,
|
||||||
|
LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
FermionField &out,
|
FermionField &out,
|
||||||
int dag);
|
int dag);
|
||||||
|
|
||||||
void DhopInternalOverlappedComms(StencilImpl & st,
|
void DhopInternalOverlappedComms(StencilImpl & st,
|
||||||
|
LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
FermionField &out,
|
FermionField &out,
|
||||||
int dag);
|
int dag);
|
||||||
|
|
||||||
void DhopInternalSerialComms(StencilImpl & st,
|
void DhopInternalSerialComms(StencilImpl & st,
|
||||||
|
LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
FermionField &out,
|
FermionField &out,
|
||||||
@ -205,6 +203,9 @@ public:
|
|||||||
DoubledGaugeField UmuEven;
|
DoubledGaugeField UmuEven;
|
||||||
DoubledGaugeField UmuOdd;
|
DoubledGaugeField UmuOdd;
|
||||||
|
|
||||||
|
LebesgueOrder Lebesgue;
|
||||||
|
LebesgueOrder LebesgueEvenOdd;
|
||||||
|
|
||||||
// Comms buffer
|
// Comms buffer
|
||||||
// std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
|
// std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
|
||||||
|
|
||||||
|
@ -57,10 +57,6 @@ public:
|
|||||||
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
||||||
int interior=1,int exterior=1) ;
|
int interior=1,int exterior=1) ;
|
||||||
|
|
||||||
static void DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
|
||||||
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
|
||||||
uint64_t *ids);
|
|
||||||
|
|
||||||
static void DhopDagKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
static void DhopDagKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||||
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
||||||
int interior=1,int exterior=1) ;
|
int interior=1,int exterior=1) ;
|
||||||
|
@ -58,7 +58,7 @@ public:
|
|||||||
{
|
{
|
||||||
// RealD eps = 1.0;
|
// RealD eps = 1.0;
|
||||||
std::cout<<GridLogMessage << "ZMobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" gamma passed in"<<std::endl;
|
std::cout<<GridLogMessage << "ZMobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" gamma passed in"<<std::endl;
|
||||||
std::vector<Coeff_t> zgamma(this->Ls);
|
Vector<Coeff_t> zgamma(this->Ls);
|
||||||
for(int s=0;s<this->Ls;s++){
|
for(int s=0;s<this->Ls;s++){
|
||||||
zgamma[s] = gamma[s];
|
zgamma[s] = gamma[s];
|
||||||
}
|
}
|
||||||
|
@ -48,8 +48,7 @@ CayleyFermion5D<Impl>::CayleyFermion5D(GaugeField &_Umu,
|
|||||||
FourDimGrid,
|
FourDimGrid,
|
||||||
FourDimRedBlackGrid,_M5,p),
|
FourDimRedBlackGrid,_M5,p),
|
||||||
mass_plus(_mass), mass_minus(_mass)
|
mass_plus(_mass), mass_minus(_mass)
|
||||||
{
|
{
|
||||||
// qmu defaults to zero size;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
@ -157,18 +156,18 @@ template<class Impl>
|
|||||||
void CayleyFermion5D<Impl>::M5D (const FermionField &psi, FermionField &chi)
|
void CayleyFermion5D<Impl>::M5D (const FermionField &psi, FermionField &chi)
|
||||||
{
|
{
|
||||||
int Ls=this->Ls;
|
int Ls=this->Ls;
|
||||||
std::vector<Coeff_t> diag (Ls,1.0);
|
Vector<Coeff_t> diag (Ls,1.0);
|
||||||
std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass_minus;
|
Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass_minus;
|
||||||
std::vector<Coeff_t> lower(Ls,-1.0); lower[0] =mass_plus;
|
Vector<Coeff_t> lower(Ls,-1.0); lower[0] =mass_plus;
|
||||||
M5D(psi,chi,chi,lower,diag,upper);
|
M5D(psi,chi,chi,lower,diag,upper);
|
||||||
}
|
}
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void CayleyFermion5D<Impl>::Meooe5D (const FermionField &psi, FermionField &Din)
|
void CayleyFermion5D<Impl>::Meooe5D (const FermionField &psi, FermionField &Din)
|
||||||
{
|
{
|
||||||
int Ls=this->Ls;
|
int Ls=this->Ls;
|
||||||
std::vector<Coeff_t> diag = bs;
|
Vector<Coeff_t> diag = bs;
|
||||||
std::vector<Coeff_t> upper= cs;
|
Vector<Coeff_t> upper= cs;
|
||||||
std::vector<Coeff_t> lower= cs;
|
Vector<Coeff_t> lower= cs;
|
||||||
upper[Ls-1]=-mass_minus*upper[Ls-1];
|
upper[Ls-1]=-mass_minus*upper[Ls-1];
|
||||||
lower[0] =-mass_plus*lower[0];
|
lower[0] =-mass_plus*lower[0];
|
||||||
M5D(psi,psi,Din,lower,diag,upper);
|
M5D(psi,psi,Din,lower,diag,upper);
|
||||||
@ -177,9 +176,9 @@ void CayleyFermion5D<Impl>::Meooe5D (const FermionField &psi, FermionField &D
|
|||||||
template<class Impl> void CayleyFermion5D<Impl>::Meo5D (const FermionField &psi, FermionField &chi)
|
template<class Impl> void CayleyFermion5D<Impl>::Meo5D (const FermionField &psi, FermionField &chi)
|
||||||
{
|
{
|
||||||
int Ls=this->Ls;
|
int Ls=this->Ls;
|
||||||
std::vector<Coeff_t> diag = beo;
|
Vector<Coeff_t> diag = beo;
|
||||||
std::vector<Coeff_t> upper(Ls);
|
Vector<Coeff_t> upper(Ls);
|
||||||
std::vector<Coeff_t> lower(Ls);
|
Vector<Coeff_t> lower(Ls);
|
||||||
for(int i=0;i<Ls;i++) {
|
for(int i=0;i<Ls;i++) {
|
||||||
upper[i]=-ceo[i];
|
upper[i]=-ceo[i];
|
||||||
lower[i]=-ceo[i];
|
lower[i]=-ceo[i];
|
||||||
@ -192,9 +191,9 @@ template<class Impl>
|
|||||||
void CayleyFermion5D<Impl>::Mooee (const FermionField &psi, FermionField &chi)
|
void CayleyFermion5D<Impl>::Mooee (const FermionField &psi, FermionField &chi)
|
||||||
{
|
{
|
||||||
int Ls=this->Ls;
|
int Ls=this->Ls;
|
||||||
std::vector<Coeff_t> diag = bee;
|
Vector<Coeff_t> diag = bee;
|
||||||
std::vector<Coeff_t> upper(Ls);
|
Vector<Coeff_t> upper(Ls);
|
||||||
std::vector<Coeff_t> lower(Ls);
|
Vector<Coeff_t> lower(Ls);
|
||||||
for(int i=0;i<Ls;i++) {
|
for(int i=0;i<Ls;i++) {
|
||||||
upper[i]=-cee[i];
|
upper[i]=-cee[i];
|
||||||
lower[i]=-cee[i];
|
lower[i]=-cee[i];
|
||||||
@ -207,9 +206,9 @@ template<class Impl>
|
|||||||
void CayleyFermion5D<Impl>::MooeeDag (const FermionField &psi, FermionField &chi)
|
void CayleyFermion5D<Impl>::MooeeDag (const FermionField &psi, FermionField &chi)
|
||||||
{
|
{
|
||||||
int Ls=this->Ls;
|
int Ls=this->Ls;
|
||||||
std::vector<Coeff_t> diag = bee;
|
Vector<Coeff_t> diag = bee;
|
||||||
std::vector<Coeff_t> upper(Ls);
|
Vector<Coeff_t> upper(Ls);
|
||||||
std::vector<Coeff_t> lower(Ls);
|
Vector<Coeff_t> lower(Ls);
|
||||||
|
|
||||||
for (int s=0;s<Ls;s++){
|
for (int s=0;s<Ls;s++){
|
||||||
// Assemble the 5d matrix
|
// Assemble the 5d matrix
|
||||||
@ -237,9 +236,9 @@ template<class Impl>
|
|||||||
void CayleyFermion5D<Impl>::M5Ddag (const FermionField &psi, FermionField &chi)
|
void CayleyFermion5D<Impl>::M5Ddag (const FermionField &psi, FermionField &chi)
|
||||||
{
|
{
|
||||||
int Ls=this->Ls;
|
int Ls=this->Ls;
|
||||||
std::vector<Coeff_t> diag(Ls,1.0);
|
Vector<Coeff_t> diag(Ls,1.0);
|
||||||
std::vector<Coeff_t> upper(Ls,-1.0);
|
Vector<Coeff_t> upper(Ls,-1.0);
|
||||||
std::vector<Coeff_t> lower(Ls,-1.0);
|
Vector<Coeff_t> lower(Ls,-1.0);
|
||||||
upper[Ls-1]=-mass_plus*upper[Ls-1];
|
upper[Ls-1]=-mass_plus*upper[Ls-1];
|
||||||
lower[0] =-mass_minus*lower[0];
|
lower[0] =-mass_minus*lower[0];
|
||||||
M5Ddag(psi,chi,chi,lower,diag,upper);
|
M5Ddag(psi,chi,chi,lower,diag,upper);
|
||||||
@ -249,9 +248,9 @@ template<class Impl>
|
|||||||
void CayleyFermion5D<Impl>::MeooeDag5D (const FermionField &psi, FermionField &Din)
|
void CayleyFermion5D<Impl>::MeooeDag5D (const FermionField &psi, FermionField &Din)
|
||||||
{
|
{
|
||||||
int Ls=this->Ls;
|
int Ls=this->Ls;
|
||||||
std::vector<Coeff_t> diag =bs;
|
Vector<Coeff_t> diag =bs;
|
||||||
std::vector<Coeff_t> upper=cs;
|
Vector<Coeff_t> upper=cs;
|
||||||
std::vector<Coeff_t> lower=cs;
|
Vector<Coeff_t> lower=cs;
|
||||||
|
|
||||||
for (int s=0;s<Ls;s++){
|
for (int s=0;s<Ls;s++){
|
||||||
if ( s== 0 ) {
|
if ( s== 0 ) {
|
||||||
@ -271,34 +270,6 @@ void CayleyFermion5D<Impl>::MeooeDag5D (const FermionField &psi, FermionField
|
|||||||
M5Ddag(psi,psi,Din,lower,diag,upper);
|
M5Ddag(psi,psi,Din,lower,diag,upper);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
|
||||||
void CayleyFermion5D<Impl>::addQmu(const FermionField &psi,FermionField &chi, int dag)
|
|
||||||
{
|
|
||||||
if ( qmu.size() ) {
|
|
||||||
|
|
||||||
Gamma::Algebra Gmu [] = {
|
|
||||||
Gamma::Algebra::GammaX,
|
|
||||||
Gamma::Algebra::GammaY,
|
|
||||||
Gamma::Algebra::GammaZ,
|
|
||||||
Gamma::Algebra::GammaT
|
|
||||||
};
|
|
||||||
std::vector<ComplexD> coeff(Nd);
|
|
||||||
ComplexD ci(0,1);
|
|
||||||
|
|
||||||
assert(qmu.size()==Nd);
|
|
||||||
|
|
||||||
for(int mu=0;mu<Nd;mu++){
|
|
||||||
coeff[mu] = ci*qmu[mu];
|
|
||||||
if ( dag ) coeff[mu] = conjugate(coeff[mu]);
|
|
||||||
}
|
|
||||||
|
|
||||||
chi = chi + Gamma(Gmu[0])*psi*coeff[0];
|
|
||||||
for(int mu=1;mu<Nd;mu++){
|
|
||||||
chi = chi + Gamma(Gmu[mu])*psi*coeff[mu];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void CayleyFermion5D<Impl>::M (const FermionField &psi, FermionField &chi)
|
void CayleyFermion5D<Impl>::M (const FermionField &psi, FermionField &chi)
|
||||||
{
|
{
|
||||||
@ -306,12 +277,8 @@ void CayleyFermion5D<Impl>::M (const FermionField &psi, FermionField &chi)
|
|||||||
|
|
||||||
// Assemble Din
|
// Assemble Din
|
||||||
Meooe5D(psi,Din);
|
Meooe5D(psi,Din);
|
||||||
|
|
||||||
this->DW(Din,chi,DaggerNo);
|
|
||||||
|
|
||||||
// add i q_mu gamma_mu here
|
|
||||||
addQmu(Din,chi,DaggerNo);
|
|
||||||
|
|
||||||
|
this->DW(Din,chi,DaggerNo);
|
||||||
// ((b D_W + D_w hop terms +1) on s-diag
|
// ((b D_W + D_w hop terms +1) on s-diag
|
||||||
axpby(chi,1.0,1.0,chi,psi);
|
axpby(chi,1.0,1.0,chi,psi);
|
||||||
|
|
||||||
@ -328,9 +295,6 @@ void CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
|
|||||||
FermionField Din(psi.Grid());
|
FermionField Din(psi.Grid());
|
||||||
// Apply Dw
|
// Apply Dw
|
||||||
this->DW(psi,Din,DaggerYes);
|
this->DW(psi,Din,DaggerYes);
|
||||||
|
|
||||||
// add -i conj(q_mu) gamma_mu here ... if qmu is real, gammm_5 hermitian, otherwise not.
|
|
||||||
addQmu(psi,Din,DaggerYes);
|
|
||||||
|
|
||||||
MeooeDag5D(Din,chi);
|
MeooeDag5D(Din,chi);
|
||||||
|
|
||||||
@ -430,7 +394,7 @@ void CayleyFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const
|
|||||||
template<class Impl>
|
template<class Impl>
|
||||||
void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c)
|
void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c)
|
||||||
{
|
{
|
||||||
std::vector<Coeff_t> gamma(this->Ls);
|
Vector<Coeff_t> gamma(this->Ls);
|
||||||
for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
|
for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
|
||||||
SetCoefficientsInternal(1.0,gamma,b,c);
|
SetCoefficientsInternal(1.0,gamma,b,c);
|
||||||
}
|
}
|
||||||
@ -438,13 +402,13 @@ void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,Re
|
|||||||
template<class Impl>
|
template<class Impl>
|
||||||
void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c)
|
void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c)
|
||||||
{
|
{
|
||||||
std::vector<Coeff_t> gamma(this->Ls);
|
Vector<Coeff_t> gamma(this->Ls);
|
||||||
for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
|
for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
|
||||||
SetCoefficientsInternal(zolo_hi,gamma,b,c);
|
SetCoefficientsInternal(zolo_hi,gamma,b,c);
|
||||||
}
|
}
|
||||||
//Zolo
|
//Zolo
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c)
|
void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t> & gamma,RealD b,RealD c)
|
||||||
{
|
{
|
||||||
int Ls=this->Ls;
|
int Ls=this->Ls;
|
||||||
|
|
||||||
@ -524,7 +488,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
|
|||||||
leem.resize(Ls);
|
leem.resize(Ls);
|
||||||
uee.resize(Ls);
|
uee.resize(Ls);
|
||||||
ueem.resize(Ls);
|
ueem.resize(Ls);
|
||||||
|
|
||||||
for(int i=0;i<Ls;i++){
|
for(int i=0;i<Ls;i++){
|
||||||
|
|
||||||
dee[i] = bee[i];
|
dee[i] = bee[i];
|
||||||
@ -565,18 +529,6 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
|
|||||||
dee[Ls-1] += delta_d;
|
dee[Ls-1] += delta_d;
|
||||||
}
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////
|
|
||||||
// Device buffers
|
|
||||||
//////////////////////////////////////////
|
|
||||||
d_diag.resize(Ls);
|
|
||||||
d_upper.resize(Ls);
|
|
||||||
d_lower.resize(Ls);
|
|
||||||
|
|
||||||
d_dee.resize(Ls);
|
|
||||||
d_lee.resize(Ls);
|
|
||||||
d_uee.resize(Ls);
|
|
||||||
d_leem.resize(Ls);
|
|
||||||
d_ueem.resize(Ls);
|
|
||||||
// int inv=1;
|
// int inv=1;
|
||||||
// this->MooeeInternalCompute(0,inv,MatpInv,MatmInv);
|
// this->MooeeInternalCompute(0,inv,MatpInv,MatmInv);
|
||||||
// this->MooeeInternalCompute(1,inv,MatpInvDag,MatmInvDag);
|
// this->MooeeInternalCompute(1,inv,MatpInvDag,MatmInvDag);
|
||||||
|
@ -43,9 +43,9 @@ void
|
|||||||
CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
|
CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
|
||||||
const FermionField &phi_i,
|
const FermionField &phi_i,
|
||||||
FermionField &chi_i,
|
FermionField &chi_i,
|
||||||
std::vector<Coeff_t> &lower,
|
Vector<Coeff_t> &lower,
|
||||||
std::vector<Coeff_t> &diag,
|
Vector<Coeff_t> &diag,
|
||||||
std::vector<Coeff_t> &upper)
|
Vector<Coeff_t> &upper)
|
||||||
{
|
{
|
||||||
|
|
||||||
chi_i.Checkerboard()=psi_i.Checkerboard();
|
chi_i.Checkerboard()=psi_i.Checkerboard();
|
||||||
@ -55,15 +55,11 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
|
|||||||
autoView(chi , chi_i,AcceleratorWrite);
|
autoView(chi , chi_i,AcceleratorWrite);
|
||||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||||
|
|
||||||
int Ls =this->Ls;
|
auto pdiag = &diag[0];
|
||||||
|
auto pupper = &upper[0];
|
||||||
|
auto plower = &lower[0];
|
||||||
|
|
||||||
acceleratorCopyToDevice(&diag[0] ,&this->d_diag[0],Ls*sizeof(Coeff_t));
|
int Ls =this->Ls;
|
||||||
acceleratorCopyToDevice(&upper[0],&this->d_upper[0],Ls*sizeof(Coeff_t));
|
|
||||||
acceleratorCopyToDevice(&lower[0],&this->d_lower[0],Ls*sizeof(Coeff_t));
|
|
||||||
|
|
||||||
auto pdiag = &d_diag[0];
|
|
||||||
auto pupper = &d_upper[0];
|
|
||||||
auto plower = &d_lower[0];
|
|
||||||
|
|
||||||
// 10 = 3 complex mult + 2 complex add
|
// 10 = 3 complex mult + 2 complex add
|
||||||
// Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting)
|
// Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting)
|
||||||
@ -86,9 +82,9 @@ void
|
|||||||
CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
|
CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
|
||||||
const FermionField &phi_i,
|
const FermionField &phi_i,
|
||||||
FermionField &chi_i,
|
FermionField &chi_i,
|
||||||
std::vector<Coeff_t> &lower,
|
Vector<Coeff_t> &lower,
|
||||||
std::vector<Coeff_t> &diag,
|
Vector<Coeff_t> &diag,
|
||||||
std::vector<Coeff_t> &upper)
|
Vector<Coeff_t> &upper)
|
||||||
{
|
{
|
||||||
chi_i.Checkerboard()=psi_i.Checkerboard();
|
chi_i.Checkerboard()=psi_i.Checkerboard();
|
||||||
GridBase *grid=psi_i.Grid();
|
GridBase *grid=psi_i.Grid();
|
||||||
@ -97,15 +93,11 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
|
|||||||
autoView(chi , chi_i,AcceleratorWrite);
|
autoView(chi , chi_i,AcceleratorWrite);
|
||||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||||
|
|
||||||
int Ls=this->Ls;
|
auto pdiag = &diag[0];
|
||||||
|
auto pupper = &upper[0];
|
||||||
|
auto plower = &lower[0];
|
||||||
|
|
||||||
acceleratorCopyToDevice(&diag[0] ,&this->d_diag[0],Ls*sizeof(Coeff_t));
|
int Ls=this->Ls;
|
||||||
acceleratorCopyToDevice(&upper[0],&this->d_upper[0],Ls*sizeof(Coeff_t));
|
|
||||||
acceleratorCopyToDevice(&lower[0],&this->d_lower[0],Ls*sizeof(Coeff_t));
|
|
||||||
|
|
||||||
auto pdiag = &d_diag[0];
|
|
||||||
auto pupper = &d_upper[0];
|
|
||||||
auto plower = &d_lower[0];
|
|
||||||
|
|
||||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||||
uint64_t nloop = grid->oSites();
|
uint64_t nloop = grid->oSites();
|
||||||
@ -134,17 +126,11 @@ CayleyFermion5D<Impl>::MooeeInv (const FermionField &psi_i, FermionField &chi
|
|||||||
|
|
||||||
int Ls=this->Ls;
|
int Ls=this->Ls;
|
||||||
|
|
||||||
acceleratorCopyToDevice(&lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
|
auto plee = & lee [0];
|
||||||
acceleratorCopyToDevice(&dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
|
auto pdee = & dee [0];
|
||||||
acceleratorCopyToDevice(&uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
|
auto puee = & uee [0];
|
||||||
acceleratorCopyToDevice(&leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
|
auto pleem = & leem[0];
|
||||||
acceleratorCopyToDevice(&ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
|
auto pueem = & ueem[0];
|
||||||
|
|
||||||
auto plee = & d_lee [0];
|
|
||||||
auto pdee = & d_dee [0];
|
|
||||||
auto puee = & d_uee [0];
|
|
||||||
auto pleem = & d_leem[0];
|
|
||||||
auto pueem = & d_ueem[0];
|
|
||||||
|
|
||||||
uint64_t nloop = grid->oSites()/Ls;
|
uint64_t nloop = grid->oSites()/Ls;
|
||||||
accelerator_for(sss,nloop,Simd::Nsimd(),{
|
accelerator_for(sss,nloop,Simd::Nsimd(),{
|
||||||
@ -196,17 +182,11 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi
|
|||||||
autoView(psi , psi_i,AcceleratorRead);
|
autoView(psi , psi_i,AcceleratorRead);
|
||||||
autoView(chi , chi_i,AcceleratorWrite);
|
autoView(chi , chi_i,AcceleratorWrite);
|
||||||
|
|
||||||
acceleratorCopyToDevice(&lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
|
auto plee = & lee [0];
|
||||||
acceleratorCopyToDevice(&dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
|
auto pdee = & dee [0];
|
||||||
acceleratorCopyToDevice(&uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
|
auto puee = & uee [0];
|
||||||
acceleratorCopyToDevice(&leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
|
auto pleem = & leem[0];
|
||||||
acceleratorCopyToDevice(&ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
|
auto pueem = & ueem[0];
|
||||||
|
|
||||||
auto plee = & d_lee [0];
|
|
||||||
auto pdee = & d_dee [0];
|
|
||||||
auto puee = & d_uee [0];
|
|
||||||
auto pleem = & d_leem[0];
|
|
||||||
auto pueem = & d_ueem[0];
|
|
||||||
|
|
||||||
assert(psi.Checkerboard() == psi.Checkerboard());
|
assert(psi.Checkerboard() == psi.Checkerboard());
|
||||||
|
|
||||||
|
@ -1,5 +1,3 @@
|
|||||||
#if 0
|
|
||||||
|
|
||||||
/*************************************************************************************
|
/*************************************************************************************
|
||||||
|
|
||||||
Grid physics library, www.github.com/paboyle/Grid
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
@ -820,5 +818,3 @@ CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,
|
|||||||
}
|
}
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
#endif
|
|
@ -42,13 +42,13 @@ template<class Impl>
|
|||||||
void ContinuedFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata)
|
void ContinuedFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata)
|
||||||
{
|
{
|
||||||
// How to check Ls matches??
|
// How to check Ls matches??
|
||||||
std::cout<<GridLogMessage << zdata->n << " - n"<<std::endl;
|
// std::cout<<GridLogMessage << Ls << " Ls"<<std::endl;
|
||||||
std::cout<<GridLogMessage << zdata->da << " -da "<<std::endl;
|
// std::cout<<GridLogMessage << zdata->n << " - n"<<std::endl;
|
||||||
std::cout<<GridLogMessage << zdata->db << " -db"<<std::endl;
|
// std::cout<<GridLogMessage << zdata->da << " -da "<<std::endl;
|
||||||
std::cout<<GridLogMessage << zdata->dn << " -dn"<<std::endl;
|
// std::cout<<GridLogMessage << zdata->db << " -db"<<std::endl;
|
||||||
std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl;
|
// std::cout<<GridLogMessage << zdata->dn << " -dn"<<std::endl;
|
||||||
|
// std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl;
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
std::cout<<GridLogMessage << Ls << " Ls"<<std::endl;
|
|
||||||
assert(zdata->db==Ls);// Beta has Ls coeffs
|
assert(zdata->db==Ls);// Beta has Ls coeffs
|
||||||
|
|
||||||
R=(1+this->mass)/(1-this->mass);
|
R=(1+this->mass)/(1-this->mass);
|
||||||
@ -320,7 +320,7 @@ ContinuedFractionFermion5D<Impl>::ContinuedFractionFermion5D(
|
|||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
conformable(solution5d.Grid(),this->FermionGrid());
|
conformable(solution5d.Grid(),this->FermionGrid());
|
||||||
conformable(exported4d.Grid(),this->GaugeGrid());
|
conformable(exported4d.Grid(),this->GaugeGrid());
|
||||||
ExtractSlice(exported4d, solution5d, Ls-1, 0);
|
ExtractSlice(exported4d, solution5d, Ls-1, Ls-1);
|
||||||
}
|
}
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void ContinuedFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
|
void ContinuedFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
|
||||||
@ -330,7 +330,7 @@ ContinuedFractionFermion5D<Impl>::ContinuedFractionFermion5D(
|
|||||||
conformable(input4d.Grid() ,this->GaugeGrid());
|
conformable(input4d.Grid() ,this->GaugeGrid());
|
||||||
FermionField tmp(this->FermionGrid());
|
FermionField tmp(this->FermionGrid());
|
||||||
tmp=Zero();
|
tmp=Zero();
|
||||||
InsertSlice(input4d, tmp, Ls-1, 0);
|
InsertSlice(input4d, tmp, Ls-1, Ls-1);
|
||||||
tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
|
tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
|
||||||
this->Dminus(tmp,imported5d);
|
this->Dminus(tmp,imported5d);
|
||||||
}
|
}
|
||||||
|
@ -41,7 +41,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
// Pplus backwards..
|
// Pplus backwards..
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i,
|
void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i,
|
||||||
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
|
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
|
||||||
{
|
{
|
||||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
@ -50,15 +50,9 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
|
|||||||
autoView( psi , psi_i, AcceleratorRead);
|
autoView( psi , psi_i, AcceleratorRead);
|
||||||
autoView( chi , chi_i, AcceleratorWrite);
|
autoView( chi , chi_i, AcceleratorWrite);
|
||||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||||
|
auto pdiag = &diag[0];
|
||||||
auto pdiag = &this->d_diag[0];
|
auto pupper = &upper[0];
|
||||||
auto pupper = &this->d_upper[0];
|
auto plower = &lower[0];
|
||||||
auto plower = &this->d_lower[0];
|
|
||||||
|
|
||||||
acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t));
|
|
||||||
acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
|
|
||||||
acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
|
|
||||||
|
|
||||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||||
|
|
||||||
auto nloop=grid->oSites()/Ls;
|
auto nloop=grid->oSites()/Ls;
|
||||||
@ -79,7 +73,7 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
|
|||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const FermionField& phi_i, FermionField& chi_i,
|
void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const FermionField& phi_i, FermionField& chi_i,
|
||||||
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
|
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
|
||||||
{
|
{
|
||||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||||
GridBase* grid = psi_i.Grid();
|
GridBase* grid = psi_i.Grid();
|
||||||
@ -89,14 +83,9 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const Fermio
|
|||||||
autoView( phi , phi_i, AcceleratorRead);
|
autoView( phi , phi_i, AcceleratorRead);
|
||||||
autoView( chi , chi_i, AcceleratorWrite);
|
autoView( chi , chi_i, AcceleratorWrite);
|
||||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||||
|
auto pdiag = &diag[0];
|
||||||
auto pdiag = &this->d_diag[0];
|
auto pupper = &upper[0];
|
||||||
auto pupper = &this->d_upper[0];
|
auto plower = &lower[0];
|
||||||
auto plower = &this->d_lower[0];
|
|
||||||
|
|
||||||
acceleratorCopyToDevice(&diag[0] ,&pdiag[0],Ls*sizeof(Coeff_t));
|
|
||||||
acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
|
|
||||||
acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
|
|
||||||
|
|
||||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||||
|
|
||||||
@ -125,17 +114,12 @@ void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionFie
|
|||||||
autoView( chi, chi_i, AcceleratorWrite);
|
autoView( chi, chi_i, AcceleratorWrite);
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
|
|
||||||
auto plee = & this->d_lee [0];
|
auto plee = & this->lee[0];
|
||||||
auto pdee = & this->d_dee [0];
|
auto pdee = & this->dee[0];
|
||||||
auto puee = & this->d_uee [0];
|
auto puee = & this->uee[0];
|
||||||
auto pleem = & this->d_leem[0];
|
|
||||||
auto pueem = & this->d_ueem[0];
|
auto pleem = & this->leem[0];
|
||||||
|
auto pueem = & this->ueem[0];
|
||||||
acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t));
|
|
||||||
acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t));
|
|
||||||
acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t));
|
|
||||||
acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t));
|
|
||||||
acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t));
|
|
||||||
|
|
||||||
uint64_t nloop=grid->oSites()/Ls;
|
uint64_t nloop=grid->oSites()/Ls;
|
||||||
accelerator_for(sss,nloop,Simd::Nsimd(),{
|
accelerator_for(sss,nloop,Simd::Nsimd(),{
|
||||||
|
@ -131,9 +131,9 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi
|
|||||||
else{ shiftm = -shift*(mq3-mq2); }
|
else{ shiftm = -shift*(mq3-mq2); }
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<Coeff_t> diag(Ls,1.0);
|
Vector<Coeff_t> diag(Ls,1.0);
|
||||||
std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftm;
|
Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftm;
|
||||||
std::vector<Coeff_t> lower(Ls,-1.0); lower[0] = mq1 + shiftp;
|
Vector<Coeff_t> lower(Ls,-1.0); lower[0] = mq1 + shiftp;
|
||||||
|
|
||||||
#if(0)
|
#if(0)
|
||||||
std::cout << GridLogMessage << "DomainWallEOFAFermion::M5D(FF&,FF&):" << std::endl;
|
std::cout << GridLogMessage << "DomainWallEOFAFermion::M5D(FF&,FF&):" << std::endl;
|
||||||
@ -168,9 +168,9 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField&
|
|||||||
else{ shiftm = -shift*(mq3-mq2); }
|
else{ shiftm = -shift*(mq3-mq2); }
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<Coeff_t> diag(Ls,1.0);
|
Vector<Coeff_t> diag(Ls,1.0);
|
||||||
std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftp;
|
Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftp;
|
||||||
std::vector<Coeff_t> lower(Ls,-1.0); lower[0] = mq1 + shiftm;
|
Vector<Coeff_t> lower(Ls,-1.0); lower[0] = mq1 + shiftm;
|
||||||
|
|
||||||
this->M5Ddag(psi, chi, chi, lower, diag, upper);
|
this->M5Ddag(psi, chi, chi, lower, diag, upper);
|
||||||
}
|
}
|
||||||
@ -181,9 +181,9 @@ void DomainWallEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& c
|
|||||||
{
|
{
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
|
|
||||||
std::vector<Coeff_t> diag = this->bee;
|
Vector<Coeff_t> diag = this->bee;
|
||||||
std::vector<Coeff_t> upper(Ls);
|
Vector<Coeff_t> upper(Ls);
|
||||||
std::vector<Coeff_t> lower(Ls);
|
Vector<Coeff_t> lower(Ls);
|
||||||
|
|
||||||
for(int s=0; s<Ls; s++){
|
for(int s=0; s<Ls; s++){
|
||||||
upper[s] = -this->cee[s];
|
upper[s] = -this->cee[s];
|
||||||
@ -200,9 +200,9 @@ void DomainWallEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField
|
|||||||
{
|
{
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
|
|
||||||
std::vector<Coeff_t> diag = this->bee;
|
Vector<Coeff_t> diag = this->bee;
|
||||||
std::vector<Coeff_t> upper(Ls);
|
Vector<Coeff_t> upper(Ls);
|
||||||
std::vector<Coeff_t> lower(Ls);
|
Vector<Coeff_t> lower(Ls);
|
||||||
|
|
||||||
for(int s=0; s<Ls; s++){
|
for(int s=0; s<Ls; s++){
|
||||||
upper[s] = -this->cee[s];
|
upper[s] = -this->cee[s];
|
||||||
@ -218,7 +218,7 @@ void DomainWallEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField
|
|||||||
|
|
||||||
//Zolo
|
//Zolo
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void DomainWallEOFAFermion<Impl>::SetCoefficientsInternal(RealD zolo_hi, std::vector<Coeff_t>& gamma, RealD b, RealD c)
|
void DomainWallEOFAFermion<Impl>::SetCoefficientsInternal(RealD zolo_hi, Vector<Coeff_t>& gamma, RealD b, RealD c)
|
||||||
{
|
{
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
int pm = this->pm;
|
int pm = this->pm;
|
||||||
|
@ -61,6 +61,8 @@ ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GridCartesian
|
|||||||
UUUmu(&FourDimGrid),
|
UUUmu(&FourDimGrid),
|
||||||
UUUmuEven(&FourDimRedBlackGrid),
|
UUUmuEven(&FourDimRedBlackGrid),
|
||||||
UUUmuOdd(&FourDimRedBlackGrid),
|
UUUmuOdd(&FourDimRedBlackGrid),
|
||||||
|
Lebesgue(&FourDimGrid),
|
||||||
|
LebesgueEvenOdd(&FourDimRedBlackGrid),
|
||||||
_tmp(&FiveDimRedBlackGrid)
|
_tmp(&FiveDimRedBlackGrid)
|
||||||
{
|
{
|
||||||
|
|
||||||
@ -275,18 +277,18 @@ void ImprovedStaggeredFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
|
|||||||
|
|
||||||
/*CHANGE */
|
/*CHANGE */
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st,
|
void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
||||||
const FermionField &in, FermionField &out,int dag)
|
const FermionField &in, FermionField &out,int dag)
|
||||||
{
|
{
|
||||||
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
|
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
|
||||||
DhopInternalOverlappedComms(st,U,UUU,in,out,dag);
|
DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
|
||||||
else
|
else
|
||||||
DhopInternalSerialComms(st,U,UUU,in,out,dag);
|
DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
|
void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
||||||
const FermionField &in, FermionField &out,int dag)
|
const FermionField &in, FermionField &out,int dag)
|
||||||
{
|
{
|
||||||
@ -311,7 +313,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
|
|||||||
{
|
{
|
||||||
int interior=1;
|
int interior=1;
|
||||||
int exterior=0;
|
int exterior=0;
|
||||||
Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
|
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
|
||||||
}
|
}
|
||||||
|
|
||||||
st.CommsMerge(compressor);
|
st.CommsMerge(compressor);
|
||||||
@ -321,12 +323,12 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
|
|||||||
{
|
{
|
||||||
int interior=0;
|
int interior=0;
|
||||||
int exterior=1;
|
int exterior=1;
|
||||||
Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
|
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
|
void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
||||||
const FermionField &in, FermionField &out,int dag)
|
const FermionField &in, FermionField &out,int dag)
|
||||||
{
|
{
|
||||||
@ -339,7 +341,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
|
|||||||
{
|
{
|
||||||
int interior=1;
|
int interior=1;
|
||||||
int exterior=1;
|
int exterior=1;
|
||||||
Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
|
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/*CHANGE END*/
|
/*CHANGE END*/
|
||||||
@ -355,7 +357,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionFie
|
|||||||
assert(in.Checkerboard()==Even);
|
assert(in.Checkerboard()==Even);
|
||||||
out.Checkerboard() = Odd;
|
out.Checkerboard() = Odd;
|
||||||
|
|
||||||
DhopInternal(StencilEven,UmuOdd,UUUmuOdd,in,out,dag);
|
DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,UUUmuOdd,in,out,dag);
|
||||||
}
|
}
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
|
void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
|
||||||
@ -366,7 +368,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionFie
|
|||||||
assert(in.Checkerboard()==Odd);
|
assert(in.Checkerboard()==Odd);
|
||||||
out.Checkerboard() = Even;
|
out.Checkerboard() = Even;
|
||||||
|
|
||||||
DhopInternal(StencilOdd,UmuEven,UUUmuEven,in,out,dag);
|
DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,UUUmuEven,in,out,dag);
|
||||||
}
|
}
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
|
void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
|
||||||
@ -376,7 +378,7 @@ void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField
|
|||||||
|
|
||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
|
|
||||||
DhopInternal(Stencil,Umu,UUUmu,in,out,dag);
|
DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////
|
||||||
|
@ -48,6 +48,8 @@ ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GridCartesian &Fgrid, G
|
|||||||
StencilEven(&Hgrid, npoint, Even, directions, displacements,p), // source is Even
|
StencilEven(&Hgrid, npoint, Even, directions, displacements,p), // source is Even
|
||||||
StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p), // source is Odd
|
StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p), // source is Odd
|
||||||
mass(_mass),
|
mass(_mass),
|
||||||
|
Lebesgue(_grid),
|
||||||
|
LebesgueEvenOdd(_cbgrid),
|
||||||
Umu(&Fgrid),
|
Umu(&Fgrid),
|
||||||
UmuEven(&Hgrid),
|
UmuEven(&Hgrid),
|
||||||
UmuOdd(&Hgrid),
|
UmuOdd(&Hgrid),
|
||||||
@ -337,7 +339,7 @@ void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &
|
|||||||
|
|
||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
|
|
||||||
DhopInternal(Stencil, Umu, UUUmu, in, out, dag);
|
DhopInternal(Stencil, Lebesgue, Umu, UUUmu, in, out, dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
@ -349,7 +351,7 @@ void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField
|
|||||||
assert(in.Checkerboard() == Even);
|
assert(in.Checkerboard() == Even);
|
||||||
out.Checkerboard() = Odd;
|
out.Checkerboard() = Odd;
|
||||||
|
|
||||||
DhopInternal(StencilEven, UmuOdd, UUUmuOdd, in, out, dag);
|
DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, UUUmuOdd, in, out, dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
@ -361,7 +363,7 @@ void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField
|
|||||||
assert(in.Checkerboard() == Odd);
|
assert(in.Checkerboard() == Odd);
|
||||||
out.Checkerboard() = Even;
|
out.Checkerboard() = Even;
|
||||||
|
|
||||||
DhopInternal(StencilOdd, UmuEven, UUUmuEven, in, out, dag);
|
DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, UUUmuEven, in, out, dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
@ -392,19 +394,19 @@ void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionFiel
|
|||||||
|
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st,
|
void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
DoubledGaugeField &UUU,
|
DoubledGaugeField &UUU,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
FermionField &out, int dag)
|
FermionField &out, int dag)
|
||||||
{
|
{
|
||||||
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
|
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
|
||||||
DhopInternalOverlappedComms(st,U,UUU,in,out,dag);
|
DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
|
||||||
else
|
else
|
||||||
DhopInternalSerialComms(st,U,UUU,in,out,dag);
|
DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
|
||||||
}
|
}
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st,
|
void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
DoubledGaugeField &UUU,
|
DoubledGaugeField &UUU,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
@ -427,7 +429,7 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
|
|||||||
{
|
{
|
||||||
int interior=1;
|
int interior=1;
|
||||||
int exterior=0;
|
int exterior=0;
|
||||||
Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
|
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
|
||||||
}
|
}
|
||||||
|
|
||||||
st.CommunicateComplete(requests);
|
st.CommunicateComplete(requests);
|
||||||
@ -438,13 +440,13 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
|
|||||||
{
|
{
|
||||||
int interior=0;
|
int interior=0;
|
||||||
int exterior=1;
|
int exterior=1;
|
||||||
Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
|
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st,
|
void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
DoubledGaugeField &UUU,
|
DoubledGaugeField &UUU,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
@ -458,7 +460,7 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st,
|
|||||||
{
|
{
|
||||||
int interior=1;
|
int interior=1;
|
||||||
int exterior=1;
|
int exterior=1;
|
||||||
Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
|
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -39,7 +39,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
|
void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
|
||||||
std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper)
|
Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper)
|
||||||
{
|
{
|
||||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||||
GridBase *grid = psi_i.Grid();
|
GridBase *grid = psi_i.Grid();
|
||||||
@ -50,14 +50,10 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField
|
|||||||
|
|
||||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||||
|
|
||||||
auto pdiag = &this->d_diag[0];
|
auto pdiag = &diag[0];
|
||||||
auto pupper = &this->d_upper[0];
|
auto pupper = &upper[0];
|
||||||
auto plower = &this->d_lower[0];
|
auto plower = &lower[0];
|
||||||
|
|
||||||
acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t));
|
|
||||||
acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
|
|
||||||
acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
|
|
||||||
|
|
||||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||||
int nloop = grid->oSites()/Ls;
|
int nloop = grid->oSites()/Ls;
|
||||||
accelerator_for(sss,nloop,Simd::Nsimd(),{
|
accelerator_for(sss,nloop,Simd::Nsimd(),{
|
||||||
@ -78,8 +74,8 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField
|
|||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
|
void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
|
||||||
std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper,
|
Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper,
|
||||||
std::vector<Coeff_t> &shift_coeffs)
|
Vector<Coeff_t> &shift_coeffs)
|
||||||
{
|
{
|
||||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||||
GridBase *grid = psi_i.Grid();
|
GridBase *grid = psi_i.Grid();
|
||||||
@ -90,18 +86,13 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
|
|||||||
|
|
||||||
auto pm = this->pm;
|
auto pm = this->pm;
|
||||||
int shift_s = (pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
|
int shift_s = (pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
|
||||||
|
|
||||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||||
|
|
||||||
auto pdiag = &this->d_diag[0];
|
auto pdiag = &diag[0];
|
||||||
auto pupper = &this->d_upper[0];
|
auto pupper = &upper[0];
|
||||||
auto plower = &this->d_lower[0];
|
auto plower = &lower[0];
|
||||||
auto pshift_coeffs = &this->d_shift_coefficients[0];
|
auto pshift_coeffs = &shift_coeffs[0];
|
||||||
|
|
||||||
acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t));
|
|
||||||
acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
|
|
||||||
acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
|
|
||||||
acceleratorCopyToDevice(&shift_coeffs[0],&pshift_coeffs[0],Ls*sizeof(Coeff_t));
|
|
||||||
|
|
||||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||||
int nloop = grid->oSites()/Ls;
|
int nloop = grid->oSites()/Ls;
|
||||||
@ -128,7 +119,7 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
|
|||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
|
void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
|
||||||
std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper)
|
Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper)
|
||||||
{
|
{
|
||||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||||
GridBase *grid = psi_i.Grid();
|
GridBase *grid = psi_i.Grid();
|
||||||
@ -138,14 +129,10 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
|
|||||||
autoView(chi , chi_i, AcceleratorWrite);
|
autoView(chi , chi_i, AcceleratorWrite);
|
||||||
|
|
||||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||||
|
|
||||||
auto pdiag = &this->d_diag[0];
|
|
||||||
auto pupper = &this->d_upper[0];
|
|
||||||
auto plower = &this->d_lower[0];
|
|
||||||
|
|
||||||
acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t));
|
auto pdiag = &diag[0];
|
||||||
acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
|
auto pupper = &upper[0];
|
||||||
acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
|
auto plower = &lower[0];
|
||||||
|
|
||||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||||
int nloop = grid->oSites()/Ls;
|
int nloop = grid->oSites()/Ls;
|
||||||
@ -167,8 +154,8 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
|
|||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
|
void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
|
||||||
std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper,
|
Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper,
|
||||||
std::vector<Coeff_t> &shift_coeffs)
|
Vector<Coeff_t> &shift_coeffs)
|
||||||
{
|
{
|
||||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||||
GridBase *grid = psi_i.Grid();
|
GridBase *grid = psi_i.Grid();
|
||||||
@ -180,16 +167,11 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const Ferm
|
|||||||
|
|
||||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||||
|
|
||||||
auto pdiag = &this->d_diag[0];
|
auto pdiag = &diag[0];
|
||||||
auto pupper = &this->d_upper[0];
|
auto pupper = &upper[0];
|
||||||
auto plower = &this->d_lower[0];
|
auto plower = &lower[0];
|
||||||
auto pshift_coeffs = &this->d_shift_coefficients[0];
|
auto pshift_coeffs = &shift_coeffs[0];
|
||||||
|
|
||||||
acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t));
|
|
||||||
acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
|
|
||||||
acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
|
|
||||||
acceleratorCopyToDevice(&shift_coeffs[0],&pshift_coeffs[0],Ls*sizeof(Coeff_t));
|
|
||||||
|
|
||||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||||
auto pm = this->pm;
|
auto pm = this->pm;
|
||||||
|
|
||||||
@ -230,17 +212,11 @@ void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField &
|
|||||||
autoView(psi , psi_i, AcceleratorRead);
|
autoView(psi , psi_i, AcceleratorRead);
|
||||||
autoView(chi , chi_i, AcceleratorWrite);
|
autoView(chi , chi_i, AcceleratorWrite);
|
||||||
|
|
||||||
auto plee = & this->d_lee [0];
|
auto plee = & this->lee [0];
|
||||||
auto pdee = & this->d_dee [0];
|
auto pdee = & this->dee [0];
|
||||||
auto puee = & this->d_uee [0];
|
auto puee = & this->uee [0];
|
||||||
auto pleem = & this->d_leem[0];
|
auto pleem= & this->leem[0];
|
||||||
auto pueem = & this->d_ueem[0];
|
auto pueem= & this->ueem[0];
|
||||||
|
|
||||||
acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t));
|
|
||||||
acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t));
|
|
||||||
acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t));
|
|
||||||
acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t));
|
|
||||||
acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t));
|
|
||||||
|
|
||||||
if(this->shift != 0.0){ MooeeInv_shift(psi_i,chi_i); return; }
|
if(this->shift != 0.0){ MooeeInv_shift(psi_i,chi_i); return; }
|
||||||
|
|
||||||
@ -292,23 +268,14 @@ void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionF
|
|||||||
autoView(psi , psi_i, AcceleratorRead);
|
autoView(psi , psi_i, AcceleratorRead);
|
||||||
autoView(chi , chi_i, AcceleratorWrite);
|
autoView(chi , chi_i, AcceleratorWrite);
|
||||||
|
|
||||||
// Move into object and constructor
|
|
||||||
auto pm = this->pm;
|
auto pm = this->pm;
|
||||||
auto plee = & this->d_lee [0];
|
auto plee = & this->lee [0];
|
||||||
auto pdee = & this->d_dee [0];
|
auto pdee = & this->dee [0];
|
||||||
auto puee = & this->d_uee [0];
|
auto puee = & this->uee [0];
|
||||||
auto pleem = & this->d_leem[0];
|
auto pleem= & this->leem[0];
|
||||||
auto pueem = & this->d_ueem[0];
|
auto pueem= & this->ueem[0];
|
||||||
auto pMooeeInv_shift_lc = &this->d_MooeeInv_shift_lc[0];
|
auto pMooeeInv_shift_lc = &MooeeInv_shift_lc[0];
|
||||||
auto pMooeeInv_shift_norm = &this->d_MooeeInv_shift_norm[0];
|
auto pMooeeInv_shift_norm = &MooeeInv_shift_norm[0];
|
||||||
|
|
||||||
acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t));
|
|
||||||
acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t));
|
|
||||||
acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t));
|
|
||||||
acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t));
|
|
||||||
acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t));
|
|
||||||
acceleratorCopyToDevice(&MooeeInv_shift_lc[0],&pMooeeInv_shift_lc[0],Ls*sizeof(Coeff_t));
|
|
||||||
acceleratorCopyToDevice(&MooeeInv_shift_norm[0],&pMooeeInv_shift_norm[0],Ls*sizeof(Coeff_t));
|
|
||||||
|
|
||||||
int nloop = grid->oSites()/Ls;
|
int nloop = grid->oSites()/Ls;
|
||||||
accelerator_for(sss,nloop,Simd::Nsimd(),{
|
accelerator_for(sss,nloop,Simd::Nsimd(),{
|
||||||
@ -366,17 +333,11 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionFiel
|
|||||||
autoView(psi , psi_i, AcceleratorRead);
|
autoView(psi , psi_i, AcceleratorRead);
|
||||||
autoView(chi , chi_i, AcceleratorWrite);
|
autoView(chi , chi_i, AcceleratorWrite);
|
||||||
|
|
||||||
auto plee = &this->d_lee [0];
|
auto plee = & this->lee [0];
|
||||||
auto pdee = &this->d_dee [0];
|
auto pdee = & this->dee [0];
|
||||||
auto puee = &this->d_uee [0];
|
auto puee = & this->uee [0];
|
||||||
auto pleem = &this->d_leem[0];
|
auto pleem= & this->leem[0];
|
||||||
auto pueem = &this->d_ueem[0];
|
auto pueem= & this->ueem[0];
|
||||||
|
|
||||||
acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t));
|
|
||||||
acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t));
|
|
||||||
acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t));
|
|
||||||
acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t));
|
|
||||||
acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t));
|
|
||||||
|
|
||||||
int nloop = grid->oSites()/Ls;
|
int nloop = grid->oSites()/Ls;
|
||||||
accelerator_for(sss,nloop,Simd::Nsimd(),{
|
accelerator_for(sss,nloop,Simd::Nsimd(),{
|
||||||
@ -426,25 +387,13 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, Fermi
|
|||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
|
|
||||||
auto pm = this->pm;
|
auto pm = this->pm;
|
||||||
auto plee = & this->d_lee [0];
|
auto plee = & this->lee [0];
|
||||||
auto pdee = & this->d_dee [0];
|
auto pdee = & this->dee [0];
|
||||||
auto puee = & this->d_uee [0];
|
auto puee = & this->uee [0];
|
||||||
auto pleem = & this->d_leem[0];
|
auto pleem= & this->leem[0];
|
||||||
auto pueem = & this->d_ueem[0];
|
auto pueem= & this->ueem[0];
|
||||||
|
auto pMooeeInvDag_shift_lc = &MooeeInvDag_shift_lc[0];
|
||||||
auto pMooeeInvDag_shift_lc = &this->d_MooeeInv_shift_lc[0];
|
auto pMooeeInvDag_shift_norm = &MooeeInvDag_shift_norm[0];
|
||||||
auto pMooeeInvDag_shift_norm = &this->d_MooeeInv_shift_norm[0];
|
|
||||||
|
|
||||||
acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t));
|
|
||||||
acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t));
|
|
||||||
acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t));
|
|
||||||
acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t));
|
|
||||||
acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t));
|
|
||||||
acceleratorCopyToDevice(&MooeeInvDag_shift_lc[0],&pMooeeInvDag_shift_lc[0],Ls*sizeof(Coeff_t));
|
|
||||||
acceleratorCopyToDevice(&MooeeInvDag_shift_norm[0],&pMooeeInvDag_shift_norm[0],Ls*sizeof(Coeff_t));
|
|
||||||
|
|
||||||
// auto pMooeeInvDag_shift_lc = &MooeeInvDag_shift_lc[0];
|
|
||||||
// auto pMooeeInvDag_shift_norm = &MooeeInvDag_shift_norm[0];
|
|
||||||
|
|
||||||
int nloop = grid->oSites()/Ls;
|
int nloop = grid->oSites()/Ls;
|
||||||
accelerator_for(sss,nloop,Simd::Nsimd(),{
|
accelerator_for(sss,nloop,Simd::Nsimd(),{
|
||||||
|
@ -196,9 +196,9 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
|
|||||||
{
|
{
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
|
|
||||||
std::vector<Coeff_t> diag(Ls,1.0);
|
Vector<Coeff_t> diag(Ls,1.0);
|
||||||
std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = this->mq1;
|
Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = this->mq1;
|
||||||
std::vector<Coeff_t> lower(Ls,-1.0); lower[0] = this->mq1;
|
Vector<Coeff_t> lower(Ls,-1.0); lower[0] = this->mq1;
|
||||||
|
|
||||||
// no shift term
|
// no shift term
|
||||||
if(this->shift == 0.0){ this->M5D(psi, chi, chi, lower, diag, upper); }
|
if(this->shift == 0.0){ this->M5D(psi, chi, chi, lower, diag, upper); }
|
||||||
@ -212,9 +212,9 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& chi)
|
|||||||
{
|
{
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
|
|
||||||
std::vector<Coeff_t> diag(Ls,1.0);
|
Vector<Coeff_t> diag(Ls,1.0);
|
||||||
std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = this->mq1;
|
Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = this->mq1;
|
||||||
std::vector<Coeff_t> lower(Ls,-1.0); lower[0] = this->mq1;
|
Vector<Coeff_t> lower(Ls,-1.0); lower[0] = this->mq1;
|
||||||
|
|
||||||
// no shift term
|
// no shift term
|
||||||
if(this->shift == 0.0){ this->M5Ddag(psi, chi, chi, lower, diag, upper); }
|
if(this->shift == 0.0){ this->M5Ddag(psi, chi, chi, lower, diag, upper); }
|
||||||
@ -230,9 +230,9 @@ void MobiusEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& chi)
|
|||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
|
|
||||||
// coefficients of Mooee
|
// coefficients of Mooee
|
||||||
std::vector<Coeff_t> diag = this->bee;
|
Vector<Coeff_t> diag = this->bee;
|
||||||
std::vector<Coeff_t> upper(Ls);
|
Vector<Coeff_t> upper(Ls);
|
||||||
std::vector<Coeff_t> lower(Ls);
|
Vector<Coeff_t> lower(Ls);
|
||||||
for(int s=0; s<Ls; s++){
|
for(int s=0; s<Ls; s++){
|
||||||
upper[s] = -this->cee[s];
|
upper[s] = -this->cee[s];
|
||||||
lower[s] = -this->cee[s];
|
lower[s] = -this->cee[s];
|
||||||
@ -253,9 +253,9 @@ void MobiusEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField& ch
|
|||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
|
|
||||||
// coefficients of MooeeDag
|
// coefficients of MooeeDag
|
||||||
std::vector<Coeff_t> diag = this->bee;
|
Vector<Coeff_t> diag = this->bee;
|
||||||
std::vector<Coeff_t> upper(Ls);
|
Vector<Coeff_t> upper(Ls);
|
||||||
std::vector<Coeff_t> lower(Ls);
|
Vector<Coeff_t> lower(Ls);
|
||||||
for(int s=0; s<Ls; s++){
|
for(int s=0; s<Ls; s++){
|
||||||
if(s==0) {
|
if(s==0) {
|
||||||
upper[s] = -this->cee[s+1];
|
upper[s] = -this->cee[s+1];
|
||||||
@ -314,10 +314,10 @@ void MobiusEOFAFermion<Impl>::SetCoefficientsPrecondShiftOps()
|
|||||||
// Tridiagonal solve for MooeeInvDag_shift_lc
|
// Tridiagonal solve for MooeeInvDag_shift_lc
|
||||||
{
|
{
|
||||||
Coeff_t m(0.0);
|
Coeff_t m(0.0);
|
||||||
std::vector<Coeff_t> d = Mooee_shift;
|
Vector<Coeff_t> d = Mooee_shift;
|
||||||
std::vector<Coeff_t> u(Ls,0.0);
|
Vector<Coeff_t> u(Ls,0.0);
|
||||||
std::vector<Coeff_t> y(Ls,0.0);
|
Vector<Coeff_t> y(Ls,0.0);
|
||||||
std::vector<Coeff_t> q(Ls,0.0);
|
Vector<Coeff_t> q(Ls,0.0);
|
||||||
if(pm == 1){ u[0] = 1.0; }
|
if(pm == 1){ u[0] = 1.0; }
|
||||||
else{ u[Ls-1] = 1.0; }
|
else{ u[Ls-1] = 1.0; }
|
||||||
|
|
||||||
|
@ -48,6 +48,8 @@ NaiveStaggeredFermion<Impl>::NaiveStaggeredFermion(GridCartesian &Fgrid, GridRed
|
|||||||
StencilEven(&Hgrid, npoint, Even, directions, displacements,p), // source is Even
|
StencilEven(&Hgrid, npoint, Even, directions, displacements,p), // source is Even
|
||||||
StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p), // source is Odd
|
StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p), // source is Odd
|
||||||
mass(_mass),
|
mass(_mass),
|
||||||
|
Lebesgue(_grid),
|
||||||
|
LebesgueEvenOdd(_cbgrid),
|
||||||
Umu(&Fgrid),
|
Umu(&Fgrid),
|
||||||
UmuEven(&Hgrid),
|
UmuEven(&Hgrid),
|
||||||
UmuOdd(&Hgrid),
|
UmuOdd(&Hgrid),
|
||||||
@ -266,7 +268,7 @@ void NaiveStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out
|
|||||||
|
|
||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
|
|
||||||
DhopInternal(Stencil, Umu, in, out, dag);
|
DhopInternal(Stencil, Lebesgue, Umu, in, out, dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
@ -278,7 +280,7 @@ void NaiveStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &o
|
|||||||
assert(in.Checkerboard() == Even);
|
assert(in.Checkerboard() == Even);
|
||||||
out.Checkerboard() = Odd;
|
out.Checkerboard() = Odd;
|
||||||
|
|
||||||
DhopInternal(StencilEven, UmuOdd, in, out, dag);
|
DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
@ -290,7 +292,7 @@ void NaiveStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &o
|
|||||||
assert(in.Checkerboard() == Odd);
|
assert(in.Checkerboard() == Odd);
|
||||||
out.Checkerboard() = Even;
|
out.Checkerboard() = Even;
|
||||||
|
|
||||||
DhopInternal(StencilOdd, UmuEven, in, out, dag);
|
DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
@ -321,18 +323,18 @@ void NaiveStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionField &
|
|||||||
|
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void NaiveStaggeredFermion<Impl>::DhopInternal(StencilImpl &st,
|
void NaiveStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
FermionField &out, int dag)
|
FermionField &out, int dag)
|
||||||
{
|
{
|
||||||
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
|
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
|
||||||
DhopInternalOverlappedComms(st,U,in,out,dag);
|
DhopInternalOverlappedComms(st,lo,U,in,out,dag);
|
||||||
else
|
else
|
||||||
DhopInternalSerialComms(st,U,in,out,dag);
|
DhopInternalSerialComms(st,lo,U,in,out,dag);
|
||||||
}
|
}
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st,
|
void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
FermionField &out, int dag)
|
FermionField &out, int dag)
|
||||||
@ -354,7 +356,7 @@ void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st,
|
|||||||
{
|
{
|
||||||
int interior=1;
|
int interior=1;
|
||||||
int exterior=0;
|
int exterior=0;
|
||||||
Kernels::DhopNaive(st,U,in,out,dag,interior,exterior);
|
Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
|
||||||
}
|
}
|
||||||
|
|
||||||
st.CommunicateComplete(requests);
|
st.CommunicateComplete(requests);
|
||||||
@ -365,12 +367,12 @@ void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st,
|
|||||||
{
|
{
|
||||||
int interior=0;
|
int interior=0;
|
||||||
int exterior=1;
|
int exterior=1;
|
||||||
Kernels::DhopNaive(st,U,in,out,dag,interior,exterior);
|
Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st,
|
void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
FermionField &out, int dag)
|
FermionField &out, int dag)
|
||||||
@ -383,7 +385,7 @@ void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st,
|
|||||||
{
|
{
|
||||||
int interior=1;
|
int interior=1;
|
||||||
int exterior=1;
|
int exterior=1;
|
||||||
Kernels::DhopNaive(st,U,in,out,dag,interior,exterior);
|
Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -237,32 +237,7 @@ void PartialFractionFermion5D<Impl>::M_internal(const FermionField &psi, Fermi
|
|||||||
// ( 0 -sqrt(p_i)*amax | 2 R gamma_5 + p0/amax 2H
|
// ( 0 -sqrt(p_i)*amax | 2 R gamma_5 + p0/amax 2H
|
||||||
//
|
//
|
||||||
|
|
||||||
this->DW(psi,D,DaggerNo);
|
this->DW(psi,D,DaggerNo);
|
||||||
|
|
||||||
// DW - DW+iqslash
|
|
||||||
// (g5 Dw)^dag = g5 Dw
|
|
||||||
// (iqmu g5 gmu)^dag = (-i qmu gmu^dag g5^dag) = i qmu g5 gmu
|
|
||||||
if ( qmu.size() ) {
|
|
||||||
|
|
||||||
std::cout<< "Mat" << "qmu ("<<qmu[0]<<","<<qmu[1]<<","<<qmu[2]<<","<<qmu[3]<<")"<<std::endl;
|
|
||||||
assert(qmu.size()==Nd);
|
|
||||||
|
|
||||||
FermionField qslash_psi(psi.Grid());
|
|
||||||
|
|
||||||
Gamma::Algebra Gmu [] = {
|
|
||||||
Gamma::Algebra::GammaX,
|
|
||||||
Gamma::Algebra::GammaY,
|
|
||||||
Gamma::Algebra::GammaZ,
|
|
||||||
Gamma::Algebra::GammaT
|
|
||||||
};
|
|
||||||
qslash_psi = qmu[0]*(Gamma(Gmu[0])*psi);
|
|
||||||
for(int mu=1;mu<Nd;mu++){
|
|
||||||
qslash_psi = qslash_psi + qmu[mu]*(Gamma(Gmu[mu])*psi);
|
|
||||||
}
|
|
||||||
ComplexD ci(0.0,1.0);
|
|
||||||
qslash_psi = ci*qslash_psi ; // i qslash
|
|
||||||
D = D + qslash_psi;
|
|
||||||
}
|
|
||||||
|
|
||||||
int nblock=(Ls-1)/2;
|
int nblock=(Ls-1)/2;
|
||||||
for(int b=0;b<nblock;b++){
|
for(int b=0;b<nblock;b++){
|
||||||
@ -280,55 +255,15 @@ void PartialFractionFermion5D<Impl>::M_internal(const FermionField &psi, Fermi
|
|||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
// The 'conventional' Cayley overlap operator is
|
|
||||||
//
|
|
||||||
// Dov = (1+m)/2 + (1-m)/2 g5 sgn Hw
|
|
||||||
//
|
|
||||||
//
|
|
||||||
// With massless limit 1/2(1+g5 sgnHw)
|
|
||||||
//
|
|
||||||
// Luscher shows quite neatly that 1+g5 sgn Hw has tree level propagator i qslash +O(a^2)
|
|
||||||
//
|
|
||||||
// However, the conventional normalisation has both a leading order factor of 2 in Zq
|
|
||||||
// at tree level AND a mass dependent (1-m) that are convenient to absorb.
|
|
||||||
//
|
|
||||||
// In WilsonFermion5DImplementation.h, the tree level propagator for Hw is
|
|
||||||
//
|
|
||||||
// num = -i sin kmu gmu
|
|
||||||
//
|
|
||||||
// denom ( sqrt(sk^2 + (2shk^2 - 1)^2
|
|
||||||
// b_k = sk2 - M5;
|
|
||||||
//
|
|
||||||
// w_k = sqrt(sk + b_k*b_k);
|
|
||||||
//
|
|
||||||
// denom= ( w_k + b_k + mass*mass) ;
|
|
||||||
//
|
|
||||||
// denom= one/denom;
|
|
||||||
// out = num*denom;
|
|
||||||
//
|
|
||||||
// Chroma, and Grid define partial fraction via 4d operator
|
|
||||||
//
|
|
||||||
// Dpf = 2/(1-m) x Dov = (1+m)/(1-m) + g5 sgn Hw
|
|
||||||
//
|
|
||||||
// Now since:
|
|
||||||
//
|
|
||||||
// (1+m)/(1-m) = (1-m)/(1-m) + 2m/(1-m) = 1 + 2m/(1-m)
|
|
||||||
//
|
|
||||||
// This corresponds to a modified mass parameter
|
|
||||||
//
|
|
||||||
// It has an annoying
|
|
||||||
//
|
|
||||||
//
|
|
||||||
double R=(1+this->mass)/(1-this->mass);
|
double R=(1+this->mass)/(1-this->mass);
|
||||||
//R g5 psi[Ls] + p[0] Hw
|
//R g5 psi[Ls] + p[0] H
|
||||||
ag5xpbg5y_ssp(chi,R*scale,psi,p[nblock]*scale/amax,D,Ls-1,Ls-1);
|
ag5xpbg5y_ssp(chi,R*scale,psi,p[nblock]*scale/amax,D,Ls-1,Ls-1);
|
||||||
|
|
||||||
for(int b=0;b<nblock;b++){
|
for(int b=0;b<nblock;b++){
|
||||||
int s = 2*b+1;
|
int s = 2*b+1;
|
||||||
double pp = p[nblock-1-b];
|
double pp = p[nblock-1-b];
|
||||||
axpby_ssp(chi,1.0,chi,-sqrt(amax*pp)*scale*sign,psi,Ls-1,s);
|
axpby_ssp(chi,1.0,chi,-sqrt(amax*pp)*scale*sign,psi,Ls-1,s);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -476,18 +411,17 @@ void PartialFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,App
|
|||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
conformable(solution5d.Grid(),this->FermionGrid());
|
conformable(solution5d.Grid(),this->FermionGrid());
|
||||||
conformable(exported4d.Grid(),this->GaugeGrid());
|
conformable(exported4d.Grid(),this->GaugeGrid());
|
||||||
ExtractSlice(exported4d, solution5d, Ls-1, 0);
|
ExtractSlice(exported4d, solution5d, Ls-1, Ls-1);
|
||||||
}
|
}
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void PartialFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
|
void PartialFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
|
||||||
{
|
{
|
||||||
//void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int orthog)
|
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
conformable(imported5d.Grid(),this->FermionGrid());
|
conformable(imported5d.Grid(),this->FermionGrid());
|
||||||
conformable(input4d.Grid() ,this->GaugeGrid());
|
conformable(input4d.Grid() ,this->GaugeGrid());
|
||||||
FermionField tmp(this->FermionGrid());
|
FermionField tmp(this->FermionGrid());
|
||||||
tmp=Zero();
|
tmp=Zero();
|
||||||
InsertSlice(input4d, tmp, Ls-1, 0);
|
InsertSlice(input4d, tmp, Ls-1, Ls-1);
|
||||||
tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
|
tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
|
||||||
this->Dminus(tmp,imported5d);
|
this->Dminus(tmp,imported5d);
|
||||||
}
|
}
|
||||||
@ -508,7 +442,7 @@ PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,
|
|||||||
|
|
||||||
{
|
{
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
qmu.resize(0);
|
|
||||||
assert((Ls&0x1)==1); // Odd Ls required
|
assert((Ls&0x1)==1); // Odd Ls required
|
||||||
int nrational=Ls-1;
|
int nrational=Ls-1;
|
||||||
|
|
||||||
@ -526,22 +460,6 @@ PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,
|
|||||||
Approx::zolotarev_free(zdata);
|
Approx::zolotarev_free(zdata);
|
||||||
|
|
||||||
}
|
}
|
||||||
template<class Impl>
|
|
||||||
PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,
|
|
||||||
GridCartesian &FiveDimGrid,
|
|
||||||
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
|
||||||
GridCartesian &FourDimGrid,
|
|
||||||
GridRedBlackCartesian &FourDimRedBlackGrid,
|
|
||||||
RealD _mass,RealD M5,
|
|
||||||
std::vector<RealD> &_qmu,
|
|
||||||
const ImplParams &p)
|
|
||||||
: PartialFractionFermion5D<Impl>(_Umu,
|
|
||||||
FiveDimGrid,FiveDimRedBlackGrid,
|
|
||||||
FourDimGrid,FourDimRedBlackGrid,
|
|
||||||
_mass,M5,p)
|
|
||||||
{
|
|
||||||
qmu=_qmu;
|
|
||||||
}
|
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
@ -375,6 +375,23 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
#define DHOP_SITE_HAND_INSTANTIATE(IMPL) \
|
||||||
|
template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \
|
||||||
|
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
|
||||||
|
SiteSpinor *buf, int LLs, int sU, \
|
||||||
|
const FermionFieldView &in, FermionFieldView &out, int dag); \
|
||||||
|
\
|
||||||
|
template void StaggeredKernels<IMPL>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, \
|
||||||
|
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
|
||||||
|
SiteSpinor *buf, int LLs, int sU, \
|
||||||
|
const FermionFieldView &in, FermionFieldView &out, int dag); \
|
||||||
|
\
|
||||||
|
template void StaggeredKernels<IMPL>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, \
|
||||||
|
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
|
||||||
|
SiteSpinor *buf, int LLs, int sU, \
|
||||||
|
const FermionFieldView &in, FermionFieldView &out, int dag); \
|
||||||
|
*/
|
||||||
#undef LOAD_CHI
|
#undef LOAD_CHI
|
||||||
#undef HAND_DECLARATIONS
|
#undef HAND_DECLARATIONS
|
||||||
|
|
||||||
|
@ -256,7 +256,7 @@ void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldVie
|
|||||||
});
|
});
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st,
|
void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
const FermionField &in, FermionField &out, int dag, int interior,int exterior)
|
const FermionField &in, FermionField &out, int dag, int interior,int exterior)
|
||||||
{
|
{
|
||||||
@ -294,7 +294,7 @@ void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st,
|
|||||||
assert(0 && " Kernel optimisation case not covered ");
|
assert(0 && " Kernel optimisation case not covered ");
|
||||||
}
|
}
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st,
|
void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
const FermionField &in, FermionField &out, int dag, int interior,int exterior)
|
const FermionField &in, FermionField &out, int dag, int interior,int exterior)
|
||||||
{
|
{
|
||||||
|
@ -58,9 +58,15 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
|
|||||||
Umu(_FourDimGrid),
|
Umu(_FourDimGrid),
|
||||||
UmuEven(_FourDimRedBlackGrid),
|
UmuEven(_FourDimRedBlackGrid),
|
||||||
UmuOdd (_FourDimRedBlackGrid),
|
UmuOdd (_FourDimRedBlackGrid),
|
||||||
|
Lebesgue(_FourDimGrid),
|
||||||
|
LebesgueEvenOdd(_FourDimRedBlackGrid),
|
||||||
_tmp(&FiveDimRedBlackGrid),
|
_tmp(&FiveDimRedBlackGrid),
|
||||||
Dirichlet(0)
|
Dirichlet(0)
|
||||||
{
|
{
|
||||||
|
Stencil.lo = &Lebesgue;
|
||||||
|
StencilEven.lo = &LebesgueEvenOdd;
|
||||||
|
StencilOdd.lo = &LebesgueEvenOdd;
|
||||||
|
|
||||||
// some assertions
|
// some assertions
|
||||||
assert(FiveDimGrid._ndimension==5);
|
assert(FiveDimGrid._ndimension==5);
|
||||||
assert(FourDimGrid._ndimension==4);
|
assert(FourDimGrid._ndimension==4);
|
||||||
@ -299,19 +305,19 @@ void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st,
|
void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField & U,
|
DoubledGaugeField & U,
|
||||||
const FermionField &in, FermionField &out,int dag)
|
const FermionField &in, FermionField &out,int dag)
|
||||||
{
|
{
|
||||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
|
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
|
||||||
DhopInternalOverlappedComms(st,U,in,out,dag);
|
DhopInternalOverlappedComms(st,lo,U,in,out,dag);
|
||||||
else
|
else
|
||||||
DhopInternalSerialComms(st,U,in,out,dag);
|
DhopInternalSerialComms(st,lo,U,in,out,dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
|
void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField & U,
|
DoubledGaugeField & U,
|
||||||
const FermionField &in, FermionField &out,int dag)
|
const FermionField &in, FermionField &out,int dag)
|
||||||
{
|
{
|
||||||
@ -325,22 +331,22 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
|
|||||||
// Start comms // Gather intranode and extra node differentiated??
|
// Start comms // Gather intranode and extra node differentiated??
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
{
|
{
|
||||||
// std::cout << " WilsonFermion5D gather " <<std::endl;
|
|
||||||
GRID_TRACE("Gather");
|
GRID_TRACE("Gather");
|
||||||
st.HaloExchangeOptGather(in,compressor); // Put the barrier in the routine
|
st.HaloExchangeOptGather(in,compressor); // Put the barrier in the routine
|
||||||
}
|
}
|
||||||
|
|
||||||
// std::cout << " WilsonFermion5D Communicate Begin " <<std::endl;
|
|
||||||
std::vector<std::vector<CommsRequest_t> > requests;
|
std::vector<std::vector<CommsRequest_t> > requests;
|
||||||
|
auto id=traceStart("Communicate overlapped");
|
||||||
|
st.CommunicateBegin(requests);
|
||||||
|
|
||||||
#if 1
|
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
// Overlap with comms
|
// Overlap with comms
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
st.CommunicateBegin(requests);
|
{
|
||||||
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
|
GRID_TRACE("MergeSHM");
|
||||||
#endif
|
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
|
||||||
|
}
|
||||||
|
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
// do the compute interior
|
// do the compute interior
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
@ -352,35 +358,22 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
|
|||||||
GRID_TRACE("DhopInterior");
|
GRID_TRACE("DhopInterior");
|
||||||
Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
|
Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
|
||||||
}
|
}
|
||||||
|
|
||||||
//ifdef GRID_ACCELERATED
|
|
||||||
#if 0
|
|
||||||
/////////////////////////////
|
|
||||||
// Overlap with comms -- on GPU the interior kernel call is nonblocking
|
|
||||||
/////////////////////////////
|
|
||||||
st.CommunicateBegin(requests);
|
|
||||||
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
// Complete comms
|
// Complete comms
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
// std::cout << " WilsonFermion5D Comms Complete " <<std::endl;
|
|
||||||
st.CommunicateComplete(requests);
|
st.CommunicateComplete(requests);
|
||||||
// traceStop(id);
|
traceStop(id);
|
||||||
|
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
// do the compute exterior
|
// do the compute exterior
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
{
|
{
|
||||||
// std::cout << " WilsonFermion5D Comms Merge " <<std::endl;
|
|
||||||
GRID_TRACE("Merge");
|
GRID_TRACE("Merge");
|
||||||
st.CommsMerge(compressor);
|
st.CommsMerge(compressor);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// std::cout << " WilsonFermion5D Exterior " <<std::endl;
|
|
||||||
if (dag == DaggerYes) {
|
if (dag == DaggerYes) {
|
||||||
GRID_TRACE("DhopDagExterior");
|
GRID_TRACE("DhopDagExterior");
|
||||||
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
|
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
|
||||||
@ -388,12 +381,11 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
|
|||||||
GRID_TRACE("DhopExterior");
|
GRID_TRACE("DhopExterior");
|
||||||
Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
|
Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
|
||||||
}
|
}
|
||||||
// std::cout << " WilsonFermion5D Done " <<std::endl;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
|
void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField & U,
|
DoubledGaugeField & U,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
FermionField &out,int dag)
|
FermionField &out,int dag)
|
||||||
@ -403,13 +395,11 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
|
|||||||
|
|
||||||
int LLs = in.Grid()->_rdimensions[0];
|
int LLs = in.Grid()->_rdimensions[0];
|
||||||
|
|
||||||
// std::cout << " WilsonFermion5D Halo exch " <<std::endl;
|
|
||||||
{
|
{
|
||||||
GRID_TRACE("HaloExchange");
|
GRID_TRACE("HaloExchange");
|
||||||
st.HaloExchangeOpt(in,compressor);
|
st.HaloExchangeOpt(in,compressor);
|
||||||
}
|
}
|
||||||
|
|
||||||
// std::cout << " WilsonFermion5D Dhop " <<std::endl;
|
|
||||||
int Opt = WilsonKernelsStatic::Opt;
|
int Opt = WilsonKernelsStatic::Opt;
|
||||||
if (dag == DaggerYes) {
|
if (dag == DaggerYes) {
|
||||||
GRID_TRACE("DhopDag");
|
GRID_TRACE("DhopDag");
|
||||||
@ -418,7 +408,6 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
|
|||||||
GRID_TRACE("Dhop");
|
GRID_TRACE("Dhop");
|
||||||
Kernels::DhopKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
|
Kernels::DhopKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
|
||||||
}
|
}
|
||||||
// std::cout << " WilsonFermion5D Done " <<std::endl;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -431,7 +420,7 @@ void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int
|
|||||||
assert(in.Checkerboard()==Even);
|
assert(in.Checkerboard()==Even);
|
||||||
out.Checkerboard() = Odd;
|
out.Checkerboard() = Odd;
|
||||||
|
|
||||||
DhopInternal(StencilEven,UmuOdd,in,out,dag);
|
DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,in,out,dag);
|
||||||
}
|
}
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
|
void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
|
||||||
@ -442,31 +431,8 @@ void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int
|
|||||||
assert(in.Checkerboard()==Odd);
|
assert(in.Checkerboard()==Odd);
|
||||||
out.Checkerboard() = Even;
|
out.Checkerboard() = Even;
|
||||||
|
|
||||||
DhopInternal(StencilOdd,UmuEven,in,out,dag);
|
DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,in,out,dag);
|
||||||
}
|
}
|
||||||
template<class Impl>
|
|
||||||
void WilsonFermion5D<Impl>::DhopComms(const FermionField &in, FermionField &out)
|
|
||||||
{
|
|
||||||
int dag =0 ;
|
|
||||||
conformable(in.Grid(),FermionGrid()); // verifies full grid
|
|
||||||
conformable(in.Grid(),out.Grid());
|
|
||||||
out.Checkerboard() = in.Checkerboard();
|
|
||||||
Compressor compressor(dag);
|
|
||||||
Stencil.HaloExchangeOpt(in,compressor);
|
|
||||||
}
|
|
||||||
template<class Impl>
|
|
||||||
void WilsonFermion5D<Impl>::DhopCalc(const FermionField &in, FermionField &out,uint64_t *ids)
|
|
||||||
{
|
|
||||||
conformable(in.Grid(),FermionGrid()); // verifies full grid
|
|
||||||
conformable(in.Grid(),out.Grid());
|
|
||||||
|
|
||||||
out.Checkerboard() = in.Checkerboard();
|
|
||||||
|
|
||||||
int LLs = in.Grid()->_rdimensions[0];
|
|
||||||
int Opt = WilsonKernelsStatic::Opt;
|
|
||||||
Kernels::DhopKernel(Opt,Stencil,Umu,Stencil.CommBuf(),LLs,Umu.oSites(),in,out,ids);
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
|
void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
|
||||||
{
|
{
|
||||||
@ -475,7 +441,7 @@ void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int d
|
|||||||
|
|
||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
|
|
||||||
DhopInternal(Stencil,Umu,in,out,dag);
|
DhopInternal(Stencil,Lebesgue,Umu,in,out,dag);
|
||||||
}
|
}
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag)
|
void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag)
|
||||||
@ -769,15 +735,6 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt(FermionField &out,const Fe
|
|||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist)
|
void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist)
|
||||||
{
|
|
||||||
std::vector<double> empty_q(Nd,0.0);
|
|
||||||
MomentumSpacePropagatorHwQ(out,in,mass,twist,empty_q);
|
|
||||||
}
|
|
||||||
template<class Impl>
|
|
||||||
void WilsonFermion5D<Impl>::MomentumSpacePropagatorHwQ(FermionField &out,const FermionField &in,
|
|
||||||
RealD mass,
|
|
||||||
std::vector<double> twist,
|
|
||||||
std::vector<double> qmu)
|
|
||||||
{
|
{
|
||||||
Gamma::Algebra Gmu [] = {
|
Gamma::Algebra Gmu [] = {
|
||||||
Gamma::Algebra::GammaX,
|
Gamma::Algebra::GammaX,
|
||||||
@ -793,7 +750,6 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHwQ(FermionField &out,const F
|
|||||||
typedef typename FermionField::scalar_type ScalComplex;
|
typedef typename FermionField::scalar_type ScalComplex;
|
||||||
|
|
||||||
typedef Lattice<iSinglet<vector_type> > LatComplex;
|
typedef Lattice<iSinglet<vector_type> > LatComplex;
|
||||||
typedef iSpinMatrix<ScalComplex> SpinMat;
|
|
||||||
|
|
||||||
|
|
||||||
Coordinate latt_size = _grid->_fdimensions;
|
Coordinate latt_size = _grid->_fdimensions;
|
||||||
@ -811,10 +767,8 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHwQ(FermionField &out,const F
|
|||||||
LatComplex kmu(_grid);
|
LatComplex kmu(_grid);
|
||||||
ScalComplex ci(0.0,1.0);
|
ScalComplex ci(0.0,1.0);
|
||||||
|
|
||||||
std::cout<< "Feynman Rule" << "qmu ("<<qmu[0]<<","<<qmu[1]<<","<<qmu[2]<<","<<qmu[3]<<")"<<std::endl;
|
|
||||||
|
|
||||||
for(int mu=0;mu<Nd;mu++) {
|
for(int mu=0;mu<Nd;mu++) {
|
||||||
|
|
||||||
LatticeCoordinate(kmu,mu);
|
LatticeCoordinate(kmu,mu);
|
||||||
|
|
||||||
RealD TwoPiL = M_PI * 2.0/ latt_size[mu];
|
RealD TwoPiL = M_PI * 2.0/ latt_size[mu];
|
||||||
@ -823,18 +777,9 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHwQ(FermionField &out,const F
|
|||||||
kmu = kmu + TwoPiL * one * twist[mu];//momentum for twisted boundary conditions
|
kmu = kmu + TwoPiL * one * twist[mu];//momentum for twisted boundary conditions
|
||||||
|
|
||||||
sk2 = sk2 + 2.0*sin(kmu*0.5)*sin(kmu*0.5);
|
sk2 = sk2 + 2.0*sin(kmu*0.5)*sin(kmu*0.5);
|
||||||
|
sk = sk + sin(kmu)*sin(kmu);
|
||||||
|
|
||||||
sk = sk + (sin(kmu)+qmu[mu])*(sin(kmu)+qmu[mu]);
|
num = num - sin(kmu)*ci*(Gamma(Gmu[mu])*in);
|
||||||
|
|
||||||
// Terms for boosted Fermion
|
|
||||||
// 1/2 [ -i gamma.(sin p + q ) ]
|
|
||||||
// [ --------------------- + 1 ]
|
|
||||||
// [ wq + b ]
|
|
||||||
//
|
|
||||||
// wq = sqrt( (sinp+q)^2 + b^2 )
|
|
||||||
//
|
|
||||||
|
|
||||||
num = num - (sin(kmu)+qmu[mu])*ci*(Gamma(Gmu[mu])*in);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
num = num + mass * in ;
|
num = num + mass * in ;
|
||||||
|
@ -52,12 +52,17 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
|
|||||||
StencilEven(&Hgrid, npoint, Even, directions,displacements,p), // source is Even
|
StencilEven(&Hgrid, npoint, Even, directions,displacements,p), // source is Even
|
||||||
StencilOdd(&Hgrid, npoint, Odd, directions,displacements,p), // source is Odd
|
StencilOdd(&Hgrid, npoint, Odd, directions,displacements,p), // source is Odd
|
||||||
mass(_mass),
|
mass(_mass),
|
||||||
|
Lebesgue(_grid),
|
||||||
|
LebesgueEvenOdd(_cbgrid),
|
||||||
Umu(&Fgrid),
|
Umu(&Fgrid),
|
||||||
UmuEven(&Hgrid),
|
UmuEven(&Hgrid),
|
||||||
UmuOdd(&Hgrid),
|
UmuOdd(&Hgrid),
|
||||||
_tmp(&Hgrid),
|
_tmp(&Hgrid),
|
||||||
anisotropyCoeff(anis)
|
anisotropyCoeff(anis)
|
||||||
{
|
{
|
||||||
|
Stencil.lo = &Lebesgue;
|
||||||
|
StencilEven.lo = &LebesgueEvenOdd;
|
||||||
|
StencilOdd.lo = &LebesgueEvenOdd;
|
||||||
// Allocate the required comms buffer
|
// Allocate the required comms buffer
|
||||||
ImportGauge(_Umu);
|
ImportGauge(_Umu);
|
||||||
if (anisotropyCoeff.isAnisotropic){
|
if (anisotropyCoeff.isAnisotropic){
|
||||||
@ -309,7 +314,7 @@ void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int da
|
|||||||
|
|
||||||
out.Checkerboard() = in.Checkerboard();
|
out.Checkerboard() = in.Checkerboard();
|
||||||
|
|
||||||
DhopInternal(Stencil, Umu, in, out, dag);
|
DhopInternal(Stencil, Lebesgue, Umu, in, out, dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
@ -321,7 +326,7 @@ void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int
|
|||||||
assert(in.Checkerboard() == Even);
|
assert(in.Checkerboard() == Even);
|
||||||
out.Checkerboard() = Odd;
|
out.Checkerboard() = Odd;
|
||||||
|
|
||||||
DhopInternal(StencilEven, UmuOdd, in, out, dag);
|
DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
@ -333,7 +338,7 @@ void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int d
|
|||||||
assert(in.Checkerboard() == Odd);
|
assert(in.Checkerboard() == Odd);
|
||||||
out.Checkerboard() = Even;
|
out.Checkerboard() = Even;
|
||||||
|
|
||||||
DhopInternal(StencilOdd, UmuEven, in, out, dag);
|
DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
@ -386,21 +391,21 @@ void WilsonFermion<Impl>::DhopDirCalc(const FermionField &in, FermionField &out,
|
|||||||
};
|
};
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonFermion<Impl>::DhopInternal(StencilImpl &st,
|
void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
FermionField &out, int dag)
|
FermionField &out, int dag)
|
||||||
{
|
{
|
||||||
#ifdef GRID_OMP
|
#ifdef GRID_OMP
|
||||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
|
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
|
||||||
DhopInternalOverlappedComms(st,U,in,out,dag);
|
DhopInternalOverlappedComms(st,lo,U,in,out,dag);
|
||||||
else
|
else
|
||||||
#endif
|
#endif
|
||||||
DhopInternalSerial(st,U,in,out,dag);
|
DhopInternalSerial(st,lo,U,in,out,dag);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st,
|
void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
FermionField &out, int dag)
|
FermionField &out, int dag)
|
||||||
@ -469,10 +474,10 @@ void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st,
|
|||||||
|
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st,
|
void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
FermionField &out, int dag)
|
FermionField &out, int dag)
|
||||||
{
|
{
|
||||||
GRID_TRACE("DhopSerial");
|
GRID_TRACE("DhopSerial");
|
||||||
assert((dag == DaggerNo) || (dag == DaggerYes));
|
assert((dag == DaggerNo) || (dag == DaggerYes));
|
||||||
|
@ -40,11 +40,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
/// Switch off the 5d vectorised code optimisations
|
/// Switch off the 5d vectorised code optimisations
|
||||||
#undef DWFVEC5D
|
#undef DWFVEC5D
|
||||||
|
|
||||||
static std::vector<vComplexF> signsF;
|
static Vector<vComplexF> signsF;
|
||||||
|
|
||||||
template<typename vtype>
|
template<typename vtype>
|
||||||
int setupSigns(std::vector<vtype>& signs ){
|
int setupSigns(Vector<vtype>& signs ){
|
||||||
std::vector<vtype> bother(2);
|
Vector<vtype> bother(2);
|
||||||
signs = bother;
|
signs = bother;
|
||||||
vrsign(signs[0]);
|
vrsign(signs[0]);
|
||||||
visign(signs[1]);
|
visign(signs[1]);
|
||||||
@ -364,7 +364,7 @@ WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDagExt(StencilView &st, Doubled
|
|||||||
|
|
||||||
#include <simd/Intel512double.h>
|
#include <simd/Intel512double.h>
|
||||||
|
|
||||||
static std::vector<vComplexD> signsD;
|
static Vector<vComplexD> signsD;
|
||||||
static int signInitD = setupSigns(signsD);
|
static int signInitD = setupSigns(signsD);
|
||||||
|
|
||||||
#define MAYBEPERM(A,perm) if (perm) { A ; }
|
#define MAYBEPERM(A,perm) if (perm) { A ; }
|
||||||
|
@ -411,46 +411,6 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
|
|||||||
#undef LoopBody
|
#undef LoopBody
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef GRID_SYCL
|
|
||||||
extern "C" {
|
|
||||||
ulong SYCL_EXTERNAL __attribute__((overloadable)) intel_get_cycle_counter( void );
|
|
||||||
uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_active_channel_mask( void );
|
|
||||||
uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_grf_register( uint reg );
|
|
||||||
uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_flag_register( uint flag );
|
|
||||||
uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_control_register( uint reg );
|
|
||||||
uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_hw_thread_id( void );
|
|
||||||
uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_slice_id( void );
|
|
||||||
uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_subslice_id( void );
|
|
||||||
uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_eu_id( void );
|
|
||||||
uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_eu_thread_id( void );
|
|
||||||
void SYCL_EXTERNAL __attribute__((overloadable)) intel_eu_thread_pause( uint value );
|
|
||||||
}
|
|
||||||
#ifdef GRID_SIMT
|
|
||||||
#define MAKE_ID(A) (intel_get_eu_id()<<16)|(intel_get_slice_id()<<8)|(intel_get_subslice_id())
|
|
||||||
#else
|
|
||||||
#define MAKE_ID(A) (0)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
#define MAKE_ID(A) (0)
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
#define KERNEL_CALL_ID(A) \
|
|
||||||
const uint64_t NN = Nsite*Ls; \
|
|
||||||
accelerator_forNB( ss, NN, Simd::Nsimd(), { \
|
|
||||||
int sF = ss; \
|
|
||||||
int sU = ss/Ls; \
|
|
||||||
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v); \
|
|
||||||
const int Nsimd = SiteHalfSpinor::Nsimd(); \
|
|
||||||
const int lane=acceleratorSIMTlane(Nsimd); \
|
|
||||||
int idx=sF*Nsimd+lane; \
|
|
||||||
uint64_t id = MAKE_ID(); \
|
|
||||||
ids[idx]=id; \
|
|
||||||
}); \
|
|
||||||
accelerator_barrier();
|
|
||||||
|
|
||||||
#define KERNEL_CALLNB(A) \
|
#define KERNEL_CALLNB(A) \
|
||||||
const uint64_t NN = Nsite*Ls; \
|
const uint64_t NN = Nsite*Ls; \
|
||||||
@ -458,7 +418,7 @@ extern "C" {
|
|||||||
int sF = ss; \
|
int sF = ss; \
|
||||||
int sU = ss/Ls; \
|
int sU = ss/Ls; \
|
||||||
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v); \
|
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v); \
|
||||||
});
|
});
|
||||||
|
|
||||||
#define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier();
|
#define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier();
|
||||||
|
|
||||||
@ -474,7 +434,7 @@ extern "C" {
|
|||||||
|
|
||||||
#define ASM_CALL(A) \
|
#define ASM_CALL(A) \
|
||||||
thread_for( sss, Nsite, { \
|
thread_for( sss, Nsite, { \
|
||||||
int ss = sss; /*st.lo->Reorder(sss);*/ \
|
int ss = st.lo->Reorder(sss); \
|
||||||
int sU = ss; \
|
int sU = ss; \
|
||||||
int sF = ss*Ls; \
|
int sF = ss*Ls; \
|
||||||
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \
|
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \
|
||||||
@ -491,8 +451,6 @@ extern "C" {
|
|||||||
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \
|
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \
|
||||||
});}
|
});}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||||
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
||||||
@ -504,7 +462,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
|||||||
autoView(st_v , st,AcceleratorRead);
|
autoView(st_v , st,AcceleratorRead);
|
||||||
|
|
||||||
if( interior && exterior ) {
|
if( interior && exterior ) {
|
||||||
// acceleratorFenceComputeStream();
|
acceleratorFenceComputeStream();
|
||||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;}
|
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;}
|
||||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;}
|
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;}
|
||||||
#ifndef GRID_CUDA
|
#ifndef GRID_CUDA
|
||||||
@ -517,7 +475,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
|||||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;}
|
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;}
|
||||||
#endif
|
#endif
|
||||||
} else if( exterior ) {
|
} else if( exterior ) {
|
||||||
// // dependent on result of merge
|
// dependent on result of merge
|
||||||
acceleratorFenceComputeStream();
|
acceleratorFenceComputeStream();
|
||||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL_EXT(GenericDhopSiteExt); return;}
|
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL_EXT(GenericDhopSiteExt); return;}
|
||||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_EXT(HandDhopSiteExt); return;}
|
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_EXT(HandDhopSiteExt); return;}
|
||||||
@ -527,18 +485,6 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
|||||||
}
|
}
|
||||||
assert(0 && " Kernel optimisation case not covered ");
|
assert(0 && " Kernel optimisation case not covered ");
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
|
||||||
void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
|
||||||
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
|
||||||
uint64_t *ids)
|
|
||||||
{
|
|
||||||
autoView(U_v , U,AcceleratorRead);
|
|
||||||
autoView(in_v , in,AcceleratorRead);
|
|
||||||
autoView(out_v,out,AcceleratorWrite);
|
|
||||||
autoView(st_v , st,AcceleratorRead);
|
|
||||||
KERNEL_CALL_ID(GenericDhopSite);
|
|
||||||
}
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonKernels<Impl>::DhopDagKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
void WilsonKernels<Impl>::DhopDagKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||||
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
||||||
|
@ -40,11 +40,6 @@ public:
|
|||||||
|
|
||||||
INHERIT_GIMPL_TYPES(Gimpl);
|
INHERIT_GIMPL_TYPES(Gimpl);
|
||||||
|
|
||||||
using Action<GaugeField>::S;
|
|
||||||
using Action<GaugeField>::Sinitial;
|
|
||||||
using Action<GaugeField>::deriv;
|
|
||||||
using Action<GaugeField>::refresh;
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
RealD c_plaq;
|
RealD c_plaq;
|
||||||
RealD c_rect;
|
RealD c_rect;
|
||||||
|
@ -43,11 +43,6 @@ class WilsonGaugeAction : public Action<typename Gimpl::GaugeField> {
|
|||||||
public:
|
public:
|
||||||
INHERIT_GIMPL_TYPES(Gimpl);
|
INHERIT_GIMPL_TYPES(Gimpl);
|
||||||
|
|
||||||
using Action<GaugeField>::S;
|
|
||||||
using Action<GaugeField>::Sinitial;
|
|
||||||
using Action<GaugeField>::deriv;
|
|
||||||
using Action<GaugeField>::refresh;
|
|
||||||
|
|
||||||
/////////////////////////// constructors
|
/////////////////////////// constructors
|
||||||
explicit WilsonGaugeAction(RealD beta_):beta(beta_){};
|
explicit WilsonGaugeAction(RealD beta_):beta(beta_){};
|
||||||
|
|
||||||
|
@ -40,7 +40,7 @@ public:
|
|||||||
U = Zero();
|
U = Zero();
|
||||||
LatticeColourMatrix tmp(Uin.Grid());
|
LatticeColourMatrix tmp(Uin.Grid());
|
||||||
|
|
||||||
std::vector<typename SU<ncolour>::Matrix> ta(Dimension);
|
Vector<typename SU<ncolour>::Matrix> ta(Dimension);
|
||||||
|
|
||||||
// Debug lines
|
// Debug lines
|
||||||
// LatticeMatrix uno(Uin.Grid());
|
// LatticeMatrix uno(Uin.Grid());
|
||||||
|
@ -43,7 +43,7 @@ public:
|
|||||||
U = Zero();
|
U = Zero();
|
||||||
LatticeColourMatrix tmp(Uin.Grid());
|
LatticeColourMatrix tmp(Uin.Grid());
|
||||||
|
|
||||||
std::vector<typename GaugeGroup<ncolour,group_name>::Matrix> eij(Dimension);
|
Vector<typename GaugeGroup<ncolour,group_name>::Matrix> eij(Dimension);
|
||||||
|
|
||||||
for (int a = 0; a < Dimension; a++)
|
for (int a = 0; a < Dimension; a++)
|
||||||
GaugeGroupTwoIndex<ncolour, S, group_name>::base(a, eij[a]);
|
GaugeGroupTwoIndex<ncolour, S, group_name>::base(a, eij[a]);
|
||||||
|
@ -32,7 +32,9 @@ private:
|
|||||||
// Smear_Stout<Gimpl> *StoutSmearing;
|
// Smear_Stout<Gimpl> *StoutSmearing;
|
||||||
// std::vector<GaugeField> SmearedSet;
|
// std::vector<GaugeField> SmearedSet;
|
||||||
|
|
||||||
|
GridRedBlackCartesian * UrbGrid; // keep a copy of the redblack grid for life of object
|
||||||
std::vector<LatticeLorentzComplex> masks;
|
std::vector<LatticeLorentzComplex> masks;
|
||||||
|
std::vector<int> cbs;
|
||||||
|
|
||||||
typedef typename SU3Adjoint::AMatrix AdjMatrix;
|
typedef typename SU3Adjoint::AMatrix AdjMatrix;
|
||||||
typedef typename SU3Adjoint::LatticeAdjMatrix AdjMatrixField;
|
typedef typename SU3Adjoint::LatticeAdjMatrix AdjMatrixField;
|
||||||
@ -147,6 +149,25 @@ private:
|
|||||||
}
|
}
|
||||||
pokeLorentz(Fdet, Fdet_pol, nu);
|
pokeLorentz(Fdet, Fdet_pol, nu);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void Compute_MpInvJx_dNxxdSy(int cb,
|
||||||
|
const GaugeLinkField &PlaqL,
|
||||||
|
const GaugeLinkField &PlaqR,
|
||||||
|
AdjMatrixField MpInvJx,
|
||||||
|
AdjVectorField &Fdet2 )
|
||||||
|
{
|
||||||
|
GaugeLinkField PlaqLeo(UrbGrid);
|
||||||
|
GaugeLinkField PlaqReo(UrbGrid);
|
||||||
|
AdjMatrixField MpInvJxeo(UrbGrid);
|
||||||
|
AdjVectorField Fdet2eo(UrbGrid);
|
||||||
|
pickCheckerboard(cb,PlaqLeo,PlaqL);
|
||||||
|
pickCheckerboard(cb,PlaqReo,PlaqR);
|
||||||
|
pickCheckerboard(cb,MpInvJxeo,MpInvJx);
|
||||||
|
Fdet2eo.Checkerboard()=cb;
|
||||||
|
Compute_MpInvJx_dNxxdSy(PlaqLeo,PlaqReo,MpInvJxeo,Fdet2eo);
|
||||||
|
setCheckerboard(Fdet2,Fdet2eo);
|
||||||
|
}
|
||||||
|
|
||||||
void Compute_MpInvJx_dNxxdSy(const GaugeLinkField &PlaqL,const GaugeLinkField &PlaqR, AdjMatrixField MpInvJx,AdjVectorField &Fdet2 )
|
void Compute_MpInvJx_dNxxdSy(const GaugeLinkField &PlaqL,const GaugeLinkField &PlaqR, AdjMatrixField MpInvJx,AdjVectorField &Fdet2 )
|
||||||
{
|
{
|
||||||
GaugeLinkField UtaU(PlaqL.Grid());
|
GaugeLinkField UtaU(PlaqL.Grid());
|
||||||
@ -278,8 +299,9 @@ public:
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Mask the gauge field
|
// Mask the gauge field
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
int cb = cbs[smr];
|
||||||
auto mask=PeekIndex<LorentzIndex>(masks[smr],mu); // the cb mask
|
auto mask=PeekIndex<LorentzIndex>(masks[smr],mu); // the cb mask
|
||||||
|
|
||||||
Umsk = U;
|
Umsk = U;
|
||||||
ApplyMask(Umsk,smr);
|
ApplyMask(Umsk,smr);
|
||||||
Utmp = peekLorentz(Umsk,mu);
|
Utmp = peekLorentz(Umsk,mu);
|
||||||
@ -442,7 +464,7 @@ public:
|
|||||||
AdjMatrixField MpInvJx_nu(grid);
|
AdjMatrixField MpInvJx_nu(grid);
|
||||||
MpInvJx = (-1.0)*MpAdInv * JxAd;// rho is on the plaq factor
|
MpInvJx = (-1.0)*MpAdInv * JxAd;// rho is on the plaq factor
|
||||||
|
|
||||||
Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx,FdetV);
|
Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx,FdetV);
|
||||||
Fdet2_mu=FdetV;
|
Fdet2_mu=FdetV;
|
||||||
Fdet1_mu=Zero();
|
Fdet1_mu=Zero();
|
||||||
|
|
||||||
@ -499,7 +521,7 @@ public:
|
|||||||
|
|
||||||
time=-usecond();
|
time=-usecond();
|
||||||
PlaqR=(-1.0)*PlaqR;
|
PlaqR=(-1.0)*PlaqR;
|
||||||
Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx,FdetV);
|
Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx,FdetV);
|
||||||
Fdet2_nu = FdetV;
|
Fdet2_nu = FdetV;
|
||||||
time+=usecond();
|
time+=usecond();
|
||||||
std::cout << GridLogMessage << "Compute_MpInvJx_dNxxSy (occurs 6x) took "<<time<< " us"<<std::endl;
|
std::cout << GridLogMessage << "Compute_MpInvJx_dNxxSy (occurs 6x) took "<<time<< " us"<<std::endl;
|
||||||
@ -520,7 +542,7 @@ public:
|
|||||||
|
|
||||||
|
|
||||||
MpInvJx_nu = Cshift(MpInvJx,mu,-1);
|
MpInvJx_nu = Cshift(MpInvJx,mu,-1);
|
||||||
Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
|
Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV);
|
||||||
Fdet2_nu = Fdet2_nu+FdetV;
|
Fdet2_nu = Fdet2_nu+FdetV;
|
||||||
|
|
||||||
///////////////// -ve nu /////////////////
|
///////////////// -ve nu /////////////////
|
||||||
@ -539,7 +561,7 @@ public:
|
|||||||
Fdet1_nu = Fdet1_nu + transpose(Nxy)*dJdXe_nMpInv_y;
|
Fdet1_nu = Fdet1_nu + transpose(Nxy)*dJdXe_nMpInv_y;
|
||||||
|
|
||||||
MpInvJx_nu = Cshift(MpInvJx,nu,1);
|
MpInvJx_nu = Cshift(MpInvJx,nu,1);
|
||||||
Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
|
Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV);
|
||||||
Fdet2_nu = Fdet2_nu+FdetV;
|
Fdet2_nu = Fdet2_nu+FdetV;
|
||||||
|
|
||||||
// x==
|
// x==
|
||||||
@ -560,7 +582,7 @@ public:
|
|||||||
|
|
||||||
MpInvJx_nu = Cshift(MpInvJx,mu,-1);
|
MpInvJx_nu = Cshift(MpInvJx,mu,-1);
|
||||||
MpInvJx_nu = Cshift(MpInvJx_nu,nu,1);
|
MpInvJx_nu = Cshift(MpInvJx_nu,nu,1);
|
||||||
Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
|
Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV);
|
||||||
Fdet2_nu = Fdet2_nu+FdetV;
|
Fdet2_nu = Fdet2_nu+FdetV;
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////
|
||||||
@ -589,7 +611,7 @@ public:
|
|||||||
|
|
||||||
MpInvJx_nu = Cshift(MpInvJx,nu,-1);
|
MpInvJx_nu = Cshift(MpInvJx,nu,-1);
|
||||||
|
|
||||||
Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
|
Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV);
|
||||||
Fdet2_mu = Fdet2_mu+FdetV;
|
Fdet2_mu = Fdet2_mu+FdetV;
|
||||||
|
|
||||||
// __
|
// __
|
||||||
@ -609,7 +631,7 @@ public:
|
|||||||
|
|
||||||
MpInvJx_nu = Cshift(MpInvJx,nu,1);
|
MpInvJx_nu = Cshift(MpInvJx,nu,1);
|
||||||
|
|
||||||
Compute_MpInvJx_dNxxdSy(PlaqL,PlaqR,MpInvJx_nu,FdetV);
|
Compute_MpInvJx_dNxxdSy(cb,PlaqL,PlaqR,MpInvJx_nu,FdetV);
|
||||||
Fdet2_mu = Fdet2_mu+FdetV;
|
Fdet2_mu = Fdet2_mu+FdetV;
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -931,6 +953,10 @@ private:
|
|||||||
public:
|
public:
|
||||||
|
|
||||||
/* Standard constructor */
|
/* Standard constructor */
|
||||||
|
virtual ~SmearedConfigurationMasked()
|
||||||
|
{
|
||||||
|
delete UrbGrid;
|
||||||
|
}
|
||||||
SmearedConfigurationMasked(GridCartesian* _UGrid, unsigned int Nsmear, Smear_Stout<Gimpl>& Stout)
|
SmearedConfigurationMasked(GridCartesian* _UGrid, unsigned int Nsmear, Smear_Stout<Gimpl>& Stout)
|
||||||
: SmearedConfiguration<Gimpl>(_UGrid, Nsmear,Stout)
|
: SmearedConfiguration<Gimpl>(_UGrid, Nsmear,Stout)
|
||||||
{
|
{
|
||||||
@ -939,7 +965,6 @@ public:
|
|||||||
// was resized in base class
|
// was resized in base class
|
||||||
assert(this->SmearedSet.size()==Nsmear);
|
assert(this->SmearedSet.size()==Nsmear);
|
||||||
|
|
||||||
GridRedBlackCartesian * UrbGrid;
|
|
||||||
UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(_UGrid);
|
UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(_UGrid);
|
||||||
LatticeComplex one(_UGrid); one = ComplexD(1.0,0.0);
|
LatticeComplex one(_UGrid); one = ComplexD(1.0,0.0);
|
||||||
LatticeComplex tmp(_UGrid);
|
LatticeComplex tmp(_UGrid);
|
||||||
@ -947,10 +972,11 @@ public:
|
|||||||
for (unsigned int i = 0; i < this->smearingLevels; ++i) {
|
for (unsigned int i = 0; i < this->smearingLevels; ++i) {
|
||||||
|
|
||||||
masks.push_back(*(new LatticeLorentzComplex(_UGrid)));
|
masks.push_back(*(new LatticeLorentzComplex(_UGrid)));
|
||||||
|
|
||||||
int mu= (i/2) %Nd;
|
int mu= (i/2) %Nd;
|
||||||
int cb= (i%2);
|
int cb= (i%2);
|
||||||
LatticeComplex tmpcb(UrbGrid);
|
LatticeComplex tmpcb(UrbGrid);
|
||||||
|
|
||||||
|
cbs.push_back(cb);
|
||||||
|
|
||||||
masks[i]=Zero();
|
masks[i]=Zero();
|
||||||
////////////////////
|
////////////////////
|
||||||
@ -962,7 +988,6 @@ public:
|
|||||||
PokeIndex<LorentzIndex>(masks[i],tmp, mu);
|
PokeIndex<LorentzIndex>(masks[i],tmp, mu);
|
||||||
|
|
||||||
}
|
}
|
||||||
delete UrbGrid;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual void smeared_force(GaugeField &SigmaTilde)
|
virtual void smeared_force(GaugeField &SigmaTilde)
|
||||||
|
File diff suppressed because it is too large
Load Diff
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user