mirror of
https://github.com/paboyle/Grid.git
synced 2025-10-24 09:44:47 +01:00
Compare commits
219 Commits
feature/ft
...
feature/S2
Author | SHA1 | Date | |
---|---|---|---|
|
d0ee38d1da | ||
|
da8dc3da0d | ||
|
21514d8487 | ||
|
77b2e9fb61 | ||
|
a71ba05bd7 | ||
|
1e95e64035 | ||
|
defcac92ab | ||
|
4869378f1e | ||
|
c7b74db317 | ||
|
0ce201efbe | ||
|
6d8a3d8bb2 | ||
|
7dfd207ebb | ||
|
3a65a096f2 | ||
|
85b2bd4c93 | ||
|
35e10a1159 | ||
d418f78352 | |||
25163998a0 | |||
|
dc546aaa4b | ||
|
5364d580c9 | ||
|
2a9a6347e3 | ||
|
cfdb56f314 | ||
|
b517e88db3 | ||
bb317aba8d | |||
644cc6647e | |||
72397ce23b | |||
|
d60a80c098 | ||
|
bb8b6d9d73 | ||
|
677b4cc5b0 | ||
|
be565ffab6 | ||
|
df6120e5f6 | ||
|
21de6f7da8 | ||
|
dbe39f9ce0 | ||
|
ab3de50d5e | ||
|
c545bd2139 | ||
|
6a1c64fbdd | ||
|
b75809ed61 | ||
|
ecaf228e5c | ||
|
6d015ae8fc | ||
|
233150d93f | ||
|
7af8c77a52 | ||
|
a957e7bfa1 | ||
|
cee4c8ce8c | ||
|
96bf814d8c | ||
|
7ddc422788 | ||
|
e652fc2825 | ||
|
a49fa3f8d0 | ||
|
cd452a2f91 | ||
|
4f89f603ae | ||
|
11dc2c5e1d | ||
|
6fec3c15ca | ||
|
938c47480f | ||
|
3811d19298 | ||
|
83a3ab6b6f | ||
|
d66a9af6a3 | ||
|
adc90d3a86 | ||
|
ebbd015c5c | ||
|
4ab73b36b2 | ||
|
130e07a422 | ||
|
8f47bb367e | ||
|
0c3cb60135 | ||
|
9eae8fca5d | ||
|
882a217074 | ||
|
e465fce201 | ||
|
d41542c64b | ||
|
199818bd6c | ||
|
fe66c7ca30 | ||
|
e9177e4af3 | ||
|
d15a6c5933 | ||
25ab9325e7 | |||
19f9378b98 | |||
|
785bc7a14f | ||
|
1a1fe85428 | ||
|
0000d2e558 | ||
|
9ffd1ed4ce | ||
|
3d014864e2 | ||
1d22841811 | |||
|
a1cdda833f | ||
|
ad6db92690 | ||
|
e8ff9d8e50 | ||
|
795769c636 | ||
|
267a39d943 | ||
|
3624bd3d22 | ||
|
bc12dbbb38 | ||
|
eb8a008a8f | ||
c4d9aa1a21 | |||
6ae809ed40 | |||
|
311e2aab3f | ||
438dfbdb83 | |||
b2ce760cf4 | |||
|
b1ba209696 | ||
|
cb3e529b1e | ||
|
717f647418 | ||
|
98e7418187 | ||
|
fe05bf48b1 | ||
|
d2dd8f54e2 | ||
|
7726ee4b16 | ||
ba9bbe0221 | |||
4c3dd82d84 | |||
44e911b5b7 | |||
a7a16df9d0 | |||
382e0abefd | |||
6fdefe5b90 | |||
4788dd8e2e | |||
1cc5f221f3 | |||
93251bfba0 | |||
18b79508b8 | |||
4de5ed1613 | |||
0baaddbe98 | |||
8729c46169 | |||
09f81fe7c3 | |||
1876e5b7c0 | |||
|
355ec76257 | ||
b50fb34e71 | |||
de84d730ff | |||
|
c74d11e3d7 | ||
|
84cab5e6e7 | ||
c4fc972fec | |||
8cf809e231 | |||
94019a922e | |||
|
4f17c8d081 | ||
|
aaab753982 | ||
d6b2727f86 | |||
74a4f43946 | |||
1caf8b0f86 | |||
|
570b72a47b | ||
|
a5798a89ed | ||
|
3f3661a86f | ||
|
f7e2f9a401 | ||
|
2848a9b558 | ||
|
d4868991af | ||
|
e99d42404e | ||
|
3ba019c747 | ||
|
47429218bb | ||
8fe429346f | |||
|
5a4f9bf2e3 | ||
|
b91fc1b6b4 | ||
|
eafc150034 | ||
|
2877f1a268 | ||
|
1e893af775 | ||
|
d9f430a575 | ||
|
63abe87f36 | ||
|
368d649c8a | ||
|
5603464f39 | ||
|
655c79f39e | ||
|
565b231c03 | ||
|
62a9f180fa | ||
|
5ae77876a8 | ||
|
4ed2c2c74f | ||
|
955da582b6 | ||
|
11b07b950d | ||
|
8f70cfeda9 | ||
|
ce64271048 | ||
5cc4f3241d | |||
|
6815e138b4 | ||
a78a61d76f | |||
2eff3f34ed | |||
03687c1d62 | |||
febfe4e77f | |||
4d1aa134b5 | |||
5ec879860a | |||
|
f617468e04 | ||
b728af903c | |||
54f1999030 | |||
fd58f0b669 | |||
c5c67b706e | |||
be7a543e2c | |||
68f112d576 | |||
ec1395a304 | |||
beb0e474ee | |||
2b5fdcbbc5 | |||
295127d456 | |||
7dcfb13694 | |||
|
ee4046fe92 | ||
|
2a9cfeb9ea | ||
|
1147b8ea40 | ||
|
3f9119b39d | ||
|
35e8225abd | ||
|
bdbfbb7a14 | ||
|
f7d4be8d96 | ||
9fa8bd6438 | |||
02c8178f16 | |||
e637fbacae | |||
066544281f | |||
11be10d2c0 | |||
160969a758 | |||
622f78ebea | |||
|
aa67a5b095 | ||
|
af9ea0864c | ||
|
4e2a6d87c4 | ||
|
a465ecece9 | ||
|
575eb72182 | ||
|
3a973914d6 | ||
|
f568c07bbd | ||
|
2c9878fc3a | ||
|
27b1b1b005 | ||
|
130d7ab077 | ||
|
29f6b8a74a | ||
|
9779aaea33 | ||
|
ec25604a67 | ||
|
3668e81c5e | ||
|
d66b2423cb | ||
|
15cc78f0b6 | ||
|
06db4ddea2 | ||
|
6cfb90e99f | ||
|
d8be95a2a3 | ||
|
f82702872d | ||
|
3752c49ef0 | ||
|
fe65fa4988 | ||
|
1fe4c205a3 | ||
|
d4dc5e0f43 | ||
|
77944437ce | ||
|
c164bff758 | ||
|
aa2e3d954a | ||
|
de62b04728 | ||
|
d0bdb50f24 | ||
|
a8fecbc609 | ||
8d305df0db | |||
|
e29b97b3ea | ||
|
ad2b699d2b |
@@ -12,15 +12,13 @@
|
||||
#include <iostream>
|
||||
#include <sys/time.h>
|
||||
|
||||
#define GRID_SYCL
|
||||
#undef GRID_HIP
|
||||
#undef GRID_CUDA
|
||||
|
||||
#ifdef GRID_HIP
|
||||
#include <hipblas/hipblas.h>
|
||||
#endif
|
||||
#ifdef GRID_CUDA
|
||||
#include <cublas_v2.h>
|
||||
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
#include <oneapi/mkl.hpp>
|
||||
@@ -45,6 +43,90 @@ inline void acceleratorFreeDevice(void *ptr,size_t bytes){free(ptr,*theAccelerat
|
||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { theAccelerator->memset(base,value,bytes); theAccelerator->wait();}
|
||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theAccelerator->memcpy(to,from,bytes); theAccelerator->wait();}
|
||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theAccelerator->memcpy(to,from,bytes); theAccelerator->wait();}
|
||||
#define accelerator_barrier(dummy) { theAccelerator->wait(); }
|
||||
#endif
|
||||
|
||||
#ifdef GRID_HIP
|
||||
hipStream_t copyStream;
|
||||
hipStream_t computeStream;
|
||||
void acceleratorInit(void)
|
||||
{
|
||||
int device = 0;
|
||||
auto discard = hipSetDevice(device);
|
||||
discard = hipStreamCreate(©Stream);
|
||||
discard = hipStreamCreate(&computeStream);
|
||||
printf("AcceleratorHIPInit\n");
|
||||
}
|
||||
inline void *acceleratorAllocDevice(size_t bytes)
|
||||
{
|
||||
void *ptr=NULL;
|
||||
auto err = hipMalloc((void **)&ptr,bytes);
|
||||
if( err != hipSuccess ) {
|
||||
ptr = (void *) NULL;
|
||||
fprintf(stderr," hipMalloc failed for %ld %s \n",bytes,hipGetErrorString(err)); fflush(stderr);
|
||||
}
|
||||
return ptr;
|
||||
};
|
||||
inline void acceleratorFreeDevice(void *ptr,size_t bytes){ auto discard=hipFree(ptr);};
|
||||
inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);};
|
||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);}
|
||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
|
||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
|
||||
#define accelerator_barrier(dummy) \
|
||||
{ \
|
||||
auto tmp=hipStreamSynchronize(computeStream); \
|
||||
auto err = hipGetLastError(); \
|
||||
if ( err != hipSuccess ) { \
|
||||
printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \
|
||||
puts(__FILE__); \
|
||||
printf("Line %d\n",__LINE__); \
|
||||
exit(0); \
|
||||
} \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef GRID_CUDA
|
||||
cudaStream_t copyStream;
|
||||
cudaStream_t computeStream;
|
||||
void acceleratorInit(void)
|
||||
{
|
||||
int device = 0;
|
||||
cudaSetDevice(device);
|
||||
cudaStreamCreate(©Stream);
|
||||
cudaStreamCreate(&computeStream);
|
||||
}
|
||||
inline void *acceleratorAllocDevice(size_t bytes)
|
||||
{
|
||||
void *ptr=NULL;
|
||||
auto err = cudaMalloc((void **)&ptr,bytes);
|
||||
if( err != cudaSuccess ) {
|
||||
ptr = (void *) NULL;
|
||||
printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err));
|
||||
}
|
||||
return ptr;
|
||||
};
|
||||
inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
|
||||
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
|
||||
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
|
||||
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
|
||||
inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
|
||||
#define accelerator_barrier(dummy) \
|
||||
{ \
|
||||
cudaStreamSynchronize(computeStream); \
|
||||
cudaError err = cudaGetLastError(); \
|
||||
if ( cudaSuccess != err ) { \
|
||||
printf("accelerator_barrier(): Cuda error %s \n", \
|
||||
cudaGetErrorString( err )); \
|
||||
printf("File %s Line %d\n",__FILE__,__LINE__); \
|
||||
fflush(stdout); \
|
||||
if (acceleratorAbortOnGpuError) assert(err==cudaSuccess); \
|
||||
} \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
template<class T> void acceleratorPut(T& dev,T&host)
|
||||
{
|
||||
acceleratorCopyToDevice(&host,&dev,sizeof(T));
|
||||
@@ -55,9 +137,6 @@ template<class T> T acceleratorGet(T& dev)
|
||||
acceleratorCopyFromDevice(&dev,&host,sizeof(T));
|
||||
return host;
|
||||
}
|
||||
#define accelerator_barrier(dummy) { theAccelerator->wait(); }
|
||||
#endif
|
||||
|
||||
|
||||
/**************************************************************
|
||||
* Allocator
|
||||
@@ -211,6 +290,269 @@ public:
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////
|
||||
// Single matrix GEMM -- fp64 and fp32
|
||||
/////////////////////////////////////////////////////////////
|
||||
void gemm(GridBLASOperation_t OpA,
|
||||
GridBLASOperation_t OpB,
|
||||
int m,int n, int k,
|
||||
ComplexD alpha,
|
||||
ComplexD* Amk, // Device pointer
|
||||
ComplexD* Bkn,
|
||||
ComplexD beta,
|
||||
ComplexD* Cmn)
|
||||
{
|
||||
RealD t2=usecond();
|
||||
|
||||
assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
|
||||
assert(OpB!=GridBLAS_OP_T);
|
||||
|
||||
int lda = m; // m x k column major
|
||||
int ldb = k; // k x n column major
|
||||
int ldc = m; // m x b column major
|
||||
if(OpA!=GridBLAS_OP_N)
|
||||
lda = k;
|
||||
if(OpB!=GridBLAS_OP_N)
|
||||
ldb = n;
|
||||
|
||||
static deviceVector<ComplexD> alpha_p(1);
|
||||
static deviceVector<ComplexD> beta_p(1);
|
||||
// can prestore the 1 and the zero on device
|
||||
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexD));
|
||||
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexD));
|
||||
RealD t0=usecond();
|
||||
|
||||
#ifdef GRID_HIP
|
||||
hipblasOperation_t hOpA;
|
||||
hipblasOperation_t hOpB;
|
||||
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
|
||||
if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
|
||||
if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
|
||||
if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
|
||||
if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
|
||||
if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
|
||||
auto err = hipblasZgemm(gridblasHandle,
|
||||
hOpA,
|
||||
hOpB,
|
||||
m,n,k,
|
||||
(hipblasDoubleComplex *) &alpha_p[0],
|
||||
(hipblasDoubleComplex *) Amk, lda,
|
||||
(hipblasDoubleComplex *) Bkn, ldb,
|
||||
(hipblasDoubleComplex *) &beta_p[0],
|
||||
(hipblasDoubleComplex *) Cmn, ldc);
|
||||
assert(err==HIPBLAS_STATUS_SUCCESS);
|
||||
#endif
|
||||
#ifdef GRID_CUDA
|
||||
cublasOperation_t hOpA;
|
||||
cublasOperation_t hOpB;
|
||||
if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
|
||||
if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
|
||||
if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
|
||||
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
|
||||
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
|
||||
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
|
||||
auto err = cublasZgemm(gridblasHandle,
|
||||
hOpA,
|
||||
hOpB,
|
||||
m,n,k,
|
||||
(cuDoubleComplex *) &alpha_p[0],
|
||||
(cuDoubleComplex *) Amk, lda,
|
||||
(cuDoubleComplex *) Bkn, ldb,
|
||||
(cuDoubleComplex *) &beta_p[0],
|
||||
(cuDoubleComplex *) Cmn, ldc);
|
||||
assert(err==CUBLAS_STATUS_SUCCESS);
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
int64_t m64=m;
|
||||
int64_t n64=n;
|
||||
int64_t k64=k;
|
||||
int64_t lda64=lda;
|
||||
int64_t ldb64=ldb;
|
||||
int64_t ldc64=ldc;
|
||||
|
||||
oneapi::mkl::transpose iOpA;
|
||||
oneapi::mkl::transpose iOpB;
|
||||
|
||||
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
|
||||
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
|
||||
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
|
||||
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
|
||||
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
|
||||
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
|
||||
|
||||
oneapi::mkl::blas::column_major::gemm(*gridblasHandle,
|
||||
iOpA,
|
||||
iOpB,
|
||||
m64,n64,k64,
|
||||
(ComplexD *) &alpha_p[0],
|
||||
(const ComplexD *)Amk, (int64_t )lda64,
|
||||
(const ComplexD *)Bkn, (int64_t )ldb64,
|
||||
(ComplexD *) &beta_p[0],
|
||||
(ComplexD *)Cmn, (int64_t)ldc64);
|
||||
synchronise();
|
||||
#endif
|
||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
||||
// Need a default/reference implementation; use Eigen
|
||||
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
|
||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk,m,k);
|
||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn,k,n);
|
||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn,m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
|
||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk,k,m);
|
||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn,k,n);
|
||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn,m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
|
||||
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
|
||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk,m,k);
|
||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn,n,k);
|
||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn,m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
|
||||
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
|
||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk,k,m);
|
||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn,n,k);
|
||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn,m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
|
||||
} else {
|
||||
assert(0);
|
||||
}
|
||||
#endif
|
||||
RealD t1=usecond();
|
||||
RealD flops = 8.0*m*n*k;
|
||||
RealD bytes = 1.0*sizeof(ComplexD)*(m*k+k*n+m*n);
|
||||
}
|
||||
void gemm(GridBLASOperation_t OpA,
|
||||
GridBLASOperation_t OpB,
|
||||
int m,int n, int k,
|
||||
ComplexF alpha,
|
||||
ComplexF* Amk, // Device pointer
|
||||
ComplexF* Bkn,
|
||||
ComplexF beta,
|
||||
ComplexF* Cmn)
|
||||
{
|
||||
RealD t2=usecond();
|
||||
|
||||
assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
|
||||
assert(OpB!=GridBLAS_OP_T);
|
||||
|
||||
int lda = m; // m x k column major
|
||||
int ldb = k; // k x n column major
|
||||
int ldc = m; // m x b column major
|
||||
if(OpA!=GridBLAS_OP_N)
|
||||
lda = k;
|
||||
if(OpB!=GridBLAS_OP_N)
|
||||
ldb = n;
|
||||
|
||||
static deviceVector<ComplexF> alpha_p(1);
|
||||
static deviceVector<ComplexF> beta_p(1);
|
||||
// can prestore the 1 and the zero on device
|
||||
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(ComplexF));
|
||||
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(ComplexF));
|
||||
RealD t0=usecond();
|
||||
|
||||
#ifdef GRID_HIP
|
||||
hipblasOperation_t hOpA;
|
||||
hipblasOperation_t hOpB;
|
||||
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
|
||||
if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
|
||||
if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
|
||||
if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
|
||||
if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
|
||||
if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
|
||||
auto err = hipblasCgemm(gridblasHandle,
|
||||
hOpA,
|
||||
hOpB,
|
||||
m,n,k,
|
||||
(hipblasComplex *) &alpha_p[0],
|
||||
(hipblasComplex *) Amk, lda,
|
||||
(hipblasComplex *) Bkn, ldb,
|
||||
(hipblasComplex *) &beta_p[0],
|
||||
(hipblasComplex *) Cmn, ldc);
|
||||
assert(err==HIPBLAS_STATUS_SUCCESS);
|
||||
#endif
|
||||
#ifdef GRID_CUDA
|
||||
cublasOperation_t hOpA;
|
||||
cublasOperation_t hOpB;
|
||||
if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
|
||||
if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
|
||||
if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
|
||||
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
|
||||
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
|
||||
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
|
||||
auto err = cublasCgemm(gridblasHandle,
|
||||
hOpA,
|
||||
hOpB,
|
||||
m,n,k,
|
||||
(cuComplex *) &alpha_p[0],
|
||||
(cuComplex *) Amk, lda,
|
||||
(cuComplex *) Bkn, ldb,
|
||||
(cuComplex *) &beta_p[0],
|
||||
(cuComplex *) Cmn, ldc);
|
||||
assert(err==CUBLAS_STATUS_SUCCESS);
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
int64_t m64=m;
|
||||
int64_t n64=n;
|
||||
int64_t k64=k;
|
||||
int64_t lda64=lda;
|
||||
int64_t ldb64=ldb;
|
||||
int64_t ldc64=ldc;
|
||||
|
||||
oneapi::mkl::transpose iOpA;
|
||||
oneapi::mkl::transpose iOpB;
|
||||
|
||||
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
|
||||
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
|
||||
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
|
||||
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
|
||||
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
|
||||
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
|
||||
|
||||
oneapi::mkl::blas::column_major::gemm(*gridblasHandle,
|
||||
iOpA,
|
||||
iOpB,
|
||||
m64,n64,k64,
|
||||
(ComplexF *) &alpha_p[0],
|
||||
(const ComplexF *)Amk, (int64_t )lda64,
|
||||
(const ComplexF *)Bkn, (int64_t )ldb64,
|
||||
(ComplexF *) &beta_p[0],
|
||||
(ComplexF *)Cmn, (int64_t )ldc64);
|
||||
synchronise();
|
||||
#endif
|
||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
||||
// Need a default/reference implementation; use Eigen
|
||||
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
|
||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk,m,k);
|
||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn,k,n);
|
||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn,m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
|
||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk,k,m);
|
||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn,k,n);
|
||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn,m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
|
||||
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
|
||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk,m,k);
|
||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn,n,k);
|
||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn,m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
|
||||
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
|
||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk,k,m);
|
||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn,n,k);
|
||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn,m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
|
||||
} else {
|
||||
assert(0);
|
||||
}
|
||||
#endif
|
||||
RealD t1=usecond();
|
||||
RealD flops = 8.0*m*n*k;
|
||||
RealD bytes = 1.0*sizeof(ComplexF)*(m*k+k*n+m*n);
|
||||
}
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////
|
||||
void gemmBatched(int m,int n, int k,
|
||||
ComplexD alpha,
|
||||
deviceVector<ComplexD*> &Amk, // pointer list to matrices
|
||||
@@ -241,36 +583,6 @@ public:
|
||||
beta,
|
||||
Cmn);
|
||||
}
|
||||
void gemmBatched(int m,int n, int k,
|
||||
RealD alpha,
|
||||
deviceVector<RealD*> &Amk, // pointer list to matrices
|
||||
deviceVector<RealD*> &Bkn,
|
||||
RealD beta,
|
||||
deviceVector<RealD*> &Cmn)
|
||||
{
|
||||
gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
|
||||
m,n,k,
|
||||
alpha,
|
||||
Amk,
|
||||
Bkn,
|
||||
beta,
|
||||
Cmn);
|
||||
}
|
||||
void gemmBatched(int m,int n, int k,
|
||||
RealF alpha,
|
||||
deviceVector<RealF*> &Amk, // pointer list to matrices
|
||||
deviceVector<RealF*> &Bkn,
|
||||
RealF beta,
|
||||
deviceVector<RealF*> &Cmn)
|
||||
{
|
||||
gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
|
||||
m,n,k,
|
||||
alpha,
|
||||
Amk,
|
||||
Bkn,
|
||||
beta,
|
||||
Cmn);
|
||||
}
|
||||
|
||||
void gemmBatched(GridBLASOperation_t OpA,
|
||||
GridBLASOperation_t OpB,
|
||||
@@ -624,301 +936,6 @@ public:
|
||||
RealD bytes = 1.0*sizeof(ComplexF)*(m*k+k*n+m*n)*batchCount;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Single precision real GEMM
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void gemmBatched(GridBLASOperation_t OpA,
|
||||
GridBLASOperation_t OpB,
|
||||
int m,int n, int k,
|
||||
RealF alpha,
|
||||
deviceVector<RealF*> &Amk, // pointer list to matrices
|
||||
deviceVector<RealF*> &Bkn,
|
||||
RealF beta,
|
||||
deviceVector<RealF*> &Cmn)
|
||||
{
|
||||
RealD t2=usecond();
|
||||
int32_t batchCount = Amk.size();
|
||||
|
||||
assert(OpA!=GridBLAS_OP_C); // Real case no conjugate
|
||||
assert(OpB!=GridBLAS_OP_C);
|
||||
|
||||
int lda = m; // m x k column major
|
||||
int ldb = k; // k x n column major
|
||||
int ldc = m; // m x b column major
|
||||
if(OpA!=GridBLAS_OP_N)
|
||||
lda = k;
|
||||
if(OpB!=GridBLAS_OP_N)
|
||||
ldb = n;
|
||||
static deviceVector<RealF> alpha_p(1);
|
||||
static deviceVector<RealF> beta_p(1);
|
||||
// can prestore the 1 and the zero on device
|
||||
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealF));
|
||||
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealF));
|
||||
RealD t0=usecond();
|
||||
|
||||
assert(Bkn.size()==batchCount);
|
||||
assert(Cmn.size()==batchCount);
|
||||
#ifdef GRID_HIP
|
||||
hipblasOperation_t hOpA;
|
||||
hipblasOperation_t hOpB;
|
||||
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
|
||||
if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
|
||||
if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
|
||||
if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
|
||||
if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
|
||||
if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
|
||||
auto err = hipblasSgemmBatched(gridblasHandle,
|
||||
hOpA,
|
||||
hOpB,
|
||||
m,n,k,
|
||||
(float *) &alpha_p[0],
|
||||
(float **)&Amk[0], lda,
|
||||
(float **)&Bkn[0], ldb,
|
||||
(float *) &beta_p[0],
|
||||
(float **)&Cmn[0], ldc,
|
||||
batchCount);
|
||||
assert(err==HIPBLAS_STATUS_SUCCESS);
|
||||
#endif
|
||||
#ifdef GRID_CUDA
|
||||
cublasOperation_t hOpA;
|
||||
cublasOperation_t hOpB;
|
||||
if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
|
||||
if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
|
||||
if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
|
||||
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
|
||||
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
|
||||
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
|
||||
auto err = cublasSgemmBatched(gridblasHandle,
|
||||
hOpA,
|
||||
hOpB,
|
||||
m,n,k,
|
||||
(float *) &alpha_p[0],
|
||||
(float **)&Amk[0], lda,
|
||||
(float **)&Bkn[0], ldb,
|
||||
(float *) &beta_p[0],
|
||||
(float **)&Cmn[0], ldc,
|
||||
batchCount);
|
||||
assert(err==CUBLAS_STATUS_SUCCESS);
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
int64_t m64=m;
|
||||
int64_t n64=n;
|
||||
int64_t k64=k;
|
||||
int64_t lda64=lda;
|
||||
int64_t ldb64=ldb;
|
||||
int64_t ldc64=ldc;
|
||||
int64_t batchCount64=batchCount;
|
||||
|
||||
oneapi::mkl::transpose iOpA;
|
||||
oneapi::mkl::transpose iOpB;
|
||||
|
||||
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
|
||||
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
|
||||
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
|
||||
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
|
||||
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
|
||||
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
|
||||
|
||||
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
|
||||
&iOpA,
|
||||
&iOpB,
|
||||
&m64,&n64,&k64,
|
||||
(float *) &alpha_p[0],
|
||||
(const float **)&Amk[0], (const int64_t *)&lda64,
|
||||
(const float **)&Bkn[0], (const int64_t *)&ldb64,
|
||||
(float *) &beta_p[0],
|
||||
(float **)&Cmn[0], (const int64_t *)&ldc64,
|
||||
(int64_t)1,&batchCount64,std::vector<sycl::event>());
|
||||
synchronise();
|
||||
#endif
|
||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
||||
// Need a default/reference implementation; use Eigen
|
||||
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
|
||||
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
|
||||
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
|
||||
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
|
||||
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
|
||||
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
|
||||
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
|
||||
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
|
||||
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||
} );
|
||||
} else {
|
||||
assert(0);
|
||||
}
|
||||
#endif
|
||||
RealD t1=usecond();
|
||||
RealD flops = 2.0*m*n*k*batchCount;
|
||||
RealD bytes = 1.0*sizeof(RealF)*(m*k+k*n+m*n)*batchCount;
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Double precision real GEMM
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
void gemmBatched(GridBLASOperation_t OpA,
|
||||
GridBLASOperation_t OpB,
|
||||
int m,int n, int k,
|
||||
RealD alpha,
|
||||
deviceVector<RealD*> &Amk, // pointer list to matrices
|
||||
deviceVector<RealD*> &Bkn,
|
||||
RealD beta,
|
||||
deviceVector<RealD*> &Cmn)
|
||||
{
|
||||
RealD t2=usecond();
|
||||
int32_t batchCount = Amk.size();
|
||||
|
||||
assert(OpA!=GridBLAS_OP_C); // Real case no conjugate
|
||||
assert(OpB!=GridBLAS_OP_C);
|
||||
|
||||
int lda = m; // m x k column major
|
||||
int ldb = k; // k x n column major
|
||||
int ldc = m; // m x b column major
|
||||
if(OpA!=GridBLAS_OP_N)
|
||||
lda = k;
|
||||
if(OpB!=GridBLAS_OP_N)
|
||||
ldb = n;
|
||||
|
||||
static deviceVector<RealD> alpha_p(1);
|
||||
static deviceVector<RealD> beta_p(1);
|
||||
// can prestore the 1 and the zero on device
|
||||
acceleratorCopyToDevice((void *)&alpha,(void *)&alpha_p[0],sizeof(RealD));
|
||||
acceleratorCopyToDevice((void *)&beta ,(void *)&beta_p[0],sizeof(RealD));
|
||||
RealD t0=usecond();
|
||||
|
||||
assert(Bkn.size()==batchCount);
|
||||
assert(Cmn.size()==batchCount);
|
||||
#ifdef GRID_HIP
|
||||
hipblasOperation_t hOpA;
|
||||
hipblasOperation_t hOpB;
|
||||
if ( OpA == GridBLAS_OP_N ) hOpA = HIPBLAS_OP_N;
|
||||
if ( OpA == GridBLAS_OP_T ) hOpA = HIPBLAS_OP_T;
|
||||
if ( OpA == GridBLAS_OP_C ) hOpA = HIPBLAS_OP_C;
|
||||
if ( OpB == GridBLAS_OP_N ) hOpB = HIPBLAS_OP_N;
|
||||
if ( OpB == GridBLAS_OP_T ) hOpB = HIPBLAS_OP_T;
|
||||
if ( OpB == GridBLAS_OP_C ) hOpB = HIPBLAS_OP_C;
|
||||
auto err = hipblasDgemmBatched(gridblasHandle,
|
||||
HIPBLAS_OP_N,
|
||||
HIPBLAS_OP_N,
|
||||
m,n,k,
|
||||
(double *) &alpha_p[0],
|
||||
(double **)&Amk[0], lda,
|
||||
(double **)&Bkn[0], ldb,
|
||||
(double *) &beta_p[0],
|
||||
(double **)&Cmn[0], ldc,
|
||||
batchCount);
|
||||
assert(err==HIPBLAS_STATUS_SUCCESS);
|
||||
#endif
|
||||
#ifdef GRID_CUDA
|
||||
cublasOperation_t hOpA;
|
||||
cublasOperation_t hOpB;
|
||||
if ( OpA == GridBLAS_OP_N ) hOpA = CUBLAS_OP_N;
|
||||
if ( OpA == GridBLAS_OP_T ) hOpA = CUBLAS_OP_T;
|
||||
if ( OpA == GridBLAS_OP_C ) hOpA = CUBLAS_OP_C;
|
||||
if ( OpB == GridBLAS_OP_N ) hOpB = CUBLAS_OP_N;
|
||||
if ( OpB == GridBLAS_OP_T ) hOpB = CUBLAS_OP_T;
|
||||
if ( OpB == GridBLAS_OP_C ) hOpB = CUBLAS_OP_C;
|
||||
auto err = cublasDgemmBatched(gridblasHandle,
|
||||
hOpA,
|
||||
hOpB,
|
||||
m,n,k,
|
||||
(double *) &alpha_p[0],
|
||||
(double **)&Amk[0], lda,
|
||||
(double **)&Bkn[0], ldb,
|
||||
(double *) &beta_p[0],
|
||||
(double **)&Cmn[0], ldc,
|
||||
batchCount);
|
||||
assert(err==CUBLAS_STATUS_SUCCESS);
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
int64_t m64=m;
|
||||
int64_t n64=n;
|
||||
int64_t k64=k;
|
||||
int64_t lda64=lda;
|
||||
int64_t ldb64=ldb;
|
||||
int64_t ldc64=ldc;
|
||||
int64_t batchCount64=batchCount;
|
||||
|
||||
oneapi::mkl::transpose iOpA;
|
||||
oneapi::mkl::transpose iOpB;
|
||||
|
||||
if ( OpA == GridBLAS_OP_N ) iOpA = oneapi::mkl::transpose::N;
|
||||
if ( OpA == GridBLAS_OP_T ) iOpA = oneapi::mkl::transpose::T;
|
||||
if ( OpA == GridBLAS_OP_C ) iOpA = oneapi::mkl::transpose::C;
|
||||
if ( OpB == GridBLAS_OP_N ) iOpB = oneapi::mkl::transpose::N;
|
||||
if ( OpB == GridBLAS_OP_T ) iOpB = oneapi::mkl::transpose::T;
|
||||
if ( OpB == GridBLAS_OP_C ) iOpB = oneapi::mkl::transpose::C;
|
||||
|
||||
oneapi::mkl::blas::column_major::gemm_batch(*gridblasHandle,
|
||||
&iOpA,
|
||||
&iOpB,
|
||||
&m64,&n64,&k64,
|
||||
(double *) &alpha_p[0],
|
||||
(const double **)&Amk[0], (const int64_t *)&lda64,
|
||||
(const double **)&Bkn[0], (const int64_t *)&ldb64,
|
||||
(double *) &beta_p[0],
|
||||
(double **)&Cmn[0], (const int64_t *)&ldc64,
|
||||
(int64_t)1,&batchCount64,std::vector<sycl::event>());
|
||||
synchronise();
|
||||
#endif
|
||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP)
|
||||
// Need a default/reference implementation; use Eigen
|
||||
if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_N) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
|
||||
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
|
||||
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
|
||||
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
|
||||
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
|
||||
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
|
||||
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
|
||||
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
|
||||
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||
});
|
||||
} else {
|
||||
assert(0);
|
||||
}
|
||||
#endif
|
||||
RealD t1=usecond();
|
||||
RealD flops = 2.0*m*n*k*batchCount;
|
||||
RealD bytes = 1.0*sizeof(RealD)*(m*k+k*n+m*n)*batchCount;
|
||||
}
|
||||
|
||||
template<class CComplex>
|
||||
double benchmark(int M, int N, int K, int BATCH)
|
||||
{
|
||||
@@ -967,6 +984,47 @@ public:
|
||||
return flops; // Returns gigaflops
|
||||
}
|
||||
|
||||
template<class CComplex>
|
||||
double benchmark(int M, int N, int K)
|
||||
{
|
||||
int32_t N_A = M*K;
|
||||
int32_t N_B = K*N;
|
||||
int32_t N_C = M*N;
|
||||
deviceVector<CComplex> A(N_A); acceleratorMemSet(&A[0],0,N_A*sizeof(CComplex));
|
||||
deviceVector<CComplex> B(N_B); acceleratorMemSet(&B[0],0,N_B*sizeof(CComplex));
|
||||
deviceVector<CComplex> C(N_C); acceleratorMemSet(&C[0],0,N_C*sizeof(CComplex));
|
||||
CComplex alpha(1.0);
|
||||
CComplex beta (1.0);
|
||||
RealD flops = 8.0*M*N*K;
|
||||
int ncall=10;
|
||||
|
||||
gemm(GridBLAS_OP_C,GridBLAS_OP_N,
|
||||
M,N,K,
|
||||
alpha,
|
||||
&A[0], // m x k
|
||||
&B[0], // k x n
|
||||
beta,
|
||||
&C[0]);
|
||||
synchronise();
|
||||
|
||||
RealD t0 = usecond();
|
||||
for(int i=0;i<ncall;i++){
|
||||
gemm(GridBLAS_OP_N,GridBLAS_OP_N,
|
||||
M,N,K,
|
||||
alpha,
|
||||
&A[0], // m x k
|
||||
&B[0], // k x n
|
||||
beta,
|
||||
&C[0]);
|
||||
synchronise();
|
||||
}
|
||||
RealD t1 = usecond();
|
||||
RealD bytes = 1.0*sizeof(CComplex)*(M*N*2+N*K+M*K);
|
||||
flops = 8.0*M*N*K*ncall;
|
||||
flops = flops/(t1-t0)/1.e3;
|
||||
return flops; // Returns gigaflops
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
|
||||
@@ -1035,6 +1093,21 @@ static void BLAS(void)
|
||||
std::cout<< M<<"\t\t"<<N<<"\t\t"<<K<<"\t\t"<<BATCH<<"\t\t"<<p<<std::endl;
|
||||
}}
|
||||
fprintf(FP,"\n\n\n");
|
||||
|
||||
std::cout << "----------------------------------------------------------"<<std::endl;
|
||||
std::cout << " M "<<"\t\t"<<"N"<<"\t\t\t"<<"K"<<"\t\t"<<"Gflop/s / rank (inner product matrix)"<<std::endl;
|
||||
std::cout << "----------------------------------------------------------"<<std::endl;
|
||||
{
|
||||
int M=12;
|
||||
int N=12;
|
||||
std::vector<int> ks({4*1024*1024, 2*1024*1024, 1024*1024, 256*1024, 1024 });
|
||||
for( int kk=0;kk<ks.size();kk++ ) {
|
||||
int K = ks[kk];
|
||||
double p=blas.benchmark<CComplex>(M,N,K);
|
||||
fprintf(FP,"%d, %d, %d, %d, %f\n", M, N, K, 1, p);
|
||||
std::cout<< M<<"\t\t"<<N<<"\t\t"<<K<<"\t\t"<<1<<"\t\t"<<p<<std::endl;
|
||||
}
|
||||
}
|
||||
std::cout << "=================================================================================="<<std::endl;
|
||||
};
|
||||
|
||||
|
@@ -1,2 +1,2 @@
|
||||
|
||||
mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench
|
||||
mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench -DGRID_SYCL
|
5
BLAS_benchmark/compile-command-frontier
Normal file
5
BLAS_benchmark/compile-command-frontier
Normal file
@@ -0,0 +1,5 @@
|
||||
CXX=hipcc
|
||||
MPICXX=mpicxx
|
||||
CXXFLAGS="-fPIC -I{$ROCM_PATH}/include/ -I${MPICH_DIR}/include -L/lib64 -I/opt/cray/pe/mpich/8.1.28/ofi/gnu/12.3/include -DGRID_HIP"
|
||||
LDFLAGS="-L/lib64 -L${MPICH_DIR}/lib -lmpi -L${CRAY_MPICH_ROOTDIR}/gtl/lib -lmpi_gtl_hsa -lamdhip64 -lhipblas -lrocblas -lmpi_gnu_123"
|
||||
hipcc $CXXFLAGS $LDFLAGS BatchBlasBench.cc -o BatchBlasBench
|
2
BLAS_benchmark/compile-command-sunspot
Normal file
2
BLAS_benchmark/compile-command-sunspot
Normal file
@@ -0,0 +1,2 @@
|
||||
|
||||
mpicxx -qmkl=parallel -fsycl BatchBlasBench.cc -o BatchBlasBench -DGRID_SYCL
|
@@ -37,6 +37,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
#include <Grid/qcd/QCD.h>
|
||||
#include <Grid/qcd/spin/Spin.h>
|
||||
#include <Grid/qcd/gparity/Gparity.h>
|
||||
#include <Grid/qcd/spin/Pauli.h> // depends on Gparity
|
||||
#include <Grid/qcd/utils/Utils.h>
|
||||
#include <Grid/qcd/representations/Representations.h>
|
||||
NAMESPACE_CHECK(GridQCDCore);
|
||||
|
@@ -50,6 +50,7 @@ NAMESPACE_CHECK(approx);
|
||||
#include <Grid/algorithms/deflation/Deflation.h>
|
||||
#include <Grid/algorithms/deflation/MultiRHSBlockProject.h>
|
||||
#include <Grid/algorithms/deflation/MultiRHSDeflation.h>
|
||||
#include <Grid/algorithms/deflation/MultiRHSBlockCGLinalg.h>
|
||||
NAMESPACE_CHECK(deflation);
|
||||
#include <Grid/algorithms/iterative/ConjugateGradient.h>
|
||||
NAMESPACE_CHECK(ConjGrad);
|
||||
|
@@ -168,6 +168,7 @@ public:
|
||||
template<class vobj>
|
||||
void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){
|
||||
#ifndef HAVE_FFTW
|
||||
std::cerr << "FFTW is not compiled but is called"<<std::endl;
|
||||
assert(0);
|
||||
#else
|
||||
conformable(result.Grid(),vgrid);
|
||||
@@ -190,6 +191,7 @@ public:
|
||||
|
||||
Lattice<sobj> pgbuf(&pencil_g);
|
||||
autoView(pgbuf_v , pgbuf, CpuWrite);
|
||||
//std::cout << "CPU view" << std::endl;
|
||||
|
||||
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
|
||||
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
|
||||
@@ -213,6 +215,7 @@ public:
|
||||
else if ( sign == forward ) div = 1.0;
|
||||
else assert(0);
|
||||
|
||||
//std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
|
||||
FFTW_plan p;
|
||||
{
|
||||
FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
|
||||
@@ -226,6 +229,7 @@ public:
|
||||
}
|
||||
|
||||
// Barrel shift and collect global pencil
|
||||
//std::cout << GridLogPerformance<<"Making pencil" << std::endl;
|
||||
Coordinate lcoor(Nd), gcoor(Nd);
|
||||
result = source;
|
||||
int pc = processor_coor[dim];
|
||||
@@ -247,6 +251,7 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
//std::cout <<GridLogPerformance<< "Looping orthog" << std::endl;
|
||||
// Loop over orthog coords
|
||||
int NN=pencil_g.lSites();
|
||||
GridStopWatch timer;
|
||||
@@ -269,6 +274,7 @@ public:
|
||||
usec += timer.useconds();
|
||||
flops+= flops_call*NN;
|
||||
|
||||
//std::cout <<GridLogPerformance<< "Writing back results " << std::endl;
|
||||
// writing out result
|
||||
{
|
||||
autoView(pgbuf_v,pgbuf,CpuRead);
|
||||
@@ -285,6 +291,7 @@ public:
|
||||
}
|
||||
result = result*div;
|
||||
|
||||
//std::cout <<GridLogPerformance<< "Destroying plan " << std::endl;
|
||||
// destroying plan
|
||||
FFTW<scalar>::fftw_destroy_plan(p);
|
||||
#endif
|
||||
|
@@ -103,6 +103,38 @@ public:
|
||||
_Mat.MdagM(in,out);
|
||||
}
|
||||
};
|
||||
template<class Matrix,class Field>
|
||||
class MMdagLinearOperator : public LinearOperatorBase<Field> {
|
||||
Matrix &_Mat;
|
||||
public:
|
||||
MMdagLinearOperator(Matrix &Mat): _Mat(Mat){};
|
||||
|
||||
// Support for coarsening to a multigrid
|
||||
void OpDiag (const Field &in, Field &out) {
|
||||
_Mat.Mdiag(in,out);
|
||||
}
|
||||
void OpDir (const Field &in, Field &out,int dir,int disp) {
|
||||
_Mat.Mdir(in,out,dir,disp);
|
||||
}
|
||||
void OpDirAll (const Field &in, std::vector<Field> &out){
|
||||
_Mat.MdirAll(in,out);
|
||||
};
|
||||
void Op (const Field &in, Field &out){
|
||||
_Mat.M(in,out);
|
||||
}
|
||||
void AdjOp (const Field &in, Field &out){
|
||||
_Mat.Mdag(in,out);
|
||||
}
|
||||
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
|
||||
_Mat.MMdag(in,out);
|
||||
ComplexD dot = innerProduct(in,out);
|
||||
n1=real(dot);
|
||||
n2=norm2(out);
|
||||
}
|
||||
void HermOp(const Field &in, Field &out){
|
||||
_Mat.MMdag(in,out);
|
||||
}
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////
|
||||
// Construct herm op and shift it for mgrid smoother
|
||||
@@ -245,6 +277,38 @@ public:
|
||||
assert(0);
|
||||
}
|
||||
};
|
||||
template<class Matrix,class Field>
|
||||
class ShiftedNonHermitianLinearOperator : public LinearOperatorBase<Field> {
|
||||
Matrix &_Mat;
|
||||
RealD shift;
|
||||
public:
|
||||
ShiftedNonHermitianLinearOperator(Matrix &Mat,RealD shft): _Mat(Mat),shift(shft){};
|
||||
// Support for coarsening to a multigrid
|
||||
void OpDiag (const Field &in, Field &out) {
|
||||
_Mat.Mdiag(in,out);
|
||||
out = out + shift*in;
|
||||
}
|
||||
void OpDir (const Field &in, Field &out,int dir,int disp) {
|
||||
_Mat.Mdir(in,out,dir,disp);
|
||||
}
|
||||
void OpDirAll (const Field &in, std::vector<Field> &out){
|
||||
_Mat.MdirAll(in,out);
|
||||
};
|
||||
void Op (const Field &in, Field &out){
|
||||
_Mat.M(in,out);
|
||||
out = out + shift * in;
|
||||
}
|
||||
void AdjOp (const Field &in, Field &out){
|
||||
_Mat.Mdag(in,out);
|
||||
out = out + shift * in;
|
||||
}
|
||||
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
|
||||
assert(0);
|
||||
}
|
||||
void HermOp(const Field &in, Field &out){
|
||||
assert(0);
|
||||
}
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////
|
||||
// Even Odd Schur decomp operators; there are several
|
||||
|
@@ -45,6 +45,11 @@ public:
|
||||
M(in,tmp);
|
||||
Mdag(tmp,out);
|
||||
}
|
||||
virtual void MMdag(const Field &in, Field &out) {
|
||||
Field tmp (in.Grid());
|
||||
Mdag(in,tmp);
|
||||
M(tmp,out);
|
||||
}
|
||||
virtual void Mdiag (const Field &in, Field &out)=0;
|
||||
virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0;
|
||||
virtual void MdirAll (const Field &in, std::vector<Field> &out)=0;
|
||||
|
@@ -59,7 +59,7 @@ public:
|
||||
RealD diff = hi-lo;
|
||||
RealD delta = diff*1.0e-9;
|
||||
for (RealD x=lo; x<hi; x+=delta) {
|
||||
delta*=1.1;
|
||||
delta*=1.02;
|
||||
RealD f = approx(x);
|
||||
out<< x<<" "<<f<<std::endl;
|
||||
}
|
||||
@@ -131,6 +131,26 @@ public:
|
||||
Coeffs[j] = s * 2.0/order;
|
||||
}
|
||||
};
|
||||
template<class functor>
|
||||
void Init(RealD _lo,RealD _hi,int _order, functor & func)
|
||||
{
|
||||
lo=_lo;
|
||||
hi=_hi;
|
||||
order=_order;
|
||||
|
||||
if(order < 2) exit(-1);
|
||||
Coeffs.resize(order);
|
||||
for(int j=0;j<order;j++){
|
||||
RealD s=0;
|
||||
for(int k=0;k<order;k++){
|
||||
RealD y=std::cos(M_PI*(k+0.5)/order);
|
||||
RealD x=0.5*(y*(hi-lo)+(hi+lo));
|
||||
RealD f=func(x);
|
||||
s=s+f*std::cos( j*M_PI*(k+0.5)/order );
|
||||
}
|
||||
Coeffs[j] = s * 2.0/order;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
void JacksonSmooth(void){
|
||||
@@ -249,7 +269,9 @@ public:
|
||||
RealD xscale = 2.0/(hi-lo);
|
||||
RealD mscale = -(hi+lo)/(hi-lo);
|
||||
Linop.HermOp(T0,y);
|
||||
grid->Barrier();
|
||||
axpby(T1,xscale,mscale,y,in);
|
||||
grid->Barrier();
|
||||
|
||||
// sum = .5 c[0] T0 + c[1] T1
|
||||
// out = ()*T0 + Coeffs[1]*T1;
|
||||
|
@@ -55,10 +55,10 @@ NAMESPACE_BEGIN(Grid);
|
||||
typedef cublasHandle_t gridblasHandle_t;
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
typedef cl::sycl::queue *gridblasHandle_t;
|
||||
typedef sycl::queue *gridblasHandle_t;
|
||||
#endif
|
||||
#ifdef GRID_ONE_MKL
|
||||
typedef cl::sycl::queue *gridblasHandle_t;
|
||||
typedef sycl::queue *gridblasHandle_t;
|
||||
#endif
|
||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
|
||||
typedef int32_t gridblasHandle_t;
|
||||
@@ -89,9 +89,9 @@ public:
|
||||
gridblasHandle = theGridAccelerator;
|
||||
#endif
|
||||
#ifdef GRID_ONE_MKL
|
||||
cl::sycl::gpu_selector selector;
|
||||
cl::sycl::device selectedDevice { selector };
|
||||
cl::sycl::property_list q_prop{cl::sycl::property::queue::in_order()};
|
||||
sycl::gpu_selector selector;
|
||||
sycl::device selectedDevice { selector };
|
||||
sycl::property_list q_prop{sycl::property::queue::in_order()};
|
||||
gridblasHandle =new sycl::queue (selectedDevice,q_prop);
|
||||
#endif
|
||||
gridblasInit=1;
|
||||
@@ -208,8 +208,8 @@ public:
|
||||
assert(Bkn.size()==batchCount);
|
||||
assert(Cmn.size()==batchCount);
|
||||
|
||||
assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
|
||||
assert(OpB!=GridBLAS_OP_T);
|
||||
//assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
|
||||
//assert(OpB!=GridBLAS_OP_T);
|
||||
|
||||
int lda = m; // m x k column major
|
||||
int ldb = k; // k x n column major
|
||||
@@ -367,28 +367,67 @@ public:
|
||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
|
||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
|
||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||
else
|
||||
eCmn = alpha * eAmk * eBkn ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
|
||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
|
||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
|
||||
else
|
||||
eCmn = alpha * eAmk.adjoint() * eBkn ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
|
||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
|
||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
|
||||
else
|
||||
eCmn = alpha * eAmk.transpose() * eBkn ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
|
||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
|
||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
|
||||
else
|
||||
eCmn = alpha * eAmk * eBkn.adjoint() ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
|
||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
|
||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
|
||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
|
||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
|
||||
else
|
||||
eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
|
||||
} );
|
||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
|
||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
|
||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||
else
|
||||
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||
} );
|
||||
} else {
|
||||
assert(0);
|
||||
@@ -414,8 +453,8 @@ public:
|
||||
RealD t2=usecond();
|
||||
int32_t batchCount = Amk.size();
|
||||
|
||||
assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
|
||||
assert(OpB!=GridBLAS_OP_T);
|
||||
//assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
|
||||
//assert(OpB!=GridBLAS_OP_T);
|
||||
|
||||
int lda = m; // m x k column major
|
||||
int ldb = k; // k x n column major
|
||||
@@ -514,28 +553,70 @@ public:
|
||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
|
||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
|
||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||
else
|
||||
eCmn = alpha * eAmk * eBkn ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
|
||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
|
||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
|
||||
else
|
||||
eCmn = alpha * eAmk.adjoint() * eBkn ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
|
||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
|
||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
|
||||
else
|
||||
eCmn = alpha * eAmk.transpose() * eBkn ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
|
||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
|
||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
|
||||
else
|
||||
eCmn = alpha * eAmk * eBkn.adjoint() ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
|
||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
|
||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
|
||||
else
|
||||
eCmn = alpha * eAmk * eBkn.transpose() ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
|
||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
|
||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
|
||||
else
|
||||
eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
|
||||
} );
|
||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
|
||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
|
||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||
else
|
||||
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||
} );
|
||||
} else {
|
||||
assert(0);
|
||||
@@ -661,28 +742,40 @@ public:
|
||||
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
|
||||
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
|
||||
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||
else
|
||||
eCmn = alpha * eAmk * eBkn ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
|
||||
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
|
||||
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
|
||||
else
|
||||
eCmn = alpha * eAmk.transpose() * eBkn ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
|
||||
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
|
||||
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
|
||||
else
|
||||
eCmn = alpha * eAmk * eBkn.transpose() ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
|
||||
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
|
||||
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||
else
|
||||
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||
});
|
||||
} else {
|
||||
assert(0);
|
||||
@@ -809,28 +902,40 @@ public:
|
||||
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
|
||||
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
|
||||
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||
else
|
||||
eCmn = alpha * eAmk * eBkn ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
|
||||
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
|
||||
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
|
||||
else
|
||||
eCmn = alpha * eAmk.transpose() * eBkn ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
|
||||
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
|
||||
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
|
||||
else
|
||||
eCmn = alpha * eAmk * eBkn.transpose() ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
|
||||
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
|
||||
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||
else
|
||||
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||
});
|
||||
} else {
|
||||
assert(0);
|
||||
|
376
Grid/algorithms/deflation/MultiRHSBlockCGLinalg.h
Normal file
376
Grid/algorithms/deflation/MultiRHSBlockCGLinalg.h
Normal file
@@ -0,0 +1,376 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: MultiRHSBlockCGLinalg.h
|
||||
|
||||
Copyright (C) 2024
|
||||
|
||||
Author: Peter Boyle <pboyle@bnl.gov>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#pragma once
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
|
||||
/* Need helper object for BLAS accelerated mrhs blockCG */
|
||||
template<class Field>
|
||||
class MultiRHSBlockCGLinalg
|
||||
{
|
||||
public:
|
||||
|
||||
typedef typename Field::scalar_type scalar;
|
||||
typedef typename Field::scalar_object scalar_object;
|
||||
typedef typename Field::vector_object vector_object;
|
||||
|
||||
deviceVector<scalar> BLAS_X; // nrhs x vol -- the sources
|
||||
deviceVector<scalar> BLAS_Y; // nrhs x vol -- the result
|
||||
deviceVector<scalar> BLAS_C; // nrhs x nrhs -- the coefficients
|
||||
deviceVector<scalar> BLAS_Cred; // nrhs x nrhs x oSites -- reduction buffer
|
||||
deviceVector<scalar *> Xdip;
|
||||
deviceVector<scalar *> Ydip;
|
||||
deviceVector<scalar *> Cdip;
|
||||
|
||||
MultiRHSBlockCGLinalg() {};
|
||||
~MultiRHSBlockCGLinalg(){ Deallocate(); };
|
||||
|
||||
void Deallocate(void)
|
||||
{
|
||||
Xdip.resize(0);
|
||||
Ydip.resize(0);
|
||||
Cdip.resize(0);
|
||||
BLAS_Cred.resize(0);
|
||||
BLAS_C.resize(0);
|
||||
BLAS_X.resize(0);
|
||||
BLAS_Y.resize(0);
|
||||
}
|
||||
void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0)
|
||||
{
|
||||
std::vector<Field> Y_copy(AP.size(),AP[0].Grid());
|
||||
for(int r=0;r<AP.size();r++){
|
||||
Y_copy[r] = Y[r];
|
||||
}
|
||||
MulMatrix(AP,m,X);
|
||||
for(int r=0;r<AP.size();r++){
|
||||
AP[r] = scale*AP[r]+Y_copy[r];
|
||||
}
|
||||
}
|
||||
void MulMatrix(std::vector<Field> &Y, Eigen::MatrixXcd &m , const std::vector<Field> &X)
|
||||
{
|
||||
typedef typename Field::scalar_type scomplex;
|
||||
GridBase *grid;
|
||||
uint64_t vol;
|
||||
uint64_t words;
|
||||
|
||||
int nrhs = Y.size();
|
||||
grid = X[0].Grid();
|
||||
vol = grid->lSites();
|
||||
words = sizeof(scalar_object)/sizeof(scalar);
|
||||
int64_t vw = vol * words;
|
||||
|
||||
RealD t0 = usecond();
|
||||
BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
|
||||
BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
|
||||
BLAS_C.resize(nrhs * nrhs);// cost free if size doesn't change
|
||||
RealD t1 = usecond();
|
||||
|
||||
/////////////////////////////////////////////
|
||||
// Copy in the multi-rhs sources
|
||||
/////////////////////////////////////////////
|
||||
for(int r=0;r<nrhs;r++){
|
||||
int64_t offset = r*vw;
|
||||
autoView(x_v,X[r],AcceleratorRead);
|
||||
acceleratorCopyDeviceToDevice(&x_v[0],&BLAS_X[offset],sizeof(scalar_object)*vol);
|
||||
}
|
||||
|
||||
// Assumes Eigen storage contiguous
|
||||
acceleratorCopyToDevice(&m(0,0),&BLAS_C[0],BLAS_C.size()*sizeof(scalar));
|
||||
|
||||
/*
|
||||
* in Fortran column major notation (cuBlas order)
|
||||
*
|
||||
* Xxr = [X1(x)][..][Xn(x)]
|
||||
* Yxr = [Y1(x)][..][Ym(x)]
|
||||
* Y = X . C
|
||||
*/
|
||||
deviceVector<scalar *> Xd(1);
|
||||
deviceVector<scalar *> Yd(1);
|
||||
deviceVector<scalar *> Cd(1);
|
||||
|
||||
scalar * Xh = & BLAS_X[0];
|
||||
scalar * Yh = & BLAS_Y[0];
|
||||
scalar * Ch = & BLAS_C[0];
|
||||
|
||||
acceleratorPut(Xd[0],Xh);
|
||||
acceleratorPut(Yd[0],Yh);
|
||||
acceleratorPut(Cd[0],Ch);
|
||||
|
||||
RealD t2 = usecond();
|
||||
GridBLAS BLAS;
|
||||
/////////////////////////////////////////
|
||||
// Y = X*C (transpose?)
|
||||
/////////////////////////////////////////
|
||||
BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
|
||||
vw,nrhs,nrhs,
|
||||
scalar(1.0),
|
||||
Xd,
|
||||
Cd,
|
||||
scalar(0.0), // wipe out Y
|
||||
Yd);
|
||||
BLAS.synchronise();
|
||||
RealD t3 = usecond();
|
||||
|
||||
// Copy back Y = m X
|
||||
for(int r=0;r<nrhs;r++){
|
||||
int64_t offset = r*vw;
|
||||
autoView(y_v,Y[r],AcceleratorWrite);
|
||||
acceleratorCopyDeviceToDevice(&BLAS_Y[offset],&y_v[0],sizeof(scalar_object)*vol);
|
||||
}
|
||||
RealD t4 = usecond();
|
||||
std::cout <<GridLogPerformance << "MulMatrix alloc took "<< t1-t0<<" us"<<std::endl;
|
||||
std::cout <<GridLogPerformance<< "MulMatrix preamble took "<< t2-t1<<" us"<<std::endl;
|
||||
std::cout <<GridLogPerformance<< "MulMatrix blas took "<< t3-t2<<" us"<<std::endl;
|
||||
std::cout <<GridLogPerformance<< "MulMatrix copy took "<< t4-t3<<" us"<<std::endl;
|
||||
std::cout <<GridLogPerformance<< "MulMatrix total "<< t4-t0<<" us"<<std::endl;
|
||||
}
|
||||
|
||||
void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y)
|
||||
{
|
||||
#if 0
|
||||
int nrhs;
|
||||
GridBase *grid;
|
||||
uint64_t vol;
|
||||
uint64_t words;
|
||||
|
||||
nrhs = X.size();
|
||||
assert(X.size()==Y.size());
|
||||
conformable(X[0],Y[0]);
|
||||
|
||||
grid = X[0].Grid();
|
||||
vol = grid->lSites();
|
||||
words = sizeof(scalar_object)/sizeof(scalar);
|
||||
int64_t vw = vol * words;
|
||||
|
||||
RealD t0 = usecond();
|
||||
BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
|
||||
BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
|
||||
BLAS_C.resize(nrhs * nrhs);// cost free if size doesn't change
|
||||
RealD t1 = usecond();
|
||||
|
||||
/////////////////////////////////////////////
|
||||
// Copy in the multi-rhs sources
|
||||
/////////////////////////////////////////////
|
||||
for(int r=0;r<nrhs;r++){
|
||||
int64_t offset = r*vw;
|
||||
autoView(x_v,X[r],AcceleratorRead);
|
||||
acceleratorCopyDeviceToDevice(&x_v[0],&BLAS_X[offset],sizeof(scalar_object)*vol);
|
||||
autoView(y_v,Y[r],AcceleratorRead);
|
||||
acceleratorCopyDeviceToDevice(&y_v[0],&BLAS_Y[offset],sizeof(scalar_object)*vol);
|
||||
}
|
||||
RealD t2 = usecond();
|
||||
|
||||
/*
|
||||
* in Fortran column major notation (cuBlas order)
|
||||
*
|
||||
* Xxr = [X1(x)][..][Xn(x)]
|
||||
*
|
||||
* Yxr = [Y1(x)][..][Ym(x)]
|
||||
*
|
||||
* C_rs = X^dag Y
|
||||
*/
|
||||
deviceVector<scalar *> Xd(1);
|
||||
deviceVector<scalar *> Yd(1);
|
||||
deviceVector<scalar *> Cd(1);
|
||||
|
||||
scalar * Xh = & BLAS_X[0];
|
||||
scalar * Yh = & BLAS_Y[0];
|
||||
scalar * Ch = & BLAS_C[0];
|
||||
|
||||
acceleratorPut(Xd[0],Xh);
|
||||
acceleratorPut(Yd[0],Yh);
|
||||
acceleratorPut(Cd[0],Ch);
|
||||
|
||||
GridBLAS BLAS;
|
||||
|
||||
RealD t3 = usecond();
|
||||
/////////////////////////////////////////
|
||||
// C_rs = X^dag Y
|
||||
/////////////////////////////////////////
|
||||
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
|
||||
nrhs,nrhs,vw,
|
||||
ComplexD(1.0),
|
||||
Xd,
|
||||
Yd,
|
||||
ComplexD(0.0), // wipe out C
|
||||
Cd);
|
||||
BLAS.synchronise();
|
||||
RealD t4 = usecond();
|
||||
|
||||
std::vector<scalar> HOST_C(BLAS_C.size()); // nrhs . nrhs -- the coefficients
|
||||
acceleratorCopyFromDevice(&BLAS_C[0],&HOST_C[0],BLAS_C.size()*sizeof(scalar));
|
||||
grid->GlobalSumVector(&HOST_C[0],nrhs*nrhs);
|
||||
|
||||
RealD t5 = usecond();
|
||||
for(int rr=0;rr<nrhs;rr++){
|
||||
for(int r=0;r<nrhs;r++){
|
||||
int off = r+nrhs*rr;
|
||||
m(r,rr)=HOST_C[off];
|
||||
}
|
||||
}
|
||||
RealD t6 = usecond();
|
||||
uint64_t M=nrhs;
|
||||
uint64_t N=nrhs;
|
||||
uint64_t K=vw;
|
||||
RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K);
|
||||
RealD flops = 8.0*M*N*K;
|
||||
flops = flops/(t4-t3)/1.e3;
|
||||
bytes = bytes/(t4-t3)/1.e3;
|
||||
std::cout <<GridLogPerformance<< "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
|
||||
std::cout <<GridLogPerformance<< "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
|
||||
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t2 "<< t2-t1<<" us"<<std::endl;
|
||||
std::cout <<GridLogPerformance<< "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
|
||||
std::cout <<GridLogPerformance<< "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
|
||||
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< flops<<" GF/s"<<std::endl;
|
||||
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< bytes<<" GB/s"<<std::endl;
|
||||
std::cout <<GridLogPerformance<< "InnerProductMatrix gsum t5 "<< t5-t4<<" us"<<std::endl;
|
||||
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t6 "<< t6-t5<<" us"<<std::endl;
|
||||
std::cout <<GridLogPerformance<< "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
|
||||
#else
|
||||
int nrhs;
|
||||
GridBase *grid;
|
||||
uint64_t vol;
|
||||
uint64_t words;
|
||||
|
||||
nrhs = X.size();
|
||||
assert(X.size()==Y.size());
|
||||
conformable(X[0],Y[0]);
|
||||
|
||||
grid = X[0].Grid();
|
||||
int rd0 = grid->_rdimensions[0] * grid->_rdimensions[1];
|
||||
vol = grid->oSites()/rd0;
|
||||
words = rd0*sizeof(vector_object)/sizeof(scalar);
|
||||
int64_t vw = vol * words;
|
||||
assert(vw == grid->lSites()*sizeof(scalar_object)/sizeof(scalar));
|
||||
|
||||
RealD t0 = usecond();
|
||||
BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
|
||||
BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
|
||||
BLAS_Cred.resize(nrhs * nrhs * vol);// cost free if size doesn't change
|
||||
RealD t1 = usecond();
|
||||
|
||||
/////////////////////////////////////////////
|
||||
// Copy in the multi-rhs sources -- layout batched BLAS ready
|
||||
/////////////////////////////////////////////
|
||||
for(int r=0;r<nrhs;r++){
|
||||
autoView(x_v,X[r],AcceleratorRead);
|
||||
autoView(y_v,Y[r],AcceleratorRead);
|
||||
scalar *from_x=(scalar *)&x_v[0];
|
||||
scalar *from_y=(scalar *)&y_v[0];
|
||||
scalar *BX = &BLAS_X[0];
|
||||
scalar *BY = &BLAS_Y[0];
|
||||
accelerator_for(ssw,vw,1,{
|
||||
uint64_t ss=ssw/words;
|
||||
uint64_t w=ssw%words;
|
||||
uint64_t offset = w+r*words+ss*nrhs*words; // [ss][rhs][words]
|
||||
BX[offset] = from_x[ssw];
|
||||
BY[offset] = from_y[ssw];
|
||||
});
|
||||
}
|
||||
RealD t2 = usecond();
|
||||
|
||||
/*
|
||||
* in Fortran column major notation (cuBlas order)
|
||||
*
|
||||
* Xxr = [X1(x)][..][Xn(x)]
|
||||
*
|
||||
* Yxr = [Y1(x)][..][Ym(x)]
|
||||
*
|
||||
* C_rs = X^dag Y
|
||||
*/
|
||||
Xdip.resize(vol);
|
||||
Ydip.resize(vol);
|
||||
Cdip.resize(vol);
|
||||
std::vector<scalar *> Xh(vol);
|
||||
std::vector<scalar *> Yh(vol);
|
||||
std::vector<scalar *> Ch(vol);
|
||||
for(uint64_t ss=0;ss<vol;ss++){
|
||||
|
||||
Xh[ss] = & BLAS_X[ss*nrhs*words];
|
||||
Yh[ss] = & BLAS_Y[ss*nrhs*words];
|
||||
Ch[ss] = & BLAS_Cred[ss*nrhs*nrhs];
|
||||
|
||||
}
|
||||
acceleratorCopyToDevice(&Xh[0],&Xdip[0],vol*sizeof(scalar *));
|
||||
acceleratorCopyToDevice(&Yh[0],&Ydip[0],vol*sizeof(scalar *));
|
||||
acceleratorCopyToDevice(&Ch[0],&Cdip[0],vol*sizeof(scalar *));
|
||||
|
||||
GridBLAS BLAS;
|
||||
|
||||
RealD t3 = usecond();
|
||||
/////////////////////////////////////////
|
||||
// C_rs = X^dag Y
|
||||
/////////////////////////////////////////
|
||||
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
|
||||
nrhs,nrhs,words,
|
||||
ComplexD(1.0),
|
||||
Xdip,
|
||||
Ydip,
|
||||
ComplexD(0.0), // wipe out C
|
||||
Cdip);
|
||||
BLAS.synchronise();
|
||||
RealD t4 = usecond();
|
||||
|
||||
std::vector<scalar> HOST_C(BLAS_Cred.size()); // nrhs . nrhs -- the coefficients
|
||||
acceleratorCopyFromDevice(&BLAS_Cred[0],&HOST_C[0],BLAS_Cred.size()*sizeof(scalar));
|
||||
|
||||
RealD t5 = usecond();
|
||||
m = Eigen::MatrixXcd::Zero(nrhs,nrhs);
|
||||
for(int ss=0;ss<vol;ss++){
|
||||
Eigen::Map<Eigen::MatrixXcd> eC((std::complex<double> *)&HOST_C[ss*nrhs*nrhs],nrhs,nrhs);
|
||||
m = m + eC;
|
||||
}
|
||||
RealD t6l = usecond();
|
||||
grid->GlobalSumVector((scalar *) &m(0,0),nrhs*nrhs);
|
||||
RealD t6 = usecond();
|
||||
uint64_t M=nrhs;
|
||||
uint64_t N=nrhs;
|
||||
uint64_t K=vw;
|
||||
RealD xybytes = grid->lSites()*sizeof(scalar_object);
|
||||
RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K);
|
||||
RealD flops = 8.0*M*N*K;
|
||||
flops = flops/(t4-t3)/1.e3;
|
||||
bytes = bytes/(t4-t3)/1.e3;
|
||||
xybytes = 4*xybytes/(t2-t1)/1.e3;
|
||||
std::cout <<GridLogPerformance<< "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
|
||||
std::cout <<GridLogPerformance<< "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
|
||||
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t2 "<< t2-t1<<" us "<<xybytes<<" GB/s"<<std::endl;
|
||||
std::cout <<GridLogPerformance<< "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
|
||||
std::cout <<GridLogPerformance<< "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
|
||||
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< flops<<" GF/s"<<std::endl;
|
||||
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< bytes<<" GB/s"<<std::endl;
|
||||
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t5 "<< t5-t4<<" us"<<std::endl;
|
||||
std::cout <<GridLogPerformance<< "InnerProductMatrix lsum t6l "<< t6l-t5<<" us"<<std::endl;
|
||||
std::cout <<GridLogPerformance<< "InnerProductMatrix gsum t6 "<< t6-t6l<<" us"<<std::endl;
|
||||
std::cout <<GridLogPerformance<< "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
NAMESPACE_END(Grid);
|
@@ -447,10 +447,10 @@ public:
|
||||
/////////////////////////////////////////
|
||||
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
|
||||
nbasis,nrhs,vw,
|
||||
ComplexD(1.0),
|
||||
scalar(1.0),
|
||||
Vd,
|
||||
Fd,
|
||||
ComplexD(0.0), // wipe out C
|
||||
scalar(0.0), // wipe out C
|
||||
Cd);
|
||||
BLAS.synchronise();
|
||||
// std::cout << "BlockProject done"<<std::endl;
|
||||
@@ -497,10 +497,10 @@ public:
|
||||
int64_t vw = block_vol * words;
|
||||
BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
|
||||
vw,nrhs,nbasis,
|
||||
ComplexD(1.0),
|
||||
scalar(1.0),
|
||||
Vd,
|
||||
Cd,
|
||||
ComplexD(0.0), // wipe out C
|
||||
scalar(0.0), // wipe out C
|
||||
Fd);
|
||||
BLAS.synchronise();
|
||||
// std::cout << " blas call done"<<std::endl;
|
||||
|
@@ -182,10 +182,10 @@ public:
|
||||
/////////////////////////////////////////
|
||||
BLAS.gemmBatched(GridBLAS_OP_C,GridBLAS_OP_N,
|
||||
nev,nrhs,vw,
|
||||
ComplexD(1.0),
|
||||
scalar(1.0),
|
||||
Ed,
|
||||
Rd,
|
||||
ComplexD(0.0), // wipe out C
|
||||
scalar(0.0), // wipe out C
|
||||
Cd);
|
||||
BLAS.synchronise();
|
||||
|
||||
@@ -210,10 +210,10 @@ public:
|
||||
/////////////////////////////////////////
|
||||
BLAS.gemmBatched(GridBLAS_OP_N,GridBLAS_OP_N,
|
||||
vw,nrhs,nev,
|
||||
ComplexD(1.0),
|
||||
scalar(1.0),
|
||||
Ed, // x . nev
|
||||
Cd, // nev . nrhs
|
||||
ComplexD(0.0),
|
||||
scalar(0.0),
|
||||
Gd);
|
||||
BLAS.synchronise();
|
||||
|
||||
|
@@ -53,6 +53,7 @@ class TwoLevelCGmrhs
|
||||
// Fine operator, Smoother, CoarseSolver
|
||||
LinearOperatorBase<Field> &_FineLinop;
|
||||
LinearFunction<Field> &_Smoother;
|
||||
MultiRHSBlockCGLinalg<Field> _BlockCGLinalg;
|
||||
|
||||
GridStopWatch ProjectTimer;
|
||||
GridStopWatch PromoteTimer;
|
||||
@@ -62,7 +63,12 @@ class TwoLevelCGmrhs
|
||||
GridStopWatch SmoothTimer;
|
||||
GridStopWatch InsertTimer;
|
||||
|
||||
|
||||
/*
|
||||
Field rrr;
|
||||
Field sss;
|
||||
Field qqq;
|
||||
Field zzz;
|
||||
*/
|
||||
// more most opertor functions
|
||||
TwoLevelCGmrhs(RealD tol,
|
||||
Integer maxit,
|
||||
@@ -73,12 +79,313 @@ class TwoLevelCGmrhs
|
||||
MaxIterations(maxit),
|
||||
_FineLinop(FineLinop),
|
||||
_Smoother(Smoother)
|
||||
/*
|
||||
rrr(fine),
|
||||
sss(fine),
|
||||
qqq(fine),
|
||||
zzz(fine)
|
||||
*/
|
||||
{
|
||||
grid = fine;
|
||||
};
|
||||
|
||||
// Vector case
|
||||
virtual void operator() (std::vector<Field> &src, std::vector<Field> &x)
|
||||
{
|
||||
// SolveSingleSystem(src,x);
|
||||
SolvePrecBlockCG(src,x);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Thin QR factorisation (google it)
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
//Dimensions
|
||||
// R_{ferm x Nblock} = Q_{ferm x Nblock} x C_{Nblock x Nblock} -> ferm x Nblock
|
||||
//
|
||||
// Rdag R = m_rr = Herm = L L^dag <-- Cholesky decomposition (LLT routine in Eigen)
|
||||
//
|
||||
// Q C = R => Q = R C^{-1}
|
||||
//
|
||||
// Want Ident = Q^dag Q = C^{-dag} R^dag R C^{-1} = C^{-dag} L L^dag C^{-1} = 1_{Nblock x Nblock}
|
||||
//
|
||||
// Set C = L^{dag}, and then Q^dag Q = ident
|
||||
//
|
||||
// Checks:
|
||||
// Cdag C = Rdag R ; passes.
|
||||
// QdagQ = 1 ; passes
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
void ThinQRfact (Eigen::MatrixXcd &m_zz,
|
||||
Eigen::MatrixXcd &C,
|
||||
Eigen::MatrixXcd &Cinv,
|
||||
std::vector<Field> & Q,
|
||||
std::vector<Field> & MQ,
|
||||
const std::vector<Field> & Z,
|
||||
const std::vector<Field> & MZ)
|
||||
{
|
||||
RealD t0=usecond();
|
||||
_BlockCGLinalg.InnerProductMatrix(m_zz,MZ,Z);
|
||||
RealD t1=usecond();
|
||||
|
||||
m_zz = 0.5*(m_zz+m_zz.adjoint());
|
||||
|
||||
Eigen::MatrixXcd L = m_zz.llt().matrixL();
|
||||
|
||||
C = L.adjoint();
|
||||
Cinv = C.inverse();
|
||||
|
||||
RealD t3=usecond();
|
||||
_BlockCGLinalg.MulMatrix( Q,Cinv,Z);
|
||||
_BlockCGLinalg.MulMatrix(MQ,Cinv,MZ);
|
||||
RealD t4=usecond();
|
||||
std::cout << " ThinQRfact IP :"<< t1-t0<<" us"<<std::endl;
|
||||
std::cout << " ThinQRfact Eigen :"<< t3-t1<<" us"<<std::endl;
|
||||
std::cout << " ThinQRfact MulMat:"<< t4-t3<<" us"<<std::endl;
|
||||
}
|
||||
|
||||
virtual void SolvePrecBlockCG (std::vector<Field> &src, std::vector<Field> &X)
|
||||
{
|
||||
std::cout << GridLogMessage<<"HDCG: mrhs fPrecBlockcg starting"<<std::endl;
|
||||
src[0].Grid()->Barrier();
|
||||
int nrhs = src.size();
|
||||
// std::vector<RealD> f(nrhs);
|
||||
// std::vector<RealD> rtzp(nrhs);
|
||||
// std::vector<RealD> rtz(nrhs);
|
||||
// std::vector<RealD> a(nrhs);
|
||||
// std::vector<RealD> d(nrhs);
|
||||
// std::vector<RealD> b(nrhs);
|
||||
// std::vector<RealD> rptzp(nrhs);
|
||||
|
||||
////////////////////////////////////////////
|
||||
//Initial residual computation & set up
|
||||
////////////////////////////////////////////
|
||||
std::vector<RealD> ssq(nrhs);
|
||||
for(int rhs=0;rhs<nrhs;rhs++){
|
||||
ssq[rhs]=norm2(src[rhs]); assert(ssq[rhs]!=0.0);
|
||||
}
|
||||
|
||||
///////////////////////////
|
||||
// Fields -- eliminate duplicates between fPcg and block cg
|
||||
///////////////////////////
|
||||
std::vector<Field> Mtmp(nrhs,grid);
|
||||
std::vector<Field> tmp(nrhs,grid);
|
||||
std::vector<Field> Z(nrhs,grid); // Rename Z to R
|
||||
std::vector<Field> MZ(nrhs,grid); // Rename MZ to Z
|
||||
std::vector<Field> Q(nrhs,grid); //
|
||||
std::vector<Field> MQ(nrhs,grid); // Rename to P
|
||||
std::vector<Field> D(nrhs,grid);
|
||||
std::vector<Field> AD(nrhs,grid);
|
||||
|
||||
/************************************************************************
|
||||
* Preconditioned Block conjugate gradient rQ
|
||||
* Generalise Sebastien Birk Thesis, after Dubrulle 2001.
|
||||
* Introduce preconditioning following Saad Ch9
|
||||
************************************************************************
|
||||
* Dimensions:
|
||||
*
|
||||
* X,B etc... ==(Nferm x nrhs)
|
||||
* Matrix A==(Nferm x Nferm)
|
||||
*
|
||||
* Nferm = Nspin x Ncolour x Ncomplex x Nlattice_site
|
||||
* QC => Thin QR factorisation (google it)
|
||||
*
|
||||
* R = B-AX
|
||||
* Z = Mi R
|
||||
* QC = Z
|
||||
* D = Q
|
||||
* for k:
|
||||
* R = AD
|
||||
* Z = Mi R
|
||||
* M = [D^dag R]^{-1}
|
||||
* X = X + D M C
|
||||
* QS = Q - Z.M
|
||||
* D = Q + D S^dag
|
||||
* C = S C
|
||||
*/
|
||||
Eigen::MatrixXcd m_DZ = Eigen::MatrixXcd::Identity(nrhs,nrhs);
|
||||
Eigen::MatrixXcd m_M = Eigen::MatrixXcd::Identity(nrhs,nrhs);
|
||||
Eigen::MatrixXcd m_zz = Eigen::MatrixXcd::Zero(nrhs,nrhs);
|
||||
Eigen::MatrixXcd m_rr = Eigen::MatrixXcd::Zero(nrhs,nrhs);
|
||||
|
||||
Eigen::MatrixXcd m_C = Eigen::MatrixXcd::Zero(nrhs,nrhs);
|
||||
Eigen::MatrixXcd m_Cinv = Eigen::MatrixXcd::Zero(nrhs,nrhs);
|
||||
Eigen::MatrixXcd m_S = Eigen::MatrixXcd::Zero(nrhs,nrhs);
|
||||
Eigen::MatrixXcd m_Sinv = Eigen::MatrixXcd::Zero(nrhs,nrhs);
|
||||
|
||||
Eigen::MatrixXcd m_tmp = Eigen::MatrixXcd::Identity(nrhs,nrhs);
|
||||
Eigen::MatrixXcd m_tmp1 = Eigen::MatrixXcd::Identity(nrhs,nrhs);
|
||||
|
||||
GridStopWatch HDCGTimer;
|
||||
|
||||
//////////////////////////
|
||||
// x0 = Vstart -- possibly modify guess
|
||||
//////////////////////////
|
||||
Vstart(X,src);
|
||||
|
||||
//////////////////////////
|
||||
// R = B-AX
|
||||
//////////////////////////
|
||||
for(int rhs=0;rhs<nrhs;rhs++){
|
||||
// r0 = b -A x0
|
||||
_FineLinop.HermOp(X[rhs],tmp[rhs]);
|
||||
axpy (Z[rhs], -1.0,tmp[rhs], src[rhs]); // Computes R=Z=src - A X0
|
||||
}
|
||||
|
||||
//////////////////////////////////
|
||||
// Compute MZ = M1 Z = M1 B - M1 A x0
|
||||
//////////////////////////////////
|
||||
PcgM1(Z,MZ);
|
||||
|
||||
//////////////////////////////////
|
||||
// QC = Z
|
||||
//////////////////////////////////
|
||||
ThinQRfact (m_zz, m_C, m_Cinv, Q, MQ, Z, MZ);
|
||||
|
||||
//////////////////////////////////
|
||||
// D=MQ
|
||||
//////////////////////////////////
|
||||
for(int b=0;b<nrhs;b++) D[b]=MQ[b]; // LLT rotation of the MZ basis of search dirs
|
||||
|
||||
std::cout << GridLogMessage<<"PrecBlockCGrQ vec computed initial residual and QR fact " <<std::endl;
|
||||
|
||||
ProjectTimer.Reset();
|
||||
PromoteTimer.Reset();
|
||||
DeflateTimer.Reset();
|
||||
CoarseTimer.Reset();
|
||||
SmoothTimer.Reset();
|
||||
FineTimer.Reset();
|
||||
InsertTimer.Reset();
|
||||
|
||||
GridStopWatch M1Timer;
|
||||
GridStopWatch M2Timer;
|
||||
GridStopWatch M3Timer;
|
||||
GridStopWatch LinalgTimer;
|
||||
GridStopWatch InnerProdTimer;
|
||||
|
||||
HDCGTimer.Start();
|
||||
|
||||
std::vector<RealD> rn(nrhs);
|
||||
for (int k=0;k<=MaxIterations;k++){
|
||||
|
||||
////////////////////
|
||||
// Z = AD
|
||||
////////////////////
|
||||
M3Timer.Start();
|
||||
for(int b=0;b<nrhs;b++) _FineLinop.HermOp(D[b], Z[b]);
|
||||
M3Timer.Stop();
|
||||
|
||||
////////////////////
|
||||
// MZ = M1 Z <==== the Multigrid preconditioner
|
||||
////////////////////
|
||||
M1Timer.Start();
|
||||
PcgM1(Z,MZ);
|
||||
M1Timer.Stop();
|
||||
|
||||
FineTimer.Start();
|
||||
////////////////////
|
||||
// M = [D^dag Z]^{-1} = (<Ddag MZ>_M)^{-1} inner prod, generalising Saad derivation of Precon CG
|
||||
////////////////////
|
||||
InnerProdTimer.Start();
|
||||
_BlockCGLinalg.InnerProductMatrix(m_DZ,D,Z);
|
||||
InnerProdTimer.Stop();
|
||||
m_M = m_DZ.inverse();
|
||||
|
||||
///////////////////////////
|
||||
// X = X + D MC
|
||||
///////////////////////////
|
||||
m_tmp = m_M * m_C;
|
||||
LinalgTimer.Start();
|
||||
_BlockCGLinalg.MaddMatrix(X,m_tmp, D,X); // D are the search directions and X takes the updates
|
||||
LinalgTimer.Stop();
|
||||
|
||||
///////////////////////////
|
||||
// QS = Q - M Z
|
||||
// (MQ) S = MQ - M (M1Z)
|
||||
///////////////////////////
|
||||
LinalgTimer.Start();
|
||||
_BlockCGLinalg.MaddMatrix(tmp ,m_M, Z, Q,-1.0);
|
||||
_BlockCGLinalg.MaddMatrix(Mtmp,m_M,MZ,MQ,-1.0);
|
||||
ThinQRfact (m_zz, m_S, m_Sinv, Q, MQ, tmp, Mtmp);
|
||||
LinalgTimer.Stop();
|
||||
|
||||
////////////////////////////
|
||||
// D = MQ + D S^dag
|
||||
////////////////////////////
|
||||
m_tmp = m_S.adjoint();
|
||||
LinalgTimer.Start();
|
||||
_BlockCGLinalg.MaddMatrix(D,m_tmp,D,MQ);
|
||||
LinalgTimer.Stop();
|
||||
|
||||
////////////////////////////
|
||||
// C = S C
|
||||
////////////////////////////
|
||||
m_C = m_S*m_C;
|
||||
|
||||
////////////////////////////
|
||||
// convergence monitor
|
||||
////////////////////////////
|
||||
m_rr = m_C.adjoint() * m_C;
|
||||
|
||||
FineTimer.Stop();
|
||||
|
||||
RealD max_resid=0;
|
||||
RealD rrsum=0;
|
||||
RealD sssum=0;
|
||||
RealD rr;
|
||||
|
||||
for(int b=0;b<nrhs;b++) {
|
||||
rrsum+=real(m_rr(b,b));
|
||||
sssum+=ssq[b];
|
||||
rr = real(m_rr(b,b))/ssq[b];
|
||||
if ( rr > max_resid ) max_resid = rr;
|
||||
}
|
||||
std::cout << GridLogMessage <<
|
||||
"\t Prec BlockCGrQ Iteration "<<k<<" ave resid "<< std::sqrt(rrsum/sssum) << " max "<< std::sqrt(max_resid) <<std::endl;
|
||||
|
||||
|
||||
if ( max_resid < Tolerance*Tolerance ) {
|
||||
|
||||
HDCGTimer.Stop();
|
||||
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ converged in "<<k<<" iterations and "<<HDCGTimer.Elapsed()<<std::endl;;
|
||||
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Linalg "<<LinalgTimer.Elapsed()<<std::endl;;
|
||||
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : fine H "<<M3Timer.Elapsed()<<std::endl;;
|
||||
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : prec M1 "<<M1Timer.Elapsed()<<std::endl;;
|
||||
std::cout<<GridLogMessage<<"**** M1 breakdown:"<<std::endl;
|
||||
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Project "<<ProjectTimer.Elapsed()<<std::endl;;
|
||||
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Promote "<<PromoteTimer.Elapsed()<<std::endl;;
|
||||
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Deflate "<<DeflateTimer.Elapsed()<<std::endl;;
|
||||
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Coarse "<<CoarseTimer.Elapsed()<<std::endl;;
|
||||
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Fine "<<FineTimer.Elapsed()<<std::endl;;
|
||||
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Smooth "<<SmoothTimer.Elapsed()<<std::endl;;
|
||||
std::cout<<GridLogMessage<<"HDCG: mrhs PrecBlockCGrQ : Insert "<<InsertTimer.Elapsed()<<std::endl;;
|
||||
|
||||
for(int rhs=0;rhs<nrhs;rhs++){
|
||||
|
||||
_FineLinop.HermOp(X[rhs],tmp[rhs]);
|
||||
|
||||
Field mytmp(grid);
|
||||
axpy(mytmp,-1.0,src[rhs],tmp[rhs]);
|
||||
|
||||
RealD xnorm = sqrt(norm2(X[rhs]));
|
||||
RealD srcnorm = sqrt(norm2(src[rhs]));
|
||||
RealD tmpnorm = sqrt(norm2(mytmp));
|
||||
RealD true_residual = tmpnorm/srcnorm;
|
||||
std::cout<<GridLogMessage
|
||||
<<"HDCG: true residual ["<<rhs<<"] is "<<true_residual
|
||||
<<" solution "<<xnorm
|
||||
<<" source "<<srcnorm
|
||||
<<std::endl;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
||||
HDCGTimer.Stop();
|
||||
std::cout<<GridLogMessage<<"HDCG: PrecBlockCGrQ not converged "<<HDCGTimer.Elapsed()<<std::endl;
|
||||
assert(0);
|
||||
}
|
||||
|
||||
virtual void SolveSingleSystem (std::vector<Field> &src, std::vector<Field> &x)
|
||||
{
|
||||
std::cout << GridLogMessage<<"HDCG: mrhs fPcg starting"<<std::endl;
|
||||
src[0].Grid()->Barrier();
|
||||
@@ -361,15 +668,26 @@ public:
|
||||
CoarseField PleftProjMrhs(this->coarsegridmrhs);
|
||||
CoarseField PleftMss_projMrhs(this->coarsegridmrhs);
|
||||
|
||||
for(int rhs=0;rhs<nrhs;rhs++) {
|
||||
// this->rrr=in[0];
|
||||
|
||||
#undef SMOOTHER_BLOCK_SOLVE
|
||||
#if SMOOTHER_BLOCK_SOLVE
|
||||
this->SmoothTimer.Start();
|
||||
this->_Smoother(in,Min);
|
||||
this->SmoothTimer.Stop();
|
||||
#else
|
||||
for(int rhs=0;rhs<nrhs;rhs++) {
|
||||
this->SmoothTimer.Start();
|
||||
this->_Smoother(in[rhs],Min[rhs]);
|
||||
this->SmoothTimer.Stop();
|
||||
}
|
||||
#endif
|
||||
// this->sss=Min[0];
|
||||
|
||||
for(int rhs=0;rhs<nrhs;rhs++) {
|
||||
|
||||
this->FineTimer.Start();
|
||||
this->_FineLinop.HermOp(Min[rhs],out[rhs]);
|
||||
|
||||
axpy(tmp[rhs],-1.0,out[rhs],in[rhs]); // resid = in - A Min
|
||||
this->FineTimer.Stop();
|
||||
|
||||
@@ -401,9 +719,11 @@ public:
|
||||
this->_Projector.blockPromote(tmp,PleftMss_proj);// tmp= Q[in - A Min]
|
||||
this->PromoteTimer.Stop();
|
||||
this->FineTimer.Start();
|
||||
// this->qqq=tmp[0];
|
||||
for(int rhs=0;rhs<nrhs;rhs++) {
|
||||
axpy(out[rhs],1.0,Min[rhs],tmp[rhs]); // Min+tmp
|
||||
}
|
||||
// this->zzz=out[0];
|
||||
this->FineTimer.Stop();
|
||||
}
|
||||
};
|
||||
|
@@ -31,6 +31,58 @@ directory
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
template<class Field>
|
||||
void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y){
|
||||
typedef typename Field::scalar_type scomplex;
|
||||
int Nblock = X.size();
|
||||
for(int b=0;b<Nblock;b++){
|
||||
for(int bp=0;bp<Nblock;bp++) {
|
||||
m(b,bp) = innerProduct(X[b],Y[bp]);
|
||||
}}
|
||||
}
|
||||
template<class Field>
|
||||
void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0){
|
||||
// Should make this cache friendly with site outermost, parallel_for
|
||||
// Deal with case AP aliases with either Y or X
|
||||
//
|
||||
//Could pack "X" and "AP" into a Nblock x Volume dense array.
|
||||
// AP(Nrhs x vol) = Y(Nrhs x vol) + scale * m(nrhs x nrhs) * X(nrhs*vol)
|
||||
typedef typename Field::scalar_type scomplex;
|
||||
int Nblock = AP.size();
|
||||
std::vector<Field> tmp(Nblock,X[0]);
|
||||
for(int b=0;b<Nblock;b++){
|
||||
tmp[b] = Y[b];
|
||||
for(int bp=0;bp<Nblock;bp++) {
|
||||
tmp[b] = tmp[b] +scomplex(scale*m(bp,b))*X[bp];
|
||||
}
|
||||
}
|
||||
for(int b=0;b<Nblock;b++){
|
||||
AP[b] = tmp[b];
|
||||
}
|
||||
}
|
||||
template<class Field>
|
||||
void MulMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X){
|
||||
// Should make this cache friendly with site outermost, parallel_for
|
||||
typedef typename Field::scalar_type scomplex;
|
||||
int Nblock = AP.size();
|
||||
for(int b=0;b<Nblock;b++){
|
||||
AP[b] = Zero();
|
||||
for(int bp=0;bp<Nblock;bp++) {
|
||||
AP[b] += scomplex(m(bp,b))*X[bp];
|
||||
}
|
||||
}
|
||||
}
|
||||
template<class Field>
|
||||
double normv(const std::vector<Field> &P){
|
||||
int Nblock = P.size();
|
||||
double nn = 0.0;
|
||||
for(int b=0;b<Nblock;b++) {
|
||||
nn+=norm2(P[b]);
|
||||
}
|
||||
return nn;
|
||||
}
|
||||
|
||||
|
||||
enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS, BlockCGVec, BlockCGrQVec };
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
@@ -87,10 +139,19 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
|
||||
sliceInnerProductMatrix(m_rr,R,R,Orthog);
|
||||
|
||||
// Force manifest hermitian to avoid rounding related
|
||||
/*
|
||||
int rank=m_rr.rows();
|
||||
for(int r=0;r<rank;r++){
|
||||
for(int s=0;s<rank;s++){
|
||||
std::cout << "QR m_rr["<<r<<","<<s<<"] "<<m_rr(r,s)<<std::endl;
|
||||
}}
|
||||
*/
|
||||
m_rr = 0.5*(m_rr+m_rr.adjoint());
|
||||
|
||||
Eigen::MatrixXcd L = m_rr.llt().matrixL();
|
||||
|
||||
// ComplexD det = L.determinant();
|
||||
// std::cout << " Det m_rr "<<det<<std::endl;
|
||||
C = L.adjoint();
|
||||
Cinv = C.inverse();
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@@ -110,11 +171,20 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
|
||||
const std::vector<Field> & R)
|
||||
{
|
||||
InnerProductMatrix(m_rr,R,R);
|
||||
|
||||
/*
|
||||
int rank=m_rr.rows();
|
||||
for(int r=0;r<rank;r++){
|
||||
for(int s=0;s<rank;s++){
|
||||
std::cout << "QRvec m_rr["<<r<<","<<s<<"] "<<m_rr(r,s)<<std::endl;
|
||||
}}
|
||||
*/
|
||||
m_rr = 0.5*(m_rr+m_rr.adjoint());
|
||||
|
||||
Eigen::MatrixXcd L = m_rr.llt().matrixL();
|
||||
|
||||
// ComplexD det = L.determinant();
|
||||
// std::cout << " Det m_rr "<<det<<std::endl;
|
||||
|
||||
C = L.adjoint();
|
||||
Cinv = C.inverse();
|
||||
|
||||
@@ -186,6 +256,7 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
|
||||
sliceNorm(ssq,B,Orthog);
|
||||
RealD sssum=0;
|
||||
for(int b=0;b<Nblock;b++) sssum+=ssq[b];
|
||||
for(int b=0;b<Nblock;b++) std::cout << "src["<<b<<"]" << ssq[b] <<std::endl;
|
||||
|
||||
sliceNorm(residuals,B,Orthog);
|
||||
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
|
||||
@@ -221,6 +292,9 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
|
||||
Linop.HermOp(X, AD);
|
||||
tmp = B - AD;
|
||||
|
||||
sliceNorm(residuals,tmp,Orthog);
|
||||
for(int b=0;b<Nblock;b++) std::cout << "res["<<b<<"]" << residuals[b] <<std::endl;
|
||||
|
||||
ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
|
||||
D=Q;
|
||||
|
||||
@@ -236,6 +310,8 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
|
||||
GridStopWatch SolverTimer;
|
||||
SolverTimer.Start();
|
||||
|
||||
RealD max_resid=0;
|
||||
|
||||
int k;
|
||||
for (k = 1; k <= MaxIterations; k++) {
|
||||
|
||||
@@ -280,7 +356,7 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
|
||||
*/
|
||||
m_rr = m_C.adjoint() * m_C;
|
||||
|
||||
RealD max_resid=0;
|
||||
max_resid=0;
|
||||
RealD rrsum=0;
|
||||
RealD rr;
|
||||
|
||||
@@ -322,7 +398,9 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
|
||||
}
|
||||
|
||||
}
|
||||
std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl;
|
||||
|
||||
std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge "<<k<<" / "<<MaxIterations
|
||||
<<" residual "<< std::sqrt(max_resid)<< std::endl;
|
||||
|
||||
if (ErrorOnNoConverge) assert(0);
|
||||
IterationsToComplete = k;
|
||||
@@ -466,43 +544,6 @@ void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &
|
||||
IterationsToComplete = k;
|
||||
}
|
||||
|
||||
void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y){
|
||||
for(int b=0;b<Nblock;b++){
|
||||
for(int bp=0;bp<Nblock;bp++) {
|
||||
m(b,bp) = innerProduct(X[b],Y[bp]);
|
||||
}}
|
||||
}
|
||||
void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0){
|
||||
// Should make this cache friendly with site outermost, parallel_for
|
||||
// Deal with case AP aliases with either Y or X
|
||||
std::vector<Field> tmp(Nblock,X[0]);
|
||||
for(int b=0;b<Nblock;b++){
|
||||
tmp[b] = Y[b];
|
||||
for(int bp=0;bp<Nblock;bp++) {
|
||||
tmp[b] = tmp[b] + scomplex(scale*m(bp,b))*X[bp];
|
||||
}
|
||||
}
|
||||
for(int b=0;b<Nblock;b++){
|
||||
AP[b] = tmp[b];
|
||||
}
|
||||
}
|
||||
void MulMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X){
|
||||
// Should make this cache friendly with site outermost, parallel_for
|
||||
for(int b=0;b<Nblock;b++){
|
||||
AP[b] = Zero();
|
||||
for(int bp=0;bp<Nblock;bp++) {
|
||||
AP[b] += scomplex(m(bp,b))*X[bp];
|
||||
}
|
||||
}
|
||||
}
|
||||
double normv(const std::vector<Field> &P){
|
||||
double nn = 0.0;
|
||||
for(int b=0;b<Nblock;b++) {
|
||||
nn+=norm2(P[b]);
|
||||
}
|
||||
return nn;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
// BlockCGrQvec implementation:
|
||||
//--------------------------
|
||||
@@ -549,6 +590,7 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
|
||||
|
||||
RealD sssum=0;
|
||||
for(int b=0;b<Nblock;b++){ ssq[b] = norm2(B[b]);}
|
||||
for(int b=0;b<Nblock;b++){ std::cout << "ssq["<<b<<"] "<<ssq[b]<<std::endl;}
|
||||
for(int b=0;b<Nblock;b++) sssum+=ssq[b];
|
||||
|
||||
for(int b=0;b<Nblock;b++){ residuals[b] = norm2(B[b]);}
|
||||
@@ -585,6 +627,7 @@ void BlockCGrQsolveVec(LinearOperatorBase<Field> &Linop, const std::vector<Field
|
||||
for(int b=0;b<Nblock;b++) {
|
||||
Linop.HermOp(X[b], AD[b]);
|
||||
tmp[b] = B[b] - AD[b];
|
||||
std::cout << "r0["<<b<<"] "<<norm2(tmp[b])<<std::endl;
|
||||
}
|
||||
|
||||
ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
|
||||
|
@@ -38,6 +38,7 @@ NAMESPACE_BEGIN(Grid);
|
||||
// single input vec, single output vec.
|
||||
/////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
template <class Field>
|
||||
class ConjugateGradient : public OperatorFunction<Field> {
|
||||
public:
|
||||
@@ -57,10 +58,22 @@ public:
|
||||
ErrorOnNoConverge(err_on_no_conv)
|
||||
{};
|
||||
|
||||
virtual void LogIteration(int k,RealD a,RealD b){
|
||||
// std::cout << "ConjugageGradient::LogIteration() "<<std::endl;
|
||||
};
|
||||
virtual void LogBegin(void){
|
||||
std::cout << "ConjugageGradient::LogBegin() "<<std::endl;
|
||||
};
|
||||
|
||||
void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
|
||||
|
||||
this->LogBegin();
|
||||
|
||||
GRID_TRACE("ConjugateGradient");
|
||||
GridStopWatch PreambleTimer;
|
||||
GridStopWatch ConstructTimer;
|
||||
GridStopWatch NormTimer;
|
||||
GridStopWatch AssignTimer;
|
||||
PreambleTimer.Start();
|
||||
psi.Checkerboard() = src.Checkerboard();
|
||||
|
||||
@@ -70,14 +83,19 @@ public:
|
||||
//RealD b_pred;
|
||||
|
||||
// Was doing copies
|
||||
ConstructTimer.Start();
|
||||
Field p (src.Grid());
|
||||
Field mmp(src.Grid());
|
||||
Field r (src.Grid());
|
||||
ConstructTimer.Stop();
|
||||
|
||||
// Initial residual computation & set up
|
||||
NormTimer.Start();
|
||||
ssq = norm2(src);
|
||||
RealD guess = norm2(psi);
|
||||
NormTimer.Stop();
|
||||
assert(std::isnan(guess) == 0);
|
||||
AssignTimer.Start();
|
||||
if ( guess == 0.0 ) {
|
||||
r = src;
|
||||
p = r;
|
||||
@@ -89,6 +107,7 @@ public:
|
||||
a = norm2(p);
|
||||
}
|
||||
cp = a;
|
||||
AssignTimer.Stop();
|
||||
|
||||
// Handle trivial case of zero src
|
||||
if (ssq == 0.){
|
||||
@@ -164,6 +183,7 @@ public:
|
||||
}
|
||||
LinearCombTimer.Stop();
|
||||
LinalgTimer.Stop();
|
||||
LogIteration(k,a,b);
|
||||
|
||||
IterationTimer.Stop();
|
||||
if ( (k % 500) == 0 ) {
|
||||
@@ -220,6 +240,9 @@ public:
|
||||
<<" residual "<< std::sqrt(cp / ssq)<< std::endl;
|
||||
SolverTimer.Stop();
|
||||
std::cout << GridLogMessage << "\tPreamble " << PreambleTimer.Elapsed() <<std::endl;
|
||||
std::cout << GridLogMessage << "\tConstruct " << ConstructTimer.Elapsed() <<std::endl;
|
||||
std::cout << GridLogMessage << "\tNorm " << NormTimer.Elapsed() <<std::endl;
|
||||
std::cout << GridLogMessage << "\tAssign " << AssignTimer.Elapsed() <<std::endl;
|
||||
std::cout << GridLogMessage << "\tSolver " << SolverTimer.Elapsed() <<std::endl;
|
||||
std::cout << GridLogMessage << "Solver breakdown "<<std::endl;
|
||||
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
|
||||
@@ -233,5 +256,118 @@ public:
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template <class Field>
|
||||
class ConjugateGradientPolynomial : public ConjugateGradient<Field> {
|
||||
public:
|
||||
// Optionally record the CG polynomial
|
||||
std::vector<double> ak;
|
||||
std::vector<double> bk;
|
||||
std::vector<double> poly_p;
|
||||
std::vector<double> poly_r;
|
||||
std::vector<double> poly_Ap;
|
||||
std::vector<double> polynomial;
|
||||
|
||||
public:
|
||||
ConjugateGradientPolynomial(RealD tol, Integer maxit, bool err_on_no_conv = true)
|
||||
: ConjugateGradient<Field>(tol,maxit,err_on_no_conv)
|
||||
{ };
|
||||
void PolyHermOp(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
|
||||
{
|
||||
Field tmp(src.Grid());
|
||||
Field AtoN(src.Grid());
|
||||
AtoN = src;
|
||||
psi=AtoN*polynomial[0];
|
||||
for(int n=1;n<polynomial.size();n++){
|
||||
tmp = AtoN;
|
||||
Linop.HermOp(tmp,AtoN);
|
||||
psi = psi + polynomial[n]*AtoN;
|
||||
}
|
||||
}
|
||||
void CGsequenceHermOp(LinearOperatorBase<Field> &Linop, const Field &src, Field &x)
|
||||
{
|
||||
Field Ap(src.Grid());
|
||||
Field r(src.Grid());
|
||||
Field p(src.Grid());
|
||||
p=src;
|
||||
r=src;
|
||||
x=Zero();
|
||||
x.Checkerboard()=src.Checkerboard();
|
||||
for(int k=0;k<ak.size();k++){
|
||||
x = x + ak[k]*p;
|
||||
Linop.HermOp(p,Ap);
|
||||
r = r - ak[k] * Ap;
|
||||
p = r + bk[k] * p;
|
||||
}
|
||||
}
|
||||
void Solve(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi)
|
||||
{
|
||||
psi=Zero();
|
||||
this->operator ()(Linop,src,psi);
|
||||
}
|
||||
virtual void LogBegin(void)
|
||||
{
|
||||
std::cout << "ConjugageGradientPolynomial::LogBegin() "<<std::endl;
|
||||
ak.resize(0);
|
||||
bk.resize(0);
|
||||
polynomial.resize(0);
|
||||
poly_Ap.resize(0);
|
||||
poly_Ap.resize(0);
|
||||
poly_p.resize(1);
|
||||
poly_r.resize(1);
|
||||
poly_p[0]=1.0;
|
||||
poly_r[0]=1.0;
|
||||
};
|
||||
virtual void LogIteration(int k,RealD a,RealD b)
|
||||
{
|
||||
// With zero guess,
|
||||
// p = r = src
|
||||
//
|
||||
// iterate:
|
||||
// x = x + a p
|
||||
// r = r - a A p
|
||||
// p = r + b p
|
||||
//
|
||||
// [0]
|
||||
// r = x
|
||||
// p = x
|
||||
// Ap=0
|
||||
//
|
||||
// [1]
|
||||
// Ap = A x + 0 ==> shift poly P right by 1 and add 0.
|
||||
// x = x + a p ==> add polynomials term by term
|
||||
// r = r - a A p ==> add polynomials term by term
|
||||
// p = r + b p ==> add polynomials term by term
|
||||
//
|
||||
std::cout << "ConjugageGradientPolynomial::LogIteration() "<<k<<std::endl;
|
||||
ak.push_back(a);
|
||||
bk.push_back(b);
|
||||
// Ap= right_shift(p)
|
||||
poly_Ap.resize(k+1);
|
||||
poly_Ap[0]=0.0;
|
||||
for(int i=0;i<k;i++){
|
||||
poly_Ap[i+1]=poly_p[i];
|
||||
}
|
||||
|
||||
// x = x + a p
|
||||
polynomial.resize(k);
|
||||
polynomial[k-1]=0.0;
|
||||
for(int i=0;i<k;i++){
|
||||
polynomial[i] = polynomial[i] + a * poly_p[i];
|
||||
}
|
||||
|
||||
// r = r - a Ap
|
||||
// p = r + b p
|
||||
poly_r.resize(k+1);
|
||||
poly_p.resize(k+1);
|
||||
poly_r[k] = poly_p[k] = 0.0;
|
||||
for(int i=0;i<k+1;i++){
|
||||
poly_r[i] = poly_r[i] - a * poly_Ap[i];
|
||||
poly_p[i] = poly_r[i] + b * poly_p[i];
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
#endif
|
||||
|
@@ -116,14 +116,14 @@ NAMESPACE_BEGIN(Grid);
|
||||
//Compute double precision rsd and also new RHS vector.
|
||||
Linop_d.HermOp(sol_d, tmp_d);
|
||||
RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
|
||||
|
||||
std::cout<<GridLogMessage<<" rsd norm "<<norm<<std::endl;
|
||||
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;
|
||||
|
||||
if(norm < OuterLoopNormMult * stop){
|
||||
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl;
|
||||
break;
|
||||
}
|
||||
while(norm * inner_tol * inner_tol < stop) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
|
||||
while(norm * inner_tol * inner_tol < stop*1.01) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
|
||||
|
||||
PrecChangeTimer.Start();
|
||||
precisionChange(src_f, src_d, pc_wk_dp_to_sp);
|
||||
|
@@ -102,11 +102,11 @@ public:
|
||||
assert(mass.size()==nshift);
|
||||
assert(mresidual.size()==nshift);
|
||||
|
||||
// dynamic sized arrays on stack; 2d is a pain with vector
|
||||
RealD bs[nshift];
|
||||
RealD rsq[nshift];
|
||||
RealD z[nshift][2];
|
||||
int converged[nshift];
|
||||
// remove dynamic sized arrays on stack; 2d is a pain with vector
|
||||
std::vector<RealD> bs(nshift);
|
||||
std::vector<RealD> rsq(nshift);
|
||||
std::vector<std::array<RealD,2> > z(nshift);
|
||||
std::vector<int> converged(nshift);
|
||||
|
||||
const int primary =0;
|
||||
|
||||
|
@@ -123,11 +123,11 @@ public:
|
||||
assert(mresidual.size()==nshift);
|
||||
|
||||
// dynamic sized arrays on stack; 2d is a pain with vector
|
||||
RealD bs[nshift];
|
||||
RealD rsq[nshift];
|
||||
RealD rsqf[nshift];
|
||||
RealD z[nshift][2];
|
||||
int converged[nshift];
|
||||
std::vector<RealD> bs(nshift);
|
||||
std::vector<RealD> rsq(nshift);
|
||||
std::vector<RealD> rsqf(nshift);
|
||||
std::vector<std::array<RealD,2> > z(nshift);
|
||||
std::vector<int> converged(nshift);
|
||||
|
||||
const int primary =0;
|
||||
|
||||
|
@@ -156,11 +156,11 @@ public:
|
||||
assert(mresidual.size()==nshift);
|
||||
|
||||
// dynamic sized arrays on stack; 2d is a pain with vector
|
||||
RealD bs[nshift];
|
||||
RealD rsq[nshift];
|
||||
RealD rsqf[nshift];
|
||||
RealD z[nshift][2];
|
||||
int converged[nshift];
|
||||
std::vector<RealD> bs(nshift);
|
||||
std::vector<RealD> rsq(nshift);
|
||||
std::vector<RealD> rsqf(nshift);
|
||||
std::vector<std::array<RealD,2> > z(nshift);
|
||||
std::vector<int> converged(nshift);
|
||||
|
||||
const int primary =0;
|
||||
|
||||
|
@@ -279,16 +279,16 @@ public:
|
||||
Qt = Eigen::MatrixXcd::Identity(Nm,Nm);
|
||||
diagonalize(eval2,lmd2,lme2,Nu,Nm,Nm,Qt,grid);
|
||||
_sort.push(eval2,Nm);
|
||||
Glog << "#Ritz value before shift: "<< std::endl;
|
||||
// Glog << "#Ritz value before shift: "<< std::endl;
|
||||
for(int i=0; i<Nm; ++i){
|
||||
std::cout.precision(13);
|
||||
std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
|
||||
std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
|
||||
// std::cout.precision(13);
|
||||
// std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
|
||||
// std::cout << "Rval = "<<std::setw(20)<< std::setiosflags(std::ios_base::left)<< eval2[i] << std::endl;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
if ( Nm>Nk ) {
|
||||
Glog <<" #Apply shifted QR transformations "<<std::endl;
|
||||
// Glog <<" #Apply shifted QR transformations "<<std::endl;
|
||||
//int k2 = Nk+Nu;
|
||||
int k2 = Nk;
|
||||
|
||||
@@ -326,7 +326,7 @@ public:
|
||||
Qt = Eigen::MatrixXcd::Identity(Nm,Nm);
|
||||
diagonalize(eval2,lmd2,lme2,Nu,Nk,Nm,Qt,grid);
|
||||
_sort.push(eval2,Nk);
|
||||
Glog << "#Ritz value after shift: "<< std::endl;
|
||||
// Glog << "#Ritz value after shift: "<< std::endl;
|
||||
for(int i=0; i<Nk; ++i){
|
||||
// std::cout.precision(13);
|
||||
// std::cout << "[" << std::setw(4)<< std::setiosflags(std::ios_base::right) <<i<<"] ";
|
||||
@@ -644,7 +644,7 @@ private:
|
||||
// for (int u=0; u<mrhs; ++u) Glog << " out["<<u<<"] = "<<norm2(out[u])<<std::endl;
|
||||
k_start +=mrhs;
|
||||
}
|
||||
Glog << "LinAlg "<< std::endl;
|
||||
// Glog << "LinAlg "<< std::endl;
|
||||
|
||||
if (b>0) {
|
||||
for (int u=0; u<Nu; ++u) {
|
||||
@@ -678,7 +678,7 @@ private:
|
||||
}
|
||||
w_copy[u] = w[u];
|
||||
}
|
||||
Glog << "LinAlg done"<< std::endl;
|
||||
// Glog << "LinAlg done"<< std::endl;
|
||||
|
||||
// In block version, the steps 6 and 7 in Lanczos construction is
|
||||
// replaced by the QR decomposition of new basis block.
|
||||
@@ -691,15 +691,15 @@ private:
|
||||
}
|
||||
|
||||
// re-orthogonalization for numerical stability
|
||||
Glog << "Gram Schmidt"<< std::endl;
|
||||
// Glog << "Gram Schmidt"<< std::endl;
|
||||
orthogonalize(w,Nu,evec,R);
|
||||
// QR part
|
||||
for (int u=1; u<Nu; ++u) {
|
||||
orthogonalize(w[u],w,u);
|
||||
}
|
||||
Glog << "Gram Schmidt done "<< std::endl;
|
||||
// Glog << "Gram Schmidt done "<< std::endl;
|
||||
|
||||
Glog << "LinAlg "<< std::endl;
|
||||
// Glog << "LinAlg "<< std::endl;
|
||||
for (int u=0; u<Nu; ++u) {
|
||||
//for (int v=0; v<Nu; ++v) {
|
||||
for (int v=u; v<Nu; ++v) {
|
||||
@@ -716,7 +716,7 @@ private:
|
||||
// Glog <<" In block "<< b << "," <<" beta[" << u << "," << k-L << "] = " << lme[u][k] << std::endl;
|
||||
}
|
||||
}
|
||||
Glog << "LinAlg done "<< std::endl;
|
||||
// Glog << "LinAlg done "<< std::endl;
|
||||
|
||||
if (b < Nm/Nu-1) {
|
||||
for (int u=0; u<Nu; ++u) {
|
||||
@@ -935,7 +935,7 @@ if (1){
|
||||
int Nu, int Nb, int Nk, int Nm,
|
||||
Eigen::MatrixXcd& M)
|
||||
{
|
||||
Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n';
|
||||
// Glog << "unpackHermitBlockTriDiagMatToEigen() begin" << '\n';
|
||||
assert( Nk%Nu == 0 && Nm%Nu == 0 );
|
||||
assert( Nk <= Nm );
|
||||
M = Eigen::MatrixXcd::Zero(Nk,Nk);
|
||||
@@ -953,7 +953,7 @@ if (1){
|
||||
M(u+(k/Nu)*Nu,k-Nu) = lme[u][k-Nu];
|
||||
}
|
||||
}
|
||||
Glog << "unpackHermitBlockTriDiagMatToEigen() end" << std::endl;
|
||||
// Glog << "unpackHermitBlockTriDiagMatToEigen() end" << std::endl;
|
||||
}
|
||||
|
||||
|
||||
@@ -963,7 +963,7 @@ if (1){
|
||||
int Nu, int Nb, int Nk, int Nm,
|
||||
Eigen::MatrixXcd& M)
|
||||
{
|
||||
Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n';
|
||||
// Glog << "packHermitBlockTriDiagMatfromEigen() begin" << '\n';
|
||||
assert( Nk%Nu == 0 && Nm%Nu == 0 );
|
||||
assert( Nk <= Nm );
|
||||
|
||||
@@ -979,7 +979,7 @@ if (1){
|
||||
lme[u][k-Nu] = M(u+(k/Nu)*Nu,k-Nu);
|
||||
}
|
||||
}
|
||||
Glog << "packHermitBlockTriDiagMatfromEigen() end" <<std::endl;
|
||||
// Glog << "packHermitBlockTriDiagMatfromEigen() end" <<std::endl;
|
||||
}
|
||||
|
||||
|
||||
@@ -988,7 +988,7 @@ if (1){
|
||||
RealD Dsh,
|
||||
Eigen::MatrixXcd& Qprod)
|
||||
{
|
||||
Glog << "shiftedQRDecompEigen() begin" << '\n';
|
||||
// Glog << "shiftedQRDecompEigen() begin" << '\n';
|
||||
Eigen::MatrixXcd Q = Eigen::MatrixXcd::Zero(Nm,Nm);
|
||||
Eigen::MatrixXcd R = Eigen::MatrixXcd::Zero(Nm,Nm);
|
||||
Eigen::MatrixXcd Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm);
|
||||
@@ -1004,7 +1004,7 @@ if (1){
|
||||
// lower triangular part used to represent series
|
||||
// of Q sequence.
|
||||
|
||||
Glog << "shiftedQRDecompEigen() Housholder & QR" << '\n';
|
||||
// Glog << "shiftedQRDecompEigen() Housholder & QR" << '\n';
|
||||
// equivalent operation of Qprod *= Q
|
||||
//M = Eigen::MatrixXcd::Zero(Nm,Nm);
|
||||
|
||||
@@ -1025,7 +1025,7 @@ if (1){
|
||||
|
||||
Mtmp = Eigen::MatrixXcd::Zero(Nm,Nm);
|
||||
|
||||
Glog << "shiftedQRDecompEigen() Mtmp create" << '\n';
|
||||
// Glog << "shiftedQRDecompEigen() Mtmp create" << '\n';
|
||||
for (int i=0; i<Nm; ++i) {
|
||||
for (int j=0; j<Nm-(Nu+1); ++j) {
|
||||
for (int k=0; k<Nu+1+j; ++k) {
|
||||
@@ -1033,7 +1033,7 @@ if (1){
|
||||
}
|
||||
}
|
||||
}
|
||||
Glog << "shiftedQRDecompEigen() Mtmp loop1" << '\n';
|
||||
// Glog << "shiftedQRDecompEigen() Mtmp loop1" << '\n';
|
||||
for (int i=0; i<Nm; ++i) {
|
||||
for (int j=Nm-(Nu+1); j<Nm; ++j) {
|
||||
for (int k=0; k<Nm; ++k) {
|
||||
@@ -1041,7 +1041,7 @@ if (1){
|
||||
}
|
||||
}
|
||||
}
|
||||
Glog << "shiftedQRDecompEigen() Mtmp loop2" << '\n';
|
||||
// Glog << "shiftedQRDecompEigen() Mtmp loop2" << '\n';
|
||||
|
||||
//static int ntimes = 2;
|
||||
//for (int j=0; j<Nm-(ntimes*Nu); ++j) {
|
||||
@@ -1067,13 +1067,13 @@ if (1){
|
||||
Mtmp(j,i) = conj(Mtmp(i,j));
|
||||
}
|
||||
}
|
||||
Glog << "shiftedQRDecompEigen() Mtmp loop3" << '\n';
|
||||
// Glog << "shiftedQRDecompEigen() Mtmp loop3" << '\n';
|
||||
|
||||
for (int i=0; i<Nm; ++i) {
|
||||
Mtmp(i,i) = real(Mtmp(i,i)) + Dsh;
|
||||
}
|
||||
|
||||
Glog << "shiftedQRDecompEigen() Mtmp loop4" << '\n';
|
||||
// Glog << "shiftedQRDecompEigen() Mtmp loop4" << '\n';
|
||||
M = Mtmp;
|
||||
|
||||
//M = Q.adjoint()*(M*Q);
|
||||
@@ -1085,7 +1085,7 @@ if (1){
|
||||
// }
|
||||
//}
|
||||
|
||||
Glog << "shiftedQRDecompEigen() end" <<std::endl;
|
||||
// Glog << "shiftedQRDecompEigen() end" <<std::endl;
|
||||
}
|
||||
|
||||
void exampleQRDecompEigen(void)
|
||||
|
@@ -245,9 +245,10 @@ until convergence
|
||||
_HermOp(src_n,tmp);
|
||||
// std::cout << GridLogMessage<< tmp<<std::endl; exit(0);
|
||||
// std::cout << GridLogIRL << " _HermOp " << norm2(tmp) << std::endl;
|
||||
RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
|
||||
// RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
|
||||
RealD vnum = real(innerProduct(tmp,tmp)); // HermOp^2.
|
||||
RealD vden = norm2(src_n);
|
||||
RealD na = vnum/vden;
|
||||
RealD na = std::sqrt(vnum/vden);
|
||||
if (fabs(evalMaxApprox/na - 1.0) < 0.0001)
|
||||
i=_MAX_ITER_IRL_MEVAPP_;
|
||||
evalMaxApprox = na;
|
||||
@@ -255,6 +256,7 @@ until convergence
|
||||
src_n = tmp;
|
||||
}
|
||||
}
|
||||
std::cout << GridLogIRL << " Final evalMaxApprox " << evalMaxApprox << std::endl;
|
||||
|
||||
std::vector<RealD> lme(Nm);
|
||||
std::vector<RealD> lme2(Nm);
|
||||
|
@@ -60,6 +60,32 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
template<class Field> class NormalResidual : public LinearFunction<Field>{
|
||||
private:
|
||||
SparseMatrixBase<Field> & _Matrix;
|
||||
OperatorFunction<Field> & _HermitianSolver;
|
||||
LinearFunction<Field> & _Guess;
|
||||
public:
|
||||
|
||||
/////////////////////////////////////////////////////
|
||||
// Wrap the usual normal equations trick
|
||||
/////////////////////////////////////////////////////
|
||||
NormalResidual(SparseMatrixBase<Field> &Matrix, OperatorFunction<Field> &HermitianSolver,
|
||||
LinearFunction<Field> &Guess)
|
||||
: _Matrix(Matrix), _HermitianSolver(HermitianSolver), _Guess(Guess) {};
|
||||
|
||||
void operator() (const Field &in, Field &out){
|
||||
|
||||
Field res(in.Grid());
|
||||
Field tmp(in.Grid());
|
||||
|
||||
MMdagLinearOperator<SparseMatrixBase<Field>,Field> MMdagOp(_Matrix);
|
||||
_Guess(in,res);
|
||||
_HermitianSolver(MMdagOp,in,res); // M Mdag res = in ;
|
||||
_Matrix.Mdag(res,out); // out = Mdag res
|
||||
}
|
||||
};
|
||||
|
||||
template<class Field> class HPDSolver : public LinearFunction<Field> {
|
||||
private:
|
||||
LinearOperatorBase<Field> & _Matrix;
|
||||
|
@@ -20,7 +20,7 @@ template<class Field> class PowerMethod
|
||||
RealD evalMaxApprox = 0.0;
|
||||
auto src_n = src;
|
||||
auto tmp = src;
|
||||
const int _MAX_ITER_EST_ = 100;
|
||||
const int _MAX_ITER_EST_ = 200;
|
||||
|
||||
for (int i=0;i<_MAX_ITER_EST_;i++) {
|
||||
|
||||
@@ -30,18 +30,17 @@ template<class Field> class PowerMethod
|
||||
RealD vden = norm2(src_n);
|
||||
RealD na = vnum/vden;
|
||||
|
||||
std::cout << GridLogIterative << "PowerMethod: Current approximation of largest eigenvalue " << na << std::endl;
|
||||
std::cout << GridLogMessage << "PowerMethod: Current approximation of largest eigenvalue " << na << std::endl;
|
||||
|
||||
if ( (fabs(evalMaxApprox/na - 1.0) < 0.001) || (i==_MAX_ITER_EST_-1) ) {
|
||||
evalMaxApprox = na;
|
||||
std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
|
||||
return evalMaxApprox;
|
||||
}
|
||||
// if ( (fabs(evalMaxApprox/na - 1.0) < 0.0001) || (i==_MAX_ITER_EST_-1) ) {
|
||||
// evalMaxApprox = na;
|
||||
// return evalMaxApprox;
|
||||
// }
|
||||
evalMaxApprox = na;
|
||||
src_n = tmp;
|
||||
}
|
||||
assert(0);
|
||||
return 0;
|
||||
std::cout << GridLogMessage << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
|
||||
return evalMaxApprox;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
76
Grid/algorithms/iterative/PowerSpectrum.h
Normal file
76
Grid/algorithms/iterative/PowerSpectrum.h
Normal file
@@ -0,0 +1,76 @@
|
||||
#pragma once
|
||||
namespace Grid {
|
||||
|
||||
class Band
|
||||
{
|
||||
RealD lo, hi;
|
||||
public:
|
||||
Band(RealD _lo,RealD _hi)
|
||||
{
|
||||
lo=_lo;
|
||||
hi=_hi;
|
||||
}
|
||||
RealD operator() (RealD x){
|
||||
if ( x>lo && x<hi ){
|
||||
return 1.0;
|
||||
} else {
|
||||
return 0.0;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class PowerSpectrum
|
||||
{
|
||||
public:
|
||||
|
||||
template<typename T> static RealD normalise(T& v)
|
||||
{
|
||||
RealD nn = norm2(v);
|
||||
nn = sqrt(nn);
|
||||
v = v * (1.0/nn);
|
||||
return nn;
|
||||
}
|
||||
|
||||
std::vector<RealD> ranges;
|
||||
std::vector<int> order;
|
||||
|
||||
PowerSpectrum( std::vector<RealD> &bins, std::vector<int> &_order ) : ranges(bins), order(_order) { };
|
||||
|
||||
template<class Field>
|
||||
RealD operator()(LinearOperatorBase<Field> &HermOp, const Field &src)
|
||||
{
|
||||
GridBase *grid = src.Grid();
|
||||
int N=ranges.size();
|
||||
RealD hi = ranges[N-1];
|
||||
|
||||
RealD lo_band = 0.0;
|
||||
RealD hi_band;
|
||||
RealD nn=norm2(src);
|
||||
RealD ss=0.0;
|
||||
|
||||
Field tmp = src;
|
||||
|
||||
for(int b=0;b<N;b++){
|
||||
hi_band = ranges[b];
|
||||
Band Notch(lo_band,hi_band);
|
||||
|
||||
Chebyshev<Field> polynomial;
|
||||
polynomial.Init(0.0,hi,order[b],Notch);
|
||||
polynomial.JacksonSmooth();
|
||||
|
||||
polynomial(HermOp,src,tmp) ;
|
||||
|
||||
RealD p=norm2(tmp);
|
||||
ss=ss+p;
|
||||
std::cout << GridLogMessage << " PowerSpectrum Band["<<lo_band<<","<<hi_band<<"] power "<<norm2(tmp)/nn<<std::endl;
|
||||
|
||||
lo_band=hi_band;
|
||||
}
|
||||
std::cout << GridLogMessage << " PowerSpectrum total power "<<ss/nn<<std::endl;
|
||||
std::cout << GridLogMessage << " PowerSpectrum total power (unnormalised) "<<nn<<std::endl;
|
||||
|
||||
return 0;
|
||||
};
|
||||
};
|
||||
|
||||
}
|
@@ -74,7 +74,7 @@ public:
|
||||
|
||||
void operator() (const Field &src, Field &psi){
|
||||
|
||||
psi=Zero();
|
||||
// psi=Zero();
|
||||
RealD cp, ssq,rsq;
|
||||
ssq=norm2(src);
|
||||
rsq=Tolerance*Tolerance*ssq;
|
||||
|
@@ -30,6 +30,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
/* END LEGAL */
|
||||
#pragma once
|
||||
|
||||
#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
inline RealD AggregatePowerLaw(RealD x)
|
||||
@@ -95,7 +97,7 @@ public:
|
||||
|
||||
RealD scale;
|
||||
|
||||
ConjugateGradient<FineField> CG(1.0e-2,100,false);
|
||||
ConjugateGradient<FineField> CG(1.0e-3,400,false);
|
||||
FineField noise(FineGrid);
|
||||
FineField Mn(FineGrid);
|
||||
|
||||
@@ -108,7 +110,7 @@ public:
|
||||
|
||||
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
|
||||
|
||||
for(int i=0;i<1;i++){
|
||||
for(int i=0;i<4;i++){
|
||||
|
||||
CG(hermop,noise,subspace[b]);
|
||||
|
||||
@@ -124,6 +126,53 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
virtual void CreateSubspaceGCR(GridParallelRNG &RNG,LinearOperatorBase<FineField> &DiracOp,int nn=nbasis)
|
||||
{
|
||||
RealD scale;
|
||||
|
||||
TrivialPrecon<FineField> simple_fine;
|
||||
PrecGeneralisedConjugateResidualNonHermitian<FineField> GCR(0.001,30,DiracOp,simple_fine,12,12);
|
||||
FineField noise(FineGrid);
|
||||
FineField src(FineGrid);
|
||||
FineField guess(FineGrid);
|
||||
FineField Mn(FineGrid);
|
||||
|
||||
for(int b=0;b<nn;b++){
|
||||
|
||||
subspace[b] = Zero();
|
||||
gaussian(RNG,noise);
|
||||
scale = std::pow(norm2(noise),-0.5);
|
||||
noise=noise*scale;
|
||||
|
||||
DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|Op|n> "<<innerProduct(noise,Mn)<<std::endl;
|
||||
|
||||
for(int i=0;i<2;i++){
|
||||
// void operator() (const Field &src, Field &psi){
|
||||
#if 1
|
||||
std::cout << GridLogMessage << " inverting on noise "<<std::endl;
|
||||
src = noise;
|
||||
guess=Zero();
|
||||
GCR(src,guess);
|
||||
subspace[b] = guess;
|
||||
#else
|
||||
std::cout << GridLogMessage << " inverting on zero "<<std::endl;
|
||||
src=Zero();
|
||||
guess = noise;
|
||||
GCR(src,guess);
|
||||
subspace[b] = guess;
|
||||
#endif
|
||||
noise = subspace[b];
|
||||
scale = std::pow(norm2(noise),-0.5);
|
||||
noise=noise*scale;
|
||||
|
||||
}
|
||||
|
||||
DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|Op|f> "<<innerProduct(noise,Mn)<<std::endl;
|
||||
subspace[b] = noise;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
|
||||
// and this is the best I found
|
||||
@@ -160,14 +209,21 @@ public:
|
||||
|
||||
int b =0;
|
||||
{
|
||||
ComplexD ip;
|
||||
// Filter
|
||||
Chebyshev<FineField> Cheb(lo,hi,orderfilter);
|
||||
Cheb(hermop,noise,Mn);
|
||||
// normalise
|
||||
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
|
||||
subspace[b] = Mn;
|
||||
|
||||
hermop.Op(Mn,tmp);
|
||||
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
|
||||
ip= innerProduct(Mn,tmp);
|
||||
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|Op|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
|
||||
|
||||
hermop.AdjOp(Mn,tmp);
|
||||
ip = innerProduct(Mn,tmp);
|
||||
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|AdjOp|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
|
||||
b++;
|
||||
}
|
||||
|
||||
@@ -213,8 +269,18 @@ public:
|
||||
Mn=*Tnp;
|
||||
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
|
||||
subspace[b] = Mn;
|
||||
|
||||
|
||||
ComplexD ip;
|
||||
|
||||
hermop.Op(Mn,tmp);
|
||||
std::cout<<GridLogMessage << n<<" filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
|
||||
ip= innerProduct(Mn,tmp);
|
||||
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|Op|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
|
||||
|
||||
hermop.AdjOp(Mn,tmp);
|
||||
ip = innerProduct(Mn,tmp);
|
||||
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|AdjOp|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
|
||||
|
||||
b++;
|
||||
}
|
||||
|
||||
@@ -228,6 +294,70 @@ public:
|
||||
}
|
||||
assert(b==nn);
|
||||
}
|
||||
|
||||
|
||||
virtual void CreateSubspacePolyCheby(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
|
||||
int nn,
|
||||
double hi,
|
||||
double lo1,
|
||||
int orderfilter,
|
||||
double lo2,
|
||||
int orderstep)
|
||||
{
|
||||
RealD scale;
|
||||
|
||||
FineField noise(FineGrid);
|
||||
FineField Mn(FineGrid);
|
||||
FineField tmp(FineGrid);
|
||||
|
||||
// New normalised noise
|
||||
gaussian(RNG,noise);
|
||||
scale = std::pow(norm2(noise),-0.5);
|
||||
noise=noise*scale;
|
||||
|
||||
std::cout << GridLogMessage<<" CreateSubspacePolyCheby "<<std::endl;
|
||||
// Initial matrix element
|
||||
hermop.Op(noise,Mn);
|
||||
std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
|
||||
|
||||
int b =0;
|
||||
{
|
||||
// Filter
|
||||
std::cout << GridLogMessage << "Cheby "<<lo1<<","<<hi<<" "<<orderstep<<std::endl;
|
||||
Chebyshev<FineField> Cheb(lo1,hi,orderfilter);
|
||||
Cheb(hermop,noise,Mn);
|
||||
// normalise
|
||||
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
|
||||
subspace[b] = Mn;
|
||||
hermop.Op(Mn,tmp);
|
||||
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
|
||||
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|n> "<<norm2(Mn)<<std::endl;
|
||||
}
|
||||
|
||||
// Generate a full sequence of Chebyshevs
|
||||
for(int n=1;n<nn;n++){
|
||||
std::cout << GridLogMessage << "Cheby "<<lo2<<","<<hi<<" "<<orderstep<<std::endl;
|
||||
Chebyshev<FineField> Cheb(lo2,hi,orderstep);
|
||||
Cheb(hermop,subspace[n-1],Mn);
|
||||
|
||||
for(int m=0;m<n;m++){
|
||||
ComplexD c = innerProduct(subspace[m],Mn);
|
||||
Mn = Mn - c*subspace[m];
|
||||
}
|
||||
|
||||
// normalise
|
||||
scale = std::pow(norm2(Mn),-0.5);
|
||||
Mn=Mn*scale;
|
||||
|
||||
subspace[n]=Mn;
|
||||
|
||||
hermop.Op(Mn,tmp);
|
||||
std::cout<<GridLogMessage << "filt ["<<n<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
|
||||
std::cout<<GridLogMessage << "filt ["<<n<<"] <n|n> "<<norm2(Mn)<<std::endl;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
|
||||
int nn,
|
||||
double hi,
|
||||
|
@@ -99,7 +99,7 @@ public:
|
||||
CoarseMatrix AselfInvEven;
|
||||
CoarseMatrix AselfInvOdd;
|
||||
|
||||
Vector<RealD> dag_factor;
|
||||
deviceVector<RealD> dag_factor;
|
||||
|
||||
///////////////////////
|
||||
// Interface
|
||||
@@ -124,9 +124,13 @@ public:
|
||||
int npoint = geom.npoint;
|
||||
typedef LatticeView<Cobj> Aview;
|
||||
|
||||
Vector<Aview> AcceleratorViewContainer;
|
||||
deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
|
||||
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
|
||||
|
||||
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
|
||||
for(int p=0;p<geom.npoint;p++) {
|
||||
hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
|
||||
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
|
||||
}
|
||||
Aview *Aview_p = & AcceleratorViewContainer[0];
|
||||
|
||||
const int Nsimd = CComplex::Nsimd();
|
||||
@@ -161,7 +165,7 @@ public:
|
||||
coalescedWrite(out_v[ss](b),res);
|
||||
});
|
||||
|
||||
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
|
||||
for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
|
||||
};
|
||||
|
||||
void Mdag (const CoarseVector &in, CoarseVector &out)
|
||||
@@ -190,9 +194,14 @@ public:
|
||||
int npoint = geom.npoint;
|
||||
typedef LatticeView<Cobj> Aview;
|
||||
|
||||
Vector<Aview> AcceleratorViewContainer;
|
||||
|
||||
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
|
||||
deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
|
||||
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
|
||||
|
||||
for(int p=0;p<geom.npoint;p++) {
|
||||
hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
|
||||
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
|
||||
}
|
||||
Aview *Aview_p = & AcceleratorViewContainer[0];
|
||||
|
||||
const int Nsimd = CComplex::Nsimd();
|
||||
@@ -201,10 +210,10 @@ public:
|
||||
|
||||
int osites=Grid()->oSites();
|
||||
|
||||
Vector<int> points(geom.npoint, 0);
|
||||
for(int p=0; p<geom.npoint; p++)
|
||||
points[p] = geom.points_dagger[p];
|
||||
|
||||
deviceVector<int> points(geom.npoint);
|
||||
for(int p=0; p<geom.npoint; p++) {
|
||||
acceleratorPut(points[p],geom.points_dagger[p]);
|
||||
}
|
||||
auto points_p = &points[0];
|
||||
|
||||
RealD* dag_factor_p = &dag_factor[0];
|
||||
@@ -236,7 +245,7 @@ public:
|
||||
coalescedWrite(out_v[ss](b),res);
|
||||
});
|
||||
|
||||
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
|
||||
for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
|
||||
}
|
||||
|
||||
void MdirComms(const CoarseVector &in)
|
||||
@@ -251,8 +260,14 @@ public:
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
|
||||
typedef LatticeView<Cobj> Aview;
|
||||
Vector<Aview> AcceleratorViewContainer;
|
||||
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
|
||||
|
||||
deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
|
||||
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
|
||||
|
||||
for(int p=0;p<geom.npoint;p++) {
|
||||
hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
|
||||
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
|
||||
}
|
||||
Aview *Aview_p = & AcceleratorViewContainer[0];
|
||||
|
||||
autoView( out_v , out, AcceleratorWrite);
|
||||
@@ -285,7 +300,7 @@ public:
|
||||
}
|
||||
coalescedWrite(out_v[ss](b),res);
|
||||
});
|
||||
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
|
||||
for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
|
||||
}
|
||||
void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out)
|
||||
{
|
||||
@@ -469,14 +484,20 @@ public:
|
||||
|
||||
// determine in what order we need the points
|
||||
int npoint = geom.npoint-1;
|
||||
Vector<int> points(npoint, 0);
|
||||
for(int p=0; p<npoint; p++)
|
||||
points[p] = (dag && !hermitian) ? geom.points_dagger[p] : p;
|
||||
|
||||
deviceVector<int> points(npoint);
|
||||
for(int p=0; p<npoint; p++) {
|
||||
int val = (dag && !hermitian) ? geom.points_dagger[p] : p;
|
||||
acceleratorPut(points[p], val);
|
||||
}
|
||||
auto points_p = &points[0];
|
||||
|
||||
Vector<Aview> AcceleratorViewContainer;
|
||||
for(int p=0;p<npoint;p++) AcceleratorViewContainer.push_back(a[p].View(AcceleratorRead));
|
||||
deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
|
||||
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
|
||||
|
||||
for(int p=0;p<geom.npoint;p++) {
|
||||
hAcceleratorViewContainer[p] = a[p].View(AcceleratorRead);
|
||||
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
|
||||
}
|
||||
Aview *Aview_p = & AcceleratorViewContainer[0];
|
||||
|
||||
const int Nsimd = CComplex::Nsimd();
|
||||
@@ -539,7 +560,7 @@ public:
|
||||
});
|
||||
}
|
||||
|
||||
for(int p=0;p<npoint;p++) AcceleratorViewContainer[p].ViewClose();
|
||||
for(int p=0;p<npoint;p++) hAcceleratorViewContainer[p].ViewClose();
|
||||
}
|
||||
|
||||
CoarsenedMatrix(GridCartesian &CoarseGrid, int hermitian_=0) :
|
||||
@@ -590,11 +611,13 @@ public:
|
||||
}
|
||||
|
||||
// GPU readable prefactor
|
||||
std::vector<RealD> h_dag_factor(nbasis*nbasis);
|
||||
thread_for(i, nbasis*nbasis, {
|
||||
int j = i/nbasis;
|
||||
int k = i%nbasis;
|
||||
dag_factor[i] = dag_factor_eigen(j, k);
|
||||
h_dag_factor[i] = dag_factor_eigen(j, k);
|
||||
});
|
||||
acceleratorCopyToDevice(&h_dag_factor[0],&dag_factor[0],dag_factor.size()*sizeof(RealD));
|
||||
}
|
||||
|
||||
void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
|
||||
|
@@ -441,8 +441,20 @@ public:
|
||||
std::cout << GridLogMessage<<"CoarsenOperator inv "<<tinv<<" us"<<std::endl;
|
||||
}
|
||||
#else
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// Galerkin projection of matrix
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
|
||||
Aggregation<Fobj,CComplex,nbasis> & Subspace)
|
||||
{
|
||||
CoarsenOperator(linop,Subspace,Subspace);
|
||||
}
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// Petrov - Galerkin projection of matrix
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
|
||||
Aggregation<Fobj,CComplex,nbasis> & U,
|
||||
Aggregation<Fobj,CComplex,nbasis> & V)
|
||||
{
|
||||
std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl;
|
||||
GridBase *grid = FineGrid();
|
||||
@@ -458,11 +470,9 @@ public:
|
||||
// Orthogonalise the subblocks over the basis
|
||||
/////////////////////////////////////////////////////////////
|
||||
CoarseScalar InnerProd(CoarseGrid());
|
||||
blockOrthogonalise(InnerProd,Subspace.subspace);
|
||||
blockOrthogonalise(InnerProd,V.subspace);
|
||||
blockOrthogonalise(InnerProd,U.subspace);
|
||||
|
||||
// for(int s=0;s<Subspace.subspace.size();s++){
|
||||
// std::cout << " subspace norm "<<norm2(Subspace.subspace[s])<<std::endl;
|
||||
// }
|
||||
const int npoint = geom.npoint;
|
||||
|
||||
Coordinate clatt = CoarseGrid()->GlobalDimensions();
|
||||
@@ -542,7 +552,7 @@ public:
|
||||
std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
|
||||
for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
|
||||
tphaseBZ-=usecond();
|
||||
phaV = phaF[p]*Subspace.subspace[i];
|
||||
phaV = phaF[p]*V.subspace[i];
|
||||
tphaseBZ+=usecond();
|
||||
|
||||
/////////////////////////////////////////////////////////////////////
|
||||
@@ -555,7 +565,7 @@ public:
|
||||
// std::cout << i << " " <<p << " MphaV "<<norm2(MphaV)<<" "<<norm2(phaV)<<std::endl;
|
||||
|
||||
tproj-=usecond();
|
||||
blockProject(coarseInner,MphaV,Subspace.subspace);
|
||||
blockProject(coarseInner,MphaV,U.subspace);
|
||||
coarseInner = conjugate(pha[p]) * coarseInner;
|
||||
|
||||
ComputeProj[p] = coarseInner;
|
||||
|
@@ -69,7 +69,7 @@ public:
|
||||
}
|
||||
|
||||
// FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
|
||||
void construct(pointer __p, const _Tp& __val) { assert(0);};
|
||||
void construct(pointer __p, const _Tp& __val) { };
|
||||
void construct(pointer __p) { };
|
||||
void destroy(pointer __p) { };
|
||||
};
|
||||
@@ -174,19 +174,10 @@ template<typename _Tp> inline bool operator!=(const devAllocator<_Tp>&, const d
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Template typedefs
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
#ifdef ACCELERATOR_CSHIFT
|
||||
// Cshift on device
|
||||
template<class T> using cshiftAllocator = devAllocator<T>;
|
||||
#else
|
||||
// Cshift on host
|
||||
template<class T> using cshiftAllocator = std::allocator<T>;
|
||||
#endif
|
||||
|
||||
template<class T> using Vector = std::vector<T,uvmAllocator<T> >;
|
||||
template<class T> using stencilVector = std::vector<T,alignedAllocator<T> >;
|
||||
template<class T> using commVector = std::vector<T,devAllocator<T> >;
|
||||
template<class T> using deviceVector = std::vector<T,devAllocator<T> >;
|
||||
template<class T> using cshiftVector = std::vector<T,cshiftAllocator<T> >;
|
||||
template<class T> using hostVector = std::vector<T,alignedAllocator<T> >; // Needs autoview
|
||||
template<class T> using Vector = std::vector<T,uvmAllocator<T> >; // Really want to deprecate
|
||||
template<class T> using uvmVector = std::vector<T,uvmAllocator<T> >; // auto migrating page
|
||||
template<class T> using deviceVector = std::vector<T,devAllocator<T> >; // device vector
|
||||
|
||||
/*
|
||||
template<class T> class vecView
|
||||
@@ -197,8 +188,9 @@ template<class T> class vecView
|
||||
ViewMode mode;
|
||||
void * cpu_ptr;
|
||||
public:
|
||||
// Rvalue accessor
|
||||
accelerator_inline T & operator[](size_t i) const { return this->data[i]; };
|
||||
vecView(std::vector<T> &refer_to_me,ViewMode _mode)
|
||||
vecView(Vector<T> &refer_to_me,ViewMode _mode)
|
||||
{
|
||||
cpu_ptr = &refer_to_me[0];
|
||||
size = refer_to_me.size();
|
||||
@@ -214,22 +206,12 @@ template<class T> class vecView
|
||||
}
|
||||
};
|
||||
|
||||
template<class T> vecView<T> VectorView(std::vector<T> &vec,ViewMode _mode)
|
||||
template<class T> vecView<T> VectorView(Vector<T> &vec,ViewMode _mode)
|
||||
{
|
||||
vecView<T> ret(vec,_mode); // does the open
|
||||
return ret; // must be closed
|
||||
}
|
||||
|
||||
// Little autoscope assister
|
||||
template<class View>
|
||||
class VectorViewCloser
|
||||
{
|
||||
View v; // Take a copy of view and call view close when I go out of scope automatically
|
||||
public:
|
||||
VectorViewCloser(View &_v) : v(_v) {};
|
||||
~VectorViewCloser() { auto ptr = v.cpu_ptr; v.ViewClose(); MemoryManager::NotifyDeletion(ptr);}
|
||||
};
|
||||
|
||||
#define autoVecView(v_v,v,mode) \
|
||||
auto v_v = VectorView(v,mode); \
|
||||
ViewCloser<decltype(v_v)> _autoView##v_v(v_v);
|
||||
|
@@ -1,16 +1,15 @@
|
||||
#include <Grid/GridCore.h>
|
||||
#ifndef GRID_UVM
|
||||
|
||||
#warning "Using explicit device memory copies"
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
#define MAXLINE 512
|
||||
static char print_buffer [ MAXLINE ];
|
||||
|
||||
#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer;
|
||||
#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug << print_buffer;
|
||||
#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer << std::endl;
|
||||
#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug << print_buffer << std::endl;
|
||||
//#define dprintf(...)
|
||||
|
||||
//#define mprintf(...)
|
||||
|
||||
////////////////////////////////////////////////////////////
|
||||
// For caching copies of data on device
|
||||
@@ -111,7 +110,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
|
||||
///////////////////////////////////////////////////////////
|
||||
assert(AccCache.state!=Empty);
|
||||
|
||||
dprintf("MemoryManager: Discard(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
|
||||
dprintf("MemoryManager: Discard(%lx) %lx",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
|
||||
assert(AccCache.accLock==0);
|
||||
assert(AccCache.cpuLock==0);
|
||||
assert(AccCache.CpuPtr!=(uint64_t)NULL);
|
||||
@@ -121,7 +120,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
|
||||
DeviceBytes -=AccCache.bytes;
|
||||
LRUremove(AccCache);
|
||||
AccCache.AccPtr=(uint64_t) NULL;
|
||||
dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
|
||||
dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
|
||||
}
|
||||
uint64_t CpuPtr = AccCache.CpuPtr;
|
||||
EntryErase(CpuPtr);
|
||||
@@ -141,7 +140,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
assert(AccCache.state!=Empty);
|
||||
|
||||
mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld\n",
|
||||
mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld",
|
||||
(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr,
|
||||
(uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock);
|
||||
if (AccCache.accLock!=0) return;
|
||||
@@ -155,7 +154,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
|
||||
AccCache.AccPtr=(uint64_t)NULL;
|
||||
AccCache.state=CpuDirty; // CPU primary now
|
||||
DeviceBytes -=AccCache.bytes;
|
||||
dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);
|
||||
dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld ",(uint64_t)AccCache.AccPtr,DeviceBytes);
|
||||
}
|
||||
// uint64_t CpuPtr = AccCache.CpuPtr;
|
||||
DeviceEvictions++;
|
||||
@@ -169,7 +168,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
|
||||
assert(AccCache.AccPtr!=(uint64_t)NULL);
|
||||
assert(AccCache.CpuPtr!=(uint64_t)NULL);
|
||||
acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
|
||||
mprintf("MemoryManager: acceleratorCopyFromDevice Flush AccPtr %lx -> CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
|
||||
mprintf("MemoryManager: acceleratorCopyFromDevice Flush size %ld AccPtr %lx -> CpuPtr %lx",(uint64_t)AccCache.bytes,(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
|
||||
DeviceToHostBytes+=AccCache.bytes;
|
||||
DeviceToHostXfer++;
|
||||
AccCache.state=Consistent;
|
||||
@@ -184,7 +183,9 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
|
||||
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
|
||||
DeviceBytes+=AccCache.bytes;
|
||||
}
|
||||
mprintf("MemoryManager: acceleratorCopyToDevice Clone AccPtr %lx <- CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
|
||||
mprintf("MemoryManager: acceleratorCopyToDevice Clone size %ld AccPtr %lx <- CpuPtr %lx",
|
||||
(uint64_t)AccCache.bytes,
|
||||
(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
|
||||
acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
|
||||
HostToDeviceBytes+=AccCache.bytes;
|
||||
HostToDeviceXfer++;
|
||||
@@ -210,7 +211,7 @@ void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache)
|
||||
void MemoryManager::ViewClose(void* Ptr,ViewMode mode)
|
||||
{
|
||||
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
|
||||
dprintf("AcceleratorViewClose %lx\n",(uint64_t)Ptr);
|
||||
dprintf("AcceleratorViewClose %lx",(uint64_t)Ptr);
|
||||
AcceleratorViewClose((uint64_t)Ptr);
|
||||
} else if( (mode==CpuRead)||(mode==CpuWrite)){
|
||||
CpuViewClose((uint64_t)Ptr);
|
||||
@@ -222,7 +223,7 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
|
||||
{
|
||||
uint64_t CpuPtr = (uint64_t)_CpuPtr;
|
||||
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
|
||||
dprintf("AcceleratorViewOpen %lx\n",(uint64_t)CpuPtr);
|
||||
dprintf("AcceleratorViewOpen %lx",(uint64_t)CpuPtr);
|
||||
return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint);
|
||||
} else if( (mode==CpuRead)||(mode==CpuWrite)){
|
||||
return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint);
|
||||
@@ -233,6 +234,9 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
|
||||
}
|
||||
void MemoryManager::EvictVictims(uint64_t bytes)
|
||||
{
|
||||
if(bytes>=DeviceMaxBytes) {
|
||||
printf("EvictVictims bytes %ld DeviceMaxBytes %ld\n",bytes,DeviceMaxBytes);
|
||||
}
|
||||
assert(bytes<DeviceMaxBytes);
|
||||
while(bytes+DeviceLRUBytes > DeviceMaxBytes){
|
||||
if ( DeviceLRUBytes > 0){
|
||||
@@ -265,7 +269,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
|
||||
assert(AccCache.cpuLock==0); // Programming error
|
||||
|
||||
if(AccCache.state!=Empty) {
|
||||
dprintf("ViewOpen found entry %lx %lx : %ld %ld accLock %ld\n",
|
||||
dprintf("ViewOpen found entry %lx %lx : sizes %ld %ld accLock %ld",
|
||||
(uint64_t)AccCache.CpuPtr,
|
||||
(uint64_t)CpuPtr,
|
||||
(uint64_t)AccCache.bytes,
|
||||
@@ -305,7 +309,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
|
||||
AccCache.state = Consistent; // Empty + AccRead => Consistent
|
||||
}
|
||||
AccCache.accLock= 1;
|
||||
dprintf("Copied Empty entry into device accLock= %d\n",AccCache.accLock);
|
||||
dprintf("Copied Empty entry into device accLock= %d",AccCache.accLock);
|
||||
} else if(AccCache.state==CpuDirty ){
|
||||
if(mode==AcceleratorWriteDiscard) {
|
||||
CpuDiscard(AccCache);
|
||||
@@ -318,21 +322,21 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
|
||||
AccCache.state = Consistent; // CpuDirty + AccRead => Consistent
|
||||
}
|
||||
AccCache.accLock++;
|
||||
dprintf("CpuDirty entry into device ++accLock= %d\n",AccCache.accLock);
|
||||
dprintf("CpuDirty entry into device ++accLock= %d",AccCache.accLock);
|
||||
} else if(AccCache.state==Consistent) {
|
||||
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
|
||||
AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty
|
||||
else
|
||||
AccCache.state = Consistent; // Consistent + AccRead => Consistent
|
||||
AccCache.accLock++;
|
||||
dprintf("Consistent entry into device ++accLock= %d\n",AccCache.accLock);
|
||||
dprintf("Consistent entry into device ++accLock= %d",AccCache.accLock);
|
||||
} else if(AccCache.state==AccDirty) {
|
||||
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
|
||||
AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
|
||||
else
|
||||
AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty
|
||||
AccCache.accLock++;
|
||||
dprintf("AccDirty entry ++accLock= %d\n",AccCache.accLock);
|
||||
dprintf("AccDirty entry ++accLock= %d",AccCache.accLock);
|
||||
} else {
|
||||
assert(0);
|
||||
}
|
||||
@@ -341,7 +345,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
|
||||
// If view is opened on device must remove from LRU
|
||||
if(AccCache.LRU_valid==1){
|
||||
// must possibly remove from LRU as now locked on GPU
|
||||
dprintf("AccCache entry removed from LRU \n");
|
||||
dprintf("AccCache entry removed from LRU ");
|
||||
LRUremove(AccCache);
|
||||
}
|
||||
|
||||
@@ -364,10 +368,10 @@ void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr)
|
||||
AccCache.accLock--;
|
||||
// Move to LRU queue if not locked and close on device
|
||||
if(AccCache.accLock==0) {
|
||||
dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
|
||||
dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
|
||||
LRUinsert(AccCache);
|
||||
} else {
|
||||
dprintf("AccleratorViewClose %lx AccLock decremented to %ld\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
|
||||
dprintf("AccleratorViewClose %lx AccLock decremented to %ld",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
|
||||
}
|
||||
}
|
||||
void MemoryManager::CpuViewClose(uint64_t CpuPtr)
|
||||
|
@@ -15,10 +15,10 @@ void check_huge_pages(void *Buf,uint64_t BYTES)
|
||||
uint64_t virt_pfn = (uint64_t)Buf / page_size;
|
||||
off_t offset = sizeof(uint64_t) * virt_pfn;
|
||||
uint64_t npages = (BYTES + page_size-1) / page_size;
|
||||
uint64_t pagedata[npages];
|
||||
std::vector<uint64_t> pagedata(npages);
|
||||
uint64_t ret = lseek(fd, offset, SEEK_SET);
|
||||
assert(ret == offset);
|
||||
ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
|
||||
ret = ::read(fd, &pagedata[0], sizeof(uint64_t)*npages);
|
||||
assert(ret == sizeof(uint64_t) * npages);
|
||||
int nhugepages = npages / 512;
|
||||
int n4ktotal, nnothuge;
|
||||
|
@@ -31,5 +31,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
#include <Grid/cartesian/Cartesian_base.h>
|
||||
#include <Grid/cartesian/Cartesian_full.h>
|
||||
#include <Grid/cartesian/Cartesian_red_black.h>
|
||||
#include <Grid/cartesian/CartesianCrossIcosahedron.h>
|
||||
|
||||
#endif
|
||||
|
235
Grid/cartesian/CartesianCrossIcosahedron.h
Normal file
235
Grid/cartesian/CartesianCrossIcosahedron.h
Normal file
@@ -0,0 +1,235 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/cartesian/CartesianCrossIcosahedron.h
|
||||
|
||||
Copyright (C) 2025
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#pragma once
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Grid Support.
|
||||
/////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
enum IcosahedralMeshType {
|
||||
IcosahedralVertices,
|
||||
IcosahedralEdges
|
||||
} ;
|
||||
enum NorthSouth {
|
||||
North = 1,
|
||||
South = 0
|
||||
};
|
||||
|
||||
const int IcosahedralPatches = 10;
|
||||
const int HemiPatches=IcosahedralPatches/2;
|
||||
const int NorthernHemisphere = HemiPatches;
|
||||
const int SouthernHemisphere = 0;
|
||||
|
||||
class GridCartesianCrossIcosahedron: public GridCartesian {
|
||||
|
||||
public:
|
||||
|
||||
IcosahedralMeshType meshType;
|
||||
|
||||
IcosahedralMeshType MeshType(void) { return meshType; };
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
// Constructor takes a parent grid and possibly subdivides communicator.
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
/*
|
||||
GridCartesian(const Coordinate &dimensions,
|
||||
const Coordinate &simd_layout,
|
||||
const Coordinate &processor_grid,
|
||||
const GridCartesian &parent) : GridBase(processor_grid,parent,dummy)
|
||||
{
|
||||
assert(0); // No subdivision
|
||||
}
|
||||
GridCartesian(const Coordinate &dimensions,
|
||||
const Coordinate &simd_layout,
|
||||
const Coordinate &processor_grid,
|
||||
const GridCartesian &parent,int &split_rank) : GridBase(processor_grid,parent,split_rank)
|
||||
{
|
||||
assert(0); // No subdivision
|
||||
}
|
||||
*/
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
// Construct from comm world
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
GridCartesianCrossIcosahedron(const Coordinate &dimensions,
|
||||
const Coordinate &simd_layout,
|
||||
const Coordinate &processor_grid,
|
||||
IcosahedralMeshType _meshType) : GridCartesian(dimensions,simd_layout,processor_grid)
|
||||
{
|
||||
meshType = _meshType;
|
||||
Coordinate S2dimensions=dimensions;
|
||||
Coordinate S2simd =simd_layout;
|
||||
Coordinate S2procs =processor_grid;
|
||||
|
||||
assert(simd_layout[0]==1); // Force simd into perpendicular dimensions
|
||||
assert(simd_layout[1]==1); // to avoid pole storage complexity interacting with SIMD.
|
||||
assert(dimensions[_ndimension-1]==IcosahedralPatches);
|
||||
assert(processor_grid[_ndimension-1]<=2); // Keeps the patches that need a pole on the same node
|
||||
|
||||
// Save a copy of the basic cartesian initialisation volume
|
||||
cartesianOsites = this->_osites;
|
||||
|
||||
// allocate the pole storage if we are seeking vertex domain data
|
||||
if ( meshType == IcosahedralVertices ) {
|
||||
InitPoles();
|
||||
}
|
||||
}
|
||||
|
||||
virtual ~GridCartesianCrossIcosahedron() = default;
|
||||
|
||||
////////////////////////////////////////////////
|
||||
// Use to decide if a given grid is icosahedral
|
||||
////////////////////////////////////////////////
|
||||
int hasNorthPole;
|
||||
int hasSouthPole;
|
||||
int northPoleOsite;
|
||||
int southPoleOsite;
|
||||
int northPoleOsites;
|
||||
int southPoleOsites;
|
||||
int cartesianOsites;
|
||||
|
||||
virtual int isIcosahedral(void) override { return 1;}
|
||||
virtual int isIcosahedralVertex(void) override { return meshType==IcosahedralVertices;}
|
||||
virtual int isIcosahedralEdge (void) override { return meshType==IcosahedralEdges;}
|
||||
virtual int NorthPoleOsite(void) const override { return northPoleOsite; };
|
||||
virtual int NorthPoleOsites(void) const override { return northPoleOsites; };
|
||||
virtual int SouthPoleOsite(void) const override { return southPoleOsite; };
|
||||
virtual int SouthPoleOsites(void) const override { return southPoleOsites; };
|
||||
virtual int ownsNorthPole(void) const override { return hasNorthPole; };
|
||||
virtual int ownsSouthPole(void) const override { return hasSouthPole; };
|
||||
virtual int CartesianOsites(void) const override { return cartesianOsites; };
|
||||
virtual int64_t PoleIdxForOcoor(Coordinate &Coor) override
|
||||
{
|
||||
// Work out the pole_osite. Pick the higher dims
|
||||
Coordinate rdims;
|
||||
Coordinate ocoor;
|
||||
int64_t pole_idx;
|
||||
int Ndm1 = this->Nd()-1;
|
||||
for(int d=2;d<Ndm1;d++){
|
||||
int dd=d-2;
|
||||
rdims.push_back(this->_rdimensions[d]);
|
||||
ocoor.push_back(Coor[d]%this->_rdimensions[d]);
|
||||
}
|
||||
Lexicographic::IndexFromCoor(ocoor,pole_idx,rdims);
|
||||
return pole_idx;
|
||||
}
|
||||
virtual int64_t PoleSiteForOcoor(Coordinate &Coor) override
|
||||
{
|
||||
int Ndm1 = this->Nd()-1;
|
||||
int64_t pole_idx = this->PoleIdxForOcoor(Coor);
|
||||
int64_t pole_osite;
|
||||
if ( Coor[Ndm1] >= HemiPatches ) {
|
||||
pole_osite = pole_idx + this->NorthPoleOsite();
|
||||
} else {
|
||||
pole_osite = pole_idx + this->SouthPoleOsite();
|
||||
}
|
||||
return pole_osite;
|
||||
}
|
||||
|
||||
|
||||
void InitPoles(void)
|
||||
{
|
||||
int Ndm1 = _ndimension-1;
|
||||
///////////////////////
|
||||
// Add the extra pole storage
|
||||
///////////////////////
|
||||
// Vertices = 1x LxLx D1...Dn + 2.D1...Dn
|
||||
// Start after the LxL and don't include the 10 patch dim
|
||||
int OrthogSize = 1;
|
||||
for (int d = 2; d < Ndm1; d++) {
|
||||
OrthogSize *= _gdimensions[d];
|
||||
}
|
||||
_fsites += OrthogSize*2;
|
||||
_gsites += OrthogSize*2;
|
||||
|
||||
// Simd reduced sizes are multiplied up.
|
||||
// If the leading LxL are simd-ized, the vector objects will contain "redundant" lanes
|
||||
// which should contain identical north (south) pole data
|
||||
OrthogSize = 1;
|
||||
for (int d = 2; d < Ndm1; d++) {
|
||||
OrthogSize *= _rdimensions[d];
|
||||
}
|
||||
|
||||
// Grow the local volume to hold pole data
|
||||
// on rank (0,0) in the LxL planes
|
||||
// since SIMD must be placed in the orthogonal directions
|
||||
Coordinate pcoor = this->ThisProcessorCoor();
|
||||
Coordinate pgrid = this->ProcessorGrid();
|
||||
|
||||
const int xdim=0;
|
||||
const int ydim=1;
|
||||
/*
|
||||
*
|
||||
* /\/\/\/\/\
|
||||
* /\/\/\/\/\/
|
||||
* \/\/\/\/\/
|
||||
*
|
||||
* y
|
||||
* /
|
||||
* \x
|
||||
*
|
||||
* Labelling patches as 5 6 7 8 9
|
||||
* 0 1 2 3 4
|
||||
*
|
||||
* Will ban distribution of the patch dimension by more than 2.
|
||||
*
|
||||
* Hence all 5 patches associated with the pole must have the
|
||||
* appropriate "corner" of the patch L^2 located on the SAME rank.
|
||||
*/
|
||||
|
||||
if( (pcoor[xdim]==pgrid[xdim]-1) && (pcoor[ydim]==0) && (pcoor[Ndm1]==0) ){
|
||||
hasSouthPole =1;
|
||||
southPoleOsite=this->_osites;
|
||||
southPoleOsites=OrthogSize;
|
||||
this->_osites += OrthogSize;
|
||||
} else {
|
||||
hasSouthPole =0;
|
||||
southPoleOsites=0;
|
||||
southPoleOsite=0;
|
||||
}
|
||||
if( (pcoor[xdim]==0) && (pcoor[ydim]==pgrid[ydim]-1) && (pcoor[Ndm1]==pgrid[Ndm1]-1) ){
|
||||
hasNorthPole =1;
|
||||
northPoleOsite=this->_osites;
|
||||
northPoleOsites=OrthogSize;
|
||||
this->_osites += OrthogSize;
|
||||
} else {
|
||||
hasNorthPole =0;
|
||||
northPoleOsites=0;
|
||||
northPoleOsite=0;
|
||||
}
|
||||
std::cout << GridLogDebug<<"Icosahedral vertex field volume " << this->_osites<<std::endl;
|
||||
std::cout << GridLogDebug<<"Icosahedral south pole offset " << this->southPoleOsite<<std::endl;
|
||||
std::cout << GridLogDebug<<"Icosahedral north pole offset " << this->northPoleOsite<<std::endl;
|
||||
std::cout << GridLogDebug<<"Icosahedral south pole size " << this->southPoleOsites<<std::endl;
|
||||
std::cout << GridLogDebug<<"Icosahedral north pole size " << this->northPoleOsites<<std::endl;
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
NAMESPACE_END(Grid);
|
@@ -82,13 +82,29 @@ public:
|
||||
bool _isCheckerBoarded;
|
||||
int LocallyPeriodic;
|
||||
Coordinate _checker_dim_mask;
|
||||
int _checker_dim;
|
||||
|
||||
public:
|
||||
|
||||
// Icosahedral decisions
|
||||
virtual int isIcosahedral(void) { return 0;}
|
||||
virtual int isIcosahedralVertex(void) { return 0;}
|
||||
virtual int isIcosahedralEdge (void) { return 0;}
|
||||
virtual int ownsNorthPole(void) const { return 0; };
|
||||
virtual int ownsSouthPole(void) const { return 0; };
|
||||
virtual int NorthPoleOsite(void) const { return 0; };
|
||||
virtual int SouthPoleOsite(void) const { return 0; };
|
||||
virtual int NorthPoleOsites(void) const { std::cout << "base osites" <<std::endl;return 0; };
|
||||
virtual int SouthPoleOsites(void) const { std::cout << "base osites" <<std::endl;return 0; };
|
||||
virtual int CartesianOsites(void) const { return this->oSites(); };
|
||||
virtual int64_t PoleIdxForOcoor(Coordinate &Coor) { return 0;};
|
||||
virtual int64_t PoleSiteForOcoor(Coordinate &Coor){ return 0;}
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// Checkerboarding interface is virtual and overridden by
|
||||
// GridCartesian / GridRedBlackCartesian
|
||||
////////////////////////////////////////////////////////////////
|
||||
|
||||
virtual int CheckerBoarded(int dim) =0;
|
||||
virtual int CheckerBoard(const Coordinate &site)=0;
|
||||
virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0;
|
||||
@@ -175,6 +191,8 @@ public:
|
||||
}
|
||||
return permute_type;
|
||||
}
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// Array sizing queries
|
||||
////////////////////////////////////////////////////////////////
|
||||
|
@@ -38,7 +38,7 @@ class GridCartesian: public GridBase {
|
||||
|
||||
public:
|
||||
int dummy;
|
||||
Coordinate _checker_dim_mask;
|
||||
// Coordinate _checker_dim_mask;
|
||||
virtual int CheckerBoardFromOindexTable (int Oindex) {
|
||||
return 0;
|
||||
}
|
||||
@@ -106,6 +106,7 @@ public:
|
||||
_rdimensions.resize(_ndimension);
|
||||
_simd_layout.resize(_ndimension);
|
||||
_checker_dim_mask.resize(_ndimension);;
|
||||
_checker_dim = -1;
|
||||
_lstart.resize(_ndimension);
|
||||
_lend.resize(_ndimension);
|
||||
|
||||
|
@@ -57,9 +57,10 @@ class GridRedBlackCartesian : public GridBase
|
||||
{
|
||||
public:
|
||||
// Coordinate _checker_dim_mask;
|
||||
int _checker_dim;
|
||||
// int _checker_dim;
|
||||
std::vector<int> _checker_board;
|
||||
|
||||
virtual int isCheckerBoarded(void) const { return 1; };
|
||||
virtual int CheckerBoarded(int dim){
|
||||
if( dim==_checker_dim) return 1;
|
||||
else return 0;
|
||||
|
@@ -57,18 +57,29 @@ int CartesianCommunicator::ProcessorCount(void) { return
|
||||
// very VERY rarely (Log, serial RNG) we need world without a grid
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifdef USE_GRID_REDUCTION
|
||||
void CartesianCommunicator::GlobalSum(ComplexF &c)
|
||||
{
|
||||
GlobalSumP2P(c);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSum(ComplexD &c)
|
||||
{
|
||||
GlobalSumP2P(c);
|
||||
}
|
||||
#else
|
||||
void CartesianCommunicator::GlobalSum(ComplexF &c)
|
||||
{
|
||||
GlobalSumVector((float *)&c,2);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
|
||||
{
|
||||
GlobalSumVector((float *)c,2*N);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSum(ComplexD &c)
|
||||
{
|
||||
GlobalSumVector((double *)&c,2);
|
||||
}
|
||||
#endif
|
||||
void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
|
||||
{
|
||||
GlobalSumVector((float *)c,2*N);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
|
||||
{
|
||||
GlobalSumVector((double *)c,2*N);
|
||||
|
@@ -33,6 +33,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
///////////////////////////////////
|
||||
#include <Grid/communicator/SharedMemory.h>
|
||||
|
||||
#define NVLINK_GET
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
extern bool Stencil_force_mpi ;
|
||||
@@ -128,6 +130,35 @@ public:
|
||||
void GlobalXOR(uint32_t &);
|
||||
void GlobalXOR(uint64_t &);
|
||||
|
||||
template<class obj> void GlobalSumP2P(obj &o)
|
||||
{
|
||||
std::vector<obj> column;
|
||||
obj accum = o;
|
||||
int source,dest;
|
||||
for(int d=0;d<_ndimension;d++){
|
||||
column.resize(_processors[d]);
|
||||
column[0] = accum;
|
||||
std::vector<MpiCommsRequest_t> list;
|
||||
for(int p=1;p<_processors[d];p++){
|
||||
ShiftedRanks(d,p,source,dest);
|
||||
SendToRecvFromBegin(list,
|
||||
&column[0],
|
||||
dest,
|
||||
&column[p],
|
||||
source,
|
||||
sizeof(obj),d*100+p);
|
||||
|
||||
}
|
||||
if (!list.empty()) // avoid triggering assert in comms == none
|
||||
CommsComplete(list);
|
||||
for(int p=1;p<_processors[d];p++){
|
||||
accum = accum + column[p];
|
||||
}
|
||||
}
|
||||
Broadcast(0,accum);
|
||||
o=accum;
|
||||
}
|
||||
|
||||
template<class obj> void GlobalSum(obj &o){
|
||||
typedef typename obj::scalar_type scalar_type;
|
||||
int words = sizeof(obj)/sizeof(scalar_type);
|
||||
@@ -138,8 +169,8 @@ public:
|
||||
////////////////////////////////////////////////////////////
|
||||
// Face exchange, buffer swap in translational invariant way
|
||||
////////////////////////////////////////////////////////////
|
||||
void CommsComplete(std::vector<CommsRequest_t> &list);
|
||||
void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void CommsComplete(std::vector<MpiCommsRequest_t> &list);
|
||||
void SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int dest,
|
||||
void *recv,
|
||||
@@ -158,6 +189,17 @@ public:
|
||||
int recv_from_rank,int do_recv,
|
||||
int bytes,int dir);
|
||||
|
||||
double StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int xmit_to_rank,int do_xmit,
|
||||
void *recv,
|
||||
int recv_from_rank,int do_recv,
|
||||
int xbytes,int rbytes,int dir);
|
||||
|
||||
// Could do a PollHtoD and have a CommsMerge dependence
|
||||
void StencilSendToRecvFromPollDtoH (std::vector<CommsRequest_t> &list);
|
||||
void StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list);
|
||||
|
||||
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int xmit_to_rank,int do_xmit,
|
||||
|
@@ -30,6 +30,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
|
||||
Grid_MPI_Comm CartesianCommunicator::communicator_world;
|
||||
|
||||
////////////////////////////////////////////
|
||||
@@ -257,15 +258,41 @@ CartesianCommunicator::~CartesianCommunicator()
|
||||
}
|
||||
}
|
||||
}
|
||||
#ifdef USE_GRID_REDUCTION
|
||||
void CartesianCommunicator::GlobalSum(float &f){
|
||||
FlightRecorder::StepLog("GlobalSumP2P");
|
||||
CartesianCommunicator::GlobalSumP2P(f);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSum(double &d)
|
||||
{
|
||||
FlightRecorder::StepLog("GlobalSumP2P");
|
||||
CartesianCommunicator::GlobalSumP2P(d);
|
||||
}
|
||||
#else
|
||||
void CartesianCommunicator::GlobalSum(float &f){
|
||||
FlightRecorder::StepLog("AllReduce");
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSum(double &d)
|
||||
{
|
||||
FlightRecorder::StepLog("AllReduce");
|
||||
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
#endif
|
||||
void CartesianCommunicator::GlobalSum(uint32_t &u){
|
||||
FlightRecorder::StepLog("AllReduce");
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSum(uint64_t &u){
|
||||
FlightRecorder::StepLog("AllReduce");
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSumVector(uint64_t* u,int N){
|
||||
FlightRecorder::StepLog("AllReduceVector");
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,u,N,MPI_UINT64_T,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
@@ -287,27 +314,18 @@ void CartesianCommunicator::GlobalMax(double &d)
|
||||
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSum(float &f){
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSumVector(float *f,int N)
|
||||
{
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSum(double &d)
|
||||
{
|
||||
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSumVector(double *d,int N)
|
||||
{
|
||||
int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
|
||||
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void CartesianCommunicator::SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int dest,
|
||||
void *recv,
|
||||
@@ -332,7 +350,7 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
|
||||
assert(ierr==0);
|
||||
list.push_back(xrq);
|
||||
}
|
||||
void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list)
|
||||
void CartesianCommunicator::CommsComplete(std::vector<MpiCommsRequest_t> &list)
|
||||
{
|
||||
int nreq=list.size();
|
||||
|
||||
@@ -351,9 +369,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
||||
int from,
|
||||
int bytes)
|
||||
{
|
||||
std::vector<CommsRequest_t> reqs(0);
|
||||
unsigned long xcrc = crc32(0L, Z_NULL, 0);
|
||||
unsigned long rcrc = crc32(0L, Z_NULL, 0);
|
||||
std::vector<MpiCommsRequest_t> reqs(0);
|
||||
|
||||
int myrank = _processor;
|
||||
int ierr;
|
||||
@@ -369,9 +385,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
||||
communicator,MPI_STATUS_IGNORE);
|
||||
assert(ierr==0);
|
||||
|
||||
// xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
|
||||
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
|
||||
// printf("proc %d SendToRecvFrom %d bytes xcrc %lx rcrc %lx\n",_processor,bytes,xcrc,rcrc); fflush
|
||||
}
|
||||
// Basic Halo comms primitive
|
||||
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
||||
@@ -381,12 +394,25 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
||||
int bytes,int dir)
|
||||
{
|
||||
std::vector<CommsRequest_t> list;
|
||||
double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
|
||||
double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
|
||||
offbytes += StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
|
||||
StencilSendToRecvFromComplete(list,dir);
|
||||
return offbytes;
|
||||
}
|
||||
|
||||
#undef NVLINK_GET // Define to use get instead of put DMA
|
||||
|
||||
#ifdef ACCELERATOR_AWARE_MPI
|
||||
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
|
||||
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
|
||||
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int dest,int dox,
|
||||
void *recv,
|
||||
int from,int dor,
|
||||
int xbytes,int rbytes,int dir)
|
||||
{
|
||||
return 0.0; // Do nothing -- no preparation required
|
||||
}
|
||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int dest,int dox,
|
||||
@@ -420,14 +446,15 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
||||
off_node_bytes+=rbytes;
|
||||
}
|
||||
#ifdef NVLINK_GET
|
||||
else {
|
||||
void *shm = (void *) this->ShmBufferTranslate(from,xmit);
|
||||
assert(shm!=NULL);
|
||||
acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
// This is a NVLINK PUT
|
||||
if (dox) {
|
||||
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
|
||||
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
|
||||
tag= dir+_processor*32;
|
||||
ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
|
||||
@@ -440,27 +467,341 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
||||
assert(shm!=NULL);
|
||||
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
return off_node_bytes;
|
||||
}
|
||||
|
||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
|
||||
{
|
||||
int nreq=list.size();
|
||||
/*finishes Get/Put*/
|
||||
acceleratorCopySynchronise();
|
||||
|
||||
if (nreq==0) return;
|
||||
std::vector<MPI_Status> status(nreq);
|
||||
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
|
||||
assert(ierr==0);
|
||||
list.resize(0);
|
||||
this->StencilBarrier();
|
||||
}
|
||||
|
||||
#else /* NOT ... ACCELERATOR_AWARE_MPI */
|
||||
///////////////////////////////////////////
|
||||
// Pipeline mode through host memory
|
||||
///////////////////////////////////////////
|
||||
/*
|
||||
* In prepare (phase 1):
|
||||
* PHASE 1: (prepare)
|
||||
* - post MPI receive buffers asynch
|
||||
* - post device - host send buffer transfer asynch
|
||||
* PHASE 2: (Begin)
|
||||
* - complete all copies
|
||||
* - post MPI send asynch
|
||||
* - post device - device transfers
|
||||
* PHASE 3: (Complete)
|
||||
* - MPI_waitall
|
||||
* - host-device transfers
|
||||
*
|
||||
*********************************
|
||||
* NB could split this further:
|
||||
*--------------------------------
|
||||
* PHASE 1: (Prepare)
|
||||
* - post MPI receive buffers asynch
|
||||
* - post device - host send buffer transfer asynch
|
||||
* PHASE 2: (BeginInterNode)
|
||||
* - complete all copies
|
||||
* - post MPI send asynch
|
||||
* PHASE 3: (BeginIntraNode)
|
||||
* - post device - device transfers
|
||||
* PHASE 4: (Complete)
|
||||
* - MPI_waitall
|
||||
* - host-device transfers asynch
|
||||
* - (complete all copies)
|
||||
*/
|
||||
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int dest,int dox,
|
||||
void *recv,
|
||||
int from,int dor,
|
||||
int xbytes,int rbytes,int dir)
|
||||
{
|
||||
/*
|
||||
* Bring sequence from Stencil.h down to lower level.
|
||||
* Assume using XeLink is ok
|
||||
*/
|
||||
int ncomm =communicator_halo.size();
|
||||
int commdir=dir%ncomm;
|
||||
|
||||
MPI_Request xrq;
|
||||
MPI_Request rrq;
|
||||
|
||||
int ierr;
|
||||
int gdest = ShmRanks[dest];
|
||||
int gfrom = ShmRanks[from];
|
||||
int gme = ShmRanks[_processor];
|
||||
|
||||
assert(dest != _processor);
|
||||
assert(from != _processor);
|
||||
assert(gme == ShmRank);
|
||||
double off_node_bytes=0.0;
|
||||
int tag;
|
||||
|
||||
void * host_recv = NULL;
|
||||
void * host_xmit = NULL;
|
||||
|
||||
/*
|
||||
* PHASE 1: (Prepare)
|
||||
* - post MPI receive buffers asynch
|
||||
* - post device - host send buffer transfer asynch
|
||||
*/
|
||||
|
||||
if ( dor ) {
|
||||
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
|
||||
tag= dir+from*32;
|
||||
host_recv = this->HostBufferMalloc(rbytes);
|
||||
ierr=MPI_Irecv(host_recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
|
||||
assert(ierr==0);
|
||||
CommsRequest_t srq;
|
||||
srq.PacketType = InterNodeRecv;
|
||||
srq.bytes = rbytes;
|
||||
srq.req = rrq;
|
||||
srq.host_buf = host_recv;
|
||||
srq.device_buf = recv;
|
||||
list.push_back(srq);
|
||||
off_node_bytes+=rbytes;
|
||||
}
|
||||
}
|
||||
|
||||
if (dox) {
|
||||
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
|
||||
|
||||
tag= dir+_processor*32;
|
||||
|
||||
host_xmit = this->HostBufferMalloc(xbytes);
|
||||
CommsRequest_t srq;
|
||||
|
||||
srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
|
||||
|
||||
// ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
|
||||
// assert(ierr==0);
|
||||
// off_node_bytes+=xbytes;
|
||||
|
||||
srq.PacketType = InterNodeXmit;
|
||||
srq.bytes = xbytes;
|
||||
// srq.req = xrq;
|
||||
srq.host_buf = host_xmit;
|
||||
srq.device_buf = xmit;
|
||||
srq.tag = tag;
|
||||
srq.dest = dest;
|
||||
srq.commdir = commdir;
|
||||
list.push_back(srq);
|
||||
}
|
||||
}
|
||||
|
||||
return off_node_bytes;
|
||||
}
|
||||
/*
|
||||
* In the interest of better pipelining, poll for completion on each DtoH and
|
||||
* start MPI_ISend in the meantime
|
||||
*/
|
||||
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list)
|
||||
{
|
||||
int pending = 0;
|
||||
do {
|
||||
|
||||
pending = 0;
|
||||
|
||||
for(int idx = 0; idx<list.size();idx++){
|
||||
|
||||
if ( list[idx].PacketType==InterNodeRecv ) {
|
||||
|
||||
int flag = 0;
|
||||
MPI_Status status;
|
||||
int ierr = MPI_Test(&list[idx].req,&flag,&status);
|
||||
assert(ierr==0);
|
||||
|
||||
if ( flag ) {
|
||||
// std::cout << " PollIrecv "<<idx<<" flag "<<flag<<std::endl;
|
||||
acceleratorCopyToDeviceAsynch(list[idx].host_buf,list[idx].device_buf,list[idx].bytes);
|
||||
list[idx].PacketType=InterNodeReceiveHtoD;
|
||||
} else {
|
||||
pending ++;
|
||||
}
|
||||
}
|
||||
}
|
||||
// std::cout << " PollIrecv "<<pending<<" pending requests"<<std::endl;
|
||||
} while ( pending );
|
||||
|
||||
}
|
||||
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list)
|
||||
{
|
||||
int pending = 0;
|
||||
do {
|
||||
|
||||
pending = 0;
|
||||
|
||||
for(int idx = 0; idx<list.size();idx++){
|
||||
|
||||
if ( list[idx].PacketType==InterNodeXmit ) {
|
||||
|
||||
if ( acceleratorEventIsComplete(list[idx].ev) ) {
|
||||
|
||||
void *host_xmit = list[idx].host_buf;
|
||||
uint32_t xbytes = list[idx].bytes;
|
||||
int dest = list[idx].dest;
|
||||
int tag = list[idx].tag;
|
||||
int commdir = list[idx].commdir;
|
||||
///////////////////
|
||||
// Send packet
|
||||
///////////////////
|
||||
|
||||
// std::cout << " DtoH is complete for index "<<idx<<" calling MPI_Isend "<<std::endl;
|
||||
|
||||
MPI_Request xrq;
|
||||
int ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
|
||||
assert(ierr==0);
|
||||
|
||||
list[idx].req = xrq; // Update the MPI request in the list
|
||||
|
||||
list[idx].PacketType=InterNodeXmitISend;
|
||||
|
||||
} else {
|
||||
// not done, so return to polling loop
|
||||
pending++;
|
||||
}
|
||||
}
|
||||
}
|
||||
} while (pending);
|
||||
}
|
||||
|
||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int dest,int dox,
|
||||
void *recv,
|
||||
int from,int dor,
|
||||
int xbytes,int rbytes,int dir)
|
||||
{
|
||||
int ncomm =communicator_halo.size();
|
||||
int commdir=dir%ncomm;
|
||||
|
||||
MPI_Request xrq;
|
||||
MPI_Request rrq;
|
||||
|
||||
int ierr;
|
||||
int gdest = ShmRanks[dest];
|
||||
int gfrom = ShmRanks[from];
|
||||
int gme = ShmRanks[_processor];
|
||||
|
||||
assert(dest != _processor);
|
||||
assert(from != _processor);
|
||||
assert(gme == ShmRank);
|
||||
double off_node_bytes=0.0;
|
||||
int tag;
|
||||
|
||||
void * host_xmit = NULL;
|
||||
|
||||
////////////////////////////////
|
||||
// Receives already posted
|
||||
// Copies already started
|
||||
////////////////////////////////
|
||||
/*
|
||||
* PHASE 2: (Begin)
|
||||
* - complete all copies
|
||||
* - post MPI send asynch
|
||||
*/
|
||||
#ifdef NVLINK_GET
|
||||
if ( dor ) {
|
||||
|
||||
if ( ! ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) ) {
|
||||
// Intranode
|
||||
void *shm = (void *) this->ShmBufferTranslate(from,xmit);
|
||||
assert(shm!=NULL);
|
||||
|
||||
CommsRequest_t srq;
|
||||
|
||||
srq.ev = acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
|
||||
|
||||
srq.PacketType = IntraNodeRecv;
|
||||
srq.bytes = xbytes;
|
||||
// srq.req = xrq;
|
||||
srq.host_buf = NULL;
|
||||
srq.device_buf = xmit;
|
||||
srq.tag = -1;
|
||||
srq.dest = dest;
|
||||
srq.commdir = dir;
|
||||
list.push_back(srq);
|
||||
}
|
||||
}
|
||||
#else
|
||||
if (dox) {
|
||||
|
||||
if ( !( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) ) {
|
||||
// Intranode
|
||||
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
|
||||
assert(shm!=NULL);
|
||||
|
||||
CommsRequest_t srq;
|
||||
|
||||
srq.ev = acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
|
||||
|
||||
srq.PacketType = IntraNodeXmit;
|
||||
srq.bytes = xbytes;
|
||||
// srq.req = xrq;
|
||||
srq.host_buf = NULL;
|
||||
srq.device_buf = xmit;
|
||||
srq.tag = -1;
|
||||
srq.dest = dest;
|
||||
srq.commdir = dir;
|
||||
list.push_back(srq);
|
||||
|
||||
}
|
||||
}
|
||||
#endif
|
||||
return off_node_bytes;
|
||||
}
|
||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
|
||||
{
|
||||
int nreq=list.size();
|
||||
acceleratorCopySynchronise(); // Complete all pending copy transfers D2D
|
||||
|
||||
acceleratorCopySynchronise();
|
||||
std::vector<MPI_Status> status;
|
||||
std::vector<MPI_Request> MpiRequests;
|
||||
|
||||
if (nreq==0) return;
|
||||
|
||||
std::vector<MPI_Status> status(nreq);
|
||||
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
|
||||
assert(ierr==0);
|
||||
list.resize(0);
|
||||
for(int r=0;r<list.size();r++){
|
||||
// Must check each Send buf is clear to reuse
|
||||
if ( list[r].PacketType == InterNodeXmitISend ) MpiRequests.push_back(list[r].req);
|
||||
// if ( list[r].PacketType == InterNodeRecv ) MpiRequests.push_back(list[r].req); // Already "Test" passed
|
||||
}
|
||||
|
||||
int nreq=MpiRequests.size();
|
||||
|
||||
if (nreq>0) {
|
||||
status.resize(MpiRequests.size());
|
||||
int ierr = MPI_Waitall(MpiRequests.size(),&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing.
|
||||
assert(ierr==0);
|
||||
}
|
||||
|
||||
// for(int r=0;r<nreq;r++){
|
||||
// if ( list[r].PacketType==InterNodeRecv ) {
|
||||
// acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
|
||||
// }
|
||||
// }
|
||||
|
||||
|
||||
list.resize(0); // Delete the list
|
||||
this->HostBufferFreeAll(); // Clean up the buffer allocs
|
||||
#ifndef NVLINK_GET
|
||||
this->StencilBarrier(); // if PUT must check our nbrs have filled our receive buffers.
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
////////////////////////////////////////////
|
||||
// END PIPELINE MODE / NO CUDA AWARE MPI
|
||||
////////////////////////////////////////////
|
||||
|
||||
void CartesianCommunicator::StencilBarrier(void)
|
||||
{
|
||||
FlightRecorder::StepLog("NodeBarrier");
|
||||
MPI_Barrier (ShmComm);
|
||||
}
|
||||
//void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
|
||||
@@ -468,11 +809,13 @@ void CartesianCommunicator::StencilBarrier(void)
|
||||
//}
|
||||
void CartesianCommunicator::Barrier(void)
|
||||
{
|
||||
FlightRecorder::StepLog("GridBarrier");
|
||||
int ierr = MPI_Barrier(communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
|
||||
{
|
||||
FlightRecorder::StepLog("Broadcast");
|
||||
int ierr=MPI_Bcast(data,
|
||||
bytes,
|
||||
MPI_BYTE,
|
||||
@@ -491,6 +834,7 @@ void CartesianCommunicator::BarrierWorld(void){
|
||||
}
|
||||
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
|
||||
{
|
||||
FlightRecorder::StepLog("BroadcastWorld");
|
||||
int ierr= MPI_Bcast(data,
|
||||
bytes,
|
||||
MPI_BYTE,
|
||||
@@ -513,6 +857,7 @@ void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,
|
||||
}
|
||||
void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t bytes)
|
||||
{
|
||||
FlightRecorder::StepLog("AllToAll");
|
||||
// MPI is a pain and uses "int" arguments
|
||||
// 64*64*64*128*16 == 500Million elements of data.
|
||||
// When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug.
|
||||
|
@@ -91,7 +91,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
||||
{
|
||||
assert(0);
|
||||
}
|
||||
void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(0);}
|
||||
void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(list.size()==0);}
|
||||
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int dest,
|
||||
@@ -132,6 +132,17 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
||||
{
|
||||
return 2.0*bytes;
|
||||
}
|
||||
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
|
||||
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
|
||||
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int xmit_to_rank,int dox,
|
||||
void *recv,
|
||||
int recv_from_rank,int dor,
|
||||
int xbytes,int rbytes, int dir)
|
||||
{
|
||||
return 0.0;
|
||||
}
|
||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int xmit_to_rank,int dox,
|
||||
|
@@ -46,8 +46,40 @@ NAMESPACE_BEGIN(Grid);
|
||||
|
||||
#if defined (GRID_COMMS_MPI3)
|
||||
typedef MPI_Comm Grid_MPI_Comm;
|
||||
typedef MPI_Request MpiCommsRequest_t;
|
||||
#ifdef ACCELERATOR_AWARE_MPI
|
||||
typedef MPI_Request CommsRequest_t;
|
||||
#else
|
||||
/*
|
||||
* Enable state transitions as each packet flows.
|
||||
*/
|
||||
enum PacketType_t {
|
||||
FaceGather,
|
||||
InterNodeXmit,
|
||||
InterNodeRecv,
|
||||
IntraNodeXmit,
|
||||
IntraNodeRecv,
|
||||
InterNodeXmitISend,
|
||||
InterNodeReceiveHtoD
|
||||
};
|
||||
/*
|
||||
*Package arguments needed for various actions along packet flow
|
||||
*/
|
||||
typedef struct {
|
||||
PacketType_t PacketType;
|
||||
void *host_buf;
|
||||
void *device_buf;
|
||||
int dest;
|
||||
int tag;
|
||||
int commdir;
|
||||
unsigned long bytes;
|
||||
acceleratorEvent_t ev;
|
||||
MpiCommsRequest_t req;
|
||||
} CommsRequest_t;
|
||||
#endif
|
||||
|
||||
#else
|
||||
typedef int MpiCommsRequest_t;
|
||||
typedef int CommsRequest_t;
|
||||
typedef int Grid_MPI_Comm;
|
||||
#endif
|
||||
@@ -105,7 +137,7 @@ public:
|
||||
///////////////////////////////////////////////////
|
||||
static void SharedMemoryAllocate(uint64_t bytes, int flags);
|
||||
static void SharedMemoryFree(void);
|
||||
static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
|
||||
// static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
|
||||
static void SharedMemoryZero(void *dest,size_t bytes);
|
||||
|
||||
};
|
||||
|
@@ -42,6 +42,11 @@ Author: Christoph Lehner <christoph@lhnr.de>
|
||||
#ifdef ACCELERATOR_AWARE_MPI
|
||||
#define GRID_SYCL_LEVEL_ZERO_IPC
|
||||
#define SHM_SOCKETS
|
||||
#else
|
||||
#ifdef HAVE_NUMAIF_H
|
||||
#warning " Using NUMAIF "
|
||||
#include <numaif.h>
|
||||
#endif
|
||||
#endif
|
||||
#include <syscall.h>
|
||||
#endif
|
||||
@@ -537,7 +542,38 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
// Each MPI rank should allocate our own buffer
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#ifndef ACCELERATOR_AWARE_MPI
|
||||
HostCommBuf= malloc(bytes);
|
||||
// printf("Host buffer allocate for GPU non-aware MPI\n");
|
||||
#if 0
|
||||
HostCommBuf= acceleratorAllocHost(bytes);
|
||||
#else
|
||||
HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
|
||||
#if 0
|
||||
#warning "Moving host buffers to specific NUMA domain"
|
||||
int numa;
|
||||
char *numa_name=(char *)getenv("MPI_BUF_NUMA");
|
||||
if(numa_name) {
|
||||
unsigned long page_size = sysconf(_SC_PAGESIZE);
|
||||
numa = atoi(numa_name);
|
||||
unsigned long page_count = bytes/page_size;
|
||||
std::vector<void *> pages(page_count);
|
||||
std::vector<int> nodes(page_count,numa);
|
||||
std::vector<int> status(page_count,-1);
|
||||
for(unsigned long p=0;p<page_count;p++){
|
||||
pages[p] =(void *) ((uint64_t) HostCommBuf + p*page_size);
|
||||
}
|
||||
int ret = move_pages(0,
|
||||
page_count,
|
||||
&pages[0],
|
||||
&nodes[0],
|
||||
&status[0],
|
||||
MPOL_MF_MOVE);
|
||||
printf("Host buffer move to numa domain %d : move_pages returned %d\n",numa,ret);
|
||||
if (ret) perror(" move_pages failed for reason:");
|
||||
}
|
||||
#endif
|
||||
acceleratorPin(HostCommBuf,bytes);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
ShmCommBuf = acceleratorAllocDevice(bytes);
|
||||
if (ShmCommBuf == (void *)NULL ) {
|
||||
@@ -569,8 +605,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
|
||||
typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t;
|
||||
|
||||
auto zeDevice = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());
|
||||
auto zeContext = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());
|
||||
auto zeDevice = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());
|
||||
auto zeContext = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());
|
||||
|
||||
ze_ipc_mem_handle_t ihandle;
|
||||
clone_mem_t handle;
|
||||
@@ -880,14 +916,14 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
|
||||
bzero(dest,bytes);
|
||||
#endif
|
||||
}
|
||||
void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
|
||||
{
|
||||
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
|
||||
acceleratorCopyToDevice(src,dest,bytes);
|
||||
#else
|
||||
bcopy(src,dest,bytes);
|
||||
#endif
|
||||
}
|
||||
//void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
|
||||
//{
|
||||
//#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
|
||||
// acceleratorCopyToDevice(src,dest,bytes);
|
||||
//#else
|
||||
// bcopy(src,dest,bytes);
|
||||
//#endif
|
||||
//}
|
||||
////////////////////////////////////////////////////////
|
||||
// Global shared functionality finished
|
||||
// Now move to per communicator functionality
|
||||
@@ -923,6 +959,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
|
||||
MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);
|
||||
|
||||
ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
|
||||
// std::cerr << " SetCommunicator rank "<<r<<" comm "<<ShmCommBufs[r] <<std::endl;
|
||||
}
|
||||
ShmBufferFreeAll();
|
||||
|
||||
@@ -975,19 +1012,18 @@ void SharedMemory::SharedMemoryTest(void)
|
||||
check[0]=GlobalSharedMemory::WorldNode;
|
||||
check[1]=r;
|
||||
check[2]=magic;
|
||||
GlobalSharedMemory::SharedMemoryCopy( ShmCommBufs[r], check, 3*sizeof(uint64_t));
|
||||
acceleratorCopyToDevice(check,ShmCommBufs[r],3*sizeof(uint64_t));
|
||||
}
|
||||
}
|
||||
ShmBarrier();
|
||||
for(uint64_t r=0;r<ShmSize;r++){
|
||||
ShmBarrier();
|
||||
GlobalSharedMemory::SharedMemoryCopy(check,ShmCommBufs[r], 3*sizeof(uint64_t));
|
||||
ShmBarrier();
|
||||
acceleratorCopyFromDevice(ShmCommBufs[r],check,3*sizeof(uint64_t));
|
||||
assert(check[0]==GlobalSharedMemory::WorldNode);
|
||||
assert(check[1]==r);
|
||||
assert(check[2]==magic);
|
||||
ShmBarrier();
|
||||
}
|
||||
ShmBarrier();
|
||||
std::cout << GridLogDebug << " SharedMemoryTest has passed "<<std::endl;
|
||||
}
|
||||
|
||||
void *SharedMemory::ShmBuffer(int rank)
|
||||
|
@@ -122,10 +122,10 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
|
||||
{
|
||||
acceleratorMemSet(dest,0,bytes);
|
||||
}
|
||||
void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
|
||||
{
|
||||
acceleratorCopyToDevice(src,dest,bytes);
|
||||
}
|
||||
//void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
|
||||
//{
|
||||
// acceleratorCopyToDevice(src,dest,bytes);
|
||||
//}
|
||||
////////////////////////////////////////////////////////
|
||||
// Global shared functionality finished
|
||||
// Now move to per communicator functionality
|
||||
|
@@ -51,7 +51,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
#endif
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
|
||||
auto Cshift(const Expression &expr,int dim,int shift) -> decltype(closure(expr))
|
||||
{
|
||||
|
@@ -30,12 +30,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
extern std::vector<std::pair<int,int> > Cshift_table;
|
||||
extern commVector<std::pair<int,int> > Cshift_table_device;
|
||||
extern deviceVector<std::pair<int,int> > Cshift_table_device;
|
||||
|
||||
inline std::pair<int,int> *MapCshiftTable(void)
|
||||
{
|
||||
// GPU version
|
||||
#ifdef ACCELERATOR_CSHIFT
|
||||
uint64_t sz=Cshift_table.size();
|
||||
if (Cshift_table_device.size()!=sz ) {
|
||||
Cshift_table_device.resize(sz);
|
||||
@@ -45,16 +44,13 @@ inline std::pair<int,int> *MapCshiftTable(void)
|
||||
sizeof(Cshift_table[0])*sz);
|
||||
|
||||
return &Cshift_table_device[0];
|
||||
#else
|
||||
return &Cshift_table[0];
|
||||
#endif
|
||||
// CPU version use identify map
|
||||
}
|
||||
///////////////////////////////////////////////////////////////////
|
||||
// Gather for when there is no need to SIMD split
|
||||
///////////////////////////////////////////////////////////////////
|
||||
template<class vobj> void
|
||||
Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
|
||||
Gather_plane_simple (const Lattice<vobj> &rhs,deviceVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
|
||||
{
|
||||
int rd = rhs.Grid()->_rdimensions[dimension];
|
||||
|
||||
@@ -94,17 +90,10 @@ Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dim
|
||||
{
|
||||
auto buffer_p = & buffer[0];
|
||||
auto table = MapCshiftTable();
|
||||
#ifdef ACCELERATOR_CSHIFT
|
||||
autoView(rhs_v , rhs, AcceleratorRead);
|
||||
accelerator_for(i,ent,vobj::Nsimd(),{
|
||||
coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
|
||||
});
|
||||
#else
|
||||
autoView(rhs_v , rhs, CpuRead);
|
||||
thread_for(i,ent,{
|
||||
buffer_p[table[i].first]=rhs_v[table[i].second];
|
||||
});
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@@ -129,7 +118,6 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
|
||||
int n1=rhs.Grid()->_slice_stride[dimension];
|
||||
|
||||
if ( cbmask ==0x3){
|
||||
#ifdef ACCELERATOR_CSHIFT
|
||||
autoView(rhs_v , rhs, AcceleratorRead);
|
||||
accelerator_for(nn,e1*e2,1,{
|
||||
int n = nn%e1;
|
||||
@@ -140,21 +128,10 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
|
||||
vobj temp =rhs_v[so+o+b];
|
||||
extract<vobj>(temp,pointers,offset);
|
||||
});
|
||||
#else
|
||||
autoView(rhs_v , rhs, CpuRead);
|
||||
thread_for2d(n,e1,b,e2,{
|
||||
int o = n*n1;
|
||||
int offset = b+n*e2;
|
||||
|
||||
vobj temp =rhs_v[so+o+b];
|
||||
extract<vobj>(temp,pointers,offset);
|
||||
});
|
||||
#endif
|
||||
} else {
|
||||
Coordinate rdim=rhs.Grid()->_rdimensions;
|
||||
Coordinate cdm =rhs.Grid()->_checker_dim_mask;
|
||||
std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
|
||||
#ifdef ACCELERATOR_CSHIFT
|
||||
autoView(rhs_v , rhs, AcceleratorRead);
|
||||
accelerator_for(nn,e1*e2,1,{
|
||||
int n = nn%e1;
|
||||
@@ -175,33 +152,13 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
|
||||
extract<vobj>(temp,pointers,offset);
|
||||
}
|
||||
});
|
||||
#else
|
||||
autoView(rhs_v , rhs, CpuRead);
|
||||
thread_for2d(n,e1,b,e2,{
|
||||
|
||||
Coordinate coor;
|
||||
|
||||
int o=n*n1;
|
||||
int oindex = o+b;
|
||||
|
||||
int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm);
|
||||
|
||||
int ocb=1<<cb;
|
||||
int offset = b+n*e2;
|
||||
|
||||
if ( ocb & cbmask ) {
|
||||
vobj temp =rhs_v[so+o+b];
|
||||
extract<vobj>(temp,pointers,offset);
|
||||
}
|
||||
});
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// Scatter for when there is no need to SIMD split
|
||||
//////////////////////////////////////////////////////
|
||||
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<vobj> &buffer, int dimension,int plane,int cbmask)
|
||||
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,deviceVector<vobj> &buffer, int dimension,int plane,int cbmask)
|
||||
{
|
||||
int rd = rhs.Grid()->_rdimensions[dimension];
|
||||
|
||||
@@ -245,17 +202,10 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<
|
||||
{
|
||||
auto buffer_p = & buffer[0];
|
||||
auto table = MapCshiftTable();
|
||||
#ifdef ACCELERATOR_CSHIFT
|
||||
autoView( rhs_v, rhs, AcceleratorWrite);
|
||||
accelerator_for(i,ent,vobj::Nsimd(),{
|
||||
coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
|
||||
});
|
||||
#else
|
||||
autoView( rhs_v, rhs, CpuWrite);
|
||||
thread_for(i,ent,{
|
||||
rhs_v[table[i].first]=buffer_p[table[i].second];
|
||||
});
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@@ -278,7 +228,6 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
|
||||
if(cbmask ==0x3 ) {
|
||||
int _slice_stride = rhs.Grid()->_slice_stride[dimension];
|
||||
int _slice_block = rhs.Grid()->_slice_block[dimension];
|
||||
#ifdef ACCELERATOR_CSHIFT
|
||||
autoView( rhs_v , rhs, AcceleratorWrite);
|
||||
accelerator_for(nn,e1*e2,1,{
|
||||
int n = nn%e1;
|
||||
@@ -287,14 +236,6 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
|
||||
int offset = b+n*_slice_block;
|
||||
merge(rhs_v[so+o+b],pointers,offset);
|
||||
});
|
||||
#else
|
||||
autoView( rhs_v , rhs, CpuWrite);
|
||||
thread_for2d(n,e1,b,e2,{
|
||||
int o = n*_slice_stride;
|
||||
int offset = b+n*_slice_block;
|
||||
merge(rhs_v[so+o+b],pointers,offset);
|
||||
});
|
||||
#endif
|
||||
} else {
|
||||
|
||||
// Case of SIMD split AND checker dim cannot currently be hit, except in
|
||||
@@ -360,19 +301,11 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
|
||||
|
||||
{
|
||||
auto table = MapCshiftTable();
|
||||
#ifdef ACCELERATOR_CSHIFT
|
||||
autoView(rhs_v , rhs, AcceleratorRead);
|
||||
autoView(lhs_v , lhs, AcceleratorWrite);
|
||||
accelerator_for(i,ent,vobj::Nsimd(),{
|
||||
coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second]));
|
||||
});
|
||||
#else
|
||||
autoView(rhs_v , rhs, CpuRead);
|
||||
autoView(lhs_v , lhs, CpuWrite);
|
||||
thread_for(i,ent,{
|
||||
lhs_v[table[i].first]=rhs_v[table[i].second];
|
||||
});
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@@ -412,19 +345,11 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
|
||||
|
||||
{
|
||||
auto table = MapCshiftTable();
|
||||
#ifdef ACCELERATOR_CSHIFT
|
||||
autoView( rhs_v, rhs, AcceleratorRead);
|
||||
autoView( lhs_v, lhs, AcceleratorWrite);
|
||||
accelerator_for(i,ent,1,{
|
||||
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
|
||||
});
|
||||
#else
|
||||
autoView( rhs_v, rhs, CpuRead);
|
||||
autoView( lhs_v, lhs, CpuWrite);
|
||||
thread_for(i,ent,{
|
||||
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
|
||||
});
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -31,9 +31,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
const int Cshift_verbose=0;
|
||||
template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
|
||||
{
|
||||
assert(!rhs.Grid()->isIcosahedral());
|
||||
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
|
||||
@@ -65,7 +67,7 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
|
||||
Cshift_comms(ret,rhs,dimension,shift);
|
||||
}
|
||||
t1=usecond();
|
||||
// std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl;
|
||||
if(Cshift_verbose) std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl;
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -94,7 +96,7 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
|
||||
sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even);
|
||||
sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd);
|
||||
|
||||
//std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
|
||||
// std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.Checkerboard()<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
|
||||
if ( sshift[0] == sshift[1] ) {
|
||||
// std::cout << "Single pass Cshift_comms" <<std::endl;
|
||||
Cshift_comms_simd(ret,rhs,dimension,shift,0x3);
|
||||
@@ -104,8 +106,6 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
|
||||
Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
|
||||
}
|
||||
}
|
||||
#define ACCELERATOR_CSHIFT_NO_COPY
|
||||
#ifdef ACCELERATOR_CSHIFT_NO_COPY
|
||||
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
||||
{
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
@@ -125,8 +125,12 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
||||
assert(shift<fd);
|
||||
|
||||
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
|
||||
static cshiftVector<vobj> send_buf; send_buf.resize(buffer_size);
|
||||
static cshiftVector<vobj> recv_buf; recv_buf.resize(buffer_size);
|
||||
static deviceVector<vobj> send_buf; send_buf.resize(buffer_size);
|
||||
static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size);
|
||||
#ifndef ACCELERATOR_AWARE_MPI
|
||||
static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size);
|
||||
static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size);
|
||||
#endif
|
||||
|
||||
int cb= (cbmask==0x2)? Odd : Even;
|
||||
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
|
||||
@@ -158,18 +162,31 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
||||
// int rank = grid->_processor;
|
||||
int recv_from_rank;
|
||||
int xmit_to_rank;
|
||||
|
||||
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
|
||||
|
||||
tcomms-=usecond();
|
||||
// grid->Barrier();
|
||||
grid->Barrier();
|
||||
|
||||
#ifdef ACCELERATOR_AWARE_MPI
|
||||
grid->SendToRecvFrom((void *)&send_buf[0],
|
||||
xmit_to_rank,
|
||||
(void *)&recv_buf[0],
|
||||
recv_from_rank,
|
||||
bytes);
|
||||
#else
|
||||
// bouncy bouncy
|
||||
acceleratorCopyFromDevice(&send_buf[0],&hsend_buf[0],bytes);
|
||||
grid->SendToRecvFrom((void *)&hsend_buf[0],
|
||||
xmit_to_rank,
|
||||
(void *)&hrecv_buf[0],
|
||||
recv_from_rank,
|
||||
bytes);
|
||||
acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
|
||||
#endif
|
||||
|
||||
xbytes+=bytes;
|
||||
// grid->Barrier();
|
||||
grid->Barrier();
|
||||
tcomms+=usecond();
|
||||
|
||||
tscatter-=usecond();
|
||||
@@ -177,13 +194,13 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
||||
tscatter+=usecond();
|
||||
}
|
||||
}
|
||||
/*
|
||||
if (Cshift_verbose){
|
||||
std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
|
||||
*/
|
||||
}
|
||||
}
|
||||
|
||||
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
||||
@@ -224,8 +241,8 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
||||
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
|
||||
// int words = sizeof(vobj)/sizeof(vector_type);
|
||||
|
||||
static std::vector<cshiftVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd);
|
||||
static std::vector<cshiftVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
|
||||
static std::vector<deviceVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd);
|
||||
static std::vector<deviceVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
|
||||
scalar_object * recv_buf_extract_mpi;
|
||||
scalar_object * send_buf_extract_mpi;
|
||||
|
||||
@@ -233,6 +250,10 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
||||
send_buf_extract[s].resize(buffer_size);
|
||||
recv_buf_extract[s].resize(buffer_size);
|
||||
}
|
||||
#ifndef ACCELERATOR_AWARE_MPI
|
||||
hostVector<scalar_object> hsend_buf; hsend_buf.resize(buffer_size);
|
||||
hostVector<scalar_object> hrecv_buf; hrecv_buf.resize(buffer_size);
|
||||
#endif
|
||||
|
||||
int bytes = buffer_size*sizeof(scalar_object);
|
||||
|
||||
@@ -281,246 +302,31 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
||||
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
||||
|
||||
tcomms-=usecond();
|
||||
// grid->Barrier();
|
||||
grid->Barrier();
|
||||
|
||||
send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
|
||||
recv_buf_extract_mpi = &recv_buf_extract[i][0];
|
||||
#ifdef ACCELERATOR_AWARE_MPI
|
||||
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
|
||||
xmit_to_rank,
|
||||
(void *)recv_buf_extract_mpi,
|
||||
recv_from_rank,
|
||||
bytes);
|
||||
|
||||
xbytes+=bytes;
|
||||
// grid->Barrier();
|
||||
tcomms+=usecond();
|
||||
|
||||
rpointers[i] = &recv_buf_extract[i][0];
|
||||
} else {
|
||||
rpointers[i] = &send_buf_extract[nbr_lane][0];
|
||||
}
|
||||
|
||||
}
|
||||
tscatter-=usecond();
|
||||
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
|
||||
tscatter+=usecond();
|
||||
}
|
||||
/*
|
||||
std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
|
||||
*/
|
||||
}
|
||||
#else
|
||||
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
||||
{
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
|
||||
GridBase *grid=rhs.Grid();
|
||||
Lattice<vobj> temp(rhs.Grid());
|
||||
|
||||
int fd = rhs.Grid()->_fdimensions[dimension];
|
||||
int rd = rhs.Grid()->_rdimensions[dimension];
|
||||
int pd = rhs.Grid()->_processors[dimension];
|
||||
int simd_layout = rhs.Grid()->_simd_layout[dimension];
|
||||
int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
|
||||
assert(simd_layout==1);
|
||||
assert(comm_dim==1);
|
||||
assert(shift>=0);
|
||||
assert(shift<fd);
|
||||
RealD tcopy=0.0;
|
||||
RealD tgather=0.0;
|
||||
RealD tscatter=0.0;
|
||||
RealD tcomms=0.0;
|
||||
uint64_t xbytes=0;
|
||||
|
||||
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
|
||||
static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size);
|
||||
static cshiftVector<vobj> recv_buf_v; recv_buf_v.resize(buffer_size);
|
||||
vobj *send_buf;
|
||||
vobj *recv_buf;
|
||||
{
|
||||
grid->ShmBufferFreeAll();
|
||||
size_t bytes = buffer_size*sizeof(vobj);
|
||||
send_buf=(vobj *)grid->ShmBufferMalloc(bytes);
|
||||
recv_buf=(vobj *)grid->ShmBufferMalloc(bytes);
|
||||
}
|
||||
|
||||
int cb= (cbmask==0x2)? Odd : Even;
|
||||
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
|
||||
|
||||
for(int x=0;x<rd;x++){
|
||||
|
||||
int sx = (x+sshift)%rd;
|
||||
int comm_proc = ((x+sshift)/rd)%pd;
|
||||
|
||||
if (comm_proc==0) {
|
||||
|
||||
tcopy-=usecond();
|
||||
Copy_plane(ret,rhs,dimension,x,sx,cbmask);
|
||||
tcopy+=usecond();
|
||||
|
||||
} else {
|
||||
|
||||
int words = buffer_size;
|
||||
if (cbmask != 0x3) words=words>>1;
|
||||
|
||||
int bytes = words * sizeof(vobj);
|
||||
|
||||
tgather-=usecond();
|
||||
Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask);
|
||||
tgather+=usecond();
|
||||
|
||||
// int rank = grid->_processor;
|
||||
int recv_from_rank;
|
||||
int xmit_to_rank;
|
||||
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
|
||||
|
||||
|
||||
tcomms-=usecond();
|
||||
// grid->Barrier();
|
||||
|
||||
acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes);
|
||||
grid->SendToRecvFrom((void *)&send_buf[0],
|
||||
// bouncy bouncy
|
||||
acceleratorCopyFromDevice((void *)send_buf_extract_mpi,(void *)&hsend_buf[0],bytes);
|
||||
grid->SendToRecvFrom((void *)&hsend_buf[0],
|
||||
xmit_to_rank,
|
||||
(void *)&recv_buf[0],
|
||||
(void *)&hrecv_buf[0],
|
||||
recv_from_rank,
|
||||
bytes);
|
||||
xbytes+=bytes;
|
||||
acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes);
|
||||
acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes);
|
||||
#endif
|
||||
|
||||
// grid->Barrier();
|
||||
xbytes+=bytes;
|
||||
grid->Barrier();
|
||||
tcomms+=usecond();
|
||||
|
||||
tscatter-=usecond();
|
||||
Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask);
|
||||
tscatter+=usecond();
|
||||
}
|
||||
}
|
||||
/*
|
||||
std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
|
||||
*/
|
||||
}
|
||||
|
||||
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
||||
{
|
||||
GridBase *grid=rhs.Grid();
|
||||
const int Nsimd = grid->Nsimd();
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
typedef typename vobj::scalar_object scalar_object;
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
|
||||
int fd = grid->_fdimensions[dimension];
|
||||
int rd = grid->_rdimensions[dimension];
|
||||
int ld = grid->_ldimensions[dimension];
|
||||
int pd = grid->_processors[dimension];
|
||||
int simd_layout = grid->_simd_layout[dimension];
|
||||
int comm_dim = grid->_processors[dimension] >1 ;
|
||||
|
||||
//std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
|
||||
// << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout
|
||||
// << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
|
||||
|
||||
assert(comm_dim==1);
|
||||
assert(simd_layout==2);
|
||||
assert(shift>=0);
|
||||
assert(shift<fd);
|
||||
RealD tcopy=0.0;
|
||||
RealD tgather=0.0;
|
||||
RealD tscatter=0.0;
|
||||
RealD tcomms=0.0;
|
||||
uint64_t xbytes=0;
|
||||
|
||||
int permute_type=grid->PermuteType(dimension);
|
||||
|
||||
///////////////////////////////////////////////
|
||||
// Simd direction uses an extract/merge pair
|
||||
///////////////////////////////////////////////
|
||||
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
|
||||
// int words = sizeof(vobj)/sizeof(vector_type);
|
||||
|
||||
static std::vector<cshiftVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd);
|
||||
static std::vector<cshiftVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
|
||||
scalar_object * recv_buf_extract_mpi;
|
||||
scalar_object * send_buf_extract_mpi;
|
||||
{
|
||||
size_t bytes = sizeof(scalar_object)*buffer_size;
|
||||
grid->ShmBufferFreeAll();
|
||||
send_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
|
||||
recv_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
|
||||
}
|
||||
for(int s=0;s<Nsimd;s++){
|
||||
send_buf_extract[s].resize(buffer_size);
|
||||
recv_buf_extract[s].resize(buffer_size);
|
||||
}
|
||||
|
||||
int bytes = buffer_size*sizeof(scalar_object);
|
||||
|
||||
ExtractPointerArray<scalar_object> pointers(Nsimd); //
|
||||
ExtractPointerArray<scalar_object> rpointers(Nsimd); // received pointers
|
||||
|
||||
///////////////////////////////////////////
|
||||
// Work out what to send where
|
||||
///////////////////////////////////////////
|
||||
int cb = (cbmask==0x2)? Odd : Even;
|
||||
int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
|
||||
|
||||
// loop over outer coord planes orthog to dim
|
||||
for(int x=0;x<rd;x++){
|
||||
|
||||
// FIXME call local permute copy if none are offnode.
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
pointers[i] = &send_buf_extract[i][0];
|
||||
}
|
||||
tgather-=usecond();
|
||||
int sx = (x+sshift)%rd;
|
||||
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
|
||||
tgather+=usecond();
|
||||
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
|
||||
int inner_bit = (Nsimd>>(permute_type+1));
|
||||
int ic= (i&inner_bit)? 1:0;
|
||||
|
||||
int my_coor = rd*ic + x;
|
||||
int nbr_coor = my_coor+sshift;
|
||||
int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
|
||||
|
||||
int nbr_ic = (nbr_coor%ld)/rd; // inner coord of peer
|
||||
int nbr_ox = (nbr_coor%rd); // outer coord of peer
|
||||
int nbr_lane = (i&(~inner_bit));
|
||||
|
||||
int recv_from_rank;
|
||||
int xmit_to_rank;
|
||||
|
||||
if (nbr_ic) nbr_lane|=inner_bit;
|
||||
|
||||
assert (sx == nbr_ox);
|
||||
|
||||
if(nbr_proc){
|
||||
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
||||
|
||||
tcomms-=usecond();
|
||||
// grid->Barrier();
|
||||
|
||||
acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes);
|
||||
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
|
||||
xmit_to_rank,
|
||||
(void *)recv_buf_extract_mpi,
|
||||
recv_from_rank,
|
||||
bytes);
|
||||
acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes);
|
||||
xbytes+=bytes;
|
||||
|
||||
// grid->Barrier();
|
||||
tcomms+=usecond();
|
||||
rpointers[i] = &recv_buf_extract[i][0];
|
||||
} else {
|
||||
rpointers[i] = &send_buf_extract[nbr_lane][0];
|
||||
@@ -530,17 +336,16 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
||||
tscatter-=usecond();
|
||||
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
|
||||
tscatter+=usecond();
|
||||
|
||||
}
|
||||
/*
|
||||
if(Cshift_verbose){
|
||||
std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s"<<std::endl;
|
||||
*/
|
||||
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
#endif
|
||||
|
@@ -30,6 +30,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
|
||||
{
|
||||
assert(!rhs.Grid()->isIcosahedral());
|
||||
Lattice<vobj> ret(rhs.Grid());
|
||||
ret.Checkerboard() = rhs.Grid()->CheckerBoardDestination(rhs.Checkerboard(),shift,dimension);
|
||||
Cshift_local(ret,rhs,dimension,shift);
|
||||
|
@@ -1,5 +1,5 @@
|
||||
#include <Grid/GridCore.h>
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
std::vector<std::pair<int,int> > Cshift_table;
|
||||
commVector<std::pair<int,int> > Cshift_table_device;
|
||||
deviceVector<std::pair<int,int> > Cshift_table_device;
|
||||
NAMESPACE_END(Grid);
|
||||
|
@@ -257,17 +257,30 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
|
||||
});
|
||||
}
|
||||
|
||||
#define FAST_AXPY_NORM
|
||||
template<class sobj,class vobj> inline
|
||||
RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
|
||||
{
|
||||
GRID_TRACE("axpy_norm");
|
||||
#ifdef FAST_AXPY_NORM
|
||||
return axpy_norm_fast(ret,a,x,y);
|
||||
#else
|
||||
ret = a*x+y;
|
||||
RealD nn=norm2(ret);
|
||||
return nn;
|
||||
#endif
|
||||
}
|
||||
template<class sobj,class vobj> inline
|
||||
RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
|
||||
{
|
||||
GRID_TRACE("axpby_norm");
|
||||
#ifdef FAST_AXPY_NORM
|
||||
return axpby_norm_fast(ret,a,b,x,y);
|
||||
#else
|
||||
ret = a*x+b*y;
|
||||
RealD nn=norm2(ret);
|
||||
return nn;
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Trace product
|
||||
|
@@ -237,16 +237,19 @@ public:
|
||||
vobj vtmp;
|
||||
vtmp = r;
|
||||
#if 1
|
||||
deviceVector<vobj> vvtmp(1);
|
||||
acceleratorPut(vvtmp[0],vtmp);
|
||||
vobj *vvtmp_p = & vvtmp[0];
|
||||
auto me = View(AcceleratorWrite);
|
||||
accelerator_for(ss,me.size(),vobj::Nsimd(),{
|
||||
auto stmp=coalescedRead(*vvtmp_p);
|
||||
coalescedWrite(me[ss],stmp);
|
||||
});
|
||||
#else
|
||||
auto me = View(CpuWrite);
|
||||
thread_for(ss,me.size(),{
|
||||
me[ss]= r;
|
||||
});
|
||||
#else
|
||||
auto me = View(AcceleratorWrite);
|
||||
accelerator_for(ss,me.size(),vobj::Nsimd(),{
|
||||
auto stmp=coalescedRead(vtmp);
|
||||
coalescedWrite(me[ss],stmp);
|
||||
});
|
||||
#endif
|
||||
me.ViewClose();
|
||||
return *this;
|
||||
@@ -370,14 +373,17 @@ public:
|
||||
|
||||
template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){
|
||||
typedef typename vobj::scalar_object sobj;
|
||||
for(int64_t g=0;g<o.Grid()->_gsites;g++){
|
||||
uint64_t gsites=1;
|
||||
uint64_t polesites=0;
|
||||
for(int d=0;d<o.Grid()->_ndimension;d++) gsites *= o.Grid()->_gdimensions[d];
|
||||
for(int64_t g=0;g<gsites;g++){
|
||||
|
||||
Coordinate gcoor;
|
||||
o.Grid()->GlobalIndexToGlobalCoor(g,gcoor);
|
||||
|
||||
sobj ss;
|
||||
peekSite(ss,o,gcoor);
|
||||
stream<<"[";
|
||||
stream<<"["<< g<<" : ";
|
||||
for(int d=0;d<gcoor.size();d++){
|
||||
stream<<gcoor[d];
|
||||
if(d!=gcoor.size()-1) stream<<",";
|
||||
@@ -385,6 +391,41 @@ template<class vobj> std::ostream& operator<< (std::ostream& stream, const Latti
|
||||
stream<<"]\t";
|
||||
stream<<ss<<std::endl;
|
||||
}
|
||||
if ( o.Grid()->isIcosahedralVertex() ) {
|
||||
uint64_t psites=1;
|
||||
Coordinate perpdims;
|
||||
for(int d=2;d<o.Grid()->_ndimension-1;d++){
|
||||
int pd=o.Grid()->_gdimensions[d];
|
||||
psites*=pd;
|
||||
perpdims.push_back(pd);
|
||||
}
|
||||
for(uint64_t p=0;p<psites;p++){
|
||||
sobj ss;
|
||||
Coordinate orthog;
|
||||
Lexicographic::CoorFromIndex(orthog,p,perpdims);
|
||||
peekPole(ss,o,orthog,South);
|
||||
stream<<"[ SouthPole : ";
|
||||
for(int d=0;d<orthog.size();d++){
|
||||
stream<<orthog[d];
|
||||
if(d!=orthog.size()-1) stream<<",";
|
||||
}
|
||||
stream<<"]\t";
|
||||
stream<<ss<<std::endl;
|
||||
}
|
||||
for(uint64_t p=0;p<psites;p++){
|
||||
sobj ss;
|
||||
Coordinate orthog;
|
||||
Lexicographic::CoorFromIndex(orthog,p,perpdims);
|
||||
peekPole(ss,o,orthog,North);
|
||||
stream<<"[ NorthPole : ";
|
||||
for(int d=0;d<orthog.size();d++){
|
||||
stream<<orthog[d];
|
||||
if(d!=orthog.size()-1) stream<<",";
|
||||
}
|
||||
stream<<"]\t";
|
||||
stream<<ss<<std::endl;
|
||||
}
|
||||
}
|
||||
return stream;
|
||||
}
|
||||
|
||||
|
@@ -53,36 +53,19 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
|
||||
typedef decltype(basis[0]) Field;
|
||||
typedef decltype(basis[0].View(AcceleratorRead)) View;
|
||||
|
||||
Vector<View> basis_v; basis_v.reserve(basis.size());
|
||||
typedef typename std::remove_reference<decltype(basis_v[0][0])>::type vobj;
|
||||
hostVector<View> h_basis_v(basis.size());
|
||||
deviceVector<View> d_basis_v(basis.size());
|
||||
typedef typename std::remove_reference<decltype(h_basis_v[0][0])>::type vobj;
|
||||
typedef typename std::remove_reference<decltype(Qt(0,0))>::type Coeff_t;
|
||||
|
||||
GridBase* grid = basis[0].Grid();
|
||||
|
||||
for(int k=0;k<basis.size();k++){
|
||||
basis_v.push_back(basis[k].View(AcceleratorWrite));
|
||||
h_basis_v[k] = basis[k].View(AcceleratorWrite);
|
||||
acceleratorPut(d_basis_v[k],h_basis_v[k]);
|
||||
}
|
||||
|
||||
#if ( !(defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)) )
|
||||
int max_threads = thread_max();
|
||||
Vector < vobj > Bt(Nm * max_threads);
|
||||
thread_region
|
||||
{
|
||||
vobj* B = &Bt[Nm * thread_num()];
|
||||
thread_for_in_region(ss, grid->oSites(),{
|
||||
for(int j=j0; j<j1; ++j) B[j]=0.;
|
||||
|
||||
for(int j=j0; j<j1; ++j){
|
||||
for(int k=k0; k<k1; ++k){
|
||||
B[j] +=Qt(j,k) * basis_v[k][ss];
|
||||
}
|
||||
}
|
||||
for(int j=j0; j<j1; ++j){
|
||||
basis_v[j][ss] = B[j];
|
||||
}
|
||||
});
|
||||
}
|
||||
#else
|
||||
View *basis_vp = &basis_v[0];
|
||||
View *basis_vp = &d_basis_v[0];
|
||||
|
||||
int nrot = j1-j0;
|
||||
if (!nrot) // edge case not handled gracefully by Cuda
|
||||
@@ -91,17 +74,19 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
|
||||
uint64_t oSites =grid->oSites();
|
||||
uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
|
||||
|
||||
Vector <vobj> Bt(siteBlock * nrot);
|
||||
deviceVector <vobj> Bt(siteBlock * nrot);
|
||||
auto Bp=&Bt[0];
|
||||
|
||||
// GPU readable copy of matrix
|
||||
Vector<Coeff_t> Qt_jv(Nm*Nm);
|
||||
hostVector<Coeff_t> h_Qt_jv(Nm*Nm);
|
||||
deviceVector<Coeff_t> Qt_jv(Nm*Nm);
|
||||
Coeff_t *Qt_p = & Qt_jv[0];
|
||||
thread_for(i,Nm*Nm,{
|
||||
int j = i/Nm;
|
||||
int k = i%Nm;
|
||||
Qt_p[i]=Qt(j,k);
|
||||
h_Qt_jv[i]=Qt(j,k);
|
||||
});
|
||||
acceleratorCopyToDevice(&h_Qt_jv[0],Qt_p,Nm*Nm*sizeof(Coeff_t));
|
||||
|
||||
// Block the loop to keep storage footprint down
|
||||
for(uint64_t s=0;s<oSites;s+=siteBlock){
|
||||
@@ -137,9 +122,8 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
|
||||
coalescedWrite(basis_vp[jj][sss],coalescedRead(Bp[ss*nrot+j]));
|
||||
});
|
||||
}
|
||||
#endif
|
||||
|
||||
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
|
||||
for(int k=0;k<basis.size();k++) h_basis_v[k].ViewClose();
|
||||
}
|
||||
|
||||
// Extract a single rotated vector
|
||||
@@ -152,16 +136,19 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
|
||||
|
||||
result.Checkerboard() = basis[0].Checkerboard();
|
||||
|
||||
Vector<View> basis_v; basis_v.reserve(basis.size());
|
||||
hostVector<View> h_basis_v(basis.size());
|
||||
deviceVector<View> d_basis_v(basis.size());
|
||||
for(int k=0;k<basis.size();k++){
|
||||
basis_v.push_back(basis[k].View(AcceleratorRead));
|
||||
h_basis_v[k]=basis[k].View(AcceleratorRead);
|
||||
acceleratorPut(d_basis_v[k],h_basis_v[k]);
|
||||
}
|
||||
vobj zz=Zero();
|
||||
Vector<double> Qt_jv(Nm);
|
||||
double * Qt_j = & Qt_jv[0];
|
||||
for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
|
||||
|
||||
auto basis_vp=& basis_v[0];
|
||||
vobj zz=Zero();
|
||||
deviceVector<double> Qt_jv(Nm);
|
||||
double * Qt_j = & Qt_jv[0];
|
||||
for(int k=0;k<Nm;++k) acceleratorPut(Qt_j[k],Qt(j,k));
|
||||
|
||||
auto basis_vp=& d_basis_v[0];
|
||||
autoView(result_v,result,AcceleratorWrite);
|
||||
accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
|
||||
vobj zzz=Zero();
|
||||
@@ -171,7 +158,7 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
|
||||
}
|
||||
coalescedWrite(result_v[ss], B);
|
||||
});
|
||||
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
|
||||
for(int k=0;k<basis.size();k++) h_basis_v[k].ViewClose();
|
||||
}
|
||||
|
||||
template<class Field>
|
||||
|
@@ -34,11 +34,18 @@ template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
|
||||
typedef typename iobj::scalar_type scalar_type;
|
||||
typedef typename iobj::vector_type vector_type;
|
||||
|
||||
l=Zero();
|
||||
|
||||
GridBase *grid = l.Grid();
|
||||
int Nsimd = grid->iSites();
|
||||
|
||||
int cartesian_vol = grid->oSites();
|
||||
if ( grid->isIcosahedral() ) {
|
||||
cartesian_vol = cartesian_vol - grid->NorthPoleOsites()-grid->SouthPoleOsites();
|
||||
}
|
||||
{
|
||||
autoView(l_v, l, CpuWrite);
|
||||
thread_for( o, grid->oSites(), {
|
||||
thread_for( o, cartesian_vol, {
|
||||
vector_type vI;
|
||||
Coordinate gcoor;
|
||||
ExtractBuffer<scalar_type> mergebuf(Nsimd);
|
||||
@@ -49,7 +56,64 @@ template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
|
||||
merge<vector_type,scalar_type>(vI,mergebuf);
|
||||
l_v[o]=vI;
|
||||
});
|
||||
}
|
||||
|
||||
if (grid->isIcosahedralVertex()) {
|
||||
uint64_t psites=1;
|
||||
Coordinate perpdims;
|
||||
typename iobj::scalar_object ss;
|
||||
for(int d=2;d<grid->_ndimension-1;d++){
|
||||
int pd=grid->_gdimensions[d];
|
||||
psites*=pd;
|
||||
perpdims.push_back(pd);
|
||||
}
|
||||
for(uint64_t p=0;p<psites;p++){
|
||||
Coordinate orthog;
|
||||
Lexicographic::CoorFromIndex(orthog,p,perpdims);
|
||||
|
||||
int icoor;
|
||||
if ( mu>=2 && mu < grid->_ndimension-1) {
|
||||
icoor = orthog[mu-2];
|
||||
} else {
|
||||
icoor = -1;
|
||||
}
|
||||
|
||||
ss=scalar_type(icoor);
|
||||
|
||||
pokePole(ss,l,orthog,South);
|
||||
pokePole(ss,l,orthog,North);
|
||||
}
|
||||
}
|
||||
};
|
||||
template<class iobj> inline void LatticePole(Lattice<iobj> &l,NorthSouth pole)
|
||||
{
|
||||
typedef typename iobj::scalar_object sobj;
|
||||
typedef typename iobj::scalar_type scalar_type;
|
||||
typedef typename iobj::vector_type vector_type;
|
||||
|
||||
GridBase *grid = l.Grid();
|
||||
|
||||
l=Zero();
|
||||
|
||||
assert(grid->isIcosahedralVertex());
|
||||
|
||||
if (grid->isIcosahedralVertex()) {
|
||||
uint64_t psites=1;
|
||||
Coordinate perpdims;
|
||||
sobj ss;
|
||||
scalar_type one(1.0);
|
||||
ss=one;
|
||||
for(int d=2;d<l.Grid()->_ndimension-1;d++){
|
||||
int pd=l.Grid()->_gdimensions[d];
|
||||
psites*=pd;
|
||||
perpdims.push_back(pd);
|
||||
}
|
||||
for(uint64_t p=0;p<psites;p++){
|
||||
Coordinate orthog;
|
||||
Lexicographic::CoorFromIndex(orthog,p,perpdims);
|
||||
pokePole(ss,l,orthog,pole);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
|
@@ -141,7 +141,7 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
|
||||
grid->GlobalCoorToRankIndex(rank,odx,idx,site);
|
||||
|
||||
ExtractBuffer<sobj> buf(Nsimd);
|
||||
autoView( l_v , l, CpuWrite);
|
||||
autoView( l_v , l, CpuRead);
|
||||
extract(l_v[odx],buf);
|
||||
|
||||
s = buf[idx];
|
||||
@@ -151,6 +151,261 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
|
||||
return;
|
||||
};
|
||||
|
||||
// zero for south pole, one for north pole
|
||||
template<class vobj,class sobj>
|
||||
void peekPole(sobj &s,const Lattice<vobj> &l,const Coordinate &orthog,NorthSouth isNorth)
|
||||
{
|
||||
s=Zero();
|
||||
|
||||
GridBase *grid=l.Grid();
|
||||
|
||||
assert(grid->isIcosahedral());
|
||||
assert(grid->isIcosahedralVertex());
|
||||
|
||||
int Nsimd = grid->Nsimd();
|
||||
|
||||
int rank;
|
||||
|
||||
int Ndm1 = grid->_ndimension-1;
|
||||
Coordinate pgrid = grid->ProcessorGrid();
|
||||
const int xdim=0;
|
||||
const int ydim=1;
|
||||
const int pdim=Ndm1;
|
||||
|
||||
int64_t pole_osite;
|
||||
int64_t pole_isite;
|
||||
Coordinate rdims;
|
||||
Coordinate idims;
|
||||
Coordinate ocoor;
|
||||
Coordinate icoor;
|
||||
Coordinate pcoor(grid->_ndimension);
|
||||
for(int d=2;d<Ndm1;d++){
|
||||
int dd=d-2;
|
||||
rdims.push_back(grid->_rdimensions[d]);
|
||||
idims.push_back(grid->_simd_layout[d]);
|
||||
icoor.push_back((orthog[dd]%grid->_ldimensions[d])/grid->_rdimensions[d]);
|
||||
ocoor.push_back(orthog[dd]%grid->_rdimensions[d]);
|
||||
pcoor[d] = orthog[dd]/grid->_ldimensions[d];
|
||||
}
|
||||
Lexicographic::IndexFromCoor(ocoor,pole_osite,rdims);
|
||||
Lexicographic::IndexFromCoor(icoor,pole_isite,idims);
|
||||
|
||||
int64_t osite;
|
||||
if(isNorth == North){
|
||||
pcoor[xdim] = 0;
|
||||
pcoor[ydim] = pgrid[ydim]-1;
|
||||
pcoor[Ndm1] = pgrid[Ndm1]-1;
|
||||
osite = pole_osite + grid->NorthPoleOsite();
|
||||
} else {
|
||||
pcoor[xdim] = pgrid[xdim]-1;
|
||||
pcoor[ydim] = 0;
|
||||
pcoor[Ndm1] = 0;
|
||||
osite = pole_osite + grid->SouthPoleOsite();
|
||||
}
|
||||
|
||||
rank = grid->RankFromProcessorCoor(pcoor);
|
||||
|
||||
if ( rank == grid->ThisRank() ) {
|
||||
ExtractBuffer<sobj> buf(Nsimd);
|
||||
autoView( l_v , l, CpuWrite);
|
||||
extract(l_v[osite],buf);
|
||||
s = buf[pole_isite];
|
||||
}
|
||||
grid->Broadcast(rank,s);
|
||||
|
||||
return;
|
||||
};
|
||||
template<class vobj,class sobj>
|
||||
void pokePole(const sobj &s,Lattice<vobj> &l,const Coordinate &orthog,NorthSouth isNorth)
|
||||
{
|
||||
GridBase *grid=l.Grid();
|
||||
|
||||
assert(grid->isIcosahedral());
|
||||
assert(grid->isIcosahedralVertex());
|
||||
|
||||
grid->Broadcast(grid->BossRank(),s);
|
||||
|
||||
int Nsimd = grid->Nsimd();
|
||||
int rank;
|
||||
int Ndm1 = grid->_ndimension-1;
|
||||
Coordinate pgrid = grid->ProcessorGrid();
|
||||
const int xdim=0;
|
||||
const int ydim=1;
|
||||
const int pdim=Ndm1;
|
||||
|
||||
int64_t pole_osite;
|
||||
int64_t pole_isite;
|
||||
Coordinate rdims;
|
||||
Coordinate idims;
|
||||
Coordinate ocoor;
|
||||
Coordinate icoor;
|
||||
Coordinate pcoor(grid->_ndimension,0);
|
||||
for(int d=2;d<Ndm1;d++){
|
||||
int dd = d-2;
|
||||
rdims.push_back(grid->_rdimensions[d]);
|
||||
idims.push_back(grid->_simd_layout[d]);
|
||||
icoor.push_back((orthog[dd]%grid->_ldimensions[d])/grid->_rdimensions[d]);
|
||||
ocoor.push_back(orthog[dd]%grid->_rdimensions[d]);
|
||||
pcoor[d] = orthog[dd]/grid->_ldimensions[d];
|
||||
|
||||
int o = orthog[dd];
|
||||
int r = grid->_rdimensions[d];
|
||||
int omr = o % r;
|
||||
}
|
||||
Lexicographic::IndexFromCoor(ocoor,pole_osite,rdims);
|
||||
Lexicographic::IndexFromCoor(icoor,pole_isite,idims);
|
||||
|
||||
int64_t osite;
|
||||
if(isNorth ==North){
|
||||
pcoor[xdim] = 0;
|
||||
pcoor[ydim] = pgrid[ydim]-1;
|
||||
pcoor[Ndm1] = pgrid[Ndm1]-1;
|
||||
osite = pole_osite + grid->NorthPoleOsite();
|
||||
} else {
|
||||
pcoor[xdim] = pgrid[xdim]-1;
|
||||
pcoor[ydim] = 0;
|
||||
pcoor[Ndm1] = 0;
|
||||
osite = pole_osite + grid->SouthPoleOsite();
|
||||
}
|
||||
|
||||
rank = grid->RankFromProcessorCoor(pcoor);
|
||||
|
||||
// extract-modify-merge cycle is easiest way and this is not perf critical
|
||||
if ( rank == grid->ThisRank() ) {
|
||||
ExtractBuffer<sobj> buf(Nsimd);
|
||||
autoView( l_v , l, CpuWrite);
|
||||
extract(l_v[osite],buf);
|
||||
buf[pole_isite] = s;
|
||||
merge(l_v[osite],buf);
|
||||
}
|
||||
return;
|
||||
};
|
||||
|
||||
|
||||
template<class vobj,class sobj>
|
||||
void peekLocalPole(sobj &s,const Lattice<vobj> &l,const Coordinate &orthog,NorthSouth isNorth)
|
||||
{
|
||||
s=Zero();
|
||||
|
||||
GridBase *grid=l.Grid();
|
||||
|
||||
assert(grid->isIcosahedral());
|
||||
assert(grid->isIcosahedralVertex());
|
||||
|
||||
int Nsimd = grid->Nsimd();
|
||||
|
||||
int rank;
|
||||
|
||||
int Ndm1 = grid->_ndimension-1;
|
||||
Coordinate pgrid = grid->ProcessorGrid();
|
||||
const int xdim=0;
|
||||
const int ydim=1;
|
||||
const int pdim=Ndm1;
|
||||
|
||||
int64_t pole_osite;
|
||||
int64_t pole_isite;
|
||||
Coordinate rdims;
|
||||
Coordinate idims;
|
||||
Coordinate ocoor;
|
||||
Coordinate icoor;
|
||||
// Coordinate pcoor(grid->_ndimension);
|
||||
for(int d=2;d<Ndm1;d++){
|
||||
int dd=d-2;
|
||||
rdims.push_back(grid->_rdimensions[d]);
|
||||
idims.push_back(grid->_simd_layout[d]);
|
||||
icoor.push_back((orthog[dd]%grid->_ldimensions[d])/grid->_rdimensions[d]);
|
||||
ocoor.push_back(orthog[dd]%grid->_rdimensions[d]);
|
||||
// pcoor[d] = orthog[dd]/grid->_ldimensions[d];
|
||||
}
|
||||
Lexicographic::IndexFromCoor(ocoor,pole_osite,rdims);
|
||||
Lexicographic::IndexFromCoor(icoor,pole_isite,idims);
|
||||
|
||||
int64_t osite;
|
||||
if(isNorth == North){
|
||||
// pcoor[xdim] = 0;
|
||||
// pcoor[ydim] = pgrid[ydim]-1;
|
||||
// pcoor[Ndm1] = pgrid[Ndm1]-1;
|
||||
osite = pole_osite + grid->NorthPoleOsite();
|
||||
assert(grid->ownsNorthPole());
|
||||
} else {
|
||||
// pcoor[xdim] = pgrid[xdim]-1;
|
||||
// pcoor[ydim] = 0;
|
||||
// pcoor[Ndm1] = 0;
|
||||
osite = pole_osite + grid->SouthPoleOsite();
|
||||
assert(grid->ownsSouthPole());
|
||||
}
|
||||
|
||||
ExtractBuffer<sobj> buf(Nsimd);
|
||||
autoView( l_v , l, CpuWrite);
|
||||
extract(l_v[osite],buf);
|
||||
s = buf[pole_isite];
|
||||
|
||||
return;
|
||||
};
|
||||
template<class vobj,class sobj>
|
||||
void pokeLocalPole(const sobj &s,Lattice<vobj> &l,const Coordinate &orthog,NorthSouth isNorth)
|
||||
{
|
||||
GridBase *grid=l.Grid();
|
||||
|
||||
assert(grid->isIcosahedral());
|
||||
assert(grid->isIcosahedralVertex());
|
||||
|
||||
int Nsimd = grid->Nsimd();
|
||||
int rank;
|
||||
int Ndm1 = grid->_ndimension-1;
|
||||
|
||||
const int xdim=0;
|
||||
const int ydim=1;
|
||||
const int pdim=Ndm1;
|
||||
|
||||
int64_t pole_osite;
|
||||
int64_t pole_isite;
|
||||
Coordinate rdims;
|
||||
Coordinate idims;
|
||||
Coordinate ocoor;
|
||||
Coordinate icoor;
|
||||
// Coordinate pcoor(grid->_ndimension,0);
|
||||
for(int d=2;d<Ndm1;d++){
|
||||
int dd = d-2;
|
||||
rdims.push_back(grid->_rdimensions[d]);
|
||||
idims.push_back(grid->_simd_layout[d]);
|
||||
icoor.push_back((orthog[dd]%grid->_ldimensions[d])/grid->_rdimensions[d]);
|
||||
ocoor.push_back(orthog[dd]%grid->_rdimensions[d]);
|
||||
// pcoor[d] = orthog[dd]/grid->_ldimensions[d];
|
||||
|
||||
int o = orthog[dd];
|
||||
int r = grid->_rdimensions[d];
|
||||
int omr = o % r;
|
||||
}
|
||||
Lexicographic::IndexFromCoor(ocoor,pole_osite,rdims);
|
||||
Lexicographic::IndexFromCoor(icoor,pole_isite,idims);
|
||||
|
||||
int64_t osite;
|
||||
int insert=0;
|
||||
if(isNorth ==North){
|
||||
// pcoor[xdim] = 0;
|
||||
// pcoor[ydim] = pgrid[ydim]-1;
|
||||
// pcoor[Ndm1] = pgrid[Ndm1]-1;
|
||||
osite = pole_osite + grid->NorthPoleOsite();
|
||||
assert(grid->ownsNorthPole());
|
||||
} else {
|
||||
// pcoor[xdim] = pgrid[xdim]-1;
|
||||
// pcoor[ydim] = 0;
|
||||
// pcoor[Ndm1] = 0;
|
||||
osite = pole_osite + grid->SouthPoleOsite();
|
||||
assert(grid->ownsSouthPole());
|
||||
}
|
||||
|
||||
// extract-modify-merge cycle is easiest way and this is not perf critical
|
||||
ExtractBuffer<sobj> buf(Nsimd);
|
||||
autoView( l_v , l, CpuWrite);
|
||||
extract(l_v[osite],buf);
|
||||
buf[pole_isite] = s;
|
||||
merge(l_v[osite],buf);
|
||||
|
||||
return;
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////
|
||||
// Peek a scalar object from the SIMD array
|
||||
//////////////////////////////////////////////////////////
|
||||
@@ -165,7 +420,7 @@ inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
|
||||
|
||||
int Nsimd = grid->Nsimd();
|
||||
|
||||
assert( l.Checkerboard()== grid->CheckerBoard(site));
|
||||
// assert( l.Checkerboard()== grid->CheckerBoard(site));
|
||||
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
|
||||
|
||||
static const int words=sizeof(vobj)/sizeof(vector_type);
|
||||
@@ -202,7 +457,7 @@ inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site)
|
||||
|
||||
int Nsimd = grid->Nsimd();
|
||||
|
||||
assert( l.Checkerboard()== grid->CheckerBoard(site));
|
||||
// assert( l.Checkerboard()== grid->CheckerBoard(site));
|
||||
assert( sizeof(sobj)*Nsimd == sizeof(vobj));
|
||||
|
||||
static const int words=sizeof(vobj)/sizeof(vector_type);
|
||||
|
@@ -46,7 +46,7 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
|
||||
// const int Nsimd = vobj::Nsimd();
|
||||
const int nthread = GridThread::GetThreads();
|
||||
|
||||
Vector<sobj> sumarray(nthread);
|
||||
std::vector<sobj> sumarray(nthread);
|
||||
for(int i=0;i<nthread;i++){
|
||||
sumarray[i]=Zero();
|
||||
}
|
||||
@@ -75,7 +75,7 @@ inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)
|
||||
|
||||
const int nthread = GridThread::GetThreads();
|
||||
|
||||
Vector<sobj> sumarray(nthread);
|
||||
std::vector<sobj> sumarray(nthread);
|
||||
for(int i=0;i<nthread;i++){
|
||||
sumarray[i]=Zero();
|
||||
}
|
||||
@@ -290,8 +290,10 @@ template<class vobj>
|
||||
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
|
||||
GridBase *grid = left.Grid();
|
||||
|
||||
bool ok;
|
||||
#ifdef GRID_SYCL
|
||||
uint64_t csum=0;
|
||||
uint64_t csum2=0;
|
||||
if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
|
||||
{
|
||||
// Hack
|
||||
@@ -300,13 +302,33 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
|
||||
Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t);
|
||||
uint64_t *base= (uint64_t *)&l_v[0];
|
||||
csum=svm_xor(base,words);
|
||||
ok = FlightRecorder::CsumLog(csum);
|
||||
if ( !ok ) {
|
||||
csum2=svm_xor(base,words);
|
||||
std::cerr<< " Bad CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
|
||||
} else {
|
||||
// csum2=svm_xor(base,words);
|
||||
// std::cerr<< " ok CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
|
||||
}
|
||||
assert(ok);
|
||||
}
|
||||
FlightRecorder::CsumLog(csum);
|
||||
#endif
|
||||
FlightRecorder::StepLog("rank inner product");
|
||||
ComplexD nrm = rankInnerProduct(left,right);
|
||||
// ComplexD nrmck=nrm;
|
||||
RealD local = real(nrm);
|
||||
FlightRecorder::NormLog(real(nrm));
|
||||
ok = FlightRecorder::NormLog(real(nrm));
|
||||
if ( !ok ) {
|
||||
ComplexD nrm2 = rankInnerProduct(left,right);
|
||||
RealD local2 = real(nrm2);
|
||||
std::cerr<< " Bad NORM " << local << " recomputed as "<<local2<<std::endl;
|
||||
assert(ok);
|
||||
}
|
||||
FlightRecorder::StepLog("Start global sum");
|
||||
// grid->GlobalSumP2P(nrm);
|
||||
grid->GlobalSum(nrm);
|
||||
FlightRecorder::StepLog("Finished global sum");
|
||||
// std::cout << " norm "<< nrm << " p2p norm "<<nrmck<<std::endl;
|
||||
FlightRecorder::ReductionLog(local,real(nrm));
|
||||
return nrm;
|
||||
}
|
||||
@@ -343,18 +365,6 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
|
||||
autoView( x_v, x, AcceleratorRead);
|
||||
autoView( y_v, y, AcceleratorRead);
|
||||
autoView( z_v, z, AcceleratorWrite);
|
||||
#if 0
|
||||
typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
|
||||
Vector<inner_t> inner_tmp(sites);
|
||||
auto inner_tmp_v = &inner_tmp[0];
|
||||
|
||||
accelerator_for( ss, sites, nsimd,{
|
||||
auto tmp = a*x_v(ss)+b*y_v(ss);
|
||||
coalescedWrite(inner_tmp_v[ss],innerProductD(tmp,tmp));
|
||||
coalescedWrite(z_v[ss],tmp);
|
||||
});
|
||||
nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
|
||||
#else
|
||||
typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
|
||||
deviceVector<inner_t> inner_tmp;
|
||||
inner_tmp.resize(sites);
|
||||
@@ -365,9 +375,44 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
|
||||
coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
|
||||
coalescedWrite(z_v[ss],tmp);
|
||||
});
|
||||
nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
|
||||
bool ok;
|
||||
#ifdef GRID_SYCL
|
||||
uint64_t csum=0;
|
||||
uint64_t csum2=0;
|
||||
if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
|
||||
{
|
||||
// z_v
|
||||
{
|
||||
Integer words = sites*sizeof(vobj)/sizeof(uint64_t);
|
||||
uint64_t *base= (uint64_t *)&z_v[0];
|
||||
csum=svm_xor(base,words);
|
||||
ok = FlightRecorder::CsumLog(csum);
|
||||
if ( !ok ) {
|
||||
csum2=svm_xor(base,words);
|
||||
std::cerr<< " Bad z_v CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
|
||||
}
|
||||
assert(ok);
|
||||
}
|
||||
// inner_v
|
||||
{
|
||||
Integer words = sites*sizeof(inner_t)/sizeof(uint64_t);
|
||||
uint64_t *base= (uint64_t *)&inner_tmp_v[0];
|
||||
csum=svm_xor(base,words);
|
||||
ok = FlightRecorder::CsumLog(csum);
|
||||
if ( !ok ) {
|
||||
csum2=svm_xor(base,words);
|
||||
std::cerr<< " Bad inner_tmp_v CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
|
||||
}
|
||||
assert(ok);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
|
||||
ok = FlightRecorder::NormLog(real(nrm));
|
||||
assert(ok);
|
||||
RealD local = real(nrm);
|
||||
grid->GlobalSum(nrm);
|
||||
FlightRecorder::ReductionLog(local,real(nrm));
|
||||
return nrm;
|
||||
}
|
||||
|
||||
@@ -377,7 +422,7 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
|
||||
conformable(left,right);
|
||||
|
||||
typedef typename vobj::vector_typeD vector_type;
|
||||
Vector<ComplexD> tmp(2);
|
||||
std::vector<ComplexD> tmp(2);
|
||||
|
||||
GridBase *grid = left.Grid();
|
||||
|
||||
@@ -387,8 +432,8 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
|
||||
// GPU
|
||||
typedef decltype(innerProductD(vobj(),vobj())) inner_t;
|
||||
typedef decltype(innerProductD(vobj(),vobj())) norm_t;
|
||||
Vector<inner_t> inner_tmp(sites);
|
||||
Vector<norm_t> norm_tmp(sites);
|
||||
deviceVector<inner_t> inner_tmp(sites);
|
||||
deviceVector<norm_t> norm_tmp(sites);
|
||||
auto inner_tmp_v = &inner_tmp[0];
|
||||
auto norm_tmp_v = &norm_tmp[0];
|
||||
{
|
||||
@@ -438,7 +483,9 @@ inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
|
||||
// sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc...
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim)
|
||||
template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,
|
||||
std::vector<typename vobj::scalar_object> &result,
|
||||
int orthogdim)
|
||||
{
|
||||
///////////////////////////////////////////////////////
|
||||
// FIXME precision promoted summation
|
||||
@@ -460,8 +507,8 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
|
||||
int ld=grid->_ldimensions[orthogdim];
|
||||
int rd=grid->_rdimensions[orthogdim];
|
||||
|
||||
Vector<vobj> lvSum(rd); // will locally sum vectors first
|
||||
Vector<sobj> lsSum(ld,Zero()); // sum across these down to scalars
|
||||
std::vector<vobj> lvSum(rd); // will locally sum vectors first
|
||||
std::vector<sobj> lsSum(ld,Zero()); // sum across these down to scalars
|
||||
ExtractBuffer<sobj> extracted(Nsimd); // splitting the SIMD
|
||||
|
||||
result.resize(fd); // And then global sum to return the same vector to every node
|
||||
@@ -509,6 +556,8 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
|
||||
scalar_type * ptr = (scalar_type *) &result[0];
|
||||
int words = fd*sizeof(sobj)/sizeof(scalar_type);
|
||||
grid->GlobalSumVector(ptr, words);
|
||||
// std::cout << GridLogMessage << " sliceSum local"<<t_sum<<" us, host+mpi "<<t_rest<<std::endl;
|
||||
|
||||
}
|
||||
template<class vobj> inline
|
||||
std::vector<typename vobj::scalar_object>
|
||||
@@ -519,7 +568,20 @@ sliceSum(const Lattice<vobj> &Data,int orthogdim)
|
||||
return result;
|
||||
}
|
||||
|
||||
/*
|
||||
Reimplement
|
||||
|
||||
1)
|
||||
template<class vobj>
|
||||
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)
|
||||
|
||||
2)
|
||||
template<class vobj>
|
||||
static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)
|
||||
|
||||
3)
|
||||
-- Make Slice Mul Matrix call sliceMaddMatrix
|
||||
*/
|
||||
template<class vobj>
|
||||
static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)
|
||||
{
|
||||
@@ -539,8 +601,8 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti
|
||||
int ld=grid->_ldimensions[orthogdim];
|
||||
int rd=grid->_rdimensions[orthogdim];
|
||||
|
||||
Vector<vector_type> lvSum(rd); // will locally sum vectors first
|
||||
Vector<scalar_type > lsSum(ld,scalar_type(0.0)); // sum across these down to scalars
|
||||
std::vector<vector_type> lvSum(rd); // will locally sum vectors first
|
||||
std::vector<scalar_type > lsSum(ld,scalar_type(0.0)); // sum across these down to scalars
|
||||
ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd); // splitting the SIMD
|
||||
|
||||
result.resize(fd); // And then global sum to return the same vector to every node for IO to file
|
||||
@@ -670,203 +732,96 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
|
||||
}
|
||||
};
|
||||
|
||||
/*
|
||||
inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
|
||||
{
|
||||
int NN = BlockSolverGrid->_ndimension;
|
||||
int nsimd = BlockSolverGrid->Nsimd();
|
||||
|
||||
std::vector<int> latt_phys(0);
|
||||
std::vector<int> simd_phys(0);
|
||||
std::vector<int> mpi_phys(0);
|
||||
std::vector<int> latt_phys(NN-1);
|
||||
Coordinate simd_phys;
|
||||
std::vector<int> mpi_phys(NN-1);
|
||||
Coordinate checker_dim_mask(NN-1);
|
||||
int checker_dim=-1;
|
||||
|
||||
int dd;
|
||||
for(int d=0;d<NN;d++){
|
||||
if( d!=Orthog ) {
|
||||
latt_phys.push_back(BlockSolverGrid->_fdimensions[d]);
|
||||
simd_phys.push_back(BlockSolverGrid->_simd_layout[d]);
|
||||
mpi_phys.push_back(BlockSolverGrid->_processors[d]);
|
||||
latt_phys[dd]=BlockSolverGrid->_fdimensions[d];
|
||||
mpi_phys[dd] =BlockSolverGrid->_processors[d];
|
||||
checker_dim_mask[dd] = BlockSolverGrid->_checker_dim_mask[d];
|
||||
if ( d == BlockSolverGrid->_checker_dim ) checker_dim = dd;
|
||||
dd++;
|
||||
}
|
||||
}
|
||||
return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys);
|
||||
simd_phys=GridDefaultSimd(latt_phys.size(),nsimd);
|
||||
GridCartesian *tmp = new GridCartesian(latt_phys,simd_phys,mpi_phys);
|
||||
if(BlockSolverGrid->_isCheckerBoarded) {
|
||||
GridRedBlackCartesian *ret = new GridRedBlackCartesian(tmp,checker_dim_mask,checker_dim);
|
||||
delete tmp;
|
||||
return (GridBase *) ret;
|
||||
} else {
|
||||
return (GridBase *) tmp;
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
template<class vobj>
|
||||
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)
|
||||
{
|
||||
GridBase *FullGrid = X.Grid();
|
||||
GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
|
||||
|
||||
Lattice<vobj> Ys(SliceGrid);
|
||||
Lattice<vobj> Rs(SliceGrid);
|
||||
Lattice<vobj> Xs(SliceGrid);
|
||||
Lattice<vobj> RR(FullGrid);
|
||||
|
||||
RR = R; // Copies checkerboard for insert
|
||||
|
||||
typedef typename vobj::scalar_object sobj;
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
|
||||
int Nblock = X.Grid()->GlobalDimensions()[Orthog];
|
||||
|
||||
GridBase *FullGrid = X.Grid();
|
||||
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
|
||||
|
||||
// Lattice<vobj> Xslice(SliceGrid);
|
||||
// Lattice<vobj> Rslice(SliceGrid);
|
||||
|
||||
assert( FullGrid->_simd_layout[Orthog]==1);
|
||||
// int nh = FullGrid->_ndimension;
|
||||
// int nl = SliceGrid->_ndimension;
|
||||
// int nl = nh-1;
|
||||
|
||||
//FIXME package in a convenient iterator
|
||||
//Should loop over a plane orthogonal to direction "Orthog"
|
||||
int stride=FullGrid->_slice_stride[Orthog];
|
||||
int block =FullGrid->_slice_block [Orthog];
|
||||
int nblock=FullGrid->_slice_nblock[Orthog];
|
||||
int ostride=FullGrid->_ostride[Orthog];
|
||||
|
||||
autoView( X_v, X, CpuRead);
|
||||
autoView( Y_v, Y, CpuRead);
|
||||
autoView( R_v, R, CpuWrite);
|
||||
thread_region
|
||||
{
|
||||
Vector<vobj> s_x(Nblock);
|
||||
|
||||
thread_for_collapse_in_region(2, n,nblock, {
|
||||
for(int b=0;b<block;b++){
|
||||
int o = n*stride + b;
|
||||
|
||||
for(int i=0;i<Nblock;i++){
|
||||
s_x[i] = X_v[o+i*ostride];
|
||||
int Nslice = X.Grid()->GlobalDimensions()[Orthog];
|
||||
for(int i=0;i<Nslice;i++){
|
||||
ExtractSlice(Ys,Y,i,Orthog);
|
||||
ExtractSlice(Rs,R,i,Orthog);
|
||||
Rs=Ys;
|
||||
for(int j=0;j<Nslice;j++){
|
||||
ExtractSlice(Xs,X,j,Orthog);
|
||||
Rs = Rs + Xs*(scale*aa(j,i));
|
||||
}
|
||||
|
||||
vobj dot;
|
||||
for(int i=0;i<Nblock;i++){
|
||||
dot = Y_v[o+i*ostride];
|
||||
for(int j=0;j<Nblock;j++){
|
||||
dot = dot + s_x[j]*(scale*aa(j,i));
|
||||
}
|
||||
R_v[o+i*ostride]=dot;
|
||||
}
|
||||
}});
|
||||
InsertSlice(Rs,RR,i,Orthog);
|
||||
}
|
||||
R=RR; // Copy back handles arguments aliasing case
|
||||
delete SliceGrid;
|
||||
};
|
||||
|
||||
template<class vobj>
|
||||
static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0)
|
||||
{
|
||||
typedef typename vobj::scalar_object sobj;
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
|
||||
int Nblock = X.Grid()->GlobalDimensions()[Orthog];
|
||||
|
||||
GridBase *FullGrid = X.Grid();
|
||||
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
|
||||
// Lattice<vobj> Xslice(SliceGrid);
|
||||
// Lattice<vobj> Rslice(SliceGrid);
|
||||
|
||||
assert( FullGrid->_simd_layout[Orthog]==1);
|
||||
// int nh = FullGrid->_ndimension;
|
||||
// int nl = SliceGrid->_ndimension;
|
||||
// int nl=1;
|
||||
|
||||
//FIXME package in a convenient iterator
|
||||
// thread_for2d_in_region
|
||||
//Should loop over a plane orthogonal to direction "Orthog"
|
||||
int stride=FullGrid->_slice_stride[Orthog];
|
||||
int block =FullGrid->_slice_block [Orthog];
|
||||
int nblock=FullGrid->_slice_nblock[Orthog];
|
||||
int ostride=FullGrid->_ostride[Orthog];
|
||||
autoView( R_v, R, CpuWrite);
|
||||
autoView( X_v, X, CpuRead);
|
||||
thread_region
|
||||
{
|
||||
std::vector<vobj> s_x(Nblock);
|
||||
|
||||
|
||||
thread_for_collapse_in_region( 2 ,n,nblock,{
|
||||
for(int b=0;b<block;b++){
|
||||
int o = n*stride + b;
|
||||
|
||||
for(int i=0;i<Nblock;i++){
|
||||
s_x[i] = X_v[o+i*ostride];
|
||||
}
|
||||
|
||||
vobj dot;
|
||||
for(int i=0;i<Nblock;i++){
|
||||
dot = s_x[0]*(scale*aa(0,i));
|
||||
for(int j=1;j<Nblock;j++){
|
||||
dot = dot + s_x[j]*(scale*aa(j,i));
|
||||
}
|
||||
R_v[o+i*ostride]=dot;
|
||||
}
|
||||
}});
|
||||
}
|
||||
R=Zero();
|
||||
sliceMaddMatrix(R,aa,X,R,Orthog,scale);
|
||||
};
|
||||
|
||||
|
||||
template<class vobj>
|
||||
static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)
|
||||
{
|
||||
GridBase *SliceGrid = makeSubSliceGrid(lhs.Grid(),Orthog);
|
||||
|
||||
Lattice<vobj> ls(SliceGrid);
|
||||
Lattice<vobj> rs(SliceGrid);
|
||||
|
||||
typedef typename vobj::scalar_object sobj;
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
|
||||
GridBase *FullGrid = lhs.Grid();
|
||||
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
|
||||
|
||||
int Nblock = FullGrid->GlobalDimensions()[Orthog];
|
||||
|
||||
// Lattice<vobj> Lslice(SliceGrid);
|
||||
// Lattice<vobj> Rslice(SliceGrid);
|
||||
|
||||
mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
|
||||
|
||||
assert( FullGrid->_simd_layout[Orthog]==1);
|
||||
// int nh = FullGrid->_ndimension;
|
||||
// int nl = SliceGrid->_ndimension;
|
||||
// int nl = nh-1;
|
||||
|
||||
//FIXME package in a convenient iterator
|
||||
//Should loop over a plane orthogonal to direction "Orthog"
|
||||
int stride=FullGrid->_slice_stride[Orthog];
|
||||
int block =FullGrid->_slice_block [Orthog];
|
||||
int nblock=FullGrid->_slice_nblock[Orthog];
|
||||
int ostride=FullGrid->_ostride[Orthog];
|
||||
|
||||
typedef typename vobj::vector_typeD vector_typeD;
|
||||
|
||||
autoView( lhs_v, lhs, CpuRead);
|
||||
autoView( rhs_v, rhs, CpuRead);
|
||||
thread_region
|
||||
{
|
||||
std::vector<vobj> Left(Nblock);
|
||||
std::vector<vobj> Right(Nblock);
|
||||
Eigen::MatrixXcd mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock);
|
||||
|
||||
thread_for_collapse_in_region( 2, n,nblock,{
|
||||
for(int b=0;b<block;b++){
|
||||
|
||||
int o = n*stride + b;
|
||||
|
||||
for(int i=0;i<Nblock;i++){
|
||||
Left [i] = lhs_v[o+i*ostride];
|
||||
Right[i] = rhs_v[o+i*ostride];
|
||||
}
|
||||
|
||||
for(int i=0;i<Nblock;i++){
|
||||
for(int j=0;j<Nblock;j++){
|
||||
auto tmp = innerProduct(Left[i],Right[j]);
|
||||
auto rtmp = TensorRemove(tmp);
|
||||
auto red = Reduce(rtmp);
|
||||
mat_thread(i,j) += std::complex<double>(real(red),imag(red));
|
||||
}}
|
||||
}});
|
||||
thread_critical
|
||||
{
|
||||
mat += mat_thread;
|
||||
int Nslice = lhs.Grid()->GlobalDimensions()[Orthog];
|
||||
mat = Eigen::MatrixXcd::Zero(Nslice,Nslice);
|
||||
for(int s=0;s<Nslice;s++){
|
||||
ExtractSlice(ls,lhs,s,Orthog);
|
||||
for(int ss=0;ss<Nslice;ss++){
|
||||
ExtractSlice(rs,rhs,ss,Orthog);
|
||||
mat(s,ss) = innerProduct(ls,rs);
|
||||
}
|
||||
}
|
||||
|
||||
for(int i=0;i<Nblock;i++){
|
||||
for(int j=0;j<Nblock;j++){
|
||||
ComplexD sum = mat(i,j);
|
||||
FullGrid->GlobalSum(sum);
|
||||
mat(i,j)=sum;
|
||||
}}
|
||||
|
||||
return;
|
||||
delete SliceGrid;
|
||||
}
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
@@ -214,22 +214,12 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi
|
||||
// Move out of UVM
|
||||
// Turns out I had messed up the synchronise after move to compute stream
|
||||
// as running this on the default stream fools the synchronise
|
||||
#undef UVM_BLOCK_BUFFER
|
||||
#ifndef UVM_BLOCK_BUFFER
|
||||
commVector<sobj> buffer(numBlocks);
|
||||
deviceVector<sobj> buffer(numBlocks);
|
||||
sobj *buffer_v = &buffer[0];
|
||||
sobj result;
|
||||
reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
|
||||
accelerator_barrier();
|
||||
acceleratorCopyFromDevice(buffer_v,&result,sizeof(result));
|
||||
#else
|
||||
Vector<sobj> buffer(numBlocks);
|
||||
sobj *buffer_v = &buffer[0];
|
||||
sobj result;
|
||||
reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
|
||||
accelerator_barrier();
|
||||
result = *buffer_v;
|
||||
#endif
|
||||
return result;
|
||||
}
|
||||
|
||||
@@ -244,7 +234,7 @@ inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osi
|
||||
|
||||
const int words = sizeof(vobj)/sizeof(vector);
|
||||
|
||||
Vector<vector> buffer(osites);
|
||||
deviceVector<vector> buffer(osites);
|
||||
vector *dat = (vector *)lat;
|
||||
vector *buf = &buffer[0];
|
||||
iScalar<vector> *tbuf =(iScalar<vector> *) &buffer[0];
|
||||
|
@@ -4,33 +4,28 @@ NAMESPACE_BEGIN(Grid);
|
||||
// Possibly promote to double and sum
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
template <class vobj>
|
||||
inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites)
|
||||
{
|
||||
typedef typename vobj::scalar_object sobj;
|
||||
typedef typename vobj::scalar_objectD sobjD;
|
||||
static Vector<sobj> mysum;
|
||||
mysum.resize(1);
|
||||
sobj *mysum_p = & mysum[0];
|
||||
|
||||
sobj identity; zeroit(identity);
|
||||
mysum[0] = identity;
|
||||
sobj ret ;
|
||||
|
||||
sobj ret; zeroit(ret);
|
||||
Integer nsimd= vobj::Nsimd();
|
||||
|
||||
const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() });
|
||||
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
|
||||
auto Reduction = cl::sycl::reduction(mysum_p,identity,std::plus<>(),PropList);
|
||||
cgh.parallel_for(cl::sycl::range<1>{osites},
|
||||
{
|
||||
sycl::buffer<sobj, 1> abuff(&ret, {1});
|
||||
theGridAccelerator->submit([&](sycl::handler &cgh) {
|
||||
auto Reduction = sycl::reduction(abuff,cgh,identity,std::plus<>());
|
||||
cgh.parallel_for(sycl::range<1>{osites},
|
||||
Reduction,
|
||||
[=] (cl::sycl::id<1> item, auto &sum) {
|
||||
[=] (sycl::id<1> item, auto &sum) {
|
||||
auto osite = item[0];
|
||||
sum +=Reduce(lat[osite]);
|
||||
});
|
||||
});
|
||||
theGridAccelerator->wait();
|
||||
ret = mysum[0];
|
||||
// free(mysum,*theGridAccelerator);
|
||||
}
|
||||
sobjD dret; convertType(dret,ret);
|
||||
return dret;
|
||||
}
|
||||
@@ -76,59 +71,22 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite
|
||||
|
||||
template<class Word> Word svm_xor(Word *vec,uint64_t L)
|
||||
{
|
||||
Word xorResult; xorResult = 0;
|
||||
static Vector<Word> d_sum;
|
||||
d_sum.resize(1);
|
||||
Word *d_sum_p=&d_sum[0];
|
||||
Word identity; identity=0;
|
||||
d_sum[0] = identity;
|
||||
const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() });
|
||||
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
|
||||
auto Reduction = cl::sycl::reduction(d_sum_p,identity,std::bit_xor<>(),PropList);
|
||||
cgh.parallel_for(cl::sycl::range<1>{L},
|
||||
Word ret = 0;
|
||||
{
|
||||
sycl::buffer<Word, 1> abuff(&ret, {1});
|
||||
theGridAccelerator->submit([&](sycl::handler &cgh) {
|
||||
auto Reduction = sycl::reduction(abuff,cgh,identity,std::bit_xor<>());
|
||||
cgh.parallel_for(sycl::range<1>{L},
|
||||
Reduction,
|
||||
[=] (cl::sycl::id<1> index, auto &sum) {
|
||||
[=] (sycl::id<1> index, auto &sum) {
|
||||
sum ^=vec[index];
|
||||
});
|
||||
});
|
||||
}
|
||||
theGridAccelerator->wait();
|
||||
Word ret = d_sum[0];
|
||||
// free(d_sum,*theGridAccelerator);
|
||||
return ret;
|
||||
}
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
/*
|
||||
|
||||
template <class vobj>
|
||||
inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites)
|
||||
{
|
||||
typedef typename vobj::vector_type vector;
|
||||
typedef typename vobj::scalar_type scalar;
|
||||
|
||||
typedef typename vobj::scalar_typeD scalarD;
|
||||
typedef typename vobj::scalar_objectD sobjD;
|
||||
|
||||
sobjD ret;
|
||||
scalarD *ret_p = (scalarD *)&ret;
|
||||
|
||||
const int nsimd = vobj::Nsimd();
|
||||
const int words = sizeof(vobj)/sizeof(vector);
|
||||
|
||||
Vector<scalar> buffer(osites*nsimd);
|
||||
scalar *buf = &buffer[0];
|
||||
vector *dat = (vector *)lat;
|
||||
|
||||
for(int w=0;w<words;w++) {
|
||||
|
||||
accelerator_for(ss,osites,nsimd,{
|
||||
int lane = acceleratorSIMTlane(nsimd);
|
||||
buf[ss*nsimd+lane] = dat[ss*words+w].getlane(lane);
|
||||
});
|
||||
//Precision change at this point is to late to gain precision
|
||||
ret_p[w] = svm_reduce(buf,nsimd*osites);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
*/
|
||||
|
@@ -47,6 +47,19 @@ NAMESPACE_BEGIN(Grid);
|
||||
// Allow the RNG state to be less dense than the fine grid
|
||||
//////////////////////////////////////////////////////////////
|
||||
inline int RNGfillable(GridBase *coarse,GridBase *fine)
|
||||
{
|
||||
if ( coarse == fine ) return 1;
|
||||
|
||||
if ( coarse->isIcosahedral()) assert(coarse->isIcosahedralEdge());
|
||||
|
||||
if ( fine->isIcosahedralVertex() && coarse->isIcosahedralEdge() ) {
|
||||
assert(fine->Nd()==coarse->Nd());
|
||||
for(int d=0;d<fine->Nd();d++){
|
||||
assert(fine->LocalDimensions()[d] == coarse->LocalDimensions()[d]);
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
{
|
||||
|
||||
int rngdims = coarse->_ndimension;
|
||||
@@ -74,12 +87,26 @@ inline int RNGfillable(GridBase *coarse,GridBase *fine)
|
||||
}
|
||||
return multiplicity;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// merge of April 11 2017
|
||||
// this function is necessary for the LS vectorised field
|
||||
inline int RNGfillable_general(GridBase *coarse,GridBase *fine)
|
||||
{
|
||||
|
||||
if ( coarse == fine ) return 1;
|
||||
|
||||
if ( coarse->isIcosahedral()) assert(coarse->isIcosahedralEdge());
|
||||
|
||||
if ( fine->isIcosahedralVertex() && coarse->isIcosahedralEdge() ) {
|
||||
assert(fine->Nd()==coarse->Nd());
|
||||
for(int d=0;d<fine->Nd();d++){
|
||||
assert(fine->LocalDimensions()[d] == coarse->LocalDimensions()[d]);
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
int rngdims = coarse->_ndimension;
|
||||
|
||||
// trivially extended in higher dims, with locality guaranteeing RNG state is local to node
|
||||
@@ -352,12 +379,12 @@ private:
|
||||
public:
|
||||
GridBase *Grid(void) const { return _grid; }
|
||||
int generator_idx(int os,int is) {
|
||||
return is*_grid->oSites()+os;
|
||||
return (is*_grid->CartesianOsites()+os)%_grid->lSites(); // On the pole sites wrap back to normal generators; Icosahedral hack
|
||||
}
|
||||
|
||||
GridParallelRNG(GridBase *grid) : GridRNGbase() {
|
||||
_grid = grid;
|
||||
_vol =_grid->iSites()*_grid->oSites();
|
||||
_vol =_grid->lSites();
|
||||
|
||||
_generators.resize(_vol);
|
||||
_uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1});
|
||||
@@ -381,7 +408,7 @@ public:
|
||||
|
||||
int multiplicity = RNGfillable_general(_grid, l.Grid()); // l has finer or same grid
|
||||
int Nsimd = _grid->Nsimd(); // guaranteed to be the same for l.Grid() too
|
||||
int osites = _grid->oSites(); // guaranteed to be <= l.Grid()->oSites() by a factor multiplicity
|
||||
int osites = _grid->CartesianOsites(); // guaranteed to be <= l.Grid()->oSites() by a factor multiplicity, except on Icosahedral
|
||||
int words = sizeof(scalar_object) / sizeof(scalar_type);
|
||||
|
||||
autoView(l_v, l, CpuWrite);
|
||||
@@ -403,7 +430,26 @@ public:
|
||||
merge(l_v[sm], buf);
|
||||
}
|
||||
});
|
||||
// });
|
||||
|
||||
/*
|
||||
* Fill in the poles for an Icosahedral vertex mesh
|
||||
*/
|
||||
if (l.Grid()->isIcosahedralVertex()) {
|
||||
int64_t pole_sites=l.Grid()->NorthPoleOsites()+l.Grid()->SouthPoleOsites();
|
||||
int64_t pole_base =l.Grid()->CartesianOsites();
|
||||
|
||||
ExtractBuffer<scalar_object> buf(Nsimd);
|
||||
for (int m = 0; m < pole_sites; m++) { // Draw from same generator multiplicity times
|
||||
for (int si = 0; si < Nsimd; si++) {
|
||||
int gdx = 0;
|
||||
scalar_type *pointer = (scalar_type *)&buf[si];
|
||||
dist[gdx].reset();
|
||||
for (int idx = 0; idx < words; idx++)
|
||||
fillScalar(pointer[idx], dist[gdx], _generators[gdx]);
|
||||
}
|
||||
merge(l_v[pole_base+m], buf);
|
||||
}
|
||||
}
|
||||
|
||||
_time_counter += usecond()- inner_time_counter;
|
||||
}
|
||||
|
@@ -21,9 +21,18 @@ NAMESPACE_BEGIN(Grid);
|
||||
|
||||
|
||||
#if defined(GRID_CUDA) || defined(GRID_HIP)
|
||||
template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
|
||||
template<class vobj>
|
||||
inline void sliceSumReduction_cub_small(const vobj *Data,
|
||||
std::vector<vobj> &lvSum,
|
||||
const int rd,
|
||||
const int e1,
|
||||
const int e2,
|
||||
const int stride,
|
||||
const int ostride,
|
||||
const int Nsimd)
|
||||
{
|
||||
size_t subvol_size = e1*e2;
|
||||
commVector<vobj> reduction_buffer(rd*subvol_size);
|
||||
deviceVector<vobj> reduction_buffer(rd*subvol_size);
|
||||
auto rb_p = &reduction_buffer[0];
|
||||
vobj zero_init;
|
||||
zeroit(zero_init);
|
||||
@@ -46,7 +55,7 @@ template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, V
|
||||
d_offsets = static_cast<int*>(acceleratorAllocDevice((rd+1)*sizeof(int)));
|
||||
|
||||
//copy offsets to device
|
||||
acceleratorCopyToDeviceAsync(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
|
||||
acceleratorCopyToDeviceAsynch(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
|
||||
|
||||
|
||||
gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream);
|
||||
@@ -79,7 +88,7 @@ template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, V
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
acceleratorCopyFromDeviceAsync(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
|
||||
acceleratorCopyFromDeviceAsynch(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
|
||||
|
||||
//sync after copy
|
||||
accelerator_barrier();
|
||||
@@ -94,7 +103,15 @@ template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, V
|
||||
|
||||
|
||||
#if defined(GRID_SYCL)
|
||||
template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data, Vector <vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
|
||||
template<class vobj>
|
||||
inline void sliceSumReduction_sycl_small(const vobj *Data,
|
||||
std::vector <vobj> &lvSum,
|
||||
const int &rd,
|
||||
const int &e1,
|
||||
const int &e2,
|
||||
const int &stride,
|
||||
const int &ostride,
|
||||
const int &Nsimd)
|
||||
{
|
||||
size_t subvol_size = e1*e2;
|
||||
|
||||
@@ -105,7 +122,7 @@ template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data,
|
||||
mysum[r] = vobj_zero;
|
||||
}
|
||||
|
||||
commVector<vobj> reduction_buffer(rd*subvol_size);
|
||||
deviceVector<vobj> reduction_buffer(rd*subvol_size);
|
||||
|
||||
auto rb_p = &reduction_buffer[0];
|
||||
|
||||
@@ -124,11 +141,11 @@ template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data,
|
||||
});
|
||||
|
||||
for (int r = 0; r < rd; r++) {
|
||||
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
|
||||
auto Reduction = cl::sycl::reduction(&mysum[r],std::plus<>());
|
||||
cgh.parallel_for(cl::sycl::range<1>{subvol_size},
|
||||
theGridAccelerator->submit([&](sycl::handler &cgh) {
|
||||
auto Reduction = sycl::reduction(&mysum[r],std::plus<>());
|
||||
cgh.parallel_for(sycl::range<1>{subvol_size},
|
||||
Reduction,
|
||||
[=](cl::sycl::id<1> item, auto &sum) {
|
||||
[=](sycl::id<1> item, auto &sum) {
|
||||
auto s = item[0];
|
||||
sum += rb_p[r*subvol_size+s];
|
||||
});
|
||||
@@ -144,14 +161,23 @@ template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data,
|
||||
}
|
||||
#endif
|
||||
|
||||
template<class vobj> inline void sliceSumReduction_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
|
||||
template<class vobj>
|
||||
inline void sliceSumReduction_large(const vobj *Data,
|
||||
std::vector<vobj> &lvSum,
|
||||
const int rd,
|
||||
const int e1,
|
||||
const int e2,
|
||||
const int stride,
|
||||
const int ostride,
|
||||
const int Nsimd)
|
||||
{
|
||||
typedef typename vobj::vector_type vector;
|
||||
const int words = sizeof(vobj)/sizeof(vector);
|
||||
const int osites = rd*e1*e2;
|
||||
commVector<vector>buffer(osites);
|
||||
deviceVector<vector>buffer(osites);
|
||||
vector *dat = (vector *)Data;
|
||||
vector *buf = &buffer[0];
|
||||
Vector<vector> lvSum_small(rd);
|
||||
std::vector<vector> lvSum_small(rd);
|
||||
vector *lvSum_ptr = (vector *)&lvSum[0];
|
||||
|
||||
for (int w = 0; w < words; w++) {
|
||||
@@ -168,13 +194,18 @@ template<class vobj> inline void sliceSumReduction_large(const vobj *Data, Vecto
|
||||
for (int r = 0; r < rd; r++) {
|
||||
lvSum_ptr[w+words*r]=lvSum_small[r];
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
template<class vobj> inline void sliceSumReduction_gpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd)
|
||||
template<class vobj>
|
||||
inline void sliceSumReduction_gpu(const Lattice<vobj> &Data,
|
||||
std::vector<vobj> &lvSum,
|
||||
const int rd,
|
||||
const int e1,
|
||||
const int e2,
|
||||
const int stride,
|
||||
const int ostride,
|
||||
const int Nsimd)
|
||||
{
|
||||
autoView(Data_v, Data, AcceleratorRead); //reduction libraries cannot deal with large vobjs so we split into small/large case.
|
||||
if constexpr (sizeof(vobj) <= 256) {
|
||||
@@ -192,7 +223,15 @@ template<class vobj> inline void sliceSumReduction_gpu(const Lattice<vobj> &Data
|
||||
}
|
||||
|
||||
|
||||
template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
|
||||
template<class vobj>
|
||||
inline void sliceSumReduction_cpu(const Lattice<vobj> &Data,
|
||||
std::vector<vobj> &lvSum,
|
||||
const int &rd,
|
||||
const int &e1,
|
||||
const int &e2,
|
||||
const int &stride,
|
||||
const int &ostride,
|
||||
const int &Nsimd)
|
||||
{
|
||||
// sum over reduced dimension planes, breaking out orthog dir
|
||||
// Parallel over orthog direction
|
||||
@@ -208,15 +247,19 @@ template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data
|
||||
});
|
||||
}
|
||||
|
||||
template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
|
||||
template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data,
|
||||
std::vector<vobj> &lvSum,
|
||||
const int &rd,
|
||||
const int &e1,
|
||||
const int &e2,
|
||||
const int &stride,
|
||||
const int &ostride,
|
||||
const int &Nsimd)
|
||||
{
|
||||
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
|
||||
|
||||
sliceSumReduction_gpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
||||
|
||||
#else
|
||||
sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@@ -981,8 +981,14 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
|
||||
hcoor[orthog] = slice;
|
||||
for(int d=0;d<nh;d++){
|
||||
if ( d!=orthog ) {
|
||||
hcoor[d]=lcoor[ddl++];
|
||||
hcoor[d]=lcoor[ddl];
|
||||
if ( hg->_checker_dim == d ) {
|
||||
hcoor[d]=hcoor[d]*2; // factor in the full coor for peekLocalSite
|
||||
lcoor[ddl]=lcoor[ddl]*2; // factor in the full coor for peekLocalSite
|
||||
}
|
||||
ddl++;
|
||||
}
|
||||
|
||||
}
|
||||
peekLocalSite(s,lowDimv,lcoor);
|
||||
pokeLocalSite(s,higherDimv,hcoor);
|
||||
@@ -1003,6 +1009,7 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
|
||||
assert(orthog<nh);
|
||||
assert(orthog>=0);
|
||||
assert(hg->_processors[orthog]==1);
|
||||
lowDim.Checkerboard() = higherDim.Checkerboard();
|
||||
|
||||
int dl; dl = 0;
|
||||
for(int d=0;d<nh;d++){
|
||||
@@ -1020,11 +1027,16 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
|
||||
Coordinate lcoor(nl);
|
||||
Coordinate hcoor(nh);
|
||||
lg->LocalIndexToLocalCoor(idx,lcoor);
|
||||
int ddl=0;
|
||||
hcoor[orthog] = slice;
|
||||
int ddl=0;
|
||||
for(int d=0;d<nh;d++){
|
||||
if ( d!=orthog ) {
|
||||
hcoor[d]=lcoor[ddl++];
|
||||
hcoor[d]=lcoor[ddl];
|
||||
if ( hg->_checker_dim == d ) {
|
||||
hcoor[d]=hcoor[d]*2; // factor in the full gridd coor for peekLocalSite
|
||||
lcoor[ddl]=lcoor[ddl]*2; // factor in the full coor for peekLocalSite
|
||||
}
|
||||
ddl++;
|
||||
}
|
||||
}
|
||||
peekLocalSite(s,higherDimv,hcoor);
|
||||
|
@@ -54,7 +54,7 @@ struct CshiftImplGauge: public CshiftImplBase<typename Gimpl::GaugeLinkField::ve
|
||||
*
|
||||
*/
|
||||
|
||||
template<class vobj> inline void ScatterSlice(const cshiftVector<vobj> &buf,
|
||||
template<class vobj> inline void ScatterSlice(const deviceVector<vobj> &buf,
|
||||
Lattice<vobj> &lat,
|
||||
int x,
|
||||
int dim,
|
||||
@@ -140,7 +140,7 @@ template<class vobj> inline void ScatterSlice(const cshiftVector<vobj> &buf,
|
||||
});
|
||||
}
|
||||
|
||||
template<class vobj> inline void GatherSlice(cshiftVector<vobj> &buf,
|
||||
template<class vobj> inline void GatherSlice(deviceVector<vobj> &buf,
|
||||
const Lattice<vobj> &lat,
|
||||
int x,
|
||||
int dim,
|
||||
@@ -462,13 +462,19 @@ public:
|
||||
int rNsimd = Nsimd / simd[dimension];
|
||||
assert( buffer_size == from.Grid()->_slice_nblock[dimension]*from.Grid()->_slice_block[dimension] / simd[dimension]);
|
||||
|
||||
static cshiftVector<vobj> send_buf;
|
||||
static cshiftVector<vobj> recv_buf;
|
||||
static deviceVector<vobj> send_buf;
|
||||
static deviceVector<vobj> recv_buf;
|
||||
send_buf.resize(buffer_size*2*depth);
|
||||
recv_buf.resize(buffer_size*2*depth);
|
||||
#ifndef ACCELERATOR_AWARE_MPI
|
||||
static hostVector<vobj> hsend_buf;
|
||||
static hostVector<vobj> hrecv_buf;
|
||||
hsend_buf.resize(buffer_size*2*depth);
|
||||
hrecv_buf.resize(buffer_size*2*depth);
|
||||
#endif
|
||||
|
||||
std::vector<CommsRequest_t> fwd_req;
|
||||
std::vector<CommsRequest_t> bwd_req;
|
||||
std::vector<MpiCommsRequest_t> fwd_req;
|
||||
std::vector<MpiCommsRequest_t> bwd_req;
|
||||
|
||||
int words = buffer_size;
|
||||
int bytes = words * sizeof(vobj);
|
||||
@@ -495,9 +501,16 @@ public:
|
||||
t_gather+=usecond()-t;
|
||||
|
||||
t=usecond();
|
||||
#ifdef ACCELERATOR_AWARE_MPI
|
||||
grid->SendToRecvFromBegin(fwd_req,
|
||||
(void *)&send_buf[d*buffer_size], xmit_to_rank,
|
||||
(void *)&recv_buf[d*buffer_size], recv_from_rank, bytes, tag);
|
||||
#else
|
||||
acceleratorCopyFromDevice(&send_buf[d*buffer_size],&hsend_buf[d*buffer_size],bytes);
|
||||
grid->SendToRecvFromBegin(fwd_req,
|
||||
(void *)&hsend_buf[d*buffer_size], xmit_to_rank,
|
||||
(void *)&hrecv_buf[d*buffer_size], recv_from_rank, bytes, tag);
|
||||
#endif
|
||||
t_comms+=usecond()-t;
|
||||
}
|
||||
for ( int d=0;d < depth ; d ++ ) {
|
||||
@@ -508,9 +521,16 @@ public:
|
||||
t_gather+= usecond() - t;
|
||||
|
||||
t=usecond();
|
||||
#ifdef ACCELERATOR_AWARE_MPI
|
||||
grid->SendToRecvFromBegin(bwd_req,
|
||||
(void *)&send_buf[(d+depth)*buffer_size], recv_from_rank,
|
||||
(void *)&recv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
|
||||
#else
|
||||
acceleratorCopyFromDevice(&send_buf[(d+depth)*buffer_size],&hsend_buf[(d+depth)*buffer_size],bytes);
|
||||
grid->SendToRecvFromBegin(bwd_req,
|
||||
(void *)&hsend_buf[(d+depth)*buffer_size], recv_from_rank,
|
||||
(void *)&hrecv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
|
||||
#endif
|
||||
t_comms+=usecond()-t;
|
||||
}
|
||||
|
||||
@@ -533,6 +553,11 @@ public:
|
||||
|
||||
t=usecond();
|
||||
grid->CommsComplete(fwd_req);
|
||||
#ifndef ACCELERATOR_AWARE_MPI
|
||||
for ( int d=0;d < depth ; d ++ ) {
|
||||
acceleratorCopyToDevice(&hrecv_buf[d*buffer_size],&recv_buf[d*buffer_size],bytes);
|
||||
}
|
||||
#endif
|
||||
t_comms+= usecond() - t;
|
||||
|
||||
t=usecond();
|
||||
@@ -543,6 +568,11 @@ public:
|
||||
|
||||
t=usecond();
|
||||
grid->CommsComplete(bwd_req);
|
||||
#ifndef ACCELERATOR_AWARE_MPI
|
||||
for ( int d=0;d < depth ; d ++ ) {
|
||||
acceleratorCopyToDevice(&hrecv_buf[(d+depth)*buffer_size],&recv_buf[(d+depth)*buffer_size],bytes);
|
||||
}
|
||||
#endif
|
||||
t_comms+= usecond() - t;
|
||||
|
||||
t=usecond();
|
||||
|
@@ -49,7 +49,7 @@ static constexpr int Tm = 7;
|
||||
|
||||
static constexpr int Nc=Config_Nc;
|
||||
static constexpr int Ns=4;
|
||||
static constexpr int Nd=4;
|
||||
static constexpr int Nd=Config_Nd;
|
||||
static constexpr int Nhs=2; // half spinor
|
||||
static constexpr int Nds=8; // double stored gauge field
|
||||
static constexpr int Ngp=2; // gparity index range
|
||||
@@ -75,6 +75,7 @@ static constexpr int InverseYes=1;
|
||||
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
|
||||
|
||||
const int SpinorIndex = 2;
|
||||
const int PauliIndex = 2; //TensorLevel counts from the bottom!
|
||||
template<typename T> struct isSpinor {
|
||||
static constexpr bool value = (SpinorIndex==T::TensorLevel);
|
||||
};
|
||||
|
@@ -132,6 +132,10 @@ public:
|
||||
template <class GaugeField >
|
||||
class EmptyAction : public Action <GaugeField>
|
||||
{
|
||||
using Action<GaugeField>::refresh;
|
||||
using Action<GaugeField>::Sinitial;
|
||||
using Action<GaugeField>::deriv;
|
||||
|
||||
virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { assert(0);}; // refresh pseudofermions
|
||||
virtual RealD S(const GaugeField& U) { return 0.0;}; // evaluate the action
|
||||
virtual void deriv(const GaugeField& U, GaugeField& dSdU) { assert(0); }; // evaluate the action derivative
|
||||
|
@@ -55,6 +55,11 @@ public:
|
||||
RealD alpha; // Mobius scale
|
||||
RealD k; // EOFA normalization constant
|
||||
|
||||
// Device resident
|
||||
deviceVector<Coeff_t> d_shift_coefficients;
|
||||
deviceVector<Coeff_t> d_MooeeInv_shift_lc;
|
||||
deviceVector<Coeff_t> d_MooeeInv_shift_norm;
|
||||
|
||||
virtual void Instantiatable(void) = 0;
|
||||
|
||||
// EOFA-specific operations
|
||||
@@ -92,6 +97,11 @@ public:
|
||||
this->k = this->alpha * (_mq3-_mq2) * std::pow(this->alpha+1.0,2*Ls) /
|
||||
( std::pow(this->alpha+1.0,Ls) + _mq2*std::pow(this->alpha-1.0,Ls) ) /
|
||||
( std::pow(this->alpha+1.0,Ls) + _mq3*std::pow(this->alpha-1.0,Ls) );
|
||||
|
||||
d_shift_coefficients.resize(Ls);
|
||||
d_MooeeInv_shift_lc.resize(Ls);
|
||||
d_MooeeInv_shift_norm.resize(Ls);
|
||||
|
||||
};
|
||||
};
|
||||
|
||||
|
@@ -90,16 +90,16 @@ public:
|
||||
void M5D(const FermionField &psi,
|
||||
const FermionField &phi,
|
||||
FermionField &chi,
|
||||
Vector<Coeff_t> &lower,
|
||||
Vector<Coeff_t> &diag,
|
||||
Vector<Coeff_t> &upper);
|
||||
std::vector<Coeff_t> &lower,
|
||||
std::vector<Coeff_t> &diag,
|
||||
std::vector<Coeff_t> &upper);
|
||||
|
||||
void M5Ddag(const FermionField &psi,
|
||||
const FermionField &phi,
|
||||
FermionField &chi,
|
||||
Vector<Coeff_t> &lower,
|
||||
Vector<Coeff_t> &diag,
|
||||
Vector<Coeff_t> &upper);
|
||||
std::vector<Coeff_t> &lower,
|
||||
std::vector<Coeff_t> &diag,
|
||||
std::vector<Coeff_t> &upper);
|
||||
|
||||
virtual void Instantiatable(void)=0;
|
||||
|
||||
@@ -119,35 +119,51 @@ public:
|
||||
RealD mass_plus, mass_minus;
|
||||
|
||||
// Save arguments to SetCoefficientsInternal
|
||||
Vector<Coeff_t> _gamma;
|
||||
std::vector<Coeff_t> _gamma;
|
||||
RealD _zolo_hi;
|
||||
RealD _b;
|
||||
RealD _c;
|
||||
|
||||
// possible boost
|
||||
std::vector<ComplexD> qmu;
|
||||
void set_qmu(std::vector<ComplexD> _qmu) { qmu=_qmu; assert(qmu.size()==Nd);};
|
||||
void addQmu(const FermionField &in, FermionField &out, int dag);
|
||||
|
||||
// Cayley form Moebius (tanh and zolotarev)
|
||||
Vector<Coeff_t> omega;
|
||||
Vector<Coeff_t> bs; // S dependent coeffs
|
||||
Vector<Coeff_t> cs;
|
||||
Vector<Coeff_t> as;
|
||||
std::vector<Coeff_t> omega;
|
||||
std::vector<Coeff_t> bs; // S dependent coeffs
|
||||
std::vector<Coeff_t> cs;
|
||||
std::vector<Coeff_t> as;
|
||||
// For preconditioning Cayley form
|
||||
Vector<Coeff_t> bee;
|
||||
Vector<Coeff_t> cee;
|
||||
Vector<Coeff_t> aee;
|
||||
Vector<Coeff_t> beo;
|
||||
Vector<Coeff_t> ceo;
|
||||
Vector<Coeff_t> aeo;
|
||||
std::vector<Coeff_t> bee;
|
||||
std::vector<Coeff_t> cee;
|
||||
std::vector<Coeff_t> aee;
|
||||
std::vector<Coeff_t> beo;
|
||||
std::vector<Coeff_t> ceo;
|
||||
std::vector<Coeff_t> aeo;
|
||||
// LDU factorisation of the eeoo matrix
|
||||
Vector<Coeff_t> lee;
|
||||
Vector<Coeff_t> leem;
|
||||
Vector<Coeff_t> uee;
|
||||
Vector<Coeff_t> ueem;
|
||||
Vector<Coeff_t> dee;
|
||||
std::vector<Coeff_t> lee;
|
||||
std::vector<Coeff_t> leem;
|
||||
std::vector<Coeff_t> uee;
|
||||
std::vector<Coeff_t> ueem;
|
||||
std::vector<Coeff_t> dee;
|
||||
|
||||
// Device memory
|
||||
deviceVector<Coeff_t> d_diag;
|
||||
deviceVector<Coeff_t> d_upper;
|
||||
deviceVector<Coeff_t> d_lower;
|
||||
|
||||
deviceVector<Coeff_t> d_lee;
|
||||
deviceVector<Coeff_t> d_dee;
|
||||
deviceVector<Coeff_t> d_uee;
|
||||
deviceVector<Coeff_t> d_leem;
|
||||
deviceVector<Coeff_t> d_ueem;
|
||||
|
||||
// Matrices of 5d ee inverse params
|
||||
Vector<iSinglet<Simd> > MatpInv;
|
||||
Vector<iSinglet<Simd> > MatmInv;
|
||||
Vector<iSinglet<Simd> > MatpInvDag;
|
||||
Vector<iSinglet<Simd> > MatmInvDag;
|
||||
// std::vector<iSinglet<Simd> > MatpInv;
|
||||
// std::vector<iSinglet<Simd> > MatmInv;
|
||||
// std::vector<iSinglet<Simd> > MatpInvDag;
|
||||
// std::vector<iSinglet<Simd> > MatmInvDag;
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
// Conserved current utilities
|
||||
@@ -187,7 +203,7 @@ public:
|
||||
protected:
|
||||
virtual void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
|
||||
virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c);
|
||||
virtual void SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t> & gamma,RealD b,RealD c);
|
||||
virtual void SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c);
|
||||
};
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
196
Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h
Normal file
196
Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h
Normal file
@@ -0,0 +1,196 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermion5D.h
|
||||
|
||||
Copyright (C) 2020 - 2025
|
||||
|
||||
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
|
||||
Author: Nils Meyer <nils.meyer@ur.de>
|
||||
Author: Christoph Lehner <christoph@lhnr.de>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <Grid/qcd/action/fermion/WilsonFermion5D.h>
|
||||
#include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
|
||||
#include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
|
||||
#include <Grid/qcd/action/fermion/CloverHelpers.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
// see Grid/qcd/action/fermion/CompactWilsonCloverFermion.h for description
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
class CompactWilsonCloverFermion5D : public WilsonFermion5D<Impl>,
|
||||
public WilsonCloverHelpers<Impl>,
|
||||
public CompactWilsonCloverHelpers<Impl> {
|
||||
/////////////////////////////////////////////
|
||||
// Sizes
|
||||
/////////////////////////////////////////////
|
||||
|
||||
public:
|
||||
|
||||
INHERIT_COMPACT_CLOVER_SIZES(Impl);
|
||||
|
||||
/////////////////////////////////////////////
|
||||
// Type definitions
|
||||
/////////////////////////////////////////////
|
||||
|
||||
public:
|
||||
|
||||
INHERIT_IMPL_TYPES(Impl);
|
||||
INHERIT_CLOVER_TYPES(Impl);
|
||||
INHERIT_COMPACT_CLOVER_TYPES(Impl);
|
||||
|
||||
typedef WilsonFermion5D<Impl> WilsonBase;
|
||||
typedef WilsonCloverHelpers<Impl> Helpers;
|
||||
typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
|
||||
|
||||
/////////////////////////////////////////////
|
||||
// Constructors
|
||||
/////////////////////////////////////////////
|
||||
|
||||
public:
|
||||
|
||||
CompactWilsonCloverFermion5D(GaugeField& _Umu,
|
||||
GridCartesian &FiveDimGrid,
|
||||
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
||||
GridCartesian &FourDimGrid,
|
||||
GridRedBlackCartesian &FourDimRedBlackGrid,
|
||||
const RealD _mass,
|
||||
const RealD _csw_r = 0.0,
|
||||
const RealD _csw_t = 0.0,
|
||||
const RealD _cF = 1.0,
|
||||
const ImplParams& impl_p = ImplParams());
|
||||
|
||||
/////////////////////////////////////////////
|
||||
// Member functions (implementing interface)
|
||||
/////////////////////////////////////////////
|
||||
|
||||
public:
|
||||
|
||||
virtual void Instantiatable() {};
|
||||
int ConstEE() override { return 0; };
|
||||
int isTrivialEE() override { return 0; };
|
||||
|
||||
void Dhop(const FermionField& in, FermionField& out, int dag) override;
|
||||
|
||||
void DhopOE(const FermionField& in, FermionField& out, int dag) override;
|
||||
|
||||
void DhopEO(const FermionField& in, FermionField& out, int dag) override;
|
||||
|
||||
void DhopDir(const FermionField& in, FermionField& out, int dir, int disp) override;
|
||||
|
||||
void DhopDirAll(const FermionField& in, std::vector<FermionField>& out) /* override */;
|
||||
|
||||
void M(const FermionField& in, FermionField& out) override;
|
||||
|
||||
void Mdag(const FermionField& in, FermionField& out) override;
|
||||
|
||||
void Meooe(const FermionField& in, FermionField& out) override;
|
||||
|
||||
void MeooeDag(const FermionField& in, FermionField& out) override;
|
||||
|
||||
void Mooee(const FermionField& in, FermionField& out) override;
|
||||
|
||||
void MooeeDag(const FermionField& in, FermionField& out) override;
|
||||
|
||||
void MooeeInv(const FermionField& in, FermionField& out) override;
|
||||
|
||||
void MooeeInvDag(const FermionField& in, FermionField& out) override;
|
||||
|
||||
void Mdir(const FermionField& in, FermionField& out, int dir, int disp) override;
|
||||
|
||||
void MdirAll(const FermionField& in, std::vector<FermionField>& out) override;
|
||||
|
||||
void MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) override;
|
||||
|
||||
void MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
|
||||
|
||||
void MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
|
||||
|
||||
/////////////////////////////////////////////
|
||||
// Member functions (internals)
|
||||
/////////////////////////////////////////////
|
||||
|
||||
void MooeeInternal(const FermionField& in,
|
||||
FermionField& out,
|
||||
const CloverDiagonalField& diagonal,
|
||||
const CloverTriangleField& triangle);
|
||||
|
||||
/////////////////////////////////////////////
|
||||
// Helpers
|
||||
/////////////////////////////////////////////
|
||||
|
||||
void ImportGauge(const GaugeField& _Umu) override;
|
||||
|
||||
/////////////////////////////////////////////
|
||||
// Helpers
|
||||
/////////////////////////////////////////////
|
||||
|
||||
private:
|
||||
|
||||
template<class Field>
|
||||
const MaskField* getCorrectMaskField(const Field &in) const {
|
||||
if(in.Grid()->_isCheckerBoarded) {
|
||||
if(in.Checkerboard() == Odd) {
|
||||
return &this->BoundaryMaskOdd;
|
||||
} else {
|
||||
return &this->BoundaryMaskEven;
|
||||
}
|
||||
} else {
|
||||
return &this->BoundaryMask;
|
||||
}
|
||||
}
|
||||
|
||||
template<class Field>
|
||||
void ApplyBoundaryMask(Field& f) {
|
||||
const MaskField* m = getCorrectMaskField(f); assert(m != nullptr);
|
||||
assert(m != nullptr);
|
||||
CompactHelpers::ApplyBoundaryMask(f, *m);
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////
|
||||
// Member Data
|
||||
/////////////////////////////////////////////
|
||||
|
||||
public:
|
||||
|
||||
RealD csw_r;
|
||||
RealD csw_t;
|
||||
RealD cF;
|
||||
int n_rhs;
|
||||
|
||||
bool fixedBoundaries;
|
||||
|
||||
CloverDiagonalField Diagonal, DiagonalEven, DiagonalOdd;
|
||||
CloverDiagonalField DiagonalInv, DiagonalInvEven, DiagonalInvOdd;
|
||||
|
||||
CloverTriangleField Triangle, TriangleEven, TriangleOdd;
|
||||
CloverTriangleField TriangleInv, TriangleInvEven, TriangleInvOdd;
|
||||
|
||||
FermionField Tmp;
|
||||
|
||||
MaskField BoundaryMask, BoundaryMaskEven, BoundaryMaskOdd;
|
||||
};
|
||||
|
||||
NAMESPACE_END(Grid);
|
@@ -60,6 +60,50 @@ public:
|
||||
// virtual void Instantiatable(void)=0;
|
||||
virtual void Instantiatable(void) =0;
|
||||
|
||||
void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary, std::vector<double> twist)
|
||||
{
|
||||
std::cout << "Free Propagator for PartialFraction"<<std::endl;
|
||||
FermionField in_k(in.Grid());
|
||||
FermionField prop_k(in.Grid());
|
||||
|
||||
FFT theFFT((GridCartesian *) in.Grid());
|
||||
|
||||
//phase for boundary condition
|
||||
ComplexField coor(in.Grid());
|
||||
ComplexField ph(in.Grid()); ph = Zero();
|
||||
FermionField in_buf(in.Grid()); in_buf = Zero();
|
||||
typedef typename Simd::scalar_type Scalar;
|
||||
Scalar ci(0.0,1.0);
|
||||
assert(twist.size() == Nd);//check that twist is Nd
|
||||
assert(boundary.size() == Nd);//check that boundary conditions is Nd
|
||||
int shift = 0;
|
||||
for(unsigned int nu = 0; nu < Nd; nu++)
|
||||
{
|
||||
// Shift coordinate lattice index by 1 to account for 5th dimension.
|
||||
LatticeCoordinate(coor, nu + shift);
|
||||
double boundary_phase = ::acos(real(boundary[nu]));
|
||||
ph = ph + boundary_phase*coor*((1./(in.Grid()->_fdimensions[nu+shift])));
|
||||
//momenta for propagator shifted by twist+boundary
|
||||
twist[nu] = twist[nu] + boundary_phase/((2.0*M_PI));
|
||||
}
|
||||
in_buf = exp(ci*ph*(-1.0))*in;
|
||||
|
||||
theFFT.FFT_all_dim(in_k,in,FFT::forward);
|
||||
this->MomentumSpacePropagatorHw(prop_k,in_k,mass,twist);
|
||||
theFFT.FFT_all_dim(out,prop_k,FFT::backward);
|
||||
|
||||
//phase for boundary condition
|
||||
out = out * exp(ci*ph);
|
||||
};
|
||||
|
||||
virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) {
|
||||
std::vector<double> twist(Nd,0.0); //default: periodic boundarys in all directions
|
||||
std::vector<Complex> boundary;
|
||||
for(int i=0;i<Nd;i++) boundary.push_back(1);//default: periodic boundary conditions
|
||||
FreePropagator(in,out,mass,boundary,twist);
|
||||
};
|
||||
|
||||
|
||||
// Efficient support for multigrid coarsening
|
||||
virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp);
|
||||
virtual void MdirAll(const FermionField &in, std::vector<FermionField> &out);
|
||||
@@ -90,12 +134,12 @@ protected:
|
||||
RealD mass;
|
||||
RealD R;
|
||||
RealD ZoloHiInv;
|
||||
Vector<double> Beta;
|
||||
Vector<double> cc;;
|
||||
Vector<double> cc_d;;
|
||||
Vector<double> sqrt_cc;
|
||||
Vector<double> See;
|
||||
Vector<double> Aee;
|
||||
std::vector<double> Beta;
|
||||
std::vector<double> cc;;
|
||||
std::vector<double> cc_d;;
|
||||
std::vector<double> sqrt_cc;
|
||||
std::vector<double> See;
|
||||
std::vector<double> Aee;
|
||||
|
||||
};
|
||||
|
||||
|
@@ -69,10 +69,10 @@ public:
|
||||
// Instantiate different versions depending on Impl
|
||||
/////////////////////////////////////////////////////
|
||||
void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
|
||||
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);
|
||||
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
|
||||
|
||||
void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
|
||||
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);
|
||||
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
|
||||
|
||||
virtual void RefreshShiftCoefficients(RealD new_shift);
|
||||
|
||||
@@ -83,7 +83,7 @@ public:
|
||||
RealD _M5, const ImplParams& p=ImplParams());
|
||||
|
||||
protected:
|
||||
void SetCoefficientsInternal(RealD zolo_hi, Vector<Coeff_t>& gamma, RealD b, RealD c);
|
||||
void SetCoefficientsInternal(RealD zolo_hi, std::vector<Coeff_t>& gamma, RealD b, RealD c);
|
||||
};
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
@@ -123,10 +123,10 @@ public:
|
||||
GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
|
||||
|
||||
peekLocalSite(ScalarUmu, Umu_v, lcoor);
|
||||
for (int mu = 0; mu < 4; mu++) ScalarUds(mu) = ScalarUmu(mu);
|
||||
for (int mu = 0; mu < Nd; mu++) ScalarUds(mu) = ScalarUmu(mu);
|
||||
|
||||
peekLocalSite(ScalarUmu, Uadj_v, lcoor);
|
||||
for (int mu = 0; mu < 4; mu++) ScalarUds(mu + 4) = ScalarUmu(mu);
|
||||
for (int mu = 0; mu < Nd; mu++) ScalarUds(mu + Nd) = ScalarUmu(mu);
|
||||
|
||||
pokeLocalSite(ScalarUds, Uds_v, lcoor);
|
||||
});
|
||||
|
@@ -55,6 +55,7 @@ NAMESPACE_CHECK(Wilson);
|
||||
NAMESPACE_CHECK(WilsonTM);
|
||||
#include <Grid/qcd/action/fermion/WilsonCloverFermion.h> // 4d wilson clover fermions
|
||||
#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h> // 4d compact wilson clover fermions
|
||||
#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h> // 5d compact wilson clover fermions
|
||||
NAMESPACE_CHECK(WilsonClover);
|
||||
#include <Grid/qcd/action/fermion/WilsonFermion5D.h> // 5d base used by all 5d overlap types
|
||||
NAMESPACE_CHECK(Wilson5D);
|
||||
@@ -84,6 +85,15 @@ NAMESPACE_CHECK(DomainWall);
|
||||
#include <Grid/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h>
|
||||
#include <Grid/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h>
|
||||
NAMESPACE_CHECK(Overlap);
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Two spin wilson fermion based
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include <Grid/qcd/action/fermion/TwoSpinWilsonFermion3plus1D.h>
|
||||
NAMESPACE_CHECK(TwoSpinWilson);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
@@ -164,12 +174,17 @@ typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiS
|
||||
|
||||
// Compact Clover fermions
|
||||
template <typename WImpl> using CompactWilsonClover = CompactWilsonCloverFermion<WImpl, CompactCloverHelpers<WImpl>>;
|
||||
template <typename WImpl> using CompactWilsonClover5D = CompactWilsonCloverFermion5D<WImpl, CompactCloverHelpers<WImpl>>;
|
||||
template <typename WImpl> using CompactWilsonExpClover = CompactWilsonCloverFermion<WImpl, CompactExpCloverHelpers<WImpl>>;
|
||||
|
||||
typedef CompactWilsonClover<WilsonImplD2> CompactWilsonCloverFermionD2;
|
||||
typedef CompactWilsonClover<WilsonImplF> CompactWilsonCloverFermionF;
|
||||
typedef CompactWilsonClover<WilsonImplD> CompactWilsonCloverFermionD;
|
||||
|
||||
typedef CompactWilsonClover5D<WilsonImplD2> CompactWilsonCloverFermion5DD2;
|
||||
typedef CompactWilsonClover5D<WilsonImplF> CompactWilsonCloverFermion5DF;
|
||||
typedef CompactWilsonClover5D<WilsonImplD> CompactWilsonCloverFermion5DD;
|
||||
|
||||
typedef CompactWilsonExpClover<WilsonImplD2> CompactWilsonExpCloverFermionD2;
|
||||
typedef CompactWilsonExpClover<WilsonImplF> CompactWilsonExpCloverFermionF;
|
||||
typedef CompactWilsonExpClover<WilsonImplD> CompactWilsonExpCloverFermionD;
|
||||
|
@@ -43,6 +43,7 @@ NAMESPACE_CHECK(FermionOperatorImpl);
|
||||
NAMESPACE_CHECK(FermionOperator);
|
||||
#include <Grid/qcd/action/fermion/WilsonKernels.h> //used by all wilson type fermions
|
||||
#include <Grid/qcd/action/fermion/StaggeredKernels.h> //used by all wilson type fermions
|
||||
#include <Grid/qcd/action/fermion/TwoSpinWilsonKernels.h> //used for 3D fermions, pauli in place of Dirac
|
||||
NAMESPACE_CHECK(Kernels);
|
||||
|
||||
#endif
|
||||
|
@@ -180,6 +180,12 @@ NAMESPACE_CHECK(ImplGparityWilson);
|
||||
#include <Grid/qcd/action/fermion/StaggeredImpl.h>
|
||||
NAMESPACE_CHECK(ImplStaggered);
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
// Two component spinor Wilson action for 3d / Boston
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
#include <Grid/qcd/action/fermion/TwoSpinWilsonImpl.h>
|
||||
NAMESPACE_CHECK(ImplTwoSpinWilson);
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
// Single flavour one component spinors with colour index. 5d vec
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
|
@@ -274,7 +274,7 @@ public:
|
||||
autoView( Uds_v , Uds, CpuWrite);
|
||||
autoView( Utmp_v, Utmp, CpuWrite);
|
||||
thread_foreach(ss,Utmp_v,{
|
||||
Uds_v[ss](0)(mu+4) = Utmp_v[ss]();
|
||||
Uds_v[ss](0)(mu+Nd) = Utmp_v[ss]();
|
||||
});
|
||||
}
|
||||
Utmp = Uconj;
|
||||
@@ -286,7 +286,7 @@ public:
|
||||
autoView( Uds_v , Uds, CpuWrite);
|
||||
autoView( Utmp_v, Utmp, CpuWrite);
|
||||
thread_foreach(ss,Utmp_v,{
|
||||
Uds_v[ss](1)(mu+4) = Utmp_v[ss]();
|
||||
Uds_v[ss](1)(mu+Nd) = Utmp_v[ss]();
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -320,7 +320,7 @@ public:
|
||||
}
|
||||
|
||||
Uconj = conjugate(*Upoke);
|
||||
pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu + 4);
|
||||
pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu + Nd);
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -36,6 +36,8 @@ public:
|
||||
static const std::vector<int> directions;
|
||||
static const std::vector<int> displacements;
|
||||
static const int npoint = 16;
|
||||
static std::vector<int> MakeDirections(void);
|
||||
static std::vector<int> MakeDisplacements(void);
|
||||
};
|
||||
|
||||
template <class Impl>
|
||||
@@ -102,11 +104,11 @@ public:
|
||||
GaugeField &mat,
|
||||
const FermionField &A, const FermionField &B, int dag);
|
||||
|
||||
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||
void DhopInternal(StencilImpl &st, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||
const FermionField &in, FermionField &out, int dag);
|
||||
void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||
void DhopInternalSerialComms(StencilImpl &st, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||
const FermionField &in, FermionField &out, int dag);
|
||||
void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||
void DhopInternalOverlappedComms(StencilImpl &st, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||
const FermionField &in, FermionField &out, int dag);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
@@ -164,8 +166,6 @@ public:
|
||||
DoubledGaugeField UUUmuEven;
|
||||
DoubledGaugeField UUUmuOdd;
|
||||
|
||||
LebesgueOrder Lebesgue;
|
||||
LebesgueOrder LebesgueEvenOdd;
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
// Conserved current utilities
|
||||
|
@@ -40,6 +40,8 @@ public:
|
||||
static const std::vector<int> directions;
|
||||
static const std::vector<int> displacements;
|
||||
const int npoint = 16;
|
||||
static std::vector<int> MakeDirections(void);
|
||||
static std::vector<int> MakeDisplacements(void);
|
||||
};
|
||||
|
||||
template<class Impl>
|
||||
@@ -100,7 +102,6 @@ public:
|
||||
int dag);
|
||||
|
||||
void DhopInternal(StencilImpl & st,
|
||||
LebesgueOrder &lo,
|
||||
DoubledGaugeField &U,
|
||||
DoubledGaugeField &UUU,
|
||||
const FermionField &in,
|
||||
@@ -108,7 +109,6 @@ public:
|
||||
int dag);
|
||||
|
||||
void DhopInternalOverlappedComms(StencilImpl & st,
|
||||
LebesgueOrder &lo,
|
||||
DoubledGaugeField &U,
|
||||
DoubledGaugeField &UUU,
|
||||
const FermionField &in,
|
||||
@@ -116,7 +116,6 @@ public:
|
||||
int dag);
|
||||
|
||||
void DhopInternalSerialComms(StencilImpl & st,
|
||||
LebesgueOrder &lo,
|
||||
DoubledGaugeField &U,
|
||||
DoubledGaugeField &UUU,
|
||||
const FermionField &in,
|
||||
@@ -192,8 +191,6 @@ public:
|
||||
DoubledGaugeField UUUmuEven;
|
||||
DoubledGaugeField UUUmuOdd;
|
||||
|
||||
LebesgueOrder Lebesgue;
|
||||
LebesgueOrder LebesgueEvenOdd;
|
||||
|
||||
// Comms buffer
|
||||
// std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
|
||||
|
@@ -42,11 +42,11 @@ public:
|
||||
|
||||
public:
|
||||
// Shift operator coefficients for red-black preconditioned Mobius EOFA
|
||||
Vector<Coeff_t> Mooee_shift;
|
||||
Vector<Coeff_t> MooeeInv_shift_lc;
|
||||
Vector<Coeff_t> MooeeInv_shift_norm;
|
||||
Vector<Coeff_t> MooeeInvDag_shift_lc;
|
||||
Vector<Coeff_t> MooeeInvDag_shift_norm;
|
||||
std::vector<Coeff_t> Mooee_shift;
|
||||
std::vector<Coeff_t> MooeeInv_shift_lc;
|
||||
std::vector<Coeff_t> MooeeInv_shift_norm;
|
||||
std::vector<Coeff_t> MooeeInvDag_shift_lc;
|
||||
std::vector<Coeff_t> MooeeInvDag_shift_norm;
|
||||
|
||||
virtual void Instantiatable(void) {};
|
||||
|
||||
@@ -74,18 +74,18 @@ public:
|
||||
// Instantiate different versions depending on Impl
|
||||
/////////////////////////////////////////////////////
|
||||
void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
|
||||
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);
|
||||
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
|
||||
|
||||
void M5D_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
|
||||
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper,
|
||||
Vector<Coeff_t>& shift_coeffs);
|
||||
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
|
||||
std::vector<Coeff_t>& shift_coeffs);
|
||||
|
||||
void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
|
||||
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);
|
||||
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
|
||||
|
||||
void M5Ddag_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
|
||||
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper,
|
||||
Vector<Coeff_t>& shift_coeffs);
|
||||
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
|
||||
std::vector<Coeff_t>& shift_coeffs);
|
||||
|
||||
virtual void RefreshShiftCoefficients(RealD new_shift);
|
||||
|
||||
|
@@ -36,6 +36,8 @@ public:
|
||||
static const std::vector<int> directions;
|
||||
static const std::vector<int> displacements;
|
||||
static const int npoint = 8;
|
||||
static std::vector<int> MakeDirections(void);
|
||||
static std::vector<int> MakeDisplacements(void);
|
||||
};
|
||||
|
||||
template <class Impl>
|
||||
@@ -102,11 +104,11 @@ public:
|
||||
GaugeField &mat,
|
||||
const FermionField &A, const FermionField &B, int dag);
|
||||
|
||||
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||
void DhopInternal(StencilImpl &st, DoubledGaugeField &U,
|
||||
const FermionField &in, FermionField &out, int dag);
|
||||
void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||
void DhopInternalSerialComms(StencilImpl &st, DoubledGaugeField &U,
|
||||
const FermionField &in, FermionField &out, int dag);
|
||||
void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||
void DhopInternalOverlappedComms(StencilImpl &st, DoubledGaugeField &U,
|
||||
const FermionField &in, FermionField &out, int dag);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
@@ -152,9 +154,6 @@ public:
|
||||
DoubledGaugeField UmuEven;
|
||||
DoubledGaugeField UmuOdd;
|
||||
|
||||
LebesgueOrder Lebesgue;
|
||||
LebesgueOrder LebesgueEvenOdd;
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
// Conserved current utilities
|
||||
///////////////////////////////////////////////////////////////
|
||||
|
@@ -41,6 +41,10 @@ public:
|
||||
public:
|
||||
|
||||
// Constructors
|
||||
virtual void Instantiatable(void){};
|
||||
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
|
||||
this->MomentumSpacePropagatorHw(out,in,_m,twist);
|
||||
};
|
||||
|
||||
OverlapWilsonCayleyZolotarevFermion(GaugeField &_Umu,
|
||||
GridCartesian &FiveDimGrid,
|
||||
|
@@ -41,6 +41,9 @@ public:
|
||||
public:
|
||||
|
||||
virtual void Instantiatable(void){};
|
||||
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
|
||||
this->MomentumSpacePropagatorHw(out,in,_m,twist);
|
||||
};
|
||||
// Constructors
|
||||
OverlapWilsonContFracTanhFermion(GaugeField &_Umu,
|
||||
GridCartesian &FiveDimGrid,
|
||||
|
@@ -40,6 +40,9 @@ public:
|
||||
INHERIT_IMPL_TYPES(Impl);
|
||||
|
||||
virtual void Instantiatable(void){};
|
||||
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
|
||||
this->MomentumSpacePropagatorHw(out,in,_m,twist);
|
||||
};
|
||||
// Constructors
|
||||
OverlapWilsonContFracZolotarevFermion(GaugeField &_Umu,
|
||||
GridCartesian &FiveDimGrid,
|
||||
|
@@ -41,6 +41,9 @@ public:
|
||||
public:
|
||||
|
||||
virtual void Instantiatable(void){};
|
||||
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
|
||||
this->MomentumSpacePropagatorHw(out,in,_m,twist);
|
||||
};
|
||||
// Constructors
|
||||
OverlapWilsonPartialFractionTanhFermion(GaugeField &_Umu,
|
||||
GridCartesian &FiveDimGrid,
|
||||
|
@@ -40,6 +40,11 @@ public:
|
||||
INHERIT_IMPL_TYPES(Impl);
|
||||
|
||||
virtual void Instantiatable(void){};
|
||||
|
||||
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
|
||||
this->MomentumSpacePropagatorHw(out,in,_m,twist);
|
||||
};
|
||||
|
||||
// Constructors
|
||||
OverlapWilsonPartialFractionZolotarevFermion(GaugeField &_Umu,
|
||||
GridCartesian &FiveDimGrid,
|
||||
|
@@ -39,7 +39,7 @@ class PartialFractionFermion5D : public WilsonFermion5D<Impl>
|
||||
public:
|
||||
INHERIT_IMPL_TYPES(Impl);
|
||||
|
||||
const int part_frac_chroma_convention=1;
|
||||
const int part_frac_chroma_convention=0;
|
||||
|
||||
void Meooe_internal(const FermionField &in, FermionField &out,int dag);
|
||||
void Mooee_internal(const FermionField &in, FermionField &out,int dag);
|
||||
@@ -83,19 +83,78 @@ public:
|
||||
GridRedBlackCartesian &FourDimRedBlackGrid,
|
||||
RealD _mass,RealD M5,const ImplParams &p= ImplParams());
|
||||
|
||||
PartialFractionFermion5D(GaugeField &_Umu,
|
||||
GridCartesian &FiveDimGrid,
|
||||
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
||||
GridCartesian &FourDimGrid,
|
||||
GridRedBlackCartesian &FourDimRedBlackGrid,
|
||||
RealD _mass,RealD M5,std::vector<RealD> &_qmu,const ImplParams &p= ImplParams());
|
||||
|
||||
void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary, std::vector<double> twist)
|
||||
{
|
||||
std::cout << "Free Propagator for PartialFraction"<<std::endl;
|
||||
FermionField in_k(in.Grid());
|
||||
FermionField prop_k(in.Grid());
|
||||
|
||||
FFT theFFT((GridCartesian *) in.Grid());
|
||||
|
||||
//phase for boundary condition
|
||||
ComplexField coor(in.Grid());
|
||||
ComplexField ph(in.Grid()); ph = Zero();
|
||||
FermionField in_buf(in.Grid()); in_buf = Zero();
|
||||
typedef typename Simd::scalar_type Scalar;
|
||||
Scalar ci(0.0,1.0);
|
||||
assert(twist.size() == Nd);//check that twist is Nd
|
||||
assert(boundary.size() == Nd);//check that boundary conditions is Nd
|
||||
int shift = 0;
|
||||
for(unsigned int nu = 0; nu < Nd; nu++)
|
||||
{
|
||||
// Shift coordinate lattice index by 1 to account for 5th dimension.
|
||||
LatticeCoordinate(coor, nu + shift);
|
||||
double boundary_phase = ::acos(real(boundary[nu]));
|
||||
ph = ph + boundary_phase*coor*((1./(in.Grid()->_fdimensions[nu+shift])));
|
||||
//momenta for propagator shifted by twist+boundary
|
||||
twist[nu] = twist[nu] + boundary_phase/((2.0*M_PI));
|
||||
}
|
||||
in_buf = exp(ci*ph*(-1.0))*in;
|
||||
|
||||
theFFT.FFT_all_dim(in_k,in,FFT::forward);
|
||||
if ( this->qmu.size() ){
|
||||
this->MomentumSpacePropagatorHwQ(prop_k,in_k,mass,twist,this->qmu);
|
||||
} else {
|
||||
this->MomentumSpacePropagatorHw(prop_k,in_k,mass,twist);
|
||||
}
|
||||
theFFT.FFT_all_dim(out,prop_k,FFT::backward);
|
||||
|
||||
//phase for boundary condition
|
||||
out = out * exp(ci*ph);
|
||||
};
|
||||
|
||||
virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) {
|
||||
std::vector<double> twist(Nd,0.0); //default: periodic boundarys in all directions
|
||||
std::vector<Complex> boundary;
|
||||
for(int i=0;i<Nd;i++) boundary.push_back(1);//default: periodic boundary conditions
|
||||
FreePropagator(in,out,mass,boundary,twist);
|
||||
};
|
||||
|
||||
void set_qmu(std::vector<RealD> _qmu) { qmu=_qmu; assert(qmu.size()==Nd);};
|
||||
void addQmu(const FermionField &in, FermionField &out, int dag);
|
||||
|
||||
protected:
|
||||
|
||||
virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale);
|
||||
virtual void SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata);
|
||||
|
||||
std::vector<RealD> qmu;
|
||||
|
||||
// Part frac
|
||||
RealD mass;
|
||||
RealD dw_diag;
|
||||
RealD R;
|
||||
RealD amax;
|
||||
RealD scale;
|
||||
Vector<double> p;
|
||||
Vector<double> q;
|
||||
std::vector<double> p;
|
||||
std::vector<double> q;
|
||||
|
||||
};
|
||||
|
||||
|
@@ -35,7 +35,7 @@ template<class Matrix, class Field>
|
||||
class KappaSimilarityTransform {
|
||||
public:
|
||||
INHERIT_IMPL_TYPES(Matrix);
|
||||
Vector<Coeff_t> kappa, kappaDag, kappaInv, kappaInvDag;
|
||||
std::vector<Coeff_t> kappa, kappaDag, kappaInv, kappaInvDag;
|
||||
|
||||
KappaSimilarityTransform (Matrix &zmob) {
|
||||
for (int i=0;i<(int)zmob.bs.size();i++) {
|
||||
|
@@ -141,9 +141,9 @@ public:
|
||||
Udag = Udag *phases;
|
||||
|
||||
InsertGaugeField(Uds,U,mu);
|
||||
InsertGaugeField(Uds,Udag,mu+4);
|
||||
InsertGaugeField(Uds,Udag,mu+Nd);
|
||||
// PokeIndex<LorentzIndex>(Uds, U, mu);
|
||||
// PokeIndex<LorentzIndex>(Uds, Udag, mu + 4);
|
||||
// PokeIndex<LorentzIndex>(Uds, Udag, mu + Nd);
|
||||
|
||||
// 3 hop based on thin links. Crazy huh ?
|
||||
U = PeekIndex<LorentzIndex>(Uthin, mu);
|
||||
@@ -156,7 +156,7 @@ public:
|
||||
UUUdag = UUUdag *phases;
|
||||
|
||||
InsertGaugeField(UUUds,UUU,mu);
|
||||
InsertGaugeField(UUUds,UUUdag,mu+4);
|
||||
InsertGaugeField(UUUds,UUUdag,mu+Nd);
|
||||
|
||||
}
|
||||
}
|
||||
|
@@ -49,10 +49,10 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
|
||||
|
||||
public:
|
||||
|
||||
void DhopImproved(StencilImpl &st, LebesgueOrder &lo,
|
||||
void DhopImproved(StencilImpl &st,
|
||||
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||
const FermionField &in, FermionField &out, int dag, int interior,int exterior);
|
||||
void DhopNaive(StencilImpl &st, LebesgueOrder &lo,
|
||||
void DhopNaive(StencilImpl &st,
|
||||
DoubledGaugeField &U,
|
||||
const FermionField &in, FermionField &out, int dag, int interior,int exterior);
|
||||
|
||||
|
175
Grid/qcd/action/fermion/TwoSpinWilsonFermion3plus1D.h
Normal file
175
Grid/qcd/action/fermion/TwoSpinWilsonFermion3plus1D.h
Normal file
@@ -0,0 +1,175 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/TwoSpinWilsonFermion3plus1D.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#pragma one
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
class TwoSpinWilsonFermion3plus1DStatic {
|
||||
public:
|
||||
// S-direction is INNERMOST and takes no part in the parity.
|
||||
static const std::vector<int> directions;
|
||||
static const std::vector<int> displacements;
|
||||
static constexpr int npoint = 6;
|
||||
static std::vector<int> MakeDirections(void);
|
||||
static std::vector<int> MakeDisplacements(void);
|
||||
};
|
||||
|
||||
template<class Impl>
|
||||
class TwoSpinWilsonFermion3plus1D : public TwoSpinWilsonKernels<Impl>, public TwoSpinWilsonFermion3plus1DStatic
|
||||
{
|
||||
public:
|
||||
INHERIT_IMPL_TYPES(Impl);
|
||||
typedef TwoSpinWilsonKernels<Impl> Kernels;
|
||||
|
||||
FermionField _tmp;
|
||||
FermionField &tmp(void) { return _tmp; }
|
||||
|
||||
int Dirichlet;
|
||||
Coordinate Block;
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
// Implement the abstract base
|
||||
///////////////////////////////////////////////////////////////
|
||||
GridBase *GaugeGrid(void) { return _ThreeDimGrid ;}
|
||||
GridBase *GaugeRedBlackGrid(void) { return _ThreeDimRedBlackGrid ;}
|
||||
GridBase *FermionGrid(void) { return _FourDimGrid;}
|
||||
GridBase *FermionRedBlackGrid(void) { return _FourDimRedBlackGrid;}
|
||||
|
||||
// full checkerboard operations; leave unimplemented as abstract for now
|
||||
virtual void M (const FermionField &in, FermionField &out){assert(0);};
|
||||
virtual void Mdag (const FermionField &in, FermionField &out){assert(0);};
|
||||
|
||||
// half checkerboard operations; leave unimplemented as abstract for now
|
||||
virtual void Meooe (const FermionField &in, FermionField &out);
|
||||
virtual void Mooee (const FermionField &in, FermionField &out);
|
||||
virtual void MooeeInv (const FermionField &in, FermionField &out);
|
||||
|
||||
virtual void MeooeDag (const FermionField &in, FermionField &out);
|
||||
virtual void MooeeDag (const FermionField &in, FermionField &out);
|
||||
virtual void MooeeInvDag (const FermionField &in, FermionField &out);
|
||||
virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp){assert(0);}; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
|
||||
virtual void MdirAll(const FermionField &in, std::vector<FermionField> &out){assert(0);}; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
|
||||
|
||||
// These can be overridden by fancy 5d chiral action
|
||||
virtual void DhopDeriv (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
|
||||
virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
|
||||
virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
|
||||
|
||||
// void MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
|
||||
void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
|
||||
void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
|
||||
|
||||
// Implement hopping term non-hermitian hopping term; half cb or both
|
||||
// Implement s-diagonal DW
|
||||
void DW (const FermionField &in, FermionField &out,int dag);
|
||||
void Dhop (const FermionField &in, FermionField &out,int dag);
|
||||
void DhopOE(const FermionField &in, FermionField &out,int dag);
|
||||
void DhopEO(const FermionField &in, FermionField &out,int dag);
|
||||
|
||||
void DhopComms (const FermionField &in, FermionField &out);
|
||||
void DhopCalc (const FermionField &in, FermionField &out,uint64_t *ids);
|
||||
|
||||
// add a DhopComm
|
||||
// -- suboptimal interface will presently trigger multiple comms.
|
||||
void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);
|
||||
void DhopDirAll(const FermionField &in,std::vector<FermionField> &out);
|
||||
void DhopDirComms(const FermionField &in);
|
||||
void DhopDirCalc(const FermionField &in, FermionField &out,int point);
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
// New methods added
|
||||
///////////////////////////////////////////////////////////////
|
||||
void DerivInternal(StencilImpl & st,
|
||||
DoubledGaugeField & U,
|
||||
GaugeField &mat,
|
||||
const FermionField &A,
|
||||
const FermionField &B,
|
||||
int dag);
|
||||
|
||||
void DhopInternal(StencilImpl & st,
|
||||
DoubledGaugeField &U,
|
||||
const FermionField &in,
|
||||
FermionField &out,
|
||||
int dag);
|
||||
|
||||
void DhopInternalOverlappedComms(StencilImpl & st,
|
||||
DoubledGaugeField &U,
|
||||
const FermionField &in,
|
||||
FermionField &out,
|
||||
int dag);
|
||||
|
||||
void DhopInternalSerialComms(StencilImpl & st,
|
||||
DoubledGaugeField &U,
|
||||
const FermionField &in,
|
||||
FermionField &out,
|
||||
int dag);
|
||||
|
||||
// Constructors
|
||||
TwoSpinWilsonFermion3plus1D(GaugeField &_Umu,
|
||||
GridCartesian &FourDimGrid,
|
||||
GridRedBlackCartesian &FourDimRedBlackGrid,
|
||||
GridCartesian &ThreeDimGrid,
|
||||
GridRedBlackCartesian &ThreeDimRedBlackGrid,
|
||||
double _M5,const ImplParams &p= ImplParams());
|
||||
|
||||
virtual void DirichletBlock(const Coordinate & block)
|
||||
{
|
||||
}
|
||||
|
||||
// DoubleStore
|
||||
void ImportGauge(const GaugeField &_Umu);
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
// Data members require to support the functionality
|
||||
///////////////////////////////////////////////////////////////
|
||||
public:
|
||||
|
||||
// Add these to the support from Wilson
|
||||
GridBase *_ThreeDimGrid;
|
||||
GridBase *_ThreeDimRedBlackGrid;
|
||||
GridBase *_FourDimGrid;
|
||||
GridBase *_FourDimRedBlackGrid;
|
||||
|
||||
double M5;
|
||||
int Ls;
|
||||
|
||||
//Defines the stencils for even and odd
|
||||
StencilImpl Stencil;
|
||||
StencilImpl StencilEven;
|
||||
StencilImpl StencilOdd;
|
||||
|
||||
// Copy of the gauge field , with even and odd subsets
|
||||
DoubledGaugeField Umu;
|
||||
DoubledGaugeField UmuEven;
|
||||
DoubledGaugeField UmuOdd;
|
||||
|
||||
};
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
222
Grid/qcd/action/fermion/TwoSpinWilsonImpl.h
Normal file
222
Grid/qcd/action/fermion/TwoSpinWilsonImpl.h
Normal file
@@ -0,0 +1,222 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/FermionOperatorImpl.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution
|
||||
directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#pragma once
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
// Single flavour four spinors with colour index
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
template <class S, class Representation = FundamentalRepresentation,class Options = CoeffReal >
|
||||
class TwoSpinWilsonImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > {
|
||||
public:
|
||||
|
||||
static const int Dimension = Representation::Dimension;
|
||||
static const bool isFundamental = Representation::isFundamental;
|
||||
|
||||
typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
|
||||
INHERIT_GIMPL_TYPES(Gimpl);
|
||||
|
||||
//Necessary?
|
||||
constexpr bool is_fundamental() const{return Dimension == Nc ? 1 : 0;}
|
||||
|
||||
typedef typename Options::_Coeff_t Coeff_t;
|
||||
|
||||
template <typename vtype> using iImplSpinor = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
|
||||
template <typename vtype> using iImplPropagator = iScalar<iMatrix<iMatrix<vtype, Dimension>, Nhs> >;
|
||||
template <typename vtype> using iImplHalfSpinor = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
|
||||
template <typename vtype> using iImplHalfCommSpinor = iScalar<iVector<iVector<vtype, Dimension>, Nhs> >;
|
||||
template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Dimension> >, Nds>;
|
||||
|
||||
typedef iImplSpinor<Simd> SiteSpinor;
|
||||
typedef iImplPropagator<Simd> SitePropagator;
|
||||
typedef iImplHalfSpinor<Simd> SiteHalfSpinor;
|
||||
typedef iImplHalfCommSpinor<Simd> SiteHalfCommSpinor;
|
||||
typedef iImplDoubledGaugeField<Simd> SiteDoubledGaugeField;
|
||||
|
||||
typedef Lattice<SiteSpinor> FermionField;
|
||||
typedef Lattice<SitePropagator> PropagatorField;
|
||||
typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
|
||||
|
||||
typedef SimpleCompressor<SiteSpinor> Compressor;
|
||||
typedef WilsonImplParams ImplParams;
|
||||
typedef CartesianStencil<SiteSpinor, SiteSpinor, ImplParams> StencilImpl;
|
||||
typedef const typename StencilImpl::View_type StencilView;
|
||||
|
||||
ImplParams Params;
|
||||
|
||||
TwoSpinWilsonImpl(const ImplParams &p = ImplParams()) : Params(p){
|
||||
};
|
||||
|
||||
template<class _Spinor>
|
||||
static accelerator_inline void multLink(_Spinor &phi,
|
||||
const SiteDoubledGaugeField &U,
|
||||
const _Spinor &chi,
|
||||
int mu)
|
||||
{
|
||||
auto UU = coalescedRead(U(mu));
|
||||
mult(&phi(), &UU, &chi());
|
||||
}
|
||||
template<class _Spinor>
|
||||
static accelerator_inline void multLink(_Spinor &phi,
|
||||
const SiteDoubledGaugeField &U,
|
||||
const _Spinor &chi,
|
||||
int mu,
|
||||
StencilEntry *SE,
|
||||
StencilView &St)
|
||||
{
|
||||
multLink(phi,U,chi,mu);
|
||||
}
|
||||
|
||||
template<class _SpinorField>
|
||||
inline void multLinkField(_SpinorField & out,
|
||||
const DoubledGaugeField &Umu,
|
||||
const _SpinorField & phi,
|
||||
int mu)
|
||||
{
|
||||
const int Nsimd = SiteHalfSpinor::Nsimd();
|
||||
autoView( out_v, out, AcceleratorWrite);
|
||||
autoView( phi_v, phi, AcceleratorRead);
|
||||
autoView( Umu_v, Umu, AcceleratorRead);
|
||||
typedef decltype(coalescedRead(out_v[0])) calcSpinor;
|
||||
accelerator_for(sss,out.Grid()->oSites(),Nsimd,{
|
||||
calcSpinor tmp;
|
||||
multLink(tmp,Umu_v[sss],phi_v(sss),mu);
|
||||
coalescedWrite(out_v[sss],tmp);
|
||||
});
|
||||
}
|
||||
|
||||
template <class ref>
|
||||
static accelerator_inline void loadLinkElement(Simd ®, ref &memory)
|
||||
{
|
||||
reg = memory;
|
||||
}
|
||||
|
||||
inline void DoubleStore(GridBase *GaugeGrid,
|
||||
DoubledGaugeField &Uds,
|
||||
const GaugeField &Umu)
|
||||
{
|
||||
typedef typename Simd::scalar_type scalar_type;
|
||||
|
||||
conformable(Uds.Grid(), GaugeGrid);
|
||||
conformable(Umu.Grid(), GaugeGrid);
|
||||
|
||||
GaugeLinkField U(GaugeGrid);
|
||||
GaugeLinkField tmp(GaugeGrid);
|
||||
|
||||
Lattice<iScalar<vInteger> > coor(GaugeGrid);
|
||||
////////////////////////////////////////////////////
|
||||
// apply any boundary phase or twists
|
||||
////////////////////////////////////////////////////
|
||||
for (int mu = 0; mu < Nd; mu++) {
|
||||
|
||||
////////// boundary phase /////////////
|
||||
auto pha = Params.boundary_phases[mu];
|
||||
scalar_type phase( real(pha),imag(pha) );
|
||||
|
||||
int L = GaugeGrid->GlobalDimensions()[mu];
|
||||
int Lmu = L - 1;
|
||||
|
||||
LatticeCoordinate(coor, mu);
|
||||
|
||||
U = PeekIndex<LorentzIndex>(Umu, mu);
|
||||
|
||||
// apply any twists
|
||||
RealD theta = Params.twist_n_2pi_L[mu] * 2*M_PI / L;
|
||||
if ( theta != 0.0) {
|
||||
scalar_type twphase(::cos(theta),::sin(theta));
|
||||
U = twphase*U;
|
||||
std::cout << GridLogMessage << " Twist ["<<mu<<"] "<< Params.twist_n_2pi_L[mu]<< " phase"<<phase <<std::endl;
|
||||
}
|
||||
|
||||
tmp = where(coor == Lmu, phase * U, U);
|
||||
PokeIndex<LorentzIndex>(Uds, tmp, mu);
|
||||
|
||||
U = adj(Cshift(U, mu, -1));
|
||||
U = where(coor == 0, conjugate(phase) * U, U);
|
||||
PokeIndex<LorentzIndex>(Uds, U, mu + Nd);
|
||||
}
|
||||
}
|
||||
|
||||
inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A,int mu){
|
||||
GaugeLinkField link(mat.Grid());
|
||||
link = TraceIndex<SpinIndex>(outerProduct(Btilde,A));
|
||||
PokeIndex<LorentzIndex>(mat,link,mu);
|
||||
}
|
||||
|
||||
inline void outerProductImpl(PropagatorField &mat, const FermionField &B, const FermionField &A){
|
||||
mat = outerProduct(B,A);
|
||||
}
|
||||
|
||||
inline void TraceSpinImpl(GaugeLinkField &mat, PropagatorField&P) {
|
||||
mat = TraceIndex<SpinIndex>(P);
|
||||
}
|
||||
|
||||
inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds)
|
||||
{
|
||||
for (int mu = 0; mu < Nd; mu++)
|
||||
mat[mu] = PeekIndex<LorentzIndex>(Uds, mu);
|
||||
}
|
||||
|
||||
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField Ã,int mu)
|
||||
{
|
||||
int Ls=Btilde.Grid()->_fdimensions[0];
|
||||
autoView( mat_v , mat, AcceleratorWrite);
|
||||
{
|
||||
const int Nsimd = SiteSpinor::Nsimd();
|
||||
autoView( Btilde_v , Btilde, AcceleratorRead);
|
||||
autoView( Atilde_v , Atilde, AcceleratorRead);
|
||||
accelerator_for(sss,mat.Grid()->oSites(),Nsimd,{
|
||||
int sU=sss;
|
||||
typedef decltype(coalescedRead(mat_v[sU](mu)() )) ColorMatrixType;
|
||||
ColorMatrixType sum;
|
||||
zeroit(sum);
|
||||
for(int s=0;s<Ls;s++){
|
||||
int sF = s+Ls*sU;
|
||||
for(int spn=0;spn<Ns;spn++){ //sum over spin
|
||||
auto bb = coalescedRead(Btilde_v[sF]()(spn) ); //color vector
|
||||
auto aa = coalescedRead(Atilde_v[sF]()(spn) );
|
||||
auto op = outerProduct(bb,aa);
|
||||
sum = sum + op;
|
||||
}
|
||||
}
|
||||
coalescedWrite(mat_v[sU](mu)(), sum);
|
||||
});
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
typedef TwoSpinWilsonImpl<vComplex, FundamentalRepresentation, CoeffReal > TwoSpinWilsonImplR; // Real.. whichever prec
|
||||
typedef TwoSpinWilsonImpl<vComplexF, FundamentalRepresentation, CoeffReal > TwoSpinWilsonImplF; // Float
|
||||
typedef TwoSpinWilsonImpl<vComplexD, FundamentalRepresentation, CoeffReal > TwoSpinWilsonImplD; // Double
|
||||
typedef TwoSpinWilsonImpl<vComplexD2, FundamentalRepresentation, CoeffReal > TwoSpinWilsonImplD2; // Double
|
||||
|
||||
NAMESPACE_END(Grid);
|
84
Grid/qcd/action/fermion/TwoSpinWilsonKernels.h
Normal file
84
Grid/qcd/action/fermion/TwoSpinWilsonKernels.h
Normal file
@@ -0,0 +1,84 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/WilsonKernels.h
|
||||
|
||||
Copyright (C) 2015
|
||||
|
||||
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution
|
||||
directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
#pragma once
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Helper routines that implement Wilson stencil for a single site.
|
||||
// Common to both the WilsonFermion and WilsonFermion5D
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template<class Impl> class TwoSpinWilsonKernels : public FermionOperator<Impl> {
|
||||
public:
|
||||
|
||||
INHERIT_IMPL_TYPES(Impl);
|
||||
typedef FermionOperator<Impl> Base;
|
||||
typedef AcceleratorVector<int,STENCIL_MAX> StencilVector;
|
||||
public:
|
||||
|
||||
static void DhopKernel(StencilImpl &st, DoubledGaugeField &U, SiteSpinor * buf,
|
||||
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
||||
int interior=1,int exterior=1) ;
|
||||
|
||||
static void DhopKernel(StencilImpl &st, DoubledGaugeField &U, SiteSpinor * buf,
|
||||
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
||||
uint64_t *ids);
|
||||
|
||||
static void DhopDagKernel(StencilImpl &st, DoubledGaugeField &U, SiteSpinor * buf,
|
||||
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
||||
int interior=1,int exterior=1) ;
|
||||
|
||||
static void DhopDirAll( StencilImpl &st, DoubledGaugeField &U,SiteSpinor *buf, int Ls,
|
||||
int Nsite, const FermionField &in, std::vector<FermionField> &out) ;
|
||||
|
||||
static void DhopDirKernel(StencilImpl &st, DoubledGaugeField &U,SiteSpinor * buf,
|
||||
int Ls, int Nsite, const FermionField &in, FermionField &out, int dirdisp, int gamma);
|
||||
|
||||
private:
|
||||
|
||||
static accelerator_inline void DhopDirK(StencilView &st, DoubledGaugeFieldView &U,SiteSpinor * buf,
|
||||
int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dirdisp, int gamma);
|
||||
|
||||
static accelerator_inline void DhopDirXp(StencilView &st,DoubledGaugeFieldView &U,SiteSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
|
||||
static accelerator_inline void DhopDirYp(StencilView &st,DoubledGaugeFieldView &U,SiteSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
|
||||
static accelerator_inline void DhopDirZp(StencilView &st,DoubledGaugeFieldView &U,SiteSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
|
||||
static accelerator_inline void DhopDirXm(StencilView &st,DoubledGaugeFieldView &U,SiteSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
|
||||
static accelerator_inline void DhopDirYm(StencilView &st,DoubledGaugeFieldView &U,SiteSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
|
||||
static accelerator_inline void DhopDirZm(StencilView &st,DoubledGaugeFieldView &U,SiteSpinor *buf,int sF,int sU,const FermionFieldView &in,FermionFieldView &out,int dirdisp);
|
||||
|
||||
public:
|
||||
TwoSpinWilsonKernels(const ImplParams &p = ImplParams()) : Base(p){};
|
||||
};
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
|
@@ -47,7 +47,7 @@ public:
|
||||
static int PartialCompressionFactor(GridBase *grid) { return 1;}
|
||||
#endif
|
||||
template<class vobj,class cobj,class compressor>
|
||||
static void Gather_plane_simple (commVector<std::pair<int,int> >& table,
|
||||
static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
|
||||
const Lattice<vobj> &rhs,
|
||||
cobj *buffer,
|
||||
compressor &compress,
|
||||
@@ -109,7 +109,7 @@ public:
|
||||
// Reorder the fifth dim to be s=Ls-1 , s=0, s=1,...,Ls-2.
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
template<class vobj,class cobj,class compressor>
|
||||
static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
|
||||
static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
|
||||
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
|
||||
compressor &compress,int type,int partial)
|
||||
{
|
||||
@@ -197,7 +197,7 @@ public:
|
||||
#endif
|
||||
|
||||
template<class vobj,class cobj,class compressor>
|
||||
static void Gather_plane_simple (commVector<std::pair<int,int> >& table,
|
||||
static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
|
||||
const Lattice<vobj> &rhs,
|
||||
cobj *buffer,
|
||||
compressor &compress,
|
||||
@@ -208,7 +208,7 @@ public:
|
||||
else FaceGatherSimple::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
|
||||
}
|
||||
template<class vobj,class cobj,class compressor>
|
||||
static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
|
||||
static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
|
||||
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
|
||||
compressor &compress,int type,int partial)
|
||||
{
|
||||
@@ -402,7 +402,6 @@ public:
|
||||
|
||||
typedef CartesianStencil<vobj,cobj,Parameters> Base;
|
||||
typedef typename Base::View_type View_type;
|
||||
typedef typename Base::StencilVector StencilVector;
|
||||
|
||||
// Vector<int> surface_list;
|
||||
WilsonStencil(GridBase *grid,
|
||||
@@ -416,29 +415,6 @@ public:
|
||||
this->same_node.resize(npoints);
|
||||
};
|
||||
|
||||
/*
|
||||
void BuildSurfaceList(int Ls,int vol4){
|
||||
|
||||
// find same node for SHM
|
||||
// Here we know the distance is 1 for WilsonStencil
|
||||
for(int point=0;point<this->_npoints;point++){
|
||||
this->same_node[point] = this->SameNode(point);
|
||||
}
|
||||
|
||||
for(int site = 0 ;site< vol4;site++){
|
||||
int local = 1;
|
||||
for(int point=0;point<this->_npoints;point++){
|
||||
if( (!this->GetNodeLocal(site*Ls,point)) && (!this->same_node[point]) ){
|
||||
local = 0;
|
||||
}
|
||||
}
|
||||
if(local == 0) {
|
||||
surface_list.push_back(site);
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
template < class compressor>
|
||||
void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress)
|
||||
{
|
||||
@@ -508,6 +484,11 @@ public:
|
||||
this->face_table_computed=1;
|
||||
assert(this->u_comm_offset==this->_unified_buffer_size);
|
||||
accelerator_barrier();
|
||||
#ifdef NVLINK_GET
|
||||
this->_grid->StencilBarrier(); // He can now get mu local gather, I can get his
|
||||
// Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check
|
||||
// Or issue barrier AFTER the DMA is running
|
||||
#endif
|
||||
}
|
||||
|
||||
};
|
||||
|
@@ -38,6 +38,8 @@ public:
|
||||
static int MortonOrder;
|
||||
static const std::vector<int> directions;
|
||||
static const std::vector<int> displacements;
|
||||
static std::vector<int> MakeDirections(void);
|
||||
static std::vector<int> MakeDisplacements(void);
|
||||
static const int npoint = 8;
|
||||
};
|
||||
|
||||
@@ -126,13 +128,16 @@ public:
|
||||
void DerivInternal(StencilImpl &st, DoubledGaugeField &U, GaugeField &mat,
|
||||
const FermionField &A, const FermionField &B, int dag);
|
||||
|
||||
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||
void DhopInternal(StencilImpl &st,
|
||||
DoubledGaugeField &U,
|
||||
const FermionField &in, FermionField &out, int dag);
|
||||
|
||||
void DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||
void DhopInternalSerial(StencilImpl &st,
|
||||
DoubledGaugeField &U,
|
||||
const FermionField &in, FermionField &out, int dag);
|
||||
|
||||
void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||
void DhopInternalOverlappedComms(StencilImpl &st,
|
||||
DoubledGaugeField &U,
|
||||
const FermionField &in, FermionField &out, int dag);
|
||||
|
||||
// Constructor
|
||||
@@ -168,9 +173,6 @@ public:
|
||||
DoubledGaugeField UmuEven;
|
||||
DoubledGaugeField UmuOdd;
|
||||
|
||||
LebesgueOrder Lebesgue;
|
||||
LebesgueOrder LebesgueEvenOdd;
|
||||
|
||||
WilsonAnisotropyCoefficients anisotropyCoeff;
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
|
@@ -62,6 +62,8 @@ public:
|
||||
static const std::vector<int> directions;
|
||||
static const std::vector<int> displacements;
|
||||
static constexpr int npoint = 8;
|
||||
static std::vector<int> MakeDirections(void);
|
||||
static std::vector<int> MakeDisplacements(void);
|
||||
};
|
||||
|
||||
template<class Impl>
|
||||
@@ -91,13 +93,13 @@ public:
|
||||
virtual void Mdag (const FermionField &in, FermionField &out){assert(0);};
|
||||
|
||||
// half checkerboard operations; leave unimplemented as abstract for now
|
||||
virtual void Meooe (const FermionField &in, FermionField &out){assert(0);};
|
||||
virtual void Mooee (const FermionField &in, FermionField &out){assert(0);};
|
||||
virtual void MooeeInv (const FermionField &in, FermionField &out){assert(0);};
|
||||
virtual void Meooe (const FermionField &in, FermionField &out);
|
||||
virtual void Mooee (const FermionField &in, FermionField &out);
|
||||
virtual void MooeeInv (const FermionField &in, FermionField &out);
|
||||
|
||||
virtual void MeooeDag (const FermionField &in, FermionField &out){assert(0);};
|
||||
virtual void MooeeDag (const FermionField &in, FermionField &out){assert(0);};
|
||||
virtual void MooeeInvDag (const FermionField &in, FermionField &out){assert(0);};
|
||||
virtual void MeooeDag (const FermionField &in, FermionField &out);
|
||||
virtual void MooeeDag (const FermionField &in, FermionField &out);
|
||||
virtual void MooeeInvDag (const FermionField &in, FermionField &out);
|
||||
virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp){assert(0);}; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
|
||||
virtual void MdirAll(const FermionField &in, std::vector<FermionField> &out){assert(0);}; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
|
||||
|
||||
@@ -109,6 +111,8 @@ public:
|
||||
void MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
|
||||
void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
|
||||
void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
|
||||
void MomentumSpacePropagatorHwQ(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist,
|
||||
std::vector<double> qmu) ;
|
||||
|
||||
// Implement hopping term non-hermitian hopping term; half cb or both
|
||||
// Implement s-diagonal DW
|
||||
@@ -117,6 +121,9 @@ public:
|
||||
void DhopOE(const FermionField &in, FermionField &out,int dag);
|
||||
void DhopEO(const FermionField &in, FermionField &out,int dag);
|
||||
|
||||
void DhopComms (const FermionField &in, FermionField &out);
|
||||
void DhopCalc (const FermionField &in, FermionField &out,uint64_t *ids);
|
||||
|
||||
// add a DhopComm
|
||||
// -- suboptimal interface will presently trigger multiple comms.
|
||||
void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);
|
||||
@@ -135,21 +142,18 @@ public:
|
||||
int dag);
|
||||
|
||||
void DhopInternal(StencilImpl & st,
|
||||
LebesgueOrder &lo,
|
||||
DoubledGaugeField &U,
|
||||
const FermionField &in,
|
||||
FermionField &out,
|
||||
int dag);
|
||||
|
||||
void DhopInternalOverlappedComms(StencilImpl & st,
|
||||
LebesgueOrder &lo,
|
||||
DoubledGaugeField &U,
|
||||
const FermionField &in,
|
||||
FermionField &out,
|
||||
int dag);
|
||||
|
||||
void DhopInternalSerialComms(StencilImpl & st,
|
||||
LebesgueOrder &lo,
|
||||
DoubledGaugeField &U,
|
||||
const FermionField &in,
|
||||
FermionField &out,
|
||||
@@ -203,9 +207,6 @@ public:
|
||||
DoubledGaugeField UmuEven;
|
||||
DoubledGaugeField UmuOdd;
|
||||
|
||||
LebesgueOrder Lebesgue;
|
||||
LebesgueOrder LebesgueEvenOdd;
|
||||
|
||||
// Comms buffer
|
||||
// std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
|
||||
|
||||
|
@@ -166,7 +166,7 @@ public:
|
||||
|
||||
U = adj(Cshift(U, mu, -1));
|
||||
U = where(coor == 0, conjugate(phase) * U, U);
|
||||
PokeIndex<LorentzIndex>(Uds, U, mu + 4);
|
||||
PokeIndex<LorentzIndex>(Uds, U, mu + Nd);
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -57,6 +57,10 @@ public:
|
||||
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
||||
int interior=1,int exterior=1) ;
|
||||
|
||||
static void DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
||||
uint64_t *ids);
|
||||
|
||||
static void DhopDagKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
||||
int interior=1,int exterior=1) ;
|
||||
|
@@ -56,7 +56,7 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
|
||||
Frbgrid,
|
||||
Ugrid,
|
||||
Urbgrid,
|
||||
4.0,p)
|
||||
Nd*1.0,p)
|
||||
|
||||
{
|
||||
update(_mass,_mu);
|
||||
@@ -83,7 +83,7 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
//axpibg5x(out,in,a,b); // out = a*in + b*i*G5*in
|
||||
for (int s=0;s<(int)this->mass.size();s++) {
|
||||
ComplexD a = 4.0+this->mass[s];
|
||||
ComplexD a = Nd*1.0+this->mass[s];
|
||||
ComplexD b(0.0,this->mu[s]);
|
||||
axpbg5y_ssp(out,a,in,b,in,s,s);
|
||||
}
|
||||
@@ -92,7 +92,7 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
|
||||
virtual void MooeeDag(const FermionField &in, FermionField &out) {
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
for (int s=0;s<(int)this->mass.size();s++) {
|
||||
ComplexD a = 4.0+this->mass[s];
|
||||
ComplexD a = Nd*1.0+this->mass[s];
|
||||
ComplexD b(0.0,-this->mu[s]);
|
||||
axpbg5y_ssp(out,a,in,b,in,s,s);
|
||||
}
|
||||
@@ -101,7 +101,7 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
|
||||
for (int s=0;s<(int)this->mass.size();s++) {
|
||||
RealD m = this->mass[s];
|
||||
RealD tm = this->mu[s];
|
||||
RealD mtil = 4.0+this->mass[s];
|
||||
RealD mtil = Nd*1.0+this->mass[s];
|
||||
RealD sq = mtil*mtil+tm*tm;
|
||||
ComplexD a = mtil/sq;
|
||||
ComplexD b(0.0, -tm /sq);
|
||||
@@ -112,7 +112,7 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
|
||||
for (int s=0;s<(int)this->mass.size();s++) {
|
||||
RealD m = this->mass[s];
|
||||
RealD tm = this->mu[s];
|
||||
RealD mtil = 4.0+this->mass[s];
|
||||
RealD mtil = Nd*1.0+this->mass[s];
|
||||
RealD sq = mtil*mtil+tm*tm;
|
||||
ComplexD a = mtil/sq;
|
||||
ComplexD b(0.0,tm /sq);
|
||||
@@ -126,7 +126,7 @@ class WilsonTMFermion5D : public WilsonFermion5D<Impl>
|
||||
this->Dhop(in, out, DaggerNo);
|
||||
FermionField tmp(out.Grid());
|
||||
for (int s=0;s<(int)this->mass.size();s++) {
|
||||
ComplexD a = 4.0+this->mass[s];
|
||||
ComplexD a = Nd*1.0+this->mass[s];
|
||||
ComplexD b(0.0,this->mu[s]);
|
||||
axpbg5y_ssp(tmp,a,in,b,in,s,s);
|
||||
}
|
||||
|
@@ -58,7 +58,7 @@ public:
|
||||
{
|
||||
// RealD eps = 1.0;
|
||||
std::cout<<GridLogMessage << "ZMobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" gamma passed in"<<std::endl;
|
||||
Vector<Coeff_t> zgamma(this->Ls);
|
||||
std::vector<Coeff_t> zgamma(this->Ls);
|
||||
for(int s=0;s<this->Ls;s++){
|
||||
zgamma[s] = gamma[s];
|
||||
}
|
||||
|
@@ -1,3 +1,5 @@
|
||||
#if 0
|
||||
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
@@ -818,3 +820,5 @@ CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,
|
||||
}
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
#endif
|
@@ -1,3 +1,4 @@
|
||||
#if 0
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
@@ -241,3 +242,4 @@ void LebesgueOrder::ZGraph(void)
|
||||
}
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
#endif
|
@@ -72,7 +72,7 @@ public:
|
||||
void ThreadInterleave(void);
|
||||
|
||||
private:
|
||||
Vector<IndexInteger> _LebesgueReorder;
|
||||
deviceVector<IndexInteger> _LebesgueReorder;
|
||||
|
||||
};
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user