mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-12 20:27:06 +01:00
Compare commits
169 Commits
6efe720fd2
...
develop
Author | SHA1 | Date | |
---|---|---|---|
9203126aa5 | |||
f90ba4712a | |||
3737a24096 | |||
d418f78352 | |||
25163998a0 | |||
dc546aaa4b | |||
5364d580c9 | |||
2a9a6347e3 | |||
cfdb56f314 | |||
b517e88db3 | |||
bb317aba8d | |||
644cc6647e | |||
72397ce23b | |||
d60a80c098 | |||
bb8b6d9d73 | |||
677b4cc5b0 | |||
be565ffab6 | |||
df6120e5f6 | |||
21de6f7da8 | |||
dbe39f9ce0 | |||
ab3de50d5e | |||
c545bd2139 | |||
6a1c64fbdd | |||
b75809ed61 | |||
ecaf228e5c | |||
6d015ae8fc | |||
233150d93f | |||
7af8c77a52 | |||
a957e7bfa1 | |||
cee4c8ce8c | |||
96bf814d8c | |||
7ddc422788 | |||
e652fc2825 | |||
a49fa3f8d0 | |||
cd452a2f91 | |||
4f89f603ae | |||
11dc2c5e1d | |||
6fec3c15ca | |||
938c47480f | |||
3811d19298 | |||
83a3ab6b6f | |||
d66a9af6a3 | |||
adc90d3a86 | |||
ebbd015c5c | |||
4ab73b36b2 | |||
130e07a422 | |||
8f47bb367e | |||
0c3cb60135 | |||
9eae8fca5d | |||
882a217074 | |||
e465fce201 | |||
d41542c64b | |||
199818bd6c | |||
fe66c7ca30 | |||
e9177e4af3 | |||
d15a6c5933 | |||
25ab9325e7 | |||
19f9378b98 | |||
785bc7a14f | |||
1a1fe85428 | |||
0000d2e558 | |||
9ffd1ed4ce | |||
3d014864e2 | |||
1d22841811 | |||
a1cdda833f | |||
ad6db92690 | |||
e8ff9d8e50 | |||
795769c636 | |||
267a39d943 | |||
3624bd3d22 | |||
bc12dbbb38 | |||
eb8a008a8f | |||
c4d9aa1a21 | |||
6ae809ed40 | |||
311e2aab3f | |||
438dfbdb83 | |||
b2ce760cf4 | |||
b1ba209696 | |||
cb3e529b1e | |||
717f647418 | |||
98e7418187 | |||
fe05bf48b1 | |||
d2dd8f54e2 | |||
7726ee4b16 | |||
ba9bbe0221 | |||
4c3dd82d84 | |||
44e911b5b7 | |||
a7a16df9d0 | |||
382e0abefd | |||
6fdefe5b90 | |||
4788dd8e2e | |||
1cc5f221f3 | |||
93251bfba0 | |||
18b79508b8 | |||
4de5ed1613 | |||
0baaddbe98 | |||
8729c46169 | |||
09f81fe7c3 | |||
1876e5b7c0 | |||
355ec76257 | |||
b50fb34e71 | |||
de84d730ff | |||
c74d11e3d7 | |||
84cab5e6e7 | |||
c4fc972fec | |||
8cf809e231 | |||
94019a922e | |||
4f17c8d081 | |||
aaab753982 | |||
d6b2727f86 | |||
74a4f43946 | |||
1caf8b0f86 | |||
570b72a47b | |||
a5798a89ed | |||
3f3661a86f | |||
f7e2f9a401 | |||
2848a9b558 | |||
d4868991af | |||
e99d42404e | |||
3ba019c747 | |||
47429218bb | |||
8fe429346f | |||
5a4f9bf2e3 | |||
b91fc1b6b4 | |||
eafc150034 | |||
2877f1a268 | |||
1e893af775 | |||
d9f430a575 | |||
63abe87f36 | |||
368d649c8a | |||
5603464f39 | |||
655c79f39e | |||
565b231c03 | |||
62a9f180fa | |||
5ae77876a8 | |||
4ed2c2c74f | |||
955da582b6 | |||
11b07b950d | |||
8f70cfeda9 | |||
ce64271048 | |||
5cc4f3241d | |||
6815e138b4 | |||
a78a61d76f | |||
2eff3f34ed | |||
03687c1d62 | |||
febfe4e77f | |||
4d1aa134b5 | |||
5ec879860a | |||
b728af903c | |||
54f1999030 | |||
fd58f0b669 | |||
c5c67b706e | |||
be7a543e2c | |||
68f112d576 | |||
ec1395a304 | |||
beb0e474ee | |||
2b5fdcbbc5 | |||
295127d456 | |||
7dcfb13694 | |||
9fa8bd6438 | |||
02c8178f16 | |||
e637fbacae | |||
066544281f | |||
11be10d2c0 | |||
160969a758 | |||
622f78ebea | |||
8d305df0db | |||
e29b97b3ea | |||
ad2b699d2b |
@ -168,6 +168,7 @@ public:
|
||||
template<class vobj>
|
||||
void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){
|
||||
#ifndef HAVE_FFTW
|
||||
std::cerr << "FFTW is not compiled but is called"<<std::endl;
|
||||
assert(0);
|
||||
#else
|
||||
conformable(result.Grid(),vgrid);
|
||||
@ -190,7 +191,8 @@ public:
|
||||
|
||||
Lattice<sobj> pgbuf(&pencil_g);
|
||||
autoView(pgbuf_v , pgbuf, CpuWrite);
|
||||
|
||||
//std::cout << "CPU view" << std::endl;
|
||||
|
||||
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
|
||||
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
|
||||
|
||||
@ -213,6 +215,7 @@ public:
|
||||
else if ( sign == forward ) div = 1.0;
|
||||
else assert(0);
|
||||
|
||||
//std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
|
||||
FFTW_plan p;
|
||||
{
|
||||
FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
|
||||
@ -226,6 +229,7 @@ public:
|
||||
}
|
||||
|
||||
// Barrel shift and collect global pencil
|
||||
//std::cout << GridLogPerformance<<"Making pencil" << std::endl;
|
||||
Coordinate lcoor(Nd), gcoor(Nd);
|
||||
result = source;
|
||||
int pc = processor_coor[dim];
|
||||
@ -247,6 +251,7 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
//std::cout <<GridLogPerformance<< "Looping orthog" << std::endl;
|
||||
// Loop over orthog coords
|
||||
int NN=pencil_g.lSites();
|
||||
GridStopWatch timer;
|
||||
@ -269,6 +274,7 @@ public:
|
||||
usec += timer.useconds();
|
||||
flops+= flops_call*NN;
|
||||
|
||||
//std::cout <<GridLogPerformance<< "Writing back results " << std::endl;
|
||||
// writing out result
|
||||
{
|
||||
autoView(pgbuf_v,pgbuf,CpuRead);
|
||||
@ -285,6 +291,7 @@ public:
|
||||
}
|
||||
result = result*div;
|
||||
|
||||
//std::cout <<GridLogPerformance<< "Destroying plan " << std::endl;
|
||||
// destroying plan
|
||||
FFTW<scalar>::fftw_destroy_plan(p);
|
||||
#endif
|
||||
|
@ -277,6 +277,38 @@ public:
|
||||
assert(0);
|
||||
}
|
||||
};
|
||||
template<class Matrix,class Field>
|
||||
class ShiftedNonHermitianLinearOperator : public LinearOperatorBase<Field> {
|
||||
Matrix &_Mat;
|
||||
RealD shift;
|
||||
public:
|
||||
ShiftedNonHermitianLinearOperator(Matrix &Mat,RealD shft): _Mat(Mat),shift(shft){};
|
||||
// Support for coarsening to a multigrid
|
||||
void OpDiag (const Field &in, Field &out) {
|
||||
_Mat.Mdiag(in,out);
|
||||
out = out + shift*in;
|
||||
}
|
||||
void OpDir (const Field &in, Field &out,int dir,int disp) {
|
||||
_Mat.Mdir(in,out,dir,disp);
|
||||
}
|
||||
void OpDirAll (const Field &in, std::vector<Field> &out){
|
||||
_Mat.MdirAll(in,out);
|
||||
};
|
||||
void Op (const Field &in, Field &out){
|
||||
_Mat.M(in,out);
|
||||
out = out + shift * in;
|
||||
}
|
||||
void AdjOp (const Field &in, Field &out){
|
||||
_Mat.Mdag(in,out);
|
||||
out = out + shift * in;
|
||||
}
|
||||
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
|
||||
assert(0);
|
||||
}
|
||||
void HermOp(const Field &in, Field &out){
|
||||
assert(0);
|
||||
}
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////
|
||||
// Even Odd Schur decomp operators; there are several
|
||||
|
@ -269,7 +269,9 @@ public:
|
||||
RealD xscale = 2.0/(hi-lo);
|
||||
RealD mscale = -(hi+lo)/(hi-lo);
|
||||
Linop.HermOp(T0,y);
|
||||
grid->Barrier();
|
||||
axpby(T1,xscale,mscale,y,in);
|
||||
grid->Barrier();
|
||||
|
||||
// sum = .5 c[0] T0 + c[1] T1
|
||||
// out = ()*T0 + Coeffs[1]*T1;
|
||||
|
@ -55,10 +55,10 @@ NAMESPACE_BEGIN(Grid);
|
||||
typedef cublasHandle_t gridblasHandle_t;
|
||||
#endif
|
||||
#ifdef GRID_SYCL
|
||||
typedef cl::sycl::queue *gridblasHandle_t;
|
||||
typedef sycl::queue *gridblasHandle_t;
|
||||
#endif
|
||||
#ifdef GRID_ONE_MKL
|
||||
typedef cl::sycl::queue *gridblasHandle_t;
|
||||
typedef sycl::queue *gridblasHandle_t;
|
||||
#endif
|
||||
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
|
||||
typedef int32_t gridblasHandle_t;
|
||||
@ -89,9 +89,9 @@ public:
|
||||
gridblasHandle = theGridAccelerator;
|
||||
#endif
|
||||
#ifdef GRID_ONE_MKL
|
||||
cl::sycl::gpu_selector selector;
|
||||
cl::sycl::device selectedDevice { selector };
|
||||
cl::sycl::property_list q_prop{cl::sycl::property::queue::in_order()};
|
||||
sycl::gpu_selector selector;
|
||||
sycl::device selectedDevice { selector };
|
||||
sycl::property_list q_prop{sycl::property::queue::in_order()};
|
||||
gridblasHandle =new sycl::queue (selectedDevice,q_prop);
|
||||
#endif
|
||||
gridblasInit=1;
|
||||
@ -208,8 +208,8 @@ public:
|
||||
assert(Bkn.size()==batchCount);
|
||||
assert(Cmn.size()==batchCount);
|
||||
|
||||
assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
|
||||
assert(OpB!=GridBLAS_OP_T);
|
||||
//assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
|
||||
//assert(OpB!=GridBLAS_OP_T);
|
||||
|
||||
int lda = m; // m x k column major
|
||||
int ldb = k; // k x n column major
|
||||
@ -367,28 +367,67 @@ public:
|
||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
|
||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
|
||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||
else
|
||||
eCmn = alpha * eAmk * eBkn ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
|
||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
|
||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
|
||||
else
|
||||
eCmn = alpha * eAmk.adjoint() * eBkn ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
|
||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],k,n);
|
||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
|
||||
else
|
||||
eCmn = alpha * eAmk.transpose() * eBkn ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
|
||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
|
||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
|
||||
else
|
||||
eCmn = alpha * eAmk * eBkn.adjoint() ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],m,k);
|
||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
|
||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
|
||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
|
||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
|
||||
else
|
||||
eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
|
||||
} );
|
||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcd> eAmk(Amk[p],k,m);
|
||||
Eigen::Map<Eigen::MatrixXcd> eBkn(Bkn[p],n,k);
|
||||
Eigen::Map<Eigen::MatrixXcd> eCmn(Cmn[p],m,n);
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||
else
|
||||
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||
} );
|
||||
} else {
|
||||
assert(0);
|
||||
@ -414,8 +453,8 @@ public:
|
||||
RealD t2=usecond();
|
||||
int32_t batchCount = Amk.size();
|
||||
|
||||
assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
|
||||
assert(OpB!=GridBLAS_OP_T);
|
||||
//assert(OpA!=GridBLAS_OP_T); // Complex case expect no transpose
|
||||
//assert(OpB!=GridBLAS_OP_T);
|
||||
|
||||
int lda = m; // m x k column major
|
||||
int ldb = k; // k x n column major
|
||||
@ -514,28 +553,70 @@ public:
|
||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
|
||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
|
||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||
else
|
||||
eCmn = alpha * eAmk * eBkn ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_N) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
|
||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
|
||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn ;
|
||||
else
|
||||
eCmn = alpha * eAmk.adjoint() * eBkn ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
|
||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],k,n);
|
||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
|
||||
else
|
||||
eCmn = alpha * eAmk.transpose() * eBkn ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_C) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
|
||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
|
||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.adjoint() ;
|
||||
else
|
||||
eCmn = alpha * eAmk * eBkn.adjoint() ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],m,k);
|
||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
|
||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
|
||||
else
|
||||
eCmn = alpha * eAmk * eBkn.transpose() ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_C ) && (OpB == GridBLAS_OP_C) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
|
||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
|
||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk.adjoint() * eBkn.adjoint() ;
|
||||
else
|
||||
eCmn = alpha * eAmk.adjoint() * eBkn.adjoint() ;
|
||||
} );
|
||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXcf> eAmk(Amk[p],k,m);
|
||||
Eigen::Map<Eigen::MatrixXcf> eBkn(Bkn[p],n,k);
|
||||
Eigen::Map<Eigen::MatrixXcf> eCmn(Cmn[p],m,n);
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||
else
|
||||
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||
} );
|
||||
} else {
|
||||
assert(0);
|
||||
@ -661,29 +742,41 @@ public:
|
||||
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
|
||||
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
|
||||
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||
else
|
||||
eCmn = alpha * eAmk * eBkn ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
|
||||
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],k,n);
|
||||
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
|
||||
else
|
||||
eCmn = alpha * eAmk.transpose() * eBkn ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],m,k);
|
||||
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
|
||||
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
|
||||
else
|
||||
eCmn = alpha * eAmk * eBkn.transpose() ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXf> eAmk(Amk[p],k,m);
|
||||
Eigen::Map<Eigen::MatrixXf> eBkn(Bkn[p],n,k);
|
||||
Eigen::Map<Eigen::MatrixXf> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||
} );
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||
else
|
||||
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||
});
|
||||
} else {
|
||||
assert(0);
|
||||
}
|
||||
@ -809,28 +902,40 @@ public:
|
||||
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
|
||||
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
|
||||
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn ;
|
||||
else
|
||||
eCmn = alpha * eAmk * eBkn ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_N) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
|
||||
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],k,n);
|
||||
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn ;
|
||||
else
|
||||
eCmn = alpha * eAmk.transpose() * eBkn ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_N ) && (OpB == GridBLAS_OP_T) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],m,k);
|
||||
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
|
||||
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk * eBkn.transpose() ;
|
||||
else
|
||||
eCmn = alpha * eAmk * eBkn.transpose() ;
|
||||
});
|
||||
} else if ( (OpA == GridBLAS_OP_T ) && (OpB == GridBLAS_OP_T) ) {
|
||||
thread_for (p, batchCount, {
|
||||
Eigen::Map<Eigen::MatrixXd> eAmk(Amk[p],k,m);
|
||||
Eigen::Map<Eigen::MatrixXd> eBkn(Bkn[p],n,k);
|
||||
Eigen::Map<Eigen::MatrixXd> eCmn(Cmn[p],m,n);
|
||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||
if (std::abs(beta) != 0.0)
|
||||
eCmn = beta * eCmn + alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||
else
|
||||
eCmn = alpha * eAmk.transpose() * eBkn.transpose() ;
|
||||
});
|
||||
} else {
|
||||
assert(0);
|
||||
|
@ -116,14 +116,14 @@ NAMESPACE_BEGIN(Grid);
|
||||
//Compute double precision rsd and also new RHS vector.
|
||||
Linop_d.HermOp(sol_d, tmp_d);
|
||||
RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
|
||||
|
||||
std::cout<<GridLogMessage<<" rsd norm "<<norm<<std::endl;
|
||||
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;
|
||||
|
||||
if(norm < OuterLoopNormMult * stop){
|
||||
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl;
|
||||
break;
|
||||
}
|
||||
while(norm * inner_tol * inner_tol < stop) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
|
||||
while(norm * inner_tol * inner_tol < stop*1.01) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
|
||||
|
||||
PrecChangeTimer.Start();
|
||||
precisionChange(src_f, src_d, pc_wk_dp_to_sp);
|
||||
|
@ -102,11 +102,11 @@ public:
|
||||
assert(mass.size()==nshift);
|
||||
assert(mresidual.size()==nshift);
|
||||
|
||||
// dynamic sized arrays on stack; 2d is a pain with vector
|
||||
RealD bs[nshift];
|
||||
RealD rsq[nshift];
|
||||
RealD z[nshift][2];
|
||||
int converged[nshift];
|
||||
// remove dynamic sized arrays on stack; 2d is a pain with vector
|
||||
std::vector<RealD> bs(nshift);
|
||||
std::vector<RealD> rsq(nshift);
|
||||
std::vector<std::array<RealD,2> > z(nshift);
|
||||
std::vector<int> converged(nshift);
|
||||
|
||||
const int primary =0;
|
||||
|
||||
|
@ -123,11 +123,11 @@ public:
|
||||
assert(mresidual.size()==nshift);
|
||||
|
||||
// dynamic sized arrays on stack; 2d is a pain with vector
|
||||
RealD bs[nshift];
|
||||
RealD rsq[nshift];
|
||||
RealD rsqf[nshift];
|
||||
RealD z[nshift][2];
|
||||
int converged[nshift];
|
||||
std::vector<RealD> bs(nshift);
|
||||
std::vector<RealD> rsq(nshift);
|
||||
std::vector<RealD> rsqf(nshift);
|
||||
std::vector<std::array<RealD,2> > z(nshift);
|
||||
std::vector<int> converged(nshift);
|
||||
|
||||
const int primary =0;
|
||||
|
||||
|
@ -156,11 +156,11 @@ public:
|
||||
assert(mresidual.size()==nshift);
|
||||
|
||||
// dynamic sized arrays on stack; 2d is a pain with vector
|
||||
RealD bs[nshift];
|
||||
RealD rsq[nshift];
|
||||
RealD rsqf[nshift];
|
||||
RealD z[nshift][2];
|
||||
int converged[nshift];
|
||||
std::vector<RealD> bs(nshift);
|
||||
std::vector<RealD> rsq(nshift);
|
||||
std::vector<RealD> rsqf(nshift);
|
||||
std::vector<std::array<RealD,2> > z(nshift);
|
||||
std::vector<int> converged(nshift);
|
||||
|
||||
const int primary =0;
|
||||
|
||||
|
@ -245,9 +245,10 @@ until convergence
|
||||
_HermOp(src_n,tmp);
|
||||
// std::cout << GridLogMessage<< tmp<<std::endl; exit(0);
|
||||
// std::cout << GridLogIRL << " _HermOp " << norm2(tmp) << std::endl;
|
||||
RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
|
||||
// RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
|
||||
RealD vnum = real(innerProduct(tmp,tmp)); // HermOp^2.
|
||||
RealD vden = norm2(src_n);
|
||||
RealD na = vnum/vden;
|
||||
RealD na = std::sqrt(vnum/vden);
|
||||
if (fabs(evalMaxApprox/na - 1.0) < 0.0001)
|
||||
i=_MAX_ITER_IRL_MEVAPP_;
|
||||
evalMaxApprox = na;
|
||||
@ -255,6 +256,7 @@ until convergence
|
||||
src_n = tmp;
|
||||
}
|
||||
}
|
||||
std::cout << GridLogIRL << " Final evalMaxApprox " << evalMaxApprox << std::endl;
|
||||
|
||||
std::vector<RealD> lme(Nm);
|
||||
std::vector<RealD> lme2(Nm);
|
||||
|
@ -74,7 +74,7 @@ public:
|
||||
|
||||
void operator() (const Field &src, Field &psi){
|
||||
|
||||
psi=Zero();
|
||||
// psi=Zero();
|
||||
RealD cp, ssq,rsq;
|
||||
ssq=norm2(src);
|
||||
rsq=Tolerance*Tolerance*ssq;
|
||||
|
@ -30,6 +30,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
/* END LEGAL */
|
||||
#pragma once
|
||||
|
||||
#include <Grid/algorithms/iterative/PrecGeneralisedConjugateResidualNonHermitian.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
inline RealD AggregatePowerLaw(RealD x)
|
||||
@ -95,7 +97,7 @@ public:
|
||||
|
||||
RealD scale;
|
||||
|
||||
ConjugateGradient<FineField> CG(1.0e-2,100,false);
|
||||
ConjugateGradient<FineField> CG(1.0e-3,400,false);
|
||||
FineField noise(FineGrid);
|
||||
FineField Mn(FineGrid);
|
||||
|
||||
@ -108,7 +110,7 @@ public:
|
||||
|
||||
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
|
||||
|
||||
for(int i=0;i<1;i++){
|
||||
for(int i=0;i<4;i++){
|
||||
|
||||
CG(hermop,noise,subspace[b]);
|
||||
|
||||
@ -124,6 +126,53 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
virtual void CreateSubspaceGCR(GridParallelRNG &RNG,LinearOperatorBase<FineField> &DiracOp,int nn=nbasis)
|
||||
{
|
||||
RealD scale;
|
||||
|
||||
TrivialPrecon<FineField> simple_fine;
|
||||
PrecGeneralisedConjugateResidualNonHermitian<FineField> GCR(0.001,30,DiracOp,simple_fine,12,12);
|
||||
FineField noise(FineGrid);
|
||||
FineField src(FineGrid);
|
||||
FineField guess(FineGrid);
|
||||
FineField Mn(FineGrid);
|
||||
|
||||
for(int b=0;b<nn;b++){
|
||||
|
||||
subspace[b] = Zero();
|
||||
gaussian(RNG,noise);
|
||||
scale = std::pow(norm2(noise),-0.5);
|
||||
noise=noise*scale;
|
||||
|
||||
DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|Op|n> "<<innerProduct(noise,Mn)<<std::endl;
|
||||
|
||||
for(int i=0;i<2;i++){
|
||||
// void operator() (const Field &src, Field &psi){
|
||||
#if 1
|
||||
std::cout << GridLogMessage << " inverting on noise "<<std::endl;
|
||||
src = noise;
|
||||
guess=Zero();
|
||||
GCR(src,guess);
|
||||
subspace[b] = guess;
|
||||
#else
|
||||
std::cout << GridLogMessage << " inverting on zero "<<std::endl;
|
||||
src=Zero();
|
||||
guess = noise;
|
||||
GCR(src,guess);
|
||||
subspace[b] = guess;
|
||||
#endif
|
||||
noise = subspace[b];
|
||||
scale = std::pow(norm2(noise),-0.5);
|
||||
noise=noise*scale;
|
||||
|
||||
}
|
||||
|
||||
DiracOp.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|Op|f> "<<innerProduct(noise,Mn)<<std::endl;
|
||||
subspace[b] = noise;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
|
||||
// and this is the best I found
|
||||
@ -160,14 +209,21 @@ public:
|
||||
|
||||
int b =0;
|
||||
{
|
||||
ComplexD ip;
|
||||
// Filter
|
||||
Chebyshev<FineField> Cheb(lo,hi,orderfilter);
|
||||
Cheb(hermop,noise,Mn);
|
||||
// normalise
|
||||
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
|
||||
subspace[b] = Mn;
|
||||
hermop.Op(Mn,tmp);
|
||||
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
|
||||
|
||||
hermop.Op(Mn,tmp);
|
||||
ip= innerProduct(Mn,tmp);
|
||||
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|Op|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
|
||||
|
||||
hermop.AdjOp(Mn,tmp);
|
||||
ip = innerProduct(Mn,tmp);
|
||||
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|AdjOp|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
|
||||
b++;
|
||||
}
|
||||
|
||||
@ -213,8 +269,18 @@ public:
|
||||
Mn=*Tnp;
|
||||
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
|
||||
subspace[b] = Mn;
|
||||
hermop.Op(Mn,tmp);
|
||||
std::cout<<GridLogMessage << n<<" filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
|
||||
|
||||
|
||||
ComplexD ip;
|
||||
|
||||
hermop.Op(Mn,tmp);
|
||||
ip= innerProduct(Mn,tmp);
|
||||
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|Op|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
|
||||
|
||||
hermop.AdjOp(Mn,tmp);
|
||||
ip = innerProduct(Mn,tmp);
|
||||
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|AdjOp|n> "<<norm2(tmp)<<" "<<ip<<std::endl;
|
||||
|
||||
b++;
|
||||
}
|
||||
|
||||
|
@ -99,7 +99,7 @@ public:
|
||||
CoarseMatrix AselfInvEven;
|
||||
CoarseMatrix AselfInvOdd;
|
||||
|
||||
Vector<RealD> dag_factor;
|
||||
deviceVector<RealD> dag_factor;
|
||||
|
||||
///////////////////////
|
||||
// Interface
|
||||
@ -124,9 +124,13 @@ public:
|
||||
int npoint = geom.npoint;
|
||||
typedef LatticeView<Cobj> Aview;
|
||||
|
||||
Vector<Aview> AcceleratorViewContainer;
|
||||
deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
|
||||
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
|
||||
|
||||
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
|
||||
for(int p=0;p<geom.npoint;p++) {
|
||||
hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
|
||||
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
|
||||
}
|
||||
Aview *Aview_p = & AcceleratorViewContainer[0];
|
||||
|
||||
const int Nsimd = CComplex::Nsimd();
|
||||
@ -161,7 +165,7 @@ public:
|
||||
coalescedWrite(out_v[ss](b),res);
|
||||
});
|
||||
|
||||
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
|
||||
for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
|
||||
};
|
||||
|
||||
void Mdag (const CoarseVector &in, CoarseVector &out)
|
||||
@ -190,9 +194,14 @@ public:
|
||||
int npoint = geom.npoint;
|
||||
typedef LatticeView<Cobj> Aview;
|
||||
|
||||
Vector<Aview> AcceleratorViewContainer;
|
||||
|
||||
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
|
||||
deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
|
||||
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
|
||||
|
||||
for(int p=0;p<geom.npoint;p++) {
|
||||
hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
|
||||
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
|
||||
}
|
||||
Aview *Aview_p = & AcceleratorViewContainer[0];
|
||||
|
||||
const int Nsimd = CComplex::Nsimd();
|
||||
@ -201,10 +210,10 @@ public:
|
||||
|
||||
int osites=Grid()->oSites();
|
||||
|
||||
Vector<int> points(geom.npoint, 0);
|
||||
for(int p=0; p<geom.npoint; p++)
|
||||
points[p] = geom.points_dagger[p];
|
||||
|
||||
deviceVector<int> points(geom.npoint);
|
||||
for(int p=0; p<geom.npoint; p++) {
|
||||
acceleratorPut(points[p],geom.points_dagger[p]);
|
||||
}
|
||||
auto points_p = &points[0];
|
||||
|
||||
RealD* dag_factor_p = &dag_factor[0];
|
||||
@ -236,7 +245,7 @@ public:
|
||||
coalescedWrite(out_v[ss](b),res);
|
||||
});
|
||||
|
||||
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
|
||||
for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
|
||||
}
|
||||
|
||||
void MdirComms(const CoarseVector &in)
|
||||
@ -251,8 +260,14 @@ public:
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
|
||||
typedef LatticeView<Cobj> Aview;
|
||||
Vector<Aview> AcceleratorViewContainer;
|
||||
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
|
||||
|
||||
deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
|
||||
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
|
||||
|
||||
for(int p=0;p<geom.npoint;p++) {
|
||||
hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
|
||||
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
|
||||
}
|
||||
Aview *Aview_p = & AcceleratorViewContainer[0];
|
||||
|
||||
autoView( out_v , out, AcceleratorWrite);
|
||||
@ -285,7 +300,7 @@ public:
|
||||
}
|
||||
coalescedWrite(out_v[ss](b),res);
|
||||
});
|
||||
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
|
||||
for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
|
||||
}
|
||||
void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out)
|
||||
{
|
||||
@ -469,14 +484,20 @@ public:
|
||||
|
||||
// determine in what order we need the points
|
||||
int npoint = geom.npoint-1;
|
||||
Vector<int> points(npoint, 0);
|
||||
for(int p=0; p<npoint; p++)
|
||||
points[p] = (dag && !hermitian) ? geom.points_dagger[p] : p;
|
||||
|
||||
deviceVector<int> points(npoint);
|
||||
for(int p=0; p<npoint; p++) {
|
||||
int val = (dag && !hermitian) ? geom.points_dagger[p] : p;
|
||||
acceleratorPut(points[p], val);
|
||||
}
|
||||
auto points_p = &points[0];
|
||||
|
||||
Vector<Aview> AcceleratorViewContainer;
|
||||
for(int p=0;p<npoint;p++) AcceleratorViewContainer.push_back(a[p].View(AcceleratorRead));
|
||||
deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
|
||||
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
|
||||
|
||||
for(int p=0;p<geom.npoint;p++) {
|
||||
hAcceleratorViewContainer[p] = a[p].View(AcceleratorRead);
|
||||
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
|
||||
}
|
||||
Aview *Aview_p = & AcceleratorViewContainer[0];
|
||||
|
||||
const int Nsimd = CComplex::Nsimd();
|
||||
@ -539,7 +560,7 @@ public:
|
||||
});
|
||||
}
|
||||
|
||||
for(int p=0;p<npoint;p++) AcceleratorViewContainer[p].ViewClose();
|
||||
for(int p=0;p<npoint;p++) hAcceleratorViewContainer[p].ViewClose();
|
||||
}
|
||||
|
||||
CoarsenedMatrix(GridCartesian &CoarseGrid, int hermitian_=0) :
|
||||
@ -590,11 +611,13 @@ public:
|
||||
}
|
||||
|
||||
// GPU readable prefactor
|
||||
std::vector<RealD> h_dag_factor(nbasis*nbasis);
|
||||
thread_for(i, nbasis*nbasis, {
|
||||
int j = i/nbasis;
|
||||
int k = i%nbasis;
|
||||
dag_factor[i] = dag_factor_eigen(j, k);
|
||||
h_dag_factor[i] = dag_factor_eigen(j, k);
|
||||
});
|
||||
acceleratorCopyToDevice(&h_dag_factor[0],&dag_factor[0],dag_factor.size()*sizeof(RealD));
|
||||
}
|
||||
|
||||
void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
|
||||
|
@ -441,8 +441,20 @@ public:
|
||||
std::cout << GridLogMessage<<"CoarsenOperator inv "<<tinv<<" us"<<std::endl;
|
||||
}
|
||||
#else
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// Galerkin projection of matrix
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
|
||||
Aggregation<Fobj,CComplex,nbasis> & Subspace)
|
||||
{
|
||||
CoarsenOperator(linop,Subspace,Subspace);
|
||||
}
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// Petrov - Galerkin projection of matrix
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
|
||||
Aggregation<Fobj,CComplex,nbasis> & U,
|
||||
Aggregation<Fobj,CComplex,nbasis> & V)
|
||||
{
|
||||
std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl;
|
||||
GridBase *grid = FineGrid();
|
||||
@ -458,11 +470,9 @@ public:
|
||||
// Orthogonalise the subblocks over the basis
|
||||
/////////////////////////////////////////////////////////////
|
||||
CoarseScalar InnerProd(CoarseGrid());
|
||||
blockOrthogonalise(InnerProd,Subspace.subspace);
|
||||
blockOrthogonalise(InnerProd,V.subspace);
|
||||
blockOrthogonalise(InnerProd,U.subspace);
|
||||
|
||||
// for(int s=0;s<Subspace.subspace.size();s++){
|
||||
// std::cout << " subspace norm "<<norm2(Subspace.subspace[s])<<std::endl;
|
||||
// }
|
||||
const int npoint = geom.npoint;
|
||||
|
||||
Coordinate clatt = CoarseGrid()->GlobalDimensions();
|
||||
@ -542,7 +552,7 @@ public:
|
||||
std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
|
||||
for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
|
||||
tphaseBZ-=usecond();
|
||||
phaV = phaF[p]*Subspace.subspace[i];
|
||||
phaV = phaF[p]*V.subspace[i];
|
||||
tphaseBZ+=usecond();
|
||||
|
||||
/////////////////////////////////////////////////////////////////////
|
||||
@ -555,7 +565,7 @@ public:
|
||||
// std::cout << i << " " <<p << " MphaV "<<norm2(MphaV)<<" "<<norm2(phaV)<<std::endl;
|
||||
|
||||
tproj-=usecond();
|
||||
blockProject(coarseInner,MphaV,Subspace.subspace);
|
||||
blockProject(coarseInner,MphaV,U.subspace);
|
||||
coarseInner = conjugate(pha[p]) * coarseInner;
|
||||
|
||||
ComputeProj[p] = coarseInner;
|
||||
|
@ -69,7 +69,7 @@ public:
|
||||
}
|
||||
|
||||
// FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
|
||||
void construct(pointer __p, const _Tp& __val) { assert(0);};
|
||||
void construct(pointer __p, const _Tp& __val) { };
|
||||
void construct(pointer __p) { };
|
||||
void destroy(pointer __p) { };
|
||||
};
|
||||
@ -174,19 +174,10 @@ template<typename _Tp> inline bool operator!=(const devAllocator<_Tp>&, const d
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Template typedefs
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
#ifdef ACCELERATOR_CSHIFT
|
||||
// Cshift on device
|
||||
template<class T> using cshiftAllocator = devAllocator<T>;
|
||||
#else
|
||||
// Cshift on host
|
||||
template<class T> using cshiftAllocator = std::allocator<T>;
|
||||
#endif
|
||||
|
||||
template<class T> using Vector = std::vector<T,uvmAllocator<T> >;
|
||||
template<class T> using stencilVector = std::vector<T,alignedAllocator<T> >;
|
||||
template<class T> using commVector = std::vector<T,devAllocator<T> >;
|
||||
template<class T> using deviceVector = std::vector<T,devAllocator<T> >;
|
||||
template<class T> using cshiftVector = std::vector<T,cshiftAllocator<T> >;
|
||||
template<class T> using hostVector = std::vector<T,alignedAllocator<T> >; // Needs autoview
|
||||
template<class T> using Vector = std::vector<T,uvmAllocator<T> >; // Really want to deprecate
|
||||
template<class T> using uvmVector = std::vector<T,uvmAllocator<T> >; // auto migrating page
|
||||
template<class T> using deviceVector = std::vector<T,devAllocator<T> >; // device vector
|
||||
|
||||
/*
|
||||
template<class T> class vecView
|
||||
@ -197,8 +188,9 @@ template<class T> class vecView
|
||||
ViewMode mode;
|
||||
void * cpu_ptr;
|
||||
public:
|
||||
// Rvalue accessor
|
||||
accelerator_inline T & operator[](size_t i) const { return this->data[i]; };
|
||||
vecView(std::vector<T> &refer_to_me,ViewMode _mode)
|
||||
vecView(Vector<T> &refer_to_me,ViewMode _mode)
|
||||
{
|
||||
cpu_ptr = &refer_to_me[0];
|
||||
size = refer_to_me.size();
|
||||
@ -214,22 +206,12 @@ template<class T> class vecView
|
||||
}
|
||||
};
|
||||
|
||||
template<class T> vecView<T> VectorView(std::vector<T> &vec,ViewMode _mode)
|
||||
template<class T> vecView<T> VectorView(Vector<T> &vec,ViewMode _mode)
|
||||
{
|
||||
vecView<T> ret(vec,_mode); // does the open
|
||||
return ret; // must be closed
|
||||
}
|
||||
|
||||
// Little autoscope assister
|
||||
template<class View>
|
||||
class VectorViewCloser
|
||||
{
|
||||
View v; // Take a copy of view and call view close when I go out of scope automatically
|
||||
public:
|
||||
VectorViewCloser(View &_v) : v(_v) {};
|
||||
~VectorViewCloser() { auto ptr = v.cpu_ptr; v.ViewClose(); MemoryManager::NotifyDeletion(ptr);}
|
||||
};
|
||||
|
||||
#define autoVecView(v_v,v,mode) \
|
||||
auto v_v = VectorView(v,mode); \
|
||||
ViewCloser<decltype(v_v)> _autoView##v_v(v_v);
|
||||
|
@ -1,16 +1,15 @@
|
||||
#include <Grid/GridCore.h>
|
||||
#ifndef GRID_UVM
|
||||
|
||||
#warning "Using explicit device memory copies"
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
#define MAXLINE 512
|
||||
static char print_buffer [ MAXLINE ];
|
||||
|
||||
#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer;
|
||||
#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug << print_buffer;
|
||||
#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer << std::endl;
|
||||
#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug << print_buffer << std::endl;
|
||||
//#define dprintf(...)
|
||||
|
||||
//#define mprintf(...)
|
||||
|
||||
////////////////////////////////////////////////////////////
|
||||
// For caching copies of data on device
|
||||
@ -111,7 +110,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
|
||||
///////////////////////////////////////////////////////////
|
||||
assert(AccCache.state!=Empty);
|
||||
|
||||
dprintf("MemoryManager: Discard(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
|
||||
dprintf("MemoryManager: Discard(%lx) %lx",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
|
||||
assert(AccCache.accLock==0);
|
||||
assert(AccCache.cpuLock==0);
|
||||
assert(AccCache.CpuPtr!=(uint64_t)NULL);
|
||||
@ -121,7 +120,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
|
||||
DeviceBytes -=AccCache.bytes;
|
||||
LRUremove(AccCache);
|
||||
AccCache.AccPtr=(uint64_t) NULL;
|
||||
dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
|
||||
dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
|
||||
}
|
||||
uint64_t CpuPtr = AccCache.CpuPtr;
|
||||
EntryErase(CpuPtr);
|
||||
@ -141,7 +140,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
assert(AccCache.state!=Empty);
|
||||
|
||||
mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld\n",
|
||||
mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld",
|
||||
(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr,
|
||||
(uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock);
|
||||
if (AccCache.accLock!=0) return;
|
||||
@ -155,7 +154,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
|
||||
AccCache.AccPtr=(uint64_t)NULL;
|
||||
AccCache.state=CpuDirty; // CPU primary now
|
||||
DeviceBytes -=AccCache.bytes;
|
||||
dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);
|
||||
dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld ",(uint64_t)AccCache.AccPtr,DeviceBytes);
|
||||
}
|
||||
// uint64_t CpuPtr = AccCache.CpuPtr;
|
||||
DeviceEvictions++;
|
||||
@ -169,7 +168,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
|
||||
assert(AccCache.AccPtr!=(uint64_t)NULL);
|
||||
assert(AccCache.CpuPtr!=(uint64_t)NULL);
|
||||
acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
|
||||
mprintf("MemoryManager: acceleratorCopyFromDevice Flush AccPtr %lx -> CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
|
||||
mprintf("MemoryManager: acceleratorCopyFromDevice Flush size %ld AccPtr %lx -> CpuPtr %lx",(uint64_t)AccCache.bytes,(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
|
||||
DeviceToHostBytes+=AccCache.bytes;
|
||||
DeviceToHostXfer++;
|
||||
AccCache.state=Consistent;
|
||||
@ -184,7 +183,9 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
|
||||
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
|
||||
DeviceBytes+=AccCache.bytes;
|
||||
}
|
||||
mprintf("MemoryManager: acceleratorCopyToDevice Clone AccPtr %lx <- CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
|
||||
mprintf("MemoryManager: acceleratorCopyToDevice Clone size %ld AccPtr %lx <- CpuPtr %lx",
|
||||
(uint64_t)AccCache.bytes,
|
||||
(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
|
||||
acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
|
||||
HostToDeviceBytes+=AccCache.bytes;
|
||||
HostToDeviceXfer++;
|
||||
@ -210,7 +211,7 @@ void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache)
|
||||
void MemoryManager::ViewClose(void* Ptr,ViewMode mode)
|
||||
{
|
||||
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
|
||||
dprintf("AcceleratorViewClose %lx\n",(uint64_t)Ptr);
|
||||
dprintf("AcceleratorViewClose %lx",(uint64_t)Ptr);
|
||||
AcceleratorViewClose((uint64_t)Ptr);
|
||||
} else if( (mode==CpuRead)||(mode==CpuWrite)){
|
||||
CpuViewClose((uint64_t)Ptr);
|
||||
@ -222,7 +223,7 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
|
||||
{
|
||||
uint64_t CpuPtr = (uint64_t)_CpuPtr;
|
||||
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
|
||||
dprintf("AcceleratorViewOpen %lx\n",(uint64_t)CpuPtr);
|
||||
dprintf("AcceleratorViewOpen %lx",(uint64_t)CpuPtr);
|
||||
return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint);
|
||||
} else if( (mode==CpuRead)||(mode==CpuWrite)){
|
||||
return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint);
|
||||
@ -233,6 +234,9 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
|
||||
}
|
||||
void MemoryManager::EvictVictims(uint64_t bytes)
|
||||
{
|
||||
if(bytes>=DeviceMaxBytes) {
|
||||
printf("EvictVictims bytes %ld DeviceMaxBytes %ld\n",bytes,DeviceMaxBytes);
|
||||
}
|
||||
assert(bytes<DeviceMaxBytes);
|
||||
while(bytes+DeviceLRUBytes > DeviceMaxBytes){
|
||||
if ( DeviceLRUBytes > 0){
|
||||
@ -265,7 +269,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
|
||||
assert(AccCache.cpuLock==0); // Programming error
|
||||
|
||||
if(AccCache.state!=Empty) {
|
||||
dprintf("ViewOpen found entry %lx %lx : %ld %ld accLock %ld\n",
|
||||
dprintf("ViewOpen found entry %lx %lx : sizes %ld %ld accLock %ld",
|
||||
(uint64_t)AccCache.CpuPtr,
|
||||
(uint64_t)CpuPtr,
|
||||
(uint64_t)AccCache.bytes,
|
||||
@ -305,7 +309,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
|
||||
AccCache.state = Consistent; // Empty + AccRead => Consistent
|
||||
}
|
||||
AccCache.accLock= 1;
|
||||
dprintf("Copied Empty entry into device accLock= %d\n",AccCache.accLock);
|
||||
dprintf("Copied Empty entry into device accLock= %d",AccCache.accLock);
|
||||
} else if(AccCache.state==CpuDirty ){
|
||||
if(mode==AcceleratorWriteDiscard) {
|
||||
CpuDiscard(AccCache);
|
||||
@ -318,21 +322,21 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
|
||||
AccCache.state = Consistent; // CpuDirty + AccRead => Consistent
|
||||
}
|
||||
AccCache.accLock++;
|
||||
dprintf("CpuDirty entry into device ++accLock= %d\n",AccCache.accLock);
|
||||
dprintf("CpuDirty entry into device ++accLock= %d",AccCache.accLock);
|
||||
} else if(AccCache.state==Consistent) {
|
||||
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
|
||||
AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty
|
||||
else
|
||||
AccCache.state = Consistent; // Consistent + AccRead => Consistent
|
||||
AccCache.accLock++;
|
||||
dprintf("Consistent entry into device ++accLock= %d\n",AccCache.accLock);
|
||||
dprintf("Consistent entry into device ++accLock= %d",AccCache.accLock);
|
||||
} else if(AccCache.state==AccDirty) {
|
||||
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
|
||||
AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
|
||||
else
|
||||
AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty
|
||||
AccCache.accLock++;
|
||||
dprintf("AccDirty entry ++accLock= %d\n",AccCache.accLock);
|
||||
dprintf("AccDirty entry ++accLock= %d",AccCache.accLock);
|
||||
} else {
|
||||
assert(0);
|
||||
}
|
||||
@ -341,7 +345,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
|
||||
// If view is opened on device must remove from LRU
|
||||
if(AccCache.LRU_valid==1){
|
||||
// must possibly remove from LRU as now locked on GPU
|
||||
dprintf("AccCache entry removed from LRU \n");
|
||||
dprintf("AccCache entry removed from LRU ");
|
||||
LRUremove(AccCache);
|
||||
}
|
||||
|
||||
@ -364,10 +368,10 @@ void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr)
|
||||
AccCache.accLock--;
|
||||
// Move to LRU queue if not locked and close on device
|
||||
if(AccCache.accLock==0) {
|
||||
dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
|
||||
dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
|
||||
LRUinsert(AccCache);
|
||||
} else {
|
||||
dprintf("AccleratorViewClose %lx AccLock decremented to %ld\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
|
||||
dprintf("AccleratorViewClose %lx AccLock decremented to %ld",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
|
||||
}
|
||||
}
|
||||
void MemoryManager::CpuViewClose(uint64_t CpuPtr)
|
||||
|
@ -15,10 +15,10 @@ void check_huge_pages(void *Buf,uint64_t BYTES)
|
||||
uint64_t virt_pfn = (uint64_t)Buf / page_size;
|
||||
off_t offset = sizeof(uint64_t) * virt_pfn;
|
||||
uint64_t npages = (BYTES + page_size-1) / page_size;
|
||||
uint64_t pagedata[npages];
|
||||
std::vector<uint64_t> pagedata(npages);
|
||||
uint64_t ret = lseek(fd, offset, SEEK_SET);
|
||||
assert(ret == offset);
|
||||
ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
|
||||
ret = ::read(fd, &pagedata[0], sizeof(uint64_t)*npages);
|
||||
assert(ret == sizeof(uint64_t) * npages);
|
||||
int nhugepages = npages / 512;
|
||||
int n4ktotal, nnothuge;
|
||||
|
@ -57,18 +57,29 @@ int CartesianCommunicator::ProcessorCount(void) { return
|
||||
// very VERY rarely (Log, serial RNG) we need world without a grid
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifdef USE_GRID_REDUCTION
|
||||
void CartesianCommunicator::GlobalSum(ComplexF &c)
|
||||
{
|
||||
GlobalSumP2P(c);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSum(ComplexD &c)
|
||||
{
|
||||
GlobalSumP2P(c);
|
||||
}
|
||||
#else
|
||||
void CartesianCommunicator::GlobalSum(ComplexF &c)
|
||||
{
|
||||
GlobalSumVector((float *)&c,2);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
|
||||
{
|
||||
GlobalSumVector((float *)c,2*N);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSum(ComplexD &c)
|
||||
{
|
||||
GlobalSumVector((double *)&c,2);
|
||||
}
|
||||
#endif
|
||||
void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
|
||||
{
|
||||
GlobalSumVector((float *)c,2*N);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
|
||||
{
|
||||
GlobalSumVector((double *)c,2*N);
|
||||
|
@ -33,6 +33,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
///////////////////////////////////
|
||||
#include <Grid/communicator/SharedMemory.h>
|
||||
|
||||
#define NVLINK_GET
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
extern bool Stencil_force_mpi ;
|
||||
@ -136,7 +138,7 @@ public:
|
||||
for(int d=0;d<_ndimension;d++){
|
||||
column.resize(_processors[d]);
|
||||
column[0] = accum;
|
||||
std::vector<CommsRequest_t> list;
|
||||
std::vector<MpiCommsRequest_t> list;
|
||||
for(int p=1;p<_processors[d];p++){
|
||||
ShiftedRanks(d,p,source,dest);
|
||||
SendToRecvFromBegin(list,
|
||||
@ -147,7 +149,8 @@ public:
|
||||
sizeof(obj),d*100+p);
|
||||
|
||||
}
|
||||
CommsComplete(list);
|
||||
if (!list.empty()) // avoid triggering assert in comms == none
|
||||
CommsComplete(list);
|
||||
for(int p=1;p<_processors[d];p++){
|
||||
accum = accum + column[p];
|
||||
}
|
||||
@ -166,8 +169,8 @@ public:
|
||||
////////////////////////////////////////////////////////////
|
||||
// Face exchange, buffer swap in translational invariant way
|
||||
////////////////////////////////////////////////////////////
|
||||
void CommsComplete(std::vector<CommsRequest_t> &list);
|
||||
void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void CommsComplete(std::vector<MpiCommsRequest_t> &list);
|
||||
void SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int dest,
|
||||
void *recv,
|
||||
@ -186,6 +189,17 @@ public:
|
||||
int recv_from_rank,int do_recv,
|
||||
int bytes,int dir);
|
||||
|
||||
double StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int xmit_to_rank,int do_xmit,
|
||||
void *recv,
|
||||
int recv_from_rank,int do_recv,
|
||||
int xbytes,int rbytes,int dir);
|
||||
|
||||
// Could do a PollHtoD and have a CommsMerge dependence
|
||||
void StencilSendToRecvFromPollDtoH (std::vector<CommsRequest_t> &list);
|
||||
void StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list);
|
||||
|
||||
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int xmit_to_rank,int do_xmit,
|
||||
|
@ -30,6 +30,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
|
||||
Grid_MPI_Comm CartesianCommunicator::communicator_world;
|
||||
|
||||
////////////////////////////////////////////
|
||||
@ -257,15 +258,41 @@ CartesianCommunicator::~CartesianCommunicator()
|
||||
}
|
||||
}
|
||||
}
|
||||
#ifdef USE_GRID_REDUCTION
|
||||
void CartesianCommunicator::GlobalSum(float &f){
|
||||
FlightRecorder::StepLog("GlobalSumP2P");
|
||||
CartesianCommunicator::GlobalSumP2P(f);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSum(double &d)
|
||||
{
|
||||
FlightRecorder::StepLog("GlobalSumP2P");
|
||||
CartesianCommunicator::GlobalSumP2P(d);
|
||||
}
|
||||
#else
|
||||
void CartesianCommunicator::GlobalSum(float &f){
|
||||
FlightRecorder::StepLog("AllReduce");
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSum(double &d)
|
||||
{
|
||||
FlightRecorder::StepLog("AllReduce");
|
||||
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
#endif
|
||||
void CartesianCommunicator::GlobalSum(uint32_t &u){
|
||||
FlightRecorder::StepLog("AllReduce");
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSum(uint64_t &u){
|
||||
FlightRecorder::StepLog("AllReduce");
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSumVector(uint64_t* u,int N){
|
||||
FlightRecorder::StepLog("AllReduceVector");
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,u,N,MPI_UINT64_T,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
@ -287,27 +314,18 @@ void CartesianCommunicator::GlobalMax(double &d)
|
||||
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSum(float &f){
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSumVector(float *f,int N)
|
||||
{
|
||||
int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSum(double &d)
|
||||
{
|
||||
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::GlobalSumVector(double *d,int N)
|
||||
{
|
||||
int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
|
||||
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void CartesianCommunicator::SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int dest,
|
||||
void *recv,
|
||||
@ -332,7 +350,7 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
|
||||
assert(ierr==0);
|
||||
list.push_back(xrq);
|
||||
}
|
||||
void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list)
|
||||
void CartesianCommunicator::CommsComplete(std::vector<MpiCommsRequest_t> &list)
|
||||
{
|
||||
int nreq=list.size();
|
||||
|
||||
@ -351,9 +369,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
||||
int from,
|
||||
int bytes)
|
||||
{
|
||||
std::vector<CommsRequest_t> reqs(0);
|
||||
unsigned long xcrc = crc32(0L, Z_NULL, 0);
|
||||
unsigned long rcrc = crc32(0L, Z_NULL, 0);
|
||||
std::vector<MpiCommsRequest_t> reqs(0);
|
||||
|
||||
int myrank = _processor;
|
||||
int ierr;
|
||||
@ -369,9 +385,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
||||
communicator,MPI_STATUS_IGNORE);
|
||||
assert(ierr==0);
|
||||
|
||||
// xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
|
||||
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
|
||||
// printf("proc %d SendToRecvFrom %d bytes xcrc %lx rcrc %lx\n",_processor,bytes,xcrc,rcrc); fflush
|
||||
}
|
||||
// Basic Halo comms primitive
|
||||
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
||||
@ -381,12 +394,287 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
||||
int bytes,int dir)
|
||||
{
|
||||
std::vector<CommsRequest_t> list;
|
||||
double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
|
||||
double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
|
||||
offbytes += StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
|
||||
StencilSendToRecvFromComplete(list,dir);
|
||||
return offbytes;
|
||||
}
|
||||
|
||||
#undef NVLINK_GET // Define to use get instead of put DMA
|
||||
|
||||
#ifdef ACCELERATOR_AWARE_MPI
|
||||
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
|
||||
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
|
||||
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int dest,int dox,
|
||||
void *recv,
|
||||
int from,int dor,
|
||||
int xbytes,int rbytes,int dir)
|
||||
{
|
||||
return 0.0; // Do nothing -- no preparation required
|
||||
}
|
||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int dest,int dox,
|
||||
void *recv,
|
||||
int from,int dor,
|
||||
int xbytes,int rbytes,int dir)
|
||||
{
|
||||
int ncomm =communicator_halo.size();
|
||||
int commdir=dir%ncomm;
|
||||
|
||||
MPI_Request xrq;
|
||||
MPI_Request rrq;
|
||||
|
||||
int ierr;
|
||||
int gdest = ShmRanks[dest];
|
||||
int gfrom = ShmRanks[from];
|
||||
int gme = ShmRanks[_processor];
|
||||
|
||||
assert(dest != _processor);
|
||||
assert(from != _processor);
|
||||
assert(gme == ShmRank);
|
||||
double off_node_bytes=0.0;
|
||||
int tag;
|
||||
|
||||
if ( dor ) {
|
||||
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
|
||||
tag= dir+from*32;
|
||||
ierr=MPI_Irecv(recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
|
||||
assert(ierr==0);
|
||||
list.push_back(rrq);
|
||||
off_node_bytes+=rbytes;
|
||||
}
|
||||
#ifdef NVLINK_GET
|
||||
else {
|
||||
void *shm = (void *) this->ShmBufferTranslate(from,xmit);
|
||||
assert(shm!=NULL);
|
||||
acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
// This is a NVLINK PUT
|
||||
if (dox) {
|
||||
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
|
||||
tag= dir+_processor*32;
|
||||
ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
|
||||
assert(ierr==0);
|
||||
list.push_back(xrq);
|
||||
off_node_bytes+=xbytes;
|
||||
} else {
|
||||
#ifndef NVLINK_GET
|
||||
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
|
||||
assert(shm!=NULL);
|
||||
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
return off_node_bytes;
|
||||
}
|
||||
|
||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
|
||||
{
|
||||
int nreq=list.size();
|
||||
/*finishes Get/Put*/
|
||||
acceleratorCopySynchronise();
|
||||
|
||||
if (nreq==0) return;
|
||||
std::vector<MPI_Status> status(nreq);
|
||||
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
|
||||
assert(ierr==0);
|
||||
list.resize(0);
|
||||
this->StencilBarrier();
|
||||
}
|
||||
|
||||
#else /* NOT ... ACCELERATOR_AWARE_MPI */
|
||||
///////////////////////////////////////////
|
||||
// Pipeline mode through host memory
|
||||
///////////////////////////////////////////
|
||||
/*
|
||||
* In prepare (phase 1):
|
||||
* PHASE 1: (prepare)
|
||||
* - post MPI receive buffers asynch
|
||||
* - post device - host send buffer transfer asynch
|
||||
* PHASE 2: (Begin)
|
||||
* - complete all copies
|
||||
* - post MPI send asynch
|
||||
* - post device - device transfers
|
||||
* PHASE 3: (Complete)
|
||||
* - MPI_waitall
|
||||
* - host-device transfers
|
||||
*
|
||||
*********************************
|
||||
* NB could split this further:
|
||||
*--------------------------------
|
||||
* PHASE 1: (Prepare)
|
||||
* - post MPI receive buffers asynch
|
||||
* - post device - host send buffer transfer asynch
|
||||
* PHASE 2: (BeginInterNode)
|
||||
* - complete all copies
|
||||
* - post MPI send asynch
|
||||
* PHASE 3: (BeginIntraNode)
|
||||
* - post device - device transfers
|
||||
* PHASE 4: (Complete)
|
||||
* - MPI_waitall
|
||||
* - host-device transfers asynch
|
||||
* - (complete all copies)
|
||||
*/
|
||||
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int dest,int dox,
|
||||
void *recv,
|
||||
int from,int dor,
|
||||
int xbytes,int rbytes,int dir)
|
||||
{
|
||||
/*
|
||||
* Bring sequence from Stencil.h down to lower level.
|
||||
* Assume using XeLink is ok
|
||||
*/
|
||||
int ncomm =communicator_halo.size();
|
||||
int commdir=dir%ncomm;
|
||||
|
||||
MPI_Request xrq;
|
||||
MPI_Request rrq;
|
||||
|
||||
int ierr;
|
||||
int gdest = ShmRanks[dest];
|
||||
int gfrom = ShmRanks[from];
|
||||
int gme = ShmRanks[_processor];
|
||||
|
||||
assert(dest != _processor);
|
||||
assert(from != _processor);
|
||||
assert(gme == ShmRank);
|
||||
double off_node_bytes=0.0;
|
||||
int tag;
|
||||
|
||||
void * host_recv = NULL;
|
||||
void * host_xmit = NULL;
|
||||
|
||||
/*
|
||||
* PHASE 1: (Prepare)
|
||||
* - post MPI receive buffers asynch
|
||||
* - post device - host send buffer transfer asynch
|
||||
*/
|
||||
|
||||
if ( dor ) {
|
||||
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
|
||||
tag= dir+from*32;
|
||||
host_recv = this->HostBufferMalloc(rbytes);
|
||||
ierr=MPI_Irecv(host_recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
|
||||
assert(ierr==0);
|
||||
CommsRequest_t srq;
|
||||
srq.PacketType = InterNodeRecv;
|
||||
srq.bytes = rbytes;
|
||||
srq.req = rrq;
|
||||
srq.host_buf = host_recv;
|
||||
srq.device_buf = recv;
|
||||
list.push_back(srq);
|
||||
off_node_bytes+=rbytes;
|
||||
}
|
||||
}
|
||||
|
||||
if (dox) {
|
||||
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
|
||||
|
||||
tag= dir+_processor*32;
|
||||
|
||||
host_xmit = this->HostBufferMalloc(xbytes);
|
||||
CommsRequest_t srq;
|
||||
|
||||
srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
|
||||
|
||||
// ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
|
||||
// assert(ierr==0);
|
||||
// off_node_bytes+=xbytes;
|
||||
|
||||
srq.PacketType = InterNodeXmit;
|
||||
srq.bytes = xbytes;
|
||||
// srq.req = xrq;
|
||||
srq.host_buf = host_xmit;
|
||||
srq.device_buf = xmit;
|
||||
srq.tag = tag;
|
||||
srq.dest = dest;
|
||||
srq.commdir = commdir;
|
||||
list.push_back(srq);
|
||||
}
|
||||
}
|
||||
|
||||
return off_node_bytes;
|
||||
}
|
||||
/*
|
||||
* In the interest of better pipelining, poll for completion on each DtoH and
|
||||
* start MPI_ISend in the meantime
|
||||
*/
|
||||
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list)
|
||||
{
|
||||
int pending = 0;
|
||||
do {
|
||||
|
||||
pending = 0;
|
||||
|
||||
for(int idx = 0; idx<list.size();idx++){
|
||||
|
||||
if ( list[idx].PacketType==InterNodeRecv ) {
|
||||
|
||||
int flag = 0;
|
||||
MPI_Status status;
|
||||
int ierr = MPI_Test(&list[idx].req,&flag,&status);
|
||||
assert(ierr==0);
|
||||
|
||||
if ( flag ) {
|
||||
// std::cout << " PollIrecv "<<idx<<" flag "<<flag<<std::endl;
|
||||
acceleratorCopyToDeviceAsynch(list[idx].host_buf,list[idx].device_buf,list[idx].bytes);
|
||||
list[idx].PacketType=InterNodeReceiveHtoD;
|
||||
} else {
|
||||
pending ++;
|
||||
}
|
||||
}
|
||||
}
|
||||
// std::cout << " PollIrecv "<<pending<<" pending requests"<<std::endl;
|
||||
} while ( pending );
|
||||
|
||||
}
|
||||
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list)
|
||||
{
|
||||
int pending = 0;
|
||||
do {
|
||||
|
||||
pending = 0;
|
||||
|
||||
for(int idx = 0; idx<list.size();idx++){
|
||||
|
||||
if ( list[idx].PacketType==InterNodeXmit ) {
|
||||
|
||||
if ( acceleratorEventIsComplete(list[idx].ev) ) {
|
||||
|
||||
void *host_xmit = list[idx].host_buf;
|
||||
uint32_t xbytes = list[idx].bytes;
|
||||
int dest = list[idx].dest;
|
||||
int tag = list[idx].tag;
|
||||
int commdir = list[idx].commdir;
|
||||
///////////////////
|
||||
// Send packet
|
||||
///////////////////
|
||||
|
||||
// std::cout << " DtoH is complete for index "<<idx<<" calling MPI_Isend "<<std::endl;
|
||||
|
||||
MPI_Request xrq;
|
||||
int ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
|
||||
assert(ierr==0);
|
||||
|
||||
list[idx].req = xrq; // Update the MPI request in the list
|
||||
|
||||
list[idx].PacketType=InterNodeXmitISend;
|
||||
|
||||
} else {
|
||||
// not done, so return to polling loop
|
||||
pending++;
|
||||
}
|
||||
}
|
||||
}
|
||||
} while (pending);
|
||||
}
|
||||
|
||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int dest,int dox,
|
||||
@ -411,56 +699,109 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
|
||||
double off_node_bytes=0.0;
|
||||
int tag;
|
||||
|
||||
if ( dor ) {
|
||||
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
|
||||
tag= dir+from*32;
|
||||
ierr=MPI_Irecv(recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
|
||||
assert(ierr==0);
|
||||
list.push_back(rrq);
|
||||
off_node_bytes+=rbytes;
|
||||
}
|
||||
void * host_xmit = NULL;
|
||||
|
||||
////////////////////////////////
|
||||
// Receives already posted
|
||||
// Copies already started
|
||||
////////////////////////////////
|
||||
/*
|
||||
* PHASE 2: (Begin)
|
||||
* - complete all copies
|
||||
* - post MPI send asynch
|
||||
*/
|
||||
#ifdef NVLINK_GET
|
||||
if ( dor ) {
|
||||
|
||||
if ( ! ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) ) {
|
||||
// Intranode
|
||||
void *shm = (void *) this->ShmBufferTranslate(from,xmit);
|
||||
assert(shm!=NULL);
|
||||
acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
CommsRequest_t srq;
|
||||
|
||||
srq.ev = acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
|
||||
|
||||
srq.PacketType = IntraNodeRecv;
|
||||
srq.bytes = xbytes;
|
||||
// srq.req = xrq;
|
||||
srq.host_buf = NULL;
|
||||
srq.device_buf = xmit;
|
||||
srq.tag = -1;
|
||||
srq.dest = dest;
|
||||
srq.commdir = dir;
|
||||
list.push_back(srq);
|
||||
}
|
||||
}
|
||||
#else
|
||||
if (dox) {
|
||||
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
|
||||
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
|
||||
tag= dir+_processor*32;
|
||||
ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
|
||||
assert(ierr==0);
|
||||
list.push_back(xrq);
|
||||
off_node_bytes+=xbytes;
|
||||
} else {
|
||||
#ifndef NVLINK_GET
|
||||
|
||||
if ( !( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) ) {
|
||||
// Intranode
|
||||
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
|
||||
assert(shm!=NULL);
|
||||
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
|
||||
#endif
|
||||
|
||||
CommsRequest_t srq;
|
||||
|
||||
srq.ev = acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
|
||||
|
||||
srq.PacketType = IntraNodeXmit;
|
||||
srq.bytes = xbytes;
|
||||
// srq.req = xrq;
|
||||
srq.host_buf = NULL;
|
||||
srq.device_buf = xmit;
|
||||
srq.tag = -1;
|
||||
srq.dest = dest;
|
||||
srq.commdir = dir;
|
||||
list.push_back(srq);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
return off_node_bytes;
|
||||
}
|
||||
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
|
||||
{
|
||||
int nreq=list.size();
|
||||
acceleratorCopySynchronise(); // Complete all pending copy transfers D2D
|
||||
|
||||
acceleratorCopySynchronise();
|
||||
std::vector<MPI_Status> status;
|
||||
std::vector<MPI_Request> MpiRequests;
|
||||
|
||||
for(int r=0;r<list.size();r++){
|
||||
// Must check each Send buf is clear to reuse
|
||||
if ( list[r].PacketType == InterNodeXmitISend ) MpiRequests.push_back(list[r].req);
|
||||
// if ( list[r].PacketType == InterNodeRecv ) MpiRequests.push_back(list[r].req); // Already "Test" passed
|
||||
}
|
||||
|
||||
if (nreq==0) return;
|
||||
int nreq=MpiRequests.size();
|
||||
|
||||
std::vector<MPI_Status> status(nreq);
|
||||
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
|
||||
assert(ierr==0);
|
||||
list.resize(0);
|
||||
if (nreq>0) {
|
||||
status.resize(MpiRequests.size());
|
||||
int ierr = MPI_Waitall(MpiRequests.size(),&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing.
|
||||
assert(ierr==0);
|
||||
}
|
||||
|
||||
// for(int r=0;r<nreq;r++){
|
||||
// if ( list[r].PacketType==InterNodeRecv ) {
|
||||
// acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
|
||||
// }
|
||||
// }
|
||||
|
||||
|
||||
list.resize(0); // Delete the list
|
||||
this->HostBufferFreeAll(); // Clean up the buffer allocs
|
||||
#ifndef NVLINK_GET
|
||||
this->StencilBarrier(); // if PUT must check our nbrs have filled our receive buffers.
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
////////////////////////////////////////////
|
||||
// END PIPELINE MODE / NO CUDA AWARE MPI
|
||||
////////////////////////////////////////////
|
||||
|
||||
void CartesianCommunicator::StencilBarrier(void)
|
||||
{
|
||||
FlightRecorder::StepLog("NodeBarrier");
|
||||
MPI_Barrier (ShmComm);
|
||||
}
|
||||
//void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
|
||||
@ -468,11 +809,13 @@ void CartesianCommunicator::StencilBarrier(void)
|
||||
//}
|
||||
void CartesianCommunicator::Barrier(void)
|
||||
{
|
||||
FlightRecorder::StepLog("GridBarrier");
|
||||
int ierr = MPI_Barrier(communicator);
|
||||
assert(ierr==0);
|
||||
}
|
||||
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
|
||||
{
|
||||
FlightRecorder::StepLog("Broadcast");
|
||||
int ierr=MPI_Bcast(data,
|
||||
bytes,
|
||||
MPI_BYTE,
|
||||
@ -491,6 +834,7 @@ void CartesianCommunicator::BarrierWorld(void){
|
||||
}
|
||||
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
|
||||
{
|
||||
FlightRecorder::StepLog("BroadcastWorld");
|
||||
int ierr= MPI_Bcast(data,
|
||||
bytes,
|
||||
MPI_BYTE,
|
||||
@ -513,6 +857,7 @@ void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,
|
||||
}
|
||||
void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t bytes)
|
||||
{
|
||||
FlightRecorder::StepLog("AllToAll");
|
||||
// MPI is a pain and uses "int" arguments
|
||||
// 64*64*64*128*16 == 500Million elements of data.
|
||||
// When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug.
|
||||
|
@ -91,7 +91,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
|
||||
{
|
||||
assert(0);
|
||||
}
|
||||
void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(0);}
|
||||
void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(list.size()==0);}
|
||||
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int dest,
|
||||
@ -132,6 +132,17 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
|
||||
{
|
||||
return 2.0*bytes;
|
||||
}
|
||||
void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
|
||||
void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
|
||||
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int xmit_to_rank,int dox,
|
||||
void *recv,
|
||||
int recv_from_rank,int dor,
|
||||
int xbytes,int rbytes, int dir)
|
||||
{
|
||||
return 0.0;
|
||||
}
|
||||
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
|
||||
void *xmit,
|
||||
int xmit_to_rank,int dox,
|
||||
|
@ -46,8 +46,40 @@ NAMESPACE_BEGIN(Grid);
|
||||
|
||||
#if defined (GRID_COMMS_MPI3)
|
||||
typedef MPI_Comm Grid_MPI_Comm;
|
||||
typedef MPI_Request MpiCommsRequest_t;
|
||||
#ifdef ACCELERATOR_AWARE_MPI
|
||||
typedef MPI_Request CommsRequest_t;
|
||||
#else
|
||||
/*
|
||||
* Enable state transitions as each packet flows.
|
||||
*/
|
||||
enum PacketType_t {
|
||||
FaceGather,
|
||||
InterNodeXmit,
|
||||
InterNodeRecv,
|
||||
IntraNodeXmit,
|
||||
IntraNodeRecv,
|
||||
InterNodeXmitISend,
|
||||
InterNodeReceiveHtoD
|
||||
};
|
||||
/*
|
||||
*Package arguments needed for various actions along packet flow
|
||||
*/
|
||||
typedef struct {
|
||||
PacketType_t PacketType;
|
||||
void *host_buf;
|
||||
void *device_buf;
|
||||
int dest;
|
||||
int tag;
|
||||
int commdir;
|
||||
unsigned long bytes;
|
||||
acceleratorEvent_t ev;
|
||||
MpiCommsRequest_t req;
|
||||
} CommsRequest_t;
|
||||
#endif
|
||||
|
||||
#else
|
||||
typedef int MpiCommsRequest_t;
|
||||
typedef int CommsRequest_t;
|
||||
typedef int Grid_MPI_Comm;
|
||||
#endif
|
||||
@ -105,7 +137,7 @@ public:
|
||||
///////////////////////////////////////////////////
|
||||
static void SharedMemoryAllocate(uint64_t bytes, int flags);
|
||||
static void SharedMemoryFree(void);
|
||||
static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
|
||||
// static void SharedMemoryCopy(void *dest,void *src,size_t bytes);
|
||||
static void SharedMemoryZero(void *dest,size_t bytes);
|
||||
|
||||
};
|
||||
|
@ -42,6 +42,11 @@ Author: Christoph Lehner <christoph@lhnr.de>
|
||||
#ifdef ACCELERATOR_AWARE_MPI
|
||||
#define GRID_SYCL_LEVEL_ZERO_IPC
|
||||
#define SHM_SOCKETS
|
||||
#else
|
||||
#ifdef HAVE_NUMAIF_H
|
||||
#warning " Using NUMAIF "
|
||||
#include <numaif.h>
|
||||
#endif
|
||||
#endif
|
||||
#include <syscall.h>
|
||||
#endif
|
||||
@ -537,7 +542,38 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
// Each MPI rank should allocate our own buffer
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
#ifndef ACCELERATOR_AWARE_MPI
|
||||
HostCommBuf= malloc(bytes);
|
||||
// printf("Host buffer allocate for GPU non-aware MPI\n");
|
||||
#if 0
|
||||
HostCommBuf= acceleratorAllocHost(bytes);
|
||||
#else
|
||||
HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
|
||||
#if 0
|
||||
#warning "Moving host buffers to specific NUMA domain"
|
||||
int numa;
|
||||
char *numa_name=(char *)getenv("MPI_BUF_NUMA");
|
||||
if(numa_name) {
|
||||
unsigned long page_size = sysconf(_SC_PAGESIZE);
|
||||
numa = atoi(numa_name);
|
||||
unsigned long page_count = bytes/page_size;
|
||||
std::vector<void *> pages(page_count);
|
||||
std::vector<int> nodes(page_count,numa);
|
||||
std::vector<int> status(page_count,-1);
|
||||
for(unsigned long p=0;p<page_count;p++){
|
||||
pages[p] =(void *) ((uint64_t) HostCommBuf + p*page_size);
|
||||
}
|
||||
int ret = move_pages(0,
|
||||
page_count,
|
||||
&pages[0],
|
||||
&nodes[0],
|
||||
&status[0],
|
||||
MPOL_MF_MOVE);
|
||||
printf("Host buffer move to numa domain %d : move_pages returned %d\n",numa,ret);
|
||||
if (ret) perror(" move_pages failed for reason:");
|
||||
}
|
||||
#endif
|
||||
acceleratorPin(HostCommBuf,bytes);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
ShmCommBuf = acceleratorAllocDevice(bytes);
|
||||
if (ShmCommBuf == (void *)NULL ) {
|
||||
@ -569,8 +605,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
|
||||
typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t;
|
||||
|
||||
auto zeDevice = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());
|
||||
auto zeContext = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());
|
||||
auto zeDevice = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());
|
||||
auto zeContext = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());
|
||||
|
||||
ze_ipc_mem_handle_t ihandle;
|
||||
clone_mem_t handle;
|
||||
@ -880,14 +916,14 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
|
||||
bzero(dest,bytes);
|
||||
#endif
|
||||
}
|
||||
void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
|
||||
{
|
||||
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
|
||||
acceleratorCopyToDevice(src,dest,bytes);
|
||||
#else
|
||||
bcopy(src,dest,bytes);
|
||||
#endif
|
||||
}
|
||||
//void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
|
||||
//{
|
||||
//#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
|
||||
// acceleratorCopyToDevice(src,dest,bytes);
|
||||
//#else
|
||||
// bcopy(src,dest,bytes);
|
||||
//#endif
|
||||
//}
|
||||
////////////////////////////////////////////////////////
|
||||
// Global shared functionality finished
|
||||
// Now move to per communicator functionality
|
||||
@ -923,6 +959,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
|
||||
MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);
|
||||
|
||||
ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
|
||||
// std::cerr << " SetCommunicator rank "<<r<<" comm "<<ShmCommBufs[r] <<std::endl;
|
||||
}
|
||||
ShmBufferFreeAll();
|
||||
|
||||
@ -953,7 +990,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
|
||||
}
|
||||
#endif
|
||||
|
||||
//SharedMemoryTest();
|
||||
// SharedMemoryTest();
|
||||
}
|
||||
//////////////////////////////////////////////////////////////////
|
||||
// On node barrier
|
||||
@ -975,19 +1012,18 @@ void SharedMemory::SharedMemoryTest(void)
|
||||
check[0]=GlobalSharedMemory::WorldNode;
|
||||
check[1]=r;
|
||||
check[2]=magic;
|
||||
GlobalSharedMemory::SharedMemoryCopy( ShmCommBufs[r], check, 3*sizeof(uint64_t));
|
||||
acceleratorCopyToDevice(check,ShmCommBufs[r],3*sizeof(uint64_t));
|
||||
}
|
||||
}
|
||||
ShmBarrier();
|
||||
for(uint64_t r=0;r<ShmSize;r++){
|
||||
ShmBarrier();
|
||||
GlobalSharedMemory::SharedMemoryCopy(check,ShmCommBufs[r], 3*sizeof(uint64_t));
|
||||
ShmBarrier();
|
||||
acceleratorCopyFromDevice(ShmCommBufs[r],check,3*sizeof(uint64_t));
|
||||
assert(check[0]==GlobalSharedMemory::WorldNode);
|
||||
assert(check[1]==r);
|
||||
assert(check[2]==magic);
|
||||
ShmBarrier();
|
||||
}
|
||||
ShmBarrier();
|
||||
std::cout << GridLogDebug << " SharedMemoryTest has passed "<<std::endl;
|
||||
}
|
||||
|
||||
void *SharedMemory::ShmBuffer(int rank)
|
||||
|
@ -122,10 +122,10 @@ void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
|
||||
{
|
||||
acceleratorMemSet(dest,0,bytes);
|
||||
}
|
||||
void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
|
||||
{
|
||||
acceleratorCopyToDevice(src,dest,bytes);
|
||||
}
|
||||
//void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
|
||||
//{
|
||||
// acceleratorCopyToDevice(src,dest,bytes);
|
||||
//}
|
||||
////////////////////////////////////////////////////////
|
||||
// Global shared functionality finished
|
||||
// Now move to per communicator functionality
|
||||
|
@ -51,7 +51,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
#endif
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
|
||||
auto Cshift(const Expression &expr,int dim,int shift) -> decltype(closure(expr))
|
||||
{
|
||||
|
@ -30,12 +30,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
extern std::vector<std::pair<int,int> > Cshift_table;
|
||||
extern commVector<std::pair<int,int> > Cshift_table_device;
|
||||
extern deviceVector<std::pair<int,int> > Cshift_table_device;
|
||||
|
||||
inline std::pair<int,int> *MapCshiftTable(void)
|
||||
{
|
||||
// GPU version
|
||||
#ifdef ACCELERATOR_CSHIFT
|
||||
uint64_t sz=Cshift_table.size();
|
||||
if (Cshift_table_device.size()!=sz ) {
|
||||
Cshift_table_device.resize(sz);
|
||||
@ -45,16 +44,13 @@ inline std::pair<int,int> *MapCshiftTable(void)
|
||||
sizeof(Cshift_table[0])*sz);
|
||||
|
||||
return &Cshift_table_device[0];
|
||||
#else
|
||||
return &Cshift_table[0];
|
||||
#endif
|
||||
// CPU version use identify map
|
||||
}
|
||||
///////////////////////////////////////////////////////////////////
|
||||
// Gather for when there is no need to SIMD split
|
||||
///////////////////////////////////////////////////////////////////
|
||||
template<class vobj> void
|
||||
Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
|
||||
Gather_plane_simple (const Lattice<vobj> &rhs,deviceVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
|
||||
{
|
||||
int rd = rhs.Grid()->_rdimensions[dimension];
|
||||
|
||||
@ -94,17 +90,10 @@ Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dim
|
||||
{
|
||||
auto buffer_p = & buffer[0];
|
||||
auto table = MapCshiftTable();
|
||||
#ifdef ACCELERATOR_CSHIFT
|
||||
autoView(rhs_v , rhs, AcceleratorRead);
|
||||
accelerator_for(i,ent,vobj::Nsimd(),{
|
||||
coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
|
||||
});
|
||||
#else
|
||||
autoView(rhs_v , rhs, CpuRead);
|
||||
thread_for(i,ent,{
|
||||
buffer_p[table[i].first]=rhs_v[table[i].second];
|
||||
});
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@ -129,7 +118,6 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
|
||||
int n1=rhs.Grid()->_slice_stride[dimension];
|
||||
|
||||
if ( cbmask ==0x3){
|
||||
#ifdef ACCELERATOR_CSHIFT
|
||||
autoView(rhs_v , rhs, AcceleratorRead);
|
||||
accelerator_for(nn,e1*e2,1,{
|
||||
int n = nn%e1;
|
||||
@ -140,21 +128,10 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
|
||||
vobj temp =rhs_v[so+o+b];
|
||||
extract<vobj>(temp,pointers,offset);
|
||||
});
|
||||
#else
|
||||
autoView(rhs_v , rhs, CpuRead);
|
||||
thread_for2d(n,e1,b,e2,{
|
||||
int o = n*n1;
|
||||
int offset = b+n*e2;
|
||||
|
||||
vobj temp =rhs_v[so+o+b];
|
||||
extract<vobj>(temp,pointers,offset);
|
||||
});
|
||||
#endif
|
||||
} else {
|
||||
Coordinate rdim=rhs.Grid()->_rdimensions;
|
||||
Coordinate cdm =rhs.Grid()->_checker_dim_mask;
|
||||
std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
|
||||
#ifdef ACCELERATOR_CSHIFT
|
||||
autoView(rhs_v , rhs, AcceleratorRead);
|
||||
accelerator_for(nn,e1*e2,1,{
|
||||
int n = nn%e1;
|
||||
@ -175,33 +152,13 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
|
||||
extract<vobj>(temp,pointers,offset);
|
||||
}
|
||||
});
|
||||
#else
|
||||
autoView(rhs_v , rhs, CpuRead);
|
||||
thread_for2d(n,e1,b,e2,{
|
||||
|
||||
Coordinate coor;
|
||||
|
||||
int o=n*n1;
|
||||
int oindex = o+b;
|
||||
|
||||
int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm);
|
||||
|
||||
int ocb=1<<cb;
|
||||
int offset = b+n*e2;
|
||||
|
||||
if ( ocb & cbmask ) {
|
||||
vobj temp =rhs_v[so+o+b];
|
||||
extract<vobj>(temp,pointers,offset);
|
||||
}
|
||||
});
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// Scatter for when there is no need to SIMD split
|
||||
//////////////////////////////////////////////////////
|
||||
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<vobj> &buffer, int dimension,int plane,int cbmask)
|
||||
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,deviceVector<vobj> &buffer, int dimension,int plane,int cbmask)
|
||||
{
|
||||
int rd = rhs.Grid()->_rdimensions[dimension];
|
||||
|
||||
@ -245,17 +202,10 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<
|
||||
{
|
||||
auto buffer_p = & buffer[0];
|
||||
auto table = MapCshiftTable();
|
||||
#ifdef ACCELERATOR_CSHIFT
|
||||
autoView( rhs_v, rhs, AcceleratorWrite);
|
||||
accelerator_for(i,ent,vobj::Nsimd(),{
|
||||
coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
|
||||
});
|
||||
#else
|
||||
autoView( rhs_v, rhs, CpuWrite);
|
||||
thread_for(i,ent,{
|
||||
rhs_v[table[i].first]=buffer_p[table[i].second];
|
||||
});
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@ -278,7 +228,6 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
|
||||
if(cbmask ==0x3 ) {
|
||||
int _slice_stride = rhs.Grid()->_slice_stride[dimension];
|
||||
int _slice_block = rhs.Grid()->_slice_block[dimension];
|
||||
#ifdef ACCELERATOR_CSHIFT
|
||||
autoView( rhs_v , rhs, AcceleratorWrite);
|
||||
accelerator_for(nn,e1*e2,1,{
|
||||
int n = nn%e1;
|
||||
@ -287,14 +236,6 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
|
||||
int offset = b+n*_slice_block;
|
||||
merge(rhs_v[so+o+b],pointers,offset);
|
||||
});
|
||||
#else
|
||||
autoView( rhs_v , rhs, CpuWrite);
|
||||
thread_for2d(n,e1,b,e2,{
|
||||
int o = n*_slice_stride;
|
||||
int offset = b+n*_slice_block;
|
||||
merge(rhs_v[so+o+b],pointers,offset);
|
||||
});
|
||||
#endif
|
||||
} else {
|
||||
|
||||
// Case of SIMD split AND checker dim cannot currently be hit, except in
|
||||
@ -360,19 +301,11 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
|
||||
|
||||
{
|
||||
auto table = MapCshiftTable();
|
||||
#ifdef ACCELERATOR_CSHIFT
|
||||
autoView(rhs_v , rhs, AcceleratorRead);
|
||||
autoView(lhs_v , lhs, AcceleratorWrite);
|
||||
accelerator_for(i,ent,vobj::Nsimd(),{
|
||||
coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second]));
|
||||
});
|
||||
#else
|
||||
autoView(rhs_v , rhs, CpuRead);
|
||||
autoView(lhs_v , lhs, CpuWrite);
|
||||
thread_for(i,ent,{
|
||||
lhs_v[table[i].first]=rhs_v[table[i].second];
|
||||
});
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@ -412,19 +345,11 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
|
||||
|
||||
{
|
||||
auto table = MapCshiftTable();
|
||||
#ifdef ACCELERATOR_CSHIFT
|
||||
autoView( rhs_v, rhs, AcceleratorRead);
|
||||
autoView( lhs_v, lhs, AcceleratorWrite);
|
||||
accelerator_for(i,ent,1,{
|
||||
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
|
||||
});
|
||||
#else
|
||||
autoView( rhs_v, rhs, CpuRead);
|
||||
autoView( lhs_v, lhs, CpuWrite);
|
||||
thread_for(i,ent,{
|
||||
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
|
||||
});
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -31,7 +31,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
const int Cshift_verbose=0;
|
||||
template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
|
||||
{
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
@ -55,17 +55,17 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
|
||||
RealD t1,t0;
|
||||
t0=usecond();
|
||||
if ( !comm_dim ) {
|
||||
//std::cout << "CSHIFT: Cshift_local" <<std::endl;
|
||||
// std::cout << "CSHIFT: Cshift_local" <<std::endl;
|
||||
Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding
|
||||
} else if ( splice_dim ) {
|
||||
//std::cout << "CSHIFT: Cshift_comms_simd call - splice_dim = " << splice_dim << " shift " << shift << " dimension = " << dimension << std::endl;
|
||||
// std::cout << "CSHIFT: Cshift_comms_simd call - splice_dim = " << splice_dim << " shift " << shift << " dimension = " << dimension << std::endl;
|
||||
Cshift_comms_simd(ret,rhs,dimension,shift);
|
||||
} else {
|
||||
//std::cout << "CSHIFT: Cshift_comms" <<std::endl;
|
||||
// std::cout << "CSHIFT: Cshift_comms" <<std::endl;
|
||||
Cshift_comms(ret,rhs,dimension,shift);
|
||||
}
|
||||
t1=usecond();
|
||||
// std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl;
|
||||
if(Cshift_verbose) std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl;
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -94,18 +94,16 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
|
||||
sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even);
|
||||
sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd);
|
||||
|
||||
//std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
|
||||
// std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.Checkerboard()<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
|
||||
if ( sshift[0] == sshift[1] ) {
|
||||
//std::cout << "Single pass Cshift_comms" <<std::endl;
|
||||
// std::cout << "Single pass Cshift_comms" <<std::endl;
|
||||
Cshift_comms_simd(ret,rhs,dimension,shift,0x3);
|
||||
} else {
|
||||
//std::cout << "Two pass Cshift_comms" <<std::endl;
|
||||
// std::cout << "Two pass Cshift_comms" <<std::endl;
|
||||
Cshift_comms_simd(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
|
||||
Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
|
||||
}
|
||||
}
|
||||
#define ACCELERATOR_CSHIFT_NO_COPY
|
||||
#ifdef ACCELERATOR_CSHIFT_NO_COPY
|
||||
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
||||
{
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
@ -125,9 +123,13 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
||||
assert(shift<fd);
|
||||
|
||||
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
|
||||
static cshiftVector<vobj> send_buf; send_buf.resize(buffer_size);
|
||||
static cshiftVector<vobj> recv_buf; recv_buf.resize(buffer_size);
|
||||
|
||||
static deviceVector<vobj> send_buf; send_buf.resize(buffer_size);
|
||||
static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size);
|
||||
#ifndef ACCELERATOR_AWARE_MPI
|
||||
static hostVector<vobj> hsend_buf; hsend_buf.resize(buffer_size);
|
||||
static hostVector<vobj> hrecv_buf; hrecv_buf.resize(buffer_size);
|
||||
#endif
|
||||
|
||||
int cb= (cbmask==0x2)? Odd : Even;
|
||||
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
|
||||
RealD tcopy=0.0;
|
||||
@ -158,18 +160,31 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
||||
// int rank = grid->_processor;
|
||||
int recv_from_rank;
|
||||
int xmit_to_rank;
|
||||
|
||||
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
|
||||
|
||||
tcomms-=usecond();
|
||||
// grid->Barrier();
|
||||
grid->Barrier();
|
||||
|
||||
#ifdef ACCELERATOR_AWARE_MPI
|
||||
grid->SendToRecvFrom((void *)&send_buf[0],
|
||||
xmit_to_rank,
|
||||
(void *)&recv_buf[0],
|
||||
recv_from_rank,
|
||||
bytes);
|
||||
#else
|
||||
// bouncy bouncy
|
||||
acceleratorCopyFromDevice(&send_buf[0],&hsend_buf[0],bytes);
|
||||
grid->SendToRecvFrom((void *)&hsend_buf[0],
|
||||
xmit_to_rank,
|
||||
(void *)&hrecv_buf[0],
|
||||
recv_from_rank,
|
||||
bytes);
|
||||
acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
|
||||
#endif
|
||||
|
||||
xbytes+=bytes;
|
||||
// grid->Barrier();
|
||||
grid->Barrier();
|
||||
tcomms+=usecond();
|
||||
|
||||
tscatter-=usecond();
|
||||
@ -177,13 +192,13 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
|
||||
tscatter+=usecond();
|
||||
}
|
||||
}
|
||||
/*
|
||||
std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
|
||||
*/
|
||||
if (Cshift_verbose){
|
||||
std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
||||
@ -201,9 +216,9 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
||||
int simd_layout = grid->_simd_layout[dimension];
|
||||
int comm_dim = grid->_processors[dimension] >1 ;
|
||||
|
||||
//std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
|
||||
// << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout
|
||||
// << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
|
||||
// std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
|
||||
// << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout
|
||||
// << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
|
||||
|
||||
assert(comm_dim==1);
|
||||
assert(simd_layout==2);
|
||||
@ -224,16 +239,20 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
||||
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
|
||||
// int words = sizeof(vobj)/sizeof(vector_type);
|
||||
|
||||
static std::vector<cshiftVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd);
|
||||
static std::vector<cshiftVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
|
||||
static std::vector<deviceVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd);
|
||||
static std::vector<deviceVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
|
||||
scalar_object * recv_buf_extract_mpi;
|
||||
scalar_object * send_buf_extract_mpi;
|
||||
|
||||
|
||||
for(int s=0;s<Nsimd;s++){
|
||||
send_buf_extract[s].resize(buffer_size);
|
||||
recv_buf_extract[s].resize(buffer_size);
|
||||
}
|
||||
|
||||
#ifndef ACCELERATOR_AWARE_MPI
|
||||
hostVector<scalar_object> hsend_buf; hsend_buf.resize(buffer_size);
|
||||
hostVector<scalar_object> hrecv_buf; hrecv_buf.resize(buffer_size);
|
||||
#endif
|
||||
|
||||
int bytes = buffer_size*sizeof(scalar_object);
|
||||
|
||||
ExtractPointerArray<scalar_object> pointers(Nsimd); //
|
||||
@ -281,266 +300,50 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
|
||||
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
||||
|
||||
tcomms-=usecond();
|
||||
// grid->Barrier();
|
||||
grid->Barrier();
|
||||
|
||||
send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
|
||||
recv_buf_extract_mpi = &recv_buf_extract[i][0];
|
||||
#ifdef ACCELERATOR_AWARE_MPI
|
||||
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
|
||||
xmit_to_rank,
|
||||
(void *)recv_buf_extract_mpi,
|
||||
recv_from_rank,
|
||||
bytes);
|
||||
|
||||
xbytes+=bytes;
|
||||
// grid->Barrier();
|
||||
tcomms+=usecond();
|
||||
|
||||
rpointers[i] = &recv_buf_extract[i][0];
|
||||
} else {
|
||||
rpointers[i] = &send_buf_extract[nbr_lane][0];
|
||||
}
|
||||
|
||||
}
|
||||
tscatter-=usecond();
|
||||
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
|
||||
tscatter+=usecond();
|
||||
}
|
||||
/*
|
||||
std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
|
||||
*/
|
||||
}
|
||||
#else
|
||||
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
||||
{
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
|
||||
GridBase *grid=rhs.Grid();
|
||||
Lattice<vobj> temp(rhs.Grid());
|
||||
|
||||
int fd = rhs.Grid()->_fdimensions[dimension];
|
||||
int rd = rhs.Grid()->_rdimensions[dimension];
|
||||
int pd = rhs.Grid()->_processors[dimension];
|
||||
int simd_layout = rhs.Grid()->_simd_layout[dimension];
|
||||
int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
|
||||
assert(simd_layout==1);
|
||||
assert(comm_dim==1);
|
||||
assert(shift>=0);
|
||||
assert(shift<fd);
|
||||
RealD tcopy=0.0;
|
||||
RealD tgather=0.0;
|
||||
RealD tscatter=0.0;
|
||||
RealD tcomms=0.0;
|
||||
uint64_t xbytes=0;
|
||||
|
||||
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
|
||||
static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size);
|
||||
static cshiftVector<vobj> recv_buf_v; recv_buf_v.resize(buffer_size);
|
||||
vobj *send_buf;
|
||||
vobj *recv_buf;
|
||||
{
|
||||
grid->ShmBufferFreeAll();
|
||||
size_t bytes = buffer_size*sizeof(vobj);
|
||||
send_buf=(vobj *)grid->ShmBufferMalloc(bytes);
|
||||
recv_buf=(vobj *)grid->ShmBufferMalloc(bytes);
|
||||
}
|
||||
|
||||
int cb= (cbmask==0x2)? Odd : Even;
|
||||
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
|
||||
|
||||
for(int x=0;x<rd;x++){
|
||||
|
||||
int sx = (x+sshift)%rd;
|
||||
int comm_proc = ((x+sshift)/rd)%pd;
|
||||
|
||||
if (comm_proc==0) {
|
||||
|
||||
tcopy-=usecond();
|
||||
Copy_plane(ret,rhs,dimension,x,sx,cbmask);
|
||||
tcopy+=usecond();
|
||||
|
||||
} else {
|
||||
|
||||
int words = buffer_size;
|
||||
if (cbmask != 0x3) words=words>>1;
|
||||
|
||||
int bytes = words * sizeof(vobj);
|
||||
|
||||
tgather-=usecond();
|
||||
Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask);
|
||||
tgather+=usecond();
|
||||
|
||||
// int rank = grid->_processor;
|
||||
int recv_from_rank;
|
||||
int xmit_to_rank;
|
||||
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
|
||||
|
||||
|
||||
tcomms-=usecond();
|
||||
// grid->Barrier();
|
||||
|
||||
acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes);
|
||||
grid->SendToRecvFrom((void *)&send_buf[0],
|
||||
xmit_to_rank,
|
||||
(void *)&recv_buf[0],
|
||||
recv_from_rank,
|
||||
bytes);
|
||||
xbytes+=bytes;
|
||||
acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes);
|
||||
|
||||
// grid->Barrier();
|
||||
tcomms+=usecond();
|
||||
|
||||
tscatter-=usecond();
|
||||
Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask);
|
||||
tscatter+=usecond();
|
||||
}
|
||||
}
|
||||
/*
|
||||
std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
|
||||
*/
|
||||
}
|
||||
|
||||
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
|
||||
{
|
||||
GridBase *grid=rhs.Grid();
|
||||
const int Nsimd = grid->Nsimd();
|
||||
typedef typename vobj::vector_type vector_type;
|
||||
typedef typename vobj::scalar_object scalar_object;
|
||||
typedef typename vobj::scalar_type scalar_type;
|
||||
|
||||
int fd = grid->_fdimensions[dimension];
|
||||
int rd = grid->_rdimensions[dimension];
|
||||
int ld = grid->_ldimensions[dimension];
|
||||
int pd = grid->_processors[dimension];
|
||||
int simd_layout = grid->_simd_layout[dimension];
|
||||
int comm_dim = grid->_processors[dimension] >1 ;
|
||||
|
||||
//std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
|
||||
// << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout
|
||||
// << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
|
||||
|
||||
assert(comm_dim==1);
|
||||
assert(simd_layout==2);
|
||||
assert(shift>=0);
|
||||
assert(shift<fd);
|
||||
RealD tcopy=0.0;
|
||||
RealD tgather=0.0;
|
||||
RealD tscatter=0.0;
|
||||
RealD tcomms=0.0;
|
||||
uint64_t xbytes=0;
|
||||
|
||||
int permute_type=grid->PermuteType(dimension);
|
||||
|
||||
///////////////////////////////////////////////
|
||||
// Simd direction uses an extract/merge pair
|
||||
///////////////////////////////////////////////
|
||||
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
|
||||
// int words = sizeof(vobj)/sizeof(vector_type);
|
||||
|
||||
static std::vector<cshiftVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd);
|
||||
static std::vector<cshiftVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
|
||||
scalar_object * recv_buf_extract_mpi;
|
||||
scalar_object * send_buf_extract_mpi;
|
||||
{
|
||||
size_t bytes = sizeof(scalar_object)*buffer_size;
|
||||
grid->ShmBufferFreeAll();
|
||||
send_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
|
||||
recv_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
|
||||
}
|
||||
for(int s=0;s<Nsimd;s++){
|
||||
send_buf_extract[s].resize(buffer_size);
|
||||
recv_buf_extract[s].resize(buffer_size);
|
||||
}
|
||||
|
||||
int bytes = buffer_size*sizeof(scalar_object);
|
||||
|
||||
ExtractPointerArray<scalar_object> pointers(Nsimd); //
|
||||
ExtractPointerArray<scalar_object> rpointers(Nsimd); // received pointers
|
||||
|
||||
///////////////////////////////////////////
|
||||
// Work out what to send where
|
||||
///////////////////////////////////////////
|
||||
int cb = (cbmask==0x2)? Odd : Even;
|
||||
int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
|
||||
|
||||
// loop over outer coord planes orthog to dim
|
||||
for(int x=0;x<rd;x++){
|
||||
|
||||
// FIXME call local permute copy if none are offnode.
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
pointers[i] = &send_buf_extract[i][0];
|
||||
}
|
||||
tgather-=usecond();
|
||||
int sx = (x+sshift)%rd;
|
||||
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
|
||||
tgather+=usecond();
|
||||
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
|
||||
int inner_bit = (Nsimd>>(permute_type+1));
|
||||
int ic= (i&inner_bit)? 1:0;
|
||||
|
||||
int my_coor = rd*ic + x;
|
||||
int nbr_coor = my_coor+sshift;
|
||||
int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
|
||||
|
||||
int nbr_ic = (nbr_coor%ld)/rd; // inner coord of peer
|
||||
int nbr_ox = (nbr_coor%rd); // outer coord of peer
|
||||
int nbr_lane = (i&(~inner_bit));
|
||||
|
||||
int recv_from_rank;
|
||||
int xmit_to_rank;
|
||||
|
||||
if (nbr_ic) nbr_lane|=inner_bit;
|
||||
|
||||
assert (sx == nbr_ox);
|
||||
|
||||
if(nbr_proc){
|
||||
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
||||
|
||||
tcomms-=usecond();
|
||||
// grid->Barrier();
|
||||
|
||||
acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes);
|
||||
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
|
||||
#else
|
||||
// bouncy bouncy
|
||||
acceleratorCopyFromDevice((void *)send_buf_extract_mpi,(void *)&hsend_buf[0],bytes);
|
||||
grid->SendToRecvFrom((void *)&hsend_buf[0],
|
||||
xmit_to_rank,
|
||||
(void *)recv_buf_extract_mpi,
|
||||
(void *)&hrecv_buf[0],
|
||||
recv_from_rank,
|
||||
bytes);
|
||||
acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes);
|
||||
xbytes+=bytes;
|
||||
|
||||
// grid->Barrier();
|
||||
tcomms+=usecond();
|
||||
rpointers[i] = &recv_buf_extract[i][0];
|
||||
} else {
|
||||
rpointers[i] = &send_buf_extract[nbr_lane][0];
|
||||
}
|
||||
|
||||
}
|
||||
tscatter-=usecond();
|
||||
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
|
||||
tscatter+=usecond();
|
||||
|
||||
}
|
||||
/*
|
||||
std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s"<<std::endl;
|
||||
*/
|
||||
}
|
||||
acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes);
|
||||
#endif
|
||||
|
||||
xbytes+=bytes;
|
||||
grid->Barrier();
|
||||
tcomms+=usecond();
|
||||
|
||||
rpointers[i] = &recv_buf_extract[i][0];
|
||||
} else {
|
||||
rpointers[i] = &send_buf_extract[nbr_lane][0];
|
||||
}
|
||||
|
||||
}
|
||||
tscatter-=usecond();
|
||||
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
|
||||
tscatter+=usecond();
|
||||
}
|
||||
if(Cshift_verbose){
|
||||
std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl;
|
||||
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
#endif
|
||||
|
@ -1,5 +1,5 @@
|
||||
#include <Grid/GridCore.h>
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
std::vector<std::pair<int,int> > Cshift_table;
|
||||
commVector<std::pair<int,int> > Cshift_table_device;
|
||||
deviceVector<std::pair<int,int> > Cshift_table_device;
|
||||
NAMESPACE_END(Grid);
|
||||
|
@ -257,17 +257,30 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
|
||||
});
|
||||
}
|
||||
|
||||
#define FAST_AXPY_NORM
|
||||
template<class sobj,class vobj> inline
|
||||
RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
|
||||
{
|
||||
GRID_TRACE("axpy_norm");
|
||||
return axpy_norm_fast(ret,a,x,y);
|
||||
#ifdef FAST_AXPY_NORM
|
||||
return axpy_norm_fast(ret,a,x,y);
|
||||
#else
|
||||
ret = a*x+y;
|
||||
RealD nn=norm2(ret);
|
||||
return nn;
|
||||
#endif
|
||||
}
|
||||
template<class sobj,class vobj> inline
|
||||
RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
|
||||
{
|
||||
GRID_TRACE("axpby_norm");
|
||||
return axpby_norm_fast(ret,a,b,x,y);
|
||||
#ifdef FAST_AXPY_NORM
|
||||
return axpby_norm_fast(ret,a,b,x,y);
|
||||
#else
|
||||
ret = a*x+b*y;
|
||||
RealD nn=norm2(ret);
|
||||
return nn;
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Trace product
|
||||
|
@ -236,10 +236,13 @@ public:
|
||||
template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
|
||||
vobj vtmp;
|
||||
vtmp = r;
|
||||
#if 0
|
||||
#if 1
|
||||
deviceVector<vobj> vvtmp(1);
|
||||
acceleratorPut(vvtmp[0],vtmp);
|
||||
vobj *vvtmp_p = & vvtmp[0];
|
||||
auto me = View(AcceleratorWrite);
|
||||
accelerator_for(ss,me.size(),vobj::Nsimd(),{
|
||||
auto stmp=coalescedRead(vtmp);
|
||||
auto stmp=coalescedRead(*vvtmp_p);
|
||||
coalescedWrite(me[ss],stmp);
|
||||
});
|
||||
#else
|
||||
|
@ -53,36 +53,19 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
|
||||
typedef decltype(basis[0]) Field;
|
||||
typedef decltype(basis[0].View(AcceleratorRead)) View;
|
||||
|
||||
Vector<View> basis_v; basis_v.reserve(basis.size());
|
||||
typedef typename std::remove_reference<decltype(basis_v[0][0])>::type vobj;
|
||||
hostVector<View> h_basis_v(basis.size());
|
||||
deviceVector<View> d_basis_v(basis.size());
|
||||
typedef typename std::remove_reference<decltype(h_basis_v[0][0])>::type vobj;
|
||||
typedef typename std::remove_reference<decltype(Qt(0,0))>::type Coeff_t;
|
||||
|
||||
GridBase* grid = basis[0].Grid();
|
||||
|
||||
for(int k=0;k<basis.size();k++){
|
||||
basis_v.push_back(basis[k].View(AcceleratorWrite));
|
||||
h_basis_v[k] = basis[k].View(AcceleratorWrite);
|
||||
acceleratorPut(d_basis_v[k],h_basis_v[k]);
|
||||
}
|
||||
|
||||
#if ( !(defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)) )
|
||||
int max_threads = thread_max();
|
||||
Vector < vobj > Bt(Nm * max_threads);
|
||||
thread_region
|
||||
{
|
||||
vobj* B = &Bt[Nm * thread_num()];
|
||||
thread_for_in_region(ss, grid->oSites(),{
|
||||
for(int j=j0; j<j1; ++j) B[j]=0.;
|
||||
|
||||
for(int j=j0; j<j1; ++j){
|
||||
for(int k=k0; k<k1; ++k){
|
||||
B[j] +=Qt(j,k) * basis_v[k][ss];
|
||||
}
|
||||
}
|
||||
for(int j=j0; j<j1; ++j){
|
||||
basis_v[j][ss] = B[j];
|
||||
}
|
||||
});
|
||||
}
|
||||
#else
|
||||
View *basis_vp = &basis_v[0];
|
||||
View *basis_vp = &d_basis_v[0];
|
||||
|
||||
int nrot = j1-j0;
|
||||
if (!nrot) // edge case not handled gracefully by Cuda
|
||||
@ -91,17 +74,19 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
|
||||
uint64_t oSites =grid->oSites();
|
||||
uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
|
||||
|
||||
Vector <vobj> Bt(siteBlock * nrot);
|
||||
deviceVector <vobj> Bt(siteBlock * nrot);
|
||||
auto Bp=&Bt[0];
|
||||
|
||||
// GPU readable copy of matrix
|
||||
Vector<Coeff_t> Qt_jv(Nm*Nm);
|
||||
hostVector<Coeff_t> h_Qt_jv(Nm*Nm);
|
||||
deviceVector<Coeff_t> Qt_jv(Nm*Nm);
|
||||
Coeff_t *Qt_p = & Qt_jv[0];
|
||||
thread_for(i,Nm*Nm,{
|
||||
int j = i/Nm;
|
||||
int k = i%Nm;
|
||||
Qt_p[i]=Qt(j,k);
|
||||
h_Qt_jv[i]=Qt(j,k);
|
||||
});
|
||||
acceleratorCopyToDevice(&h_Qt_jv[0],Qt_p,Nm*Nm*sizeof(Coeff_t));
|
||||
|
||||
// Block the loop to keep storage footprint down
|
||||
for(uint64_t s=0;s<oSites;s+=siteBlock){
|
||||
@ -137,9 +122,8 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
|
||||
coalescedWrite(basis_vp[jj][sss],coalescedRead(Bp[ss*nrot+j]));
|
||||
});
|
||||
}
|
||||
#endif
|
||||
|
||||
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
|
||||
for(int k=0;k<basis.size();k++) h_basis_v[k].ViewClose();
|
||||
}
|
||||
|
||||
// Extract a single rotated vector
|
||||
@ -152,16 +136,19 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
|
||||
|
||||
result.Checkerboard() = basis[0].Checkerboard();
|
||||
|
||||
Vector<View> basis_v; basis_v.reserve(basis.size());
|
||||
hostVector<View> h_basis_v(basis.size());
|
||||
deviceVector<View> d_basis_v(basis.size());
|
||||
for(int k=0;k<basis.size();k++){
|
||||
basis_v.push_back(basis[k].View(AcceleratorRead));
|
||||
h_basis_v[k]=basis[k].View(AcceleratorRead);
|
||||
acceleratorPut(d_basis_v[k],h_basis_v[k]);
|
||||
}
|
||||
vobj zz=Zero();
|
||||
Vector<double> Qt_jv(Nm);
|
||||
double * Qt_j = & Qt_jv[0];
|
||||
for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
|
||||
|
||||
auto basis_vp=& basis_v[0];
|
||||
vobj zz=Zero();
|
||||
deviceVector<double> Qt_jv(Nm);
|
||||
double * Qt_j = & Qt_jv[0];
|
||||
for(int k=0;k<Nm;++k) acceleratorPut(Qt_j[k],Qt(j,k));
|
||||
|
||||
auto basis_vp=& d_basis_v[0];
|
||||
autoView(result_v,result,AcceleratorWrite);
|
||||
accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
|
||||
vobj zzz=Zero();
|
||||
@ -171,7 +158,7 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
|
||||
}
|
||||
coalescedWrite(result_v[ss], B);
|
||||
});
|
||||
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
|
||||
for(int k=0;k<basis.size();k++) h_basis_v[k].ViewClose();
|
||||
}
|
||||
|
||||
template<class Field>
|
||||
|
@ -46,7 +46,7 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
|
||||
// const int Nsimd = vobj::Nsimd();
|
||||
const int nthread = GridThread::GetThreads();
|
||||
|
||||
Vector<sobj> sumarray(nthread);
|
||||
std::vector<sobj> sumarray(nthread);
|
||||
for(int i=0;i<nthread;i++){
|
||||
sumarray[i]=Zero();
|
||||
}
|
||||
@ -75,7 +75,7 @@ inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)
|
||||
|
||||
const int nthread = GridThread::GetThreads();
|
||||
|
||||
Vector<sobj> sumarray(nthread);
|
||||
std::vector<sobj> sumarray(nthread);
|
||||
for(int i=0;i<nthread;i++){
|
||||
sumarray[i]=Zero();
|
||||
}
|
||||
@ -290,8 +290,10 @@ template<class vobj>
|
||||
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
|
||||
GridBase *grid = left.Grid();
|
||||
|
||||
bool ok;
|
||||
#ifdef GRID_SYCL
|
||||
uint64_t csum=0;
|
||||
uint64_t csum2=0;
|
||||
if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
|
||||
{
|
||||
// Hack
|
||||
@ -300,13 +302,33 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
|
||||
Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t);
|
||||
uint64_t *base= (uint64_t *)&l_v[0];
|
||||
csum=svm_xor(base,words);
|
||||
ok = FlightRecorder::CsumLog(csum);
|
||||
if ( !ok ) {
|
||||
csum2=svm_xor(base,words);
|
||||
std::cerr<< " Bad CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
|
||||
} else {
|
||||
// csum2=svm_xor(base,words);
|
||||
// std::cerr<< " ok CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
|
||||
}
|
||||
assert(ok);
|
||||
}
|
||||
FlightRecorder::CsumLog(csum);
|
||||
#endif
|
||||
FlightRecorder::StepLog("rank inner product");
|
||||
ComplexD nrm = rankInnerProduct(left,right);
|
||||
// ComplexD nrmck=nrm;
|
||||
RealD local = real(nrm);
|
||||
FlightRecorder::NormLog(real(nrm));
|
||||
ok = FlightRecorder::NormLog(real(nrm));
|
||||
if ( !ok ) {
|
||||
ComplexD nrm2 = rankInnerProduct(left,right);
|
||||
RealD local2 = real(nrm2);
|
||||
std::cerr<< " Bad NORM " << local << " recomputed as "<<local2<<std::endl;
|
||||
assert(ok);
|
||||
}
|
||||
FlightRecorder::StepLog("Start global sum");
|
||||
// grid->GlobalSumP2P(nrm);
|
||||
grid->GlobalSum(nrm);
|
||||
FlightRecorder::StepLog("Finished global sum");
|
||||
// std::cout << " norm "<< nrm << " p2p norm "<<nrmck<<std::endl;
|
||||
FlightRecorder::ReductionLog(local,real(nrm));
|
||||
return nrm;
|
||||
}
|
||||
@ -343,18 +365,6 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
|
||||
autoView( x_v, x, AcceleratorRead);
|
||||
autoView( y_v, y, AcceleratorRead);
|
||||
autoView( z_v, z, AcceleratorWrite);
|
||||
#if 0
|
||||
typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
|
||||
Vector<inner_t> inner_tmp(sites);
|
||||
auto inner_tmp_v = &inner_tmp[0];
|
||||
|
||||
accelerator_for( ss, sites, nsimd,{
|
||||
auto tmp = a*x_v(ss)+b*y_v(ss);
|
||||
coalescedWrite(inner_tmp_v[ss],innerProductD(tmp,tmp));
|
||||
coalescedWrite(z_v[ss],tmp);
|
||||
});
|
||||
nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
|
||||
#else
|
||||
typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
|
||||
deviceVector<inner_t> inner_tmp;
|
||||
inner_tmp.resize(sites);
|
||||
@ -365,9 +375,44 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
|
||||
coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
|
||||
coalescedWrite(z_v[ss],tmp);
|
||||
});
|
||||
nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
|
||||
bool ok;
|
||||
#ifdef GRID_SYCL
|
||||
uint64_t csum=0;
|
||||
uint64_t csum2=0;
|
||||
if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
|
||||
{
|
||||
// z_v
|
||||
{
|
||||
Integer words = sites*sizeof(vobj)/sizeof(uint64_t);
|
||||
uint64_t *base= (uint64_t *)&z_v[0];
|
||||
csum=svm_xor(base,words);
|
||||
ok = FlightRecorder::CsumLog(csum);
|
||||
if ( !ok ) {
|
||||
csum2=svm_xor(base,words);
|
||||
std::cerr<< " Bad z_v CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
|
||||
}
|
||||
assert(ok);
|
||||
}
|
||||
// inner_v
|
||||
{
|
||||
Integer words = sites*sizeof(inner_t)/sizeof(uint64_t);
|
||||
uint64_t *base= (uint64_t *)&inner_tmp_v[0];
|
||||
csum=svm_xor(base,words);
|
||||
ok = FlightRecorder::CsumLog(csum);
|
||||
if ( !ok ) {
|
||||
csum2=svm_xor(base,words);
|
||||
std::cerr<< " Bad inner_tmp_v CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
|
||||
}
|
||||
assert(ok);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
|
||||
ok = FlightRecorder::NormLog(real(nrm));
|
||||
assert(ok);
|
||||
RealD local = real(nrm);
|
||||
grid->GlobalSum(nrm);
|
||||
FlightRecorder::ReductionLog(local,real(nrm));
|
||||
return nrm;
|
||||
}
|
||||
|
||||
@ -377,7 +422,7 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
|
||||
conformable(left,right);
|
||||
|
||||
typedef typename vobj::vector_typeD vector_type;
|
||||
Vector<ComplexD> tmp(2);
|
||||
std::vector<ComplexD> tmp(2);
|
||||
|
||||
GridBase *grid = left.Grid();
|
||||
|
||||
@ -387,8 +432,8 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
|
||||
// GPU
|
||||
typedef decltype(innerProductD(vobj(),vobj())) inner_t;
|
||||
typedef decltype(innerProductD(vobj(),vobj())) norm_t;
|
||||
Vector<inner_t> inner_tmp(sites);
|
||||
Vector<norm_t> norm_tmp(sites);
|
||||
deviceVector<inner_t> inner_tmp(sites);
|
||||
deviceVector<norm_t> norm_tmp(sites);
|
||||
auto inner_tmp_v = &inner_tmp[0];
|
||||
auto norm_tmp_v = &norm_tmp[0];
|
||||
{
|
||||
@ -438,7 +483,9 @@ inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
|
||||
// sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc...
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim)
|
||||
template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,
|
||||
std::vector<typename vobj::scalar_object> &result,
|
||||
int orthogdim)
|
||||
{
|
||||
///////////////////////////////////////////////////////
|
||||
// FIXME precision promoted summation
|
||||
@ -460,8 +507,8 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
|
||||
int ld=grid->_ldimensions[orthogdim];
|
||||
int rd=grid->_rdimensions[orthogdim];
|
||||
|
||||
Vector<vobj> lvSum(rd); // will locally sum vectors first
|
||||
Vector<sobj> lsSum(ld,Zero()); // sum across these down to scalars
|
||||
std::vector<vobj> lvSum(rd); // will locally sum vectors first
|
||||
std::vector<sobj> lsSum(ld,Zero()); // sum across these down to scalars
|
||||
ExtractBuffer<sobj> extracted(Nsimd); // splitting the SIMD
|
||||
|
||||
result.resize(fd); // And then global sum to return the same vector to every node
|
||||
@ -509,6 +556,8 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
|
||||
scalar_type * ptr = (scalar_type *) &result[0];
|
||||
int words = fd*sizeof(sobj)/sizeof(scalar_type);
|
||||
grid->GlobalSumVector(ptr, words);
|
||||
// std::cout << GridLogMessage << " sliceSum local"<<t_sum<<" us, host+mpi "<<t_rest<<std::endl;
|
||||
|
||||
}
|
||||
template<class vobj> inline
|
||||
std::vector<typename vobj::scalar_object>
|
||||
@ -552,8 +601,8 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti
|
||||
int ld=grid->_ldimensions[orthogdim];
|
||||
int rd=grid->_rdimensions[orthogdim];
|
||||
|
||||
Vector<vector_type> lvSum(rd); // will locally sum vectors first
|
||||
Vector<scalar_type > lsSum(ld,scalar_type(0.0)); // sum across these down to scalars
|
||||
std::vector<vector_type> lvSum(rd); // will locally sum vectors first
|
||||
std::vector<scalar_type > lsSum(ld,scalar_type(0.0)); // sum across these down to scalars
|
||||
ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd); // splitting the SIMD
|
||||
|
||||
result.resize(fd); // And then global sum to return the same vector to every node for IO to file
|
||||
|
@ -214,22 +214,12 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi
|
||||
// Move out of UVM
|
||||
// Turns out I had messed up the synchronise after move to compute stream
|
||||
// as running this on the default stream fools the synchronise
|
||||
#undef UVM_BLOCK_BUFFER
|
||||
#ifndef UVM_BLOCK_BUFFER
|
||||
commVector<sobj> buffer(numBlocks);
|
||||
deviceVector<sobj> buffer(numBlocks);
|
||||
sobj *buffer_v = &buffer[0];
|
||||
sobj result;
|
||||
reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
|
||||
accelerator_barrier();
|
||||
acceleratorCopyFromDevice(buffer_v,&result,sizeof(result));
|
||||
#else
|
||||
Vector<sobj> buffer(numBlocks);
|
||||
sobj *buffer_v = &buffer[0];
|
||||
sobj result;
|
||||
reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
|
||||
accelerator_barrier();
|
||||
result = *buffer_v;
|
||||
#endif
|
||||
return result;
|
||||
}
|
||||
|
||||
@ -244,7 +234,7 @@ inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osi
|
||||
|
||||
const int words = sizeof(vobj)/sizeof(vector);
|
||||
|
||||
Vector<vector> buffer(osites);
|
||||
deviceVector<vector> buffer(osites);
|
||||
vector *dat = (vector *)lat;
|
||||
vector *buf = &buffer[0];
|
||||
iScalar<vector> *tbuf =(iScalar<vector> *) &buffer[0];
|
||||
|
@ -4,33 +4,28 @@ NAMESPACE_BEGIN(Grid);
|
||||
// Possibly promote to double and sum
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
template <class vobj>
|
||||
inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites)
|
||||
{
|
||||
typedef typename vobj::scalar_object sobj;
|
||||
typedef typename vobj::scalar_objectD sobjD;
|
||||
static Vector<sobj> mysum;
|
||||
mysum.resize(1);
|
||||
sobj *mysum_p = & mysum[0];
|
||||
|
||||
sobj identity; zeroit(identity);
|
||||
mysum[0] = identity;
|
||||
sobj ret ;
|
||||
|
||||
sobj ret; zeroit(ret);
|
||||
Integer nsimd= vobj::Nsimd();
|
||||
|
||||
const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() });
|
||||
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
|
||||
auto Reduction = cl::sycl::reduction(mysum_p,identity,std::plus<>(),PropList);
|
||||
cgh.parallel_for(cl::sycl::range<1>{osites},
|
||||
Reduction,
|
||||
[=] (cl::sycl::id<1> item, auto &sum) {
|
||||
auto osite = item[0];
|
||||
sum +=Reduce(lat[osite]);
|
||||
});
|
||||
});
|
||||
theGridAccelerator->wait();
|
||||
ret = mysum[0];
|
||||
// free(mysum,*theGridAccelerator);
|
||||
{
|
||||
sycl::buffer<sobj, 1> abuff(&ret, {1});
|
||||
theGridAccelerator->submit([&](sycl::handler &cgh) {
|
||||
auto Reduction = sycl::reduction(abuff,cgh,identity,std::plus<>());
|
||||
cgh.parallel_for(sycl::range<1>{osites},
|
||||
Reduction,
|
||||
[=] (sycl::id<1> item, auto &sum) {
|
||||
auto osite = item[0];
|
||||
sum +=Reduce(lat[osite]);
|
||||
});
|
||||
});
|
||||
}
|
||||
sobjD dret; convertType(dret,ret);
|
||||
return dret;
|
||||
}
|
||||
@ -76,59 +71,22 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite
|
||||
|
||||
template<class Word> Word svm_xor(Word *vec,uint64_t L)
|
||||
{
|
||||
Word xorResult; xorResult = 0;
|
||||
static Vector<Word> d_sum;
|
||||
d_sum.resize(1);
|
||||
Word *d_sum_p=&d_sum[0];
|
||||
Word identity; identity=0;
|
||||
d_sum[0] = identity;
|
||||
const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() });
|
||||
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
|
||||
auto Reduction = cl::sycl::reduction(d_sum_p,identity,std::bit_xor<>(),PropList);
|
||||
cgh.parallel_for(cl::sycl::range<1>{L},
|
||||
Reduction,
|
||||
[=] (cl::sycl::id<1> index, auto &sum) {
|
||||
sum^=vec[index];
|
||||
});
|
||||
});
|
||||
Word ret = 0;
|
||||
{
|
||||
sycl::buffer<Word, 1> abuff(&ret, {1});
|
||||
theGridAccelerator->submit([&](sycl::handler &cgh) {
|
||||
auto Reduction = sycl::reduction(abuff,cgh,identity,std::bit_xor<>());
|
||||
cgh.parallel_for(sycl::range<1>{L},
|
||||
Reduction,
|
||||
[=] (sycl::id<1> index, auto &sum) {
|
||||
sum ^=vec[index];
|
||||
});
|
||||
});
|
||||
}
|
||||
theGridAccelerator->wait();
|
||||
Word ret = d_sum[0];
|
||||
// free(d_sum,*theGridAccelerator);
|
||||
return ret;
|
||||
}
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
/*
|
||||
|
||||
template <class vobj>
|
||||
inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites)
|
||||
{
|
||||
typedef typename vobj::vector_type vector;
|
||||
typedef typename vobj::scalar_type scalar;
|
||||
|
||||
typedef typename vobj::scalar_typeD scalarD;
|
||||
typedef typename vobj::scalar_objectD sobjD;
|
||||
|
||||
sobjD ret;
|
||||
scalarD *ret_p = (scalarD *)&ret;
|
||||
|
||||
const int nsimd = vobj::Nsimd();
|
||||
const int words = sizeof(vobj)/sizeof(vector);
|
||||
|
||||
Vector<scalar> buffer(osites*nsimd);
|
||||
scalar *buf = &buffer[0];
|
||||
vector *dat = (vector *)lat;
|
||||
|
||||
for(int w=0;w<words;w++) {
|
||||
|
||||
accelerator_for(ss,osites,nsimd,{
|
||||
int lane = acceleratorSIMTlane(nsimd);
|
||||
buf[ss*nsimd+lane] = dat[ss*words+w].getlane(lane);
|
||||
});
|
||||
//Precision change at this point is to late to gain precision
|
||||
ret_p[w] = svm_reduce(buf,nsimd*osites);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
*/
|
||||
|
@ -21,9 +21,18 @@ NAMESPACE_BEGIN(Grid);
|
||||
|
||||
|
||||
#if defined(GRID_CUDA) || defined(GRID_HIP)
|
||||
template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
|
||||
template<class vobj>
|
||||
inline void sliceSumReduction_cub_small(const vobj *Data,
|
||||
std::vector<vobj> &lvSum,
|
||||
const int rd,
|
||||
const int e1,
|
||||
const int e2,
|
||||
const int stride,
|
||||
const int ostride,
|
||||
const int Nsimd)
|
||||
{
|
||||
size_t subvol_size = e1*e2;
|
||||
commVector<vobj> reduction_buffer(rd*subvol_size);
|
||||
deviceVector<vobj> reduction_buffer(rd*subvol_size);
|
||||
auto rb_p = &reduction_buffer[0];
|
||||
vobj zero_init;
|
||||
zeroit(zero_init);
|
||||
@ -46,7 +55,7 @@ template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, V
|
||||
d_offsets = static_cast<int*>(acceleratorAllocDevice((rd+1)*sizeof(int)));
|
||||
|
||||
//copy offsets to device
|
||||
acceleratorCopyToDeviceAsync(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
|
||||
acceleratorCopyToDeviceAsynch(&offsets[0],d_offsets,sizeof(int)*(rd+1),computeStream);
|
||||
|
||||
|
||||
gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream);
|
||||
@ -79,7 +88,7 @@ template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, V
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
acceleratorCopyFromDeviceAsync(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
|
||||
acceleratorCopyFromDeviceAsynch(d_out,&lvSum[0],rd*sizeof(vobj),computeStream);
|
||||
|
||||
//sync after copy
|
||||
accelerator_barrier();
|
||||
@ -94,7 +103,15 @@ template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, V
|
||||
|
||||
|
||||
#if defined(GRID_SYCL)
|
||||
template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data, Vector <vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
|
||||
template<class vobj>
|
||||
inline void sliceSumReduction_sycl_small(const vobj *Data,
|
||||
std::vector <vobj> &lvSum,
|
||||
const int &rd,
|
||||
const int &e1,
|
||||
const int &e2,
|
||||
const int &stride,
|
||||
const int &ostride,
|
||||
const int &Nsimd)
|
||||
{
|
||||
size_t subvol_size = e1*e2;
|
||||
|
||||
@ -105,7 +122,7 @@ template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data,
|
||||
mysum[r] = vobj_zero;
|
||||
}
|
||||
|
||||
commVector<vobj> reduction_buffer(rd*subvol_size);
|
||||
deviceVector<vobj> reduction_buffer(rd*subvol_size);
|
||||
|
||||
auto rb_p = &reduction_buffer[0];
|
||||
|
||||
@ -124,11 +141,11 @@ template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data,
|
||||
});
|
||||
|
||||
for (int r = 0; r < rd; r++) {
|
||||
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
|
||||
auto Reduction = cl::sycl::reduction(&mysum[r],std::plus<>());
|
||||
cgh.parallel_for(cl::sycl::range<1>{subvol_size},
|
||||
theGridAccelerator->submit([&](sycl::handler &cgh) {
|
||||
auto Reduction = sycl::reduction(&mysum[r],std::plus<>());
|
||||
cgh.parallel_for(sycl::range<1>{subvol_size},
|
||||
Reduction,
|
||||
[=](cl::sycl::id<1> item, auto &sum) {
|
||||
[=](sycl::id<1> item, auto &sum) {
|
||||
auto s = item[0];
|
||||
sum += rb_p[r*subvol_size+s];
|
||||
});
|
||||
@ -144,14 +161,23 @@ template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data,
|
||||
}
|
||||
#endif
|
||||
|
||||
template<class vobj> inline void sliceSumReduction_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
|
||||
template<class vobj>
|
||||
inline void sliceSumReduction_large(const vobj *Data,
|
||||
std::vector<vobj> &lvSum,
|
||||
const int rd,
|
||||
const int e1,
|
||||
const int e2,
|
||||
const int stride,
|
||||
const int ostride,
|
||||
const int Nsimd)
|
||||
{
|
||||
typedef typename vobj::vector_type vector;
|
||||
const int words = sizeof(vobj)/sizeof(vector);
|
||||
const int osites = rd*e1*e2;
|
||||
commVector<vector>buffer(osites);
|
||||
deviceVector<vector>buffer(osites);
|
||||
vector *dat = (vector *)Data;
|
||||
vector *buf = &buffer[0];
|
||||
Vector<vector> lvSum_small(rd);
|
||||
std::vector<vector> lvSum_small(rd);
|
||||
vector *lvSum_ptr = (vector *)&lvSum[0];
|
||||
|
||||
for (int w = 0; w < words; w++) {
|
||||
@ -168,13 +194,18 @@ template<class vobj> inline void sliceSumReduction_large(const vobj *Data, Vecto
|
||||
for (int r = 0; r < rd; r++) {
|
||||
lvSum_ptr[w+words*r]=lvSum_small[r];
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
template<class vobj> inline void sliceSumReduction_gpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd)
|
||||
template<class vobj>
|
||||
inline void sliceSumReduction_gpu(const Lattice<vobj> &Data,
|
||||
std::vector<vobj> &lvSum,
|
||||
const int rd,
|
||||
const int e1,
|
||||
const int e2,
|
||||
const int stride,
|
||||
const int ostride,
|
||||
const int Nsimd)
|
||||
{
|
||||
autoView(Data_v, Data, AcceleratorRead); //reduction libraries cannot deal with large vobjs so we split into small/large case.
|
||||
if constexpr (sizeof(vobj) <= 256) {
|
||||
@ -192,7 +223,15 @@ template<class vobj> inline void sliceSumReduction_gpu(const Lattice<vobj> &Data
|
||||
}
|
||||
|
||||
|
||||
template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
|
||||
template<class vobj>
|
||||
inline void sliceSumReduction_cpu(const Lattice<vobj> &Data,
|
||||
std::vector<vobj> &lvSum,
|
||||
const int &rd,
|
||||
const int &e1,
|
||||
const int &e2,
|
||||
const int &stride,
|
||||
const int &ostride,
|
||||
const int &Nsimd)
|
||||
{
|
||||
// sum over reduced dimension planes, breaking out orthog dir
|
||||
// Parallel over orthog direction
|
||||
@ -208,16 +247,20 @@ template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data
|
||||
});
|
||||
}
|
||||
|
||||
template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
|
||||
template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data,
|
||||
std::vector<vobj> &lvSum,
|
||||
const int &rd,
|
||||
const int &e1,
|
||||
const int &e2,
|
||||
const int &stride,
|
||||
const int &ostride,
|
||||
const int &Nsimd)
|
||||
{
|
||||
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
|
||||
|
||||
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
|
||||
sliceSumReduction_gpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
||||
|
||||
#else
|
||||
#else
|
||||
sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
|
||||
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
@ -54,7 +54,7 @@ struct CshiftImplGauge: public CshiftImplBase<typename Gimpl::GaugeLinkField::ve
|
||||
*
|
||||
*/
|
||||
|
||||
template<class vobj> inline void ScatterSlice(const cshiftVector<vobj> &buf,
|
||||
template<class vobj> inline void ScatterSlice(const deviceVector<vobj> &buf,
|
||||
Lattice<vobj> &lat,
|
||||
int x,
|
||||
int dim,
|
||||
@ -140,7 +140,7 @@ template<class vobj> inline void ScatterSlice(const cshiftVector<vobj> &buf,
|
||||
});
|
||||
}
|
||||
|
||||
template<class vobj> inline void GatherSlice(cshiftVector<vobj> &buf,
|
||||
template<class vobj> inline void GatherSlice(deviceVector<vobj> &buf,
|
||||
const Lattice<vobj> &lat,
|
||||
int x,
|
||||
int dim,
|
||||
@ -462,13 +462,19 @@ public:
|
||||
int rNsimd = Nsimd / simd[dimension];
|
||||
assert( buffer_size == from.Grid()->_slice_nblock[dimension]*from.Grid()->_slice_block[dimension] / simd[dimension]);
|
||||
|
||||
static cshiftVector<vobj> send_buf;
|
||||
static cshiftVector<vobj> recv_buf;
|
||||
static deviceVector<vobj> send_buf;
|
||||
static deviceVector<vobj> recv_buf;
|
||||
send_buf.resize(buffer_size*2*depth);
|
||||
recv_buf.resize(buffer_size*2*depth);
|
||||
#ifndef ACCELERATOR_AWARE_MPI
|
||||
static hostVector<vobj> hsend_buf;
|
||||
static hostVector<vobj> hrecv_buf;
|
||||
hsend_buf.resize(buffer_size*2*depth);
|
||||
hrecv_buf.resize(buffer_size*2*depth);
|
||||
#endif
|
||||
|
||||
std::vector<CommsRequest_t> fwd_req;
|
||||
std::vector<CommsRequest_t> bwd_req;
|
||||
std::vector<MpiCommsRequest_t> fwd_req;
|
||||
std::vector<MpiCommsRequest_t> bwd_req;
|
||||
|
||||
int words = buffer_size;
|
||||
int bytes = words * sizeof(vobj);
|
||||
@ -495,9 +501,16 @@ public:
|
||||
t_gather+=usecond()-t;
|
||||
|
||||
t=usecond();
|
||||
#ifdef ACCELERATOR_AWARE_MPI
|
||||
grid->SendToRecvFromBegin(fwd_req,
|
||||
(void *)&send_buf[d*buffer_size], xmit_to_rank,
|
||||
(void *)&recv_buf[d*buffer_size], recv_from_rank, bytes, tag);
|
||||
#else
|
||||
acceleratorCopyFromDevice(&send_buf[d*buffer_size],&hsend_buf[d*buffer_size],bytes);
|
||||
grid->SendToRecvFromBegin(fwd_req,
|
||||
(void *)&hsend_buf[d*buffer_size], xmit_to_rank,
|
||||
(void *)&hrecv_buf[d*buffer_size], recv_from_rank, bytes, tag);
|
||||
#endif
|
||||
t_comms+=usecond()-t;
|
||||
}
|
||||
for ( int d=0;d < depth ; d ++ ) {
|
||||
@ -508,9 +521,16 @@ public:
|
||||
t_gather+= usecond() - t;
|
||||
|
||||
t=usecond();
|
||||
#ifdef ACCELERATOR_AWARE_MPI
|
||||
grid->SendToRecvFromBegin(bwd_req,
|
||||
(void *)&send_buf[(d+depth)*buffer_size], recv_from_rank,
|
||||
(void *)&recv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
|
||||
#else
|
||||
acceleratorCopyFromDevice(&send_buf[(d+depth)*buffer_size],&hsend_buf[(d+depth)*buffer_size],bytes);
|
||||
grid->SendToRecvFromBegin(bwd_req,
|
||||
(void *)&hsend_buf[(d+depth)*buffer_size], recv_from_rank,
|
||||
(void *)&hrecv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
|
||||
#endif
|
||||
t_comms+=usecond()-t;
|
||||
}
|
||||
|
||||
@ -533,8 +553,13 @@ public:
|
||||
|
||||
t=usecond();
|
||||
grid->CommsComplete(fwd_req);
|
||||
#ifndef ACCELERATOR_AWARE_MPI
|
||||
for ( int d=0;d < depth ; d ++ ) {
|
||||
acceleratorCopyToDevice(&hrecv_buf[d*buffer_size],&recv_buf[d*buffer_size],bytes);
|
||||
}
|
||||
#endif
|
||||
t_comms+= usecond() - t;
|
||||
|
||||
|
||||
t=usecond();
|
||||
for ( int d=0;d < depth ; d ++ ) {
|
||||
ScatterSlice(recv_buf,to,nld-depth+d,dimension,plane*buffer_size); plane++;
|
||||
@ -543,6 +568,11 @@ public:
|
||||
|
||||
t=usecond();
|
||||
grid->CommsComplete(bwd_req);
|
||||
#ifndef ACCELERATOR_AWARE_MPI
|
||||
for ( int d=0;d < depth ; d ++ ) {
|
||||
acceleratorCopyToDevice(&hrecv_buf[(d+depth)*buffer_size],&recv_buf[(d+depth)*buffer_size],bytes);
|
||||
}
|
||||
#endif
|
||||
t_comms+= usecond() - t;
|
||||
|
||||
t=usecond();
|
||||
|
@ -98,7 +98,7 @@ public:
|
||||
virtual RealD S(const GaugeField& U) = 0; // evaluate the action
|
||||
virtual RealD Sinitial(const GaugeField& U) { return this->S(U); } ; // if the refresh computes the action, can cache it. Alternately refreshAndAction() ?
|
||||
virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0; // evaluate the action derivative
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////
|
||||
// virtual smeared interface through configuration container
|
||||
/////////////////////////////////////////////////////////////
|
||||
@ -132,6 +132,10 @@ public:
|
||||
template <class GaugeField >
|
||||
class EmptyAction : public Action <GaugeField>
|
||||
{
|
||||
using Action<GaugeField>::refresh;
|
||||
using Action<GaugeField>::Sinitial;
|
||||
using Action<GaugeField>::deriv;
|
||||
|
||||
virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { assert(0);}; // refresh pseudofermions
|
||||
virtual RealD S(const GaugeField& U) { return 0.0;}; // evaluate the action
|
||||
virtual void deriv(const GaugeField& U, GaugeField& dSdU) { assert(0); }; // evaluate the action derivative
|
||||
|
@ -55,6 +55,11 @@ public:
|
||||
RealD alpha; // Mobius scale
|
||||
RealD k; // EOFA normalization constant
|
||||
|
||||
// Device resident
|
||||
deviceVector<Coeff_t> d_shift_coefficients;
|
||||
deviceVector<Coeff_t> d_MooeeInv_shift_lc;
|
||||
deviceVector<Coeff_t> d_MooeeInv_shift_norm;
|
||||
|
||||
virtual void Instantiatable(void) = 0;
|
||||
|
||||
// EOFA-specific operations
|
||||
@ -92,6 +97,11 @@ public:
|
||||
this->k = this->alpha * (_mq3-_mq2) * std::pow(this->alpha+1.0,2*Ls) /
|
||||
( std::pow(this->alpha+1.0,Ls) + _mq2*std::pow(this->alpha-1.0,Ls) ) /
|
||||
( std::pow(this->alpha+1.0,Ls) + _mq3*std::pow(this->alpha-1.0,Ls) );
|
||||
|
||||
d_shift_coefficients.resize(Ls);
|
||||
d_MooeeInv_shift_lc.resize(Ls);
|
||||
d_MooeeInv_shift_norm.resize(Ls);
|
||||
|
||||
};
|
||||
};
|
||||
|
||||
|
@ -90,16 +90,16 @@ public:
|
||||
void M5D(const FermionField &psi,
|
||||
const FermionField &phi,
|
||||
FermionField &chi,
|
||||
Vector<Coeff_t> &lower,
|
||||
Vector<Coeff_t> &diag,
|
||||
Vector<Coeff_t> &upper);
|
||||
std::vector<Coeff_t> &lower,
|
||||
std::vector<Coeff_t> &diag,
|
||||
std::vector<Coeff_t> &upper);
|
||||
|
||||
void M5Ddag(const FermionField &psi,
|
||||
const FermionField &phi,
|
||||
FermionField &chi,
|
||||
Vector<Coeff_t> &lower,
|
||||
Vector<Coeff_t> &diag,
|
||||
Vector<Coeff_t> &upper);
|
||||
std::vector<Coeff_t> &lower,
|
||||
std::vector<Coeff_t> &diag,
|
||||
std::vector<Coeff_t> &upper);
|
||||
|
||||
virtual void Instantiatable(void)=0;
|
||||
|
||||
@ -119,35 +119,51 @@ public:
|
||||
RealD mass_plus, mass_minus;
|
||||
|
||||
// Save arguments to SetCoefficientsInternal
|
||||
Vector<Coeff_t> _gamma;
|
||||
std::vector<Coeff_t> _gamma;
|
||||
RealD _zolo_hi;
|
||||
RealD _b;
|
||||
RealD _c;
|
||||
|
||||
// possible boost
|
||||
std::vector<ComplexD> qmu;
|
||||
void set_qmu(std::vector<ComplexD> _qmu) { qmu=_qmu; assert(qmu.size()==Nd);};
|
||||
void addQmu(const FermionField &in, FermionField &out, int dag);
|
||||
|
||||
// Cayley form Moebius (tanh and zolotarev)
|
||||
Vector<Coeff_t> omega;
|
||||
Vector<Coeff_t> bs; // S dependent coeffs
|
||||
Vector<Coeff_t> cs;
|
||||
Vector<Coeff_t> as;
|
||||
std::vector<Coeff_t> omega;
|
||||
std::vector<Coeff_t> bs; // S dependent coeffs
|
||||
std::vector<Coeff_t> cs;
|
||||
std::vector<Coeff_t> as;
|
||||
// For preconditioning Cayley form
|
||||
Vector<Coeff_t> bee;
|
||||
Vector<Coeff_t> cee;
|
||||
Vector<Coeff_t> aee;
|
||||
Vector<Coeff_t> beo;
|
||||
Vector<Coeff_t> ceo;
|
||||
Vector<Coeff_t> aeo;
|
||||
std::vector<Coeff_t> bee;
|
||||
std::vector<Coeff_t> cee;
|
||||
std::vector<Coeff_t> aee;
|
||||
std::vector<Coeff_t> beo;
|
||||
std::vector<Coeff_t> ceo;
|
||||
std::vector<Coeff_t> aeo;
|
||||
// LDU factorisation of the eeoo matrix
|
||||
Vector<Coeff_t> lee;
|
||||
Vector<Coeff_t> leem;
|
||||
Vector<Coeff_t> uee;
|
||||
Vector<Coeff_t> ueem;
|
||||
Vector<Coeff_t> dee;
|
||||
std::vector<Coeff_t> lee;
|
||||
std::vector<Coeff_t> leem;
|
||||
std::vector<Coeff_t> uee;
|
||||
std::vector<Coeff_t> ueem;
|
||||
std::vector<Coeff_t> dee;
|
||||
|
||||
// Device memory
|
||||
deviceVector<Coeff_t> d_diag;
|
||||
deviceVector<Coeff_t> d_upper;
|
||||
deviceVector<Coeff_t> d_lower;
|
||||
|
||||
deviceVector<Coeff_t> d_lee;
|
||||
deviceVector<Coeff_t> d_dee;
|
||||
deviceVector<Coeff_t> d_uee;
|
||||
deviceVector<Coeff_t> d_leem;
|
||||
deviceVector<Coeff_t> d_ueem;
|
||||
|
||||
// Matrices of 5d ee inverse params
|
||||
Vector<iSinglet<Simd> > MatpInv;
|
||||
Vector<iSinglet<Simd> > MatmInv;
|
||||
Vector<iSinglet<Simd> > MatpInvDag;
|
||||
Vector<iSinglet<Simd> > MatmInvDag;
|
||||
// std::vector<iSinglet<Simd> > MatpInv;
|
||||
// std::vector<iSinglet<Simd> > MatmInv;
|
||||
// std::vector<iSinglet<Simd> > MatpInvDag;
|
||||
// std::vector<iSinglet<Simd> > MatmInvDag;
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
// Conserved current utilities
|
||||
@ -187,7 +203,7 @@ public:
|
||||
protected:
|
||||
virtual void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
|
||||
virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c);
|
||||
virtual void SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t> & gamma,RealD b,RealD c);
|
||||
virtual void SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c);
|
||||
};
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
196
Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h
Normal file
196
Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h
Normal file
@ -0,0 +1,196 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermion5D.h
|
||||
|
||||
Copyright (C) 2020 - 2025
|
||||
|
||||
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
|
||||
Author: Nils Meyer <nils.meyer@ur.de>
|
||||
Author: Christoph Lehner <christoph@lhnr.de>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <Grid/qcd/action/fermion/WilsonFermion5D.h>
|
||||
#include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
|
||||
#include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
|
||||
#include <Grid/qcd/action/fermion/CloverHelpers.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
// see Grid/qcd/action/fermion/CompactWilsonCloverFermion.h for description
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
class CompactWilsonCloverFermion5D : public WilsonFermion5D<Impl>,
|
||||
public WilsonCloverHelpers<Impl>,
|
||||
public CompactWilsonCloverHelpers<Impl> {
|
||||
/////////////////////////////////////////////
|
||||
// Sizes
|
||||
/////////////////////////////////////////////
|
||||
|
||||
public:
|
||||
|
||||
INHERIT_COMPACT_CLOVER_SIZES(Impl);
|
||||
|
||||
/////////////////////////////////////////////
|
||||
// Type definitions
|
||||
/////////////////////////////////////////////
|
||||
|
||||
public:
|
||||
|
||||
INHERIT_IMPL_TYPES(Impl);
|
||||
INHERIT_CLOVER_TYPES(Impl);
|
||||
INHERIT_COMPACT_CLOVER_TYPES(Impl);
|
||||
|
||||
typedef WilsonFermion5D<Impl> WilsonBase;
|
||||
typedef WilsonCloverHelpers<Impl> Helpers;
|
||||
typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
|
||||
|
||||
/////////////////////////////////////////////
|
||||
// Constructors
|
||||
/////////////////////////////////////////////
|
||||
|
||||
public:
|
||||
|
||||
CompactWilsonCloverFermion5D(GaugeField& _Umu,
|
||||
GridCartesian &FiveDimGrid,
|
||||
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
||||
GridCartesian &FourDimGrid,
|
||||
GridRedBlackCartesian &FourDimRedBlackGrid,
|
||||
const RealD _mass,
|
||||
const RealD _csw_r = 0.0,
|
||||
const RealD _csw_t = 0.0,
|
||||
const RealD _cF = 1.0,
|
||||
const ImplParams& impl_p = ImplParams());
|
||||
|
||||
/////////////////////////////////////////////
|
||||
// Member functions (implementing interface)
|
||||
/////////////////////////////////////////////
|
||||
|
||||
public:
|
||||
|
||||
virtual void Instantiatable() {};
|
||||
int ConstEE() override { return 0; };
|
||||
int isTrivialEE() override { return 0; };
|
||||
|
||||
void Dhop(const FermionField& in, FermionField& out, int dag) override;
|
||||
|
||||
void DhopOE(const FermionField& in, FermionField& out, int dag) override;
|
||||
|
||||
void DhopEO(const FermionField& in, FermionField& out, int dag) override;
|
||||
|
||||
void DhopDir(const FermionField& in, FermionField& out, int dir, int disp) override;
|
||||
|
||||
void DhopDirAll(const FermionField& in, std::vector<FermionField>& out) /* override */;
|
||||
|
||||
void M(const FermionField& in, FermionField& out) override;
|
||||
|
||||
void Mdag(const FermionField& in, FermionField& out) override;
|
||||
|
||||
void Meooe(const FermionField& in, FermionField& out) override;
|
||||
|
||||
void MeooeDag(const FermionField& in, FermionField& out) override;
|
||||
|
||||
void Mooee(const FermionField& in, FermionField& out) override;
|
||||
|
||||
void MooeeDag(const FermionField& in, FermionField& out) override;
|
||||
|
||||
void MooeeInv(const FermionField& in, FermionField& out) override;
|
||||
|
||||
void MooeeInvDag(const FermionField& in, FermionField& out) override;
|
||||
|
||||
void Mdir(const FermionField& in, FermionField& out, int dir, int disp) override;
|
||||
|
||||
void MdirAll(const FermionField& in, std::vector<FermionField>& out) override;
|
||||
|
||||
void MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) override;
|
||||
|
||||
void MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
|
||||
|
||||
void MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
|
||||
|
||||
/////////////////////////////////////////////
|
||||
// Member functions (internals)
|
||||
/////////////////////////////////////////////
|
||||
|
||||
void MooeeInternal(const FermionField& in,
|
||||
FermionField& out,
|
||||
const CloverDiagonalField& diagonal,
|
||||
const CloverTriangleField& triangle);
|
||||
|
||||
/////////////////////////////////////////////
|
||||
// Helpers
|
||||
/////////////////////////////////////////////
|
||||
|
||||
void ImportGauge(const GaugeField& _Umu) override;
|
||||
|
||||
/////////////////////////////////////////////
|
||||
// Helpers
|
||||
/////////////////////////////////////////////
|
||||
|
||||
private:
|
||||
|
||||
template<class Field>
|
||||
const MaskField* getCorrectMaskField(const Field &in) const {
|
||||
if(in.Grid()->_isCheckerBoarded) {
|
||||
if(in.Checkerboard() == Odd) {
|
||||
return &this->BoundaryMaskOdd;
|
||||
} else {
|
||||
return &this->BoundaryMaskEven;
|
||||
}
|
||||
} else {
|
||||
return &this->BoundaryMask;
|
||||
}
|
||||
}
|
||||
|
||||
template<class Field>
|
||||
void ApplyBoundaryMask(Field& f) {
|
||||
const MaskField* m = getCorrectMaskField(f); assert(m != nullptr);
|
||||
assert(m != nullptr);
|
||||
CompactHelpers::ApplyBoundaryMask(f, *m);
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////
|
||||
// Member Data
|
||||
/////////////////////////////////////////////
|
||||
|
||||
public:
|
||||
|
||||
RealD csw_r;
|
||||
RealD csw_t;
|
||||
RealD cF;
|
||||
int n_rhs;
|
||||
|
||||
bool fixedBoundaries;
|
||||
|
||||
CloverDiagonalField Diagonal, DiagonalEven, DiagonalOdd;
|
||||
CloverDiagonalField DiagonalInv, DiagonalInvEven, DiagonalInvOdd;
|
||||
|
||||
CloverTriangleField Triangle, TriangleEven, TriangleOdd;
|
||||
CloverTriangleField TriangleInv, TriangleInvEven, TriangleInvOdd;
|
||||
|
||||
FermionField Tmp;
|
||||
|
||||
MaskField BoundaryMask, BoundaryMaskEven, BoundaryMaskOdd;
|
||||
};
|
||||
|
||||
NAMESPACE_END(Grid);
|
@ -60,6 +60,50 @@ public:
|
||||
// virtual void Instantiatable(void)=0;
|
||||
virtual void Instantiatable(void) =0;
|
||||
|
||||
void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary, std::vector<double> twist)
|
||||
{
|
||||
std::cout << "Free Propagator for PartialFraction"<<std::endl;
|
||||
FermionField in_k(in.Grid());
|
||||
FermionField prop_k(in.Grid());
|
||||
|
||||
FFT theFFT((GridCartesian *) in.Grid());
|
||||
|
||||
//phase for boundary condition
|
||||
ComplexField coor(in.Grid());
|
||||
ComplexField ph(in.Grid()); ph = Zero();
|
||||
FermionField in_buf(in.Grid()); in_buf = Zero();
|
||||
typedef typename Simd::scalar_type Scalar;
|
||||
Scalar ci(0.0,1.0);
|
||||
assert(twist.size() == Nd);//check that twist is Nd
|
||||
assert(boundary.size() == Nd);//check that boundary conditions is Nd
|
||||
int shift = 0;
|
||||
for(unsigned int nu = 0; nu < Nd; nu++)
|
||||
{
|
||||
// Shift coordinate lattice index by 1 to account for 5th dimension.
|
||||
LatticeCoordinate(coor, nu + shift);
|
||||
double boundary_phase = ::acos(real(boundary[nu]));
|
||||
ph = ph + boundary_phase*coor*((1./(in.Grid()->_fdimensions[nu+shift])));
|
||||
//momenta for propagator shifted by twist+boundary
|
||||
twist[nu] = twist[nu] + boundary_phase/((2.0*M_PI));
|
||||
}
|
||||
in_buf = exp(ci*ph*(-1.0))*in;
|
||||
|
||||
theFFT.FFT_all_dim(in_k,in,FFT::forward);
|
||||
this->MomentumSpacePropagatorHw(prop_k,in_k,mass,twist);
|
||||
theFFT.FFT_all_dim(out,prop_k,FFT::backward);
|
||||
|
||||
//phase for boundary condition
|
||||
out = out * exp(ci*ph);
|
||||
};
|
||||
|
||||
virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) {
|
||||
std::vector<double> twist(Nd,0.0); //default: periodic boundarys in all directions
|
||||
std::vector<Complex> boundary;
|
||||
for(int i=0;i<Nd;i++) boundary.push_back(1);//default: periodic boundary conditions
|
||||
FreePropagator(in,out,mass,boundary,twist);
|
||||
};
|
||||
|
||||
|
||||
// Efficient support for multigrid coarsening
|
||||
virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp);
|
||||
virtual void MdirAll(const FermionField &in, std::vector<FermionField> &out);
|
||||
@ -90,12 +134,12 @@ protected:
|
||||
RealD mass;
|
||||
RealD R;
|
||||
RealD ZoloHiInv;
|
||||
Vector<double> Beta;
|
||||
Vector<double> cc;;
|
||||
Vector<double> cc_d;;
|
||||
Vector<double> sqrt_cc;
|
||||
Vector<double> See;
|
||||
Vector<double> Aee;
|
||||
std::vector<double> Beta;
|
||||
std::vector<double> cc;;
|
||||
std::vector<double> cc_d;;
|
||||
std::vector<double> sqrt_cc;
|
||||
std::vector<double> See;
|
||||
std::vector<double> Aee;
|
||||
|
||||
};
|
||||
|
||||
|
@ -69,10 +69,10 @@ public:
|
||||
// Instantiate different versions depending on Impl
|
||||
/////////////////////////////////////////////////////
|
||||
void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
|
||||
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);
|
||||
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
|
||||
|
||||
void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
|
||||
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);
|
||||
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
|
||||
|
||||
virtual void RefreshShiftCoefficients(RealD new_shift);
|
||||
|
||||
@ -83,7 +83,7 @@ public:
|
||||
RealD _M5, const ImplParams& p=ImplParams());
|
||||
|
||||
protected:
|
||||
void SetCoefficientsInternal(RealD zolo_hi, Vector<Coeff_t>& gamma, RealD b, RealD c);
|
||||
void SetCoefficientsInternal(RealD zolo_hi, std::vector<Coeff_t>& gamma, RealD b, RealD c);
|
||||
};
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
@ -55,6 +55,7 @@ NAMESPACE_CHECK(Wilson);
|
||||
NAMESPACE_CHECK(WilsonTM);
|
||||
#include <Grid/qcd/action/fermion/WilsonCloverFermion.h> // 4d wilson clover fermions
|
||||
#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h> // 4d compact wilson clover fermions
|
||||
#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h> // 5d compact wilson clover fermions
|
||||
NAMESPACE_CHECK(WilsonClover);
|
||||
#include <Grid/qcd/action/fermion/WilsonFermion5D.h> // 5d base used by all 5d overlap types
|
||||
NAMESPACE_CHECK(Wilson5D);
|
||||
@ -164,12 +165,17 @@ typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiS
|
||||
|
||||
// Compact Clover fermions
|
||||
template <typename WImpl> using CompactWilsonClover = CompactWilsonCloverFermion<WImpl, CompactCloverHelpers<WImpl>>;
|
||||
template <typename WImpl> using CompactWilsonClover5D = CompactWilsonCloverFermion5D<WImpl, CompactCloverHelpers<WImpl>>;
|
||||
template <typename WImpl> using CompactWilsonExpClover = CompactWilsonCloverFermion<WImpl, CompactExpCloverHelpers<WImpl>>;
|
||||
|
||||
typedef CompactWilsonClover<WilsonImplD2> CompactWilsonCloverFermionD2;
|
||||
typedef CompactWilsonClover<WilsonImplF> CompactWilsonCloverFermionF;
|
||||
typedef CompactWilsonClover<WilsonImplD> CompactWilsonCloverFermionD;
|
||||
|
||||
typedef CompactWilsonClover5D<WilsonImplD2> CompactWilsonCloverFermion5DD2;
|
||||
typedef CompactWilsonClover5D<WilsonImplF> CompactWilsonCloverFermion5DF;
|
||||
typedef CompactWilsonClover5D<WilsonImplD> CompactWilsonCloverFermion5DD;
|
||||
|
||||
typedef CompactWilsonExpClover<WilsonImplD2> CompactWilsonExpCloverFermionD2;
|
||||
typedef CompactWilsonExpClover<WilsonImplF> CompactWilsonExpCloverFermionF;
|
||||
typedef CompactWilsonExpClover<WilsonImplD> CompactWilsonExpCloverFermionD;
|
||||
|
@ -102,11 +102,11 @@ public:
|
||||
GaugeField &mat,
|
||||
const FermionField &A, const FermionField &B, int dag);
|
||||
|
||||
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||
void DhopInternal(StencilImpl &st, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||
const FermionField &in, FermionField &out, int dag);
|
||||
void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||
void DhopInternalSerialComms(StencilImpl &st, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||
const FermionField &in, FermionField &out, int dag);
|
||||
void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||
void DhopInternalOverlappedComms(StencilImpl &st, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||
const FermionField &in, FermionField &out, int dag);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
@ -164,8 +164,6 @@ public:
|
||||
DoubledGaugeField UUUmuEven;
|
||||
DoubledGaugeField UUUmuOdd;
|
||||
|
||||
LebesgueOrder Lebesgue;
|
||||
LebesgueOrder LebesgueEvenOdd;
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
// Conserved current utilities
|
||||
|
@ -100,7 +100,6 @@ public:
|
||||
int dag);
|
||||
|
||||
void DhopInternal(StencilImpl & st,
|
||||
LebesgueOrder &lo,
|
||||
DoubledGaugeField &U,
|
||||
DoubledGaugeField &UUU,
|
||||
const FermionField &in,
|
||||
@ -108,7 +107,6 @@ public:
|
||||
int dag);
|
||||
|
||||
void DhopInternalOverlappedComms(StencilImpl & st,
|
||||
LebesgueOrder &lo,
|
||||
DoubledGaugeField &U,
|
||||
DoubledGaugeField &UUU,
|
||||
const FermionField &in,
|
||||
@ -116,7 +114,6 @@ public:
|
||||
int dag);
|
||||
|
||||
void DhopInternalSerialComms(StencilImpl & st,
|
||||
LebesgueOrder &lo,
|
||||
DoubledGaugeField &U,
|
||||
DoubledGaugeField &UUU,
|
||||
const FermionField &in,
|
||||
@ -192,8 +189,6 @@ public:
|
||||
DoubledGaugeField UUUmuEven;
|
||||
DoubledGaugeField UUUmuOdd;
|
||||
|
||||
LebesgueOrder Lebesgue;
|
||||
LebesgueOrder LebesgueEvenOdd;
|
||||
|
||||
// Comms buffer
|
||||
// std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
|
||||
|
@ -42,11 +42,11 @@ public:
|
||||
|
||||
public:
|
||||
// Shift operator coefficients for red-black preconditioned Mobius EOFA
|
||||
Vector<Coeff_t> Mooee_shift;
|
||||
Vector<Coeff_t> MooeeInv_shift_lc;
|
||||
Vector<Coeff_t> MooeeInv_shift_norm;
|
||||
Vector<Coeff_t> MooeeInvDag_shift_lc;
|
||||
Vector<Coeff_t> MooeeInvDag_shift_norm;
|
||||
std::vector<Coeff_t> Mooee_shift;
|
||||
std::vector<Coeff_t> MooeeInv_shift_lc;
|
||||
std::vector<Coeff_t> MooeeInv_shift_norm;
|
||||
std::vector<Coeff_t> MooeeInvDag_shift_lc;
|
||||
std::vector<Coeff_t> MooeeInvDag_shift_norm;
|
||||
|
||||
virtual void Instantiatable(void) {};
|
||||
|
||||
@ -74,18 +74,18 @@ public:
|
||||
// Instantiate different versions depending on Impl
|
||||
/////////////////////////////////////////////////////
|
||||
void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
|
||||
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);
|
||||
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
|
||||
|
||||
void M5D_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
|
||||
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper,
|
||||
Vector<Coeff_t>& shift_coeffs);
|
||||
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
|
||||
std::vector<Coeff_t>& shift_coeffs);
|
||||
|
||||
void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
|
||||
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);
|
||||
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
|
||||
|
||||
void M5Ddag_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
|
||||
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper,
|
||||
Vector<Coeff_t>& shift_coeffs);
|
||||
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
|
||||
std::vector<Coeff_t>& shift_coeffs);
|
||||
|
||||
virtual void RefreshShiftCoefficients(RealD new_shift);
|
||||
|
||||
|
@ -102,11 +102,11 @@ public:
|
||||
GaugeField &mat,
|
||||
const FermionField &A, const FermionField &B, int dag);
|
||||
|
||||
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||
void DhopInternal(StencilImpl &st, DoubledGaugeField &U,
|
||||
const FermionField &in, FermionField &out, int dag);
|
||||
void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||
void DhopInternalSerialComms(StencilImpl &st, DoubledGaugeField &U,
|
||||
const FermionField &in, FermionField &out, int dag);
|
||||
void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||
void DhopInternalOverlappedComms(StencilImpl &st, DoubledGaugeField &U,
|
||||
const FermionField &in, FermionField &out, int dag);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
@ -152,9 +152,6 @@ public:
|
||||
DoubledGaugeField UmuEven;
|
||||
DoubledGaugeField UmuOdd;
|
||||
|
||||
LebesgueOrder Lebesgue;
|
||||
LebesgueOrder LebesgueEvenOdd;
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
// Conserved current utilities
|
||||
///////////////////////////////////////////////////////////////
|
||||
|
@ -42,7 +42,7 @@ public:
|
||||
|
||||
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
|
||||
this->MomentumSpacePropagatorHw(out,in,_m,twist);
|
||||
};
|
||||
};
|
||||
|
||||
// Constructors
|
||||
OverlapWilsonCayleyTanhFermion(GaugeField &_Umu,
|
||||
|
@ -41,6 +41,10 @@ public:
|
||||
public:
|
||||
|
||||
// Constructors
|
||||
virtual void Instantiatable(void){};
|
||||
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
|
||||
this->MomentumSpacePropagatorHw(out,in,_m,twist);
|
||||
};
|
||||
|
||||
OverlapWilsonCayleyZolotarevFermion(GaugeField &_Umu,
|
||||
GridCartesian &FiveDimGrid,
|
||||
|
@ -41,6 +41,9 @@ public:
|
||||
public:
|
||||
|
||||
virtual void Instantiatable(void){};
|
||||
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
|
||||
this->MomentumSpacePropagatorHw(out,in,_m,twist);
|
||||
};
|
||||
// Constructors
|
||||
OverlapWilsonContFracTanhFermion(GaugeField &_Umu,
|
||||
GridCartesian &FiveDimGrid,
|
||||
|
@ -40,6 +40,9 @@ public:
|
||||
INHERIT_IMPL_TYPES(Impl);
|
||||
|
||||
virtual void Instantiatable(void){};
|
||||
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
|
||||
this->MomentumSpacePropagatorHw(out,in,_m,twist);
|
||||
};
|
||||
// Constructors
|
||||
OverlapWilsonContFracZolotarevFermion(GaugeField &_Umu,
|
||||
GridCartesian &FiveDimGrid,
|
||||
|
@ -41,6 +41,9 @@ public:
|
||||
public:
|
||||
|
||||
virtual void Instantiatable(void){};
|
||||
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
|
||||
this->MomentumSpacePropagatorHw(out,in,_m,twist);
|
||||
};
|
||||
// Constructors
|
||||
OverlapWilsonPartialFractionTanhFermion(GaugeField &_Umu,
|
||||
GridCartesian &FiveDimGrid,
|
||||
|
@ -40,6 +40,11 @@ public:
|
||||
INHERIT_IMPL_TYPES(Impl);
|
||||
|
||||
virtual void Instantiatable(void){};
|
||||
|
||||
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
|
||||
this->MomentumSpacePropagatorHw(out,in,_m,twist);
|
||||
};
|
||||
|
||||
// Constructors
|
||||
OverlapWilsonPartialFractionZolotarevFermion(GaugeField &_Umu,
|
||||
GridCartesian &FiveDimGrid,
|
||||
|
@ -39,7 +39,7 @@ class PartialFractionFermion5D : public WilsonFermion5D<Impl>
|
||||
public:
|
||||
INHERIT_IMPL_TYPES(Impl);
|
||||
|
||||
const int part_frac_chroma_convention=1;
|
||||
const int part_frac_chroma_convention=0;
|
||||
|
||||
void Meooe_internal(const FermionField &in, FermionField &out,int dag);
|
||||
void Mooee_internal(const FermionField &in, FermionField &out,int dag);
|
||||
@ -83,19 +83,78 @@ public:
|
||||
GridRedBlackCartesian &FourDimRedBlackGrid,
|
||||
RealD _mass,RealD M5,const ImplParams &p= ImplParams());
|
||||
|
||||
PartialFractionFermion5D(GaugeField &_Umu,
|
||||
GridCartesian &FiveDimGrid,
|
||||
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
||||
GridCartesian &FourDimGrid,
|
||||
GridRedBlackCartesian &FourDimRedBlackGrid,
|
||||
RealD _mass,RealD M5,std::vector<RealD> &_qmu,const ImplParams &p= ImplParams());
|
||||
|
||||
void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary, std::vector<double> twist)
|
||||
{
|
||||
std::cout << "Free Propagator for PartialFraction"<<std::endl;
|
||||
FermionField in_k(in.Grid());
|
||||
FermionField prop_k(in.Grid());
|
||||
|
||||
FFT theFFT((GridCartesian *) in.Grid());
|
||||
|
||||
//phase for boundary condition
|
||||
ComplexField coor(in.Grid());
|
||||
ComplexField ph(in.Grid()); ph = Zero();
|
||||
FermionField in_buf(in.Grid()); in_buf = Zero();
|
||||
typedef typename Simd::scalar_type Scalar;
|
||||
Scalar ci(0.0,1.0);
|
||||
assert(twist.size() == Nd);//check that twist is Nd
|
||||
assert(boundary.size() == Nd);//check that boundary conditions is Nd
|
||||
int shift = 0;
|
||||
for(unsigned int nu = 0; nu < Nd; nu++)
|
||||
{
|
||||
// Shift coordinate lattice index by 1 to account for 5th dimension.
|
||||
LatticeCoordinate(coor, nu + shift);
|
||||
double boundary_phase = ::acos(real(boundary[nu]));
|
||||
ph = ph + boundary_phase*coor*((1./(in.Grid()->_fdimensions[nu+shift])));
|
||||
//momenta for propagator shifted by twist+boundary
|
||||
twist[nu] = twist[nu] + boundary_phase/((2.0*M_PI));
|
||||
}
|
||||
in_buf = exp(ci*ph*(-1.0))*in;
|
||||
|
||||
theFFT.FFT_all_dim(in_k,in,FFT::forward);
|
||||
if ( this->qmu.size() ){
|
||||
this->MomentumSpacePropagatorHwQ(prop_k,in_k,mass,twist,this->qmu);
|
||||
} else {
|
||||
this->MomentumSpacePropagatorHw(prop_k,in_k,mass,twist);
|
||||
}
|
||||
theFFT.FFT_all_dim(out,prop_k,FFT::backward);
|
||||
|
||||
//phase for boundary condition
|
||||
out = out * exp(ci*ph);
|
||||
};
|
||||
|
||||
virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) {
|
||||
std::vector<double> twist(Nd,0.0); //default: periodic boundarys in all directions
|
||||
std::vector<Complex> boundary;
|
||||
for(int i=0;i<Nd;i++) boundary.push_back(1);//default: periodic boundary conditions
|
||||
FreePropagator(in,out,mass,boundary,twist);
|
||||
};
|
||||
|
||||
void set_qmu(std::vector<RealD> _qmu) { qmu=_qmu; assert(qmu.size()==Nd);};
|
||||
void addQmu(const FermionField &in, FermionField &out, int dag);
|
||||
|
||||
protected:
|
||||
|
||||
virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale);
|
||||
virtual void SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata);
|
||||
|
||||
std::vector<RealD> qmu;
|
||||
|
||||
// Part frac
|
||||
RealD mass;
|
||||
RealD dw_diag;
|
||||
RealD R;
|
||||
RealD amax;
|
||||
RealD scale;
|
||||
Vector<double> p;
|
||||
Vector<double> q;
|
||||
std::vector<double> p;
|
||||
std::vector<double> q;
|
||||
|
||||
};
|
||||
|
||||
|
@ -35,7 +35,7 @@ template<class Matrix, class Field>
|
||||
class KappaSimilarityTransform {
|
||||
public:
|
||||
INHERIT_IMPL_TYPES(Matrix);
|
||||
Vector<Coeff_t> kappa, kappaDag, kappaInv, kappaInvDag;
|
||||
std::vector<Coeff_t> kappa, kappaDag, kappaInv, kappaInvDag;
|
||||
|
||||
KappaSimilarityTransform (Matrix &zmob) {
|
||||
for (int i=0;i<(int)zmob.bs.size();i++) {
|
||||
|
@ -49,10 +49,10 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
|
||||
|
||||
public:
|
||||
|
||||
void DhopImproved(StencilImpl &st, LebesgueOrder &lo,
|
||||
void DhopImproved(StencilImpl &st,
|
||||
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||
const FermionField &in, FermionField &out, int dag, int interior,int exterior);
|
||||
void DhopNaive(StencilImpl &st, LebesgueOrder &lo,
|
||||
void DhopNaive(StencilImpl &st,
|
||||
DoubledGaugeField &U,
|
||||
const FermionField &in, FermionField &out, int dag, int interior,int exterior);
|
||||
|
||||
|
@ -47,7 +47,7 @@ public:
|
||||
static int PartialCompressionFactor(GridBase *grid) { return 1;}
|
||||
#endif
|
||||
template<class vobj,class cobj,class compressor>
|
||||
static void Gather_plane_simple (commVector<std::pair<int,int> >& table,
|
||||
static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
|
||||
const Lattice<vobj> &rhs,
|
||||
cobj *buffer,
|
||||
compressor &compress,
|
||||
@ -109,7 +109,7 @@ public:
|
||||
// Reorder the fifth dim to be s=Ls-1 , s=0, s=1,...,Ls-2.
|
||||
////////////////////////////////////////////////////////////////////////////////////////////
|
||||
template<class vobj,class cobj,class compressor>
|
||||
static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
|
||||
static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
|
||||
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
|
||||
compressor &compress,int type,int partial)
|
||||
{
|
||||
@ -197,7 +197,7 @@ public:
|
||||
#endif
|
||||
|
||||
template<class vobj,class cobj,class compressor>
|
||||
static void Gather_plane_simple (commVector<std::pair<int,int> >& table,
|
||||
static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
|
||||
const Lattice<vobj> &rhs,
|
||||
cobj *buffer,
|
||||
compressor &compress,
|
||||
@ -208,7 +208,7 @@ public:
|
||||
else FaceGatherSimple::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
|
||||
}
|
||||
template<class vobj,class cobj,class compressor>
|
||||
static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
|
||||
static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
|
||||
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
|
||||
compressor &compress,int type,int partial)
|
||||
{
|
||||
@ -402,7 +402,6 @@ public:
|
||||
|
||||
typedef CartesianStencil<vobj,cobj,Parameters> Base;
|
||||
typedef typename Base::View_type View_type;
|
||||
typedef typename Base::StencilVector StencilVector;
|
||||
|
||||
// Vector<int> surface_list;
|
||||
WilsonStencil(GridBase *grid,
|
||||
@ -415,29 +414,6 @@ public:
|
||||
// surface_list.resize(0);
|
||||
this->same_node.resize(npoints);
|
||||
};
|
||||
|
||||
/*
|
||||
void BuildSurfaceList(int Ls,int vol4){
|
||||
|
||||
// find same node for SHM
|
||||
// Here we know the distance is 1 for WilsonStencil
|
||||
for(int point=0;point<this->_npoints;point++){
|
||||
this->same_node[point] = this->SameNode(point);
|
||||
}
|
||||
|
||||
for(int site = 0 ;site< vol4;site++){
|
||||
int local = 1;
|
||||
for(int point=0;point<this->_npoints;point++){
|
||||
if( (!this->GetNodeLocal(site*Ls,point)) && (!this->same_node[point]) ){
|
||||
local = 0;
|
||||
}
|
||||
}
|
||||
if(local == 0) {
|
||||
surface_list.push_back(site);
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
template < class compressor>
|
||||
void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress)
|
||||
@ -508,6 +484,11 @@ public:
|
||||
this->face_table_computed=1;
|
||||
assert(this->u_comm_offset==this->_unified_buffer_size);
|
||||
accelerator_barrier();
|
||||
#ifdef NVLINK_GET
|
||||
this->_grid->StencilBarrier(); // He can now get mu local gather, I can get his
|
||||
// Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check
|
||||
// Or issue barrier AFTER the DMA is running
|
||||
#endif
|
||||
}
|
||||
|
||||
};
|
||||
|
@ -126,14 +126,17 @@ public:
|
||||
void DerivInternal(StencilImpl &st, DoubledGaugeField &U, GaugeField &mat,
|
||||
const FermionField &A, const FermionField &B, int dag);
|
||||
|
||||
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||
void DhopInternal(StencilImpl &st,
|
||||
DoubledGaugeField &U,
|
||||
const FermionField &in, FermionField &out, int dag);
|
||||
|
||||
void DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||
const FermionField &in, FermionField &out, int dag);
|
||||
void DhopInternalSerial(StencilImpl &st,
|
||||
DoubledGaugeField &U,
|
||||
const FermionField &in, FermionField &out, int dag);
|
||||
|
||||
void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||
const FermionField &in, FermionField &out, int dag);
|
||||
void DhopInternalOverlappedComms(StencilImpl &st,
|
||||
DoubledGaugeField &U,
|
||||
const FermionField &in, FermionField &out, int dag);
|
||||
|
||||
// Constructor
|
||||
WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
|
||||
@ -168,9 +171,6 @@ public:
|
||||
DoubledGaugeField UmuEven;
|
||||
DoubledGaugeField UmuOdd;
|
||||
|
||||
LebesgueOrder Lebesgue;
|
||||
LebesgueOrder LebesgueEvenOdd;
|
||||
|
||||
WilsonAnisotropyCoefficients anisotropyCoeff;
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
|
@ -91,13 +91,13 @@ public:
|
||||
virtual void Mdag (const FermionField &in, FermionField &out){assert(0);};
|
||||
|
||||
// half checkerboard operations; leave unimplemented as abstract for now
|
||||
virtual void Meooe (const FermionField &in, FermionField &out){assert(0);};
|
||||
virtual void Mooee (const FermionField &in, FermionField &out){assert(0);};
|
||||
virtual void MooeeInv (const FermionField &in, FermionField &out){assert(0);};
|
||||
virtual void Meooe (const FermionField &in, FermionField &out);
|
||||
virtual void Mooee (const FermionField &in, FermionField &out);
|
||||
virtual void MooeeInv (const FermionField &in, FermionField &out);
|
||||
|
||||
virtual void MeooeDag (const FermionField &in, FermionField &out){assert(0);};
|
||||
virtual void MooeeDag (const FermionField &in, FermionField &out){assert(0);};
|
||||
virtual void MooeeInvDag (const FermionField &in, FermionField &out){assert(0);};
|
||||
virtual void MeooeDag (const FermionField &in, FermionField &out);
|
||||
virtual void MooeeDag (const FermionField &in, FermionField &out);
|
||||
virtual void MooeeInvDag (const FermionField &in, FermionField &out);
|
||||
virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp){assert(0);}; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
|
||||
virtual void MdirAll(const FermionField &in, std::vector<FermionField> &out){assert(0);}; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
|
||||
|
||||
@ -109,6 +109,8 @@ public:
|
||||
void MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
|
||||
void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
|
||||
void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
|
||||
void MomentumSpacePropagatorHwQ(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist,
|
||||
std::vector<double> qmu) ;
|
||||
|
||||
// Implement hopping term non-hermitian hopping term; half cb or both
|
||||
// Implement s-diagonal DW
|
||||
@ -117,6 +119,9 @@ public:
|
||||
void DhopOE(const FermionField &in, FermionField &out,int dag);
|
||||
void DhopEO(const FermionField &in, FermionField &out,int dag);
|
||||
|
||||
void DhopComms (const FermionField &in, FermionField &out);
|
||||
void DhopCalc (const FermionField &in, FermionField &out,uint64_t *ids);
|
||||
|
||||
// add a DhopComm
|
||||
// -- suboptimal interface will presently trigger multiple comms.
|
||||
void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);
|
||||
@ -135,21 +140,18 @@ public:
|
||||
int dag);
|
||||
|
||||
void DhopInternal(StencilImpl & st,
|
||||
LebesgueOrder &lo,
|
||||
DoubledGaugeField &U,
|
||||
const FermionField &in,
|
||||
FermionField &out,
|
||||
int dag);
|
||||
|
||||
void DhopInternalOverlappedComms(StencilImpl & st,
|
||||
LebesgueOrder &lo,
|
||||
DoubledGaugeField &U,
|
||||
const FermionField &in,
|
||||
FermionField &out,
|
||||
int dag);
|
||||
|
||||
void DhopInternalSerialComms(StencilImpl & st,
|
||||
LebesgueOrder &lo,
|
||||
DoubledGaugeField &U,
|
||||
const FermionField &in,
|
||||
FermionField &out,
|
||||
@ -203,9 +205,6 @@ public:
|
||||
DoubledGaugeField UmuEven;
|
||||
DoubledGaugeField UmuOdd;
|
||||
|
||||
LebesgueOrder Lebesgue;
|
||||
LebesgueOrder LebesgueEvenOdd;
|
||||
|
||||
// Comms buffer
|
||||
// std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
|
||||
|
||||
|
@ -57,6 +57,10 @@ public:
|
||||
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
||||
int interior=1,int exterior=1) ;
|
||||
|
||||
static void DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
||||
uint64_t *ids);
|
||||
|
||||
static void DhopDagKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
||||
int interior=1,int exterior=1) ;
|
||||
|
@ -58,7 +58,7 @@ public:
|
||||
{
|
||||
// RealD eps = 1.0;
|
||||
std::cout<<GridLogMessage << "ZMobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" gamma passed in"<<std::endl;
|
||||
Vector<Coeff_t> zgamma(this->Ls);
|
||||
std::vector<Coeff_t> zgamma(this->Ls);
|
||||
for(int s=0;s<this->Ls;s++){
|
||||
zgamma[s] = gamma[s];
|
||||
}
|
||||
|
@ -1,3 +1,5 @@
|
||||
#if 0
|
||||
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
@ -818,3 +820,5 @@ CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,
|
||||
}
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
#endif
|
@ -1,3 +1,4 @@
|
||||
#if 0
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
@ -241,3 +242,4 @@ void LebesgueOrder::ZGraph(void)
|
||||
}
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
#endif
|
@ -72,7 +72,7 @@ public:
|
||||
void ThreadInterleave(void);
|
||||
|
||||
private:
|
||||
Vector<IndexInteger> _LebesgueReorder;
|
||||
deviceVector<IndexInteger> _LebesgueReorder;
|
||||
|
||||
};
|
||||
|
@ -48,7 +48,8 @@ CayleyFermion5D<Impl>::CayleyFermion5D(GaugeField &_Umu,
|
||||
FourDimGrid,
|
||||
FourDimRedBlackGrid,_M5,p),
|
||||
mass_plus(_mass), mass_minus(_mass)
|
||||
{
|
||||
{
|
||||
// qmu defaults to zero size;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////
|
||||
@ -156,18 +157,18 @@ template<class Impl>
|
||||
void CayleyFermion5D<Impl>::M5D (const FermionField &psi, FermionField &chi)
|
||||
{
|
||||
int Ls=this->Ls;
|
||||
Vector<Coeff_t> diag (Ls,1.0);
|
||||
Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass_minus;
|
||||
Vector<Coeff_t> lower(Ls,-1.0); lower[0] =mass_plus;
|
||||
std::vector<Coeff_t> diag (Ls,1.0);
|
||||
std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass_minus;
|
||||
std::vector<Coeff_t> lower(Ls,-1.0); lower[0] =mass_plus;
|
||||
M5D(psi,chi,chi,lower,diag,upper);
|
||||
}
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::Meooe5D (const FermionField &psi, FermionField &Din)
|
||||
{
|
||||
int Ls=this->Ls;
|
||||
Vector<Coeff_t> diag = bs;
|
||||
Vector<Coeff_t> upper= cs;
|
||||
Vector<Coeff_t> lower= cs;
|
||||
std::vector<Coeff_t> diag = bs;
|
||||
std::vector<Coeff_t> upper= cs;
|
||||
std::vector<Coeff_t> lower= cs;
|
||||
upper[Ls-1]=-mass_minus*upper[Ls-1];
|
||||
lower[0] =-mass_plus*lower[0];
|
||||
M5D(psi,psi,Din,lower,diag,upper);
|
||||
@ -176,9 +177,9 @@ void CayleyFermion5D<Impl>::Meooe5D (const FermionField &psi, FermionField &D
|
||||
template<class Impl> void CayleyFermion5D<Impl>::Meo5D (const FermionField &psi, FermionField &chi)
|
||||
{
|
||||
int Ls=this->Ls;
|
||||
Vector<Coeff_t> diag = beo;
|
||||
Vector<Coeff_t> upper(Ls);
|
||||
Vector<Coeff_t> lower(Ls);
|
||||
std::vector<Coeff_t> diag = beo;
|
||||
std::vector<Coeff_t> upper(Ls);
|
||||
std::vector<Coeff_t> lower(Ls);
|
||||
for(int i=0;i<Ls;i++) {
|
||||
upper[i]=-ceo[i];
|
||||
lower[i]=-ceo[i];
|
||||
@ -191,9 +192,9 @@ template<class Impl>
|
||||
void CayleyFermion5D<Impl>::Mooee (const FermionField &psi, FermionField &chi)
|
||||
{
|
||||
int Ls=this->Ls;
|
||||
Vector<Coeff_t> diag = bee;
|
||||
Vector<Coeff_t> upper(Ls);
|
||||
Vector<Coeff_t> lower(Ls);
|
||||
std::vector<Coeff_t> diag = bee;
|
||||
std::vector<Coeff_t> upper(Ls);
|
||||
std::vector<Coeff_t> lower(Ls);
|
||||
for(int i=0;i<Ls;i++) {
|
||||
upper[i]=-cee[i];
|
||||
lower[i]=-cee[i];
|
||||
@ -206,9 +207,9 @@ template<class Impl>
|
||||
void CayleyFermion5D<Impl>::MooeeDag (const FermionField &psi, FermionField &chi)
|
||||
{
|
||||
int Ls=this->Ls;
|
||||
Vector<Coeff_t> diag = bee;
|
||||
Vector<Coeff_t> upper(Ls);
|
||||
Vector<Coeff_t> lower(Ls);
|
||||
std::vector<Coeff_t> diag = bee;
|
||||
std::vector<Coeff_t> upper(Ls);
|
||||
std::vector<Coeff_t> lower(Ls);
|
||||
|
||||
for (int s=0;s<Ls;s++){
|
||||
// Assemble the 5d matrix
|
||||
@ -236,9 +237,9 @@ template<class Impl>
|
||||
void CayleyFermion5D<Impl>::M5Ddag (const FermionField &psi, FermionField &chi)
|
||||
{
|
||||
int Ls=this->Ls;
|
||||
Vector<Coeff_t> diag(Ls,1.0);
|
||||
Vector<Coeff_t> upper(Ls,-1.0);
|
||||
Vector<Coeff_t> lower(Ls,-1.0);
|
||||
std::vector<Coeff_t> diag(Ls,1.0);
|
||||
std::vector<Coeff_t> upper(Ls,-1.0);
|
||||
std::vector<Coeff_t> lower(Ls,-1.0);
|
||||
upper[Ls-1]=-mass_plus*upper[Ls-1];
|
||||
lower[0] =-mass_minus*lower[0];
|
||||
M5Ddag(psi,chi,chi,lower,diag,upper);
|
||||
@ -248,9 +249,9 @@ template<class Impl>
|
||||
void CayleyFermion5D<Impl>::MeooeDag5D (const FermionField &psi, FermionField &Din)
|
||||
{
|
||||
int Ls=this->Ls;
|
||||
Vector<Coeff_t> diag =bs;
|
||||
Vector<Coeff_t> upper=cs;
|
||||
Vector<Coeff_t> lower=cs;
|
||||
std::vector<Coeff_t> diag =bs;
|
||||
std::vector<Coeff_t> upper=cs;
|
||||
std::vector<Coeff_t> lower=cs;
|
||||
|
||||
for (int s=0;s<Ls;s++){
|
||||
if ( s== 0 ) {
|
||||
@ -270,6 +271,34 @@ void CayleyFermion5D<Impl>::MeooeDag5D (const FermionField &psi, FermionField
|
||||
M5Ddag(psi,psi,Din,lower,diag,upper);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::addQmu(const FermionField &psi,FermionField &chi, int dag)
|
||||
{
|
||||
if ( qmu.size() ) {
|
||||
|
||||
Gamma::Algebra Gmu [] = {
|
||||
Gamma::Algebra::GammaX,
|
||||
Gamma::Algebra::GammaY,
|
||||
Gamma::Algebra::GammaZ,
|
||||
Gamma::Algebra::GammaT
|
||||
};
|
||||
std::vector<ComplexD> coeff(Nd);
|
||||
ComplexD ci(0,1);
|
||||
|
||||
assert(qmu.size()==Nd);
|
||||
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
coeff[mu] = ci*qmu[mu];
|
||||
if ( dag ) coeff[mu] = conjugate(coeff[mu]);
|
||||
}
|
||||
|
||||
chi = chi + Gamma(Gmu[0])*psi*coeff[0];
|
||||
for(int mu=1;mu<Nd;mu++){
|
||||
chi = chi + Gamma(Gmu[mu])*psi*coeff[mu];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::M (const FermionField &psi, FermionField &chi)
|
||||
{
|
||||
@ -277,8 +306,12 @@ void CayleyFermion5D<Impl>::M (const FermionField &psi, FermionField &chi)
|
||||
|
||||
// Assemble Din
|
||||
Meooe5D(psi,Din);
|
||||
|
||||
|
||||
this->DW(Din,chi,DaggerNo);
|
||||
|
||||
// add i q_mu gamma_mu here
|
||||
addQmu(Din,chi,DaggerNo);
|
||||
|
||||
// ((b D_W + D_w hop terms +1) on s-diag
|
||||
axpby(chi,1.0,1.0,chi,psi);
|
||||
|
||||
@ -295,6 +328,9 @@ void CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
|
||||
FermionField Din(psi.Grid());
|
||||
// Apply Dw
|
||||
this->DW(psi,Din,DaggerYes);
|
||||
|
||||
// add -i conj(q_mu) gamma_mu here ... if qmu is real, gammm_5 hermitian, otherwise not.
|
||||
addQmu(psi,Din,DaggerYes);
|
||||
|
||||
MeooeDag5D(Din,chi);
|
||||
|
||||
@ -394,7 +430,7 @@ void CayleyFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c)
|
||||
{
|
||||
Vector<Coeff_t> gamma(this->Ls);
|
||||
std::vector<Coeff_t> gamma(this->Ls);
|
||||
for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
|
||||
SetCoefficientsInternal(1.0,gamma,b,c);
|
||||
}
|
||||
@ -402,13 +438,13 @@ void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,Re
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c)
|
||||
{
|
||||
Vector<Coeff_t> gamma(this->Ls);
|
||||
std::vector<Coeff_t> gamma(this->Ls);
|
||||
for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
|
||||
SetCoefficientsInternal(zolo_hi,gamma,b,c);
|
||||
}
|
||||
//Zolo
|
||||
template<class Impl>
|
||||
void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t> & gamma,RealD b,RealD c)
|
||||
void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c)
|
||||
{
|
||||
int Ls=this->Ls;
|
||||
|
||||
@ -488,7 +524,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t
|
||||
leem.resize(Ls);
|
||||
uee.resize(Ls);
|
||||
ueem.resize(Ls);
|
||||
|
||||
|
||||
for(int i=0;i<Ls;i++){
|
||||
|
||||
dee[i] = bee[i];
|
||||
@ -529,6 +565,18 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t
|
||||
dee[Ls-1] += delta_d;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////
|
||||
// Device buffers
|
||||
//////////////////////////////////////////
|
||||
d_diag.resize(Ls);
|
||||
d_upper.resize(Ls);
|
||||
d_lower.resize(Ls);
|
||||
|
||||
d_dee.resize(Ls);
|
||||
d_lee.resize(Ls);
|
||||
d_uee.resize(Ls);
|
||||
d_leem.resize(Ls);
|
||||
d_ueem.resize(Ls);
|
||||
// int inv=1;
|
||||
// this->MooeeInternalCompute(0,inv,MatpInv,MatmInv);
|
||||
// this->MooeeInternalCompute(1,inv,MatpInvDag,MatmInvDag);
|
||||
|
@ -43,9 +43,9 @@ void
|
||||
CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
|
||||
const FermionField &phi_i,
|
||||
FermionField &chi_i,
|
||||
Vector<Coeff_t> &lower,
|
||||
Vector<Coeff_t> &diag,
|
||||
Vector<Coeff_t> &upper)
|
||||
std::vector<Coeff_t> &lower,
|
||||
std::vector<Coeff_t> &diag,
|
||||
std::vector<Coeff_t> &upper)
|
||||
{
|
||||
|
||||
chi_i.Checkerboard()=psi_i.Checkerboard();
|
||||
@ -55,12 +55,16 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
|
||||
autoView(chi , chi_i,AcceleratorWrite);
|
||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||
|
||||
auto pdiag = &diag[0];
|
||||
auto pupper = &upper[0];
|
||||
auto plower = &lower[0];
|
||||
|
||||
int Ls =this->Ls;
|
||||
|
||||
acceleratorCopyToDevice(&diag[0] ,&this->d_diag[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&upper[0],&this->d_upper[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&lower[0],&this->d_lower[0],Ls*sizeof(Coeff_t));
|
||||
|
||||
auto pdiag = &d_diag[0];
|
||||
auto pupper = &d_upper[0];
|
||||
auto plower = &d_lower[0];
|
||||
|
||||
// 10 = 3 complex mult + 2 complex add
|
||||
// Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting)
|
||||
uint64_t nloop = grid->oSites();
|
||||
@ -82,9 +86,9 @@ void
|
||||
CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
|
||||
const FermionField &phi_i,
|
||||
FermionField &chi_i,
|
||||
Vector<Coeff_t> &lower,
|
||||
Vector<Coeff_t> &diag,
|
||||
Vector<Coeff_t> &upper)
|
||||
std::vector<Coeff_t> &lower,
|
||||
std::vector<Coeff_t> &diag,
|
||||
std::vector<Coeff_t> &upper)
|
||||
{
|
||||
chi_i.Checkerboard()=psi_i.Checkerboard();
|
||||
GridBase *grid=psi_i.Grid();
|
||||
@ -93,12 +97,16 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
|
||||
autoView(chi , chi_i,AcceleratorWrite);
|
||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||
|
||||
auto pdiag = &diag[0];
|
||||
auto pupper = &upper[0];
|
||||
auto plower = &lower[0];
|
||||
|
||||
int Ls=this->Ls;
|
||||
|
||||
acceleratorCopyToDevice(&diag[0] ,&this->d_diag[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&upper[0],&this->d_upper[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&lower[0],&this->d_lower[0],Ls*sizeof(Coeff_t));
|
||||
|
||||
auto pdiag = &d_diag[0];
|
||||
auto pupper = &d_upper[0];
|
||||
auto plower = &d_lower[0];
|
||||
|
||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||
uint64_t nloop = grid->oSites();
|
||||
accelerator_for(sss,nloop,Simd::Nsimd(),{
|
||||
@ -126,11 +134,17 @@ CayleyFermion5D<Impl>::MooeeInv (const FermionField &psi_i, FermionField &chi
|
||||
|
||||
int Ls=this->Ls;
|
||||
|
||||
auto plee = & lee [0];
|
||||
auto pdee = & dee [0];
|
||||
auto puee = & uee [0];
|
||||
auto pleem = & leem[0];
|
||||
auto pueem = & ueem[0];
|
||||
acceleratorCopyToDevice(&lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
|
||||
|
||||
auto plee = & d_lee [0];
|
||||
auto pdee = & d_dee [0];
|
||||
auto puee = & d_uee [0];
|
||||
auto pleem = & d_leem[0];
|
||||
auto pueem = & d_ueem[0];
|
||||
|
||||
uint64_t nloop = grid->oSites()/Ls;
|
||||
accelerator_for(sss,nloop,Simd::Nsimd(),{
|
||||
@ -182,11 +196,17 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi
|
||||
autoView(psi , psi_i,AcceleratorRead);
|
||||
autoView(chi , chi_i,AcceleratorWrite);
|
||||
|
||||
auto plee = & lee [0];
|
||||
auto pdee = & dee [0];
|
||||
auto puee = & uee [0];
|
||||
auto pleem = & leem[0];
|
||||
auto pueem = & ueem[0];
|
||||
acceleratorCopyToDevice(&lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
|
||||
|
||||
auto plee = & d_lee [0];
|
||||
auto pdee = & d_dee [0];
|
||||
auto puee = & d_uee [0];
|
||||
auto pleem = & d_leem[0];
|
||||
auto pueem = & d_ueem[0];
|
||||
|
||||
assert(psi.Checkerboard() == psi.Checkerboard());
|
||||
|
||||
|
@ -0,0 +1,376 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermion5DImplementation.h
|
||||
|
||||
Copyright (C) 2017 - 2025
|
||||
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Guido Cossu <guido.cossu@ed.ac.uk>
|
||||
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
|
||||
Author: Christoph Lehner <christoph@lhnr.de>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
#include <Grid/Grid.h>
|
||||
#include <Grid/qcd/spin/Dirac.h>
|
||||
#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h>
|
||||
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
template<class Impl, class CloverHelpers>
|
||||
CompactWilsonCloverFermion5D<Impl, CloverHelpers>::CompactWilsonCloverFermion5D(GaugeField& _Umu,
|
||||
GridCartesian &FiveDimGrid,
|
||||
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
||||
GridCartesian &FourDimGrid,
|
||||
GridRedBlackCartesian &FourDimRedBlackGrid,
|
||||
const RealD _mass,
|
||||
const RealD _csw_r,
|
||||
const RealD _csw_t,
|
||||
const RealD _cF,
|
||||
const ImplParams& impl_p)
|
||||
: WilsonBase(_Umu, FiveDimGrid, FiveDimRedBlackGrid, FourDimGrid, FourDimRedBlackGrid, _mass, impl_p)
|
||||
, csw_r(_csw_r)
|
||||
, csw_t(_csw_t)
|
||||
, cF(_cF)
|
||||
, fixedBoundaries(impl_p.boundary_phases[Nd-1] == 0.0)
|
||||
, Diagonal(&FourDimGrid), Triangle(&FourDimGrid)
|
||||
, DiagonalEven(&FourDimRedBlackGrid), TriangleEven(&FourDimRedBlackGrid)
|
||||
, DiagonalOdd(&FourDimRedBlackGrid), TriangleOdd(&FourDimRedBlackGrid)
|
||||
, DiagonalInv(&FourDimGrid), TriangleInv(&FourDimGrid)
|
||||
, DiagonalInvEven(&FourDimRedBlackGrid), TriangleInvEven(&FourDimRedBlackGrid)
|
||||
, DiagonalInvOdd(&FourDimRedBlackGrid), TriangleInvOdd(&FourDimRedBlackGrid)
|
||||
, Tmp(&FiveDimGrid)
|
||||
, BoundaryMask(&FiveDimGrid)
|
||||
, BoundaryMaskEven(&FiveDimRedBlackGrid), BoundaryMaskOdd(&FiveDimRedBlackGrid)
|
||||
{
|
||||
assert(Nd == 4 && Nc == 3 && Ns == 4 && Impl::Dimension == 3);
|
||||
|
||||
csw_r *= 0.5;
|
||||
csw_t *= 0.5;
|
||||
//if (clover_anisotropy.isAnisotropic)
|
||||
// csw_r /= clover_anisotropy.xi_0;
|
||||
|
||||
ImportGauge(_Umu);
|
||||
if (fixedBoundaries) {
|
||||
this->BoundaryMaskEven.Checkerboard() = Even;
|
||||
this->BoundaryMaskOdd.Checkerboard() = Odd;
|
||||
CompactHelpers::SetupMasks(this->BoundaryMask, this->BoundaryMaskEven, this->BoundaryMaskOdd);
|
||||
}
|
||||
}
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Dhop(const FermionField& in, FermionField& out, int dag) {
|
||||
WilsonBase::Dhop(in, out, dag);
|
||||
if(fixedBoundaries) ApplyBoundaryMask(out);
|
||||
}
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::DhopOE(const FermionField& in, FermionField& out, int dag) {
|
||||
WilsonBase::DhopOE(in, out, dag);
|
||||
if(fixedBoundaries) ApplyBoundaryMask(out);
|
||||
}
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::DhopEO(const FermionField& in, FermionField& out, int dag) {
|
||||
WilsonBase::DhopEO(in, out, dag);
|
||||
if(fixedBoundaries) ApplyBoundaryMask(out);
|
||||
}
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) {
|
||||
WilsonBase::DhopDir(in, out, dir, disp);
|
||||
if(this->fixedBoundaries) ApplyBoundaryMask(out);
|
||||
}
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::DhopDirAll(const FermionField& in, std::vector<FermionField>& out) {
|
||||
WilsonBase::DhopDirAll(in, out);
|
||||
if(this->fixedBoundaries) {
|
||||
for(auto& o : out) ApplyBoundaryMask(o);
|
||||
}
|
||||
}
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::M(const FermionField& in, FermionField& out) {
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
WilsonBase::Dhop(in, out, DaggerNo); // call base to save applying bc
|
||||
Mooee(in, Tmp);
|
||||
axpy(out, 1.0, out, Tmp);
|
||||
if(fixedBoundaries) ApplyBoundaryMask(out);
|
||||
}
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Mdag(const FermionField& in, FermionField& out) {
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
WilsonBase::Dhop(in, out, DaggerYes); // call base to save applying bc
|
||||
MooeeDag(in, Tmp);
|
||||
axpy(out, 1.0, out, Tmp);
|
||||
if(fixedBoundaries) ApplyBoundaryMask(out);
|
||||
}
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Meooe(const FermionField& in, FermionField& out) {
|
||||
WilsonBase::Meooe(in, out);
|
||||
if(fixedBoundaries) ApplyBoundaryMask(out);
|
||||
}
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MeooeDag(const FermionField& in, FermionField& out) {
|
||||
WilsonBase::MeooeDag(in, out);
|
||||
if(fixedBoundaries) ApplyBoundaryMask(out);
|
||||
}
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Mooee(const FermionField& in, FermionField& out) {
|
||||
if(in.Grid()->_isCheckerBoarded) {
|
||||
if(in.Checkerboard() == Odd) {
|
||||
MooeeInternal(in, out, DiagonalOdd, TriangleOdd);
|
||||
} else {
|
||||
MooeeInternal(in, out, DiagonalEven, TriangleEven);
|
||||
}
|
||||
} else {
|
||||
MooeeInternal(in, out, Diagonal, Triangle);
|
||||
}
|
||||
if(fixedBoundaries) ApplyBoundaryMask(out);
|
||||
}
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooeeDag(const FermionField& in, FermionField& out) {
|
||||
Mooee(in, out); // blocks are hermitian
|
||||
}
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooeeInv(const FermionField& in, FermionField& out) {
|
||||
if(in.Grid()->_isCheckerBoarded) {
|
||||
if(in.Checkerboard() == Odd) {
|
||||
MooeeInternal(in, out, DiagonalInvOdd, TriangleInvOdd);
|
||||
} else {
|
||||
MooeeInternal(in, out, DiagonalInvEven, TriangleInvEven);
|
||||
}
|
||||
} else {
|
||||
MooeeInternal(in, out, DiagonalInv, TriangleInv);
|
||||
}
|
||||
if(fixedBoundaries) ApplyBoundaryMask(out);
|
||||
}
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooeeInvDag(const FermionField& in, FermionField& out) {
|
||||
MooeeInv(in, out); // blocks are hermitian
|
||||
}
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::Mdir(const FermionField& in, FermionField& out, int dir, int disp) {
|
||||
DhopDir(in, out, dir, disp);
|
||||
}
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MdirAll(const FermionField& in, std::vector<FermionField>& out) {
|
||||
DhopDirAll(in, out);
|
||||
}
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) {
|
||||
assert(!fixedBoundaries); // TODO check for changes required for open bc
|
||||
|
||||
// NOTE: code copied from original clover term
|
||||
conformable(X.Grid(), Y.Grid());
|
||||
conformable(X.Grid(), force.Grid());
|
||||
GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
|
||||
GaugeField clover_force(force.Grid());
|
||||
PropagatorField Lambda(force.Grid());
|
||||
|
||||
// Guido: Here we are hitting some performance issues:
|
||||
// need to extract the components of the DoubledGaugeField
|
||||
// for each call
|
||||
// Possible solution
|
||||
// Create a vector object to store them? (cons: wasting space)
|
||||
std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
|
||||
|
||||
Impl::extractLinkField(U, this->Umu);
|
||||
|
||||
force = Zero();
|
||||
// Derivative of the Wilson hopping term
|
||||
this->DhopDeriv(force, X, Y, dag);
|
||||
|
||||
///////////////////////////////////////////////////////////
|
||||
// Clover term derivative
|
||||
///////////////////////////////////////////////////////////
|
||||
Impl::outerProductImpl(Lambda, X, Y);
|
||||
//std::cout << "Lambda:" << Lambda << std::endl;
|
||||
|
||||
Gamma::Algebra sigma[] = {
|
||||
Gamma::Algebra::SigmaXY,
|
||||
Gamma::Algebra::SigmaXZ,
|
||||
Gamma::Algebra::SigmaXT,
|
||||
Gamma::Algebra::MinusSigmaXY,
|
||||
Gamma::Algebra::SigmaYZ,
|
||||
Gamma::Algebra::SigmaYT,
|
||||
Gamma::Algebra::MinusSigmaXZ,
|
||||
Gamma::Algebra::MinusSigmaYZ,
|
||||
Gamma::Algebra::SigmaZT,
|
||||
Gamma::Algebra::MinusSigmaXT,
|
||||
Gamma::Algebra::MinusSigmaYT,
|
||||
Gamma::Algebra::MinusSigmaZT};
|
||||
|
||||
/*
|
||||
sigma_{\mu \nu}=
|
||||
| 0 sigma[0] sigma[1] sigma[2] |
|
||||
| sigma[3] 0 sigma[4] sigma[5] |
|
||||
| sigma[6] sigma[7] 0 sigma[8] |
|
||||
| sigma[9] sigma[10] sigma[11] 0 |
|
||||
*/
|
||||
|
||||
int count = 0;
|
||||
clover_force = Zero();
|
||||
for (int mu = 0; mu < 4; mu++)
|
||||
{
|
||||
force_mu = Zero();
|
||||
for (int nu = 0; nu < 4; nu++)
|
||||
{
|
||||
if (mu == nu)
|
||||
continue;
|
||||
|
||||
RealD factor;
|
||||
if (nu == 4 || mu == 4)
|
||||
{
|
||||
factor = 2.0 * csw_t;
|
||||
}
|
||||
else
|
||||
{
|
||||
factor = 2.0 * csw_r;
|
||||
}
|
||||
PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
|
||||
Impl::TraceSpinImpl(lambda, Slambda); // traceSpin ok
|
||||
force_mu -= factor*CloverHelpers::Cmunu(U, lambda, mu, nu); // checked
|
||||
count++;
|
||||
}
|
||||
|
||||
pokeLorentz(clover_force, U[mu] * force_mu, mu);
|
||||
}
|
||||
//clover_force *= csw;
|
||||
force += clover_force;
|
||||
}
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
|
||||
assert(0);
|
||||
}
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
|
||||
assert(0);
|
||||
}
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::MooeeInternal(const FermionField& in,
|
||||
FermionField& out,
|
||||
const CloverDiagonalField& diagonal,
|
||||
const CloverTriangleField& triangle) {
|
||||
assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
conformable(in, out);
|
||||
CompactHelpers::MooeeKernel(diagonal.oSites(), this->Ls, in, out, diagonal, triangle);
|
||||
}
|
||||
|
||||
template<class Impl, class CloverHelpers>
|
||||
void CompactWilsonCloverFermion5D<Impl, CloverHelpers>::ImportGauge(const GaugeField& _Umu) {
|
||||
// NOTE: parts copied from original implementation
|
||||
|
||||
// Import gauge into base class
|
||||
double t0 = usecond();
|
||||
WilsonBase::ImportGauge(_Umu); // NOTE: called here and in wilson constructor -> performed twice, but can't avoid that
|
||||
|
||||
// Initialize temporary variables
|
||||
double t1 = usecond();
|
||||
conformable(_Umu.Grid(), this->GaugeGrid());
|
||||
GridBase* grid = _Umu.Grid();
|
||||
typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
|
||||
CloverField TmpOriginal(grid);
|
||||
CloverField TmpInverse(grid);
|
||||
|
||||
// Compute the field strength terms mu>nu
|
||||
double t2 = usecond();
|
||||
WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
|
||||
WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
|
||||
WilsonLoops<Impl>::FieldStrength(Bz, _Umu, Ydir, Xdir);
|
||||
WilsonLoops<Impl>::FieldStrength(Ex, _Umu, Tdir, Xdir);
|
||||
WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
|
||||
WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
|
||||
|
||||
// Compute the Clover Operator acting on Colour and Spin
|
||||
// multiply here by the clover coefficients for the anisotropy
|
||||
double t3 = usecond();
|
||||
TmpOriginal = Helpers::fillCloverYZ(Bx) * csw_r;
|
||||
TmpOriginal += Helpers::fillCloverXZ(By) * csw_r;
|
||||
TmpOriginal += Helpers::fillCloverXY(Bz) * csw_r;
|
||||
TmpOriginal += Helpers::fillCloverXT(Ex) * csw_t;
|
||||
TmpOriginal += Helpers::fillCloverYT(Ey) * csw_t;
|
||||
TmpOriginal += Helpers::fillCloverZT(Ez) * csw_t;
|
||||
|
||||
// Instantiate the clover term
|
||||
// - In case of the standard clover the mass term is added
|
||||
// - In case of the exponential clover the clover term is exponentiated
|
||||
double t4 = usecond();
|
||||
CloverHelpers::InstantiateClover(TmpOriginal, TmpInverse, csw_t, 4.0 + this->M5 /*this->diag_mass*/);
|
||||
|
||||
// Convert the data layout of the clover term
|
||||
double t5 = usecond();
|
||||
CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle);
|
||||
|
||||
// Modify the clover term at the temporal boundaries in case of open boundary conditions
|
||||
double t6 = usecond();
|
||||
if(fixedBoundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, 4.0 + this->M5 /*this->diag_mass*/);
|
||||
|
||||
// Invert the Clover term
|
||||
// In case of the exponential clover with (anti-)periodic boundary conditions exp(-Clover) saved
|
||||
// in TmpInverse can be used. In all other cases the clover term has to be explictly inverted.
|
||||
// TODO: For now this inversion is explictly done on the CPU
|
||||
double t7 = usecond();
|
||||
CloverHelpers::InvertClover(TmpInverse, Diagonal, Triangle, DiagonalInv, TriangleInv, fixedBoundaries);
|
||||
|
||||
// Fill the remaining clover fields
|
||||
double t8 = usecond();
|
||||
pickCheckerboard(Even, DiagonalEven, Diagonal);
|
||||
pickCheckerboard(Even, TriangleEven, Triangle);
|
||||
pickCheckerboard(Odd, DiagonalOdd, Diagonal);
|
||||
pickCheckerboard(Odd, TriangleOdd, Triangle);
|
||||
pickCheckerboard(Even, DiagonalInvEven, DiagonalInv);
|
||||
pickCheckerboard(Even, TriangleInvEven, TriangleInv);
|
||||
pickCheckerboard(Odd, DiagonalInvOdd, DiagonalInv);
|
||||
pickCheckerboard(Odd, TriangleInvOdd, TriangleInv);
|
||||
|
||||
// Report timings
|
||||
double t9 = usecond();
|
||||
|
||||
std::cout << GridLogDebug << "CompactWilsonCloverFermion5D::ImportGauge timings:" << std::endl;
|
||||
std::cout << GridLogDebug << "WilsonFermion::Importgauge = " << (t1 - t0) / 1e6 << std::endl;
|
||||
std::cout << GridLogDebug << "allocations = " << (t2 - t1) / 1e6 << std::endl;
|
||||
std::cout << GridLogDebug << "field strength = " << (t3 - t2) / 1e6 << std::endl;
|
||||
std::cout << GridLogDebug << "fill clover = " << (t4 - t3) / 1e6 << std::endl;
|
||||
std::cout << GridLogDebug << "instantiate clover = " << (t5 - t4) / 1e6 << std::endl;
|
||||
std::cout << GridLogDebug << "convert layout = " << (t6 - t5) / 1e6 << std::endl;
|
||||
std::cout << GridLogDebug << "modify boundaries = " << (t7 - t6) / 1e6 << std::endl;
|
||||
std::cout << GridLogDebug << "invert clover = " << (t8 - t7) / 1e6 << std::endl;
|
||||
std::cout << GridLogDebug << "pick cbs = " << (t9 - t8) / 1e6 << std::endl;
|
||||
std::cout << GridLogDebug << "total = " << (t9 - t0) / 1e6 << std::endl;
|
||||
}
|
||||
|
||||
NAMESPACE_END(Grid);
|
@ -42,13 +42,13 @@ template<class Impl>
|
||||
void ContinuedFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata)
|
||||
{
|
||||
// How to check Ls matches??
|
||||
// std::cout<<GridLogMessage << Ls << " Ls"<<std::endl;
|
||||
// std::cout<<GridLogMessage << zdata->n << " - n"<<std::endl;
|
||||
// std::cout<<GridLogMessage << zdata->da << " -da "<<std::endl;
|
||||
// std::cout<<GridLogMessage << zdata->db << " -db"<<std::endl;
|
||||
// std::cout<<GridLogMessage << zdata->dn << " -dn"<<std::endl;
|
||||
// std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl;
|
||||
std::cout<<GridLogMessage << zdata->n << " - n"<<std::endl;
|
||||
std::cout<<GridLogMessage << zdata->da << " -da "<<std::endl;
|
||||
std::cout<<GridLogMessage << zdata->db << " -db"<<std::endl;
|
||||
std::cout<<GridLogMessage << zdata->dn << " -dn"<<std::endl;
|
||||
std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl;
|
||||
int Ls = this->Ls;
|
||||
std::cout<<GridLogMessage << Ls << " Ls"<<std::endl;
|
||||
assert(zdata->db==Ls);// Beta has Ls coeffs
|
||||
|
||||
R=(1+this->mass)/(1-this->mass);
|
||||
@ -320,7 +320,7 @@ ContinuedFractionFermion5D<Impl>::ContinuedFractionFermion5D(
|
||||
int Ls = this->Ls;
|
||||
conformable(solution5d.Grid(),this->FermionGrid());
|
||||
conformable(exported4d.Grid(),this->GaugeGrid());
|
||||
ExtractSlice(exported4d, solution5d, Ls-1, Ls-1);
|
||||
ExtractSlice(exported4d, solution5d, Ls-1, 0);
|
||||
}
|
||||
template<class Impl>
|
||||
void ContinuedFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
|
||||
@ -330,7 +330,7 @@ ContinuedFractionFermion5D<Impl>::ContinuedFractionFermion5D(
|
||||
conformable(input4d.Grid() ,this->GaugeGrid());
|
||||
FermionField tmp(this->FermionGrid());
|
||||
tmp=Zero();
|
||||
InsertSlice(input4d, tmp, Ls-1, Ls-1);
|
||||
InsertSlice(input4d, tmp, Ls-1, 0);
|
||||
tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
|
||||
this->Dminus(tmp,imported5d);
|
||||
}
|
||||
|
@ -41,7 +41,7 @@ NAMESPACE_BEGIN(Grid);
|
||||
// Pplus backwards..
|
||||
template<class Impl>
|
||||
void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i,
|
||||
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
|
||||
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
|
||||
{
|
||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||
int Ls = this->Ls;
|
||||
@ -50,9 +50,15 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
|
||||
autoView( psi , psi_i, AcceleratorRead);
|
||||
autoView( chi , chi_i, AcceleratorWrite);
|
||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||
auto pdiag = &diag[0];
|
||||
auto pupper = &upper[0];
|
||||
auto plower = &lower[0];
|
||||
|
||||
auto pdiag = &this->d_diag[0];
|
||||
auto pupper = &this->d_upper[0];
|
||||
auto plower = &this->d_lower[0];
|
||||
|
||||
acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
|
||||
|
||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||
|
||||
auto nloop=grid->oSites()/Ls;
|
||||
@ -73,7 +79,7 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
|
||||
|
||||
template<class Impl>
|
||||
void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const FermionField& phi_i, FermionField& chi_i,
|
||||
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
|
||||
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
|
||||
{
|
||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||
GridBase* grid = psi_i.Grid();
|
||||
@ -83,9 +89,14 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const Fermio
|
||||
autoView( phi , phi_i, AcceleratorRead);
|
||||
autoView( chi , chi_i, AcceleratorWrite);
|
||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||
auto pdiag = &diag[0];
|
||||
auto pupper = &upper[0];
|
||||
auto plower = &lower[0];
|
||||
|
||||
auto pdiag = &this->d_diag[0];
|
||||
auto pupper = &this->d_upper[0];
|
||||
auto plower = &this->d_lower[0];
|
||||
|
||||
acceleratorCopyToDevice(&diag[0] ,&pdiag[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
|
||||
|
||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||
|
||||
@ -114,12 +125,17 @@ void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionFie
|
||||
autoView( chi, chi_i, AcceleratorWrite);
|
||||
int Ls = this->Ls;
|
||||
|
||||
auto plee = & this->lee[0];
|
||||
auto pdee = & this->dee[0];
|
||||
auto puee = & this->uee[0];
|
||||
|
||||
auto pleem = & this->leem[0];
|
||||
auto pueem = & this->ueem[0];
|
||||
auto plee = & this->d_lee [0];
|
||||
auto pdee = & this->d_dee [0];
|
||||
auto puee = & this->d_uee [0];
|
||||
auto pleem = & this->d_leem[0];
|
||||
auto pueem = & this->d_ueem[0];
|
||||
|
||||
acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t));
|
||||
|
||||
uint64_t nloop=grid->oSites()/Ls;
|
||||
accelerator_for(sss,nloop,Simd::Nsimd(),{
|
||||
|
@ -131,9 +131,9 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi
|
||||
else{ shiftm = -shift*(mq3-mq2); }
|
||||
}
|
||||
|
||||
Vector<Coeff_t> diag(Ls,1.0);
|
||||
Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftm;
|
||||
Vector<Coeff_t> lower(Ls,-1.0); lower[0] = mq1 + shiftp;
|
||||
std::vector<Coeff_t> diag(Ls,1.0);
|
||||
std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftm;
|
||||
std::vector<Coeff_t> lower(Ls,-1.0); lower[0] = mq1 + shiftp;
|
||||
|
||||
#if(0)
|
||||
std::cout << GridLogMessage << "DomainWallEOFAFermion::M5D(FF&,FF&):" << std::endl;
|
||||
@ -168,9 +168,9 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField&
|
||||
else{ shiftm = -shift*(mq3-mq2); }
|
||||
}
|
||||
|
||||
Vector<Coeff_t> diag(Ls,1.0);
|
||||
Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftp;
|
||||
Vector<Coeff_t> lower(Ls,-1.0); lower[0] = mq1 + shiftm;
|
||||
std::vector<Coeff_t> diag(Ls,1.0);
|
||||
std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftp;
|
||||
std::vector<Coeff_t> lower(Ls,-1.0); lower[0] = mq1 + shiftm;
|
||||
|
||||
this->M5Ddag(psi, chi, chi, lower, diag, upper);
|
||||
}
|
||||
@ -181,9 +181,9 @@ void DomainWallEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& c
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
|
||||
Vector<Coeff_t> diag = this->bee;
|
||||
Vector<Coeff_t> upper(Ls);
|
||||
Vector<Coeff_t> lower(Ls);
|
||||
std::vector<Coeff_t> diag = this->bee;
|
||||
std::vector<Coeff_t> upper(Ls);
|
||||
std::vector<Coeff_t> lower(Ls);
|
||||
|
||||
for(int s=0; s<Ls; s++){
|
||||
upper[s] = -this->cee[s];
|
||||
@ -200,9 +200,9 @@ void DomainWallEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
|
||||
Vector<Coeff_t> diag = this->bee;
|
||||
Vector<Coeff_t> upper(Ls);
|
||||
Vector<Coeff_t> lower(Ls);
|
||||
std::vector<Coeff_t> diag = this->bee;
|
||||
std::vector<Coeff_t> upper(Ls);
|
||||
std::vector<Coeff_t> lower(Ls);
|
||||
|
||||
for(int s=0; s<Ls; s++){
|
||||
upper[s] = -this->cee[s];
|
||||
@ -218,7 +218,7 @@ void DomainWallEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField
|
||||
|
||||
//Zolo
|
||||
template<class Impl>
|
||||
void DomainWallEOFAFermion<Impl>::SetCoefficientsInternal(RealD zolo_hi, Vector<Coeff_t>& gamma, RealD b, RealD c)
|
||||
void DomainWallEOFAFermion<Impl>::SetCoefficientsInternal(RealD zolo_hi, std::vector<Coeff_t>& gamma, RealD b, RealD c)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
int pm = this->pm;
|
||||
|
@ -61,8 +61,6 @@ ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GridCartesian
|
||||
UUUmu(&FourDimGrid),
|
||||
UUUmuEven(&FourDimRedBlackGrid),
|
||||
UUUmuOdd(&FourDimRedBlackGrid),
|
||||
Lebesgue(&FourDimGrid),
|
||||
LebesgueEvenOdd(&FourDimRedBlackGrid),
|
||||
_tmp(&FiveDimRedBlackGrid)
|
||||
{
|
||||
|
||||
@ -277,18 +275,18 @@ void ImprovedStaggeredFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
|
||||
|
||||
/*CHANGE */
|
||||
template<class Impl>
|
||||
void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
|
||||
void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st,
|
||||
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
||||
const FermionField &in, FermionField &out,int dag)
|
||||
{
|
||||
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
|
||||
DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
|
||||
DhopInternalOverlappedComms(st,U,UUU,in,out,dag);
|
||||
else
|
||||
DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
|
||||
DhopInternalSerialComms(st,U,UUU,in,out,dag);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo,
|
||||
void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
|
||||
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
||||
const FermionField &in, FermionField &out,int dag)
|
||||
{
|
||||
@ -313,7 +311,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
|
||||
{
|
||||
int interior=1;
|
||||
int exterior=0;
|
||||
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
|
||||
Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
|
||||
}
|
||||
|
||||
st.CommsMerge(compressor);
|
||||
@ -323,12 +321,12 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
|
||||
{
|
||||
int interior=0;
|
||||
int exterior=1;
|
||||
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
|
||||
Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
|
||||
}
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
|
||||
void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
|
||||
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
||||
const FermionField &in, FermionField &out,int dag)
|
||||
{
|
||||
@ -341,7 +339,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
|
||||
{
|
||||
int interior=1;
|
||||
int exterior=1;
|
||||
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
|
||||
Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
|
||||
}
|
||||
}
|
||||
/*CHANGE END*/
|
||||
@ -357,7 +355,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionFie
|
||||
assert(in.Checkerboard()==Even);
|
||||
out.Checkerboard() = Odd;
|
||||
|
||||
DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,UUUmuOdd,in,out,dag);
|
||||
DhopInternal(StencilEven,UmuOdd,UUUmuOdd,in,out,dag);
|
||||
}
|
||||
template<class Impl>
|
||||
void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
|
||||
@ -368,7 +366,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionFie
|
||||
assert(in.Checkerboard()==Odd);
|
||||
out.Checkerboard() = Even;
|
||||
|
||||
DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,UUUmuEven,in,out,dag);
|
||||
DhopInternal(StencilOdd,UmuEven,UUUmuEven,in,out,dag);
|
||||
}
|
||||
template<class Impl>
|
||||
void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
|
||||
@ -378,7 +376,7 @@ void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField
|
||||
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
|
||||
DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag);
|
||||
DhopInternal(Stencil,Umu,UUUmu,in,out,dag);
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
|
@ -48,8 +48,6 @@ ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GridCartesian &Fgrid, G
|
||||
StencilEven(&Hgrid, npoint, Even, directions, displacements,p), // source is Even
|
||||
StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p), // source is Odd
|
||||
mass(_mass),
|
||||
Lebesgue(_grid),
|
||||
LebesgueEvenOdd(_cbgrid),
|
||||
Umu(&Fgrid),
|
||||
UmuEven(&Hgrid),
|
||||
UmuOdd(&Hgrid),
|
||||
@ -339,7 +337,7 @@ void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &
|
||||
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
|
||||
DhopInternal(Stencil, Lebesgue, Umu, UUUmu, in, out, dag);
|
||||
DhopInternal(Stencil, Umu, UUUmu, in, out, dag);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
@ -351,7 +349,7 @@ void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField
|
||||
assert(in.Checkerboard() == Even);
|
||||
out.Checkerboard() = Odd;
|
||||
|
||||
DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, UUUmuOdd, in, out, dag);
|
||||
DhopInternal(StencilEven, UmuOdd, UUUmuOdd, in, out, dag);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
@ -363,7 +361,7 @@ void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField
|
||||
assert(in.Checkerboard() == Odd);
|
||||
out.Checkerboard() = Even;
|
||||
|
||||
DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, UUUmuEven, in, out, dag);
|
||||
DhopInternal(StencilOdd, UmuEven, UUUmuEven, in, out, dag);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
@ -394,19 +392,19 @@ void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionFiel
|
||||
|
||||
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
|
||||
void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st,
|
||||
DoubledGaugeField &U,
|
||||
DoubledGaugeField &UUU,
|
||||
const FermionField &in,
|
||||
FermionField &out, int dag)
|
||||
{
|
||||
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
|
||||
DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
|
||||
DhopInternalOverlappedComms(st,U,UUU,in,out,dag);
|
||||
else
|
||||
DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
|
||||
DhopInternalSerialComms(st,U,UUU,in,out,dag);
|
||||
}
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
|
||||
void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st,
|
||||
DoubledGaugeField &U,
|
||||
DoubledGaugeField &UUU,
|
||||
const FermionField &in,
|
||||
@ -429,7 +427,7 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
|
||||
{
|
||||
int interior=1;
|
||||
int exterior=0;
|
||||
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
|
||||
Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
|
||||
}
|
||||
|
||||
st.CommunicateComplete(requests);
|
||||
@ -440,13 +438,13 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
|
||||
{
|
||||
int interior=0;
|
||||
int exterior=1;
|
||||
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
|
||||
Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <class Impl>
|
||||
void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo,
|
||||
void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st,
|
||||
DoubledGaugeField &U,
|
||||
DoubledGaugeField &UUU,
|
||||
const FermionField &in,
|
||||
@ -460,7 +458,7 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, Le
|
||||
{
|
||||
int interior=1;
|
||||
int exterior=1;
|
||||
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
|
||||
Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -39,7 +39,7 @@ NAMESPACE_BEGIN(Grid);
|
||||
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
|
||||
Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper)
|
||||
std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper)
|
||||
{
|
||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||
GridBase *grid = psi_i.Grid();
|
||||
@ -50,10 +50,14 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField
|
||||
|
||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||
|
||||
auto pdiag = &diag[0];
|
||||
auto pupper = &upper[0];
|
||||
auto plower = &lower[0];
|
||||
auto pdiag = &this->d_diag[0];
|
||||
auto pupper = &this->d_upper[0];
|
||||
auto plower = &this->d_lower[0];
|
||||
|
||||
acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
|
||||
|
||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||
int nloop = grid->oSites()/Ls;
|
||||
accelerator_for(sss,nloop,Simd::Nsimd(),{
|
||||
@ -74,8 +78,8 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField
|
||||
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
|
||||
Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper,
|
||||
Vector<Coeff_t> &shift_coeffs)
|
||||
std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper,
|
||||
std::vector<Coeff_t> &shift_coeffs)
|
||||
{
|
||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||
GridBase *grid = psi_i.Grid();
|
||||
@ -86,13 +90,18 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
|
||||
|
||||
auto pm = this->pm;
|
||||
int shift_s = (pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
|
||||
|
||||
|
||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||
|
||||
auto pdiag = &diag[0];
|
||||
auto pupper = &upper[0];
|
||||
auto plower = &lower[0];
|
||||
auto pshift_coeffs = &shift_coeffs[0];
|
||||
auto pdiag = &this->d_diag[0];
|
||||
auto pupper = &this->d_upper[0];
|
||||
auto plower = &this->d_lower[0];
|
||||
auto pshift_coeffs = &this->d_shift_coefficients[0];
|
||||
|
||||
acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&shift_coeffs[0],&pshift_coeffs[0],Ls*sizeof(Coeff_t));
|
||||
|
||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||
int nloop = grid->oSites()/Ls;
|
||||
@ -119,7 +128,7 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
|
||||
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
|
||||
Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper)
|
||||
std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper)
|
||||
{
|
||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||
GridBase *grid = psi_i.Grid();
|
||||
@ -129,10 +138,14 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
|
||||
autoView(chi , chi_i, AcceleratorWrite);
|
||||
|
||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||
|
||||
auto pdiag = &this->d_diag[0];
|
||||
auto pupper = &this->d_upper[0];
|
||||
auto plower = &this->d_lower[0];
|
||||
|
||||
auto pdiag = &diag[0];
|
||||
auto pupper = &upper[0];
|
||||
auto plower = &lower[0];
|
||||
acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
|
||||
|
||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||
int nloop = grid->oSites()/Ls;
|
||||
@ -154,8 +167,8 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
|
||||
|
||||
template<class Impl>
|
||||
void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
|
||||
Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper,
|
||||
Vector<Coeff_t> &shift_coeffs)
|
||||
std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper,
|
||||
std::vector<Coeff_t> &shift_coeffs)
|
||||
{
|
||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||
GridBase *grid = psi_i.Grid();
|
||||
@ -167,11 +180,16 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const Ferm
|
||||
|
||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||
|
||||
auto pdiag = &diag[0];
|
||||
auto pupper = &upper[0];
|
||||
auto plower = &lower[0];
|
||||
auto pshift_coeffs = &shift_coeffs[0];
|
||||
auto pdiag = &this->d_diag[0];
|
||||
auto pupper = &this->d_upper[0];
|
||||
auto plower = &this->d_lower[0];
|
||||
auto pshift_coeffs = &this->d_shift_coefficients[0];
|
||||
|
||||
acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&shift_coeffs[0],&pshift_coeffs[0],Ls*sizeof(Coeff_t));
|
||||
|
||||
// Flops = 6.0*(Nc*Ns) *Ls*vol
|
||||
auto pm = this->pm;
|
||||
|
||||
@ -212,11 +230,17 @@ void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField &
|
||||
autoView(psi , psi_i, AcceleratorRead);
|
||||
autoView(chi , chi_i, AcceleratorWrite);
|
||||
|
||||
auto plee = & this->lee [0];
|
||||
auto pdee = & this->dee [0];
|
||||
auto puee = & this->uee [0];
|
||||
auto pleem= & this->leem[0];
|
||||
auto pueem= & this->ueem[0];
|
||||
auto plee = & this->d_lee [0];
|
||||
auto pdee = & this->d_dee [0];
|
||||
auto puee = & this->d_uee [0];
|
||||
auto pleem = & this->d_leem[0];
|
||||
auto pueem = & this->d_ueem[0];
|
||||
|
||||
acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t));
|
||||
|
||||
if(this->shift != 0.0){ MooeeInv_shift(psi_i,chi_i); return; }
|
||||
|
||||
@ -268,14 +292,23 @@ void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionF
|
||||
autoView(psi , psi_i, AcceleratorRead);
|
||||
autoView(chi , chi_i, AcceleratorWrite);
|
||||
|
||||
// Move into object and constructor
|
||||
auto pm = this->pm;
|
||||
auto plee = & this->lee [0];
|
||||
auto pdee = & this->dee [0];
|
||||
auto puee = & this->uee [0];
|
||||
auto pleem= & this->leem[0];
|
||||
auto pueem= & this->ueem[0];
|
||||
auto pMooeeInv_shift_lc = &MooeeInv_shift_lc[0];
|
||||
auto pMooeeInv_shift_norm = &MooeeInv_shift_norm[0];
|
||||
auto plee = & this->d_lee [0];
|
||||
auto pdee = & this->d_dee [0];
|
||||
auto puee = & this->d_uee [0];
|
||||
auto pleem = & this->d_leem[0];
|
||||
auto pueem = & this->d_ueem[0];
|
||||
auto pMooeeInv_shift_lc = &this->d_MooeeInv_shift_lc[0];
|
||||
auto pMooeeInv_shift_norm = &this->d_MooeeInv_shift_norm[0];
|
||||
|
||||
acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&MooeeInv_shift_lc[0],&pMooeeInv_shift_lc[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&MooeeInv_shift_norm[0],&pMooeeInv_shift_norm[0],Ls*sizeof(Coeff_t));
|
||||
|
||||
int nloop = grid->oSites()/Ls;
|
||||
accelerator_for(sss,nloop,Simd::Nsimd(),{
|
||||
@ -333,11 +366,17 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionFiel
|
||||
autoView(psi , psi_i, AcceleratorRead);
|
||||
autoView(chi , chi_i, AcceleratorWrite);
|
||||
|
||||
auto plee = & this->lee [0];
|
||||
auto pdee = & this->dee [0];
|
||||
auto puee = & this->uee [0];
|
||||
auto pleem= & this->leem[0];
|
||||
auto pueem= & this->ueem[0];
|
||||
auto plee = &this->d_lee [0];
|
||||
auto pdee = &this->d_dee [0];
|
||||
auto puee = &this->d_uee [0];
|
||||
auto pleem = &this->d_leem[0];
|
||||
auto pueem = &this->d_ueem[0];
|
||||
|
||||
acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t));
|
||||
|
||||
int nloop = grid->oSites()/Ls;
|
||||
accelerator_for(sss,nloop,Simd::Nsimd(),{
|
||||
@ -387,13 +426,25 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, Fermi
|
||||
int Ls = this->Ls;
|
||||
|
||||
auto pm = this->pm;
|
||||
auto plee = & this->lee [0];
|
||||
auto pdee = & this->dee [0];
|
||||
auto puee = & this->uee [0];
|
||||
auto pleem= & this->leem[0];
|
||||
auto pueem= & this->ueem[0];
|
||||
auto pMooeeInvDag_shift_lc = &MooeeInvDag_shift_lc[0];
|
||||
auto pMooeeInvDag_shift_norm = &MooeeInvDag_shift_norm[0];
|
||||
auto plee = & this->d_lee [0];
|
||||
auto pdee = & this->d_dee [0];
|
||||
auto puee = & this->d_uee [0];
|
||||
auto pleem = & this->d_leem[0];
|
||||
auto pueem = & this->d_ueem[0];
|
||||
|
||||
auto pMooeeInvDag_shift_lc = &this->d_MooeeInv_shift_lc[0];
|
||||
auto pMooeeInvDag_shift_norm = &this->d_MooeeInv_shift_norm[0];
|
||||
|
||||
acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&MooeeInvDag_shift_lc[0],&pMooeeInvDag_shift_lc[0],Ls*sizeof(Coeff_t));
|
||||
acceleratorCopyToDevice(&MooeeInvDag_shift_norm[0],&pMooeeInvDag_shift_norm[0],Ls*sizeof(Coeff_t));
|
||||
|
||||
// auto pMooeeInvDag_shift_lc = &MooeeInvDag_shift_lc[0];
|
||||
// auto pMooeeInvDag_shift_norm = &MooeeInvDag_shift_norm[0];
|
||||
|
||||
int nloop = grid->oSites()/Ls;
|
||||
accelerator_for(sss,nloop,Simd::Nsimd(),{
|
||||
|
@ -196,9 +196,9 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
|
||||
Vector<Coeff_t> diag(Ls,1.0);
|
||||
Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = this->mq1;
|
||||
Vector<Coeff_t> lower(Ls,-1.0); lower[0] = this->mq1;
|
||||
std::vector<Coeff_t> diag(Ls,1.0);
|
||||
std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = this->mq1;
|
||||
std::vector<Coeff_t> lower(Ls,-1.0); lower[0] = this->mq1;
|
||||
|
||||
// no shift term
|
||||
if(this->shift == 0.0){ this->M5D(psi, chi, chi, lower, diag, upper); }
|
||||
@ -212,9 +212,9 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& chi)
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
|
||||
Vector<Coeff_t> diag(Ls,1.0);
|
||||
Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = this->mq1;
|
||||
Vector<Coeff_t> lower(Ls,-1.0); lower[0] = this->mq1;
|
||||
std::vector<Coeff_t> diag(Ls,1.0);
|
||||
std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = this->mq1;
|
||||
std::vector<Coeff_t> lower(Ls,-1.0); lower[0] = this->mq1;
|
||||
|
||||
// no shift term
|
||||
if(this->shift == 0.0){ this->M5Ddag(psi, chi, chi, lower, diag, upper); }
|
||||
@ -230,9 +230,9 @@ void MobiusEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& chi)
|
||||
int Ls = this->Ls;
|
||||
|
||||
// coefficients of Mooee
|
||||
Vector<Coeff_t> diag = this->bee;
|
||||
Vector<Coeff_t> upper(Ls);
|
||||
Vector<Coeff_t> lower(Ls);
|
||||
std::vector<Coeff_t> diag = this->bee;
|
||||
std::vector<Coeff_t> upper(Ls);
|
||||
std::vector<Coeff_t> lower(Ls);
|
||||
for(int s=0; s<Ls; s++){
|
||||
upper[s] = -this->cee[s];
|
||||
lower[s] = -this->cee[s];
|
||||
@ -253,9 +253,9 @@ void MobiusEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField& ch
|
||||
int Ls = this->Ls;
|
||||
|
||||
// coefficients of MooeeDag
|
||||
Vector<Coeff_t> diag = this->bee;
|
||||
Vector<Coeff_t> upper(Ls);
|
||||
Vector<Coeff_t> lower(Ls);
|
||||
std::vector<Coeff_t> diag = this->bee;
|
||||
std::vector<Coeff_t> upper(Ls);
|
||||
std::vector<Coeff_t> lower(Ls);
|
||||
for(int s=0; s<Ls; s++){
|
||||
if(s==0) {
|
||||
upper[s] = -this->cee[s+1];
|
||||
@ -314,10 +314,10 @@ void MobiusEOFAFermion<Impl>::SetCoefficientsPrecondShiftOps()
|
||||
// Tridiagonal solve for MooeeInvDag_shift_lc
|
||||
{
|
||||
Coeff_t m(0.0);
|
||||
Vector<Coeff_t> d = Mooee_shift;
|
||||
Vector<Coeff_t> u(Ls,0.0);
|
||||
Vector<Coeff_t> y(Ls,0.0);
|
||||
Vector<Coeff_t> q(Ls,0.0);
|
||||
std::vector<Coeff_t> d = Mooee_shift;
|
||||
std::vector<Coeff_t> u(Ls,0.0);
|
||||
std::vector<Coeff_t> y(Ls,0.0);
|
||||
std::vector<Coeff_t> q(Ls,0.0);
|
||||
if(pm == 1){ u[0] = 1.0; }
|
||||
else{ u[Ls-1] = 1.0; }
|
||||
|
||||
|
@ -48,8 +48,6 @@ NaiveStaggeredFermion<Impl>::NaiveStaggeredFermion(GridCartesian &Fgrid, GridRed
|
||||
StencilEven(&Hgrid, npoint, Even, directions, displacements,p), // source is Even
|
||||
StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p), // source is Odd
|
||||
mass(_mass),
|
||||
Lebesgue(_grid),
|
||||
LebesgueEvenOdd(_cbgrid),
|
||||
Umu(&Fgrid),
|
||||
UmuEven(&Hgrid),
|
||||
UmuOdd(&Hgrid),
|
||||
@ -268,7 +266,7 @@ void NaiveStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out
|
||||
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
|
||||
DhopInternal(Stencil, Lebesgue, Umu, in, out, dag);
|
||||
DhopInternal(Stencil, Umu, in, out, dag);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
@ -280,7 +278,7 @@ void NaiveStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &o
|
||||
assert(in.Checkerboard() == Even);
|
||||
out.Checkerboard() = Odd;
|
||||
|
||||
DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag);
|
||||
DhopInternal(StencilEven, UmuOdd, in, out, dag);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
@ -292,7 +290,7 @@ void NaiveStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &o
|
||||
assert(in.Checkerboard() == Odd);
|
||||
out.Checkerboard() = Even;
|
||||
|
||||
DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag);
|
||||
DhopInternal(StencilOdd, UmuEven, in, out, dag);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
@ -323,18 +321,18 @@ void NaiveStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionField &
|
||||
|
||||
|
||||
template <class Impl>
|
||||
void NaiveStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
|
||||
void NaiveStaggeredFermion<Impl>::DhopInternal(StencilImpl &st,
|
||||
DoubledGaugeField &U,
|
||||
const FermionField &in,
|
||||
FermionField &out, int dag)
|
||||
{
|
||||
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
|
||||
DhopInternalOverlappedComms(st,lo,U,in,out,dag);
|
||||
DhopInternalOverlappedComms(st,U,in,out,dag);
|
||||
else
|
||||
DhopInternalSerialComms(st,lo,U,in,out,dag);
|
||||
DhopInternalSerialComms(st,U,in,out,dag);
|
||||
}
|
||||
template <class Impl>
|
||||
void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
|
||||
void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st,
|
||||
DoubledGaugeField &U,
|
||||
const FermionField &in,
|
||||
FermionField &out, int dag)
|
||||
@ -356,7 +354,7 @@ void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, L
|
||||
{
|
||||
int interior=1;
|
||||
int exterior=0;
|
||||
Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
|
||||
Kernels::DhopNaive(st,U,in,out,dag,interior,exterior);
|
||||
}
|
||||
|
||||
st.CommunicateComplete(requests);
|
||||
@ -367,12 +365,12 @@ void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, L
|
||||
{
|
||||
int interior=0;
|
||||
int exterior=1;
|
||||
Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
|
||||
Kernels::DhopNaive(st,U,in,out,dag,interior,exterior);
|
||||
}
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo,
|
||||
void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st,
|
||||
DoubledGaugeField &U,
|
||||
const FermionField &in,
|
||||
FermionField &out, int dag)
|
||||
@ -385,7 +383,7 @@ void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, Lebes
|
||||
{
|
||||
int interior=1;
|
||||
int exterior=1;
|
||||
Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
|
||||
Kernels::DhopNaive(st,U,in,out,dag,interior,exterior);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -237,7 +237,32 @@ void PartialFractionFermion5D<Impl>::M_internal(const FermionField &psi, Fermi
|
||||
// ( 0 -sqrt(p_i)*amax | 2 R gamma_5 + p0/amax 2H
|
||||
//
|
||||
|
||||
this->DW(psi,D,DaggerNo);
|
||||
this->DW(psi,D,DaggerNo);
|
||||
|
||||
// DW - DW+iqslash
|
||||
// (g5 Dw)^dag = g5 Dw
|
||||
// (iqmu g5 gmu)^dag = (-i qmu gmu^dag g5^dag) = i qmu g5 gmu
|
||||
if ( qmu.size() ) {
|
||||
|
||||
std::cout<< "Mat" << "qmu ("<<qmu[0]<<","<<qmu[1]<<","<<qmu[2]<<","<<qmu[3]<<")"<<std::endl;
|
||||
assert(qmu.size()==Nd);
|
||||
|
||||
FermionField qslash_psi(psi.Grid());
|
||||
|
||||
Gamma::Algebra Gmu [] = {
|
||||
Gamma::Algebra::GammaX,
|
||||
Gamma::Algebra::GammaY,
|
||||
Gamma::Algebra::GammaZ,
|
||||
Gamma::Algebra::GammaT
|
||||
};
|
||||
qslash_psi = qmu[0]*(Gamma(Gmu[0])*psi);
|
||||
for(int mu=1;mu<Nd;mu++){
|
||||
qslash_psi = qslash_psi + qmu[mu]*(Gamma(Gmu[mu])*psi);
|
||||
}
|
||||
ComplexD ci(0.0,1.0);
|
||||
qslash_psi = ci*qslash_psi ; // i qslash
|
||||
D = D + qslash_psi;
|
||||
}
|
||||
|
||||
int nblock=(Ls-1)/2;
|
||||
for(int b=0;b<nblock;b++){
|
||||
@ -255,15 +280,55 @@ void PartialFractionFermion5D<Impl>::M_internal(const FermionField &psi, Fermi
|
||||
}
|
||||
|
||||
{
|
||||
// The 'conventional' Cayley overlap operator is
|
||||
//
|
||||
// Dov = (1+m)/2 + (1-m)/2 g5 sgn Hw
|
||||
//
|
||||
//
|
||||
// With massless limit 1/2(1+g5 sgnHw)
|
||||
//
|
||||
// Luscher shows quite neatly that 1+g5 sgn Hw has tree level propagator i qslash +O(a^2)
|
||||
//
|
||||
// However, the conventional normalisation has both a leading order factor of 2 in Zq
|
||||
// at tree level AND a mass dependent (1-m) that are convenient to absorb.
|
||||
//
|
||||
// In WilsonFermion5DImplementation.h, the tree level propagator for Hw is
|
||||
//
|
||||
// num = -i sin kmu gmu
|
||||
//
|
||||
// denom ( sqrt(sk^2 + (2shk^2 - 1)^2
|
||||
// b_k = sk2 - M5;
|
||||
//
|
||||
// w_k = sqrt(sk + b_k*b_k);
|
||||
//
|
||||
// denom= ( w_k + b_k + mass*mass) ;
|
||||
//
|
||||
// denom= one/denom;
|
||||
// out = num*denom;
|
||||
//
|
||||
// Chroma, and Grid define partial fraction via 4d operator
|
||||
//
|
||||
// Dpf = 2/(1-m) x Dov = (1+m)/(1-m) + g5 sgn Hw
|
||||
//
|
||||
// Now since:
|
||||
//
|
||||
// (1+m)/(1-m) = (1-m)/(1-m) + 2m/(1-m) = 1 + 2m/(1-m)
|
||||
//
|
||||
// This corresponds to a modified mass parameter
|
||||
//
|
||||
// It has an annoying
|
||||
//
|
||||
//
|
||||
double R=(1+this->mass)/(1-this->mass);
|
||||
//R g5 psi[Ls] + p[0] H
|
||||
//R g5 psi[Ls] + p[0] Hw
|
||||
ag5xpbg5y_ssp(chi,R*scale,psi,p[nblock]*scale/amax,D,Ls-1,Ls-1);
|
||||
|
||||
|
||||
for(int b=0;b<nblock;b++){
|
||||
int s = 2*b+1;
|
||||
double pp = p[nblock-1-b];
|
||||
axpby_ssp(chi,1.0,chi,-sqrt(amax*pp)*scale*sign,psi,Ls-1,s);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -411,17 +476,18 @@ void PartialFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,App
|
||||
int Ls = this->Ls;
|
||||
conformable(solution5d.Grid(),this->FermionGrid());
|
||||
conformable(exported4d.Grid(),this->GaugeGrid());
|
||||
ExtractSlice(exported4d, solution5d, Ls-1, Ls-1);
|
||||
ExtractSlice(exported4d, solution5d, Ls-1, 0);
|
||||
}
|
||||
template<class Impl>
|
||||
void PartialFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
|
||||
{
|
||||
//void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int orthog)
|
||||
int Ls = this->Ls;
|
||||
conformable(imported5d.Grid(),this->FermionGrid());
|
||||
conformable(input4d.Grid() ,this->GaugeGrid());
|
||||
FermionField tmp(this->FermionGrid());
|
||||
tmp=Zero();
|
||||
InsertSlice(input4d, tmp, Ls-1, Ls-1);
|
||||
InsertSlice(input4d, tmp, Ls-1, 0);
|
||||
tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
|
||||
this->Dminus(tmp,imported5d);
|
||||
}
|
||||
@ -442,7 +508,7 @@ PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,
|
||||
|
||||
{
|
||||
int Ls = this->Ls;
|
||||
|
||||
qmu.resize(0);
|
||||
assert((Ls&0x1)==1); // Odd Ls required
|
||||
int nrational=Ls-1;
|
||||
|
||||
@ -460,6 +526,22 @@ PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,
|
||||
Approx::zolotarev_free(zdata);
|
||||
|
||||
}
|
||||
template<class Impl>
|
||||
PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,
|
||||
GridCartesian &FiveDimGrid,
|
||||
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
||||
GridCartesian &FourDimGrid,
|
||||
GridRedBlackCartesian &FourDimRedBlackGrid,
|
||||
RealD _mass,RealD M5,
|
||||
std::vector<RealD> &_qmu,
|
||||
const ImplParams &p)
|
||||
: PartialFractionFermion5D<Impl>(_Umu,
|
||||
FiveDimGrid,FiveDimRedBlackGrid,
|
||||
FourDimGrid,FourDimRedBlackGrid,
|
||||
_mass,M5,p)
|
||||
{
|
||||
qmu=_qmu;
|
||||
}
|
||||
|
||||
NAMESPACE_END(Grid);
|
||||
|
||||
|
@ -375,23 +375,6 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
#define DHOP_SITE_HAND_INSTANTIATE(IMPL) \
|
||||
template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \
|
||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
|
||||
SiteSpinor *buf, int LLs, int sU, \
|
||||
const FermionFieldView &in, FermionFieldView &out, int dag); \
|
||||
\
|
||||
template void StaggeredKernels<IMPL>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, \
|
||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
|
||||
SiteSpinor *buf, int LLs, int sU, \
|
||||
const FermionFieldView &in, FermionFieldView &out, int dag); \
|
||||
\
|
||||
template void StaggeredKernels<IMPL>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, \
|
||||
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
|
||||
SiteSpinor *buf, int LLs, int sU, \
|
||||
const FermionFieldView &in, FermionFieldView &out, int dag); \
|
||||
*/
|
||||
#undef LOAD_CHI
|
||||
#undef HAND_DECLARATIONS
|
||||
|
||||
|
@ -256,7 +256,7 @@ void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldVie
|
||||
});
|
||||
|
||||
template <class Impl>
|
||||
void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,
|
||||
void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st,
|
||||
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||
const FermionField &in, FermionField &out, int dag, int interior,int exterior)
|
||||
{
|
||||
@ -294,7 +294,7 @@ void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,
|
||||
assert(0 && " Kernel optimisation case not covered ");
|
||||
}
|
||||
template <class Impl>
|
||||
void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo,
|
||||
void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st,
|
||||
DoubledGaugeField &U,
|
||||
const FermionField &in, FermionField &out, int dag, int interior,int exterior)
|
||||
{
|
||||
|
@ -14,6 +14,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Guido Cossu <guido.cossu@ed.ac.uk>
|
||||
Author: Andrew Lawson <andrew.lawson1991@gmail.com>
|
||||
Author: Vera Guelpers <V.M.Guelpers@soton.ac.uk>
|
||||
Author: Christoph Lehner <christoph@lhnr.de>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
@ -58,15 +59,9 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
|
||||
Umu(_FourDimGrid),
|
||||
UmuEven(_FourDimRedBlackGrid),
|
||||
UmuOdd (_FourDimRedBlackGrid),
|
||||
Lebesgue(_FourDimGrid),
|
||||
LebesgueEvenOdd(_FourDimRedBlackGrid),
|
||||
_tmp(&FiveDimRedBlackGrid),
|
||||
Dirichlet(0)
|
||||
{
|
||||
Stencil.lo = &Lebesgue;
|
||||
StencilEven.lo = &LebesgueEvenOdd;
|
||||
StencilOdd.lo = &LebesgueEvenOdd;
|
||||
|
||||
// some assertions
|
||||
assert(FiveDimGrid._ndimension==5);
|
||||
assert(FourDimGrid._ndimension==4);
|
||||
@ -305,19 +300,19 @@ void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
|
||||
void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st,
|
||||
DoubledGaugeField & U,
|
||||
const FermionField &in, FermionField &out,int dag)
|
||||
{
|
||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
|
||||
DhopInternalOverlappedComms(st,lo,U,in,out,dag);
|
||||
DhopInternalOverlappedComms(st,U,in,out,dag);
|
||||
else
|
||||
DhopInternalSerialComms(st,lo,U,in,out,dag);
|
||||
DhopInternalSerialComms(st,U,in,out,dag);
|
||||
}
|
||||
|
||||
|
||||
template<class Impl>
|
||||
void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo,
|
||||
void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
|
||||
DoubledGaugeField & U,
|
||||
const FermionField &in, FermionField &out,int dag)
|
||||
{
|
||||
@ -331,22 +326,22 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
|
||||
// Start comms // Gather intranode and extra node differentiated??
|
||||
/////////////////////////////
|
||||
{
|
||||
// std::cout << " WilsonFermion5D gather " <<std::endl;
|
||||
GRID_TRACE("Gather");
|
||||
st.HaloExchangeOptGather(in,compressor); // Put the barrier in the routine
|
||||
}
|
||||
|
||||
// std::cout << " WilsonFermion5D Communicate Begin " <<std::endl;
|
||||
std::vector<std::vector<CommsRequest_t> > requests;
|
||||
auto id=traceStart("Communicate overlapped");
|
||||
st.CommunicateBegin(requests);
|
||||
|
||||
#if 1
|
||||
/////////////////////////////
|
||||
// Overlap with comms
|
||||
/////////////////////////////
|
||||
{
|
||||
GRID_TRACE("MergeSHM");
|
||||
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
|
||||
}
|
||||
|
||||
st.CommunicateBegin(requests);
|
||||
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
|
||||
#endif
|
||||
|
||||
/////////////////////////////
|
||||
// do the compute interior
|
||||
/////////////////////////////
|
||||
@ -358,22 +353,35 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
|
||||
GRID_TRACE("DhopInterior");
|
||||
Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
|
||||
}
|
||||
|
||||
|
||||
//ifdef GRID_ACCELERATED
|
||||
#if 0
|
||||
/////////////////////////////
|
||||
// Overlap with comms -- on GPU the interior kernel call is nonblocking
|
||||
/////////////////////////////
|
||||
st.CommunicateBegin(requests);
|
||||
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
|
||||
#endif
|
||||
|
||||
|
||||
/////////////////////////////
|
||||
// Complete comms
|
||||
/////////////////////////////
|
||||
// std::cout << " WilsonFermion5D Comms Complete " <<std::endl;
|
||||
st.CommunicateComplete(requests);
|
||||
traceStop(id);
|
||||
// traceStop(id);
|
||||
|
||||
/////////////////////////////
|
||||
// do the compute exterior
|
||||
/////////////////////////////
|
||||
{
|
||||
// std::cout << " WilsonFermion5D Comms Merge " <<std::endl;
|
||||
GRID_TRACE("Merge");
|
||||
st.CommsMerge(compressor);
|
||||
}
|
||||
|
||||
|
||||
// std::cout << " WilsonFermion5D Exterior " <<std::endl;
|
||||
if (dag == DaggerYes) {
|
||||
GRID_TRACE("DhopDagExterior");
|
||||
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
|
||||
@ -381,11 +389,12 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
|
||||
GRID_TRACE("DhopExterior");
|
||||
Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
|
||||
}
|
||||
// std::cout << " WilsonFermion5D Done " <<std::endl;
|
||||
}
|
||||
|
||||
|
||||
template<class Impl>
|
||||
void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
|
||||
void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
|
||||
DoubledGaugeField & U,
|
||||
const FermionField &in,
|
||||
FermionField &out,int dag)
|
||||
@ -395,11 +404,13 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOr
|
||||
|
||||
int LLs = in.Grid()->_rdimensions[0];
|
||||
|
||||
// std::cout << " WilsonFermion5D Halo exch " <<std::endl;
|
||||
{
|
||||
GRID_TRACE("HaloExchange");
|
||||
st.HaloExchangeOpt(in,compressor);
|
||||
}
|
||||
|
||||
// std::cout << " WilsonFermion5D Dhop " <<std::endl;
|
||||
int Opt = WilsonKernelsStatic::Opt;
|
||||
if (dag == DaggerYes) {
|
||||
GRID_TRACE("DhopDag");
|
||||
@ -408,6 +419,7 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOr
|
||||
GRID_TRACE("Dhop");
|
||||
Kernels::DhopKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
|
||||
}
|
||||
// std::cout << " WilsonFermion5D Done " <<std::endl;
|
||||
}
|
||||
|
||||
|
||||
@ -420,7 +432,7 @@ void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int
|
||||
assert(in.Checkerboard()==Even);
|
||||
out.Checkerboard() = Odd;
|
||||
|
||||
DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,in,out,dag);
|
||||
DhopInternal(StencilEven,UmuOdd,in,out,dag);
|
||||
}
|
||||
template<class Impl>
|
||||
void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
|
||||
@ -431,8 +443,31 @@ void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int
|
||||
assert(in.Checkerboard()==Odd);
|
||||
out.Checkerboard() = Even;
|
||||
|
||||
DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,in,out,dag);
|
||||
DhopInternal(StencilOdd,UmuEven,in,out,dag);
|
||||
}
|
||||
template<class Impl>
|
||||
void WilsonFermion5D<Impl>::DhopComms(const FermionField &in, FermionField &out)
|
||||
{
|
||||
int dag =0 ;
|
||||
conformable(in.Grid(),FermionGrid()); // verifies full grid
|
||||
conformable(in.Grid(),out.Grid());
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
Compressor compressor(dag);
|
||||
Stencil.HaloExchangeOpt(in,compressor);
|
||||
}
|
||||
template<class Impl>
|
||||
void WilsonFermion5D<Impl>::DhopCalc(const FermionField &in, FermionField &out,uint64_t *ids)
|
||||
{
|
||||
conformable(in.Grid(),FermionGrid()); // verifies full grid
|
||||
conformable(in.Grid(),out.Grid());
|
||||
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
|
||||
int LLs = in.Grid()->_rdimensions[0];
|
||||
int Opt = WilsonKernelsStatic::Opt;
|
||||
Kernels::DhopKernel(Opt,Stencil,Umu,Stencil.CommBuf(),LLs,Umu.oSites(),in,out,ids);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
|
||||
{
|
||||
@ -441,7 +476,7 @@ void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int d
|
||||
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
|
||||
DhopInternal(Stencil,Lebesgue,Umu,in,out,dag);
|
||||
DhopInternal(Stencil,Umu,in,out,dag);
|
||||
}
|
||||
template<class Impl>
|
||||
void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag)
|
||||
@ -450,6 +485,54 @@ void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag
|
||||
Dhop(in,out,dag); // -0.5 is included
|
||||
axpy(out,4.0-M5,in,out);
|
||||
}
|
||||
template <class Impl>
|
||||
void WilsonFermion5D<Impl>::Meooe(const FermionField &in, FermionField &out)
|
||||
{
|
||||
if (in.Checkerboard() == Odd) {
|
||||
DhopEO(in, out, DaggerNo);
|
||||
} else {
|
||||
DhopOE(in, out, DaggerNo);
|
||||
}
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void WilsonFermion5D<Impl>::MeooeDag(const FermionField &in, FermionField &out)
|
||||
{
|
||||
if (in.Checkerboard() == Odd) {
|
||||
DhopEO(in, out, DaggerYes);
|
||||
} else {
|
||||
DhopOE(in, out, DaggerYes);
|
||||
}
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void WilsonFermion5D<Impl>::Mooee(const FermionField &in, FermionField &out)
|
||||
{
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
typename FermionField::scalar_type scal(4.0 + M5);
|
||||
out = scal * in;
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void WilsonFermion5D<Impl>::MooeeDag(const FermionField &in, FermionField &out)
|
||||
{
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
Mooee(in, out);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void WilsonFermion5D<Impl>::MooeeInv(const FermionField &in, FermionField &out)
|
||||
{
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
out = (1.0/(4.0 + M5))*in;
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void WilsonFermion5D<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
|
||||
{
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
MooeeInv(in,out);
|
||||
}
|
||||
|
||||
template<class Impl>
|
||||
void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in, RealD mass,std::vector<double> twist)
|
||||
@ -735,6 +818,15 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt(FermionField &out,const Fe
|
||||
|
||||
template<class Impl>
|
||||
void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist)
|
||||
{
|
||||
std::vector<double> empty_q(Nd,0.0);
|
||||
MomentumSpacePropagatorHwQ(out,in,mass,twist,empty_q);
|
||||
}
|
||||
template<class Impl>
|
||||
void WilsonFermion5D<Impl>::MomentumSpacePropagatorHwQ(FermionField &out,const FermionField &in,
|
||||
RealD mass,
|
||||
std::vector<double> twist,
|
||||
std::vector<double> qmu)
|
||||
{
|
||||
Gamma::Algebra Gmu [] = {
|
||||
Gamma::Algebra::GammaX,
|
||||
@ -750,6 +842,7 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const Fe
|
||||
typedef typename FermionField::scalar_type ScalComplex;
|
||||
|
||||
typedef Lattice<iSinglet<vector_type> > LatComplex;
|
||||
typedef iSpinMatrix<ScalComplex> SpinMat;
|
||||
|
||||
|
||||
Coordinate latt_size = _grid->_fdimensions;
|
||||
@ -767,8 +860,10 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const Fe
|
||||
LatComplex kmu(_grid);
|
||||
ScalComplex ci(0.0,1.0);
|
||||
|
||||
std::cout<< "Feynman Rule" << "qmu ("<<qmu[0]<<","<<qmu[1]<<","<<qmu[2]<<","<<qmu[3]<<")"<<std::endl;
|
||||
|
||||
for(int mu=0;mu<Nd;mu++) {
|
||||
|
||||
|
||||
LatticeCoordinate(kmu,mu);
|
||||
|
||||
RealD TwoPiL = M_PI * 2.0/ latt_size[mu];
|
||||
@ -777,9 +872,18 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const Fe
|
||||
kmu = kmu + TwoPiL * one * twist[mu];//momentum for twisted boundary conditions
|
||||
|
||||
sk2 = sk2 + 2.0*sin(kmu*0.5)*sin(kmu*0.5);
|
||||
sk = sk + sin(kmu)*sin(kmu);
|
||||
|
||||
num = num - sin(kmu)*ci*(Gamma(Gmu[mu])*in);
|
||||
sk = sk + (sin(kmu)+qmu[mu])*(sin(kmu)+qmu[mu]);
|
||||
|
||||
// Terms for boosted Fermion
|
||||
// 1/2 [ -i gamma.(sin p + q ) ]
|
||||
// [ --------------------- + 1 ]
|
||||
// [ wq + b ]
|
||||
//
|
||||
// wq = sqrt( (sinp+q)^2 + b^2 )
|
||||
//
|
||||
|
||||
num = num - (sin(kmu)+qmu[mu])*ci*(Gamma(Gmu[mu])*in);
|
||||
|
||||
}
|
||||
num = num + mass * in ;
|
||||
|
@ -52,17 +52,12 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
|
||||
StencilEven(&Hgrid, npoint, Even, directions,displacements,p), // source is Even
|
||||
StencilOdd(&Hgrid, npoint, Odd, directions,displacements,p), // source is Odd
|
||||
mass(_mass),
|
||||
Lebesgue(_grid),
|
||||
LebesgueEvenOdd(_cbgrid),
|
||||
Umu(&Fgrid),
|
||||
UmuEven(&Hgrid),
|
||||
UmuOdd(&Hgrid),
|
||||
_tmp(&Hgrid),
|
||||
anisotropyCoeff(anis)
|
||||
{
|
||||
Stencil.lo = &Lebesgue;
|
||||
StencilEven.lo = &LebesgueEvenOdd;
|
||||
StencilOdd.lo = &LebesgueEvenOdd;
|
||||
// Allocate the required comms buffer
|
||||
ImportGauge(_Umu);
|
||||
if (anisotropyCoeff.isAnisotropic){
|
||||
@ -314,7 +309,7 @@ void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int da
|
||||
|
||||
out.Checkerboard() = in.Checkerboard();
|
||||
|
||||
DhopInternal(Stencil, Lebesgue, Umu, in, out, dag);
|
||||
DhopInternal(Stencil, Umu, in, out, dag);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
@ -326,7 +321,7 @@ void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int
|
||||
assert(in.Checkerboard() == Even);
|
||||
out.Checkerboard() = Odd;
|
||||
|
||||
DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag);
|
||||
DhopInternal(StencilEven, UmuOdd, in, out, dag);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
@ -338,7 +333,7 @@ void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int d
|
||||
assert(in.Checkerboard() == Odd);
|
||||
out.Checkerboard() = Even;
|
||||
|
||||
DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag);
|
||||
DhopInternal(StencilOdd, UmuEven, in, out, dag);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
@ -391,21 +386,21 @@ void WilsonFermion<Impl>::DhopDirCalc(const FermionField &in, FermionField &out,
|
||||
};
|
||||
|
||||
template <class Impl>
|
||||
void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
|
||||
void WilsonFermion<Impl>::DhopInternal(StencilImpl &st,
|
||||
DoubledGaugeField &U,
|
||||
const FermionField &in,
|
||||
FermionField &out, int dag)
|
||||
{
|
||||
#ifdef GRID_OMP
|
||||
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
|
||||
DhopInternalOverlappedComms(st,lo,U,in,out,dag);
|
||||
DhopInternalOverlappedComms(st,U,in,out,dag);
|
||||
else
|
||||
#endif
|
||||
DhopInternalSerial(st,lo,U,in,out,dag);
|
||||
DhopInternalSerial(st,U,in,out,dag);
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
|
||||
void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st,
|
||||
DoubledGaugeField &U,
|
||||
const FermionField &in,
|
||||
FermionField &out, int dag)
|
||||
@ -474,10 +469,10 @@ void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueO
|
||||
|
||||
|
||||
template <class Impl>
|
||||
void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo,
|
||||
DoubledGaugeField &U,
|
||||
const FermionField &in,
|
||||
FermionField &out, int dag)
|
||||
void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st,
|
||||
DoubledGaugeField &U,
|
||||
const FermionField &in,
|
||||
FermionField &out, int dag)
|
||||
{
|
||||
GRID_TRACE("DhopSerial");
|
||||
assert((dag == DaggerNo) || (dag == DaggerYes));
|
||||
|
@ -40,11 +40,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
/// Switch off the 5d vectorised code optimisations
|
||||
#undef DWFVEC5D
|
||||
|
||||
static Vector<vComplexF> signsF;
|
||||
static std::vector<vComplexF> signsF;
|
||||
|
||||
template<typename vtype>
|
||||
int setupSigns(Vector<vtype>& signs ){
|
||||
Vector<vtype> bother(2);
|
||||
int setupSigns(std::vector<vtype>& signs ){
|
||||
std::vector<vtype> bother(2);
|
||||
signs = bother;
|
||||
vrsign(signs[0]);
|
||||
visign(signs[1]);
|
||||
@ -364,7 +364,7 @@ WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDagExt(StencilView &st, Doubled
|
||||
|
||||
#include <simd/Intel512double.h>
|
||||
|
||||
static Vector<vComplexD> signsD;
|
||||
static std::vector<vComplexD> signsD;
|
||||
static int signInitD = setupSigns(signsD);
|
||||
|
||||
#define MAYBEPERM(A,perm) if (perm) { A ; }
|
||||
|
@ -63,7 +63,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
|
||||
} else { \
|
||||
chi = coalescedRead(buf[SE->_offset],lane); \
|
||||
} \
|
||||
acceleratorSynchronise(); \
|
||||
acceleratorSynchronise(); \
|
||||
Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \
|
||||
Recon(result, Uchi);
|
||||
|
||||
@ -411,6 +411,46 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
|
||||
#undef LoopBody
|
||||
}
|
||||
|
||||
#ifdef GRID_SYCL
|
||||
extern "C" {
|
||||
ulong SYCL_EXTERNAL __attribute__((overloadable)) intel_get_cycle_counter( void );
|
||||
uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_active_channel_mask( void );
|
||||
uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_grf_register( uint reg );
|
||||
uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_flag_register( uint flag );
|
||||
uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_control_register( uint reg );
|
||||
uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_hw_thread_id( void );
|
||||
uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_slice_id( void );
|
||||
uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_subslice_id( void );
|
||||
uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_eu_id( void );
|
||||
uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_eu_thread_id( void );
|
||||
void SYCL_EXTERNAL __attribute__((overloadable)) intel_eu_thread_pause( uint value );
|
||||
}
|
||||
#ifdef GRID_SIMT
|
||||
#define MAKE_ID(A) (intel_get_eu_id()<<16)|(intel_get_slice_id()<<8)|(intel_get_subslice_id())
|
||||
#else
|
||||
#define MAKE_ID(A) (0)
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#define MAKE_ID(A) (0)
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#define KERNEL_CALL_ID(A) \
|
||||
const uint64_t NN = Nsite*Ls; \
|
||||
accelerator_forNB( ss, NN, Simd::Nsimd(), { \
|
||||
int sF = ss; \
|
||||
int sU = ss/Ls; \
|
||||
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v); \
|
||||
const int Nsimd = SiteHalfSpinor::Nsimd(); \
|
||||
const int lane=acceleratorSIMTlane(Nsimd); \
|
||||
int idx=sF*Nsimd+lane; \
|
||||
uint64_t id = MAKE_ID(); \
|
||||
ids[idx]=id; \
|
||||
}); \
|
||||
accelerator_barrier();
|
||||
|
||||
#define KERNEL_CALLNB(A) \
|
||||
const uint64_t NN = Nsite*Ls; \
|
||||
@ -418,7 +458,7 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
|
||||
int sF = ss; \
|
||||
int sU = ss/Ls; \
|
||||
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v); \
|
||||
});
|
||||
});
|
||||
|
||||
#define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier();
|
||||
|
||||
@ -434,7 +474,7 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
|
||||
|
||||
#define ASM_CALL(A) \
|
||||
thread_for( sss, Nsite, { \
|
||||
int ss = st.lo->Reorder(sss); \
|
||||
int ss = sss; /*st.lo->Reorder(sss);*/ \
|
||||
int sU = ss; \
|
||||
int sF = ss*Ls; \
|
||||
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \
|
||||
@ -451,6 +491,8 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
|
||||
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \
|
||||
});}
|
||||
|
||||
|
||||
|
||||
template <class Impl>
|
||||
void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
||||
@ -475,7 +517,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;}
|
||||
#endif
|
||||
} else if( exterior ) {
|
||||
// dependent on result of merge
|
||||
// // dependent on result of merge
|
||||
acceleratorFenceComputeStream();
|
||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL_EXT(GenericDhopSiteExt); return;}
|
||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_EXT(HandDhopSiteExt); return;}
|
||||
@ -485,6 +527,18 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
||||
}
|
||||
assert(0 && " Kernel optimisation case not covered ");
|
||||
}
|
||||
|
||||
template <class Impl>
|
||||
void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
||||
uint64_t *ids)
|
||||
{
|
||||
autoView(U_v , U,AcceleratorRead);
|
||||
autoView(in_v , in,AcceleratorRead);
|
||||
autoView(out_v,out,AcceleratorWrite);
|
||||
autoView(st_v , st,AcceleratorRead);
|
||||
KERNEL_CALL_ID(GenericDhopSite);
|
||||
}
|
||||
template <class Impl>
|
||||
void WilsonKernels<Impl>::DhopDagKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
||||
|
@ -0,0 +1,45 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/ qcd/action/fermion/instantiation/CompactWilsonCloverFermionInstantiation5D.cc.master
|
||||
|
||||
Copyright (C) 2017 - 2025
|
||||
|
||||
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||
Author: Guido Cossu <guido.cossu@ed.ac.uk>
|
||||
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
|
||||
Author: Mattia Bruno <mattia.bruno@cern.ch>
|
||||
Author: Christoph Lehner <christoph@lhnr.de>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
|
||||
#include <Grid/Grid.h>
|
||||
#include <Grid/qcd/spin/Dirac.h>
|
||||
#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion5D.h>
|
||||
#include <Grid/qcd/action/fermion/implementation/CompactWilsonCloverFermion5DImplementation.h>
|
||||
#include <Grid/qcd/action/fermion/CloverHelpers.h>
|
||||
|
||||
NAMESPACE_BEGIN(Grid);
|
||||
|
||||
#include "impl.h"
|
||||
template class CompactWilsonCloverFermion5D<IMPLEMENTATION, CompactCloverHelpers<IMPLEMENTATION>>;
|
||||
template class CompactWilsonCloverFermion5D<IMPLEMENTATION, CompactExpCloverHelpers<IMPLEMENTATION>>;
|
||||
|
||||
NAMESPACE_END(Grid);
|
@ -0,0 +1 @@
|
||||
../CompactWilsonCloverFermion5DInstantiation.cc.master
|
@ -0,0 +1 @@
|
||||
../CompactWilsonCloverFermion5DInstantiation.cc.master
|
@ -62,7 +62,7 @@ do
|
||||
done
|
||||
done
|
||||
|
||||
CC_LIST="CompactWilsonCloverFermionInstantiation"
|
||||
CC_LIST="CompactWilsonCloverFermionInstantiation CompactWilsonCloverFermion5DInstantiation"
|
||||
|
||||
for impl in $COMPACT_WILSON_IMPL_LIST
|
||||
do
|
||||
|
@ -40,6 +40,11 @@ public:
|
||||
|
||||
INHERIT_GIMPL_TYPES(Gimpl);
|
||||
|
||||
using Action<GaugeField>::S;
|
||||
using Action<GaugeField>::Sinitial;
|
||||
using Action<GaugeField>::deriv;
|
||||
using Action<GaugeField>::refresh;
|
||||
|
||||
private:
|
||||
RealD c_plaq;
|
||||
RealD c_rect;
|
||||
@ -71,27 +76,27 @@ public:
|
||||
return action;
|
||||
};
|
||||
|
||||
virtual void deriv(const GaugeField &Umu,GaugeField & dSdU) {
|
||||
virtual void deriv(const GaugeField &U, GaugeField &dSdU) {
|
||||
//extend Ta to include Lorentz indexes
|
||||
RealD factor_p = c_plaq/RealD(Nc)*0.5;
|
||||
RealD factor_r = c_rect/RealD(Nc)*0.5;
|
||||
|
||||
GridBase *grid = Umu.Grid();
|
||||
GridBase *grid = U.Grid();
|
||||
|
||||
std::vector<GaugeLinkField> U (Nd,grid);
|
||||
std::vector<GaugeLinkField> Umu (Nd,grid);
|
||||
for(int mu=0;mu<Nd;mu++){
|
||||
U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
|
||||
Umu[mu] = PeekIndex<LorentzIndex>(U,mu);
|
||||
}
|
||||
std::vector<GaugeLinkField> RectStaple(Nd,grid), Staple(Nd,grid);
|
||||
WilsonLoops<Gimpl>::StapleAndRectStapleAll(Staple, RectStaple, U, workspace);
|
||||
WilsonLoops<Gimpl>::StapleAndRectStapleAll(Staple, RectStaple, Umu, workspace);
|
||||
|
||||
GaugeLinkField dSdU_mu(grid);
|
||||
GaugeLinkField staple(grid);
|
||||
|
||||
for (int mu=0; mu < Nd; mu++){
|
||||
dSdU_mu = Ta(U[mu]*Staple[mu])*factor_p;
|
||||
dSdU_mu = dSdU_mu + Ta(U[mu]*RectStaple[mu])*factor_r;
|
||||
|
||||
dSdU_mu = Ta(Umu[mu]*Staple[mu])*factor_p;
|
||||
dSdU_mu = dSdU_mu + Ta(Umu[mu]*RectStaple[mu])*factor_r;
|
||||
|
||||
PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
|
||||
}
|
||||
|
||||
|
@ -43,6 +43,11 @@ class WilsonGaugeAction : public Action<typename Gimpl::GaugeField> {
|
||||
public:
|
||||
INHERIT_GIMPL_TYPES(Gimpl);
|
||||
|
||||
using Action<GaugeField>::S;
|
||||
using Action<GaugeField>::Sinitial;
|
||||
using Action<GaugeField>::deriv;
|
||||
using Action<GaugeField>::refresh;
|
||||
|
||||
/////////////////////////// constructors
|
||||
explicit WilsonGaugeAction(RealD beta_):beta(beta_){};
|
||||
|
||||
@ -68,20 +73,23 @@ public:
|
||||
// extend Ta to include Lorentz indexes
|
||||
|
||||
RealD factor = 0.5 * beta / RealD(Nc);
|
||||
GridBase *grid = U.Grid();
|
||||
|
||||
GaugeLinkField Umu(U.Grid());
|
||||
GaugeLinkField dSdU_mu(U.Grid());
|
||||
GaugeLinkField dSdU_mu(grid);
|
||||
std::vector<GaugeLinkField> Umu(Nd, grid);
|
||||
for (int mu = 0; mu < Nd; mu++) {
|
||||
Umu[mu] = PeekIndex<LorentzIndex>(U, mu);
|
||||
}
|
||||
|
||||
Umu = PeekIndex<LorentzIndex>(U, mu);
|
||||
|
||||
for (int mu = 0; mu < Nd; mu++) {
|
||||
// Staple in direction mu
|
||||
WilsonLoops<Gimpl>::Staple(dSdU_mu, U, mu);
|
||||
dSdU_mu = Ta(Umu * dSdU_mu) * factor;
|
||||
|
||||
WilsonLoops<Gimpl>::Staple(dSdU_mu, Umu, mu);
|
||||
dSdU_mu = Ta(Umu[mu] * dSdU_mu) * factor;
|
||||
|
||||
PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
RealD beta;
|
||||
};
|
||||
|
@ -111,8 +111,8 @@ public:
|
||||
};
|
||||
|
||||
void CheckpointRestore(int traj, Field &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG) {
|
||||
std::string config, rng;
|
||||
this->build_filenames(traj, Params, config, rng);
|
||||
std::string config, rng, smr;
|
||||
this->build_filenames(traj, Params, config, smr, rng);
|
||||
this->check_filename(rng);
|
||||
this->check_filename(config);
|
||||
|
||||
|
@ -75,7 +75,7 @@ public:
|
||||
GridParallelRNG &pRNG) {
|
||||
if ((traj % Params.saveInterval) == 0) {
|
||||
std::string config, rng, smr;
|
||||
this->build_filenames(traj, Params, config, rng);
|
||||
this->build_filenames(traj, Params, config, smr, rng);
|
||||
GridBase *grid = SmartConfig.get_U(false).Grid();
|
||||
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
|
||||
BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
|
||||
@ -102,7 +102,7 @@ public:
|
||||
if ( Params.saveSmeared ) {
|
||||
IldgWriter _IldgWriter(grid->IsBoss());
|
||||
_IldgWriter.open(smr);
|
||||
_IldgWriter.writeConfiguration<GaugeStats>(SmartConfig.get_U(true), traj, config, config);
|
||||
_IldgWriter.writeConfiguration<GaugeStats>(SmartConfig.get_U(true), traj, smr, smr);
|
||||
_IldgWriter.close();
|
||||
|
||||
std::cout << GridLogMessage << "Written ILDG Configuration on " << smr
|
||||
@ -118,8 +118,8 @@ public:
|
||||
|
||||
void CheckpointRestore(int traj, GaugeField &U, GridSerialRNG &sRNG,
|
||||
GridParallelRNG &pRNG) {
|
||||
std::string config, rng;
|
||||
this->build_filenames(traj, Params, config, rng);
|
||||
std::string config, rng, smr;
|
||||
this->build_filenames(traj, Params, config, smr, rng);
|
||||
this->check_filename(rng);
|
||||
this->check_filename(config);
|
||||
|
||||
|
@ -107,8 +107,8 @@ class ScidacHmcCheckpointer : public BaseHmcCheckpointer<Implementation> {
|
||||
|
||||
void CheckpointRestore(int traj, Field &U, GridSerialRNG &sRNG,
|
||||
GridParallelRNG &pRNG) {
|
||||
std::string config, rng;
|
||||
this->build_filenames(traj, Params, config, rng);
|
||||
std::string config, rng, smr;
|
||||
this->build_filenames(traj, Params, config, smr, rng);
|
||||
this->check_filename(rng);
|
||||
this->check_filename(config);
|
||||
|
||||
|
@ -40,7 +40,7 @@ public:
|
||||
U = Zero();
|
||||
LatticeColourMatrix tmp(Uin.Grid());
|
||||
|
||||
Vector<typename SU<ncolour>::Matrix> ta(Dimension);
|
||||
std::vector<typename SU<ncolour>::Matrix> ta(Dimension);
|
||||
|
||||
// Debug lines
|
||||
// LatticeMatrix uno(Uin.Grid());
|
||||
|
@ -43,7 +43,7 @@ public:
|
||||
U = Zero();
|
||||
LatticeColourMatrix tmp(Uin.Grid());
|
||||
|
||||
Vector<typename GaugeGroup<ncolour,group_name>::Matrix> eij(Dimension);
|
||||
std::vector<typename GaugeGroup<ncolour,group_name>::Matrix> eij(Dimension);
|
||||
|
||||
for (int a = 0; a < Dimension; a++)
|
||||
GaugeGroupTwoIndex<ncolour, S, group_name>::base(a, eij[a]);
|
||||
|
@ -62,15 +62,15 @@ accelerator_inline int stencilIndex(int mu, int nu) {
|
||||
|
||||
|
||||
/*! @brief structure holding the link treatment */
|
||||
struct SmearingParameters{
|
||||
SmearingParameters(){}
|
||||
struct HISQSmearingParameters{
|
||||
HISQSmearingParameters(){}
|
||||
Real c_1; // 1 link
|
||||
Real c_naik; // Naik term
|
||||
Real c_3; // 3 link
|
||||
Real c_5; // 5 link
|
||||
Real c_7; // 7 link
|
||||
Real c_lp; // 5 link Lepage
|
||||
SmearingParameters(Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp)
|
||||
HISQSmearingParameters(Real c1, Real cnaik, Real c3, Real c5, Real c7, Real clp)
|
||||
: c_1(c1),
|
||||
c_naik(cnaik),
|
||||
c_3(c3),
|
||||
@ -86,7 +86,7 @@ class Smear_HISQ : public Gimpl {
|
||||
|
||||
private:
|
||||
GridCartesian* const _grid;
|
||||
SmearingParameters _linkTreatment;
|
||||
HISQSmearingParameters _linkTreatment;
|
||||
|
||||
public:
|
||||
|
||||
@ -117,7 +117,7 @@ public:
|
||||
// IN--u_thin
|
||||
void smear(GF& u_smr, GF& u_naik, GF& u_thin) const {
|
||||
|
||||
SmearingParameters lt = this->_linkTreatment;
|
||||
HISQSmearingParameters lt = this->_linkTreatment;
|
||||
auto grid = this->_grid;
|
||||
|
||||
// Create a padded cell of extra padding depth=1 and fill the padding.
|
||||
|
@ -207,11 +207,14 @@ std::vector<RealD> WilsonFlowBase<Gimpl>::flowMeasureEnergyDensityCloverleaf(con
|
||||
}
|
||||
|
||||
template <class Gimpl>
|
||||
void WilsonFlowBase<Gimpl>::setDefaultMeasurements(int topq_meas_interval){
|
||||
addMeasurement(1, [](int step, RealD t, const typename Gimpl::GaugeField &U){
|
||||
void WilsonFlowBase<Gimpl>::setDefaultMeasurements(int meas_interval){
|
||||
addMeasurement(meas_interval, [](int step, RealD t, const typename Gimpl::GaugeField &U){
|
||||
std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : " << step << " " << t << " " << energyDensityPlaquette(t,U) << std::endl;
|
||||
});
|
||||
addMeasurement(topq_meas_interval, [](int step, RealD t, const typename Gimpl::GaugeField &U){
|
||||
addMeasurement(meas_interval, [](int step, RealD t, const typename Gimpl::GaugeField &U){
|
||||
std::cout << GridLogMessage << "[WilsonFlow] Energy density (cloverleaf) : " << step << " " << t << " " << energyDensityCloverleaf(t,U) << std::endl;
|
||||
});
|
||||
addMeasurement(meas_interval, [](int step, RealD t, const typename Gimpl::GaugeField &U){
|
||||
std::cout << GridLogMessage << "[WilsonFlow] Top. charge : " << step << " " << WilsonLoops<Gimpl>::TopologicalCharge(U) << std::endl;
|
||||
});
|
||||
}
|
||||
@ -249,6 +252,11 @@ void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const{
|
||||
|
||||
out = in;
|
||||
RealD taus = 0.;
|
||||
|
||||
// Perform initial t=0 measurements
|
||||
for(auto const &meas : this->functions)
|
||||
meas.second(0,taus,out);
|
||||
|
||||
for (unsigned int step = 1; step <= Nstep; step++) { //step indicates the number of smearing steps applied at the time of measurement
|
||||
auto start = std::chrono::high_resolution_clock::now();
|
||||
evolve_step(out, taus);
|
||||
@ -333,6 +341,11 @@ void WilsonFlowAdaptive<Gimpl>::smear(GaugeField& out, const GaugeField& in) con
|
||||
RealD taus = 0.;
|
||||
RealD eps = init_epsilon;
|
||||
unsigned int step = 0;
|
||||
|
||||
// Perform initial t=0 measurements
|
||||
for(auto const &meas : this->functions)
|
||||
meas.second(step,taus,out);
|
||||
|
||||
do{
|
||||
int step_success = evolve_step_adaptive(out, taus, eps);
|
||||
step += step_success; //step will not be incremented if the integration step fails
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -971,7 +971,9 @@ void BaryonUtils<FImpl>::BaryonGamma3pt(
|
||||
autoView( vq_ti , q_ti , AcceleratorRead);
|
||||
autoView( vq_tf , q_tf , AcceleratorRead);
|
||||
|
||||
Vector<mobj> my_Dq_spec{Dq_spec1,Dq_spec2};
|
||||
deviceVector<mobj> my_Dq_spec(2);
|
||||
acceleratorPut(my_Dq_spec[0],Dq_spec1);
|
||||
acceleratorPut(my_Dq_spec[1],Dq_spec2);
|
||||
mobj * Dq_spec_p = &my_Dq_spec[0];
|
||||
|
||||
if (group == 1) {
|
||||
@ -1300,7 +1302,8 @@ void BaryonUtils<FImpl>::SigmaToNucleonEye(const PropagatorField &qq_loop,
|
||||
autoView( vd_tf , qd_tf , AcceleratorRead);
|
||||
autoView( vs_ti , qs_ti , AcceleratorRead);
|
||||
|
||||
Vector<mobj> my_Dq_spec{Du_spec};
|
||||
deviceVector<mobj> my_Dq_spec(1);
|
||||
acceleratorPut(my_Dq_spec[0],Du_spec);
|
||||
mobj * Dq_spec_p = &my_Dq_spec[0];
|
||||
|
||||
if(op == "Q1"){
|
||||
@ -1353,7 +1356,8 @@ void BaryonUtils<FImpl>::SigmaToNucleonNonEye(const PropagatorField &qq_ti,
|
||||
autoView( vd_tf , qd_tf , AcceleratorRead );
|
||||
autoView( vs_ti , qs_ti , AcceleratorRead );
|
||||
|
||||
Vector<mobj> my_Dq_spec{Du_spec};
|
||||
deviceVector<mobj> my_Dq_spec(1);
|
||||
acceleratorPut(my_Dq_spec[0],Du_spec);
|
||||
mobj * Dq_spec_p = &my_Dq_spec[0];
|
||||
|
||||
if(op == "Q1"){
|
||||
@ -1544,7 +1548,9 @@ void BaryonUtils<FImpl>::XiToSigmaEye(const PropagatorField &qq_loop,
|
||||
autoView( vd_tf , qd_tf , AcceleratorRead);
|
||||
autoView( vs_ti , qs_ti , AcceleratorRead);
|
||||
|
||||
Vector<mobj> my_Dq_spec{Dd_spec,Ds_spec};
|
||||
deviceVector<mobj> my_Dq_spec(2);
|
||||
acceleratorPut(my_Dq_spec[0],Dd_spec);
|
||||
acceleratorPut(my_Dq_spec[0],Ds_spec);
|
||||
mobj * Dq_spec_p = &my_Dq_spec[0];
|
||||
|
||||
if(op == "Q1"){
|
||||
|
@ -118,7 +118,7 @@ static void generatorDiagonal(int diagIndex, iGroupMatrix<cplx> &ta) {
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// Map a su2 subgroup number to the pair of rows that are non zero
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
static void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::SU) {
|
||||
static accelerator_inline void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::SU) {
|
||||
assert((su2_index >= 0) && (su2_index < (ncolour * (ncolour - 1)) / 2));
|
||||
|
||||
int spare = su2_index;
|
||||
|
@ -62,7 +62,7 @@ public:
|
||||
// returns i(T_Adj)^index necessary for the projectors
|
||||
// see definitions above
|
||||
iAdjTa = Zero();
|
||||
Vector<iSUnMatrix<cplx> > ta(ncolour * ncolour - 1);
|
||||
iSUnMatrix<cplx> ta[ncolour * ncolour - 1];
|
||||
iSUnMatrix<cplx> tmp;
|
||||
|
||||
// FIXME not very efficient to get all the generators everytime
|
||||
|
@ -207,7 +207,7 @@ static void generatorZtype(int zIndex, iGroupMatrix<cplx> &ta) {
|
||||
// Map a su2 subgroup number to the pair of rows that are non zero
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
template <ONLY_IF_Sp>
|
||||
static void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::Sp) {
|
||||
static accelerator_inline void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::Sp) {
|
||||
const int nsp=ncolour/2;
|
||||
assert((su2_index >= 0) && (su2_index < (nsp * (nsp - 1)) / 2));
|
||||
|
||||
|
@ -292,19 +292,21 @@ public:
|
||||
//////////////////////////////////////////////////
|
||||
// the sum over all nu-oriented staples for nu != mu on each site
|
||||
//////////////////////////////////////////////////
|
||||
static void Staple(GaugeMat &staple, const GaugeLorentz &Umu, int mu) {
|
||||
static void Staple(GaugeMat &staple, const GaugeLorentz &U, int mu) {
|
||||
|
||||
GridBase *grid = Umu.Grid();
|
||||
|
||||
std::vector<GaugeMat> U(Nd, grid);
|
||||
std::vector<GaugeMat> Umu(Nd, U.Grid());
|
||||
for (int d = 0; d < Nd; d++) {
|
||||
U[d] = PeekIndex<LorentzIndex>(Umu, d);
|
||||
Umu[d] = PeekIndex<LorentzIndex>(U, d);
|
||||
}
|
||||
Staple(staple, U, mu);
|
||||
Staple(staple, Umu, mu);
|
||||
}
|
||||
|
||||
static void Staple(GaugeMat &staple, const std::vector<GaugeMat> &U, int mu) {
|
||||
staple = Zero();
|
||||
static void Staple(GaugeMat &staple, const std::vector<GaugeMat> &Umu, int mu) {
|
||||
|
||||
autoView(staple_v, staple, AcceleratorWrite);
|
||||
accelerator_for(i, staple.Grid()->oSites(), Simd::Nsimd(), {
|
||||
staple_v[i] = Zero();
|
||||
});
|
||||
|
||||
for (int nu = 0; nu < Nd; nu++) {
|
||||
|
||||
@ -318,12 +320,12 @@ public:
|
||||
// |
|
||||
// __|
|
||||
//
|
||||
|
||||
|
||||
staple += Gimpl::ShiftStaple(
|
||||
Gimpl::CovShiftForward(
|
||||
U[nu], nu,
|
||||
Umu[nu], nu,
|
||||
Gimpl::CovShiftBackward(
|
||||
U[mu], mu, Gimpl::CovShiftIdentityBackward(U[nu], nu))),
|
||||
Umu[mu], mu, Gimpl::CovShiftIdentityBackward(Umu[nu], nu))),
|
||||
mu);
|
||||
|
||||
// __
|
||||
@ -333,8 +335,8 @@ public:
|
||||
//
|
||||
|
||||
staple += Gimpl::ShiftStaple(
|
||||
Gimpl::CovShiftBackward(U[nu], nu,
|
||||
Gimpl::CovShiftBackward(U[mu], mu, U[nu])), mu);
|
||||
Gimpl::CovShiftBackward(Umu[nu], nu,
|
||||
Gimpl::CovShiftBackward(Umu[mu], mu, Umu[nu])), mu);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user