mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-14 22:07:05 +01:00
Compare commits
56 Commits
release/0.
...
ISC-freeze
Author | SHA1 | Date | |
---|---|---|---|
251b904a28 | |||
5dfd216a34 | |||
5a112feac3 | |||
c2e8d0aa88 | |||
bf96a4bdbf | |||
84685c9bc3 | |||
013ea4e8d1 | |||
7fbbb31a50 | |||
0e127b1fc7 | |||
68c028b0a6 | |||
a61e0df54b | |||
f871fb0c6d | |||
25d1cadd3b | |||
c24d53bbd1 | |||
3c7a4106ed | |||
6eed167f0c | |||
4ad0df6fde | |||
68a5079f33 | |||
8634e19f1b | |||
9ada378e38 | |||
bfbf2f1fa0 | |||
587bfcc0f4 | |||
8c658de179 | |||
ba37d51ee9 | |||
4f4181c54a | |||
4d4ac2517b | |||
e568c24d1d | |||
b458326744 | |||
6e7d5e2243 | |||
b35169f1dd | |||
441ad7498d | |||
6f6c5c549a | |||
1584e17b54 | |||
12982a4455 | |||
172f412102 | |||
a64497265d | |||
c45f24a1b5 | |||
aaf37ee4d7 | |||
1dddd17e3c | |||
661f1d3e8e | |||
edcf9b9293 | |||
96272f3841 | |||
5c936d88a0 | |||
1c64ee926e | |||
2cbb72a81c | |||
31d83ee046 | |||
a9e8758a01 | |||
3e125c5b61 | |||
eac6ec4b5e | |||
213f8db6a2 | |||
80302e95a8 | |||
b938202081 | |||
0f468e2179 | |||
97b9c6f03d | |||
63982819c6 | |||
24162c9ead |
17
.travis.yml
17
.travis.yml
@ -19,6 +19,8 @@ before_install:
|
|||||||
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc; fi
|
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc; fi
|
||||||
|
|
||||||
install:
|
install:
|
||||||
|
- export CWD=`pwd`
|
||||||
|
- echo $CWD
|
||||||
- export CC=$CC$VERSION
|
- export CC=$CC$VERSION
|
||||||
- export CXX=$CXX$VERSION
|
- export CXX=$CXX$VERSION
|
||||||
- echo $PATH
|
- echo $PATH
|
||||||
@ -36,11 +38,22 @@ script:
|
|||||||
- ./bootstrap.sh
|
- ./bootstrap.sh
|
||||||
- mkdir build
|
- mkdir build
|
||||||
- cd build
|
- cd build
|
||||||
- ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=none
|
- mkdir lime
|
||||||
|
- cd lime
|
||||||
|
- mkdir build
|
||||||
|
- cd build
|
||||||
|
- wget http://usqcd-software.github.io/downloads/c-lime/lime-1.3.2.tar.gz
|
||||||
|
- tar xf lime-1.3.2.tar.gz
|
||||||
|
- cd lime-1.3.2
|
||||||
|
- ./configure --prefix=$CWD/build/lime/install
|
||||||
|
- make -j4
|
||||||
|
- make install
|
||||||
|
- cd $CWD/build
|
||||||
|
- ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install
|
||||||
- make -j4
|
- make -j4
|
||||||
- ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
|
- ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
|
||||||
- echo make clean
|
- echo make clean
|
||||||
- ../configure --enable-precision=double --enable-simd=SSE4 --enable-comms=none
|
- ../configure --enable-precision=double --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install
|
||||||
- make -j4
|
- make -j4
|
||||||
- ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
|
- ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
|
||||||
- make check
|
- make check
|
||||||
|
@ -158,8 +158,10 @@ public:
|
|||||||
|
|
||||||
dbytes=0;
|
dbytes=0;
|
||||||
ncomm=0;
|
ncomm=0;
|
||||||
|
#ifdef GRID_OMP
|
||||||
parallel_for(int dir=0;dir<8;dir++){
|
#pragma omp parallel for num_threads(Grid::CartesianCommunicator::nCommThreads)
|
||||||
|
#endif
|
||||||
|
for(int dir=0;dir<8;dir++){
|
||||||
|
|
||||||
double tbytes;
|
double tbytes;
|
||||||
int mu =dir % 4;
|
int mu =dir % 4;
|
||||||
@ -175,9 +177,14 @@ public:
|
|||||||
int comm_proc = mpi_layout[mu]-1;
|
int comm_proc = mpi_layout[mu]-1;
|
||||||
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
||||||
}
|
}
|
||||||
|
#ifdef GRID_OMP
|
||||||
|
int tid = omp_get_thread_num();
|
||||||
|
#else
|
||||||
|
int tid = dir;
|
||||||
|
#endif
|
||||||
tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
|
tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
|
||||||
(void *)&rbuf[dir][0], recv_from_rank,
|
(void *)&rbuf[dir][0], recv_from_rank,
|
||||||
bytes,dir);
|
bytes,tid);
|
||||||
|
|
||||||
#ifdef GRID_OMP
|
#ifdef GRID_OMP
|
||||||
#pragma omp atomic
|
#pragma omp atomic
|
||||||
|
@ -169,7 +169,11 @@ int main (int argc, char ** argv)
|
|||||||
for(int lat=4;lat<=maxlat;lat+=4){
|
for(int lat=4;lat<=maxlat;lat+=4){
|
||||||
for(int Ls=8;Ls<=8;Ls*=2){
|
for(int Ls=8;Ls<=8;Ls*=2){
|
||||||
|
|
||||||
std::vector<int> latt_size ({lat,lat,lat,lat});
|
std::vector<int> latt_size ({lat*mpi_layout[0],
|
||||||
|
lat*mpi_layout[1],
|
||||||
|
lat*mpi_layout[2],
|
||||||
|
lat*mpi_layout[3]});
|
||||||
|
|
||||||
|
|
||||||
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
|
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
|
||||||
RealD Nrank = Grid._Nprocessors;
|
RealD Nrank = Grid._Nprocessors;
|
||||||
@ -446,7 +450,7 @@ int main (int argc, char ** argv)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef GRID_OMP
|
||||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||||
std::cout<<GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
|
std::cout<<GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
|
||||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||||
@ -485,7 +489,8 @@ int main (int argc, char ** argv)
|
|||||||
dbytes=0;
|
dbytes=0;
|
||||||
ncomm=0;
|
ncomm=0;
|
||||||
|
|
||||||
parallel_for(int dir=0;dir<8;dir++){
|
#pragma omp parallel for num_threads(Grid::CartesianCommunicator::nCommThreads)
|
||||||
|
for(int dir=0;dir<8;dir++){
|
||||||
|
|
||||||
double tbytes;
|
double tbytes;
|
||||||
int mu =dir % 4;
|
int mu =dir % 4;
|
||||||
@ -502,9 +507,9 @@ int main (int argc, char ** argv)
|
|||||||
int comm_proc = mpi_layout[mu]-1;
|
int comm_proc = mpi_layout[mu]-1;
|
||||||
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
|
||||||
}
|
}
|
||||||
|
int tid = omp_get_thread_num();
|
||||||
tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
|
tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
|
||||||
(void *)&rbuf[dir][0], recv_from_rank, bytes,dir);
|
(void *)&rbuf[dir][0], recv_from_rank, bytes,tid);
|
||||||
|
|
||||||
#pragma omp atomic
|
#pragma omp atomic
|
||||||
dbytes+=tbytes;
|
dbytes+=tbytes;
|
||||||
@ -532,7 +537,7 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||||
std::cout<<GridLogMessage << "= All done; Bye Bye"<<std::endl;
|
std::cout<<GridLogMessage << "= All done; Bye Bye"<<std::endl;
|
||||||
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
|
||||||
|
@ -340,7 +340,7 @@ case ${ac_PRECISION} in
|
|||||||
esac
|
esac
|
||||||
|
|
||||||
###################### Shared memory allocation technique under MPI3
|
###################### Shared memory allocation technique under MPI3
|
||||||
AC_ARG_ENABLE([shm],[AC_HELP_STRING([--enable-shm=shmopen|hugetlbfs|shmnone],
|
AC_ARG_ENABLE([shm],[AC_HELP_STRING([--enable-shm=shmopen|shmget|hugetlbfs|shmnone],
|
||||||
[Select SHM allocation technique])],[ac_SHM=${enable_shm}],[ac_SHM=shmopen])
|
[Select SHM allocation technique])],[ac_SHM=${enable_shm}],[ac_SHM=shmopen])
|
||||||
|
|
||||||
case ${ac_SHM} in
|
case ${ac_SHM} in
|
||||||
@ -349,6 +349,10 @@ case ${ac_SHM} in
|
|||||||
AC_DEFINE([GRID_MPI3_SHMOPEN],[1],[GRID_MPI3_SHMOPEN] )
|
AC_DEFINE([GRID_MPI3_SHMOPEN],[1],[GRID_MPI3_SHMOPEN] )
|
||||||
;;
|
;;
|
||||||
|
|
||||||
|
shmget)
|
||||||
|
AC_DEFINE([GRID_MPI3_SHMGET],[1],[GRID_MPI3_SHMGET] )
|
||||||
|
;;
|
||||||
|
|
||||||
shmnone)
|
shmnone)
|
||||||
AC_DEFINE([GRID_MPI3_SHM_NONE],[1],[GRID_MPI3_SHM_NONE] )
|
AC_DEFINE([GRID_MPI3_SHM_NONE],[1],[GRID_MPI3_SHM_NONE] )
|
||||||
;;
|
;;
|
||||||
@ -366,7 +370,7 @@ esac
|
|||||||
AC_ARG_ENABLE([shmpath],[AC_HELP_STRING([--enable-shmpath=path],
|
AC_ARG_ENABLE([shmpath],[AC_HELP_STRING([--enable-shmpath=path],
|
||||||
[Select SHM mmap base path for hugetlbfs])],
|
[Select SHM mmap base path for hugetlbfs])],
|
||||||
[ac_SHMPATH=${enable_shmpath}],
|
[ac_SHMPATH=${enable_shmpath}],
|
||||||
[ac_SHMPATH=/var/lib/hugetlbfs/pagesize-2MB/])
|
[ac_SHMPATH=/var/lib/hugetlbfs/global/pagesize-2MB/])
|
||||||
AC_DEFINE_UNQUOTED([GRID_SHM_PATH],["$ac_SHMPATH"],[Path to a hugetlbfs filesystem for MMAPing])
|
AC_DEFINE_UNQUOTED([GRID_SHM_PATH],["$ac_SHMPATH"],[Path to a hugetlbfs filesystem for MMAPing])
|
||||||
|
|
||||||
############### communication type selection
|
############### communication type selection
|
||||||
|
@ -51,7 +51,7 @@ namespace Grid {
|
|||||||
|
|
||||||
virtual void Op (const Field &in, Field &out) = 0; // Abstract base
|
virtual void Op (const Field &in, Field &out) = 0; // Abstract base
|
||||||
virtual void AdjOp (const Field &in, Field &out) = 0; // Abstract base
|
virtual void AdjOp (const Field &in, Field &out) = 0; // Abstract base
|
||||||
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2)=0;
|
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2) = 0;
|
||||||
virtual void HermOp(const Field &in, Field &out)=0;
|
virtual void HermOp(const Field &in, Field &out)=0;
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -309,36 +309,59 @@ namespace Grid {
|
|||||||
class SchurStaggeredOperator : public SchurOperatorBase<Field> {
|
class SchurStaggeredOperator : public SchurOperatorBase<Field> {
|
||||||
protected:
|
protected:
|
||||||
Matrix &_Mat;
|
Matrix &_Mat;
|
||||||
|
Field tmp;
|
||||||
|
RealD mass;
|
||||||
|
double tMpc;
|
||||||
|
double tIP;
|
||||||
|
double tMeo;
|
||||||
|
double taxpby_norm;
|
||||||
|
uint64_t ncall;
|
||||||
public:
|
public:
|
||||||
SchurStaggeredOperator (Matrix &Mat): _Mat(Mat){};
|
void Report(void)
|
||||||
|
{
|
||||||
|
std::cout << GridLogMessage << " HermOpAndNorm.Mpc "<< tMpc/ncall<<" usec "<<std::endl;
|
||||||
|
std::cout << GridLogMessage << " HermOpAndNorm.IP "<< tIP /ncall<<" usec "<<std::endl;
|
||||||
|
std::cout << GridLogMessage << " Mpc.MeoMoe "<< tMeo/ncall<<" usec "<<std::endl;
|
||||||
|
std::cout << GridLogMessage << " Mpc.axpby_norm "<< taxpby_norm/ncall<<" usec "<<std::endl;
|
||||||
|
}
|
||||||
|
SchurStaggeredOperator (Matrix &Mat): _Mat(Mat), tmp(_Mat.RedBlackGrid())
|
||||||
|
{
|
||||||
|
assert( _Mat.isTrivialEE() );
|
||||||
|
mass = _Mat.Mass();
|
||||||
|
tMpc=0;
|
||||||
|
tIP =0;
|
||||||
|
tMeo=0;
|
||||||
|
taxpby_norm=0;
|
||||||
|
ncall=0;
|
||||||
|
}
|
||||||
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
|
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
|
||||||
GridLogIterative.TimingMode(1);
|
ncall++;
|
||||||
std::cout << GridLogIterative << " HermOpAndNorm "<<std::endl;
|
tMpc-=usecond();
|
||||||
n2 = Mpc(in,out);
|
n2 = Mpc(in,out);
|
||||||
std::cout << GridLogIterative << " HermOpAndNorm.Mpc "<<std::endl;
|
tMpc+=usecond();
|
||||||
|
tIP-=usecond();
|
||||||
ComplexD dot= innerProduct(in,out);
|
ComplexD dot= innerProduct(in,out);
|
||||||
std::cout << GridLogIterative << " HermOpAndNorm.innerProduct "<<std::endl;
|
tIP+=usecond();
|
||||||
n1 = real(dot);
|
n1 = real(dot);
|
||||||
}
|
}
|
||||||
virtual void HermOp(const Field &in, Field &out){
|
virtual void HermOp(const Field &in, Field &out){
|
||||||
std::cout << GridLogIterative << " HermOp "<<std::endl;
|
ncall++;
|
||||||
Mpc(in,out);
|
tMpc-=usecond();
|
||||||
|
_Mat.Meooe(in,out);
|
||||||
|
_Mat.Meooe(out,tmp);
|
||||||
|
tMpc+=usecond();
|
||||||
|
taxpby_norm-=usecond();
|
||||||
|
axpby(out,-1.0,mass*mass,tmp,in);
|
||||||
|
taxpby_norm+=usecond();
|
||||||
}
|
}
|
||||||
virtual RealD Mpc (const Field &in, Field &out) {
|
virtual RealD Mpc (const Field &in, Field &out) {
|
||||||
Field tmp(in._grid);
|
tMeo-=usecond();
|
||||||
Field tmp2(in._grid);
|
|
||||||
|
|
||||||
std::cout << GridLogIterative << " HermOp.Mpc "<<std::endl;
|
|
||||||
_Mat.Mooee(in,out);
|
|
||||||
_Mat.Mooee(out,tmp);
|
|
||||||
std::cout << GridLogIterative << " HermOp.MooeeMooee "<<std::endl;
|
|
||||||
|
|
||||||
_Mat.Meooe(in,out);
|
_Mat.Meooe(in,out);
|
||||||
_Mat.Meooe(out,tmp2);
|
_Mat.Meooe(out,tmp);
|
||||||
std::cout << GridLogIterative << " HermOp.MeooeMeooe "<<std::endl;
|
tMeo+=usecond();
|
||||||
|
taxpby_norm-=usecond();
|
||||||
RealD nn=axpy_norm(out,-1.0,tmp2,tmp);
|
RealD nn=axpby_norm(out,-1.0,mass*mass,tmp,in);
|
||||||
std::cout << GridLogIterative << " HermOp.axpy_norm "<<std::endl;
|
taxpby_norm+=usecond();
|
||||||
return nn;
|
return nn;
|
||||||
}
|
}
|
||||||
virtual RealD MpcDag (const Field &in, Field &out){
|
virtual RealD MpcDag (const Field &in, Field &out){
|
||||||
|
@ -54,6 +54,7 @@ class ConjugateGradient : public OperatorFunction<Field> {
|
|||||||
|
|
||||||
void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
|
void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
|
||||||
|
|
||||||
|
|
||||||
psi.checkerboard = src.checkerboard;
|
psi.checkerboard = src.checkerboard;
|
||||||
conformable(psi, src);
|
conformable(psi, src);
|
||||||
|
|
||||||
@ -69,7 +70,6 @@ class ConjugateGradient : public OperatorFunction<Field> {
|
|||||||
|
|
||||||
|
|
||||||
Linop.HermOpAndNorm(psi, mmp, d, b);
|
Linop.HermOpAndNorm(psi, mmp, d, b);
|
||||||
|
|
||||||
|
|
||||||
r = src - mmp;
|
r = src - mmp;
|
||||||
p = r;
|
p = r;
|
||||||
@ -96,38 +96,44 @@ class ConjugateGradient : public OperatorFunction<Field> {
|
|||||||
<< "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl;
|
<< "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl;
|
||||||
|
|
||||||
GridStopWatch LinalgTimer;
|
GridStopWatch LinalgTimer;
|
||||||
|
GridStopWatch InnerTimer;
|
||||||
|
GridStopWatch AxpyNormTimer;
|
||||||
|
GridStopWatch LinearCombTimer;
|
||||||
GridStopWatch MatrixTimer;
|
GridStopWatch MatrixTimer;
|
||||||
GridStopWatch SolverTimer;
|
GridStopWatch SolverTimer;
|
||||||
|
|
||||||
SolverTimer.Start();
|
SolverTimer.Start();
|
||||||
int k;
|
int k;
|
||||||
for (k = 1; k <= MaxIterations; k++) {
|
for (k = 1; k <= MaxIterations*1000; k++) {
|
||||||
c = cp;
|
c = cp;
|
||||||
|
|
||||||
MatrixTimer.Start();
|
MatrixTimer.Start();
|
||||||
Linop.HermOpAndNorm(p, mmp, d, qq);
|
Linop.HermOp(p, mmp);
|
||||||
MatrixTimer.Stop();
|
MatrixTimer.Stop();
|
||||||
|
|
||||||
LinalgTimer.Start();
|
LinalgTimer.Start();
|
||||||
// RealD qqck = norm2(mmp);
|
|
||||||
// ComplexD dck = innerProduct(p,mmp);
|
|
||||||
|
|
||||||
|
InnerTimer.Start();
|
||||||
|
ComplexD dc = innerProduct(p,mmp);
|
||||||
|
InnerTimer.Stop();
|
||||||
|
d = dc.real();
|
||||||
a = c / d;
|
a = c / d;
|
||||||
b_pred = a * (a * qq - d) / c;
|
|
||||||
|
|
||||||
|
AxpyNormTimer.Start();
|
||||||
cp = axpy_norm(r, -a, mmp, r);
|
cp = axpy_norm(r, -a, mmp, r);
|
||||||
|
AxpyNormTimer.Stop();
|
||||||
b = cp / c;
|
b = cp / c;
|
||||||
|
|
||||||
// Fuse these loops ; should be really easy
|
LinearCombTimer.Start();
|
||||||
psi = a * p + psi;
|
parallel_for(int ss=0;ss<src._grid->oSites();ss++){
|
||||||
p = p * b + r;
|
vstream(psi[ss], a * p[ss] + psi[ss]);
|
||||||
|
vstream(p [ss], b * p[ss] + r[ss]);
|
||||||
|
}
|
||||||
|
LinearCombTimer.Stop();
|
||||||
LinalgTimer.Stop();
|
LinalgTimer.Stop();
|
||||||
|
|
||||||
std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
|
std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
|
||||||
<< " residual " << cp << " target " << rsq << std::endl;
|
<< " residual " << cp << " target " << rsq << std::endl;
|
||||||
std::cout << GridLogDebug << "a = "<< a << " b_pred = "<< b_pred << " b = "<< b << std::endl;
|
|
||||||
std::cout << GridLogDebug << "qq = "<< qq << " d = "<< d << " c = "<< c << std::endl;
|
|
||||||
|
|
||||||
// Stopping condition
|
// Stopping condition
|
||||||
if (cp <= rsq) {
|
if (cp <= rsq) {
|
||||||
@ -148,6 +154,9 @@ class ConjugateGradient : public OperatorFunction<Field> {
|
|||||||
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
|
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
|
||||||
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
|
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
|
||||||
std::cout << GridLogMessage << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl;
|
std::cout << GridLogMessage << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl;
|
||||||
|
std::cout << GridLogMessage << "\tInner " << InnerTimer.Elapsed() <<std::endl;
|
||||||
|
std::cout << GridLogMessage << "\tAxpyNorm " << AxpyNormTimer.Elapsed() <<std::endl;
|
||||||
|
std::cout << GridLogMessage << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
|
||||||
|
|
||||||
if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
|
if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
|
||||||
|
|
||||||
|
@ -43,6 +43,7 @@ namespace Grid {
|
|||||||
public:
|
public:
|
||||||
RealD Tolerance;
|
RealD Tolerance;
|
||||||
Integer MaxIterations;
|
Integer MaxIterations;
|
||||||
|
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
|
||||||
int verbose;
|
int verbose;
|
||||||
MultiShiftFunction shifts;
|
MultiShiftFunction shifts;
|
||||||
|
|
||||||
@ -163,7 +164,16 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
|
|||||||
for(int s=0;s<nshift;s++) {
|
for(int s=0;s<nshift;s++) {
|
||||||
axpby(psi[s],0.,-bs[s]*alpha[s],src,src);
|
axpby(psi[s],0.,-bs[s]*alpha[s],src,src);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
///////////////////////////////////////
|
||||||
|
// Timers
|
||||||
|
///////////////////////////////////////
|
||||||
|
GridStopWatch AXPYTimer;
|
||||||
|
GridStopWatch ShiftTimer;
|
||||||
|
GridStopWatch QRTimer;
|
||||||
|
GridStopWatch MatrixTimer;
|
||||||
|
GridStopWatch SolverTimer;
|
||||||
|
SolverTimer.Start();
|
||||||
|
|
||||||
// Iteration loop
|
// Iteration loop
|
||||||
int k;
|
int k;
|
||||||
@ -171,7 +181,9 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
|
|||||||
for (k=1;k<=MaxIterations;k++){
|
for (k=1;k<=MaxIterations;k++){
|
||||||
|
|
||||||
a = c /cp;
|
a = c /cp;
|
||||||
|
AXPYTimer.Start();
|
||||||
axpy(p,a,p,r);
|
axpy(p,a,p,r);
|
||||||
|
AXPYTimer.Stop();
|
||||||
|
|
||||||
// Note to self - direction ps is iterated seperately
|
// Note to self - direction ps is iterated seperately
|
||||||
// for each shift. Does not appear to have any scope
|
// for each shift. Does not appear to have any scope
|
||||||
@ -180,6 +192,7 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
|
|||||||
// However SAME r is used. Could load "r" and update
|
// However SAME r is used. Could load "r" and update
|
||||||
// ALL ps[s]. 2/3 Bandwidth saving
|
// ALL ps[s]. 2/3 Bandwidth saving
|
||||||
// New Kernel: Load r, vector of coeffs, vector of pointers ps
|
// New Kernel: Load r, vector of coeffs, vector of pointers ps
|
||||||
|
AXPYTimer.Start();
|
||||||
for(int s=0;s<nshift;s++){
|
for(int s=0;s<nshift;s++){
|
||||||
if ( ! converged[s] ) {
|
if ( ! converged[s] ) {
|
||||||
if (s==0){
|
if (s==0){
|
||||||
@ -190,22 +203,34 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
AXPYTimer.Stop();
|
||||||
|
|
||||||
cp=c;
|
cp=c;
|
||||||
|
MatrixTimer.Start();
|
||||||
|
//Linop.HermOpAndNorm(p,mmp,d,qq); // d is used
|
||||||
|
// The below is faster on KNL
|
||||||
|
Linop.HermOp(p,mmp);
|
||||||
|
d=real(innerProduct(p,mmp));
|
||||||
|
|
||||||
Linop.HermOpAndNorm(p,mmp,d,qq);
|
MatrixTimer.Stop();
|
||||||
|
|
||||||
|
AXPYTimer.Start();
|
||||||
axpy(mmp,mass[0],p,mmp);
|
axpy(mmp,mass[0],p,mmp);
|
||||||
|
AXPYTimer.Stop();
|
||||||
RealD rn = norm2(p);
|
RealD rn = norm2(p);
|
||||||
d += rn*mass[0];
|
d += rn*mass[0];
|
||||||
|
|
||||||
bp=b;
|
bp=b;
|
||||||
b=-cp/d;
|
b=-cp/d;
|
||||||
|
|
||||||
|
AXPYTimer.Start();
|
||||||
c=axpy_norm(r,b,mmp,r);
|
c=axpy_norm(r,b,mmp,r);
|
||||||
|
AXPYTimer.Stop();
|
||||||
|
|
||||||
// Toggle the recurrence history
|
// Toggle the recurrence history
|
||||||
bs[0] = b;
|
bs[0] = b;
|
||||||
iz = 1-iz;
|
iz = 1-iz;
|
||||||
|
ShiftTimer.Start();
|
||||||
for(int s=1;s<nshift;s++){
|
for(int s=1;s<nshift;s++){
|
||||||
if((!converged[s])){
|
if((!converged[s])){
|
||||||
RealD z0 = z[s][1-iz];
|
RealD z0 = z[s][1-iz];
|
||||||
@ -215,6 +240,7 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
|
|||||||
bs[s] = b*z[s][iz]/z0; // NB sign rel to Mike
|
bs[s] = b*z[s][iz]/z0; // NB sign rel to Mike
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
ShiftTimer.Stop();
|
||||||
|
|
||||||
for(int s=0;s<nshift;s++){
|
for(int s=0;s<nshift;s++){
|
||||||
int ss = s;
|
int ss = s;
|
||||||
@ -257,6 +283,9 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
|
|||||||
|
|
||||||
if ( all_converged ){
|
if ( all_converged ){
|
||||||
|
|
||||||
|
SolverTimer.Stop();
|
||||||
|
|
||||||
|
|
||||||
std::cout<<GridLogMessage<< "CGMultiShift: All shifts have converged iteration "<<k<<std::endl;
|
std::cout<<GridLogMessage<< "CGMultiShift: All shifts have converged iteration "<<k<<std::endl;
|
||||||
std::cout<<GridLogMessage<< "CGMultiShift: Checking solutions"<<std::endl;
|
std::cout<<GridLogMessage<< "CGMultiShift: Checking solutions"<<std::endl;
|
||||||
|
|
||||||
@ -269,8 +298,19 @@ void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector
|
|||||||
RealD cn = norm2(src);
|
RealD cn = norm2(src);
|
||||||
std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(rn/cn)<<std::endl;
|
std::cout<<GridLogMessage<<"CGMultiShift: shift["<<s<<"] true residual "<<std::sqrt(rn/cn)<<std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
|
||||||
|
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
|
||||||
|
std::cout << GridLogMessage << "\tAXPY " << AXPYTimer.Elapsed() <<std::endl;
|
||||||
|
std::cout << GridLogMessage << "\tMarix " << MatrixTimer.Elapsed() <<std::endl;
|
||||||
|
std::cout << GridLogMessage << "\tShift " << ShiftTimer.Elapsed() <<std::endl;
|
||||||
|
|
||||||
|
IterationsToComplete = k;
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
// ugly hack
|
// ugly hack
|
||||||
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
|
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
|
||||||
|
@ -57,8 +57,9 @@ void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, i
|
|||||||
|
|
||||||
parallel_region
|
parallel_region
|
||||||
{
|
{
|
||||||
std::vector < vobj > B(Nm); // Thread private
|
|
||||||
|
std::vector < vobj , commAllocator<vobj> > B(Nm); // Thread private
|
||||||
|
|
||||||
parallel_for_internal(int ss=0;ss < grid->oSites();ss++){
|
parallel_for_internal(int ss=0;ss < grid->oSites();ss++){
|
||||||
for(int j=j0; j<j1; ++j) B[j]=0.;
|
for(int j=j0; j<j1; ++j) B[j]=0.;
|
||||||
|
|
||||||
|
@ -114,19 +114,151 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
|
|||||||
assert(WorldNode!=-1);
|
assert(WorldNode!=-1);
|
||||||
_ShmSetup=1;
|
_ShmSetup=1;
|
||||||
}
|
}
|
||||||
|
// Gray encode support
|
||||||
void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm)
|
int BinaryToGray (int binary) {
|
||||||
|
int gray = (binary>>1)^binary;
|
||||||
|
return gray;
|
||||||
|
}
|
||||||
|
int Log2Size(int TwoToPower,int MAXLOG2)
|
||||||
{
|
{
|
||||||
////////////////////////////////////////////////////////////////
|
|
||||||
// Assert power of two shm_size.
|
|
||||||
////////////////////////////////////////////////////////////////
|
|
||||||
int log2size = -1;
|
int log2size = -1;
|
||||||
for(int i=0;i<=MAXLOG2RANKSPERNODE;i++){
|
for(int i=0;i<=MAXLOG2;i++){
|
||||||
if ( (0x1<<i) == WorldShmSize ) {
|
if ( (0x1<<i) == TwoToPower ) {
|
||||||
log2size = i;
|
log2size = i;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
return log2size;
|
||||||
|
}
|
||||||
|
void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm)
|
||||||
|
{
|
||||||
|
#undef HYPERCUBE
|
||||||
|
#ifdef HYPERCUBE
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
// Assert power of two shm_size.
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE);
|
||||||
|
assert(log2size != -1);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
// Identify the hypercube coordinate of this node using hostname
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
// n runs 0...7 9...16 18...25 27...34 (8*4) 5 bits
|
||||||
|
// i runs 0..7 3 bits
|
||||||
|
// r runs 0..3 2 bits
|
||||||
|
// 2^10 = 1024 nodes
|
||||||
|
const int maxhdim = 10;
|
||||||
|
std::vector<int> HyperCubeCoords(maxhdim,0);
|
||||||
|
std::vector<int> RootHyperCubeCoords(maxhdim,0);
|
||||||
|
int R;
|
||||||
|
int I;
|
||||||
|
int N;
|
||||||
|
const int namelen = _POSIX_HOST_NAME_MAX;
|
||||||
|
char name[namelen];
|
||||||
|
|
||||||
|
// Parse ICE-XA hostname to get hypercube location
|
||||||
|
gethostname(name,namelen);
|
||||||
|
int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ;
|
||||||
|
assert(nscan==3);
|
||||||
|
|
||||||
|
int nlo = N%9;
|
||||||
|
int nhi = N/9;
|
||||||
|
uint32_t hypercoor = (R<<8)|(I<<5)|(nhi<<3)|nlo ;
|
||||||
|
uint32_t rootcoor = hypercoor;
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////
|
||||||
|
// Print debug info
|
||||||
|
//////////////////////////////////////////////////////////////////
|
||||||
|
for(int d=0;d<maxhdim;d++){
|
||||||
|
HyperCubeCoords[d] = (hypercoor>>d)&0x1;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string hname(name);
|
||||||
|
std::cout << "hostname "<<hname<<std::endl;
|
||||||
|
std::cout << "R " << R << " I " << I << " N "<< N<<
|
||||||
|
<< " hypercoor 0x"<<std::hex<<hypercoor<<std::dec<<std::endl;
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////
|
||||||
|
// broadcast node 0's base coordinate for this partition.
|
||||||
|
//////////////////////////////////////////////////////////////////
|
||||||
|
MPI_Bcast(&rootcoor, sizeof(rootcoor), MPI_BYTE, 0, WorldComm);
|
||||||
|
hypercoor=hypercoor-rootcoor;
|
||||||
|
assert(hypercoor<WorldSize);
|
||||||
|
assert(hypercoor>=0);
|
||||||
|
|
||||||
|
//////////////////////////////////////
|
||||||
|
// Printing
|
||||||
|
//////////////////////////////////////
|
||||||
|
for(int d=0;d<maxhdim;d++){
|
||||||
|
HyperCubeCoords[d] = (hypercoor>>d)&0x1;
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
// Identify subblock of ranks on node spreading across dims
|
||||||
|
// in a maximally symmetrical way
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
int ndimension = processors.size();
|
||||||
|
std::vector<int> processor_coor(ndimension);
|
||||||
|
std::vector<int> WorldDims = processors; std::vector<int> ShmDims (ndimension,1); std::vector<int> NodeDims (ndimension);
|
||||||
|
std::vector<int> ShmCoor (ndimension); std::vector<int> NodeCoor (ndimension); std::vector<int> WorldCoor(ndimension);
|
||||||
|
std::vector<int> HyperCoor(ndimension);
|
||||||
|
int dim = 0;
|
||||||
|
for(int l2=0;l2<log2size;l2++){
|
||||||
|
while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension;
|
||||||
|
ShmDims[dim]*=2;
|
||||||
|
dim=(dim+1)%ndimension;
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
// Establish torus of processes and nodes with sub-blockings
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
for(int d=0;d<ndimension;d++){
|
||||||
|
NodeDims[d] = WorldDims[d]/ShmDims[d];
|
||||||
|
}
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
// Map Hcube according to physical lattice
|
||||||
|
// must partition. Loop over dims and find out who would join.
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
int hcoor = hypercoor;
|
||||||
|
for(int d=0;d<ndimension;d++){
|
||||||
|
int bits = Log2Size(NodeDims[d],MAXLOG2RANKSPERNODE);
|
||||||
|
int msk = (0x1<<bits)-1;
|
||||||
|
HyperCoor[d]=hcoor & msk;
|
||||||
|
HyperCoor[d]=BinaryToGray(HyperCoor[d]); // Space filling curve magic
|
||||||
|
hcoor = hcoor >> bits;
|
||||||
|
}
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
// Check processor counts match
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
int Nprocessors=1;
|
||||||
|
for(int i=0;i<ndimension;i++){
|
||||||
|
Nprocessors*=processors[i];
|
||||||
|
}
|
||||||
|
assert(WorldSize==Nprocessors);
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
// Establish mapping between lexico physics coord and WorldRank
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
int rank;
|
||||||
|
|
||||||
|
Lexicographic::CoorFromIndexReversed(NodeCoor,WorldNode ,NodeDims);
|
||||||
|
|
||||||
|
for(int d=0;d<ndimension;d++) NodeCoor[d]=HyperCoor[d];
|
||||||
|
|
||||||
|
Lexicographic::CoorFromIndexReversed(ShmCoor ,WorldShmRank,ShmDims);
|
||||||
|
for(int d=0;d<ndimension;d++) WorldCoor[d] = NodeCoor[d]*ShmDims[d]+ShmCoor[d];
|
||||||
|
Lexicographic::IndexFromCoorReversed(WorldCoor,rank,WorldDims);
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////////////////
|
||||||
|
// Build the new communicator
|
||||||
|
/////////////////////////////////////////////////////////////////
|
||||||
|
int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
|
||||||
|
assert(ierr==0);
|
||||||
|
#else
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
// Assert power of two shm_size.
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE);
|
||||||
assert(log2size != -1);
|
assert(log2size != -1);
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////
|
||||||
@ -175,15 +307,77 @@ void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,
|
|||||||
/////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////
|
||||||
int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
|
int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
|
||||||
assert(ierr==0);
|
assert(ierr==0);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// SHMGET
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
#ifdef GRID_MPI3_SHMGET
|
||||||
|
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||||
|
{
|
||||||
|
std::cout << "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
|
||||||
|
assert(_ShmSetup==1);
|
||||||
|
assert(_ShmAlloc==0);
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// allocate the shared windows for our group
|
||||||
|
//////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
MPI_Barrier(WorldShmComm);
|
||||||
|
WorldShmCommBufs.resize(WorldShmSize);
|
||||||
|
std::vector<int> shmids(WorldShmSize);
|
||||||
|
|
||||||
|
if ( WorldShmRank == 0 ) {
|
||||||
|
for(int r=0;r<WorldShmSize;r++){
|
||||||
|
size_t size = bytes;
|
||||||
|
key_t key = IPC_PRIVATE;
|
||||||
|
int flags = IPC_CREAT | SHM_R | SHM_W;
|
||||||
|
#ifdef SHM_HUGETLB
|
||||||
|
if (Hugepages) flags|=SHM_HUGETLB;
|
||||||
|
#endif
|
||||||
|
if ((shmids[r]= shmget(key,size, flags)) ==-1) {
|
||||||
|
int errsv = errno;
|
||||||
|
printf("Errno %d\n",errsv);
|
||||||
|
printf("key %d\n",key);
|
||||||
|
printf("size %lld\n",size);
|
||||||
|
printf("flags %d\n",flags);
|
||||||
|
perror("shmget");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
MPI_Barrier(WorldShmComm);
|
||||||
|
MPI_Bcast(&shmids[0],WorldShmSize*sizeof(int),MPI_BYTE,0,WorldShmComm);
|
||||||
|
MPI_Barrier(WorldShmComm);
|
||||||
|
|
||||||
|
for(int r=0;r<WorldShmSize;r++){
|
||||||
|
WorldShmCommBufs[r] = (uint64_t *)shmat(shmids[r], NULL,0);
|
||||||
|
if (WorldShmCommBufs[r] == (uint64_t *)-1) {
|
||||||
|
perror("Shared memory attach failure");
|
||||||
|
shmctl(shmids[r], IPC_RMID, NULL);
|
||||||
|
exit(2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
MPI_Barrier(WorldShmComm);
|
||||||
|
///////////////////////////////////
|
||||||
|
// Mark for clean up
|
||||||
|
///////////////////////////////////
|
||||||
|
for(int r=0;r<WorldShmSize;r++){
|
||||||
|
shmctl(shmids[r], IPC_RMID,(struct shmid_ds *)NULL);
|
||||||
|
}
|
||||||
|
MPI_Barrier(WorldShmComm);
|
||||||
|
|
||||||
|
_ShmAlloc=1;
|
||||||
|
_ShmAllocBytes = bytes;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Hugetlbfs mapping intended
|
// Hugetlbfs mapping intended
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
#ifdef GRID_MPI3_SHMMMAP
|
#ifdef GRID_MPI3_SHMMMAP
|
||||||
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
||||||
{
|
{
|
||||||
std::cout << "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<<std::endl;
|
std::cout << "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
|
||||||
assert(_ShmSetup==1);
|
assert(_ShmSetup==1);
|
||||||
assert(_ShmAlloc==0);
|
assert(_ShmAlloc==0);
|
||||||
//////////////////////////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
@ -193,7 +387,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
WorldShmCommBufs.resize(WorldShmSize);
|
WorldShmCommBufs.resize(WorldShmSize);
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Hugetlbf and others map filesystems as mappable huge pages
|
// Hugetlbfs and others map filesystems as mappable huge pages
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
char shm_name [NAME_MAX];
|
char shm_name [NAME_MAX];
|
||||||
for(int r=0;r<WorldShmSize;r++){
|
for(int r=0;r<WorldShmSize;r++){
|
||||||
@ -344,6 +538,9 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////
|
||||||
// Global shared functionality finished
|
// Global shared functionality finished
|
||||||
// Now move to per communicator functionality
|
// Now move to per communicator functionality
|
||||||
|
@ -244,19 +244,11 @@ namespace Grid {
|
|||||||
|
|
||||||
template<class sobj,class vobj> strong_inline
|
template<class sobj,class vobj> strong_inline
|
||||||
RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
|
RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
|
||||||
ret.checkerboard = x.checkerboard;
|
return axpy_norm_fast(ret,a,x,y);
|
||||||
conformable(ret,x);
|
|
||||||
conformable(x,y);
|
|
||||||
axpy(ret,a,x,y);
|
|
||||||
return norm2(ret);
|
|
||||||
}
|
}
|
||||||
template<class sobj,class vobj> strong_inline
|
template<class sobj,class vobj> strong_inline
|
||||||
RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
|
RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
|
||||||
ret.checkerboard = x.checkerboard;
|
return axpby_norm_fast(ret,a,b,x,y);
|
||||||
conformable(ret,x);
|
|
||||||
conformable(x,y);
|
|
||||||
axpby(ret,a,b,x,y);
|
|
||||||
return norm2(ret); // FIXME implement parallel norm in ss loop
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -33,7 +33,7 @@ namespace Grid {
|
|||||||
// Deterministic Reduction operations
|
// Deterministic Reduction operations
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
|
template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
|
||||||
ComplexD nrm = innerProduct(arg,arg);
|
auto nrm = innerProduct(arg,arg);
|
||||||
return std::real(nrm);
|
return std::real(nrm);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -43,31 +43,84 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
|
|||||||
{
|
{
|
||||||
typedef typename vobj::scalar_type scalar_type;
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
typedef typename vobj::vector_typeD vector_type;
|
typedef typename vobj::vector_typeD vector_type;
|
||||||
scalar_type nrm;
|
|
||||||
|
|
||||||
GridBase *grid = left._grid;
|
GridBase *grid = left._grid;
|
||||||
|
const int pad = 8;
|
||||||
std::vector<vector_type,alignedAllocator<vector_type> > sumarray(grid->SumArraySize());
|
|
||||||
|
ComplexD inner;
|
||||||
|
Vector<ComplexD> sumarray(grid->SumArraySize()*pad);
|
||||||
|
|
||||||
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
|
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
|
||||||
int nwork, mywork, myoff;
|
int nwork, mywork, myoff;
|
||||||
GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff);
|
GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff);
|
||||||
|
|
||||||
decltype(innerProductD(left._odata[0],right._odata[0])) vnrm=zero; // private to thread; sub summation
|
decltype(innerProductD(left._odata[0],right._odata[0])) vinner=zero; // private to thread; sub summation
|
||||||
for(int ss=myoff;ss<mywork+myoff; ss++){
|
for(int ss=myoff;ss<mywork+myoff; ss++){
|
||||||
vnrm = vnrm + innerProductD(left._odata[ss],right._odata[ss]);
|
vinner = vinner + innerProductD(left._odata[ss],right._odata[ss]);
|
||||||
}
|
}
|
||||||
sumarray[thr]=TensorRemove(vnrm) ;
|
// All threads sum across SIMD; reduce serial work at end
|
||||||
|
// one write per cacheline with streaming store
|
||||||
|
ComplexD tmp = Reduce(TensorRemove(vinner)) ;
|
||||||
|
vstream(sumarray[thr*pad],tmp);
|
||||||
}
|
}
|
||||||
|
|
||||||
vector_type vvnrm; vvnrm=zero; // sum across threads
|
inner=0.0;
|
||||||
for(int i=0;i<grid->SumArraySize();i++){
|
for(int i=0;i<grid->SumArraySize();i++){
|
||||||
vvnrm = vvnrm+sumarray[i];
|
inner = inner+sumarray[i*pad];
|
||||||
}
|
}
|
||||||
nrm = Reduce(vvnrm);// sum across simd
|
right._grid->GlobalSum(inner);
|
||||||
right._grid->GlobalSum(nrm);
|
return inner;
|
||||||
return nrm;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/////////////////////////
|
||||||
|
// Fast axpby_norm
|
||||||
|
// z = a x + b y
|
||||||
|
// return norm z
|
||||||
|
/////////////////////////
|
||||||
|
template<class sobj,class vobj> strong_inline RealD
|
||||||
|
axpy_norm_fast(Lattice<vobj> &z,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
|
||||||
|
{
|
||||||
|
sobj one(1.0);
|
||||||
|
return axpby_norm_fast(z,a,one,x,y);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class sobj,class vobj> strong_inline RealD
|
||||||
|
axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
|
||||||
|
{
|
||||||
|
const int pad = 8;
|
||||||
|
z.checkerboard = x.checkerboard;
|
||||||
|
conformable(z,x);
|
||||||
|
conformable(x,y);
|
||||||
|
|
||||||
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
|
typedef typename vobj::vector_typeD vector_type;
|
||||||
|
RealD nrm;
|
||||||
|
|
||||||
|
GridBase *grid = x._grid;
|
||||||
|
|
||||||
|
Vector<RealD> sumarray(grid->SumArraySize()*pad);
|
||||||
|
|
||||||
|
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
|
||||||
|
int nwork, mywork, myoff;
|
||||||
|
GridThread::GetWork(x._grid->oSites(),thr,mywork,myoff);
|
||||||
|
|
||||||
|
// private to thread; sub summation
|
||||||
|
decltype(innerProductD(z._odata[0],z._odata[0])) vnrm=zero;
|
||||||
|
for(int ss=myoff;ss<mywork+myoff; ss++){
|
||||||
|
vobj tmp = a*x._odata[ss]+b*y._odata[ss];
|
||||||
|
vnrm = vnrm + innerProductD(tmp,tmp);
|
||||||
|
vstream(z._odata[ss],tmp);
|
||||||
|
}
|
||||||
|
vstream(sumarray[thr*pad],real(Reduce(TensorRemove(vnrm)))) ;
|
||||||
|
}
|
||||||
|
|
||||||
|
nrm = 0.0; // sum across threads; linear in thread count but fast
|
||||||
|
for(int i=0;i<grid->SumArraySize();i++){
|
||||||
|
nrm = nrm+sumarray[i*pad];
|
||||||
|
}
|
||||||
|
z._grid->GlobalSum(nrm);
|
||||||
|
return nrm;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
template<class Op,class T1>
|
template<class Op,class T1>
|
||||||
inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
|
inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
|
||||||
|
@ -158,10 +158,19 @@ namespace Grid {
|
|||||||
// tens of seconds per trajectory so this is clean in all reasonable cases,
|
// tens of seconds per trajectory so this is clean in all reasonable cases,
|
||||||
// and margin of safety is orders of magnitude.
|
// and margin of safety is orders of magnitude.
|
||||||
// We could hack Sitmo to skip in the higher order words of state if necessary
|
// We could hack Sitmo to skip in the higher order words of state if necessary
|
||||||
|
//
|
||||||
|
// Replace with 2^30 ; avoid problem on large volumes
|
||||||
|
//
|
||||||
/////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////
|
||||||
// uint64_t skip = site+1; // Old init Skipped then drew. Checked compat with faster init
|
// uint64_t skip = site+1; // Old init Skipped then drew. Checked compat with faster init
|
||||||
|
const int shift = 30;
|
||||||
|
|
||||||
uint64_t skip = site;
|
uint64_t skip = site;
|
||||||
skip = skip<<40;
|
|
||||||
|
skip = skip<<shift;
|
||||||
|
|
||||||
|
assert((skip >> shift)==site); // check for overflow
|
||||||
|
|
||||||
eng.discard(skip);
|
eng.discard(skip);
|
||||||
// std::cout << " Engine " <<site << " state " <<eng<<std::endl;
|
// std::cout << " Engine " <<site << " state " <<eng<<std::endl;
|
||||||
}
|
}
|
||||||
|
@ -263,7 +263,7 @@ PARALLEL_CRITICAL
|
|||||||
GridBase *grid,
|
GridBase *grid,
|
||||||
std::vector<fobj> &iodata,
|
std::vector<fobj> &iodata,
|
||||||
std::string file,
|
std::string file,
|
||||||
uint64_t offset,
|
uint64_t& offset,
|
||||||
const std::string &format, int control,
|
const std::string &format, int control,
|
||||||
uint32_t &nersc_csum,
|
uint32_t &nersc_csum,
|
||||||
uint32_t &scidac_csuma,
|
uint32_t &scidac_csuma,
|
||||||
@ -431,14 +431,20 @@ PARALLEL_CRITICAL
|
|||||||
MPI_Abort(MPI_COMM_WORLD, 1); //assert(ierr == 0);
|
MPI_Abort(MPI_COMM_WORLD, 1); //assert(ierr == 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::cout << GridLogDebug << "MPI read I/O set view " << file << std::endl;
|
std::cout << GridLogDebug << "MPI write I/O set view " << file << std::endl;
|
||||||
ierr = MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);
|
ierr = MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);
|
||||||
assert(ierr == 0);
|
assert(ierr == 0);
|
||||||
|
|
||||||
std::cout << GridLogDebug << "MPI read I/O write all " << file << std::endl;
|
std::cout << GridLogDebug << "MPI write I/O write all " << file << std::endl;
|
||||||
ierr = MPI_File_write_all(fh, &iodata[0], 1, localArray, &status);
|
ierr = MPI_File_write_all(fh, &iodata[0], 1, localArray, &status);
|
||||||
assert(ierr == 0);
|
assert(ierr == 0);
|
||||||
|
|
||||||
|
MPI_Offset os;
|
||||||
|
MPI_File_get_position(fh, &os);
|
||||||
|
MPI_File_get_byte_offset(fh, os, &disp);
|
||||||
|
offset = disp;
|
||||||
|
|
||||||
|
|
||||||
MPI_File_close(&fh);
|
MPI_File_close(&fh);
|
||||||
MPI_Type_free(&fileArray);
|
MPI_Type_free(&fileArray);
|
||||||
MPI_Type_free(&localArray);
|
MPI_Type_free(&localArray);
|
||||||
@ -448,7 +454,7 @@ PARALLEL_CRITICAL
|
|||||||
} else {
|
} else {
|
||||||
|
|
||||||
std::cout << GridLogMessage << "IOobject: C++ write I/O " << file << " : "
|
std::cout << GridLogMessage << "IOobject: C++ write I/O " << file << " : "
|
||||||
<< iodata.size() * sizeof(fobj) << " bytes" << std::endl;
|
<< iodata.size() * sizeof(fobj) << " bytes and offset " << offset << std::endl;
|
||||||
|
|
||||||
std::ofstream fout;
|
std::ofstream fout;
|
||||||
fout.exceptions ( std::fstream::failbit | std::fstream::badbit );
|
fout.exceptions ( std::fstream::failbit | std::fstream::badbit );
|
||||||
@ -495,6 +501,7 @@ PARALLEL_CRITICAL
|
|||||||
exit(1);
|
exit(1);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
offset = fout.tellp();
|
||||||
fout.close();
|
fout.close();
|
||||||
}
|
}
|
||||||
timer.Stop();
|
timer.Stop();
|
||||||
@ -699,7 +706,6 @@ PARALLEL_CRITICAL
|
|||||||
|
|
||||||
IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
|
IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
|
||||||
nersc_csum,scidac_csuma,scidac_csumb);
|
nersc_csum,scidac_csuma,scidac_csumb);
|
||||||
|
|
||||||
iodata.resize(1);
|
iodata.resize(1);
|
||||||
{
|
{
|
||||||
std::vector<RngStateType> tmp(RngStateCount);
|
std::vector<RngStateType> tmp(RngStateCount);
|
||||||
|
@ -182,6 +182,11 @@ class GridLimeReader : public BinaryIO {
|
|||||||
{
|
{
|
||||||
filename= _filename;
|
filename= _filename;
|
||||||
File = fopen(filename.c_str(), "r");
|
File = fopen(filename.c_str(), "r");
|
||||||
|
if (File == nullptr)
|
||||||
|
{
|
||||||
|
std::cerr << "cannot open file '" << filename << "'" << std::endl;
|
||||||
|
abort();
|
||||||
|
}
|
||||||
LimeR = limeCreateReader(File);
|
LimeR = limeCreateReader(File);
|
||||||
}
|
}
|
||||||
/////////////////////////////////////////////
|
/////////////////////////////////////////////
|
||||||
|
@ -49,7 +49,8 @@ inline double usecond(void) {
|
|||||||
|
|
||||||
typedef std::chrono::system_clock GridClock;
|
typedef std::chrono::system_clock GridClock;
|
||||||
typedef std::chrono::time_point<GridClock> GridTimePoint;
|
typedef std::chrono::time_point<GridClock> GridTimePoint;
|
||||||
typedef std::chrono::milliseconds GridTime;
|
typedef std::chrono::milliseconds GridMillisecs;
|
||||||
|
typedef std::chrono::microseconds GridTime;
|
||||||
typedef std::chrono::microseconds GridUsecs;
|
typedef std::chrono::microseconds GridUsecs;
|
||||||
|
|
||||||
inline std::ostream& operator<< (std::ostream & stream, const std::chrono::milliseconds & time)
|
inline std::ostream& operator<< (std::ostream & stream, const std::chrono::milliseconds & time)
|
||||||
@ -57,6 +58,11 @@ inline std::ostream& operator<< (std::ostream & stream, const std::chrono::milli
|
|||||||
stream << time.count()<<" ms";
|
stream << time.count()<<" ms";
|
||||||
return stream;
|
return stream;
|
||||||
}
|
}
|
||||||
|
inline std::ostream& operator<< (std::ostream & stream, const std::chrono::microseconds & time)
|
||||||
|
{
|
||||||
|
stream << time.count()<<" usec";
|
||||||
|
return stream;
|
||||||
|
}
|
||||||
|
|
||||||
class GridStopWatch {
|
class GridStopWatch {
|
||||||
private:
|
private:
|
||||||
|
@ -63,9 +63,12 @@ namespace Grid {
|
|||||||
virtual RealD M (const FermionField &in, FermionField &out)=0;
|
virtual RealD M (const FermionField &in, FermionField &out)=0;
|
||||||
virtual RealD Mdag (const FermionField &in, FermionField &out)=0;
|
virtual RealD Mdag (const FermionField &in, FermionField &out)=0;
|
||||||
|
|
||||||
// half checkerboard operaions
|
// Query the even even properties to make algorithmic decisions
|
||||||
virtual int ConstEE(void) { return 1; }; // clover returns zero as EE depends on gauge field
|
virtual int ConstEE(void) { return 1; }; // clover returns zero as EE depends on gauge field
|
||||||
|
virtual int isTrivialEE(void) { return 0; };
|
||||||
|
virtual RealD Mass(void) {return 0.0;};
|
||||||
|
|
||||||
|
// half checkerboard operaions
|
||||||
virtual void Meooe (const FermionField &in, FermionField &out)=0;
|
virtual void Meooe (const FermionField &in, FermionField &out)=0;
|
||||||
virtual void MeooeDag (const FermionField &in, FermionField &out)=0;
|
virtual void MeooeDag (const FermionField &in, FermionField &out)=0;
|
||||||
virtual void Mooee (const FermionField &in, FermionField &out)=0;
|
virtual void Mooee (const FermionField &in, FermionField &out)=0;
|
||||||
|
@ -764,7 +764,12 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
|
|||||||
inline void loadLinkElement(Simd ®, ref &memory) {
|
inline void loadLinkElement(Simd ®, ref &memory) {
|
||||||
reg = memory;
|
reg = memory;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline void InsertGaugeField(DoubledGaugeField &U_ds,
|
||||||
|
const GaugeLinkField &U,int mu)
|
||||||
|
{
|
||||||
|
PokeIndex<LorentzIndex>(U_ds, U, mu);
|
||||||
|
}
|
||||||
inline void DoubleStore(GridBase *GaugeGrid,
|
inline void DoubleStore(GridBase *GaugeGrid,
|
||||||
DoubledGaugeField &UUUds, // for Naik term
|
DoubledGaugeField &UUUds, // for Naik term
|
||||||
DoubledGaugeField &Uds,
|
DoubledGaugeField &Uds,
|
||||||
@ -803,8 +808,10 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
|
|||||||
U = U *phases;
|
U = U *phases;
|
||||||
Udag = Udag *phases;
|
Udag = Udag *phases;
|
||||||
|
|
||||||
PokeIndex<LorentzIndex>(Uds, U, mu);
|
InsertGaugeField(Uds,U,mu);
|
||||||
PokeIndex<LorentzIndex>(Uds, Udag, mu + 4);
|
InsertGaugeField(Uds,Udag,mu+4);
|
||||||
|
// PokeIndex<LorentzIndex>(Uds, U, mu);
|
||||||
|
// PokeIndex<LorentzIndex>(Uds, Udag, mu + 4);
|
||||||
|
|
||||||
// 3 hop based on thin links. Crazy huh ?
|
// 3 hop based on thin links. Crazy huh ?
|
||||||
U = PeekIndex<LorentzIndex>(Uthin, mu);
|
U = PeekIndex<LorentzIndex>(Uthin, mu);
|
||||||
@ -816,8 +823,8 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
|
|||||||
UUU = UUU *phases;
|
UUU = UUU *phases;
|
||||||
UUUdag = UUUdag *phases;
|
UUUdag = UUUdag *phases;
|
||||||
|
|
||||||
PokeIndex<LorentzIndex>(UUUds, UUU, mu);
|
InsertGaugeField(UUUds,UUU,mu);
|
||||||
PokeIndex<LorentzIndex>(UUUds, UUUdag, mu+4);
|
InsertGaugeField(UUUds,UUUdag,mu+4);
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -910,6 +917,23 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
|
|||||||
mac(&phi(), &UU(), &chi());
|
mac(&phi(), &UU(), &chi());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline void InsertGaugeField(DoubledGaugeField &U_ds,const GaugeLinkField &U,int mu)
|
||||||
|
{
|
||||||
|
GridBase *GaugeGrid = U_ds._grid;
|
||||||
|
parallel_for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) {
|
||||||
|
|
||||||
|
SiteScalarGaugeLink ScalarU;
|
||||||
|
SiteDoubledGaugeField ScalarUds;
|
||||||
|
|
||||||
|
std::vector<int> lcoor;
|
||||||
|
GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
|
||||||
|
peekLocalSite(ScalarUds, U_ds, lcoor);
|
||||||
|
|
||||||
|
peekLocalSite(ScalarU, U, lcoor);
|
||||||
|
ScalarUds(mu) = ScalarU();
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
inline void DoubleStore(GridBase *GaugeGrid,
|
inline void DoubleStore(GridBase *GaugeGrid,
|
||||||
DoubledGaugeField &UUUds, // for Naik term
|
DoubledGaugeField &UUUds, // for Naik term
|
||||||
DoubledGaugeField &Uds,
|
DoubledGaugeField &Uds,
|
||||||
@ -951,23 +975,8 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
|
|||||||
U = U *phases;
|
U = U *phases;
|
||||||
Udag = Udag *phases;
|
Udag = Udag *phases;
|
||||||
|
|
||||||
|
InsertGaugeField(Uds,U,mu);
|
||||||
for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) {
|
InsertGaugeField(Uds,Udag,mu+4);
|
||||||
SiteScalarGaugeLink ScalarU;
|
|
||||||
SiteDoubledGaugeField ScalarUds;
|
|
||||||
|
|
||||||
std::vector<int> lcoor;
|
|
||||||
GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
|
|
||||||
peekLocalSite(ScalarUds, Uds, lcoor);
|
|
||||||
|
|
||||||
peekLocalSite(ScalarU, U, lcoor);
|
|
||||||
ScalarUds(mu) = ScalarU();
|
|
||||||
|
|
||||||
peekLocalSite(ScalarU, Udag, lcoor);
|
|
||||||
ScalarUds(mu + 4) = ScalarU();
|
|
||||||
|
|
||||||
pokeLocalSite(ScalarUds, Uds, lcoor);
|
|
||||||
}
|
|
||||||
|
|
||||||
// 3 hop based on thin links. Crazy huh ?
|
// 3 hop based on thin links. Crazy huh ?
|
||||||
U = PeekIndex<LorentzIndex>(Uthin, mu);
|
U = PeekIndex<LorentzIndex>(Uthin, mu);
|
||||||
@ -979,24 +988,8 @@ class StaggeredImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation:
|
|||||||
UUU = UUU *phases;
|
UUU = UUU *phases;
|
||||||
UUUdag = UUUdag *phases;
|
UUUdag = UUUdag *phases;
|
||||||
|
|
||||||
for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) {
|
InsertGaugeField(UUUds,UUU,mu);
|
||||||
|
InsertGaugeField(UUUds,UUUdag,mu+4);
|
||||||
SiteScalarGaugeLink ScalarU;
|
|
||||||
SiteDoubledGaugeField ScalarUds;
|
|
||||||
|
|
||||||
std::vector<int> lcoor;
|
|
||||||
GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
|
|
||||||
|
|
||||||
peekLocalSite(ScalarUds, UUUds, lcoor);
|
|
||||||
|
|
||||||
peekLocalSite(ScalarU, UUU, lcoor);
|
|
||||||
ScalarUds(mu) = ScalarU();
|
|
||||||
|
|
||||||
peekLocalSite(ScalarU, UUUdag, lcoor);
|
|
||||||
ScalarUds(mu + 4) = ScalarU();
|
|
||||||
|
|
||||||
pokeLocalSite(ScalarUds, UUUds, lcoor);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -44,6 +44,7 @@ ImprovedStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3, 3,
|
|||||||
template <class Impl>
|
template <class Impl>
|
||||||
ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid,
|
ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid,
|
||||||
RealD _mass,
|
RealD _mass,
|
||||||
|
RealD _c1, RealD _c2,RealD _u0,
|
||||||
const ImplParams &p)
|
const ImplParams &p)
|
||||||
: Kernels(p),
|
: Kernels(p),
|
||||||
_grid(&Fgrid),
|
_grid(&Fgrid),
|
||||||
@ -62,6 +63,16 @@ ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GridCartesian &Fgrid, G
|
|||||||
UUUmuOdd(&Hgrid) ,
|
UUUmuOdd(&Hgrid) ,
|
||||||
_tmp(&Hgrid)
|
_tmp(&Hgrid)
|
||||||
{
|
{
|
||||||
|
int vol4;
|
||||||
|
int LLs=1;
|
||||||
|
c1=_c1;
|
||||||
|
c2=_c2;
|
||||||
|
u0=_u0;
|
||||||
|
vol4= _grid->oSites();
|
||||||
|
Stencil.BuildSurfaceList(LLs,vol4);
|
||||||
|
vol4= _cbgrid->oSites();
|
||||||
|
StencilEven.BuildSurfaceList(LLs,vol4);
|
||||||
|
StencilOdd.BuildSurfaceList(LLs,vol4);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
@ -69,22 +80,10 @@ ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GaugeField &_Uthin, Gau
|
|||||||
GridRedBlackCartesian &Hgrid, RealD _mass,
|
GridRedBlackCartesian &Hgrid, RealD _mass,
|
||||||
RealD _c1, RealD _c2,RealD _u0,
|
RealD _c1, RealD _c2,RealD _u0,
|
||||||
const ImplParams &p)
|
const ImplParams &p)
|
||||||
: ImprovedStaggeredFermion(Fgrid,Hgrid,_mass,p)
|
: ImprovedStaggeredFermion(Fgrid,Hgrid,_mass,_c1,_c2,_u0,p)
|
||||||
{
|
{
|
||||||
c1=_c1;
|
|
||||||
c2=_c2;
|
|
||||||
u0=_u0;
|
|
||||||
ImportGauge(_Uthin,_Ufat);
|
ImportGauge(_Uthin,_Ufat);
|
||||||
}
|
}
|
||||||
template <class Impl>
|
|
||||||
ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GaugeField &_Uthin,GaugeField &_Utriple, GaugeField &_Ufat, GridCartesian &Fgrid,
|
|
||||||
GridRedBlackCartesian &Hgrid, RealD _mass,
|
|
||||||
const ImplParams &p)
|
|
||||||
: ImprovedStaggeredFermion(Fgrid,Hgrid,_mass,p)
|
|
||||||
{
|
|
||||||
ImportGaugeSimple(_Utriple,_Ufat);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////
|
||||||
// Momentum space propagator should be
|
// Momentum space propagator should be
|
||||||
@ -98,11 +97,6 @@ ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GaugeField &_Uthin,Gaug
|
|||||||
// of above link to implmement fourier based solver.
|
// of above link to implmement fourier based solver.
|
||||||
////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void ImprovedStaggeredFermion<Impl>::ImportGauge(const GaugeField &_Uthin)
|
|
||||||
{
|
|
||||||
ImportGauge(_Uthin,_Uthin);
|
|
||||||
};
|
|
||||||
template <class Impl>
|
|
||||||
void ImprovedStaggeredFermion<Impl>::ImportGaugeSimple(const GaugeField &_Utriple,const GaugeField &_Ufat)
|
void ImprovedStaggeredFermion<Impl>::ImportGaugeSimple(const GaugeField &_Utriple,const GaugeField &_Ufat)
|
||||||
{
|
{
|
||||||
/////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////
|
||||||
@ -125,6 +119,20 @@ void ImprovedStaggeredFermion<Impl>::ImportGaugeSimple(const GaugeField &_Utripl
|
|||||||
PokeIndex<LorentzIndex>(Umu, -U, mu+4);
|
PokeIndex<LorentzIndex>(Umu, -U, mu+4);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
CopyGaugeCheckerboards();
|
||||||
|
}
|
||||||
|
template <class Impl>
|
||||||
|
void ImprovedStaggeredFermion<Impl>::ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U)
|
||||||
|
{
|
||||||
|
|
||||||
|
Umu = _U;
|
||||||
|
UUUmu = _UUU;
|
||||||
|
CopyGaugeCheckerboards();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class Impl>
|
||||||
|
void ImprovedStaggeredFermion<Impl>::CopyGaugeCheckerboards(void)
|
||||||
|
{
|
||||||
pickCheckerboard(Even, UmuEven, Umu);
|
pickCheckerboard(Even, UmuEven, Umu);
|
||||||
pickCheckerboard(Odd, UmuOdd , Umu);
|
pickCheckerboard(Odd, UmuOdd , Umu);
|
||||||
pickCheckerboard(Even, UUUmuEven,UUUmu);
|
pickCheckerboard(Even, UUUmuEven,UUUmu);
|
||||||
@ -160,10 +168,7 @@ void ImprovedStaggeredFermion<Impl>::ImportGauge(const GaugeField &_Uthin,const
|
|||||||
PokeIndex<LorentzIndex>(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4);
|
PokeIndex<LorentzIndex>(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4);
|
||||||
}
|
}
|
||||||
|
|
||||||
pickCheckerboard(Even, UmuEven, Umu);
|
CopyGaugeCheckerboards();
|
||||||
pickCheckerboard(Odd, UmuOdd , Umu);
|
|
||||||
pickCheckerboard(Even, UUUmuEven, UUUmu);
|
|
||||||
pickCheckerboard(Odd, UUUmuOdd, UUUmu);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/////////////////////////////
|
/////////////////////////////
|
||||||
@ -322,6 +327,7 @@ void ImprovedStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionF
|
|||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) {
|
void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) {
|
||||||
|
DhopCalls+=2;
|
||||||
conformable(in._grid, _grid); // verifies full grid
|
conformable(in._grid, _grid); // verifies full grid
|
||||||
conformable(in._grid, out._grid);
|
conformable(in._grid, out._grid);
|
||||||
|
|
||||||
@ -332,6 +338,7 @@ void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &
|
|||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) {
|
void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) {
|
||||||
|
DhopCalls+=1;
|
||||||
conformable(in._grid, _cbgrid); // verifies half grid
|
conformable(in._grid, _cbgrid); // verifies half grid
|
||||||
conformable(in._grid, out._grid); // drops the cb check
|
conformable(in._grid, out._grid); // drops the cb check
|
||||||
|
|
||||||
@ -343,6 +350,7 @@ void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField
|
|||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag) {
|
void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag) {
|
||||||
|
DhopCalls+=1;
|
||||||
conformable(in._grid, _cbgrid); // verifies half grid
|
conformable(in._grid, _cbgrid); // verifies half grid
|
||||||
conformable(in._grid, out._grid); // drops the cb check
|
conformable(in._grid, out._grid); // drops the cb check
|
||||||
|
|
||||||
@ -374,25 +382,193 @@ void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder
|
|||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
DoubledGaugeField &UUU,
|
DoubledGaugeField &UUU,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
FermionField &out, int dag) {
|
FermionField &out, int dag)
|
||||||
|
{
|
||||||
|
#ifdef GRID_OMP
|
||||||
|
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
|
||||||
|
DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
|
||||||
|
else
|
||||||
|
#endif
|
||||||
|
DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
|
||||||
|
}
|
||||||
|
template <class Impl>
|
||||||
|
void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U,
|
||||||
|
DoubledGaugeField &UUU,
|
||||||
|
const FermionField &in,
|
||||||
|
FermionField &out, int dag)
|
||||||
|
{
|
||||||
|
#ifdef GRID_OMP
|
||||||
|
Compressor compressor;
|
||||||
|
int len = U._grid->oSites();
|
||||||
|
const int LLs = 1;
|
||||||
|
|
||||||
|
DhopTotalTime -= usecond();
|
||||||
|
|
||||||
|
DhopFaceTime -= usecond();
|
||||||
|
st.Prepare();
|
||||||
|
st.HaloGather(in,compressor);
|
||||||
|
st.CommsMergeSHM(compressor);
|
||||||
|
DhopFaceTime += usecond();
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Ugly explicit thread mapping introduced for OPA reasons.
|
||||||
|
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
DhopComputeTime -= usecond();
|
||||||
|
#pragma omp parallel
|
||||||
|
{
|
||||||
|
int tid = omp_get_thread_num();
|
||||||
|
int nthreads = omp_get_num_threads();
|
||||||
|
int ncomms = CartesianCommunicator::nCommThreads;
|
||||||
|
if (ncomms == -1) ncomms = 1;
|
||||||
|
assert(nthreads > ncomms);
|
||||||
|
|
||||||
|
if (tid >= ncomms) {
|
||||||
|
nthreads -= ncomms;
|
||||||
|
int ttid = tid - ncomms;
|
||||||
|
int n = len;
|
||||||
|
int chunk = n / nthreads;
|
||||||
|
int rem = n % nthreads;
|
||||||
|
int myblock, myn;
|
||||||
|
if (ttid < rem) {
|
||||||
|
myblock = ttid * chunk + ttid;
|
||||||
|
myn = chunk+1;
|
||||||
|
} else {
|
||||||
|
myblock = ttid*chunk + rem;
|
||||||
|
myn = chunk;
|
||||||
|
}
|
||||||
|
|
||||||
|
// do the compute
|
||||||
|
if (dag == DaggerYes) {
|
||||||
|
for (int ss = myblock; ss < myblock+myn; ++ss) {
|
||||||
|
int sU = ss;
|
||||||
|
// Interior = 1; Exterior = 0; must implement for staggered
|
||||||
|
Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,1,0);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (int ss = myblock; ss < myblock+myn; ++ss) {
|
||||||
|
// Interior = 1; Exterior = 0;
|
||||||
|
int sU = ss;
|
||||||
|
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,1,0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
st.CommunicateThreaded();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
DhopComputeTime += usecond();
|
||||||
|
|
||||||
|
// First to enter, last to leave timing
|
||||||
|
DhopFaceTime -= usecond();
|
||||||
|
st.CommsMerge(compressor);
|
||||||
|
DhopFaceTime -= usecond();
|
||||||
|
|
||||||
|
DhopComputeTime2 -= usecond();
|
||||||
|
if (dag == DaggerYes) {
|
||||||
|
int sz=st.surface_list.size();
|
||||||
|
parallel_for (int ss = 0; ss < sz; ss++) {
|
||||||
|
int sU = st.surface_list[ss];
|
||||||
|
Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,0,1);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
int sz=st.surface_list.size();
|
||||||
|
parallel_for (int ss = 0; ss < sz; ss++) {
|
||||||
|
int sU = st.surface_list[ss];
|
||||||
|
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,0,1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
DhopComputeTime2 += usecond();
|
||||||
|
#else
|
||||||
|
assert(0);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template <class Impl>
|
||||||
|
void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U,
|
||||||
|
DoubledGaugeField &UUU,
|
||||||
|
const FermionField &in,
|
||||||
|
FermionField &out, int dag)
|
||||||
|
{
|
||||||
assert((dag == DaggerNo) || (dag == DaggerYes));
|
assert((dag == DaggerNo) || (dag == DaggerYes));
|
||||||
|
|
||||||
|
DhopTotalTime -= usecond();
|
||||||
|
|
||||||
|
DhopCommTime -= usecond();
|
||||||
Compressor compressor;
|
Compressor compressor;
|
||||||
st.HaloExchange(in, compressor);
|
st.HaloExchange(in, compressor);
|
||||||
|
DhopCommTime += usecond();
|
||||||
|
|
||||||
|
DhopComputeTime -= usecond();
|
||||||
if (dag == DaggerYes) {
|
if (dag == DaggerYes) {
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
|
||||||
for (int sss = 0; sss < in._grid->oSites(); sss++) {
|
|
||||||
Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out);
|
Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
PARALLEL_FOR_LOOP
|
parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
|
||||||
for (int sss = 0; sss < in._grid->oSites(); sss++) {
|
|
||||||
Kernels::DhopSite(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out);
|
Kernels::DhopSite(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
DhopComputeTime += usecond();
|
||||||
|
DhopTotalTime += usecond();
|
||||||
};
|
};
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
// Reporting
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
template<class Impl>
|
||||||
|
void ImprovedStaggeredFermion<Impl>::Report(void)
|
||||||
|
{
|
||||||
|
std::vector<int> latt = GridDefaultLatt();
|
||||||
|
RealD volume = 1; for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
|
||||||
|
RealD NP = _grid->_Nprocessors;
|
||||||
|
RealD NN = _grid->NodeCount();
|
||||||
|
|
||||||
|
std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
|
||||||
|
|
||||||
|
std::cout << GridLogMessage << "ImprovedStaggeredFermion Number of DhopEO Calls : "
|
||||||
|
<< DhopCalls << std::endl;
|
||||||
|
std::cout << GridLogMessage << "ImprovedStaggeredFermion TotalTime /Calls : "
|
||||||
|
<< DhopTotalTime / DhopCalls << " us" << std::endl;
|
||||||
|
std::cout << GridLogMessage << "ImprovedStaggeredFermion CommTime /Calls : "
|
||||||
|
<< DhopCommTime / DhopCalls << " us" << std::endl;
|
||||||
|
std::cout << GridLogMessage << "ImprovedStaggeredFermion ComputeTime/Calls : "
|
||||||
|
<< DhopComputeTime / DhopCalls << " us" << std::endl;
|
||||||
|
|
||||||
|
// Average the compute time
|
||||||
|
_grid->GlobalSum(DhopComputeTime);
|
||||||
|
DhopComputeTime/=NP;
|
||||||
|
|
||||||
|
RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
|
||||||
|
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
|
||||||
|
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
|
||||||
|
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl;
|
||||||
|
|
||||||
|
RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
|
||||||
|
std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl;
|
||||||
|
std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
|
||||||
|
std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
|
||||||
|
|
||||||
|
std::cout << GridLogMessage << "ImprovedStaggeredFermion Stencil" <<std::endl; Stencil.Report();
|
||||||
|
std::cout << GridLogMessage << "ImprovedStaggeredFermion StencilEven"<<std::endl; StencilEven.Report();
|
||||||
|
std::cout << GridLogMessage << "ImprovedStaggeredFermion StencilOdd" <<std::endl; StencilOdd.Report();
|
||||||
|
}
|
||||||
|
template<class Impl>
|
||||||
|
void ImprovedStaggeredFermion<Impl>::ZeroCounters(void)
|
||||||
|
{
|
||||||
|
DhopCalls = 0;
|
||||||
|
DhopTotalTime = 0;
|
||||||
|
DhopCommTime = 0;
|
||||||
|
DhopComputeTime = 0;
|
||||||
|
DhopFaceTime = 0;
|
||||||
|
|
||||||
|
Stencil.ZeroCounters();
|
||||||
|
StencilEven.ZeroCounters();
|
||||||
|
StencilOdd.ZeroCounters();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////
|
||||||
// Conserved current - not yet implemented.
|
// Conserved current - not yet implemented.
|
||||||
////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////
|
||||||
|
@ -49,6 +49,18 @@ class ImprovedStaggeredFermion : public StaggeredKernels<Impl>, public ImprovedS
|
|||||||
FermionField _tmp;
|
FermionField _tmp;
|
||||||
FermionField &tmp(void) { return _tmp; }
|
FermionField &tmp(void) { return _tmp; }
|
||||||
|
|
||||||
|
////////////////////////////////////////
|
||||||
|
// Performance monitoring
|
||||||
|
////////////////////////////////////////
|
||||||
|
void Report(void);
|
||||||
|
void ZeroCounters(void);
|
||||||
|
double DhopTotalTime;
|
||||||
|
double DhopCalls;
|
||||||
|
double DhopCommTime;
|
||||||
|
double DhopComputeTime;
|
||||||
|
double DhopComputeTime2;
|
||||||
|
double DhopFaceTime;
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
// Implement the abstract base
|
// Implement the abstract base
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
@ -105,25 +117,34 @@ class ImprovedStaggeredFermion : public StaggeredKernels<Impl>, public ImprovedS
|
|||||||
|
|
||||||
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||||
const FermionField &in, FermionField &out, int dag);
|
const FermionField &in, FermionField &out, int dag);
|
||||||
|
void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||||
|
const FermionField &in, FermionField &out, int dag);
|
||||||
|
void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||||
|
const FermionField &in, FermionField &out, int dag);
|
||||||
|
|
||||||
// Constructor
|
//////////////////////////////////////////////////////////////////////////
|
||||||
|
// Grid own interface Constructor
|
||||||
|
//////////////////////////////////////////////////////////////////////////
|
||||||
ImprovedStaggeredFermion(GaugeField &_Uthin, GaugeField &_Ufat, GridCartesian &Fgrid,
|
ImprovedStaggeredFermion(GaugeField &_Uthin, GaugeField &_Ufat, GridCartesian &Fgrid,
|
||||||
GridRedBlackCartesian &Hgrid, RealD _mass,
|
GridRedBlackCartesian &Hgrid, RealD _mass,
|
||||||
RealD _c1=9.0/8.0, RealD _c2=-1.0/24.0,RealD _u0=1.0,
|
RealD _c1, RealD _c2,RealD _u0,
|
||||||
const ImplParams &p = ImplParams());
|
|
||||||
|
|
||||||
ImprovedStaggeredFermion(GaugeField &_Uthin, GaugeField &_Utriple, GaugeField &_Ufat, GridCartesian &Fgrid,
|
|
||||||
GridRedBlackCartesian &Hgrid, RealD _mass,
|
|
||||||
const ImplParams &p = ImplParams());
|
const ImplParams &p = ImplParams());
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////
|
||||||
|
// MILC constructor no gauge fields
|
||||||
|
//////////////////////////////////////////////////////////////////////////
|
||||||
ImprovedStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, RealD _mass,
|
ImprovedStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, RealD _mass,
|
||||||
|
RealD _c1=1.0, RealD _c2=1.0,RealD _u0=1.0,
|
||||||
const ImplParams &p = ImplParams());
|
const ImplParams &p = ImplParams());
|
||||||
|
|
||||||
|
|
||||||
// DoubleStore impl dependent
|
// DoubleStore impl dependent
|
||||||
void ImportGaugeSimple(const GaugeField &_Utriple, const GaugeField &_Ufat);
|
void ImportGauge (const GaugeField &_Uthin ) { assert(0); }
|
||||||
void ImportGauge(const GaugeField &_Uthin, const GaugeField &_Ufat);
|
void ImportGauge (const GaugeField &_Uthin ,const GaugeField &_Ufat);
|
||||||
void ImportGauge(const GaugeField &_Uthin);
|
void ImportGaugeSimple(const GaugeField &_UUU ,const GaugeField &_U);
|
||||||
|
void ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U);
|
||||||
|
DoubledGaugeField &GetU(void) { return Umu ; } ;
|
||||||
|
DoubledGaugeField &GetUUU(void) { return UUUmu; };
|
||||||
|
void CopyGaugeCheckerboards(void);
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
// Data members require to support the functionality
|
// Data members require to support the functionality
|
||||||
@ -132,7 +153,8 @@ class ImprovedStaggeredFermion : public StaggeredKernels<Impl>, public ImprovedS
|
|||||||
// protected:
|
// protected:
|
||||||
public:
|
public:
|
||||||
// any other parameters of action ???
|
// any other parameters of action ???
|
||||||
|
virtual int isTrivialEE(void) { return 1; };
|
||||||
|
virtual RealD Mass(void) { return mass; }
|
||||||
RealD mass;
|
RealD mass;
|
||||||
RealD u0;
|
RealD u0;
|
||||||
RealD c1;
|
RealD c1;
|
||||||
|
@ -41,8 +41,7 @@ ImprovedStaggeredFermion5DStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3,
|
|||||||
|
|
||||||
// 5d lattice for DWF.
|
// 5d lattice for DWF.
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GaugeField &_Uthin,GaugeField &_Ufat,
|
ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GridCartesian &FiveDimGrid,
|
||||||
GridCartesian &FiveDimGrid,
|
|
||||||
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
||||||
GridCartesian &FourDimGrid,
|
GridCartesian &FourDimGrid,
|
||||||
GridRedBlackCartesian &FourDimRedBlackGrid,
|
GridRedBlackCartesian &FourDimRedBlackGrid,
|
||||||
@ -121,16 +120,74 @@ ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GaugeField &_Uthin,
|
|||||||
assert(FiveDimGrid._simd_layout[0] ==1);
|
assert(FiveDimGrid._simd_layout[0] ==1);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
int LLs = FiveDimGrid._rdimensions[0];
|
||||||
|
int vol4= FourDimGrid.oSites();
|
||||||
|
Stencil.BuildSurfaceList(LLs,vol4);
|
||||||
|
|
||||||
// Allocate the required comms buffer
|
vol4=FourDimRedBlackGrid.oSites();
|
||||||
|
StencilEven.BuildSurfaceList(LLs,vol4);
|
||||||
|
StencilOdd.BuildSurfaceList(LLs,vol4);
|
||||||
|
}
|
||||||
|
template <class Impl>
|
||||||
|
void ImprovedStaggeredFermion5D<Impl>::CopyGaugeCheckerboards(void)
|
||||||
|
{
|
||||||
|
pickCheckerboard(Even, UmuEven, Umu);
|
||||||
|
pickCheckerboard(Odd, UmuOdd , Umu);
|
||||||
|
pickCheckerboard(Even, UUUmuEven,UUUmu);
|
||||||
|
pickCheckerboard(Odd, UUUmuOdd, UUUmu);
|
||||||
|
}
|
||||||
|
template<class Impl>
|
||||||
|
ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GaugeField &_Uthin,GaugeField &_Ufat,
|
||||||
|
GridCartesian &FiveDimGrid,
|
||||||
|
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
||||||
|
GridCartesian &FourDimGrid,
|
||||||
|
GridRedBlackCartesian &FourDimRedBlackGrid,
|
||||||
|
RealD _mass,
|
||||||
|
RealD _c1,RealD _c2, RealD _u0,
|
||||||
|
const ImplParams &p) :
|
||||||
|
ImprovedStaggeredFermion5D(FiveDimGrid,FiveDimRedBlackGrid,
|
||||||
|
FourDimGrid,FourDimRedBlackGrid,
|
||||||
|
_mass,_c1,_c2,_u0,p)
|
||||||
|
{
|
||||||
ImportGauge(_Uthin,_Ufat);
|
ImportGauge(_Uthin,_Ufat);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////
|
||||||
|
// For MILC use; pass three link U's and 1 link U
|
||||||
|
///////////////////////////////////////////////////
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void ImprovedStaggeredFermion5D<Impl>::ImportGauge(const GaugeField &_Uthin)
|
void ImprovedStaggeredFermion5D<Impl>::ImportGaugeSimple(const GaugeField &_Utriple,const GaugeField &_Ufat)
|
||||||
{
|
{
|
||||||
ImportGauge(_Uthin,_Uthin);
|
/////////////////////////////////////////////////////////////////
|
||||||
};
|
// Trivial import; phases and fattening and such like preapplied
|
||||||
|
/////////////////////////////////////////////////////////////////
|
||||||
|
for (int mu = 0; mu < Nd; mu++) {
|
||||||
|
|
||||||
|
auto U = PeekIndex<LorentzIndex>(_Utriple, mu);
|
||||||
|
Impl::InsertGaugeField(UUUmu,U,mu);
|
||||||
|
|
||||||
|
U = adj( Cshift(U, mu, -3));
|
||||||
|
Impl::InsertGaugeField(UUUmu,-U,mu+4);
|
||||||
|
|
||||||
|
U = PeekIndex<LorentzIndex>(_Ufat, mu);
|
||||||
|
Impl::InsertGaugeField(Umu,U,mu);
|
||||||
|
|
||||||
|
U = adj( Cshift(U, mu, -1));
|
||||||
|
Impl::InsertGaugeField(Umu,-U,mu+4);
|
||||||
|
|
||||||
|
}
|
||||||
|
CopyGaugeCheckerboards();
|
||||||
|
}
|
||||||
|
template <class Impl>
|
||||||
|
void ImprovedStaggeredFermion5D<Impl>::ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U)
|
||||||
|
{
|
||||||
|
/////////////////////////////////////////////////////////////////
|
||||||
|
// Trivial import; phases and fattening and such like preapplied
|
||||||
|
/////////////////////////////////////////////////////////////////
|
||||||
|
Umu = _U;
|
||||||
|
UUUmu = _UUU;
|
||||||
|
CopyGaugeCheckerboards();
|
||||||
|
}
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void ImprovedStaggeredFermion5D<Impl>::ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat)
|
void ImprovedStaggeredFermion5D<Impl>::ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat)
|
||||||
{
|
{
|
||||||
@ -159,10 +216,7 @@ void ImprovedStaggeredFermion5D<Impl>::ImportGauge(const GaugeField &_Uthin,cons
|
|||||||
PokeIndex<LorentzIndex>(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4);
|
PokeIndex<LorentzIndex>(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4);
|
||||||
}
|
}
|
||||||
|
|
||||||
pickCheckerboard(Even, UmuEven, Umu);
|
CopyGaugeCheckerboards();
|
||||||
pickCheckerboard(Odd, UmuOdd , Umu);
|
|
||||||
pickCheckerboard(Even, UUUmuEven, UUUmu);
|
|
||||||
pickCheckerboard(Odd, UUUmuOdd, UUUmu);
|
|
||||||
}
|
}
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void ImprovedStaggeredFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,int dir5,int disp)
|
void ImprovedStaggeredFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,int dir5,int disp)
|
||||||
@ -223,6 +277,162 @@ void ImprovedStaggeredFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
|
|||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*CHANGE */
|
||||||
|
template<class Impl>
|
||||||
|
void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
||||||
|
const FermionField &in, FermionField &out,int dag)
|
||||||
|
{
|
||||||
|
#ifdef GRID_OMP
|
||||||
|
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
|
||||||
|
DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
|
||||||
|
else
|
||||||
|
#endif
|
||||||
|
DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class Impl>
|
||||||
|
void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
||||||
|
const FermionField &in, FermionField &out,int dag)
|
||||||
|
{
|
||||||
|
#ifdef GRID_OMP
|
||||||
|
// assert((dag==DaggerNo) ||(dag==DaggerYes));
|
||||||
|
|
||||||
|
Compressor compressor;
|
||||||
|
|
||||||
|
int LLs = in._grid->_rdimensions[0];
|
||||||
|
int len = U._grid->oSites();
|
||||||
|
|
||||||
|
DhopFaceTime-=usecond();
|
||||||
|
st.Prepare();
|
||||||
|
st.HaloGather(in,compressor);
|
||||||
|
// st.HaloExchangeOptGather(in,compressor); // Wilson compressor
|
||||||
|
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
|
||||||
|
DhopFaceTime+=usecond();
|
||||||
|
|
||||||
|
double ctime=0;
|
||||||
|
double ptime=0;
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Ugly explicit thread mapping introduced for OPA reasons.
|
||||||
|
//////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
#pragma omp parallel reduction(max:ctime) reduction(max:ptime)
|
||||||
|
{
|
||||||
|
int tid = omp_get_thread_num();
|
||||||
|
int nthreads = omp_get_num_threads();
|
||||||
|
int ncomms = CartesianCommunicator::nCommThreads;
|
||||||
|
if (ncomms == -1) ncomms = 1;
|
||||||
|
assert(nthreads > ncomms);
|
||||||
|
if (tid >= ncomms) {
|
||||||
|
double start = usecond();
|
||||||
|
nthreads -= ncomms;
|
||||||
|
int ttid = tid - ncomms;
|
||||||
|
int n = U._grid->oSites(); // 4d vol
|
||||||
|
int chunk = n / nthreads;
|
||||||
|
int rem = n % nthreads;
|
||||||
|
int myblock, myn;
|
||||||
|
if (ttid < rem) {
|
||||||
|
myblock = ttid * chunk + ttid;
|
||||||
|
myn = chunk+1;
|
||||||
|
} else {
|
||||||
|
myblock = ttid*chunk + rem;
|
||||||
|
myn = chunk;
|
||||||
|
}
|
||||||
|
|
||||||
|
// do the compute
|
||||||
|
if (dag == DaggerYes) {
|
||||||
|
for (int ss = myblock; ss < myblock+myn; ++ss) {
|
||||||
|
int sU = ss;
|
||||||
|
// Interior = 1; Exterior = 0; must implement for staggered
|
||||||
|
Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,1,0); //<---------
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (int ss = myblock; ss < myblock+myn; ++ss) {
|
||||||
|
// Interior = 1; Exterior = 0;
|
||||||
|
int sU = ss;
|
||||||
|
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,1,0); //<------------
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ptime = usecond() - start;
|
||||||
|
} else {
|
||||||
|
double start = usecond();
|
||||||
|
st.CommunicateThreaded();
|
||||||
|
ctime = usecond() - start;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
DhopCommTime += ctime;
|
||||||
|
DhopComputeTime+=ptime;
|
||||||
|
|
||||||
|
// First to enter, last to leave timing
|
||||||
|
st.CollateThreads();
|
||||||
|
|
||||||
|
DhopFaceTime-=usecond();
|
||||||
|
st.CommsMerge(compressor);
|
||||||
|
DhopFaceTime+=usecond();
|
||||||
|
|
||||||
|
DhopComputeTime2-=usecond();
|
||||||
|
if (dag == DaggerYes) {
|
||||||
|
int sz=st.surface_list.size();
|
||||||
|
parallel_for (int ss = 0; ss < sz; ss++) {
|
||||||
|
int sU = st.surface_list[ss];
|
||||||
|
Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,0,1); //<----------
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
int sz=st.surface_list.size();
|
||||||
|
parallel_for (int ss = 0; ss < sz; ss++) {
|
||||||
|
int sU = st.surface_list[ss];
|
||||||
|
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,0,1);//<----------
|
||||||
|
}
|
||||||
|
}
|
||||||
|
DhopComputeTime2+=usecond();
|
||||||
|
#else
|
||||||
|
assert(0);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class Impl>
|
||||||
|
void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
||||||
|
const FermionField &in, FermionField &out,int dag)
|
||||||
|
{
|
||||||
|
Compressor compressor;
|
||||||
|
int LLs = in._grid->_rdimensions[0];
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
//double t1=usecond();
|
||||||
|
DhopTotalTime -= usecond();
|
||||||
|
DhopCommTime -= usecond();
|
||||||
|
st.HaloExchange(in,compressor);
|
||||||
|
DhopCommTime += usecond();
|
||||||
|
|
||||||
|
DhopComputeTime -= usecond();
|
||||||
|
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
|
||||||
|
if (dag == DaggerYes) {
|
||||||
|
parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
|
||||||
|
int sU=ss;
|
||||||
|
Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), LLs, sU,in, out);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
|
||||||
|
int sU=ss;
|
||||||
|
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
DhopComputeTime += usecond();
|
||||||
|
DhopTotalTime += usecond();
|
||||||
|
//double t2=usecond();
|
||||||
|
//std::cout << __FILE__ << " " << __func__ << " Total Time " << DhopTotalTime << std::endl;
|
||||||
|
//std::cout << __FILE__ << " " << __func__ << " Total Time Org " << t2-t1 << std::endl;
|
||||||
|
//std::cout << __FILE__ << " " << __func__ << " Comml Time " << DhopCommTime << std::endl;
|
||||||
|
//std::cout << __FILE__ << " " << __func__ << " Compute Time " << DhopComputeTime << std::endl;
|
||||||
|
|
||||||
|
}
|
||||||
|
/*CHANGE END*/
|
||||||
|
|
||||||
|
/* ORG
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
|
void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
DoubledGaugeField & U,DoubledGaugeField & UUU,
|
||||||
@ -254,6 +464,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
|
|||||||
DhopComputeTime += usecond();
|
DhopComputeTime += usecond();
|
||||||
DhopTotalTime += usecond();
|
DhopTotalTime += usecond();
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
@ -336,6 +547,9 @@ void ImprovedStaggeredFermion5D<Impl>::ZeroCounters(void)
|
|||||||
DhopTotalTime = 0;
|
DhopTotalTime = 0;
|
||||||
DhopCommTime = 0;
|
DhopCommTime = 0;
|
||||||
DhopComputeTime = 0;
|
DhopComputeTime = 0;
|
||||||
|
DhopFaceTime = 0;
|
||||||
|
|
||||||
|
|
||||||
Stencil.ZeroCounters();
|
Stencil.ZeroCounters();
|
||||||
StencilEven.ZeroCounters();
|
StencilEven.ZeroCounters();
|
||||||
StencilOdd.ZeroCounters();
|
StencilOdd.ZeroCounters();
|
||||||
|
@ -64,6 +64,8 @@ namespace QCD {
|
|||||||
double DhopCalls;
|
double DhopCalls;
|
||||||
double DhopCommTime;
|
double DhopCommTime;
|
||||||
double DhopComputeTime;
|
double DhopComputeTime;
|
||||||
|
double DhopComputeTime2;
|
||||||
|
double DhopFaceTime;
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
// Implement the abstract base
|
// Implement the abstract base
|
||||||
@ -119,7 +121,27 @@ namespace QCD {
|
|||||||
FermionField &out,
|
FermionField &out,
|
||||||
int dag);
|
int dag);
|
||||||
|
|
||||||
|
void DhopInternalOverlappedComms(StencilImpl & st,
|
||||||
|
LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U,
|
||||||
|
DoubledGaugeField &UUU,
|
||||||
|
const FermionField &in,
|
||||||
|
FermionField &out,
|
||||||
|
int dag);
|
||||||
|
|
||||||
|
void DhopInternalSerialComms(StencilImpl & st,
|
||||||
|
LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U,
|
||||||
|
DoubledGaugeField &UUU,
|
||||||
|
const FermionField &in,
|
||||||
|
FermionField &out,
|
||||||
|
int dag);
|
||||||
|
|
||||||
|
|
||||||
// Constructors
|
// Constructors
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Grid internal interface -- Thin link and fat link, with coefficients
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
ImprovedStaggeredFermion5D(GaugeField &_Uthin,
|
ImprovedStaggeredFermion5D(GaugeField &_Uthin,
|
||||||
GaugeField &_Ufat,
|
GaugeField &_Ufat,
|
||||||
GridCartesian &FiveDimGrid,
|
GridCartesian &FiveDimGrid,
|
||||||
@ -127,17 +149,37 @@ namespace QCD {
|
|||||||
GridCartesian &FourDimGrid,
|
GridCartesian &FourDimGrid,
|
||||||
GridRedBlackCartesian &FourDimRedBlackGrid,
|
GridRedBlackCartesian &FourDimRedBlackGrid,
|
||||||
double _mass,
|
double _mass,
|
||||||
RealD _c1=9.0/8.0, RealD _c2=-1.0/24.0,RealD _u0=1.0,
|
RealD _c1, RealD _c2,RealD _u0,
|
||||||
const ImplParams &p= ImplParams());
|
const ImplParams &p= ImplParams());
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// DoubleStore
|
// MILC constructor ; triple links, no rescale factors; must be externally pre multiplied
|
||||||
void ImportGauge(const GaugeField &_U);
|
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
void ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat);
|
ImprovedStaggeredFermion5D(GridCartesian &FiveDimGrid,
|
||||||
|
GridRedBlackCartesian &FiveDimRedBlackGrid,
|
||||||
|
GridCartesian &FourDimGrid,
|
||||||
|
GridRedBlackCartesian &FourDimRedBlackGrid,
|
||||||
|
double _mass,
|
||||||
|
RealD _c1=1.0, RealD _c2=1.0,RealD _u0=1.0,
|
||||||
|
const ImplParams &p= ImplParams());
|
||||||
|
|
||||||
|
// DoubleStore gauge field in operator
|
||||||
|
void ImportGauge (const GaugeField &_Uthin ) { assert(0); }
|
||||||
|
void ImportGauge (const GaugeField &_Uthin ,const GaugeField &_Ufat);
|
||||||
|
void ImportGaugeSimple(const GaugeField &_UUU,const GaugeField &_U);
|
||||||
|
void ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U);
|
||||||
|
// Give a reference; can be used to do an assignment or copy back out after import
|
||||||
|
// if Carleton wants to cache them and not use the ImportSimple
|
||||||
|
DoubledGaugeField &GetU(void) { return Umu ; } ;
|
||||||
|
DoubledGaugeField &GetUUU(void) { return UUUmu; };
|
||||||
|
void CopyGaugeCheckerboards(void);
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
// Data members require to support the functionality
|
// Data members require to support the functionality
|
||||||
///////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////
|
||||||
public:
|
public:
|
||||||
|
|
||||||
|
virtual int isTrivialEE(void) { return 1; };
|
||||||
|
virtual RealD Mass(void) { return mass; }
|
||||||
|
|
||||||
GridBase *_FourDimGrid;
|
GridBase *_FourDimGrid;
|
||||||
GridBase *_FourDimRedBlackGrid;
|
GridBase *_FourDimRedBlackGrid;
|
||||||
|
@ -32,223 +32,241 @@ namespace Grid {
|
|||||||
namespace QCD {
|
namespace QCD {
|
||||||
|
|
||||||
int StaggeredKernelsStatic::Opt= StaggeredKernelsStatic::OptGeneric;
|
int StaggeredKernelsStatic::Opt= StaggeredKernelsStatic::OptGeneric;
|
||||||
|
int StaggeredKernelsStatic::Comms = StaggeredKernelsStatic::CommsAndCompute;
|
||||||
|
|
||||||
|
#define GENERIC_STENCIL_LEG(U,Dir,skew,multLink) \
|
||||||
|
SE = st.GetEntry(ptype, Dir+skew, sF); \
|
||||||
|
if (SE->_is_local ) { \
|
||||||
|
if (SE->_permute) { \
|
||||||
|
chi_p = χ \
|
||||||
|
permute(chi, in._odata[SE->_offset], ptype); \
|
||||||
|
} else { \
|
||||||
|
chi_p = &in._odata[SE->_offset]; \
|
||||||
|
} \
|
||||||
|
} else { \
|
||||||
|
chi_p = &buf[SE->_offset]; \
|
||||||
|
} \
|
||||||
|
multLink(Uchi, U._odata[sU], *chi_p, Dir);
|
||||||
|
|
||||||
|
#define GENERIC_STENCIL_LEG_INT(U,Dir,skew,multLink) \
|
||||||
|
SE = st.GetEntry(ptype, Dir+skew, sF); \
|
||||||
|
if (SE->_is_local ) { \
|
||||||
|
if (SE->_permute) { \
|
||||||
|
chi_p = χ \
|
||||||
|
permute(chi, in._odata[SE->_offset], ptype); \
|
||||||
|
} else { \
|
||||||
|
chi_p = &in._odata[SE->_offset]; \
|
||||||
|
} \
|
||||||
|
} else if ( st.same_node[Dir] ) { \
|
||||||
|
chi_p = &buf[SE->_offset]; \
|
||||||
|
} \
|
||||||
|
if (SE->_is_local || st.same_node[Dir] ) { \
|
||||||
|
multLink(Uchi, U._odata[sU], *chi_p, Dir); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink) \
|
||||||
|
SE = st.GetEntry(ptype, Dir+skew, sF); \
|
||||||
|
if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \
|
||||||
|
nmu++; \
|
||||||
|
chi_p = &buf[SE->_offset]; \
|
||||||
|
multLink(Uchi, U._odata[sU], *chi_p, Dir); \
|
||||||
|
}
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};
|
StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};
|
||||||
|
|
||||||
////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Generic implementation; move to different file?
|
// Generic implementation; move to different file?
|
||||||
////////////////////////////////////////////
|
// Int, Ext, Int+Ext cases for comms overlap
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void StaggeredKernels<Impl>::DhopSiteDepth(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
|
||||||
SiteSpinor *buf, int sF,
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
int sU, const FermionField &in, SiteSpinor &out,int threeLink) {
|
SiteSpinor *buf, int LLs, int sU,
|
||||||
|
const FermionField &in, FermionField &out, int dag) {
|
||||||
const SiteSpinor *chi_p;
|
const SiteSpinor *chi_p;
|
||||||
SiteSpinor chi;
|
SiteSpinor chi;
|
||||||
SiteSpinor Uchi;
|
SiteSpinor Uchi;
|
||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
int ptype;
|
int ptype;
|
||||||
int skew = 0;
|
int skew;
|
||||||
if (threeLink) skew=8;
|
|
||||||
///////////////////////////
|
|
||||||
// Xp
|
|
||||||
///////////////////////////
|
|
||||||
|
|
||||||
SE = st.GetEntry(ptype, Xp+skew, sF);
|
for(int s=0;s<LLs;s++){
|
||||||
if (SE->_is_local) {
|
int sF=LLs*sU+s;
|
||||||
if (SE->_permute) {
|
skew = 0;
|
||||||
chi_p = χ
|
GENERIC_STENCIL_LEG(U,Xp,skew,Impl::multLink);
|
||||||
permute(chi, in._odata[SE->_offset], ptype);
|
GENERIC_STENCIL_LEG(U,Yp,skew,Impl::multLinkAdd);
|
||||||
} else {
|
GENERIC_STENCIL_LEG(U,Zp,skew,Impl::multLinkAdd);
|
||||||
chi_p = &in._odata[SE->_offset];
|
GENERIC_STENCIL_LEG(U,Tp,skew,Impl::multLinkAdd);
|
||||||
}
|
GENERIC_STENCIL_LEG(U,Xm,skew,Impl::multLinkAdd);
|
||||||
} else {
|
GENERIC_STENCIL_LEG(U,Ym,skew,Impl::multLinkAdd);
|
||||||
chi_p = &buf[SE->_offset];
|
GENERIC_STENCIL_LEG(U,Zm,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG(U,Tm,skew,Impl::multLinkAdd);
|
||||||
|
skew=8;
|
||||||
|
GENERIC_STENCIL_LEG(UUU,Xp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG(UUU,Yp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG(UUU,Zp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG(UUU,Tp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG(UUU,Xm,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG(UUU,Ym,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG(UUU,Zm,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG(UUU,Tm,skew,Impl::multLinkAdd);
|
||||||
|
if ( dag ) {
|
||||||
|
Uchi = - Uchi;
|
||||||
|
}
|
||||||
|
vstream(out._odata[sF], Uchi);
|
||||||
}
|
}
|
||||||
Impl::multLink(Uchi, U._odata[sU], *chi_p, Xp);
|
|
||||||
|
|
||||||
///////////////////////////
|
|
||||||
// Yp
|
|
||||||
///////////////////////////
|
|
||||||
SE = st.GetEntry(ptype, Yp+skew, sF);
|
|
||||||
if (SE->_is_local) {
|
|
||||||
if (SE->_permute) {
|
|
||||||
chi_p = χ
|
|
||||||
permute(chi, in._odata[SE->_offset], ptype);
|
|
||||||
} else {
|
|
||||||
chi_p = &in._odata[SE->_offset];
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
chi_p = &buf[SE->_offset];
|
|
||||||
}
|
|
||||||
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Yp);
|
|
||||||
|
|
||||||
///////////////////////////
|
|
||||||
// Zp
|
|
||||||
///////////////////////////
|
|
||||||
SE = st.GetEntry(ptype, Zp+skew, sF);
|
|
||||||
if (SE->_is_local) {
|
|
||||||
if (SE->_permute) {
|
|
||||||
chi_p = χ
|
|
||||||
permute(chi, in._odata[SE->_offset], ptype);
|
|
||||||
} else {
|
|
||||||
chi_p = &in._odata[SE->_offset];
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
chi_p = &buf[SE->_offset];
|
|
||||||
}
|
|
||||||
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Zp);
|
|
||||||
|
|
||||||
///////////////////////////
|
|
||||||
// Tp
|
|
||||||
///////////////////////////
|
|
||||||
SE = st.GetEntry(ptype, Tp+skew, sF);
|
|
||||||
if (SE->_is_local) {
|
|
||||||
if (SE->_permute) {
|
|
||||||
chi_p = χ
|
|
||||||
permute(chi, in._odata[SE->_offset], ptype);
|
|
||||||
} else {
|
|
||||||
chi_p = &in._odata[SE->_offset];
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
chi_p = &buf[SE->_offset];
|
|
||||||
}
|
|
||||||
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Tp);
|
|
||||||
|
|
||||||
///////////////////////////
|
|
||||||
// Xm
|
|
||||||
///////////////////////////
|
|
||||||
SE = st.GetEntry(ptype, Xm+skew, sF);
|
|
||||||
if (SE->_is_local) {
|
|
||||||
if (SE->_permute) {
|
|
||||||
chi_p = χ
|
|
||||||
permute(chi, in._odata[SE->_offset], ptype);
|
|
||||||
} else {
|
|
||||||
chi_p = &in._odata[SE->_offset];
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
chi_p = &buf[SE->_offset];
|
|
||||||
}
|
|
||||||
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Xm);
|
|
||||||
|
|
||||||
///////////////////////////
|
|
||||||
// Ym
|
|
||||||
///////////////////////////
|
|
||||||
SE = st.GetEntry(ptype, Ym+skew, sF);
|
|
||||||
if (SE->_is_local) {
|
|
||||||
if (SE->_permute) {
|
|
||||||
chi_p = χ
|
|
||||||
permute(chi, in._odata[SE->_offset], ptype);
|
|
||||||
} else {
|
|
||||||
chi_p = &in._odata[SE->_offset];
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
chi_p = &buf[SE->_offset];
|
|
||||||
}
|
|
||||||
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Ym);
|
|
||||||
|
|
||||||
///////////////////////////
|
|
||||||
// Zm
|
|
||||||
///////////////////////////
|
|
||||||
SE = st.GetEntry(ptype, Zm+skew, sF);
|
|
||||||
if (SE->_is_local) {
|
|
||||||
if (SE->_permute) {
|
|
||||||
chi_p = χ
|
|
||||||
permute(chi, in._odata[SE->_offset], ptype);
|
|
||||||
} else {
|
|
||||||
chi_p = &in._odata[SE->_offset];
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
chi_p = &buf[SE->_offset];
|
|
||||||
}
|
|
||||||
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Zm);
|
|
||||||
|
|
||||||
///////////////////////////
|
|
||||||
// Tm
|
|
||||||
///////////////////////////
|
|
||||||
SE = st.GetEntry(ptype, Tm+skew, sF);
|
|
||||||
if (SE->_is_local) {
|
|
||||||
if (SE->_permute) {
|
|
||||||
chi_p = χ
|
|
||||||
permute(chi, in._odata[SE->_offset], ptype);
|
|
||||||
} else {
|
|
||||||
chi_p = &in._odata[SE->_offset];
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
chi_p = &buf[SE->_offset];
|
|
||||||
}
|
|
||||||
Impl::multLinkAdd(Uchi, U._odata[sU], *chi_p, Tm);
|
|
||||||
|
|
||||||
vstream(out, Uchi);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////
|
||||||
|
// Only contributions from interior of our node
|
||||||
|
///////////////////////////////////////////////////
|
||||||
|
template <class Impl>
|
||||||
|
void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
|
SiteSpinor *buf, int LLs, int sU,
|
||||||
|
const FermionField &in, FermionField &out,int dag) {
|
||||||
|
const SiteSpinor *chi_p;
|
||||||
|
SiteSpinor chi;
|
||||||
|
SiteSpinor Uchi;
|
||||||
|
StencilEntry *SE;
|
||||||
|
int ptype;
|
||||||
|
int skew ;
|
||||||
|
|
||||||
|
for(int s=0;s<LLs;s++){
|
||||||
|
int sF=LLs*sU+s;
|
||||||
|
skew = 0;
|
||||||
|
Uchi=zero;
|
||||||
|
GENERIC_STENCIL_LEG_INT(U,Xp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_INT(U,Yp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_INT(U,Zp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_INT(U,Tp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_INT(U,Xm,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_INT(U,Ym,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_INT(U,Zm,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_INT(U,Tm,skew,Impl::multLinkAdd);
|
||||||
|
skew=8;
|
||||||
|
GENERIC_STENCIL_LEG_INT(UUU,Xp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_INT(UUU,Yp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_INT(UUU,Zp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_INT(UUU,Tp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_INT(UUU,Xm,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_INT(UUU,Ym,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_INT(UUU,Zm,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_INT(UUU,Tm,skew,Impl::multLinkAdd);
|
||||||
|
if ( dag ) {
|
||||||
|
Uchi = - Uchi;
|
||||||
|
}
|
||||||
|
vstream(out._odata[sF], Uchi);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////
|
||||||
|
// Only contributions from exterior of our node
|
||||||
|
///////////////////////////////////////////////////
|
||||||
|
template <class Impl>
|
||||||
|
void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
|
SiteSpinor *buf, int LLs, int sU,
|
||||||
|
const FermionField &in, FermionField &out,int dag) {
|
||||||
|
const SiteSpinor *chi_p;
|
||||||
|
SiteSpinor chi;
|
||||||
|
SiteSpinor Uchi;
|
||||||
|
StencilEntry *SE;
|
||||||
|
int ptype;
|
||||||
|
int nmu=0;
|
||||||
|
int skew ;
|
||||||
|
|
||||||
|
for(int s=0;s<LLs;s++){
|
||||||
|
int sF=LLs*sU+s;
|
||||||
|
skew = 0;
|
||||||
|
Uchi=zero;
|
||||||
|
GENERIC_STENCIL_LEG_EXT(U,Xp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_EXT(U,Yp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_EXT(U,Zp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_EXT(U,Tp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_EXT(U,Xm,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_EXT(U,Ym,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_EXT(U,Zm,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_EXT(U,Tm,skew,Impl::multLinkAdd);
|
||||||
|
skew=8;
|
||||||
|
GENERIC_STENCIL_LEG_EXT(UUU,Xp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_EXT(UUU,Yp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_EXT(UUU,Zp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_EXT(UUU,Tp,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_EXT(UUU,Xm,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_EXT(UUU,Ym,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_EXT(UUU,Zm,skew,Impl::multLinkAdd);
|
||||||
|
GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd);
|
||||||
|
|
||||||
|
if ( nmu ) {
|
||||||
|
if ( dag ) {
|
||||||
|
out._odata[sF] = out._odata[sF] - Uchi;
|
||||||
|
} else {
|
||||||
|
out._odata[sF] = out._odata[sF] + Uchi;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Driving / wrapping routine to select right kernel
|
||||||
|
////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
|
void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
SiteSpinor *buf, int LLs, int sU,
|
SiteSpinor *buf, int LLs, int sU,
|
||||||
const FermionField &in, FermionField &out) {
|
const FermionField &in, FermionField &out,
|
||||||
SiteSpinor naik;
|
int interior,int exterior)
|
||||||
SiteSpinor naive;
|
{
|
||||||
int oneLink =0;
|
|
||||||
int threeLink=1;
|
|
||||||
int dag=1;
|
int dag=1;
|
||||||
switch(Opt) {
|
DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
|
||||||
#ifdef AVX512
|
};
|
||||||
//FIXME; move the sign into the Asm routine
|
|
||||||
case OptInlineAsm:
|
template <class Impl>
|
||||||
DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out);
|
void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
for(int s=0;s<LLs;s++) {
|
SiteSpinor *buf, int LLs, int sU,
|
||||||
int sF=s+LLs*sU;
|
const FermionField &in, FermionField &out,
|
||||||
out._odata[sF]=-out._odata[sF];
|
int interior,int exterior)
|
||||||
}
|
{
|
||||||
break;
|
int dag=0;
|
||||||
#endif
|
DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
|
||||||
case OptHandUnroll:
|
|
||||||
DhopSiteHand(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
|
||||||
break;
|
|
||||||
case OptGeneric:
|
|
||||||
for(int s=0;s<LLs;s++){
|
|
||||||
int sF=s+LLs*sU;
|
|
||||||
DhopSiteDepth(st,lo,U,buf,sF,sU,in,naive,oneLink);
|
|
||||||
DhopSiteDepth(st,lo,UUU,buf,sF,sU,in,naik,threeLink);
|
|
||||||
out._odata[sF] =-naive-naik;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
std::cout<<"Oops Opt = "<<Opt<<std::endl;
|
|
||||||
assert(0);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
|
void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
SiteSpinor *buf, int LLs,
|
SiteSpinor *buf, int LLs,
|
||||||
int sU, const FermionField &in, FermionField &out)
|
int sU, const FermionField &in, FermionField &out,
|
||||||
|
int dag,int interior,int exterior)
|
||||||
{
|
{
|
||||||
int oneLink =0;
|
|
||||||
int threeLink=1;
|
|
||||||
SiteSpinor naik;
|
|
||||||
SiteSpinor naive;
|
|
||||||
int dag=0;
|
|
||||||
switch(Opt) {
|
switch(Opt) {
|
||||||
#ifdef AVX512
|
#ifdef AVX512
|
||||||
case OptInlineAsm:
|
case OptInlineAsm:
|
||||||
DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out);
|
if ( interior && exterior ) {
|
||||||
|
DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||||
|
} else {
|
||||||
|
std::cout << GridLogError << "Cannot overlap comms and compute with Staggered assembly"<<std::endl;
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
#endif
|
#endif
|
||||||
case OptHandUnroll:
|
case OptHandUnroll:
|
||||||
DhopSiteHand(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
if ( interior && exterior ) {
|
||||||
|
DhopSiteHand (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||||
|
} else if ( interior ) {
|
||||||
|
DhopSiteHandInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||||
|
} else if ( exterior ) {
|
||||||
|
DhopSiteHandExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
case OptGeneric:
|
case OptGeneric:
|
||||||
for(int s=0;s<LLs;s++){
|
if ( interior && exterior ) {
|
||||||
int sF=LLs*sU+s;
|
DhopSiteGeneric (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||||
// assert(sF<in._odata.size());
|
} else if ( interior ) {
|
||||||
// assert(sU< U._odata.size());
|
DhopSiteGenericInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||||
// assert(sF>=0); assert(sU>=0);
|
} else if ( exterior ) {
|
||||||
DhopSiteDepth(st,lo,U,buf,sF,sU,in,naive,oneLink);
|
DhopSiteGenericExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
|
||||||
DhopSiteDepth(st,lo,UUU,buf,sF,sU,in,naik,threeLink);
|
|
||||||
out._odata[sF] =naive+naik;
|
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
|
@ -38,8 +38,9 @@ namespace QCD {
|
|||||||
class StaggeredKernelsStatic {
|
class StaggeredKernelsStatic {
|
||||||
public:
|
public:
|
||||||
enum { OptGeneric, OptHandUnroll, OptInlineAsm };
|
enum { OptGeneric, OptHandUnroll, OptInlineAsm };
|
||||||
// S-direction is INNERMOST and takes no part in the parity.
|
enum { CommsAndCompute, CommsThenCompute };
|
||||||
static int Opt; // these are a temporary hack
|
static int Opt;
|
||||||
|
static int Comms;
|
||||||
};
|
};
|
||||||
|
|
||||||
template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , public StaggeredKernelsStatic {
|
template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , public StaggeredKernelsStatic {
|
||||||
@ -53,24 +54,62 @@ public:
|
|||||||
void DhopDir(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor * buf,
|
void DhopDir(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor * buf,
|
||||||
int sF, int sU, const FermionField &in, FermionField &out, int dir,int disp);
|
int sF, int sU, const FermionField &in, FermionField &out, int dir,int disp);
|
||||||
|
|
||||||
void DhopSiteDepth(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteSpinor * buf,
|
///////////////////////////////////////////////////////////////////////////////////////
|
||||||
int sF, int sU, const FermionField &in, SiteSpinor &out,int threeLink);
|
// Generic Nc kernels
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
void DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
|
const FermionField &in, FermionField &out,int dag);
|
||||||
|
void DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
|
const FermionField &in, FermionField &out,int dag);
|
||||||
|
void DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
|
const FermionField &in, FermionField &out,int dag);
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Nc=3 specific kernels
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
void DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||||
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
|
const FermionField &in, FermionField &out,int dag);
|
||||||
|
void DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||||
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
|
const FermionField &in, FermionField &out,int dag);
|
||||||
|
void DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||||
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
|
const FermionField &in, FermionField &out,int dag);
|
||||||
|
|
||||||
void DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteSpinor * buf,
|
///////////////////////////////////////////////////////////////////////////////////////
|
||||||
int sF, int sU, const FermionField &in, SiteSpinor&out,int threeLink);
|
// Asm Nc=3 specific kernels
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||||
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
|
const FermionField &in, FermionField &out,int dag);
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Generic interface; fan out to right routine
|
||||||
|
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
void DhopSite(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
|
const FermionField &in, FermionField &out, int interior=1,int exterior=1);
|
||||||
|
|
||||||
void DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,SiteSpinor * buf,
|
void DhopSiteDag(StencilImpl &st, LebesgueOrder &lo,
|
||||||
int LLs, int sU, const FermionField &in, FermionField &out, int dag);
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
|
const FermionField &in, FermionField &out, int interior=1,int exterior=1);
|
||||||
|
|
||||||
void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU, SiteSpinor * buf,
|
void DhopSite(StencilImpl &st, LebesgueOrder &lo,
|
||||||
int LLs, int sU, const FermionField &in, FermionField &out);
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
|
SiteSpinor * buf, int LLs, int sU,
|
||||||
void DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor * buf,
|
const FermionField &in, FermionField &out, int dag, int interior,int exterior);
|
||||||
int sF, int sU, const FermionField &in, FermionField &out);
|
|
||||||
|
|
||||||
void DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor *buf,
|
|
||||||
int LLs, int sU, const FermionField &in, FermionField &out);
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
|
@ -560,16 +560,53 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
VSTORE(2,%0,pUChi_02) \
|
VSTORE(2,%0,pUChi_02) \
|
||||||
: : "r" (out) : "memory" );
|
: : "r" (out) : "memory" );
|
||||||
|
|
||||||
|
#define nREDUCE(out) \
|
||||||
|
asm ( \
|
||||||
|
VADD(UChi_00,UChi_10,UChi_00) \
|
||||||
|
VADD(UChi_01,UChi_11,UChi_01) \
|
||||||
|
VADD(UChi_02,UChi_12,UChi_02) \
|
||||||
|
VADD(UChi_30,UChi_20,UChi_30) \
|
||||||
|
VADD(UChi_31,UChi_21,UChi_31) \
|
||||||
|
VADD(UChi_32,UChi_22,UChi_32) \
|
||||||
|
VADD(UChi_00,UChi_30,UChi_00) \
|
||||||
|
VADD(UChi_01,UChi_31,UChi_01) \
|
||||||
|
VADD(UChi_02,UChi_32,UChi_02) ); \
|
||||||
|
asm (VZERO(Chi_00) \
|
||||||
|
VSUB(UChi_00,Chi_00,UChi_00) \
|
||||||
|
VSUB(UChi_01,Chi_00,UChi_01) \
|
||||||
|
VSUB(UChi_02,Chi_00,UChi_02) ); \
|
||||||
|
asm ( \
|
||||||
|
VSTORE(0,%0,pUChi_00) \
|
||||||
|
VSTORE(1,%0,pUChi_01) \
|
||||||
|
VSTORE(2,%0,pUChi_02) \
|
||||||
|
: : "r" (out) : "memory" );
|
||||||
|
|
||||||
#define REDUCEa(out) \
|
#define REDUCEa(out) \
|
||||||
asm ( \
|
asm ( \
|
||||||
VADD(UChi_00,UChi_10,UChi_00) \
|
VADD(UChi_00,UChi_10,UChi_00) \
|
||||||
VADD(UChi_01,UChi_11,UChi_01) \
|
VADD(UChi_01,UChi_11,UChi_01) \
|
||||||
VADD(UChi_02,UChi_12,UChi_02) ); \
|
VADD(UChi_02,UChi_12,UChi_02) ); \
|
||||||
|
asm ( \
|
||||||
|
VSTORE(0,%0,pUChi_00) \
|
||||||
|
VSTORE(1,%0,pUChi_01) \
|
||||||
|
VSTORE(2,%0,pUChi_02) \
|
||||||
|
: : "r" (out) : "memory" );
|
||||||
|
|
||||||
|
// FIXME is sign right in the VSUB ?
|
||||||
|
#define nREDUCEa(out) \
|
||||||
asm ( \
|
asm ( \
|
||||||
VSTORE(0,%0,pUChi_00) \
|
VADD(UChi_00,UChi_10,UChi_00) \
|
||||||
VSTORE(1,%0,pUChi_01) \
|
VADD(UChi_01,UChi_11,UChi_01) \
|
||||||
VSTORE(2,%0,pUChi_02) \
|
VADD(UChi_02,UChi_12,UChi_02) ); \
|
||||||
: : "r" (out) : "memory" );
|
asm (VZERO(Chi_00) \
|
||||||
|
VSUB(UChi_00,Chi_00,UChi_00) \
|
||||||
|
VSUB(UChi_01,Chi_00,UChi_01) \
|
||||||
|
VSUB(UChi_02,Chi_00,UChi_02) ); \
|
||||||
|
asm ( \
|
||||||
|
VSTORE(0,%0,pUChi_00) \
|
||||||
|
VSTORE(1,%0,pUChi_01) \
|
||||||
|
VSTORE(2,%0,pUChi_02) \
|
||||||
|
: : "r" (out) : "memory" );
|
||||||
|
|
||||||
#define PERMUTE_DIR(dir) \
|
#define PERMUTE_DIR(dir) \
|
||||||
permute##dir(Chi_0,Chi_0);\
|
permute##dir(Chi_0,Chi_0);\
|
||||||
@ -581,10 +618,9 @@ namespace QCD {
|
|||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
DoubledGaugeField &UUU,
|
SiteSpinor *buf, int LLs, int sU,
|
||||||
SiteSpinor *buf, int LLs,
|
const FermionField &in, FermionField &out,int dag)
|
||||||
int sU, const FermionField &in, FermionField &out)
|
|
||||||
{
|
{
|
||||||
assert(0);
|
assert(0);
|
||||||
};
|
};
|
||||||
@ -645,10 +681,9 @@ void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
|||||||
// This is the single precision 5th direction vectorised kernel
|
// This is the single precision 5th direction vectorised kernel
|
||||||
#include <simd/Intel512single.h>
|
#include <simd/Intel512single.h>
|
||||||
template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
DoubledGaugeField &UUU,
|
SiteSpinor *buf, int LLs, int sU,
|
||||||
SiteSpinor *buf, int LLs,
|
const FermionField &in, FermionField &out,int dag)
|
||||||
int sU, const FermionField &in, FermionField &out)
|
|
||||||
{
|
{
|
||||||
#ifdef AVX512
|
#ifdef AVX512
|
||||||
uint64_t gauge0,gauge1,gauge2,gauge3;
|
uint64_t gauge0,gauge1,gauge2,gauge3;
|
||||||
@ -685,7 +720,11 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
|
|||||||
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
|
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
|
||||||
|
|
||||||
addr0 = (uint64_t) &out._odata[sF];
|
addr0 = (uint64_t) &out._odata[sF];
|
||||||
REDUCE(addr0);
|
if ( dag ) {
|
||||||
|
nREDUCE(addr0);
|
||||||
|
} else {
|
||||||
|
REDUCE(addr0);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
assert(0);
|
assert(0);
|
||||||
@ -695,10 +734,9 @@ template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl
|
|||||||
|
|
||||||
#include <simd/Intel512double.h>
|
#include <simd/Intel512double.h>
|
||||||
template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
DoubledGaugeField &UUU,
|
SiteSpinor *buf, int LLs, int sU,
|
||||||
SiteSpinor *buf, int LLs,
|
const FermionField &in, FermionField &out,int dag)
|
||||||
int sU, const FermionField &in, FermionField &out)
|
|
||||||
{
|
{
|
||||||
#ifdef AVX512
|
#ifdef AVX512
|
||||||
uint64_t gauge0,gauge1,gauge2,gauge3;
|
uint64_t gauge0,gauge1,gauge2,gauge3;
|
||||||
@ -734,7 +772,11 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
|
|||||||
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
|
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
|
||||||
|
|
||||||
addr0 = (uint64_t) &out._odata[sF];
|
addr0 = (uint64_t) &out._odata[sF];
|
||||||
REDUCE(addr0);
|
if ( dag ) {
|
||||||
|
nREDUCE(addr0);
|
||||||
|
} else {
|
||||||
|
REDUCE(addr0);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
assert(0);
|
assert(0);
|
||||||
@ -776,10 +818,9 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl
|
|||||||
|
|
||||||
#include <simd/Intel512single.h>
|
#include <simd/Intel512single.h>
|
||||||
template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
DoubledGaugeField &UUU,
|
SiteSpinor *buf, int LLs, int sU,
|
||||||
SiteSpinor *buf, int LLs,
|
const FermionField &in, FermionField &out,int dag)
|
||||||
int sU, const FermionField &in, FermionField &out)
|
|
||||||
{
|
{
|
||||||
#ifdef AVX512
|
#ifdef AVX512
|
||||||
uint64_t gauge0,gauge1,gauge2,gauge3;
|
uint64_t gauge0,gauge1,gauge2,gauge3;
|
||||||
@ -832,7 +873,11 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
|
|||||||
MULT_ADD_XYZT(gauge2,gauge3);
|
MULT_ADD_XYZT(gauge2,gauge3);
|
||||||
|
|
||||||
addr0 = (uint64_t) &out._odata[sF];
|
addr0 = (uint64_t) &out._odata[sF];
|
||||||
REDUCEa(addr0);
|
if ( dag ) {
|
||||||
|
nREDUCEa(addr0);
|
||||||
|
} else {
|
||||||
|
REDUCEa(addr0);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
assert(0);
|
assert(0);
|
||||||
@ -841,10 +886,9 @@ template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st,
|
|||||||
|
|
||||||
#include <simd/Intel512double.h>
|
#include <simd/Intel512double.h>
|
||||||
template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
DoubledGaugeField &UUU,
|
SiteSpinor *buf, int LLs, int sU,
|
||||||
SiteSpinor *buf, int LLs,
|
const FermionField &in, FermionField &out,int dag)
|
||||||
int sU, const FermionField &in, FermionField &out)
|
|
||||||
{
|
{
|
||||||
#ifdef AVX512
|
#ifdef AVX512
|
||||||
uint64_t gauge0,gauge1,gauge2,gauge3;
|
uint64_t gauge0,gauge1,gauge2,gauge3;
|
||||||
@ -897,7 +941,11 @@ template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st,
|
|||||||
MULT_ADD_XYZT(gauge2,gauge3);
|
MULT_ADD_XYZT(gauge2,gauge3);
|
||||||
|
|
||||||
addr0 = (uint64_t) &out._odata[sF];
|
addr0 = (uint64_t) &out._odata[sF];
|
||||||
REDUCEa(addr0);
|
if ( dag ) {
|
||||||
|
nREDUCEa(addr0);
|
||||||
|
} else {
|
||||||
|
REDUCEa(addr0);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
assert(0);
|
assert(0);
|
||||||
@ -909,7 +957,7 @@ template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st,
|
|||||||
DoubledGaugeField &U, \
|
DoubledGaugeField &U, \
|
||||||
DoubledGaugeField &UUU, \
|
DoubledGaugeField &UUU, \
|
||||||
SiteSpinor *buf, int LLs, \
|
SiteSpinor *buf, int LLs, \
|
||||||
int sU, const FermionField &in, FermionField &out);
|
int sU, const FermionField &in, FermionField &out,int dag);
|
||||||
|
|
||||||
KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplD);
|
KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplD);
|
||||||
KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplF);
|
KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplF);
|
||||||
|
@ -28,7 +28,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
/* END LEGAL */
|
/* END LEGAL */
|
||||||
#include <Grid.h>
|
#include <Grid.h>
|
||||||
|
|
||||||
#define REGISTER
|
|
||||||
|
|
||||||
#define LOAD_CHI(b) \
|
#define LOAD_CHI(b) \
|
||||||
const SiteSpinor & ref (b[offset]); \
|
const SiteSpinor & ref (b[offset]); \
|
||||||
@ -59,7 +58,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
UChi ## _1 += U_12*Chi_2;\
|
UChi ## _1 += U_12*Chi_2;\
|
||||||
UChi ## _2 += U_22*Chi_2;
|
UChi ## _2 += U_22*Chi_2;
|
||||||
|
|
||||||
#define MULT_ADD(A,UChi) \
|
#define MULT_ADD(U,A,UChi) \
|
||||||
auto & ref(U._odata[sU](A)); \
|
auto & ref(U._odata[sU](A)); \
|
||||||
Impl::loadLinkElement(U_00,ref()(0,0)); \
|
Impl::loadLinkElement(U_00,ref()(0,0)); \
|
||||||
Impl::loadLinkElement(U_10,ref()(1,0)); \
|
Impl::loadLinkElement(U_10,ref()(1,0)); \
|
||||||
@ -82,241 +81,319 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
|
|
||||||
|
|
||||||
#define PERMUTE_DIR(dir) \
|
#define PERMUTE_DIR(dir) \
|
||||||
permute##dir(Chi_0,Chi_0);\
|
permute##dir(Chi_0,Chi_0); \
|
||||||
permute##dir(Chi_1,Chi_1);\
|
permute##dir(Chi_1,Chi_1); \
|
||||||
permute##dir(Chi_2,Chi_2);
|
permute##dir(Chi_2,Chi_2);
|
||||||
|
|
||||||
|
|
||||||
|
#define HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \
|
||||||
|
SE=st.GetEntry(ptype,Dir+skew,sF); \
|
||||||
|
offset = SE->_offset; \
|
||||||
|
local = SE->_is_local; \
|
||||||
|
perm = SE->_permute; \
|
||||||
|
if ( local ) { \
|
||||||
|
LOAD_CHI(in._odata); \
|
||||||
|
if ( perm) { \
|
||||||
|
PERMUTE_DIR(Perm); \
|
||||||
|
} \
|
||||||
|
} else { \
|
||||||
|
LOAD_CHI(buf); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define HAND_STENCIL_LEG_BEGIN(Dir,Perm,skew,even) \
|
||||||
|
HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \
|
||||||
|
{ \
|
||||||
|
MULT(Dir,even); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define HAND_STENCIL_LEG(U,Dir,Perm,skew,even) \
|
||||||
|
HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \
|
||||||
|
{ \
|
||||||
|
MULT_ADD(U,Dir,even); \
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#define HAND_STENCIL_LEG_INT(U,Dir,Perm,skew,even) \
|
||||||
|
SE=st.GetEntry(ptype,Dir+skew,sF); \
|
||||||
|
offset = SE->_offset; \
|
||||||
|
local = SE->_is_local; \
|
||||||
|
perm = SE->_permute; \
|
||||||
|
if ( local ) { \
|
||||||
|
LOAD_CHI(in._odata); \
|
||||||
|
if ( perm) { \
|
||||||
|
PERMUTE_DIR(Perm); \
|
||||||
|
} \
|
||||||
|
} else if ( st.same_node[Dir] ) { \
|
||||||
|
LOAD_CHI(buf); \
|
||||||
|
} \
|
||||||
|
if (SE->_is_local || st.same_node[Dir] ) { \
|
||||||
|
MULT_ADD(U,Dir,even); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define HAND_STENCIL_LEG_EXT(U,Dir,Perm,skew,even) \
|
||||||
|
SE=st.GetEntry(ptype,Dir+skew,sF); \
|
||||||
|
offset = SE->_offset; \
|
||||||
|
local = SE->_is_local; \
|
||||||
|
perm = SE->_permute; \
|
||||||
|
if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \
|
||||||
|
nmu++; \
|
||||||
|
{ LOAD_CHI(buf); } \
|
||||||
|
{ MULT_ADD(U,Dir,even); } \
|
||||||
|
}
|
||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
namespace QCD {
|
namespace QCD {
|
||||||
|
|
||||||
|
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
|
void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
|
||||||
SiteSpinor *buf, int LLs,
|
DoubledGaugeField &U,DoubledGaugeField &UUU,
|
||||||
int sU, const FermionField &in, FermionField &out, int dag)
|
SiteSpinor *buf, int LLs, int sU,
|
||||||
{
|
const FermionField &in, FermionField &out,int dag)
|
||||||
SiteSpinor naik;
|
|
||||||
SiteSpinor naive;
|
|
||||||
int oneLink =0;
|
|
||||||
int threeLink=1;
|
|
||||||
int skew(0);
|
|
||||||
Real scale(1.0);
|
|
||||||
|
|
||||||
if(dag) scale = -1.0;
|
|
||||||
|
|
||||||
for(int s=0;s<LLs;s++){
|
|
||||||
int sF=s+LLs*sU;
|
|
||||||
DhopSiteDepthHand(st,lo,U,buf,sF,sU,in,naive,oneLink);
|
|
||||||
DhopSiteDepthHand(st,lo,UUU,buf,sF,sU,in,naik,threeLink);
|
|
||||||
out._odata[sF] =scale*(naive+naik);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class Impl>
|
|
||||||
void StaggeredKernels<Impl>::DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
|
||||||
SiteSpinor *buf, int sF,
|
|
||||||
int sU, const FermionField &in, SiteSpinor &out,int threeLink)
|
|
||||||
{
|
{
|
||||||
typedef typename Simd::scalar_type S;
|
typedef typename Simd::scalar_type S;
|
||||||
typedef typename Simd::vector_type V;
|
typedef typename Simd::vector_type V;
|
||||||
|
|
||||||
REGISTER Simd even_0; // 12 regs on knc
|
Simd even_0; // 12 regs on knc
|
||||||
REGISTER Simd even_1;
|
Simd even_1;
|
||||||
REGISTER Simd even_2;
|
Simd even_2;
|
||||||
REGISTER Simd odd_0; // 12 regs on knc
|
Simd odd_0; // 12 regs on knc
|
||||||
REGISTER Simd odd_1;
|
Simd odd_1;
|
||||||
REGISTER Simd odd_2;
|
Simd odd_2;
|
||||||
|
|
||||||
REGISTER Simd Chi_0; // two spinor; 6 regs
|
Simd Chi_0; // two spinor; 6 regs
|
||||||
REGISTER Simd Chi_1;
|
Simd Chi_1;
|
||||||
REGISTER Simd Chi_2;
|
Simd Chi_2;
|
||||||
|
|
||||||
REGISTER Simd U_00; // two rows of U matrix
|
Simd U_00; // two rows of U matrix
|
||||||
REGISTER Simd U_10;
|
Simd U_10;
|
||||||
REGISTER Simd U_20;
|
Simd U_20;
|
||||||
REGISTER Simd U_01;
|
Simd U_01;
|
||||||
REGISTER Simd U_11;
|
Simd U_11;
|
||||||
REGISTER Simd U_21; // 2 reg left.
|
Simd U_21; // 2 reg left.
|
||||||
REGISTER Simd U_02;
|
Simd U_02;
|
||||||
REGISTER Simd U_12;
|
Simd U_12;
|
||||||
REGISTER Simd U_22;
|
Simd U_22;
|
||||||
|
|
||||||
int skew = 0;
|
|
||||||
if (threeLink) skew=8;
|
|
||||||
|
|
||||||
|
SiteSpinor result;
|
||||||
int offset,local,perm, ptype;
|
int offset,local,perm, ptype;
|
||||||
|
|
||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
|
int skew;
|
||||||
|
|
||||||
// Xp
|
for(int s=0;s<LLs;s++){
|
||||||
SE=st.GetEntry(ptype,Xp+skew,sF);
|
int sF=s+LLs*sU;
|
||||||
offset = SE->_offset;
|
|
||||||
local = SE->_is_local;
|
|
||||||
perm = SE->_permute;
|
|
||||||
|
|
||||||
if ( local ) {
|
|
||||||
LOAD_CHI(in._odata);
|
|
||||||
if ( perm) {
|
|
||||||
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
LOAD_CHI(buf);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
MULT(Xp,even);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Yp
|
|
||||||
SE=st.GetEntry(ptype,Yp+skew,sF);
|
|
||||||
offset = SE->_offset;
|
|
||||||
local = SE->_is_local;
|
|
||||||
perm = SE->_permute;
|
|
||||||
|
|
||||||
if ( local ) {
|
|
||||||
LOAD_CHI(in._odata);
|
|
||||||
if ( perm) {
|
|
||||||
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
LOAD_CHI(buf);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
MULT(Yp,odd);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
skew = 0;
|
||||||
// Zp
|
HAND_STENCIL_LEG_BEGIN(Xp,3,skew,even);
|
||||||
SE=st.GetEntry(ptype,Zp+skew,sF);
|
HAND_STENCIL_LEG_BEGIN(Yp,2,skew,odd);
|
||||||
offset = SE->_offset;
|
HAND_STENCIL_LEG (U,Zp,1,skew,even);
|
||||||
local = SE->_is_local;
|
HAND_STENCIL_LEG (U,Tp,0,skew,odd);
|
||||||
perm = SE->_permute;
|
HAND_STENCIL_LEG (U,Xm,3,skew,even);
|
||||||
|
HAND_STENCIL_LEG (U,Ym,2,skew,odd);
|
||||||
if ( local ) {
|
HAND_STENCIL_LEG (U,Zm,1,skew,even);
|
||||||
LOAD_CHI(in._odata);
|
HAND_STENCIL_LEG (U,Tm,0,skew,odd);
|
||||||
if ( perm) {
|
skew = 8;
|
||||||
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
HAND_STENCIL_LEG(UUU,Xp,3,skew,even);
|
||||||
|
HAND_STENCIL_LEG(UUU,Yp,2,skew,odd);
|
||||||
|
HAND_STENCIL_LEG(UUU,Zp,1,skew,even);
|
||||||
|
HAND_STENCIL_LEG(UUU,Tp,0,skew,odd);
|
||||||
|
HAND_STENCIL_LEG(UUU,Xm,3,skew,even);
|
||||||
|
HAND_STENCIL_LEG(UUU,Ym,2,skew,odd);
|
||||||
|
HAND_STENCIL_LEG(UUU,Zm,1,skew,even);
|
||||||
|
HAND_STENCIL_LEG(UUU,Tm,0,skew,odd);
|
||||||
|
|
||||||
|
if ( dag ) {
|
||||||
|
result()()(0) = - even_0 - odd_0;
|
||||||
|
result()()(1) = - even_1 - odd_1;
|
||||||
|
result()()(2) = - even_2 - odd_2;
|
||||||
|
} else {
|
||||||
|
result()()(0) = even_0 + odd_0;
|
||||||
|
result()()(1) = even_1 + odd_1;
|
||||||
|
result()()(2) = even_2 + odd_2;
|
||||||
}
|
}
|
||||||
} else {
|
vstream(out._odata[sF],result);
|
||||||
LOAD_CHI(buf);
|
|
||||||
}
|
}
|
||||||
{
|
|
||||||
MULT_ADD(Zp,even);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Tp
|
|
||||||
SE=st.GetEntry(ptype,Tp+skew,sF);
|
|
||||||
offset = SE->_offset;
|
|
||||||
local = SE->_is_local;
|
|
||||||
perm = SE->_permute;
|
|
||||||
|
|
||||||
if ( local ) {
|
|
||||||
LOAD_CHI(in._odata);
|
|
||||||
if ( perm) {
|
|
||||||
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
LOAD_CHI(buf);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
MULT_ADD(Tp,odd);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Xm
|
|
||||||
SE=st.GetEntry(ptype,Xm+skew,sF);
|
|
||||||
offset = SE->_offset;
|
|
||||||
local = SE->_is_local;
|
|
||||||
perm = SE->_permute;
|
|
||||||
|
|
||||||
if ( local ) {
|
|
||||||
LOAD_CHI(in._odata);
|
|
||||||
if ( perm) {
|
|
||||||
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
LOAD_CHI(buf);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
MULT_ADD(Xm,even);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// Ym
|
|
||||||
SE=st.GetEntry(ptype,Ym+skew,sF);
|
|
||||||
offset = SE->_offset;
|
|
||||||
local = SE->_is_local;
|
|
||||||
perm = SE->_permute;
|
|
||||||
|
|
||||||
if ( local ) {
|
|
||||||
LOAD_CHI(in._odata);
|
|
||||||
if ( perm) {
|
|
||||||
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
LOAD_CHI(buf);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
MULT_ADD(Ym,odd);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Zm
|
|
||||||
SE=st.GetEntry(ptype,Zm+skew,sF);
|
|
||||||
offset = SE->_offset;
|
|
||||||
local = SE->_is_local;
|
|
||||||
perm = SE->_permute;
|
|
||||||
|
|
||||||
if ( local ) {
|
|
||||||
LOAD_CHI(in._odata);
|
|
||||||
if ( perm) {
|
|
||||||
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
LOAD_CHI(buf);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
MULT_ADD(Zm,even);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Tm
|
|
||||||
SE=st.GetEntry(ptype,Tm+skew,sF);
|
|
||||||
offset = SE->_offset;
|
|
||||||
local = SE->_is_local;
|
|
||||||
perm = SE->_permute;
|
|
||||||
|
|
||||||
if ( local ) {
|
|
||||||
LOAD_CHI(in._odata);
|
|
||||||
if ( perm) {
|
|
||||||
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
LOAD_CHI(buf);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
MULT_ADD(Tm,odd);
|
|
||||||
}
|
|
||||||
|
|
||||||
vstream(out()()(0),even_0+odd_0);
|
|
||||||
vstream(out()()(1),even_1+odd_1);
|
|
||||||
vstream(out()()(2),even_2+odd_2);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template <class Impl>
|
||||||
|
void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
|
SiteSpinor *buf, int LLs, int sU,
|
||||||
|
const FermionField &in, FermionField &out,int dag)
|
||||||
|
{
|
||||||
|
typedef typename Simd::scalar_type S;
|
||||||
|
typedef typename Simd::vector_type V;
|
||||||
|
|
||||||
|
Simd even_0; // 12 regs on knc
|
||||||
|
Simd even_1;
|
||||||
|
Simd even_2;
|
||||||
|
Simd odd_0; // 12 regs on knc
|
||||||
|
Simd odd_1;
|
||||||
|
Simd odd_2;
|
||||||
|
|
||||||
|
Simd Chi_0; // two spinor; 6 regs
|
||||||
|
Simd Chi_1;
|
||||||
|
Simd Chi_2;
|
||||||
|
|
||||||
|
Simd U_00; // two rows of U matrix
|
||||||
|
Simd U_10;
|
||||||
|
Simd U_20;
|
||||||
|
Simd U_01;
|
||||||
|
Simd U_11;
|
||||||
|
Simd U_21; // 2 reg left.
|
||||||
|
Simd U_02;
|
||||||
|
Simd U_12;
|
||||||
|
Simd U_22;
|
||||||
|
|
||||||
|
SiteSpinor result;
|
||||||
|
int offset,local,perm, ptype;
|
||||||
|
|
||||||
|
StencilEntry *SE;
|
||||||
|
int skew;
|
||||||
|
|
||||||
|
for(int s=0;s<LLs;s++){
|
||||||
|
int sF=s+LLs*sU;
|
||||||
|
|
||||||
|
even_0 = zero; even_1 = zero; even_2 = zero;
|
||||||
|
odd_0 = zero; odd_1 = zero; odd_2 = zero;
|
||||||
|
|
||||||
|
skew = 0;
|
||||||
|
HAND_STENCIL_LEG_INT(U,Xp,3,skew,even);
|
||||||
|
HAND_STENCIL_LEG_INT(U,Yp,2,skew,odd);
|
||||||
|
HAND_STENCIL_LEG_INT(U,Zp,1,skew,even);
|
||||||
|
HAND_STENCIL_LEG_INT(U,Tp,0,skew,odd);
|
||||||
|
HAND_STENCIL_LEG_INT(U,Xm,3,skew,even);
|
||||||
|
HAND_STENCIL_LEG_INT(U,Ym,2,skew,odd);
|
||||||
|
HAND_STENCIL_LEG_INT(U,Zm,1,skew,even);
|
||||||
|
HAND_STENCIL_LEG_INT(U,Tm,0,skew,odd);
|
||||||
|
skew = 8;
|
||||||
|
HAND_STENCIL_LEG_INT(UUU,Xp,3,skew,even);
|
||||||
|
HAND_STENCIL_LEG_INT(UUU,Yp,2,skew,odd);
|
||||||
|
HAND_STENCIL_LEG_INT(UUU,Zp,1,skew,even);
|
||||||
|
HAND_STENCIL_LEG_INT(UUU,Tp,0,skew,odd);
|
||||||
|
HAND_STENCIL_LEG_INT(UUU,Xm,3,skew,even);
|
||||||
|
HAND_STENCIL_LEG_INT(UUU,Ym,2,skew,odd);
|
||||||
|
HAND_STENCIL_LEG_INT(UUU,Zm,1,skew,even);
|
||||||
|
HAND_STENCIL_LEG_INT(UUU,Tm,0,skew,odd);
|
||||||
|
|
||||||
|
// Assume every site must be connected to at least one interior point. No 1^4 subvols.
|
||||||
|
if ( dag ) {
|
||||||
|
result()()(0) = - even_0 - odd_0;
|
||||||
|
result()()(1) = - even_1 - odd_1;
|
||||||
|
result()()(2) = - even_2 - odd_2;
|
||||||
|
} else {
|
||||||
|
result()()(0) = even_0 + odd_0;
|
||||||
|
result()()(1) = even_1 + odd_1;
|
||||||
|
result()()(2) = even_2 + odd_2;
|
||||||
|
}
|
||||||
|
vstream(out._odata[sF],result);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template <class Impl>
|
||||||
|
void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U, DoubledGaugeField &UUU,
|
||||||
|
SiteSpinor *buf, int LLs, int sU,
|
||||||
|
const FermionField &in, FermionField &out,int dag)
|
||||||
|
{
|
||||||
|
typedef typename Simd::scalar_type S;
|
||||||
|
typedef typename Simd::vector_type V;
|
||||||
|
|
||||||
|
Simd even_0; // 12 regs on knc
|
||||||
|
Simd even_1;
|
||||||
|
Simd even_2;
|
||||||
|
Simd odd_0; // 12 regs on knc
|
||||||
|
Simd odd_1;
|
||||||
|
Simd odd_2;
|
||||||
|
|
||||||
|
Simd Chi_0; // two spinor; 6 regs
|
||||||
|
Simd Chi_1;
|
||||||
|
Simd Chi_2;
|
||||||
|
|
||||||
|
Simd U_00; // two rows of U matrix
|
||||||
|
Simd U_10;
|
||||||
|
Simd U_20;
|
||||||
|
Simd U_01;
|
||||||
|
Simd U_11;
|
||||||
|
Simd U_21; // 2 reg left.
|
||||||
|
Simd U_02;
|
||||||
|
Simd U_12;
|
||||||
|
Simd U_22;
|
||||||
|
|
||||||
|
SiteSpinor result;
|
||||||
|
int offset,local,perm, ptype;
|
||||||
|
|
||||||
|
StencilEntry *SE;
|
||||||
|
int skew;
|
||||||
|
|
||||||
|
for(int s=0;s<LLs;s++){
|
||||||
|
int sF=s+LLs*sU;
|
||||||
|
|
||||||
|
even_0 = zero; even_1 = zero; even_2 = zero;
|
||||||
|
odd_0 = zero; odd_1 = zero; odd_2 = zero;
|
||||||
|
int nmu=0;
|
||||||
|
skew = 0;
|
||||||
|
HAND_STENCIL_LEG_EXT(U,Xp,3,skew,even);
|
||||||
|
HAND_STENCIL_LEG_EXT(U,Yp,2,skew,odd);
|
||||||
|
HAND_STENCIL_LEG_EXT(U,Zp,1,skew,even);
|
||||||
|
HAND_STENCIL_LEG_EXT(U,Tp,0,skew,odd);
|
||||||
|
HAND_STENCIL_LEG_EXT(U,Xm,3,skew,even);
|
||||||
|
HAND_STENCIL_LEG_EXT(U,Ym,2,skew,odd);
|
||||||
|
HAND_STENCIL_LEG_EXT(U,Zm,1,skew,even);
|
||||||
|
HAND_STENCIL_LEG_EXT(U,Tm,0,skew,odd);
|
||||||
|
skew = 8;
|
||||||
|
HAND_STENCIL_LEG_EXT(UUU,Xp,3,skew,even);
|
||||||
|
HAND_STENCIL_LEG_EXT(UUU,Yp,2,skew,odd);
|
||||||
|
HAND_STENCIL_LEG_EXT(UUU,Zp,1,skew,even);
|
||||||
|
HAND_STENCIL_LEG_EXT(UUU,Tp,0,skew,odd);
|
||||||
|
HAND_STENCIL_LEG_EXT(UUU,Xm,3,skew,even);
|
||||||
|
HAND_STENCIL_LEG_EXT(UUU,Ym,2,skew,odd);
|
||||||
|
HAND_STENCIL_LEG_EXT(UUU,Zm,1,skew,even);
|
||||||
|
HAND_STENCIL_LEG_EXT(UUU,Tm,0,skew,odd);
|
||||||
|
|
||||||
|
// Add sum of all exterior connected stencil legs
|
||||||
|
if ( nmu ) {
|
||||||
|
if ( dag ) {
|
||||||
|
result()()(0) = - even_0 - odd_0;
|
||||||
|
result()()(1) = - even_1 - odd_1;
|
||||||
|
result()()(2) = - even_2 - odd_2;
|
||||||
|
} else {
|
||||||
|
result()()(0) = even_0 + odd_0;
|
||||||
|
result()()(1) = even_1 + odd_1;
|
||||||
|
result()()(2) = even_2 + odd_2;
|
||||||
|
}
|
||||||
|
out._odata[sF] = out._odata[sF] + result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
#define DHOP_SITE_HAND_INSTANTIATE(IMPL) \
|
#define DHOP_SITE_HAND_INSTANTIATE(IMPL) \
|
||||||
template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \
|
template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \
|
||||||
DoubledGaugeField &U,DoubledGaugeField &UUU, \
|
DoubledGaugeField &U,DoubledGaugeField &UUU, \
|
||||||
SiteSpinor *buf, int LLs, \
|
SiteSpinor *buf, int LLs, int sU, \
|
||||||
int sU, const FermionField &in, FermionField &out, int dag);
|
const FermionField &in, FermionField &out, int dag); \
|
||||||
|
\
|
||||||
|
template void StaggeredKernels<IMPL>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, \
|
||||||
|
DoubledGaugeField &U,DoubledGaugeField &UUU, \
|
||||||
|
SiteSpinor *buf, int LLs, int sU, \
|
||||||
|
const FermionField &in, FermionField &out, int dag); \
|
||||||
|
\
|
||||||
|
template void StaggeredKernels<IMPL>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, \
|
||||||
|
DoubledGaugeField &U,DoubledGaugeField &UUU, \
|
||||||
|
SiteSpinor *buf, int LLs, int sU, \
|
||||||
|
const FermionField &in, FermionField &out, int dag); \
|
||||||
|
|
||||||
#define DHOP_SITE_DEPTH_HAND_INSTANTIATE(IMPL) \
|
|
||||||
template void StaggeredKernels<IMPL>::DhopSiteDepthHand(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, \
|
|
||||||
SiteSpinor *buf, int sF, \
|
|
||||||
int sU, const FermionField &in, SiteSpinor &out,int threeLink) ;
|
|
||||||
DHOP_SITE_HAND_INSTANTIATE(StaggeredImplD);
|
DHOP_SITE_HAND_INSTANTIATE(StaggeredImplD);
|
||||||
DHOP_SITE_HAND_INSTANTIATE(StaggeredImplF);
|
DHOP_SITE_HAND_INSTANTIATE(StaggeredImplF);
|
||||||
DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplD);
|
DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplD);
|
||||||
DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplF);
|
DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplF);
|
||||||
|
|
||||||
DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredImplD);
|
|
||||||
DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredImplF);
|
|
||||||
DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredVec5dImplD);
|
|
||||||
DHOP_SITE_DEPTH_HAND_INSTANTIATE(StaggeredVec5dImplF);
|
|
||||||
|
|
||||||
}}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@ -69,39 +69,47 @@ class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector,
|
|||||||
/*****************************************************/
|
/*****************************************************/
|
||||||
/* Compress includes precision change if mpi data is not same */
|
/* Compress includes precision change if mpi data is not same */
|
||||||
/*****************************************************/
|
/*****************************************************/
|
||||||
inline void Compress(SiteHalfSpinor *buf,Integer o,const SiteSpinor &in) {
|
inline void Compress(SiteHalfSpinor * __restrict__ buf,Integer o,const SiteSpinor &in) {
|
||||||
projector::Proj(buf[o],in,mu,dag);
|
SiteHalfSpinor tmp;
|
||||||
|
projector::Proj(tmp,in,mu,dag);
|
||||||
|
vstream(buf[o],tmp);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*****************************************************/
|
/*****************************************************/
|
||||||
/* Exchange includes precision change if mpi data is not same */
|
/* Exchange includes precision change if mpi data is not same */
|
||||||
/*****************************************************/
|
/*****************************************************/
|
||||||
inline void Exchange(SiteHalfSpinor *mp,
|
inline void Exchange(SiteHalfSpinor * __restrict__ mp,
|
||||||
SiteHalfSpinor *vp0,
|
const SiteHalfSpinor * __restrict__ vp0,
|
||||||
SiteHalfSpinor *vp1,
|
const SiteHalfSpinor * __restrict__ vp1,
|
||||||
Integer type,Integer o){
|
Integer type,Integer o){
|
||||||
exchange(mp[2*o],mp[2*o+1],vp0[o],vp1[o],type);
|
SiteHalfSpinor tmp1;
|
||||||
|
SiteHalfSpinor tmp2;
|
||||||
|
exchange(tmp1,tmp2,vp0[o],vp1[o],type);
|
||||||
|
vstream(mp[2*o ],tmp1);
|
||||||
|
vstream(mp[2*o+1],tmp2);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*****************************************************/
|
/*****************************************************/
|
||||||
/* Have a decompression step if mpi data is not same */
|
/* Have a decompression step if mpi data is not same */
|
||||||
/*****************************************************/
|
/*****************************************************/
|
||||||
inline void Decompress(SiteHalfSpinor *out,
|
inline void Decompress(SiteHalfSpinor * __restrict__ out,
|
||||||
SiteHalfSpinor *in, Integer o) {
|
SiteHalfSpinor * __restrict__ in, Integer o) {
|
||||||
assert(0);
|
assert(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*****************************************************/
|
/*****************************************************/
|
||||||
/* Compress Exchange */
|
/* Compress Exchange */
|
||||||
/*****************************************************/
|
/*****************************************************/
|
||||||
inline void CompressExchange(SiteHalfSpinor *out0,
|
inline void CompressExchange(SiteHalfSpinor * __restrict__ out0,
|
||||||
SiteHalfSpinor *out1,
|
SiteHalfSpinor * __restrict__ out1,
|
||||||
const SiteSpinor *in,
|
const SiteSpinor * __restrict__ in,
|
||||||
Integer j,Integer k, Integer m,Integer type){
|
Integer j,Integer k, Integer m,Integer type){
|
||||||
SiteHalfSpinor temp1, temp2,temp3,temp4;
|
SiteHalfSpinor temp1, temp2,temp3,temp4;
|
||||||
projector::Proj(temp1,in[k],mu,dag);
|
projector::Proj(temp1,in[k],mu,dag);
|
||||||
projector::Proj(temp2,in[m],mu,dag);
|
projector::Proj(temp2,in[m],mu,dag);
|
||||||
exchange(out0[j],out1[j],temp1,temp2,type);
|
exchange(temp3,temp4,temp1,temp2,type);
|
||||||
|
vstream(out0[j],temp3);
|
||||||
|
vstream(out1[j],temp4);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*****************************************************/
|
/*****************************************************/
|
||||||
@ -266,41 +274,16 @@ public:
|
|||||||
if ( timer4 ) std::cout << GridLogMessage << " timer4 " <<timer4 <<std::endl;
|
if ( timer4 ) std::cout << GridLogMessage << " timer4 " <<timer4 <<std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<int> same_node;
|
|
||||||
std::vector<int> surface_list;
|
|
||||||
|
|
||||||
WilsonStencil(GridBase *grid,
|
WilsonStencil(GridBase *grid,
|
||||||
int npoints,
|
int npoints,
|
||||||
int checkerboard,
|
int checkerboard,
|
||||||
const std::vector<int> &directions,
|
const std::vector<int> &directions,
|
||||||
const std::vector<int> &distances)
|
const std::vector<int> &distances)
|
||||||
: CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) ,
|
: CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances)
|
||||||
same_node(npoints)
|
|
||||||
{
|
{
|
||||||
ZeroCountersi();
|
ZeroCountersi();
|
||||||
surface_list.resize(0);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
void BuildSurfaceList(int Ls,int vol4){
|
|
||||||
|
|
||||||
// find same node for SHM
|
|
||||||
// Here we know the distance is 1 for WilsonStencil
|
|
||||||
for(int point=0;point<this->_npoints;point++){
|
|
||||||
same_node[point] = this->SameNode(point);
|
|
||||||
}
|
|
||||||
|
|
||||||
for(int site = 0 ;site< vol4;site++){
|
|
||||||
int local = 1;
|
|
||||||
for(int point=0;point<this->_npoints;point++){
|
|
||||||
if( (!this->GetNodeLocal(site*Ls,point)) && (!same_node[point]) ){
|
|
||||||
local = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if(local == 0) {
|
|
||||||
surface_list.push_back(site);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template < class compressor>
|
template < class compressor>
|
||||||
void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress)
|
void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress)
|
||||||
@ -361,23 +344,23 @@ public:
|
|||||||
int dag = compress.dag;
|
int dag = compress.dag;
|
||||||
int face_idx=0;
|
int face_idx=0;
|
||||||
if ( dag ) {
|
if ( dag ) {
|
||||||
assert(same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx));
|
assert(this->same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx));
|
||||||
assert(same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx));
|
assert(this->same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx));
|
||||||
assert(same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx));
|
assert(this->same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx));
|
||||||
assert(same_node[Tp]==this->HaloGatherDir(source,TpCompress,Tp,face_idx));
|
assert(this->same_node[Tp]==this->HaloGatherDir(source,TpCompress,Tp,face_idx));
|
||||||
assert(same_node[Xm]==this->HaloGatherDir(source,XmCompress,Xm,face_idx));
|
assert(this->same_node[Xm]==this->HaloGatherDir(source,XmCompress,Xm,face_idx));
|
||||||
assert(same_node[Ym]==this->HaloGatherDir(source,YmCompress,Ym,face_idx));
|
assert(this->same_node[Ym]==this->HaloGatherDir(source,YmCompress,Ym,face_idx));
|
||||||
assert(same_node[Zm]==this->HaloGatherDir(source,ZmCompress,Zm,face_idx));
|
assert(this->same_node[Zm]==this->HaloGatherDir(source,ZmCompress,Zm,face_idx));
|
||||||
assert(same_node[Tm]==this->HaloGatherDir(source,TmCompress,Tm,face_idx));
|
assert(this->same_node[Tm]==this->HaloGatherDir(source,TmCompress,Tm,face_idx));
|
||||||
} else {
|
} else {
|
||||||
assert(same_node[Xp]==this->HaloGatherDir(source,XmCompress,Xp,face_idx));
|
assert(this->same_node[Xp]==this->HaloGatherDir(source,XmCompress,Xp,face_idx));
|
||||||
assert(same_node[Yp]==this->HaloGatherDir(source,YmCompress,Yp,face_idx));
|
assert(this->same_node[Yp]==this->HaloGatherDir(source,YmCompress,Yp,face_idx));
|
||||||
assert(same_node[Zp]==this->HaloGatherDir(source,ZmCompress,Zp,face_idx));
|
assert(this->same_node[Zp]==this->HaloGatherDir(source,ZmCompress,Zp,face_idx));
|
||||||
assert(same_node[Tp]==this->HaloGatherDir(source,TmCompress,Tp,face_idx));
|
assert(this->same_node[Tp]==this->HaloGatherDir(source,TmCompress,Tp,face_idx));
|
||||||
assert(same_node[Xm]==this->HaloGatherDir(source,XpCompress,Xm,face_idx));
|
assert(this->same_node[Xm]==this->HaloGatherDir(source,XpCompress,Xm,face_idx));
|
||||||
assert(same_node[Ym]==this->HaloGatherDir(source,YpCompress,Ym,face_idx));
|
assert(this->same_node[Ym]==this->HaloGatherDir(source,YpCompress,Ym,face_idx));
|
||||||
assert(same_node[Zm]==this->HaloGatherDir(source,ZpCompress,Zm,face_idx));
|
assert(this->same_node[Zm]==this->HaloGatherDir(source,ZpCompress,Zm,face_idx));
|
||||||
assert(same_node[Tm]==this->HaloGatherDir(source,TpCompress,Tm,face_idx));
|
assert(this->same_node[Tm]==this->HaloGatherDir(source,TpCompress,Tm,face_idx));
|
||||||
}
|
}
|
||||||
this->face_table_computed=1;
|
this->face_table_computed=1;
|
||||||
assert(this->u_comm_offset==this->_unified_buffer_size);
|
assert(this->u_comm_offset==this->_unified_buffer_size);
|
||||||
|
@ -348,15 +348,98 @@ void WilsonFermion<Impl>::DhopDirDisp(const FermionField &in, FermionField &out,
|
|||||||
parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
|
parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
|
||||||
Kernels::DhopDir(Stencil, Umu, Stencil.CommBuf(), sss, sss, in, out, dirdisp, gamma);
|
Kernels::DhopDir(Stencil, Umu, Stencil.CommBuf(), sss, sss, in, out, dirdisp, gamma);
|
||||||
}
|
}
|
||||||
};
|
}
|
||||||
|
/*Change starts*/
|
||||||
template <class Impl>
|
template <class Impl>
|
||||||
void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
|
void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
|
||||||
DoubledGaugeField &U,
|
DoubledGaugeField &U,
|
||||||
const FermionField &in,
|
const FermionField &in,
|
||||||
FermionField &out, int dag) {
|
FermionField &out, int dag) {
|
||||||
assert((dag == DaggerNo) || (dag == DaggerYes));
|
#ifdef GRID_OMP
|
||||||
|
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
|
||||||
|
DhopInternalOverlappedComms(st,lo,U,in,out,dag);
|
||||||
|
else
|
||||||
|
#endif
|
||||||
|
DhopInternalSerial(st,lo,U,in,out,dag);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class Impl>
|
||||||
|
void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U,
|
||||||
|
const FermionField &in,
|
||||||
|
FermionField &out, int dag) {
|
||||||
|
assert((dag == DaggerNo) || (dag == DaggerYes));
|
||||||
|
#ifdef GRID_OMP
|
||||||
|
Compressor compressor;
|
||||||
|
int len = U._grid->oSites();
|
||||||
|
const int LLs = 1;
|
||||||
|
|
||||||
|
st.Prepare();
|
||||||
|
st.HaloGather(in,compressor);
|
||||||
|
st.CommsMergeSHM(compressor);
|
||||||
|
#pragma omp parallel
|
||||||
|
{
|
||||||
|
int tid = omp_get_thread_num();
|
||||||
|
int nthreads = omp_get_num_threads();
|
||||||
|
int ncomms = CartesianCommunicator::nCommThreads;
|
||||||
|
if (ncomms == -1) ncomms = 1;
|
||||||
|
assert(nthreads > ncomms);
|
||||||
|
if (tid >= ncomms) {
|
||||||
|
nthreads -= ncomms;
|
||||||
|
int ttid = tid - ncomms;
|
||||||
|
int n = len;
|
||||||
|
int chunk = n / nthreads;
|
||||||
|
int rem = n % nthreads;
|
||||||
|
int myblock, myn;
|
||||||
|
if (ttid < rem) {
|
||||||
|
myblock = ttid * chunk + ttid;
|
||||||
|
myn = chunk+1;
|
||||||
|
} else {
|
||||||
|
myblock = ttid*chunk + rem;
|
||||||
|
myn = chunk;
|
||||||
|
}
|
||||||
|
// do the compute
|
||||||
|
if (dag == DaggerYes) {
|
||||||
|
|
||||||
|
for (int sss = myblock; sss < myblock+myn; ++sss) {
|
||||||
|
Kernels::DhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (int sss = myblock; sss < myblock+myn; ++sss) {
|
||||||
|
Kernels::DhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
|
||||||
|
}
|
||||||
|
} //else
|
||||||
|
|
||||||
|
} else {
|
||||||
|
st.CommunicateThreaded();
|
||||||
|
}
|
||||||
|
|
||||||
|
Compressor compressor(dag);
|
||||||
|
|
||||||
|
if (dag == DaggerYes) {
|
||||||
|
parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
|
||||||
|
Kernels::DhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
|
||||||
|
Kernels::DhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} //pragma
|
||||||
|
#else
|
||||||
|
assert(0);
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
template <class Impl>
|
||||||
|
void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo,
|
||||||
|
DoubledGaugeField &U,
|
||||||
|
const FermionField &in,
|
||||||
|
FermionField &out, int dag) {
|
||||||
|
assert((dag == DaggerNo) || (dag == DaggerYes));
|
||||||
Compressor compressor(dag);
|
Compressor compressor(dag);
|
||||||
st.HaloExchange(in, compressor);
|
st.HaloExchange(in, compressor);
|
||||||
|
|
||||||
@ -370,6 +453,7 @@ void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
/*Change ends */
|
||||||
|
|
||||||
/*******************************************************************************
|
/*******************************************************************************
|
||||||
* Conserved current utilities for Wilson fermions, for contracting propagators
|
* Conserved current utilities for Wilson fermions, for contracting propagators
|
||||||
|
@ -130,6 +130,12 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
|
|||||||
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||||
const FermionField &in, FermionField &out, int dag);
|
const FermionField &in, FermionField &out, int dag);
|
||||||
|
|
||||||
|
void DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||||
|
const FermionField &in, FermionField &out, int dag);
|
||||||
|
|
||||||
|
void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
|
||||||
|
const FermionField &in, FermionField &out, int dag);
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
|
WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
|
||||||
GridRedBlackCartesian &Hgrid, RealD _mass,
|
GridRedBlackCartesian &Hgrid, RealD _mass,
|
||||||
@ -145,6 +151,8 @@ class WilsonFermion : public WilsonKernels<Impl>, public WilsonFermionStatic {
|
|||||||
|
|
||||||
// protected:
|
// protected:
|
||||||
public:
|
public:
|
||||||
|
virtual RealD Mass(void) { return mass; }
|
||||||
|
virtual int isTrivialEE(void) { return 1; };
|
||||||
RealD mass;
|
RealD mass;
|
||||||
RealD diag_mass;
|
RealD diag_mass;
|
||||||
|
|
||||||
|
@ -445,8 +445,7 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
ptime = usecond() - start;
|
ptime = usecond() - start;
|
||||||
}
|
} else {
|
||||||
{
|
|
||||||
double start = usecond();
|
double start = usecond();
|
||||||
st.CommunicateThreaded();
|
st.CommunicateThreaded();
|
||||||
ctime = usecond() - start;
|
ctime = usecond() - start;
|
||||||
|
@ -53,7 +53,7 @@ template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public
|
|||||||
typedef FermionOperator<Impl> Base;
|
typedef FermionOperator<Impl> Base;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
template <bool EnableBool = true>
|
template <bool EnableBool = true>
|
||||||
typename std::enable_if<Impl::isFundamental==true && Nc == 3 &&EnableBool, void>::type
|
typename std::enable_if<Impl::isFundamental==true && Nc == 3 &&EnableBool, void>::type
|
||||||
DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||||
@ -70,27 +70,27 @@ public:
|
|||||||
break;
|
break;
|
||||||
#endif
|
#endif
|
||||||
case OptHandUnroll:
|
case OptHandUnroll:
|
||||||
for (int site = 0; site < Ns; site++) {
|
for (int site = 0; site < Ns; site++) {
|
||||||
for (int s = 0; s < Ls; s++) {
|
for (int s = 0; s < Ls; s++) {
|
||||||
if(interior&&exterior) WilsonKernels<Impl>::HandDhopSite(st,lo,U,buf,sF,sU,in,out);
|
if(interior&&exterior) WilsonKernels<Impl>::HandDhopSite(st,lo,U,buf,sF,sU,in,out);
|
||||||
else if (interior) WilsonKernels<Impl>::HandDhopSiteInt(st,lo,U,buf,sF,sU,in,out);
|
else if (interior) WilsonKernels<Impl>::HandDhopSiteInt(st,lo,U,buf,sF,sU,in,out);
|
||||||
else if (exterior) WilsonKernels<Impl>::HandDhopSiteExt(st,lo,U,buf,sF,sU,in,out);
|
else if (exterior) WilsonKernels<Impl>::HandDhopSiteExt(st,lo,U,buf,sF,sU,in,out);
|
||||||
sF++;
|
sF++;
|
||||||
}
|
}
|
||||||
sU++;
|
sU++;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case OptGeneric:
|
case OptGeneric:
|
||||||
for (int site = 0; site < Ns; site++) {
|
for (int site = 0; site < Ns; site++) {
|
||||||
for (int s = 0; s < Ls; s++) {
|
for (int s = 0; s < Ls; s++) {
|
||||||
if(interior&&exterior) WilsonKernels<Impl>::GenericDhopSite(st,lo,U,buf,sF,sU,in,out);
|
if(interior&&exterior) WilsonKernels<Impl>::GenericDhopSite(st,lo,U,buf,sF,sU,in,out);
|
||||||
else if (interior) WilsonKernels<Impl>::GenericDhopSiteInt(st,lo,U,buf,sF,sU,in,out);
|
else if (interior) WilsonKernels<Impl>::GenericDhopSiteInt(st,lo,U,buf,sF,sU,in,out);
|
||||||
else if (exterior) WilsonKernels<Impl>::GenericDhopSiteExt(st,lo,U,buf,sF,sU,in,out);
|
else if (exterior) WilsonKernels<Impl>::GenericDhopSiteExt(st,lo,U,buf,sF,sU,in,out);
|
||||||
else assert(0);
|
else assert(0);
|
||||||
sF++;
|
sF++;
|
||||||
}
|
}
|
||||||
sU++;
|
sU++;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
assert(0);
|
assert(0);
|
||||||
@ -232,6 +232,7 @@ private:
|
|||||||
void GenericDhopSiteDagExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
void GenericDhopSiteDagExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||||
int sF, int sU, const FermionField &in, FermionField &out);
|
int sF, int sU, const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
|
|
||||||
void AsmDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
void AsmDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
|
||||||
int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out);
|
int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out);
|
||||||
|
|
||||||
|
@ -30,181 +30,60 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
|
|
||||||
#define REGISTER
|
#define REGISTER
|
||||||
|
|
||||||
#define LOAD_CHIMU_BODY(F) \
|
#define LOAD_CHIMU \
|
||||||
Chimu_00=ref(F)(0)(0); \
|
{const SiteSpinor & ref (in._odata[offset]); \
|
||||||
Chimu_01=ref(F)(0)(1); \
|
Chimu_00=ref()(0)(0);\
|
||||||
Chimu_02=ref(F)(0)(2); \
|
Chimu_01=ref()(0)(1);\
|
||||||
Chimu_10=ref(F)(1)(0); \
|
Chimu_02=ref()(0)(2);\
|
||||||
Chimu_11=ref(F)(1)(1); \
|
Chimu_10=ref()(1)(0);\
|
||||||
Chimu_12=ref(F)(1)(2); \
|
Chimu_11=ref()(1)(1);\
|
||||||
Chimu_20=ref(F)(2)(0); \
|
Chimu_12=ref()(1)(2);\
|
||||||
Chimu_21=ref(F)(2)(1); \
|
Chimu_20=ref()(2)(0);\
|
||||||
Chimu_22=ref(F)(2)(2); \
|
Chimu_21=ref()(2)(1);\
|
||||||
Chimu_30=ref(F)(3)(0); \
|
Chimu_22=ref()(2)(2);\
|
||||||
Chimu_31=ref(F)(3)(1); \
|
Chimu_30=ref()(3)(0);\
|
||||||
Chimu_32=ref(F)(3)(2)
|
Chimu_31=ref()(3)(1);\
|
||||||
|
Chimu_32=ref()(3)(2);}
|
||||||
|
|
||||||
#define LOAD_CHIMU(DIR,F,PERM) \
|
#define LOAD_CHI\
|
||||||
{ const SiteSpinor & ref (in._odata[offset]); LOAD_CHIMU_BODY(F); }
|
{const SiteHalfSpinor &ref(buf[offset]); \
|
||||||
|
Chi_00 = ref()(0)(0);\
|
||||||
#define LOAD_CHI_BODY(F) \
|
Chi_01 = ref()(0)(1);\
|
||||||
Chi_00 = ref(F)(0)(0);\
|
Chi_02 = ref()(0)(2);\
|
||||||
Chi_01 = ref(F)(0)(1);\
|
Chi_10 = ref()(1)(0);\
|
||||||
Chi_02 = ref(F)(0)(2);\
|
Chi_11 = ref()(1)(1);\
|
||||||
Chi_10 = ref(F)(1)(0);\
|
Chi_12 = ref()(1)(2);}
|
||||||
Chi_11 = ref(F)(1)(1);\
|
|
||||||
Chi_12 = ref(F)(1)(2)
|
|
||||||
|
|
||||||
#define LOAD_CHI(DIR,F,PERM) \
|
|
||||||
{const SiteHalfSpinor &ref(buf[offset]); LOAD_CHI_BODY(F); }
|
|
||||||
|
|
||||||
|
|
||||||
//G-parity implementations using in-place intrinsic ops
|
|
||||||
|
|
||||||
//1l 1h -> 1h 1l
|
|
||||||
//0l 0h , 1h 1l -> 0l 1h 0h,1l
|
|
||||||
//0h,1l -> 1l,0h
|
|
||||||
//if( (distance == 1 && !perm_will_occur) || (distance == -1 && perm_will_occur) )
|
|
||||||
//Pulled fermion through forwards face, GPBC on upper component
|
|
||||||
//Need 0= 0l 1h 1= 1l 0h
|
|
||||||
//else if( (distance == -1 && !perm) || (distance == 1 && perm) )
|
|
||||||
//Pulled fermion through backwards face, GPBC on lower component
|
|
||||||
//Need 0= 1l 0h 1= 0l 1h
|
|
||||||
|
|
||||||
//1l 1h -> 1h 1l
|
|
||||||
//0l 0h , 1h 1l -> 0l 1h 0h,1l
|
|
||||||
#define DO_TWIST_0L_1H(INTO,S,C,F, PERM, tmp1, tmp2, tmp3) \
|
|
||||||
permute##PERM(tmp1, ref(1)(S)(C)); \
|
|
||||||
exchange##PERM(tmp2,tmp3, ref(0)(S)(C), tmp1); \
|
|
||||||
INTO = tmp2;
|
|
||||||
|
|
||||||
//0l 0h -> 0h 0l
|
|
||||||
//1l 1h, 0h 0l -> 1l 0h, 1h 0l
|
|
||||||
#define DO_TWIST_1L_0H(INTO,S,C,F, PERM, tmp1, tmp2, tmp3) \
|
|
||||||
permute##PERM(tmp1, ref(0)(S)(C)); \
|
|
||||||
exchange##PERM(tmp2,tmp3, ref(1)(S)(C), tmp1); \
|
|
||||||
INTO = tmp2;
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#define LOAD_CHI_SETUP(DIR,F) \
|
|
||||||
g = F; \
|
|
||||||
direction = st._directions[DIR]; \
|
|
||||||
distance = st._distances[DIR]; \
|
|
||||||
sl = st._grid->_simd_layout[direction]; \
|
|
||||||
inplace_twist = 0; \
|
|
||||||
if(SE->_around_the_world && this->Params.twists[DIR % 4]){ \
|
|
||||||
if(sl == 1){ \
|
|
||||||
g = (F+1) % 2; \
|
|
||||||
}else{ \
|
|
||||||
inplace_twist = 1; \
|
|
||||||
} \
|
|
||||||
}
|
|
||||||
|
|
||||||
#define LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F,PERM) \
|
|
||||||
{ const SiteSpinor &ref(in._odata[offset]); \
|
|
||||||
LOAD_CHI_SETUP(DIR,F); \
|
|
||||||
if(!inplace_twist){ \
|
|
||||||
LOAD_CHIMU_BODY(g); \
|
|
||||||
}else{ \
|
|
||||||
if( ( F==0 && ((distance == 1 && !perm) || (distance == -1 && perm)) ) || \
|
|
||||||
( F==1 && ((distance == -1 && !perm) || (distance == 1 && perm)) ) ){ \
|
|
||||||
DO_TWIST_0L_1H(Chimu_00,0,0,F,PERM, U_00,U_01,U_10); \
|
|
||||||
DO_TWIST_0L_1H(Chimu_01,0,1,F,PERM, U_11,U_20,U_21); \
|
|
||||||
DO_TWIST_0L_1H(Chimu_02,0,2,F,PERM, U_00,U_01,U_10); \
|
|
||||||
DO_TWIST_0L_1H(Chimu_10,1,0,F,PERM, U_11,U_20,U_21); \
|
|
||||||
DO_TWIST_0L_1H(Chimu_11,1,1,F,PERM, U_00,U_01,U_10); \
|
|
||||||
DO_TWIST_0L_1H(Chimu_12,1,2,F,PERM, U_11,U_20,U_21); \
|
|
||||||
DO_TWIST_0L_1H(Chimu_20,2,0,F,PERM, U_00,U_01,U_10); \
|
|
||||||
DO_TWIST_0L_1H(Chimu_21,2,1,F,PERM, U_11,U_20,U_21); \
|
|
||||||
DO_TWIST_0L_1H(Chimu_22,2,2,F,PERM, U_00,U_01,U_10); \
|
|
||||||
DO_TWIST_0L_1H(Chimu_30,3,0,F,PERM, U_11,U_20,U_21); \
|
|
||||||
DO_TWIST_0L_1H(Chimu_31,3,1,F,PERM, U_00,U_01,U_10); \
|
|
||||||
DO_TWIST_0L_1H(Chimu_32,3,2,F,PERM, U_11,U_20,U_21); \
|
|
||||||
}else{ \
|
|
||||||
DO_TWIST_1L_0H(Chimu_00,0,0,F,PERM, U_00,U_01,U_10); \
|
|
||||||
DO_TWIST_1L_0H(Chimu_01,0,1,F,PERM, U_11,U_20,U_21); \
|
|
||||||
DO_TWIST_1L_0H(Chimu_02,0,2,F,PERM, U_00,U_01,U_10); \
|
|
||||||
DO_TWIST_1L_0H(Chimu_10,1,0,F,PERM, U_11,U_20,U_21); \
|
|
||||||
DO_TWIST_1L_0H(Chimu_11,1,1,F,PERM, U_00,U_01,U_10); \
|
|
||||||
DO_TWIST_1L_0H(Chimu_12,1,2,F,PERM, U_11,U_20,U_21); \
|
|
||||||
DO_TWIST_1L_0H(Chimu_20,2,0,F,PERM, U_00,U_01,U_10); \
|
|
||||||
DO_TWIST_1L_0H(Chimu_21,2,1,F,PERM, U_11,U_20,U_21); \
|
|
||||||
DO_TWIST_1L_0H(Chimu_22,2,2,F,PERM, U_00,U_01,U_10); \
|
|
||||||
DO_TWIST_1L_0H(Chimu_30,3,0,F,PERM, U_11,U_20,U_21); \
|
|
||||||
DO_TWIST_1L_0H(Chimu_31,3,1,F,PERM, U_00,U_01,U_10); \
|
|
||||||
DO_TWIST_1L_0H(Chimu_32,3,2,F,PERM, U_11,U_20,U_21); \
|
|
||||||
} \
|
|
||||||
} \
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
#define LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F,PERM) \
|
|
||||||
{ const SiteHalfSpinor &ref(buf[offset]); \
|
|
||||||
LOAD_CHI_SETUP(DIR,F); \
|
|
||||||
if(!inplace_twist){ \
|
|
||||||
LOAD_CHI_BODY(g); \
|
|
||||||
}else{ \
|
|
||||||
if( ( F==0 && ((distance == 1 && !perm) || (distance == -1 && perm)) ) || \
|
|
||||||
( F==1 && ((distance == -1 && !perm) || (distance == 1 && perm)) ) ){ \
|
|
||||||
DO_TWIST_0L_1H(Chi_00,0,0,F,PERM, U_00,U_01,U_10); \
|
|
||||||
DO_TWIST_0L_1H(Chi_01,0,1,F,PERM, U_11,U_20,U_21); \
|
|
||||||
DO_TWIST_0L_1H(Chi_02,0,2,F,PERM, UChi_00,UChi_01,UChi_02); \
|
|
||||||
DO_TWIST_0L_1H(Chi_10,1,0,F,PERM, UChi_10,UChi_11,UChi_12); \
|
|
||||||
DO_TWIST_0L_1H(Chi_11,1,1,F,PERM, U_00,U_01,U_10); \
|
|
||||||
DO_TWIST_0L_1H(Chi_12,1,2,F,PERM, U_11,U_20,U_21); \
|
|
||||||
}else{ \
|
|
||||||
DO_TWIST_1L_0H(Chi_00,0,0,F,PERM, U_00,U_01,U_10); \
|
|
||||||
DO_TWIST_1L_0H(Chi_01,0,1,F,PERM, U_11,U_20,U_21); \
|
|
||||||
DO_TWIST_1L_0H(Chi_02,0,2,F,PERM, UChi_00,UChi_01,UChi_02); \
|
|
||||||
DO_TWIST_1L_0H(Chi_10,1,0,F,PERM, UChi_10,UChi_11,UChi_12); \
|
|
||||||
DO_TWIST_1L_0H(Chi_11,1,1,F,PERM, U_00,U_01,U_10); \
|
|
||||||
DO_TWIST_1L_0H(Chi_12,1,2,F,PERM, U_11,U_20,U_21); \
|
|
||||||
} \
|
|
||||||
} \
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
#define LOAD_CHI_GPARITY(DIR,F,PERM) LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F,PERM)
|
|
||||||
#define LOAD_CHIMU_GPARITY(DIR,F,PERM) LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F,PERM)
|
|
||||||
|
|
||||||
// To splat or not to splat depends on the implementation
|
// To splat or not to splat depends on the implementation
|
||||||
#define MULT_2SPIN_BODY \
|
#define MULT_2SPIN(A)\
|
||||||
Impl::loadLinkElement(U_00,ref()(0,0)); \
|
{auto & ref(U._odata[sU](A)); \
|
||||||
Impl::loadLinkElement(U_10,ref()(1,0)); \
|
Impl::loadLinkElement(U_00,ref()(0,0)); \
|
||||||
Impl::loadLinkElement(U_20,ref()(2,0)); \
|
Impl::loadLinkElement(U_10,ref()(1,0)); \
|
||||||
Impl::loadLinkElement(U_01,ref()(0,1)); \
|
Impl::loadLinkElement(U_20,ref()(2,0)); \
|
||||||
Impl::loadLinkElement(U_11,ref()(1,1)); \
|
Impl::loadLinkElement(U_01,ref()(0,1)); \
|
||||||
Impl::loadLinkElement(U_21,ref()(2,1)); \
|
Impl::loadLinkElement(U_11,ref()(1,1)); \
|
||||||
UChi_00 = U_00*Chi_00; \
|
Impl::loadLinkElement(U_21,ref()(2,1)); \
|
||||||
UChi_10 = U_00*Chi_10; \
|
UChi_00 = U_00*Chi_00;\
|
||||||
UChi_01 = U_10*Chi_00; \
|
UChi_10 = U_00*Chi_10;\
|
||||||
UChi_11 = U_10*Chi_10; \
|
UChi_01 = U_10*Chi_00;\
|
||||||
UChi_02 = U_20*Chi_00; \
|
UChi_11 = U_10*Chi_10;\
|
||||||
UChi_12 = U_20*Chi_10; \
|
UChi_02 = U_20*Chi_00;\
|
||||||
UChi_00+= U_01*Chi_01; \
|
UChi_12 = U_20*Chi_10;\
|
||||||
UChi_10+= U_01*Chi_11; \
|
UChi_00+= U_01*Chi_01;\
|
||||||
UChi_01+= U_11*Chi_01; \
|
UChi_10+= U_01*Chi_11;\
|
||||||
UChi_11+= U_11*Chi_11; \
|
UChi_01+= U_11*Chi_01;\
|
||||||
UChi_02+= U_21*Chi_01; \
|
UChi_11+= U_11*Chi_11;\
|
||||||
UChi_12+= U_21*Chi_11; \
|
UChi_02+= U_21*Chi_01;\
|
||||||
Impl::loadLinkElement(U_00,ref()(0,2)); \
|
UChi_12+= U_21*Chi_11;\
|
||||||
Impl::loadLinkElement(U_10,ref()(1,2)); \
|
Impl::loadLinkElement(U_00,ref()(0,2)); \
|
||||||
Impl::loadLinkElement(U_20,ref()(2,2)); \
|
Impl::loadLinkElement(U_10,ref()(1,2)); \
|
||||||
UChi_00+= U_00*Chi_02; \
|
Impl::loadLinkElement(U_20,ref()(2,2)); \
|
||||||
UChi_10+= U_00*Chi_12; \
|
UChi_00+= U_00*Chi_02;\
|
||||||
UChi_01+= U_10*Chi_02; \
|
UChi_10+= U_00*Chi_12;\
|
||||||
UChi_11+= U_10*Chi_12; \
|
UChi_01+= U_10*Chi_02;\
|
||||||
UChi_02+= U_20*Chi_02; \
|
UChi_11+= U_10*Chi_12;\
|
||||||
UChi_12+= U_20*Chi_12
|
UChi_02+= U_20*Chi_02;\
|
||||||
|
UChi_12+= U_20*Chi_12;}
|
||||||
|
|
||||||
#define MULT_2SPIN(A,F) \
|
|
||||||
{auto & ref(U._odata[sU](A)); MULT_2SPIN_BODY; }
|
|
||||||
|
|
||||||
#define MULT_2SPIN_GPARITY(A,F) \
|
|
||||||
{auto & ref(U._odata[sU](F)(A)); MULT_2SPIN_BODY; }
|
|
||||||
|
|
||||||
|
|
||||||
#define PERMUTE_DIR(dir) \
|
#define PERMUTE_DIR(dir) \
|
||||||
@ -428,87 +307,84 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
result_31-= UChi_11; \
|
result_31-= UChi_11; \
|
||||||
result_32-= UChi_12;
|
result_32-= UChi_12;
|
||||||
|
|
||||||
#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
|
#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON) \
|
||||||
SE=st.GetEntry(ptype,DIR,ss); \
|
SE=st.GetEntry(ptype,DIR,ss); \
|
||||||
offset = SE->_offset; \
|
offset = SE->_offset; \
|
||||||
local = SE->_is_local; \
|
local = SE->_is_local; \
|
||||||
perm = SE->_permute; \
|
perm = SE->_permute; \
|
||||||
if ( local ) { \
|
if ( local ) { \
|
||||||
LOAD_CHIMU_IMPL(DIR,F,PERM); \
|
LOAD_CHIMU; \
|
||||||
PROJ; \
|
PROJ; \
|
||||||
if ( perm) { \
|
if ( perm) { \
|
||||||
PERMUTE_DIR(PERM); \
|
PERMUTE_DIR(PERM); \
|
||||||
} \
|
} \
|
||||||
} else { \
|
} else { \
|
||||||
LOAD_CHI_IMPL(DIR,F,PERM); \
|
LOAD_CHI; \
|
||||||
} \
|
} \
|
||||||
MULT_2SPIN_IMPL(DIR,F); \
|
MULT_2SPIN(DIR); \
|
||||||
RECON;
|
RECON;
|
||||||
|
|
||||||
|
#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON) \
|
||||||
#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
|
|
||||||
SE=st.GetEntry(ptype,DIR,ss); \
|
SE=st.GetEntry(ptype,DIR,ss); \
|
||||||
offset = SE->_offset; \
|
offset = SE->_offset; \
|
||||||
local = SE->_is_local; \
|
local = SE->_is_local; \
|
||||||
perm = SE->_permute; \
|
perm = SE->_permute; \
|
||||||
if ( local ) { \
|
if ( local ) { \
|
||||||
LOAD_CHIMU_IMPL(DIR,F,PERM); \
|
LOAD_CHIMU; \
|
||||||
PROJ; \
|
PROJ; \
|
||||||
if ( perm) { \
|
if ( perm) { \
|
||||||
PERMUTE_DIR(PERM); \
|
PERMUTE_DIR(PERM); \
|
||||||
} \
|
} \
|
||||||
} else if ( st.same_node[DIR] ) { \
|
} else if ( st.same_node[DIR] ) { \
|
||||||
LOAD_CHI_IMPL(DIR,F,PERM); \
|
LOAD_CHI; \
|
||||||
} \
|
} \
|
||||||
if (local || st.same_node[DIR] ) { \
|
if (local || st.same_node[DIR] ) { \
|
||||||
MULT_2SPIN_IMPL(DIR,F); \
|
MULT_2SPIN(DIR); \
|
||||||
RECON; \
|
RECON; \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
|
#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON) \
|
||||||
SE=st.GetEntry(ptype,DIR,ss); \
|
SE=st.GetEntry(ptype,DIR,ss); \
|
||||||
offset = SE->_offset; \
|
offset = SE->_offset; \
|
||||||
local = SE->_is_local; \
|
|
||||||
perm = SE->_permute; \
|
|
||||||
if((!SE->_is_local)&&(!st.same_node[DIR]) ) { \
|
if((!SE->_is_local)&&(!st.same_node[DIR]) ) { \
|
||||||
LOAD_CHI_IMPL(DIR,F,PERM); \
|
LOAD_CHI; \
|
||||||
MULT_2SPIN_IMPL(DIR,F); \
|
MULT_2SPIN(DIR); \
|
||||||
RECON; \
|
RECON; \
|
||||||
nmu++; \
|
nmu++; \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define HAND_RESULT(ss,F) \
|
#define HAND_RESULT(ss) \
|
||||||
{ \
|
{ \
|
||||||
SiteSpinor & ref (out._odata[ss]); \
|
SiteSpinor & ref (out._odata[ss]); \
|
||||||
vstream(ref(F)(0)(0),result_00); \
|
vstream(ref()(0)(0),result_00); \
|
||||||
vstream(ref(F)(0)(1),result_01); \
|
vstream(ref()(0)(1),result_01); \
|
||||||
vstream(ref(F)(0)(2),result_02); \
|
vstream(ref()(0)(2),result_02); \
|
||||||
vstream(ref(F)(1)(0),result_10); \
|
vstream(ref()(1)(0),result_10); \
|
||||||
vstream(ref(F)(1)(1),result_11); \
|
vstream(ref()(1)(1),result_11); \
|
||||||
vstream(ref(F)(1)(2),result_12); \
|
vstream(ref()(1)(2),result_12); \
|
||||||
vstream(ref(F)(2)(0),result_20); \
|
vstream(ref()(2)(0),result_20); \
|
||||||
vstream(ref(F)(2)(1),result_21); \
|
vstream(ref()(2)(1),result_21); \
|
||||||
vstream(ref(F)(2)(2),result_22); \
|
vstream(ref()(2)(2),result_22); \
|
||||||
vstream(ref(F)(3)(0),result_30); \
|
vstream(ref()(3)(0),result_30); \
|
||||||
vstream(ref(F)(3)(1),result_31); \
|
vstream(ref()(3)(1),result_31); \
|
||||||
vstream(ref(F)(3)(2),result_32); \
|
vstream(ref()(3)(2),result_32); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define HAND_RESULT_EXT(ss,F) \
|
#define HAND_RESULT_EXT(ss) \
|
||||||
if (nmu){ \
|
if (nmu){ \
|
||||||
SiteSpinor & ref (out._odata[ss]); \
|
SiteSpinor & ref (out._odata[ss]); \
|
||||||
ref(F)(0)(0)+=result_00; \
|
ref()(0)(0)+=result_00; \
|
||||||
ref(F)(0)(1)+=result_01; \
|
ref()(0)(1)+=result_01; \
|
||||||
ref(F)(0)(2)+=result_02; \
|
ref()(0)(2)+=result_02; \
|
||||||
ref(F)(1)(0)+=result_10; \
|
ref()(1)(0)+=result_10; \
|
||||||
ref(F)(1)(1)+=result_11; \
|
ref()(1)(1)+=result_11; \
|
||||||
ref(F)(1)(2)+=result_12; \
|
ref()(1)(2)+=result_12; \
|
||||||
ref(F)(2)(0)+=result_20; \
|
ref()(2)(0)+=result_20; \
|
||||||
ref(F)(2)(1)+=result_21; \
|
ref()(2)(1)+=result_21; \
|
||||||
ref(F)(2)(2)+=result_22; \
|
ref()(2)(2)+=result_22; \
|
||||||
ref(F)(3)(0)+=result_30; \
|
ref()(3)(0)+=result_30; \
|
||||||
ref(F)(3)(1)+=result_31; \
|
ref()(3)(1)+=result_31; \
|
||||||
ref(F)(3)(2)+=result_32; \
|
ref()(3)(2)+=result_32; \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -587,18 +463,15 @@ WilsonKernels<Impl>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGauge
|
|||||||
int offset,local,perm, ptype;
|
int offset,local,perm, ptype;
|
||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
|
|
||||||
#define HAND_DOP_SITE(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
|
HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
|
||||||
HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
|
||||||
HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
|
||||||
HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM);
|
||||||
HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM);
|
||||||
HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM);
|
||||||
HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
|
||||||
HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM);
|
||||||
HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
HAND_RESULT(ss);
|
||||||
HAND_RESULT(ss,F)
|
|
||||||
|
|
||||||
HAND_DOP_SITE(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
@ -612,19 +485,16 @@ void WilsonKernels<Impl>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,Doub
|
|||||||
|
|
||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
int offset,local,perm, ptype;
|
int offset,local,perm, ptype;
|
||||||
|
|
||||||
#define HAND_DOP_SITE_DAG(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
|
HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON);
|
||||||
HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM);
|
||||||
HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
|
||||||
HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM);
|
||||||
HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM);
|
||||||
HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM);
|
||||||
HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
|
||||||
HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM);
|
||||||
HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
HAND_RESULT(ss);
|
||||||
HAND_RESULT(ss,F)
|
|
||||||
|
|
||||||
HAND_DOP_SITE_DAG(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl> void
|
template<class Impl> void
|
||||||
@ -639,20 +509,16 @@ WilsonKernels<Impl>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGa
|
|||||||
|
|
||||||
int offset,local,perm, ptype;
|
int offset,local,perm, ptype;
|
||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
|
ZERO_RESULT;
|
||||||
#define HAND_DOP_SITE_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
|
HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
|
||||||
ZERO_RESULT; \
|
HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM);
|
||||||
HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
|
||||||
HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM);
|
||||||
HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM);
|
||||||
HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM);
|
||||||
HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
|
||||||
HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM);
|
||||||
HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
HAND_RESULT(ss);
|
||||||
HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
|
||||||
HAND_RESULT(ss,F)
|
|
||||||
|
|
||||||
HAND_DOP_SITE_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
@ -666,20 +532,16 @@ void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,D
|
|||||||
|
|
||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
int offset,local,perm, ptype;
|
int offset,local,perm, ptype;
|
||||||
|
ZERO_RESULT;
|
||||||
#define HAND_DOP_SITE_DAG_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
|
HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
|
||||||
ZERO_RESULT; \
|
HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
|
||||||
HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
|
||||||
HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM);
|
||||||
HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM);
|
||||||
HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM);
|
||||||
HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
|
||||||
HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM);
|
||||||
HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
HAND_RESULT(ss);
|
||||||
HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
|
||||||
HAND_RESULT(ss,F)
|
|
||||||
|
|
||||||
HAND_DOP_SITE_DAG_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl> void
|
template<class Impl> void
|
||||||
@ -695,20 +557,16 @@ WilsonKernels<Impl>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGa
|
|||||||
int offset,local,perm, ptype;
|
int offset,local,perm, ptype;
|
||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
int nmu=0;
|
int nmu=0;
|
||||||
|
ZERO_RESULT;
|
||||||
#define HAND_DOP_SITE_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
|
HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
|
||||||
ZERO_RESULT; \
|
HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM);
|
||||||
HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
|
||||||
HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM);
|
||||||
HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM);
|
||||||
HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM);
|
||||||
HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
|
||||||
HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM);
|
||||||
HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
HAND_RESULT_EXT(ss);
|
||||||
HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
|
||||||
HAND_RESULT_EXT(ss,F)
|
|
||||||
|
|
||||||
HAND_DOP_SITE_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl>
|
template<class Impl>
|
||||||
@ -723,193 +581,18 @@ void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,D
|
|||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
int offset,local,perm, ptype;
|
int offset,local,perm, ptype;
|
||||||
int nmu=0;
|
int nmu=0;
|
||||||
|
ZERO_RESULT;
|
||||||
#define HAND_DOP_SITE_DAG_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
|
HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
|
||||||
ZERO_RESULT; \
|
HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
|
||||||
HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
|
||||||
HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM);
|
||||||
HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM);
|
||||||
HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM);
|
||||||
HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
|
||||||
HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM);
|
||||||
HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
HAND_RESULT_EXT(ss);
|
||||||
HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
|
||||||
HAND_RESULT_EXT(ss,F)
|
|
||||||
|
|
||||||
HAND_DOP_SITE_DAG_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////
|
|
||||||
// Specialise Gparity to simple implementation
|
|
||||||
////////////////////////////////////////////////
|
|
||||||
#define HAND_SPECIALISE_EMPTY(IMPL) \
|
|
||||||
template<> void \
|
|
||||||
WilsonKernels<IMPL>::HandDhopSite(StencilImpl &st, \
|
|
||||||
LebesgueOrder &lo, \
|
|
||||||
DoubledGaugeField &U, \
|
|
||||||
SiteHalfSpinor *buf, \
|
|
||||||
int sF,int sU, \
|
|
||||||
const FermionField &in, \
|
|
||||||
FermionField &out){ assert(0); } \
|
|
||||||
template<> void \
|
|
||||||
WilsonKernels<IMPL>::HandDhopSiteDag(StencilImpl &st, \
|
|
||||||
LebesgueOrder &lo, \
|
|
||||||
DoubledGaugeField &U, \
|
|
||||||
SiteHalfSpinor *buf, \
|
|
||||||
int sF,int sU, \
|
|
||||||
const FermionField &in, \
|
|
||||||
FermionField &out){ assert(0); } \
|
|
||||||
template<> void \
|
|
||||||
WilsonKernels<IMPL>::HandDhopSiteInt(StencilImpl &st, \
|
|
||||||
LebesgueOrder &lo, \
|
|
||||||
DoubledGaugeField &U, \
|
|
||||||
SiteHalfSpinor *buf, \
|
|
||||||
int sF,int sU, \
|
|
||||||
const FermionField &in, \
|
|
||||||
FermionField &out){ assert(0); } \
|
|
||||||
template<> void \
|
|
||||||
WilsonKernels<IMPL>::HandDhopSiteExt(StencilImpl &st, \
|
|
||||||
LebesgueOrder &lo, \
|
|
||||||
DoubledGaugeField &U, \
|
|
||||||
SiteHalfSpinor *buf, \
|
|
||||||
int sF,int sU, \
|
|
||||||
const FermionField &in, \
|
|
||||||
FermionField &out){ assert(0); } \
|
|
||||||
template<> void \
|
|
||||||
WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilImpl &st, \
|
|
||||||
LebesgueOrder &lo, \
|
|
||||||
DoubledGaugeField &U, \
|
|
||||||
SiteHalfSpinor *buf, \
|
|
||||||
int sF,int sU, \
|
|
||||||
const FermionField &in, \
|
|
||||||
FermionField &out){ assert(0); } \
|
|
||||||
template<> void \
|
|
||||||
WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilImpl &st, \
|
|
||||||
LebesgueOrder &lo, \
|
|
||||||
DoubledGaugeField &U, \
|
|
||||||
SiteHalfSpinor *buf, \
|
|
||||||
int sF,int sU, \
|
|
||||||
const FermionField &in, \
|
|
||||||
FermionField &out){ assert(0); } \
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#define HAND_SPECIALISE_GPARITY(IMPL) \
|
|
||||||
template<> void \
|
|
||||||
WilsonKernels<IMPL>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
|
|
||||||
int ss,int sU,const FermionField &in, FermionField &out) \
|
|
||||||
{ \
|
|
||||||
typedef IMPL Impl; \
|
|
||||||
typedef typename Simd::scalar_type S; \
|
|
||||||
typedef typename Simd::vector_type V; \
|
|
||||||
\
|
|
||||||
HAND_DECLARATIONS(ignore); \
|
|
||||||
\
|
|
||||||
int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
|
|
||||||
StencilEntry *SE; \
|
|
||||||
HAND_DOP_SITE(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
|
||||||
HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
|
||||||
} \
|
|
||||||
\
|
|
||||||
template<> \
|
|
||||||
void WilsonKernels<IMPL>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
|
|
||||||
int ss,int sU,const FermionField &in, FermionField &out) \
|
|
||||||
{ \
|
|
||||||
typedef IMPL Impl; \
|
|
||||||
typedef typename Simd::scalar_type S; \
|
|
||||||
typedef typename Simd::vector_type V; \
|
|
||||||
\
|
|
||||||
HAND_DECLARATIONS(ignore); \
|
|
||||||
\
|
|
||||||
StencilEntry *SE; \
|
|
||||||
int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
|
|
||||||
HAND_DOP_SITE_DAG(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
|
||||||
HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
|
||||||
} \
|
|
||||||
\
|
|
||||||
template<> void \
|
|
||||||
WilsonKernels<IMPL>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
|
|
||||||
int ss,int sU,const FermionField &in, FermionField &out) \
|
|
||||||
{ \
|
|
||||||
typedef IMPL Impl; \
|
|
||||||
typedef typename Simd::scalar_type S; \
|
|
||||||
typedef typename Simd::vector_type V; \
|
|
||||||
\
|
|
||||||
HAND_DECLARATIONS(ignore); \
|
|
||||||
\
|
|
||||||
int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
|
|
||||||
StencilEntry *SE; \
|
|
||||||
HAND_DOP_SITE_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
|
||||||
HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
|
||||||
} \
|
|
||||||
\
|
|
||||||
template<> \
|
|
||||||
void WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
|
|
||||||
int ss,int sU,const FermionField &in, FermionField &out) \
|
|
||||||
{ \
|
|
||||||
typedef IMPL Impl; \
|
|
||||||
typedef typename Simd::scalar_type S; \
|
|
||||||
typedef typename Simd::vector_type V; \
|
|
||||||
\
|
|
||||||
HAND_DECLARATIONS(ignore); \
|
|
||||||
\
|
|
||||||
StencilEntry *SE; \
|
|
||||||
int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
|
|
||||||
HAND_DOP_SITE_DAG_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
|
||||||
HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
|
||||||
} \
|
|
||||||
\
|
|
||||||
template<> void \
|
|
||||||
WilsonKernels<IMPL>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
|
|
||||||
int ss,int sU,const FermionField &in, FermionField &out) \
|
|
||||||
{ \
|
|
||||||
typedef IMPL Impl; \
|
|
||||||
typedef typename Simd::scalar_type S; \
|
|
||||||
typedef typename Simd::vector_type V; \
|
|
||||||
\
|
|
||||||
HAND_DECLARATIONS(ignore); \
|
|
||||||
\
|
|
||||||
int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
|
|
||||||
StencilEntry *SE; \
|
|
||||||
int nmu=0; \
|
|
||||||
HAND_DOP_SITE_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
|
||||||
nmu = 0; \
|
|
||||||
HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
|
||||||
} \
|
|
||||||
template<> \
|
|
||||||
void WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
|
|
||||||
int ss,int sU,const FermionField &in, FermionField &out) \
|
|
||||||
{ \
|
|
||||||
typedef IMPL Impl; \
|
|
||||||
typedef typename Simd::scalar_type S; \
|
|
||||||
typedef typename Simd::vector_type V; \
|
|
||||||
\
|
|
||||||
HAND_DECLARATIONS(ignore); \
|
|
||||||
\
|
|
||||||
StencilEntry *SE; \
|
|
||||||
int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
|
|
||||||
int nmu=0; \
|
|
||||||
HAND_DOP_SITE_DAG_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
|
||||||
nmu = 0; \
|
|
||||||
HAND_DOP_SITE_DAG_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
HAND_SPECIALISE_GPARITY(GparityWilsonImplF);
|
|
||||||
HAND_SPECIALISE_GPARITY(GparityWilsonImplD);
|
|
||||||
HAND_SPECIALISE_GPARITY(GparityWilsonImplFH);
|
|
||||||
HAND_SPECIALISE_GPARITY(GparityWilsonImplDF);
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
////////////// Wilson ; uses this implementation /////////////////////
|
////////////// Wilson ; uses this implementation /////////////////////
|
||||||
|
|
||||||
#define INSTANTIATE_THEM(A) \
|
#define INSTANTIATE_THEM(A) \
|
||||||
@ -930,8 +613,6 @@ INSTANTIATE_THEM(WilsonImplF);
|
|||||||
INSTANTIATE_THEM(WilsonImplD);
|
INSTANTIATE_THEM(WilsonImplD);
|
||||||
INSTANTIATE_THEM(ZWilsonImplF);
|
INSTANTIATE_THEM(ZWilsonImplF);
|
||||||
INSTANTIATE_THEM(ZWilsonImplD);
|
INSTANTIATE_THEM(ZWilsonImplD);
|
||||||
INSTANTIATE_THEM(GparityWilsonImplF);
|
|
||||||
INSTANTIATE_THEM(GparityWilsonImplD);
|
|
||||||
INSTANTIATE_THEM(DomainWallVec5dImplF);
|
INSTANTIATE_THEM(DomainWallVec5dImplF);
|
||||||
INSTANTIATE_THEM(DomainWallVec5dImplD);
|
INSTANTIATE_THEM(DomainWallVec5dImplD);
|
||||||
INSTANTIATE_THEM(ZDomainWallVec5dImplF);
|
INSTANTIATE_THEM(ZDomainWallVec5dImplF);
|
||||||
@ -940,12 +621,11 @@ INSTANTIATE_THEM(WilsonImplFH);
|
|||||||
INSTANTIATE_THEM(WilsonImplDF);
|
INSTANTIATE_THEM(WilsonImplDF);
|
||||||
INSTANTIATE_THEM(ZWilsonImplFH);
|
INSTANTIATE_THEM(ZWilsonImplFH);
|
||||||
INSTANTIATE_THEM(ZWilsonImplDF);
|
INSTANTIATE_THEM(ZWilsonImplDF);
|
||||||
INSTANTIATE_THEM(GparityWilsonImplFH);
|
|
||||||
INSTANTIATE_THEM(GparityWilsonImplDF);
|
|
||||||
INSTANTIATE_THEM(DomainWallVec5dImplFH);
|
INSTANTIATE_THEM(DomainWallVec5dImplFH);
|
||||||
INSTANTIATE_THEM(DomainWallVec5dImplDF);
|
INSTANTIATE_THEM(DomainWallVec5dImplDF);
|
||||||
INSTANTIATE_THEM(ZDomainWallVec5dImplFH);
|
INSTANTIATE_THEM(ZDomainWallVec5dImplFH);
|
||||||
INSTANTIATE_THEM(ZDomainWallVec5dImplDF);
|
INSTANTIATE_THEM(ZDomainWallVec5dImplDF);
|
||||||
INSTANTIATE_THEM(WilsonTwoIndexAntiSymmetricImplF);
|
INSTANTIATE_THEM(WilsonTwoIndexAntiSymmetricImplF);
|
||||||
INSTANTIATE_THEM(WilsonTwoIndexAntiSymmetricImplD);
|
INSTANTIATE_THEM(WilsonTwoIndexAntiSymmetricImplD);
|
||||||
|
|
||||||
}}
|
}}
|
||||||
|
878
lib/qcd/action/fermion/WilsonKernelsHandGparity.cc
Normal file
878
lib/qcd/action/fermion/WilsonKernelsHandGparity.cc
Normal file
@ -0,0 +1,878 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./lib/qcd/action/fermion/WilsonKernelsHand.cc
|
||||||
|
|
||||||
|
Copyright (C) 2015
|
||||||
|
|
||||||
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#include <Grid/qcd/action/fermion/FermionCore.h>
|
||||||
|
|
||||||
|
#define REGISTER
|
||||||
|
|
||||||
|
#define LOAD_CHIMU_BODY(F) \
|
||||||
|
Chimu_00=ref(F)(0)(0); \
|
||||||
|
Chimu_01=ref(F)(0)(1); \
|
||||||
|
Chimu_02=ref(F)(0)(2); \
|
||||||
|
Chimu_10=ref(F)(1)(0); \
|
||||||
|
Chimu_11=ref(F)(1)(1); \
|
||||||
|
Chimu_12=ref(F)(1)(2); \
|
||||||
|
Chimu_20=ref(F)(2)(0); \
|
||||||
|
Chimu_21=ref(F)(2)(1); \
|
||||||
|
Chimu_22=ref(F)(2)(2); \
|
||||||
|
Chimu_30=ref(F)(3)(0); \
|
||||||
|
Chimu_31=ref(F)(3)(1); \
|
||||||
|
Chimu_32=ref(F)(3)(2)
|
||||||
|
|
||||||
|
#define LOAD_CHIMU(DIR,F,PERM) \
|
||||||
|
{ const SiteSpinor & ref (in._odata[offset]); LOAD_CHIMU_BODY(F); }
|
||||||
|
|
||||||
|
#define LOAD_CHI_BODY(F) \
|
||||||
|
Chi_00 = ref(F)(0)(0);\
|
||||||
|
Chi_01 = ref(F)(0)(1);\
|
||||||
|
Chi_02 = ref(F)(0)(2);\
|
||||||
|
Chi_10 = ref(F)(1)(0);\
|
||||||
|
Chi_11 = ref(F)(1)(1);\
|
||||||
|
Chi_12 = ref(F)(1)(2)
|
||||||
|
|
||||||
|
#define LOAD_CHI(DIR,F,PERM) \
|
||||||
|
{const SiteHalfSpinor &ref(buf[offset]); LOAD_CHI_BODY(F); }
|
||||||
|
|
||||||
|
|
||||||
|
//G-parity implementations using in-place intrinsic ops
|
||||||
|
|
||||||
|
//1l 1h -> 1h 1l
|
||||||
|
//0l 0h , 1h 1l -> 0l 1h 0h,1l
|
||||||
|
//0h,1l -> 1l,0h
|
||||||
|
//if( (distance == 1 && !perm_will_occur) || (distance == -1 && perm_will_occur) )
|
||||||
|
//Pulled fermion through forwards face, GPBC on upper component
|
||||||
|
//Need 0= 0l 1h 1= 1l 0h
|
||||||
|
//else if( (distance == -1 && !perm) || (distance == 1 && perm) )
|
||||||
|
//Pulled fermion through backwards face, GPBC on lower component
|
||||||
|
//Need 0= 1l 0h 1= 0l 1h
|
||||||
|
|
||||||
|
//1l 1h -> 1h 1l
|
||||||
|
//0l 0h , 1h 1l -> 0l 1h 0h,1l
|
||||||
|
#define DO_TWIST_0L_1H(INTO,S,C,F, PERM, tmp1, tmp2, tmp3) \
|
||||||
|
permute##PERM(tmp1, ref(1)(S)(C)); \
|
||||||
|
exchange##PERM(tmp2,tmp3, ref(0)(S)(C), tmp1); \
|
||||||
|
INTO = tmp2;
|
||||||
|
|
||||||
|
//0l 0h -> 0h 0l
|
||||||
|
//1l 1h, 0h 0l -> 1l 0h, 1h 0l
|
||||||
|
#define DO_TWIST_1L_0H(INTO,S,C,F, PERM, tmp1, tmp2, tmp3) \
|
||||||
|
permute##PERM(tmp1, ref(0)(S)(C)); \
|
||||||
|
exchange##PERM(tmp2,tmp3, ref(1)(S)(C), tmp1); \
|
||||||
|
INTO = tmp2;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#define LOAD_CHI_SETUP(DIR,F) \
|
||||||
|
g = F; \
|
||||||
|
direction = st._directions[DIR]; \
|
||||||
|
distance = st._distances[DIR]; \
|
||||||
|
sl = st._grid->_simd_layout[direction]; \
|
||||||
|
inplace_twist = 0; \
|
||||||
|
if(SE->_around_the_world && this->Params.twists[DIR % 4]){ \
|
||||||
|
if(sl == 1){ \
|
||||||
|
g = (F+1) % 2; \
|
||||||
|
}else{ \
|
||||||
|
inplace_twist = 1; \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F,PERM) \
|
||||||
|
{ const SiteSpinor &ref(in._odata[offset]); \
|
||||||
|
LOAD_CHI_SETUP(DIR,F); \
|
||||||
|
if(!inplace_twist){ \
|
||||||
|
LOAD_CHIMU_BODY(g); \
|
||||||
|
}else{ \
|
||||||
|
if( ( F==0 && ((distance == 1 && !perm) || (distance == -1 && perm)) ) || \
|
||||||
|
( F==1 && ((distance == -1 && !perm) || (distance == 1 && perm)) ) ){ \
|
||||||
|
DO_TWIST_0L_1H(Chimu_00,0,0,F,PERM, U_00,U_01,U_10); \
|
||||||
|
DO_TWIST_0L_1H(Chimu_01,0,1,F,PERM, U_11,U_20,U_21); \
|
||||||
|
DO_TWIST_0L_1H(Chimu_02,0,2,F,PERM, U_00,U_01,U_10); \
|
||||||
|
DO_TWIST_0L_1H(Chimu_10,1,0,F,PERM, U_11,U_20,U_21); \
|
||||||
|
DO_TWIST_0L_1H(Chimu_11,1,1,F,PERM, U_00,U_01,U_10); \
|
||||||
|
DO_TWIST_0L_1H(Chimu_12,1,2,F,PERM, U_11,U_20,U_21); \
|
||||||
|
DO_TWIST_0L_1H(Chimu_20,2,0,F,PERM, U_00,U_01,U_10); \
|
||||||
|
DO_TWIST_0L_1H(Chimu_21,2,1,F,PERM, U_11,U_20,U_21); \
|
||||||
|
DO_TWIST_0L_1H(Chimu_22,2,2,F,PERM, U_00,U_01,U_10); \
|
||||||
|
DO_TWIST_0L_1H(Chimu_30,3,0,F,PERM, U_11,U_20,U_21); \
|
||||||
|
DO_TWIST_0L_1H(Chimu_31,3,1,F,PERM, U_00,U_01,U_10); \
|
||||||
|
DO_TWIST_0L_1H(Chimu_32,3,2,F,PERM, U_11,U_20,U_21); \
|
||||||
|
}else{ \
|
||||||
|
DO_TWIST_1L_0H(Chimu_00,0,0,F,PERM, U_00,U_01,U_10); \
|
||||||
|
DO_TWIST_1L_0H(Chimu_01,0,1,F,PERM, U_11,U_20,U_21); \
|
||||||
|
DO_TWIST_1L_0H(Chimu_02,0,2,F,PERM, U_00,U_01,U_10); \
|
||||||
|
DO_TWIST_1L_0H(Chimu_10,1,0,F,PERM, U_11,U_20,U_21); \
|
||||||
|
DO_TWIST_1L_0H(Chimu_11,1,1,F,PERM, U_00,U_01,U_10); \
|
||||||
|
DO_TWIST_1L_0H(Chimu_12,1,2,F,PERM, U_11,U_20,U_21); \
|
||||||
|
DO_TWIST_1L_0H(Chimu_20,2,0,F,PERM, U_00,U_01,U_10); \
|
||||||
|
DO_TWIST_1L_0H(Chimu_21,2,1,F,PERM, U_11,U_20,U_21); \
|
||||||
|
DO_TWIST_1L_0H(Chimu_22,2,2,F,PERM, U_00,U_01,U_10); \
|
||||||
|
DO_TWIST_1L_0H(Chimu_30,3,0,F,PERM, U_11,U_20,U_21); \
|
||||||
|
DO_TWIST_1L_0H(Chimu_31,3,1,F,PERM, U_00,U_01,U_10); \
|
||||||
|
DO_TWIST_1L_0H(Chimu_32,3,2,F,PERM, U_11,U_20,U_21); \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#define LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F,PERM) \
|
||||||
|
{ const SiteHalfSpinor &ref(buf[offset]); \
|
||||||
|
LOAD_CHI_SETUP(DIR,F); \
|
||||||
|
if(!inplace_twist){ \
|
||||||
|
LOAD_CHI_BODY(g); \
|
||||||
|
}else{ \
|
||||||
|
if( ( F==0 && ((distance == 1 && !perm) || (distance == -1 && perm)) ) || \
|
||||||
|
( F==1 && ((distance == -1 && !perm) || (distance == 1 && perm)) ) ){ \
|
||||||
|
DO_TWIST_0L_1H(Chi_00,0,0,F,PERM, U_00,U_01,U_10); \
|
||||||
|
DO_TWIST_0L_1H(Chi_01,0,1,F,PERM, U_11,U_20,U_21); \
|
||||||
|
DO_TWIST_0L_1H(Chi_02,0,2,F,PERM, UChi_00,UChi_01,UChi_02); \
|
||||||
|
DO_TWIST_0L_1H(Chi_10,1,0,F,PERM, UChi_10,UChi_11,UChi_12); \
|
||||||
|
DO_TWIST_0L_1H(Chi_11,1,1,F,PERM, U_00,U_01,U_10); \
|
||||||
|
DO_TWIST_0L_1H(Chi_12,1,2,F,PERM, U_11,U_20,U_21); \
|
||||||
|
}else{ \
|
||||||
|
DO_TWIST_1L_0H(Chi_00,0,0,F,PERM, U_00,U_01,U_10); \
|
||||||
|
DO_TWIST_1L_0H(Chi_01,0,1,F,PERM, U_11,U_20,U_21); \
|
||||||
|
DO_TWIST_1L_0H(Chi_02,0,2,F,PERM, UChi_00,UChi_01,UChi_02); \
|
||||||
|
DO_TWIST_1L_0H(Chi_10,1,0,F,PERM, UChi_10,UChi_11,UChi_12); \
|
||||||
|
DO_TWIST_1L_0H(Chi_11,1,1,F,PERM, U_00,U_01,U_10); \
|
||||||
|
DO_TWIST_1L_0H(Chi_12,1,2,F,PERM, U_11,U_20,U_21); \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#define LOAD_CHI_GPARITY(DIR,F,PERM) LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F,PERM)
|
||||||
|
#define LOAD_CHIMU_GPARITY(DIR,F,PERM) LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F,PERM)
|
||||||
|
|
||||||
|
// To splat or not to splat depends on the implementation
|
||||||
|
#define MULT_2SPIN_BODY \
|
||||||
|
Impl::loadLinkElement(U_00,ref()(0,0)); \
|
||||||
|
Impl::loadLinkElement(U_10,ref()(1,0)); \
|
||||||
|
Impl::loadLinkElement(U_20,ref()(2,0)); \
|
||||||
|
Impl::loadLinkElement(U_01,ref()(0,1)); \
|
||||||
|
Impl::loadLinkElement(U_11,ref()(1,1)); \
|
||||||
|
Impl::loadLinkElement(U_21,ref()(2,1)); \
|
||||||
|
UChi_00 = U_00*Chi_00; \
|
||||||
|
UChi_10 = U_00*Chi_10; \
|
||||||
|
UChi_01 = U_10*Chi_00; \
|
||||||
|
UChi_11 = U_10*Chi_10; \
|
||||||
|
UChi_02 = U_20*Chi_00; \
|
||||||
|
UChi_12 = U_20*Chi_10; \
|
||||||
|
UChi_00+= U_01*Chi_01; \
|
||||||
|
UChi_10+= U_01*Chi_11; \
|
||||||
|
UChi_01+= U_11*Chi_01; \
|
||||||
|
UChi_11+= U_11*Chi_11; \
|
||||||
|
UChi_02+= U_21*Chi_01; \
|
||||||
|
UChi_12+= U_21*Chi_11; \
|
||||||
|
Impl::loadLinkElement(U_00,ref()(0,2)); \
|
||||||
|
Impl::loadLinkElement(U_10,ref()(1,2)); \
|
||||||
|
Impl::loadLinkElement(U_20,ref()(2,2)); \
|
||||||
|
UChi_00+= U_00*Chi_02; \
|
||||||
|
UChi_10+= U_00*Chi_12; \
|
||||||
|
UChi_01+= U_10*Chi_02; \
|
||||||
|
UChi_11+= U_10*Chi_12; \
|
||||||
|
UChi_02+= U_20*Chi_02; \
|
||||||
|
UChi_12+= U_20*Chi_12
|
||||||
|
|
||||||
|
|
||||||
|
#define MULT_2SPIN(A,F) \
|
||||||
|
{auto & ref(U._odata[sU](A)); MULT_2SPIN_BODY; }
|
||||||
|
|
||||||
|
#define MULT_2SPIN_GPARITY(A,F) \
|
||||||
|
{auto & ref(U._odata[sU](F)(A)); MULT_2SPIN_BODY; }
|
||||||
|
|
||||||
|
|
||||||
|
#define PERMUTE_DIR(dir) \
|
||||||
|
permute##dir(Chi_00,Chi_00);\
|
||||||
|
permute##dir(Chi_01,Chi_01);\
|
||||||
|
permute##dir(Chi_02,Chi_02);\
|
||||||
|
permute##dir(Chi_10,Chi_10);\
|
||||||
|
permute##dir(Chi_11,Chi_11);\
|
||||||
|
permute##dir(Chi_12,Chi_12);
|
||||||
|
|
||||||
|
// hspin(0)=fspin(0)+timesI(fspin(3));
|
||||||
|
// hspin(1)=fspin(1)+timesI(fspin(2));
|
||||||
|
#define XP_PROJ \
|
||||||
|
Chi_00 = Chimu_00+timesI(Chimu_30);\
|
||||||
|
Chi_01 = Chimu_01+timesI(Chimu_31);\
|
||||||
|
Chi_02 = Chimu_02+timesI(Chimu_32);\
|
||||||
|
Chi_10 = Chimu_10+timesI(Chimu_20);\
|
||||||
|
Chi_11 = Chimu_11+timesI(Chimu_21);\
|
||||||
|
Chi_12 = Chimu_12+timesI(Chimu_22);
|
||||||
|
|
||||||
|
#define YP_PROJ \
|
||||||
|
Chi_00 = Chimu_00-Chimu_30;\
|
||||||
|
Chi_01 = Chimu_01-Chimu_31;\
|
||||||
|
Chi_02 = Chimu_02-Chimu_32;\
|
||||||
|
Chi_10 = Chimu_10+Chimu_20;\
|
||||||
|
Chi_11 = Chimu_11+Chimu_21;\
|
||||||
|
Chi_12 = Chimu_12+Chimu_22;
|
||||||
|
|
||||||
|
#define ZP_PROJ \
|
||||||
|
Chi_00 = Chimu_00+timesI(Chimu_20); \
|
||||||
|
Chi_01 = Chimu_01+timesI(Chimu_21); \
|
||||||
|
Chi_02 = Chimu_02+timesI(Chimu_22); \
|
||||||
|
Chi_10 = Chimu_10-timesI(Chimu_30); \
|
||||||
|
Chi_11 = Chimu_11-timesI(Chimu_31); \
|
||||||
|
Chi_12 = Chimu_12-timesI(Chimu_32);
|
||||||
|
|
||||||
|
#define TP_PROJ \
|
||||||
|
Chi_00 = Chimu_00+Chimu_20; \
|
||||||
|
Chi_01 = Chimu_01+Chimu_21; \
|
||||||
|
Chi_02 = Chimu_02+Chimu_22; \
|
||||||
|
Chi_10 = Chimu_10+Chimu_30; \
|
||||||
|
Chi_11 = Chimu_11+Chimu_31; \
|
||||||
|
Chi_12 = Chimu_12+Chimu_32;
|
||||||
|
|
||||||
|
|
||||||
|
// hspin(0)=fspin(0)-timesI(fspin(3));
|
||||||
|
// hspin(1)=fspin(1)-timesI(fspin(2));
|
||||||
|
#define XM_PROJ \
|
||||||
|
Chi_00 = Chimu_00-timesI(Chimu_30);\
|
||||||
|
Chi_01 = Chimu_01-timesI(Chimu_31);\
|
||||||
|
Chi_02 = Chimu_02-timesI(Chimu_32);\
|
||||||
|
Chi_10 = Chimu_10-timesI(Chimu_20);\
|
||||||
|
Chi_11 = Chimu_11-timesI(Chimu_21);\
|
||||||
|
Chi_12 = Chimu_12-timesI(Chimu_22);
|
||||||
|
|
||||||
|
#define YM_PROJ \
|
||||||
|
Chi_00 = Chimu_00+Chimu_30;\
|
||||||
|
Chi_01 = Chimu_01+Chimu_31;\
|
||||||
|
Chi_02 = Chimu_02+Chimu_32;\
|
||||||
|
Chi_10 = Chimu_10-Chimu_20;\
|
||||||
|
Chi_11 = Chimu_11-Chimu_21;\
|
||||||
|
Chi_12 = Chimu_12-Chimu_22;
|
||||||
|
|
||||||
|
#define ZM_PROJ \
|
||||||
|
Chi_00 = Chimu_00-timesI(Chimu_20); \
|
||||||
|
Chi_01 = Chimu_01-timesI(Chimu_21); \
|
||||||
|
Chi_02 = Chimu_02-timesI(Chimu_22); \
|
||||||
|
Chi_10 = Chimu_10+timesI(Chimu_30); \
|
||||||
|
Chi_11 = Chimu_11+timesI(Chimu_31); \
|
||||||
|
Chi_12 = Chimu_12+timesI(Chimu_32);
|
||||||
|
|
||||||
|
#define TM_PROJ \
|
||||||
|
Chi_00 = Chimu_00-Chimu_20; \
|
||||||
|
Chi_01 = Chimu_01-Chimu_21; \
|
||||||
|
Chi_02 = Chimu_02-Chimu_22; \
|
||||||
|
Chi_10 = Chimu_10-Chimu_30; \
|
||||||
|
Chi_11 = Chimu_11-Chimu_31; \
|
||||||
|
Chi_12 = Chimu_12-Chimu_32;
|
||||||
|
|
||||||
|
// fspin(0)=hspin(0);
|
||||||
|
// fspin(1)=hspin(1);
|
||||||
|
// fspin(2)=timesMinusI(hspin(1));
|
||||||
|
// fspin(3)=timesMinusI(hspin(0));
|
||||||
|
#define XP_RECON\
|
||||||
|
result_00 = UChi_00;\
|
||||||
|
result_01 = UChi_01;\
|
||||||
|
result_02 = UChi_02;\
|
||||||
|
result_10 = UChi_10;\
|
||||||
|
result_11 = UChi_11;\
|
||||||
|
result_12 = UChi_12;\
|
||||||
|
result_20 = timesMinusI(UChi_10);\
|
||||||
|
result_21 = timesMinusI(UChi_11);\
|
||||||
|
result_22 = timesMinusI(UChi_12);\
|
||||||
|
result_30 = timesMinusI(UChi_00);\
|
||||||
|
result_31 = timesMinusI(UChi_01);\
|
||||||
|
result_32 = timesMinusI(UChi_02);
|
||||||
|
|
||||||
|
#define XP_RECON_ACCUM\
|
||||||
|
result_00+=UChi_00;\
|
||||||
|
result_01+=UChi_01;\
|
||||||
|
result_02+=UChi_02;\
|
||||||
|
result_10+=UChi_10;\
|
||||||
|
result_11+=UChi_11;\
|
||||||
|
result_12+=UChi_12;\
|
||||||
|
result_20-=timesI(UChi_10);\
|
||||||
|
result_21-=timesI(UChi_11);\
|
||||||
|
result_22-=timesI(UChi_12);\
|
||||||
|
result_30-=timesI(UChi_00);\
|
||||||
|
result_31-=timesI(UChi_01);\
|
||||||
|
result_32-=timesI(UChi_02);
|
||||||
|
|
||||||
|
#define XM_RECON\
|
||||||
|
result_00 = UChi_00;\
|
||||||
|
result_01 = UChi_01;\
|
||||||
|
result_02 = UChi_02;\
|
||||||
|
result_10 = UChi_10;\
|
||||||
|
result_11 = UChi_11;\
|
||||||
|
result_12 = UChi_12;\
|
||||||
|
result_20 = timesI(UChi_10);\
|
||||||
|
result_21 = timesI(UChi_11);\
|
||||||
|
result_22 = timesI(UChi_12);\
|
||||||
|
result_30 = timesI(UChi_00);\
|
||||||
|
result_31 = timesI(UChi_01);\
|
||||||
|
result_32 = timesI(UChi_02);
|
||||||
|
|
||||||
|
#define XM_RECON_ACCUM\
|
||||||
|
result_00+= UChi_00;\
|
||||||
|
result_01+= UChi_01;\
|
||||||
|
result_02+= UChi_02;\
|
||||||
|
result_10+= UChi_10;\
|
||||||
|
result_11+= UChi_11;\
|
||||||
|
result_12+= UChi_12;\
|
||||||
|
result_20+= timesI(UChi_10);\
|
||||||
|
result_21+= timesI(UChi_11);\
|
||||||
|
result_22+= timesI(UChi_12);\
|
||||||
|
result_30+= timesI(UChi_00);\
|
||||||
|
result_31+= timesI(UChi_01);\
|
||||||
|
result_32+= timesI(UChi_02);
|
||||||
|
|
||||||
|
#define YP_RECON_ACCUM\
|
||||||
|
result_00+= UChi_00;\
|
||||||
|
result_01+= UChi_01;\
|
||||||
|
result_02+= UChi_02;\
|
||||||
|
result_10+= UChi_10;\
|
||||||
|
result_11+= UChi_11;\
|
||||||
|
result_12+= UChi_12;\
|
||||||
|
result_20+= UChi_10;\
|
||||||
|
result_21+= UChi_11;\
|
||||||
|
result_22+= UChi_12;\
|
||||||
|
result_30-= UChi_00;\
|
||||||
|
result_31-= UChi_01;\
|
||||||
|
result_32-= UChi_02;
|
||||||
|
|
||||||
|
#define YM_RECON_ACCUM\
|
||||||
|
result_00+= UChi_00;\
|
||||||
|
result_01+= UChi_01;\
|
||||||
|
result_02+= UChi_02;\
|
||||||
|
result_10+= UChi_10;\
|
||||||
|
result_11+= UChi_11;\
|
||||||
|
result_12+= UChi_12;\
|
||||||
|
result_20-= UChi_10;\
|
||||||
|
result_21-= UChi_11;\
|
||||||
|
result_22-= UChi_12;\
|
||||||
|
result_30+= UChi_00;\
|
||||||
|
result_31+= UChi_01;\
|
||||||
|
result_32+= UChi_02;
|
||||||
|
|
||||||
|
#define ZP_RECON_ACCUM\
|
||||||
|
result_00+= UChi_00;\
|
||||||
|
result_01+= UChi_01;\
|
||||||
|
result_02+= UChi_02;\
|
||||||
|
result_10+= UChi_10;\
|
||||||
|
result_11+= UChi_11;\
|
||||||
|
result_12+= UChi_12;\
|
||||||
|
result_20-= timesI(UChi_00); \
|
||||||
|
result_21-= timesI(UChi_01); \
|
||||||
|
result_22-= timesI(UChi_02); \
|
||||||
|
result_30+= timesI(UChi_10); \
|
||||||
|
result_31+= timesI(UChi_11); \
|
||||||
|
result_32+= timesI(UChi_12);
|
||||||
|
|
||||||
|
#define ZM_RECON_ACCUM\
|
||||||
|
result_00+= UChi_00;\
|
||||||
|
result_01+= UChi_01;\
|
||||||
|
result_02+= UChi_02;\
|
||||||
|
result_10+= UChi_10;\
|
||||||
|
result_11+= UChi_11;\
|
||||||
|
result_12+= UChi_12;\
|
||||||
|
result_20+= timesI(UChi_00); \
|
||||||
|
result_21+= timesI(UChi_01); \
|
||||||
|
result_22+= timesI(UChi_02); \
|
||||||
|
result_30-= timesI(UChi_10); \
|
||||||
|
result_31-= timesI(UChi_11); \
|
||||||
|
result_32-= timesI(UChi_12);
|
||||||
|
|
||||||
|
#define TP_RECON_ACCUM\
|
||||||
|
result_00+= UChi_00;\
|
||||||
|
result_01+= UChi_01;\
|
||||||
|
result_02+= UChi_02;\
|
||||||
|
result_10+= UChi_10;\
|
||||||
|
result_11+= UChi_11;\
|
||||||
|
result_12+= UChi_12;\
|
||||||
|
result_20+= UChi_00; \
|
||||||
|
result_21+= UChi_01; \
|
||||||
|
result_22+= UChi_02; \
|
||||||
|
result_30+= UChi_10; \
|
||||||
|
result_31+= UChi_11; \
|
||||||
|
result_32+= UChi_12;
|
||||||
|
|
||||||
|
#define TM_RECON_ACCUM\
|
||||||
|
result_00+= UChi_00;\
|
||||||
|
result_01+= UChi_01;\
|
||||||
|
result_02+= UChi_02;\
|
||||||
|
result_10+= UChi_10;\
|
||||||
|
result_11+= UChi_11;\
|
||||||
|
result_12+= UChi_12;\
|
||||||
|
result_20-= UChi_00; \
|
||||||
|
result_21-= UChi_01; \
|
||||||
|
result_22-= UChi_02; \
|
||||||
|
result_30-= UChi_10; \
|
||||||
|
result_31-= UChi_11; \
|
||||||
|
result_32-= UChi_12;
|
||||||
|
|
||||||
|
#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
|
||||||
|
SE=st.GetEntry(ptype,DIR,ss); \
|
||||||
|
offset = SE->_offset; \
|
||||||
|
local = SE->_is_local; \
|
||||||
|
perm = SE->_permute; \
|
||||||
|
if ( local ) { \
|
||||||
|
LOAD_CHIMU_IMPL(DIR,F,PERM); \
|
||||||
|
PROJ; \
|
||||||
|
if ( perm) { \
|
||||||
|
PERMUTE_DIR(PERM); \
|
||||||
|
} \
|
||||||
|
} else { \
|
||||||
|
LOAD_CHI_IMPL(DIR,F,PERM); \
|
||||||
|
} \
|
||||||
|
MULT_2SPIN_IMPL(DIR,F); \
|
||||||
|
RECON;
|
||||||
|
|
||||||
|
|
||||||
|
#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
|
||||||
|
SE=st.GetEntry(ptype,DIR,ss); \
|
||||||
|
offset = SE->_offset; \
|
||||||
|
local = SE->_is_local; \
|
||||||
|
perm = SE->_permute; \
|
||||||
|
if ( local ) { \
|
||||||
|
LOAD_CHIMU_IMPL(DIR,F,PERM); \
|
||||||
|
PROJ; \
|
||||||
|
if ( perm) { \
|
||||||
|
PERMUTE_DIR(PERM); \
|
||||||
|
} \
|
||||||
|
} else if ( st.same_node[DIR] ) { \
|
||||||
|
LOAD_CHI_IMPL(DIR,F,PERM); \
|
||||||
|
} \
|
||||||
|
if (local || st.same_node[DIR] ) { \
|
||||||
|
MULT_2SPIN_IMPL(DIR,F); \
|
||||||
|
RECON; \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
|
||||||
|
SE=st.GetEntry(ptype,DIR,ss); \
|
||||||
|
offset = SE->_offset; \
|
||||||
|
local = SE->_is_local; \
|
||||||
|
perm = SE->_permute; \
|
||||||
|
if((!SE->_is_local)&&(!st.same_node[DIR]) ) { \
|
||||||
|
LOAD_CHI_IMPL(DIR,F,PERM); \
|
||||||
|
MULT_2SPIN_IMPL(DIR,F); \
|
||||||
|
RECON; \
|
||||||
|
nmu++; \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define HAND_RESULT(ss,F) \
|
||||||
|
{ \
|
||||||
|
SiteSpinor & ref (out._odata[ss]); \
|
||||||
|
vstream(ref(F)(0)(0),result_00); \
|
||||||
|
vstream(ref(F)(0)(1),result_01); \
|
||||||
|
vstream(ref(F)(0)(2),result_02); \
|
||||||
|
vstream(ref(F)(1)(0),result_10); \
|
||||||
|
vstream(ref(F)(1)(1),result_11); \
|
||||||
|
vstream(ref(F)(1)(2),result_12); \
|
||||||
|
vstream(ref(F)(2)(0),result_20); \
|
||||||
|
vstream(ref(F)(2)(1),result_21); \
|
||||||
|
vstream(ref(F)(2)(2),result_22); \
|
||||||
|
vstream(ref(F)(3)(0),result_30); \
|
||||||
|
vstream(ref(F)(3)(1),result_31); \
|
||||||
|
vstream(ref(F)(3)(2),result_32); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define HAND_RESULT_EXT(ss,F) \
|
||||||
|
if (nmu){ \
|
||||||
|
SiteSpinor & ref (out._odata[ss]); \
|
||||||
|
ref(F)(0)(0)+=result_00; \
|
||||||
|
ref(F)(0)(1)+=result_01; \
|
||||||
|
ref(F)(0)(2)+=result_02; \
|
||||||
|
ref(F)(1)(0)+=result_10; \
|
||||||
|
ref(F)(1)(1)+=result_11; \
|
||||||
|
ref(F)(1)(2)+=result_12; \
|
||||||
|
ref(F)(2)(0)+=result_20; \
|
||||||
|
ref(F)(2)(1)+=result_21; \
|
||||||
|
ref(F)(2)(2)+=result_22; \
|
||||||
|
ref(F)(3)(0)+=result_30; \
|
||||||
|
ref(F)(3)(1)+=result_31; \
|
||||||
|
ref(F)(3)(2)+=result_32; \
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#define HAND_DECLARATIONS(a) \
|
||||||
|
Simd result_00; \
|
||||||
|
Simd result_01; \
|
||||||
|
Simd result_02; \
|
||||||
|
Simd result_10; \
|
||||||
|
Simd result_11; \
|
||||||
|
Simd result_12; \
|
||||||
|
Simd result_20; \
|
||||||
|
Simd result_21; \
|
||||||
|
Simd result_22; \
|
||||||
|
Simd result_30; \
|
||||||
|
Simd result_31; \
|
||||||
|
Simd result_32; \
|
||||||
|
Simd Chi_00; \
|
||||||
|
Simd Chi_01; \
|
||||||
|
Simd Chi_02; \
|
||||||
|
Simd Chi_10; \
|
||||||
|
Simd Chi_11; \
|
||||||
|
Simd Chi_12; \
|
||||||
|
Simd UChi_00; \
|
||||||
|
Simd UChi_01; \
|
||||||
|
Simd UChi_02; \
|
||||||
|
Simd UChi_10; \
|
||||||
|
Simd UChi_11; \
|
||||||
|
Simd UChi_12; \
|
||||||
|
Simd U_00; \
|
||||||
|
Simd U_10; \
|
||||||
|
Simd U_20; \
|
||||||
|
Simd U_01; \
|
||||||
|
Simd U_11; \
|
||||||
|
Simd U_21;
|
||||||
|
|
||||||
|
#define ZERO_RESULT \
|
||||||
|
result_00=zero; \
|
||||||
|
result_01=zero; \
|
||||||
|
result_02=zero; \
|
||||||
|
result_10=zero; \
|
||||||
|
result_11=zero; \
|
||||||
|
result_12=zero; \
|
||||||
|
result_20=zero; \
|
||||||
|
result_21=zero; \
|
||||||
|
result_22=zero; \
|
||||||
|
result_30=zero; \
|
||||||
|
result_31=zero; \
|
||||||
|
result_32=zero;
|
||||||
|
|
||||||
|
#define Chimu_00 Chi_00
|
||||||
|
#define Chimu_01 Chi_01
|
||||||
|
#define Chimu_02 Chi_02
|
||||||
|
#define Chimu_10 Chi_10
|
||||||
|
#define Chimu_11 Chi_11
|
||||||
|
#define Chimu_12 Chi_12
|
||||||
|
#define Chimu_20 UChi_00
|
||||||
|
#define Chimu_21 UChi_01
|
||||||
|
#define Chimu_22 UChi_02
|
||||||
|
#define Chimu_30 UChi_10
|
||||||
|
#define Chimu_31 UChi_11
|
||||||
|
#define Chimu_32 UChi_12
|
||||||
|
|
||||||
|
namespace Grid {
|
||||||
|
namespace QCD {
|
||||||
|
|
||||||
|
template<class Impl> void
|
||||||
|
WilsonKernels<Impl>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
|
||||||
|
int ss,int sU,const FermionField &in, FermionField &out)
|
||||||
|
{
|
||||||
|
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
|
typedef typename Simd::scalar_type S;
|
||||||
|
typedef typename Simd::vector_type V;
|
||||||
|
|
||||||
|
HAND_DECLARATIONS(ignore);
|
||||||
|
|
||||||
|
int offset,local,perm, ptype;
|
||||||
|
StencilEntry *SE;
|
||||||
|
|
||||||
|
#define HAND_DOP_SITE(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
|
||||||
|
HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||||
|
HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||||
|
HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||||
|
HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||||
|
HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||||
|
HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||||
|
HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||||
|
HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||||
|
HAND_RESULT(ss,F)
|
||||||
|
|
||||||
|
HAND_DOP_SITE(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class Impl>
|
||||||
|
void WilsonKernels<Impl>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
|
||||||
|
int ss,int sU,const FermionField &in, FermionField &out)
|
||||||
|
{
|
||||||
|
typedef typename Simd::scalar_type S;
|
||||||
|
typedef typename Simd::vector_type V;
|
||||||
|
|
||||||
|
HAND_DECLARATIONS(ignore);
|
||||||
|
|
||||||
|
StencilEntry *SE;
|
||||||
|
int offset,local,perm, ptype;
|
||||||
|
|
||||||
|
#define HAND_DOP_SITE_DAG(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
|
||||||
|
HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||||
|
HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||||
|
HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||||
|
HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||||
|
HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||||
|
HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||||
|
HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||||
|
HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||||
|
HAND_RESULT(ss,F)
|
||||||
|
|
||||||
|
HAND_DOP_SITE_DAG(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class Impl> void
|
||||||
|
WilsonKernels<Impl>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
|
||||||
|
int ss,int sU,const FermionField &in, FermionField &out)
|
||||||
|
{
|
||||||
|
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
|
typedef typename Simd::scalar_type S;
|
||||||
|
typedef typename Simd::vector_type V;
|
||||||
|
|
||||||
|
HAND_DECLARATIONS(ignore);
|
||||||
|
|
||||||
|
int offset,local,perm, ptype;
|
||||||
|
StencilEntry *SE;
|
||||||
|
|
||||||
|
#define HAND_DOP_SITE_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
|
||||||
|
ZERO_RESULT; \
|
||||||
|
HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||||
|
HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||||
|
HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||||
|
HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||||
|
HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||||
|
HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||||
|
HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||||
|
HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||||
|
HAND_RESULT(ss,F)
|
||||||
|
|
||||||
|
HAND_DOP_SITE_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class Impl>
|
||||||
|
void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
|
||||||
|
int ss,int sU,const FermionField &in, FermionField &out)
|
||||||
|
{
|
||||||
|
typedef typename Simd::scalar_type S;
|
||||||
|
typedef typename Simd::vector_type V;
|
||||||
|
|
||||||
|
HAND_DECLARATIONS(ignore);
|
||||||
|
|
||||||
|
StencilEntry *SE;
|
||||||
|
int offset,local,perm, ptype;
|
||||||
|
|
||||||
|
#define HAND_DOP_SITE_DAG_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
|
||||||
|
ZERO_RESULT; \
|
||||||
|
HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||||
|
HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||||
|
HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||||
|
HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||||
|
HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||||
|
HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||||
|
HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||||
|
HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||||
|
HAND_RESULT(ss,F)
|
||||||
|
|
||||||
|
HAND_DOP_SITE_DAG_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class Impl> void
|
||||||
|
WilsonKernels<Impl>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
|
||||||
|
int ss,int sU,const FermionField &in, FermionField &out)
|
||||||
|
{
|
||||||
|
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||||
|
typedef typename Simd::scalar_type S;
|
||||||
|
typedef typename Simd::vector_type V;
|
||||||
|
|
||||||
|
HAND_DECLARATIONS(ignore);
|
||||||
|
|
||||||
|
int offset,local,perm, ptype;
|
||||||
|
StencilEntry *SE;
|
||||||
|
int nmu=0;
|
||||||
|
|
||||||
|
#define HAND_DOP_SITE_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
|
||||||
|
ZERO_RESULT; \
|
||||||
|
HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||||
|
HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||||
|
HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||||
|
HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||||
|
HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||||
|
HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||||
|
HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||||
|
HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||||
|
HAND_RESULT_EXT(ss,F)
|
||||||
|
|
||||||
|
HAND_DOP_SITE_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<class Impl>
|
||||||
|
void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
|
||||||
|
int ss,int sU,const FermionField &in, FermionField &out)
|
||||||
|
{
|
||||||
|
typedef typename Simd::scalar_type S;
|
||||||
|
typedef typename Simd::vector_type V;
|
||||||
|
|
||||||
|
HAND_DECLARATIONS(ignore);
|
||||||
|
|
||||||
|
StencilEntry *SE;
|
||||||
|
int offset,local,perm, ptype;
|
||||||
|
int nmu=0;
|
||||||
|
|
||||||
|
#define HAND_DOP_SITE_DAG_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
|
||||||
|
ZERO_RESULT; \
|
||||||
|
HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||||
|
HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||||
|
HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||||
|
HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||||
|
HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||||
|
HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||||
|
HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||||
|
HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
|
||||||
|
HAND_RESULT_EXT(ss,F)
|
||||||
|
|
||||||
|
HAND_DOP_SITE_DAG_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
|
||||||
|
}
|
||||||
|
|
||||||
|
#define HAND_SPECIALISE_GPARITY(IMPL) \
|
||||||
|
template<> void \
|
||||||
|
WilsonKernels<IMPL>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
|
||||||
|
int ss,int sU,const FermionField &in, FermionField &out) \
|
||||||
|
{ \
|
||||||
|
typedef IMPL Impl; \
|
||||||
|
typedef typename Simd::scalar_type S; \
|
||||||
|
typedef typename Simd::vector_type V; \
|
||||||
|
\
|
||||||
|
HAND_DECLARATIONS(ignore); \
|
||||||
|
\
|
||||||
|
int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
|
||||||
|
StencilEntry *SE; \
|
||||||
|
HAND_DOP_SITE(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||||
|
HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
template<> \
|
||||||
|
void WilsonKernels<IMPL>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
|
||||||
|
int ss,int sU,const FermionField &in, FermionField &out) \
|
||||||
|
{ \
|
||||||
|
typedef IMPL Impl; \
|
||||||
|
typedef typename Simd::scalar_type S; \
|
||||||
|
typedef typename Simd::vector_type V; \
|
||||||
|
\
|
||||||
|
HAND_DECLARATIONS(ignore); \
|
||||||
|
\
|
||||||
|
StencilEntry *SE; \
|
||||||
|
int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
|
||||||
|
HAND_DOP_SITE_DAG(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||||
|
HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
template<> void \
|
||||||
|
WilsonKernels<IMPL>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
|
||||||
|
int ss,int sU,const FermionField &in, FermionField &out) \
|
||||||
|
{ \
|
||||||
|
typedef IMPL Impl; \
|
||||||
|
typedef typename Simd::scalar_type S; \
|
||||||
|
typedef typename Simd::vector_type V; \
|
||||||
|
\
|
||||||
|
HAND_DECLARATIONS(ignore); \
|
||||||
|
\
|
||||||
|
int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
|
||||||
|
StencilEntry *SE; \
|
||||||
|
HAND_DOP_SITE_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||||
|
HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
template<> \
|
||||||
|
void WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
|
||||||
|
int ss,int sU,const FermionField &in, FermionField &out) \
|
||||||
|
{ \
|
||||||
|
typedef IMPL Impl; \
|
||||||
|
typedef typename Simd::scalar_type S; \
|
||||||
|
typedef typename Simd::vector_type V; \
|
||||||
|
\
|
||||||
|
HAND_DECLARATIONS(ignore); \
|
||||||
|
\
|
||||||
|
StencilEntry *SE; \
|
||||||
|
int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
|
||||||
|
HAND_DOP_SITE_DAG_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||||
|
HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
template<> void \
|
||||||
|
WilsonKernels<IMPL>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
|
||||||
|
int ss,int sU,const FermionField &in, FermionField &out) \
|
||||||
|
{ \
|
||||||
|
typedef IMPL Impl; \
|
||||||
|
typedef typename Simd::scalar_type S; \
|
||||||
|
typedef typename Simd::vector_type V; \
|
||||||
|
\
|
||||||
|
HAND_DECLARATIONS(ignore); \
|
||||||
|
\
|
||||||
|
int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
|
||||||
|
StencilEntry *SE; \
|
||||||
|
int nmu=0; \
|
||||||
|
HAND_DOP_SITE_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||||
|
nmu = 0; \
|
||||||
|
HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||||
|
} \
|
||||||
|
template<> \
|
||||||
|
void WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
|
||||||
|
int ss,int sU,const FermionField &in, FermionField &out) \
|
||||||
|
{ \
|
||||||
|
typedef IMPL Impl; \
|
||||||
|
typedef typename Simd::scalar_type S; \
|
||||||
|
typedef typename Simd::vector_type V; \
|
||||||
|
\
|
||||||
|
HAND_DECLARATIONS(ignore); \
|
||||||
|
\
|
||||||
|
StencilEntry *SE; \
|
||||||
|
int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
|
||||||
|
int nmu=0; \
|
||||||
|
HAND_DOP_SITE_DAG_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||||
|
nmu = 0; \
|
||||||
|
HAND_DOP_SITE_DAG_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
HAND_SPECIALISE_GPARITY(GparityWilsonImplF);
|
||||||
|
HAND_SPECIALISE_GPARITY(GparityWilsonImplD);
|
||||||
|
HAND_SPECIALISE_GPARITY(GparityWilsonImplFH);
|
||||||
|
HAND_SPECIALISE_GPARITY(GparityWilsonImplDF);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
////////////// Wilson ; uses this implementation /////////////////////
|
||||||
|
|
||||||
|
#define INSTANTIATE_THEM(A) \
|
||||||
|
template void WilsonKernels<A>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
|
||||||
|
int ss,int sU,const FermionField &in, FermionField &out); \
|
||||||
|
template void WilsonKernels<A>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
|
||||||
|
int ss,int sU,const FermionField &in, FermionField &out);\
|
||||||
|
template void WilsonKernels<A>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
|
||||||
|
int ss,int sU,const FermionField &in, FermionField &out); \
|
||||||
|
template void WilsonKernels<A>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
|
||||||
|
int ss,int sU,const FermionField &in, FermionField &out); \
|
||||||
|
template void WilsonKernels<A>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
|
||||||
|
int ss,int sU,const FermionField &in, FermionField &out); \
|
||||||
|
template void WilsonKernels<A>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
|
||||||
|
int ss,int sU,const FermionField &in, FermionField &out);
|
||||||
|
|
||||||
|
INSTANTIATE_THEM(GparityWilsonImplF);
|
||||||
|
INSTANTIATE_THEM(GparityWilsonImplD);
|
||||||
|
INSTANTIATE_THEM(GparityWilsonImplFH);
|
||||||
|
INSTANTIATE_THEM(GparityWilsonImplDF);
|
||||||
|
}}
|
@ -48,6 +48,22 @@ with this program; if not, write to the Free Software Foundation, Inc.,
|
|||||||
} \
|
} \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define RegisterLoadCheckPointerMetadataFunction(NAME) \
|
||||||
|
template < class Metadata > \
|
||||||
|
void Load##NAME##Checkpointer(const CheckpointerParameters& Params_, const Metadata& M_) { \
|
||||||
|
if (!have_CheckPointer) { \
|
||||||
|
std::cout << GridLogDebug << "Loading Metadata Checkpointer " << #NAME \
|
||||||
|
<< std::endl; \
|
||||||
|
CP = std::unique_ptr<CheckpointerBaseModule>( \
|
||||||
|
new NAME##CPModule<ImplementationPolicy, Metadata >(Params_, M_)); \
|
||||||
|
have_CheckPointer = true; \
|
||||||
|
} else { \
|
||||||
|
std::cout << GridLogError << "Checkpointer already loaded " \
|
||||||
|
<< std::endl; \
|
||||||
|
exit(1); \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
namespace QCD {
|
namespace QCD {
|
||||||
|
|
||||||
@ -77,7 +93,7 @@ class HMCResourceManager {
|
|||||||
bool have_CheckPointer;
|
bool have_CheckPointer;
|
||||||
|
|
||||||
// NOTE: operator << is not overloaded for std::vector<string>
|
// NOTE: operator << is not overloaded for std::vector<string>
|
||||||
// so thsi function is necessary
|
// so this function is necessary
|
||||||
void output_vector_string(const std::vector<std::string> &vs){
|
void output_vector_string(const std::vector<std::string> &vs){
|
||||||
for (auto &i: vs)
|
for (auto &i: vs)
|
||||||
std::cout << i << " ";
|
std::cout << i << " ";
|
||||||
@ -254,6 +270,7 @@ class HMCResourceManager {
|
|||||||
RegisterLoadCheckPointerFunction(Nersc);
|
RegisterLoadCheckPointerFunction(Nersc);
|
||||||
#ifdef HAVE_LIME
|
#ifdef HAVE_LIME
|
||||||
RegisterLoadCheckPointerFunction(ILDG);
|
RegisterLoadCheckPointerFunction(ILDG);
|
||||||
|
RegisterLoadCheckPointerMetadataFunction(Scidac);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////
|
||||||
|
@ -76,6 +76,14 @@ class BaseHmcCheckpointer : public HmcObservable<typename Impl::Field> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void check_filename(const std::string &filename){
|
||||||
|
std::ifstream f(filename.c_str());
|
||||||
|
if(!f.good()){
|
||||||
|
std::cout << GridLogError << "Filename " << filename << " not found. Aborting. " << std::endl;
|
||||||
|
abort();
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
virtual void initialize(const CheckpointerParameters &Params) = 0;
|
virtual void initialize(const CheckpointerParameters &Params) = 0;
|
||||||
|
|
||||||
virtual void CheckpointRestore(int traj, typename Impl::Field &U,
|
virtual void CheckpointRestore(int traj, typename Impl::Field &U,
|
||||||
|
@ -93,6 +93,9 @@ class BinaryHmcCheckpointer : public BaseHmcCheckpointer<Impl> {
|
|||||||
void CheckpointRestore(int traj, Field &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG) {
|
void CheckpointRestore(int traj, Field &U, GridSerialRNG &sRNG, GridParallelRNG &pRNG) {
|
||||||
std::string config, rng;
|
std::string config, rng;
|
||||||
this->build_filenames(traj, Params, config, rng);
|
this->build_filenames(traj, Params, config, rng);
|
||||||
|
this->check_filename(rng);
|
||||||
|
this->check_filename(config);
|
||||||
|
|
||||||
|
|
||||||
BinarySimpleMunger<sobj_double, sobj> munge;
|
BinarySimpleMunger<sobj_double, sobj> munge;
|
||||||
|
|
||||||
|
@ -136,6 +136,20 @@ class ILDGCPModule: public CheckPointerModule< ImplementationPolicy> {
|
|||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template<class ImplementationPolicy, class Metadata>
|
||||||
|
class ScidacCPModule: public CheckPointerModule< ImplementationPolicy> {
|
||||||
|
typedef CheckPointerModule< ImplementationPolicy> CPBase;
|
||||||
|
Metadata M;
|
||||||
|
|
||||||
|
// acquire resource
|
||||||
|
virtual void initialize(){
|
||||||
|
this->CheckPointPtr.reset(new ScidacHmcCheckpointer<ImplementationPolicy, Metadata>(this->Par_, M));
|
||||||
|
}
|
||||||
|
public:
|
||||||
|
ScidacCPModule(typename CPBase::APar Par, Metadata M_):M(M_), CPBase(Par) {}
|
||||||
|
template <class ReaderClass>
|
||||||
|
ScidacCPModule(Reader<ReaderClass>& Reader) : Parametrized<typename CPBase::APar>(Reader), M(Reader){};
|
||||||
|
};
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
@ -34,6 +34,7 @@ directory
|
|||||||
#include <Grid/qcd/hmc/checkpointers/NerscCheckpointer.h>
|
#include <Grid/qcd/hmc/checkpointers/NerscCheckpointer.h>
|
||||||
#include <Grid/qcd/hmc/checkpointers/BinaryCheckpointer.h>
|
#include <Grid/qcd/hmc/checkpointers/BinaryCheckpointer.h>
|
||||||
#include <Grid/qcd/hmc/checkpointers/ILDGCheckpointer.h>
|
#include <Grid/qcd/hmc/checkpointers/ILDGCheckpointer.h>
|
||||||
|
#include <Grid/qcd/hmc/checkpointers/ScidacCheckpointer.h>
|
||||||
//#include <Grid/qcd/hmc/checkpointers/CheckPointerModules.h>
|
//#include <Grid/qcd/hmc/checkpointers/CheckPointerModules.h>
|
||||||
|
|
||||||
|
|
||||||
|
@ -95,6 +95,10 @@ class ILDGHmcCheckpointer : public BaseHmcCheckpointer<Implementation> {
|
|||||||
GridParallelRNG &pRNG) {
|
GridParallelRNG &pRNG) {
|
||||||
std::string config, rng;
|
std::string config, rng;
|
||||||
this->build_filenames(traj, Params, config, rng);
|
this->build_filenames(traj, Params, config, rng);
|
||||||
|
this->check_filename(rng);
|
||||||
|
this->check_filename(config);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
|
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
|
||||||
BinaryIO::readRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
|
BinaryIO::readRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
|
||||||
|
@ -69,6 +69,9 @@ class NerscHmcCheckpointer : public BaseHmcCheckpointer<Gimpl> {
|
|||||||
GridParallelRNG &pRNG) {
|
GridParallelRNG &pRNG) {
|
||||||
std::string config, rng;
|
std::string config, rng;
|
||||||
this->build_filenames(traj, Params, config, rng);
|
this->build_filenames(traj, Params, config, rng);
|
||||||
|
this->check_filename(rng);
|
||||||
|
this->check_filename(config);
|
||||||
|
|
||||||
|
|
||||||
FieldMetaData header;
|
FieldMetaData header;
|
||||||
NerscIO::readRNGState(sRNG, pRNG, header, rng);
|
NerscIO::readRNGState(sRNG, pRNG, header, rng);
|
||||||
|
125
lib/qcd/hmc/checkpointers/ScidacCheckpointer.h
Normal file
125
lib/qcd/hmc/checkpointers/ScidacCheckpointer.h
Normal file
@ -0,0 +1,125 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./lib/qcd/hmc/ScidacCheckpointer.h
|
||||||
|
|
||||||
|
Copyright (C) 2018
|
||||||
|
|
||||||
|
Author: Guido Cossu <guido.cossu@ed.ac.uk>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution
|
||||||
|
directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#ifndef SCIDAC_CHECKPOINTER
|
||||||
|
#define SCIDAC_CHECKPOINTER
|
||||||
|
|
||||||
|
#ifdef HAVE_LIME
|
||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
#include <sstream>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
namespace Grid {
|
||||||
|
namespace QCD {
|
||||||
|
|
||||||
|
// For generic fields
|
||||||
|
template <class Implementation, class Metadata>
|
||||||
|
class ScidacHmcCheckpointer : public BaseHmcCheckpointer<Implementation> {
|
||||||
|
private:
|
||||||
|
CheckpointerParameters Params;
|
||||||
|
Metadata MData;
|
||||||
|
|
||||||
|
typedef typename Implementation::Field Field;
|
||||||
|
|
||||||
|
public:
|
||||||
|
//INHERIT_GIMPL_TYPES(Implementation);
|
||||||
|
|
||||||
|
ScidacHmcCheckpointer(const CheckpointerParameters &Params_) { initialize(Params_); }
|
||||||
|
ScidacHmcCheckpointer(const CheckpointerParameters &Params_, const Metadata& M_):MData(M_) { initialize(Params_); }
|
||||||
|
|
||||||
|
void initialize(const CheckpointerParameters &Params_) {
|
||||||
|
Params = Params_;
|
||||||
|
|
||||||
|
// check here that the format is valid
|
||||||
|
int ieee32big = (Params.format == std::string("IEEE32BIG"));
|
||||||
|
int ieee32 = (Params.format == std::string("IEEE32"));
|
||||||
|
int ieee64big = (Params.format == std::string("IEEE64BIG"));
|
||||||
|
int ieee64 = (Params.format == std::string("IEEE64"));
|
||||||
|
|
||||||
|
if (!(ieee64big || ieee32 || ieee32big || ieee64)) {
|
||||||
|
std::cout << GridLogError << "Unrecognized file format " << Params.format
|
||||||
|
<< std::endl;
|
||||||
|
std::cout << GridLogError
|
||||||
|
<< "Allowed: IEEE32BIG | IEEE32 | IEEE64BIG | IEEE64"
|
||||||
|
<< std::endl;
|
||||||
|
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void TrajectoryComplete(int traj, Field &U, GridSerialRNG &sRNG,
|
||||||
|
GridParallelRNG &pRNG) {
|
||||||
|
if ((traj % Params.saveInterval) == 0) {
|
||||||
|
std::string config, rng;
|
||||||
|
this->build_filenames(traj, Params, config, rng);
|
||||||
|
GridBase *grid = U._grid;
|
||||||
|
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
|
||||||
|
BinaryIO::writeRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
|
||||||
|
ScidacWriter _ScidacWriter(grid->IsBoss());
|
||||||
|
_ScidacWriter.open(config);
|
||||||
|
_ScidacWriter.writeScidacFieldRecord(U, MData);
|
||||||
|
_ScidacWriter.close();
|
||||||
|
|
||||||
|
std::cout << GridLogMessage << "Written Scidac Configuration on " << config
|
||||||
|
<< " checksum " << std::hex << nersc_csum<<"/"
|
||||||
|
<< scidac_csuma<<"/" << scidac_csumb
|
||||||
|
<< std::dec << std::endl;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
void CheckpointRestore(int traj, Field &U, GridSerialRNG &sRNG,
|
||||||
|
GridParallelRNG &pRNG) {
|
||||||
|
std::string config, rng;
|
||||||
|
this->build_filenames(traj, Params, config, rng);
|
||||||
|
this->check_filename(rng);
|
||||||
|
this->check_filename(config);
|
||||||
|
|
||||||
|
|
||||||
|
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
|
||||||
|
BinaryIO::readRNG(sRNG, pRNG, rng, 0,nersc_csum,scidac_csuma,scidac_csumb);
|
||||||
|
|
||||||
|
Metadata md_content;
|
||||||
|
ScidacReader _ScidacReader;
|
||||||
|
_ScidacReader.open(config);
|
||||||
|
_ScidacReader.readScidacFieldRecord(U,md_content); // format from the header
|
||||||
|
_ScidacReader.close();
|
||||||
|
|
||||||
|
std::cout << GridLogMessage << "Read Scidac Configuration from " << config
|
||||||
|
<< " checksum " << std::hex
|
||||||
|
<< nersc_csum<<"/"
|
||||||
|
<< scidac_csuma<<"/"
|
||||||
|
<< scidac_csumb
|
||||||
|
<< std::dec << std::endl;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // HAVE_LIME
|
||||||
|
#endif // ILDG_CHECKPOINTER
|
@ -66,6 +66,8 @@ void Gather_plane_simple_table (std::vector<std::pair<int,int> >& table,const La
|
|||||||
parallel_for(int i=0;i<num;i++){
|
parallel_for(int i=0;i<num;i++){
|
||||||
compress.Compress(&buffer[off],table[i].first,rhs._odata[so+table[i].second]);
|
compress.Compress(&buffer[off],table[i].first,rhs._odata[so+table[i].second]);
|
||||||
}
|
}
|
||||||
|
// Further optimisatoin: i) streaming store the result
|
||||||
|
// ii) software prefetch the first element of the next table entry
|
||||||
}
|
}
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////
|
||||||
@ -148,7 +150,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
std::vector<int> _distances;
|
std::vector<int> _distances;
|
||||||
std::vector<int> _comm_buf_size;
|
std::vector<int> _comm_buf_size;
|
||||||
std::vector<int> _permute_type;
|
std::vector<int> _permute_type;
|
||||||
|
std::vector<int> same_node;
|
||||||
|
std::vector<int> surface_list;
|
||||||
|
|
||||||
Vector<StencilEntry> _entries;
|
Vector<StencilEntry> _entries;
|
||||||
std::vector<Packet> Packets;
|
std::vector<Packet> Packets;
|
||||||
std::vector<Merge> Mergers;
|
std::vector<Merge> Mergers;
|
||||||
@ -199,7 +203,7 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
|
|
||||||
int dimension = _directions[point];
|
int dimension = _directions[point];
|
||||||
int displacement = _distances[point];
|
int displacement = _distances[point];
|
||||||
assert( (displacement==1) || (displacement==-1));
|
|
||||||
|
|
||||||
int pd = _grid->_processors[dimension];
|
int pd = _grid->_processors[dimension];
|
||||||
int fd = _grid->_fdimensions[dimension];
|
int fd = _grid->_fdimensions[dimension];
|
||||||
@ -214,9 +218,12 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
if ( ! comm_dim ) return 1;
|
if ( ! comm_dim ) return 1;
|
||||||
|
|
||||||
int nbr_proc;
|
int nbr_proc;
|
||||||
if (displacement==1) nbr_proc = 1;
|
if (displacement>0) nbr_proc = 1;
|
||||||
else nbr_proc = pd-1;
|
else nbr_proc = pd-1;
|
||||||
|
|
||||||
|
// FIXME this logic needs to be sorted for three link term
|
||||||
|
// assert( (displacement==1) || (displacement==-1));
|
||||||
|
// Present hack only works for >= 4^4 subvol per node
|
||||||
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
||||||
|
|
||||||
void *shm = (void *) _grid->ShmBufferTranslate(recv_from_rank,u_recv_buf_p);
|
void *shm = (void *) _grid->ShmBufferTranslate(recv_from_rank,u_recv_buf_p);
|
||||||
@ -505,25 +512,24 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
template<class decompressor>
|
template<class decompressor>
|
||||||
void CommsMerge(decompressor decompress,std::vector<Merge> &mm,std::vector<Decompress> &dd) {
|
void CommsMerge(decompressor decompress,std::vector<Merge> &mm,std::vector<Decompress> &dd) {
|
||||||
|
|
||||||
|
mergetime-=usecond();
|
||||||
for(int i=0;i<mm.size();i++){
|
for(int i=0;i<mm.size();i++){
|
||||||
mergetime-=usecond();
|
|
||||||
parallel_for(int o=0;o<mm[i].buffer_size/2;o++){
|
parallel_for(int o=0;o<mm[i].buffer_size/2;o++){
|
||||||
decompress.Exchange(mm[i].mpointer,
|
decompress.Exchange(mm[i].mpointer,
|
||||||
mm[i].vpointers[0],
|
mm[i].vpointers[0],
|
||||||
mm[i].vpointers[1],
|
mm[i].vpointers[1],
|
||||||
mm[i].type,o);
|
mm[i].type,o);
|
||||||
}
|
}
|
||||||
mergetime+=usecond();
|
|
||||||
}
|
}
|
||||||
|
mergetime+=usecond();
|
||||||
|
|
||||||
|
decompresstime-=usecond();
|
||||||
for(int i=0;i<dd.size();i++){
|
for(int i=0;i<dd.size();i++){
|
||||||
decompresstime-=usecond();
|
|
||||||
parallel_for(int o=0;o<dd[i].buffer_size;o++){
|
parallel_for(int o=0;o<dd[i].buffer_size;o++){
|
||||||
decompress.Decompress(dd[i].kernel_p,dd[i].mpi_p,o);
|
decompress.Decompress(dd[i].kernel_p,dd[i].mpi_p,o);
|
||||||
}
|
}
|
||||||
decompresstime+=usecond();
|
|
||||||
}
|
}
|
||||||
|
decompresstime+=usecond();
|
||||||
}
|
}
|
||||||
////////////////////////////////////////
|
////////////////////////////////////////
|
||||||
// Set up routines
|
// Set up routines
|
||||||
@ -538,6 +544,29 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Move interior/exterior split into the generic stencil
|
||||||
|
// FIXME Explicit Ls in interface is a pain. Should just use a vol
|
||||||
|
void BuildSurfaceList(int Ls,int vol4){
|
||||||
|
|
||||||
|
// find same node for SHM
|
||||||
|
// Here we know the distance is 1 for WilsonStencil
|
||||||
|
for(int point=0;point<this->_npoints;point++){
|
||||||
|
same_node[point] = this->SameNode(point);
|
||||||
|
}
|
||||||
|
|
||||||
|
for(int site = 0 ;site< vol4;site++){
|
||||||
|
int local = 1;
|
||||||
|
for(int point=0;point<this->_npoints;point++){
|
||||||
|
if( (!this->GetNodeLocal(site*Ls,point)) && (!same_node[point]) ){
|
||||||
|
local = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(local == 0) {
|
||||||
|
surface_list.push_back(site);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
CartesianStencil(GridBase *grid,
|
CartesianStencil(GridBase *grid,
|
||||||
int npoints,
|
int npoints,
|
||||||
int checkerboard,
|
int checkerboard,
|
||||||
@ -548,7 +577,8 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
comm_bytes_thr(npoints),
|
comm_bytes_thr(npoints),
|
||||||
comm_enter_thr(npoints),
|
comm_enter_thr(npoints),
|
||||||
comm_leave_thr(npoints),
|
comm_leave_thr(npoints),
|
||||||
comm_time_thr(npoints)
|
comm_time_thr(npoints),
|
||||||
|
same_node(npoints)
|
||||||
{
|
{
|
||||||
face_table_computed=0;
|
face_table_computed=0;
|
||||||
_npoints = npoints;
|
_npoints = npoints;
|
||||||
@ -556,6 +586,7 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
|
|||||||
_directions = directions;
|
_directions = directions;
|
||||||
_distances = distances;
|
_distances = distances;
|
||||||
_unified_buffer_size=0;
|
_unified_buffer_size=0;
|
||||||
|
surface_list.resize(0);
|
||||||
|
|
||||||
int osites = _grid->oSites();
|
int osites = _grid->oSites();
|
||||||
|
|
||||||
|
@ -40,7 +40,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
|
|
||||||
#define PARALLEL_FOR_LOOP _Pragma("omp parallel for schedule(static)")
|
#define PARALLEL_FOR_LOOP _Pragma("omp parallel for schedule(static)")
|
||||||
#define PARALLEL_FOR_LOOP_INTERN _Pragma("omp for schedule(static)")
|
#define PARALLEL_FOR_LOOP_INTERN _Pragma("omp for schedule(static)")
|
||||||
#define PARALLEL_NESTED_LOOP2 _Pragma("omp parallel for schedule(static) collapse(2)")
|
#define PARALLEL_NESTED_LOOP2 _Pragma("omp parallel for collapse(2)")
|
||||||
#define PARALLEL_REGION _Pragma("omp parallel")
|
#define PARALLEL_REGION _Pragma("omp parallel")
|
||||||
#define PARALLEL_CRITICAL _Pragma("omp critical")
|
#define PARALLEL_CRITICAL _Pragma("omp critical")
|
||||||
#else
|
#else
|
||||||
|
@ -368,8 +368,10 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
}
|
}
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-overlap") ){
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-overlap") ){
|
||||||
QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsAndCompute;
|
QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsAndCompute;
|
||||||
|
QCD::StaggeredKernelsStatic::Comms = QCD::StaggeredKernelsStatic::CommsAndCompute;
|
||||||
} else {
|
} else {
|
||||||
QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
|
QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
|
||||||
|
QCD::StaggeredKernelsStatic::Comms = QCD::StaggeredKernelsStatic::CommsThenCompute;
|
||||||
}
|
}
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-concurrent") ){
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-concurrent") ){
|
||||||
CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicyConcurrent);
|
CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicyConcurrent);
|
||||||
@ -385,6 +387,7 @@ void Grid_init(int *argc,char ***argv)
|
|||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-threads") ){
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-threads") ){
|
||||||
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--comms-threads");
|
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--comms-threads");
|
||||||
GridCmdOptionInt(arg,CartesianCommunicator::nCommThreads);
|
GridCmdOptionInt(arg,CartesianCommunicator::nCommThreads);
|
||||||
|
assert(CartesianCommunicator::nCommThreads > 0);
|
||||||
}
|
}
|
||||||
if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
|
if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
|
||||||
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking");
|
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking");
|
||||||
|
@ -167,7 +167,7 @@ int main (int argc, char ** argv) {
|
|||||||
RealD mass = Params.mass;
|
RealD mass = Params.mass;
|
||||||
RealD M5 = Params.M5;
|
RealD M5 = Params.M5;
|
||||||
std::vector<int> blockSize = Params.blockSize;
|
std::vector<int> blockSize = Params.blockSize;
|
||||||
std::vector<int> latt({16,16,16,16});
|
std::vector<int> latt({32,32,32,32});
|
||||||
uint64_t vol = Ls*latt[0]*latt[1]*latt[2]*latt[3];
|
uint64_t vol = Ls*latt[0]*latt[1]*latt[2]*latt[3];
|
||||||
double mat_flop= 2.0*1320.0*vol;
|
double mat_flop= 2.0*1320.0*vol;
|
||||||
// Grids
|
// Grids
|
||||||
|
@ -141,6 +141,7 @@ int main (int argc, char ** argv)
|
|||||||
t1=usecond();
|
t1=usecond();
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Called Ds ASM"<<std::endl;
|
std::cout<<GridLogMessage << "Called Ds ASM"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "norm src "<< norm2(src)<<std::endl;
|
||||||
std::cout<<GridLogMessage << "norm result "<< norm2(tmp)<<std::endl;
|
std::cout<<GridLogMessage << "norm result "<< norm2(tmp)<<std::endl;
|
||||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||||
|
|
||||||
@ -160,7 +161,8 @@ int main (int argc, char ** argv)
|
|||||||
localConvert(sresult,tmp);
|
localConvert(sresult,tmp);
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Called sDs unroll"<<std::endl;
|
std::cout<<GridLogMessage << "Called sDs unroll"<<std::endl;
|
||||||
std::cout<<GridLogMessage << "norm result "<< norm2(sresult)<<std::endl;
|
std::cout<<GridLogMessage << "norm ssrc "<< norm2(ssrc)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "norm sresult "<< norm2(sresult)<<std::endl;
|
||||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||||
|
|
||||||
|
|
||||||
@ -181,6 +183,7 @@ int main (int argc, char ** argv)
|
|||||||
localConvert(sresult,tmp);
|
localConvert(sresult,tmp);
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Called sDs asm"<<std::endl;
|
std::cout<<GridLogMessage << "Called sDs asm"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "norm ssrc "<< norm2(ssrc)<<std::endl;
|
||||||
std::cout<<GridLogMessage << "norm result "<< norm2(sresult)<<std::endl;
|
std::cout<<GridLogMessage << "norm result "<< norm2(sresult)<<std::endl;
|
||||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)*extra<<std::endl;
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)*extra<<std::endl;
|
||||||
|
|
||||||
|
196
tests/core/Test_staggered5DvecF.cc
Normal file
196
tests/core/Test_staggered5DvecF.cc
Normal file
@ -0,0 +1,196 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./benchmarks/Benchmark_wilson.cc
|
||||||
|
|
||||||
|
Copyright (C) 2015
|
||||||
|
|
||||||
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
Author: paboyle <paboyle@ph.ed.ac.uk>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#include <Grid/Grid.h>
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
using namespace Grid;
|
||||||
|
using namespace Grid::QCD;
|
||||||
|
|
||||||
|
int main (int argc, char ** argv)
|
||||||
|
{
|
||||||
|
Grid_init(&argc,&argv);
|
||||||
|
|
||||||
|
std::vector<int> latt_size = GridDefaultLatt();
|
||||||
|
std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
|
||||||
|
std::vector<int> mpi_layout = GridDefaultMpi();
|
||||||
|
|
||||||
|
const int Ls=16;
|
||||||
|
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
|
||||||
|
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||||
|
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
||||||
|
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
|
||||||
|
|
||||||
|
std::cout << GridLogMessage << "Making s innermost grids"<<std::endl;
|
||||||
|
GridCartesian * sUGrid = SpaceTimeGrid::makeFourDimDWFGrid(GridDefaultLatt(),GridDefaultMpi());
|
||||||
|
GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
|
||||||
|
GridCartesian * sFGrid = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
|
||||||
|
GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
|
||||||
|
|
||||||
|
int threads = GridThread::GetThreads();
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
|
||||||
|
|
||||||
|
std::vector<int> seeds({1,2,3,4});
|
||||||
|
|
||||||
|
GridParallelRNG pRNG4(UGrid);
|
||||||
|
GridParallelRNG pRNG5(FGrid);
|
||||||
|
pRNG4.SeedFixedIntegers(seeds);
|
||||||
|
pRNG5.SeedFixedIntegers(seeds);
|
||||||
|
|
||||||
|
typedef typename ImprovedStaggeredFermion5DF::FermionField FermionField;
|
||||||
|
typedef typename ImprovedStaggeredFermion5DF::ComplexField ComplexField;
|
||||||
|
typename ImprovedStaggeredFermion5DF::ImplParams params;
|
||||||
|
|
||||||
|
FermionField src (FGrid);
|
||||||
|
random(pRNG5,src);
|
||||||
|
/*
|
||||||
|
std::vector<int> site({0,1,2,0,0});
|
||||||
|
ColourVector cv = zero;
|
||||||
|
cv()()(0)=1.0;
|
||||||
|
src = zero;
|
||||||
|
pokeSite(cv,src,site);
|
||||||
|
*/
|
||||||
|
FermionField result(FGrid); result=zero;
|
||||||
|
FermionField tmp(FGrid); tmp=zero;
|
||||||
|
FermionField err(FGrid); tmp=zero;
|
||||||
|
FermionField phi (FGrid); random(pRNG5,phi);
|
||||||
|
FermionField chi (FGrid); random(pRNG5,chi);
|
||||||
|
|
||||||
|
LatticeGaugeFieldF Umu(UGrid);
|
||||||
|
SU3::HotConfiguration(pRNG4,Umu);
|
||||||
|
|
||||||
|
/*
|
||||||
|
for(int mu=1;mu<4;mu++){
|
||||||
|
auto tmp = PeekIndex<LorentzIndex>(Umu,mu);
|
||||||
|
tmp = zero;
|
||||||
|
PokeIndex<LorentzIndex>(Umu,tmp,mu);
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
double volume=Ls;
|
||||||
|
for(int mu=0;mu<Nd;mu++){
|
||||||
|
volume=volume*latt_size[mu];
|
||||||
|
}
|
||||||
|
|
||||||
|
RealD mass=0.1;
|
||||||
|
RealD c1=9.0/8.0;
|
||||||
|
RealD c2=-1.0/24.0;
|
||||||
|
RealD u0=1.0;
|
||||||
|
|
||||||
|
ImprovedStaggeredFermion5DF Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,c1,c2,u0,params);
|
||||||
|
ImprovedStaggeredFermionVec5dF sDs(Umu,Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,mass,c1,c2,u0,params);
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<"= Testing Dhop against cshift implementation "<<std::endl;
|
||||||
|
std::cout<<GridLogMessage<<"=========================================================="<<std::endl;
|
||||||
|
|
||||||
|
int ncall=1000;
|
||||||
|
int ncall1=1000;
|
||||||
|
double t0(0),t1(0);
|
||||||
|
double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 + == 1146
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "Calling staggered operator"<<std::endl;
|
||||||
|
t0=usecond();
|
||||||
|
for(int i=0;i<ncall1;i++){
|
||||||
|
Ds.Dhop(src,result,0);
|
||||||
|
}
|
||||||
|
t1=usecond();
|
||||||
|
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "Called Ds"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "Calling vectorised staggered operator"<<std::endl;
|
||||||
|
|
||||||
|
#ifdef AVX512
|
||||||
|
QCD::StaggeredKernelsStatic::Opt=QCD::StaggeredKernelsStatic::OptInlineAsm;
|
||||||
|
#else
|
||||||
|
QCD::StaggeredKernelsStatic::Opt=QCD::StaggeredKernelsStatic::OptGeneric;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
t0=usecond();
|
||||||
|
for(int i=0;i<ncall1;i++){
|
||||||
|
Ds.Dhop(src,tmp,0);
|
||||||
|
}
|
||||||
|
t1=usecond();
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "Called Ds ASM"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "norm src "<< norm2(src)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "norm result "<< norm2(tmp)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||||
|
|
||||||
|
err = tmp-result;
|
||||||
|
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
||||||
|
|
||||||
|
|
||||||
|
FermionField ssrc (sFGrid); localConvert(src,ssrc);
|
||||||
|
FermionField sresult(sFGrid); sresult=zero;
|
||||||
|
|
||||||
|
QCD::StaggeredKernelsStatic::Opt=QCD::StaggeredKernelsStatic::OptHandUnroll;
|
||||||
|
t0=usecond();
|
||||||
|
for(int i=0;i<ncall1;i++){
|
||||||
|
sDs.Dhop(ssrc,sresult,0);
|
||||||
|
}
|
||||||
|
t1=usecond();
|
||||||
|
localConvert(sresult,tmp);
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "Called sDs unroll"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "norm ssrc "<< norm2(ssrc)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "norm sresult "<< norm2(sresult)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef AVX512
|
||||||
|
QCD::StaggeredKernelsStatic::Opt=QCD::StaggeredKernelsStatic::OptInlineAsm;
|
||||||
|
#else
|
||||||
|
QCD::StaggeredKernelsStatic::Opt=QCD::StaggeredKernelsStatic::OptGeneric;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
err = tmp-result;
|
||||||
|
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
||||||
|
int extra=1;
|
||||||
|
t0=usecond();
|
||||||
|
for(int i=0;i<ncall1*extra;i++){
|
||||||
|
sDs.Dhop(ssrc,sresult,0);
|
||||||
|
}
|
||||||
|
t1=usecond();
|
||||||
|
localConvert(sresult,tmp);
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "Called sDs asm"<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "norm ssrc "<< norm2(ssrc)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "norm result "<< norm2(sresult)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)*extra<<std::endl;
|
||||||
|
|
||||||
|
err = tmp-result;
|
||||||
|
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Grid_finalize();
|
||||||
|
}
|
@ -34,6 +34,8 @@ class ScalarActionParameters : Serializable {
|
|||||||
double, lambda,
|
double, lambda,
|
||||||
double, g);
|
double, g);
|
||||||
|
|
||||||
|
ScalarActionParameters() = default;
|
||||||
|
|
||||||
template <class ReaderClass >
|
template <class ReaderClass >
|
||||||
ScalarActionParameters(Reader<ReaderClass>& Reader){
|
ScalarActionParameters(Reader<ReaderClass>& Reader){
|
||||||
read(Reader, "ScalarAction", *this);
|
read(Reader, "ScalarAction", *this);
|
||||||
@ -124,10 +126,13 @@ int main(int argc, char **argv) {
|
|||||||
ScalarGrid.set_rb(new GridRedBlackCartesian(ScalarGrid.get_full()));
|
ScalarGrid.set_rb(new GridRedBlackCartesian(ScalarGrid.get_full()));
|
||||||
TheHMC.Resources.AddGrid("scalar", ScalarGrid);
|
TheHMC.Resources.AddGrid("scalar", ScalarGrid);
|
||||||
std::cout << "Lattice size : " << GridDefaultLatt() << std::endl;
|
std::cout << "Lattice size : " << GridDefaultLatt() << std::endl;
|
||||||
|
|
||||||
|
ScalarActionParameters SPar(Reader);
|
||||||
|
|
||||||
// Checkpointer definition
|
// Checkpointer definition
|
||||||
CheckpointerParameters CPparams(Reader);
|
CheckpointerParameters CPparams(Reader);
|
||||||
TheHMC.Resources.LoadBinaryCheckpointer(CPparams);
|
//TheHMC.Resources.LoadBinaryCheckpointer(CPparams);
|
||||||
|
TheHMC.Resources.LoadScidacCheckpointer(CPparams, SPar);
|
||||||
|
|
||||||
RNGModuleParameters RNGpar(Reader);
|
RNGModuleParameters RNGpar(Reader);
|
||||||
TheHMC.Resources.SetRNGSeeds(RNGpar);
|
TheHMC.Resources.SetRNGSeeds(RNGpar);
|
||||||
@ -140,7 +145,6 @@ int main(int argc, char **argv) {
|
|||||||
// Collect actions, here use more encapsulation
|
// Collect actions, here use more encapsulation
|
||||||
|
|
||||||
// Scalar action in adjoint representation
|
// Scalar action in adjoint representation
|
||||||
ScalarActionParameters SPar(Reader);
|
|
||||||
ScalarAction Saction(SPar.mass_squared, SPar.lambda, SPar.g);
|
ScalarAction Saction(SPar.mass_squared, SPar.lambda, SPar.g);
|
||||||
|
|
||||||
// Collect actions
|
// Collect actions
|
||||||
|
@ -33,6 +33,7 @@ namespace Grid{
|
|||||||
GRID_SERIALIZABLE_CLASS_MEMBERS(ActionParameters,
|
GRID_SERIALIZABLE_CLASS_MEMBERS(ActionParameters,
|
||||||
double, beta)
|
double, beta)
|
||||||
|
|
||||||
|
ActionParameters() = default;
|
||||||
|
|
||||||
template <class ReaderClass >
|
template <class ReaderClass >
|
||||||
ActionParameters(Reader<ReaderClass>& Reader){
|
ActionParameters(Reader<ReaderClass>& Reader){
|
||||||
@ -68,11 +69,15 @@ int main(int argc, char **argv) {
|
|||||||
}
|
}
|
||||||
Serialiser Reader(TheHMC.ParameterFile);
|
Serialiser Reader(TheHMC.ParameterFile);
|
||||||
|
|
||||||
|
// Read parameters from input file
|
||||||
|
ActionParameters WilsonPar(Reader);
|
||||||
|
|
||||||
// Checkpointer definition
|
// Checkpointer definition
|
||||||
CheckpointerParameters CPparams(Reader);
|
CheckpointerParameters CPparams(Reader);
|
||||||
TheHMC.Resources.LoadNerscCheckpointer(CPparams);
|
//TheHMC.Resources.LoadNerscCheckpointer(CPparams);
|
||||||
|
|
||||||
|
// Store metadata in the Scidac checkpointer
|
||||||
|
TheHMC.Resources.LoadScidacCheckpointer(CPparams, WilsonPar);
|
||||||
|
|
||||||
RNGModuleParameters RNGpar(Reader);
|
RNGModuleParameters RNGpar(Reader);
|
||||||
TheHMC.Resources.SetRNGSeeds(RNGpar);
|
TheHMC.Resources.SetRNGSeeds(RNGpar);
|
||||||
@ -91,8 +96,6 @@ int main(int argc, char **argv) {
|
|||||||
// need wrappers of the fermionic classes
|
// need wrappers of the fermionic classes
|
||||||
// that have a complex construction
|
// that have a complex construction
|
||||||
// standard
|
// standard
|
||||||
ActionParameters WilsonPar(Reader);
|
|
||||||
//RealD beta = 6.4 ;
|
|
||||||
WilsonGaugeActionR Waction(WilsonPar.beta);
|
WilsonGaugeActionR Waction(WilsonPar.beta);
|
||||||
|
|
||||||
ActionLevel<HMCWrapper::Field> Level1(1);
|
ActionLevel<HMCWrapper::Field> Level1(1);
|
||||||
|
@ -74,8 +74,16 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu);
|
LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu);
|
||||||
|
|
||||||
|
double volume=1;
|
||||||
|
for(int mu=0;mu<Nd;mu++){
|
||||||
|
volume=volume*latt_size[mu];
|
||||||
|
}
|
||||||
|
|
||||||
RealD mass=0.003;
|
RealD mass=0.003;
|
||||||
ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass);
|
RealD c1=9.0/8.0;
|
||||||
|
RealD c2=-1.0/24.0;
|
||||||
|
RealD u0=1.0;
|
||||||
|
ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,c1,c2,u0);
|
||||||
SchurStaggeredOperator<ImprovedStaggeredFermion5DR,FermionField> HermOp(Ds);
|
SchurStaggeredOperator<ImprovedStaggeredFermion5DR,FermionField> HermOp(Ds);
|
||||||
|
|
||||||
ConjugateGradient<FermionField> CG(1.0e-8,10000);
|
ConjugateGradient<FermionField> CG(1.0e-8,10000);
|
||||||
@ -87,14 +95,26 @@ int main (int argc, char ** argv)
|
|||||||
std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
|
std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
|
||||||
std::cout << GridLogMessage << " Calling 4d CG "<<std::endl;
|
std::cout << GridLogMessage << " Calling 4d CG "<<std::endl;
|
||||||
std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
|
std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
|
||||||
ImprovedStaggeredFermionR Ds4d(Umu,Umu,*UGrid,*UrbGrid,mass);
|
ImprovedStaggeredFermionR Ds4d(Umu,Umu,*UGrid,*UrbGrid,mass,c1,c2,u0);
|
||||||
SchurStaggeredOperator<ImprovedStaggeredFermionR,FermionField> HermOp4d(Ds4d);
|
SchurStaggeredOperator<ImprovedStaggeredFermionR,FermionField> HermOp4d(Ds4d);
|
||||||
FermionField src4d(UGrid); random(pRNG,src4d);
|
FermionField src4d(UGrid); random(pRNG,src4d);
|
||||||
FermionField src4d_o(UrbGrid); pickCheckerboard(Odd,src4d_o,src4d);
|
FermionField src4d_o(UrbGrid); pickCheckerboard(Odd,src4d_o,src4d);
|
||||||
FermionField result4d_o(UrbGrid);
|
FermionField result4d_o(UrbGrid);
|
||||||
|
|
||||||
|
double deodoe_flops=(16*(3*(6+8+8)) + 15*3*2)*volume; // == 66*16 + == 1146
|
||||||
result4d_o=zero;
|
result4d_o=zero;
|
||||||
CG(HermOp4d,src4d_o,result4d_o);
|
{
|
||||||
|
double t1=usecond();
|
||||||
|
CG(HermOp4d,src4d_o,result4d_o);
|
||||||
|
double t2=usecond();
|
||||||
|
double ncall=CG.IterationsToComplete;
|
||||||
|
double flops = deodoe_flops * ncall;
|
||||||
|
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
|
||||||
|
HermOp4d.Report();
|
||||||
|
}
|
||||||
|
Ds4d.Report();
|
||||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||||
|
|
||||||
|
|
||||||
@ -103,7 +123,17 @@ int main (int argc, char ** argv)
|
|||||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||||
Ds.ZeroCounters();
|
Ds.ZeroCounters();
|
||||||
result_o=zero;
|
result_o=zero;
|
||||||
CG(HermOp,src_o,result_o);
|
{
|
||||||
|
double t1=usecond();
|
||||||
|
CG(HermOp,src_o,result_o);
|
||||||
|
double t2=usecond();
|
||||||
|
double ncall=CG.IterationsToComplete*Ls;
|
||||||
|
double flops = deodoe_flops * ncall;
|
||||||
|
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
|
||||||
|
HermOp.Report();
|
||||||
|
}
|
||||||
Ds.Report();
|
Ds.Report();
|
||||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||||
|
|
||||||
@ -112,7 +142,18 @@ int main (int argc, char ** argv)
|
|||||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||||
Ds.ZeroCounters();
|
Ds.ZeroCounters();
|
||||||
result_o=zero;
|
result_o=zero;
|
||||||
mCG(HermOp,src_o,result_o);
|
{
|
||||||
|
double t1=usecond();
|
||||||
|
mCG(HermOp,src_o,result_o);
|
||||||
|
double t2=usecond();
|
||||||
|
double ncall=mCG.IterationsToComplete*Ls;
|
||||||
|
double flops = deodoe_flops * ncall;
|
||||||
|
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
|
||||||
|
HermOp.Report();
|
||||||
|
}
|
||||||
|
|
||||||
Ds.Report();
|
Ds.Report();
|
||||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||||
|
|
||||||
@ -121,7 +162,17 @@ int main (int argc, char ** argv)
|
|||||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||||
Ds.ZeroCounters();
|
Ds.ZeroCounters();
|
||||||
result_o=zero;
|
result_o=zero;
|
||||||
BCGrQ(HermOp,src_o,result_o);
|
{
|
||||||
|
double t1=usecond();
|
||||||
|
BCGrQ(HermOp,src_o,result_o);
|
||||||
|
double t2=usecond();
|
||||||
|
double ncall=BCGrQ.IterationsToComplete*Ls;
|
||||||
|
double flops = deodoe_flops * ncall;
|
||||||
|
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
|
||||||
|
HermOp.Report();
|
||||||
|
}
|
||||||
Ds.Report();
|
Ds.Report();
|
||||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||||
|
|
||||||
|
@ -74,7 +74,16 @@ int main (int argc, char ** argv)
|
|||||||
LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu);
|
LatticeGaugeField Umu(UGrid); SU3::HotConfiguration(pRNG,Umu);
|
||||||
|
|
||||||
RealD mass=0.003;
|
RealD mass=0.003;
|
||||||
ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass);
|
RealD c1=9.0/8.0;
|
||||||
|
RealD c2=-1.0/24.0;
|
||||||
|
RealD u0=1.0;
|
||||||
|
|
||||||
|
double volume=1;
|
||||||
|
for(int mu=0;mu<Nd;mu++){
|
||||||
|
volume=volume*latt_size[mu];
|
||||||
|
}
|
||||||
|
|
||||||
|
ImprovedStaggeredFermion5DR Ds(Umu,Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,c1,c2,u0);
|
||||||
MdagMLinearOperator<ImprovedStaggeredFermion5DR,FermionField> HermOp(Ds);
|
MdagMLinearOperator<ImprovedStaggeredFermion5DR,FermionField> HermOp(Ds);
|
||||||
|
|
||||||
ConjugateGradient<FermionField> CG(1.0e-8,10000);
|
ConjugateGradient<FermionField> CG(1.0e-8,10000);
|
||||||
@ -86,11 +95,23 @@ int main (int argc, char ** argv)
|
|||||||
std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
|
std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
|
||||||
std::cout << GridLogMessage << " Calling 4d CG "<<std::endl;
|
std::cout << GridLogMessage << " Calling 4d CG "<<std::endl;
|
||||||
std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
|
std::cout << GridLogMessage << "****************************************************************** "<<std::endl;
|
||||||
ImprovedStaggeredFermionR Ds4d(Umu,Umu,*UGrid,*UrbGrid,mass);
|
ImprovedStaggeredFermionR Ds4d(Umu,Umu,*UGrid,*UrbGrid,mass,c1,c2,u0);
|
||||||
MdagMLinearOperator<ImprovedStaggeredFermionR,FermionField> HermOp4d(Ds4d);
|
MdagMLinearOperator<ImprovedStaggeredFermionR,FermionField> HermOp4d(Ds4d);
|
||||||
FermionField src4d(UGrid); random(pRNG,src4d);
|
FermionField src4d(UGrid); random(pRNG,src4d);
|
||||||
FermionField result4d(UGrid); result4d=zero;
|
FermionField result4d(UGrid); result4d=zero;
|
||||||
CG(HermOp4d,src4d,result4d);
|
|
||||||
|
double deodoe_flops=(16*(3*(6+8+8)) + 15*3*2)*volume; // == 66*16 + == 1146
|
||||||
|
{
|
||||||
|
double t1=usecond();
|
||||||
|
CG(HermOp4d,src4d,result4d);
|
||||||
|
double t2=usecond();
|
||||||
|
double ncall=CG.IterationsToComplete;
|
||||||
|
double flops = deodoe_flops * ncall;
|
||||||
|
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||||
|
|
||||||
|
|
||||||
@ -98,9 +119,18 @@ int main (int argc, char ** argv)
|
|||||||
std::cout << GridLogMessage << " Calling 5d CG for "<<Ls <<" right hand sides" <<std::endl;
|
std::cout << GridLogMessage << " Calling 5d CG for "<<Ls <<" right hand sides" <<std::endl;
|
||||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||||
result=zero;
|
result=zero;
|
||||||
|
{
|
||||||
Ds.ZeroCounters();
|
Ds.ZeroCounters();
|
||||||
|
double t1=usecond();
|
||||||
CG(HermOp,src,result);
|
CG(HermOp,src,result);
|
||||||
|
double t2=usecond();
|
||||||
|
double ncall=CG.IterationsToComplete;
|
||||||
|
double flops = deodoe_flops * ncall;
|
||||||
|
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
|
||||||
Ds.Report();
|
Ds.Report();
|
||||||
|
}
|
||||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||||
|
|
||||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||||
@ -108,7 +138,16 @@ int main (int argc, char ** argv)
|
|||||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||||
result=zero;
|
result=zero;
|
||||||
Ds.ZeroCounters();
|
Ds.ZeroCounters();
|
||||||
|
{
|
||||||
|
double t1=usecond();
|
||||||
mCG(HermOp,src,result);
|
mCG(HermOp,src,result);
|
||||||
|
double t2=usecond();
|
||||||
|
double ncall=CG.IterationsToComplete;
|
||||||
|
double flops = deodoe_flops * ncall;
|
||||||
|
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
|
||||||
|
}
|
||||||
Ds.Report();
|
Ds.Report();
|
||||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||||
|
|
||||||
@ -117,7 +156,16 @@ int main (int argc, char ** argv)
|
|||||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||||
result=zero;
|
result=zero;
|
||||||
Ds.ZeroCounters();
|
Ds.ZeroCounters();
|
||||||
|
{
|
||||||
|
double t1=usecond();
|
||||||
BCGrQ(HermOp,src,result);
|
BCGrQ(HermOp,src,result);
|
||||||
|
double t2=usecond();
|
||||||
|
double ncall=CG.IterationsToComplete;
|
||||||
|
double flops = deodoe_flops * ncall;
|
||||||
|
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
|
||||||
|
}
|
||||||
Ds.Report();
|
Ds.Report();
|
||||||
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
std::cout << GridLogMessage << "************************************************************************ "<<std::endl;
|
||||||
|
|
||||||
|
@ -71,7 +71,10 @@ int main (int argc, char ** argv)
|
|||||||
}
|
}
|
||||||
|
|
||||||
RealD mass=0.003;
|
RealD mass=0.003;
|
||||||
ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass);
|
RealD c1=9.0/8.0;
|
||||||
|
RealD c2=-1.0/24.0;
|
||||||
|
RealD u0=1.0;
|
||||||
|
ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0);
|
||||||
|
|
||||||
FermionField res_o(&RBGrid);
|
FermionField res_o(&RBGrid);
|
||||||
FermionField src_o(&RBGrid);
|
FermionField src_o(&RBGrid);
|
||||||
@ -80,7 +83,19 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
SchurStaggeredOperator<ImprovedStaggeredFermionR,FermionField> HermOpEO(Ds);
|
SchurStaggeredOperator<ImprovedStaggeredFermionR,FermionField> HermOpEO(Ds);
|
||||||
ConjugateGradient<FermionField> CG(1.0e-8,10000);
|
ConjugateGradient<FermionField> CG(1.0e-8,10000);
|
||||||
|
double t1=usecond();
|
||||||
CG(HermOpEO,src_o,res_o);
|
CG(HermOpEO,src_o,res_o);
|
||||||
|
double t2=usecond();
|
||||||
|
|
||||||
|
// Schur solver: uses DeoDoe => volume * 1146
|
||||||
|
double ncall=CG.IterationsToComplete;
|
||||||
|
double flops=(16*(3*(6+8+8)) + 15*3*2)*volume*ncall; // == 66*16 + == 1146
|
||||||
|
|
||||||
|
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
FermionField tmp(&RBGrid);
|
FermionField tmp(&RBGrid);
|
||||||
|
|
||||||
|
@ -65,7 +65,10 @@ int main (int argc, char ** argv)
|
|||||||
FermionField resid(&Grid);
|
FermionField resid(&Grid);
|
||||||
|
|
||||||
RealD mass=0.1;
|
RealD mass=0.1;
|
||||||
ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass);
|
RealD c1=9.0/8.0;
|
||||||
|
RealD c2=-1.0/24.0;
|
||||||
|
RealD u0=1.0;
|
||||||
|
ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0);
|
||||||
|
|
||||||
ConjugateGradient<FermionField> CG(1.0e-8,10000);
|
ConjugateGradient<FermionField> CG(1.0e-8,10000);
|
||||||
SchurRedBlackStaggeredSolve<FermionField> SchurSolver(CG);
|
SchurRedBlackStaggeredSolve<FermionField> SchurSolver(CG);
|
||||||
|
@ -73,7 +73,10 @@ int main (int argc, char ** argv)
|
|||||||
}
|
}
|
||||||
|
|
||||||
RealD mass=0.1;
|
RealD mass=0.1;
|
||||||
ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass);
|
RealD c1=9.0/8.0;
|
||||||
|
RealD c2=-1.0/24.0;
|
||||||
|
RealD u0=1.0;
|
||||||
|
ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0);
|
||||||
|
|
||||||
MdagMLinearOperator<ImprovedStaggeredFermionR,FermionField> HermOp(Ds);
|
MdagMLinearOperator<ImprovedStaggeredFermionR,FermionField> HermOp(Ds);
|
||||||
ConjugateGradient<FermionField> CG(1.0e-6,10000);
|
ConjugateGradient<FermionField> CG(1.0e-6,10000);
|
||||||
|
121
tests/solver/Test_staggered_multishift.cc
Normal file
121
tests/solver/Test_staggered_multishift.cc
Normal file
@ -0,0 +1,121 @@
|
|||||||
|
/*************************************************************************************
|
||||||
|
|
||||||
|
Grid physics library, www.github.com/paboyle/Grid
|
||||||
|
|
||||||
|
Source file: ./tests/Test_wilson_cg_unprec.cc
|
||||||
|
|
||||||
|
Copyright (C) 2015
|
||||||
|
|
||||||
|
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
|
||||||
|
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||||
|
|
||||||
|
This program is free software; you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation; either version 2 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License along
|
||||||
|
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
See the full license in the file "LICENSE" in the top level distribution directory
|
||||||
|
*************************************************************************************/
|
||||||
|
/* END LEGAL */
|
||||||
|
#include <Grid/Grid.h>
|
||||||
|
#include <Grid/algorithms/iterative/BlockConjugateGradient.h>
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
using namespace Grid;
|
||||||
|
using namespace Grid::QCD;
|
||||||
|
|
||||||
|
template<class d>
|
||||||
|
struct scal {
|
||||||
|
d internal;
|
||||||
|
};
|
||||||
|
|
||||||
|
Gamma::Algebra Gmu [] = {
|
||||||
|
Gamma::Algebra::GammaX,
|
||||||
|
Gamma::Algebra::GammaY,
|
||||||
|
Gamma::Algebra::GammaZ,
|
||||||
|
Gamma::Algebra::GammaT
|
||||||
|
};
|
||||||
|
|
||||||
|
int main (int argc, char ** argv)
|
||||||
|
{
|
||||||
|
typedef typename ImprovedStaggeredFermionR::FermionField FermionField;
|
||||||
|
typename ImprovedStaggeredFermionR::ImplParams params;
|
||||||
|
|
||||||
|
Grid_init(&argc,&argv);
|
||||||
|
|
||||||
|
std::vector<int> latt_size = GridDefaultLatt();
|
||||||
|
std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
|
||||||
|
std::vector<int> mpi_layout = GridDefaultMpi();
|
||||||
|
|
||||||
|
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
|
||||||
|
GridRedBlackCartesian RBGrid(&Grid);
|
||||||
|
|
||||||
|
std::vector<int> seeds({1,2,3,4});
|
||||||
|
GridParallelRNG pRNG(&Grid); pRNG.SeedFixedIntegers(seeds);
|
||||||
|
|
||||||
|
|
||||||
|
LatticeGaugeField Umu(&Grid); SU3::HotConfiguration(pRNG,Umu);
|
||||||
|
|
||||||
|
double volume=1;
|
||||||
|
for(int mu=0;mu<Nd;mu++){
|
||||||
|
volume=volume*latt_size[mu];
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////
|
||||||
|
// sqrt
|
||||||
|
////////////////////////////////////////
|
||||||
|
double lo=0.001;
|
||||||
|
double hi=1.0;
|
||||||
|
int precision=64;
|
||||||
|
int degree=10;
|
||||||
|
AlgRemez remez(lo,hi,precision);
|
||||||
|
remez.generateApprox(degree,1,2);
|
||||||
|
MultiShiftFunction Sqrt(remez,1.0e-6,false);
|
||||||
|
std::cout<<GridLogMessage << "Generating degree "<<degree<<" for x^(1/2)"<<std::endl;
|
||||||
|
|
||||||
|
|
||||||
|
////////////////////////////////////////////
|
||||||
|
// Setup staggered
|
||||||
|
////////////////////////////////////////////
|
||||||
|
RealD mass=0.003;
|
||||||
|
RealD c1=9.0/8.0;
|
||||||
|
RealD c2=-1.0/24.0;
|
||||||
|
RealD u0=1.0;
|
||||||
|
|
||||||
|
ImprovedStaggeredFermionR Ds(Umu,Umu,Grid,RBGrid,mass,c1,c2,u0);
|
||||||
|
SchurStaggeredOperator<ImprovedStaggeredFermionR,FermionField> HermOpEO(Ds);
|
||||||
|
|
||||||
|
FermionField src(&Grid); random(pRNG,src);
|
||||||
|
FermionField src_o(&RBGrid);
|
||||||
|
pickCheckerboard(Odd,src_o,src);
|
||||||
|
|
||||||
|
|
||||||
|
/////////////////////////////////
|
||||||
|
//Multishift CG
|
||||||
|
/////////////////////////////////
|
||||||
|
std::vector<FermionField> result(degree,&RBGrid);
|
||||||
|
ConjugateGradientMultiShift<FermionField> MSCG(10000,Sqrt);
|
||||||
|
|
||||||
|
double deodoe_flops=(1205+15*degree)*volume; // == 66*16 + == 1146
|
||||||
|
|
||||||
|
double t1=usecond();
|
||||||
|
MSCG(HermOpEO,src_o,result);
|
||||||
|
double t2=usecond();
|
||||||
|
double ncall=MSCG.IterationsToComplete;
|
||||||
|
double flops = deodoe_flops * ncall;
|
||||||
|
std::cout<<GridLogMessage << "usec = "<< (t2-t1)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "flops = "<< flops<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t2-t1)<<std::endl;
|
||||||
|
// HermOpEO.Report();
|
||||||
|
|
||||||
|
Grid_finalize();
|
||||||
|
}
|
Reference in New Issue
Block a user