1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-14 05:07:05 +01:00

Compare commits

..

8 Commits

Author SHA1 Message Date
d68937654b Merge pull request #50 from waterret/develop
use sha256 based splittable rng
2016-10-20 16:40:31 +01:00
70f386f9c6 switch to use Output Feedback Split mode
https://github.com/waterret/RngState-OFS
2016-10-17 14:20:59 -04:00
89cda5971a update rng number algo 2016-10-17 13:31:31 -04:00
c39ec3b607 update license and if guards 2016-10-17 13:31:31 -04:00
8afcc8fb8b fix state size 2016-10-17 13:31:31 -04:00
1abbe2fd0c update rng-state, change output format 2016-10-17 13:31:31 -04:00
4fb37ececd fix sprng-sha256 seed with seq 2016-10-17 13:31:30 -04:00
71eaa7c79e use sha256 based splittable rng 2016-10-17 13:31:30 -04:00
815 changed files with 26878 additions and 141922 deletions

26
.gitignore vendored
View File

@ -9,7 +9,6 @@
################
*~
*#
*.sublime-*
# Precompiled Headers #
#######################
@ -48,9 +47,7 @@ Config.h.in
config.log
config.status
.deps
Make.inc
eigen.inc
Eigen.inc
*.inc
# http://www.gnu.org/software/autoconf #
########################################
@ -83,7 +80,6 @@ ltmain.sh
.Trashes
ehthumbs.db
Thumbs.db
.dirstamp
# build directory #
###################
@ -93,24 +89,16 @@ build*/*
#####################
*.xcodeproj/*
build.sh
.vscode
*.code-workspace
# Eigen source #
################
Grid/Eigen
Eigen/*
lib/Eigen/*
# FFTW source #
################
lib/fftw/*
# libtool macros #
##################
m4/lt*
m4/libtool.m4
# github pages #
################
gh-pages/
# generated sources #
#####################
Grid/qcd/spin/gamma-gen/*.h
Grid/qcd/spin/gamma-gen/*.cc
m4/libtool.m4

View File

@ -7,13 +7,64 @@ cache:
matrix:
include:
- os: osx
osx_image: xcode8.3
osx_image: xcode7.2
compiler: clang
env: PREC=single
- os: osx
osx_image: xcode8.3
compiler: clang
env: PREC=double
- compiler: gcc
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- g++-4.9
- libmpfr-dev
- libgmp-dev
- libmpc-dev
- libopenmpi-dev
- openmpi-bin
- binutils-dev
env: VERSION=-4.9
- compiler: gcc
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- g++-5
- libmpfr-dev
- libgmp-dev
- libmpc-dev
- libopenmpi-dev
- openmpi-bin
- binutils-dev
env: VERSION=-5
- compiler: clang
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- g++-4.8
- libmpfr-dev
- libgmp-dev
- libmpc-dev
- libopenmpi-dev
- openmpi-bin
- binutils-dev
env: CLANG_LINK=http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
- compiler: clang
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- g++-4.8
- libmpfr-dev
- libgmp-dev
- libmpc-dev
- libopenmpi-dev
- openmpi-bin
- binutils-dev
env: CLANG_LINK=http://llvm.org/releases/3.7.0/clang+llvm-3.7.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
before_install:
- export GRIDDIR=`pwd`
@ -21,41 +72,35 @@ before_install:
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export PATH="${GRIDDIR}/clang/bin:${PATH}"; fi
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc openssl; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install openmpi; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]] && [[ "$CC" == "gcc" ]]; then brew install gcc5; fi
install:
- export CWD=`pwd`
- echo $CWD
- export CC=$CC$VERSION
- export CXX=$CXX$VERSION
- echo $PATH
- which autoconf
- autoconf --version
- which automake
- automake --version
- which $CC
- $CC --version
- which $CXX
- $CXX --version
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export LDFLAGS='-L/usr/local/lib'; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export EXTRACONF='--with-openssl=/usr/local/opt/openssl'; fi
script:
- ./bootstrap.sh
- mkdir build
- cd build
- mkdir lime
- cd lime
- mkdir build
- cd build
- wget http://usqcd-software.github.io/downloads/c-lime/lime-1.3.2.tar.gz
- tar xf lime-1.3.2.tar.gz
- cd lime-1.3.2
- ./configure --prefix=$CWD/build/lime/install
- make -j4
- make install
- cd $CWD/build
- ../configure --enable-precision=$PREC --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install ${EXTRACONF}
- ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=none
- make -j4
- ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
- make check
- ./benchmarks/Benchmark_dwf --threads 1
- echo make clean
- ../configure --enable-precision=double --enable-simd=SSE4 --enable-comms=none
- make -j4
- ./benchmarks/Benchmark_dwf --threads 1
- echo make clean
- if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then export CXXFLAGS='-DMPI_UINT32_T=MPI_UNSIGNED -DMPI_UINT64_T=MPI_UNSIGNED_LONG'; fi
- ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto
- make -j4
- if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then mpirun -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi

View File

@ -1,37 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/DisableWarnings.h
Copyright (C) 2016
Author: Guido Cossu <guido.cossu@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef DISABLE_WARNINGS_H
#define DISABLE_WARNINGS_H
//disables and intel compiler specific warning (in json.hpp)
#pragma warning disable 488
#endif

View File

@ -1,29 +0,0 @@
#ifndef GRID_STD_H
#define GRID_STD_H
///////////////////
// Std C++ dependencies
///////////////////
#include <cassert>
#include <complex>
#include <vector>
#include <string>
#include <iostream>
#include <iomanip>
#include <random>
#include <functional>
#include <stdio.h>
#include <stdlib.h>
#include <stdio.h>
#include <signal.h>
#include <ctime>
#include <sys/time.h>
#include <chrono>
#include <zlib.h>
///////////////////
// Grid config
///////////////////
#include "Config.h"
#endif /* GRID_STD_H */

View File

@ -1,14 +0,0 @@
#pragma once
// Force Eigen to use MKL if Grid has been configured with --enable-mkl
#ifdef USE_MKL
#define EIGEN_USE_MKL_ALL
#endif
#if defined __GNUC__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
#endif
#include <Grid/Eigen/Dense>
#if defined __GNUC__
#pragma GCC diagnostic pop
#endif

View File

@ -1,63 +0,0 @@
extra_sources=
extra_headers=
if BUILD_COMMS_MPI3
extra_sources+=communicator/Communicator_mpi3.cc
extra_sources+=communicator/Communicator_base.cc
extra_sources+=communicator/SharedMemoryMPI.cc
extra_sources+=communicator/SharedMemory.cc
endif
if BUILD_COMMS_NONE
extra_sources+=communicator/Communicator_none.cc
extra_sources+=communicator/Communicator_base.cc
extra_sources+=communicator/SharedMemoryNone.cc
extra_sources+=communicator/SharedMemory.cc
endif
if BUILD_HDF5
extra_sources+=serialisation/Hdf5IO.cc
extra_headers+=serialisation/Hdf5IO.h
extra_headers+=serialisation/Hdf5Type.h
endif
all: version-cache
version-cache:
@if [ `git status --porcelain | grep -v '??' | wc -l` -gt 0 ]; then\
a="uncommited changes";\
else\
a="clean";\
fi;\
echo "`git log -n 1 --format=format:"#define GITHASH \\"%H:%d $$a\\"%n" HEAD`" > vertmp;\
if [ -e version-cache ]; then\
d=`diff vertmp version-cache`;\
if [ "$${d}" != "" ]; then\
mv vertmp version-cache;\
rm -f Version.h;\
fi;\
else\
mv vertmp version-cache;\
rm -f Version.h;\
fi;\
rm -f vertmp
Version.h:
cp version-cache Version.h
.PHONY: version-cache
#
# Libraries
#
include Make.inc
include Eigen.inc
lib_LIBRARIES = libGrid.a
CCFILES += $(extra_sources)
HFILES += $(extra_headers) Config.h Version.h
libGrid_a_SOURCES = $(CCFILES)
libGrid_adir = $(includedir)/Grid
nobase_dist_pkginclude_HEADERS = $(HFILES) $(eigen_files) $(eigen_unsupp_files)

View File

@ -1,152 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/approx/Forecast.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: David Murphy <dmurphy@phys.columbia.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef INCLUDED_FORECAST_H
#define INCLUDED_FORECAST_H
namespace Grid {
// Abstract base class.
// Takes a matrix (Mat), a source (phi), and a vector of Fields (chi)
// and returns a forecasted solution to the system D*psi = phi (psi).
template<class Matrix, class Field>
class Forecast
{
public:
virtual Field operator()(Matrix &Mat, const Field& phi, const std::vector<Field>& chi) = 0;
};
// Implementation of Brower et al.'s chronological inverter (arXiv:hep-lat/9509012),
// used to forecast solutions across poles of the EOFA heatbath.
//
// Modified from CPS (cps_pp/src/util/dirac_op/d_op_base/comsrc/minresext.C)
template<class Matrix, class Field>
class ChronoForecast : public Forecast<Matrix,Field>
{
public:
Field operator()(Matrix &Mat, const Field& phi, const std::vector<Field>& prev_solns)
{
int degree = prev_solns.size();
Field chi(phi); // forecasted solution
// Trivial cases
if(degree == 0){ chi = zero; return chi; }
else if(degree == 1){ return prev_solns[0]; }
RealD dot;
ComplexD xp;
Field r(phi); // residual
Field Mv(phi);
std::vector<Field> v(prev_solns); // orthonormalized previous solutions
std::vector<Field> MdagMv(degree,phi);
// Array to hold the matrix elements
std::vector<std::vector<ComplexD>> G(degree, std::vector<ComplexD>(degree));
// Solution and source vectors
std::vector<ComplexD> a(degree);
std::vector<ComplexD> b(degree);
// Orthonormalize the vector basis
for(int i=0; i<degree; i++){
v[i] *= 1.0/std::sqrt(norm2(v[i]));
for(int j=i+1; j<degree; j++){ v[j] -= innerProduct(v[i],v[j]) * v[i]; }
}
// Perform sparse matrix multiplication and construct rhs
for(int i=0; i<degree; i++){
b[i] = innerProduct(v[i],phi);
Mat.M(v[i],Mv);
Mat.Mdag(Mv,MdagMv[i]);
G[i][i] = innerProduct(v[i],MdagMv[i]);
}
// Construct the matrix
for(int j=0; j<degree; j++){
for(int k=j+1; k<degree; k++){
G[j][k] = innerProduct(v[j],MdagMv[k]);
G[k][j] = std::conj(G[j][k]);
}}
// Gauss-Jordan elimination with partial pivoting
for(int i=0; i<degree; i++){
// Perform partial pivoting
int k = i;
for(int j=i+1; j<degree; j++){ if(std::abs(G[j][j]) > std::abs(G[k][k])){ k = j; } }
if(k != i){
xp = b[k];
b[k] = b[i];
b[i] = xp;
for(int j=0; j<degree; j++){
xp = G[k][j];
G[k][j] = G[i][j];
G[i][j] = xp;
}
}
// Convert matrix to upper triangular form
for(int j=i+1; j<degree; j++){
xp = G[j][i]/G[i][i];
b[j] -= xp * b[i];
for(int k=0; k<degree; k++){ G[j][k] -= xp*G[i][k]; }
}
}
// Use Gaussian elimination to solve equations and calculate initial guess
chi = zero;
r = phi;
for(int i=degree-1; i>=0; i--){
a[i] = 0.0;
for(int j=i+1; j<degree; j++){ a[i] += G[i][j] * a[j]; }
a[i] = (b[i]-a[i])/G[i][i];
chi += a[i]*v[i];
r -= a[i]*MdagMv[i];
}
RealD true_r(0.0);
ComplexD tmp;
for(int i=0; i<degree; i++){
tmp = -b[i];
for(int j=0; j<degree; j++){ tmp += G[i][j]*a[j]; }
tmp = std::conj(tmp)*tmp;
true_r += std::sqrt(tmp.real());
}
RealD error = std::sqrt(norm2(r)/norm2(phi));
std::cout << GridLogMessage << "ChronoForecast: |res|/|src| = " << error << std::endl;
return chi;
};
};
}
#endif

View File

@ -1,606 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/BlockConjugateGradient.h
Copyright (C) 2017
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_BLOCK_CONJUGATE_GRADIENT_H
#define GRID_BLOCK_CONJUGATE_GRADIENT_H
namespace Grid {
enum BlockCGtype { BlockCG, BlockCGrQ, CGmultiRHS };
//////////////////////////////////////////////////////////////////////////
// Block conjugate gradient. Dimension zero should be the block direction
//////////////////////////////////////////////////////////////////////////
template <class Field>
class BlockConjugateGradient : public OperatorFunction<Field> {
public:
typedef typename Field::scalar_type scomplex;
int blockDim ;
int Nblock;
BlockCGtype CGtype;
bool ErrorOnNoConverge; // throw an assert when the CG fails to converge.
// Defaults true.
RealD Tolerance;
Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
BlockConjugateGradient(BlockCGtype cgtype,int _Orthog,RealD tol, Integer maxit, bool err_on_no_conv = true)
: Tolerance(tol), CGtype(cgtype), blockDim(_Orthog), MaxIterations(maxit), ErrorOnNoConverge(err_on_no_conv)
{};
////////////////////////////////////////////////////////////////////////////////////////////////////
// Thin QR factorisation (google it)
////////////////////////////////////////////////////////////////////////////////////////////////////
void ThinQRfact (Eigen::MatrixXcd &m_rr,
Eigen::MatrixXcd &C,
Eigen::MatrixXcd &Cinv,
Field & Q,
const Field & R)
{
int Orthog = blockDim; // First dimension is block dim; this is an assumption
////////////////////////////////////////////////////////////////////////////////////////////////////
//Dimensions
// R_{ferm x Nblock} = Q_{ferm x Nblock} x C_{Nblock x Nblock} -> ferm x Nblock
//
// Rdag R = m_rr = Herm = L L^dag <-- Cholesky decomposition (LLT routine in Eigen)
//
// Q C = R => Q = R C^{-1}
//
// Want Ident = Q^dag Q = C^{-dag} R^dag R C^{-1} = C^{-dag} L L^dag C^{-1} = 1_{Nblock x Nblock}
//
// Set C = L^{dag}, and then Q^dag Q = ident
//
// Checks:
// Cdag C = Rdag R ; passes.
// QdagQ = 1 ; passes
////////////////////////////////////////////////////////////////////////////////////////////////////
sliceInnerProductMatrix(m_rr,R,R,Orthog);
// Force manifest hermitian to avoid rounding related
m_rr = 0.5*(m_rr+m_rr.adjoint());
#if 0
std::cout << " Calling Cholesky ldlt on m_rr " << m_rr <<std::endl;
Eigen::MatrixXcd L_ldlt = m_rr.ldlt().matrixL();
std::cout << " Called Cholesky ldlt on m_rr " << L_ldlt <<std::endl;
auto D_ldlt = m_rr.ldlt().vectorD();
std::cout << " Called Cholesky ldlt on m_rr " << D_ldlt <<std::endl;
#endif
// std::cout << " Calling Cholesky llt on m_rr " <<std::endl;
Eigen::MatrixXcd L = m_rr.llt().matrixL();
// std::cout << " Called Cholesky llt on m_rr " << L <<std::endl;
C = L.adjoint();
Cinv = C.inverse();
////////////////////////////////////////////////////////////////////////////////////////////////////
// Q = R C^{-1}
//
// Q_j = R_i Cinv(i,j)
//
// NB maddMatrix conventions are Right multiplication X[j] a[j,i] already
////////////////////////////////////////////////////////////////////////////////////////////////////
sliceMulMatrix(Q,Cinv,R,Orthog);
}
////////////////////////////////////////////////////////////////////////////////////////////////////
// Call one of several implementations
////////////////////////////////////////////////////////////////////////////////////////////////////
void operator()(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
{
if ( CGtype == BlockCGrQ ) {
BlockCGrQsolve(Linop,Src,Psi);
} else if (CGtype == BlockCG ) {
BlockCGsolve(Linop,Src,Psi);
} else if (CGtype == CGmultiRHS ) {
CGmultiRHSsolve(Linop,Src,Psi);
} else {
assert(0);
}
}
////////////////////////////////////////////////////////////////////////////
// BlockCGrQ implementation:
//--------------------------
// X is guess/Solution
// B is RHS
// Solve A X_i = B_i ; i refers to Nblock index
////////////////////////////////////////////////////////////////////////////
void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
{
int Orthog = blockDim; // First dimension is block dim; this is an assumption
Nblock = B._grid->_fdimensions[Orthog];
std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
X.checkerboard = B.checkerboard;
conformable(X, B);
Field tmp(B);
Field Q(B);
Field D(B);
Field Z(B);
Field AD(B);
Eigen::MatrixXcd m_DZ = Eigen::MatrixXcd::Identity(Nblock,Nblock);
Eigen::MatrixXcd m_M = Eigen::MatrixXcd::Identity(Nblock,Nblock);
Eigen::MatrixXcd m_rr = Eigen::MatrixXcd::Zero(Nblock,Nblock);
Eigen::MatrixXcd m_C = Eigen::MatrixXcd::Zero(Nblock,Nblock);
Eigen::MatrixXcd m_Cinv = Eigen::MatrixXcd::Zero(Nblock,Nblock);
Eigen::MatrixXcd m_S = Eigen::MatrixXcd::Zero(Nblock,Nblock);
Eigen::MatrixXcd m_Sinv = Eigen::MatrixXcd::Zero(Nblock,Nblock);
Eigen::MatrixXcd m_tmp = Eigen::MatrixXcd::Identity(Nblock,Nblock);
Eigen::MatrixXcd m_tmp1 = Eigen::MatrixXcd::Identity(Nblock,Nblock);
// Initial residual computation & set up
std::vector<RealD> residuals(Nblock);
std::vector<RealD> ssq(Nblock);
sliceNorm(ssq,B,Orthog);
RealD sssum=0;
for(int b=0;b<Nblock;b++) sssum+=ssq[b];
sliceNorm(residuals,B,Orthog);
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
sliceNorm(residuals,X,Orthog);
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
/************************************************************************
* Block conjugate gradient rQ (Sebastien Birk Thesis, after Dubrulle 2001)
************************************************************************
* Dimensions:
*
* X,B==(Nferm x Nblock)
* A==(Nferm x Nferm)
*
* Nferm = Nspin x Ncolour x Ncomplex x Nlattice_site
*
* QC = R = B-AX, D = Q ; QC => Thin QR factorisation (google it)
* for k:
* Z = AD
* M = [D^dag Z]^{-1}
* X = X + D MC
* QS = Q - ZM
* D = Q + D S^dag
* C = S C
*/
///////////////////////////////////////
// Initial block: initial search dir is guess
///////////////////////////////////////
std::cout << GridLogMessage<<"BlockCGrQ algorithm initialisation " <<std::endl;
//1. QC = R = B-AX, D = Q ; QC => Thin QR factorisation (google it)
Linop.HermOp(X, AD);
tmp = B - AD;
//std::cout << GridLogMessage << " initial tmp " << norm2(tmp)<< std::endl;
ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
//std::cout << GridLogMessage << " initial Q " << norm2(Q)<< std::endl;
//std::cout << GridLogMessage << " m_rr " << m_rr<<std::endl;
//std::cout << GridLogMessage << " m_C " << m_C<<std::endl;
//std::cout << GridLogMessage << " m_Cinv " << m_Cinv<<std::endl;
D=Q;
std::cout << GridLogMessage<<"BlockCGrQ computed initial residual and QR fact " <<std::endl;
///////////////////////////////////////
// Timers
///////////////////////////////////////
GridStopWatch sliceInnerTimer;
GridStopWatch sliceMaddTimer;
GridStopWatch QRTimer;
GridStopWatch MatrixTimer;
GridStopWatch SolverTimer;
SolverTimer.Start();
int k;
for (k = 1; k <= MaxIterations; k++) {
//3. Z = AD
MatrixTimer.Start();
Linop.HermOp(D, Z);
MatrixTimer.Stop();
//std::cout << GridLogMessage << " norm2 Z " <<norm2(Z)<<std::endl;
//4. M = [D^dag Z]^{-1}
sliceInnerTimer.Start();
sliceInnerProductMatrix(m_DZ,D,Z,Orthog);
sliceInnerTimer.Stop();
m_M = m_DZ.inverse();
//std::cout << GridLogMessage << " m_DZ " <<m_DZ<<std::endl;
//5. X = X + D MC
m_tmp = m_M * m_C;
sliceMaddTimer.Start();
sliceMaddMatrix(X,m_tmp, D,X,Orthog);
sliceMaddTimer.Stop();
//6. QS = Q - ZM
sliceMaddTimer.Start();
sliceMaddMatrix(tmp,m_M,Z,Q,Orthog,-1.0);
sliceMaddTimer.Stop();
QRTimer.Start();
ThinQRfact (m_rr, m_S, m_Sinv, Q, tmp);
QRTimer.Stop();
//7. D = Q + D S^dag
m_tmp = m_S.adjoint();
sliceMaddTimer.Start();
sliceMaddMatrix(D,m_tmp,D,Q,Orthog);
sliceMaddTimer.Stop();
//8. C = S C
m_C = m_S*m_C;
/*********************
* convergence monitor
*********************
*/
m_rr = m_C.adjoint() * m_C;
RealD max_resid=0;
RealD rrsum=0;
RealD rr;
for(int b=0;b<Nblock;b++) {
rrsum+=real(m_rr(b,b));
rr = real(m_rr(b,b))/ssq[b];
if ( rr > max_resid ) max_resid = rr;
}
std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
<<" ave "<<std::sqrt(rrsum/sssum) << " max "<< max_resid <<std::endl;
if ( max_resid < Tolerance*Tolerance ) {
SolverTimer.Stop();
std::cout << GridLogMessage<<"BlockCGrQ converged in "<<k<<" iterations"<<std::endl;
for(int b=0;b<Nblock;b++){
std::cout << GridLogMessage<< "\t\tblock "<<b<<" computed resid "
<< std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
}
std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
Linop.HermOp(X, AD);
AD = AD-B;
std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(norm2(AD)/norm2(B)) <<std::endl;
std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tInnerProd " << sliceInnerTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tThinQRfact " << QRTimer.Elapsed() <<std::endl;
IterationsToComplete = k;
return;
}
}
std::cout << GridLogMessage << "BlockConjugateGradient(rQ) did NOT converge" << std::endl;
if (ErrorOnNoConverge) assert(0);
IterationsToComplete = k;
}
//////////////////////////////////////////////////////////////////////////
// Block conjugate gradient; Original O'Leary Dimension zero should be the block direction
//////////////////////////////////////////////////////////////////////////
void BlockCGsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
{
int Orthog = blockDim; // First dimension is block dim; this is an assumption
Nblock = Src._grid->_fdimensions[Orthog];
std::cout<<GridLogMessage<<" Block Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
Psi.checkerboard = Src.checkerboard;
conformable(Psi, Src);
Field P(Src);
Field AP(Src);
Field R(Src);
Eigen::MatrixXcd m_pAp = Eigen::MatrixXcd::Identity(Nblock,Nblock);
Eigen::MatrixXcd m_pAp_inv= Eigen::MatrixXcd::Identity(Nblock,Nblock);
Eigen::MatrixXcd m_rr = Eigen::MatrixXcd::Zero(Nblock,Nblock);
Eigen::MatrixXcd m_rr_inv = Eigen::MatrixXcd::Zero(Nblock,Nblock);
Eigen::MatrixXcd m_alpha = Eigen::MatrixXcd::Zero(Nblock,Nblock);
Eigen::MatrixXcd m_beta = Eigen::MatrixXcd::Zero(Nblock,Nblock);
// Initial residual computation & set up
std::vector<RealD> residuals(Nblock);
std::vector<RealD> ssq(Nblock);
sliceNorm(ssq,Src,Orthog);
RealD sssum=0;
for(int b=0;b<Nblock;b++) sssum+=ssq[b];
sliceNorm(residuals,Src,Orthog);
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
sliceNorm(residuals,Psi,Orthog);
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
// Initial search dir is guess
Linop.HermOp(Psi, AP);
/************************************************************************
* Block conjugate gradient (Stephen Pickles, thesis 1995, pp 71, O Leary 1980)
************************************************************************
* O'Leary : R = B - A X
* O'Leary : P = M R ; preconditioner M = 1
* O'Leary : alpha = PAP^{-1} RMR
* O'Leary : beta = RMR^{-1}_old RMR_new
* O'Leary : X=X+Palpha
* O'Leary : R_new=R_old-AP alpha
* O'Leary : P=MR_new+P beta
*/
R = Src - AP;
P = R;
sliceInnerProductMatrix(m_rr,R,R,Orthog);
GridStopWatch sliceInnerTimer;
GridStopWatch sliceMaddTimer;
GridStopWatch MatrixTimer;
GridStopWatch SolverTimer;
SolverTimer.Start();
int k;
for (k = 1; k <= MaxIterations; k++) {
RealD rrsum=0;
for(int b=0;b<Nblock;b++) rrsum+=real(m_rr(b,b));
std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
<<" / "<<std::sqrt(rrsum/sssum) <<std::endl;
MatrixTimer.Start();
Linop.HermOp(P, AP);
MatrixTimer.Stop();
// Alpha
sliceInnerTimer.Start();
sliceInnerProductMatrix(m_pAp,P,AP,Orthog);
sliceInnerTimer.Stop();
m_pAp_inv = m_pAp.inverse();
m_alpha = m_pAp_inv * m_rr ;
// Psi, R update
sliceMaddTimer.Start();
sliceMaddMatrix(Psi,m_alpha, P,Psi,Orthog); // add alpha * P to psi
sliceMaddMatrix(R ,m_alpha,AP, R,Orthog,-1.0);// sub alpha * AP to resid
sliceMaddTimer.Stop();
// Beta
m_rr_inv = m_rr.inverse();
sliceInnerTimer.Start();
sliceInnerProductMatrix(m_rr,R,R,Orthog);
sliceInnerTimer.Stop();
m_beta = m_rr_inv *m_rr;
// Search update
sliceMaddTimer.Start();
sliceMaddMatrix(AP,m_beta,P,R,Orthog);
sliceMaddTimer.Stop();
P= AP;
/*********************
* convergence monitor
*********************
*/
RealD max_resid=0;
RealD rr;
for(int b=0;b<Nblock;b++){
rr = real(m_rr(b,b))/ssq[b];
if ( rr > max_resid ) max_resid = rr;
}
if ( max_resid < Tolerance*Tolerance ) {
SolverTimer.Stop();
std::cout << GridLogMessage<<"BlockCG converged in "<<k<<" iterations"<<std::endl;
for(int b=0;b<Nblock;b++){
std::cout << GridLogMessage<< "\t\tblock "<<b<<" computed resid "
<< std::sqrt(real(m_rr(b,b))/ssq[b])<<std::endl;
}
std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
Linop.HermOp(Psi, AP);
AP = AP-Src;
std::cout << GridLogMessage <<"\t True residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tInnerProd " << sliceInnerTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed() <<std::endl;
IterationsToComplete = k;
return;
}
}
std::cout << GridLogMessage << "BlockConjugateGradient did NOT converge" << std::endl;
if (ErrorOnNoConverge) assert(0);
IterationsToComplete = k;
}
//////////////////////////////////////////////////////////////////////////
// multiRHS conjugate gradient. Dimension zero should be the block direction
// Use this for spread out across nodes
//////////////////////////////////////////////////////////////////////////
void CGmultiRHSsolve(LinearOperatorBase<Field> &Linop, const Field &Src, Field &Psi)
{
int Orthog = blockDim; // First dimension is block dim
Nblock = Src._grid->_fdimensions[Orthog];
std::cout<<GridLogMessage<<"MultiRHS Conjugate Gradient : Orthog "<<Orthog<<" Nblock "<<Nblock<<std::endl;
Psi.checkerboard = Src.checkerboard;
conformable(Psi, Src);
Field P(Src);
Field AP(Src);
Field R(Src);
std::vector<ComplexD> v_pAp(Nblock);
std::vector<RealD> v_rr (Nblock);
std::vector<RealD> v_rr_inv(Nblock);
std::vector<RealD> v_alpha(Nblock);
std::vector<RealD> v_beta(Nblock);
// Initial residual computation & set up
std::vector<RealD> residuals(Nblock);
std::vector<RealD> ssq(Nblock);
sliceNorm(ssq,Src,Orthog);
RealD sssum=0;
for(int b=0;b<Nblock;b++) sssum+=ssq[b];
sliceNorm(residuals,Src,Orthog);
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
sliceNorm(residuals,Psi,Orthog);
for(int b=0;b<Nblock;b++){ assert(std::isnan(residuals[b])==0); }
// Initial search dir is guess
Linop.HermOp(Psi, AP);
R = Src - AP;
P = R;
sliceNorm(v_rr,R,Orthog);
GridStopWatch sliceInnerTimer;
GridStopWatch sliceMaddTimer;
GridStopWatch sliceNormTimer;
GridStopWatch MatrixTimer;
GridStopWatch SolverTimer;
SolverTimer.Start();
int k;
for (k = 1; k <= MaxIterations; k++) {
RealD rrsum=0;
for(int b=0;b<Nblock;b++) rrsum+=real(v_rr[b]);
std::cout << GridLogIterative << "\titeration "<<k<<" rr_sum "<<rrsum<<" ssq_sum "<< sssum
<<" / "<<std::sqrt(rrsum/sssum) <<std::endl;
MatrixTimer.Start();
Linop.HermOp(P, AP);
MatrixTimer.Stop();
// Alpha
sliceInnerTimer.Start();
sliceInnerProductVector(v_pAp,P,AP,Orthog);
sliceInnerTimer.Stop();
for(int b=0;b<Nblock;b++){
v_alpha[b] = v_rr[b]/real(v_pAp[b]);
}
// Psi, R update
sliceMaddTimer.Start();
sliceMaddVector(Psi,v_alpha, P,Psi,Orthog); // add alpha * P to psi
sliceMaddVector(R ,v_alpha,AP, R,Orthog,-1.0);// sub alpha * AP to resid
sliceMaddTimer.Stop();
// Beta
for(int b=0;b<Nblock;b++){
v_rr_inv[b] = 1.0/v_rr[b];
}
sliceNormTimer.Start();
sliceNorm(v_rr,R,Orthog);
sliceNormTimer.Stop();
for(int b=0;b<Nblock;b++){
v_beta[b] = v_rr_inv[b] *v_rr[b];
}
// Search update
sliceMaddTimer.Start();
sliceMaddVector(P,v_beta,P,R,Orthog);
sliceMaddTimer.Stop();
/*********************
* convergence monitor
*********************
*/
RealD max_resid=0;
for(int b=0;b<Nblock;b++){
RealD rr = v_rr[b]/ssq[b];
if ( rr > max_resid ) max_resid = rr;
}
if ( max_resid < Tolerance*Tolerance ) {
SolverTimer.Stop();
std::cout << GridLogMessage<<"MultiRHS solver converged in " <<k<<" iterations"<<std::endl;
for(int b=0;b<Nblock;b++){
std::cout << GridLogMessage<< "\t\tBlock "<<b<<" computed resid "<< std::sqrt(v_rr[b]/ssq[b])<<std::endl;
}
std::cout << GridLogMessage<<"\tMax residual is "<<std::sqrt(max_resid)<<std::endl;
Linop.HermOp(Psi, AP);
AP = AP-Src;
std::cout <<GridLogMessage << "\tTrue residual is " << std::sqrt(norm2(AP)/norm2(Src)) <<std::endl;
std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tInnerProd " << sliceInnerTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tNorm " << sliceNormTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMaddMatrix " << sliceMaddTimer.Elapsed() <<std::endl;
IterationsToComplete = k;
return;
}
}
std::cout << GridLogMessage << "MultiRHSConjugateGradient did NOT converge" << std::endl;
if (ErrorOnNoConverge) assert(0);
IterationsToComplete = k;
}
};
}
#endif

View File

@ -1,256 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/ConjugateGradientReliableUpdate.h
Copyright (C) 2015
Author: Christopher Kelly <ckelly@phys.columbia.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CONJUGATE_GRADIENT_RELIABLE_UPDATE_H
#define GRID_CONJUGATE_GRADIENT_RELIABLE_UPDATE_H
namespace Grid {
template<class FieldD,class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class ConjugateGradientReliableUpdate : public LinearFunction<FieldD> {
public:
bool ErrorOnNoConverge; // throw an assert when the CG fails to converge.
// Defaults true.
RealD Tolerance;
Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
Integer ReliableUpdatesPerformed;
bool DoFinalCleanup; //Final DP cleanup, defaults to true
Integer IterationsToCleanup; //Final DP cleanup step iterations
LinearOperatorBase<FieldF> &Linop_f;
LinearOperatorBase<FieldD> &Linop_d;
GridBase* SinglePrecGrid;
RealD Delta; //reliable update parameter
//Optional ability to switch to a different linear operator once the tolerance reaches a certain point. Useful for single/half -> single/single
LinearOperatorBase<FieldF> *Linop_fallback;
RealD fallback_transition_tol;
ConjugateGradientReliableUpdate(RealD tol, Integer maxit, RealD _delta, GridBase* _sp_grid, LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldD> &_Linop_d, bool err_on_no_conv = true)
: Tolerance(tol),
MaxIterations(maxit),
Delta(_delta),
Linop_f(_Linop_f),
Linop_d(_Linop_d),
SinglePrecGrid(_sp_grid),
ErrorOnNoConverge(err_on_no_conv),
DoFinalCleanup(true),
Linop_fallback(NULL)
{};
void setFallbackLinop(LinearOperatorBase<FieldF> &_Linop_fallback, const RealD _fallback_transition_tol){
Linop_fallback = &_Linop_fallback;
fallback_transition_tol = _fallback_transition_tol;
}
void operator()(const FieldD &src, FieldD &psi) {
LinearOperatorBase<FieldF> *Linop_f_use = &Linop_f;
bool using_fallback = false;
psi.checkerboard = src.checkerboard;
conformable(psi, src);
RealD cp, c, a, d, b, ssq, qq, b_pred;
FieldD p(src);
FieldD mmp(src);
FieldD r(src);
// Initial residual computation & set up
RealD guess = norm2(psi);
assert(std::isnan(guess) == 0);
Linop_d.HermOpAndNorm(psi, mmp, d, b);
r = src - mmp;
p = r;
a = norm2(p);
cp = a;
ssq = norm2(src);
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: guess " << guess << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: src " << ssq << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: mp " << d << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: mmp " << b << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: cp,r " << cp << std::endl;
std::cout << GridLogIterative << std::setprecision(4) << "ConjugateGradientReliableUpdate: p " << a << std::endl;
RealD rsq = Tolerance * Tolerance * ssq;
// Check if guess is really REALLY good :)
if (cp <= rsq) {
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate guess was REALLY good\n";
std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl;
return;
}
//Single prec initialization
FieldF r_f(SinglePrecGrid);
r_f.checkerboard = r.checkerboard;
precisionChange(r_f, r);
FieldF psi_f(r_f);
psi_f = zero;
FieldF p_f(r_f);
FieldF mmp_f(r_f);
RealD MaxResidSinceLastRelUp = cp; //initial residual
std::cout << GridLogIterative << std::setprecision(4)
<< "ConjugateGradient: k=0 residual " << cp << " target " << rsq << std::endl;
GridStopWatch LinalgTimer;
GridStopWatch MatrixTimer;
GridStopWatch SolverTimer;
SolverTimer.Start();
int k = 0;
int l = 0;
for (k = 1; k <= MaxIterations; k++) {
c = cp;
MatrixTimer.Start();
Linop_f_use->HermOpAndNorm(p_f, mmp_f, d, qq);
MatrixTimer.Stop();
LinalgTimer.Start();
a = c / d;
b_pred = a * (a * qq - d) / c;
cp = axpy_norm(r_f, -a, mmp_f, r_f);
b = cp / c;
// Fuse these loops ; should be really easy
psi_f = a * p_f + psi_f;
//p_f = p_f * b + r_f;
LinalgTimer.Stop();
std::cout << GridLogIterative << "ConjugateGradientReliableUpdate: Iteration " << k
<< " residual " << cp << " target " << rsq << std::endl;
std::cout << GridLogDebug << "a = "<< a << " b_pred = "<< b_pred << " b = "<< b << std::endl;
std::cout << GridLogDebug << "qq = "<< qq << " d = "<< d << " c = "<< c << std::endl;
if(cp > MaxResidSinceLastRelUp){
std::cout << GridLogIterative << "ConjugateGradientReliableUpdate: updating MaxResidSinceLastRelUp : " << MaxResidSinceLastRelUp << " -> " << cp << std::endl;
MaxResidSinceLastRelUp = cp;
}
// Stopping condition
if (cp <= rsq) {
//Although not written in the paper, I assume that I have to add on the final solution
precisionChange(mmp, psi_f);
psi = psi + mmp;
SolverTimer.Stop();
Linop_d.HermOpAndNorm(psi, mmp, d, qq);
p = mmp - src;
RealD srcnorm = sqrt(norm2(src));
RealD resnorm = sqrt(norm2(p));
RealD true_residual = resnorm / srcnorm;
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate Converged on iteration " << k << " after " << l << " reliable updates" << std::endl;
std::cout << GridLogMessage << "\tComputed residual " << sqrt(cp / ssq)<<std::endl;
std::cout << GridLogMessage << "\tTrue residual " << true_residual<<std::endl;
std::cout << GridLogMessage << "\tTarget " << Tolerance << std::endl;
std::cout << GridLogMessage << "Time breakdown "<<std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl;
IterationsToComplete = k;
ReliableUpdatesPerformed = l;
if(DoFinalCleanup){
//Do a final CG to cleanup
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate performing final cleanup.\n";
ConjugateGradient<FieldD> CG(Tolerance,MaxIterations);
CG.ErrorOnNoConverge = ErrorOnNoConverge;
CG(Linop_d,src,psi);
IterationsToCleanup = CG.IterationsToComplete;
}
else if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate complete.\n";
return;
}
else if(cp < Delta * MaxResidSinceLastRelUp) { //reliable update
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate "
<< cp << "(residual) < " << Delta << "(Delta) * " << MaxResidSinceLastRelUp << "(MaxResidSinceLastRelUp) on iteration " << k << " : performing reliable update\n";
precisionChange(mmp, psi_f);
psi = psi + mmp;
Linop_d.HermOpAndNorm(psi, mmp, d, qq);
r = src - mmp;
psi_f = zero;
precisionChange(r_f, r);
cp = norm2(r);
MaxResidSinceLastRelUp = cp;
b = cp/c;
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate new residual " << cp << std::endl;
l = l+1;
}
p_f = p_f * b + r_f; //update search vector after reliable update appears to help convergence
if(!using_fallback && Linop_fallback != NULL && cp < fallback_transition_tol){
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate switching to fallback linear operator on iteration " << k << " at residual " << cp << std::endl;
Linop_f_use = Linop_fallback;
using_fallback = true;
}
}
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate did NOT converge"
<< std::endl;
if (ErrorOnNoConverge) assert(0);
IterationsToComplete = k;
ReliableUpdatesPerformed = l;
}
};
};
#endif

View File

@ -1,104 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_DEFLATION_H
#define GRID_DEFLATION_H
namespace Grid {
template<class Field>
class ZeroGuesser: public LinearFunction<Field> {
public:
virtual void operator()(const Field &src, Field &guess) { guess = zero; };
};
template<class Field>
class SourceGuesser: public LinearFunction<Field> {
public:
virtual void operator()(const Field &src, Field &guess) { guess = src; };
};
////////////////////////////////
// Fine grid deflation
////////////////////////////////
template<class Field>
class DeflatedGuesser: public LinearFunction<Field> {
private:
const std::vector<Field> &evec;
const std::vector<RealD> &eval;
public:
DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval) : evec(_evec), eval(_eval) {};
virtual void operator()(const Field &src,Field &guess) {
guess = zero;
assert(evec.size()==eval.size());
auto N = evec.size();
for (int i=0;i<N;i++) {
const Field& tmp = evec[i];
axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess);
}
guess.checkerboard = src.checkerboard;
}
};
template<class FineField, class CoarseField>
class LocalCoherenceDeflatedGuesser: public LinearFunction<FineField> {
private:
const std::vector<FineField> &subspace;
const std::vector<CoarseField> &evec_coarse;
const std::vector<RealD> &eval_coarse;
public:
LocalCoherenceDeflatedGuesser(const std::vector<FineField> &_subspace,
const std::vector<CoarseField> &_evec_coarse,
const std::vector<RealD> &_eval_coarse)
: subspace(_subspace),
evec_coarse(_evec_coarse),
eval_coarse(_eval_coarse)
{
}
void operator()(const FineField &src,FineField &guess) {
int N = (int)evec_coarse.size();
CoarseField src_coarse(evec_coarse[0]._grid);
CoarseField guess_coarse(evec_coarse[0]._grid); guess_coarse = zero;
blockProject(src_coarse,src,subspace);
for (int i=0;i<N;i++) {
const CoarseField & tmp = evec_coarse[i];
axpy(guess_coarse,TensorRemove(innerProduct(tmp,src_coarse)) / eval_coarse[i],tmp,guess_coarse);
}
blockPromote(guess_coarse,guess,subspace);
guess.checkerboard = src.checkerboard;
};
};
}
#endif

View File

@ -1,842 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/ImplicitlyRestartedLanczos.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Chulwoo Jung <chulwoo@bnl.gov>
Author: Christoph Lehner <clehner@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_BIRL_H
#define GRID_BIRL_H
#include <string.h> //memset
//#include <zlib.h>
#include <sys/stat.h>
namespace Grid {
////////////////////////////////////////////////////////
// Move following 100 LOC to lattice/Lattice_basis.h
////////////////////////////////////////////////////////
template<class Field>
void basisOrthogonalize(std::vector<Field> &basis,Field &w,int k)
{
for(int j=0; j<k; ++j){
auto ip = innerProduct(basis[j],w);
w = w - ip*basis[j];
}
}
template<class Field>
void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, int k0,int k1,int Nm)
{
typedef typename Field::vector_object vobj;
GridBase* grid = basis[0]._grid;
parallel_region
{
std::vector < vobj , commAllocator<vobj> > B(Nm); // Thread private
parallel_for_internal(int ss=0;ss < grid->oSites();ss++){
for(int j=j0; j<j1; ++j) B[j]=0.;
for(int j=j0; j<j1; ++j){
for(int k=k0; k<k1; ++k){
B[j] +=Qt(j,k) * basis[k]._odata[ss];
}
}
for(int j=j0; j<j1; ++j){
basis[j]._odata[ss] = B[j];
}
}
}
}
// Extract a single rotated vector
template<class Field>
void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm)
{
typedef typename Field::vector_object vobj;
GridBase* grid = basis[0]._grid;
result.checkerboard = basis[0].checkerboard;
parallel_for(int ss=0;ss < grid->oSites();ss++){
vobj B = zero;
for(int k=k0; k<k1; ++k){
B +=Qt(j,k) * basis[k]._odata[ss];
}
result._odata[ss] = B;
}
}
template<class Field>
void basisReorderInPlace(std::vector<Field> &_v,std::vector<RealD>& sort_vals, std::vector<int>& idx)
{
int vlen = idx.size();
assert(vlen>=1);
assert(vlen<=sort_vals.size());
assert(vlen<=_v.size());
for (size_t i=0;i<vlen;i++) {
if (idx[i] != i) {
//////////////////////////////////////
// idx[i] is a table of desired sources giving a permutation.
// Swap v[i] with v[idx[i]].
// Find j>i for which _vnew[j] = _vold[i],
// track the move idx[j] => idx[i]
// track the move idx[i] => i
//////////////////////////////////////
size_t j;
for (j=i;j<idx.size();j++)
if (idx[j]==i)
break;
assert(idx[i] > i); assert(j!=idx.size()); assert(idx[j]==i);
std::swap(_v[i]._odata,_v[idx[i]]._odata); // should use vector move constructor, no data copy
std::swap(sort_vals[i],sort_vals[idx[i]]);
idx[j] = idx[i];
idx[i] = i;
}
}
}
inline std::vector<int> basisSortGetIndex(std::vector<RealD>& sort_vals)
{
std::vector<int> idx(sort_vals.size());
std::iota(idx.begin(), idx.end(), 0);
// sort indexes based on comparing values in v
std::sort(idx.begin(), idx.end(), [&sort_vals](int i1, int i2) {
return ::fabs(sort_vals[i1]) < ::fabs(sort_vals[i2]);
});
return idx;
}
template<class Field>
void basisSortInPlace(std::vector<Field> & _v,std::vector<RealD>& sort_vals, bool reverse)
{
std::vector<int> idx = basisSortGetIndex(sort_vals);
if (reverse)
std::reverse(idx.begin(), idx.end());
basisReorderInPlace(_v,sort_vals,idx);
}
/////////////////////////////////////////////////////////////
// Implicitly restarted lanczos
/////////////////////////////////////////////////////////////
template<class Field> class ImplicitlyRestartedLanczosTester
{
public:
virtual int TestConvergence(int j,RealD resid,Field &evec, RealD &eval,RealD evalMaxApprox)=0;
virtual int ReconstructEval(int j,RealD resid,Field &evec, RealD &eval,RealD evalMaxApprox)=0;
};
enum IRLdiagonalisation {
IRLdiagonaliseWithDSTEGR,
IRLdiagonaliseWithQR,
IRLdiagonaliseWithEigen
};
template<class Field> class ImplicitlyRestartedLanczosHermOpTester : public ImplicitlyRestartedLanczosTester<Field>
{
public:
LinearFunction<Field> &_HermOp;
ImplicitlyRestartedLanczosHermOpTester(LinearFunction<Field> &HermOp) : _HermOp(HermOp) { };
int ReconstructEval(int j,RealD resid,Field &B, RealD &eval,RealD evalMaxApprox)
{
return TestConvergence(j,resid,B,eval,evalMaxApprox);
}
int TestConvergence(int j,RealD eresid,Field &B, RealD &eval,RealD evalMaxApprox)
{
Field v(B);
RealD eval_poly = eval;
// Apply operator
_HermOp(B,v);
RealD vnum = real(innerProduct(B,v)); // HermOp.
RealD vden = norm2(B);
RealD vv0 = norm2(v);
eval = vnum/vden;
v -= eval*B;
RealD vv = norm2(v) / ::pow(evalMaxApprox,2.0);
std::cout.precision(13);
std::cout<<GridLogIRL << "[" << std::setw(3)<<j<<"] "
<<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
<<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
<<std::endl;
int conv=0;
if( (vv<eresid*eresid) ) conv = 1;
return conv;
}
};
template<class Field>
class ImplicitlyRestartedLanczos {
private:
const RealD small = 1.0e-8;
int MaxIter;
int MinRestart; // Minimum number of restarts; only check for convergence after
int Nstop; // Number of evecs checked for convergence
int Nk; // Number of converged sought
// int Np; // Np -- Number of spare vecs in krylov space // == Nm - Nk
int Nm; // Nm -- total number of vectors
IRLdiagonalisation diagonalisation;
int orth_period;
RealD OrthoTime;
RealD eresid, betastp;
////////////////////////////////
// Embedded objects
////////////////////////////////
LinearFunction<Field> &_PolyOp;
LinearFunction<Field> &_HermOp;
ImplicitlyRestartedLanczosTester<Field> &_Tester;
// Default tester provided (we need a ref to something in default case)
ImplicitlyRestartedLanczosHermOpTester<Field> SimpleTester;
/////////////////////////
// Constructor
/////////////////////////
public:
//////////////////////////////////////////////////////////////////
// PAB:
//////////////////////////////////////////////////////////////////
// Too many options & knobs.
// Eliminate:
// orth_period
// betastp
// MinRestart
//
// Do we really need orth_period
// What is the theoretical basis & guarantees of betastp ?
// Nstop=Nk viable?
// MinRestart avoidable with new convergence test?
// Could cut to PolyOp, HermOp, Tester, Nk, Nm, resid, maxiter (+diagonalisation)
// HermOp could be eliminated if we dropped the Power method for max eval.
// -- also: The eval, eval2, eval2_copy stuff is still unnecessarily unclear
//////////////////////////////////////////////////////////////////
ImplicitlyRestartedLanczos(LinearFunction<Field> & PolyOp,
LinearFunction<Field> & HermOp,
ImplicitlyRestartedLanczosTester<Field> & Tester,
int _Nstop, // sought vecs
int _Nk, // sought vecs
int _Nm, // spare vecs
RealD _eresid, // resid in lmdue deficit
int _MaxIter, // Max iterations
RealD _betastp=0.0, // if beta(k) < betastp: converged
int _MinRestart=1, int _orth_period = 1,
IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) :
SimpleTester(HermOp), _PolyOp(PolyOp), _HermOp(HermOp), _Tester(Tester),
Nstop(_Nstop) , Nk(_Nk), Nm(_Nm),
eresid(_eresid), betastp(_betastp),
MaxIter(_MaxIter) , MinRestart(_MinRestart),
orth_period(_orth_period), diagonalisation(_diagonalisation) { };
ImplicitlyRestartedLanczos(LinearFunction<Field> & PolyOp,
LinearFunction<Field> & HermOp,
int _Nstop, // sought vecs
int _Nk, // sought vecs
int _Nm, // spare vecs
RealD _eresid, // resid in lmdue deficit
int _MaxIter, // Max iterations
RealD _betastp=0.0, // if beta(k) < betastp: converged
int _MinRestart=1, int _orth_period = 1,
IRLdiagonalisation _diagonalisation= IRLdiagonaliseWithEigen) :
SimpleTester(HermOp), _PolyOp(PolyOp), _HermOp(HermOp), _Tester(SimpleTester),
Nstop(_Nstop) , Nk(_Nk), Nm(_Nm),
eresid(_eresid), betastp(_betastp),
MaxIter(_MaxIter) , MinRestart(_MinRestart),
orth_period(_orth_period), diagonalisation(_diagonalisation) { };
////////////////////////////////
// Helpers
////////////////////////////////
template<typename T> static RealD normalise(T& v)
{
RealD nn = norm2(v);
nn = sqrt(nn);
v = v * (1.0/nn);
return nn;
}
void orthogonalize(Field& w, std::vector<Field>& evec,int k)
{
OrthoTime-=usecond()/1e6;
basisOrthogonalize(evec,w,k);
normalise(w);
OrthoTime+=usecond()/1e6;
}
/* Rudy Arthur's thesis pp.137
------------------------
Require: M > K P = M K †
Compute the factorization AVM = VM HM + fM eM
repeat
Q=I
for i = 1,...,P do
QiRi =HM θiI Q = QQi
H M = Q †i H M Q i
end for
βK =HM(K+1,K) σK =Q(M,K)
r=vK+1βK +rσK
VK =VM(1:M)Q(1:M,1:K)
HK =HM(1:K,1:K)
→AVK =VKHK +fKe†K † Extend to an M = K + P step factorization AVM = VMHM + fMeM
until convergence
*/
void calc(std::vector<RealD>& eval, std::vector<Field>& evec, const Field& src, int& Nconv, bool reverse=false)
{
GridBase *grid = src._grid;
assert(grid == evec[0]._grid);
GridLogIRL.TimingMode(1);
std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
std::cout << GridLogIRL <<" ImplicitlyRestartedLanczos::calc() starting iteration 0 / "<< MaxIter<< std::endl;
std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
std::cout << GridLogIRL <<" -- seek Nk = " << Nk <<" vectors"<< std::endl;
std::cout << GridLogIRL <<" -- accept Nstop = " << Nstop <<" vectors"<< std::endl;
std::cout << GridLogIRL <<" -- total Nm = " << Nm <<" vectors"<< std::endl;
std::cout << GridLogIRL <<" -- size of eval = " << eval.size() << std::endl;
std::cout << GridLogIRL <<" -- size of evec = " << evec.size() << std::endl;
if ( diagonalisation == IRLdiagonaliseWithDSTEGR ) {
std::cout << GridLogIRL << "Diagonalisation is DSTEGR "<<std::endl;
} else if ( diagonalisation == IRLdiagonaliseWithQR ) {
std::cout << GridLogIRL << "Diagonalisation is QR "<<std::endl;
} else if ( diagonalisation == IRLdiagonaliseWithEigen ) {
std::cout << GridLogIRL << "Diagonalisation is Eigen "<<std::endl;
}
std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
assert(Nm <= evec.size() && Nm <= eval.size());
// quickly get an idea of the largest eigenvalue to more properly normalize the residuum
RealD evalMaxApprox = 0.0;
{
auto src_n = src;
auto tmp = src;
const int _MAX_ITER_IRL_MEVAPP_ = 50;
for (int i=0;i<_MAX_ITER_IRL_MEVAPP_;i++) {
normalise(src_n);
_HermOp(src_n,tmp);
RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
RealD vden = norm2(src_n);
RealD na = vnum/vden;
if (fabs(evalMaxApprox/na - 1.0) < 0.05)
i=_MAX_ITER_IRL_MEVAPP_;
evalMaxApprox = na;
std::cout << GridLogIRL << " Approximation of largest eigenvalue: " << evalMaxApprox << std::endl;
src_n = tmp;
}
}
std::vector<RealD> lme(Nm);
std::vector<RealD> lme2(Nm);
std::vector<RealD> eval2(Nm);
std::vector<RealD> eval2_copy(Nm);
Eigen::MatrixXd Qt = Eigen::MatrixXd::Zero(Nm,Nm);
Field f(grid);
Field v(grid);
int k1 = 1;
int k2 = Nk;
RealD beta_k;
Nconv = 0;
// Set initial vector
evec[0] = src;
normalise(evec[0]);
// Initial Nk steps
OrthoTime=0.;
for(int k=0; k<Nk; ++k) step(eval,lme,evec,f,Nm,k);
std::cout<<GridLogIRL <<"Initial "<< Nk <<"steps done "<<std::endl;
std::cout<<GridLogIRL <<"Initial steps:OrthoTime "<<OrthoTime<< "seconds"<<std::endl;
//////////////////////////////////
// Restarting loop begins
//////////////////////////////////
int iter;
for(iter = 0; iter<MaxIter; ++iter){
OrthoTime=0.;
std::cout<< GridLogMessage <<" **********************"<< std::endl;
std::cout<< GridLogMessage <<" Restart iteration = "<< iter << std::endl;
std::cout<< GridLogMessage <<" **********************"<< std::endl;
std::cout<<GridLogIRL <<" running "<<Nm-Nk <<" steps: "<<std::endl;
for(int k=Nk; k<Nm; ++k) step(eval,lme,evec,f,Nm,k);
f *= lme[Nm-1];
std::cout<<GridLogIRL <<" "<<Nm-Nk <<" steps done "<<std::endl;
std::cout<<GridLogIRL <<"Initial steps:OrthoTime "<<OrthoTime<< "seconds"<<std::endl;
//////////////////////////////////
// getting eigenvalues
//////////////////////////////////
for(int k=0; k<Nm; ++k){
eval2[k] = eval[k+k1-1];
lme2[k] = lme[k+k1-1];
}
Qt = Eigen::MatrixXd::Identity(Nm,Nm);
diagonalize(eval2,lme2,Nm,Nm,Qt,grid);
std::cout<<GridLogIRL <<" diagonalized "<<std::endl;
//////////////////////////////////
// sorting
//////////////////////////////////
eval2_copy = eval2;
std::partial_sort(eval2.begin(),eval2.begin()+Nm,eval2.end(),std::greater<RealD>());
std::cout<<GridLogIRL <<" evals sorted "<<std::endl;
const int chunk=8;
for(int io=0; io<k2;io+=chunk){
std::cout<<GridLogIRL << "eval "<< std::setw(3) << io ;
for(int ii=0;ii<chunk;ii++){
if ( (io+ii)<k2 )
std::cout<< " "<< std::setw(12)<< eval2[io+ii];
}
std::cout << std::endl;
}
//////////////////////////////////
// Implicitly shifted QR transformations
//////////////////////////////////
Qt = Eigen::MatrixXd::Identity(Nm,Nm);
for(int ip=k2; ip<Nm; ++ip){
QR_decomp(eval,lme,Nm,Nm,Qt,eval2[ip],k1,Nm);
}
std::cout<<GridLogIRL <<"QR decomposed "<<std::endl;
assert(k2<Nm); assert(k2<Nm); assert(k1>0);
basisRotate(evec,Qt,k1-1,k2+1,0,Nm,Nm); /// big constraint on the basis
std::cout<<GridLogIRL <<"basisRotated by Qt"<<std::endl;
////////////////////////////////////////////////////
// Compressed vector f and beta(k2)
////////////////////////////////////////////////////
f *= Qt(k2-1,Nm-1);
f += lme[k2-1] * evec[k2];
beta_k = norm2(f);
beta_k = sqrt(beta_k);
std::cout<<GridLogIRL<<" beta(k) = "<<beta_k<<std::endl;
RealD betar = 1.0/beta_k;
evec[k2] = betar * f;
lme[k2-1] = beta_k;
////////////////////////////////////////////////////
// Convergence test
////////////////////////////////////////////////////
for(int k=0; k<Nm; ++k){
eval2[k] = eval[k];
lme2[k] = lme[k];
}
Qt = Eigen::MatrixXd::Identity(Nm,Nm);
diagonalize(eval2,lme2,Nk,Nm,Qt,grid);
std::cout<<GridLogIRL <<" Diagonalized "<<std::endl;
Nconv = 0;
if (iter >= MinRestart) {
std::cout << GridLogIRL << "Test convergence: rotate subset of vectors to test convergence " << std::endl;
Field B(grid); B.checkerboard = evec[0].checkerboard;
// power of two search pattern; not every evalue in eval2 is assessed.
int allconv =1;
for(int jj = 1; jj<=Nstop; jj*=2){
int j = Nstop-jj;
RealD e = eval2_copy[j]; // Discard the evalue
basisRotateJ(B,evec,Qt,j,0,Nk,Nm);
if( !_Tester.TestConvergence(j,eresid,B,e,evalMaxApprox) ) {
allconv=0;
}
}
// Do evec[0] for good measure
{
int j=0;
RealD e = eval2_copy[0];
basisRotateJ(B,evec,Qt,j,0,Nk,Nm);
if( !_Tester.TestConvergence(j,eresid,B,e,evalMaxApprox) ) allconv=0;
}
if ( allconv ) Nconv = Nstop;
// test if we converged, if so, terminate
std::cout<<GridLogIRL<<" #modes converged: >= "<<Nconv<<"/"<<Nstop<<std::endl;
// if( Nconv>=Nstop || beta_k < betastp){
if( Nconv>=Nstop){
goto converged;
}
} else {
std::cout << GridLogIRL << "iter < MinRestart: do not yet test for convergence\n";
} // end of iter loop
}
std::cout<<GridLogError<<"\n NOT converged.\n";
abort();
converged:
{
Field B(grid); B.checkerboard = evec[0].checkerboard;
basisRotate(evec,Qt,0,Nk,0,Nk,Nm);
std::cout << GridLogIRL << " Rotated basis"<<std::endl;
Nconv=0;
//////////////////////////////////////////////////////////////////////
// Full final convergence test; unconditionally applied
//////////////////////////////////////////////////////////////////////
for(int j = 0; j<=Nk; j++){
B=evec[j];
if( _Tester.ReconstructEval(j,eresid,B,eval2[j],evalMaxApprox) ) {
Nconv++;
}
}
if ( Nconv < Nstop )
std::cout << GridLogIRL << "Nconv ("<<Nconv<<") < Nstop ("<<Nstop<<")"<<std::endl;
eval=eval2;
//Keep only converged
eval.resize(Nconv);// Nstop?
evec.resize(Nconv,grid);// Nstop?
basisSortInPlace(evec,eval,reverse);
}
std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
std::cout << GridLogIRL << "ImplicitlyRestartedLanczos CONVERGED ; Summary :\n";
std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
std::cout << GridLogIRL << " -- Iterations = "<< iter << "\n";
std::cout << GridLogIRL << " -- beta(k) = "<< beta_k << "\n";
std::cout << GridLogIRL << " -- Nconv = "<< Nconv << "\n";
std::cout << GridLogIRL <<"**************************************************************************"<< std::endl;
}
private:
/* Saad PP. 195
1. Choose an initial vector v1 of 2-norm unity. Set β1 ≡ 0, v0 ≡ 0
2. For k = 1,2,...,m Do:
3. wk:=Avkβkv_{k1}
4. αk:=(wk,vk) //
5. wk:=wkαkvk // wk orthog vk
6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
7. vk+1 := wk/βk+1
8. EndDo
*/
void step(std::vector<RealD>& lmd,
std::vector<RealD>& lme,
std::vector<Field>& evec,
Field& w,int Nm,int k)
{
const RealD tiny = 1.0e-20;
assert( k< Nm );
GridStopWatch gsw_op,gsw_o;
Field& evec_k = evec[k];
_PolyOp(evec_k,w); std::cout<<GridLogIRL << "PolyOp" <<std::endl;
if(k>0) w -= lme[k-1] * evec[k-1];
ComplexD zalph = innerProduct(evec_k,w); // 4. αk:=(wk,vk)
RealD alph = real(zalph);
w = w - alph * evec_k;// 5. wk:=wkαkvk
RealD beta = normalise(w); // 6. βk+1 := ∥wk∥2. If βk+1 = 0 then Stop
// 7. vk+1 := wk/βk+1
lmd[k] = alph;
lme[k] = beta;
if (k>0 && k % orth_period == 0) {
orthogonalize(w,evec,k); // orthonormalise
std::cout<<GridLogIRL << "Orthogonalised " <<std::endl;
}
if(k < Nm-1) evec[k+1] = w;
std::cout<<GridLogIRL << "alpha[" << k << "] = " << zalph << " beta[" << k << "] = "<<beta<<std::endl;
if ( beta < tiny )
std::cout<<GridLogIRL << " beta is tiny "<<beta<<std::endl;
}
void diagonalize_Eigen(std::vector<RealD>& lmd, std::vector<RealD>& lme,
int Nk, int Nm,
Eigen::MatrixXd & Qt, // Nm x Nm
GridBase *grid)
{
Eigen::MatrixXd TriDiag = Eigen::MatrixXd::Zero(Nk,Nk);
for(int i=0;i<Nk;i++) TriDiag(i,i) = lmd[i];
for(int i=0;i<Nk-1;i++) TriDiag(i,i+1) = lme[i];
for(int i=0;i<Nk-1;i++) TriDiag(i+1,i) = lme[i];
Eigen::SelfAdjointEigenSolver<Eigen::MatrixXd> eigensolver(TriDiag);
for (int i = 0; i < Nk; i++) {
lmd[Nk-1-i] = eigensolver.eigenvalues()(i);
}
for (int i = 0; i < Nk; i++) {
for (int j = 0; j < Nk; j++) {
Qt(Nk-1-i,j) = eigensolver.eigenvectors()(j,i);
}
}
}
///////////////////////////////////////////////////////////////////////////
// File could end here if settle on Eigen ??? !!!
///////////////////////////////////////////////////////////////////////////
void QR_decomp(std::vector<RealD>& lmd, // Nm
std::vector<RealD>& lme, // Nm
int Nk, int Nm, // Nk, Nm
Eigen::MatrixXd& Qt, // Nm x Nm matrix
RealD Dsh, int kmin, int kmax)
{
int k = kmin-1;
RealD x;
RealD Fden = 1.0/hypot(lmd[k]-Dsh,lme[k]);
RealD c = ( lmd[k] -Dsh) *Fden;
RealD s = -lme[k] *Fden;
RealD tmpa1 = lmd[k];
RealD tmpa2 = lmd[k+1];
RealD tmpb = lme[k];
lmd[k] = c*c*tmpa1 +s*s*tmpa2 -2.0*c*s*tmpb;
lmd[k+1] = s*s*tmpa1 +c*c*tmpa2 +2.0*c*s*tmpb;
lme[k] = c*s*(tmpa1-tmpa2) +(c*c-s*s)*tmpb;
x =-s*lme[k+1];
lme[k+1] = c*lme[k+1];
for(int i=0; i<Nk; ++i){
RealD Qtmp1 = Qt(k,i);
RealD Qtmp2 = Qt(k+1,i);
Qt(k,i) = c*Qtmp1 - s*Qtmp2;
Qt(k+1,i)= s*Qtmp1 + c*Qtmp2;
}
// Givens transformations
for(int k = kmin; k < kmax-1; ++k){
RealD Fden = 1.0/hypot(x,lme[k-1]);
RealD c = lme[k-1]*Fden;
RealD s = - x*Fden;
RealD tmpa1 = lmd[k];
RealD tmpa2 = lmd[k+1];
RealD tmpb = lme[k];
lmd[k] = c*c*tmpa1 +s*s*tmpa2 -2.0*c*s*tmpb;
lmd[k+1] = s*s*tmpa1 +c*c*tmpa2 +2.0*c*s*tmpb;
lme[k] = c*s*(tmpa1-tmpa2) +(c*c-s*s)*tmpb;
lme[k-1] = c*lme[k-1] -s*x;
if(k != kmax-2){
x = -s*lme[k+1];
lme[k+1] = c*lme[k+1];
}
for(int i=0; i<Nk; ++i){
RealD Qtmp1 = Qt(k,i);
RealD Qtmp2 = Qt(k+1,i);
Qt(k,i) = c*Qtmp1 -s*Qtmp2;
Qt(k+1,i) = s*Qtmp1 +c*Qtmp2;
}
}
}
void diagonalize(std::vector<RealD>& lmd, std::vector<RealD>& lme,
int Nk, int Nm,
Eigen::MatrixXd & Qt,
GridBase *grid)
{
Qt = Eigen::MatrixXd::Identity(Nm,Nm);
if ( diagonalisation == IRLdiagonaliseWithDSTEGR ) {
diagonalize_lapack(lmd,lme,Nk,Nm,Qt,grid);
} else if ( diagonalisation == IRLdiagonaliseWithQR ) {
diagonalize_QR(lmd,lme,Nk,Nm,Qt,grid);
} else if ( diagonalisation == IRLdiagonaliseWithEigen ) {
diagonalize_Eigen(lmd,lme,Nk,Nm,Qt,grid);
} else {
assert(0);
}
}
#ifdef USE_LAPACK
void LAPACK_dstegr(char *jobz, char *range, int *n, double *d, double *e,
double *vl, double *vu, int *il, int *iu, double *abstol,
int *m, double *w, double *z, int *ldz, int *isuppz,
double *work, int *lwork, int *iwork, int *liwork,
int *info);
#endif
void diagonalize_lapack(std::vector<RealD>& lmd,
std::vector<RealD>& lme,
int Nk, int Nm,
Eigen::MatrixXd& Qt,
GridBase *grid)
{
#ifdef USE_LAPACK
const int size = Nm;
int NN = Nk;
double evals_tmp[NN];
double evec_tmp[NN][NN];
memset(evec_tmp[0],0,sizeof(double)*NN*NN);
double DD[NN];
double EE[NN];
for (int i = 0; i< NN; i++) {
for (int j = i - 1; j <= i + 1; j++) {
if ( j < NN && j >= 0 ) {
if (i==j) DD[i] = lmd[i];
if (i==j) evals_tmp[i] = lmd[i];
if (j==(i-1)) EE[j] = lme[j];
}
}
}
int evals_found;
int lwork = ( (18*NN) > (1+4*NN+NN*NN)? (18*NN):(1+4*NN+NN*NN)) ;
int liwork = 3+NN*10 ;
int iwork[liwork];
double work[lwork];
int isuppz[2*NN];
char jobz = 'V'; // calculate evals & evecs
char range = 'I'; // calculate all evals
// char range = 'A'; // calculate all evals
char uplo = 'U'; // refer to upper half of original matrix
char compz = 'I'; // Compute eigenvectors of tridiagonal matrix
int ifail[NN];
int info;
int total = grid->_Nprocessors;
int node = grid->_processor;
int interval = (NN/total)+1;
double vl = 0.0, vu = 0.0;
int il = interval*node+1 , iu = interval*(node+1);
if (iu > NN) iu=NN;
double tol = 0.0;
if (1) {
memset(evals_tmp,0,sizeof(double)*NN);
if ( il <= NN){
LAPACK_dstegr(&jobz, &range, &NN,
(double*)DD, (double*)EE,
&vl, &vu, &il, &iu, // these four are ignored if second parameteris 'A'
&tol, // tolerance
&evals_found, evals_tmp, (double*)evec_tmp, &NN,
isuppz,
work, &lwork, iwork, &liwork,
&info);
for (int i = iu-1; i>= il-1; i--){
evals_tmp[i] = evals_tmp[i - (il-1)];
if (il>1) evals_tmp[i-(il-1)]=0.;
for (int j = 0; j< NN; j++){
evec_tmp[i][j] = evec_tmp[i - (il-1)][j];
if (il>1) evec_tmp[i-(il-1)][j]=0.;
}
}
}
{
grid->GlobalSumVector(evals_tmp,NN);
grid->GlobalSumVector((double*)evec_tmp,NN*NN);
}
}
// Safer to sort instead of just reversing it,
// but the document of the routine says evals are sorted in increasing order.
// qr gives evals in decreasing order.
for(int i=0;i<NN;i++){
lmd [NN-1-i]=evals_tmp[i];
for(int j=0;j<NN;j++){
Qt((NN-1-i),j)=evec_tmp[i][j];
}
}
#else
assert(0);
#endif
}
void diagonalize_QR(std::vector<RealD>& lmd, std::vector<RealD>& lme,
int Nk, int Nm,
Eigen::MatrixXd & Qt,
GridBase *grid)
{
int QRiter = 100*Nm;
int kmin = 1;
int kmax = Nk;
// (this should be more sophisticated)
for(int iter=0; iter<QRiter; ++iter){
// determination of 2x2 leading submatrix
RealD dsub = lmd[kmax-1]-lmd[kmax-2];
RealD dd = sqrt(dsub*dsub + 4.0*lme[kmax-2]*lme[kmax-2]);
RealD Dsh = 0.5*(lmd[kmax-2]+lmd[kmax-1] +dd*(dsub/fabs(dsub)));
// (Dsh: shift)
// transformation
QR_decomp(lmd,lme,Nk,Nm,Qt,Dsh,kmin,kmax); // Nk, Nm
// Convergence criterion (redef of kmin and kamx)
for(int j=kmax-1; j>= kmin; --j){
RealD dds = fabs(lmd[j-1])+fabs(lmd[j]);
if(fabs(lme[j-1])+dds > dds){
kmax = j+1;
goto continued;
}
}
QRiter = iter;
return;
continued:
for(int j=0; j<kmax-1; ++j){
RealD dds = fabs(lmd[j])+fabs(lmd[j+1]);
if(fabs(lme[j])+dds > dds){
kmin = j+1;
break;
}
}
}
std::cout << GridLogError << "[QL method] Error - Too many iteration: "<<QRiter<<"\n";
abort();
}
};
}
#endif

View File

@ -1,406 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/LocalCoherenceLanczos.h
Copyright (C) 2015
Author: Christoph Lehner <clehner@bnl.gov>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LOCAL_COHERENCE_IRL_H
#define GRID_LOCAL_COHERENCE_IRL_H
namespace Grid {
struct LanczosParams : Serializable {
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParams,
ChebyParams, Cheby,/*Chebyshev*/
int, Nstop, /*Vecs in Lanczos must converge Nstop < Nk < Nm*/
int, Nk, /*Vecs in Lanczos seek converge*/
int, Nm, /*Total vecs in Lanczos include restart*/
RealD, resid, /*residual*/
int, MaxIt,
RealD, betastp, /* ? */
int, MinRes); // Must restart
};
struct LocalCoherenceLanczosParams : Serializable {
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(LocalCoherenceLanczosParams,
bool, saveEvecs,
bool, doFine,
bool, doFineRead,
bool, doCoarse,
bool, doCoarseRead,
LanczosParams, FineParams,
LanczosParams, CoarseParams,
ChebyParams, Smoother,
RealD , coarse_relax_tol,
std::vector<int>, blockSize,
std::string, config,
std::vector < std::complex<double> >, omega,
RealD, mass,
RealD, M5);
};
// Duplicate functionality; ProjectedFunctionHermOp could be used with the trivial function
template<class Fobj,class CComplex,int nbasis>
class ProjectedHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
public:
typedef iVector<CComplex,nbasis > CoarseSiteVector;
typedef Lattice<CoarseSiteVector> CoarseField;
typedef Lattice<CComplex> CoarseScalar; // used for inner products on fine field
typedef Lattice<Fobj> FineField;
LinearOperatorBase<FineField> &_Linop;
std::vector<FineField> &subspace;
ProjectedHermOp(LinearOperatorBase<FineField>& linop, std::vector<FineField> & _subspace) :
_Linop(linop), subspace(_subspace)
{
assert(subspace.size() >0);
};
void operator()(const CoarseField& in, CoarseField& out) {
GridBase *FineGrid = subspace[0]._grid;
int checkerboard = subspace[0].checkerboard;
FineField fin (FineGrid); fin.checkerboard= checkerboard;
FineField fout(FineGrid); fout.checkerboard = checkerboard;
blockPromote(in,fin,subspace); std::cout<<GridLogIRL<<"ProjectedHermop : Promote to fine"<<std::endl;
_Linop.HermOp(fin,fout); std::cout<<GridLogIRL<<"ProjectedHermop : HermOp (fine) "<<std::endl;
blockProject(out,fout,subspace); std::cout<<GridLogIRL<<"ProjectedHermop : Project to coarse "<<std::endl;
}
};
template<class Fobj,class CComplex,int nbasis>
class ProjectedFunctionHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
public:
typedef iVector<CComplex,nbasis > CoarseSiteVector;
typedef Lattice<CoarseSiteVector> CoarseField;
typedef Lattice<CComplex> CoarseScalar; // used for inner products on fine field
typedef Lattice<Fobj> FineField;
OperatorFunction<FineField> & _poly;
LinearOperatorBase<FineField> &_Linop;
std::vector<FineField> &subspace;
ProjectedFunctionHermOp(OperatorFunction<FineField> & poly,
LinearOperatorBase<FineField>& linop,
std::vector<FineField> & _subspace) :
_poly(poly),
_Linop(linop),
subspace(_subspace)
{ };
void operator()(const CoarseField& in, CoarseField& out) {
GridBase *FineGrid = subspace[0]._grid;
int checkerboard = subspace[0].checkerboard;
FineField fin (FineGrid); fin.checkerboard =checkerboard;
FineField fout(FineGrid);fout.checkerboard =checkerboard;
blockPromote(in,fin,subspace); std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Promote to fine"<<std::endl;
_poly(_Linop,fin,fout); std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Poly "<<std::endl;
blockProject(out,fout,subspace); std::cout<<GridLogIRL<<"ProjectedFunctionHermop : Project to coarse "<<std::endl;
}
};
template<class Fobj,class CComplex,int nbasis>
class ImplicitlyRestartedLanczosSmoothedTester : public ImplicitlyRestartedLanczosTester<Lattice<iVector<CComplex,nbasis > > >
{
public:
typedef iVector<CComplex,nbasis > CoarseSiteVector;
typedef Lattice<CoarseSiteVector> CoarseField;
typedef Lattice<CComplex> CoarseScalar; // used for inner products on fine field
typedef Lattice<Fobj> FineField;
LinearFunction<CoarseField> & _Poly;
OperatorFunction<FineField> & _smoother;
LinearOperatorBase<FineField> &_Linop;
RealD _coarse_relax_tol;
std::vector<FineField> &_subspace;
ImplicitlyRestartedLanczosSmoothedTester(LinearFunction<CoarseField> &Poly,
OperatorFunction<FineField> &smoother,
LinearOperatorBase<FineField> &Linop,
std::vector<FineField> &subspace,
RealD coarse_relax_tol=5.0e3)
: _smoother(smoother), _Linop(Linop), _Poly(Poly), _subspace(subspace),
_coarse_relax_tol(coarse_relax_tol)
{ };
int TestConvergence(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
{
CoarseField v(B);
RealD eval_poly = eval;
// Apply operator
_Poly(B,v);
RealD vnum = real(innerProduct(B,v)); // HermOp.
RealD vden = norm2(B);
RealD vv0 = norm2(v);
eval = vnum/vden;
v -= eval*B;
RealD vv = norm2(v) / ::pow(evalMaxApprox,2.0);
std::cout.precision(13);
std::cout<<GridLogIRL << "[" << std::setw(3)<<j<<"] "
<<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
<<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
<<std::endl;
int conv=0;
if( (vv<eresid*eresid) ) conv = 1;
return conv;
}
int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
{
GridBase *FineGrid = _subspace[0]._grid;
int checkerboard = _subspace[0].checkerboard;
FineField fB(FineGrid);fB.checkerboard =checkerboard;
FineField fv(FineGrid);fv.checkerboard =checkerboard;
blockPromote(B,fv,_subspace);
_smoother(_Linop,fv,fB);
RealD eval_poly = eval;
_Linop.HermOp(fB,fv);
RealD vnum = real(innerProduct(fB,fv)); // HermOp.
RealD vden = norm2(fB);
RealD vv0 = norm2(fv);
eval = vnum/vden;
fv -= eval*fB;
RealD vv = norm2(fv) / ::pow(evalMaxApprox,2.0);
std::cout.precision(13);
std::cout<<GridLogIRL << "[" << std::setw(3)<<j<<"] "
<<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
<<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
<<std::endl;
if ( j > nbasis ) eresid = eresid*_coarse_relax_tol;
if( (vv<eresid*eresid) ) return 1;
return 0;
}
};
////////////////////////////////////////////
// Make serializable Lanczos params
////////////////////////////////////////////
template<class Fobj,class CComplex,int nbasis>
class LocalCoherenceLanczos
{
public:
typedef iVector<CComplex,nbasis > CoarseSiteVector;
typedef Lattice<CComplex> CoarseScalar; // used for inner products on fine field
typedef Lattice<CoarseSiteVector> CoarseField;
typedef Lattice<Fobj> FineField;
protected:
GridBase *_CoarseGrid;
GridBase *_FineGrid;
int _checkerboard;
LinearOperatorBase<FineField> & _FineOp;
std::vector<RealD> &evals_fine;
std::vector<RealD> &evals_coarse;
std::vector<FineField> &subspace;
std::vector<CoarseField> &evec_coarse;
private:
std::vector<RealD> _evals_fine;
std::vector<RealD> _evals_coarse;
std::vector<FineField> _subspace;
std::vector<CoarseField> _evec_coarse;
public:
LocalCoherenceLanczos(GridBase *FineGrid,
GridBase *CoarseGrid,
LinearOperatorBase<FineField> &FineOp,
int checkerboard) :
_CoarseGrid(CoarseGrid),
_FineGrid(FineGrid),
_FineOp(FineOp),
_checkerboard(checkerboard),
evals_fine (_evals_fine),
evals_coarse(_evals_coarse),
subspace (_subspace),
evec_coarse(_evec_coarse)
{
evals_fine.resize(0);
evals_coarse.resize(0);
};
//////////////////////////////////////////////////////////////////////////
// Alternate constructore, external storage for use by Hadrons module
//////////////////////////////////////////////////////////////////////////
LocalCoherenceLanczos(GridBase *FineGrid,
GridBase *CoarseGrid,
LinearOperatorBase<FineField> &FineOp,
int checkerboard,
std::vector<FineField> &ext_subspace,
std::vector<CoarseField> &ext_coarse,
std::vector<RealD> &ext_eval_fine,
std::vector<RealD> &ext_eval_coarse
) :
_CoarseGrid(CoarseGrid),
_FineGrid(FineGrid),
_FineOp(FineOp),
_checkerboard(checkerboard),
evals_fine (ext_eval_fine),
evals_coarse(ext_eval_coarse),
subspace (ext_subspace),
evec_coarse (ext_coarse)
{
evals_fine.resize(0);
evals_coarse.resize(0);
};
void Orthogonalise(void ) {
CoarseScalar InnerProd(_CoarseGrid);
std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
blockOrthogonalise(InnerProd,subspace);
std::cout << GridLogMessage <<" Gramm-Schmidt pass 2"<<std::endl;
blockOrthogonalise(InnerProd,subspace);
};
template<typename T> static RealD normalise(T& v)
{
RealD nn = norm2(v);
nn = ::sqrt(nn);
v = v * (1.0/nn);
return nn;
}
/*
void fakeFine(void)
{
int Nk = nbasis;
subspace.resize(Nk,_FineGrid);
subspace[0]=1.0;
subspace[0].checkerboard=_checkerboard;
normalise(subspace[0]);
PlainHermOp<FineField> Op(_FineOp);
for(int k=1;k<Nk;k++){
subspace[k].checkerboard=_checkerboard;
Op(subspace[k-1],subspace[k]);
normalise(subspace[k]);
}
}
*/
void testFine(RealD resid)
{
assert(evals_fine.size() == nbasis);
assert(subspace.size() == nbasis);
PlainHermOp<FineField> Op(_FineOp);
ImplicitlyRestartedLanczosHermOpTester<FineField> SimpleTester(Op);
for(int k=0;k<nbasis;k++){
assert(SimpleTester.ReconstructEval(k,resid,subspace[k],evals_fine[k],1.0)==1);
}
}
void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax)
{
assert(evals_fine.size() == nbasis);
assert(subspace.size() == nbasis);
//////////////////////////////////////////////////////////////////////////////////////////////////
// create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
//////////////////////////////////////////////////////////////////////////////////////////////////
Chebyshev<FineField> ChebySmooth(cheby_smooth);
ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (ChebySmooth,_FineOp,subspace);
ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax);
for(int k=0;k<evec_coarse.size();k++){
if ( k < nbasis ) {
assert(ChebySmoothTester.ReconstructEval(k,resid,evec_coarse[k],evals_coarse[k],1.0)==1);
} else {
assert(ChebySmoothTester.ReconstructEval(k,resid*relax,evec_coarse[k],evals_coarse[k],1.0)==1);
}
}
}
void calcFine(ChebyParams cheby_parms,int Nstop,int Nk,int Nm,RealD resid,
RealD MaxIt, RealD betastp, int MinRes)
{
assert(nbasis<=Nm);
Chebyshev<FineField> Cheby(cheby_parms);
FunctionHermOp<FineField> ChebyOp(Cheby,_FineOp);
PlainHermOp<FineField> Op(_FineOp);
evals_fine.resize(Nm);
subspace.resize(Nm,_FineGrid);
ImplicitlyRestartedLanczos<FineField> IRL(ChebyOp,Op,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
FineField src(_FineGrid); src=1.0; src.checkerboard = _checkerboard;
int Nconv;
IRL.calc(evals_fine,subspace,src,Nconv,false);
// Shrink down to number saved
assert(Nstop>=nbasis);
assert(Nconv>=nbasis);
evals_fine.resize(nbasis);
subspace.resize(nbasis,_FineGrid);
}
void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth,RealD relax,
int Nstop, int Nk, int Nm,RealD resid,
RealD MaxIt, RealD betastp, int MinRes)
{
Chebyshev<FineField> Cheby(cheby_op);
ProjectedHermOp<Fobj,CComplex,nbasis> Op(_FineOp,subspace);
ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,subspace);
//////////////////////////////////////////////////////////////////////////////////////////////////
// create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
//////////////////////////////////////////////////////////////////////////////////////////////////
Chebyshev<FineField> ChebySmooth(cheby_smooth);
ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax);
evals_coarse.resize(Nm);
evec_coarse.resize(Nm,_CoarseGrid);
CoarseField src(_CoarseGrid); src=1.0;
ImplicitlyRestartedLanczos<CoarseField> IRL(ChebyOp,ChebyOp,ChebySmoothTester,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
int Nconv=0;
IRL.calc(evals_coarse,evec_coarse,src,Nconv,false);
assert(Nconv>=Nstop);
evals_coarse.resize(Nstop);
evec_coarse.resize (Nstop,_CoarseGrid);
for (int i=0;i<Nstop;i++){
std::cout << i << " Coarse eval = " << evals_coarse[i] << std::endl;
}
}
};
}
#endif

View File

@ -1,503 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/SchurRedBlack.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_SCHUR_RED_BLACK_H
#define GRID_SCHUR_RED_BLACK_H
/*
* Red black Schur decomposition
*
* M = (Mee Meo) = (1 0 ) (Mee 0 ) (1 Mee^{-1} Meo)
* (Moe Moo) (Moe Mee^-1 1 ) (0 Moo-Moe Mee^-1 Meo) (0 1 )
* = L D U
*
* L^-1 = (1 0 )
* (-MoeMee^{-1} 1 )
* L^{dag} = ( 1 Mee^{-dag} Moe^{dag} )
* ( 0 1 )
* L^{-d} = ( 1 -Mee^{-dag} Moe^{dag} )
* ( 0 1 )
*
* U^-1 = (1 -Mee^{-1} Meo)
* (0 1 )
* U^{dag} = ( 1 0)
* (Meo^dag Mee^{-dag} 1)
* U^{-dag} = ( 1 0)
* (-Meo^dag Mee^{-dag} 1)
***********************
* M psi = eta
***********************
*Odd
* i) D_oo psi_o = L^{-1} eta_o
* eta_o' = (D_oo)^dag (eta_o - Moe Mee^{-1} eta_e)
*
* Wilson:
* (D_oo)^{\dag} D_oo psi_o = (D_oo)^dag L^{-1} eta_o
* Stag:
* D_oo psi_o = L^{-1} eta = (eta_o - Moe Mee^{-1} eta_e)
*
* L^-1 eta_o= (1 0 ) (e
* (-MoeMee^{-1} 1 )
*
*Even
* ii) Mee psi_e + Meo psi_o = src_e
*
* => sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
*
*
* TODO: Other options:
*
* a) change checkerboards for Schur e<->o
*
* Left precon by Moo^-1
* b) Doo^{dag} M_oo^-dag Moo^-1 Doo psi_0 = (D_oo)^dag M_oo^-dag Moo^-1 L^{-1} eta_o
* eta_o' = (D_oo)^dag M_oo^-dag Moo^-1 (eta_o - Moe Mee^{-1} eta_e)
*
* Right precon by Moo^-1
* c) M_oo^-dag Doo^{dag} Doo Moo^-1 phi_0 = M_oo^-dag (D_oo)^dag L^{-1} eta_o
* eta_o' = M_oo^-dag (D_oo)^dag (eta_o - Moe Mee^{-1} eta_e)
* psi_o = M_oo^-1 phi_o
* TODO: Deflation
*/
namespace Grid {
///////////////////////////////////////////////////////////////////////////////////////////////////////
// Take a matrix and form a Red Black solver calling a Herm solver
// Use of RB info prevents making SchurRedBlackSolve conform to standard interface
///////////////////////////////////////////////////////////////////////////////////////////////////////
// Now make the norm reflect extra factor of Mee
template<class Field> class SchurRedBlackStaggeredSolve {
private:
OperatorFunction<Field> & _HermitianRBSolver;
int CBfactorise;
bool subGuess;
public:
/////////////////////////////////////////////////////
// Wrap the usual normal equations Schur trick
/////////////////////////////////////////////////////
SchurRedBlackStaggeredSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false) :
_HermitianRBSolver(HermitianRBSolver)
{
CBfactorise=0;
subtractGuess(initSubGuess);
};
void subtractGuess(const bool initSubGuess)
{
subGuess = initSubGuess;
}
bool isSubtractGuess(void)
{
return subGuess;
}
template<class Matrix>
void operator() (Matrix & _Matrix,const Field &in, Field &out){
ZeroGuesser<Field> guess;
(*this)(_Matrix,in,out,guess);
}
template<class Matrix, class Guesser>
void operator() (Matrix & _Matrix,const Field &in, Field &out, Guesser &guess){
// FIXME CGdiagonalMee not implemented virtual function
// FIXME use CBfactorise to control schur decomp
GridBase *grid = _Matrix.RedBlackGrid();
GridBase *fgrid= _Matrix.Grid();
SchurStaggeredOperator<Matrix,Field> _HermOpEO(_Matrix);
Field src_e(grid);
Field src_o(grid);
Field sol_e(grid);
Field sol_o(grid);
Field tmp(grid);
Field Mtmp(grid);
Field resid(fgrid);
std::cout << GridLogMessage << " SchurRedBlackStaggeredSolve " <<std::endl;
pickCheckerboard(Even,src_e,in);
pickCheckerboard(Odd ,src_o,in);
pickCheckerboard(Even,sol_e,out);
pickCheckerboard(Odd ,sol_o,out);
std::cout << GridLogMessage << " SchurRedBlackStaggeredSolve checkerboards picked" <<std::endl;
/////////////////////////////////////////////////////
// src_o = (source_o - Moe MeeInv source_e)
/////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); assert( tmp.checkerboard ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.checkerboard ==Odd);
tmp=src_o-Mtmp; assert( tmp.checkerboard ==Odd);
//src_o = tmp; assert(src_o.checkerboard ==Odd);
_Matrix.Mooee(tmp,src_o); // Extra factor of "m" in source from dumb choice of matrix norm.
//////////////////////////////////////////////////////////////
// Call the red-black solver
//////////////////////////////////////////////////////////////
std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver calling the Mpc solver" <<std::endl;
guess(src_o, sol_o);
Mtmp = sol_o;
_HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.checkerboard==Odd);
std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver called the Mpc solver" <<std::endl;
// Fionn A2A boolean behavioural control
if (subGuess) sol_o = sol_o-Mtmp;
///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
///////////////////////////////////////////////////
_Matrix.Meooe(sol_o,tmp); assert( tmp.checkerboard ==Even);
src_e = src_e-tmp; assert( src_e.checkerboard ==Even);
_Matrix.MooeeInv(src_e,sol_e); assert( sol_e.checkerboard ==Even);
std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver reconstructed other CB" <<std::endl;
setCheckerboard(out,sol_e); assert( sol_e.checkerboard ==Even);
setCheckerboard(out,sol_o); assert( sol_o.checkerboard ==Odd );
std::cout<<GridLogMessage << "SchurRedBlackStaggeredSolver inserted solution" <<std::endl;
// Verify the unprec residual
if ( ! subGuess ) {
_Matrix.M(out,resid);
resid = resid-in;
RealD ns = norm2(in);
RealD nr = norm2(resid);
std::cout<<GridLogMessage << "SchurRedBlackStaggered solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
} else {
std::cout << GridLogMessage << "Guess subtracted after solve." << std::endl;
}
}
};
template<class Field> using SchurRedBlackStagSolve = SchurRedBlackStaggeredSolve<Field>;
///////////////////////////////////////////////////////////////////////////////////////////////////////
// Take a matrix and form a Red Black solver calling a Herm solver
// Use of RB info prevents making SchurRedBlackSolve conform to standard interface
///////////////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class SchurRedBlackDiagMooeeSolve {
private:
OperatorFunction<Field> & _HermitianRBSolver;
int CBfactorise;
bool subGuess;
public:
/////////////////////////////////////////////////////
// Wrap the usual normal equations Schur trick
/////////////////////////////////////////////////////
SchurRedBlackDiagMooeeSolve(OperatorFunction<Field> &HermitianRBSolver,int cb=0, const bool initSubGuess = false) : _HermitianRBSolver(HermitianRBSolver)
{
CBfactorise=cb;
subtractGuess(initSubGuess);
};
void subtractGuess(const bool initSubGuess)
{
subGuess = initSubGuess;
}
bool isSubtractGuess(void)
{
return subGuess;
}
template<class Matrix>
void operator() (Matrix & _Matrix,const Field &in, Field &out){
ZeroGuesser<Field> guess;
(*this)(_Matrix,in,out,guess);
}
template<class Matrix, class Guesser>
void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
// FIXME CGdiagonalMee not implemented virtual function
// FIXME use CBfactorise to control schur decomp
GridBase *grid = _Matrix.RedBlackGrid();
GridBase *fgrid= _Matrix.Grid();
SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
Field src_e(grid);
Field src_o(grid);
Field sol_e(grid);
Field sol_o(grid);
Field tmp(grid);
Field Mtmp(grid);
Field resid(fgrid);
pickCheckerboard(Even,src_e,in);
pickCheckerboard(Odd ,src_o,in);
pickCheckerboard(Even,sol_e,out);
pickCheckerboard(Odd ,sol_o,out);
/////////////////////////////////////////////////////
// src_o = Mdag * (source_o - Moe MeeInv source_e)
/////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); assert( tmp.checkerboard ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.checkerboard ==Odd);
tmp=src_o-Mtmp; assert( tmp.checkerboard ==Odd);
// get the right MpcDag
_HermOpEO.MpcDag(tmp,src_o); assert(src_o.checkerboard ==Odd);
//////////////////////////////////////////////////////////////
// Call the red-black solver
//////////////////////////////////////////////////////////////
std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
guess(src_o,sol_o);
Mtmp = sol_o;
_HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.checkerboard==Odd);
// Fionn A2A boolean behavioural control
if (subGuess) sol_o = sol_o-Mtmp;
///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
///////////////////////////////////////////////////
_Matrix.Meooe(sol_o,tmp); assert( tmp.checkerboard ==Even);
src_e = src_e-tmp; assert( src_e.checkerboard ==Even);
_Matrix.MooeeInv(src_e,sol_e); assert( sol_e.checkerboard ==Even);
setCheckerboard(out,sol_e); assert( sol_e.checkerboard ==Even);
setCheckerboard(out,sol_o); assert( sol_o.checkerboard ==Odd );
// Verify the unprec residual
if ( ! subGuess ) {
_Matrix.M(out,resid);
resid = resid-in;
RealD ns = norm2(in);
RealD nr = norm2(resid);
std::cout<<GridLogMessage << "SchurRedBlackDiagMooee solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
} else {
std::cout << GridLogMessage << "Guess subtracted after solve." << std::endl;
}
}
};
///////////////////////////////////////////////////////////////////////////////////////////////////////
// Take a matrix and form a Red Black solver calling a Herm solver
// Use of RB info prevents making SchurRedBlackSolve conform to standard interface
///////////////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class SchurRedBlackDiagTwoSolve {
private:
OperatorFunction<Field> & _HermitianRBSolver;
int CBfactorise;
bool subGuess;
public:
/////////////////////////////////////////////////////
// Wrap the usual normal equations Schur trick
/////////////////////////////////////////////////////
SchurRedBlackDiagTwoSolve(OperatorFunction<Field> &HermitianRBSolver, const bool initSubGuess = false) :
_HermitianRBSolver(HermitianRBSolver)
{
CBfactorise = 0;
subtractGuess(initSubGuess);
};
void subtractGuess(const bool initSubGuess)
{
subGuess = initSubGuess;
}
bool isSubtractGuess(void)
{
return subGuess;
}
template<class Matrix>
void operator() (Matrix & _Matrix,const Field &in, Field &out){
ZeroGuesser<Field> guess;
(*this)(_Matrix,in,out,guess);
}
template<class Matrix,class Guesser>
void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
// FIXME CGdiagonalMee not implemented virtual function
// FIXME use CBfactorise to control schur decomp
GridBase *grid = _Matrix.RedBlackGrid();
GridBase *fgrid= _Matrix.Grid();
SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
Field src_e(grid);
Field src_o(grid);
Field sol_e(grid);
Field sol_o(grid);
Field tmp(grid);
Field Mtmp(grid);
Field resid(fgrid);
pickCheckerboard(Even,src_e,in);
pickCheckerboard(Odd ,src_o,in);
pickCheckerboard(Even,sol_e,out);
pickCheckerboard(Odd ,sol_o,out);
/////////////////////////////////////////////////////
// src_o = Mdag * (source_o - Moe MeeInv source_e)
/////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); assert( tmp.checkerboard ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.checkerboard ==Odd);
tmp=src_o-Mtmp; assert( tmp.checkerboard ==Odd);
// get the right MpcDag
_HermOpEO.MpcDag(tmp,src_o); assert(src_o.checkerboard ==Odd);
//////////////////////////////////////////////////////////////
// Call the red-black solver
//////////////////////////////////////////////////////////////
std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
// _HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.checkerboard==Odd);
guess(src_o,tmp);
Mtmp = tmp;
_HermitianRBSolver(_HermOpEO,src_o,tmp); assert(tmp.checkerboard==Odd);
// Fionn A2A boolean behavioural control
if (subGuess) tmp = tmp-Mtmp;
_Matrix.MooeeInv(tmp,sol_o); assert( sol_o.checkerboard ==Odd);
///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
///////////////////////////////////////////////////
_Matrix.Meooe(sol_o,tmp); assert( tmp.checkerboard ==Even);
src_e = src_e-tmp; assert( src_e.checkerboard ==Even);
_Matrix.MooeeInv(src_e,sol_e); assert( sol_e.checkerboard ==Even);
setCheckerboard(out,sol_e); assert( sol_e.checkerboard ==Even);
setCheckerboard(out,sol_o); assert( sol_o.checkerboard ==Odd );
// Verify the unprec residual
if ( ! subGuess ) {
_Matrix.M(out,resid);
resid = resid-in;
RealD ns = norm2(in);
RealD nr = norm2(resid);
std::cout<<GridLogMessage << "SchurRedBlackDiagTwo solver true unprec resid "<< std::sqrt(nr/ns) <<" nr "<< nr <<" ns "<<ns << std::endl;
} else {
std::cout << GridLogMessage << "Guess subtracted after solve." << std::endl;
}
}
};
///////////////////////////////////////////////////////////////////////////////////////////////////////
// Take a matrix and form a Red Black solver calling a Herm solver
// Use of RB info prevents making SchurRedBlackSolve conform to standard interface
///////////////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class SchurRedBlackDiagTwoMixed {
private:
LinearFunction<Field> & _HermitianRBSolver;
int CBfactorise;
bool subGuess;
public:
/////////////////////////////////////////////////////
// Wrap the usual normal equations Schur trick
/////////////////////////////////////////////////////
SchurRedBlackDiagTwoMixed(LinearFunction<Field> &HermitianRBSolver, const bool initSubGuess = false) :
_HermitianRBSolver(HermitianRBSolver)
{
CBfactorise=0;
subtractGuess(initSubGuess);
};
void subtractGuess(const bool initSubGuess)
{
subGuess = initSubGuess;
}
bool isSubtractGuess(void)
{
return subGuess;
}
template<class Matrix>
void operator() (Matrix & _Matrix,const Field &in, Field &out){
ZeroGuesser<Field> guess;
(*this)(_Matrix,in,out,guess);
}
template<class Matrix, class Guesser>
void operator() (Matrix & _Matrix,const Field &in, Field &out,Guesser &guess){
// FIXME CGdiagonalMee not implemented virtual function
// FIXME use CBfactorise to control schur decomp
GridBase *grid = _Matrix.RedBlackGrid();
GridBase *fgrid= _Matrix.Grid();
SchurDiagTwoOperator<Matrix,Field> _HermOpEO(_Matrix);
Field src_e(grid);
Field src_o(grid);
Field sol_e(grid);
Field sol_o(grid);
Field tmp(grid);
Field Mtmp(grid);
Field resid(fgrid);
pickCheckerboard(Even,src_e,in);
pickCheckerboard(Odd ,src_o,in);
pickCheckerboard(Even,sol_e,out);
pickCheckerboard(Odd ,sol_o,out);
/////////////////////////////////////////////////////
// src_o = Mdag * (source_o - Moe MeeInv source_e)
/////////////////////////////////////////////////////
_Matrix.MooeeInv(src_e,tmp); assert( tmp.checkerboard ==Even);
_Matrix.Meooe (tmp,Mtmp); assert( Mtmp.checkerboard ==Odd);
tmp=src_o-Mtmp; assert( tmp.checkerboard ==Odd);
// get the right MpcDag
_HermOpEO.MpcDag(tmp,src_o); assert(src_o.checkerboard ==Odd);
//////////////////////////////////////////////////////////////
// Call the red-black solver
//////////////////////////////////////////////////////////////
std::cout<<GridLogMessage << "SchurRedBlack solver calling the MpcDagMp solver" <<std::endl;
// _HermitianRBSolver(_HermOpEO,src_o,sol_o); assert(sol_o.checkerboard==Odd);
// _HermitianRBSolver(_HermOpEO,src_o,tmp); assert(tmp.checkerboard==Odd);
guess(src_o,tmp);
Mtmp = tmp;
_HermitianRBSolver(_HermOpEO,src_o,tmp); assert(tmp.checkerboard==Odd);
// Fionn A2A boolean behavioural control
if (subGuess) tmp = tmp-Mtmp;
_Matrix.MooeeInv(tmp,sol_o); assert( sol_o.checkerboard ==Odd);
///////////////////////////////////////////////////
// sol_e = M_ee^-1 * ( src_e - Meo sol_o )...
///////////////////////////////////////////////////
_Matrix.Meooe(sol_o,tmp); assert( tmp.checkerboard ==Even);
src_e = src_e-tmp; assert( src_e.checkerboard ==Even);
_Matrix.MooeeInv(src_e,sol_e); assert( sol_e.checkerboard ==Even);
setCheckerboard(out,sol_e); assert( sol_e.checkerboard ==Even);
setCheckerboard(out,sol_o); assert( sol_o.checkerboard ==Odd );
// Verify the unprec residual
if ( ! subGuess ) {
_Matrix.M(out,resid);
resid = resid-in;
RealD ns = norm2(in);
RealD nr = norm2(resid);
std::cout << GridLogMessage << "SchurRedBlackDiagTwo solver true unprec resid " << std::sqrt(nr / ns) << " nr " << nr << " ns " << ns << std::endl;
} else {
std::cout << GridLogMessage << "Guess subtracted after solve." << std::endl;
}
}
};
}
#endif

View File

@ -1,125 +0,0 @@
#include <Grid/GridCore.h>
#include <fcntl.h>
namespace Grid {
MemoryStats *MemoryProfiler::stats = nullptr;
bool MemoryProfiler::debug = false;
int PointerCache::victim;
PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::Ncache];
void *PointerCache::Insert(void *ptr,size_t bytes) {
if (bytes < 4096 ) return ptr;
#ifdef GRID_OMP
assert(omp_in_parallel()==0);
#endif
void * ret = NULL;
int v = -1;
for(int e=0;e<Ncache;e++) {
if ( Entries[e].valid==0 ) {
v=e;
break;
}
}
if ( v==-1 ) {
v=victim;
victim = (victim+1)%Ncache;
}
if ( Entries[v].valid ) {
ret = Entries[v].address;
Entries[v].valid = 0;
Entries[v].address = NULL;
Entries[v].bytes = 0;
}
Entries[v].address=ptr;
Entries[v].bytes =bytes;
Entries[v].valid =1;
return ret;
}
void *PointerCache::Lookup(size_t bytes) {
if (bytes < 4096 ) return NULL;
#ifdef _OPENMP
assert(omp_in_parallel()==0);
#endif
for(int e=0;e<Ncache;e++){
if ( Entries[e].valid && ( Entries[e].bytes == bytes ) ) {
Entries[e].valid = 0;
return Entries[e].address;
}
}
return NULL;
}
void check_huge_pages(void *Buf,uint64_t BYTES)
{
#ifdef __linux__
int fd = open("/proc/self/pagemap", O_RDONLY);
assert(fd >= 0);
const int page_size = 4096;
uint64_t virt_pfn = (uint64_t)Buf / page_size;
off_t offset = sizeof(uint64_t) * virt_pfn;
uint64_t npages = (BYTES + page_size-1) / page_size;
uint64_t pagedata[npages];
uint64_t ret = lseek(fd, offset, SEEK_SET);
assert(ret == offset);
ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
assert(ret == sizeof(uint64_t) * npages);
int nhugepages = npages / 512;
int n4ktotal, nnothuge;
n4ktotal = 0;
nnothuge = 0;
for (int i = 0; i < nhugepages; ++i) {
uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size;
for (int j = 0; j < 512; ++j) {
uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size;
++n4ktotal;
if (pageaddr != baseaddr + j * page_size)
++nnothuge;
}
}
int rank = CartesianCommunicator::RankWorld();
printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge);
#endif
}
std::string sizeString(const size_t bytes)
{
constexpr unsigned int bufSize = 256;
const char *suffixes[7] = {"", "K", "M", "G", "T", "P", "E"};
char buf[256];
size_t s = 0;
double count = bytes;
while (count >= 1024 && s < 7)
{
s++;
count /= 1024;
}
if (count - floor(count) == 0.0)
{
snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]);
}
else
{
snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]);
}
return std::string(buf);
}
}

View File

@ -1,315 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/AlignedAllocator.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_ALIGNED_ALLOCATOR_H
#define GRID_ALIGNED_ALLOCATOR_H
#ifdef HAVE_MALLOC_MALLOC_H
#include <malloc/malloc.h>
#endif
#ifdef HAVE_MALLOC_H
#include <malloc.h>
#endif
#ifdef HAVE_MM_MALLOC_H
#include <mm_malloc.h>
#endif
namespace Grid {
class PointerCache {
private:
static const int Ncache=8;
static int victim;
typedef struct {
void *address;
size_t bytes;
int valid;
} PointerCacheEntry;
static PointerCacheEntry Entries[Ncache];
public:
static void *Insert(void *ptr,size_t bytes) ;
static void *Lookup(size_t bytes) ;
};
std::string sizeString(size_t bytes);
struct MemoryStats
{
size_t totalAllocated{0}, maxAllocated{0},
currentlyAllocated{0}, totalFreed{0};
};
class MemoryProfiler
{
public:
static MemoryStats *stats;
static bool debug;
};
#define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")"
#define profilerDebugPrint \
if (MemoryProfiler::stats)\
{\
auto s = MemoryProfiler::stats;\
std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl;\
std::cout << GridLogDebug << "[Memory debug] total : " << memString(s->totalAllocated) \
<< std::endl;\
std::cout << GridLogDebug << "[Memory debug] max : " << memString(s->maxAllocated) \
<< std::endl;\
std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \
<< std::endl;\
std::cout << GridLogDebug << "[Memory debug] freed : " << memString(s->totalFreed) \
<< std::endl;\
}
#define profilerAllocate(bytes)\
if (MemoryProfiler::stats)\
{\
auto s = MemoryProfiler::stats;\
s->totalAllocated += (bytes);\
s->currentlyAllocated += (bytes);\
s->maxAllocated = std::max(s->maxAllocated, s->currentlyAllocated);\
}\
if (MemoryProfiler::debug)\
{\
std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl;\
profilerDebugPrint;\
}
#define profilerFree(bytes)\
if (MemoryProfiler::stats)\
{\
auto s = MemoryProfiler::stats;\
s->totalFreed += (bytes);\
s->currentlyAllocated -= (bytes);\
}\
if (MemoryProfiler::debug)\
{\
std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl;\
profilerDebugPrint;\
}
void check_huge_pages(void *Buf,uint64_t BYTES);
////////////////////////////////////////////////////////////////////
// A lattice of something, but assume the something is SIMDized.
////////////////////////////////////////////////////////////////////
template<typename _Tp>
class alignedAllocator {
public:
typedef std::size_t size_type;
typedef std::ptrdiff_t difference_type;
typedef _Tp* pointer;
typedef const _Tp* const_pointer;
typedef _Tp& reference;
typedef const _Tp& const_reference;
typedef _Tp value_type;
template<typename _Tp1> struct rebind { typedef alignedAllocator<_Tp1> other; };
alignedAllocator() throw() { }
alignedAllocator(const alignedAllocator&) throw() { }
template<typename _Tp1> alignedAllocator(const alignedAllocator<_Tp1>&) throw() { }
~alignedAllocator() throw() { }
pointer address(reference __x) const { return &__x; }
size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
pointer allocate(size_type __n, const void* _p= 0)
{
size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes);
_Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
// if ( ptr != NULL )
// std::cout << "alignedAllocator "<<__n << " cache hit "<< std::hex << ptr <<std::dec <<std::endl;
//////////////////
// Hack 2MB align; could make option probably doesn't need configurability
//////////////////
//define GRID_ALLOC_ALIGN (128)
#define GRID_ALLOC_ALIGN (2*1024*1024)
#ifdef HAVE_MM_MALLOC_H
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN);
#else
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes);
#endif
// std::cout << "alignedAllocator " << std::hex << ptr <<std::dec <<std::endl;
// First touch optimise in threaded loop
uint8_t *cp = (uint8_t *)ptr;
#ifdef GRID_OMP
#pragma omp parallel for
#endif
for(size_type n=0;n<bytes;n+=4096){
cp[n]=0;
}
return ptr;
}
void deallocate(pointer __p, size_type __n) {
size_type bytes = __n * sizeof(_Tp);
profilerFree(bytes);
pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes);
#ifdef HAVE_MM_MALLOC_H
if ( __freeme ) _mm_free((void *)__freeme);
#else
if ( __freeme ) free((void *)__freeme);
#endif
}
void construct(pointer __p, const _Tp& __val) { };
void construct(pointer __p) { };
void destroy(pointer __p) { };
};
template<typename _Tp> inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
template<typename _Tp> inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
//////////////////////////////////////////////////////////////////////////////////////////
// MPI3 : comms must use shm region
// SHMEM: comms must use symmetric heap
//////////////////////////////////////////////////////////////////////////////////////////
#ifdef GRID_COMMS_SHMEM
extern "C" {
#include <mpp/shmem.h>
extern void * shmem_align(size_t, size_t);
extern void shmem_free(void *);
}
#define PARANOID_SYMMETRIC_HEAP
#endif
template<typename _Tp>
class commAllocator {
public:
typedef std::size_t size_type;
typedef std::ptrdiff_t difference_type;
typedef _Tp* pointer;
typedef const _Tp* const_pointer;
typedef _Tp& reference;
typedef const _Tp& const_reference;
typedef _Tp value_type;
template<typename _Tp1> struct rebind { typedef commAllocator<_Tp1> other; };
commAllocator() throw() { }
commAllocator(const commAllocator&) throw() { }
template<typename _Tp1> commAllocator(const commAllocator<_Tp1>&) throw() { }
~commAllocator() throw() { }
pointer address(reference __x) const { return &__x; }
size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
#ifdef GRID_COMMS_SHMEM
pointer allocate(size_type __n, const void* _p= 0)
{
size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes);
#ifdef CRAY
_Tp *ptr = (_Tp *) shmem_align(bytes,64);
#else
_Tp *ptr = (_Tp *) shmem_align(64,bytes);
#endif
#ifdef PARANOID_SYMMETRIC_HEAP
static void * bcast;
static long psync[_SHMEM_REDUCE_SYNC_SIZE];
bcast = (void *) ptr;
shmem_broadcast32((void *)&bcast,(void *)&bcast,sizeof(void *)/4,0,0,0,shmem_n_pes(),psync);
if ( bcast != ptr ) {
std::printf("inconsistent alloc pe %d %lx %lx \n",shmem_my_pe(),bcast,ptr);std::fflush(stdout);
// BACKTRACEFILE();
exit(0);
}
assert( bcast == (void *) ptr);
#endif
return ptr;
}
void deallocate(pointer __p, size_type __n) {
size_type bytes = __n*sizeof(_Tp);
profilerFree(bytes);
shmem_free((void *)__p);
}
#else
pointer allocate(size_type __n, const void* _p= 0)
{
size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes);
#ifdef HAVE_MM_MALLOC_H
_Tp * ptr = (_Tp *) _mm_malloc(bytes, GRID_ALLOC_ALIGN);
#else
_Tp * ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN, bytes);
#endif
uint8_t *cp = (uint8_t *)ptr;
if ( ptr ) {
// One touch per 4k page, static OMP loop to catch same loop order
#ifdef GRID_OMP
#pragma omp parallel for schedule(static)
#endif
for(size_type n=0;n<bytes;n+=4096){
cp[n]=0;
}
}
return ptr;
}
void deallocate(pointer __p, size_type __n) {
size_type bytes = __n*sizeof(_Tp);
profilerFree(bytes);
#ifdef HAVE_MM_MALLOC_H
_mm_free((void *)__p);
#else
free((void *)__p);
#endif
}
#endif
void construct(pointer __p, const _Tp& __val) { };
void construct(pointer __p) { };
void destroy(pointer __p) { };
};
template<typename _Tp> inline bool operator==(const commAllocator<_Tp>&, const commAllocator<_Tp>&){ return true; }
template<typename _Tp> inline bool operator!=(const commAllocator<_Tp>&, const commAllocator<_Tp>&){ return false; }
////////////////////////////////////////////////////////////////////////////////
// Template typedefs
////////////////////////////////////////////////////////////////////////////////
template<class T> using Vector = std::vector<T,alignedAllocator<T> >;
template<class T> using commVector = std::vector<T,commAllocator<T> >;
template<class T> using Matrix = std::vector<std::vector<T,alignedAllocator<T> > >;
}; // namespace Grid
#endif

View File

@ -1,174 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/cartesian/Cartesian_full.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CARTESIAN_FULL_H
#define GRID_CARTESIAN_FULL_H
namespace Grid{
/////////////////////////////////////////////////////////////////////////////////////////
// Grid Support.
/////////////////////////////////////////////////////////////////////////////////////////
class GridCartesian: public GridBase {
public:
int dummy;
virtual int CheckerBoardFromOindexTable (int Oindex) {
return 0;
}
virtual int CheckerBoardFromOindex (int Oindex)
{
return 0;
}
virtual int CheckerBoarded(int dim){
return 0;
}
virtual int CheckerBoard(const std::vector<int> &site){
return 0;
}
virtual int CheckerBoardDestination(int cb,int shift,int dim){
return 0;
}
virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift, int ocb){
return shift;
}
virtual int CheckerBoardShift(int source_cb,int dim,int shift, int osite){
return shift;
}
/////////////////////////////////////////////////////////////////////////
// Constructor takes a parent grid and possibly subdivides communicator.
/////////////////////////////////////////////////////////////////////////
GridCartesian(const std::vector<int> &dimensions,
const std::vector<int> &simd_layout,
const std::vector<int> &processor_grid,
const GridCartesian &parent) : GridBase(processor_grid,parent,dummy)
{
Init(dimensions,simd_layout,processor_grid);
}
GridCartesian(const std::vector<int> &dimensions,
const std::vector<int> &simd_layout,
const std::vector<int> &processor_grid,
const GridCartesian &parent,int &split_rank) : GridBase(processor_grid,parent,split_rank)
{
Init(dimensions,simd_layout,processor_grid);
}
/////////////////////////////////////////////////////////////////////////
// Construct from comm world
/////////////////////////////////////////////////////////////////////////
GridCartesian(const std::vector<int> &dimensions,
const std::vector<int> &simd_layout,
const std::vector<int> &processor_grid) : GridBase(processor_grid)
{
Init(dimensions,simd_layout,processor_grid);
}
virtual ~GridCartesian() = default;
void Init(const std::vector<int> &dimensions,
const std::vector<int> &simd_layout,
const std::vector<int> &processor_grid)
{
///////////////////////
// Grid information
///////////////////////
_isCheckerBoarded = false;
_ndimension = dimensions.size();
_fdimensions.resize(_ndimension);
_gdimensions.resize(_ndimension);
_ldimensions.resize(_ndimension);
_rdimensions.resize(_ndimension);
_simd_layout.resize(_ndimension);
_lstart.resize(_ndimension);
_lend.resize(_ndimension);
_ostride.resize(_ndimension);
_istride.resize(_ndimension);
_fsites = _gsites = _osites = _isites = 1;
for (int d = 0; d < _ndimension; d++)
{
_fdimensions[d] = dimensions[d]; // Global dimensions
_gdimensions[d] = _fdimensions[d]; // Global dimensions
_simd_layout[d] = simd_layout[d];
_fsites = _fsites * _fdimensions[d];
_gsites = _gsites * _gdimensions[d];
// Use a reduced simd grid
_ldimensions[d] = _gdimensions[d] / _processors[d]; //local dimensions
//std::cout << _ldimensions[d] << " " << _gdimensions[d] << " " << _processors[d] << std::endl;
assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
_rdimensions[d] = _ldimensions[d] / _simd_layout[d]; //overdecomposition
assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
_lstart[d] = _processor_coor[d] * _ldimensions[d];
_lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;
_osites *= _rdimensions[d];
_isites *= _simd_layout[d];
// Addressing support
if (d == 0)
{
_ostride[d] = 1;
_istride[d] = 1;
}
else
{
_ostride[d] = _ostride[d - 1] * _rdimensions[d - 1];
_istride[d] = _istride[d - 1] * _simd_layout[d - 1];
}
}
///////////////////////
// subplane information
///////////////////////
_slice_block.resize(_ndimension);
_slice_stride.resize(_ndimension);
_slice_nblock.resize(_ndimension);
int block = 1;
int nblock = 1;
for (int d = 0; d < _ndimension; d++)
nblock *= _rdimensions[d];
for (int d = 0; d < _ndimension; d++)
{
nblock /= _rdimensions[d];
_slice_block[d] = block;
_slice_stride[d] = _ostride[d] * _rdimensions[d];
_slice_nblock[d] = nblock;
block = block * _rdimensions[d];
}
};
};
}
#endif

View File

@ -1,76 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/communicator/Communicator_none.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/GridCore.h>
#include <fcntl.h>
#include <unistd.h>
#include <limits.h>
#include <sys/mman.h>
namespace Grid {
///////////////////////////////////////////////////////////////
// Info that is setup once and indept of cartesian layout
///////////////////////////////////////////////////////////////
CartesianCommunicator::CommunicatorPolicy_t
CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
int CartesianCommunicator::nCommThreads = -1;
/////////////////////////////////
// Grid information queries
/////////////////////////////////
int CartesianCommunicator::Dimensions(void) { return _ndimension; };
int CartesianCommunicator::IsBoss(void) { return _processor==0; };
int CartesianCommunicator::BossRank(void) { return 0; };
int CartesianCommunicator::ThisRank(void) { return _processor; };
const std::vector<int> & CartesianCommunicator::ThisProcessorCoor(void) { return _processor_coor; };
const std::vector<int> & CartesianCommunicator::ProcessorGrid(void) { return _processors; };
int CartesianCommunicator::ProcessorCount(void) { return _Nprocessors; };
////////////////////////////////////////////////////////////////////////////////
// very VERY rarely (Log, serial RNG) we need world without a grid
////////////////////////////////////////////////////////////////////////////////
void CartesianCommunicator::GlobalSum(ComplexF &c)
{
GlobalSumVector((float *)&c,2);
}
void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
{
GlobalSumVector((float *)c,2*N);
}
void CartesianCommunicator::GlobalSum(ComplexD &c)
{
GlobalSumVector((double *)&c,2);
}
void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
{
GlobalSumVector((double *)c,2*N);
}
}

View File

@ -1,207 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/communicator/Communicator_base.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_COMMUNICATOR_BASE_H
#define GRID_COMMUNICATOR_BASE_H
///////////////////////////////////
// Processor layout information
///////////////////////////////////
#include <Grid/communicator/SharedMemory.h>
namespace Grid {
class CartesianCommunicator : public SharedMemory {
public:
////////////////////////////////////////////
// Policies
////////////////////////////////////////////
enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential };
static CommunicatorPolicy_t CommunicatorPolicy;
static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; }
static int nCommThreads;
////////////////////////////////////////////
// Communicator should know nothing of the physics grid, only processor grid.
////////////////////////////////////////////
int _Nprocessors; // How many in all
std::vector<int> _processors; // Which dimensions get relayed out over processors lanes.
int _processor; // linear processor rank
std::vector<int> _processor_coor; // linear processor coordinate
unsigned long _ndimension;
static Grid_MPI_Comm communicator_world;
Grid_MPI_Comm communicator;
std::vector<Grid_MPI_Comm> communicator_halo;
////////////////////////////////////////////////
// Must call in Grid startup
////////////////////////////////////////////////
static void Init(int *argc, char ***argv);
////////////////////////////////////////////////
// Constructors to sub-divide a parent communicator
// and default to comm world
////////////////////////////////////////////////
CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank);
CartesianCommunicator(const std::vector<int> &pdimensions_in);
virtual ~CartesianCommunicator();
private:
////////////////////////////////////////////////
// Private initialise from an MPI communicator
// Can use after an MPI_Comm_split, but hidden from user so private
////////////////////////////////////////////////
void InitFromMPICommunicator(const std::vector<int> &processors, Grid_MPI_Comm communicator_base);
public:
////////////////////////////////////////////////////////////////////////////////////////
// Wraps MPI_Cart routines, or implements equivalent on other impls
////////////////////////////////////////////////////////////////////////////////////////
void ShiftedRanks(int dim,int shift,int & source, int & dest);
int RankFromProcessorCoor(std::vector<int> &coor);
void ProcessorCoorFromRank(int rank,std::vector<int> &coor);
int Dimensions(void) ;
int IsBoss(void) ;
int BossRank(void) ;
int ThisRank(void) ;
const std::vector<int> & ThisProcessorCoor(void) ;
const std::vector<int> & ProcessorGrid(void) ;
int ProcessorCount(void) ;
////////////////////////////////////////////////////////////////////////////////
// very VERY rarely (Log, serial RNG) we need world without a grid
////////////////////////////////////////////////////////////////////////////////
static int RankWorld(void) ;
static void BroadcastWorld(int root,void* data, int bytes);
////////////////////////////////////////////////////////////
// Reduction
////////////////////////////////////////////////////////////
void GlobalSum(RealF &);
void GlobalSumVector(RealF *,int N);
void GlobalSum(RealD &);
void GlobalSumVector(RealD *,int N);
void GlobalSum(uint32_t &);
void GlobalSum(uint64_t &);
void GlobalSum(ComplexF &c);
void GlobalSumVector(ComplexF *c,int N);
void GlobalSum(ComplexD &c);
void GlobalSumVector(ComplexD *c,int N);
void GlobalXOR(uint32_t &);
void GlobalXOR(uint64_t &);
template<class obj> void GlobalSum(obj &o){
typedef typename obj::scalar_type scalar_type;
int words = sizeof(obj)/sizeof(scalar_type);
scalar_type * ptr = (scalar_type *)& o;
GlobalSumVector(ptr,words);
}
////////////////////////////////////////////////////////////
// Face exchange, buffer swap in translational invariant way
////////////////////////////////////////////////////////////
void SendToRecvFrom(void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes);
void SendRecvPacket(void *xmit,
void *recv,
int xmit_to_rank,
int recv_from_rank,
int bytes);
void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes);
void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
double StencilSendToRecvFrom(void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes,int dir);
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes,int dir);
void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i);
void StencilBarrier(void);
////////////////////////////////////////////////////////////
// Barrier
////////////////////////////////////////////////////////////
void Barrier(void);
////////////////////////////////////////////////////////////
// Broadcast a buffer and composite larger
////////////////////////////////////////////////////////////
void Broadcast(int root,void* data, int bytes);
////////////////////////////////////////////////////////////
// All2All down one dimension
////////////////////////////////////////////////////////////
template<class T> void AllToAll(int dim,std::vector<T> &in, std::vector<T> &out){
assert(dim>=0);
assert(dim<_ndimension);
assert(in.size()==out.size());
int numnode = _processors[dim];
uint64_t bytes=sizeof(T);
uint64_t words=in.size()/numnode;
assert(numnode * words == in.size());
assert(words < (1ULL<<31));
AllToAll(dim,(void *)&in[0],(void *)&out[0],words,bytes);
}
void AllToAll(int dim ,void *in,void *out,uint64_t words,uint64_t bytes);
void AllToAll(void *in,void *out,uint64_t words ,uint64_t bytes);
template<class obj> void Broadcast(int root,obj &data)
{
Broadcast(root,(void *)&data,sizeof(data));
};
};
}
#endif

View File

@ -1,514 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/communicator/Communicator_mpi.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/GridCore.h>
#include <Grid/communicator/SharedMemory.h>
namespace Grid {
Grid_MPI_Comm CartesianCommunicator::communicator_world;
////////////////////////////////////////////
// First initialise of comms system
////////////////////////////////////////////
void CartesianCommunicator::Init(int *argc, char ***argv)
{
int flag;
int provided;
MPI_Initialized(&flag); // needed to coexist with other libs apparently
if ( !flag ) {
MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
//If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE
if( (nCommThreads == 1 && provided == MPI_THREAD_SINGLE) ||
(nCommThreads > 1 && provided != MPI_THREAD_MULTIPLE) )
assert(0);
}
Grid_quiesce_nodes();
// Never clean up as done once.
MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
GlobalSharedMemory::Init(communicator_world);
GlobalSharedMemory::SharedMemoryAllocate(
GlobalSharedMemory::MAX_MPI_SHM_BYTES,
GlobalSharedMemory::Hugepages);
}
///////////////////////////////////////////////////////////////////////////
// Use cartesian communicators now even in MPI3
///////////////////////////////////////////////////////////////////////////
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
{
int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
assert(ierr==0);
}
int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
{
int rank;
int ierr=MPI_Cart_rank (communicator, &coor[0], &rank);
assert(ierr==0);
return rank;
}
void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
{
coor.resize(_ndimension);
int ierr=MPI_Cart_coords (communicator, rank, _ndimension,&coor[0]);
assert(ierr==0);
}
////////////////////////////////////////////////////////////////////////////////////////////////////////
// Initialises from communicator_world
////////////////////////////////////////////////////////////////////////////////////////////////////////
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
{
MPI_Comm optimal_comm;
////////////////////////////////////////////////////
// Remap using the shared memory optimising routine
// The remap creates a comm which must be freed
////////////////////////////////////////////////////
GlobalSharedMemory::OptimalCommunicator (processors,optimal_comm);
InitFromMPICommunicator(processors,optimal_comm);
SetCommunicator(optimal_comm);
///////////////////////////////////////////////////
// Free the temp communicator
///////////////////////////////////////////////////
MPI_Comm_free(&optimal_comm);
}
//////////////////////////////////
// Try to subdivide communicator
//////////////////////////////////
CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors,const CartesianCommunicator &parent,int &srank)
{
_ndimension = processors.size();
int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension);
std::vector<int> parent_processor_coor(_ndimension,0);
std::vector<int> parent_processors (_ndimension,1);
// Can make 5d grid from 4d etc...
int pad = _ndimension-parent_ndimension;
for(int d=0;d<parent_ndimension;d++){
parent_processor_coor[pad+d]=parent._processor_coor[d];
parent_processors [pad+d]=parent._processors[d];
}
//////////////////////////////////////////////////////////////////////////////////////////////////////
// split the communicator
//////////////////////////////////////////////////////////////////////////////////////////////////////
// int Nparent = parent._processors ;
// std::cout << " splitting from communicator "<<parent.communicator <<std::endl;
int Nparent;
MPI_Comm_size(parent.communicator,&Nparent);
// std::cout << " Parent size "<<Nparent <<std::endl;
int childsize=1;
for(int d=0;d<processors.size();d++) {
childsize *= processors[d];
}
int Nchild = Nparent/childsize;
assert (childsize * Nchild == Nparent);
// std::cout << " child size "<<childsize <<std::endl;
std::vector<int> ccoor(_ndimension); // coor within subcommunicator
std::vector<int> scoor(_ndimension); // coor of split within parent
std::vector<int> ssize(_ndimension); // coor of split within parent
for(int d=0;d<_ndimension;d++){
ccoor[d] = parent_processor_coor[d] % processors[d];
scoor[d] = parent_processor_coor[d] / processors[d];
ssize[d] = parent_processors[d] / processors[d];
}
// rank within subcomm ; srank is rank of subcomm within blocks of subcomms
int crank;
// Mpi uses the reverse Lexico convention to us; so reversed routines called
Lexicographic::IndexFromCoorReversed(ccoor,crank,processors); // processors is the split grid dimensions
Lexicographic::IndexFromCoorReversed(scoor,srank,ssize); // ssize is the number of split grids
MPI_Comm comm_split;
if ( Nchild > 1 ) {
if(0){
std::cout << GridLogMessage<<"Child communicator of "<< std::hex << parent.communicator << std::dec<<std::endl;
std::cout << GridLogMessage<<" parent grid["<< parent._ndimension<<"] ";
for(int d=0;d<parent._ndimension;d++) std::cout << parent._processors[d] << " ";
std::cout<<std::endl;
std::cout << GridLogMessage<<" child grid["<< _ndimension <<"] ";
for(int d=0;d<processors.size();d++) std::cout << processors[d] << " ";
std::cout<<std::endl;
std::cout << GridLogMessage<<" old rank "<< parent._processor<<" coor ["<< parent._ndimension <<"] ";
for(int d=0;d<parent._ndimension;d++) std::cout << parent._processor_coor[d] << " ";
std::cout<<std::endl;
std::cout << GridLogMessage<<" new split "<< srank<<" scoor ["<< _ndimension <<"] ";
for(int d=0;d<processors.size();d++) std::cout << scoor[d] << " ";
std::cout<<std::endl;
std::cout << GridLogMessage<<" new rank "<< crank<<" coor ["<< _ndimension <<"] ";
for(int d=0;d<processors.size();d++) std::cout << ccoor[d] << " ";
std::cout<<std::endl;
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Declare victory
//////////////////////////////////////////////////////////////////////////////////////////////////////
std::cout << GridLogMessage<<"Divided communicator "<< parent._Nprocessors<<" into "
<< Nchild <<" communicators with " << childsize << " ranks"<<std::endl;
std::cout << " Split communicator " <<comm_split <<std::endl;
}
////////////////////////////////////////////////////////////////
// Split the communicator
////////////////////////////////////////////////////////////////
int ierr= MPI_Comm_split(parent.communicator,srank,crank,&comm_split);
assert(ierr==0);
} else {
srank = 0;
int ierr = MPI_Comm_dup (parent.communicator,&comm_split);
assert(ierr==0);
}
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Set up from the new split communicator
//////////////////////////////////////////////////////////////////////////////////////////////////////
InitFromMPICommunicator(processors,comm_split);
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Take the right SHM buffers
//////////////////////////////////////////////////////////////////////////////////////////////////////
SetCommunicator(comm_split);
///////////////////////////////////////////////
// Free the temp communicator
///////////////////////////////////////////////
MPI_Comm_free(&comm_split);
if(0){
std::cout << " ndim " <<_ndimension<<" " << parent._ndimension << std::endl;
for(int d=0;d<processors.size();d++){
std::cout << d<< " " << _processor_coor[d] <<" " << ccoor[d]<<std::endl;
}
}
for(int d=0;d<processors.size();d++){
assert(_processor_coor[d] == ccoor[d] );
}
}
void CartesianCommunicator::InitFromMPICommunicator(const std::vector<int> &processors, MPI_Comm communicator_base)
{
////////////////////////////////////////////////////
// Creates communicator, and the communicator_halo
////////////////////////////////////////////////////
_ndimension = processors.size();
_processor_coor.resize(_ndimension);
/////////////////////////////////
// Count the requested nodes
/////////////////////////////////
_Nprocessors=1;
_processors = processors;
for(int i=0;i<_ndimension;i++){
_Nprocessors*=_processors[i];
}
std::vector<int> periodic(_ndimension,1);
MPI_Cart_create(communicator_base, _ndimension,&_processors[0],&periodic[0],0,&communicator);
MPI_Comm_rank(communicator,&_processor);
MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
if ( 0 && (communicator_base != communicator_world) ) {
std::cout << "InitFromMPICommunicator Cartesian communicator created with a non-world communicator"<<std::endl;
std::cout << " new communicator rank "<<_processor<< " coor ["<<_ndimension<<"] ";
for(int d=0;d<_processors.size();d++){
std::cout << _processor_coor[d]<<" ";
}
std::cout << std::endl;
}
int Size;
MPI_Comm_size(communicator,&Size);
communicator_halo.resize (2*_ndimension);
for(int i=0;i<_ndimension*2;i++){
MPI_Comm_dup(communicator,&communicator_halo[i]);
}
assert(Size==_Nprocessors);
}
CartesianCommunicator::~CartesianCommunicator()
{
int MPI_is_finalised;
MPI_Finalized(&MPI_is_finalised);
if (communicator && !MPI_is_finalised) {
MPI_Comm_free(&communicator);
for(int i=0;i<communicator_halo.size();i++){
MPI_Comm_free(&communicator_halo[i]);
}
}
}
void CartesianCommunicator::GlobalSum(uint32_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(uint64_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalXOR(uint32_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalXOR(uint64_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(float &f){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(float *f,int N)
{
int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(double &d)
{
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(double *d,int N)
{
int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
}
// Basic Halo comms primitive
void CartesianCommunicator::SendToRecvFrom(void *xmit,
int dest,
void *recv,
int from,
int bytes)
{
std::vector<CommsRequest_t> reqs(0);
// unsigned long xcrc = crc32(0L, Z_NULL, 0);
// unsigned long rcrc = crc32(0L, Z_NULL, 0);
// xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
SendToRecvFromComplete(reqs);
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
// printf("proc %d SendToRecvFrom %d bytes %lx %lx\n",_processor,bytes,xcrc,rcrc);
}
void CartesianCommunicator::SendRecvPacket(void *xmit,
void *recv,
int sender,
int receiver,
int bytes)
{
MPI_Status stat;
assert(sender != receiver);
int tag = sender;
if ( _processor == sender ) {
MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
}
if ( _processor == receiver ) {
MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
}
}
// Basic Halo comms primitive
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
int from,
int bytes)
{
int myrank = _processor;
int ierr;
if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) {
MPI_Request xrq;
MPI_Request rrq;
ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
assert(ierr==0);
list.push_back(xrq);
list.push_back(rrq);
} else {
// Give the CPU to MPI immediately; can use threads to overlap optionally
ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
recv,bytes,MPI_CHAR,from, from,
communicator,MPI_STATUS_IGNORE);
assert(ierr==0);
}
}
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
int dest,
void *recv,
int from,
int bytes,int dir)
{
std::vector<CommsRequest_t> list;
double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir);
StencilSendToRecvFromComplete(list,dir);
return offbytes;
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
int from,
int bytes,int dir)
{
int ncomm =communicator_halo.size();
int commdir=dir%ncomm;
MPI_Request xrq;
MPI_Request rrq;
int ierr;
int gdest = ShmRanks[dest];
int gfrom = ShmRanks[from];
int gme = ShmRanks[_processor];
assert(dest != _processor);
assert(from != _processor);
assert(gme == ShmRank);
double off_node_bytes=0.0;
if ( gfrom ==MPI_UNDEFINED) {
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator_halo[commdir],&rrq);
assert(ierr==0);
list.push_back(rrq);
off_node_bytes+=bytes;
}
if ( gdest == MPI_UNDEFINED ) {
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator_halo[commdir],&xrq);
assert(ierr==0);
list.push_back(xrq);
off_node_bytes+=bytes;
}
if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
this->StencilSendToRecvFromComplete(list,dir);
}
return off_node_bytes;
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
{
SendToRecvFromComplete(waitall);
}
void CartesianCommunicator::StencilBarrier(void)
{
MPI_Barrier (ShmComm);
}
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
{
int nreq=list.size();
if (nreq==0) return;
std::vector<MPI_Status> status(nreq);
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
assert(ierr==0);
list.resize(0);
}
void CartesianCommunicator::Barrier(void)
{
int ierr = MPI_Barrier(communicator);
assert(ierr==0);
}
void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
{
int ierr=MPI_Bcast(data,
bytes,
MPI_BYTE,
root,
communicator);
assert(ierr==0);
}
int CartesianCommunicator::RankWorld(void){
int r;
MPI_Comm_rank(communicator_world,&r);
return r;
}
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
{
int ierr= MPI_Bcast(data,
bytes,
MPI_BYTE,
root,
communicator_world);
assert(ierr==0);
}
void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes)
{
std::vector<int> row(_ndimension,1);
assert(dim>=0 && dim<_ndimension);
// Split the communicator
row[dim] = _processors[dim];
int me;
CartesianCommunicator Comm(row,*this,me);
Comm.AllToAll(in,out,words,bytes);
}
void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t bytes)
{
// MPI is a pain and uses "int" arguments
// 64*64*64*128*16 == 500Million elements of data.
// When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug.
// (Turns up on 32^3 x 64 Gparity too)
MPI_Datatype object;
int iwords;
int ibytes;
iwords = words;
ibytes = bytes;
assert(words == iwords); // safe to cast to int ?
assert(bytes == ibytes); // safe to cast to int ?
MPI_Type_contiguous(ibytes,MPI_BYTE,&object);
MPI_Type_commit(&object);
MPI_Alltoall(in,iwords,object,out,iwords,object,communicator);
MPI_Type_free(&object);
}
}

View File

@ -1,92 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/communicator/SharedMemory.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/GridCore.h>
namespace Grid {
// static data
uint64_t GlobalSharedMemory::MAX_MPI_SHM_BYTES = 1024LL*1024LL*1024LL;
int GlobalSharedMemory::Hugepages = 0;
int GlobalSharedMemory::_ShmSetup;
int GlobalSharedMemory::_ShmAlloc;
uint64_t GlobalSharedMemory::_ShmAllocBytes;
std::vector<void *> GlobalSharedMemory::WorldShmCommBufs;
Grid_MPI_Comm GlobalSharedMemory::WorldShmComm;
int GlobalSharedMemory::WorldShmRank;
int GlobalSharedMemory::WorldShmSize;
std::vector<int> GlobalSharedMemory::WorldShmRanks;
Grid_MPI_Comm GlobalSharedMemory::WorldComm;
int GlobalSharedMemory::WorldSize;
int GlobalSharedMemory::WorldRank;
int GlobalSharedMemory::WorldNodes;
int GlobalSharedMemory::WorldNode;
void GlobalSharedMemory::SharedMemoryFree(void)
{
assert(_ShmAlloc);
assert(_ShmAllocBytes>0);
for(int r=0;r<WorldShmSize;r++){
munmap(WorldShmCommBufs[r],_ShmAllocBytes);
}
_ShmAlloc = 0;
_ShmAllocBytes = 0;
}
/////////////////////////////////
// Alloc, free shmem region
/////////////////////////////////
void *SharedMemory::ShmBufferMalloc(size_t bytes){
// bytes = (bytes+sizeof(vRealD))&(~(sizeof(vRealD)-1));// align up bytes
void *ptr = (void *)heap_top;
heap_top += bytes;
heap_bytes+= bytes;
if (heap_bytes >= heap_size) {
std::cout<< " ShmBufferMalloc exceeded shared heap size -- try increasing with --shm <MB> flag" <<std::endl;
std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl;
std::cout<< " Current value is " << (heap_size/(1024*1024)) <<std::endl;
assert(heap_bytes<heap_size);
}
return ptr;
}
void SharedMemory::ShmBufferFreeAll(void) {
heap_top =(size_t)ShmBufferSelf();
heap_bytes=0;
}
void *SharedMemory::ShmBufferSelf(void)
{
return ShmCommBufs[ShmRank];
}
}

View File

@ -1,165 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/communicator/SharedMemory.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
// TODO
// 1) move includes into SharedMemory.cc
//
// 2) split shared memory into a) optimal communicator creation from comm world
//
// b) shared memory buffers container
// -- static globally shared; init once
// -- per instance set of buffers.
//
#pragma once
#include <Grid/GridCore.h>
#if defined (GRID_COMMS_MPI3)
#include <mpi.h>
#endif
#include <semaphore.h>
#include <fcntl.h>
#include <unistd.h>
#include <limits.h>
#include <sys/types.h>
#include <sys/ipc.h>
#include <sys/shm.h>
#include <sys/mman.h>
#include <zlib.h>
#ifdef HAVE_NUMAIF_H
#include <numaif.h>
#endif
namespace Grid {
#if defined (GRID_COMMS_MPI3)
typedef MPI_Comm Grid_MPI_Comm;
typedef MPI_Request CommsRequest_t;
#else
typedef int CommsRequest_t;
typedef int Grid_MPI_Comm;
#endif
class GlobalSharedMemory {
private:
static const int MAXLOG2RANKSPERNODE = 16;
// Init once lock on the buffer allocation
static int _ShmSetup;
static int _ShmAlloc;
static uint64_t _ShmAllocBytes;
public:
static int ShmSetup(void) { return _ShmSetup; }
static int ShmAlloc(void) { return _ShmAlloc; }
static uint64_t ShmAllocBytes(void) { return _ShmAllocBytes; }
static uint64_t MAX_MPI_SHM_BYTES;
static int Hugepages;
static std::vector<void *> WorldShmCommBufs;
static Grid_MPI_Comm WorldComm;
static int WorldRank;
static int WorldSize;
static Grid_MPI_Comm WorldShmComm;
static int WorldShmRank;
static int WorldShmSize;
static int WorldNodes;
static int WorldNode;
static std::vector<int> WorldShmRanks;
//////////////////////////////////////////////////////////////////////////////////////
// Create an optimal reordered communicator that makes MPI_Cart_create get it right
//////////////////////////////////////////////////////////////////////////////////////
static void Init(Grid_MPI_Comm comm); // Typically MPI_COMM_WORLD
static void OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian
///////////////////////////////////////////////////
// Provide shared memory facilities off comm world
///////////////////////////////////////////////////
static void SharedMemoryAllocate(uint64_t bytes, int flags);
static void SharedMemoryFree(void);
};
//////////////////////////////
// one per communicator
//////////////////////////////
class SharedMemory
{
private:
static const int MAXLOG2RANKSPERNODE = 16;
size_t heap_top;
size_t heap_bytes;
size_t heap_size;
protected:
Grid_MPI_Comm ShmComm; // for barriers
int ShmRank;
int ShmSize;
std::vector<void *> ShmCommBufs;
std::vector<int> ShmRanks;// Mapping comm ranks to Shm ranks
public:
SharedMemory() {};
~SharedMemory();
///////////////////////////////////////////////////////////////////////////////////////
// set the buffers & sizes
///////////////////////////////////////////////////////////////////////////////////////
void SetCommunicator(Grid_MPI_Comm comm);
////////////////////////////////////////////////////////////////////////
// For this instance ; disjoint buffer sets between splits if split grid
////////////////////////////////////////////////////////////////////////
void ShmBarrier(void);
///////////////////////////////////////////////////
// Call on any instance
///////////////////////////////////////////////////
void SharedMemoryTest(void);
void *ShmBufferSelf(void);
void *ShmBuffer (int rank);
void *ShmBufferTranslate(int rank,void * local_p);
void *ShmBufferMalloc(size_t bytes);
void ShmBufferFreeAll(void) ;
//////////////////////////////////////////////////////////////////////////
// Make info on Nodes & ranks and Shared memory available
//////////////////////////////////////////////////////////////////////////
int NodeCount(void) { return GlobalSharedMemory::WorldNodes;};
int RankCount(void) { return GlobalSharedMemory::WorldSize;};
};
}

View File

@ -1,651 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/communicator/SharedMemory.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/GridCore.h>
#include <pwd.h>
namespace Grid {
/*Construct from an MPI communicator*/
void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
{
assert(_ShmSetup==0);
WorldComm = comm;
MPI_Comm_rank(WorldComm,&WorldRank);
MPI_Comm_size(WorldComm,&WorldSize);
// WorldComm, WorldSize, WorldRank
/////////////////////////////////////////////////////////////////////
// Split into groups that can share memory
/////////////////////////////////////////////////////////////////////
MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm);
MPI_Comm_rank(WorldShmComm ,&WorldShmRank);
MPI_Comm_size(WorldShmComm ,&WorldShmSize);
// WorldShmComm, WorldShmSize, WorldShmRank
// WorldNodes
WorldNodes = WorldSize/WorldShmSize;
assert( (WorldNodes * WorldShmSize) == WorldSize );
// FIXME: Check all WorldShmSize are the same ?
/////////////////////////////////////////////////////////////////////
// find world ranks in our SHM group (i.e. which ranks are on our node)
/////////////////////////////////////////////////////////////////////
MPI_Group WorldGroup, ShmGroup;
MPI_Comm_group (WorldComm, &WorldGroup);
MPI_Comm_group (WorldShmComm, &ShmGroup);
std::vector<int> world_ranks(WorldSize); for(int r=0;r<WorldSize;r++) world_ranks[r]=r;
WorldShmRanks.resize(WorldSize);
MPI_Group_translate_ranks (WorldGroup,WorldSize,&world_ranks[0],ShmGroup, &WorldShmRanks[0]);
///////////////////////////////////////////////////////////////////
// Identify who is in my group and nominate the leader
///////////////////////////////////////////////////////////////////
int g=0;
std::vector<int> MyGroup;
MyGroup.resize(WorldShmSize);
for(int rank=0;rank<WorldSize;rank++){
if(WorldShmRanks[rank]!=MPI_UNDEFINED){
assert(g<WorldShmSize);
MyGroup[g++] = rank;
}
}
std::sort(MyGroup.begin(),MyGroup.end(),std::less<int>());
int myleader = MyGroup[0];
std::vector<int> leaders_1hot(WorldSize,0);
std::vector<int> leaders_group(WorldNodes,0);
leaders_1hot [ myleader ] = 1;
///////////////////////////////////////////////////////////////////
// global sum leaders over comm world
///////////////////////////////////////////////////////////////////
int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,WorldComm);
assert(ierr==0);
///////////////////////////////////////////////////////////////////
// find the group leaders world rank
///////////////////////////////////////////////////////////////////
int group=0;
for(int l=0;l<WorldSize;l++){
if(leaders_1hot[l]){
leaders_group[group++] = l;
}
}
///////////////////////////////////////////////////////////////////
// Identify the node of the group in which I (and my leader) live
///////////////////////////////////////////////////////////////////
WorldNode=-1;
for(int g=0;g<WorldNodes;g++){
if (myleader == leaders_group[g]){
WorldNode=g;
}
}
assert(WorldNode!=-1);
_ShmSetup=1;
}
// Gray encode support
int BinaryToGray (int binary) {
int gray = (binary>>1)^binary;
return gray;
}
int Log2Size(int TwoToPower,int MAXLOG2)
{
int log2size = -1;
for(int i=0;i<=MAXLOG2;i++){
if ( (0x1<<i) == TwoToPower ) {
log2size = i;
break;
}
}
return log2size;
}
void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm)
{
#ifdef HYPERCUBE
////////////////////////////////////////////////////////////////
// Assert power of two shm_size.
////////////////////////////////////////////////////////////////
int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE);
assert(log2size != -1);
////////////////////////////////////////////////////////////////
// Identify the hypercube coordinate of this node using hostname
////////////////////////////////////////////////////////////////
// n runs 0...7 9...16 18...25 27...34 (8*4) 5 bits
// i runs 0..7 3 bits
// r runs 0..3 2 bits
// 2^10 = 1024 nodes
const int maxhdim = 10;
std::vector<int> HyperCubeCoords(maxhdim,0);
std::vector<int> RootHyperCubeCoords(maxhdim,0);
int R;
int I;
int N;
const int namelen = _POSIX_HOST_NAME_MAX;
char name[namelen];
// Parse ICE-XA hostname to get hypercube location
gethostname(name,namelen);
int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ;
assert(nscan==3);
int nlo = N%9;
int nhi = N/9;
uint32_t hypercoor = (R<<8)|(I<<5)|(nhi<<3)|nlo ;
uint32_t rootcoor = hypercoor;
//////////////////////////////////////////////////////////////////
// Print debug info
//////////////////////////////////////////////////////////////////
for(int d=0;d<maxhdim;d++){
HyperCubeCoords[d] = (hypercoor>>d)&0x1;
}
std::string hname(name);
std::cout << "hostname "<<hname<<std::endl;
std::cout << "R " << R << " I " << I << " N "<< N
<< " hypercoor 0x"<<std::hex<<hypercoor<<std::dec<<std::endl;
//////////////////////////////////////////////////////////////////
// broadcast node 0's base coordinate for this partition.
//////////////////////////////////////////////////////////////////
MPI_Bcast(&rootcoor, sizeof(rootcoor), MPI_BYTE, 0, WorldComm);
hypercoor=hypercoor-rootcoor;
assert(hypercoor<WorldSize);
assert(hypercoor>=0);
//////////////////////////////////////
// Printing
//////////////////////////////////////
for(int d=0;d<maxhdim;d++){
HyperCubeCoords[d] = (hypercoor>>d)&0x1;
}
////////////////////////////////////////////////////////////////
// Identify subblock of ranks on node spreading across dims
// in a maximally symmetrical way
////////////////////////////////////////////////////////////////
int ndimension = processors.size();
std::vector<int> processor_coor(ndimension);
std::vector<int> WorldDims = processors; std::vector<int> ShmDims (ndimension,1); std::vector<int> NodeDims (ndimension);
std::vector<int> ShmCoor (ndimension); std::vector<int> NodeCoor (ndimension); std::vector<int> WorldCoor(ndimension);
std::vector<int> HyperCoor(ndimension);
int dim = 0;
for(int l2=0;l2<log2size;l2++){
while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension;
ShmDims[dim]*=2;
dim=(dim+1)%ndimension;
}
////////////////////////////////////////////////////////////////
// Establish torus of processes and nodes with sub-blockings
////////////////////////////////////////////////////////////////
for(int d=0;d<ndimension;d++){
NodeDims[d] = WorldDims[d]/ShmDims[d];
}
////////////////////////////////////////////////////////////////
// Map Hcube according to physical lattice
// must partition. Loop over dims and find out who would join.
////////////////////////////////////////////////////////////////
int hcoor = hypercoor;
for(int d=0;d<ndimension;d++){
int bits = Log2Size(NodeDims[d],MAXLOG2RANKSPERNODE);
int msk = (0x1<<bits)-1;
HyperCoor[d]=hcoor & msk;
HyperCoor[d]=BinaryToGray(HyperCoor[d]); // Space filling curve magic
hcoor = hcoor >> bits;
}
////////////////////////////////////////////////////////////////
// Check processor counts match
////////////////////////////////////////////////////////////////
int Nprocessors=1;
for(int i=0;i<ndimension;i++){
Nprocessors*=processors[i];
}
assert(WorldSize==Nprocessors);
////////////////////////////////////////////////////////////////
// Establish mapping between lexico physics coord and WorldRank
////////////////////////////////////////////////////////////////
int rank;
Lexicographic::CoorFromIndexReversed(NodeCoor,WorldNode ,NodeDims);
for(int d=0;d<ndimension;d++) NodeCoor[d]=HyperCoor[d];
Lexicographic::CoorFromIndexReversed(ShmCoor ,WorldShmRank,ShmDims);
for(int d=0;d<ndimension;d++) WorldCoor[d] = NodeCoor[d]*ShmDims[d]+ShmCoor[d];
Lexicographic::IndexFromCoorReversed(WorldCoor,rank,WorldDims);
/////////////////////////////////////////////////////////////////
// Build the new communicator
/////////////////////////////////////////////////////////////////
int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
assert(ierr==0);
#else
////////////////////////////////////////////////////////////////
// Assert power of two shm_size.
////////////////////////////////////////////////////////////////
int log2size = Log2Size(WorldShmSize,MAXLOG2RANKSPERNODE);
assert(log2size != -1);
////////////////////////////////////////////////////////////////
// Identify subblock of ranks on node spreading across dims
// in a maximally symmetrical way
////////////////////////////////////////////////////////////////
int ndimension = processors.size();
std::vector<int> processor_coor(ndimension);
std::vector<int> WorldDims = processors; std::vector<int> ShmDims (ndimension,1); std::vector<int> NodeDims (ndimension);
std::vector<int> ShmCoor (ndimension); std::vector<int> NodeCoor (ndimension); std::vector<int> WorldCoor(ndimension);
int dim = 0;
for(int l2=0;l2<log2size;l2++){
while ( (WorldDims[dim] / ShmDims[dim]) <= 1 ) dim=(dim+1)%ndimension;
ShmDims[dim]*=2;
dim=(dim+1)%ndimension;
}
////////////////////////////////////////////////////////////////
// Establish torus of processes and nodes with sub-blockings
////////////////////////////////////////////////////////////////
for(int d=0;d<ndimension;d++){
NodeDims[d] = WorldDims[d]/ShmDims[d];
}
////////////////////////////////////////////////////////////////
// Check processor counts match
////////////////////////////////////////////////////////////////
int Nprocessors=1;
for(int i=0;i<ndimension;i++){
Nprocessors*=processors[i];
}
assert(WorldSize==Nprocessors);
////////////////////////////////////////////////////////////////
// Establish mapping between lexico physics coord and WorldRank
////////////////////////////////////////////////////////////////
int rank;
Lexicographic::CoorFromIndexReversed(NodeCoor,WorldNode ,NodeDims);
Lexicographic::CoorFromIndexReversed(ShmCoor ,WorldShmRank,ShmDims);
for(int d=0;d<ndimension;d++) WorldCoor[d] = NodeCoor[d]*ShmDims[d]+ShmCoor[d];
Lexicographic::IndexFromCoorReversed(WorldCoor,rank,WorldDims);
/////////////////////////////////////////////////////////////////
// Build the new communicator
/////////////////////////////////////////////////////////////////
int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
assert(ierr==0);
#endif
}
////////////////////////////////////////////////////////////////////////////////////////////
// SHMGET
////////////////////////////////////////////////////////////////////////////////////////////
#ifdef GRID_MPI3_SHMGET
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the shared windows for our group
//////////////////////////////////////////////////////////////////////////////////////////////////////////
MPI_Barrier(WorldShmComm);
WorldShmCommBufs.resize(WorldShmSize);
std::vector<int> shmids(WorldShmSize);
if ( WorldShmRank == 0 ) {
for(int r=0;r<WorldShmSize;r++){
size_t size = bytes;
key_t key = IPC_PRIVATE;
int flags = IPC_CREAT | SHM_R | SHM_W;
#ifdef SHM_HUGETLB
if (Hugepages) flags|=SHM_HUGETLB;
#endif
if ((shmids[r]= shmget(key,size, flags)) ==-1) {
int errsv = errno;
printf("Errno %d\n",errsv);
printf("key %d\n",key);
printf("size %lld\n",size);
printf("flags %d\n",flags);
perror("shmget");
exit(1);
}
}
}
MPI_Barrier(WorldShmComm);
MPI_Bcast(&shmids[0],WorldShmSize*sizeof(int),MPI_BYTE,0,WorldShmComm);
MPI_Barrier(WorldShmComm);
for(int r=0;r<WorldShmSize;r++){
WorldShmCommBufs[r] = (uint64_t *)shmat(shmids[r], NULL,0);
if (WorldShmCommBufs[r] == (uint64_t *)-1) {
perror("Shared memory attach failure");
shmctl(shmids[r], IPC_RMID, NULL);
exit(2);
}
}
MPI_Barrier(WorldShmComm);
///////////////////////////////////
// Mark for clean up
///////////////////////////////////
for(int r=0;r<WorldShmSize;r++){
shmctl(shmids[r], IPC_RMID,(struct shmid_ds *)NULL);
}
MPI_Barrier(WorldShmComm);
_ShmAlloc=1;
_ShmAllocBytes = bytes;
}
#endif
////////////////////////////////////////////////////////////////////////////////////////////
// Hugetlbfs mapping intended
////////////////////////////////////////////////////////////////////////////////////////////
#ifdef GRID_MPI3_SHMMMAP
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the shared windows for our group
//////////////////////////////////////////////////////////////////////////////////////////////////////////
MPI_Barrier(WorldShmComm);
WorldShmCommBufs.resize(WorldShmSize);
////////////////////////////////////////////////////////////////////////////////////////////
// Hugetlbfs and others map filesystems as mappable huge pages
////////////////////////////////////////////////////////////////////////////////////////////
char shm_name [NAME_MAX];
for(int r=0;r<WorldShmSize;r++){
sprintf(shm_name,GRID_SHM_PATH "/Grid_mpi3_shm_%d_%d",WorldNode,r);
int fd=open(shm_name,O_RDWR|O_CREAT,0666);
if ( fd == -1) {
printf("open %s failed\n",shm_name);
perror("open hugetlbfs");
exit(0);
}
int mmap_flag = MAP_SHARED ;
#ifdef MAP_POPULATE
mmap_flag|=MAP_POPULATE;
#endif
#ifdef MAP_HUGETLB
if ( flags ) mmap_flag |= MAP_HUGETLB;
#endif
void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0);
if ( ptr == (void *)MAP_FAILED ) {
printf("mmap %s failed\n",shm_name);
perror("failed mmap"); assert(0);
}
assert(((uint64_t)ptr&0x3F)==0);
close(fd);
WorldShmCommBufs[r] =ptr;
std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
}
_ShmAlloc=1;
_ShmAllocBytes = bytes;
};
#endif // MMAP
#ifdef GRID_MPI3_SHM_NONE
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the shared windows for our group
//////////////////////////////////////////////////////////////////////////////////////////////////////////
MPI_Barrier(WorldShmComm);
WorldShmCommBufs.resize(WorldShmSize);
////////////////////////////////////////////////////////////////////////////////////////////
// Hugetlbf and others map filesystems as mappable huge pages
////////////////////////////////////////////////////////////////////////////////////////////
char shm_name [NAME_MAX];
assert(WorldShmSize == 1);
for(int r=0;r<WorldShmSize;r++){
int fd=-1;
int mmap_flag = MAP_SHARED |MAP_ANONYMOUS ;
#ifdef MAP_POPULATE
mmap_flag|=MAP_POPULATE;
#endif
#ifdef MAP_HUGETLB
if ( flags ) mmap_flag |= MAP_HUGETLB;
#endif
void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0);
if ( ptr == (void *)MAP_FAILED ) {
printf("mmap %s failed\n",shm_name);
perror("failed mmap"); assert(0);
}
assert(((uint64_t)ptr&0x3F)==0);
close(fd);
WorldShmCommBufs[r] =ptr;
std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
}
_ShmAlloc=1;
_ShmAllocBytes = bytes;
};
#endif // MMAP
#ifdef GRID_MPI3_SHMOPEN
////////////////////////////////////////////////////////////////////////////////////////////
// POSIX SHMOPEN ; as far as I know Linux does not allow EXPLICIT HugePages with this case
// tmpfs (Larry Meadows says) does not support explicit huge page, and this is used for
// the posix shm virtual file system
////////////////////////////////////////////////////////////////////////////////////////////
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
MPI_Barrier(WorldShmComm);
WorldShmCommBufs.resize(WorldShmSize);
char shm_name [NAME_MAX];
if ( WorldShmRank == 0 ) {
for(int r=0;r<WorldShmSize;r++){
size_t size = bytes;
struct passwd *pw = getpwuid (getuid());
sprintf(shm_name,"/Grid_%s_mpi3_shm_%d_%d",pw->pw_name,WorldNode,r);
shm_unlink(shm_name);
int fd=shm_open(shm_name,O_RDWR|O_CREAT,0666);
if ( fd < 0 ) { perror("failed shm_open"); assert(0); }
ftruncate(fd, size);
int mmap_flag = MAP_SHARED;
#ifdef MAP_POPULATE
mmap_flag |= MAP_POPULATE;
#endif
#ifdef MAP_HUGETLB
if (flags) mmap_flag |= MAP_HUGETLB;
#endif
void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< size<< "bytes)"<<std::endl;
if ( ptr == (void * )MAP_FAILED ) {
perror("failed mmap");
assert(0);
}
assert(((uint64_t)ptr&0x3F)==0);
WorldShmCommBufs[r] =ptr;
close(fd);
}
}
MPI_Barrier(WorldShmComm);
if ( WorldShmRank != 0 ) {
for(int r=0;r<WorldShmSize;r++){
size_t size = bytes ;
struct passwd *pw = getpwuid (getuid());
sprintf(shm_name,"/Grid_%s_mpi3_shm_%d_%d",pw->pw_name,WorldNode,r);
int fd=shm_open(shm_name,O_RDWR,0666);
if ( fd<0 ) { perror("failed shm_open"); assert(0); }
void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if ( ptr == MAP_FAILED ) { perror("failed mmap"); assert(0); }
assert(((uint64_t)ptr&0x3F)==0);
WorldShmCommBufs[r] =ptr;
close(fd);
}
}
_ShmAlloc=1;
_ShmAllocBytes = bytes;
}
#endif
////////////////////////////////////////////////////////
// Global shared functionality finished
// Now move to per communicator functionality
////////////////////////////////////////////////////////
void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
{
int rank, size;
MPI_Comm_rank(comm,&rank);
MPI_Comm_size(comm,&size);
ShmRanks.resize(size);
/////////////////////////////////////////////////////////////////////
// Split into groups that can share memory
/////////////////////////////////////////////////////////////////////
MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm);
MPI_Comm_rank(ShmComm ,&ShmRank);
MPI_Comm_size(ShmComm ,&ShmSize);
ShmCommBufs.resize(ShmSize);
//////////////////////////////////////////////////////////////////////
// Map ShmRank to WorldShmRank and use the right buffer
//////////////////////////////////////////////////////////////////////
assert (GlobalSharedMemory::ShmAlloc()==1);
heap_size = GlobalSharedMemory::ShmAllocBytes();
for(int r=0;r<ShmSize;r++){
uint32_t wsr = (r==ShmRank) ? GlobalSharedMemory::WorldShmRank : 0 ;
MPI_Allreduce(MPI_IN_PLACE,&wsr,1,MPI_UINT32_T,MPI_SUM,ShmComm);
ShmCommBufs[r] = GlobalSharedMemory::WorldShmCommBufs[wsr];
// std::cout << "SetCommunicator ShmCommBufs ["<< r<< "] = "<< ShmCommBufs[r]<< " wsr = "<<wsr<<std::endl;
}
ShmBufferFreeAll();
/////////////////////////////////////////////////////////////////////
// find comm ranks in our SHM group (i.e. which ranks are on our node)
/////////////////////////////////////////////////////////////////////
MPI_Group FullGroup, ShmGroup;
MPI_Comm_group (comm , &FullGroup);
MPI_Comm_group (ShmComm, &ShmGroup);
std::vector<int> ranks(size); for(int r=0;r<size;r++) ranks[r]=r;
MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]);
}
//////////////////////////////////////////////////////////////////
// On node barrier
//////////////////////////////////////////////////////////////////
void SharedMemory::ShmBarrier(void)
{
MPI_Barrier (ShmComm);
}
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// Test the shared memory is working
//////////////////////////////////////////////////////////////////////////////////////////////////////////
void SharedMemory::SharedMemoryTest(void)
{
ShmBarrier();
if ( ShmRank == 0 ) {
for(int r=0;r<ShmSize;r++){
uint64_t * check = (uint64_t *) ShmCommBufs[r];
check[0] = GlobalSharedMemory::WorldNode;
check[1] = r;
check[2] = 0x5A5A5A;
}
}
ShmBarrier();
for(int r=0;r<ShmSize;r++){
uint64_t * check = (uint64_t *) ShmCommBufs[r];
assert(check[0]==GlobalSharedMemory::WorldNode);
assert(check[1]==r);
assert(check[2]==0x5A5A5A);
}
ShmBarrier();
}
void *SharedMemory::ShmBuffer(int rank)
{
int gpeer = ShmRanks[rank];
if (gpeer == MPI_UNDEFINED){
return NULL;
} else {
return ShmCommBufs[gpeer];
}
}
void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
{
static int count =0;
int gpeer = ShmRanks[rank];
assert(gpeer!=ShmRank); // never send to self
if (gpeer == MPI_UNDEFINED){
return NULL;
} else {
uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank];
uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset;
return (void *) remote;
}
}
SharedMemory::~SharedMemory()
{
int MPI_is_finalised; MPI_Finalized(&MPI_is_finalised);
if ( !MPI_is_finalised ) {
MPI_Comm_free(&ShmComm);
}
};
}

View File

@ -1,128 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/communicator/SharedMemory.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/GridCore.h>
namespace Grid {
/*Construct from an MPI communicator*/
void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
{
assert(_ShmSetup==0);
WorldComm = 0;
WorldRank = 0;
WorldSize = 1;
WorldShmComm = 0 ;
WorldShmRank = 0 ;
WorldShmSize = 1 ;
WorldNodes = 1 ;
WorldNode = 0 ;
WorldShmRanks.resize(WorldSize); WorldShmRanks[0] = 0;
WorldShmCommBufs.resize(1);
_ShmSetup=1;
}
void GlobalSharedMemory::OptimalCommunicator(const std::vector<int> &processors,Grid_MPI_Comm & optimal_comm)
{
optimal_comm = WorldComm;
}
////////////////////////////////////////////////////////////////////////////////////////////
// Hugetlbfs mapping intended, use anonymous mmap
////////////////////////////////////////////////////////////////////////////////////////////
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
void * ShmCommBuf ;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
int mmap_flag =0;
#ifdef MAP_ANONYMOUS
mmap_flag = mmap_flag| MAP_SHARED | MAP_ANONYMOUS;
#endif
#ifdef MAP_ANON
mmap_flag = mmap_flag| MAP_SHARED | MAP_ANON;
#endif
#ifdef MAP_HUGETLB
if ( flags ) mmap_flag |= MAP_HUGETLB;
#endif
ShmCommBuf =(void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag, -1, 0);
if (ShmCommBuf == (void *)MAP_FAILED) {
perror("mmap failed ");
exit(EXIT_FAILURE);
}
#ifdef MADV_HUGEPAGE
if (!Hugepages ) madvise(ShmCommBuf,bytes,MADV_HUGEPAGE);
#endif
bzero(ShmCommBuf,bytes);
WorldShmCommBufs[0] = ShmCommBuf;
_ShmAllocBytes=bytes;
_ShmAlloc=1;
};
////////////////////////////////////////////////////////
// Global shared functionality finished
// Now move to per communicator functionality
////////////////////////////////////////////////////////
void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
{
assert(GlobalSharedMemory::ShmAlloc()==1);
ShmRanks.resize(1);
ShmCommBufs.resize(1);
ShmRanks[0] = 0;
ShmRank = 0;
ShmSize = 1;
//////////////////////////////////////////////////////////////////////
// Map ShmRank to WorldShmRank and use the right buffer
//////////////////////////////////////////////////////////////////////
ShmCommBufs[0] = GlobalSharedMemory::WorldShmCommBufs[0];
heap_size = GlobalSharedMemory::ShmAllocBytes();
ShmBufferFreeAll();
return;
}
//////////////////////////////////////////////////////////////////
// On node barrier
//////////////////////////////////////////////////////////////////
void SharedMemory::ShmBarrier(void){ return ; }
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// Test the shared memory is working
//////////////////////////////////////////////////////////////////////////////////////////////////////////
void SharedMemory::SharedMemoryTest(void) { return; }
void *SharedMemory::ShmBuffer(int rank)
{
return NULL;
}
void *SharedMemory::ShmBufferTranslate(int rank,void * local_p)
{
return NULL;
}
SharedMemory::~SharedMemory()
{};
}

File diff suppressed because it is too large Load Diff

View File

@ -1,733 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_reduction.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_REDUCTION_H
#define GRID_LATTICE_REDUCTION_H
#include <Grid/Grid_Eigen_Dense.h>
namespace Grid {
#ifdef GRID_WARN_SUBOPTIMAL
#warning "Optimisation alert all these reduction loops are NOT threaded "
#endif
////////////////////////////////////////////////////////////////////////////////////////////////////
// Deterministic Reduction operations
////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
auto nrm = innerProduct(arg,arg);
return std::real(nrm);
}
// Double inner product
template<class vobj>
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
{
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_typeD vector_type;
GridBase *grid = left._grid;
const int pad = 8;
ComplexD inner;
Vector<ComplexD> sumarray(grid->SumArraySize()*pad);
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
int nwork, mywork, myoff;
GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff);
decltype(innerProductD(left._odata[0],right._odata[0])) vinner=zero; // private to thread; sub summation
for(int ss=myoff;ss<mywork+myoff; ss++){
vinner = vinner + innerProductD(left._odata[ss],right._odata[ss]);
}
// All threads sum across SIMD; reduce serial work at end
// one write per cacheline with streaming store
ComplexD tmp = Reduce(TensorRemove(vinner)) ;
vstream(sumarray[thr*pad],tmp);
}
inner=0.0;
for(int i=0;i<grid->SumArraySize();i++){
inner = inner+sumarray[i*pad];
}
right._grid->GlobalSum(inner);
return inner;
}
/////////////////////////
// Fast axpby_norm
// z = a x + b y
// return norm z
/////////////////////////
template<class sobj,class vobj> strong_inline RealD
axpy_norm_fast(Lattice<vobj> &z,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
{
sobj one(1.0);
return axpby_norm_fast(z,a,one,x,y);
}
template<class sobj,class vobj> strong_inline RealD
axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
{
const int pad = 8;
z.checkerboard = x.checkerboard;
conformable(z,x);
conformable(x,y);
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_typeD vector_type;
RealD nrm;
GridBase *grid = x._grid;
Vector<RealD> sumarray(grid->SumArraySize()*pad);
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
int nwork, mywork, myoff;
GridThread::GetWork(x._grid->oSites(),thr,mywork,myoff);
// private to thread; sub summation
decltype(innerProductD(z._odata[0],z._odata[0])) vnrm=zero;
for(int ss=myoff;ss<mywork+myoff; ss++){
vobj tmp = a*x._odata[ss]+b*y._odata[ss];
vnrm = vnrm + innerProductD(tmp,tmp);
vstream(z._odata[ss],tmp);
}
vstream(sumarray[thr*pad],real(Reduce(TensorRemove(vnrm)))) ;
}
nrm = 0.0; // sum across threads; linear in thread count but fast
for(int i=0;i<grid->SumArraySize();i++){
nrm = nrm+sumarray[i*pad];
}
z._grid->GlobalSum(nrm);
return nrm;
}
template<class Op,class T1>
inline auto sum(const LatticeUnaryExpression<Op,T1> & expr)
->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second))))::scalar_object
{
return sum(closure(expr));
}
template<class Op,class T1,class T2>
inline auto sum(const LatticeBinaryExpression<Op,T1,T2> & expr)
->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),eval(0,std::get<1>(expr.second))))::scalar_object
{
return sum(closure(expr));
}
template<class Op,class T1,class T2,class T3>
inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
->typename decltype(expr.first.func(eval(0,std::get<0>(expr.second)),
eval(0,std::get<1>(expr.second)),
eval(0,std::get<2>(expr.second))
))::scalar_object
{
return sum(closure(expr));
}
template<class vobj>
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
{
GridBase *grid=arg._grid;
int Nsimd = grid->Nsimd();
std::vector<vobj,alignedAllocator<vobj> > sumarray(grid->SumArraySize());
for(int i=0;i<grid->SumArraySize();i++){
sumarray[i]=zero;
}
parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
int nwork, mywork, myoff;
GridThread::GetWork(grid->oSites(),thr,mywork,myoff);
vobj vvsum=zero;
for(int ss=myoff;ss<mywork+myoff; ss++){
vvsum = vvsum + arg._odata[ss];
}
sumarray[thr]=vvsum;
}
vobj vsum=zero; // sum across threads
for(int i=0;i<grid->SumArraySize();i++){
vsum = vsum+sumarray[i];
}
typedef typename vobj::scalar_object sobj;
sobj ssum=zero;
std::vector<sobj> buf(Nsimd);
extract(vsum,buf);
for(int i=0;i<Nsimd;i++) ssum = ssum + buf[i];
arg._grid->GlobalSum(ssum);
return ssum;
}
//////////////////////////////////////////////////////////////////////////////////////////////////////////////
// sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc...
//////////////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim)
{
///////////////////////////////////////////////////////
// FIXME precision promoted summation
// may be important for correlation functions
// But easily avoided by using double precision fields
///////////////////////////////////////////////////////
typedef typename vobj::scalar_object sobj;
GridBase *grid = Data._grid;
assert(grid!=NULL);
const int Nd = grid->_ndimension;
const int Nsimd = grid->Nsimd();
assert(orthogdim >= 0);
assert(orthogdim < Nd);
int fd=grid->_fdimensions[orthogdim];
int ld=grid->_ldimensions[orthogdim];
int rd=grid->_rdimensions[orthogdim];
std::vector<vobj,alignedAllocator<vobj> > lvSum(rd); // will locally sum vectors first
std::vector<sobj> lsSum(ld,zero); // sum across these down to scalars
std::vector<sobj> extracted(Nsimd); // splitting the SIMD
result.resize(fd); // And then global sum to return the same vector to every node
for(int r=0;r<rd;r++){
lvSum[r]=zero;
}
int e1= grid->_slice_nblock[orthogdim];
int e2= grid->_slice_block [orthogdim];
int stride=grid->_slice_stride[orthogdim];
// sum over reduced dimension planes, breaking out orthog dir
// Parallel over orthog direction
parallel_for(int r=0;r<rd;r++){
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int ss= so+n*stride+b;
lvSum[r]=lvSum[r]+Data._odata[ss];
}
}
}
// Sum across simd lanes in the plane, breaking out orthog dir.
std::vector<int> icoor(Nd);
for(int rt=0;rt<rd;rt++){
extract(lvSum[rt],extracted);
for(int idx=0;idx<Nsimd;idx++){
grid->iCoorFromIindex(icoor,idx);
int ldx =rt+icoor[orthogdim]*rd;
lsSum[ldx]=lsSum[ldx]+extracted[idx];
}
}
// sum over nodes.
sobj gsum;
for(int t=0;t<fd;t++){
int pt = t/ld; // processor plane
int lt = t%ld;
if ( pt == grid->_processor_coor[orthogdim] ) {
gsum=lsSum[lt];
} else {
gsum=zero;
}
grid->GlobalSum(gsum);
result[t]=gsum;
}
}
template<class vobj>
static void mySliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)
{
// std::cout << GridLogMessage << "Start mySliceInnerProductVector" << std::endl;
typedef typename vobj::scalar_type scalar_type;
std::vector<scalar_type> lsSum;
localSliceInnerProductVector(result, lhs, rhs, lsSum, orthogdim);
globalSliceInnerProductVector(result, lhs, lsSum, orthogdim);
// std::cout << GridLogMessage << "End mySliceInnerProductVector" << std::endl;
}
template <class vobj>
static void localSliceInnerProductVector(std::vector<ComplexD> &result, const Lattice<vobj> &lhs, const Lattice<vobj> &rhs, std::vector<typename vobj::scalar_type> &lsSum, int orthogdim)
{
// std::cout << GridLogMessage << "Start prep" << std::endl;
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
GridBase *grid = lhs._grid;
assert(grid!=NULL);
conformable(grid,rhs._grid);
const int Nd = grid->_ndimension;
const int Nsimd = grid->Nsimd();
assert(orthogdim >= 0);
assert(orthogdim < Nd);
int fd=grid->_fdimensions[orthogdim];
int ld=grid->_ldimensions[orthogdim];
int rd=grid->_rdimensions[orthogdim];
// std::cout << GridLogMessage << "Start alloc" << std::endl;
std::vector<vector_type,alignedAllocator<vector_type> > lvSum(rd); // will locally sum vectors first
lsSum.resize(ld,scalar_type(0.0)); // sum across these down to scalars
std::vector<iScalar<scalar_type>> extracted(Nsimd); // splitting the SIMD
// std::cout << GridLogMessage << "End alloc" << std::endl;
result.resize(fd); // And then global sum to return the same vector to every node for IO to file
for(int r=0;r<rd;r++){
lvSum[r]=zero;
}
int e1= grid->_slice_nblock[orthogdim];
int e2= grid->_slice_block [orthogdim];
int stride=grid->_slice_stride[orthogdim];
// std::cout << GridLogMessage << "End prep" << std::endl;
// std::cout << GridLogMessage << "Start parallel inner product, _rd = " << rd << std::endl;
vector_type vv;
parallel_for(int r=0;r<rd;r++)
{
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int ss = so + n * stride + b;
vv = TensorRemove(innerProduct(lhs._odata[ss], rhs._odata[ss]));
lvSum[r] = lvSum[r] + vv;
}
}
}
// std::cout << GridLogMessage << "End parallel inner product" << std::endl;
// Sum across simd lanes in the plane, breaking out orthog dir.
std::vector<int> icoor(Nd);
for(int rt=0;rt<rd;rt++){
iScalar<vector_type> temp;
temp._internal = lvSum[rt];
extract(temp,extracted);
for(int idx=0;idx<Nsimd;idx++){
grid->iCoorFromIindex(icoor,idx);
int ldx =rt+icoor[orthogdim]*rd;
lsSum[ldx]=lsSum[ldx]+extracted[idx]._internal;
}
}
// std::cout << GridLogMessage << "End sum over simd lanes" << std::endl;
}
template <class vobj>
static void globalSliceInnerProductVector(std::vector<ComplexD> &result, const Lattice<vobj> &lhs, std::vector<typename vobj::scalar_type> &lsSum, int orthogdim)
{
typedef typename vobj::scalar_type scalar_type;
GridBase *grid = lhs._grid;
int fd = result.size();
int ld = lsSum.size();
// sum over nodes.
std::vector<scalar_type> gsum;
gsum.resize(fd, scalar_type(0.0));
// std::cout << GridLogMessage << "Start of gsum[t] creation:" << std::endl;
for(int t=0;t<fd;t++){
int pt = t/ld; // processor plane
int lt = t%ld;
if ( pt == grid->_processor_coor[orthogdim] ) {
gsum[t]=lsSum[lt];
}
}
// std::cout << GridLogMessage << "End of gsum[t] creation:" << std::endl;
// std::cout << GridLogMessage << "Start of GlobalSumVector:" << std::endl;
grid->GlobalSumVector(&gsum[0], fd);
// std::cout << GridLogMessage << "End of GlobalSumVector:" << std::endl;
result = gsum;
}
template<class vobj>
static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)
{
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
GridBase *grid = lhs._grid;
assert(grid!=NULL);
conformable(grid,rhs._grid);
const int Nd = grid->_ndimension;
const int Nsimd = grid->Nsimd();
assert(orthogdim >= 0);
assert(orthogdim < Nd);
int fd=grid->_fdimensions[orthogdim];
int ld=grid->_ldimensions[orthogdim];
int rd=grid->_rdimensions[orthogdim];
std::vector<vector_type,alignedAllocator<vector_type> > lvSum(rd); // will locally sum vectors first
std::vector<scalar_type > lsSum(ld,scalar_type(0.0)); // sum across these down to scalars
std::vector<iScalar<scalar_type> > extracted(Nsimd); // splitting the SIMD
result.resize(fd); // And then global sum to return the same vector to every node for IO to file
for(int r=0;r<rd;r++){
lvSum[r]=zero;
}
int e1= grid->_slice_nblock[orthogdim];
int e2= grid->_slice_block [orthogdim];
int stride=grid->_slice_stride[orthogdim];
parallel_for(int r=0;r<rd;r++){
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int ss= so+n*stride+b;
vector_type vv = TensorRemove(innerProduct(lhs._odata[ss],rhs._odata[ss]));
lvSum[r]=lvSum[r]+vv;
}
}
}
// Sum across simd lanes in the plane, breaking out orthog dir.
std::vector<int> icoor(Nd);
for(int rt=0;rt<rd;rt++){
iScalar<vector_type> temp;
temp._internal = lvSum[rt];
extract(temp,extracted);
for(int idx=0;idx<Nsimd;idx++){
grid->iCoorFromIindex(icoor,idx);
int ldx =rt+icoor[orthogdim]*rd;
lsSum[ldx]=lsSum[ldx]+extracted[idx]._internal;
}
}
// sum over nodes.
scalar_type gsum;
for(int t=0;t<fd;t++){
int pt = t/ld; // processor plane
int lt = t%ld;
if ( pt == grid->_processor_coor[orthogdim] ) {
gsum=lsSum[lt];
} else {
gsum=scalar_type(0.0);
}
grid->GlobalSum(gsum);
result[t]=gsum;
}
}
template<class vobj>
static void sliceNorm (std::vector<RealD> &sn,const Lattice<vobj> &rhs,int Orthog)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nblock = rhs._grid->GlobalDimensions()[Orthog];
std::vector<ComplexD> ip(Nblock);
sn.resize(Nblock);
sliceInnerProductVector(ip,rhs,rhs,Orthog);
for(int ss=0;ss<Nblock;ss++){
sn[ss] = real(ip[ss]);
}
};
template<class vobj>
static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice<vobj> &X,const Lattice<vobj> &Y,
int orthogdim,RealD scale=1.0)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
typedef typename vobj::tensor_reduced tensor_reduced;
scalar_type zscale(scale);
GridBase *grid = X._grid;
int Nsimd =grid->Nsimd();
int Nblock =grid->GlobalDimensions()[orthogdim];
int fd =grid->_fdimensions[orthogdim];
int ld =grid->_ldimensions[orthogdim];
int rd =grid->_rdimensions[orthogdim];
int e1 =grid->_slice_nblock[orthogdim];
int e2 =grid->_slice_block [orthogdim];
int stride =grid->_slice_stride[orthogdim];
std::vector<int> icoor;
for(int r=0;r<rd;r++){
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
vector_type av;
for(int l=0;l<Nsimd;l++){
grid->iCoorFromIindex(icoor,l);
int ldx =r+icoor[orthogdim]*rd;
scalar_type *as =(scalar_type *)&av;
as[l] = scalar_type(a[ldx])*zscale;
}
tensor_reduced at; at=av;
parallel_for_nest2(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){
int ss= so+n*stride+b;
R._odata[ss] = at*X._odata[ss]+Y._odata[ss];
}
}
}
};
/*
inline GridBase *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
{
int NN = BlockSolverGrid->_ndimension;
int nsimd = BlockSolverGrid->Nsimd();
std::vector<int> latt_phys(0);
std::vector<int> simd_phys(0);
std::vector<int> mpi_phys(0);
for(int d=0;d<NN;d++){
if( d!=Orthog ) {
latt_phys.push_back(BlockSolverGrid->_fdimensions[d]);
simd_phys.push_back(BlockSolverGrid->_simd_layout[d]);
mpi_phys.push_back(BlockSolverGrid->_processors[d]);
}
}
return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys);
}
*/
template<class vobj>
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nblock = X._grid->GlobalDimensions()[Orthog];
GridBase *FullGrid = X._grid;
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
// Lattice<vobj> Xslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid);
assert( FullGrid->_simd_layout[Orthog]==1);
int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension;
int nl = nh-1;
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
#pragma omp parallel
{
std::vector<vobj> s_x(Nblock);
#pragma omp for collapse(2)
for(int n=0;n<nblock;n++){
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
s_x[i] = X[o+i*ostride];
}
vobj dot;
for(int i=0;i<Nblock;i++){
dot = Y[o+i*ostride];
for(int j=0;j<Nblock;j++){
dot = dot + s_x[j]*(scale*aa(j,i));
}
R[o+i*ostride]=dot;
}
}}
}
};
template<class vobj>
static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nblock = X._grid->GlobalDimensions()[Orthog];
GridBase *FullGrid = X._grid;
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
// Lattice<vobj> Xslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid);
assert( FullGrid->_simd_layout[Orthog]==1);
int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension;
int nl=1;
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
#pragma omp parallel
{
std::vector<vobj> s_x(Nblock);
#pragma omp for collapse(2)
for(int n=0;n<nblock;n++){
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
s_x[i] = X[o+i*ostride];
}
vobj dot;
for(int i=0;i<Nblock;i++){
dot = s_x[0]*(scale*aa(0,i));
for(int j=1;j<Nblock;j++){
dot = dot + s_x[j]*(scale*aa(j,i));
}
R[o+i*ostride]=dot;
}
}}
}
};
template<class vobj>
static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
GridBase *FullGrid = lhs._grid;
// GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
int Nblock = FullGrid->GlobalDimensions()[Orthog];
// Lattice<vobj> Lslice(SliceGrid);
// Lattice<vobj> Rslice(SliceGrid);
mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
assert( FullGrid->_simd_layout[Orthog]==1);
int nh = FullGrid->_ndimension;
// int nl = SliceGrid->_ndimension;
int nl = nh-1;
//FIXME package in a convenient iterator
//Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog];
typedef typename vobj::vector_typeD vector_typeD;
#pragma omp parallel
{
std::vector<vobj> Left(Nblock);
std::vector<vobj> Right(Nblock);
Eigen::MatrixXcd mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock);
#pragma omp for collapse(2)
for(int n=0;n<nblock;n++){
for(int b=0;b<block;b++){
int o = n*stride + b;
for(int i=0;i<Nblock;i++){
Left [i] = lhs[o+i*ostride];
Right[i] = rhs[o+i*ostride];
}
for(int i=0;i<Nblock;i++){
for(int j=0;j<Nblock;j++){
auto tmp = innerProduct(Left[i],Right[j]);
auto rtmp = TensorRemove(tmp);
mat_thread(i,j) += Reduce(rtmp);
}}
}}
#pragma omp critical
{
mat += mat_thread;
}
}
for(int i=0;i<Nblock;i++){
for(int j=0;j<Nblock;j++){
ComplexD sum = mat(i,j);
FullGrid->GlobalSum(sum);
mat(i,j)=sum;
}}
return;
}
} /*END NAMESPACE GRID*/
#endif

View File

@ -1,516 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_rng.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Guido Cossu <guido.cossu@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_RNG_H
#define GRID_LATTICE_RNG_H
#include <random>
#ifdef RNG_SITMO
#include <Grid/sitmo_rng/sitmo_prng_engine.hpp>
#endif
#if defined(RNG_SITMO)
#define RNG_FAST_DISCARD
#else
#undef RNG_FAST_DISCARD
#endif
namespace Grid {
//////////////////////////////////////////////////////////////
// Allow the RNG state to be less dense than the fine grid
//////////////////////////////////////////////////////////////
inline int RNGfillable(GridBase *coarse,GridBase *fine)
{
int rngdims = coarse->_ndimension;
// trivially extended in higher dims, with locality guaranteeing RNG state is local to node
int lowerdims = fine->_ndimension - coarse->_ndimension;
assert(lowerdims >= 0);
for(int d=0;d<lowerdims;d++){
assert(fine->_simd_layout[d]==1);
assert(fine->_processors[d]==1);
}
int multiplicity=1;
for(int d=0;d<lowerdims;d++){
multiplicity=multiplicity*fine->_rdimensions[d];
}
// local and global volumes subdivide cleanly after SIMDization
for(int d=0;d<rngdims;d++){
int fd= d+lowerdims;
assert(coarse->_processors[d] == fine->_processors[fd]);
assert(coarse->_simd_layout[d] == fine->_simd_layout[fd]);
assert(((fine->_rdimensions[fd] / coarse->_rdimensions[d])* coarse->_rdimensions[d])==fine->_rdimensions[fd]);
multiplicity = multiplicity *fine->_rdimensions[fd] / coarse->_rdimensions[d];
}
return multiplicity;
}
// merge of April 11 2017
// this function is necessary for the LS vectorised field
inline int RNGfillable_general(GridBase *coarse,GridBase *fine)
{
int rngdims = coarse->_ndimension;
// trivially extended in higher dims, with locality guaranteeing RNG state is local to node
int lowerdims = fine->_ndimension - coarse->_ndimension; assert(lowerdims >= 0);
// assumes that the higher dimensions are not using more processors
// all further divisions are local
for(int d=0;d<lowerdims;d++) assert(fine->_processors[d]==1);
for(int d=0;d<rngdims;d++) assert(coarse->_processors[d] == fine->_processors[d+lowerdims]);
// then divide the number of local sites
// check that the total number of sims agree, meanse the iSites are the same
assert(fine->Nsimd() == coarse->Nsimd());
// check that the two grids divide cleanly
assert( (fine->lSites() / coarse->lSites() ) * coarse->lSites() == fine->lSites() );
return fine->lSites() / coarse->lSites();
}
// real scalars are one component
template<class scalar,class distribution,class generator>
void fillScalar(scalar &s,distribution &dist,generator & gen)
{
s=dist(gen);
}
template<class distribution,class generator>
void fillScalar(ComplexF &s,distribution &dist, generator &gen)
{
s=ComplexF(dist(gen),dist(gen));
}
template<class distribution,class generator>
void fillScalar(ComplexD &s,distribution &dist,generator &gen)
{
s=ComplexD(dist(gen),dist(gen));
}
class GridRNGbase {
public:
// One generator per site.
// Uniform and Gaussian distributions from these generators.
#ifdef RNG_RANLUX
typedef std::ranlux48 RngEngine;
typedef uint64_t RngStateType;
static const int RngStateCount = 15;
#endif
#ifdef RNG_MT19937
typedef std::mt19937 RngEngine;
typedef uint32_t RngStateType;
static const int RngStateCount = std::mt19937::state_size;
#endif
#ifdef RNG_SITMO
typedef sitmo::prng_engine RngEngine;
typedef uint64_t RngStateType;
static const int RngStateCount = 13;
#endif
std::vector<RngEngine> _generators;
std::vector<std::uniform_real_distribution<RealD> > _uniform;
std::vector<std::normal_distribution<RealD> > _gaussian;
std::vector<std::discrete_distribution<int32_t> > _bernoulli;
std::vector<std::uniform_int_distribution<uint32_t> > _uid;
///////////////////////
// support for parallel init
///////////////////////
#ifdef RNG_FAST_DISCARD
static void Skip(RngEngine &eng,uint64_t site)
{
/////////////////////////////////////////////////////////////////////////////////////
// Skip by 2^40 elements between successive lattice sites
// This goes by 10^12.
// Consider quenched updating; likely never exceeding rate of 1000 sweeps
// per second on any machine. This gives us of order 10^9 seconds, or 100 years
// skip ahead.
// For HMC unlikely to go at faster than a solve per second, and
// tens of seconds per trajectory so this is clean in all reasonable cases,
// and margin of safety is orders of magnitude.
// We could hack Sitmo to skip in the higher order words of state if necessary
//
// Replace with 2^30 ; avoid problem on large volumes
//
/////////////////////////////////////////////////////////////////////////////////////
// uint64_t skip = site+1; // Old init Skipped then drew. Checked compat with faster init
const int shift = 30;
uint64_t skip = site;
skip = skip<<shift;
assert((skip >> shift)==site); // check for overflow
eng.discard(skip);
// std::cout << " Engine " <<site << " state " <<eng<<std::endl;
}
#endif
static RngEngine Reseed(RngEngine &eng)
{
std::vector<uint32_t> newseed;
std::uniform_int_distribution<uint32_t> uid;
return Reseed(eng,newseed,uid);
}
static RngEngine Reseed(RngEngine &eng,std::vector<uint32_t> & newseed,
std::uniform_int_distribution<uint32_t> &uid)
{
const int reseeds=4;
newseed.resize(reseeds);
for(int i=0;i<reseeds;i++){
newseed[i] = uid(eng);
}
std::seed_seq sseq(newseed.begin(),newseed.end());
return RngEngine(sseq);
}
void GetState(std::vector<RngStateType> & saved,RngEngine &eng) {
saved.resize(RngStateCount);
std::stringstream ss;
ss<<eng;
ss.seekg(0,ss.beg);
for(int i=0;i<RngStateCount;i++){
ss>>saved[i];
}
}
void GetState(std::vector<RngStateType> & saved,int gen) {
GetState(saved,_generators[gen]);
}
void SetState(std::vector<RngStateType> & saved,RngEngine &eng){
assert(saved.size()==RngStateCount);
std::stringstream ss;
for(int i=0;i<RngStateCount;i++){
ss<< saved[i]<<" ";
}
ss.seekg(0,ss.beg);
ss>>eng;
}
void SetState(std::vector<RngStateType> & saved,int gen){
SetState(saved,_generators[gen]);
}
void SetEngine(RngEngine &Eng, int gen){
_generators[gen]=Eng;
}
void GetEngine(RngEngine &Eng, int gen){
Eng=_generators[gen];
}
template<class source> void Seed(source &src, int gen)
{
_generators[gen] = RngEngine(src);
}
};
class GridSerialRNG : public GridRNGbase {
public:
GridSerialRNG() : GridRNGbase() {
_generators.resize(1);
_uniform.resize(1,std::uniform_real_distribution<RealD>{0,1});
_gaussian.resize(1,std::normal_distribution<RealD>(0.0,1.0) );
_bernoulli.resize(1,std::discrete_distribution<int32_t>{1,1});
_uid.resize(1,std::uniform_int_distribution<uint32_t>() );
}
template <class sobj,class distribution> inline void fill(sobj &l,std::vector<distribution> &dist){
typedef typename sobj::scalar_type scalar_type;
int words = sizeof(sobj)/sizeof(scalar_type);
scalar_type *buf = (scalar_type *) & l;
dist[0].reset();
for(int idx=0;idx<words;idx++){
fillScalar(buf[idx],dist[0],_generators[0]);
}
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
};
template <class distribution> inline void fill(ComplexF &l,std::vector<distribution> &dist){
dist[0].reset();
fillScalar(l,dist[0],_generators[0]);
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
template <class distribution> inline void fill(ComplexD &l,std::vector<distribution> &dist){
dist[0].reset();
fillScalar(l,dist[0],_generators[0]);
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
template <class distribution> inline void fill(RealF &l,std::vector<distribution> &dist){
dist[0].reset();
fillScalar(l,dist[0],_generators[0]);
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
template <class distribution> inline void fill(RealD &l,std::vector<distribution> &dist){
dist[0].reset();
fillScalar(l,dist[0],_generators[0]);
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
// vector fill
template <class distribution> inline void fill(vComplexF &l,std::vector<distribution> &dist){
RealF *pointer=(RealF *)&l;
dist[0].reset();
for(int i=0;i<2*vComplexF::Nsimd();i++){
fillScalar(pointer[i],dist[0],_generators[0]);
}
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
template <class distribution> inline void fill(vComplexD &l,std::vector<distribution> &dist){
RealD *pointer=(RealD *)&l;
dist[0].reset();
for(int i=0;i<2*vComplexD::Nsimd();i++){
fillScalar(pointer[i],dist[0],_generators[0]);
}
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
template <class distribution> inline void fill(vRealF &l,std::vector<distribution> &dist){
RealF *pointer=(RealF *)&l;
dist[0].reset();
for(int i=0;i<vRealF::Nsimd();i++){
fillScalar(pointer[i],dist[0],_generators[0]);
}
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
template <class distribution> inline void fill(vRealD &l,std::vector<distribution> &dist){
RealD *pointer=(RealD *)&l;
dist[0].reset();
for(int i=0;i<vRealD::Nsimd();i++){
fillScalar(pointer[i],dist[0],_generators[0]);
}
CartesianCommunicator::BroadcastWorld(0,(void *)&l,sizeof(l));
}
void SeedFixedIntegers(const std::vector<int> &seeds){
CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size());
std::seed_seq src(seeds.begin(),seeds.end());
Seed(src,0);
}
void SeedUniqueString(const std::string &s){
std::vector<int> seeds;
std::stringstream sha;
seeds = GridChecksum::sha256_seeds(s);
for(int i=0;i<seeds.size();i++) {
sha << std::hex << seeds[i];
}
std::cout << GridLogMessage << "Intialising serial RNG with unique string '"
<< s << "'" << std::endl;
std::cout << GridLogMessage << "Seed SHA256: " << sha.str() << std::endl;
SeedFixedIntegers(seeds);
}
};
class GridParallelRNG : public GridRNGbase {
double _time_counter;
public:
GridBase *_grid;
unsigned int _vol;
int generator_idx(int os,int is) {
return is*_grid->oSites()+os;
}
GridParallelRNG(GridBase *grid) : GridRNGbase() {
_grid = grid;
_vol =_grid->iSites()*_grid->oSites();
_generators.resize(_vol);
_uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1});
_gaussian.resize(_vol,std::normal_distribution<RealD>(0.0,1.0) );
_bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1});
_uid.resize(_vol,std::uniform_int_distribution<uint32_t>() );
}
template <class vobj,class distribution> inline void fill(Lattice<vobj> &l,std::vector<distribution> &dist){
typedef typename vobj::scalar_object scalar_object;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
double inner_time_counter = usecond();
int multiplicity = RNGfillable_general(_grid, l._grid); // l has finer or same grid
int Nsimd = _grid->Nsimd(); // guaranteed to be the same for l._grid too
int osites = _grid->oSites(); // guaranteed to be <= l._grid->oSites() by a factor multiplicity
int words = sizeof(scalar_object) / sizeof(scalar_type);
parallel_for(int ss=0;ss<osites;ss++){
std::vector<scalar_object> buf(Nsimd);
for (int m = 0; m < multiplicity; m++) { // Draw from same generator multiplicity times
int sm = multiplicity * ss + m; // Maps the generator site to the fine site
for (int si = 0; si < Nsimd; si++) {
int gdx = generator_idx(ss, si); // index of generator state
scalar_type *pointer = (scalar_type *)&buf[si];
dist[gdx].reset();
for (int idx = 0; idx < words; idx++)
fillScalar(pointer[idx], dist[gdx], _generators[gdx]);
}
// merge into SIMD lanes, FIXME suboptimal implementation
merge(l._odata[sm], buf);
}
}
_time_counter += usecond()- inner_time_counter;
};
void SeedUniqueString(const std::string &s){
std::vector<int> seeds;
seeds = GridChecksum::sha256_seeds(s);
std::cout << GridLogMessage << "Intialising parallel RNG with unique string '"
<< s << "'" << std::endl;
std::cout << GridLogMessage << "Seed SHA256: " << GridChecksum::sha256_string(seeds) << std::endl;
SeedFixedIntegers(seeds);
}
void SeedFixedIntegers(const std::vector<int> &seeds){
// Everyone generates the same seed_seq based on input seeds
CartesianCommunicator::BroadcastWorld(0,(void *)&seeds[0],sizeof(int)*seeds.size());
std::seed_seq source(seeds.begin(),seeds.end());
RngEngine master_engine(source);
#ifdef RNG_FAST_DISCARD
////////////////////////////////////////////////
// Skip ahead through a single stream.
// Applicable to SITMO and other has based/crypto RNGs
// Should be applicable to Mersenne Twister, but the C++11
// MT implementation does not implement fast discard even though
// in principle this is possible
////////////////////////////////////////////////
// Everybody loops over global volume.
parallel_for(int gidx=0;gidx<_grid->_gsites;gidx++){
// Where is it?
int rank,o_idx,i_idx;
std::vector<int> gcoor;
_grid->GlobalIndexToGlobalCoor(gidx,gcoor);
_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
// If this is one of mine we take it
if( rank == _grid->ThisRank() ){
int l_idx=generator_idx(o_idx,i_idx);
_generators[l_idx] = master_engine;
Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
}
}
#else
////////////////////////////////////////////////////////////////
// Machine and thread decomposition dependent seeding is efficient
// and maximally parallel; but NOT reproducible from machine to machine.
// Not ideal, but fastest way to reseed all nodes.
////////////////////////////////////////////////////////////////
{
// Obtain one Reseed per processor
int Nproc = _grid->ProcessorCount();
std::vector<RngEngine> seeders(Nproc);
int me= _grid->ThisRank();
for(int p=0;p<Nproc;p++){
seeders[p] = Reseed(master_engine);
}
master_engine = seeders[me];
}
{
// Obtain one reseeded generator per thread
int Nthread = GridThread::GetThreads();
std::vector<RngEngine> seeders(Nthread);
for(int t=0;t<Nthread;t++){
seeders[t] = Reseed(master_engine);
}
parallel_for(int t=0;t<Nthread;t++) {
// set up one per local site in threaded fashion
std::vector<uint32_t> newseeds;
std::uniform_int_distribution<uint32_t> uid;
for(int l=0;l<_grid->lSites();l++) {
if ( (l%Nthread)==t ) {
_generators[l] = Reseed(seeders[t],newseeds,uid);
}
}
}
}
#endif
}
void Report(){
std::cout << GridLogMessage << "Time spent in the fill() routine by GridParallelRNG: "<< _time_counter/1e3 << " ms" << std::endl;
}
////////////////////////////////////////////////////////////////////////
// Support for rigorous test of RNG's
// Return uniform random uint32_t from requested site generator
////////////////////////////////////////////////////////////////////////
uint32_t GlobalU01(int gsite){
uint32_t the_number;
// who
std::vector<int> gcoor;
int rank,o_idx,i_idx;
_grid->GlobalIndexToGlobalCoor(gsite,gcoor);
_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
// draw
int l_idx=generator_idx(o_idx,i_idx);
if( rank == _grid->ThisRank() ){
the_number = _uid[l_idx](_generators[l_idx]);
}
// share & return
_grid->Broadcast(rank,(void *)&the_number,sizeof(the_number));
return the_number;
}
};
template <class vobj> inline void random(GridParallelRNG &rng,Lattice<vobj> &l) { rng.fill(l,rng._uniform); }
template <class vobj> inline void gaussian(GridParallelRNG &rng,Lattice<vobj> &l) { rng.fill(l,rng._gaussian); }
template <class vobj> inline void bernoulli(GridParallelRNG &rng,Lattice<vobj> &l){ rng.fill(l,rng._bernoulli);}
template <class sobj> inline void random(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._uniform ); }
template <class sobj> inline void gaussian(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._gaussian ); }
template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){ rng.fill(l,rng._bernoulli); }
}
#endif

File diff suppressed because it is too large Load Diff

View File

@ -1,729 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/parallelIO/BinaryIO.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Guido Cossu<guido.cossu@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_BINARY_IO_H
#define GRID_BINARY_IO_H
#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT)
#define USE_MPI_IO
#else
#undef USE_MPI_IO
#endif
#ifdef HAVE_ENDIAN_H
#include <endian.h>
#endif
#include <arpa/inet.h>
#include <algorithm>
namespace Grid {
/////////////////////////////////////////////////////////////////////////////////
// Byte reversal garbage
/////////////////////////////////////////////////////////////////////////////////
inline uint32_t byte_reverse32(uint32_t f) {
f = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ;
return f;
}
inline uint64_t byte_reverse64(uint64_t f) {
uint64_t g;
g = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ;
g = g << 32;
f = f >> 32;
g|= ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ;
return g;
}
#if BYTE_ORDER == BIG_ENDIAN
inline uint64_t Grid_ntohll(uint64_t A) { return A; }
#else
inline uint64_t Grid_ntohll(uint64_t A) {
return byte_reverse64(A);
}
#endif
// A little helper
inline void removeWhitespace(std::string &key)
{
key.erase(std::remove_if(key.begin(), key.end(), ::isspace),key.end());
}
///////////////////////////////////////////////////////////////////////////////////////////////////
// Static class holding the parallel IO code
// Could just use a namespace
///////////////////////////////////////////////////////////////////////////////////////////////////
class BinaryIO {
public:
/////////////////////////////////////////////////////////////////////////////
// more byte manipulation helpers
/////////////////////////////////////////////////////////////////////////////
template<class vobj> static inline void Uint32Checksum(Lattice<vobj> &lat,uint32_t &nersc_csum)
{
typedef typename vobj::scalar_object sobj;
GridBase *grid = lat._grid;
uint64_t lsites = grid->lSites();
std::vector<sobj> scalardata(lsites);
unvectorizeToLexOrdArray(scalardata,lat);
NerscChecksum(grid,scalardata,nersc_csum);
}
template <class fobj>
static inline void NerscChecksum(GridBase *grid, std::vector<fobj> &fbuf, uint32_t &nersc_csum)
{
const uint64_t size32 = sizeof(fobj) / sizeof(uint32_t);
uint64_t lsites = grid->lSites();
if (fbuf.size() == 1)
{
lsites = 1;
}
PARALLEL_REGION
{
uint32_t nersc_csum_thr = 0;
PARALLEL_FOR_LOOP_INTERN
for (uint64_t local_site = 0; local_site < lsites; local_site++)
{
uint32_t *site_buf = (uint32_t *)&fbuf[local_site];
for (uint64_t j = 0; j < size32; j++)
{
nersc_csum_thr = nersc_csum_thr + site_buf[j];
}
}
PARALLEL_CRITICAL
{
nersc_csum += nersc_csum_thr;
}
}
}
template<class fobj> static inline void ScidacChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &scidac_csuma,uint32_t &scidac_csumb)
{
const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t);
int nd = grid->_ndimension;
uint64_t lsites =grid->lSites();
if (fbuf.size()==1) {
lsites=1;
}
std::vector<int> local_vol =grid->LocalDimensions();
std::vector<int> local_start =grid->LocalStarts();
std::vector<int> global_vol =grid->FullDimensions();
PARALLEL_REGION
{
std::vector<int> coor(nd);
uint32_t scidac_csuma_thr=0;
uint32_t scidac_csumb_thr=0;
uint32_t site_crc=0;
PARALLEL_FOR_LOOP_INTERN
for(uint64_t local_site=0;local_site<lsites;local_site++){
uint32_t * site_buf = (uint32_t *)&fbuf[local_site];
/*
* Scidac csum is rather more heavyweight
* FIXME -- 128^3 x 256 x 16 will overflow.
*/
int global_site;
Lexicographic::CoorFromIndex(coor,local_site,local_vol);
for(int d=0;d<nd;d++) {
coor[d] = coor[d]+local_start[d];
}
Lexicographic::IndexFromCoor(coor,global_site,global_vol);
uint32_t gsite29 = global_site%29;
uint32_t gsite31 = global_site%31;
site_crc = crc32(0,(unsigned char *)site_buf,sizeof(fobj));
// std::cout << "Site "<<local_site << " crc "<<std::hex<<site_crc<<std::dec<<std::endl;
// std::cout << "Site "<<local_site << std::hex<<site_buf[0] <<site_buf[1]<<std::dec <<std::endl;
scidac_csuma_thr ^= site_crc<<gsite29 | site_crc>>(32-gsite29);
scidac_csumb_thr ^= site_crc<<gsite31 | site_crc>>(32-gsite31);
}
PARALLEL_CRITICAL
{
scidac_csuma^= scidac_csuma_thr;
scidac_csumb^= scidac_csumb_thr;
}
}
}
// Network is big endian
static inline void htobe32_v(void *file_object,uint32_t bytes){ be32toh_v(file_object,bytes);}
static inline void htobe64_v(void *file_object,uint32_t bytes){ be64toh_v(file_object,bytes);}
static inline void htole32_v(void *file_object,uint32_t bytes){ le32toh_v(file_object,bytes);}
static inline void htole64_v(void *file_object,uint32_t bytes){ le64toh_v(file_object,bytes);}
static inline void be32toh_v(void *file_object,uint64_t bytes)
{
uint32_t * f = (uint32_t *)file_object;
uint64_t count = bytes/sizeof(uint32_t);
parallel_for(uint64_t i=0;i<count;i++){
f[i] = ntohl(f[i]);
}
}
// LE must Swap and switch to host
static inline void le32toh_v(void *file_object,uint64_t bytes)
{
uint32_t *fp = (uint32_t *)file_object;
uint32_t f;
uint64_t count = bytes/sizeof(uint32_t);
parallel_for(uint64_t i=0;i<count;i++){
f = fp[i];
// got network order and the network to host
f = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ;
fp[i] = ntohl(f);
}
}
// BE is same as network
static inline void be64toh_v(void *file_object,uint64_t bytes)
{
uint64_t * f = (uint64_t *)file_object;
uint64_t count = bytes/sizeof(uint64_t);
parallel_for(uint64_t i=0;i<count;i++){
f[i] = Grid_ntohll(f[i]);
}
}
// LE must swap and switch;
static inline void le64toh_v(void *file_object,uint64_t bytes)
{
uint64_t *fp = (uint64_t *)file_object;
uint64_t f,g;
uint64_t count = bytes/sizeof(uint64_t);
parallel_for(uint64_t i=0;i<count;i++){
f = fp[i];
// got network order and the network to host
g = ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ;
g = g << 32;
f = f >> 32;
g|= ((f&0xFF)<<24) | ((f&0xFF00)<<8) | ((f&0xFF0000)>>8) | ((f&0xFF000000UL)>>24) ;
fp[i] = Grid_ntohll(g);
}
}
/////////////////////////////////////////////////////////////////////////////
// Real action:
// Read or Write distributed lexico array of ANY object to a specific location in file
//////////////////////////////////////////////////////////////////////////////////////
static const int BINARYIO_MASTER_APPEND = 0x10;
static const int BINARYIO_UNORDERED = 0x08;
static const int BINARYIO_LEXICOGRAPHIC = 0x04;
static const int BINARYIO_READ = 0x02;
static const int BINARYIO_WRITE = 0x01;
template<class word,class fobj>
static inline void IOobject(word w,
GridBase *grid,
std::vector<fobj> &iodata,
std::string file,
uint64_t& offset,
const std::string &format, int control,
uint32_t &nersc_csum,
uint32_t &scidac_csuma,
uint32_t &scidac_csumb)
{
grid->Barrier();
GridStopWatch timer;
GridStopWatch bstimer;
nersc_csum=0;
scidac_csuma=0;
scidac_csumb=0;
int ndim = grid->Dimensions();
int nrank = grid->ProcessorCount();
int myrank = grid->ThisRank();
std::vector<int> psizes = grid->ProcessorGrid();
std::vector<int> pcoor = grid->ThisProcessorCoor();
std::vector<int> gLattice= grid->GlobalDimensions();
std::vector<int> lLattice= grid->LocalDimensions();
std::vector<int> lStart(ndim);
std::vector<int> gStart(ndim);
// Flatten the file
uint64_t lsites = grid->lSites();
if ( control & BINARYIO_MASTER_APPEND ) {
assert(iodata.size()==1);
} else {
assert(lsites==iodata.size());
}
for(int d=0;d<ndim;d++){
gStart[d] = lLattice[d]*pcoor[d];
lStart[d] = 0;
}
#ifdef USE_MPI_IO
std::vector<int> distribs(ndim,MPI_DISTRIBUTE_BLOCK);
std::vector<int> dargs (ndim,MPI_DISTRIBUTE_DFLT_DARG);
MPI_Datatype mpiObject;
MPI_Datatype fileArray;
MPI_Datatype localArray;
MPI_Datatype mpiword;
MPI_Offset disp = offset;
MPI_File fh ;
MPI_Status status;
int numword;
if ( sizeof( word ) == sizeof(float ) ) {
numword = sizeof(fobj)/sizeof(float);
mpiword = MPI_FLOAT;
} else {
numword = sizeof(fobj)/sizeof(double);
mpiword = MPI_DOUBLE;
}
//////////////////////////////////////////////////////////////////////////////
// Sobj in MPI phrasing
//////////////////////////////////////////////////////////////////////////////
int ierr;
ierr = MPI_Type_contiguous(numword,mpiword,&mpiObject); assert(ierr==0);
ierr = MPI_Type_commit(&mpiObject);
//////////////////////////////////////////////////////////////////////////////
// File global array data type
//////////////////////////////////////////////////////////////////////////////
ierr=MPI_Type_create_subarray(ndim,&gLattice[0],&lLattice[0],&gStart[0],MPI_ORDER_FORTRAN, mpiObject,&fileArray); assert(ierr==0);
ierr=MPI_Type_commit(&fileArray); assert(ierr==0);
//////////////////////////////////////////////////////////////////////////////
// local lattice array
//////////////////////////////////////////////////////////////////////////////
ierr=MPI_Type_create_subarray(ndim,&lLattice[0],&lLattice[0],&lStart[0],MPI_ORDER_FORTRAN, mpiObject,&localArray); assert(ierr==0);
ierr=MPI_Type_commit(&localArray); assert(ierr==0);
#endif
//////////////////////////////////////////////////////////////////////////////
// Byte order
//////////////////////////////////////////////////////////////////////////////
int ieee32big = (format == std::string("IEEE32BIG"));
int ieee32 = (format == std::string("IEEE32"));
int ieee64big = (format == std::string("IEEE64BIG"));
int ieee64 = (format == std::string("IEEE64"));
//////////////////////////////////////////////////////////////////////////////
// Do the I/O
//////////////////////////////////////////////////////////////////////////////
if ( control & BINARYIO_READ ) {
timer.Start();
if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) {
#ifdef USE_MPI_IO
std::cout<< GridLogMessage<<"IOobject: MPI read I/O "<< file<< std::endl;
ierr=MPI_File_open(grid->communicator,(char *) file.c_str(), MPI_MODE_RDONLY, MPI_INFO_NULL, &fh); assert(ierr==0);
ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL); assert(ierr==0);
ierr=MPI_File_read_all(fh, &iodata[0], 1, localArray, &status); assert(ierr==0);
MPI_File_close(&fh);
MPI_Type_free(&fileArray);
MPI_Type_free(&localArray);
#else
assert(0);
#endif
} else {
std::cout << GridLogMessage <<"IOobject: C++ read I/O " << file << " : "
<< iodata.size() * sizeof(fobj) << " bytes" << std::endl;
std::ifstream fin;
fin.open(file, std::ios::binary | std::ios::in);
if (control & BINARYIO_MASTER_APPEND)
{
fin.seekg(-sizeof(fobj), fin.end);
}
else
{
fin.seekg(offset + myrank * lsites * sizeof(fobj));
}
fin.read((char *)&iodata[0], iodata.size() * sizeof(fobj));
assert(fin.fail() == 0);
fin.close();
}
timer.Stop();
grid->Barrier();
bstimer.Start();
ScidacChecksum(grid,iodata,scidac_csuma,scidac_csumb);
if (ieee32big) be32toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
if (ieee32) le32toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
if (ieee64big) be64toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
if (ieee64) le64toh_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
NerscChecksum(grid,iodata,nersc_csum);
bstimer.Stop();
}
if ( control & BINARYIO_WRITE ) {
bstimer.Start();
NerscChecksum(grid,iodata,nersc_csum);
if (ieee32big) htobe32_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
if (ieee32) htole32_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
if (ieee64big) htobe64_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
if (ieee64) htole64_v((void *)&iodata[0], sizeof(fobj)*iodata.size());
ScidacChecksum(grid,iodata,scidac_csuma,scidac_csumb);
bstimer.Stop();
grid->Barrier();
timer.Start();
if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) {
#ifdef USE_MPI_IO
std::cout << GridLogMessage <<"IOobject: MPI write I/O " << file << std::endl;
ierr = MPI_File_open(grid->communicator, (char *)file.c_str(), MPI_MODE_RDWR | MPI_MODE_CREATE, MPI_INFO_NULL, &fh);
// std::cout << GridLogMessage << "Checking for errors" << std::endl;
if (ierr != MPI_SUCCESS)
{
char error_string[BUFSIZ];
int length_of_error_string, error_class;
MPI_Error_class(ierr, &error_class);
MPI_Error_string(error_class, error_string, &length_of_error_string);
fprintf(stderr, "%3d: %s\n", myrank, error_string);
MPI_Error_string(ierr, error_string, &length_of_error_string);
fprintf(stderr, "%3d: %s\n", myrank, error_string);
MPI_Abort(MPI_COMM_WORLD, 1); //assert(ierr == 0);
}
std::cout << GridLogDebug << "MPI write I/O set view " << file << std::endl;
ierr = MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);
assert(ierr == 0);
std::cout << GridLogDebug << "MPI write I/O write all " << file << std::endl;
ierr = MPI_File_write_all(fh, &iodata[0], 1, localArray, &status);
assert(ierr == 0);
MPI_Offset os;
MPI_File_get_position(fh, &os);
MPI_File_get_byte_offset(fh, os, &disp);
offset = disp;
MPI_File_close(&fh);
MPI_Type_free(&fileArray);
MPI_Type_free(&localArray);
#else
assert(0);
#endif
} else {
std::cout << GridLogMessage << "IOobject: C++ write I/O " << file << " : "
<< iodata.size() * sizeof(fobj) << " bytes and offset " << offset << std::endl;
std::ofstream fout;
fout.exceptions ( std::fstream::failbit | std::fstream::badbit );
try {
if (offset) { // Must already exist and contain data
fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
} else { // Allow create
fout.open(file,std::ios::binary|std::ios::out);
}
} catch (const std::fstream::failure& exc) {
std::cout << GridLogError << "Error in opening the file " << file << " for output" <<std::endl;
std::cout << GridLogError << "Exception description: " << exc.what() << std::endl;
// std::cout << GridLogError << "Probable cause: wrong path, inaccessible location "<< std::endl;
#ifdef USE_MPI_IO
MPI_Abort(MPI_COMM_WORLD,1);
#else
exit(1);
#endif
}
if ( control & BINARYIO_MASTER_APPEND ) {
try {
fout.seekp(0,fout.end);
} catch (const std::fstream::failure& exc) {
std::cout << "Exception in seeking file end " << file << std::endl;
}
} else {
try {
fout.seekp(offset+myrank*lsites*sizeof(fobj));
} catch (const std::fstream::failure& exc) {
std::cout << "Exception in seeking file " << file <<" offset "<< offset << std::endl;
}
}
try {
fout.write((char *)&iodata[0],iodata.size()*sizeof(fobj));//assert( fout.fail()==0);
}
catch (const std::fstream::failure& exc) {
std::cout << "Exception in writing file " << file << std::endl;
std::cout << GridLogError << "Exception description: "<< exc.what() << std::endl;
#ifdef USE_MPI_IO
MPI_Abort(MPI_COMM_WORLD,1);
#else
exit(1);
#endif
}
offset = fout.tellp();
fout.close();
}
timer.Stop();
}
std::cout<<GridLogMessage<<"IOobject: ";
if ( control & BINARYIO_READ) std::cout << " read ";
else std::cout << " write ";
uint64_t bytes = sizeof(fobj)*iodata.size()*nrank;
std::cout<< bytes <<" bytes in "<<timer.Elapsed() <<" "
<< (double)bytes/ (double)timer.useconds() <<" MB/s "<<std::endl;
std::cout<<GridLogMessage<<"IOobject: endian and checksum overhead "<<bstimer.Elapsed() <<std::endl;
//////////////////////////////////////////////////////////////////////////////
// Safety check
//////////////////////////////////////////////////////////////////////////////
// if the data size is 1 we do not want to sum over the MPI ranks
if (iodata.size() != 1){
grid->Barrier();
grid->GlobalSum(nersc_csum);
grid->GlobalXOR(scidac_csuma);
grid->GlobalXOR(scidac_csumb);
grid->Barrier();
}
}
/////////////////////////////////////////////////////////////////////////////
// Read a Lattice of object
//////////////////////////////////////////////////////////////////////////////////////
template<class vobj,class fobj,class munger>
static inline void readLatticeObject(Lattice<vobj> &Umu,
std::string file,
munger munge,
uint64_t offset,
const std::string &format,
uint32_t &nersc_csum,
uint32_t &scidac_csuma,
uint32_t &scidac_csumb)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::Realified::scalar_type word; word w=0;
GridBase *grid = Umu._grid;
uint64_t lsites = grid->lSites();
std::vector<sobj> scalardata(lsites);
std::vector<fobj> iodata(lsites); // Munge, checksum, byte order in here
IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC,
nersc_csum,scidac_csuma,scidac_csumb);
GridStopWatch timer;
timer.Start();
parallel_for(uint64_t x=0;x<lsites;x++) munge(iodata[x], scalardata[x]);
vectorizeFromLexOrdArray(scalardata,Umu);
grid->Barrier();
timer.Stop();
std::cout<<GridLogMessage<<"readLatticeObject: vectorize overhead "<<timer.Elapsed() <<std::endl;
}
/////////////////////////////////////////////////////////////////////////////
// Write a Lattice of object
//////////////////////////////////////////////////////////////////////////////////////
template<class vobj,class fobj,class munger>
static inline void writeLatticeObject(Lattice<vobj> &Umu,
std::string file,
munger munge,
uint64_t offset,
const std::string &format,
uint32_t &nersc_csum,
uint32_t &scidac_csuma,
uint32_t &scidac_csumb)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::Realified::scalar_type word; word w=0;
GridBase *grid = Umu._grid;
uint64_t lsites = grid->lSites();
std::vector<sobj> scalardata(lsites);
std::vector<fobj> iodata(lsites); // Munge, checksum, byte order in here
//////////////////////////////////////////////////////////////////////////////
// Munge [ .e.g 3rd row recon ]
//////////////////////////////////////////////////////////////////////////////
GridStopWatch timer; timer.Start();
unvectorizeToLexOrdArray(scalardata,Umu);
parallel_for(uint64_t x=0;x<lsites;x++) munge(scalardata[x],iodata[x]);
grid->Barrier();
timer.Stop();
IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
nersc_csum,scidac_csuma,scidac_csumb);
std::cout<<GridLogMessage<<"writeLatticeObject: unvectorize overhead "<<timer.Elapsed() <<std::endl;
}
/////////////////////////////////////////////////////////////////////////////
// Read a RNG; use IOobject and lexico map to an array of state
//////////////////////////////////////////////////////////////////////////////////////
static inline void readRNG(GridSerialRNG &serial,
GridParallelRNG &parallel,
std::string file,
uint64_t offset,
uint32_t &nersc_csum,
uint32_t &scidac_csuma,
uint32_t &scidac_csumb)
{
typedef typename GridSerialRNG::RngStateType RngStateType;
const int RngStateCount = GridSerialRNG::RngStateCount;
typedef std::array<RngStateType,RngStateCount> RNGstate;
typedef RngStateType word; word w=0;
std::string format = "IEEE32BIG";
GridBase *grid = parallel._grid;
uint64_t gsites = grid->gSites();
uint64_t lsites = grid->lSites();
uint32_t nersc_csum_tmp = 0;
uint32_t scidac_csuma_tmp = 0;
uint32_t scidac_csumb_tmp = 0;
GridStopWatch timer;
std::cout << GridLogMessage << "RNG read I/O on file " << file << std::endl;
std::vector<RNGstate> iodata(lsites);
IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC,
nersc_csum,scidac_csuma,scidac_csumb);
timer.Start();
parallel_for(uint64_t lidx=0;lidx<lsites;lidx++){
std::vector<RngStateType> tmp(RngStateCount);
std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin());
parallel.SetState(tmp,lidx);
}
timer.Stop();
iodata.resize(1);
IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|BINARYIO_MASTER_APPEND,
nersc_csum_tmp,scidac_csuma_tmp,scidac_csumb_tmp);
{
std::vector<RngStateType> tmp(RngStateCount);
std::copy(iodata[0].begin(),iodata[0].end(),tmp.begin());
serial.SetState(tmp,0);
}
nersc_csum = nersc_csum + nersc_csum_tmp;
scidac_csuma = scidac_csuma ^ scidac_csuma_tmp;
scidac_csumb = scidac_csumb ^ scidac_csumb_tmp;
std::cout << GridLogMessage << "RNG file nersc_checksum " << std::hex << nersc_csum << std::dec << std::endl;
std::cout << GridLogMessage << "RNG file scidac_checksuma " << std::hex << scidac_csuma << std::dec << std::endl;
std::cout << GridLogMessage << "RNG file scidac_checksumb " << std::hex << scidac_csumb << std::dec << std::endl;
std::cout << GridLogMessage << "RNG state overhead " << timer.Elapsed() << std::endl;
}
/////////////////////////////////////////////////////////////////////////////
// Write a RNG; lexico map to an array of state and use IOobject
//////////////////////////////////////////////////////////////////////////////////////
static inline void writeRNG(GridSerialRNG &serial,
GridParallelRNG &parallel,
std::string file,
uint64_t offset,
uint32_t &nersc_csum,
uint32_t &scidac_csuma,
uint32_t &scidac_csumb)
{
typedef typename GridSerialRNG::RngStateType RngStateType;
typedef RngStateType word; word w=0;
const int RngStateCount = GridSerialRNG::RngStateCount;
typedef std::array<RngStateType,RngStateCount> RNGstate;
GridBase *grid = parallel._grid;
uint64_t gsites = grid->gSites();
uint64_t lsites = grid->lSites();
uint32_t nersc_csum_tmp;
uint32_t scidac_csuma_tmp;
uint32_t scidac_csumb_tmp;
GridStopWatch timer;
std::string format = "IEEE32BIG";
std::cout << GridLogMessage << "RNG write I/O on file " << file << std::endl;
timer.Start();
std::vector<RNGstate> iodata(lsites);
parallel_for(uint64_t lidx=0;lidx<lsites;lidx++){
std::vector<RngStateType> tmp(RngStateCount);
parallel.GetState(tmp,lidx);
std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin());
}
timer.Stop();
IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
nersc_csum,scidac_csuma,scidac_csumb);
iodata.resize(1);
{
std::vector<RngStateType> tmp(RngStateCount);
serial.GetState(tmp,0);
std::copy(tmp.begin(),tmp.end(),iodata[0].begin());
}
IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_MASTER_APPEND,
nersc_csum_tmp,scidac_csuma_tmp,scidac_csumb_tmp);
nersc_csum = nersc_csum + nersc_csum_tmp;
scidac_csuma = scidac_csuma ^ scidac_csuma_tmp;
scidac_csumb = scidac_csumb ^ scidac_csumb_tmp;
std::cout << GridLogMessage << "RNG file checksum " << std::hex << nersc_csum << std::dec << std::endl;
std::cout << GridLogMessage << "RNG file checksuma " << std::hex << scidac_csuma << std::dec << std::endl;
std::cout << GridLogMessage << "RNG file checksumb " << std::hex << scidac_csumb << std::dec << std::endl;
std::cout << GridLogMessage << "RNG state overhead " << timer.Elapsed() << std::endl;
}
};
}
#endif

View File

@ -1,875 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/parallelIO/IldgIO.h
Copyright (C) 2015
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_ILDG_IO_H
#define GRID_ILDG_IO_H
#ifdef HAVE_LIME
#include <algorithm>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <map>
#include <pwd.h>
#include <sys/utsname.h>
#include <unistd.h>
//C-Lime is a must have for this functionality
extern "C" {
#include "lime.h"
}
namespace Grid {
namespace QCD {
/////////////////////////////////
// Encode word types as strings
/////////////////////////////////
template<class word> inline std::string ScidacWordMnemonic(void){ return std::string("unknown"); }
template<> inline std::string ScidacWordMnemonic<double> (void){ return std::string("D"); }
template<> inline std::string ScidacWordMnemonic<float> (void){ return std::string("F"); }
template<> inline std::string ScidacWordMnemonic< int32_t>(void){ return std::string("I32_t"); }
template<> inline std::string ScidacWordMnemonic<uint32_t>(void){ return std::string("U32_t"); }
template<> inline std::string ScidacWordMnemonic< int64_t>(void){ return std::string("I64_t"); }
template<> inline std::string ScidacWordMnemonic<uint64_t>(void){ return std::string("U64_t"); }
/////////////////////////////////////////
// Encode a generic tensor as a string
/////////////////////////////////////////
template<class vobj> std::string ScidacRecordTypeString(int &colors, int &spins, int & typesize,int &datacount) {
typedef typename getPrecision<vobj>::real_scalar_type stype;
int _ColourN = indexRank<ColourIndex,vobj>();
int _ColourScalar = isScalar<ColourIndex,vobj>();
int _ColourVector = isVector<ColourIndex,vobj>();
int _ColourMatrix = isMatrix<ColourIndex,vobj>();
int _SpinN = indexRank<SpinIndex,vobj>();
int _SpinScalar = isScalar<SpinIndex,vobj>();
int _SpinVector = isVector<SpinIndex,vobj>();
int _SpinMatrix = isMatrix<SpinIndex,vobj>();
int _LorentzN = indexRank<LorentzIndex,vobj>();
int _LorentzScalar = isScalar<LorentzIndex,vobj>();
int _LorentzVector = isVector<LorentzIndex,vobj>();
int _LorentzMatrix = isMatrix<LorentzIndex,vobj>();
std::stringstream stream;
stream << "GRID_";
stream << ScidacWordMnemonic<stype>();
if ( _LorentzVector ) stream << "_LorentzVector"<<_LorentzN;
if ( _LorentzMatrix ) stream << "_LorentzMatrix"<<_LorentzN;
if ( _SpinVector ) stream << "_SpinVector"<<_SpinN;
if ( _SpinMatrix ) stream << "_SpinMatrix"<<_SpinN;
if ( _ColourVector ) stream << "_ColourVector"<<_ColourN;
if ( _ColourMatrix ) stream << "_ColourMatrix"<<_ColourN;
if ( _ColourScalar && _LorentzScalar && _SpinScalar ) stream << "_Complex";
typesize = sizeof(typename vobj::scalar_type);
if ( _ColourMatrix ) typesize*= _ColourN*_ColourN;
else typesize*= _ColourN;
if ( _SpinMatrix ) typesize*= _SpinN*_SpinN;
else typesize*= _SpinN;
colors = _ColourN;
spins = _SpinN;
datacount = _LorentzN;
return stream.str();
}
template<class vobj> std::string ScidacRecordTypeString(Lattice<vobj> & lat,int &colors, int &spins, int & typesize,int &datacount) {
return ScidacRecordTypeString<vobj>(colors,spins,typesize,datacount);
};
////////////////////////////////////////////////////////////
// Helper to fill out metadata
////////////////////////////////////////////////////////////
template<class vobj> void ScidacMetaData(Lattice<vobj> & field,
FieldMetaData &header,
scidacRecord & _scidacRecord,
scidacFile & _scidacFile)
{
typedef typename getPrecision<vobj>::real_scalar_type stype;
/////////////////////////////////////
// Pull Grid's metadata
/////////////////////////////////////
PrepareMetaData(field,header);
/////////////////////////////////////
// Scidac Private File structure
/////////////////////////////////////
_scidacFile = scidacFile(field._grid);
/////////////////////////////////////
// Scidac Private Record structure
/////////////////////////////////////
scidacRecord sr;
sr.datatype = ScidacRecordTypeString(field,sr.colors,sr.spins,sr.typesize,sr.datacount);
sr.date = header.creation_date;
sr.precision = ScidacWordMnemonic<stype>();
sr.recordtype = GRID_IO_FIELD;
_scidacRecord = sr;
// std::cout << GridLogMessage << "Build SciDAC datatype " <<sr.datatype<<std::endl;
}
///////////////////////////////////////////////////////
// Scidac checksum
///////////////////////////////////////////////////////
static int scidacChecksumVerify(scidacChecksum &scidacChecksum_,uint32_t scidac_csuma,uint32_t scidac_csumb)
{
uint32_t scidac_checksuma = stoull(scidacChecksum_.suma,0,16);
uint32_t scidac_checksumb = stoull(scidacChecksum_.sumb,0,16);
if ( scidac_csuma !=scidac_checksuma) return 0;
if ( scidac_csumb !=scidac_checksumb) return 0;
return 1;
}
////////////////////////////////////////////////////////////////////////////////////
// Lime, ILDG and Scidac I/O classes
////////////////////////////////////////////////////////////////////////////////////
class GridLimeReader : public BinaryIO {
public:
///////////////////////////////////////////////////
// FIXME: format for RNG? Now just binary out instead
///////////////////////////////////////////////////
FILE *File;
LimeReader *LimeR;
std::string filename;
/////////////////////////////////////////////
// Open the file
/////////////////////////////////////////////
void open(const std::string &_filename)
{
filename= _filename;
File = fopen(filename.c_str(), "r");
if (File == nullptr)
{
std::cerr << "cannot open file '" << filename << "'" << std::endl;
abort();
}
LimeR = limeCreateReader(File);
}
/////////////////////////////////////////////
// Close the file
/////////////////////////////////////////////
void close(void){
fclose(File);
// limeDestroyReader(LimeR);
}
////////////////////////////////////////////
// Read a generic lattice field and verify checksum
////////////////////////////////////////////
template<class vobj>
void readLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name)
{
typedef typename vobj::scalar_object sobj;
scidacChecksum scidacChecksum_;
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
std::string format = getFormatString<vobj>();
while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) {
uint64_t file_bytes =limeReaderBytes(LimeR);
// std::cout << GridLogMessage << limeReaderType(LimeR) << " "<< file_bytes <<" bytes "<<std::endl;
// std::cout << GridLogMessage<< " readLimeObject seeking "<< record_name <<" found record :" <<limeReaderType(LimeR) <<std::endl;
if ( !strncmp(limeReaderType(LimeR), record_name.c_str(),strlen(record_name.c_str()) ) ) {
// std::cout << GridLogMessage<< " readLimeLatticeBinaryObject matches ! " <<std::endl;
uint64_t PayloadSize = sizeof(sobj) * field._grid->_gsites;
// std::cout << "R sizeof(sobj)= " <<sizeof(sobj)<<std::endl;
// std::cout << "R Gsites " <<field._grid->_gsites<<std::endl;
// std::cout << "R Payload expected " <<PayloadSize<<std::endl;
// std::cout << "R file size " <<file_bytes <<std::endl;
assert(PayloadSize == file_bytes);// Must match or user error
uint64_t offset= ftello(File);
// std::cout << " ReadLatticeObject from offset "<<offset << std::endl;
BinarySimpleMunger<sobj,sobj> munge;
BinaryIO::readLatticeObject< vobj, sobj >(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
/////////////////////////////////////////////
// Insist checksum is next record
/////////////////////////////////////////////
readLimeObject(scidacChecksum_,std::string("scidacChecksum"),std::string(SCIDAC_CHECKSUM));
/////////////////////////////////////////////
// Verify checksums
/////////////////////////////////////////////
assert(scidacChecksumVerify(scidacChecksum_,scidac_csuma,scidac_csumb)==1);
return;
}
}
}
////////////////////////////////////////////
// Read a generic serialisable object
////////////////////////////////////////////
void readLimeObject(std::string &xmlstring,std::string record_name)
{
// should this be a do while; can we miss a first record??
while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) {
// std::cout << GridLogMessage<< " readLimeObject seeking "<< record_name <<" found record :" <<limeReaderType(LimeR) <<std::endl;
uint64_t nbytes = limeReaderBytes(LimeR);//size of this record (configuration)
if ( !strncmp(limeReaderType(LimeR), record_name.c_str(),strlen(record_name.c_str()) ) ) {
// std::cout << GridLogMessage<< " readLimeObject matches ! " << record_name <<std::endl;
std::vector<char> xmlc(nbytes+1,'\0');
limeReaderReadData((void *)&xmlc[0], &nbytes, LimeR);
// std::cout << GridLogMessage<< " readLimeObject matches XML " << &xmlc[0] <<std::endl;
xmlstring = std::string(&xmlc[0]);
return;
}
}
assert(0);
}
template<class serialisable_object>
void readLimeObject(serialisable_object &object,std::string object_name,std::string record_name)
{
std::string xmlstring;
readLimeObject(xmlstring, record_name);
XmlReader RD(xmlstring, true, "");
read(RD,object_name,object);
}
};
class GridLimeWriter : public BinaryIO
{
public:
///////////////////////////////////////////////////
// FIXME: format for RNG? Now just binary out instead
// FIXME: collective calls or not ?
// : must know if I am the I/O boss
///////////////////////////////////////////////////
FILE *File;
LimeWriter *LimeW;
std::string filename;
bool boss_node;
GridLimeWriter( bool isboss = true) {
boss_node = isboss;
}
void open(const std::string &_filename) {
filename= _filename;
if ( boss_node ) {
File = fopen(filename.c_str(), "w");
LimeW = limeCreateWriter(File); assert(LimeW != NULL );
}
}
/////////////////////////////////////////////
// Close the file
/////////////////////////////////////////////
void close(void) {
if ( boss_node ) {
fclose(File);
}
// limeDestroyWriter(LimeW);
}
///////////////////////////////////////////////////////
// Lime utility functions
///////////////////////////////////////////////////////
int createLimeRecordHeader(std::string message, int MB, int ME, size_t PayloadSize)
{
if ( boss_node ) {
LimeRecordHeader *h;
h = limeCreateHeader(MB, ME, const_cast<char *>(message.c_str()), PayloadSize);
assert(limeWriteRecordHeader(h, LimeW) >= 0);
limeDestroyHeader(h);
}
return LIME_SUCCESS;
}
////////////////////////////////////////////
// Write a generic serialisable object
////////////////////////////////////////////
void writeLimeObject(int MB,int ME,XmlWriter &writer,std::string object_name,std::string record_name)
{
if ( boss_node ) {
std::string xmlstring = writer.docString();
// std::cout << "WriteLimeObject" << record_name <<std::endl;
uint64_t nbytes = xmlstring.size();
// std::cout << " xmlstring "<< nbytes<< " " << xmlstring <<std::endl;
int err;
LimeRecordHeader *h = limeCreateHeader(MB, ME,const_cast<char *>(record_name.c_str()), nbytes);
assert(h!= NULL);
err=limeWriteRecordHeader(h, LimeW); assert(err>=0);
err=limeWriteRecordData(&xmlstring[0], &nbytes, LimeW); assert(err>=0);
err=limeWriterCloseRecord(LimeW); assert(err>=0);
limeDestroyHeader(h);
}
}
template<class serialisable_object>
void writeLimeObject(int MB,int ME,serialisable_object &object,std::string object_name,std::string record_name, const unsigned int scientificPrec = 0)
{
XmlWriter WR("","");
if (scientificPrec)
{
WR.scientificFormat(true);
WR.setPrecision(scientificPrec);
}
write(WR,object_name,object);
writeLimeObject(MB, ME, WR, object_name, record_name);
}
////////////////////////////////////////////////////
// Write a generic lattice field and csum
// This routine is Collectively called by all nodes
// in communicator used by the field._grid
////////////////////////////////////////////////////
template<class vobj>
void writeLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name)
{
////////////////////////////////////////////////////////////////////
// NB: FILE and iostream are jointly writing disjoint sequences in the
// the same file through different file handles (integer units).
//
// These are both buffered, so why I think this code is right is as follows.
//
// i) write record header to FILE *File, telegraphing the size; flush
// ii) ftello reads the offset from FILE *File .
// iii) iostream / MPI Open independently seek this offset. Write sequence direct to disk.
// Closes iostream and flushes.
// iv) fseek on FILE * to end of this disjoint section.
// v) Continue writing scidac record.
////////////////////////////////////////////////////////////////////
GridBase *grid = field._grid;
assert(boss_node == field._grid->IsBoss() );
////////////////////////////////////////////
// Create record header
////////////////////////////////////////////
typedef typename vobj::scalar_object sobj;
int err;
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
uint64_t PayloadSize = sizeof(sobj) * grid->_gsites;
if ( boss_node ) {
createLimeRecordHeader(record_name, 0, 0, PayloadSize);
fflush(File);
}
// std::cout << "W sizeof(sobj)" <<sizeof(sobj)<<std::endl;
// std::cout << "W Gsites " <<field._grid->_gsites<<std::endl;
// std::cout << "W Payload expected " <<PayloadSize<<std::endl;
////////////////////////////////////////////////
// Check all nodes agree on file position
////////////////////////////////////////////////
uint64_t offset1;
if ( boss_node ) {
offset1 = ftello(File);
}
grid->Broadcast(0,(void *)&offset1,sizeof(offset1));
///////////////////////////////////////////
// The above is collective. Write by other means into the binary record
///////////////////////////////////////////
std::string format = getFormatString<vobj>();
BinarySimpleMunger<sobj,sobj> munge;
BinaryIO::writeLatticeObject<vobj,sobj>(field, filename, munge, offset1, format,nersc_csum,scidac_csuma,scidac_csumb);
///////////////////////////////////////////
// Wind forward and close the record
///////////////////////////////////////////
if ( boss_node ) {
fseek(File,0,SEEK_END);
uint64_t offset2 = ftello(File); // std::cout << " now at offset "<<offset2 << std::endl;
assert( (offset2-offset1) == PayloadSize);
}
/////////////////////////////////////////////////////////////
// Check MPI-2 I/O did what we expect to file
/////////////////////////////////////////////////////////////
if ( boss_node ) {
err=limeWriterCloseRecord(LimeW); assert(err>=0);
}
////////////////////////////////////////
// Write checksum element, propagaing forward from the BinaryIO
// Always pair a checksum with a binary object, and close message
////////////////////////////////////////
scidacChecksum checksum;
std::stringstream streama; streama << std::hex << scidac_csuma;
std::stringstream streamb; streamb << std::hex << scidac_csumb;
checksum.suma= streama.str();
checksum.sumb= streamb.str();
if ( boss_node ) {
writeLimeObject(0,1,checksum,std::string("scidacChecksum"),std::string(SCIDAC_CHECKSUM));
}
}
};
class ScidacWriter : public GridLimeWriter {
public:
ScidacWriter(bool isboss =true ) : GridLimeWriter(isboss) { };
template<class SerialisableUserFile>
void writeScidacFileRecord(GridBase *grid,SerialisableUserFile &_userFile)
{
scidacFile _scidacFile(grid);
if ( this->boss_node ) {
writeLimeObject(1,0,_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML));
writeLimeObject(0,1,_userFile,_userFile.SerialisableClassName(),std::string(SCIDAC_FILE_XML));
}
}
////////////////////////////////////////////////
// Write generic lattice field in scidac format
////////////////////////////////////////////////
template <class vobj, class userRecord>
void writeScidacFieldRecord(Lattice<vobj> &field,userRecord _userRecord,
const unsigned int recordScientificPrec = 0)
{
GridBase * grid = field._grid;
////////////////////////////////////////
// fill the Grid header
////////////////////////////////////////
FieldMetaData header;
scidacRecord _scidacRecord;
scidacFile _scidacFile;
ScidacMetaData(field,header,_scidacRecord,_scidacFile);
//////////////////////////////////////////////
// Fill the Lime file record by record
//////////////////////////////////////////////
if ( this->boss_node ) {
writeLimeObject(1,0,header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message
writeLimeObject(0,0,_userRecord,_userRecord.SerialisableClassName(),std::string(SCIDAC_RECORD_XML), recordScientificPrec);
writeLimeObject(0,0,_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
}
// Collective call
writeLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA)); // Closes message with checksum
}
};
class ScidacReader : public GridLimeReader {
public:
template<class SerialisableUserFile>
void readScidacFileRecord(GridBase *grid,SerialisableUserFile &_userFile)
{
scidacFile _scidacFile(grid);
readLimeObject(_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML));
readLimeObject(_userFile,_userFile.SerialisableClassName(),std::string(SCIDAC_FILE_XML));
}
////////////////////////////////////////////////
// Write generic lattice field in scidac format
////////////////////////////////////////////////
template <class vobj, class userRecord>
void readScidacFieldRecord(Lattice<vobj> &field,userRecord &_userRecord)
{
typedef typename vobj::scalar_object sobj;
GridBase * grid = field._grid;
////////////////////////////////////////
// fill the Grid header
////////////////////////////////////////
FieldMetaData header;
scidacRecord _scidacRecord;
scidacFile _scidacFile;
//////////////////////////////////////////////
// Fill the Lime file record by record
//////////////////////////////////////////////
readLimeObject(header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message
readLimeObject(_userRecord,_userRecord.SerialisableClassName(),std::string(SCIDAC_RECORD_XML));
readLimeObject(_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
readLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA));
}
void skipPastBinaryRecord(void) {
std::string rec_name(ILDG_BINARY_DATA);
while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) {
if ( !strncmp(limeReaderType(LimeR), rec_name.c_str(),strlen(rec_name.c_str()) ) ) {
skipPastObjectRecord(std::string(SCIDAC_CHECKSUM));
return;
}
}
}
void skipPastObjectRecord(std::string rec_name) {
while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) {
if ( !strncmp(limeReaderType(LimeR), rec_name.c_str(),strlen(rec_name.c_str()) ) ) {
return;
}
}
}
void skipScidacFieldRecord() {
skipPastObjectRecord(std::string(GRID_FORMAT));
skipPastObjectRecord(std::string(SCIDAC_RECORD_XML));
skipPastObjectRecord(std::string(SCIDAC_PRIVATE_RECORD_XML));
skipPastBinaryRecord();
}
};
class IldgWriter : public ScidacWriter {
public:
IldgWriter(bool isboss) : ScidacWriter(isboss) {};
///////////////////////////////////
// A little helper
///////////////////////////////////
void writeLimeIldgLFN(std::string &LFN)
{
uint64_t PayloadSize = LFN.size();
int err;
createLimeRecordHeader(ILDG_DATA_LFN, 0 , 0, PayloadSize);
err=limeWriteRecordData(const_cast<char*>(LFN.c_str()), &PayloadSize,LimeW); assert(err>=0);
err=limeWriterCloseRecord(LimeW); assert(err>=0);
}
////////////////////////////////////////////////////////////////
// Special ILDG operations ; gauge configs only.
// Don't require scidac records EXCEPT checksum
// Use Grid MetaData object if present.
////////////////////////////////////////////////////////////////
template <class vsimd>
void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,int sequence,std::string LFN,std::string description)
{
GridBase * grid = Umu._grid;
typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
typedef iLorentzColourMatrix<vsimd> vobj;
typedef typename vobj::scalar_object sobj;
////////////////////////////////////////
// fill the Grid header
////////////////////////////////////////
FieldMetaData header;
scidacRecord _scidacRecord;
scidacFile _scidacFile;
ScidacMetaData(Umu,header,_scidacRecord,_scidacFile);
std::string format = header.floating_point;
header.ensemble_id = description;
header.ensemble_label = description;
header.sequence_number = sequence;
header.ildg_lfn = LFN;
assert ( (format == std::string("IEEE32BIG"))
||(format == std::string("IEEE64BIG")) );
//////////////////////////////////////////////////////
// Fill ILDG header data struct
//////////////////////////////////////////////////////
ildgFormat ildgfmt ;
ildgfmt.field = std::string("su3gauge");
if ( format == std::string("IEEE32BIG") ) {
ildgfmt.precision = 32;
} else {
ildgfmt.precision = 64;
}
ildgfmt.version = 1.0;
ildgfmt.lx = header.dimension[0];
ildgfmt.ly = header.dimension[1];
ildgfmt.lz = header.dimension[2];
ildgfmt.lt = header.dimension[3];
assert(header.nd==4);
assert(header.nd==header.dimension.size());
//////////////////////////////////////////////////////////////////////////////
// Fill the USQCD info field
//////////////////////////////////////////////////////////////////////////////
usqcdInfo info;
info.version=1.0;
info.plaq = header.plaquette;
info.linktr = header.link_trace;
std::cout << GridLogMessage << " Writing config; IldgIO "<<std::endl;
//////////////////////////////////////////////
// Fill the Lime file record by record
//////////////////////////////////////////////
writeLimeObject(1,0,header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message
writeLimeObject(0,0,_scidacFile,_scidacFile.SerialisableClassName(),std::string(SCIDAC_PRIVATE_FILE_XML));
writeLimeObject(0,1,info,info.SerialisableClassName(),std::string(SCIDAC_FILE_XML));
writeLimeObject(1,0,_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
writeLimeObject(0,0,info,info.SerialisableClassName(),std::string(SCIDAC_RECORD_XML));
writeLimeObject(0,0,ildgfmt,std::string("ildgFormat") ,std::string(ILDG_FORMAT)); // rec
writeLimeIldgLFN(header.ildg_lfn); // rec
writeLimeLatticeBinaryObject(Umu,std::string(ILDG_BINARY_DATA)); // Closes message with checksum
// limeDestroyWriter(LimeW);
}
};
class IldgReader : public GridLimeReader {
public:
////////////////////////////////////////////////////////////////
// Read either Grid/SciDAC/ILDG configuration
// Don't require scidac records EXCEPT checksum
// Use Grid MetaData object if present.
// Else use ILDG MetaData object if present.
// Else use SciDAC MetaData object if present.
////////////////////////////////////////////////////////////////
template <class vsimd>
void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu, FieldMetaData &FieldMetaData_) {
typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
typedef typename GaugeField::vector_object vobj;
typedef typename vobj::scalar_object sobj;
typedef LorentzColourMatrixF fobj;
typedef LorentzColourMatrixD dobj;
GridBase *grid = Umu._grid;
std::vector<int> dims = Umu._grid->FullDimensions();
assert(dims.size()==4);
// Metadata holders
ildgFormat ildgFormat_ ;
std::string ildgLFN_ ;
scidacChecksum scidacChecksum_;
usqcdInfo usqcdInfo_ ;
// track what we read from file
int found_ildgFormat =0;
int found_ildgLFN =0;
int found_scidacChecksum=0;
int found_usqcdInfo =0;
int found_ildgBinary =0;
int found_FieldMetaData =0;
uint32_t nersc_csum;
uint32_t scidac_csuma;
uint32_t scidac_csumb;
// Binary format
std::string format;
//////////////////////////////////////////////////////////////////////////
// Loop over all records
// -- Order is poorly guaranteed except ILDG header preceeds binary section.
// -- Run like an event loop.
// -- Impose trust hierarchy. Grid takes precedence & look for ILDG, and failing
// that Scidac.
// -- Insist on Scidac checksum record.
//////////////////////////////////////////////////////////////////////////
while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) {
uint64_t nbytes = limeReaderBytes(LimeR);//size of this record (configuration)
//////////////////////////////////////////////////////////////////
// If not BINARY_DATA read a string and parse
//////////////////////////////////////////////////////////////////
if ( strncmp(limeReaderType(LimeR), ILDG_BINARY_DATA,strlen(ILDG_BINARY_DATA) ) ) {
// Copy out the string
std::vector<char> xmlc(nbytes+1,'\0');
limeReaderReadData((void *)&xmlc[0], &nbytes, LimeR);
// std::cout << GridLogMessage<< "Non binary record :" <<limeReaderType(LimeR) <<std::endl; //<<"\n"<<(&xmlc[0])<<std::endl;
//////////////////////////////////
// ILDG format record
std::string xmlstring(&xmlc[0]);
if ( !strncmp(limeReaderType(LimeR), ILDG_FORMAT,strlen(ILDG_FORMAT)) ) {
XmlReader RD(xmlstring, true, "");
read(RD,"ildgFormat",ildgFormat_);
if ( ildgFormat_.precision == 64 ) format = std::string("IEEE64BIG");
if ( ildgFormat_.precision == 32 ) format = std::string("IEEE32BIG");
assert( ildgFormat_.lx == dims[0]);
assert( ildgFormat_.ly == dims[1]);
assert( ildgFormat_.lz == dims[2]);
assert( ildgFormat_.lt == dims[3]);
found_ildgFormat = 1;
}
if ( !strncmp(limeReaderType(LimeR), ILDG_DATA_LFN,strlen(ILDG_DATA_LFN)) ) {
FieldMetaData_.ildg_lfn = xmlstring;
found_ildgLFN = 1;
}
if ( !strncmp(limeReaderType(LimeR), GRID_FORMAT,strlen(ILDG_FORMAT)) ) {
XmlReader RD(xmlstring, true, "");
read(RD,"FieldMetaData",FieldMetaData_);
format = FieldMetaData_.floating_point;
assert(FieldMetaData_.dimension[0] == dims[0]);
assert(FieldMetaData_.dimension[1] == dims[1]);
assert(FieldMetaData_.dimension[2] == dims[2]);
assert(FieldMetaData_.dimension[3] == dims[3]);
found_FieldMetaData = 1;
}
if ( !strncmp(limeReaderType(LimeR), SCIDAC_RECORD_XML,strlen(SCIDAC_RECORD_XML)) ) {
// is it a USQCD info field
if ( xmlstring.find(std::string("usqcdInfo")) != std::string::npos ) {
// std::cout << GridLogMessage<<"...found a usqcdInfo field"<<std::endl;
XmlReader RD(xmlstring, true, "");
read(RD,"usqcdInfo",usqcdInfo_);
found_usqcdInfo = 1;
}
}
if ( !strncmp(limeReaderType(LimeR), SCIDAC_CHECKSUM,strlen(SCIDAC_CHECKSUM)) ) {
XmlReader RD(xmlstring, true, "");
read(RD,"scidacChecksum",scidacChecksum_);
found_scidacChecksum = 1;
}
} else {
/////////////////////////////////
// Binary data
/////////////////////////////////
std::cout << GridLogMessage << "ILDG Binary record found : " ILDG_BINARY_DATA << std::endl;
uint64_t offset= ftello(File);
if ( format == std::string("IEEE64BIG") ) {
GaugeSimpleMunger<dobj, sobj> munge;
BinaryIO::readLatticeObject< vobj, dobj >(Umu, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
} else {
GaugeSimpleMunger<fobj, sobj> munge;
BinaryIO::readLatticeObject< vobj, fobj >(Umu, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
}
found_ildgBinary = 1;
}
}
//////////////////////////////////////////////////////
// Minimally must find binary segment and checksum
// Since this is an ILDG reader require ILDG format
//////////////////////////////////////////////////////
assert(found_ildgBinary);
assert(found_ildgFormat);
assert(found_scidacChecksum);
// Must find something with the lattice dimensions
assert(found_FieldMetaData||found_ildgFormat);
if ( found_FieldMetaData ) {
std::cout << GridLogMessage<<"Grid MetaData was record found: configuration was probably written by Grid ! Yay ! "<<std::endl;
} else {
assert(found_ildgFormat);
assert ( ildgFormat_.field == std::string("su3gauge") );
///////////////////////////////////////////////////////////////////////////////////////
// Populate our Grid metadata as best we can
///////////////////////////////////////////////////////////////////////////////////////
std::ostringstream vers; vers << ildgFormat_.version;
FieldMetaData_.hdr_version = vers.str();
FieldMetaData_.data_type = std::string("4D_SU3_GAUGE_3X3");
FieldMetaData_.nd=4;
FieldMetaData_.dimension.resize(4);
FieldMetaData_.dimension[0] = ildgFormat_.lx ;
FieldMetaData_.dimension[1] = ildgFormat_.ly ;
FieldMetaData_.dimension[2] = ildgFormat_.lz ;
FieldMetaData_.dimension[3] = ildgFormat_.lt ;
if ( found_usqcdInfo ) {
FieldMetaData_.plaquette = usqcdInfo_.plaq;
FieldMetaData_.link_trace= usqcdInfo_.linktr;
std::cout << GridLogMessage <<"This configuration was probably written by USQCD "<<std::endl;
std::cout << GridLogMessage <<"USQCD xml record Plaquette : "<<FieldMetaData_.plaquette<<std::endl;
std::cout << GridLogMessage <<"USQCD xml record LinkTrace : "<<FieldMetaData_.link_trace<<std::endl;
} else {
FieldMetaData_.plaquette = 0.0;
FieldMetaData_.link_trace= 0.0;
std::cout << GridLogWarning << "This configuration is unsafe with no plaquette records that can verify it !!! "<<std::endl;
}
}
////////////////////////////////////////////////////////////
// Really really want to mandate a scidac checksum
////////////////////////////////////////////////////////////
if ( found_scidacChecksum ) {
FieldMetaData_.scidac_checksuma = stoull(scidacChecksum_.suma,0,16);
FieldMetaData_.scidac_checksumb = stoull(scidacChecksum_.sumb,0,16);
scidacChecksumVerify(scidacChecksum_,scidac_csuma,scidac_csumb);
assert( scidac_csuma ==FieldMetaData_.scidac_checksuma);
assert( scidac_csumb ==FieldMetaData_.scidac_checksumb);
std::cout << GridLogMessage<<"SciDAC checksums match " << std::endl;
} else {
std::cout << GridLogWarning<<"SciDAC checksums not found. This is unsafe. " << std::endl;
assert(0); // Can I insist always checksum ?
}
if ( found_FieldMetaData || found_usqcdInfo ) {
FieldMetaData checker;
GaugeStatistics(Umu,checker);
assert(fabs(checker.plaquette - FieldMetaData_.plaquette )<1.0e-5);
assert(fabs(checker.link_trace - FieldMetaData_.link_trace)<1.0e-5);
std::cout << GridLogMessage<<"Plaquette and link trace match " << std::endl;
}
}
};
}}
//HAVE_LIME
#endif
#endif

View File

@ -1,237 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/parallelIO/IldgIO.h
Copyright (C) 2015
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_ILDGTYPES_IO_H
#define GRID_ILDGTYPES_IO_H
#ifdef HAVE_LIME
extern "C" { // for linkage
#include "lime.h"
}
namespace Grid {
/////////////////////////////////////////////////////////////////////////////////
// Data representation of records that enter ILDG and SciDac formats
/////////////////////////////////////////////////////////////////////////////////
#define GRID_FORMAT "grid-format"
#define ILDG_FORMAT "ildg-format"
#define ILDG_BINARY_DATA "ildg-binary-data"
#define ILDG_DATA_LFN "ildg-data-lfn"
#define SCIDAC_CHECKSUM "scidac-checksum"
#define SCIDAC_PRIVATE_FILE_XML "scidac-private-file-xml"
#define SCIDAC_FILE_XML "scidac-file-xml"
#define SCIDAC_PRIVATE_RECORD_XML "scidac-private-record-xml"
#define SCIDAC_RECORD_XML "scidac-record-xml"
#define SCIDAC_BINARY_DATA "scidac-binary-data"
// Unused SCIDAC records names; could move to support this functionality
#define SCIDAC_SITELIST "scidac-sitelist"
////////////////////////////////////////////////////////////
const int GRID_IO_SINGLEFILE = 0; // hardcode lift from QIO compat
const int GRID_IO_MULTIFILE = 1; // hardcode lift from QIO compat
const int GRID_IO_FIELD = 0; // hardcode lift from QIO compat
const int GRID_IO_GLOBAL = 1; // hardcode lift from QIO compat
////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////
// QIO uses mandatory "private" records fixed format
// Private is in principle "opaque" however it can't be changed now because that would break existing
// file compatability, so should be correct to assume the undocumented but defacto file structure.
/////////////////////////////////////////////////////////////////////////////////
struct emptyUserRecord : Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(emptyUserRecord,int,dummy);
emptyUserRecord() { dummy=0; };
};
////////////////////////
// Scidac private file xml
// <?xml version="1.0" encoding="UTF-8"?><scidacFile><version>1.1</version><spacetime>4</spacetime><dims>16 16 16 32 </dims><volfmt>0</volfmt></scidacFile>
////////////////////////
struct scidacFile : Serializable {
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(scidacFile,
double, version,
int, spacetime,
std::string, dims, // must convert to int
int, volfmt);
std::vector<int> getDimensions(void) {
std::stringstream stream(dims);
std::vector<int> dimensions;
int n;
while(stream >> n){
dimensions.push_back(n);
}
return dimensions;
}
void setDimensions(std::vector<int> dimensions) {
char delimiter = ' ';
std::stringstream stream;
for(int i=0;i<dimensions.size();i++){
stream << dimensions[i];
if ( i != dimensions.size()-1) {
stream << delimiter <<std::endl;
}
}
dims = stream.str();
}
// Constructor provides Grid
scidacFile() =default; // default constructor
scidacFile(GridBase * grid){
version = 1.0;
spacetime = grid->_ndimension;
setDimensions(grid->FullDimensions());
volfmt = GRID_IO_SINGLEFILE;
}
};
///////////////////////////////////////////////////////////////////////
// scidac-private-record-xml : example
// <scidacRecord>
// <version>1.1</version><date>Tue Jul 26 21:14:44 2011 UTC</date><recordtype>0</recordtype>
// <datatype>QDP_D3_ColorMatrix</datatype><precision>D</precision><colors>3</colors><spins>4</spins>
// <typesize>144</typesize><datacount>4</datacount>
// </scidacRecord>
///////////////////////////////////////////////////////////////////////
struct scidacRecord : Serializable {
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(scidacRecord,
double, version,
std::string, date,
int, recordtype,
std::string, datatype,
std::string, precision,
int, colors,
int, spins,
int, typesize,
int, datacount);
scidacRecord()
: version(1.0), recordtype(0), colors(0), spins(0), typesize(0), datacount(0)
{}
};
////////////////////////
// ILDG format
////////////////////////
struct ildgFormat : Serializable {
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(ildgFormat,
double, version,
std::string, field,
int, precision,
int, lx,
int, ly,
int, lz,
int, lt);
ildgFormat() { version=1.0; };
};
////////////////////////
// USQCD info
////////////////////////
struct usqcdInfo : Serializable {
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdInfo,
double, version,
double, plaq,
double, linktr,
std::string, info);
usqcdInfo() {
version=1.0;
};
};
////////////////////////
// Scidac Checksum
////////////////////////
struct scidacChecksum : Serializable {
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(scidacChecksum,
double, version,
std::string, suma,
std::string, sumb);
scidacChecksum() {
version=1.0;
};
};
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Type: scidac-file-xml <title>MILC ILDG archival gauge configuration</title>
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Type:
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////
// Scidac private file xml
// <?xml version="1.0" encoding="UTF-8"?><scidacFile><version>1.1</version><spacetime>4</spacetime><dims>16 16 16 32 </dims><volfmt>0</volfmt></scidacFile>
////////////////////////
#if 0
////////////////////////////////////////////////////////////////////////////////////////
// From http://www.physics.utah.edu/~detar/scidac/qio_2p3.pdf
////////////////////////////////////////////////////////////////////////////////////////
struct usqcdPropFile : Serializable {
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdPropFile,
double, version,
std::string, type,
std::string, info);
usqcdPropFile() {
version=1.0;
};
};
struct usqcdSourceInfo : Serializable {
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdSourceInfo,
double, version,
std::string, info);
usqcdSourceInfo() {
version=1.0;
};
};
struct usqcdPropInfo : Serializable {
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(usqcdPropInfo,
double, version,
int, spin,
int, color,
std::string, info);
usqcdPropInfo() {
version=1.0;
};
};
#endif
}
#endif
#endif

View File

@ -1,327 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/parallelIO/NerscIO.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <algorithm>
#include <iostream>
#include <iomanip>
#include <fstream>
#include <map>
#include <unistd.h>
#include <sys/utsname.h>
#include <pwd.h>
namespace Grid {
///////////////////////////////////////////////////////
// Precision mapping
///////////////////////////////////////////////////////
template<class vobj> static std::string getFormatString (void)
{
std::string format;
typedef typename getPrecision<vobj>::real_scalar_type stype;
if ( sizeof(stype) == sizeof(float) ) {
format = std::string("IEEE32BIG");
}
if ( sizeof(stype) == sizeof(double) ) {
format = std::string("IEEE64BIG");
}
return format;
}
////////////////////////////////////////////////////////////////////////////////
// header specification/interpretation
////////////////////////////////////////////////////////////////////////////////
class FieldMetaData : Serializable {
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(FieldMetaData,
int, nd,
std::vector<int>, dimension,
std::vector<std::string>, boundary,
int, data_start,
std::string, hdr_version,
std::string, storage_format,
double, link_trace,
double, plaquette,
uint32_t, checksum,
uint32_t, scidac_checksuma,
uint32_t, scidac_checksumb,
unsigned int, sequence_number,
std::string, data_type,
std::string, ensemble_id,
std::string, ensemble_label,
std::string, ildg_lfn,
std::string, creator,
std::string, creator_hardware,
std::string, creation_date,
std::string, archive_date,
std::string, floating_point);
// WARNING: non-initialised values might lead to twisted parallel IO
// issues, std::string are fine because they initliase to size 0
// as per C++ standard.
FieldMetaData(void)
: nd(4), dimension(4,0), boundary(4, ""), data_start(0),
link_trace(0.), plaquette(0.), checksum(0),
scidac_checksuma(0), scidac_checksumb(0), sequence_number(0)
{}
};
namespace QCD {
using namespace Grid;
//////////////////////////////////////////////////////////////////////
// Bit and Physical Checksumming and QA of data
//////////////////////////////////////////////////////////////////////
inline void GridMetaData(GridBase *grid,FieldMetaData &header)
{
int nd = grid->_ndimension;
header.nd = nd;
header.dimension.resize(nd);
header.boundary.resize(nd);
header.data_start = 0;
for(int d=0;d<nd;d++) {
header.dimension[d] = grid->_fdimensions[d];
}
for(int d=0;d<nd;d++) {
header.boundary[d] = std::string("PERIODIC");
}
}
inline void MachineCharacteristics(FieldMetaData &header)
{
// Who
struct passwd *pw = getpwuid (getuid());
if (pw) header.creator = std::string(pw->pw_name);
// When
std::time_t t = std::time(nullptr);
std::tm tm_ = *std::localtime(&t);
std::ostringstream oss;
// oss << std::put_time(&tm_, "%c %Z");
header.creation_date = oss.str();
header.archive_date = header.creation_date;
// What
struct utsname name; uname(&name);
header.creator_hardware = std::string(name.nodename)+"-";
header.creator_hardware+= std::string(name.machine)+"-";
header.creator_hardware+= std::string(name.sysname)+"-";
header.creator_hardware+= std::string(name.release);
}
#define dump_meta_data(field, s) \
s << "BEGIN_HEADER" << std::endl; \
s << "HDR_VERSION = " << field.hdr_version << std::endl; \
s << "DATATYPE = " << field.data_type << std::endl; \
s << "STORAGE_FORMAT = " << field.storage_format << std::endl; \
for(int i=0;i<4;i++){ \
s << "DIMENSION_" << i+1 << " = " << field.dimension[i] << std::endl ; \
} \
s << "LINK_TRACE = " << std::setprecision(10) << field.link_trace << std::endl; \
s << "PLAQUETTE = " << std::setprecision(10) << field.plaquette << std::endl; \
for(int i=0;i<4;i++){ \
s << "BOUNDARY_"<<i+1<<" = " << field.boundary[i] << std::endl; \
} \
\
s << "CHECKSUM = "<< std::hex << std::setw(10) << field.checksum << std::dec<<std::endl; \
s << "SCIDAC_CHECKSUMA = "<< std::hex << std::setw(10) << field.scidac_checksuma << std::dec<<std::endl; \
s << "SCIDAC_CHECKSUMB = "<< std::hex << std::setw(10) << field.scidac_checksumb << std::dec<<std::endl; \
s << "ENSEMBLE_ID = " << field.ensemble_id << std::endl; \
s << "ENSEMBLE_LABEL = " << field.ensemble_label << std::endl; \
s << "SEQUENCE_NUMBER = " << field.sequence_number << std::endl; \
s << "CREATOR = " << field.creator << std::endl; \
s << "CREATOR_HARDWARE = "<< field.creator_hardware << std::endl; \
s << "CREATION_DATE = " << field.creation_date << std::endl; \
s << "ARCHIVE_DATE = " << field.archive_date << std::endl; \
s << "FLOATING_POINT = " << field.floating_point << std::endl; \
s << "END_HEADER" << std::endl;
template<class vobj> inline void PrepareMetaData(Lattice<vobj> & field, FieldMetaData &header)
{
GridBase *grid = field._grid;
std::string format = getFormatString<vobj>();
header.floating_point = format;
header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
GridMetaData(grid,header);
MachineCharacteristics(header);
}
inline void GaugeStatistics(Lattice<vLorentzColourMatrixF> & data,FieldMetaData &header)
{
// How to convert data precision etc...
header.link_trace=Grid::QCD::WilsonLoops<PeriodicGimplF>::linkTrace(data);
header.plaquette =Grid::QCD::WilsonLoops<PeriodicGimplF>::avgPlaquette(data);
}
inline void GaugeStatistics(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header)
{
// How to convert data precision etc...
header.link_trace=Grid::QCD::WilsonLoops<PeriodicGimplD>::linkTrace(data);
header.plaquette =Grid::QCD::WilsonLoops<PeriodicGimplD>::avgPlaquette(data);
}
template<> inline void PrepareMetaData<vLorentzColourMatrixF>(Lattice<vLorentzColourMatrixF> & field, FieldMetaData &header)
{
GridBase *grid = field._grid;
std::string format = getFormatString<vLorentzColourMatrixF>();
header.floating_point = format;
header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
GridMetaData(grid,header);
GaugeStatistics(field,header);
MachineCharacteristics(header);
}
template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzColourMatrixD> & field, FieldMetaData &header)
{
GridBase *grid = field._grid;
std::string format = getFormatString<vLorentzColourMatrixD>();
header.floating_point = format;
header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
GridMetaData(grid,header);
GaugeStatistics(field,header);
MachineCharacteristics(header);
}
//////////////////////////////////////////////////////////////////////
// Utilities ; these are QCD aware
//////////////////////////////////////////////////////////////////////
inline void reconstruct3(LorentzColourMatrix & cm)
{
const int x=0;
const int y=1;
const int z=2;
for(int mu=0;mu<Nd;mu++){
cm(mu)()(2,x) = adj(cm(mu)()(0,y)*cm(mu)()(1,z)-cm(mu)()(0,z)*cm(mu)()(1,y)); //x= yz-zy
cm(mu)()(2,y) = adj(cm(mu)()(0,z)*cm(mu)()(1,x)-cm(mu)()(0,x)*cm(mu)()(1,z)); //y= zx-xz
cm(mu)()(2,z) = adj(cm(mu)()(0,x)*cm(mu)()(1,y)-cm(mu)()(0,y)*cm(mu)()(1,x)); //z= xy-yx
}
}
////////////////////////////////////////////////////////////////////////////////
// Some data types for intermediate storage
////////////////////////////////////////////////////////////////////////////////
template<typename vtype> using iLorentzColour2x3 = iVector<iVector<iVector<vtype, Nc>, 2>, Nd >;
typedef iLorentzColour2x3<Complex> LorentzColour2x3;
typedef iLorentzColour2x3<ComplexF> LorentzColour2x3F;
typedef iLorentzColour2x3<ComplexD> LorentzColour2x3D;
/////////////////////////////////////////////////////////////////////////////////
// Simple classes for precision conversion
/////////////////////////////////////////////////////////////////////////////////
template <class fobj, class sobj>
struct BinarySimpleUnmunger {
typedef typename getPrecision<fobj>::real_scalar_type fobj_stype;
typedef typename getPrecision<sobj>::real_scalar_type sobj_stype;
void operator()(sobj &in, fobj &out) {
// take word by word and transform accoding to the status
fobj_stype *out_buffer = (fobj_stype *)&out;
sobj_stype *in_buffer = (sobj_stype *)&in;
size_t fobj_words = sizeof(out) / sizeof(fobj_stype);
size_t sobj_words = sizeof(in) / sizeof(sobj_stype);
assert(fobj_words == sobj_words);
for (unsigned int word = 0; word < sobj_words; word++)
out_buffer[word] = in_buffer[word]; // type conversion on the fly
}
};
template <class fobj, class sobj>
struct BinarySimpleMunger {
typedef typename getPrecision<fobj>::real_scalar_type fobj_stype;
typedef typename getPrecision<sobj>::real_scalar_type sobj_stype;
void operator()(fobj &in, sobj &out) {
// take word by word and transform accoding to the status
fobj_stype *in_buffer = (fobj_stype *)&in;
sobj_stype *out_buffer = (sobj_stype *)&out;
size_t fobj_words = sizeof(in) / sizeof(fobj_stype);
size_t sobj_words = sizeof(out) / sizeof(sobj_stype);
assert(fobj_words == sobj_words);
for (unsigned int word = 0; word < sobj_words; word++)
out_buffer[word] = in_buffer[word]; // type conversion on the fly
}
};
template<class fobj,class sobj>
struct GaugeSimpleMunger{
void operator()(fobj &in, sobj &out) {
for (int mu = 0; mu < Nd; mu++) {
for (int i = 0; i < Nc; i++) {
for (int j = 0; j < Nc; j++) {
out(mu)()(i, j) = in(mu)()(i, j);
}}
}
};
};
template <class fobj, class sobj>
struct GaugeSimpleUnmunger {
void operator()(sobj &in, fobj &out) {
for (int mu = 0; mu < Nd; mu++) {
for (int i = 0; i < Nc; i++) {
for (int j = 0; j < Nc; j++) {
out(mu)()(i, j) = in(mu)()(i, j);
}}
}
};
};
template<class fobj,class sobj>
struct Gauge3x2munger{
void operator() (fobj &in,sobj &out){
for(int mu=0;mu<Nd;mu++){
for(int i=0;i<2;i++){
for(int j=0;j<3;j++){
out(mu)()(i,j) = in(mu)(i)(j);
}}
}
reconstruct3(out);
}
};
template<class fobj,class sobj>
struct Gauge3x2unmunger{
void operator() (sobj &in,fobj &out){
for(int mu=0;mu<Nd;mu++){
for(int i=0;i<2;i++){
for(int j=0;j<3;j++){
out(mu)(i)(j) = in(mu)()(i,j);
}}
}
}
};
}
}

View File

@ -1,363 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/parallelIO/NerscIO.h
Copyright (C) 2015
Author: Matt Spraggs <matthew.spraggs@gmail.com>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_NERSC_IO_H
#define GRID_NERSC_IO_H
namespace Grid {
namespace QCD {
using namespace Grid;
////////////////////////////////////////////////////////////////////////////////
// Write and read from fstream; comput header offset for payload
////////////////////////////////////////////////////////////////////////////////
class NerscIO : public BinaryIO {
public:
static inline void truncate(std::string file){
std::ofstream fout(file,std::ios::out);
}
static inline unsigned int writeHeader(FieldMetaData &field,std::string file)
{
std::ofstream fout(file,std::ios::out|std::ios::in);
fout.seekp(0,std::ios::beg);
dump_meta_data(field, fout);
field.data_start = fout.tellp();
return field.data_start;
}
// for the header-reader
static inline int readHeader(std::string file,GridBase *grid, FieldMetaData &field)
{
uint64_t offset=0;
std::map<std::string,std::string> header;
std::string line;
//////////////////////////////////////////////////
// read the header
//////////////////////////////////////////////////
std::ifstream fin(file);
getline(fin,line); // read one line and insist is
removeWhitespace(line);
std::cout << GridLogMessage << "* " << line << std::endl;
assert(line==std::string("BEGIN_HEADER"));
do {
getline(fin,line); // read one line
std::cout << GridLogMessage << "* "<<line<< std::endl;
int eq = line.find("=");
if(eq >0) {
std::string key=line.substr(0,eq);
std::string val=line.substr(eq+1);
removeWhitespace(key);
removeWhitespace(val);
header[key] = val;
}
} while( line.find("END_HEADER") == std::string::npos );
field.data_start = fin.tellg();
//////////////////////////////////////////////////
// chomp the values
//////////////////////////////////////////////////
field.hdr_version = header["HDR_VERSION"];
field.data_type = header["DATATYPE"];
field.storage_format = header["STORAGE_FORMAT"];
field.dimension[0] = std::stol(header["DIMENSION_1"]);
field.dimension[1] = std::stol(header["DIMENSION_2"]);
field.dimension[2] = std::stol(header["DIMENSION_3"]);
field.dimension[3] = std::stol(header["DIMENSION_4"]);
assert(grid->_ndimension == 4);
for(int d=0;d<4;d++){
assert(grid->_fdimensions[d]==field.dimension[d]);
}
field.link_trace = std::stod(header["LINK_TRACE"]);
field.plaquette = std::stod(header["PLAQUETTE"]);
field.boundary[0] = header["BOUNDARY_1"];
field.boundary[1] = header["BOUNDARY_2"];
field.boundary[2] = header["BOUNDARY_3"];
field.boundary[3] = header["BOUNDARY_4"];
field.checksum = std::stoul(header["CHECKSUM"],0,16);
field.ensemble_id = header["ENSEMBLE_ID"];
field.ensemble_label = header["ENSEMBLE_LABEL"];
field.sequence_number = std::stol(header["SEQUENCE_NUMBER"]);
field.creator = header["CREATOR"];
field.creator_hardware = header["CREATOR_HARDWARE"];
field.creation_date = header["CREATION_DATE"];
field.archive_date = header["ARCHIVE_DATE"];
field.floating_point = header["FLOATING_POINT"];
return field.data_start;
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Now the meat: the object readers
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vsimd>
static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
FieldMetaData& header,
std::string file)
{
typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
GridBase *grid = Umu._grid;
uint64_t offset = readHeader(file,Umu._grid,header);
FieldMetaData clone(header);
std::string format(header.floating_point);
int ieee32big = (format == std::string("IEEE32BIG"));
int ieee32 = (format == std::string("IEEE32"));
int ieee64big = (format == std::string("IEEE64BIG"));
int ieee64 = (format == std::string("IEEE64"));
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
// depending on datatype, set up munger;
// munger is a function of <floating point, Real, data_type>
if ( header.data_type == std::string("4D_SU3_GAUGE") ) {
if ( ieee32 || ieee32big ) {
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3F>
(Umu,file,Gauge3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format,
nersc_csum,scidac_csuma,scidac_csumb);
}
if ( ieee64 || ieee64big ) {
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3D>
(Umu,file,Gauge3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format,
nersc_csum,scidac_csuma,scidac_csumb);
}
} else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
if ( ieee32 || ieee32big ) {
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
(Umu,file,GaugeSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format,
nersc_csum,scidac_csuma,scidac_csumb);
}
if ( ieee64 || ieee64big ) {
BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixD>
(Umu,file,GaugeSimpleMunger<LorentzColourMatrixD,LorentzColourMatrix>(),offset,format,
nersc_csum,scidac_csuma,scidac_csumb);
}
} else {
assert(0);
}
GaugeStatistics(Umu,clone);
std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" checksum "<<std::hex<<nersc_csum<< std::dec
<<" header "<<std::hex<<header.checksum<<std::dec <<std::endl;
std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" plaquette "<<clone.plaquette
<<" header "<<header.plaquette<<std::endl;
std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" link_trace "<<clone.link_trace
<<" header "<<header.link_trace<<std::endl;
if ( fabs(clone.plaquette -header.plaquette ) >= 1.0e-5 ) {
std::cout << " Plaquette mismatch "<<std::endl;
std::cout << Umu[0]<<std::endl;
std::cout << Umu[1]<<std::endl;
}
if ( nersc_csum != header.checksum ) {
std::cerr << " checksum mismatch " << std::endl;
std::cerr << " plaqs " << clone.plaquette << " " << header.plaquette << std::endl;
std::cerr << " trace " << clone.link_trace<< " " << header.link_trace<< std::endl;
std::cerr << " nersc_csum " <<std::hex<< nersc_csum << " " << header.checksum<< std::dec<< std::endl;
exit(0);
}
assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
assert(nersc_csum == header.checksum );
std::cout<<GridLogMessage <<"NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl;
}
template<class vsimd>
static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
std::string file,
int two_row,
int bits32)
{
typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
typedef iLorentzColourMatrix<vsimd> vobj;
typedef typename vobj::scalar_object sobj;
FieldMetaData header;
///////////////////////////////////////////
// Following should become arguments
///////////////////////////////////////////
header.sequence_number = 1;
header.ensemble_id = "UKQCD";
header.ensemble_label = "DWF";
typedef LorentzColourMatrixD fobj3D;
typedef LorentzColour2x3D fobj2D;
GridBase *grid = Umu._grid;
GridMetaData(grid,header);
assert(header.nd==4);
GaugeStatistics(Umu,header);
MachineCharacteristics(header);
uint64_t offset;
// Sod it -- always write 3x3 double
header.floating_point = std::string("IEEE64BIG");
header.data_type = std::string("4D_SU3_GAUGE_3x3");
GaugeSimpleUnmunger<fobj3D,sobj> munge;
if ( grid->IsBoss() ) {
truncate(file);
offset = writeHeader(header,file);
}
grid->Broadcast(0,(void *)&offset,sizeof(offset));
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
nersc_csum,scidac_csuma,scidac_csumb);
header.checksum = nersc_csum;
if ( grid->IsBoss() ) {
writeHeader(header,file);
}
std::cout<<GridLogMessage <<"Written NERSC Configuration on "<< file << " checksum "
<<std::hex<<header.checksum
<<std::dec<<" plaq "<< header.plaquette <<std::endl;
}
///////////////////////////////
// RNG state
///////////////////////////////
static inline void writeRNGState(GridSerialRNG &serial,GridParallelRNG &parallel,std::string file)
{
typedef typename GridParallelRNG::RngStateType RngStateType;
// Following should become arguments
FieldMetaData header;
header.sequence_number = 1;
header.ensemble_id = "UKQCD";
header.ensemble_label = "DWF";
GridBase *grid = parallel._grid;
GridMetaData(grid,header);
assert(header.nd==4);
header.link_trace=0.0;
header.plaquette=0.0;
MachineCharacteristics(header);
uint64_t offset;
#ifdef RNG_RANLUX
header.floating_point = std::string("UINT64");
header.data_type = std::string("RANLUX48");
#endif
#ifdef RNG_MT19937
header.floating_point = std::string("UINT32");
header.data_type = std::string("MT19937");
#endif
#ifdef RNG_SITMO
header.floating_point = std::string("UINT64");
header.data_type = std::string("SITMO");
#endif
if ( grid->IsBoss() ) {
truncate(file);
offset = writeHeader(header,file);
}
grid->Broadcast(0,(void *)&offset,sizeof(offset));
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
BinaryIO::writeRNG(serial,parallel,file,offset,nersc_csum,scidac_csuma,scidac_csumb);
header.checksum = nersc_csum;
if ( grid->IsBoss() ) {
offset = writeHeader(header,file);
}
std::cout<<GridLogMessage
<<"Written NERSC RNG STATE "<<file<< " checksum "
<<std::hex<<header.checksum
<<std::dec<<std::endl;
}
static inline void readRNGState(GridSerialRNG &serial,GridParallelRNG & parallel,FieldMetaData& header,std::string file)
{
typedef typename GridParallelRNG::RngStateType RngStateType;
GridBase *grid = parallel._grid;
uint64_t offset = readHeader(file,grid,header);
FieldMetaData clone(header);
std::string format(header.floating_point);
std::string data_type(header.data_type);
#ifdef RNG_RANLUX
assert(format == std::string("UINT64"));
assert(data_type == std::string("RANLUX48"));
#endif
#ifdef RNG_MT19937
assert(format == std::string("UINT32"));
assert(data_type == std::string("MT19937"));
#endif
#ifdef RNG_SITMO
assert(format == std::string("UINT64"));
assert(data_type == std::string("SITMO"));
#endif
// depending on datatype, set up munger;
// munger is a function of <floating point, Real, data_type>
uint32_t nersc_csum,scidac_csuma,scidac_csumb;
BinaryIO::readRNG(serial,parallel,file,offset,nersc_csum,scidac_csuma,scidac_csumb);
if ( nersc_csum != header.checksum ) {
std::cerr << "checksum mismatch "<<std::hex<< nersc_csum <<" "<<header.checksum<<std::dec<<std::endl;
exit(0);
}
assert(nersc_csum == header.checksum );
std::cout<<GridLogMessage <<"Read NERSC RNG file "<<file<< " format "<< data_type <<std::endl;
}
};
}}
#endif

View File

@ -1,124 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/QCD.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: neo <cossu@post.kek.jp>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LT_H
#define GRID_LT_H
namespace Grid{
// First steps in the complete generalization of the Physics part
// Design not final
namespace LatticeTheories {
template <int Dimensions>
struct LatticeTheory {
static const int Nd = Dimensions;
static const int Nds = Dimensions * 2; // double stored field
template <typename vtype>
using iSinglet = iScalar<iScalar<iScalar<vtype> > >;
};
template <int Dimensions, int Colours>
struct LatticeGaugeTheory : public LatticeTheory<Dimensions> {
static const int Nds = Dimensions * 2;
static const int Nd = Dimensions;
static const int Nc = Colours;
template <typename vtype>
using iColourMatrix = iScalar<iScalar<iMatrix<vtype, Nc> > >;
template <typename vtype>
using iLorentzColourMatrix = iVector<iScalar<iMatrix<vtype, Nc> >, Nd>;
template <typename vtype>
using iDoubleStoredColourMatrix = iVector<iScalar<iMatrix<vtype, Nc> >, Nds>;
template <typename vtype>
using iColourVector = iScalar<iScalar<iVector<vtype, Nc> > >;
};
template <int Dimensions, int Colours, int Spin>
struct FermionicLatticeGaugeTheory
: public LatticeGaugeTheory<Dimensions, Colours> {
static const int Nd = Dimensions;
static const int Nds = Dimensions * 2;
static const int Nc = Colours;
static const int Ns = Spin;
template <typename vtype>
using iSpinMatrix = iScalar<iMatrix<iScalar<vtype>, Ns> >;
template <typename vtype>
using iSpinColourMatrix = iScalar<iMatrix<iMatrix<vtype, Nc>, Ns> >;
template <typename vtype>
using iSpinVector = iScalar<iVector<iScalar<vtype>, Ns> >;
template <typename vtype>
using iSpinColourVector = iScalar<iVector<iVector<vtype, Nc>, Ns> >;
// These 2 only if Spin is a multiple of 2
static const int Nhs = Spin / 2;
template <typename vtype>
using iHalfSpinVector = iScalar<iVector<iScalar<vtype>, Nhs> >;
template <typename vtype>
using iHalfSpinColourVector = iScalar<iVector<iVector<vtype, Nc>, Nhs> >;
//tests
typedef iColourMatrix<Complex> ColourMatrix;
typedef iColourMatrix<ComplexF> ColourMatrixF;
typedef iColourMatrix<ComplexD> ColourMatrixD;
};
// Examples, not complete now.
struct QCD : public FermionicLatticeGaugeTheory<4, 3, 4> {
static const int Xp = 0;
static const int Yp = 1;
static const int Zp = 2;
static const int Tp = 3;
static const int Xm = 4;
static const int Ym = 5;
static const int Zm = 6;
static const int Tm = 7;
typedef FermionicLatticeGaugeTheory FLGT;
typedef FLGT::iSpinMatrix<Complex > SpinMatrix;
typedef FLGT::iSpinMatrix<ComplexF > SpinMatrixF;
typedef FLGT::iSpinMatrix<ComplexD > SpinMatrixD;
};
struct QED : public FermionicLatticeGaugeTheory<4, 1, 4> {//fill
};
template <int Dimensions>
struct Scalar : public LatticeTheory<Dimensions> {};
}; // LatticeTheories
} // Grid
#endif

View File

@ -1,56 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/ActionBase.h
Copyright (C) 2015-2016
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: neo <cossu@post.kek.jp>
Author: Guido Cossu <guido.cossu@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef ACTION_BASE_H
#define ACTION_BASE_H
namespace Grid {
namespace QCD {
template <class GaugeField >
class Action
{
public:
bool is_smeared = false;
// Heatbath?
virtual void refresh(const GaugeField& U, GridParallelRNG& pRNG) = 0; // refresh pseudofermions
virtual RealD S(const GaugeField& U) = 0; // evaluate the action
virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0; // evaluate the action derivative
virtual std::string action_name() = 0; // return the action name
virtual std::string LogParameters() = 0; // prints action parameters
virtual ~Action(){}
};
}
}
#endif // ACTION_BASE_H

View File

@ -1,61 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/ActionCore.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: neo <cossu@post.kek.jp>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef QCD_ACTION_CORE
#define QCD_ACTION_CORE
#include <Grid/qcd/action/ActionBase.h>
#include <Grid/qcd/action/ActionSet.h>
#include <Grid/qcd/action/ActionParams.h>
////////////////////////////////////////////
// Gauge Actions
////////////////////////////////////////////
#include <Grid/qcd/action/gauge/Gauge.h>
////////////////////////////////////////////
// Fermion prereqs
////////////////////////////////////////////
#include <Grid/qcd/action/fermion/FermionCore.h>
////////////////////////////////////////////
// Scalar Actions
////////////////////////////////////////////
#include <Grid/qcd/action/scalar/Scalar.h>
////////////////////////////////////////////
// Utility functions
////////////////////////////////////////////
#include <Grid/qcd/utils/Metric.h>
#include <Grid/qcd/utils/CovariantLaplacian.h>
#endif

View File

@ -1,92 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/ActionParams.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Guido Cossu <guido.cossu@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_QCD_ACTION_PARAMS_H
#define GRID_QCD_ACTION_PARAMS_H
namespace Grid {
namespace QCD {
// These can move into a params header and be given MacroMagic serialisation
struct GparityWilsonImplParams {
bool overlapCommsCompute;
std::vector<int> twists;
GparityWilsonImplParams() : twists(Nd, 0), overlapCommsCompute(false){};
};
struct WilsonImplParams {
bool overlapCommsCompute;
std::vector<Complex> boundary_phases;
WilsonImplParams() : overlapCommsCompute(false) {
boundary_phases.resize(Nd, 1.0);
};
WilsonImplParams(const std::vector<Complex> phi)
: boundary_phases(phi), overlapCommsCompute(false) {}
};
struct StaggeredImplParams {
StaggeredImplParams() {};
};
struct OneFlavourRationalParams : Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(OneFlavourRationalParams,
RealD, lo,
RealD, hi,
int, MaxIter,
RealD, tolerance,
int, degree,
int, precision);
// MaxIter and tolerance, vectors??
// constructor
OneFlavourRationalParams( RealD _lo = 0.0,
RealD _hi = 1.0,
int _maxit = 1000,
RealD tol = 1.0e-8,
int _degree = 10,
int _precision = 64)
: lo(_lo),
hi(_hi),
MaxIter(_maxit),
tolerance(tol),
degree(_degree),
precision(_precision){};
};
}
}
#endif

View File

@ -1,100 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/AbstractEOFAFermion.h
Copyright (C) 2017
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: David Murphy <dmurphy@phys.columbia.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_QCD_ABSTRACT_EOFA_FERMION_H
#define GRID_QCD_ABSTRACT_EOFA_FERMION_H
#include <Grid/qcd/action/fermion/CayleyFermion5D.h>
namespace Grid {
namespace QCD {
// DJM: Abstract base class for EOFA fermion types.
// Defines layout of additional EOFA-specific parameters and operators.
// Use to construct EOFA pseudofermion actions that are agnostic to
// Shamir / Mobius / etc., and ensure that no one can construct EOFA
// pseudofermion action with non-EOFA fermion type.
template<class Impl>
class AbstractEOFAFermion : public CayleyFermion5D<Impl> {
public:
INHERIT_IMPL_TYPES(Impl);
public:
// Fermion operator: D(mq1) + shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm}
RealD mq1;
RealD mq2;
RealD mq3;
RealD shift;
int pm;
RealD alpha; // Mobius scale
RealD k; // EOFA normalization constant
virtual void Instantiatable(void) = 0;
// EOFA-specific operations
// Force user to implement in derived classes
virtual void Omega (const FermionField& in, FermionField& out, int sign, int dag) = 0;
virtual void Dtilde (const FermionField& in, FermionField& out) = 0;
virtual void DtildeInv(const FermionField& in, FermionField& out) = 0;
// Implement derivatives in base class:
// for EOFA both DWF and Mobius just need d(Dw)/dU
virtual void MDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag){
this->DhopDeriv(mat, U, V, dag);
};
virtual void MoeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag){
this->DhopDerivOE(mat, U, V, dag);
};
virtual void MeoDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag){
this->DhopDerivEO(mat, U, V, dag);
};
// Recompute 5D coefficients for different value of shift constant
// (needed for heatbath loop over poles)
virtual void RefreshShiftCoefficients(RealD new_shift) = 0;
// Constructors
AbstractEOFAFermion(GaugeField& _Umu, GridCartesian& FiveDimGrid, GridRedBlackCartesian& FiveDimRedBlackGrid,
GridCartesian& FourDimGrid, GridRedBlackCartesian& FourDimRedBlackGrid,
RealD _mq1, RealD _mq2, RealD _mq3, RealD _shift, int _pm,
RealD _M5, RealD _b, RealD _c, const ImplParams& p=ImplParams())
: CayleyFermion5D<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid, FourDimGrid, FourDimRedBlackGrid,
_mq1, _M5, p), mq1(_mq1), mq2(_mq2), mq3(_mq3), shift(_shift), pm(_pm)
{
int Ls = this->Ls;
this->alpha = _b + _c;
this->k = this->alpha * (_mq3-_mq2) * std::pow(this->alpha+1.0,2*Ls) /
( std::pow(this->alpha+1.0,Ls) + _mq2*std::pow(this->alpha-1.0,Ls) ) /
( std::pow(this->alpha+1.0,Ls) + _mq3*std::pow(this->alpha-1.0,Ls) );
};
};
}}
#endif

View File

@ -1,828 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
Copyright (C) 2015
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/CayleyFermion5D.h>
namespace Grid {
namespace QCD {
/*
* Dense matrix versions of routines
*/
template<class Impl>
void CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi, FermionField &chi)
{
this->MooeeInternal(psi,chi,DaggerYes,InverseYes);
}
template<class Impl>
void CayleyFermion5D<Impl>::MooeeInv(const FermionField &psi, FermionField &chi)
{
this->MooeeInternal(psi,chi,DaggerNo,InverseYes);
}
template<class Impl>
void CayleyFermion5D<Impl>::M5D(const FermionField &psi,
const FermionField &phi,
FermionField &chi,
std::vector<Coeff_t> &lower,
std::vector<Coeff_t> &diag,
std::vector<Coeff_t> &upper)
{
GridBase *grid=psi._grid;
int Ls = this->Ls;
int LLs = grid->_rdimensions[0];
const int nsimd= Simd::Nsimd();
Vector<iSinglet<Simd> > u(LLs);
Vector<iSinglet<Simd> > l(LLs);
Vector<iSinglet<Simd> > d(LLs);
assert(Ls/LLs==nsimd);
assert(phi.checkerboard == psi.checkerboard);
chi.checkerboard=psi.checkerboard;
// just directly address via type pun
typedef typename Simd::scalar_type scalar_type;
scalar_type * u_p = (scalar_type *)&u[0];
scalar_type * l_p = (scalar_type *)&l[0];
scalar_type * d_p = (scalar_type *)&d[0];
for(int o=0;o<LLs;o++){ // outer
for(int i=0;i<nsimd;i++){ //inner
int s = o+i*LLs;
int ss = o*nsimd+i;
u_p[ss] = upper[s];
l_p[ss] = lower[s];
d_p[ss] = diag[s];
}}
M5Dcalls++;
M5Dtime-=usecond();
assert(Nc==3);
parallel_for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
#if 0
alignas(64) SiteHalfSpinor hp;
alignas(64) SiteHalfSpinor hm;
alignas(64) SiteSpinor fp;
alignas(64) SiteSpinor fm;
for(int v=0;v<LLs;v++){
int vp=(v+1)%LLs;
int vm=(v+LLs-1)%LLs;
spProj5m(hp,psi[ss+vp]);
spProj5p(hm,psi[ss+vm]);
if ( vp<=v ) rotate(hp,hp,1);
if ( vm>=v ) rotate(hm,hm,nsimd-1);
hp=0.5*hp;
hm=0.5*hm;
spRecon5m(fp,hp);
spRecon5p(fm,hm);
chi[ss+v] = d[v]*phi[ss+v];
chi[ss+v] = chi[ss+v] +u[v]*fp;
chi[ss+v] = chi[ss+v] +l[v]*fm;
}
#else
for(int v=0;v<LLs;v++){
vprefetch(psi[ss+v+LLs]);
int vp= (v==LLs-1) ? 0 : v+1;
int vm= (v==0 ) ? LLs-1 : v-1;
Simd hp_00 = psi[ss+vp]()(2)(0);
Simd hp_01 = psi[ss+vp]()(2)(1);
Simd hp_02 = psi[ss+vp]()(2)(2);
Simd hp_10 = psi[ss+vp]()(3)(0);
Simd hp_11 = psi[ss+vp]()(3)(1);
Simd hp_12 = psi[ss+vp]()(3)(2);
Simd hm_00 = psi[ss+vm]()(0)(0);
Simd hm_01 = psi[ss+vm]()(0)(1);
Simd hm_02 = psi[ss+vm]()(0)(2);
Simd hm_10 = psi[ss+vm]()(1)(0);
Simd hm_11 = psi[ss+vm]()(1)(1);
Simd hm_12 = psi[ss+vm]()(1)(2);
if ( vp<=v ) {
hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
}
if ( vm>=v ) {
hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
}
// Can force these to real arithmetic and save 2x.
Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_00);
Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_01);
Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_02);
Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_10);
Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_11);
Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_12);
Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_00);
Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_01);
Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_02);
Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_10);
Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_11);
Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_12);
vstream(chi[ss+v]()(0)(0),p_00);
vstream(chi[ss+v]()(0)(1),p_01);
vstream(chi[ss+v]()(0)(2),p_02);
vstream(chi[ss+v]()(1)(0),p_10);
vstream(chi[ss+v]()(1)(1),p_11);
vstream(chi[ss+v]()(1)(2),p_12);
vstream(chi[ss+v]()(2)(0),p_20);
vstream(chi[ss+v]()(2)(1),p_21);
vstream(chi[ss+v]()(2)(2),p_22);
vstream(chi[ss+v]()(3)(0),p_30);
vstream(chi[ss+v]()(3)(1),p_31);
vstream(chi[ss+v]()(3)(2),p_32);
}
#endif
}
M5Dtime+=usecond();
}
template<class Impl>
void CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi,
const FermionField &phi,
FermionField &chi,
std::vector<Coeff_t> &lower,
std::vector<Coeff_t> &diag,
std::vector<Coeff_t> &upper)
{
GridBase *grid=psi._grid;
int Ls = this->Ls;
int LLs = grid->_rdimensions[0];
int nsimd= Simd::Nsimd();
Vector<iSinglet<Simd> > u(LLs);
Vector<iSinglet<Simd> > l(LLs);
Vector<iSinglet<Simd> > d(LLs);
assert(Ls/LLs==nsimd);
assert(phi.checkerboard == psi.checkerboard);
chi.checkerboard=psi.checkerboard;
// just directly address via type pun
typedef typename Simd::scalar_type scalar_type;
scalar_type * u_p = (scalar_type *)&u[0];
scalar_type * l_p = (scalar_type *)&l[0];
scalar_type * d_p = (scalar_type *)&d[0];
for(int o=0;o<LLs;o++){ // outer
for(int i=0;i<nsimd;i++){ //inner
int s = o+i*LLs;
int ss = o*nsimd+i;
u_p[ss] = upper[s];
l_p[ss] = lower[s];
d_p[ss] = diag[s];
}}
M5Dcalls++;
M5Dtime-=usecond();
parallel_for(int ss=0;ss<grid->oSites();ss+=LLs){ // adds LLs
#if 0
alignas(64) SiteHalfSpinor hp;
alignas(64) SiteHalfSpinor hm;
alignas(64) SiteSpinor fp;
alignas(64) SiteSpinor fm;
for(int v=0;v<LLs;v++){
int vp=(v+1)%LLs;
int vm=(v+LLs-1)%LLs;
spProj5p(hp,psi[ss+vp]);
spProj5m(hm,psi[ss+vm]);
if ( vp<=v ) rotate(hp,hp,1);
if ( vm>=v ) rotate(hm,hm,nsimd-1);
hp=hp*0.5;
hm=hm*0.5;
spRecon5p(fp,hp);
spRecon5m(fm,hm);
chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
chi[ss+v] = chi[ss+v] +l[v]*fm;
}
#else
for(int v=0;v<LLs;v++){
vprefetch(psi[ss+v+LLs]);
int vp= (v==LLs-1) ? 0 : v+1;
int vm= (v==0 ) ? LLs-1 : v-1;
Simd hp_00 = psi[ss+vp]()(0)(0);
Simd hp_01 = psi[ss+vp]()(0)(1);
Simd hp_02 = psi[ss+vp]()(0)(2);
Simd hp_10 = psi[ss+vp]()(1)(0);
Simd hp_11 = psi[ss+vp]()(1)(1);
Simd hp_12 = psi[ss+vp]()(1)(2);
Simd hm_00 = psi[ss+vm]()(2)(0);
Simd hm_01 = psi[ss+vm]()(2)(1);
Simd hm_02 = psi[ss+vm]()(2)(2);
Simd hm_10 = psi[ss+vm]()(3)(0);
Simd hm_11 = psi[ss+vm]()(3)(1);
Simd hm_12 = psi[ss+vm]()(3)(2);
if ( vp<=v ) {
hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
}
if ( vm>=v ) {
hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
}
Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_00);
Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_01);
Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_02);
Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_10);
Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_11);
Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_12);
Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_00);
Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_01);
Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_02);
Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_10);
Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_11);
Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_12);
vstream(chi[ss+v]()(0)(0),p_00);
vstream(chi[ss+v]()(0)(1),p_01);
vstream(chi[ss+v]()(0)(2),p_02);
vstream(chi[ss+v]()(1)(0),p_10);
vstream(chi[ss+v]()(1)(1),p_11);
vstream(chi[ss+v]()(1)(2),p_12);
vstream(chi[ss+v]()(2)(0),p_20);
vstream(chi[ss+v]()(2)(1),p_21);
vstream(chi[ss+v]()(2)(2),p_22);
vstream(chi[ss+v]()(3)(0),p_30);
vstream(chi[ss+v]()(3)(1),p_31);
vstream(chi[ss+v]()(3)(2),p_32);
}
#endif
}
M5Dtime+=usecond();
}
#ifdef AVX512
#include <simd/Intel512common.h>
#include <simd/Intel512avx.h>
#include <simd/Intel512single.h>
#endif
template<class Impl>
void CayleyFermion5D<Impl>::MooeeInternalAsm(const FermionField &psi, FermionField &chi,
int LLs, int site,
Vector<iSinglet<Simd> > &Matp,
Vector<iSinglet<Simd> > &Matm)
{
#ifndef AVX512
{
SiteHalfSpinor BcastP;
SiteHalfSpinor BcastM;
SiteHalfSpinor SiteChiP;
SiteHalfSpinor SiteChiM;
// Ls*Ls * 2 * 12 * vol flops
for(int s1=0;s1<LLs;s1++){
for(int s2=0;s2<LLs;s2++){
for(int l=0; l<Simd::Nsimd();l++){ // simd lane
int s=s2+l*LLs;
int lex=s2+LLs*site;
if ( s2==0 && l==0) {
SiteChiP=zero;
SiteChiM=zero;
}
for(int sp=0;sp<2;sp++){
for(int co=0;co<Nc;co++){
vbroadcast(BcastP()(sp )(co),psi[lex]()(sp)(co),l);
}}
for(int sp=0;sp<2;sp++){
for(int co=0;co<Nc;co++){
vbroadcast(BcastM()(sp )(co),psi[lex]()(sp+2)(co),l);
}}
for(int sp=0;sp<2;sp++){
for(int co=0;co<Nc;co++){
SiteChiP()(sp)(co)=real_madd(Matp[LLs*s+s1]()()(),BcastP()(sp)(co),SiteChiP()(sp)(co)); // 1100 us.
SiteChiM()(sp)(co)=real_madd(Matm[LLs*s+s1]()()(),BcastM()(sp)(co),SiteChiM()(sp)(co)); // each found by commenting out
}}
}}
{
int lex = s1+LLs*site;
for(int sp=0;sp<2;sp++){
for(int co=0;co<Nc;co++){
vstream(chi[lex]()(sp)(co), SiteChiP()(sp)(co));
vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
}}
}
}
}
#else
{
// pointers
// MASK_REGS;
#define Chi_00 %%zmm1
#define Chi_01 %%zmm2
#define Chi_02 %%zmm3
#define Chi_10 %%zmm4
#define Chi_11 %%zmm5
#define Chi_12 %%zmm6
#define Chi_20 %%zmm7
#define Chi_21 %%zmm8
#define Chi_22 %%zmm9
#define Chi_30 %%zmm10
#define Chi_31 %%zmm11
#define Chi_32 %%zmm12
#define BCAST0 %%zmm13
#define BCAST1 %%zmm14
#define BCAST2 %%zmm15
#define BCAST3 %%zmm16
#define BCAST4 %%zmm17
#define BCAST5 %%zmm18
#define BCAST6 %%zmm19
#define BCAST7 %%zmm20
#define BCAST8 %%zmm21
#define BCAST9 %%zmm22
#define BCAST10 %%zmm23
#define BCAST11 %%zmm24
int incr=LLs*LLs*sizeof(iSinglet<Simd>);
for(int s1=0;s1<LLs;s1++){
for(int s2=0;s2<LLs;s2++){
int lex=s2+LLs*site;
uint64_t a0 = (uint64_t)&Matp[LLs*s2+s1]; // should be cacheable
uint64_t a1 = (uint64_t)&Matm[LLs*s2+s1];
uint64_t a2 = (uint64_t)&psi[lex];
for(int l=0; l<Simd::Nsimd();l++){ // simd lane
if ( (s2+l)==0 ) {
asm (
VPREFETCH1(0,%2) VPREFETCH1(0,%1)
VPREFETCH1(12,%2) VPREFETCH1(13,%2)
VPREFETCH1(14,%2) VPREFETCH1(15,%2)
VBCASTCDUP(0,%2,BCAST0)
VBCASTCDUP(1,%2,BCAST1)
VBCASTCDUP(2,%2,BCAST2)
VBCASTCDUP(3,%2,BCAST3)
VBCASTCDUP(4,%2,BCAST4) VMULMEM (0,%0,BCAST0,Chi_00)
VBCASTCDUP(5,%2,BCAST5) VMULMEM (0,%0,BCAST1,Chi_01)
VBCASTCDUP(6,%2,BCAST6) VMULMEM (0,%0,BCAST2,Chi_02)
VBCASTCDUP(7,%2,BCAST7) VMULMEM (0,%0,BCAST3,Chi_10)
VBCASTCDUP(8,%2,BCAST8) VMULMEM (0,%0,BCAST4,Chi_11)
VBCASTCDUP(9,%2,BCAST9) VMULMEM (0,%0,BCAST5,Chi_12)
VBCASTCDUP(10,%2,BCAST10) VMULMEM (0,%1,BCAST6,Chi_20)
VBCASTCDUP(11,%2,BCAST11) VMULMEM (0,%1,BCAST7,Chi_21)
VMULMEM (0,%1,BCAST8,Chi_22)
VMULMEM (0,%1,BCAST9,Chi_30)
VMULMEM (0,%1,BCAST10,Chi_31)
VMULMEM (0,%1,BCAST11,Chi_32)
: : "r" (a0), "r" (a1), "r" (a2) );
} else {
asm (
VBCASTCDUP(0,%2,BCAST0) VMADDMEM (0,%0,BCAST0,Chi_00)
VBCASTCDUP(1,%2,BCAST1) VMADDMEM (0,%0,BCAST1,Chi_01)
VBCASTCDUP(2,%2,BCAST2) VMADDMEM (0,%0,BCAST2,Chi_02)
VBCASTCDUP(3,%2,BCAST3) VMADDMEM (0,%0,BCAST3,Chi_10)
VBCASTCDUP(4,%2,BCAST4) VMADDMEM (0,%0,BCAST4,Chi_11)
VBCASTCDUP(5,%2,BCAST5) VMADDMEM (0,%0,BCAST5,Chi_12)
VBCASTCDUP(6,%2,BCAST6) VMADDMEM (0,%1,BCAST6,Chi_20)
VBCASTCDUP(7,%2,BCAST7) VMADDMEM (0,%1,BCAST7,Chi_21)
VBCASTCDUP(8,%2,BCAST8) VMADDMEM (0,%1,BCAST8,Chi_22)
VBCASTCDUP(9,%2,BCAST9) VMADDMEM (0,%1,BCAST9,Chi_30)
VBCASTCDUP(10,%2,BCAST10) VMADDMEM (0,%1,BCAST10,Chi_31)
VBCASTCDUP(11,%2,BCAST11) VMADDMEM (0,%1,BCAST11,Chi_32)
: : "r" (a0), "r" (a1), "r" (a2) );
}
a0 = a0+incr;
a1 = a1+incr;
a2 = a2+sizeof(typename Simd::scalar_type);
}}
{
int lexa = s1+LLs*site;
asm (
VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01) VSTORE(2 ,%0,Chi_02)
VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11) VSTORE(5 ,%0,Chi_12)
VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21) VSTORE(8 ,%0,Chi_22)
VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31) VSTORE(11,%0,Chi_32)
: : "r" ((uint64_t)&chi[lexa]) : "memory" );
}
}
}
#undef Chi_00
#undef Chi_01
#undef Chi_02
#undef Chi_10
#undef Chi_11
#undef Chi_12
#undef Chi_20
#undef Chi_21
#undef Chi_22
#undef Chi_30
#undef Chi_31
#undef Chi_32
#undef BCAST0
#undef BCAST1
#undef BCAST2
#undef BCAST3
#undef BCAST4
#undef BCAST5
#undef BCAST6
#undef BCAST7
#undef BCAST8
#undef BCAST9
#undef BCAST10
#undef BCAST11
#endif
};
// Z-mobius version
template<class Impl>
void CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi, FermionField &chi,
int LLs, int site, Vector<iSinglet<Simd> > &Matp, Vector<iSinglet<Simd> > &Matm)
{
#ifndef AVX512
{
SiteHalfSpinor BcastP;
SiteHalfSpinor BcastM;
SiteHalfSpinor SiteChiP;
SiteHalfSpinor SiteChiM;
// Ls*Ls * 2 * 12 * vol flops
for(int s1=0;s1<LLs;s1++){
for(int s2=0;s2<LLs;s2++){
for(int l=0; l<Simd::Nsimd();l++){ // simd lane
int s=s2+l*LLs;
int lex=s2+LLs*site;
if ( s2==0 && l==0) {
SiteChiP=zero;
SiteChiM=zero;
}
for(int sp=0;sp<2;sp++){
for(int co=0;co<Nc;co++){
vbroadcast(BcastP()(sp )(co),psi[lex]()(sp)(co),l);
}}
for(int sp=0;sp<2;sp++){
for(int co=0;co<Nc;co++){
vbroadcast(BcastM()(sp )(co),psi[lex]()(sp+2)(co),l);
}}
for(int sp=0;sp<2;sp++){
for(int co=0;co<Nc;co++){
SiteChiP()(sp)(co)=SiteChiP()(sp)(co)+ Matp[LLs*s+s1]()()()*BcastP()(sp)(co);
SiteChiM()(sp)(co)=SiteChiM()(sp)(co)+ Matm[LLs*s+s1]()()()*BcastM()(sp)(co);
}}
}}
{
int lex = s1+LLs*site;
for(int sp=0;sp<2;sp++){
for(int co=0;co<Nc;co++){
vstream(chi[lex]()(sp)(co), SiteChiP()(sp)(co));
vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
}}
}
}
}
#else
{
// pointers
// MASK_REGS;
#define Chi_00 %zmm0
#define Chi_01 %zmm1
#define Chi_02 %zmm2
#define Chi_10 %zmm3
#define Chi_11 %zmm4
#define Chi_12 %zmm5
#define Chi_20 %zmm6
#define Chi_21 %zmm7
#define Chi_22 %zmm8
#define Chi_30 %zmm9
#define Chi_31 %zmm10
#define Chi_32 %zmm11
#define pChi_00 %%zmm0
#define pChi_01 %%zmm1
#define pChi_02 %%zmm2
#define pChi_10 %%zmm3
#define pChi_11 %%zmm4
#define pChi_12 %%zmm5
#define pChi_20 %%zmm6
#define pChi_21 %%zmm7
#define pChi_22 %%zmm8
#define pChi_30 %%zmm9
#define pChi_31 %%zmm10
#define pChi_32 %%zmm11
#define BCAST_00 %zmm12
#define SHUF_00 %zmm13
#define BCAST_01 %zmm14
#define SHUF_01 %zmm15
#define BCAST_02 %zmm16
#define SHUF_02 %zmm17
#define BCAST_10 %zmm18
#define SHUF_10 %zmm19
#define BCAST_11 %zmm20
#define SHUF_11 %zmm21
#define BCAST_12 %zmm22
#define SHUF_12 %zmm23
#define Mp %zmm24
#define Mps %zmm25
#define Mm %zmm26
#define Mms %zmm27
#define N 8
int incr=LLs*LLs*sizeof(iSinglet<Simd>);
for(int s1=0;s1<LLs;s1++){
for(int s2=0;s2<LLs;s2++){
int lex=s2+LLs*site;
uint64_t a0 = (uint64_t)&Matp[LLs*s2+s1]; // should be cacheable
uint64_t a1 = (uint64_t)&Matm[LLs*s2+s1];
uint64_t a2 = (uint64_t)&psi[lex];
for(int l=0; l<Simd::Nsimd();l++){ // simd lane
if ( (s2+l)==0 ) {
LOAD64(%r8,a0);
LOAD64(%r9,a1);
LOAD64(%r10,a2);
asm (
VLOAD(0,%r8,Mp)// i r
VLOAD(0,%r9,Mm)
VSHUF(Mp,Mps) // r i
VSHUF(Mm,Mms)
VPREFETCH1(12,%r10) VPREFETCH1(13,%r10)
VPREFETCH1(14,%r10) VPREFETCH1(15,%r10)
VMULIDUP(0*N,%r10,Mps,Chi_00)
VMULIDUP(1*N,%r10,Mps,Chi_01)
VMULIDUP(2*N,%r10,Mps,Chi_02)
VMULIDUP(3*N,%r10,Mps,Chi_10)
VMULIDUP(4*N,%r10,Mps,Chi_11)
VMULIDUP(5*N,%r10,Mps,Chi_12)
VMULIDUP(6*N ,%r10,Mms,Chi_20)
VMULIDUP(7*N ,%r10,Mms,Chi_21)
VMULIDUP(8*N ,%r10,Mms,Chi_22)
VMULIDUP(9*N ,%r10,Mms,Chi_30)
VMULIDUP(10*N,%r10,Mms,Chi_31)
VMULIDUP(11*N,%r10,Mms,Chi_32)
VMADDSUBRDUP(0*N,%r10,Mp,Chi_00)
VMADDSUBRDUP(1*N,%r10,Mp,Chi_01)
VMADDSUBRDUP(2*N,%r10,Mp,Chi_02)
VMADDSUBRDUP(3*N,%r10,Mp,Chi_10)
VMADDSUBRDUP(4*N,%r10,Mp,Chi_11)
VMADDSUBRDUP(5*N,%r10,Mp,Chi_12)
VMADDSUBRDUP(6*N ,%r10,Mm,Chi_20)
VMADDSUBRDUP(7*N ,%r10,Mm,Chi_21)
VMADDSUBRDUP(8*N ,%r10,Mm,Chi_22)
VMADDSUBRDUP(9*N ,%r10,Mm,Chi_30)
VMADDSUBRDUP(10*N,%r10,Mm,Chi_31)
VMADDSUBRDUP(11*N,%r10,Mm,Chi_32)
);
} else {
LOAD64(%r8,a0);
LOAD64(%r9,a1);
LOAD64(%r10,a2);
asm (
VLOAD(0,%r8,Mp)
VSHUF(Mp,Mps)
VLOAD(0,%r9,Mm)
VSHUF(Mm,Mms)
VMADDSUBIDUP(0*N,%r10,Mps,Chi_00) // Mri * Pii +- Cir
VMADDSUBIDUP(1*N,%r10,Mps,Chi_01)
VMADDSUBIDUP(2*N,%r10,Mps,Chi_02)
VMADDSUBIDUP(3*N,%r10,Mps,Chi_10)
VMADDSUBIDUP(4*N,%r10,Mps,Chi_11)
VMADDSUBIDUP(5*N,%r10,Mps,Chi_12)
VMADDSUBIDUP(6 *N,%r10,Mms,Chi_20)
VMADDSUBIDUP(7 *N,%r10,Mms,Chi_21)
VMADDSUBIDUP(8 *N,%r10,Mms,Chi_22)
VMADDSUBIDUP(9 *N,%r10,Mms,Chi_30)
VMADDSUBIDUP(10*N,%r10,Mms,Chi_31)
VMADDSUBIDUP(11*N,%r10,Mms,Chi_32)
VMADDSUBRDUP(0*N,%r10,Mp,Chi_00) // Cir = Mir * Prr +- ( Mri * Pii +- Cir)
VMADDSUBRDUP(1*N,%r10,Mp,Chi_01) // Ci = MiPr + Ci + MrPi ; Cr = MrPr - ( MiPi - Cr)
VMADDSUBRDUP(2*N,%r10,Mp,Chi_02)
VMADDSUBRDUP(3*N,%r10,Mp,Chi_10)
VMADDSUBRDUP(4*N,%r10,Mp,Chi_11)
VMADDSUBRDUP(5*N,%r10,Mp,Chi_12)
VMADDSUBRDUP(6 *N,%r10,Mm,Chi_20)
VMADDSUBRDUP(7 *N,%r10,Mm,Chi_21)
VMADDSUBRDUP(8 *N,%r10,Mm,Chi_22)
VMADDSUBRDUP(9 *N,%r10,Mm,Chi_30)
VMADDSUBRDUP(10*N,%r10,Mm,Chi_31)
VMADDSUBRDUP(11*N,%r10,Mm,Chi_32)
);
}
a0 = a0+incr;
a1 = a1+incr;
a2 = a2+sizeof(typename Simd::scalar_type);
}}
{
int lexa = s1+LLs*site;
/*
SiteSpinor tmp;
asm (
VSTORE(0,%0,pChi_00) VSTORE(1 ,%0,pChi_01) VSTORE(2 ,%0,pChi_02)
VSTORE(3,%0,pChi_10) VSTORE(4 ,%0,pChi_11) VSTORE(5 ,%0,pChi_12)
VSTORE(6,%0,pChi_20) VSTORE(7 ,%0,pChi_21) VSTORE(8 ,%0,pChi_22)
VSTORE(9,%0,pChi_30) VSTORE(10,%0,pChi_31) VSTORE(11,%0,pChi_32)
: : "r" ((uint64_t)&tmp) : "memory" );
*/
asm (
VSTORE(0,%0,pChi_00) VSTORE(1 ,%0,pChi_01) VSTORE(2 ,%0,pChi_02)
VSTORE(3,%0,pChi_10) VSTORE(4 ,%0,pChi_11) VSTORE(5 ,%0,pChi_12)
VSTORE(6,%0,pChi_20) VSTORE(7 ,%0,pChi_21) VSTORE(8 ,%0,pChi_22)
VSTORE(9,%0,pChi_30) VSTORE(10,%0,pChi_31) VSTORE(11,%0,pChi_32)
: : "r" ((uint64_t)&chi[lexa]) : "memory" );
// if ( 1 || (site==0) ) {
// std::cout<<site << " s1 "<<s1<<"\n\t"<<tmp << "\n't" << chi[lexa] <<"\n\t"<<tmp-chi[lexa]<<std::endl;
// }
}
}
}
#undef Chi_00
#undef Chi_01
#undef Chi_02
#undef Chi_10
#undef Chi_11
#undef Chi_12
#undef Chi_20
#undef Chi_21
#undef Chi_22
#undef Chi_30
#undef Chi_31
#undef Chi_32
#undef BCAST0
#undef BCAST1
#undef BCAST2
#undef BCAST3
#undef BCAST4
#undef BCAST5
#undef BCAST6
#undef BCAST7
#undef BCAST8
#undef BCAST9
#undef BCAST10
#undef BCAST11
#endif
};
template<class Impl>
void CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv)
{
int Ls=this->Ls;
int LLs = psi._grid->_rdimensions[0];
int vol = psi._grid->oSites()/LLs;
chi.checkerboard=psi.checkerboard;
Vector<iSinglet<Simd> > Matp;
Vector<iSinglet<Simd> > Matm;
Vector<iSinglet<Simd> > *_Matp;
Vector<iSinglet<Simd> > *_Matm;
// MooeeInternalCompute(dag,inv,Matp,Matm);
if ( inv && dag ) {
_Matp = &MatpInvDag;
_Matm = &MatmInvDag;
}
if ( inv && (!dag) ) {
_Matp = &MatpInv;
_Matm = &MatmInv;
}
if ( !inv ) {
MooeeInternalCompute(dag,inv,Matp,Matm);
_Matp = &Matp;
_Matm = &Matm;
}
assert(_Matp->size()==Ls*LLs);
MooeeInvCalls++;
MooeeInvTime-=usecond();
if ( switcheroo<Coeff_t>::iscomplex() ) {
parallel_for(auto site=0;site<vol;site++){
MooeeInternalZAsm(psi,chi,LLs,site,*_Matp,*_Matm);
}
} else {
parallel_for(auto site=0;site<vol;site++){
MooeeInternalAsm(psi,chi,LLs,site,*_Matp,*_Matm);
}
}
MooeeInvTime+=usecond();
}
INSTANTIATE_DPERP(DomainWallVec5dImplD);
INSTANTIATE_DPERP(DomainWallVec5dImplF);
INSTANTIATE_DPERP(ZDomainWallVec5dImplD);
INSTANTIATE_DPERP(ZDomainWallVec5dImplF);
INSTANTIATE_DPERP(DomainWallVec5dImplDF);
INSTANTIATE_DPERP(DomainWallVec5dImplFH);
INSTANTIATE_DPERP(ZDomainWallVec5dImplDF);
INSTANTIATE_DPERP(ZDomainWallVec5dImplFH);
template void CayleyFermion5D<DomainWallVec5dImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
template void CayleyFermion5D<DomainWallVec5dImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
template void CayleyFermion5D<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
template void CayleyFermion5D<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
template void CayleyFermion5D<DomainWallVec5dImplFH>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
template void CayleyFermion5D<DomainWallVec5dImplDF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
template void CayleyFermion5D<ZDomainWallVec5dImplFH>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
template void CayleyFermion5D<ZDomainWallVec5dImplDF>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv);
}}

View File

@ -1,438 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermion.cc
Copyright (C) 2017
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: David Murphy <dmurphy@phys.columbia.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid_Eigen_Dense.h>
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
namespace Grid {
namespace QCD {
template<class Impl>
DomainWallEOFAFermion<Impl>::DomainWallEOFAFermion(
GaugeField &_Umu,
GridCartesian &FiveDimGrid,
GridRedBlackCartesian &FiveDimRedBlackGrid,
GridCartesian &FourDimGrid,
GridRedBlackCartesian &FourDimRedBlackGrid,
RealD _mq1, RealD _mq2, RealD _mq3,
RealD _shift, int _pm, RealD _M5, const ImplParams &p) :
AbstractEOFAFermion<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid,
FourDimGrid, FourDimRedBlackGrid, _mq1, _mq2, _mq3,
_shift, _pm, _M5, 1.0, 0.0, p)
{
RealD eps = 1.0;
Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);
assert(zdata->n == this->Ls);
std::cout << GridLogMessage << "DomainWallEOFAFermion with Ls=" << this->Ls << std::endl;
this->SetCoefficientsTanh(zdata, 1.0, 0.0);
Approx::zolotarev_free(zdata);
}
/***************************************************************
* Additional EOFA operators only called outside the inverter.
* Since speed is not essential, simple axpby-style
* implementations should be fine.
***************************************************************/
template<class Impl>
void DomainWallEOFAFermion<Impl>::Omega(const FermionField& psi, FermionField& Din, int sign, int dag)
{
int Ls = this->Ls;
Din = zero;
if((sign == 1) && (dag == 0)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, Ls-1, 0); }
else if((sign == -1) && (dag == 0)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, 0); }
else if((sign == 1 ) && (dag == 1)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, Ls-1); }
else if((sign == -1) && (dag == 1)){ axpby_ssp(Din, 0.0, psi, 1.0, psi, 0, 0); }
}
// This is just the identity for DWF
template<class Impl>
void DomainWallEOFAFermion<Impl>::Dtilde(const FermionField& psi, FermionField& chi){ chi = psi; }
// This is just the identity for DWF
template<class Impl>
void DomainWallEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionField& chi){ chi = psi; }
/*****************************************************************************************************/
template<class Impl>
RealD DomainWallEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
{
int Ls = this->Ls;
FermionField Din(psi._grid);
this->Meooe5D(psi, Din);
this->DW(Din, chi, DaggerNo);
axpby(chi, 1.0, 1.0, chi, psi);
this->M5D(psi, chi);
return(norm2(chi));
}
template<class Impl>
RealD DomainWallEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
{
int Ls = this->Ls;
FermionField Din(psi._grid);
this->DW(psi, Din, DaggerYes);
this->MeooeDag5D(Din, chi);
this->M5Ddag(psi, chi);
axpby(chi, 1.0, 1.0, chi, psi);
return(norm2(chi));
}
/********************************************************************
* Performance critical fermion operators called inside the inverter
********************************************************************/
template<class Impl>
void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
{
int Ls = this->Ls;
int pm = this->pm;
RealD shift = this->shift;
RealD mq1 = this->mq1;
RealD mq2 = this->mq2;
RealD mq3 = this->mq3;
// coefficients for shift operator ( = shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm} )
Coeff_t shiftp(0.0), shiftm(0.0);
if(shift != 0.0){
if(pm == 1){ shiftp = shift*(mq3-mq2); }
else{ shiftm = -shift*(mq3-mq2); }
}
std::vector<Coeff_t> diag(Ls,1.0);
std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftm;
std::vector<Coeff_t> lower(Ls,-1.0); lower[0] = mq1 + shiftp;
#if(0)
std::cout << GridLogMessage << "DomainWallEOFAFermion::M5D(FF&,FF&):" << std::endl;
for(int i=0; i<diag.size(); ++i){
std::cout << GridLogMessage << "diag[" << i << "] =" << diag[i] << std::endl;
}
for(int i=0; i<upper.size(); ++i){
std::cout << GridLogMessage << "upper[" << i << "] =" << upper[i] << std::endl;
}
for(int i=0; i<lower.size(); ++i){
std::cout << GridLogMessage << "lower[" << i << "] =" << lower[i] << std::endl;
}
#endif
this->M5D(psi, chi, chi, lower, diag, upper);
}
template<class Impl>
void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& chi)
{
int Ls = this->Ls;
int pm = this->pm;
RealD shift = this->shift;
RealD mq1 = this->mq1;
RealD mq2 = this->mq2;
RealD mq3 = this->mq3;
// coefficients for shift operator ( = shift*\gamma_{5}*R_{5}*\Delta_{\pm}(mq2,mq3)*P_{\pm} )
Coeff_t shiftp(0.0), shiftm(0.0);
if(shift != 0.0){
if(pm == 1){ shiftp = shift*(mq3-mq2); }
else{ shiftm = -shift*(mq3-mq2); }
}
std::vector<Coeff_t> diag(Ls,1.0);
std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftp;
std::vector<Coeff_t> lower(Ls,-1.0); lower[0] = mq1 + shiftm;
#if(0)
std::cout << GridLogMessage << "DomainWallEOFAFermion::M5Ddag(FF&,FF&):" << std::endl;
for(int i=0; i<diag.size(); ++i){
std::cout << GridLogMessage << "diag[" << i << "] =" << diag[i] << std::endl;
}
for(int i=0; i<upper.size(); ++i){
std::cout << GridLogMessage << "upper[" << i << "] =" << upper[i] << std::endl;
}
for(int i=0; i<lower.size(); ++i){
std::cout << GridLogMessage << "lower[" << i << "] =" << lower[i] << std::endl;
}
#endif
this->M5Ddag(psi, chi, chi, lower, diag, upper);
}
// half checkerboard operations
template<class Impl>
void DomainWallEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& chi)
{
int Ls = this->Ls;
std::vector<Coeff_t> diag = this->bee;
std::vector<Coeff_t> upper(Ls);
std::vector<Coeff_t> lower(Ls);
for(int s=0; s<Ls; s++){
upper[s] = -this->cee[s];
lower[s] = -this->cee[s];
}
upper[Ls-1] = this->dm;
lower[0] = this->dp;
this->M5D(psi, psi, chi, lower, diag, upper);
}
template<class Impl>
void DomainWallEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField& chi)
{
int Ls = this->Ls;
std::vector<Coeff_t> diag = this->bee;
std::vector<Coeff_t> upper(Ls);
std::vector<Coeff_t> lower(Ls);
for(int s=0; s<Ls; s++){
upper[s] = -this->cee[s];
lower[s] = -this->cee[s];
}
upper[Ls-1] = this->dp;
lower[0] = this->dm;
this->M5Ddag(psi, psi, chi, lower, diag, upper);
}
/****************************************************************************************/
//Zolo
template<class Impl>
void DomainWallEOFAFermion<Impl>::SetCoefficientsInternal(RealD zolo_hi, std::vector<Coeff_t>& gamma, RealD b, RealD c)
{
int Ls = this->Ls;
int pm = this->pm;
RealD mq1 = this->mq1;
RealD mq2 = this->mq2;
RealD mq3 = this->mq3;
RealD shift = this->shift;
////////////////////////////////////////////////////////
// Constants for the preconditioned matrix Cayley form
////////////////////////////////////////////////////////
this->bs.resize(Ls);
this->cs.resize(Ls);
this->aee.resize(Ls);
this->aeo.resize(Ls);
this->bee.resize(Ls);
this->beo.resize(Ls);
this->cee.resize(Ls);
this->ceo.resize(Ls);
for(int i=0; i<Ls; ++i){
this->bee[i] = 4.0 - this->M5 + 1.0;
this->cee[i] = 1.0;
}
for(int i=0; i<Ls; ++i){
this->aee[i] = this->cee[i];
this->bs[i] = this->beo[i] = 1.0;
this->cs[i] = this->ceo[i] = 0.0;
}
//////////////////////////////////////////
// EOFA shift terms
//////////////////////////////////////////
if(pm == 1){
this->dp = mq1*this->cee[0] + shift*(mq3-mq2);
this->dm = mq1*this->cee[Ls-1];
} else if(this->pm == -1) {
this->dp = mq1*this->cee[0];
this->dm = mq1*this->cee[Ls-1] - shift*(mq3-mq2);
} else {
this->dp = mq1*this->cee[0];
this->dm = mq1*this->cee[Ls-1];
}
//////////////////////////////////////////
// LDU decomposition of eeoo
//////////////////////////////////////////
this->dee.resize(Ls+1);
this->lee.resize(Ls);
this->leem.resize(Ls);
this->uee.resize(Ls);
this->ueem.resize(Ls);
for(int i=0; i<Ls; ++i){
if(i < Ls-1){
this->lee[i] = -this->cee[i+1]/this->bee[i]; // sub-diag entry on the ith column
this->leem[i] = this->dm/this->bee[i];
for(int j=0; j<i; j++){ this->leem[i] *= this->aee[j]/this->bee[j]; }
this->dee[i] = this->bee[i];
this->uee[i] = -this->aee[i]/this->bee[i]; // up-diag entry on the ith row
this->ueem[i] = this->dp / this->bee[0];
for(int j=1; j<=i; j++){ this->ueem[i] *= this->cee[j]/this->bee[j]; }
} else {
this->lee[i] = 0.0;
this->leem[i] = 0.0;
this->uee[i] = 0.0;
this->ueem[i] = 0.0;
}
}
{
Coeff_t delta_d = 1.0 / this->bee[0];
for(int j=1; j<Ls-1; j++){ delta_d *= this->cee[j] / this->bee[j]; }
this->dee[Ls-1] = this->bee[Ls-1] + this->cee[0] * this->dm * delta_d;
this->dee[Ls] = this->bee[Ls-1] + this->cee[Ls-1] * this->dp * delta_d;
}
int inv = 1;
this->MooeeInternalCompute(0, inv, this->MatpInv, this->MatmInv);
this->MooeeInternalCompute(1, inv, this->MatpInvDag, this->MatmInvDag);
}
// Recompute Cayley-form coefficients for different shift
template<class Impl>
void DomainWallEOFAFermion<Impl>::RefreshShiftCoefficients(RealD new_shift)
{
this->shift = new_shift;
Approx::zolotarev_data *zdata = Approx::higham(1.0, this->Ls);
this->SetCoefficientsTanh(zdata, 1.0, 0.0);
}
template<class Impl>
void DomainWallEOFAFermion<Impl>::MooeeInternalCompute(int dag, int inv,
Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
{
int Ls = this->Ls;
GridBase* grid = this->FermionRedBlackGrid();
int LLs = grid->_rdimensions[0];
if(LLs == Ls){ return; } // Not vectorised in 5th direction
Eigen::MatrixXcd Pplus = Eigen::MatrixXcd::Zero(Ls,Ls);
Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
for(int s=0; s<Ls; s++){
Pplus(s,s) = this->bee[s];
Pminus(s,s) = this->bee[s];
}
for(int s=0; s<Ls-1; s++){
Pminus(s,s+1) = -this->cee[s];
}
for(int s=0; s<Ls-1; s++){
Pplus(s+1,s) = -this->cee[s+1];
}
Pplus (0,Ls-1) = this->dp;
Pminus(Ls-1,0) = this->dm;
Eigen::MatrixXcd PplusMat ;
Eigen::MatrixXcd PminusMat;
#if(0)
std::cout << GridLogMessage << "Pplus:" << std::endl;
for(int s=0; s<Ls; ++s){
for(int ss=0; ss<Ls; ++ss){
std::cout << Pplus(s,ss) << "\t";
}
std::cout << std::endl;
}
std::cout << GridLogMessage << "Pminus:" << std::endl;
for(int s=0; s<Ls; ++s){
for(int ss=0; ss<Ls; ++ss){
std::cout << Pminus(s,ss) << "\t";
}
std::cout << std::endl;
}
#endif
if(inv) {
PplusMat = Pplus.inverse();
PminusMat = Pminus.inverse();
} else {
PplusMat = Pplus;
PminusMat = Pminus;
}
if(dag){
PplusMat.adjointInPlace();
PminusMat.adjointInPlace();
}
typedef typename SiteHalfSpinor::scalar_type scalar_type;
const int Nsimd = Simd::Nsimd();
Matp.resize(Ls*LLs);
Matm.resize(Ls*LLs);
for(int s2=0; s2<Ls; s2++){
for(int s1=0; s1<LLs; s1++){
int istride = LLs;
int ostride = 1;
Simd Vp;
Simd Vm;
scalar_type *sp = (scalar_type*) &Vp;
scalar_type *sm = (scalar_type*) &Vm;
for(int l=0; l<Nsimd; l++){
if(switcheroo<Coeff_t>::iscomplex()) {
sp[l] = PplusMat (l*istride+s1*ostride,s2);
sm[l] = PminusMat(l*istride+s1*ostride,s2);
} else {
// if real
scalar_type tmp;
tmp = PplusMat (l*istride+s1*ostride,s2);
sp[l] = scalar_type(tmp.real(),tmp.real());
tmp = PminusMat(l*istride+s1*ostride,s2);
sm[l] = scalar_type(tmp.real(),tmp.real());
}
}
Matp[LLs*s2+s1] = Vp;
Matm[LLs*s2+s1] = Vm;
}}
}
FermOpTemplateInstantiate(DomainWallEOFAFermion);
GparityFermOpTemplateInstantiate(DomainWallEOFAFermion);
}}

View File

@ -1,115 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermion.h
Copyright (C) 2017
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: David Murphy <dmurphy@phys.columbia.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_QCD_DOMAIN_WALL_EOFA_FERMION_H
#define GRID_QCD_DOMAIN_WALL_EOFA_FERMION_H
#include <Grid/qcd/action/fermion/AbstractEOFAFermion.h>
namespace Grid {
namespace QCD {
template<class Impl>
class DomainWallEOFAFermion : public AbstractEOFAFermion<Impl>
{
public:
INHERIT_IMPL_TYPES(Impl);
public:
// Modified (0,Ls-1) and (Ls-1,0) elements of Mooee
// for red-black preconditioned Shamir EOFA
Coeff_t dm;
Coeff_t dp;
virtual void Instantiatable(void) {};
// EOFA-specific operations
virtual void Omega (const FermionField& in, FermionField& out, int sign, int dag);
virtual void Dtilde (const FermionField& in, FermionField& out);
virtual void DtildeInv (const FermionField& in, FermionField& out);
// override multiply
virtual RealD M (const FermionField& in, FermionField& out);
virtual RealD Mdag (const FermionField& in, FermionField& out);
// half checkerboard operations
virtual void Mooee (const FermionField& in, FermionField& out);
virtual void MooeeDag (const FermionField& in, FermionField& out);
virtual void MooeeInv (const FermionField& in, FermionField& out);
virtual void MooeeInvDag(const FermionField& in, FermionField& out);
virtual void M5D (const FermionField& psi, FermionField& chi);
virtual void M5Ddag (const FermionField& psi, FermionField& chi);
/////////////////////////////////////////////////////
// Instantiate different versions depending on Impl
/////////////////////////////////////////////////////
void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
void MooeeInternal(const FermionField& in, FermionField& out, int dag, int inv);
void MooeeInternalCompute(int dag, int inv, Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
void MooeeInternalAsm(const FermionField& in, FermionField& out, int LLs, int site,
Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
void MooeeInternalZAsm(const FermionField& in, FermionField& out, int LLs, int site,
Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
virtual void RefreshShiftCoefficients(RealD new_shift);
// Constructors
DomainWallEOFAFermion(GaugeField& _Umu, GridCartesian& FiveDimGrid, GridRedBlackCartesian& FiveDimRedBlackGrid,
GridCartesian& FourDimGrid, GridRedBlackCartesian& FourDimRedBlackGrid,
RealD _mq1, RealD _mq2, RealD _mq3, RealD _shift, int pm,
RealD _M5, const ImplParams& p=ImplParams());
protected:
void SetCoefficientsInternal(RealD zolo_hi, std::vector<Coeff_t>& gamma, RealD b, RealD c);
};
}}
#define INSTANTIATE_DPERP_DWF_EOFA(A)\
template void DomainWallEOFAFermion<A>::M5D(const FermionField& psi, const FermionField& phi, FermionField& chi, \
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper); \
template void DomainWallEOFAFermion<A>::M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi, \
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper); \
template void DomainWallEOFAFermion<A>::MooeeInv(const FermionField& psi, FermionField& chi); \
template void DomainWallEOFAFermion<A>::MooeeInvDag(const FermionField& psi, FermionField& chi);
#undef DOMAIN_WALL_EOFA_DPERP_DENSE
#define DOMAIN_WALL_EOFA_DPERP_CACHE
#undef DOMAIN_WALL_EOFA_DPERP_LINALG
#define DOMAIN_WALL_EOFA_DPERP_VEC
#endif

View File

@ -1,248 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermioncache.cc
Copyright (C) 2017
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: David Murphy <dmurphy@phys.columbia.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
namespace Grid {
namespace QCD {
// FIXME -- make a version of these routines with site loop outermost for cache reuse.
// Pminus fowards
// Pplus backwards..
template<class Impl>
void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi,
FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
{
int Ls = this->Ls;
GridBase* grid = psi._grid;
assert(phi.checkerboard == psi.checkerboard);
chi.checkerboard = psi.checkerboard;
// Flops = 6.0*(Nc*Ns) *Ls*vol
this->M5Dcalls++;
this->M5Dtime -= usecond();
parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){ // adds Ls
for(int s=0; s<Ls; s++){
auto tmp = psi._odata[0];
if(s==0) {
spProj5m(tmp, psi._odata[ss+s+1]);
chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
spProj5p(tmp, psi._odata[ss+Ls-1]);
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
} else if(s==(Ls-1)) {
spProj5m(tmp, psi._odata[ss+0]);
chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
spProj5p(tmp, psi._odata[ss+s-1]);
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
} else {
spProj5m(tmp, psi._odata[ss+s+1]);
chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
spProj5p(tmp, psi._odata[ss+s-1]);
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
}
}
}
this->M5Dtime += usecond();
}
template<class Impl>
void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi,
FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
{
int Ls = this->Ls;
GridBase* grid = psi._grid;
assert(phi.checkerboard == psi.checkerboard);
chi.checkerboard=psi.checkerboard;
// Flops = 6.0*(Nc*Ns) *Ls*vol
this->M5Dcalls++;
this->M5Dtime -= usecond();
parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){ // adds Ls
auto tmp = psi._odata[0];
for(int s=0; s<Ls; s++){
if(s==0) {
spProj5p(tmp, psi._odata[ss+s+1]);
chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
spProj5m(tmp, psi._odata[ss+Ls-1]);
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
} else if(s==(Ls-1)) {
spProj5p(tmp, psi._odata[ss+0]);
chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
spProj5m(tmp, psi._odata[ss+s-1]);
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
} else {
spProj5p(tmp, psi._odata[ss+s+1]);
chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
spProj5m(tmp, psi._odata[ss+s-1]);
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
}
}
}
this->M5Dtime += usecond();
}
template<class Impl>
void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
{
GridBase* grid = psi._grid;
int Ls = this->Ls;
chi.checkerboard = psi.checkerboard;
this->MooeeInvCalls++;
this->MooeeInvTime -= usecond();
parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){ // adds Ls
auto tmp1 = psi._odata[0];
auto tmp2 = psi._odata[0];
// flops = 12*2*Ls + 12*2*Ls + 3*12*Ls + 12*2*Ls = 12*Ls * (9) = 108*Ls flops
// Apply (L^{\prime})^{-1}
chi[ss] = psi[ss]; // chi[0]=psi[0]
for(int s=1; s<Ls; s++){
spProj5p(tmp1, chi[ss+s-1]);
chi[ss+s] = psi[ss+s] - this->lee[s-1]*tmp1;
}
// L_m^{-1}
for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
spProj5m(tmp1, chi[ss+s]);
chi[ss+Ls-1] = chi[ss+Ls-1] - this->leem[s]*tmp1;
}
// U_m^{-1} D^{-1}
for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
spProj5p(tmp1, chi[ss+Ls-1]);
chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->ueem[s]/this->dee[Ls])*tmp1;
}
spProj5m(tmp2, chi[ss+Ls-1]);
chi[ss+Ls-1] = (1.0/this->dee[Ls])*tmp1 + (1.0/this->dee[Ls-1])*tmp2;
// Apply U^{-1}
for(int s=Ls-2; s>=0; s--){
spProj5m(tmp1, chi[ss+s+1]);
chi[ss+s] = chi[ss+s] - this->uee[s]*tmp1;
}
}
this->MooeeInvTime += usecond();
}
template<class Impl>
void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
{
GridBase* grid = psi._grid;
int Ls = this->Ls;
assert(psi.checkerboard == psi.checkerboard);
chi.checkerboard = psi.checkerboard;
std::vector<Coeff_t> ueec(Ls);
std::vector<Coeff_t> deec(Ls+1);
std::vector<Coeff_t> leec(Ls);
std::vector<Coeff_t> ueemc(Ls);
std::vector<Coeff_t> leemc(Ls);
for(int s=0; s<ueec.size(); s++){
ueec[s] = conjugate(this->uee[s]);
deec[s] = conjugate(this->dee[s]);
leec[s] = conjugate(this->lee[s]);
ueemc[s] = conjugate(this->ueem[s]);
leemc[s] = conjugate(this->leem[s]);
}
deec[Ls] = conjugate(this->dee[Ls]);
this->MooeeInvCalls++;
this->MooeeInvTime -= usecond();
parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){ // adds Ls
auto tmp1 = psi._odata[0];
auto tmp2 = psi._odata[0];
// Apply (U^{\prime})^{-dagger}
chi[ss] = psi[ss];
for(int s=1; s<Ls; s++){
spProj5m(tmp1, chi[ss+s-1]);
chi[ss+s] = psi[ss+s] - ueec[s-1]*tmp1;
}
// U_m^{-\dagger}
for(int s=0; s<Ls-1; s++){
spProj5p(tmp1, chi[ss+s]);
chi[ss+Ls-1] = chi[ss+Ls-1] - ueemc[s]*tmp1;
}
// L_m^{-\dagger} D^{-dagger}
for(int s=0; s<Ls-1; s++){
spProj5m(tmp1, chi[ss+Ls-1]);
chi[ss+s] = (1.0/deec[s])*chi[ss+s] - (leemc[s]/deec[Ls-1])*tmp1;
}
spProj5p(tmp2, chi[ss+Ls-1]);
chi[ss+Ls-1] = (1.0/deec[Ls-1])*tmp1 + (1.0/deec[Ls])*tmp2;
// Apply L^{-dagger}
for(int s=Ls-2; s>=0; s--){
spProj5p(tmp1, chi[ss+s+1]);
chi[ss+s] = chi[ss+s] - leec[s]*tmp1;
}
}
this->MooeeInvTime += usecond();
}
#ifdef DOMAIN_WALL_EOFA_DPERP_CACHE
INSTANTIATE_DPERP_DWF_EOFA(WilsonImplF);
INSTANTIATE_DPERP_DWF_EOFA(WilsonImplD);
INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplF);
INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplD);
INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplF);
INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplD);
INSTANTIATE_DPERP_DWF_EOFA(WilsonImplFH);
INSTANTIATE_DPERP_DWF_EOFA(WilsonImplDF);
INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplFH);
INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplDF);
INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplFH);
INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplDF);
#endif
}}

View File

@ -1,159 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermiondense.cc
Copyright (C) 2017
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: David Murphy <dmurphy@phys.columbia.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid_Eigen_Dense.h>
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
namespace Grid {
namespace QCD {
/*
* Dense matrix versions of routines
*/
template<class Impl>
void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
{
this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
}
template<class Impl>
void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
{
this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
}
template<class Impl>
void DomainWallEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
{
int Ls = this->Ls;
int LLs = psi._grid->_rdimensions[0];
int vol = psi._grid->oSites()/LLs;
chi.checkerboard = psi.checkerboard;
assert(Ls==LLs);
Eigen::MatrixXd Pplus = Eigen::MatrixXd::Zero(Ls,Ls);
Eigen::MatrixXd Pminus = Eigen::MatrixXd::Zero(Ls,Ls);
for(int s=0;s<Ls;s++){
Pplus(s,s) = this->bee[s];
Pminus(s,s) = this->bee[s];
}
for(int s=0; s<Ls-1; s++){
Pminus(s,s+1) = -this->cee[s];
}
for(int s=0; s<Ls-1; s++){
Pplus(s+1,s) = -this->cee[s+1];
}
Pplus (0,Ls-1) = this->dp;
Pminus(Ls-1,0) = this->dm;
Eigen::MatrixXd PplusMat ;
Eigen::MatrixXd PminusMat;
if(inv) {
PplusMat = Pplus.inverse();
PminusMat = Pminus.inverse();
} else {
PplusMat = Pplus;
PminusMat = Pminus;
}
if(dag){
PplusMat.adjointInPlace();
PminusMat.adjointInPlace();
}
// For the non-vectorised s-direction this is simple
for(auto site=0; site<vol; site++){
SiteSpinor SiteChi;
SiteHalfSpinor SitePplus;
SiteHalfSpinor SitePminus;
for(int s1=0; s1<Ls; s1++){
SiteChi = zero;
for(int s2=0; s2<Ls; s2++){
int lex2 = s2 + Ls*site;
if(PplusMat(s1,s2) != 0.0){
spProj5p(SitePplus,psi[lex2]);
accumRecon5p(SiteChi, PplusMat(s1,s2)*SitePplus);
}
if(PminusMat(s1,s2) != 0.0){
spProj5m(SitePminus, psi[lex2]);
accumRecon5m(SiteChi, PminusMat(s1,s2)*SitePminus);
}
}
chi[s1+Ls*site] = SiteChi*0.5;
}
}
}
#ifdef DOMAIN_WALL_EOFA_DPERP_DENSE
INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplF);
INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplD);
INSTANTIATE_DPERP_DWF_EOFA(WilsonImplF);
INSTANTIATE_DPERP_DWF_EOFA(WilsonImplD);
INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplF);
INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplD);
template void DomainWallEOFAFermion<GparityWilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void DomainWallEOFAFermion<GparityWilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void DomainWallEOFAFermion<WilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void DomainWallEOFAFermion<WilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void DomainWallEOFAFermion<ZWilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void DomainWallEOFAFermion<ZWilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplFH);
INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplDF);
INSTANTIATE_DPERP_DWF_EOFA(WilsonImplFH);
INSTANTIATE_DPERP_DWF_EOFA(WilsonImplDF);
INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplFH);
INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplDF);
template void DomainWallEOFAFermion<GparityWilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void DomainWallEOFAFermion<GparityWilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void DomainWallEOFAFermion<WilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void DomainWallEOFAFermion<WilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void DomainWallEOFAFermion<ZWilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void DomainWallEOFAFermion<ZWilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
#endif
}}

View File

@ -1,168 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermionssp.cc
Copyright (C) 2017
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: David Murphy <dmurphy@phys.columbia.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
namespace Grid {
namespace QCD {
// FIXME -- make a version of these routines with site loop outermost for cache reuse.
// Pminus fowards
// Pplus backwards
template<class Impl>
void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi,
FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
{
Coeff_t one(1.0);
int Ls = this->Ls;
for(int s=0; s<Ls; s++){
if(s==0) {
axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, Ls-1);
} else if (s==(Ls-1)) {
axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, 0);
axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, s-1);
} else {
axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
axpby_ssp_pplus(chi, one, chi, lower[s], psi, s, s-1);
}
}
}
template<class Impl>
void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi,
FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
{
Coeff_t one(1.0);
int Ls = this->Ls;
for(int s=0; s<Ls; s++){
if(s==0) {
axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, Ls-1);
} else if (s==(Ls-1)) {
axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, 0);
axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
} else {
axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
}
}
}
template<class Impl>
void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
{
Coeff_t one(1.0);
Coeff_t czero(0.0);
chi.checkerboard = psi.checkerboard;
int Ls = this->Ls;
FermionField tmp(psi._grid);
// Apply (L^{\prime})^{-1}
axpby_ssp(chi, one, psi, czero, psi, 0, 0); // chi[0]=psi[0]
for(int s=1; s<Ls; s++){
axpby_ssp_pplus(chi, one, psi, -this->lee[s-1], chi, s, s-1);// recursion Psi[s] -lee P_+ chi[s-1]
}
// L_m^{-1}
for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
axpby_ssp_pminus(chi, one, chi, -this->leem[s], chi, Ls-1, s);
}
// U_m^{-1} D^{-1}
for(int s=0; s<Ls-1; s++){
axpby_ssp_pplus(chi, one/this->dee[s], chi, -this->ueem[s]/this->dee[Ls], chi, s, Ls-1);
}
axpby_ssp_pminus(tmp, czero, chi, one/this->dee[Ls-1], chi, Ls-1, Ls-1);
axpby_ssp_pplus(chi, one, tmp, one/this->dee[Ls], chi, Ls-1, Ls-1);
// Apply U^{-1}
for(int s=Ls-2; s>=0; s--){
axpby_ssp_pminus(chi, one, chi, -this->uee[s], chi, s, s+1); // chi[Ls]
}
}
template<class Impl>
void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
{
Coeff_t one(1.0);
Coeff_t czero(0.0);
chi.checkerboard = psi.checkerboard;
int Ls = this->Ls;
FermionField tmp(psi._grid);
// Apply (U^{\prime})^{-dagger}
axpby_ssp(chi, one, psi, czero, psi, 0, 0); // chi[0]=psi[0]
for(int s=1; s<Ls; s++){
axpby_ssp_pminus(chi, one, psi, -conjugate(this->uee[s-1]), chi, s, s-1);
}
// U_m^{-\dagger}
for(int s=0; s<Ls-1; s++){
axpby_ssp_pplus(chi, one, chi, -conjugate(this->ueem[s]), chi, Ls-1, s);
}
// L_m^{-\dagger} D^{-dagger}
for(int s=0; s<Ls-1; s++){
axpby_ssp_pminus(chi, one/conjugate(this->dee[s]), chi, -conjugate(this->leem[s]/this->dee[Ls-1]), chi, s, Ls-1);
}
axpby_ssp_pminus(tmp, czero, chi, one/conjugate(this->dee[Ls-1]), chi, Ls-1, Ls-1);
axpby_ssp_pplus(chi, one, tmp, one/conjugate(this->dee[Ls]), chi, Ls-1, Ls-1);
// Apply L^{-dagger}
for(int s=Ls-2; s>=0; s--){
axpby_ssp_pplus(chi, one, chi, -conjugate(this->lee[s]), chi, s, s+1); // chi[Ls]
}
}
#ifdef DOMAIN_WALL_EOFA_DPERP_LINALG
INSTANTIATE_DPERP_DWF_EOFA(WilsonImplF);
INSTANTIATE_DPERP_DWF_EOFA(WilsonImplD);
INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplF);
INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplD);
INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplF);
INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplD);
INSTANTIATE_DPERP_DWF_EOFA(WilsonImplFH);
INSTANTIATE_DPERP_DWF_EOFA(WilsonImplDF);
INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplFH);
INSTANTIATE_DPERP_DWF_EOFA(GparityWilsonImplDF);
INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplFH);
INSTANTIATE_DPERP_DWF_EOFA(ZWilsonImplDF);
#endif
}}

View File

@ -1,605 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/DomainWallEOFAFermionvec.cc
Copyright (C) 2017
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: David Murphy <dmurphy@phys.columbia.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
namespace Grid {
namespace QCD {
/*
* Dense matrix versions of routines
*/
template<class Impl>
void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
{
this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
}
template<class Impl>
void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
{
this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
}
template<class Impl>
void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi,
FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
{
GridBase* grid = psi._grid;
int Ls = this->Ls;
int LLs = grid->_rdimensions[0];
const int nsimd = Simd::Nsimd();
Vector<iSinglet<Simd> > u(LLs);
Vector<iSinglet<Simd> > l(LLs);
Vector<iSinglet<Simd> > d(LLs);
assert(Ls/LLs == nsimd);
assert(phi.checkerboard == psi.checkerboard);
chi.checkerboard = psi.checkerboard;
// just directly address via type pun
typedef typename Simd::scalar_type scalar_type;
scalar_type* u_p = (scalar_type*) &u[0];
scalar_type* l_p = (scalar_type*) &l[0];
scalar_type* d_p = (scalar_type*) &d[0];
for(int o=0;o<LLs;o++){ // outer
for(int i=0;i<nsimd;i++){ //inner
int s = o + i*LLs;
int ss = o*nsimd + i;
u_p[ss] = upper[s];
l_p[ss] = lower[s];
d_p[ss] = diag[s];
}}
this->M5Dcalls++;
this->M5Dtime -= usecond();
assert(Nc == 3);
parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
#if 0
alignas(64) SiteHalfSpinor hp;
alignas(64) SiteHalfSpinor hm;
alignas(64) SiteSpinor fp;
alignas(64) SiteSpinor fm;
for(int v=0; v<LLs; v++){
int vp = (v+1)%LLs;
int vm = (v+LLs-1)%LLs;
spProj5m(hp, psi[ss+vp]);
spProj5p(hm, psi[ss+vm]);
if (vp <= v){ rotate(hp, hp, 1); }
if (vm >= v){ rotate(hm, hm, nsimd-1); }
hp = 0.5*hp;
hm = 0.5*hm;
spRecon5m(fp, hp);
spRecon5p(fm, hm);
chi[ss+v] = d[v]*phi[ss+v];
chi[ss+v] = chi[ss+v] + u[v]*fp;
chi[ss+v] = chi[ss+v] + l[v]*fm;
}
#else
for(int v=0; v<LLs; v++){
vprefetch(psi[ss+v+LLs]);
int vp = (v==LLs-1) ? 0 : v+1;
int vm = (v==0) ? LLs-1 : v-1;
Simd hp_00 = psi[ss+vp]()(2)(0);
Simd hp_01 = psi[ss+vp]()(2)(1);
Simd hp_02 = psi[ss+vp]()(2)(2);
Simd hp_10 = psi[ss+vp]()(3)(0);
Simd hp_11 = psi[ss+vp]()(3)(1);
Simd hp_12 = psi[ss+vp]()(3)(2);
Simd hm_00 = psi[ss+vm]()(0)(0);
Simd hm_01 = psi[ss+vm]()(0)(1);
Simd hm_02 = psi[ss+vm]()(0)(2);
Simd hm_10 = psi[ss+vm]()(1)(0);
Simd hm_11 = psi[ss+vm]()(1)(1);
Simd hm_12 = psi[ss+vm]()(1)(2);
if(vp <= v){
hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
}
if(vm >= v){
hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
}
// Can force these to real arithmetic and save 2x.
Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
vstream(chi[ss+v]()(0)(0), p_00);
vstream(chi[ss+v]()(0)(1), p_01);
vstream(chi[ss+v]()(0)(2), p_02);
vstream(chi[ss+v]()(1)(0), p_10);
vstream(chi[ss+v]()(1)(1), p_11);
vstream(chi[ss+v]()(1)(2), p_12);
vstream(chi[ss+v]()(2)(0), p_20);
vstream(chi[ss+v]()(2)(1), p_21);
vstream(chi[ss+v]()(2)(2), p_22);
vstream(chi[ss+v]()(3)(0), p_30);
vstream(chi[ss+v]()(3)(1), p_31);
vstream(chi[ss+v]()(3)(2), p_32);
}
#endif
}
this->M5Dtime += usecond();
}
template<class Impl>
void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi,
FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
{
GridBase* grid = psi._grid;
int Ls = this->Ls;
int LLs = grid->_rdimensions[0];
int nsimd = Simd::Nsimd();
Vector<iSinglet<Simd> > u(LLs);
Vector<iSinglet<Simd> > l(LLs);
Vector<iSinglet<Simd> > d(LLs);
assert(Ls/LLs == nsimd);
assert(phi.checkerboard == psi.checkerboard);
chi.checkerboard = psi.checkerboard;
// just directly address via type pun
typedef typename Simd::scalar_type scalar_type;
scalar_type* u_p = (scalar_type*) &u[0];
scalar_type* l_p = (scalar_type*) &l[0];
scalar_type* d_p = (scalar_type*) &d[0];
for(int o=0; o<LLs; o++){ // outer
for(int i=0; i<nsimd; i++){ //inner
int s = o + i*LLs;
int ss = o*nsimd + i;
u_p[ss] = upper[s];
l_p[ss] = lower[s];
d_p[ss] = diag[s];
}}
this->M5Dcalls++;
this->M5Dtime -= usecond();
parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
#if 0
alignas(64) SiteHalfSpinor hp;
alignas(64) SiteHalfSpinor hm;
alignas(64) SiteSpinor fp;
alignas(64) SiteSpinor fm;
for(int v=0; v<LLs; v++){
int vp = (v+1)%LLs;
int vm = (v+LLs-1)%LLs;
spProj5p(hp, psi[ss+vp]);
spProj5m(hm, psi[ss+vm]);
if(vp <= v){ rotate(hp, hp, 1); }
if(vm >= v){ rotate(hm, hm, nsimd-1); }
hp = hp*0.5;
hm = hm*0.5;
spRecon5p(fp, hp);
spRecon5m(fm, hm);
chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
chi[ss+v] = chi[ss+v] +l[v]*fm;
}
#else
for(int v=0; v<LLs; v++){
vprefetch(psi[ss+v+LLs]);
int vp = (v == LLs-1) ? 0 : v+1;
int vm = (v == 0 ) ? LLs-1 : v-1;
Simd hp_00 = psi[ss+vp]()(0)(0);
Simd hp_01 = psi[ss+vp]()(0)(1);
Simd hp_02 = psi[ss+vp]()(0)(2);
Simd hp_10 = psi[ss+vp]()(1)(0);
Simd hp_11 = psi[ss+vp]()(1)(1);
Simd hp_12 = psi[ss+vp]()(1)(2);
Simd hm_00 = psi[ss+vm]()(2)(0);
Simd hm_01 = psi[ss+vm]()(2)(1);
Simd hm_02 = psi[ss+vm]()(2)(2);
Simd hm_10 = psi[ss+vm]()(3)(0);
Simd hm_11 = psi[ss+vm]()(3)(1);
Simd hm_12 = psi[ss+vm]()(3)(2);
if (vp <= v){
hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
}
if(vm >= v){
hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
}
Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
vstream(chi[ss+v]()(0)(0), p_00);
vstream(chi[ss+v]()(0)(1), p_01);
vstream(chi[ss+v]()(0)(2), p_02);
vstream(chi[ss+v]()(1)(0), p_10);
vstream(chi[ss+v]()(1)(1), p_11);
vstream(chi[ss+v]()(1)(2), p_12);
vstream(chi[ss+v]()(2)(0), p_20);
vstream(chi[ss+v]()(2)(1), p_21);
vstream(chi[ss+v]()(2)(2), p_22);
vstream(chi[ss+v]()(3)(0), p_30);
vstream(chi[ss+v]()(3)(1), p_31);
vstream(chi[ss+v]()(3)(2), p_32);
}
#endif
}
this->M5Dtime += usecond();
}
#ifdef AVX512
#include<simd/Intel512common.h>
#include<simd/Intel512avx.h>
#include<simd/Intel512single.h>
#endif
template<class Impl>
void DomainWallEOFAFermion<Impl>::MooeeInternalAsm(const FermionField& psi, FermionField& chi,
int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
{
#ifndef AVX512
{
SiteHalfSpinor BcastP;
SiteHalfSpinor BcastM;
SiteHalfSpinor SiteChiP;
SiteHalfSpinor SiteChiM;
// Ls*Ls * 2 * 12 * vol flops
for(int s1=0; s1<LLs; s1++){
for(int s2=0; s2<LLs; s2++){
for(int l=0; l < Simd::Nsimd(); l++){ // simd lane
int s = s2 + l*LLs;
int lex = s2 + LLs*site;
if( s2==0 && l==0 ){
SiteChiP=zero;
SiteChiM=zero;
}
for(int sp=0; sp<2; sp++){
for(int co=0; co<Nc; co++){
vbroadcast(BcastP()(sp)(co), psi[lex]()(sp)(co), l);
}}
for(int sp=0; sp<2; sp++){
for(int co=0; co<Nc; co++){
vbroadcast(BcastM()(sp)(co), psi[lex]()(sp+2)(co), l);
}}
for(int sp=0; sp<2; sp++){
for(int co=0; co<Nc; co++){
SiteChiP()(sp)(co) = real_madd(Matp[LLs*s+s1]()()(), BcastP()(sp)(co), SiteChiP()(sp)(co)); // 1100 us.
SiteChiM()(sp)(co) = real_madd(Matm[LLs*s+s1]()()(), BcastM()(sp)(co), SiteChiM()(sp)(co)); // each found by commenting out
}}
}}
{
int lex = s1 + LLs*site;
for(int sp=0; sp<2; sp++){
for(int co=0; co<Nc; co++){
vstream(chi[lex]()(sp)(co), SiteChiP()(sp)(co));
vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
}}
}
}
}
#else
{
// pointers
// MASK_REGS;
#define Chi_00 %%zmm1
#define Chi_01 %%zmm2
#define Chi_02 %%zmm3
#define Chi_10 %%zmm4
#define Chi_11 %%zmm5
#define Chi_12 %%zmm6
#define Chi_20 %%zmm7
#define Chi_21 %%zmm8
#define Chi_22 %%zmm9
#define Chi_30 %%zmm10
#define Chi_31 %%zmm11
#define Chi_32 %%zmm12
#define BCAST0 %%zmm13
#define BCAST1 %%zmm14
#define BCAST2 %%zmm15
#define BCAST3 %%zmm16
#define BCAST4 %%zmm17
#define BCAST5 %%zmm18
#define BCAST6 %%zmm19
#define BCAST7 %%zmm20
#define BCAST8 %%zmm21
#define BCAST9 %%zmm22
#define BCAST10 %%zmm23
#define BCAST11 %%zmm24
int incr = LLs*LLs*sizeof(iSinglet<Simd>);
for(int s1=0; s1<LLs; s1++){
for(int s2=0; s2<LLs; s2++){
int lex = s2 + LLs*site;
uint64_t a0 = (uint64_t) &Matp[LLs*s2+s1]; // should be cacheable
uint64_t a1 = (uint64_t) &Matm[LLs*s2+s1];
uint64_t a2 = (uint64_t) &psi[lex];
for(int l=0; l<Simd::Nsimd(); l++){ // simd lane
if((s2+l)==0) {
asm(
VPREFETCH1(0,%2) VPREFETCH1(0,%1)
VPREFETCH1(12,%2) VPREFETCH1(13,%2)
VPREFETCH1(14,%2) VPREFETCH1(15,%2)
VBCASTCDUP(0,%2,BCAST0)
VBCASTCDUP(1,%2,BCAST1)
VBCASTCDUP(2,%2,BCAST2)
VBCASTCDUP(3,%2,BCAST3)
VBCASTCDUP(4,%2,BCAST4) VMULMEM(0,%0,BCAST0,Chi_00)
VBCASTCDUP(5,%2,BCAST5) VMULMEM(0,%0,BCAST1,Chi_01)
VBCASTCDUP(6,%2,BCAST6) VMULMEM(0,%0,BCAST2,Chi_02)
VBCASTCDUP(7,%2,BCAST7) VMULMEM(0,%0,BCAST3,Chi_10)
VBCASTCDUP(8,%2,BCAST8) VMULMEM(0,%0,BCAST4,Chi_11)
VBCASTCDUP(9,%2,BCAST9) VMULMEM(0,%0,BCAST5,Chi_12)
VBCASTCDUP(10,%2,BCAST10) VMULMEM(0,%1,BCAST6,Chi_20)
VBCASTCDUP(11,%2,BCAST11) VMULMEM(0,%1,BCAST7,Chi_21)
VMULMEM(0,%1,BCAST8,Chi_22)
VMULMEM(0,%1,BCAST9,Chi_30)
VMULMEM(0,%1,BCAST10,Chi_31)
VMULMEM(0,%1,BCAST11,Chi_32)
: : "r" (a0), "r" (a1), "r" (a2) );
} else {
asm(
VBCASTCDUP(0,%2,BCAST0) VMADDMEM(0,%0,BCAST0,Chi_00)
VBCASTCDUP(1,%2,BCAST1) VMADDMEM(0,%0,BCAST1,Chi_01)
VBCASTCDUP(2,%2,BCAST2) VMADDMEM(0,%0,BCAST2,Chi_02)
VBCASTCDUP(3,%2,BCAST3) VMADDMEM(0,%0,BCAST3,Chi_10)
VBCASTCDUP(4,%2,BCAST4) VMADDMEM(0,%0,BCAST4,Chi_11)
VBCASTCDUP(5,%2,BCAST5) VMADDMEM(0,%0,BCAST5,Chi_12)
VBCASTCDUP(6,%2,BCAST6) VMADDMEM(0,%1,BCAST6,Chi_20)
VBCASTCDUP(7,%2,BCAST7) VMADDMEM(0,%1,BCAST7,Chi_21)
VBCASTCDUP(8,%2,BCAST8) VMADDMEM(0,%1,BCAST8,Chi_22)
VBCASTCDUP(9,%2,BCAST9) VMADDMEM(0,%1,BCAST9,Chi_30)
VBCASTCDUP(10,%2,BCAST10) VMADDMEM(0,%1,BCAST10,Chi_31)
VBCASTCDUP(11,%2,BCAST11) VMADDMEM(0,%1,BCAST11,Chi_32)
: : "r" (a0), "r" (a1), "r" (a2) );
}
a0 = a0 + incr;
a1 = a1 + incr;
a2 = a2 + sizeof(typename Simd::scalar_type);
}
}
{
int lexa = s1+LLs*site;
asm (
VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01) VSTORE(2 ,%0,Chi_02)
VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11) VSTORE(5 ,%0,Chi_12)
VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21) VSTORE(8 ,%0,Chi_22)
VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31) VSTORE(11,%0,Chi_32)
: : "r" ((uint64_t)&chi[lexa]) : "memory" );
}
}
}
#undef Chi_00
#undef Chi_01
#undef Chi_02
#undef Chi_10
#undef Chi_11
#undef Chi_12
#undef Chi_20
#undef Chi_21
#undef Chi_22
#undef Chi_30
#undef Chi_31
#undef Chi_32
#undef BCAST0
#undef BCAST1
#undef BCAST2
#undef BCAST3
#undef BCAST4
#undef BCAST5
#undef BCAST6
#undef BCAST7
#undef BCAST8
#undef BCAST9
#undef BCAST10
#undef BCAST11
#endif
};
// Z-mobius version
template<class Impl>
void DomainWallEOFAFermion<Impl>::MooeeInternalZAsm(const FermionField& psi, FermionField& chi,
int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
{
std::cout << "Error: zMobius not implemented for EOFA" << std::endl;
exit(-1);
};
template<class Impl>
void DomainWallEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
{
int Ls = this->Ls;
int LLs = psi._grid->_rdimensions[0];
int vol = psi._grid->oSites()/LLs;
chi.checkerboard = psi.checkerboard;
Vector<iSinglet<Simd> > Matp;
Vector<iSinglet<Simd> > Matm;
Vector<iSinglet<Simd> > *_Matp;
Vector<iSinglet<Simd> > *_Matm;
// MooeeInternalCompute(dag,inv,Matp,Matm);
if(inv && dag){
_Matp = &this->MatpInvDag;
_Matm = &this->MatmInvDag;
}
if(inv && (!dag)){
_Matp = &this->MatpInv;
_Matm = &this->MatmInv;
}
if(!inv){
MooeeInternalCompute(dag, inv, Matp, Matm);
_Matp = &Matp;
_Matm = &Matm;
}
assert(_Matp->size() == Ls*LLs);
this->MooeeInvCalls++;
this->MooeeInvTime -= usecond();
if(switcheroo<Coeff_t>::iscomplex()){
parallel_for(auto site=0; site<vol; site++){
MooeeInternalZAsm(psi, chi, LLs, site, *_Matp, *_Matm);
}
} else {
parallel_for(auto site=0; site<vol; site++){
MooeeInternalAsm(psi, chi, LLs, site, *_Matp, *_Matm);
}
}
this->MooeeInvTime += usecond();
}
#ifdef DOMAIN_WALL_EOFA_DPERP_VEC
INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplD);
INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplF);
INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplD);
INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplF);
INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplDF);
INSTANTIATE_DPERP_DWF_EOFA(DomainWallVec5dImplFH);
INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplDF);
INSTANTIATE_DPERP_DWF_EOFA(ZDomainWallVec5dImplFH);
template void DomainWallEOFAFermion<DomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void DomainWallEOFAFermion<DomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void DomainWallEOFAFermion<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void DomainWallEOFAFermion<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void DomainWallEOFAFermion<DomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void DomainWallEOFAFermion<DomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void DomainWallEOFAFermion<ZDomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void DomainWallEOFAFermion<ZDomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
#endif
}}

View File

@ -1,134 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/DomainWallFermion.h
Copyright (C) 2015
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Vera Guelpers <V.M.Guelpers@soton.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_QCD_DOMAIN_WALL_FERMION_H
#define GRID_QCD_DOMAIN_WALL_FERMION_H
#include <Grid/qcd/action/fermion/FermionCore.h>
namespace Grid {
namespace QCD {
template<class Impl>
class DomainWallFermion : public CayleyFermion5D<Impl>
{
public:
INHERIT_IMPL_TYPES(Impl);
public:
void FreePropagator(const FermionField &in,FermionField &out,RealD mass, std::vector<double> twist, bool fiveD) {
FermionField in_k(in._grid);
FermionField prop_k(in._grid);
FFT theFFT((GridCartesian *) in._grid);
//phase for boundary condition
ComplexField coor(in._grid);
ComplexField ph(in._grid); ph = zero;
FermionField in_buf(in._grid); in_buf = zero;
Complex ci(0.0,1.0);
assert(twist.size() == Nd);//check that twist is Nd
int shift = 0;
if(fiveD) shift = 1;
for(unsigned int nu = 0; nu < Nd; nu++)
{
// Shift coordinate lattice index by 1 to account for 5th dimension.
LatticeCoordinate(coor, nu + shift);
ph = ph + twist[nu]*coor*((1./(in._grid->_fdimensions[nu+shift])));
}
in_buf = exp((Real)(2.0*M_PI)*ci*ph*(-1.0))*in;
if(fiveD){//FFT only on temporal and spatial dimensions
std::vector<int> mask(Nd+1,1); mask[0] = 0;
theFFT.FFT_dim_mask(in_k,in_buf,mask,FFT::forward);
this->MomentumSpacePropagatorHt_5d(prop_k,in_k,mass,twist);
theFFT.FFT_dim_mask(out,prop_k,mask,FFT::backward);
}
else{
theFFT.FFT_all_dim(in_k,in,FFT::forward);
this->MomentumSpacePropagatorHt(prop_k,in_k,mass,twist);
theFFT.FFT_all_dim(out,prop_k,FFT::backward);
}
//phase for boundary condition
out = out * exp((Real)(2.0*M_PI)*ci*ph);
};
virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<double> twist) {
bool fiveD = true; //5d propagator by default
FreePropagator(in,out,mass,twist,fiveD);
};
virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass, bool fiveD) {
std::vector<double> twist(Nd,0.0); //default: periodic boundarys in all directions
FreePropagator(in,out,mass,twist,fiveD);
};
virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) {
bool fiveD = true; //5d propagator by default
std::vector<double> twist(Nd,0.0); //default: periodic boundarys in all directions
FreePropagator(in,out,mass,twist,fiveD);
};
virtual void Instantiatable(void) {};
// Constructors
DomainWallFermion(GaugeField &_Umu,
GridCartesian &FiveDimGrid,
GridRedBlackCartesian &FiveDimRedBlackGrid,
GridCartesian &FourDimGrid,
GridRedBlackCartesian &FourDimRedBlackGrid,
RealD _mass,RealD _M5,const ImplParams &p= ImplParams()) :
CayleyFermion5D<Impl>(_Umu,
FiveDimGrid,
FiveDimRedBlackGrid,
FourDimGrid,
FourDimRedBlackGrid,_mass,_M5,p)
{
RealD eps = 1.0;
Approx::zolotarev_data *zdata = Approx::higham(eps,this->Ls);// eps is ignored for higham
assert(zdata->n==this->Ls);
std::cout<<GridLogMessage << "DomainWallFermion with Ls="<<this->Ls<<std::endl;
// Call base setter
this->SetCoefficientsTanh(zdata,1.0,0.0);
Approx::zolotarev_free(zdata);
}
};
}
}
#endif

View File

@ -1,335 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/Fermion_base_aggregate.h
Copyright (C) 2015
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_QCD_FERMION_H
#define GRID_QCD_FERMION_H
////////////////////////////////////////////////////////////////////////////////////////////////////
// Explicit explicit template instantiation is still required in the .cc files
//
// - CayleyFermion5D.cc
// - PartialFractionFermion5D.cc
// - WilsonFermion5D.cc
// - WilsonKernelsHand.cc
// - ContinuedFractionFermion5D.cc
// - WilsonFermion.cc
// - WilsonKernels.cc
// - DomainWallEOFAFermion.cc
// - MobiusEOFAFermion.cc
//
// The explicit instantiation is only avoidable if we move this source to headers and end up with include/parse/recompile
// for EVERY .cc file. This define centralises the list and restores global push of impl cases
////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////
// Fermion operators / actions
////////////////////////////////////////////
#include <Grid/qcd/action/fermion/WilsonFermion.h> // 4d wilson like
#include <Grid/qcd/action/fermion/WilsonTMFermion.h> // 4d wilson like
#include <Grid/qcd/action/fermion/WilsonCloverFermion.h> // 4d wilson clover fermions
#include <Grid/qcd/action/fermion/WilsonFermion5D.h> // 5d base used by all 5d overlap types
#include <Grid/qcd/action/fermion/ImprovedStaggeredFermion.h>
#include <Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h>
#include <Grid/qcd/action/fermion/CayleyFermion5D.h> // Cayley types
#include <Grid/qcd/action/fermion/DomainWallFermion.h>
#include <Grid/qcd/action/fermion/DomainWallEOFAFermion.h>
#include <Grid/qcd/action/fermion/MobiusFermion.h>
#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
#include <Grid/qcd/action/fermion/ZMobiusFermion.h>
#include <Grid/qcd/action/fermion/SchurDiagTwoKappa.h>
#include <Grid/qcd/action/fermion/ScaledShamirFermion.h>
#include <Grid/qcd/action/fermion/MobiusZolotarevFermion.h>
#include <Grid/qcd/action/fermion/ShamirZolotarevFermion.h>
#include <Grid/qcd/action/fermion/OverlapWilsonCayleyTanhFermion.h>
#include <Grid/qcd/action/fermion/OverlapWilsonCayleyZolotarevFermion.h>
#include <Grid/qcd/action/fermion/ContinuedFractionFermion5D.h> // Continued fraction
#include <Grid/qcd/action/fermion/OverlapWilsonContfracTanhFermion.h>
#include <Grid/qcd/action/fermion/OverlapWilsonContfracZolotarevFermion.h>
#include <Grid/qcd/action/fermion/PartialFractionFermion5D.h> // Partial fraction
#include <Grid/qcd/action/fermion/OverlapWilsonPartialFractionTanhFermion.h>
#include <Grid/qcd/action/fermion/OverlapWilsonPartialFractionZolotarevFermion.h>
///////////////////////////////////////////////////////////////////////////////
// G5 herm -- this has to live in QCD since dirac matrix is not in the broader sector of code
///////////////////////////////////////////////////////////////////////////////
#include <Grid/qcd/action/fermion/g5HermitianLinop.h>
///////////////////////////////////////////////////////////////////////////////
// Fourier accelerated Pauli Villars inverse support
///////////////////////////////////////////////////////////////////////////////
#include <Grid/qcd/action/fermion/WilsonTMFermion5D.h>
////////////////////////////////////////////////////////////////////////////////
// Move this group to a DWF specific tools/algorithms subdir?
////////////////////////////////////////////////////////////////////////////////
#include <Grid/qcd/action/fermion/FourierAcceleratedPV.h>
#include <Grid/qcd/action/fermion/PauliVillarsInverters.h>
#include <Grid/qcd/action/fermion/Reconstruct5Dprop.h>
#include <Grid/qcd/action/fermion/MADWF.h>
////////////////////////////////////////////////////////////////////////////////////////////////////
// More maintainable to maintain the following typedef list centrally, as more "impl" targets
// are added, (e.g. extension for gparity, half precision project in comms etc..)
////////////////////////////////////////////////////////////////////////////////////////////////////
// Cayley 5d
namespace Grid {
namespace QCD {
typedef WilsonFermion<WilsonImplR> WilsonFermionR;
typedef WilsonFermion<WilsonImplF> WilsonFermionF;
typedef WilsonFermion<WilsonImplD> WilsonFermionD;
typedef WilsonFermion<WilsonImplRL> WilsonFermionRL;
typedef WilsonFermion<WilsonImplFH> WilsonFermionFH;
typedef WilsonFermion<WilsonImplDF> WilsonFermionDF;
typedef WilsonFermion<WilsonAdjImplR> WilsonAdjFermionR;
typedef WilsonFermion<WilsonAdjImplF> WilsonAdjFermionF;
typedef WilsonFermion<WilsonAdjImplD> WilsonAdjFermionD;
typedef WilsonFermion<WilsonTwoIndexSymmetricImplR> WilsonTwoIndexSymmetricFermionR;
typedef WilsonFermion<WilsonTwoIndexSymmetricImplF> WilsonTwoIndexSymmetricFermionF;
typedef WilsonFermion<WilsonTwoIndexSymmetricImplD> WilsonTwoIndexSymmetricFermionD;
typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplR> WilsonTwoIndexAntiSymmetricFermionR;
typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonTwoIndexAntiSymmetricFermionF;
typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonTwoIndexAntiSymmetricFermionD;
// Twisted mass fermion
typedef WilsonTMFermion<WilsonImplR> WilsonTMFermionR;
typedef WilsonTMFermion<WilsonImplF> WilsonTMFermionF;
typedef WilsonTMFermion<WilsonImplD> WilsonTMFermionD;
// Clover fermions
typedef WilsonCloverFermion<WilsonImplR> WilsonCloverFermionR;
typedef WilsonCloverFermion<WilsonImplF> WilsonCloverFermionF;
typedef WilsonCloverFermion<WilsonImplD> WilsonCloverFermionD;
typedef WilsonCloverFermion<WilsonAdjImplR> WilsonCloverAdjFermionR;
typedef WilsonCloverFermion<WilsonAdjImplF> WilsonCloverAdjFermionF;
typedef WilsonCloverFermion<WilsonAdjImplD> WilsonCloverAdjFermionD;
typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplR> WilsonCloverTwoIndexSymmetricFermionR;
typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplF> WilsonCloverTwoIndexSymmetricFermionF;
typedef WilsonCloverFermion<WilsonTwoIndexSymmetricImplD> WilsonCloverTwoIndexSymmetricFermionD;
typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplR> WilsonCloverTwoIndexAntiSymmetricFermionR;
typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonCloverTwoIndexAntiSymmetricFermionF;
typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiSymmetricFermionD;
// Domain Wall fermions
typedef DomainWallFermion<WilsonImplR> DomainWallFermionR;
typedef DomainWallFermion<WilsonImplF> DomainWallFermionF;
typedef DomainWallFermion<WilsonImplD> DomainWallFermionD;
typedef DomainWallFermion<WilsonImplRL> DomainWallFermionRL;
typedef DomainWallFermion<WilsonImplFH> DomainWallFermionFH;
typedef DomainWallFermion<WilsonImplDF> DomainWallFermionDF;
typedef DomainWallEOFAFermion<WilsonImplR> DomainWallEOFAFermionR;
typedef DomainWallEOFAFermion<WilsonImplF> DomainWallEOFAFermionF;
typedef DomainWallEOFAFermion<WilsonImplD> DomainWallEOFAFermionD;
typedef DomainWallEOFAFermion<WilsonImplRL> DomainWallEOFAFermionRL;
typedef DomainWallEOFAFermion<WilsonImplFH> DomainWallEOFAFermionFH;
typedef DomainWallEOFAFermion<WilsonImplDF> DomainWallEOFAFermionDF;
typedef MobiusFermion<WilsonImplR> MobiusFermionR;
typedef MobiusFermion<WilsonImplF> MobiusFermionF;
typedef MobiusFermion<WilsonImplD> MobiusFermionD;
typedef MobiusFermion<WilsonImplRL> MobiusFermionRL;
typedef MobiusFermion<WilsonImplFH> MobiusFermionFH;
typedef MobiusFermion<WilsonImplDF> MobiusFermionDF;
typedef MobiusEOFAFermion<WilsonImplR> MobiusEOFAFermionR;
typedef MobiusEOFAFermion<WilsonImplF> MobiusEOFAFermionF;
typedef MobiusEOFAFermion<WilsonImplD> MobiusEOFAFermionD;
typedef MobiusEOFAFermion<WilsonImplRL> MobiusEOFAFermionRL;
typedef MobiusEOFAFermion<WilsonImplFH> MobiusEOFAFermionFH;
typedef MobiusEOFAFermion<WilsonImplDF> MobiusEOFAFermionDF;
typedef ZMobiusFermion<ZWilsonImplR> ZMobiusFermionR;
typedef ZMobiusFermion<ZWilsonImplF> ZMobiusFermionF;
typedef ZMobiusFermion<ZWilsonImplD> ZMobiusFermionD;
typedef ZMobiusFermion<ZWilsonImplRL> ZMobiusFermionRL;
typedef ZMobiusFermion<ZWilsonImplFH> ZMobiusFermionFH;
typedef ZMobiusFermion<ZWilsonImplDF> ZMobiusFermionDF;
// Ls vectorised
typedef DomainWallFermion<DomainWallVec5dImplR> DomainWallFermionVec5dR;
typedef DomainWallFermion<DomainWallVec5dImplF> DomainWallFermionVec5dF;
typedef DomainWallFermion<DomainWallVec5dImplD> DomainWallFermionVec5dD;
typedef DomainWallFermion<DomainWallVec5dImplRL> DomainWallFermionVec5dRL;
typedef DomainWallFermion<DomainWallVec5dImplFH> DomainWallFermionVec5dFH;
typedef DomainWallFermion<DomainWallVec5dImplDF> DomainWallFermionVec5dDF;
typedef DomainWallEOFAFermion<DomainWallVec5dImplR> DomainWallEOFAFermionVec5dR;
typedef DomainWallEOFAFermion<DomainWallVec5dImplF> DomainWallEOFAFermionVec5dF;
typedef DomainWallEOFAFermion<DomainWallVec5dImplD> DomainWallEOFAFermionVec5dD;
typedef DomainWallEOFAFermion<DomainWallVec5dImplRL> DomainWallEOFAFermionVec5dRL;
typedef DomainWallEOFAFermion<DomainWallVec5dImplFH> DomainWallEOFAFermionVec5dFH;
typedef DomainWallEOFAFermion<DomainWallVec5dImplDF> DomainWallEOFAFermionVec5dDF;
typedef MobiusFermion<DomainWallVec5dImplR> MobiusFermionVec5dR;
typedef MobiusFermion<DomainWallVec5dImplF> MobiusFermionVec5dF;
typedef MobiusFermion<DomainWallVec5dImplD> MobiusFermionVec5dD;
typedef MobiusFermion<DomainWallVec5dImplRL> MobiusFermionVec5dRL;
typedef MobiusFermion<DomainWallVec5dImplFH> MobiusFermionVec5dFH;
typedef MobiusFermion<DomainWallVec5dImplDF> MobiusFermionVec5dDF;
typedef MobiusEOFAFermion<DomainWallVec5dImplR> MobiusEOFAFermionVec5dR;
typedef MobiusEOFAFermion<DomainWallVec5dImplF> MobiusEOFAFermionVec5dF;
typedef MobiusEOFAFermion<DomainWallVec5dImplD> MobiusEOFAFermionVec5dD;
typedef MobiusEOFAFermion<DomainWallVec5dImplRL> MobiusEOFAFermionVec5dRL;
typedef MobiusEOFAFermion<DomainWallVec5dImplFH> MobiusEOFAFermionVec5dFH;
typedef MobiusEOFAFermion<DomainWallVec5dImplDF> MobiusEOFAFermionVec5dDF;
typedef ZMobiusFermion<ZDomainWallVec5dImplR> ZMobiusFermionVec5dR;
typedef ZMobiusFermion<ZDomainWallVec5dImplF> ZMobiusFermionVec5dF;
typedef ZMobiusFermion<ZDomainWallVec5dImplD> ZMobiusFermionVec5dD;
typedef ZMobiusFermion<ZDomainWallVec5dImplRL> ZMobiusFermionVec5dRL;
typedef ZMobiusFermion<ZDomainWallVec5dImplFH> ZMobiusFermionVec5dFH;
typedef ZMobiusFermion<ZDomainWallVec5dImplDF> ZMobiusFermionVec5dDF;
typedef ScaledShamirFermion<WilsonImplR> ScaledShamirFermionR;
typedef ScaledShamirFermion<WilsonImplF> ScaledShamirFermionF;
typedef ScaledShamirFermion<WilsonImplD> ScaledShamirFermionD;
typedef MobiusZolotarevFermion<WilsonImplR> MobiusZolotarevFermionR;
typedef MobiusZolotarevFermion<WilsonImplF> MobiusZolotarevFermionF;
typedef MobiusZolotarevFermion<WilsonImplD> MobiusZolotarevFermionD;
typedef ShamirZolotarevFermion<WilsonImplR> ShamirZolotarevFermionR;
typedef ShamirZolotarevFermion<WilsonImplF> ShamirZolotarevFermionF;
typedef ShamirZolotarevFermion<WilsonImplD> ShamirZolotarevFermionD;
typedef OverlapWilsonCayleyTanhFermion<WilsonImplR> OverlapWilsonCayleyTanhFermionR;
typedef OverlapWilsonCayleyTanhFermion<WilsonImplF> OverlapWilsonCayleyTanhFermionF;
typedef OverlapWilsonCayleyTanhFermion<WilsonImplD> OverlapWilsonCayleyTanhFermionD;
typedef OverlapWilsonCayleyZolotarevFermion<WilsonImplR> OverlapWilsonCayleyZolotarevFermionR;
typedef OverlapWilsonCayleyZolotarevFermion<WilsonImplF> OverlapWilsonCayleyZolotarevFermionF;
typedef OverlapWilsonCayleyZolotarevFermion<WilsonImplD> OverlapWilsonCayleyZolotarevFermionD;
// Continued fraction
typedef OverlapWilsonContFracTanhFermion<WilsonImplR> OverlapWilsonContFracTanhFermionR;
typedef OverlapWilsonContFracTanhFermion<WilsonImplF> OverlapWilsonContFracTanhFermionF;
typedef OverlapWilsonContFracTanhFermion<WilsonImplD> OverlapWilsonContFracTanhFermionD;
typedef OverlapWilsonContFracZolotarevFermion<WilsonImplR> OverlapWilsonContFracZolotarevFermionR;
typedef OverlapWilsonContFracZolotarevFermion<WilsonImplF> OverlapWilsonContFracZolotarevFermionF;
typedef OverlapWilsonContFracZolotarevFermion<WilsonImplD> OverlapWilsonContFracZolotarevFermionD;
// Partial fraction
typedef OverlapWilsonPartialFractionTanhFermion<WilsonImplR> OverlapWilsonPartialFractionTanhFermionR;
typedef OverlapWilsonPartialFractionTanhFermion<WilsonImplF> OverlapWilsonPartialFractionTanhFermionF;
typedef OverlapWilsonPartialFractionTanhFermion<WilsonImplD> OverlapWilsonPartialFractionTanhFermionD;
typedef OverlapWilsonPartialFractionZolotarevFermion<WilsonImplR> OverlapWilsonPartialFractionZolotarevFermionR;
typedef OverlapWilsonPartialFractionZolotarevFermion<WilsonImplF> OverlapWilsonPartialFractionZolotarevFermionF;
typedef OverlapWilsonPartialFractionZolotarevFermion<WilsonImplD> OverlapWilsonPartialFractionZolotarevFermionD;
// Gparity cases; partial list until tested
typedef WilsonFermion<GparityWilsonImplR> GparityWilsonFermionR;
typedef WilsonFermion<GparityWilsonImplF> GparityWilsonFermionF;
typedef WilsonFermion<GparityWilsonImplD> GparityWilsonFermionD;
typedef WilsonFermion<GparityWilsonImplRL> GparityWilsonFermionRL;
typedef WilsonFermion<GparityWilsonImplFH> GparityWilsonFermionFH;
typedef WilsonFermion<GparityWilsonImplDF> GparityWilsonFermionDF;
typedef DomainWallFermion<GparityWilsonImplR> GparityDomainWallFermionR;
typedef DomainWallFermion<GparityWilsonImplF> GparityDomainWallFermionF;
typedef DomainWallFermion<GparityWilsonImplD> GparityDomainWallFermionD;
typedef DomainWallFermion<GparityWilsonImplRL> GparityDomainWallFermionRL;
typedef DomainWallFermion<GparityWilsonImplFH> GparityDomainWallFermionFH;
typedef DomainWallFermion<GparityWilsonImplDF> GparityDomainWallFermionDF;
typedef DomainWallEOFAFermion<GparityWilsonImplR> GparityDomainWallEOFAFermionR;
typedef DomainWallEOFAFermion<GparityWilsonImplF> GparityDomainWallEOFAFermionF;
typedef DomainWallEOFAFermion<GparityWilsonImplD> GparityDomainWallEOFAFermionD;
typedef DomainWallEOFAFermion<GparityWilsonImplRL> GparityDomainWallEOFAFermionRL;
typedef DomainWallEOFAFermion<GparityWilsonImplFH> GparityDomainWallEOFAFermionFH;
typedef DomainWallEOFAFermion<GparityWilsonImplDF> GparityDomainWallEOFAFermionDF;
typedef WilsonTMFermion<GparityWilsonImplR> GparityWilsonTMFermionR;
typedef WilsonTMFermion<GparityWilsonImplF> GparityWilsonTMFermionF;
typedef WilsonTMFermion<GparityWilsonImplD> GparityWilsonTMFermionD;
typedef WilsonTMFermion<GparityWilsonImplRL> GparityWilsonTMFermionRL;
typedef WilsonTMFermion<GparityWilsonImplFH> GparityWilsonTMFermionFH;
typedef WilsonTMFermion<GparityWilsonImplDF> GparityWilsonTMFermionDF;
typedef MobiusFermion<GparityWilsonImplR> GparityMobiusFermionR;
typedef MobiusFermion<GparityWilsonImplF> GparityMobiusFermionF;
typedef MobiusFermion<GparityWilsonImplD> GparityMobiusFermionD;
typedef MobiusFermion<GparityWilsonImplRL> GparityMobiusFermionRL;
typedef MobiusFermion<GparityWilsonImplFH> GparityMobiusFermionFH;
typedef MobiusFermion<GparityWilsonImplDF> GparityMobiusFermionDF;
typedef MobiusEOFAFermion<GparityWilsonImplR> GparityMobiusEOFAFermionR;
typedef MobiusEOFAFermion<GparityWilsonImplF> GparityMobiusEOFAFermionF;
typedef MobiusEOFAFermion<GparityWilsonImplD> GparityMobiusEOFAFermionD;
typedef MobiusEOFAFermion<GparityWilsonImplRL> GparityMobiusEOFAFermionRL;
typedef MobiusEOFAFermion<GparityWilsonImplFH> GparityMobiusEOFAFermionFH;
typedef MobiusEOFAFermion<GparityWilsonImplDF> GparityMobiusEOFAFermionDF;
typedef ImprovedStaggeredFermion<StaggeredImplR> ImprovedStaggeredFermionR;
typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF;
typedef ImprovedStaggeredFermion<StaggeredImplD> ImprovedStaggeredFermionD;
typedef ImprovedStaggeredFermion5D<StaggeredImplR> ImprovedStaggeredFermion5DR;
typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF;
typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD;
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplR> ImprovedStaggeredFermionVec5dR;
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplF> ImprovedStaggeredFermionVec5dF;
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplD> ImprovedStaggeredFermionVec5dD;
}}
////////////////////
// Scalar QED actions
// TODO: this needs to move to another header after rename to Fermion.h
////////////////////
#include <Grid/qcd/action/scalar/Scalar.h>
#include <Grid/qcd/action/gauge/Photon.h>
#endif

View File

@ -1,93 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/Fermion_base_aggregate.h
Copyright (C) 2015
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_QCD_FERMION_CORE_H
#define GRID_QCD_FERMION_CORE_H
#include <Grid/GridCore.h>
#include <Grid/GridQCDcore.h>
#include <Grid/qcd/action/ActionCore.h>
////////////////////////////////////////////
// Fermion prereqs
////////////////////////////////////////////
#include <Grid/qcd/action/fermion/WilsonCompressor.h> //used by all wilson type fermions
#include <Grid/qcd/action/fermion/FermionOperatorImpl.h>
#include <Grid/qcd/action/fermion/FermionOperator.h>
#include <Grid/qcd/action/fermion/WilsonKernels.h> //used by all wilson type fermions
#include <Grid/qcd/action/fermion/StaggeredKernels.h> //used by all wilson type fermions
#define FermOpStaggeredTemplateInstantiate(A) \
template class A<StaggeredImplF>; \
template class A<StaggeredImplD>;
#define FermOpStaggeredVec5dTemplateInstantiate(A) \
template class A<StaggeredVec5dImplF>; \
template class A<StaggeredVec5dImplD>;
#define FermOp4dVecTemplateInstantiate(A) \
template class A<WilsonImplF>; \
template class A<WilsonImplD>; \
template class A<ZWilsonImplF>; \
template class A<ZWilsonImplD>; \
template class A<GparityWilsonImplF>; \
template class A<GparityWilsonImplD>; \
template class A<WilsonImplFH>; \
template class A<WilsonImplDF>; \
template class A<ZWilsonImplFH>; \
template class A<ZWilsonImplDF>; \
template class A<GparityWilsonImplFH>; \
template class A<GparityWilsonImplDF>;
#define AdjointFermOpTemplateInstantiate(A) \
template class A<WilsonAdjImplF>; \
template class A<WilsonAdjImplD>;
#define TwoIndexFermOpTemplateInstantiate(A) \
template class A<WilsonTwoIndexSymmetricImplF>; \
template class A<WilsonTwoIndexSymmetricImplD>; \
template class A<WilsonTwoIndexAntiSymmetricImplF>; \
template class A<WilsonTwoIndexAntiSymmetricImplD>;
#define FermOp5dVecTemplateInstantiate(A) \
template class A<DomainWallVec5dImplF>; \
template class A<DomainWallVec5dImplD>; \
template class A<ZDomainWallVec5dImplF>; \
template class A<ZDomainWallVec5dImplD>; \
template class A<DomainWallVec5dImplFH>; \
template class A<DomainWallVec5dImplDF>; \
template class A<ZDomainWallVec5dImplFH>; \
template class A<ZDomainWallVec5dImplDF>;
#define FermOpTemplateInstantiate(A) \
FermOp4dVecTemplateInstantiate(A) \
FermOp5dVecTemplateInstantiate(A)
#define GparityFermOpTemplateInstantiate(A)
#endif

File diff suppressed because it is too large Load Diff

View File

@ -1,237 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/FourierAcceleratedPV.h
Copyright (C) 2015
Author: Christoph Lehner (lifted with permission by Peter Boyle, brought back to Grid)
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
namespace Grid {
namespace QCD {
template<typename M>
void get_real_const_bc(M& m, RealD& _b, RealD& _c) {
ComplexD b,c;
b=m.bs[0];
c=m.cs[0];
std::cout << GridLogMessage << "b=" << b << ", c=" << c << std::endl;
for (size_t i=1;i<m.bs.size();i++) {
assert(m.bs[i] == b);
assert(m.cs[i] == c);
}
assert(b.imag() == 0.0);
assert(c.imag() == 0.0);
_b = b.real();
_c = c.real();
}
template<typename Vi, typename M, typename G>
class FourierAcceleratedPV {
public:
ConjugateGradient<Vi> &cg;
M& dwfPV;
G& Umu;
GridCartesian* grid5D;
GridRedBlackCartesian* gridRB5D;
int group_in_s;
FourierAcceleratedPV(M& _dwfPV, G& _Umu, ConjugateGradient<Vi> &_cg, int _group_in_s = 2)
: dwfPV(_dwfPV), Umu(_Umu), cg(_cg), group_in_s(_group_in_s)
{
assert( dwfPV.FermionGrid()->_fdimensions[0] % (2*group_in_s) == 0);
grid5D = QCD::SpaceTimeGrid::makeFiveDimGrid(2*group_in_s, (GridCartesian*)Umu._grid);
gridRB5D = QCD::SpaceTimeGrid::makeFiveDimRedBlackGrid(2*group_in_s, (GridCartesian*)Umu._grid);
}
void rotatePV(const Vi& _src, Vi& dst, bool forward) const {
GridStopWatch gsw1, gsw2;
typedef typename Vi::scalar_type Coeff_t;
int Ls = dst._grid->_fdimensions[0];
Vi _tmp(dst._grid);
double phase = M_PI / (double)Ls;
Coeff_t bzero(0.0,0.0);
FFT theFFT((GridCartesian*)dst._grid);
if (!forward) {
gsw1.Start();
for (int s=0;s<Ls;s++) {
Coeff_t a(::cos(phase*s),-::sin(phase*s));
axpby_ssp(_tmp,a,_src,bzero,_src,s,s);
}
gsw1.Stop();
gsw2.Start();
theFFT.FFT_dim(dst,_tmp,0,FFT::forward);
gsw2.Stop();
} else {
gsw2.Start();
theFFT.FFT_dim(_tmp,_src,0,FFT::backward);
gsw2.Stop();
gsw1.Start();
for (int s=0;s<Ls;s++) {
Coeff_t a(::cos(phase*s),::sin(phase*s));
axpby_ssp(dst,a,_tmp,bzero,_tmp,s,s);
}
gsw1.Stop();
}
std::cout << GridLogMessage << "Timing rotatePV: " << gsw1.Elapsed() << ", " << gsw2.Elapsed() << std::endl;
}
void pvInv(const Vi& _src, Vi& _dst) const {
std::cout << GridLogMessage << "Fourier-Accelerated Outer Pauli Villars"<<std::endl;
typedef typename Vi::scalar_type Coeff_t;
int Ls = _dst._grid->_fdimensions[0];
GridStopWatch gswT;
gswT.Start();
RealD b,c;
get_real_const_bc(dwfPV,b,c);
RealD M5 = dwfPV.M5;
// U(true) Rightinv TMinv U(false) = Minv
Vi _src_diag(_dst._grid);
Vi _src_diag_slice(dwfPV.GaugeGrid());
Vi _dst_diag_slice(dwfPV.GaugeGrid());
Vi _src_diag_slices(grid5D);
Vi _dst_diag_slices(grid5D);
Vi _dst_diag(_dst._grid);
rotatePV(_src,_src_diag,false);
// now do TM solves
Gamma G5(Gamma::Algebra::Gamma5);
GridStopWatch gswA, gswB;
gswA.Start();
typedef typename M::Impl_t Impl;
//WilsonTMFermion<Impl> tm(x.Umu,*x.UGridF,*x.UrbGridF,0.0,0.0,solver_outer.parent.par.wparams_f);
std::vector<RealD> vmass(grid5D->_fdimensions[0],0.0);
std::vector<RealD> vmu(grid5D->_fdimensions[0],0.0);
WilsonTMFermion5D<Impl> tm(Umu,*grid5D,*gridRB5D,
*(GridCartesian*)dwfPV.GaugeGrid(),
*(GridRedBlackCartesian*)dwfPV.GaugeRedBlackGrid(),
vmass,vmu);
//SchurRedBlackDiagTwoSolve<Vi> sol(cg);
SchurRedBlackDiagMooeeSolve<Vi> sol(cg); // same performance as DiagTwo
gswA.Stop();
gswB.Start();
for (int sgroup=0;sgroup<Ls/2/group_in_s;sgroup++) {
for (int sidx=0;sidx<group_in_s;sidx++) {
int s = sgroup*group_in_s + sidx;
int sprime = Ls-s-1;
RealD phase = M_PI / (RealD)Ls * (2.0 * s + 1.0);
RealD cosp = ::cos(phase);
RealD sinp = ::sin(phase);
RealD denom = b*b + c*c + 2.0*b*c*cosp;
RealD mass = -(b*b*M5 + c*(1.0 - cosp + c*M5) + b*(-1.0 + cosp + 2.0*c*cosp*M5))/denom;
RealD mu = (b+c)*sinp/denom;
vmass[2*sidx + 0] = mass;
vmass[2*sidx + 1] = mass;
vmu[2*sidx + 0] = mu;
vmu[2*sidx + 1] = -mu;
}
tm.update(vmass,vmu);
for (int sidx=0;sidx<group_in_s;sidx++) {
int s = sgroup*group_in_s + sidx;
int sprime = Ls-s-1;
ExtractSlice(_src_diag_slice,_src_diag,s,0);
InsertSlice(_src_diag_slice,_src_diag_slices,2*sidx + 0,0);
ExtractSlice(_src_diag_slice,_src_diag,sprime,0);
InsertSlice(_src_diag_slice,_src_diag_slices,2*sidx + 1,0);
}
GridStopWatch gsw;
gsw.Start();
_dst_diag_slices = zero; // zero guess
sol(tm,_src_diag_slices,_dst_diag_slices);
gsw.Stop();
std::cout << GridLogMessage << "Solve[sgroup=" << sgroup << "] completed in " << gsw.Elapsed() << ", " << gswA.Elapsed() << std::endl;
for (int sidx=0;sidx<group_in_s;sidx++) {
int s = sgroup*group_in_s + sidx;
int sprime = Ls-s-1;
RealD phase = M_PI / (RealD)Ls * (2.0 * s + 1.0);
RealD cosp = ::cos(phase);
RealD sinp = ::sin(phase);
// now rotate with inverse of
Coeff_t pA = b + c*cosp;
Coeff_t pB = - Coeff_t(0.0,1.0)*c*sinp;
Coeff_t pABden = pA*pA - pB*pB;
// (pA + pB * G5) * (pA - pB*G5) = (pA^2 - pB^2)
ExtractSlice(_dst_diag_slice,_dst_diag_slices,2*sidx + 0,0);
_dst_diag_slice = (pA/pABden) * _dst_diag_slice - (pB/pABden) * (G5 * _dst_diag_slice);
InsertSlice(_dst_diag_slice,_dst_diag,s,0);
ExtractSlice(_dst_diag_slice,_dst_diag_slices,2*sidx + 1,0);
_dst_diag_slice = (pA/pABden) * _dst_diag_slice + (pB/pABden) * (G5 * _dst_diag_slice);
InsertSlice(_dst_diag_slice,_dst_diag,sprime,0);
}
}
gswB.Stop();
rotatePV(_dst_diag,_dst,true);
gswT.Stop();
std::cout << GridLogMessage << "PV completed in " << gswT.Elapsed() << " (Setup: " << gswA.Elapsed() << ", s-loop: " << gswB.Elapsed() << ")" << std::endl;
}
};
}}

View File

@ -1,604 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion.cc
Copyright (C) 2015
Author: Azusa Yamaguchi, Peter Boyle
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid.h>
namespace Grid {
namespace QCD {
const std::vector<int>
ImprovedStaggeredFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3});
const std::vector<int>
ImprovedStaggeredFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3, 3, 3, 3, -3, -3, -3, -3});
/////////////////////////////////
// Constructor and gauge import
/////////////////////////////////
template <class Impl>
ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid,
RealD _mass,
RealD _c1, RealD _c2,RealD _u0,
const ImplParams &p)
: Kernels(p),
_grid(&Fgrid),
_cbgrid(&Hgrid),
Stencil(&Fgrid, npoint, Even, directions, displacements),
StencilEven(&Hgrid, npoint, Even, directions, displacements), // source is Even
StencilOdd(&Hgrid, npoint, Odd, directions, displacements), // source is Odd
mass(_mass),
Lebesgue(_grid),
LebesgueEvenOdd(_cbgrid),
Umu(&Fgrid),
UmuEven(&Hgrid),
UmuOdd(&Hgrid),
UUUmu(&Fgrid),
UUUmuEven(&Hgrid),
UUUmuOdd(&Hgrid) ,
_tmp(&Hgrid)
{
int vol4;
int LLs=1;
c1=_c1;
c2=_c2;
u0=_u0;
vol4= _grid->oSites();
Stencil.BuildSurfaceList(LLs,vol4);
vol4= _cbgrid->oSites();
StencilEven.BuildSurfaceList(LLs,vol4);
StencilOdd.BuildSurfaceList(LLs,vol4);
}
template <class Impl>
ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GaugeField &_Uthin, GaugeField &_Ufat, GridCartesian &Fgrid,
GridRedBlackCartesian &Hgrid, RealD _mass,
RealD _c1, RealD _c2,RealD _u0,
const ImplParams &p)
: ImprovedStaggeredFermion(Fgrid,Hgrid,_mass,_c1,_c2,_u0,p)
{
ImportGauge(_Uthin,_Ufat);
}
////////////////////////////////////////////////////////////
// Momentum space propagator should be
// https://arxiv.org/pdf/hep-lat/9712010.pdf
//
// mom space action.
// gamma_mu i ( c1 sin pmu + c2 sin 3 pmu ) + m
//
// must track through staggered flavour/spin reduction in literature to
// turn to free propagator for the one component chi field, a la page 4/5
// of above link to implmement fourier based solver.
////////////////////////////////////////////////////////////
template <class Impl>
void ImprovedStaggeredFermion<Impl>::ImportGaugeSimple(const GaugeField &_Utriple,const GaugeField &_Ufat)
{
/////////////////////////////////////////////////////////////////
// Trivial import; phases and fattening and such like preapplied
/////////////////////////////////////////////////////////////////
GaugeLinkField U(GaugeGrid());
for (int mu = 0; mu < Nd; mu++) {
U = PeekIndex<LorentzIndex>(_Utriple, mu);
PokeIndex<LorentzIndex>(UUUmu, U, mu );
U = adj( Cshift(U, mu, -3));
PokeIndex<LorentzIndex>(UUUmu, -U, mu+4 );
U = PeekIndex<LorentzIndex>(_Ufat, mu);
PokeIndex<LorentzIndex>(Umu, U, mu);
U = adj( Cshift(U, mu, -1));
PokeIndex<LorentzIndex>(Umu, -U, mu+4);
}
CopyGaugeCheckerboards();
}
template <class Impl>
void ImprovedStaggeredFermion<Impl>::ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U)
{
Umu = _U;
UUUmu = _UUU;
CopyGaugeCheckerboards();
}
template <class Impl>
void ImprovedStaggeredFermion<Impl>::CopyGaugeCheckerboards(void)
{
pickCheckerboard(Even, UmuEven, Umu);
pickCheckerboard(Odd, UmuOdd , Umu);
pickCheckerboard(Even, UUUmuEven,UUUmu);
pickCheckerboard(Odd, UUUmuOdd, UUUmu);
}
template <class Impl>
void ImprovedStaggeredFermion<Impl>::ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat)
{
GaugeLinkField U(GaugeGrid());
////////////////////////////////////////////////////////
// Double Store should take two fields for Naik and one hop separately.
////////////////////////////////////////////////////////
Impl::DoubleStore(GaugeGrid(), UUUmu, Umu, _Uthin, _Ufat );
////////////////////////////////////////////////////////
// Apply scale factors to get the right fermion Kinetic term
// Could pass coeffs into the double store to save work.
// 0.5 ( U p(x+mu) - Udag(x-mu) p(x-mu) )
////////////////////////////////////////////////////////
for (int mu = 0; mu < Nd; mu++) {
U = PeekIndex<LorentzIndex>(Umu, mu);
PokeIndex<LorentzIndex>(Umu, U*( 0.5*c1/u0), mu );
U = PeekIndex<LorentzIndex>(Umu, mu+4);
PokeIndex<LorentzIndex>(Umu, U*(-0.5*c1/u0), mu+4);
U = PeekIndex<LorentzIndex>(UUUmu, mu);
PokeIndex<LorentzIndex>(UUUmu, U*( 0.5*c2/u0/u0/u0), mu );
U = PeekIndex<LorentzIndex>(UUUmu, mu+4);
PokeIndex<LorentzIndex>(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4);
}
CopyGaugeCheckerboards();
}
/////////////////////////////
// Implement the interface
/////////////////////////////
template <class Impl>
RealD ImprovedStaggeredFermion<Impl>::M(const FermionField &in, FermionField &out) {
out.checkerboard = in.checkerboard;
Dhop(in, out, DaggerNo);
return axpy_norm(out, mass, in, out);
}
template <class Impl>
RealD ImprovedStaggeredFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
out.checkerboard = in.checkerboard;
Dhop(in, out, DaggerYes);
return axpy_norm(out, mass, in, out);
}
template <class Impl>
void ImprovedStaggeredFermion<Impl>::Meooe(const FermionField &in, FermionField &out) {
if (in.checkerboard == Odd) {
DhopEO(in, out, DaggerNo);
} else {
DhopOE(in, out, DaggerNo);
}
}
template <class Impl>
void ImprovedStaggeredFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
if (in.checkerboard == Odd) {
DhopEO(in, out, DaggerYes);
} else {
DhopOE(in, out, DaggerYes);
}
}
template <class Impl>
void ImprovedStaggeredFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
out.checkerboard = in.checkerboard;
typename FermionField::scalar_type scal(mass);
out = scal * in;
}
template <class Impl>
void ImprovedStaggeredFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
out.checkerboard = in.checkerboard;
Mooee(in, out);
}
template <class Impl>
void ImprovedStaggeredFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
out.checkerboard = in.checkerboard;
out = (1.0 / (mass)) * in;
}
template <class Impl>
void ImprovedStaggeredFermion<Impl>::MooeeInvDag(const FermionField &in,
FermionField &out) {
out.checkerboard = in.checkerboard;
MooeeInv(in, out);
}
///////////////////////////////////
// Internal
///////////////////////////////////
template <class Impl>
void ImprovedStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU,
GaugeField & mat,
const FermionField &A, const FermionField &B, int dag) {
assert((dag == DaggerNo) || (dag == DaggerYes));
Compressor compressor;
FermionField Btilde(B._grid);
FermionField Atilde(B._grid);
Atilde = A;
st.HaloExchange(B, compressor);
for (int mu = 0; mu < Nd; mu++) {
////////////////////////
// Call the single hop
////////////////////////
PARALLEL_FOR_LOOP
for (int sss = 0; sss < B._grid->oSites(); sss++) {
Kernels::DhopDir(st, U, UUU, st.CommBuf(), sss, sss, B, Btilde, mu,1);
}
// Force in three link terms
//
// Impl::InsertForce4D(mat, Btilde, Atilde, mu);
//
// dU_ac(x)/dt = i p_ab U_bc(x)
//
// => dS_f/dt = dS_f/dU_ac(x) . dU_ac(x)/dt = i p_ab U_bc(x) dS_f/dU_ac(x)
//
// One link: form fragments S_f = A U B
//
// write Btilde = U(x) B(x+mu)
//
// mat+= TraceIndex<SpinIndex>(outerProduct(Btilde,A));
//
// Three link: form fragments S_f = A UUU B
//
// mat+= outer ( A, UUUB) <-- Best take DhopDeriv with one linke or identity matrix
// mat+= outer ( AU, UUB) <-- and then use covariant cshift?
// mat+= outer ( AUU, UB) <-- Returned from call to DhopDir
assert(0);// need to figure out the force interface with a blasted three link term.
}
}
template <class Impl>
void ImprovedStaggeredFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
conformable(U._grid, _grid);
conformable(U._grid, V._grid);
conformable(U._grid, mat._grid);
mat.checkerboard = U.checkerboard;
DerivInternal(Stencil, Umu, UUUmu, mat, U, V, dag);
}
template <class Impl>
void ImprovedStaggeredFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
conformable(U._grid, _cbgrid);
conformable(U._grid, V._grid);
conformable(U._grid, mat._grid);
assert(V.checkerboard == Even);
assert(U.checkerboard == Odd);
mat.checkerboard = Odd;
DerivInternal(StencilEven, UmuOdd, UUUmuOdd, mat, U, V, dag);
}
template <class Impl>
void ImprovedStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
conformable(U._grid, _cbgrid);
conformable(U._grid, V._grid);
conformable(U._grid, mat._grid);
assert(V.checkerboard == Odd);
assert(U.checkerboard == Even);
mat.checkerboard = Even;
DerivInternal(StencilOdd, UmuEven, UUUmuEven, mat, U, V, dag);
}
template <class Impl>
void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) {
DhopCalls+=2;
conformable(in._grid, _grid); // verifies full grid
conformable(in._grid, out._grid);
out.checkerboard = in.checkerboard;
DhopInternal(Stencil, Lebesgue, Umu, UUUmu, in, out, dag);
}
template <class Impl>
void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) {
DhopCalls+=1;
conformable(in._grid, _cbgrid); // verifies half grid
conformable(in._grid, out._grid); // drops the cb check
assert(in.checkerboard == Even);
out.checkerboard = Odd;
DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, UUUmuOdd, in, out, dag);
}
template <class Impl>
void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag) {
DhopCalls+=1;
conformable(in._grid, _cbgrid); // verifies half grid
conformable(in._grid, out._grid); // drops the cb check
assert(in.checkerboard == Odd);
out.checkerboard = Even;
DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, UUUmuEven, in, out, dag);
}
template <class Impl>
void ImprovedStaggeredFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) {
DhopDir(in, out, dir, disp);
}
template <class Impl>
void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) {
Compressor compressor;
Stencil.HaloExchange(in, compressor);
PARALLEL_FOR_LOOP
for (int sss = 0; sss < in._grid->oSites(); sss++) {
Kernels::DhopDir(Stencil, Umu, UUUmu, Stencil.CommBuf(), sss, sss, in, out, dir, disp);
}
};
template <class Impl>
void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
DoubledGaugeField &UUU,
const FermionField &in,
FermionField &out, int dag)
{
#ifdef GRID_OMP
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
else
#endif
DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
}
template <class Impl>
void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
DoubledGaugeField &UUU,
const FermionField &in,
FermionField &out, int dag)
{
#ifdef GRID_OMP
Compressor compressor;
int len = U._grid->oSites();
const int LLs = 1;
DhopTotalTime -= usecond();
DhopFaceTime -= usecond();
st.Prepare();
st.HaloGather(in,compressor);
st.CommsMergeSHM(compressor);
DhopFaceTime += usecond();
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Ugly explicit thread mapping introduced for OPA reasons.
//////////////////////////////////////////////////////////////////////////////////////////////////////
DhopComputeTime -= usecond();
#pragma omp parallel
{
int tid = omp_get_thread_num();
int nthreads = omp_get_num_threads();
int ncomms = CartesianCommunicator::nCommThreads;
if (ncomms == -1) ncomms = 1;
assert(nthreads > ncomms);
if (tid >= ncomms) {
nthreads -= ncomms;
int ttid = tid - ncomms;
int n = len;
int chunk = n / nthreads;
int rem = n % nthreads;
int myblock, myn;
if (ttid < rem) {
myblock = ttid * chunk + ttid;
myn = chunk+1;
} else {
myblock = ttid*chunk + rem;
myn = chunk;
}
// do the compute
if (dag == DaggerYes) {
for (int ss = myblock; ss < myblock+myn; ++ss) {
int sU = ss;
// Interior = 1; Exterior = 0; must implement for staggered
Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,1,0);
}
} else {
for (int ss = myblock; ss < myblock+myn; ++ss) {
// Interior = 1; Exterior = 0;
int sU = ss;
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,1,0);
}
}
} else {
st.CommunicateThreaded();
}
}
DhopComputeTime += usecond();
// First to enter, last to leave timing
DhopFaceTime -= usecond();
st.CommsMerge(compressor);
DhopFaceTime -= usecond();
DhopComputeTime2 -= usecond();
if (dag == DaggerYes) {
int sz=st.surface_list.size();
parallel_for (int ss = 0; ss < sz; ss++) {
int sU = st.surface_list[ss];
Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,0,1);
}
} else {
int sz=st.surface_list.size();
parallel_for (int ss = 0; ss < sz; ss++) {
int sU = st.surface_list[ss];
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),1,sU,in,out,0,1);
}
}
DhopComputeTime2 += usecond();
#else
assert(0);
#endif
}
template <class Impl>
void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
DoubledGaugeField &UUU,
const FermionField &in,
FermionField &out, int dag)
{
assert((dag == DaggerNo) || (dag == DaggerYes));
DhopTotalTime -= usecond();
DhopCommTime -= usecond();
Compressor compressor;
st.HaloExchange(in, compressor);
DhopCommTime += usecond();
DhopComputeTime -= usecond();
if (dag == DaggerYes) {
parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out);
}
} else {
parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
Kernels::DhopSite(st, lo, U, UUU, st.CommBuf(), 1, sss, in, out);
}
}
DhopComputeTime += usecond();
DhopTotalTime += usecond();
};
////////////////////////////////////////////////////////////////
// Reporting
////////////////////////////////////////////////////////////////
template<class Impl>
void ImprovedStaggeredFermion<Impl>::Report(void)
{
std::vector<int> latt = GridDefaultLatt();
RealD volume = 1; for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
RealD NP = _grid->_Nprocessors;
RealD NN = _grid->NodeCount();
std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion Number of DhopEO Calls : "
<< DhopCalls << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion TotalTime /Calls : "
<< DhopTotalTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion CommTime /Calls : "
<< DhopCommTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion ComputeTime/Calls : "
<< DhopComputeTime / DhopCalls << " us" << std::endl;
// Average the compute time
_grid->GlobalSum(DhopComputeTime);
DhopComputeTime/=NP;
RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl;
RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion Stencil" <<std::endl; Stencil.Report();
std::cout << GridLogMessage << "ImprovedStaggeredFermion StencilEven"<<std::endl; StencilEven.Report();
std::cout << GridLogMessage << "ImprovedStaggeredFermion StencilOdd" <<std::endl; StencilOdd.Report();
}
template<class Impl>
void ImprovedStaggeredFermion<Impl>::ZeroCounters(void)
{
DhopCalls = 0;
DhopTotalTime = 0;
DhopCommTime = 0;
DhopComputeTime = 0;
DhopFaceTime = 0;
Stencil.ZeroCounters();
StencilEven.ZeroCounters();
StencilOdd.ZeroCounters();
}
////////////////////////////////////////////////////////
// Conserved current - not yet implemented.
////////////////////////////////////////////////////////
template <class Impl>
void ImprovedStaggeredFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
PropagatorField &q_in_2,
PropagatorField &q_out,
Current curr_type,
unsigned int mu)
{
assert(0);
}
template <class Impl>
void ImprovedStaggeredFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
PropagatorField &q_out,
Current curr_type,
unsigned int mu,
unsigned int tmin,
unsigned int tmax,
ComplexField &lattice_cmplx)
{
assert(0);
}
FermOpStaggeredTemplateInstantiate(ImprovedStaggeredFermion);
//AdjointFermOpTemplateInstantiate(ImprovedStaggeredFermion);
//TwoIndexFermOpTemplateInstantiate(ImprovedStaggeredFermion);
}}

View File

@ -1,205 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/ImprovedStaggered.h
Copyright (C) 2015
Author: Azusa Yamaguchi, Peter Boyle
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_QCD_IMPR_STAG_FERMION_H
#define GRID_QCD_IMPR_STAG_FERMION_H
namespace Grid {
namespace QCD {
class ImprovedStaggeredFermionStatic {
public:
static const std::vector<int> directions;
static const std::vector<int> displacements;
static const int npoint = 16;
};
template <class Impl>
class ImprovedStaggeredFermion : public StaggeredKernels<Impl>, public ImprovedStaggeredFermionStatic {
public:
INHERIT_IMPL_TYPES(Impl);
typedef StaggeredKernels<Impl> Kernels;
FermionField _tmp;
FermionField &tmp(void) { return _tmp; }
////////////////////////////////////////
// Performance monitoring
////////////////////////////////////////
void Report(void);
void ZeroCounters(void);
double DhopTotalTime;
double DhopCalls;
double DhopCommTime;
double DhopComputeTime;
double DhopComputeTime2;
double DhopFaceTime;
///////////////////////////////////////////////////////////////
// Implement the abstract base
///////////////////////////////////////////////////////////////
GridBase *GaugeGrid(void) { return _grid; }
GridBase *GaugeRedBlackGrid(void) { return _cbgrid; }
GridBase *FermionGrid(void) { return _grid; }
GridBase *FermionRedBlackGrid(void) { return _cbgrid; }
//////////////////////////////////////////////////////////////////
// override multiply; cut number routines if pass dagger argument
// and also make interface more uniformly consistent
//////////////////////////////////////////////////////////////////
RealD M(const FermionField &in, FermionField &out);
RealD Mdag(const FermionField &in, FermionField &out);
/////////////////////////////////////////////////////////
// half checkerboard operations
/////////////////////////////////////////////////////////
void Meooe(const FermionField &in, FermionField &out);
void MeooeDag(const FermionField &in, FermionField &out);
void Mooee(const FermionField &in, FermionField &out);
void MooeeDag(const FermionField &in, FermionField &out);
void MooeeInv(const FermionField &in, FermionField &out);
void MooeeInvDag(const FermionField &in, FermionField &out);
////////////////////////
// Derivative interface
////////////////////////
// Interface calls an internal routine
void DhopDeriv (GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
void DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
void DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
///////////////////////////////////////////////////////////////
// non-hermitian hopping term; half cb or both
///////////////////////////////////////////////////////////////
void Dhop (const FermionField &in, FermionField &out, int dag);
void DhopOE(const FermionField &in, FermionField &out, int dag);
void DhopEO(const FermionField &in, FermionField &out, int dag);
///////////////////////////////////////////////////////////////
// Multigrid assistance; force term uses too
///////////////////////////////////////////////////////////////
void Mdir(const FermionField &in, FermionField &out, int dir, int disp);
void DhopDir(const FermionField &in, FermionField &out, int dir, int disp);
///////////////////////////////////////////////////////////////
// Extra methods added by derived
///////////////////////////////////////////////////////////////
void DerivInternal(StencilImpl &st,
DoubledGaugeField &U,DoubledGaugeField &UUU,
GaugeField &mat,
const FermionField &A, const FermionField &B, int dag);
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
const FermionField &in, FermionField &out, int dag);
void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
const FermionField &in, FermionField &out, int dag);
void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
const FermionField &in, FermionField &out, int dag);
//////////////////////////////////////////////////////////////////////////
// Grid own interface Constructor
//////////////////////////////////////////////////////////////////////////
ImprovedStaggeredFermion(GaugeField &_Uthin, GaugeField &_Ufat, GridCartesian &Fgrid,
GridRedBlackCartesian &Hgrid, RealD _mass,
RealD _c1, RealD _c2,RealD _u0,
const ImplParams &p = ImplParams());
//////////////////////////////////////////////////////////////////////////
// MILC constructor no gauge fields
//////////////////////////////////////////////////////////////////////////
ImprovedStaggeredFermion(GridCartesian &Fgrid, GridRedBlackCartesian &Hgrid, RealD _mass,
RealD _c1=1.0, RealD _c2=1.0,RealD _u0=1.0,
const ImplParams &p = ImplParams());
// DoubleStore impl dependent
void ImportGauge (const GaugeField &_Uthin ) { assert(0); }
void ImportGauge (const GaugeField &_Uthin ,const GaugeField &_Ufat);
void ImportGaugeSimple(const GaugeField &_UUU ,const GaugeField &_U);
void ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U);
DoubledGaugeField &GetU(void) { return Umu ; } ;
DoubledGaugeField &GetUUU(void) { return UUUmu; };
void CopyGaugeCheckerboards(void);
///////////////////////////////////////////////////////////////
// Data members require to support the functionality
///////////////////////////////////////////////////////////////
// protected:
public:
// any other parameters of action ???
virtual int isTrivialEE(void) { return 1; };
virtual RealD Mass(void) { return mass; }
RealD mass;
RealD u0;
RealD c1;
RealD c2;
GridBase *_grid;
GridBase *_cbgrid;
// Defines the stencils for even and odd
StencilImpl Stencil;
StencilImpl StencilEven;
StencilImpl StencilOdd;
// Copy of the gauge field , with even and odd subsets
DoubledGaugeField Umu;
DoubledGaugeField UmuEven;
DoubledGaugeField UmuOdd;
DoubledGaugeField UUUmu;
DoubledGaugeField UUUmuEven;
DoubledGaugeField UUUmuOdd;
LebesgueOrder Lebesgue;
LebesgueOrder LebesgueEvenOdd;
///////////////////////////////////////////////////////////////
// Conserved current utilities
///////////////////////////////////////////////////////////////
void ContractConservedCurrent(PropagatorField &q_in_1,
PropagatorField &q_in_2,
PropagatorField &q_out,
Current curr_type,
unsigned int mu);
void SeqConservedCurrent(PropagatorField &q_in,
PropagatorField &q_out,
Current curr_type,
unsigned int mu,
unsigned int tmin,
unsigned int tmax,
ComplexField &lattice_cmplx);
};
typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF;
typedef ImprovedStaggeredFermion<StaggeredImplD> ImprovedStaggeredFermionD;
}
}
#endif

View File

@ -1,654 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h>
#include <Grid/perfmon/PerfCount.h>
namespace Grid {
namespace QCD {
// S-direction is INNERMOST and takes no part in the parity.
const std::vector<int>
ImprovedStaggeredFermion5DStatic::directions({1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4});
const std::vector<int>
ImprovedStaggeredFermion5DStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1, 3, 3, 3, 3, -3, -3, -3, -3});
// 5d lattice for DWF.
template<class Impl>
ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GridCartesian &FiveDimGrid,
GridRedBlackCartesian &FiveDimRedBlackGrid,
GridCartesian &FourDimGrid,
GridRedBlackCartesian &FourDimRedBlackGrid,
RealD _mass,
RealD _c1,RealD _c2, RealD _u0,
const ImplParams &p) :
Kernels(p),
_FiveDimGrid (&FiveDimGrid),
_FiveDimRedBlackGrid(&FiveDimRedBlackGrid),
_FourDimGrid (&FourDimGrid),
_FourDimRedBlackGrid(&FourDimRedBlackGrid),
Stencil (&FiveDimGrid,npoint,Even,directions,displacements),
StencilEven(&FiveDimRedBlackGrid,npoint,Even,directions,displacements), // source is Even
StencilOdd (&FiveDimRedBlackGrid,npoint,Odd ,directions,displacements), // source is Odd
mass(_mass),
c1(_c1),
c2(_c2),
u0(_u0),
Umu(&FourDimGrid),
UmuEven(&FourDimRedBlackGrid),
UmuOdd (&FourDimRedBlackGrid),
UUUmu(&FourDimGrid),
UUUmuEven(&FourDimRedBlackGrid),
UUUmuOdd(&FourDimRedBlackGrid),
Lebesgue(&FourDimGrid),
LebesgueEvenOdd(&FourDimRedBlackGrid),
_tmp(&FiveDimRedBlackGrid)
{
// some assertions
assert(FiveDimGrid._ndimension==5);
assert(FourDimGrid._ndimension==4);
assert(FourDimRedBlackGrid._ndimension==4);
assert(FiveDimRedBlackGrid._ndimension==5);
assert(FiveDimRedBlackGrid._checker_dim==1); // Don't checker the s direction
// extent of fifth dim and not spread out
Ls=FiveDimGrid._fdimensions[0];
assert(FiveDimRedBlackGrid._fdimensions[0]==Ls);
assert(FiveDimGrid._processors[0] ==1);
assert(FiveDimRedBlackGrid._processors[0] ==1);
// Other dimensions must match the decomposition of the four-D fields
for(int d=0;d<4;d++){
assert(FiveDimGrid._processors[d+1] ==FourDimGrid._processors[d]);
assert(FiveDimRedBlackGrid._processors[d+1] ==FourDimGrid._processors[d]);
assert(FourDimRedBlackGrid._processors[d] ==FourDimGrid._processors[d]);
assert(FiveDimGrid._fdimensions[d+1] ==FourDimGrid._fdimensions[d]);
assert(FiveDimRedBlackGrid._fdimensions[d+1]==FourDimGrid._fdimensions[d]);
assert(FourDimRedBlackGrid._fdimensions[d] ==FourDimGrid._fdimensions[d]);
assert(FiveDimGrid._simd_layout[d+1] ==FourDimGrid._simd_layout[d]);
assert(FiveDimRedBlackGrid._simd_layout[d+1]==FourDimGrid._simd_layout[d]);
assert(FourDimRedBlackGrid._simd_layout[d] ==FourDimGrid._simd_layout[d]);
}
if (Impl::LsVectorised) {
int nsimd = Simd::Nsimd();
// Dimension zero of the five-d is the Ls direction
assert(FiveDimGrid._simd_layout[0] ==nsimd);
assert(FiveDimRedBlackGrid._simd_layout[0]==nsimd);
for(int d=0;d<4;d++){
assert(FourDimGrid._simd_layout[d]=1);
assert(FourDimRedBlackGrid._simd_layout[d]=1);
assert(FiveDimRedBlackGrid._simd_layout[d+1]==1);
}
} else {
// Dimension zero of the five-d is the Ls direction
assert(FiveDimRedBlackGrid._simd_layout[0]==1);
assert(FiveDimGrid._simd_layout[0] ==1);
}
int LLs = FiveDimGrid._rdimensions[0];
int vol4= FourDimGrid.oSites();
Stencil.BuildSurfaceList(LLs,vol4);
vol4=FourDimRedBlackGrid.oSites();
StencilEven.BuildSurfaceList(LLs,vol4);
StencilOdd.BuildSurfaceList(LLs,vol4);
}
template <class Impl>
void ImprovedStaggeredFermion5D<Impl>::CopyGaugeCheckerboards(void)
{
pickCheckerboard(Even, UmuEven, Umu);
pickCheckerboard(Odd, UmuOdd , Umu);
pickCheckerboard(Even, UUUmuEven,UUUmu);
pickCheckerboard(Odd, UUUmuOdd, UUUmu);
}
template<class Impl>
ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GaugeField &_Uthin,GaugeField &_Ufat,
GridCartesian &FiveDimGrid,
GridRedBlackCartesian &FiveDimRedBlackGrid,
GridCartesian &FourDimGrid,
GridRedBlackCartesian &FourDimRedBlackGrid,
RealD _mass,
RealD _c1,RealD _c2, RealD _u0,
const ImplParams &p) :
ImprovedStaggeredFermion5D(FiveDimGrid,FiveDimRedBlackGrid,
FourDimGrid,FourDimRedBlackGrid,
_mass,_c1,_c2,_u0,p)
{
ImportGauge(_Uthin,_Ufat);
}
///////////////////////////////////////////////////
// For MILC use; pass three link U's and 1 link U
///////////////////////////////////////////////////
template <class Impl>
void ImprovedStaggeredFermion5D<Impl>::ImportGaugeSimple(const GaugeField &_Utriple,const GaugeField &_Ufat)
{
/////////////////////////////////////////////////////////////////
// Trivial import; phases and fattening and such like preapplied
/////////////////////////////////////////////////////////////////
for (int mu = 0; mu < Nd; mu++) {
auto U = PeekIndex<LorentzIndex>(_Utriple, mu);
Impl::InsertGaugeField(UUUmu,U,mu);
U = adj( Cshift(U, mu, -3));
Impl::InsertGaugeField(UUUmu,-U,mu+4);
U = PeekIndex<LorentzIndex>(_Ufat, mu);
Impl::InsertGaugeField(Umu,U,mu);
U = adj( Cshift(U, mu, -1));
Impl::InsertGaugeField(Umu,-U,mu+4);
}
CopyGaugeCheckerboards();
}
template <class Impl>
void ImprovedStaggeredFermion5D<Impl>::ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U)
{
/////////////////////////////////////////////////////////////////
// Trivial import; phases and fattening and such like preapplied
/////////////////////////////////////////////////////////////////
Umu = _U;
UUUmu = _UUU;
CopyGaugeCheckerboards();
}
template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat)
{
////////////////////////////////////////////////////////
// Double Store should take two fields for Naik and one hop separately.
////////////////////////////////////////////////////////
Impl::DoubleStore(GaugeGrid(), UUUmu, Umu, _Uthin, _Ufat );
////////////////////////////////////////////////////////
// Apply scale factors to get the right fermion Kinetic term
// Could pass coeffs into the double store to save work.
// 0.5 ( U p(x+mu) - Udag(x-mu) p(x-mu) )
////////////////////////////////////////////////////////
for (int mu = 0; mu < Nd; mu++) {
auto U = PeekIndex<LorentzIndex>(Umu, mu);
PokeIndex<LorentzIndex>(Umu, U*( 0.5*c1/u0), mu );
U = PeekIndex<LorentzIndex>(Umu, mu+4);
PokeIndex<LorentzIndex>(Umu, U*(-0.5*c1/u0), mu+4);
U = PeekIndex<LorentzIndex>(UUUmu, mu);
PokeIndex<LorentzIndex>(UUUmu, U*( 0.5*c2/u0/u0/u0), mu );
U = PeekIndex<LorentzIndex>(UUUmu, mu+4);
PokeIndex<LorentzIndex>(UUUmu, U*(-0.5*c2/u0/u0/u0), mu+4);
}
CopyGaugeCheckerboards();
}
template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::DhopDir(const FermionField &in, FermionField &out,int dir5,int disp)
{
int dir = dir5-1; // Maps to the ordering above in "directions" that is passed to stencil
// we drop off the innermost fifth dimension
Compressor compressor;
Stencil.HaloExchange(in,compressor);
parallel_for(int ss=0;ss<Umu._grid->oSites();ss++){
for(int s=0;s<Ls;s++){
int sU=ss;
int sF = s+Ls*sU;
Kernels::DhopDir(Stencil, Umu, UUUmu, Stencil.CommBuf(), sF, sU, in, out, dir, disp);
}
}
};
template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::DerivInternal(StencilImpl & st,
DoubledGaugeField & U,
DoubledGaugeField & UUU,
GaugeField &mat,
const FermionField &A,
const FermionField &B,
int dag)
{
// No force terms in multi-rhs solver staggered
assert(0);
}
template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::DhopDeriv(GaugeField &mat,
const FermionField &A,
const FermionField &B,
int dag)
{
assert(0);
}
template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::DhopDerivEO(GaugeField &mat,
const FermionField &A,
const FermionField &B,
int dag)
{
assert(0);
}
template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
const FermionField &A,
const FermionField &B,
int dag)
{
assert(0);
}
/*CHANGE */
template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
DoubledGaugeField & U,DoubledGaugeField & UUU,
const FermionField &in, FermionField &out,int dag)
{
#ifdef GRID_OMP
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
else
#endif
DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
}
template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo,
DoubledGaugeField & U,DoubledGaugeField & UUU,
const FermionField &in, FermionField &out,int dag)
{
#ifdef GRID_OMP
// assert((dag==DaggerNo) ||(dag==DaggerYes));
Compressor compressor;
int LLs = in._grid->_rdimensions[0];
int len = U._grid->oSites();
DhopFaceTime-=usecond();
st.Prepare();
st.HaloGather(in,compressor);
// st.HaloExchangeOptGather(in,compressor); // Wilson compressor
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
DhopFaceTime+=usecond();
double ctime=0;
double ptime=0;
//////////////////////////////////////////////////////////////////////////////////////////////////////
// Ugly explicit thread mapping introduced for OPA reasons.
//////////////////////////////////////////////////////////////////////////////////////////////////////
#pragma omp parallel reduction(max:ctime) reduction(max:ptime)
{
int tid = omp_get_thread_num();
int nthreads = omp_get_num_threads();
int ncomms = CartesianCommunicator::nCommThreads;
if (ncomms == -1) ncomms = 1;
assert(nthreads > ncomms);
if (tid >= ncomms) {
double start = usecond();
nthreads -= ncomms;
int ttid = tid - ncomms;
int n = U._grid->oSites(); // 4d vol
int chunk = n / nthreads;
int rem = n % nthreads;
int myblock, myn;
if (ttid < rem) {
myblock = ttid * chunk + ttid;
myn = chunk+1;
} else {
myblock = ttid*chunk + rem;
myn = chunk;
}
// do the compute
if (dag == DaggerYes) {
for (int ss = myblock; ss < myblock+myn; ++ss) {
int sU = ss;
// Interior = 1; Exterior = 0; must implement for staggered
Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,1,0); //<---------
}
} else {
for (int ss = myblock; ss < myblock+myn; ++ss) {
// Interior = 1; Exterior = 0;
int sU = ss;
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,1,0); //<------------
}
}
ptime = usecond() - start;
} else {
double start = usecond();
st.CommunicateThreaded();
ctime = usecond() - start;
}
}
DhopCommTime += ctime;
DhopComputeTime+=ptime;
// First to enter, last to leave timing
st.CollateThreads();
DhopFaceTime-=usecond();
st.CommsMerge(compressor);
DhopFaceTime+=usecond();
DhopComputeTime2-=usecond();
if (dag == DaggerYes) {
int sz=st.surface_list.size();
parallel_for (int ss = 0; ss < sz; ss++) {
int sU = st.surface_list[ss];
Kernels::DhopSiteDag(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,0,1); //<----------
}
} else {
int sz=st.surface_list.size();
parallel_for (int ss = 0; ss < sz; ss++) {
int sU = st.surface_list[ss];
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out,0,1);//<----------
}
}
DhopComputeTime2+=usecond();
#else
assert(0);
#endif
}
template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
DoubledGaugeField & U,DoubledGaugeField & UUU,
const FermionField &in, FermionField &out,int dag)
{
Compressor compressor;
int LLs = in._grid->_rdimensions[0];
//double t1=usecond();
DhopTotalTime -= usecond();
DhopCommTime -= usecond();
st.HaloExchange(in,compressor);
DhopCommTime += usecond();
DhopComputeTime -= usecond();
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
if (dag == DaggerYes) {
parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
int sU=ss;
Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), LLs, sU,in, out);
}
} else {
parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
int sU=ss;
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out);
}
}
DhopComputeTime += usecond();
DhopTotalTime += usecond();
//double t2=usecond();
//std::cout << __FILE__ << " " << __func__ << " Total Time " << DhopTotalTime << std::endl;
//std::cout << __FILE__ << " " << __func__ << " Total Time Org " << t2-t1 << std::endl;
//std::cout << __FILE__ << " " << __func__ << " Comml Time " << DhopCommTime << std::endl;
//std::cout << __FILE__ << " " << __func__ << " Compute Time " << DhopComputeTime << std::endl;
}
/*CHANGE END*/
/* ORG
template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
DoubledGaugeField & U,DoubledGaugeField & UUU,
const FermionField &in, FermionField &out,int dag)
{
Compressor compressor;
int LLs = in._grid->_rdimensions[0];
DhopTotalTime -= usecond();
DhopCommTime -= usecond();
st.HaloExchange(in,compressor);
DhopCommTime += usecond();
DhopComputeTime -= usecond();
// Dhop takes the 4d grid from U, and makes a 5d index for fermion
if (dag == DaggerYes) {
parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
int sU=ss;
Kernels::DhopSiteDag(st, lo, U, UUU, st.CommBuf(), LLs, sU,in, out);
}
} else {
parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
int sU=ss;
Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out);
}
}
DhopComputeTime += usecond();
DhopTotalTime += usecond();
}
*/
template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
{
DhopCalls+=1;
conformable(in._grid,FermionRedBlackGrid()); // verifies half grid
conformable(in._grid,out._grid); // drops the cb check
assert(in.checkerboard==Even);
out.checkerboard = Odd;
DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,UUUmuOdd,in,out,dag);
}
template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
{
DhopCalls+=1;
conformable(in._grid,FermionRedBlackGrid()); // verifies half grid
conformable(in._grid,out._grid); // drops the cb check
assert(in.checkerboard==Odd);
out.checkerboard = Even;
DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,UUUmuEven,in,out,dag);
}
template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
{
DhopCalls+=2;
conformable(in._grid,FermionGrid()); // verifies full grid
conformable(in._grid,out._grid);
out.checkerboard = in.checkerboard;
DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag);
}
template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::Report(void)
{
std::vector<int> latt = GridDefaultLatt();
RealD volume = Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
RealD NP = _FourDimGrid->_Nprocessors;
RealD NN = _FourDimGrid->NodeCount();
std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Number of DhopEO Calls : "
<< DhopCalls << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion5D TotalTime /Calls : "
<< DhopTotalTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion5D CommTime /Calls : "
<< DhopCommTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion5D ComputeTime/Calls : "
<< DhopComputeTime / DhopCalls << " us" << std::endl;
// Average the compute time
_FourDimGrid->GlobalSum(DhopComputeTime);
DhopComputeTime/=NP;
RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl;
RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Stencil" <<std::endl; Stencil.Report();
std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilEven"<<std::endl; StencilEven.Report();
std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilOdd" <<std::endl; StencilOdd.Report();
}
template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::ZeroCounters(void)
{
DhopCalls = 0;
DhopTotalTime = 0;
DhopCommTime = 0;
DhopComputeTime = 0;
DhopFaceTime = 0;
Stencil.ZeroCounters();
StencilEven.ZeroCounters();
StencilOdd.ZeroCounters();
}
/////////////////////////////////////////////////////////////////////////
// Implement the general interface. Here we use SAME mass on all slices
/////////////////////////////////////////////////////////////////////////
template <class Impl>
void ImprovedStaggeredFermion5D<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) {
DhopDir(in, out, dir, disp);
}
template <class Impl>
RealD ImprovedStaggeredFermion5D<Impl>::M(const FermionField &in, FermionField &out) {
out.checkerboard = in.checkerboard;
Dhop(in, out, DaggerNo);
return axpy_norm(out, mass, in, out);
}
template <class Impl>
RealD ImprovedStaggeredFermion5D<Impl>::Mdag(const FermionField &in, FermionField &out) {
out.checkerboard = in.checkerboard;
Dhop(in, out, DaggerYes);
return axpy_norm(out, mass, in, out);
}
template <class Impl>
void ImprovedStaggeredFermion5D<Impl>::Meooe(const FermionField &in, FermionField &out) {
if (in.checkerboard == Odd) {
DhopEO(in, out, DaggerNo);
} else {
DhopOE(in, out, DaggerNo);
}
}
template <class Impl>
void ImprovedStaggeredFermion5D<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
if (in.checkerboard == Odd) {
DhopEO(in, out, DaggerYes);
} else {
DhopOE(in, out, DaggerYes);
}
}
template <class Impl>
void ImprovedStaggeredFermion5D<Impl>::Mooee(const FermionField &in, FermionField &out) {
out.checkerboard = in.checkerboard;
typename FermionField::scalar_type scal(mass);
out = scal * in;
}
template <class Impl>
void ImprovedStaggeredFermion5D<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
out.checkerboard = in.checkerboard;
Mooee(in, out);
}
template <class Impl>
void ImprovedStaggeredFermion5D<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
out.checkerboard = in.checkerboard;
out = (1.0 / (mass)) * in;
}
template <class Impl>
void ImprovedStaggeredFermion5D<Impl>::MooeeInvDag(const FermionField &in,
FermionField &out) {
out.checkerboard = in.checkerboard;
MooeeInv(in, out);
}
////////////////////////////////////////////////////////
// Conserved current - not yet implemented.
////////////////////////////////////////////////////////
template <class Impl>
void ImprovedStaggeredFermion5D<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
PropagatorField &q_in_2,
PropagatorField &q_out,
Current curr_type,
unsigned int mu)
{
assert(0);
}
template <class Impl>
void ImprovedStaggeredFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
PropagatorField &q_out,
Current curr_type,
unsigned int mu,
unsigned int tmin,
unsigned int tmax,
ComplexField &lattice_cmplx)
{
assert(0);
}
FermOpStaggeredTemplateInstantiate(ImprovedStaggeredFermion5D);
FermOpStaggeredVec5dTemplateInstantiate(ImprovedStaggeredFermion5D);
}}

View File

@ -1,234 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: AzusaYamaguchi <ayamaguc@staffmail.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_QCD_IMPROVED_STAGGERED_FERMION_5D_H
#define GRID_QCD_IMPROVED_STAGGERED_FERMION_5D_H
namespace Grid {
namespace QCD {
////////////////////////////////////////////////////////////////////////////////
// This is the 4d red black case appropriate to support
////////////////////////////////////////////////////////////////////////////////
class ImprovedStaggeredFermion5DStatic {
public:
// S-direction is INNERMOST and takes no part in the parity.
static const std::vector<int> directions;
static const std::vector<int> displacements;
const int npoint = 16;
};
template<class Impl>
class ImprovedStaggeredFermion5D : public StaggeredKernels<Impl>, public ImprovedStaggeredFermion5DStatic
{
public:
INHERIT_IMPL_TYPES(Impl);
typedef StaggeredKernels<Impl> Kernels;
FermionField _tmp;
FermionField &tmp(void) { return _tmp; }
////////////////////////////////////////
// Performance monitoring
////////////////////////////////////////
void Report(void);
void ZeroCounters(void);
double DhopTotalTime;
double DhopCalls;
double DhopCommTime;
double DhopComputeTime;
double DhopComputeTime2;
double DhopFaceTime;
///////////////////////////////////////////////////////////////
// Implement the abstract base
///////////////////////////////////////////////////////////////
GridBase *GaugeGrid(void) { return _FourDimGrid ;}
GridBase *GaugeRedBlackGrid(void) { return _FourDimRedBlackGrid ;}
GridBase *FermionGrid(void) { return _FiveDimGrid;}
GridBase *FermionRedBlackGrid(void) { return _FiveDimRedBlackGrid;}
// full checkerboard operations; leave unimplemented as abstract for now
RealD M (const FermionField &in, FermionField &out);
RealD Mdag (const FermionField &in, FermionField &out);
// half checkerboard operations
void Meooe (const FermionField &in, FermionField &out);
void Mooee (const FermionField &in, FermionField &out);
void MooeeInv (const FermionField &in, FermionField &out);
void MeooeDag (const FermionField &in, FermionField &out);
void MooeeDag (const FermionField &in, FermionField &out);
void MooeeInvDag (const FermionField &in, FermionField &out);
void Mdir (const FermionField &in, FermionField &out,int dir,int disp);
void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);
// These can be overridden by fancy 5d chiral action
void DhopDeriv (GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag);
// Implement hopping term non-hermitian hopping term; half cb or both
void Dhop (const FermionField &in, FermionField &out,int dag);
void DhopOE(const FermionField &in, FermionField &out,int dag);
void DhopEO(const FermionField &in, FermionField &out,int dag);
///////////////////////////////////////////////////////////////
// New methods added
///////////////////////////////////////////////////////////////
void DerivInternal(StencilImpl & st,
DoubledGaugeField & U,
DoubledGaugeField & UUU,
GaugeField &mat,
const FermionField &A,
const FermionField &B,
int dag);
void DhopInternal(StencilImpl & st,
LebesgueOrder &lo,
DoubledGaugeField &U,
DoubledGaugeField &UUU,
const FermionField &in,
FermionField &out,
int dag);
void DhopInternalOverlappedComms(StencilImpl & st,
LebesgueOrder &lo,
DoubledGaugeField &U,
DoubledGaugeField &UUU,
const FermionField &in,
FermionField &out,
int dag);
void DhopInternalSerialComms(StencilImpl & st,
LebesgueOrder &lo,
DoubledGaugeField &U,
DoubledGaugeField &UUU,
const FermionField &in,
FermionField &out,
int dag);
// Constructors
////////////////////////////////////////////////////////////////////////////////////////////////
// Grid internal interface -- Thin link and fat link, with coefficients
////////////////////////////////////////////////////////////////////////////////////////////////
ImprovedStaggeredFermion5D(GaugeField &_Uthin,
GaugeField &_Ufat,
GridCartesian &FiveDimGrid,
GridRedBlackCartesian &FiveDimRedBlackGrid,
GridCartesian &FourDimGrid,
GridRedBlackCartesian &FourDimRedBlackGrid,
double _mass,
RealD _c1, RealD _c2,RealD _u0,
const ImplParams &p= ImplParams());
////////////////////////////////////////////////////////////////////////////////////////////////
// MILC constructor ; triple links, no rescale factors; must be externally pre multiplied
////////////////////////////////////////////////////////////////////////////////////////////////
ImprovedStaggeredFermion5D(GridCartesian &FiveDimGrid,
GridRedBlackCartesian &FiveDimRedBlackGrid,
GridCartesian &FourDimGrid,
GridRedBlackCartesian &FourDimRedBlackGrid,
double _mass,
RealD _c1=1.0, RealD _c2=1.0,RealD _u0=1.0,
const ImplParams &p= ImplParams());
// DoubleStore gauge field in operator
void ImportGauge (const GaugeField &_Uthin ) { assert(0); }
void ImportGauge (const GaugeField &_Uthin ,const GaugeField &_Ufat);
void ImportGaugeSimple(const GaugeField &_UUU,const GaugeField &_U);
void ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U);
// Give a reference; can be used to do an assignment or copy back out after import
// if Carleton wants to cache them and not use the ImportSimple
DoubledGaugeField &GetU(void) { return Umu ; } ;
DoubledGaugeField &GetUUU(void) { return UUUmu; };
void CopyGaugeCheckerboards(void);
///////////////////////////////////////////////////////////////
// Data members require to support the functionality
///////////////////////////////////////////////////////////////
public:
virtual int isTrivialEE(void) { return 1; };
virtual RealD Mass(void) { return mass; }
GridBase *_FourDimGrid;
GridBase *_FourDimRedBlackGrid;
GridBase *_FiveDimGrid;
GridBase *_FiveDimRedBlackGrid;
RealD mass;
RealD c1;
RealD c2;
RealD u0;
int Ls;
//Defines the stencils for even and odd
StencilImpl Stencil;
StencilImpl StencilEven;
StencilImpl StencilOdd;
// Copy of the gauge field , with even and odd subsets
DoubledGaugeField Umu;
DoubledGaugeField UmuEven;
DoubledGaugeField UmuOdd;
DoubledGaugeField UUUmu;
DoubledGaugeField UUUmuEven;
DoubledGaugeField UUUmuOdd;
LebesgueOrder Lebesgue;
LebesgueOrder LebesgueEvenOdd;
// Comms buffer
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
///////////////////////////////////////////////////////////////
// Conserved current utilities
///////////////////////////////////////////////////////////////
void ContractConservedCurrent(PropagatorField &q_in_1,
PropagatorField &q_in_2,
PropagatorField &q_out,
Current curr_type,
unsigned int mu);
void SeqConservedCurrent(PropagatorField &q_in,
PropagatorField &q_out,
Current curr_type,
unsigned int mu,
unsigned int tmin,
unsigned int tmax,
ComplexField &lattice_cmplx);
};
}}
#endif

View File

@ -1,193 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/MADWF.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
namespace Grid {
namespace QCD {
template <class Fieldi, class Fieldo,IfNotSame<Fieldi,Fieldo> X=0>
inline void convert(const Fieldi &from,Fieldo &to)
{
precisionChange(to,from);
}
template <class Fieldi, class Fieldo,IfSame<Fieldi,Fieldo> X=0>
inline void convert(const Fieldi &from,Fieldo &to)
{
to=from;
}
template<class Matrixo,class Matrixi,class PVinverter,class SchurSolver, class Guesser>
class MADWF
{
private:
typedef typename Matrixo::FermionField FermionFieldo;
typedef typename Matrixi::FermionField FermionFieldi;
PVinverter & PauliVillarsSolvero;// For the outer field
SchurSolver & SchurSolveri; // For the inner approx field
Guesser & Guesseri; // To deflate the inner approx solves
Matrixo & Mato; // Action object for outer
Matrixi & Mati; // Action object for inner
RealD target_resid;
int maxiter;
public:
MADWF(Matrixo &_Mato,
Matrixi &_Mati,
PVinverter &_PauliVillarsSolvero,
SchurSolver &_SchurSolveri,
Guesser & _Guesseri,
RealD resid,
int _maxiter) :
Mato(_Mato),Mati(_Mati),
SchurSolveri(_SchurSolveri),
PauliVillarsSolvero(_PauliVillarsSolvero),Guesseri(_Guesseri)
{
target_resid=resid;
maxiter =_maxiter;
};
void operator() (const FermionFieldo &src4,FermionFieldo &sol5)
{
std::cout << GridLogMessage<< " ************************************************" << std::endl;
std::cout << GridLogMessage<< " MADWF-like algorithm " << std::endl;
std::cout << GridLogMessage<< " ************************************************" << std::endl;
FermionFieldi c0i(Mati.GaugeGrid()); // 4d
FermionFieldi y0i(Mati.GaugeGrid()); // 4d
FermionFieldo c0 (Mato.GaugeGrid()); // 4d
FermionFieldo y0 (Mato.GaugeGrid()); // 4d
FermionFieldo A(Mato.FermionGrid()); // Temporary outer
FermionFieldo B(Mato.FermionGrid()); // Temporary outer
FermionFieldo b(Mato.FermionGrid()); // 5d source
FermionFieldo c(Mato.FermionGrid()); // PVinv source; reused so store
FermionFieldo defect(Mato.FermionGrid()); // 5d source
FermionFieldi ci(Mati.FermionGrid());
FermionFieldi yi(Mati.FermionGrid());
FermionFieldi xi(Mati.FermionGrid());
FermionFieldi srci(Mati.FermionGrid());
FermionFieldi Ai(Mati.FermionGrid());
RealD m=Mati.Mass();
///////////////////////////////////////
//Import source, include Dminus factors
///////////////////////////////////////
Mato.ImportPhysicalFermionSource(src4,b);
std::cout << GridLogMessage << " src4 " <<norm2(src4)<<std::endl;
std::cout << GridLogMessage << " b " <<norm2(b)<<std::endl;
defect = b;
sol5=zero;
for (int i=0;i<maxiter;i++) {
///////////////////////////////////////
// Set up c0 from current defect
///////////////////////////////////////
PauliVillarsSolvero(Mato,defect,A);
Mato.Pdag(A,c);
ExtractSlice(c0, c, 0 , 0);
////////////////////////////////////////////////
// Solve the inner system with surface term c0
////////////////////////////////////////////////
ci = zero;
convert(c0,c0i); // Possible precison change
InsertSlice(c0i,ci,0, 0);
// Dwm P y = Dwm x = D(1) P (c0,0,0,0)^T
Mati.P(ci,Ai);
Mati.SetMass(1.0); Mati.M(Ai,srci); Mati.SetMass(m);
SchurSolveri(Mati,srci,xi,Guesseri);
Mati.Pdag(xi,yi);
ExtractSlice(y0i, yi, 0 , 0);
convert(y0i,y0); // Possible precision change
//////////////////////////////////////
// Propagate solution back to outer system
// Build Pdag PV^-1 Dm P [-sol4,c2,c3... cL]
//////////////////////////////////////
c0 = - y0;
InsertSlice(c0, c, 0 , 0);
/////////////////////////////
// Reconstruct the bulk solution Pdag PV^-1 Dm P
/////////////////////////////
Mato.P(c,B);
Mato.M(B,A);
PauliVillarsSolvero(Mato,A,B);
Mato.Pdag(B,A);
//////////////////////////////
// Reinsert surface prop
//////////////////////////////
InsertSlice(y0,A,0,0);
//////////////////////////////
// Convert from y back to x
//////////////////////////////
Mato.P(A,B);
// sol5' = sol5 + M^-1 defect
// = sol5 + M^-1 src - M^-1 M sol5 ...
sol5 = sol5 + B;
std::cout << GridLogMessage << "***************************************" <<std::endl;
std::cout << GridLogMessage << " Sol5 update "<<std::endl;
std::cout << GridLogMessage << "***************************************" <<std::endl;
std::cout << GridLogMessage << " Sol5 now "<<norm2(sol5)<<std::endl;
std::cout << GridLogMessage << " delta "<<norm2(B)<<std::endl;
// New defect = b - M sol5
Mato.M(sol5,A);
defect = b - A;
std::cout << GridLogMessage << " defect "<<norm2(defect)<<std::endl;
double resid = ::sqrt(norm2(defect) / norm2(b));
std::cout << GridLogMessage << "Residual " << i << ": " << resid << std::endl;
std::cout << GridLogMessage << "***************************************" <<std::endl;
if (resid < target_resid) {
return;
}
}
std::cout << GridLogMessage << "MADWF : Exceeded maxiter "<<std::endl;
assert(0);
}
};
}}

View File

@ -1,502 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/MobiusEOFAFermion.cc
Copyright (C) 2017
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: David Murphy <dmurphy@phys.columbia.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid_Eigen_Dense.h>
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
namespace Grid {
namespace QCD {
template<class Impl>
MobiusEOFAFermion<Impl>::MobiusEOFAFermion(
GaugeField &_Umu,
GridCartesian &FiveDimGrid,
GridRedBlackCartesian &FiveDimRedBlackGrid,
GridCartesian &FourDimGrid,
GridRedBlackCartesian &FourDimRedBlackGrid,
RealD _mq1, RealD _mq2, RealD _mq3,
RealD _shift, int _pm, RealD _M5,
RealD _b, RealD _c, const ImplParams &p) :
AbstractEOFAFermion<Impl>(_Umu, FiveDimGrid, FiveDimRedBlackGrid,
FourDimGrid, FourDimRedBlackGrid, _mq1, _mq2, _mq3,
_shift, _pm, _M5, _b, _c, p)
{
int Ls = this->Ls;
RealD eps = 1.0;
Approx::zolotarev_data *zdata = Approx::higham(eps, this->Ls);
assert(zdata->n == this->Ls);
std::cout << GridLogMessage << "MobiusEOFAFermion (b=" << _b <<
",c=" << _c << ") with Ls=" << Ls << std::endl;
this->SetCoefficientsTanh(zdata, _b, _c);
std::cout << GridLogMessage << "EOFA parameters: (mq1=" << _mq1 <<
",mq2=" << _mq2 << ",mq3=" << _mq3 << ",shift=" << _shift <<
",pm=" << _pm << ")" << std::endl;
Approx::zolotarev_free(zdata);
if(_shift != 0.0){
SetCoefficientsPrecondShiftOps();
} else {
Mooee_shift.resize(Ls, 0.0);
MooeeInv_shift_lc.resize(Ls, 0.0);
MooeeInv_shift_norm.resize(Ls, 0.0);
MooeeInvDag_shift_lc.resize(Ls, 0.0);
MooeeInvDag_shift_norm.resize(Ls, 0.0);
}
}
/****************************************************************
* Additional EOFA operators only called outside the inverter.
* Since speed is not essential, simple axpby-style
* implementations should be fine.
***************************************************************/
template<class Impl>
void MobiusEOFAFermion<Impl>::Omega(const FermionField& psi, FermionField& Din, int sign, int dag)
{
int Ls = this->Ls;
RealD alpha = this->alpha;
Din = zero;
if((sign == 1) && (dag == 0)) { // \Omega_{+}
for(int s=0; s<Ls; ++s){
axpby_ssp(Din, 0.0, psi, 2.0*std::pow(1.0-alpha,Ls-s-1)/std::pow(1.0+alpha,Ls-s), psi, s, 0);
}
} else if((sign == -1) && (dag == 0)) { // \Omega_{-}
for(int s=0; s<Ls; ++s){
axpby_ssp(Din, 0.0, psi, 2.0*std::pow(1.0-alpha,s)/std::pow(1.0+alpha,s+1), psi, s, 0);
}
} else if((sign == 1 ) && (dag == 1)) { // \Omega_{+}^{\dagger}
for(int sp=0; sp<Ls; ++sp){
axpby_ssp(Din, 1.0, Din, 2.0*std::pow(1.0-alpha,Ls-sp-1)/std::pow(1.0+alpha,Ls-sp), psi, 0, sp);
}
} else if((sign == -1) && (dag == 1)) { // \Omega_{-}^{\dagger}
for(int sp=0; sp<Ls; ++sp){
axpby_ssp(Din, 1.0, Din, 2.0*std::pow(1.0-alpha,sp)/std::pow(1.0+alpha,sp+1), psi, 0, sp);
}
}
}
// This is the operator relating the usual Ddwf to TWQCD's EOFA Dirac operator (arXiv:1706.05843, Eqn. 6).
// It also relates the preconditioned and unpreconditioned systems described in Appendix B.2.
template<class Impl>
void MobiusEOFAFermion<Impl>::Dtilde(const FermionField& psi, FermionField& chi)
{
int Ls = this->Ls;
RealD b = 0.5 * ( 1.0 + this->alpha );
RealD c = 0.5 * ( 1.0 - this->alpha );
RealD mq1 = this->mq1;
for(int s=0; s<Ls; ++s){
if(s == 0) {
axpby_ssp_pminus(chi, b, psi, -c, psi, s, s+1);
axpby_ssp_pplus (chi, 1.0, chi, mq1*c, psi, s, Ls-1);
} else if(s == (Ls-1)) {
axpby_ssp_pminus(chi, b, psi, mq1*c, psi, s, 0);
axpby_ssp_pplus (chi, 1.0, chi, -c, psi, s, s-1);
} else {
axpby_ssp_pminus(chi, b, psi, -c, psi, s, s+1);
axpby_ssp_pplus (chi, 1.0, chi, -c, psi, s, s-1);
}
}
}
template<class Impl>
void MobiusEOFAFermion<Impl>::DtildeInv(const FermionField& psi, FermionField& chi)
{
int Ls = this->Ls;
RealD m = this->mq1;
RealD c = 0.5 * this->alpha;
RealD d = 0.5;
RealD DtInv_p(0.0), DtInv_m(0.0);
RealD N = std::pow(c+d,Ls) + m*std::pow(c-d,Ls);
FermionField tmp(this->FermionGrid());
for(int s=0; s<Ls; ++s){
for(int sp=0; sp<Ls; ++sp){
DtInv_p = m * std::pow(-1.0,s-sp+1) * std::pow(c-d,Ls+s-sp) / std::pow(c+d,s-sp+1) / N;
DtInv_p += (s < sp) ? 0.0 : std::pow(-1.0,s-sp) * std::pow(c-d,s-sp) / std::pow(c+d,s-sp+1);
DtInv_m = m * std::pow(-1.0,sp-s+1) * std::pow(c-d,Ls+sp-s) / std::pow(c+d,sp-s+1) / N;
DtInv_m += (s > sp) ? 0.0 : std::pow(-1.0,sp-s) * std::pow(c-d,sp-s) / std::pow(c+d,sp-s+1);
if(sp == 0){
axpby_ssp_pplus (tmp, 0.0, tmp, DtInv_p, psi, s, sp);
axpby_ssp_pminus(tmp, 0.0, tmp, DtInv_m, psi, s, sp);
} else {
axpby_ssp_pplus (tmp, 1.0, tmp, DtInv_p, psi, s, sp);
axpby_ssp_pminus(tmp, 1.0, tmp, DtInv_m, psi, s, sp);
}
}}
}
/*****************************************************************************************************/
template<class Impl>
RealD MobiusEOFAFermion<Impl>::M(const FermionField& psi, FermionField& chi)
{
int Ls = this->Ls;
FermionField Din(psi._grid);
this->Meooe5D(psi, Din);
this->DW(Din, chi, DaggerNo);
axpby(chi, 1.0, 1.0, chi, psi);
this->M5D(psi, chi);
return(norm2(chi));
}
template<class Impl>
RealD MobiusEOFAFermion<Impl>::Mdag(const FermionField& psi, FermionField& chi)
{
int Ls = this->Ls;
FermionField Din(psi._grid);
this->DW(psi, Din, DaggerYes);
this->MeooeDag5D(Din, chi);
this->M5Ddag(psi, chi);
axpby(chi, 1.0, 1.0, chi, psi);
return(norm2(chi));
}
/********************************************************************
* Performance critical fermion operators called inside the inverter
********************************************************************/
template<class Impl>
void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
{
int Ls = this->Ls;
std::vector<Coeff_t> diag(Ls,1.0);
std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = this->mq1;
std::vector<Coeff_t> lower(Ls,-1.0); lower[0] = this->mq1;
// no shift term
if(this->shift == 0.0){ this->M5D(psi, chi, chi, lower, diag, upper); }
// fused M + shift operation
else{ this->M5D_shift(psi, chi, chi, lower, diag, upper, Mooee_shift); }
}
template<class Impl>
void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& chi)
{
int Ls = this->Ls;
std::vector<Coeff_t> diag(Ls,1.0);
std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = this->mq1;
std::vector<Coeff_t> lower(Ls,-1.0); lower[0] = this->mq1;
// no shift term
if(this->shift == 0.0){ this->M5Ddag(psi, chi, chi, lower, diag, upper); }
// fused M + shift operation
else{ this->M5Ddag_shift(psi, chi, chi, lower, diag, upper, Mooee_shift); }
}
// half checkerboard operations
template<class Impl>
void MobiusEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& chi)
{
int Ls = this->Ls;
// coefficients of Mooee
std::vector<Coeff_t> diag = this->bee;
std::vector<Coeff_t> upper(Ls);
std::vector<Coeff_t> lower(Ls);
for(int s=0; s<Ls; s++){
upper[s] = -this->cee[s];
lower[s] = -this->cee[s];
}
upper[Ls-1] *= -this->mq1;
lower[0] *= -this->mq1;
// no shift term
if(this->shift == 0.0){ this->M5D(psi, psi, chi, lower, diag, upper); }
// fused M + shift operation
else { this->M5D_shift(psi, psi, chi, lower, diag, upper, Mooee_shift); }
}
template<class Impl>
void MobiusEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField& chi)
{
int Ls = this->Ls;
// coefficients of MooeeDag
std::vector<Coeff_t> diag = this->bee;
std::vector<Coeff_t> upper(Ls);
std::vector<Coeff_t> lower(Ls);
for(int s=0; s<Ls; s++){
if(s==0) {
upper[s] = -this->cee[s+1];
lower[s] = this->mq1*this->cee[Ls-1];
} else if(s==(Ls-1)) {
upper[s] = this->mq1*this->cee[0];
lower[s] = -this->cee[s-1];
} else {
upper[s] = -this->cee[s+1];
lower[s] = -this->cee[s-1];
}
}
// no shift term
if(this->shift == 0.0){ this->M5Ddag(psi, psi, chi, lower, diag, upper); }
// fused M + shift operation
else{ this->M5Ddag_shift(psi, psi, chi, lower, diag, upper, Mooee_shift); }
}
/****************************************************************************************/
// Computes coefficients for applying Cayley preconditioned shift operators
// (Mooee + \Delta) --> Mooee_shift
// (Mooee + \Delta)^{-1} --> MooeeInv_shift_lc, MooeeInv_shift_norm
// (Mooee + \Delta)^{-dag} --> MooeeInvDag_shift_lc, MooeeInvDag_shift_norm
// For the latter two cases, the operation takes the form
// [ (Mooee + \Delta)^{-1} \psi ]_{i} = Mooee_{ij} \psi_{j} +
// ( MooeeInv_shift_norm )_{i} ( \sum_{j} [ MooeeInv_shift_lc ]_{j} P_{pm} \psi_{j} )
template<class Impl>
void MobiusEOFAFermion<Impl>::SetCoefficientsPrecondShiftOps()
{
int Ls = this->Ls;
int pm = this->pm;
RealD alpha = this->alpha;
RealD k = this->k;
RealD mq1 = this->mq1;
RealD shift = this->shift;
// Initialize
Mooee_shift.resize(Ls);
MooeeInv_shift_lc.resize(Ls);
MooeeInv_shift_norm.resize(Ls);
MooeeInvDag_shift_lc.resize(Ls);
MooeeInvDag_shift_norm.resize(Ls);
// Construct Mooee_shift
int idx(0);
Coeff_t N = ( (pm == 1) ? 1.0 : -1.0 ) * (2.0*shift*k) *
( std::pow(alpha+1.0,Ls) + mq1*std::pow(alpha-1.0,Ls) );
for(int s=0; s<Ls; ++s){
idx = (pm == 1) ? (s) : (Ls-1-s);
Mooee_shift[idx] = N * std::pow(-1.0,s) * std::pow(alpha-1.0,s) / std::pow(alpha+1.0,Ls+s+1);
}
// Tridiagonal solve for MooeeInvDag_shift_lc
{
Coeff_t m(0.0);
std::vector<Coeff_t> d = Mooee_shift;
std::vector<Coeff_t> u(Ls,0.0);
std::vector<Coeff_t> y(Ls,0.0);
std::vector<Coeff_t> q(Ls,0.0);
if(pm == 1){ u[0] = 1.0; }
else{ u[Ls-1] = 1.0; }
// Tridiagonal matrix algorithm + Sherman-Morrison formula
//
// We solve
// ( Mooee' + u \otimes v ) MooeeInvDag_shift_lc = Mooee_shift
// where Mooee' is the tridiagonal part of Mooee_{+}, and
// u = (1,0,...,0) and v = (0,...,0,mq1*cee[0]) are chosen
// so that the outer-product u \otimes v gives the (0,Ls-1)
// entry of Mooee_{+}.
//
// We do this as two solves: Mooee'*y = d and Mooee'*q = u,
// and then construct the solution to the original system
// MooeeInvDag_shift_lc = y - <v,y> / ( 1 + <v,q> ) q
if(pm == 1){
for(int s=1; s<Ls; ++s){
m = -this->cee[s] / this->bee[s-1];
d[s] -= m*d[s-1];
u[s] -= m*u[s-1];
}
}
y[Ls-1] = d[Ls-1] / this->bee[Ls-1];
q[Ls-1] = u[Ls-1] / this->bee[Ls-1];
for(int s=Ls-2; s>=0; --s){
if(pm == 1){
y[s] = d[s] / this->bee[s];
q[s] = u[s] / this->bee[s];
} else {
y[s] = ( d[s] + this->cee[s]*y[s+1] ) / this->bee[s];
q[s] = ( u[s] + this->cee[s]*q[s+1] ) / this->bee[s];
}
}
// Construct MooeeInvDag_shift_lc
for(int s=0; s<Ls; ++s){
if(pm == 1){
MooeeInvDag_shift_lc[s] = y[s] - mq1*this->cee[0]*y[Ls-1] /
(1.0+mq1*this->cee[0]*q[Ls-1]) * q[s];
} else {
MooeeInvDag_shift_lc[s] = y[s] - mq1*this->cee[Ls-1]*y[0] /
(1.0+mq1*this->cee[Ls-1]*q[0]) * q[s];
}
}
// Compute remaining coefficients
N = (pm == 1) ? (1.0 + MooeeInvDag_shift_lc[Ls-1]) : (1.0 + MooeeInvDag_shift_lc[0]);
for(int s=0; s<Ls; ++s){
// MooeeInv_shift_lc
if(pm == 1){ MooeeInv_shift_lc[s] = std::pow(this->bee[s],s) * std::pow(this->cee[s],Ls-1-s); }
else{ MooeeInv_shift_lc[s] = std::pow(this->bee[s],Ls-1-s) * std::pow(this->cee[s],s); }
// MooeeInv_shift_norm
MooeeInv_shift_norm[s] = -MooeeInvDag_shift_lc[s] /
( std::pow(this->bee[s],Ls) + mq1*std::pow(this->cee[s],Ls) ) / N;
// MooeeInvDag_shift_norm
if(pm == 1){ MooeeInvDag_shift_norm[s] = -std::pow(this->bee[s],s) * std::pow(this->cee[s],Ls-1-s) /
( std::pow(this->bee[s],Ls) + mq1*std::pow(this->cee[s],Ls) ) / N; }
else{ MooeeInvDag_shift_norm[s] = -std::pow(this->bee[s],Ls-1-s) * std::pow(this->cee[s],s) /
( std::pow(this->bee[s],Ls) + mq1*std::pow(this->cee[s],Ls) ) / N; }
}
}
}
// Recompute coefficients for a different value of shift constant
template<class Impl>
void MobiusEOFAFermion<Impl>::RefreshShiftCoefficients(RealD new_shift)
{
this->shift = new_shift;
if(new_shift != 0.0){
SetCoefficientsPrecondShiftOps();
} else {
int Ls = this->Ls;
Mooee_shift.resize(Ls,0.0);
MooeeInv_shift_lc.resize(Ls,0.0);
MooeeInv_shift_norm.resize(Ls,0.0);
MooeeInvDag_shift_lc.resize(Ls,0.0);
MooeeInvDag_shift_norm.resize(Ls,0.0);
}
}
template<class Impl>
void MobiusEOFAFermion<Impl>::MooeeInternalCompute(int dag, int inv,
Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
{
int Ls = this->Ls;
GridBase* grid = this->FermionRedBlackGrid();
int LLs = grid->_rdimensions[0];
if(LLs == Ls){ return; } // Not vectorised in 5th direction
Eigen::MatrixXcd Pplus = Eigen::MatrixXcd::Zero(Ls,Ls);
Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
for(int s=0; s<Ls; s++){
Pplus(s,s) = this->bee[s];
Pminus(s,s) = this->bee[s];
}
for(int s=0; s<Ls-1; s++){
Pminus(s,s+1) = -this->cee[s];
Pplus(s+1,s) = -this->cee[s+1];
}
Pplus (0,Ls-1) = this->mq1*this->cee[0];
Pminus(Ls-1,0) = this->mq1*this->cee[Ls-1];
if(this->shift != 0.0){
RealD c = 0.5 * this->alpha;
RealD d = 0.5;
RealD N = this->shift * this->k * ( std::pow(c+d,Ls) + this->mq1*std::pow(c-d,Ls) );
if(this->pm == 1) {
for(int s=0; s<Ls; ++s){
Pplus(s,Ls-1) += N * std::pow(-1.0,s) * std::pow(c-d,s) / std::pow(c+d,Ls+s+1);
}
} else {
for(int s=0; s<Ls; ++s){
Pminus(s,0) += N * std::pow(-1.0,s+1) * std::pow(c-d,Ls-1-s) / std::pow(c+d,2*Ls-s);
}
}
}
Eigen::MatrixXcd PplusMat ;
Eigen::MatrixXcd PminusMat;
if(inv) {
PplusMat = Pplus.inverse();
PminusMat = Pminus.inverse();
} else {
PplusMat = Pplus;
PminusMat = Pminus;
}
if(dag){
PplusMat.adjointInPlace();
PminusMat.adjointInPlace();
}
typedef typename SiteHalfSpinor::scalar_type scalar_type;
const int Nsimd = Simd::Nsimd();
Matp.resize(Ls*LLs);
Matm.resize(Ls*LLs);
for(int s2=0; s2<Ls; s2++){
for(int s1=0; s1<LLs; s1++){
int istride = LLs;
int ostride = 1;
Simd Vp;
Simd Vm;
scalar_type *sp = (scalar_type*) &Vp;
scalar_type *sm = (scalar_type*) &Vm;
for(int l=0; l<Nsimd; l++){
if(switcheroo<Coeff_t>::iscomplex()) {
sp[l] = PplusMat (l*istride+s1*ostride,s2);
sm[l] = PminusMat(l*istride+s1*ostride,s2);
} else {
// if real
scalar_type tmp;
tmp = PplusMat (l*istride+s1*ostride,s2);
sp[l] = scalar_type(tmp.real(),tmp.real());
tmp = PminusMat(l*istride+s1*ostride,s2);
sm[l] = scalar_type(tmp.real(),tmp.real());
}
}
Matp[LLs*s2+s1] = Vp;
Matm[LLs*s2+s1] = Vm;
}}
}
FermOpTemplateInstantiate(MobiusEOFAFermion);
GparityFermOpTemplateInstantiate(MobiusEOFAFermion);
}}

View File

@ -1,133 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/MobiusEOFAFermion.h
Copyright (C) 2017
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: David Murphy <dmurphy@phys.columbia.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_QCD_MOBIUS_EOFA_FERMION_H
#define GRID_QCD_MOBIUS_EOFA_FERMION_H
#include <Grid/qcd/action/fermion/AbstractEOFAFermion.h>
namespace Grid {
namespace QCD {
template<class Impl>
class MobiusEOFAFermion : public AbstractEOFAFermion<Impl>
{
public:
INHERIT_IMPL_TYPES(Impl);
public:
// Shift operator coefficients for red-black preconditioned Mobius EOFA
std::vector<Coeff_t> Mooee_shift;
std::vector<Coeff_t> MooeeInv_shift_lc;
std::vector<Coeff_t> MooeeInv_shift_norm;
std::vector<Coeff_t> MooeeInvDag_shift_lc;
std::vector<Coeff_t> MooeeInvDag_shift_norm;
virtual void Instantiatable(void) {};
// EOFA-specific operations
virtual void Omega (const FermionField& in, FermionField& out, int sign, int dag);
virtual void Dtilde (const FermionField& in, FermionField& out);
virtual void DtildeInv (const FermionField& in, FermionField& out);
// override multiply
virtual RealD M (const FermionField& in, FermionField& out);
virtual RealD Mdag (const FermionField& in, FermionField& out);
// half checkerboard operations
virtual void Mooee (const FermionField& in, FermionField& out);
virtual void MooeeDag (const FermionField& in, FermionField& out);
virtual void MooeeInv (const FermionField& in, FermionField& out);
virtual void MooeeInv_shift (const FermionField& in, FermionField& out);
virtual void MooeeInvDag (const FermionField& in, FermionField& out);
virtual void MooeeInvDag_shift(const FermionField& in, FermionField& out);
virtual void M5D (const FermionField& psi, FermionField& chi);
virtual void M5Ddag (const FermionField& psi, FermionField& chi);
/////////////////////////////////////////////////////
// Instantiate different versions depending on Impl
/////////////////////////////////////////////////////
void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
void M5D_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
std::vector<Coeff_t>& shift_coeffs);
void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
void M5Ddag_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
std::vector<Coeff_t>& shift_coeffs);
void MooeeInternal(const FermionField& in, FermionField& out, int dag, int inv);
void MooeeInternalCompute(int dag, int inv, Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
void MooeeInternalAsm(const FermionField& in, FermionField& out, int LLs, int site,
Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
void MooeeInternalZAsm(const FermionField& in, FermionField& out, int LLs, int site,
Vector<iSinglet<Simd>>& Matp, Vector<iSinglet<Simd>>& Matm);
virtual void RefreshShiftCoefficients(RealD new_shift);
// Constructors
MobiusEOFAFermion(GaugeField& _Umu, GridCartesian& FiveDimGrid, GridRedBlackCartesian& FiveDimRedBlackGrid,
GridCartesian& FourDimGrid, GridRedBlackCartesian& FourDimRedBlackGrid,
RealD _mq1, RealD _mq2, RealD _mq3, RealD _shift, int pm,
RealD _M5, RealD _b, RealD _c, const ImplParams& p=ImplParams());
protected:
void SetCoefficientsPrecondShiftOps(void);
};
}}
#define INSTANTIATE_DPERP_MOBIUS_EOFA(A)\
template void MobiusEOFAFermion<A>::M5D(const FermionField& psi, const FermionField& phi, FermionField& chi, \
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper); \
template void MobiusEOFAFermion<A>::M5D_shift(const FermionField& psi, const FermionField& phi, FermionField& chi, \
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper, std::vector<Coeff_t>& shift_coeffs); \
template void MobiusEOFAFermion<A>::M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi, \
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper); \
template void MobiusEOFAFermion<A>::M5Ddag_shift(const FermionField& psi, const FermionField& phi, FermionField& chi, \
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper, std::vector<Coeff_t>& shift_coeffs); \
template void MobiusEOFAFermion<A>::MooeeInv(const FermionField& psi, FermionField& chi); \
template void MobiusEOFAFermion<A>::MooeeInv_shift(const FermionField& psi, FermionField& chi); \
template void MobiusEOFAFermion<A>::MooeeInvDag(const FermionField& psi, FermionField& chi); \
template void MobiusEOFAFermion<A>::MooeeInvDag_shift(const FermionField& psi, FermionField& chi);
#undef MOBIUS_EOFA_DPERP_DENSE
#define MOBIUS_EOFA_DPERP_CACHE
#undef MOBIUS_EOFA_DPERP_LINALG
#define MOBIUS_EOFA_DPERP_VEC
#endif

View File

@ -1,429 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/MobiusEOFAFermioncache.cc
Copyright (C) 2017
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: David Murphy <dmurphy@phys.columbia.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
namespace Grid {
namespace QCD {
// FIXME -- make a version of these routines with site loop outermost for cache reuse.
template<class Impl>
void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi, const FermionField &phi, FermionField &chi,
std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper)
{
int Ls = this->Ls;
GridBase *grid = psi._grid;
assert(phi.checkerboard == psi.checkerboard);
chi.checkerboard = psi.checkerboard;
// Flops = 6.0*(Nc*Ns) *Ls*vol
this->M5Dcalls++;
this->M5Dtime -= usecond();
parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
for(int s=0; s<Ls; s++){
auto tmp = psi._odata[0];
if(s==0){
spProj5m(tmp, psi._odata[ss+s+1]);
chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
spProj5p(tmp, psi._odata[ss+Ls-1]);
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
} else if(s==(Ls-1)) {
spProj5m(tmp, psi._odata[ss+0]);
chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
spProj5p(tmp, psi._odata[ss+s-1]);
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
} else {
spProj5m(tmp, psi._odata[ss+s+1]);
chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
spProj5p(tmp, psi._odata[ss+s-1]);
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
}
}
}
this->M5Dtime += usecond();
}
template<class Impl>
void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi, const FermionField &phi, FermionField &chi,
std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper,
std::vector<Coeff_t> &shift_coeffs)
{
int Ls = this->Ls;
int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
GridBase *grid = psi._grid;
assert(phi.checkerboard == psi.checkerboard);
chi.checkerboard = psi.checkerboard;
// Flops = 6.0*(Nc*Ns) *Ls*vol
this->M5Dcalls++;
this->M5Dtime -= usecond();
parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
for(int s=0; s<Ls; s++){
auto tmp = psi._odata[0];
if(s==0){
spProj5m(tmp, psi._odata[ss+s+1]);
chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
spProj5p(tmp, psi._odata[ss+Ls-1]);
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
} else if(s==(Ls-1)) {
spProj5m(tmp, psi._odata[ss+0]);
chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
spProj5p(tmp, psi._odata[ss+s-1]);
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
} else {
spProj5m(tmp, psi._odata[ss+s+1]);
chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
spProj5p(tmp, psi._odata[ss+s-1]);
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
}
if(this->pm == 1){ spProj5p(tmp, psi._odata[ss+shift_s]); }
else{ spProj5m(tmp, psi._odata[ss+shift_s]); }
chi[ss+s] = chi[ss+s] + shift_coeffs[s]*tmp;
}
}
this->M5Dtime += usecond();
}
template<class Impl>
void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi, const FermionField &phi, FermionField &chi,
std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper)
{
int Ls = this->Ls;
GridBase *grid = psi._grid;
assert(phi.checkerboard == psi.checkerboard);
chi.checkerboard = psi.checkerboard;
// Flops = 6.0*(Nc*Ns) *Ls*vol
this->M5Dcalls++;
this->M5Dtime -= usecond();
parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
auto tmp = psi._odata[0];
for(int s=0; s<Ls; s++){
if(s==0) {
spProj5p(tmp, psi._odata[ss+s+1]);
chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
spProj5m(tmp, psi._odata[ss+Ls-1]);
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
} else if(s==(Ls-1)) {
spProj5p(tmp, psi._odata[ss+0]);
chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
spProj5m(tmp, psi._odata[ss+s-1]);
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
} else {
spProj5p(tmp, psi._odata[ss+s+1]);
chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
spProj5m(tmp, psi._odata[ss+s-1]);
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
}
}
}
this->M5Dtime += usecond();
}
template<class Impl>
void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi, const FermionField &phi, FermionField &chi,
std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper,
std::vector<Coeff_t> &shift_coeffs)
{
int Ls = this->Ls;
int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
GridBase *grid = psi._grid;
assert(phi.checkerboard == psi.checkerboard);
chi.checkerboard = psi.checkerboard;
// Flops = 6.0*(Nc*Ns) *Ls*vol
this->M5Dcalls++;
this->M5Dtime -= usecond();
parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
chi[ss+Ls-1] = zero;
auto tmp = psi._odata[0];
for(int s=0; s<Ls; s++){
if(s==0) {
spProj5p(tmp, psi._odata[ss+s+1]);
chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
spProj5m(tmp, psi._odata[ss+Ls-1]);
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
} else if(s==(Ls-1)) {
spProj5p(tmp, psi._odata[ss+0]);
chi[ss+s] = chi[ss+s] + diag[s]*phi[ss+s] + upper[s]*tmp;
spProj5m(tmp, psi._odata[ss+s-1]);
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
} else {
spProj5p(tmp, psi._odata[ss+s+1]);
chi[ss+s] = diag[s]*phi[ss+s] + upper[s]*tmp;
spProj5m(tmp, psi._odata[ss+s-1]);
chi[ss+s] = chi[ss+s] + lower[s]*tmp;
}
if(this->pm == 1){ spProj5p(tmp, psi._odata[ss+s]); }
else{ spProj5m(tmp, psi._odata[ss+s]); }
chi[ss+shift_s] = chi[ss+shift_s] + shift_coeffs[s]*tmp;
}
}
this->M5Dtime += usecond();
}
template<class Impl>
void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi, FermionField &chi)
{
if(this->shift != 0.0){ MooeeInv_shift(psi,chi); return; }
GridBase *grid = psi._grid;
int Ls = this->Ls;
chi.checkerboard = psi.checkerboard;
this->MooeeInvCalls++;
this->MooeeInvTime -= usecond();
parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
auto tmp = psi._odata[0];
// Apply (L^{\prime})^{-1}
chi[ss] = psi[ss]; // chi[0]=psi[0]
for(int s=1; s<Ls; s++){
spProj5p(tmp, chi[ss+s-1]);
chi[ss+s] = psi[ss+s] - this->lee[s-1]*tmp;
}
// L_m^{-1}
for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
spProj5m(tmp, chi[ss+s]);
chi[ss+Ls-1] = chi[ss+Ls-1] - this->leem[s]*tmp;
}
// U_m^{-1} D^{-1}
for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
spProj5p(tmp, chi[ss+Ls-1]);
chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->ueem[s]/this->dee[Ls-1])*tmp;
}
chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
// Apply U^{-1}
for(int s=Ls-2; s>=0; s--){
spProj5m(tmp, chi[ss+s+1]);
chi[ss+s] = chi[ss+s] - this->uee[s]*tmp;
}
}
this->MooeeInvTime += usecond();
}
template<class Impl>
void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi, FermionField &chi)
{
GridBase *grid = psi._grid;
int Ls = this->Ls;
chi.checkerboard = psi.checkerboard;
this->MooeeInvCalls++;
this->MooeeInvTime -= usecond();
parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
auto tmp1 = psi._odata[0];
auto tmp2 = psi._odata[0];
auto tmp2_spProj = psi._odata[0];
// Apply (L^{\prime})^{-1} and accumulate MooeeInv_shift_lc[j]*psi[j] in tmp2
chi[ss] = psi[ss]; // chi[0]=psi[0]
tmp2 = MooeeInv_shift_lc[0]*psi[ss];
for(int s=1; s<Ls; s++){
spProj5p(tmp1, chi[ss+s-1]);
chi[ss+s] = psi[ss+s] - this->lee[s-1]*tmp1;
tmp2 = tmp2 + MooeeInv_shift_lc[s]*psi[ss+s];
}
if(this->pm == 1){ spProj5p(tmp2_spProj, tmp2);}
else{ spProj5m(tmp2_spProj, tmp2); }
// L_m^{-1}
for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
spProj5m(tmp1, chi[ss+s]);
chi[ss+Ls-1] = chi[ss+Ls-1] - this->leem[s]*tmp1;
}
// U_m^{-1} D^{-1}
for(int s=0; s<Ls-1; s++){ // Chi[s] + 1/d chi[s]
spProj5p(tmp1, chi[ss+Ls-1]);
chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->ueem[s]/this->dee[Ls-1])*tmp1;
}
// chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1] + MooeeInv_shift_norm[Ls-1]*tmp2_spProj;
chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
spProj5m(tmp1, chi[ss+Ls-1]);
chi[ss+Ls-1] = chi[ss+Ls-1] + MooeeInv_shift_norm[Ls-1]*tmp2_spProj;
// Apply U^{-1} and add shift term
for(int s=Ls-2; s>=0; s--){
chi[ss+s] = chi[ss+s] - this->uee[s]*tmp1;
spProj5m(tmp1, chi[ss+s]);
chi[ss+s] = chi[ss+s] + MooeeInv_shift_norm[s]*tmp2_spProj;
}
}
this->MooeeInvTime += usecond();
}
template<class Impl>
void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi, FermionField &chi)
{
if(this->shift != 0.0){ MooeeInvDag_shift(psi,chi); return; }
GridBase *grid = psi._grid;
int Ls = this->Ls;
chi.checkerboard = psi.checkerboard;
this->MooeeInvCalls++;
this->MooeeInvTime -= usecond();
parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
auto tmp = psi._odata[0];
// Apply (U^{\prime})^{-dag}
chi[ss] = psi[ss];
for(int s=1; s<Ls; s++){
spProj5m(tmp, chi[ss+s-1]);
chi[ss+s] = psi[ss+s] - this->uee[s-1]*tmp;
}
// U_m^{-\dag}
for(int s=0; s<Ls-1; s++){
spProj5p(tmp, chi[ss+s]);
chi[ss+Ls-1] = chi[ss+Ls-1] - this->ueem[s]*tmp;
}
// L_m^{-\dag} D^{-dag}
for(int s=0; s<Ls-1; s++){
spProj5m(tmp, chi[ss+Ls-1]);
chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->leem[s]/this->dee[Ls-1])*tmp;
}
chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
// Apply L^{-dag}
for(int s=Ls-2; s>=0; s--){
spProj5p(tmp, chi[ss+s+1]);
chi[ss+s] = chi[ss+s] - this->lee[s]*tmp;
}
}
this->MooeeInvTime += usecond();
}
template<class Impl>
void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi, FermionField &chi)
{
GridBase *grid = psi._grid;
int Ls = this->Ls;
chi.checkerboard = psi.checkerboard;
this->MooeeInvCalls++;
this->MooeeInvTime -= usecond();
parallel_for(int ss=0; ss<grid->oSites(); ss+=Ls){
auto tmp1 = psi._odata[0];
auto tmp2 = psi._odata[0];
auto tmp2_spProj = psi._odata[0];
// Apply (U^{\prime})^{-dag} and accumulate MooeeInvDag_shift_lc[j]*psi[j] in tmp2
chi[ss] = psi[ss];
tmp2 = MooeeInvDag_shift_lc[0]*psi[ss];
for(int s=1; s<Ls; s++){
spProj5m(tmp1, chi[ss+s-1]);
chi[ss+s] = psi[ss+s] - this->uee[s-1]*tmp1;
tmp2 = tmp2 + MooeeInvDag_shift_lc[s]*psi[ss+s];
}
if(this->pm == 1){ spProj5p(tmp2_spProj, tmp2);}
else{ spProj5m(tmp2_spProj, tmp2); }
// U_m^{-\dag}
for(int s=0; s<Ls-1; s++){
spProj5p(tmp1, chi[ss+s]);
chi[ss+Ls-1] = chi[ss+Ls-1] - this->ueem[s]*tmp1;
}
// L_m^{-\dag} D^{-dag}
for(int s=0; s<Ls-1; s++){
spProj5m(tmp1, chi[ss+Ls-1]);
chi[ss+s] = (1.0/this->dee[s])*chi[ss+s] - (this->leem[s]/this->dee[Ls-1])*tmp1;
}
chi[ss+Ls-1] = (1.0/this->dee[Ls-1])*chi[ss+Ls-1];
spProj5p(tmp1, chi[ss+Ls-1]);
chi[ss+Ls-1] = chi[ss+Ls-1] + MooeeInvDag_shift_norm[Ls-1]*tmp2_spProj;
// Apply L^{-dag}
for(int s=Ls-2; s>=0; s--){
chi[ss+s] = chi[ss+s] - this->lee[s]*tmp1;
spProj5p(tmp1, chi[ss+s]);
chi[ss+s] = chi[ss+s] + MooeeInvDag_shift_norm[s]*tmp2_spProj;
}
}
this->MooeeInvTime += usecond();
}
#ifdef MOBIUS_EOFA_DPERP_CACHE
INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplF);
INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplD);
INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplF);
INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplD);
INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplF);
INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplD);
INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplFH);
INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplDF);
INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplFH);
INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplDF);
INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplFH);
INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplDF);
#endif
}}

View File

@ -1,184 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/MobiusEOFAFermiondense.cc
Copyright (C) 2017
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: David Murphy <dmurphy@phys.columbia.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid_Eigen_Dense.h>
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
namespace Grid {
namespace QCD {
/*
* Dense matrix versions of routines
*/
template<class Impl>
void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
{
this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
}
template<class Impl>
void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField& psi, FermionField& chi)
{
this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
}
template<class Impl>
void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
{
this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
}
template<class Impl>
void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField& psi, FermionField& chi)
{
this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
}
template<class Impl>
void MobiusEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
{
int Ls = this->Ls;
int LLs = psi._grid->_rdimensions[0];
int vol = psi._grid->oSites()/LLs;
int pm = this->pm;
RealD shift = this->shift;
RealD alpha = this->alpha;
RealD k = this->k;
RealD mq1 = this->mq1;
chi.checkerboard = psi.checkerboard;
assert(Ls==LLs);
Eigen::MatrixXd Pplus = Eigen::MatrixXd::Zero(Ls,Ls);
Eigen::MatrixXd Pminus = Eigen::MatrixXd::Zero(Ls,Ls);
for(int s=0;s<Ls;s++){
Pplus(s,s) = this->bee[s];
Pminus(s,s) = this->bee[s];
}
for(int s=0; s<Ls-1; s++){
Pminus(s,s+1) = -this->cee[s];
}
for(int s=0; s<Ls-1; s++){
Pplus(s+1,s) = -this->cee[s+1];
}
Pplus (0,Ls-1) = mq1*this->cee[0];
Pminus(Ls-1,0) = mq1*this->cee[Ls-1];
if(shift != 0.0){
Coeff_t N = 2.0 * ( std::pow(alpha+1.0,Ls) + mq1*std::pow(alpha-1.0,Ls) );
for(int s=0; s<Ls; ++s){
if(pm == 1){ Pplus(s,Ls-1) += shift * k * N * std::pow(-1.0,s) * std::pow(alpha-1.0,s) / std::pow(alpha+1.0,Ls+s+1); }
else{ Pminus(Ls-1-s,Ls-1) -= shift * k * N * std::pow(-1.0,s) * std::pow(alpha-1.0,s) / std::pow(alpha+1.0,Ls+s+1); }
}
}
Eigen::MatrixXd PplusMat ;
Eigen::MatrixXd PminusMat;
if(inv){
PplusMat = Pplus.inverse();
PminusMat = Pminus.inverse();
} else {
PplusMat = Pplus;
PminusMat = Pminus;
}
if(dag){
PplusMat.adjointInPlace();
PminusMat.adjointInPlace();
}
// For the non-vectorised s-direction this is simple
for(auto site=0; site<vol; site++){
SiteSpinor SiteChi;
SiteHalfSpinor SitePplus;
SiteHalfSpinor SitePminus;
for(int s1=0; s1<Ls; s1++){
SiteChi = zero;
for(int s2=0; s2<Ls; s2++){
int lex2 = s2 + Ls*site;
if(PplusMat(s1,s2) != 0.0){
spProj5p(SitePplus,psi[lex2]);
accumRecon5p(SiteChi, PplusMat(s1,s2)*SitePplus);
}
if(PminusMat(s1,s2) != 0.0){
spProj5m(SitePminus, psi[lex2]);
accumRecon5m(SiteChi, PminusMat(s1,s2)*SitePminus);
}
}
chi[s1+Ls*site] = SiteChi*0.5;
}
}
}
#ifdef MOBIUS_EOFA_DPERP_DENSE
INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplF);
INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplD);
INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplF);
INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplD);
INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplF);
INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplD);
template void MobiusEOFAFermion<GparityWilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void MobiusEOFAFermion<GparityWilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void MobiusEOFAFermion<WilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void MobiusEOFAFermion<WilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void MobiusEOFAFermion<ZWilsonImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void MobiusEOFAFermion<ZWilsonImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplFH);
INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplDF);
INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplFH);
INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplDF);
INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplFH);
INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplDF);
template void MobiusEOFAFermion<GparityWilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void MobiusEOFAFermion<GparityWilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void MobiusEOFAFermion<WilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void MobiusEOFAFermion<WilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void MobiusEOFAFermion<ZWilsonImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void MobiusEOFAFermion<ZWilsonImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
#endif
}}

View File

@ -1,290 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/MobiusEOFAFermionssp.cc
Copyright (C) 2017
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: David Murphy <dmurphy@phys.columbia.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
namespace Grid {
namespace QCD {
// FIXME -- make a version of these routines with site loop outermost for cache reuse.
// Pminus fowards
// Pplus backwards
template<class Impl>
void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi,
FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
{
Coeff_t one(1.0);
int Ls = this->Ls;
for(int s=0; s<Ls; s++){
if(s==0) {
axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, Ls-1);
} else if (s==(Ls-1)) {
axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, 0);
axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, s-1);
} else {
axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
axpby_ssp_pplus(chi, one, chi, lower[s], psi, s, s-1);
}
}
}
template<class Impl>
void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField& psi, const FermionField& phi,
FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
std::vector<Coeff_t>& shift_coeffs)
{
Coeff_t one(1.0);
int Ls = this->Ls;
for(int s=0; s<Ls; s++){
if(s==0) {
axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, Ls-1);
} else if (s==(Ls-1)) {
axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, 0);
axpby_ssp_pplus (chi, one, chi, lower[s], psi, s, s-1);
} else {
axpby_ssp_pminus(chi, diag[s], phi, upper[s], psi, s, s+1);
axpby_ssp_pplus(chi, one, chi, lower[s], psi, s, s-1);
}
if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, s, Ls-1); }
else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, s, 0); }
}
}
template<class Impl>
void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi,
FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
{
Coeff_t one(1.0);
int Ls = this->Ls;
for(int s=0; s<Ls; s++){
if(s==0) {
axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, Ls-1);
} else if (s==(Ls-1)) {
axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, 0);
axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
} else {
axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
}
}
}
template<class Impl>
void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField& psi, const FermionField& phi,
FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
std::vector<Coeff_t>& shift_coeffs)
{
Coeff_t one(1.0);
int Ls = this->Ls;
for(int s=0; s<Ls; s++){
if(s==0) {
axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, Ls-1);
} else if (s==(Ls-1)) {
axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, 0);
axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
} else {
axpby_ssp_pplus (chi, diag[s], phi, upper[s], psi, s, s+1);
axpby_ssp_pminus(chi, one, chi, lower[s], psi, s, s-1);
}
if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, Ls-1, s); }
else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, 0, s); }
}
}
template<class Impl>
void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
{
if(this->shift != 0.0){ MooeeInv_shift(psi,chi); return; }
Coeff_t one(1.0);
Coeff_t czero(0.0);
chi.checkerboard = psi.checkerboard;
int Ls = this->Ls;
// Apply (L^{\prime})^{-1}
axpby_ssp(chi, one, psi, czero, psi, 0, 0); // chi[0]=psi[0]
for(int s=1; s<Ls; s++){
axpby_ssp_pplus(chi, one, psi, -this->lee[s-1], chi, s, s-1);// recursion Psi[s] -lee P_+ chi[s-1]
}
// L_m^{-1}
for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
axpby_ssp_pminus(chi, one, chi, -this->leem[s], chi, Ls-1, s);
}
// U_m^{-1} D^{-1}
for(int s=0; s<Ls-1; s++){
axpby_ssp_pplus(chi, one/this->dee[s], chi, -this->ueem[s]/this->dee[Ls-1], chi, s, Ls-1);
}
axpby_ssp(chi, one/this->dee[Ls-1], chi, czero, chi, Ls-1, Ls-1);
// Apply U^{-1}
for(int s=Ls-2; s>=0; s--){
axpby_ssp_pminus(chi, one, chi, -this->uee[s], chi, s, s+1); // chi[Ls]
}
}
template<class Impl>
void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField& psi, FermionField& chi)
{
Coeff_t one(1.0);
Coeff_t czero(0.0);
chi.checkerboard = psi.checkerboard;
int Ls = this->Ls;
FermionField tmp(psi._grid);
// Apply (L^{\prime})^{-1}
axpby_ssp(chi, one, psi, czero, psi, 0, 0); // chi[0]=psi[0]
axpby_ssp(tmp, czero, tmp, this->MooeeInv_shift_lc[0], psi, 0, 0);
for(int s=1; s<Ls; s++){
axpby_ssp_pplus(chi, one, psi, -this->lee[s-1], chi, s, s-1);// recursion Psi[s] -lee P_+ chi[s-1]
axpby_ssp(tmp, one, tmp, this->MooeeInv_shift_lc[s], psi, 0, s);
}
// L_m^{-1}
for(int s=0; s<Ls-1; s++){ // Chi[ee] = 1 - sum[s<Ls-1] -leem[s]P_- chi
axpby_ssp_pminus(chi, one, chi, -this->leem[s], chi, Ls-1, s);
}
// U_m^{-1} D^{-1}
for(int s=0; s<Ls-1; s++){
axpby_ssp_pplus(chi, one/this->dee[s], chi, -this->ueem[s]/this->dee[Ls-1], chi, s, Ls-1);
}
axpby_ssp(chi, one/this->dee[Ls-1], chi, czero, chi, Ls-1, Ls-1);
// Apply U^{-1} and add shift term
if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, this->MooeeInv_shift_norm[Ls-1], tmp, Ls-1, 0); }
else{ axpby_ssp_pminus(chi, one, chi, this->MooeeInv_shift_norm[Ls-1], tmp, Ls-1, 0); }
for(int s=Ls-2; s>=0; s--){
axpby_ssp_pminus(chi, one, chi, -this->uee[s], chi, s, s+1); // chi[Ls]
if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, this->MooeeInv_shift_norm[s], tmp, s, 0); }
else{ axpby_ssp_pminus(chi, one, chi, this->MooeeInv_shift_norm[s], tmp, s, 0); }
}
}
template<class Impl>
void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
{
if(this->shift != 0.0){ MooeeInvDag_shift(psi,chi); return; }
Coeff_t one(1.0);
Coeff_t czero(0.0);
chi.checkerboard = psi.checkerboard;
int Ls = this->Ls;
// Apply (U^{\prime})^{-dagger}
axpby_ssp(chi, one, psi, czero, psi, 0, 0); // chi[0]=psi[0]
for(int s=1; s<Ls; s++){
axpby_ssp_pminus(chi, one, psi, -conjugate(this->uee[s-1]), chi, s, s-1);
}
// U_m^{-\dagger}
for(int s=0; s<Ls-1; s++){
axpby_ssp_pplus(chi, one, chi, -conjugate(this->ueem[s]), chi, Ls-1, s);
}
// L_m^{-\dagger} D^{-dagger}
for(int s=0; s<Ls-1; s++){
axpby_ssp_pminus(chi, one/conjugate(this->dee[s]), chi, -conjugate(this->leem[s]/this->dee[Ls-1]), chi, s, Ls-1);
}
axpby_ssp(chi, one/conjugate(this->dee[Ls-1]), chi, czero, chi, Ls-1, Ls-1);
// Apply L^{-dagger}
for(int s=Ls-2; s>=0; s--){
axpby_ssp_pplus(chi, one, chi, -conjugate(this->lee[s]), chi, s, s+1); // chi[Ls]
}
}
template<class Impl>
void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField& psi, FermionField& chi)
{
Coeff_t one(1.0);
Coeff_t czero(0.0);
chi.checkerboard = psi.checkerboard;
int Ls = this->Ls;
FermionField tmp(psi._grid);
// Apply (U^{\prime})^{-dagger} and accumulate (MooeeInvDag_shift_lc)_{j} \psi_{j} in tmp[0]
axpby_ssp(chi, one, psi, czero, psi, 0, 0); // chi[0]=psi[0]
axpby_ssp(tmp, czero, tmp, this->MooeeInvDag_shift_lc[0], psi, 0, 0);
for(int s=1; s<Ls; s++){
axpby_ssp_pminus(chi, one, psi, -conjugate(this->uee[s-1]), chi, s, s-1);
axpby_ssp(tmp, one, tmp, this->MooeeInvDag_shift_lc[s], psi, 0, s);
}
// U_m^{-\dagger}
for(int s=0; s<Ls-1; s++){
axpby_ssp_pplus(chi, one, chi, -conjugate(this->ueem[s]), chi, Ls-1, s);
}
// L_m^{-\dagger} D^{-dagger}
for(int s=0; s<Ls-1; s++){
axpby_ssp_pminus(chi, one/conjugate(this->dee[s]), chi, -conjugate(this->leem[s]/this->dee[Ls-1]), chi, s, Ls-1);
}
axpby_ssp(chi, one/conjugate(this->dee[Ls-1]), chi, czero, chi, Ls-1, Ls-1);
// Apply L^{-dagger} and add shift
if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, this->MooeeInvDag_shift_norm[Ls-1], tmp, Ls-1, 0); }
else{ axpby_ssp_pminus(chi, one, chi, this->MooeeInvDag_shift_norm[Ls-1], tmp, Ls-1, 0); }
for(int s=Ls-2; s>=0; s--){
axpby_ssp_pplus(chi, one, chi, -conjugate(this->lee[s]), chi, s, s+1); // chi[Ls]
if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, this->MooeeInvDag_shift_norm[s], tmp, s, 0); }
else{ axpby_ssp_pminus(chi, one, chi, this->MooeeInvDag_shift_norm[s], tmp, s, 0); }
}
}
#ifdef MOBIUS_EOFA_DPERP_LINALG
INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplF);
INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplD);
INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplF);
INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplD);
INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplF);
INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplD);
INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplFH);
INSTANTIATE_DPERP_MOBIUS_EOFA(WilsonImplDF);
INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplFH);
INSTANTIATE_DPERP_MOBIUS_EOFA(GparityWilsonImplDF);
INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplFH);
INSTANTIATE_DPERP_MOBIUS_EOFA(ZWilsonImplDF);
#endif
}}

View File

@ -1,983 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/MobiusEOFAFermionvec.cc
Copyright (C) 2017
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: David Murphy <dmurphy@phys.columbia.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/MobiusEOFAFermion.h>
namespace Grid {
namespace QCD {
/*
* Dense matrix versions of routines
*/
template<class Impl>
void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField& psi, FermionField& chi)
{
this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
}
template<class Impl>
void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField& psi, FermionField& chi)
{
this->MooeeInternal(psi, chi, DaggerNo, InverseYes);
}
template<class Impl>
void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi, FermionField& chi)
{
this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
}
template<class Impl>
void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField& psi, FermionField& chi)
{
this->MooeeInternal(psi, chi, DaggerYes, InverseYes);
}
template<class Impl>
void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, const FermionField& phi,
FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
{
GridBase* grid = psi._grid;
int Ls = this->Ls;
int LLs = grid->_rdimensions[0];
const int nsimd = Simd::Nsimd();
Vector<iSinglet<Simd>> u(LLs);
Vector<iSinglet<Simd>> l(LLs);
Vector<iSinglet<Simd>> d(LLs);
assert(Ls/LLs == nsimd);
assert(phi.checkerboard == psi.checkerboard);
chi.checkerboard = psi.checkerboard;
// just directly address via type pun
typedef typename Simd::scalar_type scalar_type;
scalar_type* u_p = (scalar_type*) &u[0];
scalar_type* l_p = (scalar_type*) &l[0];
scalar_type* d_p = (scalar_type*) &d[0];
for(int o=0; o<LLs; o++){ // outer
for(int i=0; i<nsimd; i++){ //inner
int s = o + i*LLs;
int ss = o*nsimd + i;
u_p[ss] = upper[s];
l_p[ss] = lower[s];
d_p[ss] = diag[s];
}}
this->M5Dcalls++;
this->M5Dtime -= usecond();
assert(Nc == 3);
parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
#if 0
alignas(64) SiteHalfSpinor hp;
alignas(64) SiteHalfSpinor hm;
alignas(64) SiteSpinor fp;
alignas(64) SiteSpinor fm;
for(int v=0; v<LLs; v++){
int vp = (v+1)%LLs;
int vm = (v+LLs-1)%LLs;
spProj5m(hp, psi[ss+vp]);
spProj5p(hm, psi[ss+vm]);
if (vp <= v){ rotate(hp, hp, 1); }
if (vm >= v){ rotate(hm, hm, nsimd-1); }
hp = 0.5*hp;
hm = 0.5*hm;
spRecon5m(fp, hp);
spRecon5p(fm, hm);
chi[ss+v] = d[v]*phi[ss+v];
chi[ss+v] = chi[ss+v] + u[v]*fp;
chi[ss+v] = chi[ss+v] + l[v]*fm;
}
#else
for(int v=0; v<LLs; v++){
vprefetch(psi[ss+v+LLs]);
int vp = (v == LLs-1) ? 0 : v+1;
int vm = (v == 0) ? LLs-1 : v-1;
Simd hp_00 = psi[ss+vp]()(2)(0);
Simd hp_01 = psi[ss+vp]()(2)(1);
Simd hp_02 = psi[ss+vp]()(2)(2);
Simd hp_10 = psi[ss+vp]()(3)(0);
Simd hp_11 = psi[ss+vp]()(3)(1);
Simd hp_12 = psi[ss+vp]()(3)(2);
Simd hm_00 = psi[ss+vm]()(0)(0);
Simd hm_01 = psi[ss+vm]()(0)(1);
Simd hm_02 = psi[ss+vm]()(0)(2);
Simd hm_10 = psi[ss+vm]()(1)(0);
Simd hm_11 = psi[ss+vm]()(1)(1);
Simd hm_12 = psi[ss+vm]()(1)(2);
if(vp <= v){
hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
}
if(vm >= v){
hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
}
// Can force these to real arithmetic and save 2x.
Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
vstream(chi[ss+v]()(0)(0), p_00);
vstream(chi[ss+v]()(0)(1), p_01);
vstream(chi[ss+v]()(0)(2), p_02);
vstream(chi[ss+v]()(1)(0), p_10);
vstream(chi[ss+v]()(1)(1), p_11);
vstream(chi[ss+v]()(1)(2), p_12);
vstream(chi[ss+v]()(2)(0), p_20);
vstream(chi[ss+v]()(2)(1), p_21);
vstream(chi[ss+v]()(2)(2), p_22);
vstream(chi[ss+v]()(3)(0), p_30);
vstream(chi[ss+v]()(3)(1), p_31);
vstream(chi[ss+v]()(3)(2), p_32);
}
#endif
}
this->M5Dtime += usecond();
}
template<class Impl>
void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField& psi, const FermionField& phi,
FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
std::vector<Coeff_t>& shift_coeffs)
{
#if 0
this->M5D(psi, phi, chi, lower, diag, upper);
// FIXME: possible gain from vectorizing shift operation as well?
Coeff_t one(1.0);
int Ls = this->Ls;
for(int s=0; s<Ls; s++){
if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, s, Ls-1); }
else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, s, 0); }
}
#else
GridBase* grid = psi._grid;
int Ls = this->Ls;
int LLs = grid->_rdimensions[0];
const int nsimd = Simd::Nsimd();
Vector<iSinglet<Simd>> u(LLs);
Vector<iSinglet<Simd>> l(LLs);
Vector<iSinglet<Simd>> d(LLs);
Vector<iSinglet<Simd>> s(LLs);
assert(Ls/LLs == nsimd);
assert(phi.checkerboard == psi.checkerboard);
chi.checkerboard = psi.checkerboard;
// just directly address via type pun
typedef typename Simd::scalar_type scalar_type;
scalar_type* u_p = (scalar_type*) &u[0];
scalar_type* l_p = (scalar_type*) &l[0];
scalar_type* d_p = (scalar_type*) &d[0];
scalar_type* s_p = (scalar_type*) &s[0];
for(int o=0; o<LLs; o++){ // outer
for(int i=0; i<nsimd; i++){ //inner
int s = o + i*LLs;
int ss = o*nsimd + i;
u_p[ss] = upper[s];
l_p[ss] = lower[s];
d_p[ss] = diag[s];
s_p[ss] = shift_coeffs[s];
}}
this->M5Dcalls++;
this->M5Dtime -= usecond();
assert(Nc == 3);
parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
int vs = (this->pm == 1) ? LLs-1 : 0;
Simd hs_00 = (this->pm == 1) ? psi[ss+vs]()(2)(0) : psi[ss+vs]()(0)(0);
Simd hs_01 = (this->pm == 1) ? psi[ss+vs]()(2)(1) : psi[ss+vs]()(0)(1);
Simd hs_02 = (this->pm == 1) ? psi[ss+vs]()(2)(2) : psi[ss+vs]()(0)(2);
Simd hs_10 = (this->pm == 1) ? psi[ss+vs]()(3)(0) : psi[ss+vs]()(1)(0);
Simd hs_11 = (this->pm == 1) ? psi[ss+vs]()(3)(1) : psi[ss+vs]()(1)(1);
Simd hs_12 = (this->pm == 1) ? psi[ss+vs]()(3)(2) : psi[ss+vs]()(1)(2);
for(int v=0; v<LLs; v++){
vprefetch(psi[ss+v+LLs]);
int vp = (v == LLs-1) ? 0 : v+1;
int vm = (v == 0) ? LLs-1 : v-1;
Simd hp_00 = psi[ss+vp]()(2)(0);
Simd hp_01 = psi[ss+vp]()(2)(1);
Simd hp_02 = psi[ss+vp]()(2)(2);
Simd hp_10 = psi[ss+vp]()(3)(0);
Simd hp_11 = psi[ss+vp]()(3)(1);
Simd hp_12 = psi[ss+vp]()(3)(2);
Simd hm_00 = psi[ss+vm]()(0)(0);
Simd hm_01 = psi[ss+vm]()(0)(1);
Simd hm_02 = psi[ss+vm]()(0)(2);
Simd hm_10 = psi[ss+vm]()(1)(0);
Simd hm_11 = psi[ss+vm]()(1)(1);
Simd hm_12 = psi[ss+vm]()(1)(2);
if(vp <= v){
hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
}
if(this->pm == 1 && vs <= v){
hs_00.v = Optimization::Rotate::tRotate<2>(hs_00.v);
hs_01.v = Optimization::Rotate::tRotate<2>(hs_01.v);
hs_02.v = Optimization::Rotate::tRotate<2>(hs_02.v);
hs_10.v = Optimization::Rotate::tRotate<2>(hs_10.v);
hs_11.v = Optimization::Rotate::tRotate<2>(hs_11.v);
hs_12.v = Optimization::Rotate::tRotate<2>(hs_12.v);
}
if(vm >= v){
hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
}
if(this->pm == -1 && vs >= v){
hs_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_00.v);
hs_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_01.v);
hs_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_02.v);
hs_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_10.v);
hs_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_11.v);
hs_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_12.v);
}
// Can force these to real arithmetic and save 2x.
Simd p_00 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_00);
Simd p_01 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_01);
Simd p_02 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_02);
Simd p_10 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_10);
Simd p_11 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_11);
Simd p_12 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_12);
Simd p_20 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00)
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_00)
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
Simd p_21 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01)
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_01)
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
Simd p_22 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02)
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_02)
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
Simd p_30 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10)
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_10)
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
Simd p_31 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11)
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_11)
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
Simd p_32 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12)
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_12)
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
vstream(chi[ss+v]()(0)(0), p_00);
vstream(chi[ss+v]()(0)(1), p_01);
vstream(chi[ss+v]()(0)(2), p_02);
vstream(chi[ss+v]()(1)(0), p_10);
vstream(chi[ss+v]()(1)(1), p_11);
vstream(chi[ss+v]()(1)(2), p_12);
vstream(chi[ss+v]()(2)(0), p_20);
vstream(chi[ss+v]()(2)(1), p_21);
vstream(chi[ss+v]()(2)(2), p_22);
vstream(chi[ss+v]()(3)(0), p_30);
vstream(chi[ss+v]()(3)(1), p_31);
vstream(chi[ss+v]()(3)(2), p_32);
}
}
this->M5Dtime += usecond();
#endif
}
template<class Impl>
void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, const FermionField& phi,
FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
{
GridBase* grid = psi._grid;
int Ls = this->Ls;
int LLs = grid->_rdimensions[0];
int nsimd = Simd::Nsimd();
Vector<iSinglet<Simd>> u(LLs);
Vector<iSinglet<Simd>> l(LLs);
Vector<iSinglet<Simd>> d(LLs);
assert(Ls/LLs == nsimd);
assert(phi.checkerboard == psi.checkerboard);
chi.checkerboard = psi.checkerboard;
// just directly address via type pun
typedef typename Simd::scalar_type scalar_type;
scalar_type* u_p = (scalar_type*) &u[0];
scalar_type* l_p = (scalar_type*) &l[0];
scalar_type* d_p = (scalar_type*) &d[0];
for(int o=0; o<LLs; o++){ // outer
for(int i=0; i<nsimd; i++){ //inner
int s = o + i*LLs;
int ss = o*nsimd + i;
u_p[ss] = upper[s];
l_p[ss] = lower[s];
d_p[ss] = diag[s];
}}
this->M5Dcalls++;
this->M5Dtime -= usecond();
parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
#if 0
alignas(64) SiteHalfSpinor hp;
alignas(64) SiteHalfSpinor hm;
alignas(64) SiteSpinor fp;
alignas(64) SiteSpinor fm;
for(int v=0; v<LLs; v++){
int vp = (v+1)%LLs;
int vm = (v+LLs-1)%LLs;
spProj5p(hp, psi[ss+vp]);
spProj5m(hm, psi[ss+vm]);
if(vp <= v){ rotate(hp, hp, 1); }
if(vm >= v){ rotate(hm, hm, nsimd-1); }
hp = hp*0.5;
hm = hm*0.5;
spRecon5p(fp, hp);
spRecon5m(fm, hm);
chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
chi[ss+v] = chi[ss+v] +l[v]*fm;
}
#else
for(int v=0; v<LLs; v++){
vprefetch(psi[ss+v+LLs]);
int vp = (v == LLs-1) ? 0 : v+1;
int vm = (v == 0 ) ? LLs-1 : v-1;
Simd hp_00 = psi[ss+vp]()(0)(0);
Simd hp_01 = psi[ss+vp]()(0)(1);
Simd hp_02 = psi[ss+vp]()(0)(2);
Simd hp_10 = psi[ss+vp]()(1)(0);
Simd hp_11 = psi[ss+vp]()(1)(1);
Simd hp_12 = psi[ss+vp]()(1)(2);
Simd hm_00 = psi[ss+vm]()(2)(0);
Simd hm_01 = psi[ss+vm]()(2)(1);
Simd hm_02 = psi[ss+vm]()(2)(2);
Simd hm_10 = psi[ss+vm]()(3)(0);
Simd hm_11 = psi[ss+vm]()(3)(1);
Simd hm_12 = psi[ss+vm]()(3)(2);
if (vp <= v){
hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
}
if(vm >= v){
hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
}
Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00);
Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01);
Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02);
Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10);
Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11);
Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12);
vstream(chi[ss+v]()(0)(0), p_00);
vstream(chi[ss+v]()(0)(1), p_01);
vstream(chi[ss+v]()(0)(2), p_02);
vstream(chi[ss+v]()(1)(0), p_10);
vstream(chi[ss+v]()(1)(1), p_11);
vstream(chi[ss+v]()(1)(2), p_12);
vstream(chi[ss+v]()(2)(0), p_20);
vstream(chi[ss+v]()(2)(1), p_21);
vstream(chi[ss+v]()(2)(2), p_22);
vstream(chi[ss+v]()(3)(0), p_30);
vstream(chi[ss+v]()(3)(1), p_31);
vstream(chi[ss+v]()(3)(2), p_32);
}
#endif
}
this->M5Dtime += usecond();
}
template<class Impl>
void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField& psi, const FermionField& phi,
FermionField& chi, std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
std::vector<Coeff_t>& shift_coeffs)
{
#if 0
this->M5Ddag(psi, phi, chi, lower, diag, upper);
// FIXME: possible gain from vectorizing shift operation as well?
Coeff_t one(1.0);
int Ls = this->Ls;
for(int s=0; s<Ls; s++){
if(this->pm == 1){ axpby_ssp_pplus(chi, one, chi, shift_coeffs[s], psi, Ls-1, s); }
else{ axpby_ssp_pminus(chi, one, chi, shift_coeffs[s], psi, 0, s); }
}
#else
GridBase* grid = psi._grid;
int Ls = this->Ls;
int LLs = grid->_rdimensions[0];
int nsimd = Simd::Nsimd();
Vector<iSinglet<Simd>> u(LLs);
Vector<iSinglet<Simd>> l(LLs);
Vector<iSinglet<Simd>> d(LLs);
Vector<iSinglet<Simd>> s(LLs);
assert(Ls/LLs == nsimd);
assert(phi.checkerboard == psi.checkerboard);
chi.checkerboard = psi.checkerboard;
// just directly address via type pun
typedef typename Simd::scalar_type scalar_type;
scalar_type* u_p = (scalar_type*) &u[0];
scalar_type* l_p = (scalar_type*) &l[0];
scalar_type* d_p = (scalar_type*) &d[0];
scalar_type* s_p = (scalar_type*) &s[0];
for(int o=0; o<LLs; o++){ // outer
for(int i=0; i<nsimd; i++){ //inner
int s = o + i*LLs;
int ss = o*nsimd + i;
u_p[ss] = upper[s];
l_p[ss] = lower[s];
d_p[ss] = diag[s];
s_p[ss] = shift_coeffs[s];
}}
this->M5Dcalls++;
this->M5Dtime -= usecond();
parallel_for(int ss=0; ss<grid->oSites(); ss+=LLs){ // adds LLs
int vs = (this->pm == 1) ? LLs-1 : 0;
Simd hs_00 = (this->pm == 1) ? psi[ss+vs]()(0)(0) : psi[ss+vs]()(2)(0);
Simd hs_01 = (this->pm == 1) ? psi[ss+vs]()(0)(1) : psi[ss+vs]()(2)(1);
Simd hs_02 = (this->pm == 1) ? psi[ss+vs]()(0)(2) : psi[ss+vs]()(2)(2);
Simd hs_10 = (this->pm == 1) ? psi[ss+vs]()(1)(0) : psi[ss+vs]()(3)(0);
Simd hs_11 = (this->pm == 1) ? psi[ss+vs]()(1)(1) : psi[ss+vs]()(3)(1);
Simd hs_12 = (this->pm == 1) ? psi[ss+vs]()(1)(2) : psi[ss+vs]()(3)(2);
for(int v=0; v<LLs; v++){
vprefetch(psi[ss+v+LLs]);
int vp = (v == LLs-1) ? 0 : v+1;
int vm = (v == 0 ) ? LLs-1 : v-1;
Simd hp_00 = psi[ss+vp]()(0)(0);
Simd hp_01 = psi[ss+vp]()(0)(1);
Simd hp_02 = psi[ss+vp]()(0)(2);
Simd hp_10 = psi[ss+vp]()(1)(0);
Simd hp_11 = psi[ss+vp]()(1)(1);
Simd hp_12 = psi[ss+vp]()(1)(2);
Simd hm_00 = psi[ss+vm]()(2)(0);
Simd hm_01 = psi[ss+vm]()(2)(1);
Simd hm_02 = psi[ss+vm]()(2)(2);
Simd hm_10 = psi[ss+vm]()(3)(0);
Simd hm_11 = psi[ss+vm]()(3)(1);
Simd hm_12 = psi[ss+vm]()(3)(2);
if (vp <= v){
hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
}
if(this->pm == 1 && vs <= v){
hs_00.v = Optimization::Rotate::tRotate<2>(hs_00.v);
hs_01.v = Optimization::Rotate::tRotate<2>(hs_01.v);
hs_02.v = Optimization::Rotate::tRotate<2>(hs_02.v);
hs_10.v = Optimization::Rotate::tRotate<2>(hs_10.v);
hs_11.v = Optimization::Rotate::tRotate<2>(hs_11.v);
hs_12.v = Optimization::Rotate::tRotate<2>(hs_12.v);
}
if(vm >= v){
hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
}
if(this->pm == -1 && vs >= v){
hs_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_00.v);
hs_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_01.v);
hs_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_02.v);
hs_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_10.v);
hs_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_11.v);
hs_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hs_12.v);
}
Simd p_00 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00)
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_00)
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_00);
Simd p_01 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01)
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_01)
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_01);
Simd p_02 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02)
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_02)
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_02);
Simd p_10 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10)
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_10)
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_10);
Simd p_11 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11)
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_11)
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_11);
Simd p_12 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12)
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_12)
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(), hp_12);
Simd p_20 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_00)
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_00);
Simd p_21 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_01)
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_01);
Simd p_22 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_02)
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_02);
Simd p_30 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_10)
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_10);
Simd p_31 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_11)
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_11);
Simd p_32 = (this->pm == 1) ? switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
: switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(), hm_12)
+ switcheroo<Coeff_t>::mult(s[v]()()(), hs_12);
vstream(chi[ss+v]()(0)(0), p_00);
vstream(chi[ss+v]()(0)(1), p_01);
vstream(chi[ss+v]()(0)(2), p_02);
vstream(chi[ss+v]()(1)(0), p_10);
vstream(chi[ss+v]()(1)(1), p_11);
vstream(chi[ss+v]()(1)(2), p_12);
vstream(chi[ss+v]()(2)(0), p_20);
vstream(chi[ss+v]()(2)(1), p_21);
vstream(chi[ss+v]()(2)(2), p_22);
vstream(chi[ss+v]()(3)(0), p_30);
vstream(chi[ss+v]()(3)(1), p_31);
vstream(chi[ss+v]()(3)(2), p_32);
}
}
this->M5Dtime += usecond();
#endif
}
#ifdef AVX512
#include<simd/Intel512common.h>
#include<simd/Intel512avx.h>
#include<simd/Intel512single.h>
#endif
template<class Impl>
void MobiusEOFAFermion<Impl>::MooeeInternalAsm(const FermionField& psi, FermionField& chi,
int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
{
#ifndef AVX512
{
SiteHalfSpinor BcastP;
SiteHalfSpinor BcastM;
SiteHalfSpinor SiteChiP;
SiteHalfSpinor SiteChiM;
// Ls*Ls * 2 * 12 * vol flops
for(int s1=0; s1<LLs; s1++){
for(int s2=0; s2<LLs; s2++){
for(int l=0; l < Simd::Nsimd(); l++){ // simd lane
int s = s2 + l*LLs;
int lex = s2 + LLs*site;
if( s2==0 && l==0 ){
SiteChiP=zero;
SiteChiM=zero;
}
for(int sp=0; sp<2; sp++){
for(int co=0; co<Nc; co++){
vbroadcast(BcastP()(sp)(co), psi[lex]()(sp)(co), l);
}}
for(int sp=0; sp<2; sp++){
for(int co=0; co<Nc; co++){
vbroadcast(BcastM()(sp)(co), psi[lex]()(sp+2)(co), l);
}}
for(int sp=0; sp<2; sp++){
for(int co=0; co<Nc; co++){
SiteChiP()(sp)(co) = real_madd(Matp[LLs*s+s1]()()(), BcastP()(sp)(co), SiteChiP()(sp)(co)); // 1100 us.
SiteChiM()(sp)(co) = real_madd(Matm[LLs*s+s1]()()(), BcastM()(sp)(co), SiteChiM()(sp)(co)); // each found by commenting out
}}
}}
{
int lex = s1 + LLs*site;
for(int sp=0; sp<2; sp++){
for(int co=0; co<Nc; co++){
vstream(chi[lex]()(sp)(co), SiteChiP()(sp)(co));
vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
}}
}
}
}
#else
{
// pointers
// MASK_REGS;
#define Chi_00 %%zmm1
#define Chi_01 %%zmm2
#define Chi_02 %%zmm3
#define Chi_10 %%zmm4
#define Chi_11 %%zmm5
#define Chi_12 %%zmm6
#define Chi_20 %%zmm7
#define Chi_21 %%zmm8
#define Chi_22 %%zmm9
#define Chi_30 %%zmm10
#define Chi_31 %%zmm11
#define Chi_32 %%zmm12
#define BCAST0 %%zmm13
#define BCAST1 %%zmm14
#define BCAST2 %%zmm15
#define BCAST3 %%zmm16
#define BCAST4 %%zmm17
#define BCAST5 %%zmm18
#define BCAST6 %%zmm19
#define BCAST7 %%zmm20
#define BCAST8 %%zmm21
#define BCAST9 %%zmm22
#define BCAST10 %%zmm23
#define BCAST11 %%zmm24
int incr = LLs*LLs*sizeof(iSinglet<Simd>);
for(int s1=0; s1<LLs; s1++){
for(int s2=0; s2<LLs; s2++){
int lex = s2 + LLs*site;
uint64_t a0 = (uint64_t) &Matp[LLs*s2+s1]; // should be cacheable
uint64_t a1 = (uint64_t) &Matm[LLs*s2+s1];
uint64_t a2 = (uint64_t) &psi[lex];
for(int l=0; l<Simd::Nsimd(); l++){ // simd lane
if((s2+l)==0) {
asm(
VPREFETCH1(0,%2) VPREFETCH1(0,%1)
VPREFETCH1(12,%2) VPREFETCH1(13,%2)
VPREFETCH1(14,%2) VPREFETCH1(15,%2)
VBCASTCDUP(0,%2,BCAST0)
VBCASTCDUP(1,%2,BCAST1)
VBCASTCDUP(2,%2,BCAST2)
VBCASTCDUP(3,%2,BCAST3)
VBCASTCDUP(4,%2,BCAST4) VMULMEM(0,%0,BCAST0,Chi_00)
VBCASTCDUP(5,%2,BCAST5) VMULMEM(0,%0,BCAST1,Chi_01)
VBCASTCDUP(6,%2,BCAST6) VMULMEM(0,%0,BCAST2,Chi_02)
VBCASTCDUP(7,%2,BCAST7) VMULMEM(0,%0,BCAST3,Chi_10)
VBCASTCDUP(8,%2,BCAST8) VMULMEM(0,%0,BCAST4,Chi_11)
VBCASTCDUP(9,%2,BCAST9) VMULMEM(0,%0,BCAST5,Chi_12)
VBCASTCDUP(10,%2,BCAST10) VMULMEM(0,%1,BCAST6,Chi_20)
VBCASTCDUP(11,%2,BCAST11) VMULMEM(0,%1,BCAST7,Chi_21)
VMULMEM(0,%1,BCAST8,Chi_22)
VMULMEM(0,%1,BCAST9,Chi_30)
VMULMEM(0,%1,BCAST10,Chi_31)
VMULMEM(0,%1,BCAST11,Chi_32)
: : "r" (a0), "r" (a1), "r" (a2) );
} else {
asm(
VBCASTCDUP(0,%2,BCAST0) VMADDMEM(0,%0,BCAST0,Chi_00)
VBCASTCDUP(1,%2,BCAST1) VMADDMEM(0,%0,BCAST1,Chi_01)
VBCASTCDUP(2,%2,BCAST2) VMADDMEM(0,%0,BCAST2,Chi_02)
VBCASTCDUP(3,%2,BCAST3) VMADDMEM(0,%0,BCAST3,Chi_10)
VBCASTCDUP(4,%2,BCAST4) VMADDMEM(0,%0,BCAST4,Chi_11)
VBCASTCDUP(5,%2,BCAST5) VMADDMEM(0,%0,BCAST5,Chi_12)
VBCASTCDUP(6,%2,BCAST6) VMADDMEM(0,%1,BCAST6,Chi_20)
VBCASTCDUP(7,%2,BCAST7) VMADDMEM(0,%1,BCAST7,Chi_21)
VBCASTCDUP(8,%2,BCAST8) VMADDMEM(0,%1,BCAST8,Chi_22)
VBCASTCDUP(9,%2,BCAST9) VMADDMEM(0,%1,BCAST9,Chi_30)
VBCASTCDUP(10,%2,BCAST10) VMADDMEM(0,%1,BCAST10,Chi_31)
VBCASTCDUP(11,%2,BCAST11) VMADDMEM(0,%1,BCAST11,Chi_32)
: : "r" (a0), "r" (a1), "r" (a2) );
}
a0 = a0 + incr;
a1 = a1 + incr;
a2 = a2 + sizeof(typename Simd::scalar_type);
}
}
{
int lexa = s1+LLs*site;
asm (
VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01) VSTORE(2 ,%0,Chi_02)
VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11) VSTORE(5 ,%0,Chi_12)
VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21) VSTORE(8 ,%0,Chi_22)
VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31) VSTORE(11,%0,Chi_32)
: : "r" ((uint64_t)&chi[lexa]) : "memory" );
}
}
}
#undef Chi_00
#undef Chi_01
#undef Chi_02
#undef Chi_10
#undef Chi_11
#undef Chi_12
#undef Chi_20
#undef Chi_21
#undef Chi_22
#undef Chi_30
#undef Chi_31
#undef Chi_32
#undef BCAST0
#undef BCAST1
#undef BCAST2
#undef BCAST3
#undef BCAST4
#undef BCAST5
#undef BCAST6
#undef BCAST7
#undef BCAST8
#undef BCAST9
#undef BCAST10
#undef BCAST11
#endif
};
// Z-mobius version
template<class Impl>
void MobiusEOFAFermion<Impl>::MooeeInternalZAsm(const FermionField& psi, FermionField& chi,
int LLs, int site, Vector<iSinglet<Simd> >& Matp, Vector<iSinglet<Simd> >& Matm)
{
std::cout << "Error: zMobius not implemented for EOFA" << std::endl;
exit(-1);
};
template<class Impl>
void MobiusEOFAFermion<Impl>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv)
{
int Ls = this->Ls;
int LLs = psi._grid->_rdimensions[0];
int vol = psi._grid->oSites()/LLs;
chi.checkerboard = psi.checkerboard;
Vector<iSinglet<Simd>> Matp;
Vector<iSinglet<Simd>> Matm;
Vector<iSinglet<Simd>>* _Matp;
Vector<iSinglet<Simd>>* _Matm;
// MooeeInternalCompute(dag,inv,Matp,Matm);
if(inv && dag){
_Matp = &this->MatpInvDag;
_Matm = &this->MatmInvDag;
}
if(inv && (!dag)){
_Matp = &this->MatpInv;
_Matm = &this->MatmInv;
}
if(!inv){
MooeeInternalCompute(dag, inv, Matp, Matm);
_Matp = &Matp;
_Matm = &Matm;
}
assert(_Matp->size() == Ls*LLs);
this->MooeeInvCalls++;
this->MooeeInvTime -= usecond();
if(switcheroo<Coeff_t>::iscomplex()){
parallel_for(auto site=0; site<vol; site++){
MooeeInternalZAsm(psi, chi, LLs, site, *_Matp, *_Matm);
}
} else {
parallel_for(auto site=0; site<vol; site++){
MooeeInternalAsm(psi, chi, LLs, site, *_Matp, *_Matm);
}
}
this->MooeeInvTime += usecond();
}
#ifdef MOBIUS_EOFA_DPERP_VEC
INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplD);
INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplF);
INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplD);
INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplF);
INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplDF);
INSTANTIATE_DPERP_MOBIUS_EOFA(DomainWallVec5dImplFH);
INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplDF);
INSTANTIATE_DPERP_MOBIUS_EOFA(ZDomainWallVec5dImplFH);
template void MobiusEOFAFermion<DomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void MobiusEOFAFermion<DomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void MobiusEOFAFermion<ZDomainWallVec5dImplF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void MobiusEOFAFermion<ZDomainWallVec5dImplD>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void MobiusEOFAFermion<DomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void MobiusEOFAFermion<DomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void MobiusEOFAFermion<ZDomainWallVec5dImplFH>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
template void MobiusEOFAFermion<ZDomainWallVec5dImplDF>::MooeeInternal(const FermionField& psi, FermionField& chi, int dag, int inv);
#endif
}}

View File

@ -1,95 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/SchurRedBlack.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
namespace Grid {
namespace QCD {
template<class Field>
class PauliVillarsSolverUnprec
{
public:
ConjugateGradient<Field> & CG;
PauliVillarsSolverUnprec( ConjugateGradient<Field> &_CG) : CG(_CG){};
template<class Matrix>
void operator() (Matrix &_Matrix,const Field &src,Field &sol)
{
RealD m = _Matrix.Mass();
Field A (_Matrix.FermionGrid());
MdagMLinearOperator<Matrix,Field> HermOp(_Matrix);
_Matrix.SetMass(1.0);
_Matrix.Mdag(src,A);
CG(HermOp,A,sol);
_Matrix.SetMass(m);
};
};
template<class Field,class SchurSolverType>
class PauliVillarsSolverRBprec
{
public:
SchurSolverType & SchurSolver;
PauliVillarsSolverRBprec( SchurSolverType &_SchurSolver) : SchurSolver(_SchurSolver){};
template<class Matrix>
void operator() (Matrix &_Matrix,const Field &src,Field &sol)
{
RealD m = _Matrix.Mass();
Field A (_Matrix.FermionGrid());
_Matrix.SetMass(1.0);
SchurSolver(_Matrix,src,sol);
_Matrix.SetMass(m);
};
};
template<class Field,class GaugeField>
class PauliVillarsSolverFourierAccel
{
public:
GaugeField & Umu;
ConjugateGradient<Field> & CG;
PauliVillarsSolverFourierAccel(GaugeField &_Umu,ConjugateGradient<Field> &_CG) : Umu(_Umu), CG(_CG)
{
};
template<class Matrix>
void operator() (Matrix &_Matrix,const Field &src,Field &sol)
{
FourierAcceleratedPV<Field, Matrix, typename Matrix::GaugeField > faPV(_Matrix,Umu,CG) ;
faPV.pvInv(src,sol);
};
};
}
}

View File

@ -1,135 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/SchurRedBlack.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
namespace Grid {
namespace QCD {
template<class Field,class PVinverter> class Reconstruct5DfromPhysical {
private:
PVinverter & PauliVillarsSolver;
public:
/////////////////////////////////////////////////////
// First cut works, 10 Oct 2018.
//
// Must form a plan to get this into production for Zmobius acceleration
// of the Mobius exact AMA corrections.
//
// TODO : understand absence of contact term in eqns in Hantao's thesis
// sol4 is contact term subtracted, but thesis & Brower's paper suggests not.
//
// Step 1: Localise PV inverse in a routine. [DONE]
// Step 2: Schur based PV inverse [DONE]
// Step 3: Fourier accelerated PV inverse [DONE]
//
/////////////////////////////////////////////////////
Reconstruct5DfromPhysical(PVinverter &_PauliVillarsSolver)
: PauliVillarsSolver(_PauliVillarsSolver)
{
};
template<class Matrix>
void PV(Matrix &_Matrix,const Field &src,Field &sol)
{
RealD m = _Matrix.Mass();
_Matrix.SetMass(1.0);
_Matrix.M(src,sol);
_Matrix.SetMass(m);
}
template<class Matrix>
void PVdag(Matrix &_Matrix,const Field &src,Field &sol)
{
RealD m = _Matrix.Mass();
_Matrix.SetMass(1.0);
_Matrix.Mdag(src,sol);
_Matrix.SetMass(m);
}
template<class Matrix>
void operator() (Matrix & _Matrix,const Field &sol4,const Field &src4, Field &sol5){
int Ls = _Matrix.Ls;
Field psi4(_Matrix.GaugeGrid());
Field psi(_Matrix.FermionGrid());
Field A (_Matrix.FermionGrid());
Field B (_Matrix.FermionGrid());
Field c (_Matrix.FermionGrid());
typedef typename Matrix::Coeff_t Coeff_t;
std::cout << GridLogMessage<< " ************************************************" << std::endl;
std::cout << GridLogMessage<< " Reconstruct5Dprop: c.f. MADWF algorithm " << std::endl;
std::cout << GridLogMessage<< " ************************************************" << std::endl;
///////////////////////////////////////
//Import source, include Dminus factors
///////////////////////////////////////
_Matrix.ImportPhysicalFermionSource(src4,B);
///////////////////////////////////////
// Set up c from src4
///////////////////////////////////////
PauliVillarsSolver(_Matrix,B,A);
_Matrix.Pdag(A,c);
//////////////////////////////////////
// Build Pdag PV^-1 Dm P [-sol4,c2,c3... cL]
//////////////////////////////////////
psi4 = - sol4;
InsertSlice(psi4, psi, 0 , 0);
for (int s=1;s<Ls;s++) {
ExtractSlice(psi4,c,s,0);
InsertSlice(psi4,psi,s,0);
}
/////////////////////////////
// Pdag PV^-1 Dm P
/////////////////////////////
_Matrix.P(psi,B);
_Matrix.M(B,A);
PauliVillarsSolver(_Matrix,A,B);
_Matrix.Pdag(B,A);
//////////////////////////////
// Reinsert surface prop
//////////////////////////////
InsertSlice(sol4,A,0,0);
//////////////////////////////
// Convert from y back to x
//////////////////////////////
_Matrix.P(A,sol5);
}
};
}
}

View File

@ -1,102 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: SchurDiagTwoKappa.h
Copyright (C) 2017
Author: Christoph Lehner
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef _SCHUR_DIAG_TWO_KAPPA_H
#define _SCHUR_DIAG_TWO_KAPPA_H
namespace Grid {
// This is specific to (Z)mobius fermions
template<class Matrix, class Field>
class KappaSimilarityTransform {
public:
INHERIT_IMPL_TYPES(Matrix);
std::vector<Coeff_t> kappa, kappaDag, kappaInv, kappaInvDag;
KappaSimilarityTransform (Matrix &zmob) {
for (int i=0;i<(int)zmob.bs.size();i++) {
Coeff_t k = 1.0 / ( 2.0 * (zmob.bs[i] *(4 - zmob.M5) + 1.0) );
kappa.push_back( k );
kappaDag.push_back( conj(k) );
kappaInv.push_back( 1.0 / k );
kappaInvDag.push_back( 1.0 / conj(k) );
}
}
template<typename vobj>
void sscale(const Lattice<vobj>& in, Lattice<vobj>& out, Coeff_t* s) {
GridBase *grid=out._grid;
out.checkerboard = in.checkerboard;
assert(grid->_simd_layout[0] == 1); // should be fine for ZMobius for now
int Ls = grid->_rdimensions[0];
parallel_for(int ss=0;ss<grid->oSites();ss++){
vobj tmp = s[ss % Ls]*in._odata[ss];
vstream(out._odata[ss],tmp);
}
}
RealD sscale_norm(const Field& in, Field& out, Coeff_t* s) {
sscale(in,out,s);
return norm2(out);
}
virtual RealD M (const Field& in, Field& out) { return sscale_norm(in,out,&kappa[0]); }
virtual RealD MDag (const Field& in, Field& out) { return sscale_norm(in,out,&kappaDag[0]);}
virtual RealD MInv (const Field& in, Field& out) { return sscale_norm(in,out,&kappaInv[0]);}
virtual RealD MInvDag (const Field& in, Field& out) { return sscale_norm(in,out,&kappaInvDag[0]);}
};
template<class Matrix,class Field>
class SchurDiagTwoKappaOperator : public SchurOperatorBase<Field> {
public:
KappaSimilarityTransform<Matrix, Field> _S;
SchurDiagTwoOperator<Matrix, Field> _Mat;
SchurDiagTwoKappaOperator (Matrix &Mat): _S(Mat), _Mat(Mat) {};
virtual RealD Mpc (const Field &in, Field &out) {
Field tmp(in._grid);
_S.MInv(in,out);
_Mat.Mpc(out,tmp);
return _S.M(tmp,out);
}
virtual RealD MpcDag (const Field &in, Field &out){
Field tmp(in._grid);
_S.MDag(in,out);
_Mat.MpcDag(out,tmp);
return _S.MInvDag(tmp,out);
}
};
}
#endif

View File

@ -1,294 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
Copyright (C) 2015
Author: Azusa Yamaguchi, Peter Boyle
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
namespace Grid {
namespace QCD {
int StaggeredKernelsStatic::Opt= StaggeredKernelsStatic::OptGeneric;
int StaggeredKernelsStatic::Comms = StaggeredKernelsStatic::CommsAndCompute;
#define GENERIC_STENCIL_LEG(U,Dir,skew,multLink) \
SE = st.GetEntry(ptype, Dir+skew, sF); \
if (SE->_is_local ) { \
if (SE->_permute) { \
chi_p = &chi; \
permute(chi, in._odata[SE->_offset], ptype); \
} else { \
chi_p = &in._odata[SE->_offset]; \
} \
} else { \
chi_p = &buf[SE->_offset]; \
} \
multLink(Uchi, U._odata[sU], *chi_p, Dir);
#define GENERIC_STENCIL_LEG_INT(U,Dir,skew,multLink) \
SE = st.GetEntry(ptype, Dir+skew, sF); \
if (SE->_is_local ) { \
if (SE->_permute) { \
chi_p = &chi; \
permute(chi, in._odata[SE->_offset], ptype); \
} else { \
chi_p = &in._odata[SE->_offset]; \
} \
} else if ( st.same_node[Dir] ) { \
chi_p = &buf[SE->_offset]; \
} \
if (SE->_is_local || st.same_node[Dir] ) { \
multLink(Uchi, U._odata[sU], *chi_p, Dir); \
}
#define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink) \
SE = st.GetEntry(ptype, Dir+skew, sF); \
if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \
nmu++; \
chi_p = &buf[SE->_offset]; \
multLink(Uchi, U._odata[sU], *chi_p, Dir); \
}
template <class Impl>
StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};
////////////////////////////////////////////////////////////////////////////////////
// Generic implementation; move to different file?
// Int, Ext, Int+Ext cases for comms overlap
////////////////////////////////////////////////////////////////////////////////////
template <class Impl>
void StaggeredKernels<Impl>::DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionField &in, FermionField &out, int dag) {
const SiteSpinor *chi_p;
SiteSpinor chi;
SiteSpinor Uchi;
StencilEntry *SE;
int ptype;
int skew;
for(int s=0;s<LLs;s++){
int sF=LLs*sU+s;
skew = 0;
GENERIC_STENCIL_LEG(U,Xp,skew,Impl::multLink);
GENERIC_STENCIL_LEG(U,Yp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(U,Zp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(U,Tp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(U,Xm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(U,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(U,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(U,Tm,skew,Impl::multLinkAdd);
skew=8;
GENERIC_STENCIL_LEG(UUU,Xp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(UUU,Yp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(UUU,Zp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(UUU,Tp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(UUU,Xm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(UUU,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(UUU,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG(UUU,Tm,skew,Impl::multLinkAdd);
if ( dag ) {
Uchi = - Uchi;
}
vstream(out._odata[sF], Uchi);
}
};
///////////////////////////////////////////////////
// Only contributions from interior of our node
///////////////////////////////////////////////////
template <class Impl>
void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag) {
const SiteSpinor *chi_p;
SiteSpinor chi;
SiteSpinor Uchi;
StencilEntry *SE;
int ptype;
int skew ;
for(int s=0;s<LLs;s++){
int sF=LLs*sU+s;
skew = 0;
Uchi=zero;
GENERIC_STENCIL_LEG_INT(U,Xp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(U,Yp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(U,Zp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(U,Tp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(U,Xm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(U,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(U,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(U,Tm,skew,Impl::multLinkAdd);
skew=8;
GENERIC_STENCIL_LEG_INT(UUU,Xp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(UUU,Yp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(UUU,Zp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(UUU,Tp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(UUU,Xm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(UUU,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(UUU,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_INT(UUU,Tm,skew,Impl::multLinkAdd);
if ( dag ) {
Uchi = - Uchi;
}
vstream(out._odata[sF], Uchi);
}
};
///////////////////////////////////////////////////
// Only contributions from exterior of our node
///////////////////////////////////////////////////
template <class Impl>
void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag) {
const SiteSpinor *chi_p;
SiteSpinor chi;
SiteSpinor Uchi;
StencilEntry *SE;
int ptype;
int nmu=0;
int skew ;
for(int s=0;s<LLs;s++){
int sF=LLs*sU+s;
skew = 0;
Uchi=zero;
GENERIC_STENCIL_LEG_EXT(U,Xp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(U,Yp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(U,Zp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(U,Tp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(U,Xm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(U,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(U,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(U,Tm,skew,Impl::multLinkAdd);
skew=8;
GENERIC_STENCIL_LEG_EXT(UUU,Xp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(UUU,Yp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(UUU,Zp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(UUU,Tp,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(UUU,Xm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(UUU,Ym,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(UUU,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd);
if ( nmu ) {
if ( dag ) {
out._odata[sF] = out._odata[sF] - Uchi;
} else {
out._odata[sF] = out._odata[sF] + Uchi;
}
}
}
};
////////////////////////////////////////////////////////////////////////////////////
// Driving / wrapping routine to select right kernel
////////////////////////////////////////////////////////////////////////////////////
template <class Impl>
void StaggeredKernels<Impl>::DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionField &in, FermionField &out,
int interior,int exterior)
{
int dag=1;
DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
};
template <class Impl>
void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionField &in, FermionField &out,
int interior,int exterior)
{
int dag=0;
DhopSite(st,lo,U,UUU,buf,LLs,sU,in,out,dag,interior,exterior);
};
template <class Impl>
void StaggeredKernels<Impl>::DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs,
int sU, const FermionField &in, FermionField &out,
int dag,int interior,int exterior)
{
switch(Opt) {
#ifdef AVX512
case OptInlineAsm:
if ( interior && exterior ) {
DhopSiteAsm(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
} else {
std::cout << GridLogError << "Cannot overlap comms and compute with Staggered assembly"<<std::endl;
assert(0);
}
break;
#endif
case OptHandUnroll:
if ( interior && exterior ) {
DhopSiteHand (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
} else if ( interior ) {
DhopSiteHandInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
} else if ( exterior ) {
DhopSiteHandExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
}
break;
case OptGeneric:
if ( interior && exterior ) {
DhopSiteGeneric (st,lo,U,UUU,buf,LLs,sU,in,out,dag);
} else if ( interior ) {
DhopSiteGenericInt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
} else if ( exterior ) {
DhopSiteGenericExt(st,lo,U,UUU,buf,LLs,sU,in,out,dag);
}
break;
default:
std::cout<<"Oops Opt = "<<Opt<<std::endl;
assert(0);
break;
}
};
template <class Impl>
void StaggeredKernels<Impl>::DhopDir( StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor *buf, int sF,
int sU, const FermionField &in, FermionField &out, int dir, int disp)
{
// Disp should be either +1,-1,+3,-3
// What about "dag" ?
// Because we work out pU . dS/dU
// U
assert(0);
}
FermOpStaggeredTemplateInstantiate(StaggeredKernels);
FermOpStaggeredVec5dTemplateInstantiate(StaggeredKernels);
}}

View File

@ -1,122 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/StaggeredKernels.h
Copyright (C) 2015
Author: Azusa Yamaguchi, Peter Boyle
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_QCD_STAGGERED_KERNELS_H
#define GRID_QCD_STAGGERED_KERNELS_H
namespace Grid {
namespace QCD {
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Helper routines that implement Staggered stencil for a single site.
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
class StaggeredKernelsStatic {
public:
enum { OptGeneric, OptHandUnroll, OptInlineAsm };
enum { CommsAndCompute, CommsThenCompute };
static int Opt;
static int Comms;
};
template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , public StaggeredKernelsStatic {
public:
INHERIT_IMPL_TYPES(Impl);
typedef FermionOperator<Impl> Base;
public:
void DhopDir(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU, SiteSpinor * buf,
int sF, int sU, const FermionField &in, FermionField &out, int dir,int disp);
///////////////////////////////////////////////////////////////////////////////////////
// Generic Nc kernels
///////////////////////////////////////////////////////////////////////////////////////
void DhopSiteGeneric(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag);
void DhopSiteGenericInt(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag);
void DhopSiteGenericExt(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag);
///////////////////////////////////////////////////////////////////////////////////////
// Nc=3 specific kernels
///////////////////////////////////////////////////////////////////////////////////////
void DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,DoubledGaugeField &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag);
void DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,DoubledGaugeField &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag);
void DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,DoubledGaugeField &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag);
///////////////////////////////////////////////////////////////////////////////////////
// Asm Nc=3 specific kernels
///////////////////////////////////////////////////////////////////////////////////////
void DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,DoubledGaugeField &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag);
///////////////////////////////////////////////////////////////////////////////////////////////////
// Generic interface; fan out to right routine
///////////////////////////////////////////////////////////////////////////////////////////////////
void DhopSite(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionField &in, FermionField &out, int interior=1,int exterior=1);
void DhopSiteDag(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionField &in, FermionField &out, int interior=1,int exterior=1);
void DhopSite(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor * buf, int LLs, int sU,
const FermionField &in, FermionField &out, int dag, int interior,int exterior);
public:
StaggeredKernels(const ImplParams &p = ImplParams());
};
}}
#endif

View File

@ -1,968 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/StaggerdKernelsHand.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid.h>
#ifdef AVX512
#include <simd/Intel512common.h>
#include <simd/Intel512avx.h>
#endif
// Interleave operations from two directions
// This looks just like a 2 spin multiply and reuse same sequence from the Wilson
// Kernel. But the spin index becomes a mu index instead.
#define Chi_00 %zmm0
#define Chi_01 %zmm1
#define Chi_02 %zmm2
#define Chi_10 %zmm3
#define Chi_11 %zmm4
#define Chi_12 %zmm5
#define Chi_20 %zmm6
#define Chi_21 %zmm7
#define Chi_22 %zmm8
#define Chi_30 %zmm9
#define Chi_31 %zmm10
#define Chi_32 %zmm11
#define UChi_00 %zmm12
#define UChi_01 %zmm13
#define UChi_02 %zmm14
#define UChi_10 %zmm15
#define UChi_11 %zmm16
#define UChi_12 %zmm17
#define UChi_20 %zmm18
#define UChi_21 %zmm19
#define UChi_22 %zmm20
#define UChi_30 %zmm21
#define UChi_31 %zmm22
#define UChi_32 %zmm23
#define pChi_00 %%zmm0
#define pChi_01 %%zmm1
#define pChi_02 %%zmm2
#define pChi_10 %%zmm3
#define pChi_11 %%zmm4
#define pChi_12 %%zmm5
#define pChi_20 %%zmm6
#define pChi_21 %%zmm7
#define pChi_22 %%zmm8
#define pChi_30 %%zmm9
#define pChi_31 %%zmm10
#define pChi_32 %%zmm11
#define pUChi_00 %%zmm12
#define pUChi_01 %%zmm13
#define pUChi_02 %%zmm14
#define pUChi_10 %%zmm15
#define pUChi_11 %%zmm16
#define pUChi_12 %%zmm17
#define pUChi_20 %%zmm18
#define pUChi_21 %%zmm19
#define pUChi_22 %%zmm20
#define pUChi_30 %%zmm21
#define pUChi_31 %%zmm22
#define pUChi_32 %%zmm23
#define T0 %zmm24
#define T1 %zmm25
#define T2 %zmm26
#define T3 %zmm27
#define Z00 %zmm26
#define Z10 %zmm27
#define Z0 Z00
#define Z1 %zmm28
#define Z2 %zmm29
#define Z3 %zmm30
#define Z4 %zmm31
#define Z5 Chi_31
#define Z6 Chi_32
#define MULT_ADD_LS(g0,g1,g2,g3) \
asm ( "movq %0, %%r8 \n\t" \
"movq %1, %%r9 \n\t" \
"movq %2, %%r10 \n\t" \
"movq %3, %%r11 \n\t" : : "r"(g0), "r"(g1), "r"(g2), "r"(g3) : "%r8","%r9","%r10","%r11" );\
asm ( \
VSHUF(Chi_00,T0) VSHUF(Chi_10,T1) \
VSHUF(Chi_20,T2) VSHUF(Chi_30,T3) \
VMADDSUBIDUP(0,%r8,T0,UChi_00) VMADDSUBIDUP(0,%r9,T1,UChi_10) \
VMADDSUBIDUP(3,%r8,T0,UChi_01) VMADDSUBIDUP(3,%r9,T1,UChi_11) \
VMADDSUBIDUP(6,%r8,T0,UChi_02) VMADDSUBIDUP(6,%r9,T1,UChi_12) \
VMADDSUBIDUP(0,%r10,T2,UChi_20) VMADDSUBIDUP(0,%r11,T3,UChi_30) \
VMADDSUBIDUP(3,%r10,T2,UChi_21) VMADDSUBIDUP(3,%r11,T3,UChi_31) \
VMADDSUBIDUP(6,%r10,T2,UChi_22) VMADDSUBIDUP(6,%r11,T3,UChi_32) \
VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r9,Chi_10,UChi_10) \
VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r9,Chi_10,UChi_11) \
VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r9,Chi_10,UChi_12) \
VMADDSUBRDUP(0,%r10,Chi_20,UChi_20) VMADDSUBRDUP(0,%r11,Chi_30,UChi_30) \
VMADDSUBRDUP(3,%r10,Chi_20,UChi_21) VMADDSUBRDUP(3,%r11,Chi_30,UChi_31) \
VMADDSUBRDUP(6,%r10,Chi_20,UChi_22) VMADDSUBRDUP(6,%r11,Chi_30,UChi_32) \
VSHUF(Chi_01,T0) VSHUF(Chi_11,T1) \
VSHUF(Chi_21,T2) VSHUF(Chi_31,T3) \
VMADDSUBIDUP(1,%r8,T0,UChi_00) VMADDSUBIDUP(1,%r9,T1,UChi_10) \
VMADDSUBIDUP(4,%r8,T0,UChi_01) VMADDSUBIDUP(4,%r9,T1,UChi_11) \
VMADDSUBIDUP(7,%r8,T0,UChi_02) VMADDSUBIDUP(7,%r9,T1,UChi_12) \
VMADDSUBIDUP(1,%r10,T2,UChi_20) VMADDSUBIDUP(1,%r11,T3,UChi_30) \
VMADDSUBIDUP(4,%r10,T2,UChi_21) VMADDSUBIDUP(4,%r11,T3,UChi_31) \
VMADDSUBIDUP(7,%r10,T2,UChi_22) VMADDSUBIDUP(7,%r11,T3,UChi_32) \
VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r9,Chi_11,UChi_10) \
VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r9,Chi_11,UChi_11) \
VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r9,Chi_11,UChi_12) \
VMADDSUBRDUP(1,%r10,Chi_21,UChi_20) VMADDSUBRDUP(1,%r11,Chi_31,UChi_30) \
VMADDSUBRDUP(4,%r10,Chi_21,UChi_21) VMADDSUBRDUP(4,%r11,Chi_31,UChi_31) \
VMADDSUBRDUP(7,%r10,Chi_21,UChi_22) VMADDSUBRDUP(7,%r11,Chi_31,UChi_32) \
VSHUF(Chi_02,T0) VSHUF(Chi_12,T1) \
VSHUF(Chi_22,T2) VSHUF(Chi_32,T3) \
VMADDSUBIDUP(2,%r8,T0,UChi_00) VMADDSUBIDUP(2,%r9,T1,UChi_10) \
VMADDSUBIDUP(5,%r8,T0,UChi_01) VMADDSUBIDUP(5,%r9,T1,UChi_11) \
VMADDSUBIDUP(8,%r8,T0,UChi_02) VMADDSUBIDUP(8,%r9,T1,UChi_12) \
VMADDSUBIDUP(2,%r10,T2,UChi_20) VMADDSUBIDUP(2,%r11,T3,UChi_30) \
VMADDSUBIDUP(5,%r10,T2,UChi_21) VMADDSUBIDUP(5,%r11,T3,UChi_31) \
VMADDSUBIDUP(8,%r10,T2,UChi_22) VMADDSUBIDUP(8,%r11,T3,UChi_32) \
VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r9,Chi_12,UChi_10) \
VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r9,Chi_12,UChi_11) \
VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r9,Chi_12,UChi_12) \
VMADDSUBRDUP(2,%r10,Chi_22,UChi_20) VMADDSUBRDUP(2,%r11,Chi_32,UChi_30) \
VMADDSUBRDUP(5,%r10,Chi_22,UChi_21) VMADDSUBRDUP(5,%r11,Chi_32,UChi_31) \
VMADDSUBRDUP(8,%r10,Chi_22,UChi_22) VMADDSUBRDUP(8,%r11,Chi_32,UChi_32) );
#define MULT_LS(g0,g1,g2,g3) \
asm ( "movq %0, %%r8 \n\t" \
"movq %1, %%r9 \n\t" \
"movq %2, %%r10 \n\t" \
"movq %3, %%r11 \n\t" : : "r"(g0), "r"(g1), "r"(g2), "r"(g3) : "%r8","%r9","%r10","%r11" );\
asm ( \
VSHUF(Chi_00,T0) VSHUF(Chi_10,T1) \
VSHUF(Chi_20,T2) VSHUF(Chi_30,T3) \
VMULIDUP(0,%r8,T0,UChi_00) VMULIDUP(0,%r9,T1,UChi_10) \
VMULIDUP(3,%r8,T0,UChi_01) VMULIDUP(3,%r9,T1,UChi_11) \
VMULIDUP(6,%r8,T0,UChi_02) VMULIDUP(6,%r9,T1,UChi_12) \
VMULIDUP(0,%r10,T2,UChi_20) VMULIDUP(0,%r11,T3,UChi_30) \
VMULIDUP(3,%r10,T2,UChi_21) VMULIDUP(3,%r11,T3,UChi_31) \
VMULIDUP(6,%r10,T2,UChi_22) VMULIDUP(6,%r11,T3,UChi_32) \
VMADDSUBRDUP(0,%r8,Chi_00,UChi_00) VMADDSUBRDUP(0,%r9,Chi_10,UChi_10) \
VMADDSUBRDUP(3,%r8,Chi_00,UChi_01) VMADDSUBRDUP(3,%r9,Chi_10,UChi_11) \
VMADDSUBRDUP(6,%r8,Chi_00,UChi_02) VMADDSUBRDUP(6,%r9,Chi_10,UChi_12) \
VMADDSUBRDUP(0,%r10,Chi_20,UChi_20) VMADDSUBRDUP(0,%r11,Chi_30,UChi_30) \
VMADDSUBRDUP(3,%r10,Chi_20,UChi_21) VMADDSUBRDUP(3,%r11,Chi_30,UChi_31) \
VMADDSUBRDUP(6,%r10,Chi_20,UChi_22) VMADDSUBRDUP(6,%r11,Chi_30,UChi_32) \
VSHUF(Chi_01,T0) VSHUF(Chi_11,T1) \
VSHUF(Chi_21,T2) VSHUF(Chi_31,T3) \
VMADDSUBIDUP(1,%r8,T0,UChi_00) VMADDSUBIDUP(1,%r9,T1,UChi_10) \
VMADDSUBIDUP(4,%r8,T0,UChi_01) VMADDSUBIDUP(4,%r9,T1,UChi_11) \
VMADDSUBIDUP(7,%r8,T0,UChi_02) VMADDSUBIDUP(7,%r9,T1,UChi_12) \
VMADDSUBIDUP(1,%r10,T2,UChi_20) VMADDSUBIDUP(1,%r11,T3,UChi_30) \
VMADDSUBIDUP(4,%r10,T2,UChi_21) VMADDSUBIDUP(4,%r11,T3,UChi_31) \
VMADDSUBIDUP(7,%r10,T2,UChi_22) VMADDSUBIDUP(7,%r11,T3,UChi_32) \
VMADDSUBRDUP(1,%r8,Chi_01,UChi_00) VMADDSUBRDUP(1,%r9,Chi_11,UChi_10) \
VMADDSUBRDUP(4,%r8,Chi_01,UChi_01) VMADDSUBRDUP(4,%r9,Chi_11,UChi_11) \
VMADDSUBRDUP(7,%r8,Chi_01,UChi_02) VMADDSUBRDUP(7,%r9,Chi_11,UChi_12) \
VMADDSUBRDUP(1,%r10,Chi_21,UChi_20) VMADDSUBRDUP(1,%r11,Chi_31,UChi_30) \
VMADDSUBRDUP(4,%r10,Chi_21,UChi_21) VMADDSUBRDUP(4,%r11,Chi_31,UChi_31) \
VMADDSUBRDUP(7,%r10,Chi_21,UChi_22) VMADDSUBRDUP(7,%r11,Chi_31,UChi_32) \
VSHUF(Chi_02,T0) VSHUF(Chi_12,T1) \
VSHUF(Chi_22,T2) VSHUF(Chi_32,T3) \
VMADDSUBIDUP(2,%r8,T0,UChi_00) VMADDSUBIDUP(2,%r9,T1,UChi_10) \
VMADDSUBIDUP(5,%r8,T0,UChi_01) VMADDSUBIDUP(5,%r9,T1,UChi_11) \
VMADDSUBIDUP(8,%r8,T0,UChi_02) VMADDSUBIDUP(8,%r9,T1,UChi_12) \
VMADDSUBIDUP(2,%r10,T2,UChi_20) VMADDSUBIDUP(2,%r11,T3,UChi_30) \
VMADDSUBIDUP(5,%r10,T2,UChi_21) VMADDSUBIDUP(5,%r11,T3,UChi_31) \
VMADDSUBIDUP(8,%r10,T2,UChi_22) VMADDSUBIDUP(8,%r11,T3,UChi_32) \
VMADDSUBRDUP(2,%r8,Chi_02,UChi_00) VMADDSUBRDUP(2,%r9,Chi_12,UChi_10) \
VMADDSUBRDUP(5,%r8,Chi_02,UChi_01) VMADDSUBRDUP(5,%r9,Chi_12,UChi_11) \
VMADDSUBRDUP(8,%r8,Chi_02,UChi_02) VMADDSUBRDUP(8,%r9,Chi_12,UChi_12) \
VMADDSUBRDUP(2,%r10,Chi_22,UChi_20) VMADDSUBRDUP(2,%r11,Chi_32,UChi_30) \
VMADDSUBRDUP(5,%r10,Chi_22,UChi_21) VMADDSUBRDUP(5,%r11,Chi_32,UChi_31) \
VMADDSUBRDUP(8,%r10,Chi_22,UChi_22) VMADDSUBRDUP(8,%r11,Chi_32,UChi_32) );
#define MULT_ADD_XYZTa(g0,g1) \
asm ( "movq %0, %%r8 \n\t" \
"movq %1, %%r9 \n\t" : : "r"(g0), "r"(g1) : "%r8","%r9");\
__asm__ ( \
VSHUF(Chi_00,T0) \
VSHUF(Chi_10,T1) \
VMOVIDUP(0,%r8,Z0 ) \
VMOVIDUP(3,%r8,Z1 ) \
VMOVIDUP(6,%r8,Z2 ) \
VMADDSUB(Z0,T0,UChi_00) \
VMADDSUB(Z1,T0,UChi_01) \
VMADDSUB(Z2,T0,UChi_02) \
\
VMOVIDUP(0,%r9,Z0 ) \
VMOVIDUP(3,%r9,Z1 ) \
VMOVIDUP(6,%r9,Z2 ) \
VMADDSUB(Z0,T1,UChi_10) \
VMADDSUB(Z1,T1,UChi_11) \
VMADDSUB(Z2,T1,UChi_12) \
\
\
VMOVRDUP(0,%r8,Z3 ) \
VMOVRDUP(3,%r8,Z4 ) \
VMOVRDUP(6,%r8,Z5 ) \
VMADDSUB(Z3,Chi_00,UChi_00)/*rr * ir = ri rr*/ \
VMADDSUB(Z4,Chi_00,UChi_01) \
VMADDSUB(Z5,Chi_00,UChi_02) \
\
VMOVRDUP(0,%r9,Z3 ) \
VMOVRDUP(3,%r9,Z4 ) \
VMOVRDUP(6,%r9,Z5 ) \
VMADDSUB(Z3,Chi_10,UChi_10) \
VMADDSUB(Z4,Chi_10,UChi_11)\
VMADDSUB(Z5,Chi_10,UChi_12) \
\
\
VMOVIDUP(1,%r8,Z0 ) \
VMOVIDUP(4,%r8,Z1 ) \
VMOVIDUP(7,%r8,Z2 ) \
VSHUF(Chi_01,T0) \
VMADDSUB(Z0,T0,UChi_00) \
VMADDSUB(Z1,T0,UChi_01) \
VMADDSUB(Z2,T0,UChi_02) \
\
VMOVIDUP(1,%r9,Z0 ) \
VMOVIDUP(4,%r9,Z1 ) \
VMOVIDUP(7,%r9,Z2 ) \
VSHUF(Chi_11,T1) \
VMADDSUB(Z0,T1,UChi_10) \
VMADDSUB(Z1,T1,UChi_11) \
VMADDSUB(Z2,T1,UChi_12) \
\
VMOVRDUP(1,%r8,Z3 ) \
VMOVRDUP(4,%r8,Z4 ) \
VMOVRDUP(7,%r8,Z5 ) \
VMADDSUB(Z3,Chi_01,UChi_00) \
VMADDSUB(Z4,Chi_01,UChi_01) \
VMADDSUB(Z5,Chi_01,UChi_02) \
\
VMOVRDUP(1,%r9,Z3 ) \
VMOVRDUP(4,%r9,Z4 ) \
VMOVRDUP(7,%r9,Z5 ) \
VMADDSUB(Z3,Chi_11,UChi_10) \
VMADDSUB(Z4,Chi_11,UChi_11) \
VMADDSUB(Z5,Chi_11,UChi_12) \
\
VSHUF(Chi_02,T0) \
VSHUF(Chi_12,T1) \
VMOVIDUP(2,%r8,Z0 ) \
VMOVIDUP(5,%r8,Z1 ) \
VMOVIDUP(8,%r8,Z2 ) \
VMADDSUB(Z0,T0,UChi_00) \
VMADDSUB(Z1,T0,UChi_01) \
VMADDSUB(Z2,T0,UChi_02) \
VMOVIDUP(2,%r9,Z0 ) \
VMOVIDUP(5,%r9,Z1 ) \
VMOVIDUP(8,%r9,Z2 ) \
VMADDSUB(Z0,T1,UChi_10) \
VMADDSUB(Z1,T1,UChi_11) \
VMADDSUB(Z2,T1,UChi_12) \
/*55*/ \
VMOVRDUP(2,%r8,Z3 ) \
VMOVRDUP(5,%r8,Z4 ) \
VMOVRDUP(8,%r8,Z5 ) \
VMADDSUB(Z3,Chi_02,UChi_00) \
VMADDSUB(Z4,Chi_02,UChi_01) \
VMADDSUB(Z5,Chi_02,UChi_02) \
VMOVRDUP(2,%r9,Z3 ) \
VMOVRDUP(5,%r9,Z4 ) \
VMOVRDUP(8,%r9,Z5 ) \
VMADDSUB(Z3,Chi_12,UChi_10) \
VMADDSUB(Z4,Chi_12,UChi_11) \
VMADDSUB(Z5,Chi_12,UChi_12) \
/*61 insns*/ );
#define MULT_ADD_XYZT(g0,g1) \
asm ( "movq %0, %%r8 \n\t" \
"movq %1, %%r9 \n\t" : : "r"(g0), "r"(g1) : "%r8","%r9");\
__asm__ ( \
VSHUFMEM(0,%r8,Z00) VSHUFMEM(0,%r9,Z10) \
VRDUP(Chi_00,T0) VIDUP(Chi_00,Chi_00) \
VRDUP(Chi_10,T1) VIDUP(Chi_10,Chi_10) \
VMUL(Z00,Chi_00,Z1) VMUL(Z10,Chi_10,Z2) \
VSHUFMEM(3,%r8,Z00) VSHUFMEM(3,%r9,Z10) \
VMUL(Z00,Chi_00,Z3) VMUL(Z10,Chi_10,Z4) \
VSHUFMEM(6,%r8,Z00) VSHUFMEM(6,%r9,Z10) \
VMUL(Z00,Chi_00,Z5) VMUL(Z10,Chi_10,Z6) \
VMADDMEM(0,%r8,T0,UChi_00) VMADDMEM(0,%r9,T1,UChi_10) \
VMADDMEM(3,%r8,T0,UChi_01) VMADDMEM(3,%r9,T1,UChi_11) \
VMADDMEM(6,%r8,T0,UChi_02) VMADDMEM(6,%r9,T1,UChi_12) \
VSHUFMEM(1,%r8,Z00) VSHUFMEM(1,%r9,Z10) \
VRDUP(Chi_01,T0) VIDUP(Chi_01,Chi_01) \
VRDUP(Chi_11,T1) VIDUP(Chi_11,Chi_11) \
VMADD(Z00,Chi_01,Z1) VMADD(Z10,Chi_11,Z2) \
VSHUFMEM(4,%r8,Z00) VSHUFMEM(4,%r9,Z10) \
VMADD(Z00,Chi_01,Z3) VMADD(Z10,Chi_11,Z4) \
VSHUFMEM(7,%r8,Z00) VSHUFMEM(7,%r9,Z10) \
VMADD(Z00,Chi_01,Z5) VMADD(Z10,Chi_11,Z6) \
VMADDMEM(1,%r8,T0,UChi_00) VMADDMEM(1,%r9,T1,UChi_10) \
VMADDMEM(4,%r8,T0,UChi_01) VMADDMEM(4,%r9,T1,UChi_11) \
VMADDMEM(7,%r8,T0,UChi_02) VMADDMEM(7,%r9,T1,UChi_12) \
VSHUFMEM(2,%r8,Z00) VSHUFMEM(2,%r9,Z10) \
VRDUP(Chi_02,T0) VIDUP(Chi_02,Chi_02) \
VRDUP(Chi_12,T1) VIDUP(Chi_12,Chi_12) \
VMADD(Z00,Chi_02,Z1) VMADD(Z10,Chi_12,Z2) \
VSHUFMEM(5,%r8,Z00) VSHUFMEM(5,%r9,Z10) \
VMADD(Z00,Chi_02,Z3) VMADD(Z10,Chi_12,Z4) \
VSHUFMEM(8,%r8,Z00) VSHUFMEM(8,%r9,Z10) \
VMADD(Z00,Chi_02,Z5) VMADD(Z10,Chi_12,Z6) \
VMADDSUBMEM(2,%r8,T0,Z1) VMADDSUBMEM(2,%r9,T1,Z2) \
VMADDSUBMEM(5,%r8,T0,Z3) VMADDSUBMEM(5,%r9,T1,Z4) \
VMADDSUBMEM(8,%r8,T0,Z5) VMADDSUBMEM(8,%r9,T1,Z6) \
VADD(Z1,UChi_00,UChi_00) VADD(Z2,UChi_10,UChi_10) \
VADD(Z3,UChi_01,UChi_01) VADD(Z4,UChi_11,UChi_11) \
VADD(Z5,UChi_02,UChi_02) VADD(Z6,UChi_12,UChi_12) );
#define MULT_XYZT(g0,g1) \
asm ( "movq %0, %%r8 \n\t" \
"movq %1, %%r9 \n\t" : : "r"(g0), "r"(g1) : "%r8","%r9" ); \
__asm__ ( \
VSHUF(Chi_00,T0) \
VSHUF(Chi_10,T1) \
VMOVIDUP(0,%r8,Z0 ) \
VMOVIDUP(3,%r8,Z1 ) \
VMOVIDUP(6,%r8,Z2 ) \
/*6*/ \
VMUL(Z0,T0,UChi_00) \
VMUL(Z1,T0,UChi_01) \
VMUL(Z2,T0,UChi_02) \
VMOVIDUP(0,%r9,Z0 ) \
VMOVIDUP(3,%r9,Z1 ) \
VMOVIDUP(6,%r9,Z2 ) \
VMUL(Z0,T1,UChi_10) \
VMUL(Z1,T1,UChi_11) \
VMUL(Z2,T1,UChi_12) \
VMOVRDUP(0,%r8,Z3 ) \
VMOVRDUP(3,%r8,Z4 ) \
VMOVRDUP(6,%r8,Z5 ) \
/*18*/ \
VMADDSUB(Z3,Chi_00,UChi_00) \
VMADDSUB(Z4,Chi_00,UChi_01)\
VMADDSUB(Z5,Chi_00,UChi_02) \
VMOVRDUP(0,%r9,Z3 ) \
VMOVRDUP(3,%r9,Z4 ) \
VMOVRDUP(6,%r9,Z5 ) \
VMADDSUB(Z3,Chi_10,UChi_10) \
VMADDSUB(Z4,Chi_10,UChi_11)\
VMADDSUB(Z5,Chi_10,UChi_12) \
VMOVIDUP(1,%r8,Z0 ) \
VMOVIDUP(4,%r8,Z1 ) \
VMOVIDUP(7,%r8,Z2 ) \
/*28*/ \
VSHUF(Chi_01,T0) \
VMADDSUB(Z0,T0,UChi_00) \
VMADDSUB(Z1,T0,UChi_01) \
VMADDSUB(Z2,T0,UChi_02) \
VMOVIDUP(1,%r9,Z0 ) \
VMOVIDUP(4,%r9,Z1 ) \
VMOVIDUP(7,%r9,Z2 ) \
VSHUF(Chi_11,T1) \
VMADDSUB(Z0,T1,UChi_10) \
VMADDSUB(Z1,T1,UChi_11) \
VMADDSUB(Z2,T1,UChi_12) \
VMOVRDUP(1,%r8,Z3 ) \
VMOVRDUP(4,%r8,Z4 ) \
VMOVRDUP(7,%r8,Z5 ) \
/*38*/ \
VMADDSUB(Z3,Chi_01,UChi_00) \
VMADDSUB(Z4,Chi_01,UChi_01) \
VMADDSUB(Z5,Chi_01,UChi_02) \
VMOVRDUP(1,%r9,Z3 ) \
VMOVRDUP(4,%r9,Z4 ) \
VMOVRDUP(7,%r9,Z5 ) \
VMADDSUB(Z3,Chi_11,UChi_10) \
VMADDSUB(Z4,Chi_11,UChi_11) \
VMADDSUB(Z5,Chi_11,UChi_12) \
/*48*/ \
VSHUF(Chi_02,T0) \
VSHUF(Chi_12,T1) \
VMOVIDUP(2,%r8,Z0 ) \
VMOVIDUP(5,%r8,Z1 ) \
VMOVIDUP(8,%r8,Z2 ) \
VMADDSUB(Z0,T0,UChi_00) \
VMADDSUB(Z1,T0,UChi_01) \
VMADDSUB(Z2,T0,UChi_02) \
VMOVIDUP(2,%r9,Z0 ) \
VMOVIDUP(5,%r9,Z1 ) \
VMOVIDUP(8,%r9,Z2 ) \
VMADDSUB(Z0,T1,UChi_10) \
VMADDSUB(Z1,T1,UChi_11) \
VMADDSUB(Z2,T1,UChi_12) \
/*55*/ \
VMOVRDUP(2,%r8,Z3 ) \
VMOVRDUP(5,%r8,Z4 ) \
VMOVRDUP(8,%r8,Z5 ) \
VMADDSUB(Z3,Chi_02,UChi_00) \
VMADDSUB(Z4,Chi_02,UChi_01) \
VMADDSUB(Z5,Chi_02,UChi_02) \
VMOVRDUP(2,%r9,Z3 ) \
VMOVRDUP(5,%r9,Z4 ) \
VMOVRDUP(8,%r9,Z5 ) \
VMADDSUB(Z3,Chi_12,UChi_10) \
VMADDSUB(Z4,Chi_12,UChi_11) \
VMADDSUB(Z5,Chi_12,UChi_12) \
/*61 insns*/ );
#define MULT_XYZTa(g0,g1) \
asm ( "movq %0, %%r8 \n\t" \
"movq %1, %%r9 \n\t" : : "r"(g0), "r"(g1) : "%r8","%r9" ); \
__asm__ ( \
VSHUFMEM(0,%r8,Z00) VSHUFMEM(0,%r9,Z10) \
VRDUP(Chi_00,T0) VIDUP(Chi_00,Chi_00) \
VRDUP(Chi_10,T1) VIDUP(Chi_10,Chi_10) \
VMUL(Z00,Chi_00,Z1) VMUL(Z10,Chi_10,Z2) \
VSHUFMEM(3,%r8,Z00) VSHUFMEM(3,%r9,Z10) \
VMUL(Z00,Chi_00,Z3) VMUL(Z10,Chi_10,Z4) \
VSHUFMEM(6,%r8,Z00) VSHUFMEM(6,%r9,Z10) \
VMUL(Z00,Chi_00,Z5) VMUL(Z10,Chi_10,Z6) \
VMULMEM(0,%r8,T0,UChi_00) VMULMEM(0,%r9,T1,UChi_10) \
VMULMEM(3,%r8,T0,UChi_01) VMULMEM(3,%r9,T1,UChi_11) \
VMULMEM(6,%r8,T0,UChi_02) VMULMEM(6,%r9,T1,UChi_12) \
VSHUFMEM(1,%r8,Z00) VSHUFMEM(1,%r9,Z10) \
VRDUP(Chi_01,T0) VIDUP(Chi_01,Chi_01) \
VRDUP(Chi_11,T1) VIDUP(Chi_11,Chi_11) \
VMADD(Z00,Chi_01,Z1) VMADD(Z10,Chi_11,Z2) \
VSHUFMEM(4,%r8,Z00) VSHUFMEM(4,%r9,Z10) \
VMADD(Z00,Chi_01,Z3) VMADD(Z10,Chi_11,Z4) \
VSHUFMEM(7,%r8,Z00) VSHUFMEM(7,%r9,Z10) \
VMADD(Z00,Chi_01,Z5) VMADD(Z10,Chi_11,Z6) \
VMADDMEM(1,%r8,T0,UChi_00) VMADDMEM(1,%r9,T1,UChi_10) \
VMADDMEM(4,%r8,T0,UChi_01) VMADDMEM(4,%r9,T1,UChi_11) \
VMADDMEM(7,%r8,T0,UChi_02) VMADDMEM(7,%r9,T1,UChi_12) \
VSHUFMEM(2,%r8,Z00) VSHUFMEM(2,%r9,Z10) \
VRDUP(Chi_02,T0) VIDUP(Chi_02,Chi_02) \
VRDUP(Chi_12,T1) VIDUP(Chi_12,Chi_12) \
VMADD(Z00,Chi_02,Z1) VMADD(Z10,Chi_12,Z2) \
VSHUFMEM(5,%r8,Z00) VSHUFMEM(5,%r9,Z10) \
VMADD(Z00,Chi_02,Z3) VMADD(Z10,Chi_12,Z4) \
VSHUFMEM(8,%r8,Z00) VSHUFMEM(8,%r9,Z10) \
VMADD(Z00,Chi_02,Z5) VMADD(Z10,Chi_12,Z6) \
VMADDSUBMEM(2,%r8,T0,Z1) VMADDSUBMEM(2,%r9,T1,Z2) \
VMADDSUBMEM(5,%r8,T0,Z3) VMADDSUBMEM(5,%r9,T1,Z4) \
VMADDSUBMEM(8,%r8,T0,Z5) VMADDSUBMEM(8,%r9,T1,Z6) \
VADD(Z1,UChi_00,UChi_00) VADD(Z2,UChi_10,UChi_10) \
VADD(Z3,UChi_01,UChi_01) VADD(Z4,UChi_11,UChi_11) \
VADD(Z5,UChi_02,UChi_02) VADD(Z6,UChi_12,UChi_12) );
#define LOAD_CHI(a0,a1,a2,a3) \
asm ( \
"movq %0, %%r8 \n\t" \
VLOAD(0,%%r8,pChi_00) \
VLOAD(1,%%r8,pChi_01) \
VLOAD(2,%%r8,pChi_02) \
: : "r" (a0) : "%r8" ); \
asm ( \
"movq %0, %%r8 \n\t" \
VLOAD(0,%%r8,pChi_10) \
VLOAD(1,%%r8,pChi_11) \
VLOAD(2,%%r8,pChi_12) \
: : "r" (a1) : "%r8" ); \
asm ( \
"movq %0, %%r8 \n\t" \
VLOAD(0,%%r8,pChi_20) \
VLOAD(1,%%r8,pChi_21) \
VLOAD(2,%%r8,pChi_22) \
: : "r" (a2) : "%r8" ); \
asm ( \
"movq %0, %%r8 \n\t" \
VLOAD(0,%%r8,pChi_30) \
VLOAD(1,%%r8,pChi_31) \
VLOAD(2,%%r8,pChi_32) \
: : "r" (a3) : "%r8" );
#define LOAD_CHIa(a0,a1) \
asm ( \
"movq %0, %%r8 \n\t" \
VLOAD(0,%%r8,pChi_00) \
VLOAD(1,%%r8,pChi_01) \
VLOAD(2,%%r8,pChi_02) \
: : "r" (a0) : "%r8" ); \
asm ( \
"movq %0, %%r8 \n\t" \
VLOAD(0,%%r8,pChi_10) \
VLOAD(1,%%r8,pChi_11) \
VLOAD(2,%%r8,pChi_12) \
: : "r" (a1) : "%r8" );
#define PF_CHI(a0)
#define PF_CHIa(a0) \
asm ( \
"movq %0, %%r8 \n\t" \
VPREFETCH1(0,%%r8) \
VPREFETCH1(1,%%r8) \
VPREFETCH1(2,%%r8) \
: : "r" (a0) : "%r8" ); \
#define PF_GAUGE_XYZT(a0)
#define PF_GAUGE_XYZTa(a0) \
asm ( \
"movq %0, %%r8 \n\t" \
VPREFETCH1(0,%%r8) \
VPREFETCH1(1,%%r8) \
VPREFETCH1(2,%%r8) \
VPREFETCH1(3,%%r8) \
VPREFETCH1(4,%%r8) \
VPREFETCH1(5,%%r8) \
VPREFETCH1(6,%%r8) \
VPREFETCH1(7,%%r8) \
VPREFETCH1(8,%%r8) \
: : "r" (a0) : "%r8" ); \
#define PF_GAUGE_LS(a0)
#define PF_GAUGE_LSa(a0) \
asm ( \
"movq %0, %%r8 \n\t" \
VPREFETCH1(0,%%r8) \
VPREFETCH1(1,%%r8) \
: : "r" (a0) : "%r8" ); \
#define REDUCE(out) \
asm ( \
VADD(UChi_00,UChi_10,UChi_00) \
VADD(UChi_01,UChi_11,UChi_01) \
VADD(UChi_02,UChi_12,UChi_02) \
VADD(UChi_30,UChi_20,UChi_30) \
VADD(UChi_31,UChi_21,UChi_31) \
VADD(UChi_32,UChi_22,UChi_32) \
VADD(UChi_00,UChi_30,UChi_00) \
VADD(UChi_01,UChi_31,UChi_01) \
VADD(UChi_02,UChi_32,UChi_02) ); \
asm ( \
VSTORE(0,%0,pUChi_00) \
VSTORE(1,%0,pUChi_01) \
VSTORE(2,%0,pUChi_02) \
: : "r" (out) : "memory" );
#define nREDUCE(out) \
asm ( \
VADD(UChi_00,UChi_10,UChi_00) \
VADD(UChi_01,UChi_11,UChi_01) \
VADD(UChi_02,UChi_12,UChi_02) \
VADD(UChi_30,UChi_20,UChi_30) \
VADD(UChi_31,UChi_21,UChi_31) \
VADD(UChi_32,UChi_22,UChi_32) \
VADD(UChi_00,UChi_30,UChi_00) \
VADD(UChi_01,UChi_31,UChi_01) \
VADD(UChi_02,UChi_32,UChi_02) ); \
asm (VZERO(Chi_00) \
VSUB(UChi_00,Chi_00,UChi_00) \
VSUB(UChi_01,Chi_00,UChi_01) \
VSUB(UChi_02,Chi_00,UChi_02) ); \
asm ( \
VSTORE(0,%0,pUChi_00) \
VSTORE(1,%0,pUChi_01) \
VSTORE(2,%0,pUChi_02) \
: : "r" (out) : "memory" );
#define REDUCEa(out) \
asm ( \
VADD(UChi_00,UChi_10,UChi_00) \
VADD(UChi_01,UChi_11,UChi_01) \
VADD(UChi_02,UChi_12,UChi_02) ); \
asm ( \
VSTORE(0,%0,pUChi_00) \
VSTORE(1,%0,pUChi_01) \
VSTORE(2,%0,pUChi_02) \
: : "r" (out) : "memory" );
// FIXME is sign right in the VSUB ?
#define nREDUCEa(out) \
asm ( \
VADD(UChi_00,UChi_10,UChi_00) \
VADD(UChi_01,UChi_11,UChi_01) \
VADD(UChi_02,UChi_12,UChi_02) ); \
asm (VZERO(Chi_00) \
VSUB(UChi_00,Chi_00,UChi_00) \
VSUB(UChi_01,Chi_00,UChi_01) \
VSUB(UChi_02,Chi_00,UChi_02) ); \
asm ( \
VSTORE(0,%0,pUChi_00) \
VSTORE(1,%0,pUChi_01) \
VSTORE(2,%0,pUChi_02) \
: : "r" (out) : "memory" );
#define PERMUTE_DIR(dir) \
permute##dir(Chi_0,Chi_0);\
permute##dir(Chi_1,Chi_1);\
permute##dir(Chi_2,Chi_2);
namespace Grid {
namespace QCD {
template <class Impl>
void StaggeredKernels<Impl>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag)
{
assert(0);
};
//#define CONDITIONAL_MOVE(l,o,out) if ( l ) { out = (uint64_t) &in._odata[o] ; } else { out =(uint64_t) &buf[o]; }
#define CONDITIONAL_MOVE(l,o,out) { const SiteSpinor *ptr = l? in_p : buf; out = (uint64_t) &ptr[o]; }
#define PREPARE_XYZT(X,Y,Z,T,skew,UU) \
PREPARE(X,Y,Z,T,skew,UU); \
PF_GAUGE_XYZT(gauge0); \
PF_GAUGE_XYZT(gauge1); \
PF_GAUGE_XYZT(gauge2); \
PF_GAUGE_XYZT(gauge3);
#define PREPARE_LS(X,Y,Z,T,skew,UU) \
PREPARE(X,Y,Z,T,skew,UU); \
PF_GAUGE_LS(gauge0); \
PF_GAUGE_LS(gauge1); \
PF_GAUGE_LS(gauge2); \
PF_GAUGE_LS(gauge3);
#define PREPARE(X,Y,Z,T,skew,UU) \
SE0=st.GetEntry(ptype,X+skew,sF); \
o0 = SE0->_offset; \
l0 = SE0->_is_local; \
p0 = SE0->_permute; \
CONDITIONAL_MOVE(l0,o0,addr0); \
PF_CHI(addr0); \
\
SE1=st.GetEntry(ptype,Y+skew,sF); \
o1 = SE1->_offset; \
l1 = SE1->_is_local; \
p1 = SE1->_permute; \
CONDITIONAL_MOVE(l1,o1,addr1); \
PF_CHI(addr1); \
\
SE2=st.GetEntry(ptype,Z+skew,sF); \
o2 = SE2->_offset; \
l2 = SE2->_is_local; \
p2 = SE2->_permute; \
CONDITIONAL_MOVE(l2,o2,addr2); \
PF_CHI(addr2); \
\
SE3=st.GetEntry(ptype,T+skew,sF); \
o3 = SE3->_offset; \
l3 = SE3->_is_local; \
p3 = SE3->_permute; \
CONDITIONAL_MOVE(l3,o3,addr3); \
PF_CHI(addr3); \
\
gauge0 =(uint64_t)&UU._odata[sU]( X ); \
gauge1 =(uint64_t)&UU._odata[sU]( Y ); \
gauge2 =(uint64_t)&UU._odata[sU]( Z ); \
gauge3 =(uint64_t)&UU._odata[sU]( T );
// This is the single precision 5th direction vectorised kernel
#include <simd/Intel512single.h>
template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag)
{
#ifdef AVX512
uint64_t gauge0,gauge1,gauge2,gauge3;
uint64_t addr0,addr1,addr2,addr3;
const SiteSpinor *in_p; in_p = &in._odata[0];
int o0,o1,o2,o3; // offsets
int l0,l1,l2,l3; // local
int p0,p1,p2,p3; // perm
int ptype;
StencilEntry *SE0;
StencilEntry *SE1;
StencilEntry *SE2;
StencilEntry *SE3;
for(int s=0;s<LLs;s++){
int sF=s+LLs*sU;
// Xp, Yp, Zp, Tp
PREPARE(Xp,Yp,Zp,Tp,0,U);
LOAD_CHI(addr0,addr1,addr2,addr3);
MULT_LS(gauge0,gauge1,gauge2,gauge3);
PREPARE(Xm,Ym,Zm,Tm,0,U);
LOAD_CHI(addr0,addr1,addr2,addr3);
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
PREPARE(Xp,Yp,Zp,Tp,8,UUU);
LOAD_CHI(addr0,addr1,addr2,addr3);
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
PREPARE(Xm,Ym,Zm,Tm,8,UUU);
LOAD_CHI(addr0,addr1,addr2,addr3);
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
addr0 = (uint64_t) &out._odata[sF];
if ( dag ) {
nREDUCE(addr0);
} else {
REDUCE(addr0);
}
}
#else
assert(0);
#endif
}
#include <simd/Intel512double.h>
template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag)
{
#ifdef AVX512
uint64_t gauge0,gauge1,gauge2,gauge3;
uint64_t addr0,addr1,addr2,addr3;
const SiteSpinor *in_p; in_p = &in._odata[0];
int o0,o1,o2,o3; // offsets
int l0,l1,l2,l3; // local
int p0,p1,p2,p3; // perm
int ptype;
StencilEntry *SE0;
StencilEntry *SE1;
StencilEntry *SE2;
StencilEntry *SE3;
for(int s=0;s<LLs;s++){
int sF=s+LLs*sU;
// Xp, Yp, Zp, Tp
PREPARE(Xp,Yp,Zp,Tp,0,U);
LOAD_CHI(addr0,addr1,addr2,addr3);
MULT_LS(gauge0,gauge1,gauge2,gauge3);
PREPARE(Xm,Ym,Zm,Tm,0,U);
LOAD_CHI(addr0,addr1,addr2,addr3);
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
PREPARE(Xp,Yp,Zp,Tp,8,UUU);
LOAD_CHI(addr0,addr1,addr2,addr3);
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
PREPARE(Xm,Ym,Zm,Tm,8,UUU);
LOAD_CHI(addr0,addr1,addr2,addr3);
MULT_ADD_LS(gauge0,gauge1,gauge2,gauge3);
addr0 = (uint64_t) &out._odata[sF];
if ( dag ) {
nREDUCE(addr0);
} else {
REDUCE(addr0);
}
}
#else
assert(0);
#endif
}
#define PERMUTE_DIR3 __asm__ ( \
VPERM3(Chi_00,Chi_00) \
VPERM3(Chi_01,Chi_01) \
VPERM3(Chi_02,Chi_02) );
#define PERMUTE_DIR2 __asm__ ( \
VPERM2(Chi_10,Chi_10) \
VPERM2(Chi_11,Chi_11) \
VPERM2(Chi_12,Chi_12) );
#define PERMUTE_DIR1 __asm__ ( \
VPERM1(Chi_00,Chi_00) \
VPERM1(Chi_01,Chi_01) \
VPERM1(Chi_02,Chi_02) );
#define PERMUTE_DIR0 __asm__ ( \
VPERM0(Chi_10,Chi_10) \
VPERM0(Chi_11,Chi_11) \
VPERM0(Chi_12,Chi_12) );
#define PERMUTE01 \
if ( p0 ) { PERMUTE_DIR3; }\
if ( p1 ) { PERMUTE_DIR2; }
#define PERMUTE23 \
if ( p2 ) { PERMUTE_DIR1; }\
if ( p3 ) { PERMUTE_DIR0; }
// This is the single precision 5th direction vectorised kernel
#include <simd/Intel512single.h>
template <> void StaggeredKernels<StaggeredImplF>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag)
{
#ifdef AVX512
uint64_t gauge0,gauge1,gauge2,gauge3;
uint64_t addr0,addr1,addr2,addr3;
const SiteSpinor *in_p; in_p = &in._odata[0];
int o0,o1,o2,o3; // offsets
int l0,l1,l2,l3; // local
int p0,p1,p2,p3; // perm
int ptype;
StencilEntry *SE0;
StencilEntry *SE1;
StencilEntry *SE2;
StencilEntry *SE3;
for(int s=0;s<LLs;s++){
int sF=s+LLs*sU;
// Xp, Yp, Zp, Tp
PREPARE(Xp,Yp,Zp,Tp,0,U);
LOAD_CHIa(addr0,addr1);
PERMUTE01;
MULT_XYZT(gauge0,gauge1);
LOAD_CHIa(addr2,addr3);
PERMUTE23;
MULT_ADD_XYZT(gauge2,gauge3);
PREPARE(Xm,Ym,Zm,Tm,0,U);
LOAD_CHIa(addr0,addr1);
PERMUTE01;
MULT_ADD_XYZT(gauge0,gauge1);
LOAD_CHIa(addr2,addr3);
PERMUTE23;
MULT_ADD_XYZT(gauge2,gauge3);
PREPARE(Xp,Yp,Zp,Tp,8,UUU);
LOAD_CHIa(addr0,addr1);
PERMUTE01;
MULT_ADD_XYZT(gauge0,gauge1);
LOAD_CHIa(addr2,addr3);
PERMUTE23;
MULT_ADD_XYZT(gauge2,gauge3);
PREPARE(Xm,Ym,Zm,Tm,8,UUU);
LOAD_CHIa(addr0,addr1);
PERMUTE01;
MULT_ADD_XYZT(gauge0,gauge1);
LOAD_CHIa(addr2,addr3);
PERMUTE23;
MULT_ADD_XYZT(gauge2,gauge3);
addr0 = (uint64_t) &out._odata[sF];
if ( dag ) {
nREDUCEa(addr0);
} else {
REDUCEa(addr0);
}
}
#else
assert(0);
#endif
}
#include <simd/Intel512double.h>
template <> void StaggeredKernels<StaggeredImplD>::DhopSiteAsm(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag)
{
#ifdef AVX512
uint64_t gauge0,gauge1,gauge2,gauge3;
uint64_t addr0,addr1,addr2,addr3;
const SiteSpinor *in_p; in_p = &in._odata[0];
int o0,o1,o2,o3; // offsets
int l0,l1,l2,l3; // local
int p0,p1,p2,p3; // perm
int ptype;
StencilEntry *SE0;
StencilEntry *SE1;
StencilEntry *SE2;
StencilEntry *SE3;
for(int s=0;s<LLs;s++){
int sF=s+LLs*sU;
// Xp, Yp, Zp, Tp
PREPARE(Xp,Yp,Zp,Tp,0,U);
LOAD_CHIa(addr0,addr1);
PERMUTE01;
MULT_XYZT(gauge0,gauge1);
LOAD_CHIa(addr2,addr3);
PERMUTE23;
MULT_ADD_XYZT(gauge2,gauge3);
PREPARE(Xm,Ym,Zm,Tm,0,U);
LOAD_CHIa(addr0,addr1);
PERMUTE01;
MULT_ADD_XYZT(gauge0,gauge1);
LOAD_CHIa(addr2,addr3);
PERMUTE23;
MULT_ADD_XYZT(gauge2,gauge3);
PREPARE(Xp,Yp,Zp,Tp,8,UUU);
LOAD_CHIa(addr0,addr1);
PERMUTE01;
MULT_ADD_XYZT(gauge0,gauge1);
LOAD_CHIa(addr2,addr3);
PERMUTE23;
MULT_ADD_XYZT(gauge2,gauge3);
PREPARE(Xm,Ym,Zm,Tm,8,UUU);
LOAD_CHIa(addr0,addr1);
PERMUTE01;
MULT_ADD_XYZT(gauge0,gauge1);
LOAD_CHIa(addr2,addr3);
PERMUTE23;
MULT_ADD_XYZT(gauge2,gauge3);
addr0 = (uint64_t) &out._odata[sF];
if ( dag ) {
nREDUCEa(addr0);
} else {
REDUCEa(addr0);
}
}
#else
assert(0);
#endif
}
#define KERNEL_INSTANTIATE(CLASS,FUNC,IMPL) \
template void CLASS<IMPL>::FUNC(StencilImpl &st, LebesgueOrder &lo, \
DoubledGaugeField &U, \
DoubledGaugeField &UUU, \
SiteSpinor *buf, int LLs, \
int sU, const FermionField &in, FermionField &out,int dag);
KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplD);
KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredImplF);
KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredVec5dImplD);
KERNEL_INSTANTIATE(StaggeredKernels,DhopSiteAsm,StaggeredVec5dImplF);
}}

View File

@ -1,399 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/StaggerdKernelsHand.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid.h>
#define LOAD_CHI(b) \
const SiteSpinor & ref (b[offset]); \
Chi_0=ref()()(0);\
Chi_1=ref()()(1);\
Chi_2=ref()()(2);
// To splat or not to splat depends on the implementation
#define MULT(A,UChi) \
auto & ref(U._odata[sU](A)); \
Impl::loadLinkElement(U_00,ref()(0,0)); \
Impl::loadLinkElement(U_10,ref()(1,0)); \
Impl::loadLinkElement(U_20,ref()(2,0)); \
Impl::loadLinkElement(U_01,ref()(0,1)); \
Impl::loadLinkElement(U_11,ref()(1,1)); \
Impl::loadLinkElement(U_21,ref()(2,1)); \
Impl::loadLinkElement(U_02,ref()(0,2)); \
Impl::loadLinkElement(U_12,ref()(1,2)); \
Impl::loadLinkElement(U_22,ref()(2,2)); \
UChi ## _0 = U_00*Chi_0; \
UChi ## _1 = U_10*Chi_0;\
UChi ## _2 = U_20*Chi_0;\
UChi ## _0 += U_01*Chi_1;\
UChi ## _1 += U_11*Chi_1;\
UChi ## _2 += U_21*Chi_1;\
UChi ## _0 += U_02*Chi_2;\
UChi ## _1 += U_12*Chi_2;\
UChi ## _2 += U_22*Chi_2;
#define MULT_ADD(U,A,UChi) \
auto & ref(U._odata[sU](A)); \
Impl::loadLinkElement(U_00,ref()(0,0)); \
Impl::loadLinkElement(U_10,ref()(1,0)); \
Impl::loadLinkElement(U_20,ref()(2,0)); \
Impl::loadLinkElement(U_01,ref()(0,1)); \
Impl::loadLinkElement(U_11,ref()(1,1)); \
Impl::loadLinkElement(U_21,ref()(2,1)); \
Impl::loadLinkElement(U_02,ref()(0,2)); \
Impl::loadLinkElement(U_12,ref()(1,2)); \
Impl::loadLinkElement(U_22,ref()(2,2)); \
UChi ## _0 += U_00*Chi_0; \
UChi ## _1 += U_10*Chi_0;\
UChi ## _2 += U_20*Chi_0;\
UChi ## _0 += U_01*Chi_1;\
UChi ## _1 += U_11*Chi_1;\
UChi ## _2 += U_21*Chi_1;\
UChi ## _0 += U_02*Chi_2;\
UChi ## _1 += U_12*Chi_2;\
UChi ## _2 += U_22*Chi_2;
#define PERMUTE_DIR(dir) \
permute##dir(Chi_0,Chi_0); \
permute##dir(Chi_1,Chi_1); \
permute##dir(Chi_2,Chi_2);
#define HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \
SE=st.GetEntry(ptype,Dir+skew,sF); \
offset = SE->_offset; \
local = SE->_is_local; \
perm = SE->_permute; \
if ( local ) { \
LOAD_CHI(in._odata); \
if ( perm) { \
PERMUTE_DIR(Perm); \
} \
} else { \
LOAD_CHI(buf); \
}
#define HAND_STENCIL_LEG_BEGIN(Dir,Perm,skew,even) \
HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \
{ \
MULT(Dir,even); \
}
#define HAND_STENCIL_LEG(U,Dir,Perm,skew,even) \
HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \
{ \
MULT_ADD(U,Dir,even); \
}
#define HAND_STENCIL_LEG_INT(U,Dir,Perm,skew,even) \
SE=st.GetEntry(ptype,Dir+skew,sF); \
offset = SE->_offset; \
local = SE->_is_local; \
perm = SE->_permute; \
if ( local ) { \
LOAD_CHI(in._odata); \
if ( perm) { \
PERMUTE_DIR(Perm); \
} \
} else if ( st.same_node[Dir] ) { \
LOAD_CHI(buf); \
} \
if (SE->_is_local || st.same_node[Dir] ) { \
MULT_ADD(U,Dir,even); \
}
#define HAND_STENCIL_LEG_EXT(U,Dir,Perm,skew,even) \
SE=st.GetEntry(ptype,Dir+skew,sF); \
offset = SE->_offset; \
local = SE->_is_local; \
perm = SE->_permute; \
if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \
nmu++; \
{ LOAD_CHI(buf); } \
{ MULT_ADD(U,Dir,even); } \
}
namespace Grid {
namespace QCD {
template <class Impl>
void StaggeredKernels<Impl>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag)
{
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
Simd even_0; // 12 regs on knc
Simd even_1;
Simd even_2;
Simd odd_0; // 12 regs on knc
Simd odd_1;
Simd odd_2;
Simd Chi_0; // two spinor; 6 regs
Simd Chi_1;
Simd Chi_2;
Simd U_00; // two rows of U matrix
Simd U_10;
Simd U_20;
Simd U_01;
Simd U_11;
Simd U_21; // 2 reg left.
Simd U_02;
Simd U_12;
Simd U_22;
SiteSpinor result;
int offset,local,perm, ptype;
StencilEntry *SE;
int skew;
for(int s=0;s<LLs;s++){
int sF=s+LLs*sU;
skew = 0;
HAND_STENCIL_LEG_BEGIN(Xp,3,skew,even);
HAND_STENCIL_LEG_BEGIN(Yp,2,skew,odd);
HAND_STENCIL_LEG (U,Zp,1,skew,even);
HAND_STENCIL_LEG (U,Tp,0,skew,odd);
HAND_STENCIL_LEG (U,Xm,3,skew,even);
HAND_STENCIL_LEG (U,Ym,2,skew,odd);
HAND_STENCIL_LEG (U,Zm,1,skew,even);
HAND_STENCIL_LEG (U,Tm,0,skew,odd);
skew = 8;
HAND_STENCIL_LEG(UUU,Xp,3,skew,even);
HAND_STENCIL_LEG(UUU,Yp,2,skew,odd);
HAND_STENCIL_LEG(UUU,Zp,1,skew,even);
HAND_STENCIL_LEG(UUU,Tp,0,skew,odd);
HAND_STENCIL_LEG(UUU,Xm,3,skew,even);
HAND_STENCIL_LEG(UUU,Ym,2,skew,odd);
HAND_STENCIL_LEG(UUU,Zm,1,skew,even);
HAND_STENCIL_LEG(UUU,Tm,0,skew,odd);
if ( dag ) {
result()()(0) = - even_0 - odd_0;
result()()(1) = - even_1 - odd_1;
result()()(2) = - even_2 - odd_2;
} else {
result()()(0) = even_0 + odd_0;
result()()(1) = even_1 + odd_1;
result()()(2) = even_2 + odd_2;
}
vstream(out._odata[sF],result);
}
}
template <class Impl>
void StaggeredKernels<Impl>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag)
{
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
Simd even_0; // 12 regs on knc
Simd even_1;
Simd even_2;
Simd odd_0; // 12 regs on knc
Simd odd_1;
Simd odd_2;
Simd Chi_0; // two spinor; 6 regs
Simd Chi_1;
Simd Chi_2;
Simd U_00; // two rows of U matrix
Simd U_10;
Simd U_20;
Simd U_01;
Simd U_11;
Simd U_21; // 2 reg left.
Simd U_02;
Simd U_12;
Simd U_22;
SiteSpinor result;
int offset,local,perm, ptype;
StencilEntry *SE;
int skew;
for(int s=0;s<LLs;s++){
int sF=s+LLs*sU;
even_0 = zero; even_1 = zero; even_2 = zero;
odd_0 = zero; odd_1 = zero; odd_2 = zero;
skew = 0;
HAND_STENCIL_LEG_INT(U,Xp,3,skew,even);
HAND_STENCIL_LEG_INT(U,Yp,2,skew,odd);
HAND_STENCIL_LEG_INT(U,Zp,1,skew,even);
HAND_STENCIL_LEG_INT(U,Tp,0,skew,odd);
HAND_STENCIL_LEG_INT(U,Xm,3,skew,even);
HAND_STENCIL_LEG_INT(U,Ym,2,skew,odd);
HAND_STENCIL_LEG_INT(U,Zm,1,skew,even);
HAND_STENCIL_LEG_INT(U,Tm,0,skew,odd);
skew = 8;
HAND_STENCIL_LEG_INT(UUU,Xp,3,skew,even);
HAND_STENCIL_LEG_INT(UUU,Yp,2,skew,odd);
HAND_STENCIL_LEG_INT(UUU,Zp,1,skew,even);
HAND_STENCIL_LEG_INT(UUU,Tp,0,skew,odd);
HAND_STENCIL_LEG_INT(UUU,Xm,3,skew,even);
HAND_STENCIL_LEG_INT(UUU,Ym,2,skew,odd);
HAND_STENCIL_LEG_INT(UUU,Zm,1,skew,even);
HAND_STENCIL_LEG_INT(UUU,Tm,0,skew,odd);
// Assume every site must be connected to at least one interior point. No 1^4 subvols.
if ( dag ) {
result()()(0) = - even_0 - odd_0;
result()()(1) = - even_1 - odd_1;
result()()(2) = - even_2 - odd_2;
} else {
result()()(0) = even_0 + odd_0;
result()()(1) = even_1 + odd_1;
result()()(2) = even_2 + odd_2;
}
vstream(out._odata[sF],result);
}
}
template <class Impl>
void StaggeredKernels<Impl>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
SiteSpinor *buf, int LLs, int sU,
const FermionField &in, FermionField &out,int dag)
{
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
Simd even_0; // 12 regs on knc
Simd even_1;
Simd even_2;
Simd odd_0; // 12 regs on knc
Simd odd_1;
Simd odd_2;
Simd Chi_0; // two spinor; 6 regs
Simd Chi_1;
Simd Chi_2;
Simd U_00; // two rows of U matrix
Simd U_10;
Simd U_20;
Simd U_01;
Simd U_11;
Simd U_21; // 2 reg left.
Simd U_02;
Simd U_12;
Simd U_22;
SiteSpinor result;
int offset,local,perm, ptype;
StencilEntry *SE;
int skew;
for(int s=0;s<LLs;s++){
int sF=s+LLs*sU;
even_0 = zero; even_1 = zero; even_2 = zero;
odd_0 = zero; odd_1 = zero; odd_2 = zero;
int nmu=0;
skew = 0;
HAND_STENCIL_LEG_EXT(U,Xp,3,skew,even);
HAND_STENCIL_LEG_EXT(U,Yp,2,skew,odd);
HAND_STENCIL_LEG_EXT(U,Zp,1,skew,even);
HAND_STENCIL_LEG_EXT(U,Tp,0,skew,odd);
HAND_STENCIL_LEG_EXT(U,Xm,3,skew,even);
HAND_STENCIL_LEG_EXT(U,Ym,2,skew,odd);
HAND_STENCIL_LEG_EXT(U,Zm,1,skew,even);
HAND_STENCIL_LEG_EXT(U,Tm,0,skew,odd);
skew = 8;
HAND_STENCIL_LEG_EXT(UUU,Xp,3,skew,even);
HAND_STENCIL_LEG_EXT(UUU,Yp,2,skew,odd);
HAND_STENCIL_LEG_EXT(UUU,Zp,1,skew,even);
HAND_STENCIL_LEG_EXT(UUU,Tp,0,skew,odd);
HAND_STENCIL_LEG_EXT(UUU,Xm,3,skew,even);
HAND_STENCIL_LEG_EXT(UUU,Ym,2,skew,odd);
HAND_STENCIL_LEG_EXT(UUU,Zm,1,skew,even);
HAND_STENCIL_LEG_EXT(UUU,Tm,0,skew,odd);
// Add sum of all exterior connected stencil legs
if ( nmu ) {
if ( dag ) {
result()()(0) = - even_0 - odd_0;
result()()(1) = - even_1 - odd_1;
result()()(2) = - even_2 - odd_2;
} else {
result()()(0) = even_0 + odd_0;
result()()(1) = even_1 + odd_1;
result()()(2) = even_2 + odd_2;
}
out._odata[sF] = out._odata[sF] + result;
}
}
}
#define DHOP_SITE_HAND_INSTANTIATE(IMPL) \
template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \
DoubledGaugeField &U,DoubledGaugeField &UUU, \
SiteSpinor *buf, int LLs, int sU, \
const FermionField &in, FermionField &out, int dag); \
\
template void StaggeredKernels<IMPL>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, \
DoubledGaugeField &U,DoubledGaugeField &UUU, \
SiteSpinor *buf, int LLs, int sU, \
const FermionField &in, FermionField &out, int dag); \
\
template void StaggeredKernels<IMPL>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, \
DoubledGaugeField &U,DoubledGaugeField &UUU, \
SiteSpinor *buf, int LLs, int sU, \
const FermionField &in, FermionField &out, int dag); \
DHOP_SITE_HAND_INSTANTIATE(StaggeredImplD);
DHOP_SITE_HAND_INSTANTIATE(StaggeredImplF);
DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplD);
DHOP_SITE_HAND_INSTANTIATE(StaggeredVec5dImplF);
}
}

View File

@ -1,243 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.cc
Copyright (C) 2017
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Guido Cossu <guido.cossu@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
//#include <Grid/Eigen/Dense>
#include <Grid/qcd/spin/Dirac.h>
namespace Grid
{
namespace QCD
{
// *NOT* EO
template <class Impl>
RealD WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
{
FermionField temp(out._grid);
// Wilson term
out.checkerboard = in.checkerboard;
this->Dhop(in, out, DaggerNo);
// Clover term
Mooee(in, temp);
out += temp;
return norm2(out);
}
template <class Impl>
RealD WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
{
FermionField temp(out._grid);
// Wilson term
out.checkerboard = in.checkerboard;
this->Dhop(in, out, DaggerYes);
// Clover term
MooeeDag(in, temp);
out += temp;
return norm2(out);
}
template <class Impl>
void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
{
WilsonFermion<Impl>::ImportGauge(_Umu);
GridBase *grid = _Umu._grid;
typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
// Compute the field strength terms mu>nu
WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
WilsonLoops<Impl>::FieldStrength(Bz, _Umu, Ydir, Xdir);
WilsonLoops<Impl>::FieldStrength(Ex, _Umu, Tdir, Xdir);
WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
// Compute the Clover Operator acting on Colour and Spin
// multiply here by the clover coefficients for the anisotropy
CloverTerm = fillCloverYZ(Bx) * csw_r;
CloverTerm += fillCloverXZ(By) * csw_r;
CloverTerm += fillCloverXY(Bz) * csw_r;
CloverTerm += fillCloverXT(Ex) * csw_t;
CloverTerm += fillCloverYT(Ey) * csw_t;
CloverTerm += fillCloverZT(Ez) * csw_t;
CloverTerm += diag_mass;
int lvol = _Umu._grid->lSites();
int DimRep = Impl::Dimension;
Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
std::vector<int> lcoor;
typename SiteCloverType::scalar_object Qx = zero, Qxinv = zero;
for (int site = 0; site < lvol; site++)
{
grid->LocalIndexToLocalCoor(site, lcoor);
EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
peekLocalSite(Qx, CloverTerm, lcoor);
Qxinv = zero;
//if (csw!=0){
for (int j = 0; j < Ns; j++)
for (int k = 0; k < Ns; k++)
for (int a = 0; a < DimRep; a++)
for (int b = 0; b < DimRep; b++)
EigenCloverOp(a + j * DimRep, b + k * DimRep) = Qx()(j, k)(a, b);
// if (site==0) std::cout << "site =" << site << "\n" << EigenCloverOp << std::endl;
EigenInvCloverOp = EigenCloverOp.inverse();
//std::cout << EigenInvCloverOp << std::endl;
for (int j = 0; j < Ns; j++)
for (int k = 0; k < Ns; k++)
for (int a = 0; a < DimRep; a++)
for (int b = 0; b < DimRep; b++)
Qxinv()(j, k)(a, b) = EigenInvCloverOp(a + j * DimRep, b + k * DimRep);
// if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
// }
pokeLocalSite(Qxinv, CloverTermInv, lcoor);
}
// Separate the even and odd parts
pickCheckerboard(Even, CloverTermEven, CloverTerm);
pickCheckerboard(Odd, CloverTermOdd, CloverTerm);
pickCheckerboard(Even, CloverTermDagEven, adj(CloverTerm));
pickCheckerboard(Odd, CloverTermDagOdd, adj(CloverTerm));
pickCheckerboard(Even, CloverTermInvEven, CloverTermInv);
pickCheckerboard(Odd, CloverTermInvOdd, CloverTermInv);
pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv));
pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv));
}
template <class Impl>
void WilsonCloverFermion<Impl>::Mooee(const FermionField &in, FermionField &out)
{
this->MooeeInternal(in, out, DaggerNo, InverseNo);
}
template <class Impl>
void WilsonCloverFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out)
{
this->MooeeInternal(in, out, DaggerYes, InverseNo);
}
template <class Impl>
void WilsonCloverFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out)
{
this->MooeeInternal(in, out, DaggerNo, InverseYes);
}
template <class Impl>
void WilsonCloverFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out)
{
this->MooeeInternal(in, out, DaggerYes, InverseYes);
}
template <class Impl>
void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
{
out.checkerboard = in.checkerboard;
CloverFieldType *Clover;
assert(in.checkerboard == Odd || in.checkerboard == Even);
if (dag)
{
if (in._grid->_isCheckerBoarded)
{
if (in.checkerboard == Odd)
{
Clover = (inv) ? &CloverTermInvDagOdd : &CloverTermDagOdd;
}
else
{
Clover = (inv) ? &CloverTermInvDagEven : &CloverTermDagEven;
}
out = *Clover * in;
}
else
{
Clover = (inv) ? &CloverTermInv : &CloverTerm;
out = adj(*Clover) * in;
}
}
else
{
if (in._grid->_isCheckerBoarded)
{
if (in.checkerboard == Odd)
{
// std::cout << "Calling clover term Odd" << std::endl;
Clover = (inv) ? &CloverTermInvOdd : &CloverTermOdd;
}
else
{
// std::cout << "Calling clover term Even" << std::endl;
Clover = (inv) ? &CloverTermInvEven : &CloverTermEven;
}
out = *Clover * in;
// std::cout << GridLogMessage << "*Clover.checkerboard " << (*Clover).checkerboard << std::endl;
}
else
{
Clover = (inv) ? &CloverTermInv : &CloverTerm;
out = *Clover * in;
}
}
} // MooeeInternal
// Derivative parts
template <class Impl>
void WilsonCloverFermion<Impl>::MooDeriv(GaugeField &mat, const FermionField &X, const FermionField &Y, int dag)
{
assert(0);
}
// Derivative parts
template <class Impl>
void WilsonCloverFermion<Impl>::MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag)
{
assert(0); // not implemented yet
}
FermOpTemplateInstantiate(WilsonCloverFermion);
AdjointFermOpTemplateInstantiate(WilsonCloverFermion);
TwoIndexFermOpTemplateInstantiate(WilsonCloverFermion);
//GparityFermOpTemplateInstantiate(WilsonCloverFermion);
}
}

View File

@ -1,366 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.h
Copyright (C) 2017
Author: Guido Cossu <guido.cossu@ed.ac.uk>
Author: David Preti <>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_QCD_WILSON_CLOVER_FERMION_H
#define GRID_QCD_WILSON_CLOVER_FERMION_H
#include <Grid/Grid.h>
namespace Grid
{
namespace QCD
{
///////////////////////////////////////////////////////////////////
// Wilson Clover
//
// Operator ( with anisotropy coefficients):
//
// Q = 1 + (Nd-1)/xi_0 + m
// + W_t + (nu/xi_0) * W_s
// - 1/2*[ csw_t * sum_s (sigma_ts F_ts) + (csw_s/xi_0) * sum_ss (sigma_ss F_ss) ]
//
// s spatial, t temporal directions.
// where W_t and W_s are the temporal and spatial components of the
// Wilson Dirac operator
//
// csw_r = csw_t to recover the isotropic version
//////////////////////////////////////////////////////////////////
template <class Impl>
class WilsonCloverFermion : public WilsonFermion<Impl>
{
public:
// Types definitions
INHERIT_IMPL_TYPES(Impl);
template <typename vtype>
using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
typedef iImplClover<Simd> SiteCloverType;
typedef Lattice<SiteCloverType> CloverFieldType;
public:
typedef WilsonFermion<Impl> WilsonBase;
virtual void Instantiatable(void){};
// Constructors
WilsonCloverFermion(GaugeField &_Umu, GridCartesian &Fgrid,
GridRedBlackCartesian &Hgrid,
const RealD _mass,
const RealD _csw_r = 0.0,
const RealD _csw_t = 0.0,
const WilsonAnisotropyCoefficients &clover_anisotropy = WilsonAnisotropyCoefficients(),
const ImplParams &impl_p = ImplParams()) : WilsonFermion<Impl>(_Umu,
Fgrid,
Hgrid,
_mass, impl_p, clover_anisotropy),
CloverTerm(&Fgrid),
CloverTermInv(&Fgrid),
CloverTermEven(&Hgrid),
CloverTermOdd(&Hgrid),
CloverTermInvEven(&Hgrid),
CloverTermInvOdd(&Hgrid),
CloverTermDagEven(&Hgrid),
CloverTermDagOdd(&Hgrid),
CloverTermInvDagEven(&Hgrid),
CloverTermInvDagOdd(&Hgrid)
{
assert(Nd == 4); // require 4 dimensions
if (clover_anisotropy.isAnisotropic)
{
csw_r = _csw_r * 0.5 / clover_anisotropy.xi_0;
diag_mass = _mass + 1.0 + (Nd - 1) * (clover_anisotropy.nu / clover_anisotropy.xi_0);
}
else
{
csw_r = _csw_r * 0.5;
diag_mass = 4.0 + _mass;
}
csw_t = _csw_t * 0.5;
if (csw_r == 0)
std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_r = 0" << std::endl;
if (csw_t == 0)
std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_t = 0" << std::endl;
ImportGauge(_Umu);
}
virtual RealD M(const FermionField &in, FermionField &out);
virtual RealD Mdag(const FermionField &in, FermionField &out);
virtual void Mooee(const FermionField &in, FermionField &out);
virtual void MooeeDag(const FermionField &in, FermionField &out);
virtual void MooeeInv(const FermionField &in, FermionField &out);
virtual void MooeeInvDag(const FermionField &in, FermionField &out);
virtual void MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv);
//virtual void MDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
virtual void MooDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
virtual void MeeDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
void ImportGauge(const GaugeField &_Umu);
// Derivative parts unpreconditioned pseudofermions
void MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag)
{
conformable(X._grid, Y._grid);
conformable(X._grid, force._grid);
GaugeLinkField force_mu(force._grid), lambda(force._grid);
GaugeField clover_force(force._grid);
PropagatorField Lambda(force._grid);
// Guido: Here we are hitting some performance issues:
// need to extract the components of the DoubledGaugeField
// for each call
// Possible solution
// Create a vector object to store them? (cons: wasting space)
std::vector<GaugeLinkField> U(Nd, this->Umu._grid);
Impl::extractLinkField(U, this->Umu);
force = zero;
// Derivative of the Wilson hopping term
this->DhopDeriv(force, X, Y, dag);
///////////////////////////////////////////////////////////
// Clover term derivative
///////////////////////////////////////////////////////////
Impl::outerProductImpl(Lambda, X, Y);
//std::cout << "Lambda:" << Lambda << std::endl;
Gamma::Algebra sigma[] = {
Gamma::Algebra::SigmaXY,
Gamma::Algebra::SigmaXZ,
Gamma::Algebra::SigmaXT,
Gamma::Algebra::MinusSigmaXY,
Gamma::Algebra::SigmaYZ,
Gamma::Algebra::SigmaYT,
Gamma::Algebra::MinusSigmaXZ,
Gamma::Algebra::MinusSigmaYZ,
Gamma::Algebra::SigmaZT,
Gamma::Algebra::MinusSigmaXT,
Gamma::Algebra::MinusSigmaYT,
Gamma::Algebra::MinusSigmaZT};
/*
sigma_{\mu \nu}=
| 0 sigma[0] sigma[1] sigma[2] |
| sigma[3] 0 sigma[4] sigma[5] |
| sigma[6] sigma[7] 0 sigma[8] |
| sigma[9] sigma[10] sigma[11] 0 |
*/
int count = 0;
clover_force = zero;
for (int mu = 0; mu < 4; mu++)
{
force_mu = zero;
for (int nu = 0; nu < 4; nu++)
{
if (mu == nu)
continue;
RealD factor;
if (nu == 4 || mu == 4)
{
factor = 2.0 * csw_t;
}
else
{
factor = 2.0 * csw_r;
}
PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
Impl::TraceSpinImpl(lambda, Slambda); // traceSpin ok
force_mu -= factor*Cmunu(U, lambda, mu, nu); // checked
count++;
}
pokeLorentz(clover_force, U[mu] * force_mu, mu);
}
//clover_force *= csw;
force += clover_force;
}
// Computing C_{\mu \nu}(x) as in Eq.(B.39) in Zbigniew Sroczynski's PhD thesis
GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu)
{
conformable(lambda._grid, U[0]._grid);
GaugeLinkField out(lambda._grid), tmp(lambda._grid);
// insertion in upper staple
// please check redundancy of shift operations
// C1+
tmp = lambda * U[nu];
out = Impl::ShiftStaple(Impl::CovShiftForward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
// C2+
tmp = U[mu] * Impl::ShiftStaple(adj(lambda), mu);
out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(tmp, mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
// C3+
tmp = U[nu] * Impl::ShiftStaple(adj(lambda), nu);
out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(tmp, nu))), mu);
// C4+
out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu) * lambda;
// insertion in lower staple
// C1-
out -= Impl::ShiftStaple(lambda, mu) * Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
// C2-
tmp = adj(lambda) * U[nu];
out -= Impl::ShiftStaple(Impl::CovShiftBackward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
// C3-
tmp = lambda * U[nu];
out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, tmp)), mu);
// C4-
out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu) * lambda;
return out;
}
private:
// here fixing the 4 dimensions, make it more general?
RealD csw_r; // Clover coefficient - spatial
RealD csw_t; // Clover coefficient - temporal
RealD diag_mass; // Mass term
CloverFieldType CloverTerm, CloverTermInv; // Clover term
CloverFieldType CloverTermEven, CloverTermOdd; // Clover term EO
CloverFieldType CloverTermInvEven, CloverTermInvOdd; // Clover term Inv EO
CloverFieldType CloverTermDagEven, CloverTermDagOdd; // Clover term Dag EO
CloverFieldType CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO
// eventually these can be compressed into 6x6 blocks instead of the 12x12
// using the DeGrand-Rossi basis for the gamma matrices
CloverFieldType fillCloverYZ(const GaugeLinkField &F)
{
CloverFieldType T(F._grid);
T = zero;
PARALLEL_FOR_LOOP
for (int i = 0; i < CloverTerm._grid->oSites(); i++)
{
T._odata[i]()(0, 1) = timesMinusI(F._odata[i]()());
T._odata[i]()(1, 0) = timesMinusI(F._odata[i]()());
T._odata[i]()(2, 3) = timesMinusI(F._odata[i]()());
T._odata[i]()(3, 2) = timesMinusI(F._odata[i]()());
}
return T;
}
CloverFieldType fillCloverXZ(const GaugeLinkField &F)
{
CloverFieldType T(F._grid);
T = zero;
PARALLEL_FOR_LOOP
for (int i = 0; i < CloverTerm._grid->oSites(); i++)
{
T._odata[i]()(0, 1) = -F._odata[i]()();
T._odata[i]()(1, 0) = F._odata[i]()();
T._odata[i]()(2, 3) = -F._odata[i]()();
T._odata[i]()(3, 2) = F._odata[i]()();
}
return T;
}
CloverFieldType fillCloverXY(const GaugeLinkField &F)
{
CloverFieldType T(F._grid);
T = zero;
PARALLEL_FOR_LOOP
for (int i = 0; i < CloverTerm._grid->oSites(); i++)
{
T._odata[i]()(0, 0) = timesMinusI(F._odata[i]()());
T._odata[i]()(1, 1) = timesI(F._odata[i]()());
T._odata[i]()(2, 2) = timesMinusI(F._odata[i]()());
T._odata[i]()(3, 3) = timesI(F._odata[i]()());
}
return T;
}
CloverFieldType fillCloverXT(const GaugeLinkField &F)
{
CloverFieldType T(F._grid);
T = zero;
PARALLEL_FOR_LOOP
for (int i = 0; i < CloverTerm._grid->oSites(); i++)
{
T._odata[i]()(0, 1) = timesI(F._odata[i]()());
T._odata[i]()(1, 0) = timesI(F._odata[i]()());
T._odata[i]()(2, 3) = timesMinusI(F._odata[i]()());
T._odata[i]()(3, 2) = timesMinusI(F._odata[i]()());
}
return T;
}
CloverFieldType fillCloverYT(const GaugeLinkField &F)
{
CloverFieldType T(F._grid);
T = zero;
PARALLEL_FOR_LOOP
for (int i = 0; i < CloverTerm._grid->oSites(); i++)
{
T._odata[i]()(0, 1) = -(F._odata[i]()());
T._odata[i]()(1, 0) = (F._odata[i]()());
T._odata[i]()(2, 3) = (F._odata[i]()());
T._odata[i]()(3, 2) = -(F._odata[i]()());
}
return T;
}
CloverFieldType fillCloverZT(const GaugeLinkField &F)
{
CloverFieldType T(F._grid);
T = zero;
PARALLEL_FOR_LOOP
for (int i = 0; i < CloverTerm._grid->oSites(); i++)
{
T._odata[i]()(0, 0) = timesI(F._odata[i]()());
T._odata[i]()(1, 1) = timesMinusI(F._odata[i]()());
T._odata[i]()(2, 2) = timesMinusI(F._odata[i]()());
T._odata[i]()(3, 3) = timesI(F._odata[i]()());
}
return T;
}
};
}
}
#endif // GRID_QCD_WILSON_CLOVER_FERMION_H

View File

@ -1,373 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonCompressor.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_QCD_WILSON_COMPRESSOR_H
#define GRID_QCD_WILSON_COMPRESSOR_H
namespace Grid {
namespace QCD {
/////////////////////////////////////////////////////////////////////////////////////////////
// optimised versions supporting half precision too
/////////////////////////////////////////////////////////////////////////////////////////////
template<class _HCspinor,class _Hspinor,class _Spinor, class projector,typename SFINAE = void >
class WilsonCompressorTemplate;
template<class _HCspinor,class _Hspinor,class _Spinor, class projector>
class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector,
typename std::enable_if<std::is_same<_HCspinor,_Hspinor>::value>::type >
{
public:
int mu,dag;
void Point(int p) { mu=p; };
WilsonCompressorTemplate(int _dag=0){
dag = _dag;
}
typedef _Spinor SiteSpinor;
typedef _Hspinor SiteHalfSpinor;
typedef _HCspinor SiteHalfCommSpinor;
typedef typename SiteHalfCommSpinor::vector_type vComplexLow;
typedef typename SiteHalfSpinor::vector_type vComplexHigh;
constexpr static int Nw=sizeof(SiteHalfSpinor)/sizeof(vComplexHigh);
inline int CommDatumSize(void) {
return sizeof(SiteHalfCommSpinor);
}
/*****************************************************/
/* Compress includes precision change if mpi data is not same */
/*****************************************************/
inline void Compress(SiteHalfSpinor * __restrict__ buf,Integer o,const SiteSpinor &in) {
SiteHalfSpinor tmp;
projector::Proj(tmp,in,mu,dag);
vstream(buf[o],tmp);
}
/*****************************************************/
/* Exchange includes precision change if mpi data is not same */
/*****************************************************/
inline void Exchange(SiteHalfSpinor * __restrict__ mp,
const SiteHalfSpinor * __restrict__ vp0,
const SiteHalfSpinor * __restrict__ vp1,
Integer type,Integer o){
SiteHalfSpinor tmp1;
SiteHalfSpinor tmp2;
exchange(tmp1,tmp2,vp0[o],vp1[o],type);
vstream(mp[2*o ],tmp1);
vstream(mp[2*o+1],tmp2);
}
/*****************************************************/
/* Have a decompression step if mpi data is not same */
/*****************************************************/
inline void Decompress(SiteHalfSpinor * __restrict__ out,
SiteHalfSpinor * __restrict__ in, Integer o) {
assert(0);
}
/*****************************************************/
/* Compress Exchange */
/*****************************************************/
inline void CompressExchange(SiteHalfSpinor * __restrict__ out0,
SiteHalfSpinor * __restrict__ out1,
const SiteSpinor * __restrict__ in,
Integer j,Integer k, Integer m,Integer type){
SiteHalfSpinor temp1, temp2,temp3,temp4;
projector::Proj(temp1,in[k],mu,dag);
projector::Proj(temp2,in[m],mu,dag);
exchange(temp3,temp4,temp1,temp2,type);
vstream(out0[j],temp3);
vstream(out1[j],temp4);
}
/*****************************************************/
/* Pass the info to the stencil */
/*****************************************************/
inline bool DecompressionStep(void) { return false; }
};
template<class _HCspinor,class _Hspinor,class _Spinor, class projector>
class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector,
typename std::enable_if<!std::is_same<_HCspinor,_Hspinor>::value>::type >
{
public:
int mu,dag;
void Point(int p) { mu=p; };
WilsonCompressorTemplate(int _dag=0){
dag = _dag;
}
typedef _Spinor SiteSpinor;
typedef _Hspinor SiteHalfSpinor;
typedef _HCspinor SiteHalfCommSpinor;
typedef typename SiteHalfCommSpinor::vector_type vComplexLow;
typedef typename SiteHalfSpinor::vector_type vComplexHigh;
constexpr static int Nw=sizeof(SiteHalfSpinor)/sizeof(vComplexHigh);
inline int CommDatumSize(void) {
return sizeof(SiteHalfCommSpinor);
}
/*****************************************************/
/* Compress includes precision change if mpi data is not same */
/*****************************************************/
inline void Compress(SiteHalfSpinor *buf,Integer o,const SiteSpinor &in) {
SiteHalfSpinor hsp;
SiteHalfCommSpinor *hbuf = (SiteHalfCommSpinor *)buf;
projector::Proj(hsp,in,mu,dag);
precisionChange((vComplexLow *)&hbuf[o],(vComplexHigh *)&hsp,Nw);
}
/*****************************************************/
/* Exchange includes precision change if mpi data is not same */
/*****************************************************/
inline void Exchange(SiteHalfSpinor *mp,
SiteHalfSpinor *vp0,
SiteHalfSpinor *vp1,
Integer type,Integer o){
SiteHalfSpinor vt0,vt1;
SiteHalfCommSpinor *vpp0 = (SiteHalfCommSpinor *)vp0;
SiteHalfCommSpinor *vpp1 = (SiteHalfCommSpinor *)vp1;
precisionChange((vComplexHigh *)&vt0,(vComplexLow *)&vpp0[o],Nw);
precisionChange((vComplexHigh *)&vt1,(vComplexLow *)&vpp1[o],Nw);
exchange(mp[2*o],mp[2*o+1],vt0,vt1,type);
}
/*****************************************************/
/* Have a decompression step if mpi data is not same */
/*****************************************************/
inline void Decompress(SiteHalfSpinor *out,
SiteHalfSpinor *in, Integer o){
SiteHalfCommSpinor *hin=(SiteHalfCommSpinor *)in;
precisionChange((vComplexHigh *)&out[o],(vComplexLow *)&hin[o],Nw);
}
/*****************************************************/
/* Compress Exchange */
/*****************************************************/
inline void CompressExchange(SiteHalfSpinor *out0,
SiteHalfSpinor *out1,
const SiteSpinor *in,
Integer j,Integer k, Integer m,Integer type){
SiteHalfSpinor temp1, temp2,temp3,temp4;
SiteHalfCommSpinor *hout0 = (SiteHalfCommSpinor *)out0;
SiteHalfCommSpinor *hout1 = (SiteHalfCommSpinor *)out1;
projector::Proj(temp1,in[k],mu,dag);
projector::Proj(temp2,in[m],mu,dag);
exchange(temp3,temp4,temp1,temp2,type);
precisionChange((vComplexLow *)&hout0[j],(vComplexHigh *)&temp3,Nw);
precisionChange((vComplexLow *)&hout1[j],(vComplexHigh *)&temp4,Nw);
}
/*****************************************************/
/* Pass the info to the stencil */
/*****************************************************/
inline bool DecompressionStep(void) { return true; }
};
#define DECLARE_PROJ(Projector,Compressor,spProj) \
class Projector { \
public: \
template<class hsp,class fsp> \
static void Proj(hsp &result,const fsp &in,int mu,int dag){ \
spProj(result,in); \
} \
}; \
template<typename HCS,typename HS,typename S> using Compressor = WilsonCompressorTemplate<HCS,HS,S,Projector>;
DECLARE_PROJ(WilsonXpProjector,WilsonXpCompressor,spProjXp);
DECLARE_PROJ(WilsonYpProjector,WilsonYpCompressor,spProjYp);
DECLARE_PROJ(WilsonZpProjector,WilsonZpCompressor,spProjZp);
DECLARE_PROJ(WilsonTpProjector,WilsonTpCompressor,spProjTp);
DECLARE_PROJ(WilsonXmProjector,WilsonXmCompressor,spProjXm);
DECLARE_PROJ(WilsonYmProjector,WilsonYmCompressor,spProjYm);
DECLARE_PROJ(WilsonZmProjector,WilsonZmCompressor,spProjZm);
DECLARE_PROJ(WilsonTmProjector,WilsonTmCompressor,spProjTm);
class WilsonProjector {
public:
template<class hsp,class fsp>
static void Proj(hsp &result,const fsp &in,int mu,int dag){
int mudag=dag? mu : (mu+Nd)%(2*Nd);
switch(mudag) {
case Xp: spProjXp(result,in); break;
case Yp: spProjYp(result,in); break;
case Zp: spProjZp(result,in); break;
case Tp: spProjTp(result,in); break;
case Xm: spProjXm(result,in); break;
case Ym: spProjYm(result,in); break;
case Zm: spProjZm(result,in); break;
case Tm: spProjTm(result,in); break;
default: assert(0); break;
}
}
};
template<typename HCS,typename HS,typename S> using WilsonCompressor = WilsonCompressorTemplate<HCS,HS,S,WilsonProjector>;
// Fast comms buffer manipulation which should inline right through (avoid direction
// dependent logic that prevents inlining
template<class vobj,class cobj>
class WilsonStencil : public CartesianStencil<vobj,cobj> {
public:
double timer0;
double timer1;
double timer2;
double timer3;
double timer4;
double timer5;
double timer6;
uint64_t callsi;
void ZeroCountersi(void)
{
timer0=0;
timer1=0;
timer2=0;
timer3=0;
timer4=0;
timer5=0;
timer6=0;
callsi=0;
}
void Reporti(int calls)
{
if ( timer0 ) std::cout << GridLogMessage << " timer0 (HaloGatherOpt) " <<timer0/calls <<std::endl;
if ( timer1 ) std::cout << GridLogMessage << " timer1 (Communicate) " <<timer1/calls <<std::endl;
if ( timer2 ) std::cout << GridLogMessage << " timer2 (CommsMerge ) " <<timer2/calls <<std::endl;
if ( timer3 ) std::cout << GridLogMessage << " timer3 (commsMergeShm) " <<timer3/calls <<std::endl;
if ( timer4 ) std::cout << GridLogMessage << " timer4 " <<timer4 <<std::endl;
}
WilsonStencil(GridBase *grid,
int npoints,
int checkerboard,
const std::vector<int> &directions,
const std::vector<int> &distances)
: CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances)
{
ZeroCountersi();
};
template < class compressor>
void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress)
{
std::vector<std::vector<CommsRequest_t> > reqs;
this->HaloExchangeOptGather(source,compress);
double t1=usecond();
// Asynchronous MPI calls multidirectional, Isend etc...
// this->CommunicateBegin(reqs);
// this->CommunicateComplete(reqs);
// Non-overlapped directions within a thread. Asynchronous calls except MPI3, threaded up to comm threads ways.
this->Communicate();
double t2=usecond(); timer1 += t2-t1;
this->CommsMerge(compress);
double t3=usecond(); timer2 += t3-t2;
this->CommsMergeSHM(compress);
double t4=usecond(); timer3 += t4-t3;
}
template <class compressor>
void HaloExchangeOptGather(const Lattice<vobj> &source,compressor &compress)
{
this->Prepare();
double t0=usecond();
this->HaloGatherOpt(source,compress);
double t1=usecond();
timer0 += t1-t0;
callsi++;
}
template <class compressor>
void HaloGatherOpt(const Lattice<vobj> &source,compressor &compress)
{
// Strategy. Inherit types from Compressor.
// Use types to select the write direction by directon compressor
typedef typename compressor::SiteSpinor SiteSpinor;
typedef typename compressor::SiteHalfSpinor SiteHalfSpinor;
typedef typename compressor::SiteHalfCommSpinor SiteHalfCommSpinor;
this->mpi3synctime_g-=usecond();
this->_grid->StencilBarrier();
this->mpi3synctime_g+=usecond();
assert(source._grid==this->_grid);
this->halogtime-=usecond();
this->u_comm_offset=0;
WilsonXpCompressor<SiteHalfCommSpinor,SiteHalfSpinor,SiteSpinor> XpCompress;
WilsonYpCompressor<SiteHalfCommSpinor,SiteHalfSpinor,SiteSpinor> YpCompress;
WilsonZpCompressor<SiteHalfCommSpinor,SiteHalfSpinor,SiteSpinor> ZpCompress;
WilsonTpCompressor<SiteHalfCommSpinor,SiteHalfSpinor,SiteSpinor> TpCompress;
WilsonXmCompressor<SiteHalfCommSpinor,SiteHalfSpinor,SiteSpinor> XmCompress;
WilsonYmCompressor<SiteHalfCommSpinor,SiteHalfSpinor,SiteSpinor> YmCompress;
WilsonZmCompressor<SiteHalfCommSpinor,SiteHalfSpinor,SiteSpinor> ZmCompress;
WilsonTmCompressor<SiteHalfCommSpinor,SiteHalfSpinor,SiteSpinor> TmCompress;
int dag = compress.dag;
int face_idx=0;
if ( dag ) {
assert(this->same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx));
assert(this->same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx));
assert(this->same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx));
assert(this->same_node[Tp]==this->HaloGatherDir(source,TpCompress,Tp,face_idx));
assert(this->same_node[Xm]==this->HaloGatherDir(source,XmCompress,Xm,face_idx));
assert(this->same_node[Ym]==this->HaloGatherDir(source,YmCompress,Ym,face_idx));
assert(this->same_node[Zm]==this->HaloGatherDir(source,ZmCompress,Zm,face_idx));
assert(this->same_node[Tm]==this->HaloGatherDir(source,TmCompress,Tm,face_idx));
} else {
assert(this->same_node[Xp]==this->HaloGatherDir(source,XmCompress,Xp,face_idx));
assert(this->same_node[Yp]==this->HaloGatherDir(source,YmCompress,Yp,face_idx));
assert(this->same_node[Zp]==this->HaloGatherDir(source,ZmCompress,Zp,face_idx));
assert(this->same_node[Tp]==this->HaloGatherDir(source,TmCompress,Tp,face_idx));
assert(this->same_node[Xm]==this->HaloGatherDir(source,XpCompress,Xm,face_idx));
assert(this->same_node[Ym]==this->HaloGatherDir(source,YpCompress,Ym,face_idx));
assert(this->same_node[Zm]==this->HaloGatherDir(source,ZpCompress,Zm,face_idx));
assert(this->same_node[Tm]==this->HaloGatherDir(source,TpCompress,Tm,face_idx));
}
this->face_table_computed=1;
assert(this->u_comm_offset==this->_unified_buffer_size);
this->halogtime+=usecond();
}
};
}} // namespace close
#endif

View File

@ -1,562 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonFermion.cc
Copyright (C) 2015
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/WilsonFermion.h>
namespace Grid {
namespace QCD {
const std::vector<int> WilsonFermionStatic::directions({0, 1, 2, 3, 0, 1, 2, 3});
const std::vector<int> WilsonFermionStatic::displacements({1, 1, 1, 1, -1, -1, -1, -1});
int WilsonFermionStatic::HandOptDslash;
/////////////////////////////////
// Constructor and gauge import
/////////////////////////////////
template <class Impl>
WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
GridRedBlackCartesian &Hgrid, RealD _mass,
const ImplParams &p,
const WilsonAnisotropyCoefficients &anis)
: Kernels(p),
_grid(&Fgrid),
_cbgrid(&Hgrid),
Stencil(&Fgrid, npoint, Even, directions, displacements),
StencilEven(&Hgrid, npoint, Even, directions,displacements), // source is Even
StencilOdd(&Hgrid, npoint, Odd, directions,displacements), // source is Odd
mass(_mass),
Lebesgue(_grid),
LebesgueEvenOdd(_cbgrid),
Umu(&Fgrid),
UmuEven(&Hgrid),
UmuOdd(&Hgrid),
_tmp(&Hgrid),
anisotropyCoeff(anis)
{
// Allocate the required comms buffer
ImportGauge(_Umu);
if (anisotropyCoeff.isAnisotropic){
diag_mass = mass + 1.0 + (Nd-1)*(anisotropyCoeff.nu / anisotropyCoeff.xi_0);
} else {
diag_mass = 4.0 + mass;
}
}
template <class Impl>
void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu) {
GaugeField HUmu(_Umu._grid);
//Here multiply the anisotropy coefficients
if (anisotropyCoeff.isAnisotropic)
{
for (int mu = 0; mu < Nd; mu++)
{
GaugeLinkField U_dir = (-0.5)*PeekIndex<LorentzIndex>(_Umu, mu);
if (mu != anisotropyCoeff.t_direction)
U_dir *= (anisotropyCoeff.nu / anisotropyCoeff.xi_0);
PokeIndex<LorentzIndex>(HUmu, U_dir, mu);
}
}
else
{
HUmu = _Umu * (-0.5);
}
Impl::DoubleStore(GaugeGrid(), Umu, HUmu);
pickCheckerboard(Even, UmuEven, Umu);
pickCheckerboard(Odd, UmuOdd, Umu);
}
/////////////////////////////
// Implement the interface
/////////////////////////////
template <class Impl>
RealD WilsonFermion<Impl>::M(const FermionField &in, FermionField &out) {
out.checkerboard = in.checkerboard;
Dhop(in, out, DaggerNo);
return axpy_norm(out, diag_mass, in, out);
}
template <class Impl>
RealD WilsonFermion<Impl>::Mdag(const FermionField &in, FermionField &out) {
out.checkerboard = in.checkerboard;
Dhop(in, out, DaggerYes);
return axpy_norm(out, diag_mass, in, out);
}
template <class Impl>
void WilsonFermion<Impl>::Meooe(const FermionField &in, FermionField &out) {
if (in.checkerboard == Odd) {
DhopEO(in, out, DaggerNo);
} else {
DhopOE(in, out, DaggerNo);
}
}
template <class Impl>
void WilsonFermion<Impl>::MeooeDag(const FermionField &in, FermionField &out) {
if (in.checkerboard == Odd) {
DhopEO(in, out, DaggerYes);
} else {
DhopOE(in, out, DaggerYes);
}
}
template <class Impl>
void WilsonFermion<Impl>::Mooee(const FermionField &in, FermionField &out) {
out.checkerboard = in.checkerboard;
typename FermionField::scalar_type scal(diag_mass);
out = scal * in;
}
template <class Impl>
void WilsonFermion<Impl>::MooeeDag(const FermionField &in, FermionField &out) {
out.checkerboard = in.checkerboard;
Mooee(in, out);
}
template<class Impl>
void WilsonFermion<Impl>::MooeeInv(const FermionField &in, FermionField &out) {
out.checkerboard = in.checkerboard;
out = (1.0/(diag_mass))*in;
}
template<class Impl>
void WilsonFermion<Impl>::MooeeInvDag(const FermionField &in, FermionField &out) {
out.checkerboard = in.checkerboard;
MooeeInv(in,out);
}
template<class Impl>
void WilsonFermion<Impl>::MomentumSpacePropagator(FermionField &out, const FermionField &in,RealD _m,std::vector<double> twist)
{
typedef typename FermionField::vector_type vector_type;
typedef typename FermionField::scalar_type ScalComplex;
typedef Lattice<iSinglet<vector_type> > LatComplex;
// what type LatticeComplex
conformable(_grid,out._grid);
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT
};
std::vector<int> latt_size = _grid->_fdimensions;
FermionField num (_grid); num = zero;
LatComplex wilson(_grid); wilson= zero;
LatComplex one (_grid); one = ScalComplex(1.0,0.0);
LatComplex denom(_grid); denom= zero;
LatComplex kmu(_grid);
ScalComplex ci(0.0,1.0);
// momphase = n * 2pi / L
for(int mu=0;mu<Nd;mu++) {
LatticeCoordinate(kmu,mu);
RealD TwoPiL = M_PI * 2.0/ latt_size[mu];
kmu = TwoPiL * kmu;
kmu = kmu + TwoPiL * one * twist[mu];//momentum for twisted boundary conditions
wilson = wilson + 2.0*sin(kmu*0.5)*sin(kmu*0.5); // Wilson term
num = num - sin(kmu)*ci*(Gamma(Gmu[mu])*in); // derivative term
denom=denom + sin(kmu)*sin(kmu);
}
wilson = wilson + _m; // 2 sin^2 k/2 + m
num = num + wilson*in; // -i gmu sin k + 2 sin^2 k/2 + m
denom= denom+wilson*wilson; // sin^2 k + (2 sin^2 k/2 + m)^2
denom= one/denom;
out = num*denom; // [ -i gmu sin k + 2 sin^2 k/2 + m] / [ sin^2 k + (2 sin^2 k/2 + m)^2 ]
}
///////////////////////////////////
// Internal
///////////////////////////////////
template <class Impl>
void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
GaugeField &mat, const FermionField &A,
const FermionField &B, int dag) {
assert((dag == DaggerNo) || (dag == DaggerYes));
Compressor compressor(dag);
FermionField Btilde(B._grid);
FermionField Atilde(B._grid);
Atilde = A;//redundant
st.HaloExchange(B, compressor);
for (int mu = 0; mu < Nd; mu++) {
////////////////////////////////////////////////////////////////////////
// Flip gamma (1+g)<->(1-g) if dag
////////////////////////////////////////////////////////////////////////
int gamma = mu;
if (!dag) gamma += Nd;
////////////////////////
// Call the single hop
////////////////////////
parallel_for (int sss = 0; sss < B._grid->oSites(); sss++) {
Kernels::DhopDir(st, U, st.CommBuf(), sss, sss, B, Btilde, mu, gamma);
}
//////////////////////////////////////////////////
// spin trace outer product
//////////////////////////////////////////////////
Impl::InsertForce4D(mat, Btilde, Atilde, mu);
}
}
template <class Impl>
void WilsonFermion<Impl>::DhopDeriv(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
conformable(U._grid, _grid);
conformable(U._grid, V._grid);
conformable(U._grid, mat._grid);
mat.checkerboard = U.checkerboard;
DerivInternal(Stencil, Umu, mat, U, V, dag);
}
template <class Impl>
void WilsonFermion<Impl>::DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
conformable(U._grid, _cbgrid);
conformable(U._grid, V._grid);
//conformable(U._grid, mat._grid); not general, leaving as a comment (Guido)
// Motivation: look at the SchurDiff operator
assert(V.checkerboard == Even);
assert(U.checkerboard == Odd);
mat.checkerboard = Odd;
DerivInternal(StencilEven, UmuOdd, mat, U, V, dag);
}
template <class Impl>
void WilsonFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag) {
conformable(U._grid, _cbgrid);
conformable(U._grid, V._grid);
//conformable(U._grid, mat._grid);
assert(V.checkerboard == Odd);
assert(U.checkerboard == Even);
mat.checkerboard = Even;
DerivInternal(StencilOdd, UmuEven, mat, U, V, dag);
}
template <class Impl>
void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) {
conformable(in._grid, _grid); // verifies full grid
conformable(in._grid, out._grid);
out.checkerboard = in.checkerboard;
DhopInternal(Stencil, Lebesgue, Umu, in, out, dag);
}
template <class Impl>
void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) {
conformable(in._grid, _cbgrid); // verifies half grid
conformable(in._grid, out._grid); // drops the cb check
assert(in.checkerboard == Even);
out.checkerboard = Odd;
DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag);
}
template <class Impl>
void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag) {
conformable(in._grid, _cbgrid); // verifies half grid
conformable(in._grid, out._grid); // drops the cb check
assert(in.checkerboard == Odd);
out.checkerboard = Even;
DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag);
}
template <class Impl>
void WilsonFermion<Impl>::Mdir(const FermionField &in, FermionField &out, int dir, int disp) {
DhopDir(in, out, dir, disp);
}
template <class Impl>
void WilsonFermion<Impl>::DhopDir(const FermionField &in, FermionField &out, int dir, int disp) {
int skip = (disp == 1) ? 0 : 1;
int dirdisp = dir + skip * 4;
int gamma = dir + (1 - skip) * 4;
DhopDirDisp(in, out, dirdisp, gamma, DaggerNo);
};
template <class Impl>
void WilsonFermion<Impl>::DhopDirDisp(const FermionField &in, FermionField &out,int dirdisp, int gamma, int dag) {
Compressor compressor(dag);
Stencil.HaloExchange(in, compressor);
parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
Kernels::DhopDir(Stencil, Umu, Stencil.CommBuf(), sss, sss, in, out, dirdisp, gamma);
}
}
/*Change starts*/
template <class Impl>
void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in,
FermionField &out, int dag) {
#ifdef GRID_OMP
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
DhopInternalOverlappedComms(st,lo,U,in,out,dag);
else
#endif
DhopInternalSerial(st,lo,U,in,out,dag);
}
template <class Impl>
void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in,
FermionField &out, int dag) {
assert((dag == DaggerNo) || (dag == DaggerYes));
#ifdef GRID_OMP
Compressor compressor;
int len = U._grid->oSites();
const int LLs = 1;
st.Prepare();
st.HaloGather(in,compressor);
st.CommsMergeSHM(compressor);
#pragma omp parallel
{
int tid = omp_get_thread_num();
int nthreads = omp_get_num_threads();
int ncomms = CartesianCommunicator::nCommThreads;
if (ncomms == -1) ncomms = 1;
assert(nthreads > ncomms);
if (tid >= ncomms) {
nthreads -= ncomms;
int ttid = tid - ncomms;
int n = len;
int chunk = n / nthreads;
int rem = n % nthreads;
int myblock, myn;
if (ttid < rem) {
myblock = ttid * chunk + ttid;
myn = chunk+1;
} else {
myblock = ttid*chunk + rem;
myn = chunk;
}
// do the compute
if (dag == DaggerYes) {
for (int sss = myblock; sss < myblock+myn; ++sss) {
Kernels::DhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
}
} else {
for (int sss = myblock; sss < myblock+myn; ++sss) {
Kernels::DhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
}
} //else
} else {
st.CommunicateThreaded();
}
Compressor compressor(dag);
if (dag == DaggerYes) {
parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
Kernels::DhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
}
} else {
parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
Kernels::DhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
}
}
} //pragma
#else
assert(0);
#endif
};
template <class Impl>
void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in,
FermionField &out, int dag) {
assert((dag == DaggerNo) || (dag == DaggerYes));
Compressor compressor(dag);
st.HaloExchange(in, compressor);
if (dag == DaggerYes) {
parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
Kernels::DhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
}
} else {
parallel_for (int sss = 0; sss < in._grid->oSites(); sss++) {
Kernels::DhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in, out);
}
}
};
/*Change ends */
/*******************************************************************************
* Conserved current utilities for Wilson fermions, for contracting propagators
* to make a conserved current sink or inserting the conserved current
* sequentially.
******************************************************************************/
template <class Impl>
void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
PropagatorField &q_in_2,
PropagatorField &q_out,
Current curr_type,
unsigned int mu)
{
Gamma g5(Gamma::Algebra::Gamma5);
conformable(_grid, q_in_1._grid);
conformable(_grid, q_in_2._grid);
conformable(_grid, q_out._grid);
PropagatorField tmp1(_grid), tmp2(_grid);
q_out = zero;
// Forward, need q1(x + mu), q2(x). Backward, need q1(x), q2(x + mu).
// Inefficient comms method but not performance critical.
tmp1 = Cshift(q_in_1, mu, 1);
tmp2 = Cshift(q_in_2, mu, 1);
parallel_for (unsigned int sU = 0; sU < Umu._grid->oSites(); ++sU)
{
Kernels::ContractConservedCurrentSiteFwd(tmp1._odata[sU],
q_in_2._odata[sU],
q_out._odata[sU],
Umu, sU, mu);
Kernels::ContractConservedCurrentSiteBwd(q_in_1._odata[sU],
tmp2._odata[sU],
q_out._odata[sU],
Umu, sU, mu);
}
}
template <class Impl>
void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
PropagatorField &q_out,
Current curr_type,
unsigned int mu,
unsigned int tmin,
unsigned int tmax,
ComplexField &lattice_cmplx)
{
conformable(_grid, q_in._grid);
conformable(_grid, q_out._grid);
PropagatorField tmpFwd(_grid), tmpBwd(_grid), tmp(_grid);
unsigned int tshift = (mu == Tp) ? 1 : 0;
unsigned int LLt = GridDefaultLatt()[Tp];
q_out = zero;
LatticeInteger coords(_grid);
LatticeCoordinate(coords, Tp);
// Need q(x + mu) and q(x - mu).
tmp = Cshift(q_in, mu, 1);
tmpFwd = tmp*lattice_cmplx;
tmp = lattice_cmplx*q_in;
tmpBwd = Cshift(tmp, mu, -1);
parallel_for (unsigned int sU = 0; sU < Umu._grid->oSites(); ++sU)
{
// Compute the sequential conserved current insertion only if our simd
// object contains a timeslice we need.
vInteger t_mask = ((coords._odata[sU] >= tmin) &&
(coords._odata[sU] <= tmax));
Integer timeSlices = Reduce(t_mask);
if (timeSlices > 0)
{
Kernels::SeqConservedCurrentSiteFwd(tmpFwd._odata[sU],
q_out._odata[sU],
Umu, sU, mu, t_mask);
}
// Repeat for backward direction.
t_mask = ((coords._odata[sU] >= (tmin + tshift)) &&
(coords._odata[sU] <= (tmax + tshift)));
//if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)
unsigned int t0 = 0;
if((tmax==LLt-1) && (tshift==1)) t_mask = (t_mask || (coords._odata[sU] == t0 ));
timeSlices = Reduce(t_mask);
if (timeSlices > 0)
{
Kernels::SeqConservedCurrentSiteBwd(tmpBwd._odata[sU],
q_out._odata[sU],
Umu, sU, mu, t_mask);
}
}
}
FermOpTemplateInstantiate(WilsonFermion);
AdjointFermOpTemplateInstantiate(WilsonFermion);
TwoIndexFermOpTemplateInstantiate(WilsonFermion);
GparityFermOpTemplateInstantiate(WilsonFermion);
}
}

File diff suppressed because it is too large Load Diff

View File

@ -1,455 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
namespace Grid {
namespace QCD {
int WilsonKernelsStatic::Opt = WilsonKernelsStatic::OptGeneric;
int WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsAndCompute;
template <class Impl>
WilsonKernels<Impl>::WilsonKernels(const ImplParams &p) : Base(p){};
////////////////////////////////////////////
// Generic implementation; move to different file?
////////////////////////////////////////////
#define GENERIC_STENCIL_LEG(Dir,spProj,Recon) \
SE = st.GetEntry(ptype, Dir, sF); \
if (SE->_is_local) { \
chi_p = &chi; \
if (SE->_permute) { \
spProj(tmp, in._odata[SE->_offset]); \
permute(chi, tmp, ptype); \
} else { \
spProj(chi, in._odata[SE->_offset]); \
} \
} else { \
chi_p = &buf[SE->_offset]; \
} \
Impl::multLink(Uchi, U._odata[sU], *chi_p, Dir, SE, st); \
Recon(result, Uchi);
#define GENERIC_STENCIL_LEG_INT(Dir,spProj,Recon) \
SE = st.GetEntry(ptype, Dir, sF); \
if (SE->_is_local) { \
chi_p = &chi; \
if (SE->_permute) { \
spProj(tmp, in._odata[SE->_offset]); \
permute(chi, tmp, ptype); \
} else { \
spProj(chi, in._odata[SE->_offset]); \
} \
} else if ( st.same_node[Dir] ) { \
chi_p = &buf[SE->_offset]; \
} \
if (SE->_is_local || st.same_node[Dir] ) { \
Impl::multLink(Uchi, U._odata[sU], *chi_p, Dir, SE, st); \
Recon(result, Uchi); \
}
#define GENERIC_STENCIL_LEG_EXT(Dir,spProj,Recon) \
SE = st.GetEntry(ptype, Dir, sF); \
if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \
chi_p = &buf[SE->_offset]; \
Impl::multLink(Uchi, U._odata[sU], *chi_p, Dir, SE, st); \
Recon(result, Uchi); \
nmu++; \
}
#define GENERIC_DHOPDIR_LEG(Dir,spProj,Recon) \
if (gamma == Dir) { \
if (SE->_is_local && SE->_permute) { \
spProj(tmp, in._odata[SE->_offset]); \
permute(chi, tmp, ptype); \
} else if (SE->_is_local) { \
spProj(chi, in._odata[SE->_offset]); \
} else { \
chi = buf[SE->_offset]; \
} \
Impl::multLink(Uchi, U._odata[sU], chi, dir, SE, st); \
Recon(result, Uchi); \
}
////////////////////////////////////////////////////////////////////
// All legs kernels ; comms then compute
////////////////////////////////////////////////////////////////////
template <class Impl>
void WilsonKernels<Impl>::GenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
SiteHalfSpinor *buf, int sF,
int sU, const FermionField &in, FermionField &out)
{
SiteHalfSpinor tmp;
SiteHalfSpinor chi;
SiteHalfSpinor *chi_p;
SiteHalfSpinor Uchi;
SiteSpinor result;
StencilEntry *SE;
int ptype;
GENERIC_STENCIL_LEG(Xp,spProjXp,spReconXp);
GENERIC_STENCIL_LEG(Yp,spProjYp,accumReconYp);
GENERIC_STENCIL_LEG(Zp,spProjZp,accumReconZp);
GENERIC_STENCIL_LEG(Tp,spProjTp,accumReconTp);
GENERIC_STENCIL_LEG(Xm,spProjXm,accumReconXm);
GENERIC_STENCIL_LEG(Ym,spProjYm,accumReconYm);
GENERIC_STENCIL_LEG(Zm,spProjZm,accumReconZm);
GENERIC_STENCIL_LEG(Tm,spProjTm,accumReconTm);
vstream(out._odata[sF], result);
};
template <class Impl>
void WilsonKernels<Impl>::GenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
SiteHalfSpinor *buf, int sF,
int sU, const FermionField &in, FermionField &out)
{
SiteHalfSpinor tmp;
SiteHalfSpinor chi;
SiteHalfSpinor *chi_p;
SiteHalfSpinor Uchi;
SiteSpinor result;
StencilEntry *SE;
int ptype;
GENERIC_STENCIL_LEG(Xm,spProjXp,spReconXp);
GENERIC_STENCIL_LEG(Ym,spProjYp,accumReconYp);
GENERIC_STENCIL_LEG(Zm,spProjZp,accumReconZp);
GENERIC_STENCIL_LEG(Tm,spProjTp,accumReconTp);
GENERIC_STENCIL_LEG(Xp,spProjXm,accumReconXm);
GENERIC_STENCIL_LEG(Yp,spProjYm,accumReconYm);
GENERIC_STENCIL_LEG(Zp,spProjZm,accumReconZm);
GENERIC_STENCIL_LEG(Tp,spProjTm,accumReconTm);
vstream(out._odata[sF], result);
};
////////////////////////////////////////////////////////////////////
// Interior kernels
////////////////////////////////////////////////////////////////////
template <class Impl>
void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
SiteHalfSpinor *buf, int sF,
int sU, const FermionField &in, FermionField &out)
{
SiteHalfSpinor tmp;
SiteHalfSpinor chi;
SiteHalfSpinor *chi_p;
SiteHalfSpinor Uchi;
SiteSpinor result;
StencilEntry *SE;
int ptype;
result=zero;
GENERIC_STENCIL_LEG_INT(Xp,spProjXp,accumReconXp);
GENERIC_STENCIL_LEG_INT(Yp,spProjYp,accumReconYp);
GENERIC_STENCIL_LEG_INT(Zp,spProjZp,accumReconZp);
GENERIC_STENCIL_LEG_INT(Tp,spProjTp,accumReconTp);
GENERIC_STENCIL_LEG_INT(Xm,spProjXm,accumReconXm);
GENERIC_STENCIL_LEG_INT(Ym,spProjYm,accumReconYm);
GENERIC_STENCIL_LEG_INT(Zm,spProjZm,accumReconZm);
GENERIC_STENCIL_LEG_INT(Tm,spProjTm,accumReconTm);
vstream(out._odata[sF], result);
};
template <class Impl>
void WilsonKernels<Impl>::GenericDhopSiteInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
SiteHalfSpinor *buf, int sF,
int sU, const FermionField &in, FermionField &out)
{
SiteHalfSpinor tmp;
SiteHalfSpinor chi;
SiteHalfSpinor *chi_p;
SiteHalfSpinor Uchi;
SiteSpinor result;
StencilEntry *SE;
int ptype;
result=zero;
GENERIC_STENCIL_LEG_INT(Xm,spProjXp,accumReconXp);
GENERIC_STENCIL_LEG_INT(Ym,spProjYp,accumReconYp);
GENERIC_STENCIL_LEG_INT(Zm,spProjZp,accumReconZp);
GENERIC_STENCIL_LEG_INT(Tm,spProjTp,accumReconTp);
GENERIC_STENCIL_LEG_INT(Xp,spProjXm,accumReconXm);
GENERIC_STENCIL_LEG_INT(Yp,spProjYm,accumReconYm);
GENERIC_STENCIL_LEG_INT(Zp,spProjZm,accumReconZm);
GENERIC_STENCIL_LEG_INT(Tp,spProjTm,accumReconTm);
vstream(out._odata[sF], result);
};
////////////////////////////////////////////////////////////////////
// Exterior kernels
////////////////////////////////////////////////////////////////////
template <class Impl>
void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
SiteHalfSpinor *buf, int sF,
int sU, const FermionField &in, FermionField &out)
{
SiteHalfSpinor tmp;
SiteHalfSpinor chi;
SiteHalfSpinor *chi_p;
SiteHalfSpinor Uchi;
SiteSpinor result;
StencilEntry *SE;
int ptype;
int nmu=0;
result=zero;
GENERIC_STENCIL_LEG_EXT(Xp,spProjXp,accumReconXp);
GENERIC_STENCIL_LEG_EXT(Yp,spProjYp,accumReconYp);
GENERIC_STENCIL_LEG_EXT(Zp,spProjZp,accumReconZp);
GENERIC_STENCIL_LEG_EXT(Tp,spProjTp,accumReconTp);
GENERIC_STENCIL_LEG_EXT(Xm,spProjXm,accumReconXm);
GENERIC_STENCIL_LEG_EXT(Ym,spProjYm,accumReconYm);
GENERIC_STENCIL_LEG_EXT(Zm,spProjZm,accumReconZm);
GENERIC_STENCIL_LEG_EXT(Tm,spProjTm,accumReconTm);
if ( nmu ) {
out._odata[sF] = out._odata[sF] + result;
}
};
template <class Impl>
void WilsonKernels<Impl>::GenericDhopSiteExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
SiteHalfSpinor *buf, int sF,
int sU, const FermionField &in, FermionField &out)
{
SiteHalfSpinor tmp;
SiteHalfSpinor chi;
SiteHalfSpinor *chi_p;
SiteHalfSpinor Uchi;
SiteSpinor result;
StencilEntry *SE;
int ptype;
int nmu=0;
result=zero;
GENERIC_STENCIL_LEG_EXT(Xm,spProjXp,accumReconXp);
GENERIC_STENCIL_LEG_EXT(Ym,spProjYp,accumReconYp);
GENERIC_STENCIL_LEG_EXT(Zm,spProjZp,accumReconZp);
GENERIC_STENCIL_LEG_EXT(Tm,spProjTp,accumReconTp);
GENERIC_STENCIL_LEG_EXT(Xp,spProjXm,accumReconXm);
GENERIC_STENCIL_LEG_EXT(Yp,spProjYm,accumReconYm);
GENERIC_STENCIL_LEG_EXT(Zp,spProjZm,accumReconZm);
GENERIC_STENCIL_LEG_EXT(Tp,spProjTm,accumReconTm);
if ( nmu ) {
out._odata[sF] = out._odata[sF] + result;
}
};
template <class Impl>
void WilsonKernels<Impl>::DhopDir( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int sF,
int sU, const FermionField &in, FermionField &out, int dir, int gamma) {
SiteHalfSpinor tmp;
SiteHalfSpinor chi;
SiteSpinor result;
SiteHalfSpinor Uchi;
StencilEntry *SE;
int ptype;
SE = st.GetEntry(ptype, dir, sF);
GENERIC_DHOPDIR_LEG(Xp,spProjXp,spReconXp);
GENERIC_DHOPDIR_LEG(Yp,spProjYp,spReconYp);
GENERIC_DHOPDIR_LEG(Zp,spProjZp,spReconZp);
GENERIC_DHOPDIR_LEG(Tp,spProjTp,spReconTp);
GENERIC_DHOPDIR_LEG(Xm,spProjXm,spReconXm);
GENERIC_DHOPDIR_LEG(Ym,spProjYm,spReconYm);
GENERIC_DHOPDIR_LEG(Zm,spProjZm,spReconZm);
GENERIC_DHOPDIR_LEG(Tm,spProjTm,spReconTm);
vstream(out._odata[sF], result);
}
/*******************************************************************************
* Conserved current utilities for Wilson fermions, for contracting propagators
* to make a conserved current sink or inserting the conserved current
* sequentially. Common to both 4D and 5D.
******************************************************************************/
// N.B. Functions below assume a -1/2 factor within U.
#define WilsonCurrentFwd(expr, mu) ((expr - Gamma::gmu[mu]*expr))
#define WilsonCurrentBwd(expr, mu) ((expr + Gamma::gmu[mu]*expr))
/*******************************************************************************
* Name: ContractConservedCurrentSiteFwd
* Operation: (1/2) * q2[x] * U(x) * (g[mu] - 1) * q1[x + mu]
* Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
* - Pass in q_in_1 shifted in +ve mu direction.
******************************************************************************/
template<class Impl>
void WilsonKernels<Impl>::ContractConservedCurrentSiteFwd(
const SitePropagator &q_in_1,
const SitePropagator &q_in_2,
SitePropagator &q_out,
DoubledGaugeField &U,
unsigned int sU,
unsigned int mu,
bool switch_sign)
{
SitePropagator result, tmp;
Gamma g5(Gamma::Algebra::Gamma5);
Impl::multLinkProp(tmp, U._odata[sU], q_in_1, mu);
result = g5 * adj(q_in_2) * g5 * WilsonCurrentFwd(tmp, mu);
if (switch_sign)
{
q_out -= result;
}
else
{
q_out += result;
}
}
/*******************************************************************************
* Name: ContractConservedCurrentSiteBwd
* Operation: (1/2) * q2[x + mu] * U^dag(x) * (g[mu] + 1) * q1[x]
* Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
* - Pass in q_in_2 shifted in +ve mu direction.
******************************************************************************/
template<class Impl>
void WilsonKernels<Impl>::ContractConservedCurrentSiteBwd(
const SitePropagator &q_in_1,
const SitePropagator &q_in_2,
SitePropagator &q_out,
DoubledGaugeField &U,
unsigned int sU,
unsigned int mu,
bool switch_sign)
{
SitePropagator result, tmp;
Gamma g5(Gamma::Algebra::Gamma5);
Impl::multLinkProp(tmp, U._odata[sU], q_in_1, mu + Nd);
result = g5 * adj(q_in_2) * g5 * WilsonCurrentBwd(tmp, mu);
if (switch_sign)
{
q_out += result;
}
else
{
q_out -= result;
}
}
// G-parity requires more specialised implementation.
#define NO_CURR_SITE(Impl) \
template <> \
void WilsonKernels<Impl>::ContractConservedCurrentSiteFwd( \
const SitePropagator &q_in_1, \
const SitePropagator &q_in_2, \
SitePropagator &q_out, \
DoubledGaugeField &U, \
unsigned int sU, \
unsigned int mu, \
bool switch_sign) \
{ \
assert(0); \
} \
template <> \
void WilsonKernels<Impl>::ContractConservedCurrentSiteBwd( \
const SitePropagator &q_in_1, \
const SitePropagator &q_in_2, \
SitePropagator &q_out, \
DoubledGaugeField &U, \
unsigned int mu, \
unsigned int sU, \
bool switch_sign) \
{ \
assert(0); \
}
NO_CURR_SITE(GparityWilsonImplF);
NO_CURR_SITE(GparityWilsonImplD);
NO_CURR_SITE(GparityWilsonImplFH);
NO_CURR_SITE(GparityWilsonImplDF);
/*******************************************************************************
* Name: SeqConservedCurrentSiteFwd
* Operation: (1/2) * U(x) * (g[mu] - 1) * q[x + mu]
* Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
* - Pass in q_in shifted in +ve mu direction.
******************************************************************************/
template<class Impl>
void WilsonKernels<Impl>::SeqConservedCurrentSiteFwd(const SitePropagator &q_in,
SitePropagator &q_out,
DoubledGaugeField &U,
unsigned int sU,
unsigned int mu,
vInteger t_mask,
bool switch_sign)
{
SitePropagator result;
Impl::multLinkProp(result, U._odata[sU], q_in, mu);
result = WilsonCurrentFwd(result, mu);
// Zero any unwanted timeslice entries.
result = predicatedWhere(t_mask, result, 0.*result);
if (switch_sign)
{
q_out -= result;
}
else
{
q_out += result;
}
}
/*******************************************************************************
* Name: SeqConservedCurrentSiteFwd
* Operation: (1/2) * U^dag(x) * (g[mu] + 1) * q[x - mu]
* Notes: - DoubledGaugeField U assumed to contain -1/2 factor.
* - Pass in q_in shifted in -ve mu direction.
******************************************************************************/
template<class Impl>
void WilsonKernels<Impl>::SeqConservedCurrentSiteBwd(const SitePropagator &q_in,
SitePropagator &q_out,
DoubledGaugeField &U,
unsigned int sU,
unsigned int mu,
vInteger t_mask,
bool switch_sign)
{
SitePropagator result;
Impl::multLinkProp(result, U._odata[sU], q_in, mu + Nd);
result = WilsonCurrentBwd(result, mu);
// Zero any unwanted timeslice entries.
result = predicatedWhere(t_mask, result, 0.*result);
if (switch_sign)
{
q_out += result;
}
else
{
q_out -= result;
}
}
FermOpTemplateInstantiate(WilsonKernels);
AdjointFermOpTemplateInstantiate(WilsonKernels);
TwoIndexFermOpTemplateInstantiate(WilsonKernels);
}}

View File

@ -1,281 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernels.h
Copyright (C) 2015
Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_QCD_DHOP_H
#define GRID_QCD_DHOP_H
namespace Grid {
namespace QCD {
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Helper routines that implement Wilson stencil for a single site.
// Common to both the WilsonFermion and WilsonFermion5D
////////////////////////////////////////////////////////////////////////////////////////////////////////////////
class WilsonKernelsStatic {
public:
enum { OptGeneric, OptHandUnroll, OptInlineAsm };
enum { CommsAndCompute, CommsThenCompute };
static int Opt;
static int Comms;
};
template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public WilsonKernelsStatic {
public:
INHERIT_IMPL_TYPES(Impl);
typedef FermionOperator<Impl> Base;
public:
template <bool EnableBool = true>
typename std::enable_if<Impl::isFundamental==true && Nc == 3 &&EnableBool, void>::type
DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1)
{
bgq_l1p_optimisation(1);
switch(Opt) {
#if defined(AVX512) || defined (QPX)
case OptInlineAsm:
if(interior&&exterior) WilsonKernels<Impl>::AsmDhopSite (st,lo,U,buf,sF,sU,Ls,Ns,in,out);
else if (interior) WilsonKernels<Impl>::AsmDhopSiteInt(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
else if (exterior) WilsonKernels<Impl>::AsmDhopSiteExt(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
else assert(0);
break;
#endif
case OptHandUnroll:
for (int site = 0; site < Ns; site++) {
for (int s = 0; s < Ls; s++) {
if(interior&&exterior) WilsonKernels<Impl>::HandDhopSite(st,lo,U,buf,sF,sU,in,out);
else if (interior) WilsonKernels<Impl>::HandDhopSiteInt(st,lo,U,buf,sF,sU,in,out);
else if (exterior) WilsonKernels<Impl>::HandDhopSiteExt(st,lo,U,buf,sF,sU,in,out);
sF++;
}
sU++;
}
break;
case OptGeneric:
for (int site = 0; site < Ns; site++) {
for (int s = 0; s < Ls; s++) {
if(interior&&exterior) WilsonKernels<Impl>::GenericDhopSite(st,lo,U,buf,sF,sU,in,out);
else if (interior) WilsonKernels<Impl>::GenericDhopSiteInt(st,lo,U,buf,sF,sU,in,out);
else if (exterior) WilsonKernels<Impl>::GenericDhopSiteExt(st,lo,U,buf,sF,sU,in,out);
else assert(0);
sF++;
}
sU++;
}
break;
default:
assert(0);
}
bgq_l1p_optimisation(0);
}
template <bool EnableBool = true>
typename std::enable_if<(Impl::isFundamental==false || (Impl::isFundamental==true && Nc != 3)) && EnableBool, void>::type
DhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1 ) {
// no kernel choice
for (int site = 0; site < Ns; site++) {
for (int s = 0; s < Ls; s++) {
if(interior&&exterior) WilsonKernels<Impl>::GenericDhopSite(st,lo,U,buf,sF,sU,in,out);
else if (interior) WilsonKernels<Impl>::GenericDhopSiteInt(st,lo,U,buf,sF,sU,in,out);
else if (exterior) WilsonKernels<Impl>::GenericDhopSiteExt(st,lo,U,buf,sF,sU,in,out);
else assert(0);
sF++;
}
sU++;
}
}
template <bool EnableBool = true>
typename std::enable_if<Impl::isFundamental==true && Nc == 3 && EnableBool,void>::type
DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1)
{
bgq_l1p_optimisation(1);
switch(Opt) {
#if defined(AVX512) || defined (QPX)
case OptInlineAsm:
if(interior&&exterior) WilsonKernels<Impl>::AsmDhopSiteDag (st,lo,U,buf,sF,sU,Ls,Ns,in,out);
else if (interior) WilsonKernels<Impl>::AsmDhopSiteDagInt(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
else if (exterior) WilsonKernels<Impl>::AsmDhopSiteDagExt(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
else assert(0);
break;
#endif
case OptHandUnroll:
for (int site = 0; site < Ns; site++) {
for (int s = 0; s < Ls; s++) {
if(interior&&exterior) WilsonKernels<Impl>::HandDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
else if (interior) WilsonKernels<Impl>::HandDhopSiteDagInt(st,lo,U,buf,sF,sU,in,out);
else if (exterior) WilsonKernels<Impl>::HandDhopSiteDagExt(st,lo,U,buf,sF,sU,in,out);
else assert(0);
sF++;
}
sU++;
}
break;
case OptGeneric:
for (int site = 0; site < Ns; site++) {
for (int s = 0; s < Ls; s++) {
if(interior&&exterior) WilsonKernels<Impl>::GenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
else if (interior) WilsonKernels<Impl>::GenericDhopSiteDagInt(st,lo,U,buf,sF,sU,in,out);
else if (exterior) WilsonKernels<Impl>::GenericDhopSiteDagExt(st,lo,U,buf,sF,sU,in,out);
else assert(0);
sF++;
}
sU++;
}
break;
default:
assert(0);
}
bgq_l1p_optimisation(0);
}
template <bool EnableBool = true>
typename std::enable_if<(Impl::isFundamental==false || (Impl::isFundamental==true && Nc != 3)) && EnableBool,void>::type
DhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,SiteHalfSpinor * buf,
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out,int interior=1,int exterior=1) {
for (int site = 0; site < Ns; site++) {
for (int s = 0; s < Ls; s++) {
if(interior&&exterior) WilsonKernels<Impl>::GenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
else if (interior) WilsonKernels<Impl>::GenericDhopSiteDagInt(st,lo,U,buf,sF,sU,in,out);
else if (exterior) WilsonKernels<Impl>::GenericDhopSiteDagExt(st,lo,U,buf,sF,sU,in,out);
else assert(0);
sF++;
}
sU++;
}
}
void DhopDir(StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor * buf,
int sF, int sU, const FermionField &in, FermionField &out, int dirdisp, int gamma);
//////////////////////////////////////////////////////////////////////////////
// Utilities for inserting Wilson conserved current.
//////////////////////////////////////////////////////////////////////////////
void ContractConservedCurrentSiteFwd(const SitePropagator &q_in_1,
const SitePropagator &q_in_2,
SitePropagator &q_out,
DoubledGaugeField &U,
unsigned int sU,
unsigned int mu,
bool switch_sign = false);
void ContractConservedCurrentSiteBwd(const SitePropagator &q_in_1,
const SitePropagator &q_in_2,
SitePropagator &q_out,
DoubledGaugeField &U,
unsigned int sU,
unsigned int mu,
bool switch_sign = false);
void SeqConservedCurrentSiteFwd(const SitePropagator &q_in,
SitePropagator &q_out,
DoubledGaugeField &U,
unsigned int sU,
unsigned int mu,
vInteger t_mask,
bool switch_sign = false);
void SeqConservedCurrentSiteBwd(const SitePropagator &q_in,
SitePropagator &q_out,
DoubledGaugeField &U,
unsigned int sU,
unsigned int mu,
vInteger t_mask,
bool switch_sign = false);
private:
// Specialised variants
void GenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, const FermionField &in, FermionField &out);
void GenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, const FermionField &in, FermionField &out);
void GenericDhopSiteInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, const FermionField &in, FermionField &out);
void GenericDhopSiteDagInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, const FermionField &in, FermionField &out);
void GenericDhopSiteExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, const FermionField &in, FermionField &out);
void GenericDhopSiteDagExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, const FermionField &in, FermionField &out);
void AsmDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out);
void AsmDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out);
void AsmDhopSiteInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out);
void AsmDhopSiteDagInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out);
void AsmDhopSiteExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out);
void AsmDhopSiteDagExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out);
void HandDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, const FermionField &in, FermionField &out);
void HandDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, const FermionField &in, FermionField &out);
void HandDhopSiteInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, const FermionField &in, FermionField &out);
void HandDhopSiteDagInt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, const FermionField &in, FermionField &out);
void HandDhopSiteExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, const FermionField &in, FermionField &out);
void HandDhopSiteDagExt(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
int sF, int sU, const FermionField &in, FermionField &out);
public:
WilsonKernels(const ImplParams &p = ImplParams());
};
}}
#endif

View File

@ -1,127 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernelsAsm.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Guido Cossu <guido.cossu@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
namespace Grid {
namespace QCD {
///////////////////////////////////////////////////////////
// Default to no assembler implementation
///////////////////////////////////////////////////////////
template<class Impl> void
WilsonKernels<Impl >::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
{
assert(0);
}
template<class Impl> void
WilsonKernels<Impl >::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
{
assert(0);
}
template<class Impl> void
WilsonKernels<Impl >::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
{
assert(0);
}
template<class Impl> void
WilsonKernels<Impl >::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
{
assert(0);
}
template<class Impl> void
WilsonKernels<Impl >::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
{
assert(0);
}
template<class Impl> void
WilsonKernels<Impl >::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
{
assert(0);
}
#include <qcd/action/fermion/WilsonKernelsAsmAvx512.h>
#include <qcd/action/fermion/WilsonKernelsAsmQPX.h>
#define INSTANTIATE_ASM(A)\
template void WilsonKernels<A>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
\
template void WilsonKernels<A>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
template void WilsonKernels<A>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
\
template void WilsonKernels<A>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
template void WilsonKernels<A>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
\
template void WilsonKernels<A>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
INSTANTIATE_ASM(WilsonImplF);
INSTANTIATE_ASM(WilsonImplD);
INSTANTIATE_ASM(ZWilsonImplF);
INSTANTIATE_ASM(ZWilsonImplD);
INSTANTIATE_ASM(GparityWilsonImplF);
INSTANTIATE_ASM(GparityWilsonImplD);
INSTANTIATE_ASM(DomainWallVec5dImplF);
INSTANTIATE_ASM(DomainWallVec5dImplD);
INSTANTIATE_ASM(ZDomainWallVec5dImplF);
INSTANTIATE_ASM(ZDomainWallVec5dImplD);
INSTANTIATE_ASM(WilsonImplFH);
INSTANTIATE_ASM(WilsonImplDF);
INSTANTIATE_ASM(ZWilsonImplFH);
INSTANTIATE_ASM(ZWilsonImplDF);
INSTANTIATE_ASM(GparityWilsonImplFH);
INSTANTIATE_ASM(GparityWilsonImplDF);
INSTANTIATE_ASM(DomainWallVec5dImplFH);
INSTANTIATE_ASM(DomainWallVec5dImplDF);
INSTANTIATE_ASM(ZDomainWallVec5dImplFH);
INSTANTIATE_ASM(ZDomainWallVec5dImplDF);
}}

View File

@ -1,650 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernelsAsmAvx512.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#if defined(AVX512)
///////////////////////////////////////////////////////////
// If we are AVX512 specialise the single precision routine
///////////////////////////////////////////////////////////
#include <simd/Intel512wilson.h>
#include <simd/Intel512single.h>
static Vector<vComplexF> signsF;
template<typename vtype>
int setupSigns(Vector<vtype>& signs ){
Vector<vtype> bother(2);
signs = bother;
vrsign(signs[0]);
visign(signs[1]);
return 1;
}
static int signInitF = setupSigns(signsF);
#define MAYBEPERM(A,perm) if (perm) { A ; }
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
#define COMPLEX_SIGNS(isigns) vComplexF *isigns = &signsF[0];
/////////////////////////////////////////////////////////////////
// XYZT vectorised, undag Kernel, single
/////////////////////////////////////////////////////////////////
#undef KERNEL_DAG
#define INTERIOR_AND_EXTERIOR
#undef INTERIOR
#undef EXTERIOR
template<> void
WilsonKernels<WilsonImplF>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZWilsonImplF>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<WilsonImplFH>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZWilsonImplFH>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR
#define INTERIOR
#undef EXTERIOR
template<> void
WilsonKernels<WilsonImplF>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZWilsonImplF>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<WilsonImplFH>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR
#undef INTERIOR
#define EXTERIOR
template<> void
WilsonKernels<WilsonImplF>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZWilsonImplF>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<WilsonImplFH>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
/////////////////////////////////////////////////////////////////
// XYZT vectorised, dag Kernel, single
/////////////////////////////////////////////////////////////////
#define KERNEL_DAG
#define INTERIOR_AND_EXTERIOR
#undef INTERIOR
#undef EXTERIOR
template<> void
WilsonKernels<WilsonImplF>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZWilsonImplF>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<WilsonImplFH>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR
#define INTERIOR
#undef EXTERIOR
template<> void
WilsonKernels<WilsonImplF>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<WilsonImplFH>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR
#undef INTERIOR
#define EXTERIOR
template<> void
WilsonKernels<WilsonImplF>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<WilsonImplFH>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#undef MAYBEPERM
#undef MULT_2SPIN
#define MAYBEPERM(A,B)
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
/////////////////////////////////////////////////////////////////
// Ls vectorised, undag Kernel, single
/////////////////////////////////////////////////////////////////
#undef KERNEL_DAG
#define INTERIOR_AND_EXTERIOR
#undef INTERIOR
#undef EXTERIOR
template<> void
WilsonKernels<DomainWallVec5dImplF>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR
#define INTERIOR
#undef EXTERIOR
template<> void
WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR
#undef INTERIOR
#define EXTERIOR
#undef MULT_2SPIN
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf)
template<> void
WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
/////////////////////////////////////////////////////////////////
// Ls vectorised, dag Kernel, single
/////////////////////////////////////////////////////////////////
#define KERNEL_DAG
#define INTERIOR_AND_EXTERIOR
#undef INTERIOR
#undef EXTERIOR
template<> void
WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR
#define INTERIOR
#undef EXTERIOR
template<> void
WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR
#undef INTERIOR
#define EXTERIOR
template<> void
WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#undef COMPLEX_SIGNS
#undef MAYBEPERM
#undef MULT_2SPIN
///////////////////////////////////////////////////////////
// If we are AVX512 specialise the double precision routine
///////////////////////////////////////////////////////////
#include <simd/Intel512double.h>
static Vector<vComplexD> signsD;
static int signInitD = setupSigns(signsD);
#define MAYBEPERM(A,perm) if (perm) { A ; }
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN(ptr,pf)
#define COMPLEX_SIGNS(isigns) vComplexD *isigns = &signsD[0];
#define INTERIOR_AND_EXTERIOR
#undef INTERIOR
#undef EXTERIOR
/////////////////////////////////////////////////////////////////
// XYZT vectorised, undag Kernel, single
/////////////////////////////////////////////////////////////////
#undef KERNEL_DAG
#define INTERIOR_AND_EXTERIOR
#undef INTERIOR
#undef EXTERIOR
template<> void
WilsonKernels<WilsonImplD>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZWilsonImplD>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<WilsonImplDF>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZWilsonImplDF>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR
#define INTERIOR
#undef EXTERIOR
template<> void
WilsonKernels<WilsonImplD>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZWilsonImplD>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<WilsonImplDF>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR
#undef INTERIOR
#define EXTERIOR
template<> void
WilsonKernels<WilsonImplD>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZWilsonImplD>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<WilsonImplDF>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
/////////////////////////////////////////////////////////////////
// XYZT vectorised, dag Kernel, single
/////////////////////////////////////////////////////////////////
#define KERNEL_DAG
#define INTERIOR_AND_EXTERIOR
#undef INTERIOR
#undef EXTERIOR
template<> void
WilsonKernels<WilsonImplD>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZWilsonImplD>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<WilsonImplDF>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR
#define INTERIOR
#undef EXTERIOR
template<> void
WilsonKernels<WilsonImplD>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<WilsonImplDF>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR
#undef INTERIOR
#define EXTERIOR
template<> void
WilsonKernels<WilsonImplD>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<WilsonImplDF>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#undef MAYBEPERM
#undef MULT_2SPIN
#define MAYBEPERM(A,B)
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
/////////////////////////////////////////////////////////////////
// Ls vectorised, undag Kernel, single
/////////////////////////////////////////////////////////////////
#undef KERNEL_DAG
#define INTERIOR_AND_EXTERIOR
#undef INTERIOR
#undef EXTERIOR
template<> void
WilsonKernels<DomainWallVec5dImplD>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR
#define INTERIOR
#undef EXTERIOR
template<> void
WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR
#undef INTERIOR
#define EXTERIOR
#undef MULT_2SPIN
#define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LSNOPF(ptr,pf)
template<> void
WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
/////////////////////////////////////////////////////////////////
// Ls vectorised, dag Kernel, single
/////////////////////////////////////////////////////////////////
#define KERNEL_DAG
#define INTERIOR_AND_EXTERIOR
#undef INTERIOR
#undef EXTERIOR
template<> void
WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR
#define INTERIOR
#undef EXTERIOR
template<> void
WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDagInt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR
#undef INTERIOR
#define EXTERIOR
template<> void
WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
template<> void
WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDagExt(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#undef COMPLEX_SIGNS
#undef MAYBEPERM
#undef MULT_2SPIN
#endif //AVX512

View File

@ -1,196 +0,0 @@
#ifdef KERNEL_DAG
#define DIR0_PROJMEM(base) XP_PROJMEM(base);
#define DIR1_PROJMEM(base) YP_PROJMEM(base);
#define DIR2_PROJMEM(base) ZP_PROJMEM(base);
#define DIR3_PROJMEM(base) TP_PROJMEM(base);
#define DIR4_PROJMEM(base) XM_PROJMEM(base);
#define DIR5_PROJMEM(base) YM_PROJMEM(base);
#define DIR6_PROJMEM(base) ZM_PROJMEM(base);
#define DIR7_PROJMEM(base) TM_PROJMEM(base);
#define DIR0_RECON XP_RECON
#define DIR1_RECON YP_RECON_ACCUM
#define DIR2_RECON ZP_RECON_ACCUM
#define DIR3_RECON TP_RECON_ACCUM
#define DIR4_RECON XM_RECON_ACCUM
#define DIR5_RECON YM_RECON_ACCUM
#define DIR6_RECON ZM_RECON_ACCUM
#define DIR7_RECON TM_RECON_ACCUM
#else
#define DIR0_PROJMEM(base) XM_PROJMEM(base);
#define DIR1_PROJMEM(base) YM_PROJMEM(base);
#define DIR2_PROJMEM(base) ZM_PROJMEM(base);
#define DIR3_PROJMEM(base) TM_PROJMEM(base);
#define DIR4_PROJMEM(base) XP_PROJMEM(base);
#define DIR5_PROJMEM(base) YP_PROJMEM(base);
#define DIR6_PROJMEM(base) ZP_PROJMEM(base);
#define DIR7_PROJMEM(base) TP_PROJMEM(base);
#define DIR0_RECON XM_RECON
#define DIR1_RECON YM_RECON_ACCUM
#define DIR2_RECON ZM_RECON_ACCUM
#define DIR3_RECON TM_RECON_ACCUM
#define DIR4_RECON XP_RECON_ACCUM
#define DIR5_RECON YP_RECON_ACCUM
#define DIR6_RECON ZP_RECON_ACCUM
#define DIR7_RECON TP_RECON_ACCUM
#endif
////////////////////////////////////////////////////////////////////////////////
// Comms then compute kernel
////////////////////////////////////////////////////////////////////////////////
#ifdef INTERIOR_AND_EXTERIOR
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
basep = st.GetPFInfo(nent,plocal); nent++; \
if ( local ) { \
LOAD64(%r10,isigns); \
PROJ(base); \
MAYBEPERM(PERMUTE_DIR,perm); \
} else { \
LOAD_CHI(base); \
} \
base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \
PREFETCH_CHIMU(base); \
MULT_2SPIN_DIR_PF(Dir,basep); \
LOAD64(%r10,isigns); \
RECON; \
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
PF_GAUGE(Xp); \
PREFETCH1_CHIMU(base); \
ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)
#define RESULT(base,basep) SAVE_RESULT(base,basep);
#endif
////////////////////////////////////////////////////////////////////////////////
// Pre comms kernel -- prefetch like normal because it is mostly right
////////////////////////////////////////////////////////////////////////////////
#ifdef INTERIOR
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
basep = st.GetPFInfo(nent,plocal); nent++; \
if ( local ) { \
LOAD64(%r10,isigns); \
PROJ(base); \
MAYBEPERM(PERMUTE_DIR,perm); \
}else if ( st.same_node[Dir] ) {LOAD_CHI(base);} \
if ( local || st.same_node[Dir] ) { \
MULT_2SPIN_DIR_PF(Dir,basep); \
LOAD64(%r10,isigns); \
RECON; \
} \
base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \
PREFETCH_CHIMU(base); \
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
PF_GAUGE(Xp); \
PREFETCH1_CHIMU(base); \
{ ZERO_PSI; } \
ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)
#define RESULT(base,basep) SAVE_RESULT(base,basep);
#endif
////////////////////////////////////////////////////////////////////////////////
// Post comms kernel
////////////////////////////////////////////////////////////////////////////////
#ifdef EXTERIOR
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
if((!local)&&(!st.same_node[Dir]) ) { \
LOAD_CHI(base); \
MULT_2SPIN_DIR_PF(Dir,base); \
LOAD64(%r10,isigns); \
RECON; \
nmu++; \
}
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
nmu=0; \
{ ZERO_PSI;} \
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
if((!local)&&(!st.same_node[Dir]) ) { \
LOAD_CHI(base); \
MULT_2SPIN_DIR_PF(Dir,base); \
LOAD64(%r10,isigns); \
RECON; \
nmu++; \
}
#define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);}
#endif
{
int nmu;
int local,perm, ptype;
uint64_t base;
uint64_t basep;
const uint64_t plocal =(uint64_t) & in._odata[0];
COMPLEX_SIGNS(isigns);
MASK_REGS;
int nmax=U._grid->oSites();
for(int site=0;site<Ns;site++) {
#ifndef EXTERIOR
int sU =lo.Reorder(ssU);
int ssn=ssU+1; if(ssn>=nmax) ssn=0;
int sUn=lo.Reorder(ssn);
LOCK_GAUGE(0);
#else
int sU =ssU;
int ssn=ssU+1; if(ssn>=nmax) ssn=0;
int sUn=ssn;
#endif
for(int s=0;s<Ls;s++) {
ss =sU*Ls+s;
ssn=sUn*Ls+s;
int ent=ss*8;// 2*Ndim
int nent=ssn*8;
ASM_LEG_XP(Xp,Yp,PERMUTE_DIR3,DIR0_PROJMEM,DIR0_RECON);
ASM_LEG(Yp,Zp,PERMUTE_DIR2,DIR1_PROJMEM,DIR1_RECON);
ASM_LEG(Zp,Tp,PERMUTE_DIR1,DIR2_PROJMEM,DIR2_RECON);
ASM_LEG(Tp,Xm,PERMUTE_DIR0,DIR3_PROJMEM,DIR3_RECON);
ASM_LEG(Xm,Ym,PERMUTE_DIR3,DIR4_PROJMEM,DIR4_RECON);
ASM_LEG(Ym,Zm,PERMUTE_DIR2,DIR5_PROJMEM,DIR5_RECON);
ASM_LEG(Zm,Tm,PERMUTE_DIR1,DIR6_PROJMEM,DIR6_RECON);
ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJMEM,DIR7_RECON);
#ifdef EXTERIOR
if (nmu==0) break;
// if (nmu!=0) std::cout << "EXT "<<sU<<std::endl;
#endif
base = (uint64_t) &out._odata[ss];
basep= st.GetPFInfo(nent,plocal); nent++;
RESULT(base,basep);
}
ssU++;
UNLOCK_GAUGE(0);
}
}
#undef DIR0_PROJMEM
#undef DIR1_PROJMEM
#undef DIR2_PROJMEM
#undef DIR3_PROJMEM
#undef DIR4_PROJMEM
#undef DIR5_PROJMEM
#undef DIR6_PROJMEM
#undef DIR7_PROJMEM
#undef DIR0_RECON
#undef DIR1_RECON
#undef DIR2_RECON
#undef DIR3_RECON
#undef DIR4_RECON
#undef DIR5_RECON
#undef DIR6_RECON
#undef DIR7_RECON
#undef ASM_LEG
#undef ASM_LEG_XP
#undef RESULT

View File

@ -1,150 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernelsAsmQPX.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#if defined(QPX)
///////////////////////////////////////////////////////////
// If we are QPX specialise the single precision routine
///////////////////////////////////////////////////////////
#include <simd/IBM_qpx.h>
#include <simd/IBM_qpx_single.h>
#define MAYBEPERM(A,perm) if (perm) { A ; }
#define MULT_2SPIN(ptr,pf) MULT_2SPIN_QPX(ptr,pf)
#define COMPLEX_SIGNS(isigns)
#define INTERIOR_AND_EXTERIOR
#undef INTERIOR
#undef EXTERIOR
/////////////////////////////////////////////////////////////////
// XYZT vectorised, undag Kernel, single
/////////////////////////////////////////////////////////////////
#undef KERNEL_DAG
template<> void
WilsonKernels<WilsonImplF>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
/////////////////////////////////////////////////////////////////
// XYZT vectorised, dag Kernel, single
/////////////////////////////////////////////////////////////////
#define KERNEL_DAG
template<> void
WilsonKernels<WilsonImplF>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#undef MAYBEPERM
#undef MULT_2SPIN
#define MAYBEPERM(A,B)
#define MULT_2SPIN(ptr,pf) MULT_2SPIN_QPX_LS(ptr,pf)
/////////////////////////////////////////////////////////////////
// Ls vectorised, undag Kernel, single
/////////////////////////////////////////////////////////////////
#undef KERNEL_DAG
template<> void
WilsonKernels<DomainWallVec5dImplF>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
/////////////////////////////////////////////////////////////////
// Ls vectorised, dag Kernel, single
/////////////////////////////////////////////////////////////////
#define KERNEL_DAG
template<> void
WilsonKernels<DomainWallVec5dImplF>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
#undef MAYBEPERM
#undef MULT_2SPIN
///////////////////////////////////////////////////////////
// DP routines
///////////////////////////////////////////////////////////
#include <simd/IBM_qpx_double.h>
#define MAYBEPERM(A,perm) if (perm) { A ; }
#define MULT_2SPIN(ptr,pf) MULT_2SPIN_QPX(ptr,pf)
/////////////////////////////////////////////////////////////////
// XYZT Vectorised, undag Kernel, double
/////////////////////////////////////////////////////////////////
#undef KERNEL_DAG
template<> void
WilsonKernels<WilsonImplD>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////
// XYZT Vectorised, dag Kernel, double
/////////////////////////////////////////////////////////////////
#define KERNEL_DAG
template<> void
WilsonKernels<WilsonImplD>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
/////////////////////////////////////////////////////////////////
#undef MAYBEPERM
#undef MULT_2SPIN
#define MAYBEPERM(A,B)
#define MULT_2SPIN(ptr,pf) MULT_2SPIN_QPX_LS(ptr,pf)
/////////////////////////////////////////////////////////////////
// Ls vectorised, undag Kernel, double
/////////////////////////////////////////////////////////////////
#undef KERNEL_DAG
template<> void
WilsonKernels<DomainWallVec5dImplD>::AsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////
// Ls vectorised, dag Kernel, double
/////////////////////////////////////////////////////////////////
#define KERNEL_DAG
template<> void
WilsonKernels<DomainWallVec5dImplD>::AsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
#include <qcd/action/fermion/WilsonKernelsAsmBody.h>
/////////////////////////////////////////////////////////////////
#undef MAYBEPERM
#undef MULT_2SPIN
#endif

View File

@ -1,631 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernelsHand.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#define REGISTER
#define LOAD_CHIMU \
{const SiteSpinor & ref (in._odata[offset]); \
Chimu_00=ref()(0)(0);\
Chimu_01=ref()(0)(1);\
Chimu_02=ref()(0)(2);\
Chimu_10=ref()(1)(0);\
Chimu_11=ref()(1)(1);\
Chimu_12=ref()(1)(2);\
Chimu_20=ref()(2)(0);\
Chimu_21=ref()(2)(1);\
Chimu_22=ref()(2)(2);\
Chimu_30=ref()(3)(0);\
Chimu_31=ref()(3)(1);\
Chimu_32=ref()(3)(2);}
#define LOAD_CHI\
{const SiteHalfSpinor &ref(buf[offset]); \
Chi_00 = ref()(0)(0);\
Chi_01 = ref()(0)(1);\
Chi_02 = ref()(0)(2);\
Chi_10 = ref()(1)(0);\
Chi_11 = ref()(1)(1);\
Chi_12 = ref()(1)(2);}
// To splat or not to splat depends on the implementation
#define MULT_2SPIN(A)\
{auto & ref(U._odata[sU](A)); \
Impl::loadLinkElement(U_00,ref()(0,0)); \
Impl::loadLinkElement(U_10,ref()(1,0)); \
Impl::loadLinkElement(U_20,ref()(2,0)); \
Impl::loadLinkElement(U_01,ref()(0,1)); \
Impl::loadLinkElement(U_11,ref()(1,1)); \
Impl::loadLinkElement(U_21,ref()(2,1)); \
UChi_00 = U_00*Chi_00;\
UChi_10 = U_00*Chi_10;\
UChi_01 = U_10*Chi_00;\
UChi_11 = U_10*Chi_10;\
UChi_02 = U_20*Chi_00;\
UChi_12 = U_20*Chi_10;\
UChi_00+= U_01*Chi_01;\
UChi_10+= U_01*Chi_11;\
UChi_01+= U_11*Chi_01;\
UChi_11+= U_11*Chi_11;\
UChi_02+= U_21*Chi_01;\
UChi_12+= U_21*Chi_11;\
Impl::loadLinkElement(U_00,ref()(0,2)); \
Impl::loadLinkElement(U_10,ref()(1,2)); \
Impl::loadLinkElement(U_20,ref()(2,2)); \
UChi_00+= U_00*Chi_02;\
UChi_10+= U_00*Chi_12;\
UChi_01+= U_10*Chi_02;\
UChi_11+= U_10*Chi_12;\
UChi_02+= U_20*Chi_02;\
UChi_12+= U_20*Chi_12;}
#define PERMUTE_DIR(dir) \
permute##dir(Chi_00,Chi_00);\
permute##dir(Chi_01,Chi_01);\
permute##dir(Chi_02,Chi_02);\
permute##dir(Chi_10,Chi_10);\
permute##dir(Chi_11,Chi_11);\
permute##dir(Chi_12,Chi_12);
// hspin(0)=fspin(0)+timesI(fspin(3));
// hspin(1)=fspin(1)+timesI(fspin(2));
#define XP_PROJ \
Chi_00 = Chimu_00+timesI(Chimu_30);\
Chi_01 = Chimu_01+timesI(Chimu_31);\
Chi_02 = Chimu_02+timesI(Chimu_32);\
Chi_10 = Chimu_10+timesI(Chimu_20);\
Chi_11 = Chimu_11+timesI(Chimu_21);\
Chi_12 = Chimu_12+timesI(Chimu_22);
#define YP_PROJ \
Chi_00 = Chimu_00-Chimu_30;\
Chi_01 = Chimu_01-Chimu_31;\
Chi_02 = Chimu_02-Chimu_32;\
Chi_10 = Chimu_10+Chimu_20;\
Chi_11 = Chimu_11+Chimu_21;\
Chi_12 = Chimu_12+Chimu_22;
#define ZP_PROJ \
Chi_00 = Chimu_00+timesI(Chimu_20); \
Chi_01 = Chimu_01+timesI(Chimu_21); \
Chi_02 = Chimu_02+timesI(Chimu_22); \
Chi_10 = Chimu_10-timesI(Chimu_30); \
Chi_11 = Chimu_11-timesI(Chimu_31); \
Chi_12 = Chimu_12-timesI(Chimu_32);
#define TP_PROJ \
Chi_00 = Chimu_00+Chimu_20; \
Chi_01 = Chimu_01+Chimu_21; \
Chi_02 = Chimu_02+Chimu_22; \
Chi_10 = Chimu_10+Chimu_30; \
Chi_11 = Chimu_11+Chimu_31; \
Chi_12 = Chimu_12+Chimu_32;
// hspin(0)=fspin(0)-timesI(fspin(3));
// hspin(1)=fspin(1)-timesI(fspin(2));
#define XM_PROJ \
Chi_00 = Chimu_00-timesI(Chimu_30);\
Chi_01 = Chimu_01-timesI(Chimu_31);\
Chi_02 = Chimu_02-timesI(Chimu_32);\
Chi_10 = Chimu_10-timesI(Chimu_20);\
Chi_11 = Chimu_11-timesI(Chimu_21);\
Chi_12 = Chimu_12-timesI(Chimu_22);
#define YM_PROJ \
Chi_00 = Chimu_00+Chimu_30;\
Chi_01 = Chimu_01+Chimu_31;\
Chi_02 = Chimu_02+Chimu_32;\
Chi_10 = Chimu_10-Chimu_20;\
Chi_11 = Chimu_11-Chimu_21;\
Chi_12 = Chimu_12-Chimu_22;
#define ZM_PROJ \
Chi_00 = Chimu_00-timesI(Chimu_20); \
Chi_01 = Chimu_01-timesI(Chimu_21); \
Chi_02 = Chimu_02-timesI(Chimu_22); \
Chi_10 = Chimu_10+timesI(Chimu_30); \
Chi_11 = Chimu_11+timesI(Chimu_31); \
Chi_12 = Chimu_12+timesI(Chimu_32);
#define TM_PROJ \
Chi_00 = Chimu_00-Chimu_20; \
Chi_01 = Chimu_01-Chimu_21; \
Chi_02 = Chimu_02-Chimu_22; \
Chi_10 = Chimu_10-Chimu_30; \
Chi_11 = Chimu_11-Chimu_31; \
Chi_12 = Chimu_12-Chimu_32;
// fspin(0)=hspin(0);
// fspin(1)=hspin(1);
// fspin(2)=timesMinusI(hspin(1));
// fspin(3)=timesMinusI(hspin(0));
#define XP_RECON\
result_00 = UChi_00;\
result_01 = UChi_01;\
result_02 = UChi_02;\
result_10 = UChi_10;\
result_11 = UChi_11;\
result_12 = UChi_12;\
result_20 = timesMinusI(UChi_10);\
result_21 = timesMinusI(UChi_11);\
result_22 = timesMinusI(UChi_12);\
result_30 = timesMinusI(UChi_00);\
result_31 = timesMinusI(UChi_01);\
result_32 = timesMinusI(UChi_02);
#define XP_RECON_ACCUM\
result_00+=UChi_00;\
result_01+=UChi_01;\
result_02+=UChi_02;\
result_10+=UChi_10;\
result_11+=UChi_11;\
result_12+=UChi_12;\
result_20-=timesI(UChi_10);\
result_21-=timesI(UChi_11);\
result_22-=timesI(UChi_12);\
result_30-=timesI(UChi_00);\
result_31-=timesI(UChi_01);\
result_32-=timesI(UChi_02);
#define XM_RECON\
result_00 = UChi_00;\
result_01 = UChi_01;\
result_02 = UChi_02;\
result_10 = UChi_10;\
result_11 = UChi_11;\
result_12 = UChi_12;\
result_20 = timesI(UChi_10);\
result_21 = timesI(UChi_11);\
result_22 = timesI(UChi_12);\
result_30 = timesI(UChi_00);\
result_31 = timesI(UChi_01);\
result_32 = timesI(UChi_02);
#define XM_RECON_ACCUM\
result_00+= UChi_00;\
result_01+= UChi_01;\
result_02+= UChi_02;\
result_10+= UChi_10;\
result_11+= UChi_11;\
result_12+= UChi_12;\
result_20+= timesI(UChi_10);\
result_21+= timesI(UChi_11);\
result_22+= timesI(UChi_12);\
result_30+= timesI(UChi_00);\
result_31+= timesI(UChi_01);\
result_32+= timesI(UChi_02);
#define YP_RECON_ACCUM\
result_00+= UChi_00;\
result_01+= UChi_01;\
result_02+= UChi_02;\
result_10+= UChi_10;\
result_11+= UChi_11;\
result_12+= UChi_12;\
result_20+= UChi_10;\
result_21+= UChi_11;\
result_22+= UChi_12;\
result_30-= UChi_00;\
result_31-= UChi_01;\
result_32-= UChi_02;
#define YM_RECON_ACCUM\
result_00+= UChi_00;\
result_01+= UChi_01;\
result_02+= UChi_02;\
result_10+= UChi_10;\
result_11+= UChi_11;\
result_12+= UChi_12;\
result_20-= UChi_10;\
result_21-= UChi_11;\
result_22-= UChi_12;\
result_30+= UChi_00;\
result_31+= UChi_01;\
result_32+= UChi_02;
#define ZP_RECON_ACCUM\
result_00+= UChi_00;\
result_01+= UChi_01;\
result_02+= UChi_02;\
result_10+= UChi_10;\
result_11+= UChi_11;\
result_12+= UChi_12;\
result_20-= timesI(UChi_00); \
result_21-= timesI(UChi_01); \
result_22-= timesI(UChi_02); \
result_30+= timesI(UChi_10); \
result_31+= timesI(UChi_11); \
result_32+= timesI(UChi_12);
#define ZM_RECON_ACCUM\
result_00+= UChi_00;\
result_01+= UChi_01;\
result_02+= UChi_02;\
result_10+= UChi_10;\
result_11+= UChi_11;\
result_12+= UChi_12;\
result_20+= timesI(UChi_00); \
result_21+= timesI(UChi_01); \
result_22+= timesI(UChi_02); \
result_30-= timesI(UChi_10); \
result_31-= timesI(UChi_11); \
result_32-= timesI(UChi_12);
#define TP_RECON_ACCUM\
result_00+= UChi_00;\
result_01+= UChi_01;\
result_02+= UChi_02;\
result_10+= UChi_10;\
result_11+= UChi_11;\
result_12+= UChi_12;\
result_20+= UChi_00; \
result_21+= UChi_01; \
result_22+= UChi_02; \
result_30+= UChi_10; \
result_31+= UChi_11; \
result_32+= UChi_12;
#define TM_RECON_ACCUM\
result_00+= UChi_00;\
result_01+= UChi_01;\
result_02+= UChi_02;\
result_10+= UChi_10;\
result_11+= UChi_11;\
result_12+= UChi_12;\
result_20-= UChi_00; \
result_21-= UChi_01; \
result_22-= UChi_02; \
result_30-= UChi_10; \
result_31-= UChi_11; \
result_32-= UChi_12;
#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON) \
SE=st.GetEntry(ptype,DIR,ss); \
offset = SE->_offset; \
local = SE->_is_local; \
perm = SE->_permute; \
if ( local ) { \
LOAD_CHIMU; \
PROJ; \
if ( perm) { \
PERMUTE_DIR(PERM); \
} \
} else { \
LOAD_CHI; \
} \
MULT_2SPIN(DIR); \
RECON;
#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON) \
SE=st.GetEntry(ptype,DIR,ss); \
offset = SE->_offset; \
local = SE->_is_local; \
perm = SE->_permute; \
if ( local ) { \
LOAD_CHIMU; \
PROJ; \
if ( perm) { \
PERMUTE_DIR(PERM); \
} \
} else if ( st.same_node[DIR] ) { \
LOAD_CHI; \
} \
if (local || st.same_node[DIR] ) { \
MULT_2SPIN(DIR); \
RECON; \
}
#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON) \
SE=st.GetEntry(ptype,DIR,ss); \
offset = SE->_offset; \
if((!SE->_is_local)&&(!st.same_node[DIR]) ) { \
LOAD_CHI; \
MULT_2SPIN(DIR); \
RECON; \
nmu++; \
}
#define HAND_RESULT(ss) \
{ \
SiteSpinor & ref (out._odata[ss]); \
vstream(ref()(0)(0),result_00); \
vstream(ref()(0)(1),result_01); \
vstream(ref()(0)(2),result_02); \
vstream(ref()(1)(0),result_10); \
vstream(ref()(1)(1),result_11); \
vstream(ref()(1)(2),result_12); \
vstream(ref()(2)(0),result_20); \
vstream(ref()(2)(1),result_21); \
vstream(ref()(2)(2),result_22); \
vstream(ref()(3)(0),result_30); \
vstream(ref()(3)(1),result_31); \
vstream(ref()(3)(2),result_32); \
}
#define HAND_RESULT_EXT(ss) \
if (nmu){ \
SiteSpinor & ref (out._odata[ss]); \
ref()(0)(0)+=result_00; \
ref()(0)(1)+=result_01; \
ref()(0)(2)+=result_02; \
ref()(1)(0)+=result_10; \
ref()(1)(1)+=result_11; \
ref()(1)(2)+=result_12; \
ref()(2)(0)+=result_20; \
ref()(2)(1)+=result_21; \
ref()(2)(2)+=result_22; \
ref()(3)(0)+=result_30; \
ref()(3)(1)+=result_31; \
ref()(3)(2)+=result_32; \
}
#define HAND_DECLARATIONS(a) \
Simd result_00; \
Simd result_01; \
Simd result_02; \
Simd result_10; \
Simd result_11; \
Simd result_12; \
Simd result_20; \
Simd result_21; \
Simd result_22; \
Simd result_30; \
Simd result_31; \
Simd result_32; \
Simd Chi_00; \
Simd Chi_01; \
Simd Chi_02; \
Simd Chi_10; \
Simd Chi_11; \
Simd Chi_12; \
Simd UChi_00; \
Simd UChi_01; \
Simd UChi_02; \
Simd UChi_10; \
Simd UChi_11; \
Simd UChi_12; \
Simd U_00; \
Simd U_10; \
Simd U_20; \
Simd U_01; \
Simd U_11; \
Simd U_21;
#define ZERO_RESULT \
result_00=zero; \
result_01=zero; \
result_02=zero; \
result_10=zero; \
result_11=zero; \
result_12=zero; \
result_20=zero; \
result_21=zero; \
result_22=zero; \
result_30=zero; \
result_31=zero; \
result_32=zero;
#define Chimu_00 Chi_00
#define Chimu_01 Chi_01
#define Chimu_02 Chi_02
#define Chimu_10 Chi_10
#define Chimu_11 Chi_11
#define Chimu_12 Chi_12
#define Chimu_20 UChi_00
#define Chimu_21 UChi_01
#define Chimu_22 UChi_02
#define Chimu_30 UChi_10
#define Chimu_31 UChi_11
#define Chimu_32 UChi_12
namespace Grid {
namespace QCD {
template<class Impl> void
WilsonKernels<Impl>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionField &in, FermionField &out)
{
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
HAND_DECLARATIONS(ignore);
int offset,local,perm, ptype;
StencilEntry *SE;
HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM);
HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM);
HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM);
HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM);
HAND_RESULT(ss);
}
template<class Impl>
void WilsonKernels<Impl>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionField &in, FermionField &out)
{
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
HAND_DECLARATIONS(ignore);
StencilEntry *SE;
int offset,local,perm, ptype;
HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON);
HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM);
HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM);
HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM);
HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM);
HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM);
HAND_RESULT(ss);
}
template<class Impl> void
WilsonKernels<Impl>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionField &in, FermionField &out)
{
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
HAND_DECLARATIONS(ignore);
int offset,local,perm, ptype;
StencilEntry *SE;
ZERO_RESULT;
HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM);
HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM);
HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM);
HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM);
HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM);
HAND_RESULT(ss);
}
template<class Impl>
void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionField &in, FermionField &out)
{
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
HAND_DECLARATIONS(ignore);
StencilEntry *SE;
int offset,local,perm, ptype;
ZERO_RESULT;
HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM);
HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM);
HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM);
HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM);
HAND_RESULT(ss);
}
template<class Impl> void
WilsonKernels<Impl>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionField &in, FermionField &out)
{
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
HAND_DECLARATIONS(ignore);
int offset,local,perm, ptype;
StencilEntry *SE;
int nmu=0;
ZERO_RESULT;
HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM);
HAND_RESULT_EXT(ss);
}
template<class Impl>
void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionField &in, FermionField &out)
{
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
HAND_DECLARATIONS(ignore);
StencilEntry *SE;
int offset,local,perm, ptype;
int nmu=0;
ZERO_RESULT;
HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM);
HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM);
HAND_RESULT_EXT(ss);
}
////////////// Wilson ; uses this implementation /////////////////////
#define INSTANTIATE_THEM(A) \
template void WilsonKernels<A>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
int ss,int sU,const FermionField &in, FermionField &out); \
template void WilsonKernels<A>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionField &in, FermionField &out);\
template void WilsonKernels<A>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
int ss,int sU,const FermionField &in, FermionField &out); \
template void WilsonKernels<A>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionField &in, FermionField &out); \
template void WilsonKernels<A>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
int ss,int sU,const FermionField &in, FermionField &out); \
template void WilsonKernels<A>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionField &in, FermionField &out);
INSTANTIATE_THEM(WilsonImplF);
INSTANTIATE_THEM(WilsonImplD);
INSTANTIATE_THEM(ZWilsonImplF);
INSTANTIATE_THEM(ZWilsonImplD);
INSTANTIATE_THEM(DomainWallVec5dImplF);
INSTANTIATE_THEM(DomainWallVec5dImplD);
INSTANTIATE_THEM(ZDomainWallVec5dImplF);
INSTANTIATE_THEM(ZDomainWallVec5dImplD);
INSTANTIATE_THEM(WilsonImplFH);
INSTANTIATE_THEM(WilsonImplDF);
INSTANTIATE_THEM(ZWilsonImplFH);
INSTANTIATE_THEM(ZWilsonImplDF);
INSTANTIATE_THEM(DomainWallVec5dImplFH);
INSTANTIATE_THEM(DomainWallVec5dImplDF);
INSTANTIATE_THEM(ZDomainWallVec5dImplFH);
INSTANTIATE_THEM(ZDomainWallVec5dImplDF);
INSTANTIATE_THEM(WilsonTwoIndexAntiSymmetricImplF);
INSTANTIATE_THEM(WilsonTwoIndexAntiSymmetricImplD);
}}

View File

@ -1,878 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonKernelsHand.cc
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/qcd/action/fermion/FermionCore.h>
#define REGISTER
#define LOAD_CHIMU_BODY(F) \
Chimu_00=ref(F)(0)(0); \
Chimu_01=ref(F)(0)(1); \
Chimu_02=ref(F)(0)(2); \
Chimu_10=ref(F)(1)(0); \
Chimu_11=ref(F)(1)(1); \
Chimu_12=ref(F)(1)(2); \
Chimu_20=ref(F)(2)(0); \
Chimu_21=ref(F)(2)(1); \
Chimu_22=ref(F)(2)(2); \
Chimu_30=ref(F)(3)(0); \
Chimu_31=ref(F)(3)(1); \
Chimu_32=ref(F)(3)(2)
#define LOAD_CHIMU(DIR,F,PERM) \
{ const SiteSpinor & ref (in._odata[offset]); LOAD_CHIMU_BODY(F); }
#define LOAD_CHI_BODY(F) \
Chi_00 = ref(F)(0)(0);\
Chi_01 = ref(F)(0)(1);\
Chi_02 = ref(F)(0)(2);\
Chi_10 = ref(F)(1)(0);\
Chi_11 = ref(F)(1)(1);\
Chi_12 = ref(F)(1)(2)
#define LOAD_CHI(DIR,F,PERM) \
{const SiteHalfSpinor &ref(buf[offset]); LOAD_CHI_BODY(F); }
//G-parity implementations using in-place intrinsic ops
//1l 1h -> 1h 1l
//0l 0h , 1h 1l -> 0l 1h 0h,1l
//0h,1l -> 1l,0h
//if( (distance == 1 && !perm_will_occur) || (distance == -1 && perm_will_occur) )
//Pulled fermion through forwards face, GPBC on upper component
//Need 0= 0l 1h 1= 1l 0h
//else if( (distance == -1 && !perm) || (distance == 1 && perm) )
//Pulled fermion through backwards face, GPBC on lower component
//Need 0= 1l 0h 1= 0l 1h
//1l 1h -> 1h 1l
//0l 0h , 1h 1l -> 0l 1h 0h,1l
#define DO_TWIST_0L_1H(INTO,S,C,F, PERM, tmp1, tmp2, tmp3) \
permute##PERM(tmp1, ref(1)(S)(C)); \
exchange##PERM(tmp2,tmp3, ref(0)(S)(C), tmp1); \
INTO = tmp2;
//0l 0h -> 0h 0l
//1l 1h, 0h 0l -> 1l 0h, 1h 0l
#define DO_TWIST_1L_0H(INTO,S,C,F, PERM, tmp1, tmp2, tmp3) \
permute##PERM(tmp1, ref(0)(S)(C)); \
exchange##PERM(tmp2,tmp3, ref(1)(S)(C), tmp1); \
INTO = tmp2;
#define LOAD_CHI_SETUP(DIR,F) \
g = F; \
direction = st._directions[DIR]; \
distance = st._distances[DIR]; \
sl = st._grid->_simd_layout[direction]; \
inplace_twist = 0; \
if(SE->_around_the_world && this->Params.twists[DIR % 4]){ \
if(sl == 1){ \
g = (F+1) % 2; \
}else{ \
inplace_twist = 1; \
} \
}
#define LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F,PERM) \
{ const SiteSpinor &ref(in._odata[offset]); \
LOAD_CHI_SETUP(DIR,F); \
if(!inplace_twist){ \
LOAD_CHIMU_BODY(g); \
}else{ \
if( ( F==0 && ((distance == 1 && !perm) || (distance == -1 && perm)) ) || \
( F==1 && ((distance == -1 && !perm) || (distance == 1 && perm)) ) ){ \
DO_TWIST_0L_1H(Chimu_00,0,0,F,PERM, U_00,U_01,U_10); \
DO_TWIST_0L_1H(Chimu_01,0,1,F,PERM, U_11,U_20,U_21); \
DO_TWIST_0L_1H(Chimu_02,0,2,F,PERM, U_00,U_01,U_10); \
DO_TWIST_0L_1H(Chimu_10,1,0,F,PERM, U_11,U_20,U_21); \
DO_TWIST_0L_1H(Chimu_11,1,1,F,PERM, U_00,U_01,U_10); \
DO_TWIST_0L_1H(Chimu_12,1,2,F,PERM, U_11,U_20,U_21); \
DO_TWIST_0L_1H(Chimu_20,2,0,F,PERM, U_00,U_01,U_10); \
DO_TWIST_0L_1H(Chimu_21,2,1,F,PERM, U_11,U_20,U_21); \
DO_TWIST_0L_1H(Chimu_22,2,2,F,PERM, U_00,U_01,U_10); \
DO_TWIST_0L_1H(Chimu_30,3,0,F,PERM, U_11,U_20,U_21); \
DO_TWIST_0L_1H(Chimu_31,3,1,F,PERM, U_00,U_01,U_10); \
DO_TWIST_0L_1H(Chimu_32,3,2,F,PERM, U_11,U_20,U_21); \
}else{ \
DO_TWIST_1L_0H(Chimu_00,0,0,F,PERM, U_00,U_01,U_10); \
DO_TWIST_1L_0H(Chimu_01,0,1,F,PERM, U_11,U_20,U_21); \
DO_TWIST_1L_0H(Chimu_02,0,2,F,PERM, U_00,U_01,U_10); \
DO_TWIST_1L_0H(Chimu_10,1,0,F,PERM, U_11,U_20,U_21); \
DO_TWIST_1L_0H(Chimu_11,1,1,F,PERM, U_00,U_01,U_10); \
DO_TWIST_1L_0H(Chimu_12,1,2,F,PERM, U_11,U_20,U_21); \
DO_TWIST_1L_0H(Chimu_20,2,0,F,PERM, U_00,U_01,U_10); \
DO_TWIST_1L_0H(Chimu_21,2,1,F,PERM, U_11,U_20,U_21); \
DO_TWIST_1L_0H(Chimu_22,2,2,F,PERM, U_00,U_01,U_10); \
DO_TWIST_1L_0H(Chimu_30,3,0,F,PERM, U_11,U_20,U_21); \
DO_TWIST_1L_0H(Chimu_31,3,1,F,PERM, U_00,U_01,U_10); \
DO_TWIST_1L_0H(Chimu_32,3,2,F,PERM, U_11,U_20,U_21); \
} \
} \
}
#define LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F,PERM) \
{ const SiteHalfSpinor &ref(buf[offset]); \
LOAD_CHI_SETUP(DIR,F); \
if(!inplace_twist){ \
LOAD_CHI_BODY(g); \
}else{ \
if( ( F==0 && ((distance == 1 && !perm) || (distance == -1 && perm)) ) || \
( F==1 && ((distance == -1 && !perm) || (distance == 1 && perm)) ) ){ \
DO_TWIST_0L_1H(Chi_00,0,0,F,PERM, U_00,U_01,U_10); \
DO_TWIST_0L_1H(Chi_01,0,1,F,PERM, U_11,U_20,U_21); \
DO_TWIST_0L_1H(Chi_02,0,2,F,PERM, UChi_00,UChi_01,UChi_02); \
DO_TWIST_0L_1H(Chi_10,1,0,F,PERM, UChi_10,UChi_11,UChi_12); \
DO_TWIST_0L_1H(Chi_11,1,1,F,PERM, U_00,U_01,U_10); \
DO_TWIST_0L_1H(Chi_12,1,2,F,PERM, U_11,U_20,U_21); \
}else{ \
DO_TWIST_1L_0H(Chi_00,0,0,F,PERM, U_00,U_01,U_10); \
DO_TWIST_1L_0H(Chi_01,0,1,F,PERM, U_11,U_20,U_21); \
DO_TWIST_1L_0H(Chi_02,0,2,F,PERM, UChi_00,UChi_01,UChi_02); \
DO_TWIST_1L_0H(Chi_10,1,0,F,PERM, UChi_10,UChi_11,UChi_12); \
DO_TWIST_1L_0H(Chi_11,1,1,F,PERM, U_00,U_01,U_10); \
DO_TWIST_1L_0H(Chi_12,1,2,F,PERM, U_11,U_20,U_21); \
} \
} \
}
#define LOAD_CHI_GPARITY(DIR,F,PERM) LOAD_CHI_GPARITY_INPLACE_TWIST(DIR,F,PERM)
#define LOAD_CHIMU_GPARITY(DIR,F,PERM) LOAD_CHIMU_GPARITY_INPLACE_TWIST(DIR,F,PERM)
// To splat or not to splat depends on the implementation
#define MULT_2SPIN_BODY \
Impl::loadLinkElement(U_00,ref()(0,0)); \
Impl::loadLinkElement(U_10,ref()(1,0)); \
Impl::loadLinkElement(U_20,ref()(2,0)); \
Impl::loadLinkElement(U_01,ref()(0,1)); \
Impl::loadLinkElement(U_11,ref()(1,1)); \
Impl::loadLinkElement(U_21,ref()(2,1)); \
UChi_00 = U_00*Chi_00; \
UChi_10 = U_00*Chi_10; \
UChi_01 = U_10*Chi_00; \
UChi_11 = U_10*Chi_10; \
UChi_02 = U_20*Chi_00; \
UChi_12 = U_20*Chi_10; \
UChi_00+= U_01*Chi_01; \
UChi_10+= U_01*Chi_11; \
UChi_01+= U_11*Chi_01; \
UChi_11+= U_11*Chi_11; \
UChi_02+= U_21*Chi_01; \
UChi_12+= U_21*Chi_11; \
Impl::loadLinkElement(U_00,ref()(0,2)); \
Impl::loadLinkElement(U_10,ref()(1,2)); \
Impl::loadLinkElement(U_20,ref()(2,2)); \
UChi_00+= U_00*Chi_02; \
UChi_10+= U_00*Chi_12; \
UChi_01+= U_10*Chi_02; \
UChi_11+= U_10*Chi_12; \
UChi_02+= U_20*Chi_02; \
UChi_12+= U_20*Chi_12
#define MULT_2SPIN(A,F) \
{auto & ref(U._odata[sU](A)); MULT_2SPIN_BODY; }
#define MULT_2SPIN_GPARITY(A,F) \
{auto & ref(U._odata[sU](F)(A)); MULT_2SPIN_BODY; }
#define PERMUTE_DIR(dir) \
permute##dir(Chi_00,Chi_00);\
permute##dir(Chi_01,Chi_01);\
permute##dir(Chi_02,Chi_02);\
permute##dir(Chi_10,Chi_10);\
permute##dir(Chi_11,Chi_11);\
permute##dir(Chi_12,Chi_12);
// hspin(0)=fspin(0)+timesI(fspin(3));
// hspin(1)=fspin(1)+timesI(fspin(2));
#define XP_PROJ \
Chi_00 = Chimu_00+timesI(Chimu_30);\
Chi_01 = Chimu_01+timesI(Chimu_31);\
Chi_02 = Chimu_02+timesI(Chimu_32);\
Chi_10 = Chimu_10+timesI(Chimu_20);\
Chi_11 = Chimu_11+timesI(Chimu_21);\
Chi_12 = Chimu_12+timesI(Chimu_22);
#define YP_PROJ \
Chi_00 = Chimu_00-Chimu_30;\
Chi_01 = Chimu_01-Chimu_31;\
Chi_02 = Chimu_02-Chimu_32;\
Chi_10 = Chimu_10+Chimu_20;\
Chi_11 = Chimu_11+Chimu_21;\
Chi_12 = Chimu_12+Chimu_22;
#define ZP_PROJ \
Chi_00 = Chimu_00+timesI(Chimu_20); \
Chi_01 = Chimu_01+timesI(Chimu_21); \
Chi_02 = Chimu_02+timesI(Chimu_22); \
Chi_10 = Chimu_10-timesI(Chimu_30); \
Chi_11 = Chimu_11-timesI(Chimu_31); \
Chi_12 = Chimu_12-timesI(Chimu_32);
#define TP_PROJ \
Chi_00 = Chimu_00+Chimu_20; \
Chi_01 = Chimu_01+Chimu_21; \
Chi_02 = Chimu_02+Chimu_22; \
Chi_10 = Chimu_10+Chimu_30; \
Chi_11 = Chimu_11+Chimu_31; \
Chi_12 = Chimu_12+Chimu_32;
// hspin(0)=fspin(0)-timesI(fspin(3));
// hspin(1)=fspin(1)-timesI(fspin(2));
#define XM_PROJ \
Chi_00 = Chimu_00-timesI(Chimu_30);\
Chi_01 = Chimu_01-timesI(Chimu_31);\
Chi_02 = Chimu_02-timesI(Chimu_32);\
Chi_10 = Chimu_10-timesI(Chimu_20);\
Chi_11 = Chimu_11-timesI(Chimu_21);\
Chi_12 = Chimu_12-timesI(Chimu_22);
#define YM_PROJ \
Chi_00 = Chimu_00+Chimu_30;\
Chi_01 = Chimu_01+Chimu_31;\
Chi_02 = Chimu_02+Chimu_32;\
Chi_10 = Chimu_10-Chimu_20;\
Chi_11 = Chimu_11-Chimu_21;\
Chi_12 = Chimu_12-Chimu_22;
#define ZM_PROJ \
Chi_00 = Chimu_00-timesI(Chimu_20); \
Chi_01 = Chimu_01-timesI(Chimu_21); \
Chi_02 = Chimu_02-timesI(Chimu_22); \
Chi_10 = Chimu_10+timesI(Chimu_30); \
Chi_11 = Chimu_11+timesI(Chimu_31); \
Chi_12 = Chimu_12+timesI(Chimu_32);
#define TM_PROJ \
Chi_00 = Chimu_00-Chimu_20; \
Chi_01 = Chimu_01-Chimu_21; \
Chi_02 = Chimu_02-Chimu_22; \
Chi_10 = Chimu_10-Chimu_30; \
Chi_11 = Chimu_11-Chimu_31; \
Chi_12 = Chimu_12-Chimu_32;
// fspin(0)=hspin(0);
// fspin(1)=hspin(1);
// fspin(2)=timesMinusI(hspin(1));
// fspin(3)=timesMinusI(hspin(0));
#define XP_RECON\
result_00 = UChi_00;\
result_01 = UChi_01;\
result_02 = UChi_02;\
result_10 = UChi_10;\
result_11 = UChi_11;\
result_12 = UChi_12;\
result_20 = timesMinusI(UChi_10);\
result_21 = timesMinusI(UChi_11);\
result_22 = timesMinusI(UChi_12);\
result_30 = timesMinusI(UChi_00);\
result_31 = timesMinusI(UChi_01);\
result_32 = timesMinusI(UChi_02);
#define XP_RECON_ACCUM\
result_00+=UChi_00;\
result_01+=UChi_01;\
result_02+=UChi_02;\
result_10+=UChi_10;\
result_11+=UChi_11;\
result_12+=UChi_12;\
result_20-=timesI(UChi_10);\
result_21-=timesI(UChi_11);\
result_22-=timesI(UChi_12);\
result_30-=timesI(UChi_00);\
result_31-=timesI(UChi_01);\
result_32-=timesI(UChi_02);
#define XM_RECON\
result_00 = UChi_00;\
result_01 = UChi_01;\
result_02 = UChi_02;\
result_10 = UChi_10;\
result_11 = UChi_11;\
result_12 = UChi_12;\
result_20 = timesI(UChi_10);\
result_21 = timesI(UChi_11);\
result_22 = timesI(UChi_12);\
result_30 = timesI(UChi_00);\
result_31 = timesI(UChi_01);\
result_32 = timesI(UChi_02);
#define XM_RECON_ACCUM\
result_00+= UChi_00;\
result_01+= UChi_01;\
result_02+= UChi_02;\
result_10+= UChi_10;\
result_11+= UChi_11;\
result_12+= UChi_12;\
result_20+= timesI(UChi_10);\
result_21+= timesI(UChi_11);\
result_22+= timesI(UChi_12);\
result_30+= timesI(UChi_00);\
result_31+= timesI(UChi_01);\
result_32+= timesI(UChi_02);
#define YP_RECON_ACCUM\
result_00+= UChi_00;\
result_01+= UChi_01;\
result_02+= UChi_02;\
result_10+= UChi_10;\
result_11+= UChi_11;\
result_12+= UChi_12;\
result_20+= UChi_10;\
result_21+= UChi_11;\
result_22+= UChi_12;\
result_30-= UChi_00;\
result_31-= UChi_01;\
result_32-= UChi_02;
#define YM_RECON_ACCUM\
result_00+= UChi_00;\
result_01+= UChi_01;\
result_02+= UChi_02;\
result_10+= UChi_10;\
result_11+= UChi_11;\
result_12+= UChi_12;\
result_20-= UChi_10;\
result_21-= UChi_11;\
result_22-= UChi_12;\
result_30+= UChi_00;\
result_31+= UChi_01;\
result_32+= UChi_02;
#define ZP_RECON_ACCUM\
result_00+= UChi_00;\
result_01+= UChi_01;\
result_02+= UChi_02;\
result_10+= UChi_10;\
result_11+= UChi_11;\
result_12+= UChi_12;\
result_20-= timesI(UChi_00); \
result_21-= timesI(UChi_01); \
result_22-= timesI(UChi_02); \
result_30+= timesI(UChi_10); \
result_31+= timesI(UChi_11); \
result_32+= timesI(UChi_12);
#define ZM_RECON_ACCUM\
result_00+= UChi_00;\
result_01+= UChi_01;\
result_02+= UChi_02;\
result_10+= UChi_10;\
result_11+= UChi_11;\
result_12+= UChi_12;\
result_20+= timesI(UChi_00); \
result_21+= timesI(UChi_01); \
result_22+= timesI(UChi_02); \
result_30-= timesI(UChi_10); \
result_31-= timesI(UChi_11); \
result_32-= timesI(UChi_12);
#define TP_RECON_ACCUM\
result_00+= UChi_00;\
result_01+= UChi_01;\
result_02+= UChi_02;\
result_10+= UChi_10;\
result_11+= UChi_11;\
result_12+= UChi_12;\
result_20+= UChi_00; \
result_21+= UChi_01; \
result_22+= UChi_02; \
result_30+= UChi_10; \
result_31+= UChi_11; \
result_32+= UChi_12;
#define TM_RECON_ACCUM\
result_00+= UChi_00;\
result_01+= UChi_01;\
result_02+= UChi_02;\
result_10+= UChi_10;\
result_11+= UChi_11;\
result_12+= UChi_12;\
result_20-= UChi_00; \
result_21-= UChi_01; \
result_22-= UChi_02; \
result_30-= UChi_10; \
result_31-= UChi_11; \
result_32-= UChi_12;
#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
SE=st.GetEntry(ptype,DIR,ss); \
offset = SE->_offset; \
local = SE->_is_local; \
perm = SE->_permute; \
if ( local ) { \
LOAD_CHIMU_IMPL(DIR,F,PERM); \
PROJ; \
if ( perm) { \
PERMUTE_DIR(PERM); \
} \
} else { \
LOAD_CHI_IMPL(DIR,F,PERM); \
} \
MULT_2SPIN_IMPL(DIR,F); \
RECON;
#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
SE=st.GetEntry(ptype,DIR,ss); \
offset = SE->_offset; \
local = SE->_is_local; \
perm = SE->_permute; \
if ( local ) { \
LOAD_CHIMU_IMPL(DIR,F,PERM); \
PROJ; \
if ( perm) { \
PERMUTE_DIR(PERM); \
} \
} else if ( st.same_node[DIR] ) { \
LOAD_CHI_IMPL(DIR,F,PERM); \
} \
if (local || st.same_node[DIR] ) { \
MULT_2SPIN_IMPL(DIR,F); \
RECON; \
}
#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
SE=st.GetEntry(ptype,DIR,ss); \
offset = SE->_offset; \
local = SE->_is_local; \
perm = SE->_permute; \
if((!SE->_is_local)&&(!st.same_node[DIR]) ) { \
LOAD_CHI_IMPL(DIR,F,PERM); \
MULT_2SPIN_IMPL(DIR,F); \
RECON; \
nmu++; \
}
#define HAND_RESULT(ss,F) \
{ \
SiteSpinor & ref (out._odata[ss]); \
vstream(ref(F)(0)(0),result_00); \
vstream(ref(F)(0)(1),result_01); \
vstream(ref(F)(0)(2),result_02); \
vstream(ref(F)(1)(0),result_10); \
vstream(ref(F)(1)(1),result_11); \
vstream(ref(F)(1)(2),result_12); \
vstream(ref(F)(2)(0),result_20); \
vstream(ref(F)(2)(1),result_21); \
vstream(ref(F)(2)(2),result_22); \
vstream(ref(F)(3)(0),result_30); \
vstream(ref(F)(3)(1),result_31); \
vstream(ref(F)(3)(2),result_32); \
}
#define HAND_RESULT_EXT(ss,F) \
if (nmu){ \
SiteSpinor & ref (out._odata[ss]); \
ref(F)(0)(0)+=result_00; \
ref(F)(0)(1)+=result_01; \
ref(F)(0)(2)+=result_02; \
ref(F)(1)(0)+=result_10; \
ref(F)(1)(1)+=result_11; \
ref(F)(1)(2)+=result_12; \
ref(F)(2)(0)+=result_20; \
ref(F)(2)(1)+=result_21; \
ref(F)(2)(2)+=result_22; \
ref(F)(3)(0)+=result_30; \
ref(F)(3)(1)+=result_31; \
ref(F)(3)(2)+=result_32; \
}
#define HAND_DECLARATIONS(a) \
Simd result_00; \
Simd result_01; \
Simd result_02; \
Simd result_10; \
Simd result_11; \
Simd result_12; \
Simd result_20; \
Simd result_21; \
Simd result_22; \
Simd result_30; \
Simd result_31; \
Simd result_32; \
Simd Chi_00; \
Simd Chi_01; \
Simd Chi_02; \
Simd Chi_10; \
Simd Chi_11; \
Simd Chi_12; \
Simd UChi_00; \
Simd UChi_01; \
Simd UChi_02; \
Simd UChi_10; \
Simd UChi_11; \
Simd UChi_12; \
Simd U_00; \
Simd U_10; \
Simd U_20; \
Simd U_01; \
Simd U_11; \
Simd U_21;
#define ZERO_RESULT \
result_00=zero; \
result_01=zero; \
result_02=zero; \
result_10=zero; \
result_11=zero; \
result_12=zero; \
result_20=zero; \
result_21=zero; \
result_22=zero; \
result_30=zero; \
result_31=zero; \
result_32=zero;
#define Chimu_00 Chi_00
#define Chimu_01 Chi_01
#define Chimu_02 Chi_02
#define Chimu_10 Chi_10
#define Chimu_11 Chi_11
#define Chimu_12 Chi_12
#define Chimu_20 UChi_00
#define Chimu_21 UChi_01
#define Chimu_22 UChi_02
#define Chimu_30 UChi_10
#define Chimu_31 UChi_11
#define Chimu_32 UChi_12
namespace Grid {
namespace QCD {
template<class Impl> void
WilsonKernels<Impl>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionField &in, FermionField &out)
{
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
HAND_DECLARATIONS(ignore);
int offset,local,perm, ptype;
StencilEntry *SE;
#define HAND_DOP_SITE(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_RESULT(ss,F)
HAND_DOP_SITE(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
}
template<class Impl>
void WilsonKernels<Impl>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionField &in, FermionField &out)
{
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
HAND_DECLARATIONS(ignore);
StencilEntry *SE;
int offset,local,perm, ptype;
#define HAND_DOP_SITE_DAG(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_RESULT(ss,F)
HAND_DOP_SITE_DAG(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
}
template<class Impl> void
WilsonKernels<Impl>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionField &in, FermionField &out)
{
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
HAND_DECLARATIONS(ignore);
int offset,local,perm, ptype;
StencilEntry *SE;
#define HAND_DOP_SITE_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
ZERO_RESULT; \
HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_RESULT(ss,F)
HAND_DOP_SITE_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
}
template<class Impl>
void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionField &in, FermionField &out)
{
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
HAND_DECLARATIONS(ignore);
StencilEntry *SE;
int offset,local,perm, ptype;
#define HAND_DOP_SITE_DAG_INT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
ZERO_RESULT; \
HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_INT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_RESULT(ss,F)
HAND_DOP_SITE_DAG_INT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
}
template<class Impl> void
WilsonKernels<Impl>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionField &in, FermionField &out)
{
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
HAND_DECLARATIONS(ignore);
int offset,local,perm, ptype;
StencilEntry *SE;
int nmu=0;
#define HAND_DOP_SITE_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
ZERO_RESULT; \
HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xp,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(YM_PROJ,2,Yp,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zp,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tp,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xm,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(YP_PROJ,2,Ym,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zm,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tm,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_RESULT_EXT(ss,F)
HAND_DOP_SITE_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
}
template<class Impl>
void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionField &in, FermionField &out)
{
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
HAND_DECLARATIONS(ignore);
StencilEntry *SE;
int offset,local,perm, ptype;
int nmu=0;
#define HAND_DOP_SITE_DAG_EXT(F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL) \
ZERO_RESULT; \
HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(YP_PROJ,2,Yp,YP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(ZP_PROJ,1,Zp,ZP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(TP_PROJ,0,Tp,TP_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(XM_PROJ,3,Xm,XM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(YM_PROJ,2,Ym,YM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(ZM_PROJ,1,Zm,ZM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_STENCIL_LEG_EXT(TM_PROJ,0,Tm,TM_RECON_ACCUM,F,LOAD_CHI_IMPL,LOAD_CHIMU_IMPL,MULT_2SPIN_IMPL); \
HAND_RESULT_EXT(ss,F)
HAND_DOP_SITE_DAG_EXT(, LOAD_CHI,LOAD_CHIMU,MULT_2SPIN);
}
#define HAND_SPECIALISE_GPARITY(IMPL) \
template<> void \
WilsonKernels<IMPL>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionField &in, FermionField &out) \
{ \
typedef IMPL Impl; \
typedef typename Simd::scalar_type S; \
typedef typename Simd::vector_type V; \
\
HAND_DECLARATIONS(ignore); \
\
int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
StencilEntry *SE; \
HAND_DOP_SITE(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
} \
\
template<> \
void WilsonKernels<IMPL>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionField &in, FermionField &out) \
{ \
typedef IMPL Impl; \
typedef typename Simd::scalar_type S; \
typedef typename Simd::vector_type V; \
\
HAND_DECLARATIONS(ignore); \
\
StencilEntry *SE; \
int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
HAND_DOP_SITE_DAG(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
} \
\
template<> void \
WilsonKernels<IMPL>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionField &in, FermionField &out) \
{ \
typedef IMPL Impl; \
typedef typename Simd::scalar_type S; \
typedef typename Simd::vector_type V; \
\
HAND_DECLARATIONS(ignore); \
\
int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
StencilEntry *SE; \
HAND_DOP_SITE_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
} \
\
template<> \
void WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionField &in, FermionField &out) \
{ \
typedef IMPL Impl; \
typedef typename Simd::scalar_type S; \
typedef typename Simd::vector_type V; \
\
HAND_DECLARATIONS(ignore); \
\
StencilEntry *SE; \
int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
HAND_DOP_SITE_DAG_INT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
} \
\
template<> void \
WilsonKernels<IMPL>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionField &in, FermionField &out) \
{ \
typedef IMPL Impl; \
typedef typename Simd::scalar_type S; \
typedef typename Simd::vector_type V; \
\
HAND_DECLARATIONS(ignore); \
\
int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
StencilEntry *SE; \
int nmu=0; \
HAND_DOP_SITE_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
nmu = 0; \
HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
} \
template<> \
void WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionField &in, FermionField &out) \
{ \
typedef IMPL Impl; \
typedef typename Simd::scalar_type S; \
typedef typename Simd::vector_type V; \
\
HAND_DECLARATIONS(ignore); \
\
StencilEntry *SE; \
int offset,local,perm, ptype, g, direction, distance, sl, inplace_twist; \
int nmu=0; \
HAND_DOP_SITE_DAG_EXT(0, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
nmu = 0; \
HAND_DOP_SITE_DAG_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
}
HAND_SPECIALISE_GPARITY(GparityWilsonImplF);
HAND_SPECIALISE_GPARITY(GparityWilsonImplD);
HAND_SPECIALISE_GPARITY(GparityWilsonImplFH);
HAND_SPECIALISE_GPARITY(GparityWilsonImplDF);
////////////// Wilson ; uses this implementation /////////////////////
#define INSTANTIATE_THEM(A) \
template void WilsonKernels<A>::HandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
int ss,int sU,const FermionField &in, FermionField &out); \
template void WilsonKernels<A>::HandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionField &in, FermionField &out);\
template void WilsonKernels<A>::HandDhopSiteInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
int ss,int sU,const FermionField &in, FermionField &out); \
template void WilsonKernels<A>::HandDhopSiteDagInt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionField &in, FermionField &out); \
template void WilsonKernels<A>::HandDhopSiteExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
int ss,int sU,const FermionField &in, FermionField &out); \
template void WilsonKernels<A>::HandDhopSiteDagExt(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionField &in, FermionField &out);
INSTANTIATE_THEM(GparityWilsonImplF);
INSTANTIATE_THEM(GparityWilsonImplD);
INSTANTIATE_THEM(GparityWilsonImplFH);
INSTANTIATE_THEM(GparityWilsonImplDF);
}}

View File

@ -1,155 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonTMFermion5D.h
Copyright (C) 2015
Author: paboyle <paboyle@ph.ed.ac.uk> ; NB Christoph did similar in GPT
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <Grid/qcd/action/fermion/FermionCore.h>
#include <Grid/qcd/action/fermion/WilsonFermion.h>
namespace Grid {
namespace QCD {
template<class Impl>
class WilsonTMFermion5D : public WilsonFermion5D<Impl>
{
public:
INHERIT_IMPL_TYPES(Impl);
public:
virtual void Instantiatable(void) {};
// Constructors
WilsonTMFermion5D(GaugeField &_Umu,
GridCartesian &Fgrid,
GridRedBlackCartesian &Frbgrid,
GridCartesian &Ugrid,
GridRedBlackCartesian &Urbgrid,
const std::vector<RealD> _mass,
const std::vector<RealD> _mu,
const ImplParams &p= ImplParams()
) :
WilsonFermion5D<Impl>(_Umu,
Fgrid,
Frbgrid,
Ugrid,
Urbgrid,
4.0,p)
{
update(_mass,_mu);
}
virtual void Meooe(const FermionField &in, FermionField &out) {
if (in.checkerboard == Odd) {
this->DhopEO(in, out, DaggerNo);
} else {
this->DhopOE(in, out, DaggerNo);
}
}
virtual void MeooeDag(const FermionField &in, FermionField &out) {
if (in.checkerboard == Odd) {
this->DhopEO(in, out, DaggerYes);
} else {
this->DhopOE(in, out, DaggerYes);
}
}
// allow override for twisted mass and clover
virtual void Mooee(const FermionField &in, FermionField &out) {
out.checkerboard = in.checkerboard;
//axpibg5x(out,in,a,b); // out = a*in + b*i*G5*in
for (int s=0;s<(int)this->mass.size();s++) {
ComplexD a = 4.0+this->mass[s];
ComplexD b(0.0,this->mu[s]);
axpbg5y_ssp(out,a,in,b,in,s,s);
}
}
virtual void MooeeDag(const FermionField &in, FermionField &out) {
out.checkerboard = in.checkerboard;
for (int s=0;s<(int)this->mass.size();s++) {
ComplexD a = 4.0+this->mass[s];
ComplexD b(0.0,-this->mu[s]);
axpbg5y_ssp(out,a,in,b,in,s,s);
}
}
virtual void MooeeInv(const FermionField &in, FermionField &out) {
for (int s=0;s<(int)this->mass.size();s++) {
RealD m = this->mass[s];
RealD tm = this->mu[s];
RealD mtil = 4.0+this->mass[s];
RealD sq = mtil*mtil+tm*tm;
ComplexD a = mtil/sq;
ComplexD b(0.0, -tm /sq);
axpbg5y_ssp(out,a,in,b,in,s,s);
}
}
virtual void MooeeInvDag(const FermionField &in, FermionField &out) {
for (int s=0;s<(int)this->mass.size();s++) {
RealD m = this->mass[s];
RealD tm = this->mu[s];
RealD mtil = 4.0+this->mass[s];
RealD sq = mtil*mtil+tm*tm;
ComplexD a = mtil/sq;
ComplexD b(0.0,tm /sq);
axpbg5y_ssp(out,a,in,b,in,s,s);
}
}
virtual RealD M(const FermionField &in, FermionField &out) {
out.checkerboard = in.checkerboard;
this->Dhop(in, out, DaggerNo);
FermionField tmp(out._grid);
for (int s=0;s<(int)this->mass.size();s++) {
ComplexD a = 4.0+this->mass[s];
ComplexD b(0.0,this->mu[s]);
axpbg5y_ssp(tmp,a,in,b,in,s,s);
}
return axpy_norm(out, 1.0, tmp, out);
}
// needed for fast PV
void update(const std::vector<RealD>& _mass, const std::vector<RealD>& _mu) {
assert(_mass.size() == _mu.size());
assert(_mass.size() == this->FermionGrid()->_fdimensions[0]);
this->mass = _mass;
this->mu = _mu;
}
private:
std::vector<RealD> mu;
std::vector<RealD> mass;
};
typedef WilsonTMFermion5D<WilsonImplF> WilsonTMFermion5DF;
typedef WilsonTMFermion5D<WilsonImplD> WilsonTMFermion5DD;
}}

View File

@ -1,70 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/gauge/Gauge_aggregate.h
Copyright (C) 2015
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_QCD_GAUGE_H
#define GRID_QCD_GAUGE_H
#include <Grid/qcd/action/gauge/GaugeImplementations.h>
#include <Grid/qcd/utils/WilsonLoops.h>
#include <Grid/qcd/action/gauge/WilsonGaugeAction.h>
#include <Grid/qcd/action/gauge/PlaqPlusRectangleAction.h>
namespace Grid {
namespace QCD {
typedef WilsonGaugeAction<PeriodicGimplR> WilsonGaugeActionR;
typedef WilsonGaugeAction<PeriodicGimplF> WilsonGaugeActionF;
typedef WilsonGaugeAction<PeriodicGimplD> WilsonGaugeActionD;
typedef PlaqPlusRectangleAction<PeriodicGimplR> PlaqPlusRectangleActionR;
typedef PlaqPlusRectangleAction<PeriodicGimplF> PlaqPlusRectangleActionF;
typedef PlaqPlusRectangleAction<PeriodicGimplD> PlaqPlusRectangleActionD;
typedef IwasakiGaugeAction<PeriodicGimplR> IwasakiGaugeActionR;
typedef IwasakiGaugeAction<PeriodicGimplF> IwasakiGaugeActionF;
typedef IwasakiGaugeAction<PeriodicGimplD> IwasakiGaugeActionD;
typedef SymanzikGaugeAction<PeriodicGimplR> SymanzikGaugeActionR;
typedef SymanzikGaugeAction<PeriodicGimplF> SymanzikGaugeActionF;
typedef SymanzikGaugeAction<PeriodicGimplD> SymanzikGaugeActionD;
typedef WilsonGaugeAction<ConjugateGimplR> ConjugateWilsonGaugeActionR;
typedef WilsonGaugeAction<ConjugateGimplF> ConjugateWilsonGaugeActionF;
typedef WilsonGaugeAction<ConjugateGimplD> ConjugateWilsonGaugeActionD;
typedef PlaqPlusRectangleAction<ConjugateGimplR> ConjugatePlaqPlusRectangleActionR;
typedef PlaqPlusRectangleAction<ConjugateGimplF> ConjugatePlaqPlusRectangleActionF;
typedef PlaqPlusRectangleAction<ConjugateGimplD> ConjugatePlaqPlusRectangleActionD;
typedef IwasakiGaugeAction<ConjugateGimplR> ConjugateIwasakiGaugeActionR;
typedef IwasakiGaugeAction<ConjugateGimplF> ConjugateIwasakiGaugeActionF;
typedef IwasakiGaugeAction<ConjugateGimplD> ConjugateIwasakiGaugeActionD;
typedef SymanzikGaugeAction<ConjugateGimplR> ConjugateSymanzikGaugeActionR;
typedef SymanzikGaugeAction<ConjugateGimplF> ConjugateSymanzikGaugeActionF;
typedef SymanzikGaugeAction<ConjugateGimplD> ConjugateSymanzikGaugeActionD;
}}
#endif

View File

@ -1,153 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/gauge/GaugeImpl.h
Copyright (C) 2015
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_GAUGE_IMPL_TYPES_H
#define GRID_GAUGE_IMPL_TYPES_H
namespace Grid {
namespace QCD {
////////////////////////////////////////////////////////////////////////
// Implementation dependent gauge types
////////////////////////////////////////////////////////////////////////
#define INHERIT_GIMPL_TYPES(GImpl) \
typedef typename GImpl::Simd Simd; \
typedef typename GImpl::LinkField GaugeLinkField; \
typedef typename GImpl::Field GaugeField; \
typedef typename GImpl::ComplexField ComplexField;\
typedef typename GImpl::SiteField SiteGaugeField; \
typedef typename GImpl::SiteComplex SiteComplex; \
typedef typename GImpl::SiteLink SiteGaugeLink;
#define INHERIT_FIELD_TYPES(Impl) \
typedef typename Impl::Simd Simd; \
typedef typename Impl::ComplexField ComplexField; \
typedef typename Impl::SiteField SiteField; \
typedef typename Impl::Field Field;
// hardcodes the exponential approximation in the template
template <class S, int Nrepresentation = Nc, int Nexp = 12 > class GaugeImplTypes {
public:
typedef S Simd;
template <typename vtype> using iImplScalar = iScalar<iScalar<iScalar<vtype> > >;
template <typename vtype> using iImplGaugeLink = iScalar<iScalar<iMatrix<vtype, Nrepresentation> > >;
template <typename vtype> using iImplGaugeField = iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nd>;
typedef iImplScalar<Simd> SiteComplex;
typedef iImplGaugeLink<Simd> SiteLink;
typedef iImplGaugeField<Simd> SiteField;
typedef Lattice<SiteComplex> ComplexField;
typedef Lattice<SiteLink> LinkField;
typedef Lattice<SiteField> Field;
// Guido: we can probably separate the types from the HMC functions
// this will create 2 kind of implementations
// probably confusing the users
// Now keeping only one class
// Move this elsewhere? FIXME
static inline void AddLink(Field &U, LinkField &W,
int mu) { // U[mu] += W
PARALLEL_FOR_LOOP
for (auto ss = 0; ss < U._grid->oSites(); ss++) {
U._odata[ss]._internal[mu] =
U._odata[ss]._internal[mu] + W._odata[ss]._internal;
}
}
///////////////////////////////////////////////////////////
// Move these to another class
// HMC auxiliary functions
static inline void generate_momenta(Field &P, GridParallelRNG &pRNG) {
// specific for SU gauge fields
LinkField Pmu(P._grid);
Pmu = zero;
for (int mu = 0; mu < Nd; mu++) {
SU<Nrepresentation>::GaussianFundamentalLieAlgebraMatrix(pRNG, Pmu);
PokeIndex<LorentzIndex>(P, Pmu, mu);
}
}
static inline Field projectForce(Field &P) { return Ta(P); }
static inline void update_field(Field& P, Field& U, double ep){
//static std::chrono::duration<double> diff;
//auto start = std::chrono::high_resolution_clock::now();
parallel_for(int ss=0;ss<P._grid->oSites();ss++){
for (int mu = 0; mu < Nd; mu++)
U[ss]._internal[mu] = ProjectOnGroup(Exponentiate(P[ss]._internal[mu], ep, Nexp) * U[ss]._internal[mu]);
}
//auto end = std::chrono::high_resolution_clock::now();
// diff += end - start;
// std::cout << "Time to exponentiate matrix " << diff.count() << " s\n";
}
static inline RealD FieldSquareNorm(Field& U){
LatticeComplex Hloc(U._grid);
Hloc = zero;
for (int mu = 0; mu < Nd; mu++) {
auto Umu = PeekIndex<LorentzIndex>(U, mu);
Hloc += trace(Umu * Umu);
}
Complex Hsum = sum(Hloc);
return Hsum.real();
}
static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
SU<Nc>::HotConfiguration(pRNG, U);
}
static inline void TepidConfiguration(GridParallelRNG &pRNG, Field &U) {
SU<Nc>::TepidConfiguration(pRNG, U);
}
static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) {
SU<Nc>::ColdConfiguration(pRNG, U);
}
};
typedef GaugeImplTypes<vComplex, Nc> GimplTypesR;
typedef GaugeImplTypes<vComplexF, Nc> GimplTypesF;
typedef GaugeImplTypes<vComplexD, Nc> GimplTypesD;
typedef GaugeImplTypes<vComplex, SU<Nc>::AdjointDimension> GimplAdjointTypesR;
typedef GaugeImplTypes<vComplexF, SU<Nc>::AdjointDimension> GimplAdjointTypesF;
typedef GaugeImplTypes<vComplexD, SU<Nc>::AdjointDimension> GimplAdjointTypesD;
} // QCD
} // Grid
#endif // GRID_GAUGE_IMPL_TYPES_H

View File

@ -1,417 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/gauge/Photon.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef QCD_PHOTON_ACTION_H
#define QCD_PHOTON_ACTION_H
namespace Grid{
namespace QCD{
template <class S>
class QedGimpl
{
public:
typedef S Simd;
template <typename vtype>
using iImplGaugeLink = iScalar<iScalar<iScalar<vtype>>>;
template <typename vtype>
using iImplGaugeField = iVector<iScalar<iScalar<vtype>>, Nd>;
typedef iImplGaugeLink<Simd> SiteLink;
typedef iImplGaugeField<Simd> SiteField;
typedef SiteField SiteComplex;
typedef Lattice<SiteLink> LinkField;
typedef Lattice<SiteField> Field;
typedef Field ComplexField;
};
typedef QedGimpl<vComplex> QedGimplR;
template<class Gimpl>
class Photon
{
public:
INHERIT_GIMPL_TYPES(Gimpl);
GRID_SERIALIZABLE_ENUM(Gauge, undef, feynman, 1, coulomb, 2, landau, 3);
GRID_SERIALIZABLE_ENUM(ZmScheme, undef, qedL, 1, qedTL, 2, qedInf, 3);
public:
Photon(Gauge gauge, ZmScheme zmScheme);
Photon(Gauge gauge, ZmScheme zmScheme, std::vector<Real> improvements);
Photon(Gauge gauge, ZmScheme zmScheme, Real G0);
Photon(Gauge gauge, ZmScheme zmScheme, std::vector<Real> improvements, Real G0);
virtual ~Photon(void) = default;
void FreePropagator(const GaugeField &in, GaugeField &out);
void MomentumSpacePropagator(const GaugeField &in, GaugeField &out);
void StochasticWeight(GaugeLinkField &weight);
void StochasticField(GaugeField &out, GridParallelRNG &rng);
void StochasticField(GaugeField &out, GridParallelRNG &rng,
const GaugeLinkField &weight);
void UnitField(GaugeField &out);
private:
void infVolPropagator(GaugeLinkField &out);
void invKHatSquared(GaugeLinkField &out);
void zmSub(GaugeLinkField &out);
private:
Gauge gauge_;
ZmScheme zmScheme_;
std::vector<Real> improvement_;
Real G0_;
};
typedef Photon<QedGimplR> PhotonR;
template<class Gimpl>
Photon<Gimpl>::Photon(Gauge gauge, ZmScheme zmScheme)
: gauge_(gauge), zmScheme_(zmScheme), improvement_(std::vector<Real>()),
G0_(0.15493339023106021408483720810737508876916113364521)
{}
template<class Gimpl>
Photon<Gimpl>::Photon(Gauge gauge, ZmScheme zmScheme,
std::vector<Real> improvements)
: gauge_(gauge), zmScheme_(zmScheme), improvement_(improvements),
G0_(0.15493339023106021408483720810737508876916113364521)
{}
template<class Gimpl>
Photon<Gimpl>::Photon(Gauge gauge, ZmScheme zmScheme, Real G0)
: gauge_(gauge), zmScheme_(zmScheme), improvement_(std::vector<Real>()), G0_(G0)
{}
template<class Gimpl>
Photon<Gimpl>::Photon(Gauge gauge, ZmScheme zmScheme,
std::vector<Real> improvements, Real G0)
: gauge_(gauge), zmScheme_(zmScheme), improvement_(improvements), G0_(G0)
{}
template<class Gimpl>
void Photon<Gimpl>::FreePropagator (const GaugeField &in,GaugeField &out)
{
FFT theFFT(in._grid);
GaugeField in_k(in._grid);
GaugeField prop_k(in._grid);
theFFT.FFT_all_dim(in_k,in,FFT::forward);
MomentumSpacePropagator(prop_k,in_k);
theFFT.FFT_all_dim(out,prop_k,FFT::backward);
}
template<class Gimpl>
void Photon<Gimpl>::infVolPropagator(GaugeLinkField &out)
{
auto *grid = dynamic_cast<GridCartesian *>(out._grid);
LatticeReal xmu(grid);
GaugeLinkField one(grid);
const unsigned int nd = grid->_ndimension;
std::vector<int> &l = grid->_fdimensions;
std::vector<int> x0(nd,0);
TComplex Tone = Complex(1.0,0.0);
TComplex Tzero = Complex(G0_,0.0);
FFT fft(grid);
one = Complex(1.0,0.0);
out = zero;
for(int mu = 0; mu < nd; mu++)
{
LatticeCoordinate(xmu,mu);
Real lo2 = l[mu]/2.0;
xmu = where(xmu < lo2, xmu, xmu-double(l[mu]));
out = out + toComplex(4*M_PI*M_PI*xmu*xmu);
}
pokeSite(Tone, out, x0);
out = one/out;
pokeSite(Tzero, out, x0);
fft.FFT_all_dim(out, out, FFT::forward);
}
template<class Gimpl>
void Photon<Gimpl>::invKHatSquared(GaugeLinkField &out)
{
GridBase *grid = out._grid;
GaugeLinkField kmu(grid), one(grid);
const unsigned int nd = grid->_ndimension;
std::vector<int> &l = grid->_fdimensions;
std::vector<int> zm(nd,0);
TComplex Tone = Complex(1.0,0.0);
TComplex Tzero= Complex(0.0,0.0);
one = Complex(1.0,0.0);
out = zero;
for(int mu = 0; mu < nd; mu++)
{
Real twoPiL = M_PI*2./l[mu];
LatticeCoordinate(kmu,mu);
kmu = 2.*sin(.5*twoPiL*kmu);
out = out + kmu*kmu;
}
pokeSite(Tone, out, zm);
out = one/out;
pokeSite(Tzero, out, zm);
}
template<class Gimpl>
void Photon<Gimpl>::zmSub(GaugeLinkField &out)
{
GridBase *grid = out._grid;
const unsigned int nd = grid->_ndimension;
std::vector<int> &l = grid->_fdimensions;
switch (zmScheme_)
{
case ZmScheme::qedTL:
{
std::vector<int> zm(nd,0);
TComplex Tzero = Complex(0.0,0.0);
pokeSite(Tzero, out, zm);
break;
}
case ZmScheme::qedL:
{
LatticeInteger spNrm(grid), coor(grid);
GaugeLinkField z(grid);
spNrm = zero;
for(int d = 0; d < grid->_ndimension - 1; d++)
{
LatticeCoordinate(coor,d);
coor = where(coor < Integer(l[d]/2), coor, coor-Integer(l[d]));
spNrm = spNrm + coor*coor;
}
out = where(spNrm == Integer(0), 0.*out, out);
// IR improvement
for(int i = 0; i < improvement_.size(); i++)
{
Real f = sqrt(improvement_[i]+1);
out = where(spNrm == Integer(i+1), f*out, out);
}
}
default:
break;
}
}
template<class Gimpl>
void Photon<Gimpl>::MomentumSpacePropagator(const GaugeField &in,
GaugeField &out)
{
GridBase *grid = out._grid;
LatticeComplex momProp(grid);
switch (zmScheme_)
{
case ZmScheme::qedTL:
case ZmScheme::qedL:
{
invKHatSquared(momProp);
zmSub(momProp);
break;
}
case ZmScheme::qedInf:
{
infVolPropagator(momProp);
break;
}
default:
break;
}
out = in*momProp;
}
template<class Gimpl>
void Photon<Gimpl>::StochasticWeight(GaugeLinkField &weight)
{
auto *grid = dynamic_cast<GridCartesian *>(weight._grid);
const unsigned int nd = grid->_ndimension;
std::vector<int> latt_size = grid->_fdimensions;
switch (zmScheme_)
{
case ZmScheme::qedTL:
case ZmScheme::qedL:
{
Integer vol = 1;
for(int d = 0; d < nd; d++)
{
vol = vol * latt_size[d];
}
invKHatSquared(weight);
weight = sqrt(vol)*sqrt(weight);
zmSub(weight);
break;
}
case ZmScheme::qedInf:
{
infVolPropagator(weight);
weight = sqrt(real(weight));
break;
}
default:
break;
}
}
template<class Gimpl>
void Photon<Gimpl>::StochasticField(GaugeField &out, GridParallelRNG &rng)
{
auto *grid = dynamic_cast<GridCartesian *>(out._grid);
GaugeLinkField weight(grid);
StochasticWeight(weight);
StochasticField(out, rng, weight);
}
template<class Gimpl>
void Photon<Gimpl>::StochasticField(GaugeField &out, GridParallelRNG &rng,
const GaugeLinkField &weight)
{
auto *grid = dynamic_cast<GridCartesian *>(out._grid);
const unsigned int nd = grid->_ndimension;
GaugeLinkField r(grid);
GaugeField aTilde(grid);
FFT fft(grid);
switch (zmScheme_)
{
case ZmScheme::qedTL:
case ZmScheme::qedL:
{
for(int mu = 0; mu < nd; mu++)
{
gaussian(rng, r);
r = weight*r;
pokeLorentz(aTilde, r, mu);
}
break;
}
case ZmScheme::qedInf:
{
Complex shift(1., 1.); // This needs to be a GaugeLink element?
for(int mu = 0; mu < nd; mu++)
{
bernoulli(rng, r);
r = weight*(2.*r - shift);
pokeLorentz(aTilde, r, mu);
}
break;
}
default:
break;
}
fft.FFT_all_dim(out, aTilde, FFT::backward);
out = real(out);
}
template<class Gimpl>
void Photon<Gimpl>::UnitField(GaugeField &out)
{
auto *grid = dynamic_cast<GridCartesian *>(out._grid);
const unsigned int nd = grid->_ndimension;
GaugeLinkField r(grid);
r = Complex(1.0,0.0);
for(int mu = 0; mu < nd; mu++)
{
pokeLorentz(out, r, mu);
}
out = real(out);
}
// template<class Gimpl>
// void Photon<Gimpl>::FeynmanGaugeMomentumSpacePropagator_L(GaugeField &out,
// const GaugeField &in)
// {
//
// FeynmanGaugeMomentumSpacePropagator_TL(out,in);
//
// GridBase *grid = out._grid;
// LatticeInteger coor(grid);
// GaugeField zz(grid); zz=zero;
//
// // xyzt
// for(int d = 0; d < grid->_ndimension-1;d++){
// LatticeCoordinate(coor,d);
// out = where(coor==Integer(0),zz,out);
// }
// }
//
// template<class Gimpl>
// void Photon<Gimpl>::FeynmanGaugeMomentumSpacePropagator_TL(GaugeField &out,
// const GaugeField &in)
// {
//
// // what type LatticeComplex
// GridBase *grid = out._grid;
// int nd = grid->_ndimension;
//
// typedef typename GaugeField::vector_type vector_type;
// typedef typename GaugeField::scalar_type ScalComplex;
// typedef Lattice<iSinglet<vector_type> > LatComplex;
//
// std::vector<int> latt_size = grid->_fdimensions;
//
// LatComplex denom(grid); denom= zero;
// LatComplex one(grid); one = ScalComplex(1.0,0.0);
// LatComplex kmu(grid);
//
// ScalComplex ci(0.0,1.0);
// // momphase = n * 2pi / L
// for(int mu=0;mu<Nd;mu++) {
//
// LatticeCoordinate(kmu,mu);
//
// RealD TwoPiL = M_PI * 2.0/ latt_size[mu];
//
// kmu = TwoPiL * kmu ;
//
// denom = denom + 4.0*sin(kmu*0.5)*sin(kmu*0.5); // Wilson term
// }
// std::vector<int> zero_mode(nd,0);
// TComplexD Tone = ComplexD(1.0,0.0);
// TComplexD Tzero= ComplexD(0.0,0.0);
//
// pokeSite(Tone,denom,zero_mode);
//
// denom= one/denom;
//
// pokeSite(Tzero,denom,zero_mode);
//
// out = zero;
// out = in*denom;
// };
}}
#endif

View File

@ -1,95 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/gauge/WilsonGaugeAction.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: neo <cossu@post.kek.jp>
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Guido Cossu <guido.cossu@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef QCD_WILSON_GAUGE_ACTION_H
#define QCD_WILSON_GAUGE_ACTION_H
namespace Grid {
namespace QCD {
////////////////////////////////////////////////////////////////////////
// Wilson Gauge Action .. should I template the Nc etc..
////////////////////////////////////////////////////////////////////////
template <class Gimpl>
class WilsonGaugeAction : public Action<typename Gimpl::GaugeField> {
public:
INHERIT_GIMPL_TYPES(Gimpl);
/////////////////////////// constructors
explicit WilsonGaugeAction(RealD beta_):beta(beta_){};
virtual std::string action_name() {return "WilsonGaugeAction";}
virtual std::string LogParameters(){
std::stringstream sstream;
sstream << GridLogMessage << "[WilsonGaugeAction] Beta: " << beta << std::endl;
return sstream.str();
}
virtual void refresh(const GaugeField &U,
GridParallelRNG &pRNG){}; // noop as no pseudoferms
virtual RealD S(const GaugeField &U) {
RealD plaq = WilsonLoops<Gimpl>::avgPlaquette(U);
RealD vol = U._grid->gSites();
RealD action = beta * (1.0 - plaq) * (Nd * (Nd - 1.0)) * vol * 0.5;
return action;
};
virtual void deriv(const GaugeField &U, GaugeField &dSdU) {
// not optimal implementation FIXME
// extend Ta to include Lorentz indexes
RealD factor = 0.5 * beta / RealD(Nc);
GaugeLinkField Umu(U._grid);
GaugeLinkField dSdU_mu(U._grid);
for (int mu = 0; mu < Nd; mu++) {
Umu = PeekIndex<LorentzIndex>(U, mu);
// Staple in direction mu
WilsonLoops<Gimpl>::Staple(dSdU_mu, U, mu);
dSdU_mu = Ta(Umu * dSdU_mu) * factor;
PokeIndex<LorentzIndex>(dSdU, dSdU_mu, mu);
}
}
private:
RealD beta;
};
}
}
#endif

View File

@ -1,145 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/pseudofermion/EvenOddSchurDifferentiable.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Guido Cossu <guido.cossu@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef QCD_EVEN_ODD_SCHUR_DIFFERENTIABLE_H
#define QCD_EVEN_ODD_SCHUR_DIFFERENTIABLE_H
namespace Grid{
namespace QCD{
// Base even odd HMC on the normal Mee based schur decomposition.
//
// M = (Mee Meo) = (1 0 ) (Mee 0 ) (1 Mee^{-1} Meo)
// (Moe Moo) (Moe Mee^-1 1 ) (0 Moo-Moe Mee^-1 Meo) (0 1 )
//
// Determinant is det of middle factor
// This assumes Mee is indept of U.
//
template<class Impl>
class SchurDifferentiableOperator : public SchurDiagMooeeOperator<FermionOperator<Impl>,typename Impl::FermionField>
{
public:
INHERIT_IMPL_TYPES(Impl);
typedef FermionOperator<Impl> Matrix;
SchurDifferentiableOperator (Matrix &Mat) : SchurDiagMooeeOperator<Matrix,FermionField>(Mat) {};
void MpcDeriv(GaugeField &Force,const FermionField &U,const FermionField &V) {
GridBase *fgrid = this->_Mat.FermionGrid();
GridBase *fcbgrid = this->_Mat.FermionRedBlackGrid();
FermionField tmp1(fcbgrid);
FermionField tmp2(fcbgrid);
conformable(fcbgrid,U._grid);
conformable(fcbgrid,V._grid);
// Assert the checkerboard?? or code for either
assert(U.checkerboard==Odd);
assert(V.checkerboard==U.checkerboard);
// NOTE Guido: WE DO NOT WANT TO USE THE ucbgrid GRID FOR THE FORCE
// it is not conformable with the HMC force field
// Case: Ls vectorised fields
// INHERIT FROM THE Force field instead
GridRedBlackCartesian* forcecb = new GridRedBlackCartesian(Force._grid);
GaugeField ForceO(forcecb);
GaugeField ForceE(forcecb);
// X^dag Der_oe MeeInv Meo Y
// Use Mooee as nontrivial but gauge field indept
this->_Mat.Meooe (V,tmp1); // odd->even -- implicit -0.5 factor to be applied
this->_Mat.MooeeInv(tmp1,tmp2); // even->even
this->_Mat.MoeDeriv(ForceO,U,tmp2,DaggerNo);
// Accumulate X^dag M_oe MeeInv Der_eo Y
this->_Mat.MeooeDag (U,tmp1); // even->odd -- implicit -0.5 factor to be applied
this->_Mat.MooeeInvDag(tmp1,tmp2); // even->even
this->_Mat.MeoDeriv(ForceE,tmp2,V,DaggerNo);
assert(ForceE.checkerboard==Even);
assert(ForceO.checkerboard==Odd);
setCheckerboard(Force,ForceE);
setCheckerboard(Force,ForceO);
Force=-Force;
delete forcecb;
}
void MpcDagDeriv(GaugeField &Force,const FermionField &U,const FermionField &V) {
GridBase *fgrid = this->_Mat.FermionGrid();
GridBase *fcbgrid = this->_Mat.FermionRedBlackGrid();
FermionField tmp1(fcbgrid);
FermionField tmp2(fcbgrid);
conformable(fcbgrid,U._grid);
conformable(fcbgrid,V._grid);
// Assert the checkerboard?? or code for either
assert(V.checkerboard==Odd);
assert(V.checkerboard==V.checkerboard);
// NOTE Guido: WE DO NOT WANT TO USE THE ucbgrid GRID FOR THE FORCE
// it is not conformable with the HMC force field
// INHERIT FROM THE Force field instead
GridRedBlackCartesian* forcecb = new GridRedBlackCartesian(Force._grid);
GaugeField ForceO(forcecb);
GaugeField ForceE(forcecb);
// X^dag Der_oe MeeInv Meo Y
// Use Mooee as nontrivial but gauge field indept
this->_Mat.MeooeDag (V,tmp1); // odd->even -- implicit -0.5 factor to be applied
this->_Mat.MooeeInvDag(tmp1,tmp2); // even->even
this->_Mat.MoeDeriv(ForceO,U,tmp2,DaggerYes);
// Accumulate X^dag M_oe MeeInv Der_eo Y
this->_Mat.Meooe (U,tmp1); // even->odd -- implicit -0.5 factor to be applied
this->_Mat.MooeeInv(tmp1,tmp2); // even->even
this->_Mat.MeoDeriv(ForceE,tmp2,V,DaggerYes);
assert(ForceE.checkerboard==Even);
assert(ForceO.checkerboard==Odd);
setCheckerboard(Force,ForceE);
setCheckerboard(Force,ForceO);
Force=-Force;
delete forcecb;
}
};
}
}
#endif

View File

@ -1,264 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/pseudofermion/ExactOneFlavourRatio.h
Copyright (C) 2017
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: David Murphy <dmurphy@phys.columbia.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
/////////////////////////////////////////////////////////////////
// Implementation of exact one flavour algorithm (EOFA) //
// using fermion classes defined in: //
// Grid/qcd/action/fermion/DomainWallEOFAFermion.h (Shamir) //
// Grid/qcd/action/fermion/MobiusEOFAFermion.h (Mobius) //
// arXiv: 1403.1683, 1706.05843 //
/////////////////////////////////////////////////////////////////
#ifndef QCD_PSEUDOFERMION_EXACT_ONE_FLAVOUR_RATIO_H
#define QCD_PSEUDOFERMION_EXACT_ONE_FLAVOUR_RATIO_H
namespace Grid{
namespace QCD{
///////////////////////////////////////////////////////////////
// Exact one flavour implementation of DWF determinant ratio //
///////////////////////////////////////////////////////////////
template<class Impl>
class ExactOneFlavourRatioPseudoFermionAction : public Action<typename Impl::GaugeField>
{
public:
INHERIT_IMPL_TYPES(Impl);
typedef OneFlavourRationalParams Params;
Params param;
MultiShiftFunction PowerNegHalf;
private:
bool use_heatbath_forecasting;
AbstractEOFAFermion<Impl>& Lop; // the basic LH operator
AbstractEOFAFermion<Impl>& Rop; // the basic RH operator
SchurRedBlackDiagMooeeSolve<FermionField> Solver;
FermionField Phi; // the pseudofermion field for this trajectory
public:
ExactOneFlavourRatioPseudoFermionAction(AbstractEOFAFermion<Impl>& _Lop, AbstractEOFAFermion<Impl>& _Rop,
OperatorFunction<FermionField>& S, Params& p, bool use_fc=false) : Lop(_Lop), Rop(_Rop), Solver(S),
Phi(_Lop.FermionGrid()), param(p), use_heatbath_forecasting(use_fc)
{
AlgRemez remez(param.lo, param.hi, param.precision);
// MdagM^(+- 1/2)
std::cout << GridLogMessage << "Generating degree " << param.degree << " for x^(-1/2)" << std::endl;
remez.generateApprox(param.degree, 1, 2);
PowerNegHalf.Init(remez, param.tolerance, true);
};
virtual std::string action_name() { return "ExactOneFlavourRatioPseudoFermionAction"; }
virtual std::string LogParameters() {
std::stringstream sstream;
sstream << GridLogMessage << "[" << action_name() << "] Low :" << param.lo << std::endl;
sstream << GridLogMessage << "[" << action_name() << "] High :" << param.hi << std::endl;
sstream << GridLogMessage << "[" << action_name() << "] Max iterations :" << param.MaxIter << std::endl;
sstream << GridLogMessage << "[" << action_name() << "] Tolerance :" << param.tolerance << std::endl;
sstream << GridLogMessage << "[" << action_name() << "] Degree :" << param.degree << std::endl;
sstream << GridLogMessage << "[" << action_name() << "] Precision :" << param.precision << std::endl;
return sstream.str();
}
// Spin projection
void spProj(const FermionField& in, FermionField& out, int sign, int Ls)
{
if(sign == 1){ for(int s=0; s<Ls; ++s){ axpby_ssp_pplus(out, 0.0, in, 1.0, in, s, s); } }
else{ for(int s=0; s<Ls; ++s){ axpby_ssp_pminus(out, 0.0, in, 1.0, in, s, s); } }
}
// EOFA heatbath: see Eqn. (29) of arXiv:1706.05843
// We generate a Gaussian noise vector \eta, and then compute
// \Phi = M_{\rm EOFA}^{-1/2} * \eta
// using a rational approximation to the inverse square root
virtual void refresh(const GaugeField& U, GridParallelRNG& pRNG)
{
Lop.ImportGauge(U);
Rop.ImportGauge(U);
FermionField eta (Lop.FermionGrid());
FermionField CG_src (Lop.FermionGrid());
FermionField CG_soln (Lop.FermionGrid());
FermionField Forecast_src(Lop.FermionGrid());
std::vector<FermionField> tmp(2, Lop.FermionGrid());
// Use chronological inverter to forecast solutions across poles
std::vector<FermionField> prev_solns;
if(use_heatbath_forecasting){ prev_solns.reserve(param.degree); }
ChronoForecast<AbstractEOFAFermion<Impl>, FermionField> Forecast;
// Seed with Gaussian noise vector (var = 0.5)
RealD scale = std::sqrt(0.5);
gaussian(pRNG,eta);
eta = eta * scale;
printf("Heatbath source vector: <\\eta|\\eta> = %1.15e\n", norm2(eta));
// \Phi = ( \alpha_{0} + \sum_{k=1}^{N_{p}} \alpha_{l} * \gamma_{l} ) * \eta
RealD N(PowerNegHalf.norm);
for(int k=0; k<param.degree; ++k){ N += PowerNegHalf.residues[k] / ( 1.0 + PowerNegHalf.poles[k] ); }
Phi = eta * N;
// LH terms:
// \Phi = \Phi + k \sum_{k=1}^{N_{p}} P_{-} \Omega_{-}^{\dagger} ( H(mf)
// - \gamma_{l} \Delta_{-}(mf,mb) P_{-} )^{-1} \Omega_{-} P_{-} \eta
RealD gamma_l(0.0);
spProj(eta, tmp[0], -1, Lop.Ls);
Lop.Omega(tmp[0], tmp[1], -1, 0);
G5R5(CG_src, tmp[1]);
tmp[1] = zero;
for(int k=0; k<param.degree; ++k){
gamma_l = 1.0 / ( 1.0 + PowerNegHalf.poles[k] );
Lop.RefreshShiftCoefficients(-gamma_l);
if(use_heatbath_forecasting){ // Forecast CG guess using solutions from previous poles
Lop.Mdag(CG_src, Forecast_src);
CG_soln = Forecast(Lop, Forecast_src, prev_solns);
Solver(Lop, CG_src, CG_soln);
prev_solns.push_back(CG_soln);
} else {
CG_soln = zero; // Just use zero as the initial guess
Solver(Lop, CG_src, CG_soln);
}
Lop.Dtilde(CG_soln, tmp[0]); // We actually solved Cayley preconditioned system: transform back
tmp[1] = tmp[1] + ( PowerNegHalf.residues[k]*gamma_l*gamma_l*Lop.k ) * tmp[0];
}
Lop.Omega(tmp[1], tmp[0], -1, 1);
spProj(tmp[0], tmp[1], -1, Lop.Ls);
Phi = Phi + tmp[1];
// RH terms:
// \Phi = \Phi - k \sum_{k=1}^{N_{p}} P_{+} \Omega_{+}^{\dagger} ( H(mb)
// + \gamma_{l} \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{+} P_{+} \eta
spProj(eta, tmp[0], 1, Rop.Ls);
Rop.Omega(tmp[0], tmp[1], 1, 0);
G5R5(CG_src, tmp[1]);
tmp[1] = zero;
if(use_heatbath_forecasting){ prev_solns.clear(); } // empirically, LH solns don't help for RH solves
for(int k=0; k<param.degree; ++k){
gamma_l = 1.0 / ( 1.0 + PowerNegHalf.poles[k] );
Rop.RefreshShiftCoefficients(-gamma_l*PowerNegHalf.poles[k]);
if(use_heatbath_forecasting){
Rop.Mdag(CG_src, Forecast_src);
CG_soln = Forecast(Rop, Forecast_src, prev_solns);
Solver(Rop, CG_src, CG_soln);
prev_solns.push_back(CG_soln);
} else {
CG_soln = zero;
Solver(Rop, CG_src, CG_soln);
}
Rop.Dtilde(CG_soln, tmp[0]); // We actually solved Cayley preconditioned system: transform back
tmp[1] = tmp[1] - ( PowerNegHalf.residues[k]*gamma_l*gamma_l*Rop.k ) * tmp[0];
}
Rop.Omega(tmp[1], tmp[0], 1, 1);
spProj(tmp[0], tmp[1], 1, Rop.Ls);
Phi = Phi + tmp[1];
// Reset shift coefficients for energy and force evals
Lop.RefreshShiftCoefficients(0.0);
Rop.RefreshShiftCoefficients(-1.0);
};
// EOFA action: see Eqn. (10) of arXiv:1706.05843
virtual RealD S(const GaugeField& U)
{
Lop.ImportGauge(U);
Rop.ImportGauge(U);
FermionField spProj_Phi(Lop.FermionGrid());
std::vector<FermionField> tmp(2, Lop.FermionGrid());
// S = <\Phi|\Phi>
RealD action(norm2(Phi));
// LH term: S = S - k <\Phi| P_{-} \Omega_{-}^{\dagger} H(mf)^{-1} \Omega_{-} P_{-} |\Phi>
spProj(Phi, spProj_Phi, -1, Lop.Ls);
Lop.Omega(spProj_Phi, tmp[0], -1, 0);
G5R5(tmp[1], tmp[0]);
tmp[0] = zero;
Solver(Lop, tmp[1], tmp[0]);
Lop.Dtilde(tmp[0], tmp[1]); // We actually solved Cayley preconditioned system: transform back
Lop.Omega(tmp[1], tmp[0], -1, 1);
action -= Lop.k * innerProduct(spProj_Phi, tmp[0]).real();
// RH term: S = S + k <\Phi| P_{+} \Omega_{+}^{\dagger} ( H(mb)
// - \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{-} P_{-} |\Phi>
spProj(Phi, spProj_Phi, 1, Rop.Ls);
Rop.Omega(spProj_Phi, tmp[0], 1, 0);
G5R5(tmp[1], tmp[0]);
tmp[0] = zero;
Solver(Rop, tmp[1], tmp[0]);
Rop.Dtilde(tmp[0], tmp[1]);
Rop.Omega(tmp[1], tmp[0], 1, 1);
action += Rop.k * innerProduct(spProj_Phi, tmp[0]).real();
return action;
};
// EOFA pseudofermion force: see Eqns. (34)-(36) of arXiv:1706.05843
virtual void deriv(const GaugeField& U, GaugeField& dSdU)
{
Lop.ImportGauge(U);
Rop.ImportGauge(U);
FermionField spProj_Phi (Lop.FermionGrid());
FermionField Omega_spProj_Phi(Lop.FermionGrid());
FermionField CG_src (Lop.FermionGrid());
FermionField Chi (Lop.FermionGrid());
FermionField g5_R5_Chi (Lop.FermionGrid());
GaugeField force(Lop.GaugeGrid());
// LH: dSdU = k \chi_{L}^{\dagger} \gamma_{5} R_{5} ( \partial_{x,\mu} D_{w} ) \chi_{L}
// \chi_{L} = H(mf)^{-1} \Omega_{-} P_{-} \Phi
spProj(Phi, spProj_Phi, -1, Lop.Ls);
Lop.Omega(spProj_Phi, Omega_spProj_Phi, -1, 0);
G5R5(CG_src, Omega_spProj_Phi);
spProj_Phi = zero;
Solver(Lop, CG_src, spProj_Phi);
Lop.Dtilde(spProj_Phi, Chi);
G5R5(g5_R5_Chi, Chi);
Lop.MDeriv(force, g5_R5_Chi, Chi, DaggerNo);
dSdU = Lop.k * force;
// RH: dSdU = dSdU - k \chi_{R}^{\dagger} \gamma_{5} R_{5} ( \partial_{x,\mu} D_{w} ) \chi_{}
// \chi_{R} = ( H(mb) - \Delta_{+}(mf,mb) P_{+} )^{-1} \Omega_{+} P_{+} \Phi
spProj(Phi, spProj_Phi, 1, Rop.Ls);
Rop.Omega(spProj_Phi, Omega_spProj_Phi, 1, 0);
G5R5(CG_src, Omega_spProj_Phi);
spProj_Phi = zero;
Solver(Rop, CG_src, spProj_Phi);
Rop.Dtilde(spProj_Phi, Chi);
G5R5(g5_R5_Chi, Chi);
Lop.MDeriv(force, g5_R5_Chi, Chi, DaggerNo);
dSdU = dSdU - Rop.k * force;
};
};
}}
#endif

View File

@ -1,43 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/pseudofermion/PseudoFermion_aggregate.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef QCD_PSEUDOFERMION_AGGREGATE_H
#define QCD_PSEUDOFERMION_AGGREGATE_H
#include <Grid/qcd/action/pseudofermion/EvenOddSchurDifferentiable.h>
#include <Grid/qcd/action/pseudofermion/TwoFlavour.h>
#include <Grid/qcd/action/pseudofermion/TwoFlavourRatio.h>
#include <Grid/qcd/action/pseudofermion/TwoFlavourEvenOdd.h>
#include <Grid/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h>
#include <Grid/qcd/action/pseudofermion/OneFlavourRational.h>
#include <Grid/qcd/action/pseudofermion/OneFlavourRationalRatio.h>
#include <Grid/qcd/action/pseudofermion/OneFlavourEvenOddRational.h>
#include <Grid/qcd/action/pseudofermion/OneFlavourEvenOddRationalRatio.h>
#include <Grid/qcd/action/pseudofermion/ExactOneFlavourRatio.h>
#endif

View File

@ -1,209 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/pseudofermion/TwoFlavourEvenOddRatio.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef QCD_PSEUDOFERMION_TWO_FLAVOUR_EVEN_ODD_RATIO_H
#define QCD_PSEUDOFERMION_TWO_FLAVOUR_EVEN_ODD_RATIO_H
namespace Grid{
namespace QCD{
///////////////////////////////////////
// Two flavour ratio
///////////////////////////////////////
template<class Impl>
class TwoFlavourEvenOddRatioPseudoFermionAction : public Action<typename Impl::GaugeField> {
public:
INHERIT_IMPL_TYPES(Impl);
private:
FermionOperator<Impl> & NumOp;// the basic operator
FermionOperator<Impl> & DenOp;// the basic operator
OperatorFunction<FermionField> &DerivativeSolver;
OperatorFunction<FermionField> &ActionSolver;
FermionField PhiOdd; // the pseudo fermion field for this trajectory
FermionField PhiEven; // the pseudo fermion field for this trajectory
public:
TwoFlavourEvenOddRatioPseudoFermionAction(FermionOperator<Impl> &_NumOp,
FermionOperator<Impl> &_DenOp,
OperatorFunction<FermionField> & DS,
OperatorFunction<FermionField> & AS) :
NumOp(_NumOp),
DenOp(_DenOp),
DerivativeSolver(DS),
ActionSolver(AS),
PhiEven(_NumOp.FermionRedBlackGrid()),
PhiOdd(_NumOp.FermionRedBlackGrid())
{
conformable(_NumOp.FermionGrid(), _DenOp.FermionGrid());
conformable(_NumOp.FermionRedBlackGrid(), _DenOp.FermionRedBlackGrid());
conformable(_NumOp.GaugeGrid(), _DenOp.GaugeGrid());
conformable(_NumOp.GaugeRedBlackGrid(), _DenOp.GaugeRedBlackGrid());
};
virtual std::string action_name(){return "TwoFlavourEvenOddRatioPseudoFermionAction";}
virtual std::string LogParameters(){
std::stringstream sstream;
sstream << GridLogMessage << "["<<action_name()<<"] has no parameters" << std::endl;
return sstream.str();
}
virtual void refresh(const GaugeField &U, GridParallelRNG& pRNG) {
// P(phi) = e^{- phi^dag Vpc (MpcdagMpc)^-1 Vpcdag phi}
//
// NumOp == V
// DenOp == M
//
// Take phi_o = Vpcdag^{-1} Mpcdag eta_o ; eta_o = Mpcdag^{-1} Vpcdag Phi
//
// P(eta_o) = e^{- eta_o^dag eta_o}
//
// e^{x^2/2 sig^2} => sig^2 = 0.5.
//
RealD scale = std::sqrt(0.5);
FermionField eta (NumOp.FermionGrid());
FermionField etaOdd (NumOp.FermionRedBlackGrid());
FermionField etaEven(NumOp.FermionRedBlackGrid());
FermionField tmp (NumOp.FermionRedBlackGrid());
gaussian(pRNG,eta);
pickCheckerboard(Even,etaEven,eta);
pickCheckerboard(Odd,etaOdd,eta);
NumOp.ImportGauge(U);
DenOp.ImportGauge(U);
SchurDifferentiableOperator<Impl> Mpc(DenOp);
SchurDifferentiableOperator<Impl> Vpc(NumOp);
// Odd det factors
Mpc.MpcDag(etaOdd,PhiOdd);
tmp=zero;
ActionSolver(Vpc,PhiOdd,tmp);
Vpc.Mpc(tmp,PhiOdd);
// Even det factors
DenOp.MooeeDag(etaEven,tmp);
NumOp.MooeeInvDag(tmp,PhiEven);
PhiOdd =PhiOdd*scale;
PhiEven=PhiEven*scale;
};
//////////////////////////////////////////////////////
// S = phi^dag V (Mdag M)^-1 Vdag phi
//////////////////////////////////////////////////////
virtual RealD S(const GaugeField &U) {
NumOp.ImportGauge(U);
DenOp.ImportGauge(U);
SchurDifferentiableOperator<Impl> Mpc(DenOp);
SchurDifferentiableOperator<Impl> Vpc(NumOp);
FermionField X(NumOp.FermionRedBlackGrid());
FermionField Y(NumOp.FermionRedBlackGrid());
Vpc.MpcDag(PhiOdd,Y); // Y= Vdag phi
X=zero;
ActionSolver(Mpc,Y,X); // X= (MdagM)^-1 Vdag phi
//Mpc.Mpc(X,Y); // Y= Mdag^-1 Vdag phi
// Multiply by Ydag
RealD action = real(innerProduct(Y,X));
//RealD action = norm2(Y);
// The EE factorised block; normally can replace with zero if det is constant (gauge field indept)
// Only really clover term that creates this. Leave the EE portion as a future to do to make most
// rapid progresss on DWF for now.
//
NumOp.MooeeDag(PhiEven,X);
DenOp.MooeeInvDag(X,Y);
action = action + norm2(Y);
return action;
};
//////////////////////////////////////////////////////
// dS/du = phi^dag dV (Mdag M)^-1 V^dag phi
// - phi^dag V (Mdag M)^-1 [ Mdag dM + dMdag M ] (Mdag M)^-1 V^dag phi
// + phi^dag V (Mdag M)^-1 dV^dag phi
//////////////////////////////////////////////////////
virtual void deriv(const GaugeField &U,GaugeField & dSdU) {
NumOp.ImportGauge(U);
DenOp.ImportGauge(U);
SchurDifferentiableOperator<Impl> Mpc(DenOp);
SchurDifferentiableOperator<Impl> Vpc(NumOp);
FermionField X(NumOp.FermionRedBlackGrid());
FermionField Y(NumOp.FermionRedBlackGrid());
// This assignment is necessary to be compliant with the HMC grids
GaugeField force(dSdU._grid);
//Y=Vdag phi
//X = (Mdag M)^-1 V^dag phi
//Y = (Mdag)^-1 V^dag phi
Vpc.MpcDag(PhiOdd,Y); // Y= Vdag phi
X=zero;
DerivativeSolver(Mpc,Y,X); // X= (MdagM)^-1 Vdag phi
Mpc.Mpc(X,Y); // Y= Mdag^-1 Vdag phi
// phi^dag V (Mdag M)^-1 dV^dag phi
Vpc.MpcDagDeriv(force , X, PhiOdd ); dSdU = force;
// phi^dag dV (Mdag M)^-1 V^dag phi
Vpc.MpcDeriv(force , PhiOdd, X ); dSdU = dSdU+force;
// - phi^dag V (Mdag M)^-1 Mdag dM (Mdag M)^-1 V^dag phi
// - phi^dag V (Mdag M)^-1 dMdag M (Mdag M)^-1 V^dag phi
Mpc.MpcDeriv(force,Y,X); dSdU = dSdU-force;
Mpc.MpcDagDeriv(force,X,Y); dSdU = dSdU-force;
// FIXME No force contribution from EvenEven assumed here
// Needs a fix for clover.
assert(NumOp.ConstEE() == 1);
assert(DenOp.ConstEE() == 1);
dSdU = -dSdU;
};
};
}
}
#endif

View File

@ -1,50 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/gauge/Scalar.h
Copyright (C) 2017
Author: Guido Cossu <guido.cossu@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_QCD_SCALAR_H
#define GRID_QCD_SCALAR_H
#include <Grid/qcd/action/scalar/ScalarImpl.h>
#include <Grid/qcd/action/scalar/ScalarAction.h>
#include <Grid/qcd/action/scalar/ScalarInteractionAction.h>
namespace Grid {
namespace QCD {
typedef ScalarAction<ScalarImplR> ScalarActionR;
typedef ScalarAction<ScalarImplF> ScalarActionF;
typedef ScalarAction<ScalarImplD> ScalarActionD;
template <int Colours, int Dimensions> using ScalarAdjActionR = ScalarInteractionAction<ScalarNxNAdjImplR<Colours>, Dimensions>;
template <int Colours, int Dimensions> using ScalarAdjActionF = ScalarInteractionAction<ScalarNxNAdjImplF<Colours>, Dimensions>;
template <int Colours, int Dimensions> using ScalarAdjActionD = ScalarInteractionAction<ScalarNxNAdjImplD<Colours>, Dimensions>;
}
}
#endif // GRID_QCD_SCALAR_H

View File

@ -1,83 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/gauge/WilsonGaugeAction.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: neo <cossu@post.kek.jp>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef SCALAR_ACTION_H
#define SCALAR_ACTION_H
namespace Grid {
// FIXME drop the QCD namespace everywhere here
template <class Impl>
class ScalarAction : public QCD::Action<typename Impl::Field> {
public:
INHERIT_FIELD_TYPES(Impl);
private:
RealD mass_square;
RealD lambda;
public:
ScalarAction(RealD ms, RealD l) : mass_square(ms), lambda(l) {}
virtual std::string LogParameters() {
std::stringstream sstream;
sstream << GridLogMessage << "[ScalarAction] lambda : " << lambda << std::endl;
sstream << GridLogMessage << "[ScalarAction] mass_square : " << mass_square << std::endl;
return sstream.str();
}
virtual std::string action_name() {return "ScalarAction";}
virtual void refresh(const Field &U, GridParallelRNG &pRNG) {} // noop as no pseudoferms
virtual RealD S(const Field &p) {
return (mass_square * 0.5 + QCD::Nd) * ScalarObs<Impl>::sumphisquared(p) +
(lambda / 24.) * ScalarObs<Impl>::sumphifourth(p) +
ScalarObs<Impl>::sumphider(p);
};
virtual void deriv(const Field &p,
Field &force) {
Field tmp(p._grid);
Field p2(p._grid);
ScalarObs<Impl>::phisquared(p2, p);
tmp = -(Cshift(p, 0, -1) + Cshift(p, 0, 1));
for (int mu = 1; mu < QCD::Nd; mu++) tmp -= Cshift(p, mu, -1) + Cshift(p, mu, 1);
force =+(mass_square + 2. * QCD::Nd) * p + (lambda / 6.) * p2 * p + tmp;
}
};
} // namespace Grid
#endif // SCALAR_ACTION_H

View File

@ -1,263 +0,0 @@
#ifndef SCALAR_IMPL
#define SCALAR_IMPL
namespace Grid {
//namespace QCD {
template <class S>
class ScalarImplTypes {
public:
typedef S Simd;
template <typename vtype>
using iImplField = iScalar<iScalar<iScalar<vtype> > >;
typedef iImplField<Simd> SiteField;
typedef SiteField SitePropagator;
typedef SiteField SiteComplex;
typedef Lattice<SiteField> Field;
typedef Field ComplexField;
typedef Field FermionField;
typedef Field PropagatorField;
static inline void generate_momenta(Field& P, GridParallelRNG& pRNG){
gaussian(pRNG, P);
}
static inline Field projectForce(Field& P){return P;}
static inline void update_field(Field& P, Field& U, double ep) {
U += P*ep;
}
static inline RealD FieldSquareNorm(Field& U) {
return (- sum(trace(U*U))/2.0);
}
static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
gaussian(pRNG, U);
}
static inline void TepidConfiguration(GridParallelRNG &pRNG, Field &U) {
gaussian(pRNG, U);
}
static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) {
U = 1.0;
}
static void MomentumSpacePropagator(Field &out, RealD m)
{
GridBase *grid = out._grid;
Field kmu(grid), one(grid);
const unsigned int nd = grid->_ndimension;
std::vector<int> &l = grid->_fdimensions;
one = Complex(1.0,0.0);
out = m*m;
for(int mu = 0; mu < nd; mu++)
{
Real twoPiL = M_PI*2./l[mu];
LatticeCoordinate(kmu,mu);
kmu = 2.*sin(.5*twoPiL*kmu);
out = out + kmu*kmu;
}
out = one/out;
}
static void FreePropagator(const Field &in, Field &out,
const Field &momKernel)
{
FFT fft((GridCartesian *)in._grid);
Field inFT(in._grid);
fft.FFT_all_dim(inFT, in, FFT::forward);
inFT = inFT*momKernel;
fft.FFT_all_dim(out, inFT, FFT::backward);
}
static void FreePropagator(const Field &in, Field &out, RealD m)
{
Field momKernel(in._grid);
MomentumSpacePropagator(momKernel, m);
FreePropagator(in, out, momKernel);
}
};
#ifdef USE_FFT_ACCELERATION
#ifndef FFT_MASS
#error "USE_FFT_ACCELERATION is defined but not FFT_MASS"
#endif
#endif
template <class S, unsigned int N>
class ScalarAdjMatrixImplTypes {
public:
typedef S Simd;
typedef QCD::SU<N> Group;
template <typename vtype>
using iImplField = iScalar<iScalar<iMatrix<vtype, N>>>;
template <typename vtype>
using iImplComplex = iScalar<iScalar<iScalar<vtype>>>;
typedef iImplField<Simd> SiteField;
typedef SiteField SitePropagator;
typedef iImplComplex<Simd> SiteComplex;
typedef Lattice<SiteField> Field;
typedef Lattice<SiteComplex> ComplexField;
typedef Field FermionField;
typedef Field PropagatorField;
static void MomentaSquare(ComplexField &out)
{
GridBase *grid = out._grid;
const std::vector<int> &l = grid->FullDimensions();
ComplexField kmu(grid);
for (int mu = 0; mu < grid->Nd(); mu++)
{
Real twoPiL = M_PI * 2.0 / l[mu];
LatticeCoordinate(kmu, mu);
kmu = 2.0 * sin(0.5 * twoPiL * kmu);
out += kmu * kmu;
}
}
static void MomentumSpacePropagator(ComplexField &out, RealD m)
{
GridBase *grid = out._grid;
ComplexField one(grid);
one = Complex(1.0, 0.0);
out = m * m;
MomentaSquare(out);
out = one / out;
}
static inline void generate_momenta(Field &P, GridParallelRNG &pRNG)
{
#ifndef USE_FFT_ACCELERATION
Group::GaussianFundamentalLieAlgebraMatrix(pRNG, P);
#else
Field Pgaussian(P._grid), Pp(P._grid);
ComplexField p2(P._grid); p2 = zero;
RealD M = FFT_MASS;
Group::GaussianFundamentalLieAlgebraMatrix(pRNG, Pgaussian);
FFT theFFT((GridCartesian*)P._grid);
theFFT.FFT_all_dim(Pp, Pgaussian, FFT::forward);
MomentaSquare(p2);
p2 += M * M;
p2 = sqrt(p2);
Pp *= p2;
theFFT.FFT_all_dim(P, Pp, FFT::backward);
#endif //USE_FFT_ACCELERATION
}
static inline Field projectForce(Field& P) {return P;}
static inline void update_field(Field &P, Field &U, double ep)
{
#ifndef USE_FFT_ACCELERATION
double t0=usecond();
U += P * ep;
double t1=usecond();
double total_time = (t1-t0)/1e6;
std::cout << GridLogIntegrator << "Total time for updating field (s) : " << total_time << std::endl;
#else
// FFT transform P(x) -> P(p)
// divide by (M^2+p^2) M external parameter (how to pass?)
// P'(p) = P(p)/(M^2+p^2)
// Transform back -> P'(x)
// U += P'(x)*ep
Field Pp(U._grid), P_FFT(U._grid);
static ComplexField p2(U._grid);
RealD M = FFT_MASS;
FFT theFFT((GridCartesian*)U._grid);
theFFT.FFT_all_dim(Pp, P, FFT::forward);
static bool first_call = true;
if (first_call)
{
// avoid recomputing
MomentumSpacePropagator(p2, M);
first_call = false;
}
Pp *= p2;
theFFT.FFT_all_dim(P_FFT, Pp, FFT::backward);
U += P_FFT * ep;
#endif //USE_FFT_ACCELERATION
}
static inline RealD FieldSquareNorm(Field &U)
{
#ifndef USE_FFT_ACCELERATION
return (TensorRemove(sum(trace(U * U))).real());
#else
// In case of Fourier acceleration we have to:
// compute U(p)*U(p)/(M^2+p^2)) Parseval theorem
// 1 FFT needed U(x) -> U(p)
// M to be passed
FFT theFFT((GridCartesian*)U._grid);
Field Up(U._grid);
theFFT.FFT_all_dim(Up, U, FFT::forward);
RealD M = FFT_MASS;
ComplexField p2(U._grid);
MomentumSpacePropagator(p2, M);
Field Up2 = Up * p2;
// from the definition of the DFT we need to divide by the volume
return (-TensorRemove(sum(trace(adj(Up) * Up2))).real() / U._grid->gSites());
#endif //USE_FFT_ACCELERATION
}
static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
Group::GaussianFundamentalLieAlgebraMatrix(pRNG, U);
}
static inline void TepidConfiguration(GridParallelRNG &pRNG, Field &U) {
Group::GaussianFundamentalLieAlgebraMatrix(pRNG, U, 0.01);
}
static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) {
U = zero;
}
};
typedef ScalarImplTypes<vReal> ScalarImplR;
typedef ScalarImplTypes<vRealF> ScalarImplF;
typedef ScalarImplTypes<vRealD> ScalarImplD;
typedef ScalarImplTypes<vComplex> ScalarImplCR;
typedef ScalarImplTypes<vComplexF> ScalarImplCF;
typedef ScalarImplTypes<vComplexD> ScalarImplCD;
// Hardcoding here the size of the matrices
typedef ScalarAdjMatrixImplTypes<vComplex, QCD::Nc> ScalarAdjImplR;
typedef ScalarAdjMatrixImplTypes<vComplexF, QCD::Nc> ScalarAdjImplF;
typedef ScalarAdjMatrixImplTypes<vComplexD, QCD::Nc> ScalarAdjImplD;
template <int Colours > using ScalarNxNAdjImplR = ScalarAdjMatrixImplTypes<vComplex, Colours >;
template <int Colours > using ScalarNxNAdjImplF = ScalarAdjMatrixImplTypes<vComplexF, Colours >;
template <int Colours > using ScalarNxNAdjImplD = ScalarAdjMatrixImplTypes<vComplexD, Colours >;
//}
}
#endif

View File

@ -1,208 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/gauge/WilsonGaugeAction.h
Copyright (C) 2015
Author: Guido Cossu <guido,cossu@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef SCALAR_INT_ACTION_H
#define SCALAR_INT_ACTION_H
// Note: this action can completely absorb the ScalarAction for real float fields
// use the scalarObjs to generalise the structure
namespace Grid
{
// FIXME drop the QCD namespace everywhere here
template <class Impl, int Ndim>
class ScalarInteractionAction : public QCD::Action<typename Impl::Field>
{
public:
INHERIT_FIELD_TYPES(Impl);
private:
RealD mass_square;
RealD lambda;
RealD g;
const unsigned int N = Impl::Group::Dimension;
typedef typename Field::vector_object vobj;
typedef CartesianStencil<vobj, vobj> Stencil;
SimpleCompressor<vobj> compressor;
int npoint = 2 * Ndim;
std::vector<int> directions; //
std::vector<int> displacements; //
public:
ScalarInteractionAction(RealD ms, RealD l, RealD gval) : mass_square(ms), lambda(l), g(gval), displacements(2 * Ndim, 0), directions(2 * Ndim, 0)
{
for (int mu = 0; mu < Ndim; mu++)
{
directions[mu] = mu;
directions[mu + Ndim] = mu;
displacements[mu] = 1;
displacements[mu + Ndim] = -1;
}
}
virtual std::string LogParameters()
{
std::stringstream sstream;
sstream << GridLogMessage << "[ScalarAction] lambda : " << lambda << std::endl;
sstream << GridLogMessage << "[ScalarAction] mass_square : " << mass_square << std::endl;
sstream << GridLogMessage << "[ScalarAction] g : " << g << std::endl;
return sstream.str();
}
virtual std::string action_name() { return "ScalarAction"; }
virtual void refresh(const Field &U, GridParallelRNG &pRNG) {}
virtual RealD S(const Field &p)
{
assert(p._grid->Nd() == Ndim);
static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
phiStencil.HaloExchange(p, compressor);
Field action(p._grid), pshift(p._grid), phisquared(p._grid);
phisquared = p * p;
action = (2.0 * Ndim + mass_square) * phisquared - lambda * phisquared * phisquared;
for (int mu = 0; mu < Ndim; mu++)
{
// pshift = Cshift(p, mu, +1); // not efficient, implement with stencils
parallel_for(int i = 0; i < p._grid->oSites(); i++)
{
int permute_type;
StencilEntry *SE;
vobj temp2;
const vobj *temp, *t_p;
SE = phiStencil.GetEntry(permute_type, mu, i);
t_p = &p._odata[i];
if (SE->_is_local)
{
temp = &p._odata[SE->_offset];
if (SE->_permute)
{
permute(temp2, *temp, permute_type);
action._odata[i] -= temp2 * (*t_p) + (*t_p) * temp2;
}
else
{
action._odata[i] -= (*temp) * (*t_p) + (*t_p) * (*temp);
}
}
else
{
action._odata[i] -= phiStencil.CommBuf()[SE->_offset] * (*t_p) + (*t_p) * phiStencil.CommBuf()[SE->_offset];
}
}
// action -= pshift*p + p*pshift;
}
// NB the trace in the algebra is normalised to 1/2
// minus sign coming from the antihermitian fields
return -(TensorRemove(sum(trace(action)))).real() * N / g;
};
virtual void deriv(const Field &p, Field &force)
{
double t0 = usecond();
assert(p._grid->Nd() == Ndim);
force = (2. * Ndim + mass_square) * p - 2. * lambda * p * p * p;
double interm_t = usecond();
// move this outside
static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
phiStencil.HaloExchange(p, compressor);
double halo_t = usecond();
int chunk = 128;
//for (int mu = 0; mu < QCD::Nd; mu++) force -= Cshift(p, mu, -1) + Cshift(p, mu, 1);
// inverting the order of the loops slows down the code(! g++ 7)
// cannot try to reduce the number of force writes by factor npoint...
// use cache blocking
for (int point = 0; point < npoint; point++)
{
#pragma omp parallel
{
int permute_type;
StencilEntry *SE;
const vobj *temp;
#pragma omp for schedule(static, chunk)
for (int i = 0; i < p._grid->oSites(); i++)
{
SE = phiStencil.GetEntry(permute_type, point, i);
// prefetch next p?
if (SE->_is_local)
{
temp = &p._odata[SE->_offset];
if (SE->_permute)
{
vobj temp2;
permute(temp2, *temp, permute_type);
force._odata[i] -= temp2;
}
else
{
force._odata[i] -= *temp; // slow part. Dominated by this read/write (BW)
}
}
else
{
force._odata[i] -= phiStencil.CommBuf()[SE->_offset];
}
}
}
}
force *= N / g;
double t1 = usecond();
double total_time = (t1 - t0) / 1e6;
double interm_time = (interm_t - t0) / 1e6;
double halo_time = (halo_t - interm_t) / 1e6;
double stencil_time = (t1 - halo_t) / 1e6;
std::cout << GridLogIntegrator << "Total time for force computation (s) : " << total_time << std::endl;
std::cout << GridLogIntegrator << "Intermediate time for force computation (s): " << interm_time << std::endl;
std::cout << GridLogIntegrator << "Halo time in force computation (s) : " << halo_time << std::endl;
std::cout << GridLogIntegrator << "Stencil time in force computation (s) : " << stencil_time << std::endl;
double flops = p._grid->gSites() * (14 * N * N * N + 18 * N * N + 2);
double flops_no_stencil = p._grid->gSites() * (14 * N * N * N + 6 * N * N + 2);
double Gflops = flops / (total_time * 1e9);
double Gflops_no_stencil = flops_no_stencil / (interm_time * 1e9);
std::cout << GridLogIntegrator << "Flops: " << flops << " - Gflop/s : " << Gflops << std::endl;
std::cout << GridLogIntegrator << "Flops NS: " << flops_no_stencil << " - Gflop/s NS: " << Gflops_no_stencil << std::endl;
}
};
} // namespace Grid
#endif // SCALAR_INT_ACTION_H

View File

@ -1,219 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/hmc/GenericHmcRunner.h
Copyright (C) 2015
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Guido Cossu <guido.cossu@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_GENERIC_HMC_RUNNER
#define GRID_GENERIC_HMC_RUNNER
#include <unordered_map>
namespace Grid {
namespace QCD {
// very ugly here but possibly resolved if we had a base Reader class
template < class ReaderClass >
class HMCRunnerBase {
public:
virtual void Run() = 0;
virtual void initialize(ReaderClass& ) = 0;
};
template <class Implementation,
template <typename, typename, typename> class Integrator,
class RepresentationsPolicy = NoHirep, class ReaderClass = XmlReader>
class HMCWrapperTemplate: public HMCRunnerBase<ReaderClass> {
public:
INHERIT_FIELD_TYPES(Implementation);
typedef Implementation ImplPolicy; // visible from outside
template <typename S = NoSmearing<Implementation> >
using IntegratorType = Integrator<Implementation, S, RepresentationsPolicy>;
HMCparameters Parameters;
std::string ParameterFile;
HMCResourceManager<Implementation> Resources;
// The set of actions (keep here for lower level users, for now)
ActionSet<Field, RepresentationsPolicy> TheAction;
HMCWrapperTemplate() = default;
HMCWrapperTemplate(HMCparameters Par){
Parameters = Par;
}
void initialize(ReaderClass & TheReader){
std::cout << "Initialization of the HMC" << std::endl;
Resources.initialize(TheReader);
// eventually add smearing
Resources.GetActionSet(TheAction);
}
void ReadCommandLine(int argc, char **argv) {
std::string arg;
if (GridCmdOptionExists(argv, argv + argc, "--StartingType")) {
arg = GridCmdOptionPayload(argv, argv + argc, "--StartingType");
if (arg != "HotStart" && arg != "ColdStart" && arg != "TepidStart" &&
arg != "CheckpointStart") {
std::cout << GridLogError << "Unrecognized option in --StartingType\n";
std::cout
<< GridLogError
<< "Valid [HotStart, ColdStart, TepidStart, CheckpointStart]\n";
exit(1);
}
Parameters.StartingType = arg;
}
if (GridCmdOptionExists(argv, argv + argc, "--StartingTrajectory")) {
arg = GridCmdOptionPayload(argv, argv + argc, "--StartingTrajectory");
std::vector<int> ivec(0);
GridCmdOptionIntVector(arg, ivec);
Parameters.StartTrajectory = ivec[0];
}
if (GridCmdOptionExists(argv, argv + argc, "--Trajectories")) {
arg = GridCmdOptionPayload(argv, argv + argc, "--Trajectories");
std::vector<int> ivec(0);
GridCmdOptionIntVector(arg, ivec);
Parameters.Trajectories = ivec[0];
}
if (GridCmdOptionExists(argv, argv + argc, "--Thermalizations")) {
arg = GridCmdOptionPayload(argv, argv + argc, "--Thermalizations");
std::vector<int> ivec(0);
GridCmdOptionIntVector(arg, ivec);
Parameters.NoMetropolisUntil = ivec[0];
}
if (GridCmdOptionExists(argv, argv + argc, "--ParameterFile")) {
arg = GridCmdOptionPayload(argv, argv + argc, "--ParameterFile");
ParameterFile = arg;
}
}
template <class SmearingPolicy>
void Run(SmearingPolicy &S) {
Runner(S);
}
void Run(){
NoSmearing<Implementation> S;
Runner(S);
}
//////////////////////////////////////////////////////////////////
private:
template <class SmearingPolicy>
void Runner(SmearingPolicy &Smearing) {
auto UGrid = Resources.GetCartesian();
Resources.AddRNGs();
Field U(UGrid);
// Can move this outside?
typedef IntegratorType<SmearingPolicy> TheIntegrator;
TheIntegrator MDynamics(UGrid, Parameters.MD, TheAction, Smearing);
if (Parameters.StartingType == "HotStart") {
// Hot start
Resources.SeedFixedIntegers();
Implementation::HotConfiguration(Resources.GetParallelRNG(), U);
} else if (Parameters.StartingType == "ColdStart") {
// Cold start
Resources.SeedFixedIntegers();
Implementation::ColdConfiguration(Resources.GetParallelRNG(), U);
} else if (Parameters.StartingType == "TepidStart") {
// Tepid start
Resources.SeedFixedIntegers();
Implementation::TepidConfiguration(Resources.GetParallelRNG(), U);
} else if (Parameters.StartingType == "CheckpointStart") {
// CheckpointRestart
Resources.GetCheckPointer()->CheckpointRestore(Parameters.StartTrajectory, U,
Resources.GetSerialRNG(),
Resources.GetParallelRNG());
}
Smearing.set_Field(U);
HybridMonteCarlo<TheIntegrator> HMC(Parameters, MDynamics,
Resources.GetSerialRNG(),
Resources.GetParallelRNG(),
Resources.GetObservables(), U);
// Run it
HMC.evolve();
}
};
// These are for gauge fields, default integrator MinimumNorm2
template <template <typename, typename, typename> class Integrator>
using GenericHMCRunner = HMCWrapperTemplate<PeriodicGimplR, Integrator>;
template <template <typename, typename, typename> class Integrator>
using GenericHMCRunnerF = HMCWrapperTemplate<PeriodicGimplF, Integrator>;
template <template <typename, typename, typename> class Integrator>
using GenericHMCRunnerD = HMCWrapperTemplate<PeriodicGimplD, Integrator>;
// These are for gauge fields, default integrator MinimumNorm2
template <template <typename, typename, typename> class Integrator>
using ConjugateHMCRunner = HMCWrapperTemplate<ConjugateGimplR, Integrator>;
template <template <typename, typename, typename> class Integrator>
using ConjugateHMCRunnerF = HMCWrapperTemplate<ConjugateGimplF, Integrator>;
template <template <typename, typename, typename> class Integrator>
using ConjugateHMCRunnerD = HMCWrapperTemplate<ConjugateGimplD, Integrator>;
template <class RepresentationsPolicy,
template <typename, typename, typename> class Integrator>
using GenericHMCRunnerHirep =
HMCWrapperTemplate<PeriodicGimplR, Integrator, RepresentationsPolicy>;
template <class Implementation, class RepresentationsPolicy,
template <typename, typename, typename> class Integrator>
using GenericHMCRunnerTemplate = HMCWrapperTemplate<Implementation, Integrator, RepresentationsPolicy>;
typedef HMCWrapperTemplate<ScalarImplR, MinimumNorm2, ScalarFields>
ScalarGenericHMCRunner;
typedef HMCWrapperTemplate<ScalarAdjImplR, MinimumNorm2, ScalarMatrixFields>
ScalarAdjGenericHMCRunner;
template <int Colours>
using ScalarNxNAdjGenericHMCRunner = HMCWrapperTemplate < ScalarNxNAdjImplR<Colours>, ForceGradient, ScalarNxNMatrixFields<Colours> >;
} // namespace QCD
} // namespace Grid
#endif // GRID_GENERIC_HMC_RUNNER

View File

@ -1,111 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/hmc/GenericHmcRunner.h
Copyright (C) 2015
Copyright (C) 2016
Author: Guido Cossu <guido.cossu@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_HMC_MODULES
#define GRID_HMC_MODULES
#include "HMC_GridModules.h"
namespace Grid {
namespace QCD {
////////////////////////////////////////////////////////////////////
struct RNGModuleParameters: Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(RNGModuleParameters,
std::string, serial_seeds,
std::string, parallel_seeds,);
std::vector<int> getSerialSeeds(){return strToVec<int>(serial_seeds);}
std::vector<int> getParallelSeeds(){return strToVec<int>(parallel_seeds);}
RNGModuleParameters(): serial_seeds("1"), parallel_seeds("1"){}
template <class ReaderClass >
RNGModuleParameters(Reader<ReaderClass>& Reader){
read(Reader, "RandomNumberGenerator", *this);
}
};
// Random number generators module
class RNGModule{
GridSerialRNG sRNG_;
std::unique_ptr<GridParallelRNG> pRNG_;
RNGModuleParameters Params_;
public:
RNGModule(){};
void set_pRNG(GridParallelRNG* pRNG){
pRNG_.reset(pRNG);
}
void set_RNGSeeds(RNGModuleParameters& Params) {
Params_ = Params;
}
GridSerialRNG& get_sRNG() { return sRNG_; }
GridParallelRNG& get_pRNG() { return *pRNG_.get(); }
void seed() {
auto SerialSeeds = Params_.getSerialSeeds();
auto ParallelSeeds = Params_.getParallelSeeds();
if (SerialSeeds.size() == 0 && ParallelSeeds.size() == 0) {
std::cout << GridLogError << "Seeds not initialized" << std::endl;
exit(1);
}
sRNG_.SeedFixedIntegers(SerialSeeds);
pRNG_->SeedFixedIntegers(ParallelSeeds);
}
};
/*
///////////////////////////////////////////////////////////////////
/// Smearing module
template <class ImplementationPolicy>
class SmearingModule{
virtual void get_smearing();
};
template <class ImplementationPolicy>
class StoutSmearingModule: public SmearingModule<ImplementationPolicy>{
SmearedConfiguration<ImplementationPolicy> SmearingPolicy;
};
*/
} // namespace QCD
} // namespace Grid
#endif // GRID_HMC_MODULES

View File

@ -1,328 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/hmc/GenericHmcRunner.h
Copyright (C) 2015
Copyright (C) 2016
Author: Guido Cossu <guido.cossu@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef HMC_RESOURCE_MANAGER_H
#define HMC_RESOURCE_MANAGER_H
#include <unordered_map>
// One function per Checkpointer, use a macro to simplify
#define RegisterLoadCheckPointerFunction(NAME) \
void Load##NAME##Checkpointer(const CheckpointerParameters& Params_) { \
if (!have_CheckPointer) { \
std::cout << GridLogDebug << "Loading Checkpointer " << #NAME \
<< std::endl; \
CP = std::unique_ptr<CheckpointerBaseModule>( \
new NAME##CPModule<ImplementationPolicy>(Params_)); \
have_CheckPointer = true; \
} else { \
std::cout << GridLogError << "Checkpointer already loaded " \
<< std::endl; \
exit(1); \
} \
}
#define RegisterLoadCheckPointerMetadataFunction(NAME) \
template < class Metadata > \
void Load##NAME##Checkpointer(const CheckpointerParameters& Params_, const Metadata& M_) { \
if (!have_CheckPointer) { \
std::cout << GridLogDebug << "Loading Metadata Checkpointer " << #NAME \
<< std::endl; \
CP = std::unique_ptr<CheckpointerBaseModule>( \
new NAME##CPModule<ImplementationPolicy, Metadata >(Params_, M_)); \
have_CheckPointer = true; \
} else { \
std::cout << GridLogError << "Checkpointer already loaded " \
<< std::endl; \
exit(1); \
} \
}
namespace Grid {
namespace QCD {
// HMC Resource manager
template <class ImplementationPolicy>
class HMCResourceManager {
typedef HMCModuleBase< QCD::BaseHmcCheckpointer<ImplementationPolicy> > CheckpointerBaseModule;
typedef HMCModuleBase< QCD::HmcObservable<typename ImplementationPolicy::Field> > ObservableBaseModule;
typedef ActionModuleBase< QCD::Action<typename ImplementationPolicy::Field>, GridModule > ActionBaseModule;
// Named storage for grid pairs (std + red-black)
std::unordered_map<std::string, GridModule> Grids;
RNGModule RNGs;
// SmearingModule<ImplementationPolicy> Smearing;
std::unique_ptr<CheckpointerBaseModule> CP;
// A vector of HmcObservable modules
std::vector<std::unique_ptr<ObservableBaseModule> > ObservablesList;
// A vector of HmcObservable modules
std::multimap<int, std::unique_ptr<ActionBaseModule> > ActionsList;
std::vector<int> multipliers;
bool have_RNG;
bool have_CheckPointer;
// NOTE: operator << is not overloaded for std::vector<string>
// so this function is necessary
void output_vector_string(const std::vector<std::string> &vs){
for (auto &i: vs)
std::cout << i << " ";
std::cout << std::endl;
}
public:
HMCResourceManager() : have_RNG(false), have_CheckPointer(false) {}
template <class ReaderClass, class vector_type = vComplex >
void initialize(ReaderClass &Read){
// assumes we are starting from the main node
// Geometry
GridModuleParameters GridPar(Read);
GridFourDimModule<vector_type> GridMod( GridPar) ;
AddGrid("gauge", GridMod);
// Checkpointer
auto &CPfactory = HMC_CPModuleFactory<cp_string, ImplementationPolicy, ReaderClass >::getInstance();
Read.push("Checkpointer");
std::string cp_type;
read(Read,"name", cp_type);
std::cout << "Registered types " << std::endl;
output_vector_string(CPfactory.getBuilderList());
CP = CPfactory.create(cp_type, Read);
CP->print_parameters();
Read.pop();
have_CheckPointer = true;
RNGModuleParameters RNGpar(Read);
SetRNGSeeds(RNGpar);
// Observables
auto &ObsFactory = HMC_ObservablesModuleFactory<observable_string, typename ImplementationPolicy::Field, ReaderClass>::getInstance();
Read.push(observable_string);// here must check if existing...
do {
std::string obs_type;
read(Read,"name", obs_type);
std::cout << "Registered types " << std::endl;
output_vector_string(ObsFactory.getBuilderList() );
ObservablesList.emplace_back(ObsFactory.create(obs_type, Read));
ObservablesList[ObservablesList.size() - 1]->print_parameters();
} while (Read.nextElement(observable_string));
Read.pop();
// Loop on levels
if(!Read.push("Actions")){
std::cout << "Actions not found" << std::endl;
exit(1);
}
if(!Read.push("Level")){// push must check if the node exist
std::cout << "Level not found" << std::endl;
exit(1);
}
do
{
fill_ActionsLevel(Read);
}
while(Read.push("Level"));
Read.pop();
}
template <class RepresentationPolicy>
void GetActionSet(ActionSet<typename ImplementationPolicy::Field, RepresentationPolicy>& Aset){
Aset.resize(multipliers.size());
for(auto it = ActionsList.begin(); it != ActionsList.end(); it++){
(*it).second->acquireResource(Grids["gauge"]);
Aset[(*it).first-1].push_back((*it).second->getPtr());
}
}
//////////////////////////////////////////////////////////////
// Grids
//////////////////////////////////////////////////////////////
void AddGrid(const std::string s, GridModule& M) {
// Check for name clashes
auto search = Grids.find(s);
if (search != Grids.end()) {
std::cout << GridLogError << "Grid with name \"" << search->first
<< "\" already present. Terminating\n";
exit(1);
}
Grids[s] = std::move(M);
std::cout << GridLogMessage << "::::::::::::::::::::::::::::::::::::::::" <<std::endl;
std::cout << GridLogMessage << "HMCResourceManager:" << std::endl;
std::cout << GridLogMessage << "Created grid set with name '" << s << "' and decomposition for the full cartesian " << std::endl;
Grids[s].show_full_decomposition();
std::cout << GridLogMessage << "::::::::::::::::::::::::::::::::::::::::" <<std::endl;
}
// Add a named grid set, 4d shortcut
void AddFourDimGrid(const std::string s) {
GridFourDimModule<vComplex> Mod;
AddGrid(s, Mod);
}
// Add a named grid set, 4d shortcut + tweak simd lanes
void AddFourDimGrid(const std::string s, const std::vector<int> simd_decomposition) {
GridFourDimModule<vComplex> Mod(simd_decomposition);
AddGrid(s, Mod);
}
GridCartesian* GetCartesian(std::string s = "") {
if (s.empty()) s = Grids.begin()->first;
std::cout << GridLogDebug << "Getting cartesian grid from: " << s
<< std::endl;
return Grids[s].get_full();
}
GridRedBlackCartesian* GetRBCartesian(std::string s = "") {
if (s.empty()) s = Grids.begin()->first;
std::cout << GridLogDebug << "Getting rb-cartesian grid from: " << s
<< std::endl;
return Grids[s].get_rb();
}
//////////////////////////////////////////////////////
// Random number generators
//////////////////////////////////////////////////////
void AddRNGs(std::string s = "") {
// Couple the RNGs to the GridModule tagged by s
// the default is the first grid registered
assert(Grids.size() > 0 && !have_RNG);
if (s.empty()) s = Grids.begin()->first;
std::cout << GridLogDebug << "Adding RNG to grid: " << s << std::endl;
RNGs.set_pRNG(new GridParallelRNG(GetCartesian(s)));
have_RNG = true;
}
void SetRNGSeeds(RNGModuleParameters& Params) { RNGs.set_RNGSeeds(Params); }
GridSerialRNG& GetSerialRNG() { return RNGs.get_sRNG(); }
GridParallelRNG& GetParallelRNG() {
assert(have_RNG);
return RNGs.get_pRNG();
}
void SeedFixedIntegers() {
assert(have_RNG);
RNGs.seed();
}
//////////////////////////////////////////////////////
// Checkpointers
//////////////////////////////////////////////////////
BaseHmcCheckpointer<ImplementationPolicy>* GetCheckPointer() {
if (have_CheckPointer)
return CP->getPtr();
else {
std::cout << GridLogError << "Error: no checkpointer defined"
<< std::endl;
exit(1);
}
}
RegisterLoadCheckPointerFunction(Binary);
RegisterLoadCheckPointerFunction(Nersc);
#ifdef HAVE_LIME
RegisterLoadCheckPointerFunction(ILDG);
RegisterLoadCheckPointerMetadataFunction(Scidac);
#endif
////////////////////////////////////////////////////////
// Observables
////////////////////////////////////////////////////////
template<class T, class... Types>
void AddObservable(Types&&... Args){
ObservablesList.push_back(std::unique_ptr<T>(new T(std::forward<Types>(Args)...)));
ObservablesList.back()->print_parameters();
}
std::vector<HmcObservable<typename ImplementationPolicy::Field>* > GetObservables(){
std::vector<HmcObservable<typename ImplementationPolicy::Field>* > out;
for (auto &i : ObservablesList){
out.push_back(i->getPtr());
}
// Add the checkpointer to the observables
out.push_back(GetCheckPointer());
return out;
}
private:
// this private
template <class ReaderClass >
void fill_ActionsLevel(ReaderClass &Read){
// Actions set
int m;
Read.readDefault("multiplier",m);
multipliers.push_back(m);
std::cout << "Level : " << multipliers.size() << " with multiplier : " << m << std::endl;
// here gauge
Read.push("Action");
do{
auto &ActionFactory = HMC_ActionModuleFactory<gauge_string, typename ImplementationPolicy::Field, ReaderClass>::getInstance();
std::string action_type;
Read.readDefault("name", action_type);
output_vector_string(ActionFactory.getBuilderList() );
ActionsList.emplace(m, ActionFactory.create(action_type, Read));
} while (Read.nextElement("Action"));
ActionsList.find(m)->second->print_parameters();
Read.pop();
}
};
}
}
#endif // HMC_RESOURCE_MANAGER_H

View File

@ -1,137 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/hmc/GenericHmcRunner.h
Copyright (C) 2015
Copyright (C) 2016
Author: Guido Cossu <guido.cossu@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef HMC_RUNNER_MODULE
#define HMC_RUNNER_MODULE
namespace Grid {
// the reader class is necessary here for the automatic initialization of the resources
// if we had a virtual reader would have been unecessary
template <class HMCType, class ReaderClass >
class HMCModule
: public Parametrized< QCD::HMCparameters >,
public HMCModuleBase< QCD::HMCRunnerBase<ReaderClass> > {
public:
typedef HMCModuleBase< QCD::HMCRunnerBase<ReaderClass> > Base;
typedef typename Base::Product Product;
std::unique_ptr<HMCType> HMCPtr;
HMCModule(QCD::HMCparameters Par) : Parametrized<QCD::HMCparameters>(Par) {}
template <class ReaderCl>
HMCModule(Reader<ReaderCl>& R) : Parametrized<QCD::HMCparameters>(R, "HMC"){};
Product* getPtr() {
if (!HMCPtr) initialize();
return HMCPtr.get();
}
private:
virtual void initialize() = 0;
};
// Factory
template <char const *str, class ReaderClass >
class HMCRunnerModuleFactory
: public Factory < HMCModuleBase< QCD::HMCRunnerBase<ReaderClass> > , Reader<ReaderClass> > {
public:
typedef Reader<ReaderClass> TheReader;
// use SINGLETON FUNCTOR MACRO HERE
HMCRunnerModuleFactory(const HMCRunnerModuleFactory& e) = delete;
void operator=(const HMCRunnerModuleFactory& e) = delete;
static HMCRunnerModuleFactory& getInstance(void) {
static HMCRunnerModuleFactory e;
return e;
}
private:
HMCRunnerModuleFactory(void) = default;
std::string obj_type() const {
return std::string(str);
}
};
///////////////
// macro for these
template < class ImplementationPolicy, class RepresentationPolicy, class ReaderClass >
class HMCLeapFrog: public HMCModule< QCD::GenericHMCRunnerTemplate<ImplementationPolicy, RepresentationPolicy, QCD::LeapFrog>, ReaderClass >{
typedef HMCModule< QCD::GenericHMCRunnerTemplate<ImplementationPolicy, RepresentationPolicy, QCD::LeapFrog>, ReaderClass > HMCBaseMod;
using HMCBaseMod::HMCBaseMod;
// aquire resource
virtual void initialize(){
this->HMCPtr.reset(new QCD::GenericHMCRunnerTemplate<ImplementationPolicy, RepresentationPolicy, QCD::LeapFrog>(this->Par_) );
}
};
template < class ImplementationPolicy, class RepresentationPolicy, class ReaderClass >
class HMCMinimumNorm2: public HMCModule< QCD::GenericHMCRunnerTemplate<ImplementationPolicy, RepresentationPolicy, QCD::MinimumNorm2>, ReaderClass >{
typedef HMCModule< QCD::GenericHMCRunnerTemplate<ImplementationPolicy, RepresentationPolicy, QCD::MinimumNorm2>, ReaderClass > HMCBaseMod;
using HMCBaseMod::HMCBaseMod;
// aquire resource
virtual void initialize(){
this->HMCPtr.reset(new QCD::GenericHMCRunnerTemplate<ImplementationPolicy, RepresentationPolicy, QCD::MinimumNorm2>(this->Par_));
}
};
template < class ImplementationPolicy, class RepresentationPolicy, class ReaderClass >
class HMCForceGradient: public HMCModule< QCD::GenericHMCRunnerTemplate<ImplementationPolicy, RepresentationPolicy, QCD::ForceGradient>, ReaderClass >{
typedef HMCModule< QCD::GenericHMCRunnerTemplate<ImplementationPolicy, RepresentationPolicy, QCD::ForceGradient>, ReaderClass > HMCBaseMod;
using HMCBaseMod::HMCBaseMod;
// aquire resource
virtual void initialize(){
this->HMCPtr.reset(new QCD::GenericHMCRunnerTemplate<ImplementationPolicy, RepresentationPolicy, QCD::ForceGradient>(this->Par_) );
}
};
extern char hmc_string[];
//////////////////////////////////////////////////////////////
}
#endif

View File

@ -1,177 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/hmc/HMC_GridModules.h
Copyright (C) 2015
Copyright (C) 2016
Author: Guido Cossu <guido.cossu@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef HMC_GRID_MODULES
#define HMC_GRID_MODULES
namespace Grid {
// Resources
// Modules for grids
// Introduce another namespace HMCModules?
class GridModuleParameters: Serializable{
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(GridModuleParameters,
std::string, lattice,
std::string, mpi);
std::vector<int> getLattice() const {return strToVec<int>(lattice);}
std::vector<int> getMpi() const {return strToVec<int>(mpi);}
void check() const {
if (getLattice().size() != getMpi().size() ) {
std::cout << GridLogError
<< "Error in GridModuleParameters: lattice and mpi dimensions "
"do not match"
<< std::endl;
exit(1);
}
}
template <class ReaderClass>
GridModuleParameters(Reader<ReaderClass>& Reader, std::string n = "LatticeGrid"):name(n) {
read(Reader, name, *this);
check();
}
// Save on file
template< class WriterClass>
void save(Writer<WriterClass>& Writer){
check();
write(Writer, name, *this);
}
private:
std::string name;
};
// Lower level class
class GridModule {
public:
GridCartesian* get_full() {
std::cout << GridLogDebug << "Getting cartesian in module"<< std::endl;
return grid_.get(); }
GridRedBlackCartesian* get_rb() {
std::cout << GridLogDebug << "Getting rb-cartesian in module"<< std::endl;
return rbgrid_.get(); }
void set_full(GridCartesian* grid) { grid_.reset(grid); }
void set_rb(GridRedBlackCartesian* rbgrid) { rbgrid_.reset(rbgrid); }
void show_full_decomposition(){ grid_->show_decomposition(); }
void show_rb_decomposition(){ rbgrid_->show_decomposition(); }
protected:
std::unique_ptr<GridCartesian> grid_;
std::unique_ptr<GridRedBlackCartesian> rbgrid_;
};
////////////////////////////////////
// Classes for the user
////////////////////////////////////
// Note: the space time grid should be out of the QCD namespace
template <class vector_type>
class GridFourDimModule : public GridModule
{
public:
GridFourDimModule()
{
using namespace QCD;
set_full(SpaceTimeGrid::makeFourDimGrid(
GridDefaultLatt(),
GridDefaultSimd(4, vector_type::Nsimd()),
GridDefaultMpi()));
set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get()));
}
GridFourDimModule(const std::vector<int> tweak_simd)
{
using namespace QCD;
if (tweak_simd.size() != 4)
{
std::cout << GridLogError
<< "Error in GridFourDimModule: SIMD size different from 4"
<< std::endl;
exit(1);
}
// Checks that the product agrees with the expectation
int simd_sum = 1;
for (auto &n : tweak_simd)
simd_sum *= n;
std::cout << GridLogDebug << "TweakSIMD: " << tweak_simd << " Sum: " << simd_sum << std::endl;
if (simd_sum == vector_type::Nsimd())
{
set_full(SpaceTimeGrid::makeFourDimGrid(
GridDefaultLatt(),
tweak_simd,
GridDefaultMpi()));
set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get()));
}
else
{
std::cout << GridLogError
<< "Error in GridFourDimModule: SIMD lanes must sum to "
<< vector_type::Nsimd()
<< std::endl;
}
}
GridFourDimModule(const GridModuleParameters Params)
{
using namespace QCD;
std::vector<int> lattice_v = Params.getLattice();
std::vector<int> mpi_v = Params.getMpi();
if (lattice_v.size() == 4)
{
set_full(SpaceTimeGrid::makeFourDimGrid(
lattice_v,
GridDefaultSimd(4, vector_type::Nsimd()),
mpi_v));
set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get()));
}
else
{
std::cout << GridLogError
<< "Error in GridFourDimModule: lattice dimension different from 4"
<< std::endl;
exit(1);
}
}
};
typedef GridFourDimModule<vComplex> GridDefaultFourDimModule;
} // namespace Grid
#endif // HMC_GRID_MODULES

View File

@ -1,52 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/hmc/HMC.h
Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
//--------------------------------------------------------------------
//--------------------------------------------------------------------
#ifndef HMC_AGGREGATE_INCLUDED
#define HMC_AGGREGATE_INCLUDED
#include <string>
#include <Grid/qcd/observables/hmc_observable.h>
#include <Grid/qcd/hmc/HMC.h>
// annoying location; should move this ?
#include <Grid/parallelIO/IldgIOtypes.h>
#include <Grid/parallelIO/IldgIO.h>
#include <Grid/parallelIO/NerscIO.h>
#include <Grid/qcd/hmc/checkpointers/CheckPointers.h>
#include <Grid/qcd/hmc/HMCModules.h>
#include <Grid/qcd/modules/mods.h>
#include <Grid/qcd/hmc/HMCResourceManager.h>
#include <Grid/qcd/hmc/GenericHMCrunner.h>
#include <Grid/qcd/hmc/HMCRunnerModule.h>
#endif

View File

@ -1,107 +0,0 @@
Using HMC in Grid version 0.5.1
These are the instructions to use the Generalised HMC on Grid version 0.5.1.
Disclaimer: GRID is still under active development so any information here can be changed in future releases.
Command line options
===================
(relevant file GenericHMCrunner.h)
The initial configuration can be changed at the command line using
--StartType <your choice>
valid choices, one among these
HotStart, ColdStart, TepidStart, CheckpointStart
default: HotStart
example
./My_hmc_exec --StartType HotStart
The CheckpointStart option uses the prefix for the configurations and rng seed files defined in your executable and the initial configuration is specified by
--StartTrajectory <integer>
default: 0
The number of trajectories for a specific run are specified at command line by
--Trajectories <integer>
default: 1
The number of thermalization steps (i.e. steps when the Metropolis acceptance check is turned off) is specified by
--Thermalizations <integer>
default: 10
Any other parameter is defined in the source for the executable.
HMC controls
===========
The lines
std::vector<int> SerSeed({1, 2, 3, 4, 5});
std::vector<int> ParSeed({6, 7, 8, 9, 10});
define the seeds for the serial and the parallel RNG.
The line
TheHMC.MDparameters.set(20, 1.0);// MDsteps, traj length
declares the number of molecular dynamics steps and the total trajectory length.
Actions
======
Action names are defined in the file
lib/qcd/Actions.h
Gauge actions list:
WilsonGaugeActionR;
WilsonGaugeActionF;
WilsonGaugeActionD;
PlaqPlusRectangleActionR;
PlaqPlusRectangleActionF;
PlaqPlusRectangleActionD;
IwasakiGaugeActionR;
IwasakiGaugeActionF;
IwasakiGaugeActionD;
SymanzikGaugeActionR;
SymanzikGaugeActionF;
SymanzikGaugeActionD;
ConjugateWilsonGaugeActionR;
ConjugateWilsonGaugeActionF;
ConjugateWilsonGaugeActionD;
ConjugatePlaqPlusRectangleActionR;
ConjugatePlaqPlusRectangleActionF;
ConjugatePlaqPlusRectangleActionD;
ConjugateIwasakiGaugeActionR;
ConjugateIwasakiGaugeActionF;
ConjugateIwasakiGaugeActionD;
ConjugateSymanzikGaugeActionR;
ConjugateSymanzikGaugeActionF;
ConjugateSymanzikGaugeActionD;
ScalarActionR;
ScalarActionF;
ScalarActionD;
each of these action accept one single parameter at creation time (beta).
Example for creating a Symanzik action with beta=4.0
SymanzikGaugeActionR(4.0)
The suffixes R,F,D in the action names refer to the Real
(the precision is defined at compile time by the --enable-precision flag in the configure),
Float and Double, that force the precision of the action to be 32, 64 bit respectively.

Some files were not shown because too many files have changed in this diff Show More