1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-16 23:07:05 +01:00

Compare commits

..

26 Commits

Author SHA1 Message Date
3064c9a6e2 Improve the matching of stencil coarsening 2020-09-08 15:36:58 -04:00
729882827c Improve the coarse matrix calc 2020-09-08 15:36:33 -04:00
baa668d3ac Merge branch 'develop' into feature/hw-multigrid
Conflicts:
	Grid/allocator/MemoryManager.h
2020-09-03 22:16:50 -04:00
3c82d16ed8 4D multigrid 2020-09-03 22:11:17 -04:00
5c8c0c2d7c BiCG 2020-09-03 22:10:40 -04:00
e5a100846c Useful tthing to preserve 2020-09-03 22:09:57 -04:00
a74e2dc12e Printing mem info 2020-09-03 22:08:44 -04:00
595f512a6e G5 for coarse space too 2020-09-03 22:07:40 -04:00
a6499b22ff Stats printing 2020-09-03 22:00:46 -04:00
b4e42a59c6 Stats traacking improvement 2020-09-03 22:00:14 -04:00
8c913e0edd Clearer UVM ttreatment 2020-09-03 21:59:05 -04:00
fd3f93d8d3 Zero changes 2020-09-03 21:57:11 -04:00
e9543cdacd Time deflation 2020-09-03 21:56:02 -04:00
98f7b3d298 Pcg 2020-09-03 21:55:05 -04:00
b7b164ea24 Test operator and deebug code 2020-09-03 21:54:20 -04:00
77124d99d5 Merge branch 'develop' into feature/hw-multigrid 2020-09-03 21:52:04 -04:00
e1327e7ea0 Optional bounds check debug code 2020-07-16 16:57:46 -04:00
569f78c2cf Stenccil improvement 2020-07-16 16:57:13 -04:00
488c79d5a1 Bound improvement 2020-07-15 19:58:08 -04:00
dc6b0f20b2 Fixed array bounds 2020-07-02 12:20:20 -04:00
c0badc3e16 Summit bounce back to git 2020-07-02 10:48:39 -04:00
58f6529b55 Slowly piecing together 2020-06-30 16:42:03 -04:00
e3f056dfbb Hw multigrid operator 2020-06-30 16:10:16 -04:00
da0ffa7a79 Two spin update defer commit to repository 2020-06-30 16:09:48 -04:00
fcc7640b9c Detect a coarsened vector 2020-06-30 14:17:45 -04:00
0cbe2859e0 Making progress on Hw based 5d coarse matrix 2020-06-30 14:17:20 -04:00
487 changed files with 10020 additions and 30126 deletions

1
.gitignore vendored
View File

@ -88,7 +88,6 @@ Thumbs.db
# build directory # # build directory #
################### ###################
build*/* build*/*
Documentation/_build
# IDE related files # # IDE related files #
##################### #####################

61
.travis.yml Normal file
View File

@ -0,0 +1,61 @@
language: cpp
cache:
directories:
- clang
matrix:
include:
- os: osx
osx_image: xcode8.3
compiler: clang
env: PREC=single
- os: osx
osx_image: xcode8.3
compiler: clang
env: PREC=double
before_install:
- export GRIDDIR=`pwd`
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]] && [ ! -e clang/bin ]; then wget $CLANG_LINK; tar -xf `basename $CLANG_LINK`; mkdir clang; mv clang+*/* clang/; fi
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export PATH="${GRIDDIR}/clang/bin:${PATH}"; fi
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc openssl; fi
install:
- export CWD=`pwd`
- echo $CWD
- export CC=$CC$VERSION
- export CXX=$CXX$VERSION
- echo $PATH
- which autoconf
- autoconf --version
- which automake
- automake --version
- which $CC
- $CC --version
- which $CXX
- $CXX --version
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export LDFLAGS='-L/usr/local/lib'; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export EXTRACONF='--with-openssl=/usr/local/opt/openssl'; fi
script:
- ./bootstrap.sh
- mkdir build
- cd build
- mkdir lime
- cd lime
- mkdir build
- cd build
- wget http://usqcd-software.github.io/downloads/c-lime/lime-1.3.2.tar.gz
- tar xf lime-1.3.2.tar.gz
- cd lime-1.3.2
- ./configure --prefix=$CWD/build/lime/install
- make -j4
- make install
- cd $CWD/build
- ../configure --enable-precision=$PREC --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install ${EXTRACONF}
- make -j4
- ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
- make check

View File

@ -34,15 +34,10 @@ directory
#if defined __GNUC__ && __GNUC__>=6 #if defined __GNUC__ && __GNUC__>=6
#pragma GCC diagnostic ignored "-Wignored-attributes" #pragma GCC diagnostic ignored "-Wignored-attributes"
#endif
#if defined __GNUC__
#pragma GCC diagnostic ignored "-Wpsabi"
#endif #endif
//disables and intel compiler specific warning (in json.hpp) //disables and intel compiler specific warning (in json.hpp)
#ifdef __ICC
#pragma warning disable 488 #pragma warning disable 488
#endif
#ifdef __NVCC__ #ifdef __NVCC__
//disables nvcc specific warning in json.hpp //disables nvcc specific warning in json.hpp

View File

@ -36,7 +36,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
#include <Grid/qcd/QCD.h> #include <Grid/qcd/QCD.h>
#include <Grid/qcd/spin/Spin.h> #include <Grid/qcd/spin/Spin.h>
#include <Grid/qcd/gparity/Gparity.h>
#include <Grid/qcd/utils/Utils.h> #include <Grid/qcd/utils/Utils.h>
#include <Grid/qcd/representations/Representations.h> #include <Grid/qcd/representations/Representations.h>
NAMESPACE_CHECK(GridQCDCore); NAMESPACE_CHECK(GridQCDCore);

View File

@ -28,7 +28,4 @@
/////////////////// ///////////////////
#include "Config.h" #include "Config.h"
#ifdef TOFU
#undef GRID_COMMS_THREADS
#endif
#endif /* GRID_STD_H */ #endif /* GRID_STD_H */

View File

@ -34,12 +34,6 @@
#define __SYCL__REDEFINE__ #define __SYCL__REDEFINE__
#endif #endif
/* HIP save and restore compile environment*/
#ifdef GRID_HIP
#pragma push
#pragma push_macro("__HIP_DEVICE_COMPILE__")
#endif
#define EIGEN_NO_HIP
#include <Grid/Eigen/Dense> #include <Grid/Eigen/Dense>
#include <Grid/Eigen/unsupported/CXX11/Tensor> #include <Grid/Eigen/unsupported/CXX11/Tensor>
@ -58,12 +52,6 @@
#pragma pop #pragma pop
#endif #endif
/*HIP restore*/
#ifdef __HIP__REDEFINE__
#pragma pop_macro("__HIP_DEVICE_COMPILE__")
#pragma pop
#endif
#if defined __GNUC__ #if defined __GNUC__
#pragma GCC diagnostic pop #pragma GCC diagnostic pop
#endif #endif

View File

@ -21,7 +21,6 @@ if BUILD_HDF5
extra_headers+=serialisation/Hdf5Type.h extra_headers+=serialisation/Hdf5Type.h
endif endif
all: version-cache Version.h all: version-cache Version.h
version-cache: version-cache:
@ -54,19 +53,6 @@ Version.h: version-cache
include Make.inc include Make.inc
include Eigen.inc include Eigen.inc
extra_sources+=$(WILS_FERMION_FILES)
extra_sources+=$(STAG_FERMION_FILES)
if BUILD_ZMOBIUS
extra_sources+=$(ZWILS_FERMION_FILES)
endif
if BUILD_GPARITY
extra_sources+=$(GP_FERMION_FILES)
endif
if BUILD_FERMION_REPS
extra_sources+=$(ADJ_FERMION_FILES)
extra_sources+=$(TWOIND_FERMION_FILES)
endif
lib_LIBRARIES = libGrid.a lib_LIBRARIES = libGrid.a
CCFILES += $(extra_sources) CCFILES += $(extra_sources)

View File

@ -54,7 +54,6 @@ NAMESPACE_CHECK(BiCGSTAB);
#include <Grid/algorithms/iterative/SchurRedBlack.h> #include <Grid/algorithms/iterative/SchurRedBlack.h>
#include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h> #include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
#include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h> #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
#include <Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h>
#include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h> #include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h>
#include <Grid/algorithms/iterative/BlockConjugateGradient.h> #include <Grid/algorithms/iterative/BlockConjugateGradient.h>
#include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h> #include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>

View File

@ -31,7 +31,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_ALGORITHM_COARSENED_MATRIX_H #ifndef GRID_ALGORITHM_COARSENED_MATRIX_H
#define GRID_ALGORITHM_COARSENED_MATRIX_H #define GRID_ALGORITHM_COARSENED_MATRIX_H
#include <Grid/qcd/QCD.h> // needed for Dagger(Yes|No), Inverse(Yes|No)
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
@ -50,24 +49,20 @@ inline void blockMaskedInnerProduct(Lattice<CComplex> &CoarseInner,
Lattice<dotp> fine_inner_msk(fine); Lattice<dotp> fine_inner_msk(fine);
// Multiply could be fused with innerProduct // Multiply could be fused with innerProduct
// Single block sum kernel could do both masks.
fine_inner = localInnerProduct(fineX,fineY); fine_inner = localInnerProduct(fineX,fineY);
mult(fine_inner_msk, fine_inner,FineMask); mult(fine_inner_msk, fine_inner,FineMask);
blockSum(CoarseInner,fine_inner_msk); blockSum(CoarseInner,fine_inner_msk);
} }
class Geometry { class Geometry {
public: public:
int npoint; int npoint;
int base;
std::vector<int> directions ; std::vector<int> directions ;
std::vector<int> displacements; std::vector<int> displacements;
std::vector<int> points_dagger;
Geometry(int _d) { Geometry(int _d) {
base = (_d==5) ? 1:0; int base = (_d==5) ? 1:0;
// make coarse grid stencil for 4d , not 5d // make coarse grid stencil for 4d , not 5d
if ( _d==5 ) _d=4; if ( _d==5 ) _d=4;
@ -75,50 +70,19 @@ public:
npoint = 2*_d+1; npoint = 2*_d+1;
directions.resize(npoint); directions.resize(npoint);
displacements.resize(npoint); displacements.resize(npoint);
points_dagger.resize(npoint);
for(int d=0;d<_d;d++){ for(int d=0;d<_d;d++){
directions[d ] = d+base; directions[d ] = d+base;
directions[d+_d] = d+base; directions[d+_d] = d+base;
displacements[d ] = +1; displacements[d ] = +1;
displacements[d+_d]= -1; displacements[d+_d]= -1;
points_dagger[d ] = d+_d;
points_dagger[d+_d] = d;
} }
directions [2*_d]=0; directions [2*_d]=0;
displacements[2*_d]=0; displacements[2*_d]=0;
points_dagger[2*_d]=2*_d;
}
int point(int dir, int disp) { std::cout <<GridLogMessage << "Geometry "<<std::endl;
assert(disp == -1 || disp == 0 || disp == 1); for(int p=0;p<npoint;p++){
assert(base+0 <= dir && dir < base+4); std::cout <<GridLogMessage << "point " <<p<<" dir "<<directions[p]<<" delta " <<displacements[p]<<std::endl;
}
// directions faster index = new indexing
// 4d (base = 0):
// point 0 1 2 3 4 5 6 7 8
// dir 0 1 2 3 0 1 2 3 0
// disp +1 +1 +1 +1 -1 -1 -1 -1 0
// 5d (base = 1):
// point 0 1 2 3 4 5 6 7 8
// dir 1 2 3 4 1 2 3 4 0
// disp +1 +1 +1 +1 -1 -1 -1 -1 0
// displacements faster index = old indexing
// 4d (base = 0):
// point 0 1 2 3 4 5 6 7 8
// dir 0 0 1 1 2 2 3 3 0
// disp +1 -1 +1 -1 +1 -1 +1 -1 0
// 5d (base = 1):
// point 0 1 2 3 4 5 6 7 8
// dir 1 1 2 2 3 3 4 4 0
// disp +1 -1 +1 -1 +1 -1 +1 -1 0
if(dir == 0 and disp == 0)
return 8;
else // New indexing
return (1 - disp) / 2 * 4 + dir - base;
// else // Old indexing
// return (4 * (dir - base) + 1 - disp) / 2;
} }
}; };
@ -140,8 +104,8 @@ public:
Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid,int _checkerboard) : Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid,int _checkerboard) :
CoarseGrid(_CoarseGrid), CoarseGrid(_CoarseGrid),
FineGrid(_FineGrid), FineGrid(_FineGrid),
subspace(nbasis,_FineGrid), checkerboard(_checkerboard),
checkerboard(_checkerboard) subspace(nbasis,_FineGrid)
{ {
}; };
@ -296,7 +260,7 @@ public:
// Fine Object == (per site) type of fine field // Fine Object == (per site) type of fine field
// nbasis == number of deflation vectors // nbasis == number of deflation vectors
template<class Fobj,class CComplex,int nbasis> template<class Fobj,class CComplex,int nbasis>
class CoarsenedMatrix : public CheckerBoardedSparseMatrixBase<Lattice<iVector<CComplex,nbasis > > > { class CoarsenedMatrix : public SparseMatrixBase<Lattice<iVector<CComplex,nbasis > > > {
public: public:
typedef iVector<CComplex,nbasis > siteVector; typedef iVector<CComplex,nbasis > siteVector;
@ -306,59 +270,35 @@ public:
typedef iMatrix<CComplex,nbasis > Cobj; typedef iMatrix<CComplex,nbasis > Cobj;
typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field
typedef Lattice<Fobj > FineField; typedef Lattice<Fobj > FineField;
typedef CoarseVector FermionField;
// enrich interface, use default implementation as in FermionOperator ///////
void Dminus(CoarseVector const& in, CoarseVector& out) { out = in; }
void DminusDag(CoarseVector const& in, CoarseVector& out) { out = in; }
void ImportPhysicalFermionSource(CoarseVector const& input, CoarseVector& imported) { imported = input; }
void ImportUnphysicalFermion(CoarseVector const& input, CoarseVector& imported) { imported = input; }
void ExportPhysicalFermionSolution(CoarseVector const& solution, CoarseVector& exported) { exported = solution; };
void ExportPhysicalFermionSource(CoarseVector const& solution, CoarseVector& exported) { exported = solution; };
//////////////////// ////////////////////
// Data members // Data members
//////////////////// ////////////////////
Geometry geom; Geometry geom;
GridBase * _grid; GridBase * _grid;
GridBase* _cbgrid;
int hermitian; int hermitian;
CartesianStencil<siteVector,siteVector,int> Stencil; CartesianStencil<siteVector,siteVector,int> Stencil;
CartesianStencil<siteVector,siteVector,int> StencilEven;
CartesianStencil<siteVector,siteVector,int> StencilOdd;
std::vector<CoarseMatrix> A; std::vector<CoarseMatrix> A;
std::vector<CoarseMatrix> Aeven;
std::vector<CoarseMatrix> Aodd;
CoarseMatrix AselfInv;
CoarseMatrix AselfInvEven;
CoarseMatrix AselfInvOdd;
Vector<RealD> dag_factor;
/////////////////////// ///////////////////////
// Interface // Interface
/////////////////////// ///////////////////////
GridBase * Grid(void) { return _grid; }; // this is all the linalg routines need to know GridBase * Grid(void) { return _grid; }; // this is all the linalg routines need to know
GridBase * RedBlackGrid() { return _cbgrid; };
int ConstEE() { return 0; }
virtual std::vector<int> Directions(void) { return geom.directions; };
virtual std::vector<int> Displacements(void){ return geom.displacements; };
void M (const CoarseVector &in, CoarseVector &out) void M (const CoarseVector &in, CoarseVector &out)
{ {
conformable(_grid,in.Grid()); conformable(_grid,in.Grid());
conformable(in.Grid(),out.Grid()); conformable(in.Grid(),out.Grid());
out.Checkerboard() = in.Checkerboard();
SimpleCompressor<siteVector> compressor; SimpleCompressor<siteVector> compressor;
Stencil.HaloExchange(in,compressor); Stencil.HaloExchange(in,compressor);
autoView( in_v , in, AcceleratorRead); autoView( in_v , in, AcceleratorRead);
autoView( out_v , out, AcceleratorWrite); autoView( out_v , out, AcceleratorWrite);
autoView( Stencil_v , Stencil, AcceleratorRead);
int npoint = geom.npoint;
typedef LatticeView<Cobj> Aview; typedef LatticeView<Cobj> Aview;
Vector<Aview> AcceleratorViewContainer; Vector<Aview> AcceleratorViewContainer;
@ -372,6 +312,9 @@ public:
int osites=Grid()->oSites(); int osites=Grid()->oSites();
autoView(st,Stencil,AcceleratorRead);
siteVector *CBp=Stencil.CommBuf();
accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, { accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, {
int ss = sss/nbasis; int ss = sss/nbasis;
int b = sss%nbasis; int b = sss%nbasis;
@ -380,14 +323,14 @@ public:
int ptype; int ptype;
StencilEntry *SE; StencilEntry *SE;
for(int point=0;point<npoint;point++){ for(int point=0;point<geom.npoint;point++){
SE=Stencil_v.GetEntry(ptype,point,ss); SE=st.GetEntry(ptype,point,ss);
if(SE->_is_local) { if(SE->_is_local) {
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
} else { } else {
nbr = coalescedRead(Stencil_v.CommBuf()[SE->_offset]); nbr = coalescedRead(CBp[SE->_offset]);
} }
acceleratorSynchronise(); acceleratorSynchronise();
@ -396,7 +339,7 @@ public:
} }
} }
coalescedWrite(out_v[ss](b),res); coalescedWrite(out_v[ss](b),res);
}); });
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose(); for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
}; };
@ -408,74 +351,12 @@ public:
return M(in,out); return M(in,out);
} else { } else {
// corresponds to Galerkin coarsening // corresponds to Galerkin coarsening
return MdagNonHermitian(in, out); CoarseVector tmp(Grid());
G5C(tmp, in);
M(tmp, out);
G5C(out, out);
} }
}; };
void MdagNonHermitian(const CoarseVector &in, CoarseVector &out)
{
conformable(_grid,in.Grid());
conformable(in.Grid(),out.Grid());
out.Checkerboard() = in.Checkerboard();
SimpleCompressor<siteVector> compressor;
Stencil.HaloExchange(in,compressor);
autoView( in_v , in, AcceleratorRead);
autoView( out_v , out, AcceleratorWrite);
autoView( Stencil_v , Stencil, AcceleratorRead);
int npoint = geom.npoint;
typedef LatticeView<Cobj> Aview;
Vector<Aview> AcceleratorViewContainer;
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
Aview *Aview_p = & AcceleratorViewContainer[0];
const int Nsimd = CComplex::Nsimd();
typedef decltype(coalescedRead(in_v[0])) calcVector;
typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
int osites=Grid()->oSites();
Vector<int> points(geom.npoint, 0);
for(int p=0; p<geom.npoint; p++)
points[p] = geom.points_dagger[p];
auto points_p = &points[0];
RealD* dag_factor_p = &dag_factor[0];
accelerator_for(sss, Grid()->oSites()*nbasis, Nsimd, {
int ss = sss/nbasis;
int b = sss%nbasis;
calcComplex res = Zero();
calcVector nbr;
int ptype;
StencilEntry *SE;
for(int p=0;p<npoint;p++){
int point = points_p[p];
SE=Stencil_v.GetEntry(ptype,point,ss);
if(SE->_is_local) {
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
} else {
nbr = coalescedRead(Stencil_v.CommBuf()[SE->_offset]);
}
acceleratorSynchronise();
for(int bb=0;bb<nbasis;bb++) {
res = res + dag_factor_p[b*nbasis+bb]*coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
}
}
coalescedWrite(out_v[ss](b),res);
});
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
}
void MdirComms(const CoarseVector &in) void MdirComms(const CoarseVector &in)
{ {
SimpleCompressor<siteVector> compressor; SimpleCompressor<siteVector> compressor;
@ -485,7 +366,6 @@ public:
{ {
conformable(_grid,in.Grid()); conformable(_grid,in.Grid());
conformable(_grid,out.Grid()); conformable(_grid,out.Grid());
out.Checkerboard() = in.Checkerboard();
typedef LatticeView<Cobj> Aview; typedef LatticeView<Cobj> Aview;
Vector<Aview> AcceleratorViewContainer; Vector<Aview> AcceleratorViewContainer;
@ -494,7 +374,6 @@ public:
autoView( out_v , out, AcceleratorWrite); autoView( out_v , out, AcceleratorWrite);
autoView( in_v , in, AcceleratorRead); autoView( in_v , in, AcceleratorRead);
autoView( Stencil_v , Stencil, AcceleratorRead);
const int Nsimd = CComplex::Nsimd(); const int Nsimd = CComplex::Nsimd();
typedef decltype(coalescedRead(in_v[0])) calcVector; typedef decltype(coalescedRead(in_v[0])) calcVector;
@ -508,12 +387,12 @@ public:
int ptype; int ptype;
StencilEntry *SE; StencilEntry *SE;
SE=Stencil_v.GetEntry(ptype,point,ss); SE=Stencil.GetEntry(ptype,point,ss);
if(SE->_is_local) { if(SE->_is_local) {
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute); nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
} else { } else {
nbr = coalescedRead(Stencil_v.CommBuf()[SE->_offset]); nbr = coalescedRead(Stencil.CommBuf()[SE->_offset]);
} }
acceleratorSynchronise(); acceleratorSynchronise();
@ -537,11 +416,23 @@ public:
MdirCalc(in,out[p],p); MdirCalc(in,out[p],p);
} }
}; };
void Mdir(const CoarseVector &in, CoarseVector &out, int dir, int disp){ void Mdir(const CoarseVector &in, CoarseVector &out, int dir, int disp)
{
this->MdirComms(in); this->MdirComms(in);
MdirCalc(in,out,geom.point(dir,disp)); int ndim = in.Grid()->Nd();
int point=-1;
for(int p=0;p<geom.npoint;p++){
if( (dir==geom.directions[p])&&(disp==geom.displacements[p])) point=p;
}
assert(point!=-1);// Must find
std::cout <<GridLogMessage << "Mdir point "<<point<<" dir "<<dir<<" disp "<<disp <<std::endl;
for(int p=0;p<geom.npoint;p++){
std::cout <<GridLogMessage << "point " <<p<<" dir "<<geom.directions[p]<<" delta " <<geom.displacements[p]<<std::endl;
}
MdirCalc(in,out,point);
}; };
void Mdiag(const CoarseVector &in, CoarseVector &out) void Mdiag(const CoarseVector &in, CoarseVector &out)
@ -550,288 +441,63 @@ public:
MdirCalc(in, out, point); // No comms MdirCalc(in, out, point); // No comms
}; };
void Mooee(const CoarseVector &in, CoarseVector &out) {
MooeeInternal(in, out, DaggerNo, InverseNo);
}
void MooeeInv(const CoarseVector &in, CoarseVector &out) {
MooeeInternal(in, out, DaggerNo, InverseYes);
}
void MooeeDag(const CoarseVector &in, CoarseVector &out) {
MooeeInternal(in, out, DaggerYes, InverseNo);
}
void MooeeInvDag(const CoarseVector &in, CoarseVector &out) {
MooeeInternal(in, out, DaggerYes, InverseYes);
}
void Meooe(const CoarseVector &in, CoarseVector &out) {
if(in.Checkerboard() == Odd) {
DhopEO(in, out, DaggerNo);
} else {
DhopOE(in, out, DaggerNo);
}
}
void MeooeDag(const CoarseVector &in, CoarseVector &out) {
if(in.Checkerboard() == Odd) {
DhopEO(in, out, DaggerYes);
} else {
DhopOE(in, out, DaggerYes);
}
}
void Dhop(const CoarseVector &in, CoarseVector &out, int dag) {
conformable(in.Grid(), _grid); // verifies full grid
conformable(in.Grid(), out.Grid());
out.Checkerboard() = in.Checkerboard();
DhopInternal(Stencil, A, in, out, dag);
}
void DhopOE(const CoarseVector &in, CoarseVector &out, int dag) {
conformable(in.Grid(), _cbgrid); // verifies half grid
conformable(in.Grid(), out.Grid()); // drops the cb check
assert(in.Checkerboard() == Even);
out.Checkerboard() = Odd;
DhopInternal(StencilEven, Aodd, in, out, dag);
}
void DhopEO(const CoarseVector &in, CoarseVector &out, int dag) {
conformable(in.Grid(), _cbgrid); // verifies half grid
conformable(in.Grid(), out.Grid()); // drops the cb check
assert(in.Checkerboard() == Odd);
out.Checkerboard() = Even;
DhopInternal(StencilOdd, Aeven, in, out, dag);
}
void MooeeInternal(const CoarseVector &in, CoarseVector &out, int dag, int inv) {
out.Checkerboard() = in.Checkerboard();
assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
CoarseMatrix *Aself = nullptr;
if(in.Grid()->_isCheckerBoarded) {
if(in.Checkerboard() == Odd) {
Aself = (inv) ? &AselfInvOdd : &Aodd[geom.npoint-1];
DselfInternal(StencilOdd, *Aself, in, out, dag);
} else {
Aself = (inv) ? &AselfInvEven : &Aeven[geom.npoint-1];
DselfInternal(StencilEven, *Aself, in, out, dag);
}
} else {
Aself = (inv) ? &AselfInv : &A[geom.npoint-1];
DselfInternal(Stencil, *Aself, in, out, dag);
}
assert(Aself != nullptr);
}
void DselfInternal(CartesianStencil<siteVector,siteVector,int> &st, CoarseMatrix &a,
const CoarseVector &in, CoarseVector &out, int dag) {
int point = geom.npoint-1;
autoView( out_v, out, AcceleratorWrite);
autoView( in_v, in, AcceleratorRead);
autoView( st_v, st, AcceleratorRead);
autoView( a_v, a, AcceleratorRead);
const int Nsimd = CComplex::Nsimd();
typedef decltype(coalescedRead(in_v[0])) calcVector;
typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
RealD* dag_factor_p = &dag_factor[0];
if(dag) {
accelerator_for(sss, in.Grid()->oSites()*nbasis, Nsimd, {
int ss = sss/nbasis;
int b = sss%nbasis;
calcComplex res = Zero();
calcVector nbr;
int ptype;
StencilEntry *SE;
SE=st_v.GetEntry(ptype,point,ss);
if(SE->_is_local) {
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
} else {
nbr = coalescedRead(st_v.CommBuf()[SE->_offset]);
}
acceleratorSynchronise();
for(int bb=0;bb<nbasis;bb++) {
res = res + dag_factor_p[b*nbasis+bb]*coalescedRead(a_v[ss](b,bb))*nbr(bb);
}
coalescedWrite(out_v[ss](b),res);
});
} else {
accelerator_for(sss, in.Grid()->oSites()*nbasis, Nsimd, {
int ss = sss/nbasis;
int b = sss%nbasis;
calcComplex res = Zero();
calcVector nbr;
int ptype;
StencilEntry *SE;
SE=st_v.GetEntry(ptype,point,ss);
if(SE->_is_local) {
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
} else {
nbr = coalescedRead(st_v.CommBuf()[SE->_offset]);
}
acceleratorSynchronise();
for(int bb=0;bb<nbasis;bb++) {
res = res + coalescedRead(a_v[ss](b,bb))*nbr(bb);
}
coalescedWrite(out_v[ss](b),res);
});
}
}
void DhopInternal(CartesianStencil<siteVector,siteVector,int> &st, std::vector<CoarseMatrix> &a,
const CoarseVector &in, CoarseVector &out, int dag) {
SimpleCompressor<siteVector> compressor;
st.HaloExchange(in,compressor);
autoView( in_v, in, AcceleratorRead);
autoView( out_v, out, AcceleratorWrite);
autoView( st_v , st, AcceleratorRead);
typedef LatticeView<Cobj> Aview;
// determine in what order we need the points
int npoint = geom.npoint-1;
Vector<int> points(npoint, 0);
for(int p=0; p<npoint; p++)
points[p] = (dag && !hermitian) ? geom.points_dagger[p] : p;
auto points_p = &points[0];
Vector<Aview> AcceleratorViewContainer;
for(int p=0;p<npoint;p++) AcceleratorViewContainer.push_back(a[p].View(AcceleratorRead));
Aview *Aview_p = & AcceleratorViewContainer[0];
const int Nsimd = CComplex::Nsimd();
typedef decltype(coalescedRead(in_v[0])) calcVector;
typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
RealD* dag_factor_p = &dag_factor[0];
if(dag) {
accelerator_for(sss, in.Grid()->oSites()*nbasis, Nsimd, {
int ss = sss/nbasis;
int b = sss%nbasis;
calcComplex res = Zero();
calcVector nbr;
int ptype;
StencilEntry *SE;
for(int p=0;p<npoint;p++){
int point = points_p[p];
SE=st_v.GetEntry(ptype,point,ss);
if(SE->_is_local) {
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
} else {
nbr = coalescedRead(st_v.CommBuf()[SE->_offset]);
}
acceleratorSynchronise();
for(int bb=0;bb<nbasis;bb++) {
res = res + dag_factor_p[b*nbasis+bb]*coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
}
}
coalescedWrite(out_v[ss](b),res);
});
} else {
accelerator_for(sss, in.Grid()->oSites()*nbasis, Nsimd, {
int ss = sss/nbasis;
int b = sss%nbasis;
calcComplex res = Zero();
calcVector nbr;
int ptype;
StencilEntry *SE;
for(int p=0;p<npoint;p++){
int point = points_p[p];
SE=st_v.GetEntry(ptype,point,ss);
if(SE->_is_local) {
nbr = coalescedReadPermute(in_v[SE->_offset],ptype,SE->_permute);
} else {
nbr = coalescedRead(st_v.CommBuf()[SE->_offset]);
}
acceleratorSynchronise();
for(int bb=0;bb<nbasis;bb++) {
res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
}
}
coalescedWrite(out_v[ss](b),res);
});
}
for(int p=0;p<npoint;p++) AcceleratorViewContainer[p].ViewClose();
}
CoarsenedMatrix(GridCartesian &CoarseGrid, int hermitian_=0) : CoarsenedMatrix(GridCartesian &CoarseGrid, int hermitian_=0) :
_grid(&CoarseGrid), _grid(&CoarseGrid),
_cbgrid(new GridRedBlackCartesian(&CoarseGrid)),
geom(CoarseGrid._ndimension), geom(CoarseGrid._ndimension),
hermitian(hermitian_), hermitian(hermitian_),
Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0), Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
StencilEven(_cbgrid,geom.npoint,Even,geom.directions,geom.displacements,0), A(geom.npoint,&CoarseGrid)
StencilOdd(_cbgrid,geom.npoint,Odd,geom.directions,geom.displacements,0),
A(geom.npoint,&CoarseGrid),
Aeven(geom.npoint,_cbgrid),
Aodd(geom.npoint,_cbgrid),
AselfInv(&CoarseGrid),
AselfInvEven(_cbgrid),
AselfInvOdd(_cbgrid),
dag_factor(nbasis*nbasis)
{ {
fillFactor();
}; };
CoarsenedMatrix(GridCartesian &CoarseGrid, GridRedBlackCartesian &CoarseRBGrid, int hermitian_=0) : void Test(Aggregation<Fobj,CComplex,nbasis> &_Aggregates,GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop)
_grid(&CoarseGrid),
_cbgrid(&CoarseRBGrid),
geom(CoarseGrid._ndimension),
hermitian(hermitian_),
Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
StencilEven(&CoarseRBGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
StencilOdd(&CoarseRBGrid,geom.npoint,Odd,geom.directions,geom.displacements,0),
A(geom.npoint,&CoarseGrid),
Aeven(geom.npoint,&CoarseRBGrid),
Aodd(geom.npoint,&CoarseRBGrid),
AselfInv(&CoarseGrid),
AselfInvEven(&CoarseRBGrid),
AselfInvOdd(&CoarseRBGrid),
dag_factor(nbasis*nbasis)
{ {
fillFactor(); typedef Lattice<Fobj> FineField;
}; CoarseVector Cin(_grid);
CoarseVector Cout(_grid);
CoarseVector CFout(_grid);
void fillFactor() { FineField Fin(FineGrid);
Eigen::MatrixXd dag_factor_eigen = Eigen::MatrixXd::Ones(nbasis, nbasis); FineField Fout(FineGrid);
if(!hermitian) {
const int nb = nbasis/2;
dag_factor_eigen.block(0,nb,nb,nb) *= -1.0;
dag_factor_eigen.block(nb,0,nb,nb) *= -1.0;
}
// GPU readable prefactor
thread_for(i, nbasis*nbasis, { std::vector<int> seeds({1,2,3,4,5});
int j = i/nbasis; GridParallelRNG RNG(_grid); RNG.SeedFixedIntegers(seeds);
int k = i%nbasis; gaussian(RNG,Cin);
dag_factor[i] = dag_factor_eigen(j, k);
}); _Aggregates.PromoteFromSubspace(Cin,Fin);
_Aggregates.ProjectToSubspace(Cin,Fin);
std::cout << GridLogMessage<< "************ "<<std::endl;
std::cout << GridLogMessage<< " Testing M "<<std::endl;
std::cout << GridLogMessage<< "************ "<<std::endl;
// Coarse operator
this->M(Cin,Cout);
// Fine projected operator
_Aggregates.PromoteFromSubspace(Cin,Fin);
linop.Op(Fin,Fout);
_Aggregates.ProjectToSubspace(CFout,Fout);
CFout = CFout-Cout;
RealD diff = norm2(CFout);
std::cout << GridLogMessage<< " diff "<<diff<<std::endl;
assert(diff<1.0e-5);
std::cout << GridLogMessage<< "************ "<<std::endl;
std::cout << GridLogMessage<< " Testing Mdag "<<std::endl;
std::cout << GridLogMessage<< "************ "<<std::endl;
// Coarse operator
Mdag(Cin,Cout);
// Fine operator
linop.AdjOp(Fin,Fout);
_Aggregates.ProjectToSubspace(CFout,Fout);
CFout = CFout-Cout;
diff = norm2(CFout);
std::cout << GridLogMessage<< " diff "<<diff<<std::endl;
assert(diff<1.0e-5);
} }
void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop, void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,
@ -840,8 +506,6 @@ public:
typedef Lattice<typename Fobj::tensor_reduced> FineComplexField; typedef Lattice<typename Fobj::tensor_reduced> FineComplexField;
typedef typename Fobj::scalar_type scalar_type; typedef typename Fobj::scalar_type scalar_type;
std::cout << GridLogMessage<< "CoarsenMatrix "<< std::endl;
FineComplexField one(FineGrid); one=scalar_type(1.0,0.0); FineComplexField one(FineGrid); one=scalar_type(1.0,0.0);
FineComplexField zero(FineGrid); zero=scalar_type(0.0,0.0); FineComplexField zero(FineGrid); zero=scalar_type(0.0,0.0);
@ -872,13 +536,22 @@ public:
CoarseScalar InnerProd(Grid()); CoarseScalar InnerProd(Grid());
std::cout << GridLogMessage<< "CoarsenMatrix Orthog "<< std::endl; std::cout << GridLogMessage<< "CoarsenMatrix Orthog " << std::endl;
// Orthogonalise the subblocks over the basis // Orthogonalise the subblocks over the basis
blockOrthogonalise(InnerProd,Subspace.subspace); blockOrthogonalise(InnerProd,Subspace.subspace);
std::cout << GridLogMessage<< "CoarsenMatrix Orthog done " << std::endl;
auto OpDirections = linop.Directions();
auto OpDisplacements = linop.Displacements();
std::cout<<" Coarsening an operator with "<< OpDirections.size()<<" terms "<<std::endl;
for(int p=0;p<OpDirections.size();p++) {
assert(OpDirections[p]==geom.directions[p]);
assert(OpDisplacements[p]==geom.displacements[p]);
}
// Compute the matrix elements of linop between this orthonormal // Compute the matrix elements of linop between this orthonormal
// set of vectors. // set of vectors.
std::cout << GridLogMessage<< "CoarsenMatrix masks "<< std::endl;
int self_stencil=-1; int self_stencil=-1;
for(int p=0;p<geom.npoint;p++) for(int p=0;p<geom.npoint;p++)
{ {
@ -911,7 +584,21 @@ public:
evenmask = where(mod(bcb,2)==(Integer)0,one,zero); evenmask = where(mod(bcb,2)==(Integer)0,one,zero);
oddmask = one-evenmask; oddmask = one-evenmask;
/*
{
phi=Subspace.subspace[0];
linop.OpDirAll(phi,Mphi_p);
for(int p=0;p<geom.npoint-1;p++){
int dir=geom.directions[p];
int disp=geom.displacements[p];
linop.OpDir(phi,Mphi,dir,disp);
Mphi=Mphi-Mphi_p[p];
std::cout << GridLogMessage <<" Direction mapping check " <<norm2(Mphi)<<std::endl;
}
}
*/
assert(self_stencil!=-1); assert(self_stencil!=-1);
int lhermitian=hermitian;
for(int i=0;i<nbasis;i++){ for(int i=0;i<nbasis;i++){
@ -928,7 +615,7 @@ public:
int dir = geom.directions[p]; int dir = geom.directions[p];
int disp = geom.displacements[p]; int disp = geom.displacements[p];
if ( (disp==-1) || (!hermitian ) ) { if ( (disp==-1) || (!lhermitian ) ) {
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// Pick out contributions coming from this cell and neighbour cell // Pick out contributions coming from this cell and neighbour cell
@ -946,7 +633,7 @@ public:
autoView( A_self , A[self_stencil], AcceleratorWrite); autoView( A_self , A[self_stencil], AcceleratorWrite);
accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); }); accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });
if ( hermitian && (disp==-1) ) { if ( lhermitian && (disp==-1) ) {
for(int pp=0;pp<geom.npoint;pp++){// Find the opposite link and set <j|A|i> = <i|A|j>* for(int pp=0;pp<geom.npoint;pp++){// Find the opposite link and set <j|A|i> = <i|A|j>*
int dirp = geom.directions[pp]; int dirp = geom.directions[pp];
int dispp = geom.displacements[pp]; int dispp = geom.displacements[pp];
@ -958,11 +645,11 @@ public:
} }
} }
} }
} }
} }
} }
std::cout << GridLogMessage<< "CoarsenMatrix Diag "<<std::endl;
/////////////////////////////////////////// ///////////////////////////////////////////
// Faster alternate self coupling.. use hermiticity to save 2x // Faster alternate self coupling.. use hermiticity to save 2x
/////////////////////////////////////////// ///////////////////////////////////////////
@ -994,57 +681,35 @@ public:
} }
} }
if(hermitian) {
std::cout << GridLogMessage << " ForceHermitian, new code "<<std::endl; MemoryManager::PrintBytes();
// Auto self test
Test( Subspace,FineGrid,linop);
#if 0
///////////////////////////
// test code worth preserving in if block
///////////////////////////
std::cout<<GridLogMessage<< " Computed matrix elements "<< self_stencil <<std::endl;
for(int p=0;p<geom.npoint;p++){
std::cout<<GridLogMessage<< "A["<<p<<"]" << std::endl;
std::cout<<GridLogMessage<< "\n"<<A[p] << std::endl;
} }
std::cout<<GridLogMessage<< " picking by block0 "<< self_stencil <<std::endl;
phi=Subspace.subspace[0];
std::vector<int> bc(FineGrid->_ndimension,0);
blockPick(Grid(),phi,tmp,bc); // Pick out a block
linop.Op(tmp,Mphi); // Apply big dop
blockProject(iProj,Mphi,Subspace.subspace); // project it and print it
std::cout<<GridLogMessage<< " Computed matrix elements from block zero only "<<std::endl;
std::cout<<GridLogMessage<< iProj <<std::endl;
std::cout<<GridLogMessage<<"Computed Coarse Operator"<<std::endl;
#endif
InvertSelfStencilLink(); std::cout << GridLogMessage << "Coarse self link inverted" << std::endl;
FillHalfCbs(); std::cout << GridLogMessage << "Coarse half checkerboards filled" << std::endl;
} }
void InvertSelfStencilLink() {
std::cout << GridLogDebug << "CoarsenedMatrix::InvertSelfStencilLink" << std::endl;
int localVolume = Grid()->lSites();
typedef typename Cobj::scalar_object scalar_object;
autoView(Aself_v, A[geom.npoint-1], CpuRead);
autoView(AselfInv_v, AselfInv, CpuWrite);
thread_for(site, localVolume, { // NOTE: Not able to bring this to GPU because of Eigen + peek/poke
Eigen::MatrixXcd selfLinkEigen = Eigen::MatrixXcd::Zero(nbasis, nbasis);
Eigen::MatrixXcd selfLinkInvEigen = Eigen::MatrixXcd::Zero(nbasis, nbasis);
scalar_object selfLink = Zero();
scalar_object selfLinkInv = Zero();
Coordinate lcoor;
Grid()->LocalIndexToLocalCoor(site, lcoor);
peekLocalSite(selfLink, Aself_v, lcoor);
for (int i = 0; i < nbasis; ++i)
for (int j = 0; j < nbasis; ++j)
selfLinkEigen(i, j) = static_cast<ComplexD>(TensorRemove(selfLink(i, j)));
selfLinkInvEigen = selfLinkEigen.inverse();
for(int i = 0; i < nbasis; ++i)
for(int j = 0; j < nbasis; ++j)
selfLinkInv(i, j) = selfLinkInvEigen(i, j);
pokeLocalSite(selfLinkInv, AselfInv_v, lcoor);
});
}
void FillHalfCbs() {
std::cout << GridLogDebug << "CoarsenedMatrix::FillHalfCbs" << std::endl;
for(int p = 0; p < geom.npoint; ++p) {
pickCheckerboard(Even, Aeven[p], A[p]);
pickCheckerboard(Odd, Aodd[p], A[p]);
}
pickCheckerboard(Even, AselfInvEven, AselfInv);
pickCheckerboard(Odd, AselfInvOdd, AselfInv);
}
}; };
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@ -136,7 +136,7 @@ public:
flops=0; flops=0;
usec =0; usec =0;
Coordinate layout(Nd,1); Coordinate layout(Nd,1);
sgrid = new GridCartesian(dimensions,layout,processors,*grid); sgrid = new GridCartesian(dimensions,layout,processors);
}; };
~FFT ( void) { ~FFT ( void) {
@ -182,7 +182,7 @@ public:
pencil_gd[dim] = G*processors[dim]; pencil_gd[dim] = G*processors[dim];
// Pencil global vol LxLxGxLxL per node // Pencil global vol LxLxGxLxL per node
GridCartesian pencil_g(pencil_gd,layout,processors,*vgrid); GridCartesian pencil_g(pencil_gd,layout,processors);
// Construct pencils // Construct pencils
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;

View File

@ -52,7 +52,9 @@ public:
virtual void AdjOp (const Field &in, Field &out) = 0; // Abstract base virtual void AdjOp (const Field &in, Field &out) = 0; // Abstract base
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2)=0; virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2)=0;
virtual void HermOp(const Field &in, Field &out)=0; virtual void HermOp(const Field &in, Field &out)=0;
virtual ~LinearOperatorBase(){};
virtual std::vector<int> Directions(void) =0;
virtual std::vector<int> Displacements(void)=0;
}; };
@ -77,6 +79,9 @@ class MdagMLinearOperator : public LinearOperatorBase<Field> {
public: public:
MdagMLinearOperator(Matrix &Mat): _Mat(Mat){}; MdagMLinearOperator(Matrix &Mat): _Mat(Mat){};
virtual std::vector<int> Directions(void) { return _Mat.Directions();};
virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
// Support for coarsening to a multigrid // Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) { void OpDiag (const Field &in, Field &out) {
_Mat.Mdiag(in,out); _Mat.Mdiag(in,out);
@ -112,6 +117,8 @@ class ShiftedMdagMLinearOperator : public LinearOperatorBase<Field> {
Matrix &_Mat; Matrix &_Mat;
RealD _shift; RealD _shift;
public: public:
virtual std::vector<int> Directions(void) { return _Mat.Directions();};
virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
ShiftedMdagMLinearOperator(Matrix &Mat,RealD shift): _Mat(Mat), _shift(shift){}; ShiftedMdagMLinearOperator(Matrix &Mat,RealD shift): _Mat(Mat), _shift(shift){};
// Support for coarsening to a multigrid // Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) { void OpDiag (const Field &in, Field &out) {
@ -152,6 +159,8 @@ template<class Matrix,class Field>
class HermitianLinearOperator : public LinearOperatorBase<Field> { class HermitianLinearOperator : public LinearOperatorBase<Field> {
Matrix &_Mat; Matrix &_Mat;
public: public:
virtual std::vector<int> Directions(void) { return _Mat.Directions();};
virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
HermitianLinearOperator(Matrix &Mat): _Mat(Mat){}; HermitianLinearOperator(Matrix &Mat): _Mat(Mat){};
// Support for coarsening to a multigrid // Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) { void OpDiag (const Field &in, Field &out) {
@ -183,6 +192,8 @@ template<class Matrix,class Field>
class NonHermitianLinearOperator : public LinearOperatorBase<Field> { class NonHermitianLinearOperator : public LinearOperatorBase<Field> {
Matrix &_Mat; Matrix &_Mat;
public: public:
virtual std::vector<int> Directions(void) { return _Mat.Directions();};
virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
NonHermitianLinearOperator(Matrix &Mat): _Mat(Mat){}; NonHermitianLinearOperator(Matrix &Mat): _Mat(Mat){};
// Support for coarsening to a multigrid // Support for coarsening to a multigrid
void OpDiag (const Field &in, Field &out) { void OpDiag (const Field &in, Field &out) {
@ -256,6 +267,8 @@ template<class Matrix,class Field>
class SchurDiagMooeeOperator : public SchurOperatorBase<Field> { class SchurDiagMooeeOperator : public SchurOperatorBase<Field> {
public: public:
Matrix &_Mat; Matrix &_Mat;
virtual std::vector<int> Directions(void) { return _Mat.Directions();};
virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){}; SchurDiagMooeeOperator (Matrix &Mat): _Mat(Mat){};
virtual void Mpc (const Field &in, Field &out) { virtual void Mpc (const Field &in, Field &out) {
Field tmp(in.Grid()); Field tmp(in.Grid());
@ -282,6 +295,8 @@ template<class Matrix,class Field>
protected: protected:
Matrix &_Mat; Matrix &_Mat;
public: public:
virtual std::vector<int> Directions(void) { return _Mat.Directions();};
virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
SchurDiagOneOperator (Matrix &Mat): _Mat(Mat){}; SchurDiagOneOperator (Matrix &Mat): _Mat(Mat){};
virtual void Mpc (const Field &in, Field &out) { virtual void Mpc (const Field &in, Field &out) {
@ -308,6 +323,8 @@ template<class Matrix,class Field>
protected: protected:
Matrix &_Mat; Matrix &_Mat;
public: public:
virtual std::vector<int> Directions(void) { return _Mat.Directions();};
virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
SchurDiagTwoOperator (Matrix &Mat): _Mat(Mat){}; SchurDiagTwoOperator (Matrix &Mat): _Mat(Mat){};
virtual void Mpc (const Field &in, Field &out) { virtual void Mpc (const Field &in, Field &out) {
@ -373,6 +390,8 @@ class NonHermitianSchurDiagMooeeOperator : public NonHermitianSchurOperatorBase
{ {
public: public:
Matrix& _Mat; Matrix& _Mat;
virtual std::vector<int> Directions(void) { return _Mat.Directions();};
virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
NonHermitianSchurDiagMooeeOperator(Matrix& Mat): _Mat(Mat){}; NonHermitianSchurDiagMooeeOperator(Matrix& Mat): _Mat(Mat){};
virtual void Mpc(const Field& in, Field& out) { virtual void Mpc(const Field& in, Field& out) {
Field tmp(in.Grid()); Field tmp(in.Grid());
@ -406,6 +425,8 @@ class NonHermitianSchurDiagOneOperator : public NonHermitianSchurOperatorBase<Fi
Matrix &_Mat; Matrix &_Mat;
public: public:
virtual std::vector<int> Directions(void) { return _Mat.Directions();};
virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
NonHermitianSchurDiagOneOperator (Matrix& Mat): _Mat(Mat){}; NonHermitianSchurDiagOneOperator (Matrix& Mat): _Mat(Mat){};
virtual void Mpc(const Field& in, Field& out) { virtual void Mpc(const Field& in, Field& out) {
Field tmp(in.Grid()); Field tmp(in.Grid());
@ -436,6 +457,8 @@ class NonHermitianSchurDiagTwoOperator : public NonHermitianSchurOperatorBase<Fi
Matrix& _Mat; Matrix& _Mat;
public: public:
virtual std::vector<int> Directions(void) { return _Mat.Directions();};
virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
NonHermitianSchurDiagTwoOperator(Matrix& Mat): _Mat(Mat){}; NonHermitianSchurDiagTwoOperator(Matrix& Mat): _Mat(Mat){};
virtual void Mpc(const Field& in, Field& out) { virtual void Mpc(const Field& in, Field& out) {
@ -476,6 +499,8 @@ class SchurStaggeredOperator : public SchurOperatorBase<Field> {
Field tmp; Field tmp;
RealD mass; RealD mass;
public: public:
virtual std::vector<int> Directions(void) { return _Mat.Directions();};
virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
SchurStaggeredOperator (Matrix &Mat): _Mat(Mat), tmp(_Mat.RedBlackGrid()) SchurStaggeredOperator (Matrix &Mat): _Mat(Mat), tmp(_Mat.RedBlackGrid())
{ {
assert( _Mat.isTrivialEE() ); assert( _Mat.isTrivialEE() );
@ -508,7 +533,7 @@ class SchurStaggeredOperator : public SchurOperatorBase<Field> {
virtual void MpcDag (const Field &in, Field &out){ virtual void MpcDag (const Field &in, Field &out){
Mpc(in,out); Mpc(in,out);
} }
virtual void MpcDagMpc(const Field &in, Field &out) { virtual void MpcDagMpc(const Field &in, Field &out,RealD &ni,RealD &no) {
assert(0);// Never need with staggered assert(0);// Never need with staggered
} }
}; };
@ -531,16 +556,6 @@ public:
template<class Field> class LinearFunction { template<class Field> class LinearFunction {
public: public:
virtual void operator() (const Field &in, Field &out) = 0; virtual void operator() (const Field &in, Field &out) = 0;
virtual void operator() (const std::vector<Field> &in, std::vector<Field> &out)
{
assert(in.size() == out.size());
for (unsigned int i = 0; i < in.size(); ++i)
{
(*this)(in[i], out[i]);
}
}
}; };
template<class Field> class IdentityLinearFunction : public LinearFunction<Field> { template<class Field> class IdentityLinearFunction : public LinearFunction<Field> {
@ -586,7 +601,6 @@ class HermOpOperatorFunction : public OperatorFunction<Field> {
template<typename Field> template<typename Field>
class PlainHermOp : public LinearFunction<Field> { class PlainHermOp : public LinearFunction<Field> {
public: public:
using LinearFunction<Field>::operator();
LinearOperatorBase<Field> &_Linop; LinearOperatorBase<Field> &_Linop;
PlainHermOp(LinearOperatorBase<Field>& linop) : _Linop(linop) PlainHermOp(LinearOperatorBase<Field>& linop) : _Linop(linop)
@ -600,7 +614,6 @@ public:
template<typename Field> template<typename Field>
class FunctionHermOp : public LinearFunction<Field> { class FunctionHermOp : public LinearFunction<Field> {
public: public:
using LinearFunction<Field>::operator();
OperatorFunction<Field> & _poly; OperatorFunction<Field> & _poly;
LinearOperatorBase<Field> &_Linop; LinearOperatorBase<Field> &_Linop;

View File

@ -30,19 +30,13 @@ Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
template<class Field> using Preconditioner = LinearFunction<Field> ; template<class Field> class Preconditioner : public LinearFunction<Field> {
/*
template<class Field> class Preconditioner : public LinearFunction<Field> {
using LinearFunction<Field>::operator();
virtual void operator()(const Field &src, Field & psi)=0; virtual void operator()(const Field &src, Field & psi)=0;
}; };
*/
template<class Field> class TrivialPrecon : public Preconditioner<Field> { template<class Field> class TrivialPrecon : public Preconditioner<Field> {
public: public:
using Preconditioner<Field>::operator(); void operator()(const Field &src, Field & psi){
virtual void operator()(const Field &src, Field & psi){
psi = src; psi = src;
} }
TrivialPrecon(void){}; TrivialPrecon(void){};

View File

@ -48,7 +48,8 @@ public:
virtual void Mdiag (const Field &in, Field &out)=0; virtual void Mdiag (const Field &in, Field &out)=0;
virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0; virtual void Mdir (const Field &in, Field &out,int dir, int disp)=0;
virtual void MdirAll (const Field &in, std::vector<Field> &out)=0; virtual void MdirAll (const Field &in, std::vector<Field> &out)=0;
virtual ~SparseMatrixBase() {}; virtual std::vector<int> Directions(void) =0;
virtual std::vector<int> Displacements(void)=0;
}; };
///////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////
@ -73,7 +74,9 @@ public:
virtual void MeooeDag (const Field &in, Field &out)=0; virtual void MeooeDag (const Field &in, Field &out)=0;
virtual void MooeeDag (const Field &in, Field &out)=0; virtual void MooeeDag (const Field &in, Field &out)=0;
virtual void MooeeInvDag (const Field &in, Field &out)=0; virtual void MooeeInvDag (const Field &in, Field &out)=0;
virtual ~CheckerBoardedSparseMatrixBase() {};
virtual std::vector<int> Directions(void) =0;
virtual std::vector<int> Displacements(void)=0;
}; };
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@ -292,7 +292,6 @@ public:
template<class Field> template<class Field>
class ChebyshevLanczos : public Chebyshev<Field> { class ChebyshevLanczos : public Chebyshev<Field> {
private: private:
std::vector<RealD> Coeffs; std::vector<RealD> Coeffs;
int order; int order;
RealD alpha; RealD alpha;

View File

@ -1,4 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -28,6 +28,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_ALGORITHMS_ITERATIVE_GENERIC_PCG #ifndef GRID_ALGORITHMS_ITERATIVE_GENERIC_PCG
#define GRID_ALGORITHMS_ITERATIVE_GENERIC_PCG #define GRID_ALGORITHMS_ITERATIVE_GENERIC_PCG
NAMESPACE_BEGIN(Grid);
/* /*
* Compared to Tang-2009: P=Pleft. P^T = PRight Q=MssInv. * Compared to Tang-2009: P=Pleft. P^T = PRight Q=MssInv.
* Script A = SolverMatrix * Script A = SolverMatrix
@ -50,53 +51,54 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
* Vout = x * Vout = x
*/ */
// abstract base
template<class Field, class CoarseField> template<class Field, class CoarseField, class Aggregates>
class TwoLevelFlexiblePcg : public LinearFunction<Field> class TwoLevelFlexiblePcg : public LinearFunction<Field>
{ {
public: public:
int verbose; int verbose;
RealD Tolerance; RealD Tolerance;
Integer MaxIterations; Integer MaxIterations;
const int mmax = 5; const int mmax = 4;
GridBase *grid; GridBase *FineGrid;
GridBase *coarsegrid; GridBase *CoarseGrid;
LinearOperatorBase<Field> *_Linop LinearOperatorBase<Field> &_Linop;
OperatorFunction<Field> *_Smoother, LinearFunction<Field> &_Smoother;
LinearFunction<CoarseField> *_CoarseSolver; LinearFunction<CoarseField> &_CoarseSolver;
Aggregates &_Aggregates;
// Need somthing that knows how to get from Coarse to fine and back again
// more most opertor functions // more most opertor functions
TwoLevelFlexiblePcg(RealD tol, TwoLevelFlexiblePcg(RealD tol,
Integer maxit, Integer maxit,
LinearOperatorBase<Field> *Linop, LinearOperatorBase<Field> *Linop,
LinearOperatorBase<Field> *SmootherLinop, LinearFunction<Field> *Smoother,
OperatorFunction<Field> *Smoother, LinearFunction<CoarseField> *CoarseSolver,
OperatorFunction<CoarseField> CoarseLinop Aggregates *AggP
) : ) :
Tolerance(tol), Tolerance(tol),
MaxIterations(maxit), MaxIterations(maxit),
_Linop(Linop), _Linop(*Linop),
_PreconditionerLinop(PrecLinop), _Smoother(*Smoother),
_Preconditioner(Preconditioner) _CoarseSolver(*CoarseSolver),
_Aggregates(*AggP)
{ {
CoarseGrid=_Aggregates.CoarseGrid;
FineGrid=_Aggregates.FineGrid;
verbose=0; verbose=0;
}; };
// The Pcg routine is common to all, but the various matrices differ from derived // The Pcg routine is common to all, but the various matrices differ from derived
// implementation to derived implmentation // implementation to derived implmentation
void operator() (const Field &src, Field &psi){
void operator() (const Field &src, Field &psi){ void operator() (const Field &src, Field &psi){
psi.Checkerboard() = src.Checkerboard(); psi.Checkerboard() = src.Checkerboard();
grid = src.Grid();
RealD f;
RealD rtzp,rtz,a,d,b; RealD rtzp,rtz,a,d,b;
RealD rptzp; // RealD rptzp;
RealD tn; // RealD tn;
RealD guess = norm2(psi); RealD guess = norm2(psi);
RealD ssq = norm2(src); RealD ssq = norm2(src);
RealD rsq = ssq*Tolerance*Tolerance; RealD rsq = ssq*Tolerance*Tolerance;
@ -104,15 +106,15 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
///////////////////////////// /////////////////////////////
// Set up history vectors // Set up history vectors
///////////////////////////// /////////////////////////////
std::vector<Field> p (mmax,grid); std::vector<Field> p (mmax,FineGrid);
std::vector<Field> mmp(mmax,grid); std::vector<Field> mmp(mmax,FineGrid);
std::vector<RealD> pAp(mmax); std::vector<RealD> pAp(mmax);
Field x (grid); x = psi; Field x (FineGrid); x = psi;
Field z (grid); Field z (FineGrid);
Field tmp(grid); Field tmp(FineGrid);
Field r (grid); Field r (FineGrid);
Field mu (grid); Field mu (FineGrid);
////////////////////////// //////////////////////////
// x0 = Vstart -- possibly modify guess // x0 = Vstart -- possibly modify guess
@ -121,13 +123,13 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
Vstart(x,src); Vstart(x,src);
// r0 = b -A x0 // r0 = b -A x0
HermOp(x,mmp); // Shouldn't this be something else? _Linop.HermOp(x,mmp[0]); // Shouldn't this be something else?
axpy (r, -1.0,mmp[0], src); // Recomputes r=src-Ax0 axpy (r, -1.0,mmp[0], src); // Recomputes r=src-Ax0
////////////////////////////////// //////////////////////////////////
// Compute z = M1 x // Compute z = M1 x
////////////////////////////////// //////////////////////////////////
M1(r,z,tmp,mp,SmootherMirs); M1(r,z);
rtzp =real(innerProduct(r,z)); rtzp =real(innerProduct(r,z));
/////////////////////////////////////// ///////////////////////////////////////
@ -143,7 +145,7 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
int peri_kp = (k+1) % mmax; int peri_kp = (k+1) % mmax;
rtz=rtzp; rtz=rtzp;
d= M3(p[peri_k],mp,mmp[peri_k],tmp); d= M3(p[peri_k],mmp[peri_k]);
a = rtz/d; a = rtz/d;
// Memorise this // Memorise this
@ -153,13 +155,13 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
RealD rn = axpy_norm(r,-a,mmp[peri_k],r); RealD rn = axpy_norm(r,-a,mmp[peri_k],r);
// Compute z = M x // Compute z = M x
M1(r,z,tmp,mp); M1(r,z);
rtzp =real(innerProduct(r,z)); rtzp =real(innerProduct(r,z));
M2(z,mu); // ADEF-2 this is identity. Axpy possible to eliminate M2(z,mu); // ADEF-2 this is identity. Axpy possible to eliminate
p[peri_kp]=p[peri_k]; p[peri_kp]=mu;
// Standard search direction p -> z + b p ; b = // Standard search direction p -> z + b p ; b =
b = (rtzp)/rtz; b = (rtzp)/rtz;
@ -181,7 +183,7 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
// Stopping condition // Stopping condition
if ( rn <= rsq ) { if ( rn <= rsq ) {
HermOp(x,mmp); // Shouldn't this be something else? _Linop.HermOp(x,mmp[0]); // Shouldn't this be something else?
axpy(tmp,-1.0,src,mmp[0]); axpy(tmp,-1.0,src,mmp[0]);
RealD psinorm = sqrt(norm2(x)); RealD psinorm = sqrt(norm2(x));
@ -190,7 +192,8 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
RealD true_residual = tmpnorm/srcnorm; RealD true_residual = tmpnorm/srcnorm;
std::cout<<GridLogMessage<<"TwoLevelfPcg: true residual is "<<true_residual<<std::endl; std::cout<<GridLogMessage<<"TwoLevelfPcg: true residual is "<<true_residual<<std::endl;
std::cout<<GridLogMessage<<"TwoLevelfPcg: target residual was"<<Tolerance<<std::endl; std::cout<<GridLogMessage<<"TwoLevelfPcg: target residual was"<<Tolerance<<std::endl;
return k;
return;
} }
} }
// Non-convergence // Non-convergence
@ -199,48 +202,40 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
public: public:
virtual void M(Field & in,Field & out,Field & tmp) { virtual void M1(Field & in, Field & out)
{// the smoother
}
virtual void M1(Field & in, Field & out) {// the smoother
// [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min] // [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min]
Field tmp(grid); Field tmp(FineGrid);
Field Min(grid); Field Min(FineGrid);
PcgM(in,Min); // Smoother call CoarseField PleftProj(CoarseGrid);
CoarseField PleftMss_proj(CoarseGrid);
HermOp(Min,out); _Smoother(in,Min); // Smoother call
_Linop.HermOp(Min,out);
axpy(tmp,-1.0,out,in); // tmp = in - A Min axpy(tmp,-1.0,out,in); // tmp = in - A Min
ProjectToSubspace(tmp,PleftProj); _Aggregates.ProjectToSubspace(PleftProj,tmp);
ApplyInverse(PleftProj,PleftMss_proj); // Ass^{-1} [in - A Min]_s _CoarseSolver(PleftProj,PleftMss_proj); // Ass^{-1} [in - A Min]_s
PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Q[in - A Min] _Aggregates.PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Q[in - A Min]
axpy(out,1.0,Min,tmp); // Min+tmp axpy(out,1.0,Min,tmp); // Min+tmp
} }
virtual void M2(const Field & in, Field & out) { virtual void M2(const Field & in, Field & out)
{
out=in; out=in;
// Must override for Def2 only
// case PcgDef2:
// Pright(in,out);
// break;
} }
virtual RealD M3(const Field & p, Field & mmp){ virtual RealD M3(const Field & p, Field & mmp)
{
double d,dd; double d,dd;
HermOpAndNorm(p,mmp,d,dd); _Linop.HermOpAndNorm(p,mmp,d,dd);
return dd; return dd;
// Must override for Def1 only
// case PcgDef1:
// d=linop_d->Mprec(p,mmp,tmp,0,1);// Dag no
// linop_d->Mprec(mmp,mp,tmp,1);// Dag yes
// Pleft(mp,mmp);
// d=real(linop_d->inner(p,mmp));
} }
virtual void VstartDef2(Field & xconst Field & src){ virtual void Vstart(Field & x,const Field & src)
{
//case PcgDef2: //case PcgDef2:
//case PcgAdef2: //case PcgAdef2:
//case PcgAdef2f: //case PcgAdef2f:
@ -256,142 +251,79 @@ class TwoLevelFlexiblePcg : public LinearFunction<Field>
// = src_s - (A guess)_s - src_s + (A guess)_s // = src_s - (A guess)_s - src_s + (A guess)_s
// = 0 // = 0
/////////////////////////////////// ///////////////////////////////////
Field r(grid); Field r(FineGrid);
Field mmp(grid); Field mmp(FineGrid);
CoarseField PleftProj(CoarseGrid);
CoarseField PleftMss_proj(CoarseGrid);
HermOp(x,mmp); _Linop.HermOp(x,mmp);
axpy (r, -1.0, mmp, src); // r_{-1} = src - A x axpy (r, -1.0, mmp, src); // r_{-1} = src - A x
ProjectToSubspace(r,PleftProj); _Aggregates.ProjectToSubspace(PleftProj,r);
ApplyInverseCG(PleftProj,PleftMss_proj); // Ass^{-1} r_s _CoarseSolver(PleftProj,PleftMss_proj); // Ass^{-1} r_s
PromoteFromSubspace(PleftMss_proj,mmp); _Aggregates.PromoteFromSubspace(PleftMss_proj,mmp);
x=x+mmp; x=x+mmp;
} }
virtual void Vstart(Field & x,const Field & src){
return;
}
///////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////
// Only Def1 has non-trivial Vout. Override in Def1 // Only Def1 has non-trivial Vout. Override in Def1
///////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////
virtual void Vout (Field & in, Field & out,Field & src){ virtual void Vout (Field & in, Field & out,Field & src){
out = in; out = in;
//case PcgDef1:
// //Qb + PT x
// ProjectToSubspace(src,PleftProj);
// ApplyInverse(PleftProj,PleftMss_proj); // Ass^{-1} r_s
// PromoteFromSubspace(PleftMss_proj,tmp);
//
// Pright(in,out);
//
// linop_d->axpy(out,tmp,out,1.0);
// break;
} }
//////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////
// Pright and Pleft are common to all implementations // Pright and Pleft are common to all implementations
//////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////
virtual void Pright(Field & in,Field & out){ virtual void Pright(Field & in,Field & out)
{
// P_R = [ 1 0 ] // P_R = [ 1 0 ]
// [ -Mss^-1 Msb 0 ] // [ -Mss^-1 Msb 0 ]
Field in_sbar(grid); Field in_sbar(FineGrid);
ProjectToSubspace(in,PleftProj); CoarseField PleftProj(CoarseGrid);
PromoteFromSubspace(PleftProj,out); CoarseField PleftMss_proj(CoarseGrid);
_Aggregates.ProjectToSubspace(PleftProj,in);
_Aggregates.PromoteFromSubspace(PleftProj,out);
axpy(in_sbar,-1.0,out,in); // in_sbar = in - in_s axpy(in_sbar,-1.0,out,in); // in_sbar = in - in_s
HermOp(in_sbar,out); _Linop.HermOp(in_sbar,out);
ProjectToSubspace(out,PleftProj); // Mssbar in_sbar (project) _Aggregates.ProjectToSubspace(PleftProj,out); // Mssbar in_sbar (project)
ApplyInverse (PleftProj,PleftMss_proj); // Mss^{-1} Mssbar _CoarseSolver(PleftProj,PleftMss_proj); // Mss^{-1} Mssbar
PromoteFromSubspace(PleftMss_proj,out); // _Aggregates.PromoteFromSubspace(PleftMss_proj,out); //
axpy(out,-1.0,out,in_sbar); // in_sbar - Mss^{-1} Mssbar in_sbar axpy(out,-1.0,out,in_sbar); // in_sbar - Mss^{-1} Mssbar in_sbar
} }
virtual void Pleft (Field & in,Field & out){ virtual void Pleft (Field & in,Field & out)
{
// P_L = [ 1 -Mbs Mss^-1] // P_L = [ 1 -Mbs Mss^-1]
// [ 0 0 ] // [ 0 0 ]
Field in_sbar(grid); Field in_sbar(FineGrid);
Field tmp2(grid); Field tmp2(FineGrid);
Field Mtmp(grid); Field Mtmp(FineGrid);
ProjectToSubspace(in,PleftProj); CoarseField PleftProj(CoarseGrid);
PromoteFromSubspace(PleftProj,out); CoarseField PleftMss_proj(CoarseGrid);
_Aggregates.ProjectToSubspace(PleftProj,in);
_Aggregates.PromoteFromSubspace(PleftProj,out);
axpy(in_sbar,-1.0,out,in); // in_sbar = in - in_s axpy(in_sbar,-1.0,out,in); // in_sbar = in - in_s
ApplyInverse(PleftProj,PleftMss_proj); // Mss^{-1} in_s _CoarseSolver(PleftProj,PleftMss_proj); // Mss^{-1} in_s
PromoteFromSubspace(PleftMss_proj,out); _Aggregates.PromoteFromSubspace(PleftMss_proj,out);
HermOp(out,Mtmp); _Linop.HermOp(out,Mtmp);
ProjectToSubspace(Mtmp,PleftProj); // Msbar s Mss^{-1} _Aggregates.ProjectToSubspace(PleftProj,Mtmp); // Msbar s Mss^{-1}
PromoteFromSubspace(PleftProj,tmp2); _Aggregates.PromoteFromSubspace(PleftProj,tmp2);
axpy(out,-1.0,tmp2,Mtmp); axpy(out,-1.0,tmp2,Mtmp);
axpy(out,-1.0,out,in_sbar); // in_sbar - Msbars Mss^{-1} in_s axpy(out,-1.0,out,in_sbar); // in_sbar - Msbars Mss^{-1} in_s
} }
} };
NAMESPACE_END(Grid);
template<class Field>
class TwoLevelFlexiblePcgADef2 : public TwoLevelFlexiblePcg<Field> {
public:
virtual void M(Field & in,Field & out,Field & tmp){
}
virtual void M1(Field & in, Field & out,Field & tmp,Field & mp){
}
virtual void M2(Field & in, Field & out){
}
virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp){
}
virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp){
}
}
/*
template<class Field>
class TwoLevelFlexiblePcgAD : public TwoLevelFlexiblePcg<Field> {
public:
virtual void M(Field & in,Field & out,Field & tmp);
virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
virtual void M2(Field & in, Field & out);
virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
}
template<class Field>
class TwoLevelFlexiblePcgDef1 : public TwoLevelFlexiblePcg<Field> {
public:
virtual void M(Field & in,Field & out,Field & tmp);
virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
virtual void M2(Field & in, Field & out);
virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
virtual void Vout (Field & in, Field & out,Field & src,Field & tmp);
}
template<class Field>
class TwoLevelFlexiblePcgDef2 : public TwoLevelFlexiblePcg<Field> {
public:
virtual void M(Field & in,Field & out,Field & tmp);
virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
virtual void M2(Field & in, Field & out);
virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
}
template<class Field>
class TwoLevelFlexiblePcgV11: public TwoLevelFlexiblePcg<Field> {
public:
virtual void M(Field & in,Field & out,Field & tmp);
virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
virtual void M2(Field & in, Field & out);
virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
}
*/
#endif #endif

View File

@ -36,8 +36,7 @@ NAMESPACE_BEGIN(Grid);
template<class FieldD, class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0, typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> template<class FieldD, class FieldF, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0, typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class MixedPrecisionBiCGSTAB : public LinearFunction<FieldD> class MixedPrecisionBiCGSTAB : public LinearFunction<FieldD>
{ {
public: public:
using LinearFunction<FieldD>::operator();
RealD Tolerance; RealD Tolerance;
RealD InnerTolerance; // Initial tolerance for inner CG. Defaults to Tolerance but can be changed RealD InnerTolerance; // Initial tolerance for inner CG. Defaults to Tolerance but can be changed
Integer MaxInnerIterations; Integer MaxInnerIterations;

View File

@ -35,8 +35,7 @@ NAMESPACE_BEGIN(Grid);
typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0, typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0> typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class MixedPrecisionConjugateGradient : public LinearFunction<FieldD> { class MixedPrecisionConjugateGradient : public LinearFunction<FieldD> {
public: public:
using LinearFunction<FieldD>::operator();
RealD Tolerance; RealD Tolerance;
RealD InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed RealD InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
Integer MaxInnerIterations; Integer MaxInnerIterations;
@ -49,7 +48,6 @@ NAMESPACE_BEGIN(Grid);
Integer TotalInnerIterations; //Number of inner CG iterations Integer TotalInnerIterations; //Number of inner CG iterations
Integer TotalOuterIterations; //Number of restarts Integer TotalOuterIterations; //Number of restarts
Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
RealD TrueResidual;
//Option to speed up *inner single precision* solves using a LinearFunction that produces a guess //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
LinearFunction<FieldF> *guesser; LinearFunction<FieldF> *guesser;
@ -69,7 +67,6 @@ NAMESPACE_BEGIN(Grid);
} }
void operator() (const FieldD &src_d_in, FieldD &sol_d){ void operator() (const FieldD &src_d_in, FieldD &sol_d){
std::cout << GridLogMessage << "MixedPrecisionConjugateGradient: Starting mixed precision CG with outer tolerance " << Tolerance << " and inner tolerance " << InnerTolerance << std::endl;
TotalInnerIterations = 0; TotalInnerIterations = 0;
GridStopWatch TotalTimer; GridStopWatch TotalTimer;
@ -82,11 +79,6 @@ NAMESPACE_BEGIN(Grid);
RealD stop = src_norm * Tolerance*Tolerance; RealD stop = src_norm * Tolerance*Tolerance;
GridBase* DoublePrecGrid = src_d_in.Grid(); GridBase* DoublePrecGrid = src_d_in.Grid();
//Generate precision change workspaces
precisionChangeWorkspace wk_dp_from_sp(DoublePrecGrid, SinglePrecGrid);
precisionChangeWorkspace wk_sp_from_dp(SinglePrecGrid, DoublePrecGrid);
FieldD tmp_d(DoublePrecGrid); FieldD tmp_d(DoublePrecGrid);
tmp_d.Checkerboard() = cb; tmp_d.Checkerboard() = cb;
@ -104,7 +96,6 @@ NAMESPACE_BEGIN(Grid);
FieldF sol_f(SinglePrecGrid); FieldF sol_f(SinglePrecGrid);
sol_f.Checkerboard() = cb; sol_f.Checkerboard() = cb;
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Starting initial inner CG with tolerance " << inner_tol << std::endl;
ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations); ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
CG_f.ErrorOnNoConverge = false; CG_f.ErrorOnNoConverge = false;
@ -128,7 +119,7 @@ NAMESPACE_BEGIN(Grid);
while(norm * inner_tol * inner_tol < stop) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ?? while(norm * inner_tol * inner_tol < stop) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
PrecChangeTimer.Start(); PrecChangeTimer.Start();
precisionChange(src_f, src_d, wk_sp_from_dp); precisionChange(src_f, src_d);
PrecChangeTimer.Stop(); PrecChangeTimer.Stop();
sol_f = Zero(); sol_f = Zero();
@ -138,7 +129,6 @@ NAMESPACE_BEGIN(Grid);
(*guesser)(src_f, sol_f); (*guesser)(src_f, sol_f);
//Inner CG //Inner CG
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " << outer_iter << " starting inner CG with tolerance " << inner_tol << std::endl;
CG_f.Tolerance = inner_tol; CG_f.Tolerance = inner_tol;
InnerCGtimer.Start(); InnerCGtimer.Start();
CG_f(Linop_f, src_f, sol_f); CG_f(Linop_f, src_f, sol_f);
@ -147,7 +137,7 @@ NAMESPACE_BEGIN(Grid);
//Convert sol back to double and add to double prec solution //Convert sol back to double and add to double prec solution
PrecChangeTimer.Start(); PrecChangeTimer.Start();
precisionChange(tmp_d, sol_f, wk_dp_from_sp); precisionChange(tmp_d, sol_f);
PrecChangeTimer.Stop(); PrecChangeTimer.Stop();
axpy(sol_d, 1.0, tmp_d, sol_d); axpy(sol_d, 1.0, tmp_d, sol_d);
@ -159,7 +149,6 @@ NAMESPACE_BEGIN(Grid);
ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations); ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations);
CG_d(Linop_d, src_d_in, sol_d); CG_d(Linop_d, src_d_in, sol_d);
TotalFinalStepIterations = CG_d.IterationsToComplete; TotalFinalStepIterations = CG_d.IterationsToComplete;
TrueResidual = CG_d.TrueResidual;
TotalTimer.Stop(); TotalTimer.Stop();
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl; std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl;

View File

@ -52,7 +52,7 @@ public:
MultiShiftFunction shifts; MultiShiftFunction shifts;
std::vector<RealD> TrueResidualShift; std::vector<RealD> TrueResidualShift;
ConjugateGradientMultiShift(Integer maxit, const MultiShiftFunction &_shifts) : ConjugateGradientMultiShift(Integer maxit,MultiShiftFunction &_shifts) :
MaxIterations(maxit), MaxIterations(maxit),
shifts(_shifts) shifts(_shifts)
{ {
@ -182,9 +182,6 @@ public:
for(int s=0;s<nshift;s++) { for(int s=0;s<nshift;s++) {
axpby(psi[s],0.,-bs[s]*alpha[s],src,src); axpby(psi[s],0.,-bs[s]*alpha[s],src,src);
} }
std::cout << GridLogIterative << "ConjugateGradientMultiShift: initial rn (|src|^2) =" << rn << " qq (|MdagM src|^2) =" << qq << " d ( dot(src, [MdagM + m_0]src) ) =" << d << " c=" << c << std::endl;
/////////////////////////////////////// ///////////////////////////////////////
// Timers // Timers

View File

@ -1,411 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/ConjugateGradientMultiShift.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Christopher Kelly <ckelly@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CONJUGATE_GRADIENT_MULTI_SHIFT_MIXEDPREC_H
#define GRID_CONJUGATE_GRADIENT_MULTI_SHIFT_MIXEDPREC_H
NAMESPACE_BEGIN(Grid);
//CK 2020: A variant of the multi-shift conjugate gradient with the matrix multiplication in single precision.
//The residual is stored in single precision, but the search directions and solution are stored in double precision.
//Every update_freq iterations the residual is corrected in double precision.
//For safety the a final regular CG is applied to clean up if necessary
//Linop to add shift to input linop, used in cleanup CG
namespace ConjugateGradientMultiShiftMixedPrecSupport{
template<typename Field>
class ShiftedLinop: public LinearOperatorBase<Field>{
public:
LinearOperatorBase<Field> &linop_base;
RealD shift;
ShiftedLinop(LinearOperatorBase<Field> &_linop_base, RealD _shift): linop_base(_linop_base), shift(_shift){}
void OpDiag (const Field &in, Field &out){ assert(0); }
void OpDir (const Field &in, Field &out,int dir,int disp){ assert(0); }
void OpDirAll (const Field &in, std::vector<Field> &out){ assert(0); }
void Op (const Field &in, Field &out){ assert(0); }
void AdjOp (const Field &in, Field &out){ assert(0); }
void HermOp(const Field &in, Field &out){
linop_base.HermOp(in, out);
axpy(out, shift, in, out);
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
HermOp(in,out);
ComplexD dot = innerProduct(in,out);
n1=real(dot);
n2=norm2(out);
}
};
};
template<class FieldD, class FieldF,
typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class ConjugateGradientMultiShiftMixedPrec : public OperatorMultiFunction<FieldD>,
public OperatorFunction<FieldD>
{
public:
using OperatorFunction<FieldD>::operator();
RealD Tolerance;
Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
std::vector<int> IterationsToCompleteShift; // Iterations for this shift
int verbose;
MultiShiftFunction shifts;
std::vector<RealD> TrueResidualShift;
int ReliableUpdateFreq; //number of iterations between reliable updates
GridBase* SinglePrecGrid; //Grid for single-precision fields
LinearOperatorBase<FieldF> &Linop_f; //single precision
ConjugateGradientMultiShiftMixedPrec(Integer maxit, const MultiShiftFunction &_shifts,
GridBase* _SinglePrecGrid, LinearOperatorBase<FieldF> &_Linop_f,
int _ReliableUpdateFreq
) :
MaxIterations(maxit), shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq)
{
verbose=1;
IterationsToCompleteShift.resize(_shifts.order);
TrueResidualShift.resize(_shifts.order);
}
void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, FieldD &psi)
{
GridBase *grid = src.Grid();
int nshift = shifts.order;
std::vector<FieldD> results(nshift,grid);
(*this)(Linop,src,results,psi);
}
void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, std::vector<FieldD> &results, FieldD &psi)
{
int nshift = shifts.order;
(*this)(Linop,src,results);
psi = shifts.norm*src;
for(int i=0;i<nshift;i++){
psi = psi + shifts.residues[i]*results[i];
}
return;
}
void operator() (LinearOperatorBase<FieldD> &Linop_d, const FieldD &src_d, std::vector<FieldD> &psi_d)
{
GridBase *DoublePrecGrid = src_d.Grid();
precisionChangeWorkspace wk_f_from_d(SinglePrecGrid, DoublePrecGrid);
precisionChangeWorkspace wk_d_from_f(DoublePrecGrid, SinglePrecGrid);
////////////////////////////////////////////////////////////////////////
// Convenience references to the info stored in "MultiShiftFunction"
////////////////////////////////////////////////////////////////////////
int nshift = shifts.order;
std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts"
std::vector<RealD> &mresidual(shifts.tolerances);
std::vector<RealD> alpha(nshift,1.0);
//Double precision search directions
FieldD p_d(DoublePrecGrid);
std::vector<FieldD> ps_d(nshift, DoublePrecGrid);// Search directions (double precision)
FieldD tmp_d(DoublePrecGrid);
FieldD r_d(DoublePrecGrid);
FieldD mmp_d(DoublePrecGrid);
assert(psi_d.size()==nshift);
assert(mass.size()==nshift);
assert(mresidual.size()==nshift);
// dynamic sized arrays on stack; 2d is a pain with vector
RealD bs[nshift];
RealD rsq[nshift];
RealD z[nshift][2];
int converged[nshift];
const int primary =0;
//Primary shift fields CG iteration
RealD a,b,c,d;
RealD cp,bp,qq; //prev
// Matrix mult fields
FieldF r_f(SinglePrecGrid);
FieldF p_f(SinglePrecGrid);
FieldF tmp_f(SinglePrecGrid);
FieldF mmp_f(SinglePrecGrid);
FieldF src_f(SinglePrecGrid);
precisionChange(src_f, src_d, wk_f_from_d);
// Check lightest mass
for(int s=0;s<nshift;s++){
assert( mass[s]>= mass[primary] );
converged[s]=0;
}
// Wire guess to zero
// Residuals "r" are src
// First search direction "p" is also src
cp = norm2(src_d);
// Handle trivial case of zero src.
if( cp == 0. ){
for(int s=0;s<nshift;s++){
psi_d[s] = Zero();
IterationsToCompleteShift[s] = 1;
TrueResidualShift[s] = 0.;
}
return;
}
for(int s=0;s<nshift;s++){
rsq[s] = cp * mresidual[s] * mresidual[s];
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: shift "<< s <<" target resid "<<rsq[s]<<std::endl;
ps_d[s] = src_d;
}
// r and p for primary
r_f=src_f; //residual maintained in single
p_f=src_f;
p_d = src_d; //primary copy --- make this a reference to ps_d to save axpys
//MdagM+m[0]
Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
axpy(mmp_f,mass[0],p_f,mmp_f);
RealD rn = norm2(p_f);
d += rn*mass[0];
b = -cp /d;
// Set up the various shift variables
int iz=0;
z[0][1-iz] = 1.0;
z[0][iz] = 1.0;
bs[0] = b;
for(int s=1;s<nshift;s++){
z[s][1-iz] = 1.0;
z[s][iz] = 1.0/( 1.0 - b*(mass[s]-mass[0]));
bs[s] = b*z[s][iz];
}
// r += b[0] A.p[0]
// c= norm(r)
c=axpy_norm(r_f,b,mmp_f,r_f);
for(int s=0;s<nshift;s++) {
axpby(psi_d[s],0.,-bs[s]*alpha[s],src_d,src_d);
}
///////////////////////////////////////
// Timers
///////////////////////////////////////
GridStopWatch AXPYTimer, ShiftTimer, QRTimer, MatrixTimer, SolverTimer, PrecChangeTimer, CleanupTimer;
SolverTimer.Start();
// Iteration loop
int k;
for (k=1;k<=MaxIterations;k++){
a = c /cp;
//Update double precision search direction by residual
PrecChangeTimer.Start();
precisionChange(r_d, r_f, wk_d_from_f);
PrecChangeTimer.Stop();
AXPYTimer.Start();
axpy(p_d,a,p_d,r_d);
for(int s=0;s<nshift;s++){
if ( ! converged[s] ) {
if (s==0){
axpy(ps_d[s],a,ps_d[s],r_d);
} else{
RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b);
axpby(ps_d[s],z[s][iz],as,r_d,ps_d[s]);
}
}
}
AXPYTimer.Stop();
PrecChangeTimer.Start();
precisionChange(p_f, p_d, wk_f_from_d); //get back single prec search direction for linop
PrecChangeTimer.Stop();
cp=c;
MatrixTimer.Start();
Linop_f.HermOp(p_f,mmp_f);
d=real(innerProduct(p_f,mmp_f));
MatrixTimer.Stop();
AXPYTimer.Start();
axpy(mmp_f,mass[0],p_f,mmp_f);
AXPYTimer.Stop();
RealD rn = norm2(p_f);
d += rn*mass[0];
bp=b;
b=-cp/d;
// Toggle the recurrence history
bs[0] = b;
iz = 1-iz;
ShiftTimer.Start();
for(int s=1;s<nshift;s++){
if((!converged[s])){
RealD z0 = z[s][1-iz];
RealD z1 = z[s][iz];
z[s][iz] = z0*z1*bp
/ (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b));
bs[s] = b*z[s][iz]/z0; // NB sign rel to Mike
}
}
ShiftTimer.Stop();
//Update double precision solutions
AXPYTimer.Start();
for(int s=0;s<nshift;s++){
int ss = s;
if( (!converged[s]) ) {
axpy(psi_d[ss],-bs[s]*alpha[s],ps_d[s],psi_d[ss]);
}
}
//Perform reliable update if necessary; otherwise update residual from single-prec mmp
RealD c_f = axpy_norm(r_f,b,mmp_f,r_f);
AXPYTimer.Stop();
c = c_f;
if(k % ReliableUpdateFreq == 0){
//Replace r with true residual
MatrixTimer.Start();
Linop_d.HermOp(psi_d[0],mmp_d);
MatrixTimer.Stop();
AXPYTimer.Start();
axpy(mmp_d,mass[0],psi_d[0],mmp_d);
RealD c_d = axpy_norm(r_d, -1.0, mmp_d, src_d);
AXPYTimer.Stop();
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<< ", replaced |r|^2 = "<<c_f <<" with |r|^2 = "<<c_d<<std::endl;
PrecChangeTimer.Start();
precisionChange(r_f, r_d, wk_f_from_d);
PrecChangeTimer.Stop();
c = c_d;
}
// Convergence checks
int all_converged = 1;
for(int s=0;s<nshift;s++){
if ( (!converged[s]) ){
IterationsToCompleteShift[s] = k;
RealD css = c * z[s][iz]* z[s][iz];
if(css<rsq[s]){
if ( ! converged[s] )
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
converged[s]=1;
} else {
all_converged=0;
}
}
}
if ( all_converged ){
SolverTimer.Stop();
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: All shifts have converged iteration "<<k<<std::endl;
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Checking solutions"<<std::endl;
// Check answers
for(int s=0; s < nshift; s++) {
Linop_d.HermOpAndNorm(psi_d[s],mmp_d,d,qq);
axpy(tmp_d,mass[s],psi_d[s],mmp_d);
axpy(r_d,-alpha[s],src_d,tmp_d);
RealD rn = norm2(r_d);
RealD cn = norm2(src_d);
TrueResidualShift[s] = std::sqrt(rn/cn);
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: shift["<<s<<"] true residual "<< TrueResidualShift[s] << " target " << mresidual[s] << std::endl;
//If we have not reached the desired tolerance, do a (mixed precision) CG cleanup
if(rn >= rsq[s]){
CleanupTimer.Start();
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: performing cleanup step for shift " << s << std::endl;
//Setup linear operators for final cleanup
ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldD> Linop_shift_d(Linop_d, mass[s]);
ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldF> Linop_shift_f(Linop_f, mass[s]);
MixedPrecisionConjugateGradient<FieldD,FieldF> cg(mresidual[s], MaxIterations, MaxIterations, SinglePrecGrid, Linop_shift_f, Linop_shift_d);
cg(src_d, psi_d[s]);
TrueResidualShift[s] = cg.TrueResidual;
CleanupTimer.Stop();
}
}
std::cout << GridLogMessage << "ConjugateGradientMultiShiftMixedPrec: Time Breakdown for body"<<std::endl;
std::cout << GridLogMessage << "\tSolver " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tAXPY " << AXPYTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tShift " << ShiftTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tPrecision Change " << PrecChangeTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tFinal Cleanup " << CleanupTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tSolver+Cleanup " << SolverTimer.Elapsed() + CleanupTimer.Elapsed() << std::endl;
IterationsToComplete = k;
return;
}
}
// ugly hack
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
// assert(0);
}
};
NAMESPACE_END(Grid);
#endif

View File

@ -33,19 +33,16 @@ namespace Grid {
template<class Field> template<class Field>
class ZeroGuesser: public LinearFunction<Field> { class ZeroGuesser: public LinearFunction<Field> {
public: public:
using LinearFunction<Field>::operator();
virtual void operator()(const Field &src, Field &guess) { guess = Zero(); }; virtual void operator()(const Field &src, Field &guess) { guess = Zero(); };
}; };
template<class Field> template<class Field>
class DoNothingGuesser: public LinearFunction<Field> { class DoNothingGuesser: public LinearFunction<Field> {
public: public:
using LinearFunction<Field>::operator();
virtual void operator()(const Field &src, Field &guess) { }; virtual void operator()(const Field &src, Field &guess) { };
}; };
template<class Field> template<class Field>
class SourceGuesser: public LinearFunction<Field> { class SourceGuesser: public LinearFunction<Field> {
public: public:
using LinearFunction<Field>::operator();
virtual void operator()(const Field &src, Field &guess) { guess = src; }; virtual void operator()(const Field &src, Field &guess) { guess = src; };
}; };
@ -57,29 +54,24 @@ class DeflatedGuesser: public LinearFunction<Field> {
private: private:
const std::vector<Field> &evec; const std::vector<Field> &evec;
const std::vector<RealD> &eval; const std::vector<RealD> &eval;
const unsigned int N;
public: public:
using LinearFunction<Field>::operator();
DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval) DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval) : evec(_evec), eval(_eval) {};
: DeflatedGuesser(_evec, _eval, _evec.size())
{}
DeflatedGuesser(const std::vector<Field> & _evec, const std::vector<RealD> & _eval, const unsigned int _N)
: evec(_evec), eval(_eval), N(_N)
{
assert(evec.size()==eval.size());
assert(N <= evec.size());
}
virtual void operator()(const Field &src,Field &guess) { virtual void operator()(const Field &src,Field &guess) {
RealD t=-usecond();
guess = Zero(); guess = Zero();
assert(evec.size()==eval.size());
auto N = evec.size();
for (int i=0;i<N;i++) { for (int i=0;i<N;i++) {
const Field& tmp = evec[i]; const Field& tmp = evec[i];
axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess); axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess);
} }
guess.Checkerboard() = src.Checkerboard(); guess.Checkerboard() = src.Checkerboard();
t+=usecond();
std::cout<<GridLogMessage<<"\t\t\t" << "Deflated guess took "<< t/1000.0<< "ms" <<std::endl;
} }
}; };
@ -91,7 +83,6 @@ private:
const std::vector<RealD> &eval_coarse; const std::vector<RealD> &eval_coarse;
public: public:
using LinearFunction<FineField>::operator();
LocalCoherenceDeflatedGuesser(const std::vector<FineField> &_subspace, LocalCoherenceDeflatedGuesser(const std::vector<FineField> &_subspace,
const std::vector<CoarseField> &_evec_coarse, const std::vector<CoarseField> &_evec_coarse,
const std::vector<RealD> &_eval_coarse) const std::vector<RealD> &_eval_coarse)

View File

@ -44,7 +44,6 @@ public:
int, MinRes); // Must restart int, MinRes); // Must restart
}; };
//This class is the input parameter class for some testing programs
struct LocalCoherenceLanczosParams : Serializable { struct LocalCoherenceLanczosParams : Serializable {
public: public:
GRID_SERIALIZABLE_CLASS_MEMBERS(LocalCoherenceLanczosParams, GRID_SERIALIZABLE_CLASS_MEMBERS(LocalCoherenceLanczosParams,
@ -68,7 +67,6 @@ public:
template<class Fobj,class CComplex,int nbasis> template<class Fobj,class CComplex,int nbasis>
class ProjectedHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > { class ProjectedHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
public: public:
using LinearFunction<Lattice<iVector<CComplex,nbasis > > >::operator();
typedef iVector<CComplex,nbasis > CoarseSiteVector; typedef iVector<CComplex,nbasis > CoarseSiteVector;
typedef Lattice<CoarseSiteVector> CoarseField; typedef Lattice<CoarseSiteVector> CoarseField;
typedef Lattice<CComplex> CoarseScalar; // used for inner products on fine field typedef Lattice<CComplex> CoarseScalar; // used for inner products on fine field
@ -99,7 +97,6 @@ public:
template<class Fobj,class CComplex,int nbasis> template<class Fobj,class CComplex,int nbasis>
class ProjectedFunctionHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > { class ProjectedFunctionHermOp : public LinearFunction<Lattice<iVector<CComplex,nbasis > > > {
public: public:
using LinearFunction<Lattice<iVector<CComplex,nbasis > > >::operator();
typedef iVector<CComplex,nbasis > CoarseSiteVector; typedef iVector<CComplex,nbasis > CoarseSiteVector;
typedef Lattice<CoarseSiteVector> CoarseField; typedef Lattice<CoarseSiteVector> CoarseField;
typedef Lattice<CComplex> CoarseScalar; // used for inner products on fine field typedef Lattice<CComplex> CoarseScalar; // used for inner products on fine field
@ -156,7 +153,6 @@ public:
_coarse_relax_tol(coarse_relax_tol) _coarse_relax_tol(coarse_relax_tol)
{ }; { };
//evalMaxApprox: approximation of largest eval of the fine Chebyshev operator (suitably wrapped by block projection)
int TestConvergence(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox) int TestConvergence(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
{ {
CoarseField v(B); CoarseField v(B);
@ -183,16 +179,8 @@ public:
if( (vv<eresid*eresid) ) conv = 1; if( (vv<eresid*eresid) ) conv = 1;
return conv; return conv;
} }
int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
//This function is called at the end of the coarse grid Lanczos. It promotes the coarse eigenvector 'B' to the fine grid,
//applies a smoother to the result then computes the computes the *fine grid* eigenvalue (output as 'eval').
//evalMaxApprox should be the approximation of the largest eval of the fine Hermop. However when this function is called by IRL it actually passes the largest eval of the *Chebyshev* operator (as this is the max approx used for the TestConvergence above)
//As the largest eval of the Chebyshev is typically several orders of magnitude larger this makes the convergence test pass even when it should not.
//We therefore ignore evalMaxApprox here and use a value of 1.0 (note this value is already used by TestCoarse)
int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
{ {
evalMaxApprox = 1.0; //cf above
GridBase *FineGrid = _subspace[0].Grid(); GridBase *FineGrid = _subspace[0].Grid();
int checkerboard = _subspace[0].Checkerboard(); int checkerboard = _subspace[0].Checkerboard();
FineField fB(FineGrid);fB.Checkerboard() =checkerboard; FineField fB(FineGrid);fB.Checkerboard() =checkerboard;
@ -211,13 +199,13 @@ public:
eval = vnum/vden; eval = vnum/vden;
fv -= eval*fB; fv -= eval*fB;
RealD vv = norm2(fv) / ::pow(evalMaxApprox,2.0); RealD vv = norm2(fv) / ::pow(evalMaxApprox,2.0);
if ( j > nbasis ) eresid = eresid*_coarse_relax_tol;
std::cout.precision(13); std::cout.precision(13);
std::cout<<GridLogIRL << "[" << std::setw(3)<<j<<"] " std::cout<<GridLogIRL << "[" << std::setw(3)<<j<<"] "
<<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")" <<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
<<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv << " target " << eresid*eresid <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
<<std::endl; <<std::endl;
if ( j > nbasis ) eresid = eresid*_coarse_relax_tol;
if( (vv<eresid*eresid) ) return 1; if( (vv<eresid*eresid) ) return 1;
return 0; return 0;
} }
@ -295,10 +283,6 @@ public:
evals_coarse.resize(0); evals_coarse.resize(0);
}; };
//The block inner product is the inner product on the fine grid locally summed over the blocks
//to give a Lattice<Scalar> on the coarse grid. This function orthnormalizes the fine-grid subspace
//vectors under the block inner product. This step must be performed after computing the fine grid
//eigenvectors and before computing the coarse grid eigenvectors.
void Orthogonalise(void ) { void Orthogonalise(void ) {
CoarseScalar InnerProd(_CoarseGrid); CoarseScalar InnerProd(_CoarseGrid);
std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl; std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
@ -342,8 +326,6 @@ public:
} }
} }
//While this method serves to check the coarse eigenvectors, it also recomputes the eigenvalues from the smoothed reconstructed eigenvectors
//hence the smoother can be tuned after running the coarse Lanczos by using a different smoother here
void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax) void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax)
{ {
assert(evals_fine.size() == nbasis); assert(evals_fine.size() == nbasis);
@ -392,31 +374,25 @@ public:
evals_fine.resize(nbasis); evals_fine.resize(nbasis);
subspace.resize(nbasis,_FineGrid); subspace.resize(nbasis,_FineGrid);
} }
//cheby_op: Parameters of the fine grid Chebyshev polynomial used for the Lanczos acceleration
//cheby_smooth: Parameters of a separate Chebyshev polynomial used after the Lanczos has completed to smooth out high frequency noise in the reconstructed fine grid eigenvectors prior to computing the eigenvalue
//relax: Reconstructed eigenvectors (post smoothing) are naturally not as precise as true eigenvectors. This factor acts as a multiplier on the stopping condition when determining whether the results satisfy the user provided stopping condition
void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth,RealD relax, void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth,RealD relax,
int Nstop, int Nk, int Nm,RealD resid, int Nstop, int Nk, int Nm,RealD resid,
RealD MaxIt, RealD betastp, int MinRes) RealD MaxIt, RealD betastp, int MinRes)
{ {
Chebyshev<FineField> Cheby(cheby_op); //Chebyshev of fine operator on fine grid Chebyshev<FineField> Cheby(cheby_op);
ProjectedHermOp<Fobj,CComplex,nbasis> Op(_FineOp,subspace); //Fine operator on coarse grid with intermediate fine grid conversion ProjectedHermOp<Fobj,CComplex,nbasis> Op(_FineOp,subspace);
ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,subspace); //Chebyshev of fine operator on coarse grid with intermediate fine grid conversion ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,subspace);
////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////
// create a smoother and see if we can get a cheap convergence test and smooth inside the IRL // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////
Chebyshev<FineField> ChebySmooth(cheby_smooth); //lower order Chebyshev of fine operator on fine grid used to smooth regenerated eigenvectors Chebyshev<FineField> ChebySmooth(cheby_smooth);
ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax); ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax);
evals_coarse.resize(Nm); evals_coarse.resize(Nm);
evec_coarse.resize(Nm,_CoarseGrid); evec_coarse.resize(Nm,_CoarseGrid);
CoarseField src(_CoarseGrid); src=1.0; CoarseField src(_CoarseGrid); src=1.0;
//Note the "tester" here is also responsible for generating the fine grid eigenvalues which are output into the "evals_coarse" array
ImplicitlyRestartedLanczos<CoarseField> IRL(ChebyOp,ChebyOp,ChebySmoothTester,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes); ImplicitlyRestartedLanczos<CoarseField> IRL(ChebyOp,ChebyOp,ChebySmoothTester,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
int Nconv=0; int Nconv=0;
IRL.calc(evals_coarse,evec_coarse,src,Nconv,false); IRL.calc(evals_coarse,evec_coarse,src,Nconv,false);
@ -427,14 +403,6 @@ public:
std::cout << i << " Coarse eval = " << evals_coarse[i] << std::endl; std::cout << i << " Coarse eval = " << evals_coarse[i] << std::endl;
} }
} }
//Get the fine eigenvector 'i' by reconstruction
void getFineEvecEval(FineField &evec, RealD &eval, const int i) const{
blockPromote(evec_coarse[i],evec,subspace);
eval = evals_coarse[i];
}
}; };
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@ -29,8 +29,6 @@ template<class Field> class PowerMethod
RealD vnum = real(innerProduct(src_n,tmp)); // HermOp. RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
RealD vden = norm2(src_n); RealD vden = norm2(src_n);
RealD na = vnum/vden; RealD na = vnum/vden;
std::cout << GridLogIterative << "PowerMethod: Current approximation of largest eigenvalue " << na << std::endl;
if ( (fabs(evalMaxApprox/na - 1.0) < 0.001) || (i==_MAX_ITER_EST_-1) ) { if ( (fabs(evalMaxApprox/na - 1.0) < 0.001) || (i==_MAX_ITER_EST_-1) ) {
evalMaxApprox = na; evalMaxApprox = na;

View File

@ -59,7 +59,7 @@ public:
GridBase *grid = src.Grid(); GridBase *grid = src.Grid();
Field r(grid), p(grid), Ap(grid), Ar(grid), z(grid); Field r(grid), p(grid), Ap(grid), Ar(grid), z(grid);
psi=zero; psi=Zero();
r = src; r = src;
Preconditioner(r,p); Preconditioner(r,p);

View File

@ -43,7 +43,7 @@ NAMESPACE_BEGIN(Grid);
template<class Field> template<class Field>
class PrecGeneralisedConjugateResidual : public LinearFunction<Field> { class PrecGeneralisedConjugateResidual : public LinearFunction<Field> {
public: public:
using LinearFunction<Field>::operator();
RealD Tolerance; RealD Tolerance;
Integer MaxIterations; Integer MaxIterations;
int verbose; int verbose;

View File

@ -43,7 +43,7 @@ NAMESPACE_BEGIN(Grid);
template<class Field> template<class Field>
class PrecGeneralisedConjugateResidualNonHermitian : public LinearFunction<Field> { class PrecGeneralisedConjugateResidualNonHermitian : public LinearFunction<Field> {
public: public:
using LinearFunction<Field>::operator();
RealD Tolerance; RealD Tolerance;
Integer MaxIterations; Integer MaxIterations;
int verbose; int verbose;
@ -119,8 +119,7 @@ public:
RealD GCRnStep(const Field &src, Field &psi,RealD rsq){ RealD GCRnStep(const Field &src, Field &psi,RealD rsq){
RealD cp; RealD cp;
ComplexD a, b; ComplexD a, b, zAz;
// ComplexD zAz;
RealD zAAz; RealD zAAz;
ComplexD rq; ComplexD rq;
@ -147,7 +146,7 @@ public:
////////////////////////////////// //////////////////////////////////
MatTimer.Start(); MatTimer.Start();
Linop.Op(psi,Az); Linop.Op(psi,Az);
// zAz = innerProduct(Az,psi); zAz = innerProduct(Az,psi);
zAAz= norm2(Az); zAAz= norm2(Az);
MatTimer.Stop(); MatTimer.Stop();
@ -171,7 +170,7 @@ public:
LinalgTimer.Start(); LinalgTimer.Start();
// zAz = innerProduct(Az,psi); zAz = innerProduct(Az,psi);
zAAz= norm2(Az); zAAz= norm2(Az);
//p[0],q[0],qq[0] //p[0],q[0],qq[0]
@ -213,7 +212,7 @@ public:
MatTimer.Start(); MatTimer.Start();
Linop.Op(z,Az); Linop.Op(z,Az);
MatTimer.Stop(); MatTimer.Stop();
// zAz = innerProduct(Az,psi); zAz = innerProduct(Az,psi);
zAAz= norm2(Az); zAAz= norm2(Az);
LinalgTimer.Start(); LinalgTimer.Start();

View File

@ -132,31 +132,6 @@ namespace Grid {
(*this)(_Matrix,in,out,guess); (*this)(_Matrix,in,out,guess);
} }
void RedBlackSource(Matrix &_Matrix, const std::vector<Field> &in, std::vector<Field> &src_o)
{
GridBase *grid = _Matrix.RedBlackGrid();
Field tmp(grid);
int nblock = in.size();
for(int b=0;b<nblock;b++){
RedBlackSource(_Matrix,in[b],tmp,src_o[b]);
}
}
// James can write his own deflated guesser
// with optimised code for the inner products
// RedBlackSolveSplitGrid();
// RedBlackSolve(_Matrix,src_o,sol_o);
void RedBlackSolution(Matrix &_Matrix, const std::vector<Field> &in, const std::vector<Field> &sol_o, std::vector<Field> &out)
{
GridBase *grid = _Matrix.RedBlackGrid();
Field tmp(grid);
int nblock = in.size();
for(int b=0;b<nblock;b++) {
pickCheckerboard(Even,tmp,in[b]);
RedBlackSolution(_Matrix,sol_o[b],tmp,out[b]);
}
}
template<class Guesser> template<class Guesser>
void operator()(Matrix &_Matrix, const std::vector<Field> &in, std::vector<Field> &out,Guesser &guess) void operator()(Matrix &_Matrix, const std::vector<Field> &in, std::vector<Field> &out,Guesser &guess)
{ {
@ -175,29 +150,24 @@ namespace Grid {
//////////////////////////////////////////////// ////////////////////////////////////////////////
// Prepare RedBlack source // Prepare RedBlack source
//////////////////////////////////////////////// ////////////////////////////////////////////////
RedBlackSource(_Matrix,in,src_o); for(int b=0;b<nblock;b++){
// for(int b=0;b<nblock;b++){ RedBlackSource(_Matrix,in[b],tmp,src_o[b]);
// RedBlackSource(_Matrix,in[b],tmp,src_o[b]); }
// }
//////////////////////////////////////////////// ////////////////////////////////////////////////
// Make the guesses // Make the guesses
//////////////////////////////////////////////// ////////////////////////////////////////////////
if ( subGuess ) guess_save.resize(nblock,grid); if ( subGuess ) guess_save.resize(nblock,grid);
for(int b=0;b<nblock;b++){
if(useSolnAsInitGuess) { if(useSolnAsInitGuess) {
for(int b=0;b<nblock;b++){
pickCheckerboard(Odd, sol_o[b], out[b]); pickCheckerboard(Odd, sol_o[b], out[b]);
} else {
guess(src_o[b],sol_o[b]);
} }
} else {
guess(src_o, sol_o);
}
if ( subGuess ) { if ( subGuess ) {
for(int b=0;b<nblock;b++){ guess_save[b] = sol_o[b];
guess_save[b] = sol_o[b]; }
}
} }
////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////
// Call the block solver // Call the block solver

View File

@ -0,0 +1,67 @@
#include <Grid/GridCore.h>
#include <fcntl.h>
NAMESPACE_BEGIN(Grid);
MemoryStats *MemoryProfiler::stats = nullptr;
bool MemoryProfiler::debug = false;
void check_huge_pages(void *Buf,uint64_t BYTES)
{
#ifdef __linux__
int fd = open("/proc/self/pagemap", O_RDONLY);
assert(fd >= 0);
const int page_size = 4096;
uint64_t virt_pfn = (uint64_t)Buf / page_size;
off_t offset = sizeof(uint64_t) * virt_pfn;
uint64_t npages = (BYTES + page_size-1) / page_size;
uint64_t pagedata[npages];
uint64_t ret = lseek(fd, offset, SEEK_SET);
assert(ret == offset);
ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
assert(ret == sizeof(uint64_t) * npages);
int nhugepages = npages / 512;
int n4ktotal, nnothuge;
n4ktotal = 0;
nnothuge = 0;
for (int i = 0; i < nhugepages; ++i) {
uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size;
for (int j = 0; j < 512; ++j) {
uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size;
++n4ktotal;
if (pageaddr != baseaddr + j * page_size)
++nnothuge;
}
}
int rank = CartesianCommunicator::RankWorld();
printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge);
#endif
}
std::string sizeString(const size_t bytes)
{
constexpr unsigned int bufSize = 256;
const char *suffixes[7] = {"", "K", "M", "G", "T", "P", "E"};
char buf[256];
size_t s = 0;
double count = bytes;
while (count >= 1024 && s < 7)
{
s++;
count /= 1024;
}
if (count - floor(count) == 0.0)
{
snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]);
}
else
{
snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]);
}
return std::string(buf);
}
NAMESPACE_END(Grid);

View File

@ -53,7 +53,11 @@ public:
{ {
size_type bytes = __n*sizeof(_Tp); size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes); profilerAllocate(bytes);
#ifdef GRID_UVM
_Tp *ptr = (_Tp*) MemoryManager::SharedAllocate(bytes);
#else
_Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes); _Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes);
#endif
assert( ( (_Tp*)ptr != (_Tp *)NULL ) ); assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
return ptr; return ptr;
} }
@ -62,7 +66,11 @@ public:
{ {
size_type bytes = __n * sizeof(_Tp); size_type bytes = __n * sizeof(_Tp);
profilerFree(bytes); profilerFree(bytes);
#ifdef GRID_UVM
MemoryManager::SharedFree((void *)__p,bytes);
#else
MemoryManager::CpuFree((void *)__p,bytes); MemoryManager::CpuFree((void *)__p,bytes);
#endif
} }
// FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop // FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop
@ -165,18 +173,9 @@ template<typename _Tp> inline bool operator!=(const devAllocator<_Tp>&, const d
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Template typedefs // Template typedefs
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
#ifdef ACCELERATOR_CSHIFT //template<class T> using commAllocator = devAllocator<T>;
// Cshift on device template<class T> using Vector = std::vector<T,uvmAllocator<T> >;
template<class T> using cshiftAllocator = devAllocator<T>;
#else
// Cshift on host
template<class T> using cshiftAllocator = std::allocator<T>;
#endif
template<class T> using Vector = std::vector<T,uvmAllocator<T> >;
template<class T> using stencilVector = std::vector<T,alignedAllocator<T> >;
template<class T> using commVector = std::vector<T,devAllocator<T> >; template<class T> using commVector = std::vector<T,devAllocator<T> >;
template<class T> using cshiftVector = std::vector<T,cshiftAllocator<T> >;
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@ -9,30 +9,16 @@ NAMESPACE_BEGIN(Grid);
#define AccSmall (3) #define AccSmall (3)
#define Shared (4) #define Shared (4)
#define SharedSmall (5) #define SharedSmall (5)
#undef GRID_MM_VERBOSE uint64_t total_cache;
uint64_t total_shared; uint64_t total_shared;
uint64_t total_device; uint64_t total_device;
uint64_t total_host;; uint64_t total_host;;
void MemoryManager::PrintBytes(void) void MemoryManager::PrintBytes(void)
{ {
std::cout << " MemoryManager : ------------------------------------ "<<std::endl; std::cout << " MemoryManager : "<<total_cache <<" cache bytes "<<std::endl;
std::cout << " MemoryManager : PrintBytes "<<std::endl; std::cout << " MemoryManager : "<<total_shared<<" shared bytes "<<std::endl;
std::cout << " MemoryManager : ------------------------------------ "<<std::endl; std::cout << " MemoryManager : "<<total_device<<" accelerator bytes "<<std::endl;
std::cout << " MemoryManager : "<<(total_shared>>20)<<" shared Mbytes "<<std::endl; std::cout << " MemoryManager : "<<total_host <<" cpu bytes "<<std::endl;
std::cout << " MemoryManager : "<<(total_device>>20)<<" accelerator Mbytes "<<std::endl;
std::cout << " MemoryManager : "<<(total_host>>20) <<" cpu Mbytes "<<std::endl;
uint64_t cacheBytes;
cacheBytes = CacheBytes[Cpu];
std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" cpu cache Mbytes "<<std::endl;
cacheBytes = CacheBytes[Acc];
std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" acc cache Mbytes "<<std::endl;
cacheBytes = CacheBytes[Shared];
std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" shared cache Mbytes "<<std::endl;
#ifdef GRID_CUDA
cuda_mem();
#endif
} }
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
@ -40,114 +26,93 @@ void MemoryManager::PrintBytes(void)
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax]; MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
int MemoryManager::Victim[MemoryManager::NallocType]; int MemoryManager::Victim[MemoryManager::NallocType];
int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 2, 8, 2, 8 }; int MemoryManager::Ncache[MemoryManager::NallocType] = { 8, 32, 8, 32, 8, 32 };
uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType];
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
// Actual allocation and deallocation utils // Actual allocation and deallocation utils
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
void *MemoryManager::AcceleratorAllocate(size_t bytes) void *MemoryManager::AcceleratorAllocate(size_t bytes)
{ {
total_device+=bytes;
void *ptr = (void *) Lookup(bytes,Acc); void *ptr = (void *) Lookup(bytes,Acc);
if ( ptr == (void *) NULL ) { if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocDevice(bytes); ptr = (void *) acceleratorAllocDevice(bytes);
total_device+=bytes;
} else {
// std::cout <<"AcceleratorAllocate: cache hit Device pointer "<<std::hex<<ptr<<std::dec<<" "<<bytes<<std::endl;
} }
#ifdef GRID_MM_VERBOSE
std::cout <<"AcceleratorAllocate "<<std::endl;
PrintBytes();
#endif
return ptr; return ptr;
} }
void MemoryManager::AcceleratorFree (void *ptr,size_t bytes) void MemoryManager::AcceleratorFree (void *ptr,size_t bytes)
{ {
total_device-=bytes;
void *__freeme = Insert(ptr,bytes,Acc); void *__freeme = Insert(ptr,bytes,Acc);
if ( __freeme ) { if ( __freeme ) {
acceleratorFreeDevice(__freeme); acceleratorFreeDevice(__freeme);
total_device-=bytes;
// PrintBytes();
} }
#ifdef GRID_MM_VERBOSE
std::cout <<"AcceleratorFree "<<std::endl;
PrintBytes();
#endif
} }
void *MemoryManager::SharedAllocate(size_t bytes) void *MemoryManager::SharedAllocate(size_t bytes)
{ {
total_shared+=bytes;
void *ptr = (void *) Lookup(bytes,Shared); void *ptr = (void *) Lookup(bytes,Shared);
if ( ptr == (void *) NULL ) { if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocShared(bytes); ptr = (void *) acceleratorAllocShared(bytes);
total_shared+=bytes;
// std::cout <<"SharedAllocate: allocated Shared pointer "<<std::hex<<ptr<<std::dec<<std::endl;
// PrintBytes();
} else {
// std::cout <<"SharedAllocate: cache hit Shared pointer "<<std::hex<<ptr<<std::dec<<" "<<bytes<<std::endl;
} }
#ifdef GRID_MM_VERBOSE
std::cout <<"SharedAllocate "<<std::endl;
PrintBytes();
#endif
return ptr; return ptr;
} }
void MemoryManager::SharedFree (void *ptr,size_t bytes) void MemoryManager::SharedFree (void *ptr,size_t bytes)
{ {
total_shared-=bytes;
void *__freeme = Insert(ptr,bytes,Shared); void *__freeme = Insert(ptr,bytes,Shared);
if ( __freeme ) { if ( __freeme ) {
acceleratorFreeShared(__freeme); acceleratorFreeShared(__freeme);
total_shared-=bytes;
// PrintBytes();
} }
#ifdef GRID_MM_VERBOSE
std::cout <<"SharedFree "<<std::endl;
PrintBytes();
#endif
} }
#ifdef GRID_UVM #ifdef GRID_UVM
void *MemoryManager::CpuAllocate(size_t bytes) void *MemoryManager::CpuAllocate(size_t bytes)
{ {
total_host+=bytes;
void *ptr = (void *) Lookup(bytes,Cpu); void *ptr = (void *) Lookup(bytes,Cpu);
if ( ptr == (void *) NULL ) { if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocShared(bytes); ptr = (void *) acceleratorAllocShared(bytes);
total_host+=bytes;
// std::cout <<"CpuAllocate: allocated Cpu pointer "<<std::hex<<ptr<<std::dec<<std::endl;
} else {
// std::cout <<"CpufAllocate: cache hit Cpu pointer "<<std::hex<<ptr<<std::dec<<" "<<bytes<<std::endl;
} }
#ifdef GRID_MM_VERBOSE
std::cout <<"CpuAllocate "<<std::endl;
PrintBytes();
#endif
return ptr; return ptr;
} }
void MemoryManager::CpuFree (void *_ptr,size_t bytes) void MemoryManager::CpuFree (void *_ptr,size_t bytes)
{ {
total_host-=bytes;
NotifyDeletion(_ptr); NotifyDeletion(_ptr);
void *__freeme = Insert(_ptr,bytes,Cpu); void *__freeme = Insert(_ptr,bytes,Cpu);
if ( __freeme ) { if ( __freeme ) {
acceleratorFreeShared(__freeme); acceleratorFreeShared(__freeme);
total_host-=bytes;
} }
#ifdef GRID_MM_VERBOSE
std::cout <<"CpuFree "<<std::endl;
PrintBytes();
#endif
} }
#else #else
void *MemoryManager::CpuAllocate(size_t bytes) void *MemoryManager::CpuAllocate(size_t bytes)
{ {
total_host+=bytes;
void *ptr = (void *) Lookup(bytes,Cpu); void *ptr = (void *) Lookup(bytes,Cpu);
if ( ptr == (void *) NULL ) { if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocCpu(bytes); ptr = (void *) acceleratorAllocCpu(bytes);
total_host+=bytes;
} }
#ifdef GRID_MM_VERBOSE
std::cout <<"CpuAllocate "<<std::endl;
PrintBytes();
#endif
return ptr; return ptr;
} }
void MemoryManager::CpuFree (void *_ptr,size_t bytes) void MemoryManager::CpuFree (void *_ptr,size_t bytes)
{ {
total_host-=bytes;
NotifyDeletion(_ptr); NotifyDeletion(_ptr);
void *__freeme = Insert(_ptr,bytes,Cpu); void *__freeme = Insert(_ptr,bytes,Cpu);
if ( __freeme ) { if ( __freeme ) {
acceleratorFreeCpu(__freeme); acceleratorFreeCpu(__freeme);
total_host-=bytes;
} }
#ifdef GRID_MM_VERBOSE
std::cout <<"CpuFree "<<std::endl;
PrintBytes();
#endif
} }
#endif #endif
@ -159,11 +124,12 @@ void MemoryManager::Init(void)
char * str; char * str;
int Nc; int Nc;
int NcS;
str= getenv("GRID_ALLOC_NCACHE_LARGE"); str= getenv("GRID_ALLOC_NCACHE_LARGE");
if ( str ) { if ( str ) {
Nc = atoi(str); Nc = atoi(str);
if ( (Nc>=0) && (Nc < NallocCacheMax)) { if ( (Nc>=0) && (Nc <= NallocCacheMax)) {
Ncache[Cpu]=Nc; Ncache[Cpu]=Nc;
Ncache[Acc]=Nc; Ncache[Acc]=Nc;
Ncache[Shared]=Nc; Ncache[Shared]=Nc;
@ -173,7 +139,7 @@ void MemoryManager::Init(void)
str= getenv("GRID_ALLOC_NCACHE_SMALL"); str= getenv("GRID_ALLOC_NCACHE_SMALL");
if ( str ) { if ( str ) {
Nc = atoi(str); Nc = atoi(str);
if ( (Nc>=0) && (Nc < NallocCacheMax)) { if ( (Nc>=0) && (Nc <= NallocCacheMax)) {
Ncache[CpuSmall]=Nc; Ncache[CpuSmall]=Nc;
Ncache[AccSmall]=Nc; Ncache[AccSmall]=Nc;
Ncache[SharedSmall]=Nc; Ncache[SharedSmall]=Nc;
@ -224,13 +190,13 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
#ifdef ALLOCATION_CACHE #ifdef ALLOCATION_CACHE
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT); bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
int cache = type + small; int cache = type + small;
return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]); return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache]);
#else #else
return ptr; return ptr;
#endif #endif
} }
void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes) void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim)
{ {
assert(ncache>0); assert(ncache>0);
#ifdef GRID_OMP #ifdef GRID_OMP
@ -254,7 +220,7 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
if ( entries[v].valid ) { if ( entries[v].valid ) {
ret = entries[v].address; ret = entries[v].address;
cacheBytes -= entries[v].bytes; total_cache-=entries[v].bytes;
entries[v].valid = 0; entries[v].valid = 0;
entries[v].address = NULL; entries[v].address = NULL;
entries[v].bytes = 0; entries[v].bytes = 0;
@ -263,7 +229,7 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
entries[v].address=ptr; entries[v].address=ptr;
entries[v].bytes =bytes; entries[v].bytes =bytes;
entries[v].valid =1; entries[v].valid =1;
cacheBytes += bytes; total_cache+=entries[v].bytes;
return ret; return ret;
} }
@ -273,13 +239,13 @@ void *MemoryManager::Lookup(size_t bytes,int type)
#ifdef ALLOCATION_CACHE #ifdef ALLOCATION_CACHE
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT); bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
int cache = type+small; int cache = type+small;
return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]); return Lookup(bytes,Entries[cache],Ncache[cache]);
#else #else
return NULL; return NULL;
#endif #endif
} }
void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes) void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache)
{ {
assert(ncache>0); assert(ncache>0);
#ifdef GRID_OMP #ifdef GRID_OMP
@ -288,7 +254,7 @@ void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncach
for(int e=0;e<ncache;e++){ for(int e=0;e<ncache;e++){
if ( entries[e].valid && ( entries[e].bytes == bytes ) ) { if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
entries[e].valid = 0; entries[e].valid = 0;
cacheBytes -= entries[e].bytes; total_cache-=bytes;
return entries[e].address; return entries[e].address;
} }
} }

View File

@ -34,6 +34,8 @@ NAMESPACE_BEGIN(Grid);
// Move control to configure.ac and Config.h? // Move control to configure.ac and Config.h?
#define ALLOCATION_CACHE
#define GRID_ALLOC_ALIGN (2*1024*1024)
#define GRID_ALLOC_SMALL_LIMIT (4096) #define GRID_ALLOC_SMALL_LIMIT (4096)
/*Pinning pages is costly*/ /*Pinning pages is costly*/
@ -82,18 +84,17 @@ private:
static AllocationCacheEntry Entries[NallocType][NallocCacheMax]; static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
static int Victim[NallocType]; static int Victim[NallocType];
static int Ncache[NallocType]; static int Ncache[NallocType];
static uint64_t CacheBytes[NallocType];
///////////////////////////////////////////////// /////////////////////////////////////////////////
// Free pool // Free pool
///////////////////////////////////////////////// /////////////////////////////////////////////////
static void *Insert(void *ptr,size_t bytes,int type) ; static void *Insert(void *ptr,size_t bytes,int type) ;
static void *Lookup(size_t bytes,int type) ; static void *Lookup(size_t bytes,int type) ;
static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim,uint64_t &cbytes) ; static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim) ;
static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t &cbytes) ; static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache) ;
static void PrintBytes(void);
public: public:
static void PrintBytes(void);
static void Init(void); static void Init(void);
static void InitMessage(void); static void InitMessage(void);
static void *AcceleratorAllocate(size_t bytes); static void *AcceleratorAllocate(size_t bytes);
@ -170,7 +171,6 @@ private:
public: public:
static void Print(void); static void Print(void);
static void PrintState( void* CpuPtr);
static int isOpen (void* CpuPtr); static int isOpen (void* CpuPtr);
static void ViewClose(void* CpuPtr,ViewMode mode); static void ViewClose(void* CpuPtr,ViewMode mode);
static void *ViewOpen (void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint); static void *ViewOpen (void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);

View File

@ -1,12 +1,11 @@
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
#ifndef GRID_UVM #ifndef GRID_UVM
#warning "Using explicit device memory copies" #warning "Using explicit device memory copies"
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
//#define dprintf(...) printf ( __VA_ARGS__ ); fflush(stdout);
#define dprintf(...) #define dprintf(...)
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
// For caching copies of data on device // For caching copies of data on device
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
@ -104,7 +103,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
/////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////
assert(AccCache.state!=Empty); assert(AccCache.state!=Empty);
dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); // dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
assert(AccCache.accLock==0); assert(AccCache.accLock==0);
assert(AccCache.cpuLock==0); assert(AccCache.cpuLock==0);
assert(AccCache.CpuPtr!=(uint64_t)NULL); assert(AccCache.CpuPtr!=(uint64_t)NULL);
@ -112,7 +111,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes); AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
DeviceBytes -=AccCache.bytes; DeviceBytes -=AccCache.bytes;
LRUremove(AccCache); LRUremove(AccCache);
dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes); // dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
} }
uint64_t CpuPtr = AccCache.CpuPtr; uint64_t CpuPtr = AccCache.CpuPtr;
EntryErase(CpuPtr); EntryErase(CpuPtr);
@ -126,7 +125,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
assert(AccCache.state!=Empty); assert(AccCache.state!=Empty);
dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); // dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
assert(AccCache.accLock==0); assert(AccCache.accLock==0);
assert(AccCache.cpuLock==0); assert(AccCache.cpuLock==0);
if(AccCache.state==AccDirty) { if(AccCache.state==AccDirty) {
@ -137,7 +136,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes); AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
DeviceBytes -=AccCache.bytes; DeviceBytes -=AccCache.bytes;
LRUremove(AccCache); LRUremove(AccCache);
dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes); // dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);
} }
uint64_t CpuPtr = AccCache.CpuPtr; uint64_t CpuPtr = AccCache.CpuPtr;
EntryErase(CpuPtr); EntryErase(CpuPtr);
@ -150,7 +149,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
assert(AccCache.AccPtr!=(uint64_t)NULL); assert(AccCache.AccPtr!=(uint64_t)NULL);
assert(AccCache.CpuPtr!=(uint64_t)NULL); assert(AccCache.CpuPtr!=(uint64_t)NULL);
acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes); acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
dprintf("MemoryManager: Flush %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); // dprintf("MemoryManager: Flush %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
DeviceToHostBytes+=AccCache.bytes; DeviceToHostBytes+=AccCache.bytes;
DeviceToHostXfer++; DeviceToHostXfer++;
AccCache.state=Consistent; AccCache.state=Consistent;
@ -165,7 +164,7 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes); AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
DeviceBytes+=AccCache.bytes; DeviceBytes+=AccCache.bytes;
} }
dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); // dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes); acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
HostToDeviceBytes+=AccCache.bytes; HostToDeviceBytes+=AccCache.bytes;
HostToDeviceXfer++; HostToDeviceXfer++;
@ -228,24 +227,18 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
// Find if present, otherwise get or force an empty // Find if present, otherwise get or force an empty
//////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////
if ( EntryPresent(CpuPtr)==0 ){ if ( EntryPresent(CpuPtr)==0 ){
EvictVictims(bytes);
EntryCreate(CpuPtr,bytes,mode,hint); EntryCreate(CpuPtr,bytes,mode,hint);
} }
auto AccCacheIterator = EntryLookup(CpuPtr); auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second; auto & AccCache = AccCacheIterator->second;
if (!AccCache.AccPtr) {
EvictVictims(bytes);
}
assert((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)); assert((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard));
assert(AccCache.cpuLock==0); // Programming error assert(AccCache.cpuLock==0); // Programming error
if(AccCache.state!=Empty) { if(AccCache.state!=Empty) {
dprintf("ViewOpen found entry %llx %llx : %lld %lld\n",
(uint64_t)AccCache.CpuPtr,
(uint64_t)CpuPtr,
(uint64_t)AccCache.bytes,
(uint64_t)bytes);
assert(AccCache.CpuPtr == CpuPtr); assert(AccCache.CpuPtr == CpuPtr);
assert(AccCache.bytes ==bytes); assert(AccCache.bytes ==bytes);
} }
@ -292,21 +285,21 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
AccCache.state = Consistent; // CpuDirty + AccRead => Consistent AccCache.state = Consistent; // CpuDirty + AccRead => Consistent
} }
AccCache.accLock++; AccCache.accLock++;
dprintf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock); // printf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock);
} else if(AccCache.state==Consistent) { } else if(AccCache.state==Consistent) {
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty
else else
AccCache.state = Consistent; // Consistent + AccRead => Consistent AccCache.state = Consistent; // Consistent + AccRead => Consistent
AccCache.accLock++; AccCache.accLock++;
dprintf("Consistent entry into device accLock %d\n",AccCache.accLock); // printf("Consistent entry into device accLock %d\n",AccCache.accLock);
} else if(AccCache.state==AccDirty) { } else if(AccCache.state==AccDirty) {
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard)) if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
else else
AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty
AccCache.accLock++; AccCache.accLock++;
dprintf("AccDirty entry into device accLock %d\n",AccCache.accLock); // printf("AccDirty entry into device accLock %d\n",AccCache.accLock);
} else { } else {
assert(0); assert(0);
} }
@ -368,16 +361,13 @@ uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,V
// Find if present, otherwise get or force an empty // Find if present, otherwise get or force an empty
//////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////
if ( EntryPresent(CpuPtr)==0 ){ if ( EntryPresent(CpuPtr)==0 ){
EvictVictims(bytes);
EntryCreate(CpuPtr,bytes,mode,transient); EntryCreate(CpuPtr,bytes,mode,transient);
} }
auto AccCacheIterator = EntryLookup(CpuPtr); auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second; auto & AccCache = AccCacheIterator->second;
if (!AccCache.AccPtr) {
EvictVictims(bytes);
}
assert((mode==CpuRead)||(mode==CpuWrite)); assert((mode==CpuRead)||(mode==CpuWrite));
assert(AccCache.accLock==0); // Programming error assert(AccCache.accLock==0); // Programming error
@ -429,7 +419,6 @@ void MemoryManager::NotifyDeletion(void *_ptr)
} }
void MemoryManager::Print(void) void MemoryManager::Print(void)
{ {
PrintBytes();
std::cout << GridLogDebug << "--------------------------------------------" << std::endl; std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
std::cout << GridLogDebug << "Memory Manager " << std::endl; std::cout << GridLogDebug << "Memory Manager " << std::endl;
std::cout << GridLogDebug << "--------------------------------------------" << std::endl; std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
@ -474,32 +463,6 @@ int MemoryManager::isOpen (void* _CpuPtr)
} }
} }
void MemoryManager::PrintState(void* _CpuPtr)
{
uint64_t CpuPtr = (uint64_t)_CpuPtr;
if ( EntryPresent(CpuPtr) ){
auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second;
std::string str;
if ( AccCache.state==Empty ) str = std::string("Empty");
if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty");
if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
if ( AccCache.state==Consistent)str = std::string("Consistent");
if ( AccCache.state==EvictNext) str = std::string("EvictNext");
std::cout << GridLogMessage << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
std::cout << GridLogMessage << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
<< "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
<< "\t" << AccCache.cpuLock
<< "\t" << AccCache.accLock
<< "\t" << AccCache.LRU_valid<<std::endl;
} else {
std::cout << GridLogMessage << "No Entry in AccCache table." << std::endl;
}
}
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
#endif #endif

View File

@ -1,6 +1,7 @@
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
#ifdef GRID_UVM #ifdef GRID_UVM
#warning "Grid is assuming unified virtual memory address space"
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////
// View management is 1:1 address space mapping // View management is 1:1 address space mapping
@ -16,10 +17,6 @@ uint64_t MemoryManager::DeviceToHostXfer;
void MemoryManager::ViewClose(void* AccPtr,ViewMode mode){}; void MemoryManager::ViewClose(void* AccPtr,ViewMode mode){};
void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; }; void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; };
int MemoryManager::isOpen (void* CpuPtr) { return 0;} int MemoryManager::isOpen (void* CpuPtr) { return 0;}
void MemoryManager::PrintState(void* CpuPtr)
{
std::cout << GridLogMessage << "Host<->Device memory movement not currently managed by Grid." << std::endl;
};
void MemoryManager::Print(void){}; void MemoryManager::Print(void){};
void MemoryManager::NotifyDeletion(void *ptr){}; void MemoryManager::NotifyDeletion(void *ptr){};

View File

@ -36,7 +36,7 @@ static const int CbBlack=1;
static const int Even =CbRed; static const int Even =CbRed;
static const int Odd =CbBlack; static const int Odd =CbBlack;
accelerator_inline int RedBlackCheckerBoardFromOindex (int oindex,const Coordinate &rdim,const Coordinate &chk_dim_msk) accelerator_inline int RedBlackCheckerBoardFromOindex (int oindex, Coordinate &rdim, Coordinate &chk_dim_msk)
{ {
int nd=rdim.size(); int nd=rdim.size();
Coordinate coor(nd); Coordinate coor(nd);

View File

@ -33,8 +33,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
bool Stencil_force_mpi = true;
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
// Info that is setup once and indept of cartesian layout // Info that is setup once and indept of cartesian layout
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////

View File

@ -1,3 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@ -35,8 +36,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
extern bool Stencil_force_mpi ;
class CartesianCommunicator : public SharedMemory { class CartesianCommunicator : public SharedMemory {
public: public:
@ -109,8 +108,6 @@ public:
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
// Reduction // Reduction
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
void GlobalMax(RealD &);
void GlobalMax(RealF &);
void GlobalSum(RealF &); void GlobalSum(RealF &);
void GlobalSumVector(RealF *,int N); void GlobalSumVector(RealF *,int N);
void GlobalSum(RealD &); void GlobalSum(RealD &);
@ -141,6 +138,21 @@ public:
int recv_from_rank, int recv_from_rank,
int bytes); int bytes);
void SendRecvPacket(void *xmit,
void *recv,
int xmit_to_rank,
int recv_from_rank,
int bytes);
void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes);
void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
double StencilSendToRecvFrom(void *xmit, double StencilSendToRecvFrom(void *xmit,
int xmit_to_rank, int xmit_to_rank,
void *recv, void *recv,

View File

@ -44,7 +44,7 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
MPI_Initialized(&flag); // needed to coexist with other libs apparently MPI_Initialized(&flag); // needed to coexist with other libs apparently
if ( !flag ) { if ( !flag ) {
#ifndef GRID_COMMS_THREADS #if defined (TOFU) // FUGAKU, credits go to Issaku Kanamori
nCommThreads=1; nCommThreads=1;
// wrong results here too // wrong results here too
// For now: comms-overlap leads to wrong results in Benchmark_wilson even on single node MPI runs // For now: comms-overlap leads to wrong results in Benchmark_wilson even on single node MPI runs
@ -275,16 +275,6 @@ void CartesianCommunicator::GlobalXOR(uint64_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator); int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
assert(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::GlobalMax(float &f)
{
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_MAX,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalMax(double &d)
{
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(float &f){ void CartesianCommunicator::GlobalSum(float &f){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator); int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0); assert(ierr==0);
@ -368,28 +358,19 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
assert(from != _processor); assert(from != _processor);
assert(gme == ShmRank); assert(gme == ShmRank);
double off_node_bytes=0.0; double off_node_bytes=0.0;
int tag;
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) { if ( gfrom ==MPI_UNDEFINED) {
tag= dir+from*32; ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator_halo[commdir],&rrq);
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
assert(ierr==0); assert(ierr==0);
list.push_back(rrq); list.push_back(rrq);
off_node_bytes+=bytes; off_node_bytes+=bytes;
} }
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) { if ( gdest == MPI_UNDEFINED ) {
tag= dir+_processor*32; ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator_halo[commdir],&xrq);
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
assert(ierr==0); assert(ierr==0);
list.push_back(xrq); list.push_back(xrq);
off_node_bytes+=bytes; off_node_bytes+=bytes;
} else {
// TODO : make a OMP loop on CPU, call threaded bcopy
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
assert(shm!=NULL);
// std::cout <<"acceleratorCopyDeviceToDeviceAsynch"<< std::endl;
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes);
} }
if ( CommunicatorPolicy == CommunicatorPolicySequential ) { if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
@ -400,9 +381,6 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
} }
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir) void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
{ {
// std::cout << "Copy Synchronised\n"<<std::endl;
acceleratorCopySynchronise();
int nreq=list.size(); int nreq=list.size();
if (nreq==0) return; if (nreq==0) return;

View File

@ -67,8 +67,6 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
CartesianCommunicator::~CartesianCommunicator(){} CartesianCommunicator::~CartesianCommunicator(){}
void CartesianCommunicator::GlobalMax(float &){}
void CartesianCommunicator::GlobalMax(double &){}
void CartesianCommunicator::GlobalSum(float &){} void CartesianCommunicator::GlobalSum(float &){}
void CartesianCommunicator::GlobalSumVector(float *,int N){} void CartesianCommunicator::GlobalSumVector(float *,int N){}
void CartesianCommunicator::GlobalSum(double &){} void CartesianCommunicator::GlobalSum(double &){}
@ -79,6 +77,15 @@ void CartesianCommunicator::GlobalSumVector(uint64_t *,int N){}
void CartesianCommunicator::GlobalXOR(uint32_t &){} void CartesianCommunicator::GlobalXOR(uint32_t &){}
void CartesianCommunicator::GlobalXOR(uint64_t &){} void CartesianCommunicator::GlobalXOR(uint64_t &){}
void CartesianCommunicator::SendRecvPacket(void *xmit,
void *recv,
int xmit_to_rank,
int recv_from_rank,
int bytes)
{
assert(0);
}
// Basic Halo comms primitive -- should never call in single node // Basic Halo comms primitive -- should never call in single node
void CartesianCommunicator::SendToRecvFrom(void *xmit, void CartesianCommunicator::SendToRecvFrom(void *xmit,
@ -89,6 +96,20 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
{ {
assert(0); assert(0);
} }
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
int from,
int bytes)
{
assert(0);
}
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
{
assert(0);
}
void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes) void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes)
{ {
bcopy(in,out,bytes*words); bcopy(in,out,bytes*words);
@ -116,6 +137,10 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
int recv_from_rank, int recv_from_rank,
int bytes, int dir) int bytes, int dir)
{ {
std::vector<CommsRequest_t> list;
// Discard the "dir"
SendToRecvFromBegin (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
SendToRecvFromComplete(list);
return 2.0*bytes; return 2.0*bytes;
} }
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
@ -125,10 +150,13 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
int recv_from_rank, int recv_from_rank,
int bytes, int dir) int bytes, int dir)
{ {
// Discard the "dir"
SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
return 2.0*bytes; return 2.0*bytes;
} }
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir) void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
{ {
SendToRecvFromComplete(waitall);
} }
void CartesianCommunicator::StencilBarrier(void){}; void CartesianCommunicator::StencilBarrier(void){};

View File

@ -102,7 +102,7 @@ public:
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
static void SharedMemoryAllocate(uint64_t bytes, int flags); static void SharedMemoryAllocate(uint64_t bytes, int flags);
static void SharedMemoryFree(void); static void SharedMemoryFree(void);
static void SharedMemoryCopy(void *dest,void *src,size_t bytes); static void SharedMemoryCopy(void *dest,const void *src,size_t bytes);
static void SharedMemoryZero(void *dest,size_t bytes); static void SharedMemoryZero(void *dest,size_t bytes);
}; };

View File

@ -7,7 +7,6 @@
Copyright (C) 2015 Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk> Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Christoph Lehner <christoph@lhnr.de>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -32,12 +31,6 @@ Author: Christoph Lehner <christoph@lhnr.de>
#ifdef GRID_CUDA #ifdef GRID_CUDA
#include <cuda_runtime_api.h> #include <cuda_runtime_api.h>
#endif
#ifdef GRID_HIP
#include <hip/hip_runtime_api.h>
#endif
#ifdef GRID_SYCl
#endif #endif
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
@ -73,7 +66,6 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
WorldNodes = WorldSize/WorldShmSize; WorldNodes = WorldSize/WorldShmSize;
assert( (WorldNodes * WorldShmSize) == WorldSize ); assert( (WorldNodes * WorldShmSize) == WorldSize );
// FIXME: Check all WorldShmSize are the same ? // FIXME: Check all WorldShmSize are the same ?
///////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////
@ -174,23 +166,6 @@ static inline int divides(int a,int b)
} }
void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims) void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims)
{ {
////////////////////////////////////////////////////////////////
// Allow user to configure through environment variable
////////////////////////////////////////////////////////////////
char* str = getenv(("GRID_SHM_DIMS_" + std::to_string(ShmDims.size())).c_str());
if ( str ) {
std::vector<int> IntShmDims;
GridCmdOptionIntVector(std::string(str),IntShmDims);
assert(IntShmDims.size() == WorldDims.size());
long ShmSize = 1;
for (int dim=0;dim<WorldDims.size();dim++) {
ShmSize *= (ShmDims[dim] = IntShmDims[dim]);
assert(divides(ShmDims[dim],WorldDims[dim]));
}
assert(ShmSize == WorldShmSize);
return;
}
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Powers of 2,3,5 only in prime decomposition for now // Powers of 2,3,5 only in prime decomposition for now
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
@ -450,47 +425,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
// Hugetlbfs mapping intended // Hugetlbfs mapping intended
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
#if defined(GRID_CUDA) ||defined(GRID_HIP) || defined(GRID_SYCL) #ifdef GRID_CUDA
//if defined(GRID_SYCL)
#if 0
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
void * ShmCommBuf ;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the pointer array for shared windows for our group
//////////////////////////////////////////////////////////////////////////////////////////////////////////
MPI_Barrier(WorldShmComm);
WorldShmCommBufs.resize(WorldShmSize);
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Each MPI rank should allocate our own buffer
///////////////////////////////////////////////////////////////////////////////////////////////////////////
ShmCommBuf = acceleratorAllocDevice(bytes);
if (ShmCommBuf == (void *)NULL ) {
std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
exit(EXIT_FAILURE);
}
std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
SharedMemoryZero(ShmCommBuf,bytes);
assert(WorldShmSize == 1);
for(int r=0;r<WorldShmSize;r++){
WorldShmCommBufs[r] = ShmCommBuf;
}
_ShmAllocBytes=bytes;
_ShmAlloc=1;
}
#endif
#if defined(GRID_CUDA) ||defined(GRID_HIP) ||defined(GRID_SYCL)
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
void * ShmCommBuf ; void * ShmCommBuf ;
@ -513,17 +448,24 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Each MPI rank should allocate our own buffer // Each MPI rank should allocate our own buffer
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
ShmCommBuf = acceleratorAllocDevice(bytes); #ifndef GRID_MPI3_SHM_NONE
auto err = cudaMalloc(&ShmCommBuf, bytes);
#else
auto err = cudaMallocManaged(&ShmCommBuf, bytes);
#endif
if ( err != cudaSuccess) {
std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed for " << bytes<<" bytes " <<cudaGetErrorString(err)<< std::endl;
exit(EXIT_FAILURE);
}
if (ShmCommBuf == (void *)NULL ) { if (ShmCommBuf == (void *)NULL ) {
std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl; std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed NULL pointer for " << bytes<<" bytes " << std::endl;
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
if ( WorldRank == 0 ){ if ( WorldRank == 0 ){
std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes std::cout << header " SharedMemoryMPI.cc cudaMalloc "<< bytes << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
} }
SharedMemoryZero(ShmCommBuf,bytes); SharedMemoryZero(ShmCommBuf,bytes);
std::cout<< "Setting up IPC"<<std::endl;
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Loop over ranks/gpu's on our node // Loop over ranks/gpu's on our node
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -533,50 +475,15 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
////////////////////////////////////////////////// //////////////////////////////////////////////////
// If it is me, pass around the IPC access key // If it is me, pass around the IPC access key
////////////////////////////////////////////////// //////////////////////////////////////////////////
void * thisBuf = ShmCommBuf;
if(!Stencil_force_mpi) {
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
typedef struct { int fd; pid_t pid ; } clone_mem_t;
auto zeDevice = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_device());
auto zeContext = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_context());
ze_ipc_mem_handle_t ihandle;
clone_mem_t handle;
if ( r==WorldShmRank ) {
auto err = zeMemGetIpcHandle(zeContext,ShmCommBuf,&ihandle);
if ( err != ZE_RESULT_SUCCESS ) {
std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
exit(EXIT_FAILURE);
} else {
std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
}
memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int));
handle.pid = getpid();
}
#endif
#ifdef GRID_CUDA
cudaIpcMemHandle_t handle; cudaIpcMemHandle_t handle;
if ( r==WorldShmRank ) { if ( r==WorldShmRank ) {
auto err = cudaIpcGetMemHandle(&handle,ShmCommBuf); err = cudaIpcGetMemHandle(&handle,ShmCommBuf);
if ( err != cudaSuccess) { if ( err != cudaSuccess) {
std::cerr << " SharedMemoryMPI.cc cudaIpcGetMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl; std::cerr << " SharedMemoryMPI.cc cudaIpcGetMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
} }
#endif
#ifdef GRID_HIP
hipIpcMemHandle_t handle;
if ( r==WorldShmRank ) {
auto err = hipIpcGetMemHandle(&handle,ShmCommBuf);
if ( err != hipSuccess) {
std::cerr << " SharedMemoryMPI.cc hipIpcGetMemHandle failed for rank" << r <<" "<<hipGetErrorString(err)<< std::endl;
exit(EXIT_FAILURE);
}
}
#endif
////////////////////////////////////////////////// //////////////////////////////////////////////////
// Share this IPC handle across the Shm Comm // Share this IPC handle across the Shm Comm
////////////////////////////////////////////////// //////////////////////////////////////////////////
@ -592,57 +499,17 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
// If I am not the source, overwrite thisBuf with remote buffer // If I am not the source, overwrite thisBuf with remote buffer
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
void * thisBuf = ShmCommBuf;
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
if ( r!=WorldShmRank ) {
thisBuf = nullptr;
std::cout<<"mapping seeking remote pid/fd "
<<handle.pid<<"/"
<<handle.fd<<std::endl;
int pidfd = syscall(SYS_pidfd_open,handle.pid,0);
std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n";
// int myfd = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0);
int myfd = syscall(438,pidfd,handle.fd,0);
std::cout<<"Using IpcHandle myfd "<<myfd<<"\n";
memcpy((void *)&ihandle,(void *)&myfd,sizeof(int));
auto err = zeMemOpenIpcHandle(zeContext,zeDevice,ihandle,0,&thisBuf);
if ( err != ZE_RESULT_SUCCESS ) {
std::cout << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
exit(EXIT_FAILURE);
} else {
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle succeeded for rank "<<r<<std::endl;
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle pointer is "<<std::hex<<thisBuf<<std::dec<<std::endl;
}
assert(thisBuf!=nullptr);
}
#endif
#ifdef GRID_CUDA
if ( r!=WorldShmRank ) { if ( r!=WorldShmRank ) {
auto err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess); err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess);
if ( err != cudaSuccess) { if ( err != cudaSuccess) {
std::cerr << " SharedMemoryMPI.cc cudaIpcOpenMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl; std::cerr << " SharedMemoryMPI.cc cudaIpcOpenMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
} }
#endif
#ifdef GRID_HIP
if ( r!=WorldShmRank ) {
auto err = hipIpcOpenMemHandle(&thisBuf,handle,hipIpcMemLazyEnablePeerAccess);
if ( err != hipSuccess) {
std::cerr << " SharedMemoryMPI.cc hipIpcOpenMemHandle failed for rank" << r <<" "<<hipGetErrorString(err)<< std::endl;
exit(EXIT_FAILURE);
}
}
#endif
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
// Save a copy of the device buffers // Save a copy of the device buffers
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
}
WorldShmCommBufs[r] = thisBuf; WorldShmCommBufs[r] = thisBuf;
#else #else
WorldShmCommBufs[r] = ShmCommBuf; WorldShmCommBufs[r] = ShmCommBuf;
@ -652,8 +519,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
_ShmAllocBytes=bytes; _ShmAllocBytes=bytes;
_ShmAlloc=1; _ShmAlloc=1;
} }
#endif
#else #else
#ifdef GRID_MPI3_SHMMMAP #ifdef GRID_MPI3_SHMMMAP
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
@ -781,6 +646,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
#endif #endif
void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0); void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
// std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< size<< "bytes)"<<std::endl;
if ( ptr == (void * )MAP_FAILED ) { if ( ptr == (void * )MAP_FAILED ) {
perror("failed mmap"); perror("failed mmap");
assert(0); assert(0);
@ -824,16 +690,16 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
///////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////
void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes) void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
{ {
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL) #ifdef GRID_CUDA
acceleratorMemSet(dest,0,bytes); cudaMemset(dest,0,bytes);
#else #else
bzero(dest,bytes); bzero(dest,bytes);
#endif #endif
} }
void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes) void GlobalSharedMemory::SharedMemoryCopy(void *dest,const void *src,size_t bytes)
{ {
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL) #ifdef GRID_CUDA
acceleratorCopyToDevice(src,dest,bytes); cudaMemcpy(dest,src,bytes,cudaMemcpyDefault);
#else #else
bcopy(src,dest,bytes); bcopy(src,dest,bytes);
#endif #endif
@ -886,18 +752,25 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
std::vector<int> ranks(size); for(int r=0;r<size;r++) ranks[r]=r; std::vector<int> ranks(size); for(int r=0;r<size;r++) ranks[r]=r;
MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]); MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]);
#ifdef GRID_SHM_FORCE_MPI #ifdef GRID_IBM_SUMMIT
// Hide the shared memory path between ranks // Hide the shared memory path between sockets
{ // if even number of nodes
if ( (ShmSize & 0x1)==0 ) {
int SocketSize = ShmSize/2;
int mySocket = ShmRank/SocketSize;
for(int r=0;r<size;r++){ for(int r=0;r<size;r++){
if ( r!=rank ) { int hisRank=ShmRanks[r];
ShmRanks[r] = MPI_UNDEFINED; if ( hisRank!= MPI_UNDEFINED ) {
int hisSocket=hisRank/SocketSize;
if ( hisSocket != mySocket ) {
ShmRanks[r] = MPI_UNDEFINED;
}
} }
} }
} }
#endif #endif
//SharedMemoryTest(); SharedMemoryTest();
} }
////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////
// On node barrier // On node barrier

View File

@ -29,7 +29,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
#define header "SharedMemoryNone: "
/*Construct from an MPI communicator*/ /*Construct from an MPI communicator*/
void GlobalSharedMemory::Init(Grid_MPI_Comm comm) void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
@ -56,38 +55,6 @@ void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_M
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
// Hugetlbfs mapping intended, use anonymous mmap // Hugetlbfs mapping intended, use anonymous mmap
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
#if 1
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << header "SharedMemoryAllocate "<< bytes<< " GPU implementation "<<std::endl;
void * ShmCommBuf ;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Each MPI rank should allocate our own buffer
///////////////////////////////////////////////////////////////////////////////////////////////////////////
ShmCommBuf = acceleratorAllocDevice(bytes);
if (ShmCommBuf == (void *)NULL ) {
std::cerr << " SharedMemoryNone.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
exit(EXIT_FAILURE);
}
if ( WorldRank == 0 ){
std::cout << WorldRank << header " SharedMemoryNone.cc acceleratorAllocDevice "<< bytes
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
}
SharedMemoryZero(ShmCommBuf,bytes);
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Loop over ranks/gpu's on our node
///////////////////////////////////////////////////////////////////////////////////////////////////////////
WorldShmCommBufs[0] = ShmCommBuf;
_ShmAllocBytes=bytes;
_ShmAlloc=1;
}
#else
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
void * ShmCommBuf ; void * ShmCommBuf ;
@ -116,15 +83,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
_ShmAllocBytes=bytes; _ShmAllocBytes=bytes;
_ShmAlloc=1; _ShmAlloc=1;
}; };
#endif
void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
{
acceleratorMemSet(dest,0,bytes);
}
void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
{
acceleratorCopyToDevice(src,dest,bytes);
}
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
// Global shared functionality finished // Global shared functionality finished
// Now move to per communicator functionality // Now move to per communicator functionality

View File

@ -35,7 +35,7 @@ extern Vector<std::pair<int,int> > Cshift_table;
// Gather for when there is no need to SIMD split // Gather for when there is no need to SIMD split
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
template<class vobj> void template<class vobj> void
Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0) Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
{ {
int rd = rhs.Grid()->_rdimensions[dimension]; int rd = rhs.Grid()->_rdimensions[dimension];
@ -73,19 +73,12 @@ Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dim
} }
} }
{ {
autoView(rhs_v , rhs, AcceleratorRead);
auto buffer_p = & buffer[0]; auto buffer_p = & buffer[0];
auto table = &Cshift_table[0]; auto table = &Cshift_table[0];
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for(i,ent,vobj::Nsimd(),{ accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second])); coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
}); });
#else
autoView(rhs_v , rhs, CpuRead);
thread_for(i,ent,{
buffer_p[table[i].first]=rhs_v[table[i].second];
});
#endif
} }
} }
@ -110,36 +103,21 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
int n1=rhs.Grid()->_slice_stride[dimension]; int n1=rhs.Grid()->_slice_stride[dimension];
if ( cbmask ==0x3){ if ( cbmask ==0x3){
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead); autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for(nn,e1*e2,1,{ accelerator_for2d(n,e1,b,e2,1,{
int n = nn%e1;
int b = nn/e1;
int o = n*n1; int o = n*n1;
int offset = b+n*e2; int offset = b+n*e2;
vobj temp =rhs_v[so+o+b]; vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset); extract<vobj>(temp,pointers,offset);
}); });
#else
autoView(rhs_v , rhs, CpuRead);
thread_for2d(n,e1,b,e2,{
int o = n*n1;
int offset = b+n*e2;
vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset);
});
#endif
} else { } else {
autoView(rhs_v , rhs, AcceleratorRead);
Coordinate rdim=rhs.Grid()->_rdimensions; Coordinate rdim=rhs.Grid()->_rdimensions;
Coordinate cdm =rhs.Grid()->_checker_dim_mask; Coordinate cdm =rhs.Grid()->_checker_dim_mask;
std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb? std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
#ifdef ACCELERATOR_CSHIFT accelerator_for2d(n,e1,b,e2,1,{
autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for(nn,e1*e2,1,{
int n = nn%e1;
int b = nn/e1;
Coordinate coor; Coordinate coor;
@ -156,33 +134,13 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
extract<vobj>(temp,pointers,offset); extract<vobj>(temp,pointers,offset);
} }
}); });
#else
autoView(rhs_v , rhs, CpuRead);
thread_for2d(n,e1,b,e2,{
Coordinate coor;
int o=n*n1;
int oindex = o+b;
int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm);
int ocb=1<<cb;
int offset = b+n*e2;
if ( ocb & cbmask ) {
vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset);
}
});
#endif
} }
} }
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
// Scatter for when there is no need to SIMD split // Scatter for when there is no need to SIMD split
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<vobj> &buffer, int dimension,int plane,int cbmask) template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vobj> &buffer, int dimension,int plane,int cbmask)
{ {
int rd = rhs.Grid()->_rdimensions[dimension]; int rd = rhs.Grid()->_rdimensions[dimension];
@ -224,19 +182,12 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<
} }
{ {
autoView( rhs_v, rhs, AcceleratorWrite);
auto buffer_p = & buffer[0]; auto buffer_p = & buffer[0];
auto table = &Cshift_table[0]; auto table = &Cshift_table[0];
#ifdef ACCELERATOR_CSHIFT
autoView( rhs_v, rhs, AcceleratorWrite);
accelerator_for(i,ent,vobj::Nsimd(),{ accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second])); coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
}); });
#else
autoView( rhs_v, rhs, CpuWrite);
thread_for(i,ent,{
rhs_v[table[i].first]=buffer_p[table[i].second];
});
#endif
} }
} }
@ -257,30 +208,19 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
int e2=rhs.Grid()->_slice_block[dimension]; int e2=rhs.Grid()->_slice_block[dimension];
if(cbmask ==0x3 ) { if(cbmask ==0x3 ) {
autoView( rhs_v , rhs, AcceleratorWrite);
int _slice_stride = rhs.Grid()->_slice_stride[dimension]; int _slice_stride = rhs.Grid()->_slice_stride[dimension];
int _slice_block = rhs.Grid()->_slice_block[dimension]; int _slice_block = rhs.Grid()->_slice_block[dimension];
#ifdef ACCELERATOR_CSHIFT accelerator_for2d(n,e1,b,e2,1,{
autoView( rhs_v , rhs, AcceleratorWrite);
accelerator_for(nn,e1*e2,1,{
int n = nn%e1;
int b = nn/e1;
int o = n*_slice_stride; int o = n*_slice_stride;
int offset = b+n*_slice_block; int offset = b+n*_slice_block;
merge(rhs_v[so+o+b],pointers,offset); merge(rhs_v[so+o+b],pointers,offset);
}); });
#else
autoView( rhs_v , rhs, CpuWrite);
thread_for2d(n,e1,b,e2,{
int o = n*_slice_stride;
int offset = b+n*_slice_block;
merge(rhs_v[so+o+b],pointers,offset);
});
#endif
} else { } else {
// Case of SIMD split AND checker dim cannot currently be hit, except in // Case of SIMD split AND checker dim cannot currently be hit, except in
// Test_cshift_red_black code. // Test_cshift_red_black code.
std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME // std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl; std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl;
assert(0); // This will fail if hit on GPU assert(0); // This will fail if hit on GPU
autoView( rhs_v, rhs, CpuWrite); autoView( rhs_v, rhs, CpuWrite);
@ -340,20 +280,12 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
} }
{ {
auto table = &Cshift_table[0];
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead); autoView(rhs_v , rhs, AcceleratorRead);
autoView(lhs_v , lhs, AcceleratorWrite); autoView(lhs_v , lhs, AcceleratorWrite);
auto table = &Cshift_table[0];
accelerator_for(i,ent,vobj::Nsimd(),{ accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second])); coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second]));
}); });
#else
autoView(rhs_v , rhs, CpuRead);
autoView(lhs_v , lhs, CpuWrite);
thread_for(i,ent,{
lhs_v[table[i].first]=rhs_v[table[i].second];
});
#endif
} }
} }
@ -392,20 +324,12 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
} }
{ {
auto table = &Cshift_table[0];
#ifdef ACCELERATOR_CSHIFT
autoView( rhs_v, rhs, AcceleratorRead); autoView( rhs_v, rhs, AcceleratorRead);
autoView( lhs_v, lhs, AcceleratorWrite); autoView( lhs_v, lhs, AcceleratorWrite);
auto table = &Cshift_table[0];
accelerator_for(i,ent,1,{ accelerator_for(i,ent,1,{
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type); permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
}); });
#else
autoView( rhs_v, rhs, CpuRead);
autoView( lhs_v, lhs, CpuWrite);
thread_for(i,ent,{
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
});
#endif
} }
} }

View File

@ -101,8 +101,7 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
} }
} }
#define ACCELERATOR_CSHIFT_NO_COPY
#ifdef ACCELERATOR_CSHIFT_NO_COPY
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask) template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{ {
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
@ -122,9 +121,9 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
assert(shift<fd); assert(shift<fd);
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
static cshiftVector<vobj> send_buf; send_buf.resize(buffer_size); commVector<vobj> send_buf(buffer_size);
static cshiftVector<vobj> recv_buf; recv_buf.resize(buffer_size); commVector<vobj> recv_buf(buffer_size);
int cb= (cbmask==0x2)? Odd : Even; int cb= (cbmask==0x2)? Odd : Even;
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
@ -139,7 +138,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
} else { } else {
int words = buffer_size; int words = send_buf.size();
if (cbmask != 0x3) words=words>>1; if (cbmask != 0x3) words=words>>1;
int bytes = words * sizeof(vobj); int bytes = words * sizeof(vobj);
@ -151,14 +150,12 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
int xmit_to_rank; int xmit_to_rank;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
grid->Barrier();
grid->SendToRecvFrom((void *)&send_buf[0], grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank, xmit_to_rank,
(void *)&recv_buf[0], (void *)&recv_buf[0],
recv_from_rank, recv_from_rank,
bytes); bytes);
grid->Barrier(); grid->Barrier();
Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask); Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
@ -198,15 +195,8 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension]; int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
// int words = sizeof(vobj)/sizeof(vector_type); // int words = sizeof(vobj)/sizeof(vector_type);
static std::vector<cshiftVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd); std::vector<commVector<scalar_object> > send_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
static std::vector<cshiftVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd); std::vector<commVector<scalar_object> > recv_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
scalar_object * recv_buf_extract_mpi;
scalar_object * send_buf_extract_mpi;
for(int s=0;s<Nsimd;s++){
send_buf_extract[s].resize(buffer_size);
recv_buf_extract[s].resize(buffer_size);
}
int bytes = buffer_size*sizeof(scalar_object); int bytes = buffer_size*sizeof(scalar_object);
@ -252,204 +242,11 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
if(nbr_proc){ if(nbr_proc){
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
grid->Barrier(); grid->SendToRecvFrom((void *)&send_buf_extract[nbr_lane][0],
send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
recv_buf_extract_mpi = &recv_buf_extract[i][0];
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
xmit_to_rank, xmit_to_rank,
(void *)recv_buf_extract_mpi, (void *)&recv_buf_extract[i][0],
recv_from_rank, recv_from_rank,
bytes); bytes);
grid->Barrier();
rpointers[i] = &recv_buf_extract[i][0];
} else {
rpointers[i] = &send_buf_extract[nbr_lane][0];
}
}
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
}
}
#else
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
GridBase *grid=rhs.Grid();
Lattice<vobj> temp(rhs.Grid());
int fd = rhs.Grid()->_fdimensions[dimension];
int rd = rhs.Grid()->_rdimensions[dimension];
int pd = rhs.Grid()->_processors[dimension];
int simd_layout = rhs.Grid()->_simd_layout[dimension];
int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
assert(simd_layout==1);
assert(comm_dim==1);
assert(shift>=0);
assert(shift<fd);
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size);
static cshiftVector<vobj> recv_buf_v; recv_buf_v.resize(buffer_size);
vobj *send_buf;
vobj *recv_buf;
{
grid->ShmBufferFreeAll();
size_t bytes = buffer_size*sizeof(vobj);
send_buf=(vobj *)grid->ShmBufferMalloc(bytes);
recv_buf=(vobj *)grid->ShmBufferMalloc(bytes);
}
int cb= (cbmask==0x2)? Odd : Even;
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
for(int x=0;x<rd;x++){
int sx = (x+sshift)%rd;
int comm_proc = ((x+sshift)/rd)%pd;
if (comm_proc==0) {
Copy_plane(ret,rhs,dimension,x,sx,cbmask);
} else {
int words = buffer_size;
if (cbmask != 0x3) words=words>>1;
int bytes = words * sizeof(vobj);
Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask);
// int rank = grid->_processor;
int recv_from_rank;
int xmit_to_rank;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
grid->Barrier();
acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes);
grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank,
(void *)&recv_buf[0],
recv_from_rank,
bytes);
acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes);
grid->Barrier();
Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask);
}
}
}
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
GridBase *grid=rhs.Grid();
const int Nsimd = grid->Nsimd();
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_object scalar_object;
typedef typename vobj::scalar_type scalar_type;
int fd = grid->_fdimensions[dimension];
int rd = grid->_rdimensions[dimension];
int ld = grid->_ldimensions[dimension];
int pd = grid->_processors[dimension];
int simd_layout = grid->_simd_layout[dimension];
int comm_dim = grid->_processors[dimension] >1 ;
//std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
// << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout
// << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
assert(comm_dim==1);
assert(simd_layout==2);
assert(shift>=0);
assert(shift<fd);
int permute_type=grid->PermuteType(dimension);
///////////////////////////////////////////////
// Simd direction uses an extract/merge pair
///////////////////////////////////////////////
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
// int words = sizeof(vobj)/sizeof(vector_type);
static std::vector<cshiftVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd);
static std::vector<cshiftVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
scalar_object * recv_buf_extract_mpi;
scalar_object * send_buf_extract_mpi;
{
size_t bytes = sizeof(scalar_object)*buffer_size;
grid->ShmBufferFreeAll();
send_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
recv_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
}
for(int s=0;s<Nsimd;s++){
send_buf_extract[s].resize(buffer_size);
recv_buf_extract[s].resize(buffer_size);
}
int bytes = buffer_size*sizeof(scalar_object);
ExtractPointerArray<scalar_object> pointers(Nsimd); //
ExtractPointerArray<scalar_object> rpointers(Nsimd); // received pointers
///////////////////////////////////////////
// Work out what to send where
///////////////////////////////////////////
int cb = (cbmask==0x2)? Odd : Even;
int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
// loop over outer coord planes orthog to dim
for(int x=0;x<rd;x++){
// FIXME call local permute copy if none are offnode.
for(int i=0;i<Nsimd;i++){
pointers[i] = &send_buf_extract[i][0];
}
int sx = (x+sshift)%rd;
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
for(int i=0;i<Nsimd;i++){
int inner_bit = (Nsimd>>(permute_type+1));
int ic= (i&inner_bit)? 1:0;
int my_coor = rd*ic + x;
int nbr_coor = my_coor+sshift;
int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
int nbr_ic = (nbr_coor%ld)/rd; // inner coord of peer
int nbr_ox = (nbr_coor%rd); // outer coord of peer
int nbr_lane = (i&(~inner_bit));
int recv_from_rank;
int xmit_to_rank;
if (nbr_ic) nbr_lane|=inner_bit;
assert (sx == nbr_ox);
if(nbr_proc){
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
grid->Barrier();
acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes);
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
xmit_to_rank,
(void *)recv_buf_extract_mpi,
recv_from_rank,
bytes);
acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes);
grid->Barrier(); grid->Barrier();
rpointers[i] = &recv_buf_extract[i][0]; rpointers[i] = &recv_buf_extract[i][0];
} else { } else {
@ -461,7 +258,7 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
} }
} }
#endif
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
#endif #endif

View File

@ -46,4 +46,3 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/lattice/Lattice_unary.h> #include <Grid/lattice/Lattice_unary.h>
#include <Grid/lattice/Lattice_transfer.h> #include <Grid/lattice/Lattice_transfer.h>
#include <Grid/lattice/Lattice_basis.h> #include <Grid/lattice/Lattice_basis.h>
#include <Grid/lattice/Lattice_crc.h>

View File

@ -342,14 +342,19 @@ inline void ExpressionViewClose(LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
GridUnopClass(UnarySub, -a); GridUnopClass(UnarySub, -a);
GridUnopClass(UnaryNot, Not(a)); GridUnopClass(UnaryNot, Not(a));
GridUnopClass(UnaryAdj, adj(a));
GridUnopClass(UnaryConj, conjugate(a));
GridUnopClass(UnaryTrace, trace(a)); GridUnopClass(UnaryTrace, trace(a));
GridUnopClass(UnaryTranspose, transpose(a)); GridUnopClass(UnaryTranspose, transpose(a));
GridUnopClass(UnaryTa, Ta(a)); GridUnopClass(UnaryTa, Ta(a));
GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a)); GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a));
GridUnopClass(UnaryToReal, toReal(a));
GridUnopClass(UnaryToComplex, toComplex(a));
GridUnopClass(UnaryTimesI, timesI(a)); GridUnopClass(UnaryTimesI, timesI(a));
GridUnopClass(UnaryTimesMinusI, timesMinusI(a)); GridUnopClass(UnaryTimesMinusI, timesMinusI(a));
GridUnopClass(UnaryAbs, abs(a)); GridUnopClass(UnaryAbs, abs(a));
GridUnopClass(UnarySqrt, sqrt(a)); GridUnopClass(UnarySqrt, sqrt(a));
GridUnopClass(UnaryRsqrt, rsqrt(a));
GridUnopClass(UnarySin, sin(a)); GridUnopClass(UnarySin, sin(a));
GridUnopClass(UnaryCos, cos(a)); GridUnopClass(UnaryCos, cos(a));
GridUnopClass(UnaryAsin, asin(a)); GridUnopClass(UnaryAsin, asin(a));
@ -451,17 +456,20 @@ GridTrinOpClass(TrinaryWhere,
GRID_DEF_UNOP(operator-, UnarySub); GRID_DEF_UNOP(operator-, UnarySub);
GRID_DEF_UNOP(Not, UnaryNot); GRID_DEF_UNOP(Not, UnaryNot);
GRID_DEF_UNOP(operator!, UnaryNot); GRID_DEF_UNOP(operator!, UnaryNot);
//GRID_DEF_UNOP(adj, UnaryAdj); GRID_DEF_UNOP(adj, UnaryAdj);
//GRID_DEF_UNOP(conjugate, UnaryConj); GRID_DEF_UNOP(conjugate, UnaryConj);
GRID_DEF_UNOP(trace, UnaryTrace); GRID_DEF_UNOP(trace, UnaryTrace);
GRID_DEF_UNOP(transpose, UnaryTranspose); GRID_DEF_UNOP(transpose, UnaryTranspose);
GRID_DEF_UNOP(Ta, UnaryTa); GRID_DEF_UNOP(Ta, UnaryTa);
GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup); GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup);
GRID_DEF_UNOP(toReal, UnaryToReal);
GRID_DEF_UNOP(toComplex, UnaryToComplex);
GRID_DEF_UNOP(timesI, UnaryTimesI); GRID_DEF_UNOP(timesI, UnaryTimesI);
GRID_DEF_UNOP(timesMinusI, UnaryTimesMinusI); GRID_DEF_UNOP(timesMinusI, UnaryTimesMinusI);
GRID_DEF_UNOP(abs, UnaryAbs); // abs overloaded in cmath C++98; DON'T do the GRID_DEF_UNOP(abs, UnaryAbs); // abs overloaded in cmath C++98; DON'T do the
// abs-fabs-dabs-labs thing // abs-fabs-dabs-labs thing
GRID_DEF_UNOP(sqrt, UnarySqrt); GRID_DEF_UNOP(sqrt, UnarySqrt);
GRID_DEF_UNOP(rsqrt, UnaryRsqrt);
GRID_DEF_UNOP(sin, UnarySin); GRID_DEF_UNOP(sin, UnarySin);
GRID_DEF_UNOP(cos, UnaryCos); GRID_DEF_UNOP(cos, UnaryCos);
GRID_DEF_UNOP(asin, UnaryAsin); GRID_DEF_UNOP(asin, UnaryAsin);
@ -486,27 +494,27 @@ GRID_DEF_TRINOP(where, TrinaryWhere);
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
template <class Op, class T1> template <class Op, class T1>
auto closure(const LatticeUnaryExpression<Op, T1> &expr) auto closure(const LatticeUnaryExpression<Op, T1> &expr)
-> Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1)))>::type > -> Lattice<decltype(expr.op.func(vecEval(0, expr.arg1)))>
{ {
Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1)))>::type > ret(expr); Lattice<decltype(expr.op.func(vecEval(0, expr.arg1)))> ret(expr);
return ret; return ret;
} }
template <class Op, class T1, class T2> template <class Op, class T1, class T2>
auto closure(const LatticeBinaryExpression<Op, T1, T2> &expr) auto closure(const LatticeBinaryExpression<Op, T1, T2> &expr)
-> Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1),vecEval(0, expr.arg2)))>::type > -> Lattice<decltype(expr.op.func(vecEval(0, expr.arg1),vecEval(0, expr.arg2)))>
{ {
Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1),vecEval(0, expr.arg2)))>::type > ret(expr); Lattice<decltype(expr.op.func(vecEval(0, expr.arg1),vecEval(0, expr.arg2)))> ret(expr);
return ret; return ret;
} }
template <class Op, class T1, class T2, class T3> template <class Op, class T1, class T2, class T3>
auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
-> Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1), -> Lattice<decltype(expr.op.func(vecEval(0, expr.arg1),
vecEval(0, expr.arg2), vecEval(0, expr.arg2),
vecEval(0, expr.arg3)))>::type > vecEval(0, expr.arg3)))>
{ {
Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1), Lattice<decltype(expr.op.func(vecEval(0, expr.arg1),
vecEval(0, expr.arg2), vecEval(0, expr.arg2),
vecEval(0, expr.arg3)))>::type > ret(expr); vecEval(0, expr.arg3)))> ret(expr);
return ret; return ret;
} }
#define EXPRESSION_CLOSURE(function) \ #define EXPRESSION_CLOSURE(function) \

View File

@ -60,9 +60,9 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
autoView( lhs_v , lhs, AcceleratorRead); autoView( lhs_v , lhs, AcceleratorRead);
autoView( rhs_v , rhs, AcceleratorRead); autoView( rhs_v , rhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss); auto lhs_t=lhs_v(ss);
auto rhs_t=rhs_v(ss); auto rhs_t=rhs_v(ss);
auto tmp =ret_v(ss);
mac(&tmp,&lhs_t,&rhs_t); mac(&tmp,&lhs_t,&rhs_t);
coalescedWrite(ret_v[ss],tmp); coalescedWrite(ret_v[ss],tmp);
}); });
@ -124,7 +124,7 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
autoView( ret_v , ret, AcceleratorWrite); autoView( ret_v , ret, AcceleratorWrite);
autoView( lhs_v , lhs, AcceleratorRead); autoView( lhs_v , lhs, AcceleratorRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
auto tmp =ret_v(ss); decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss); auto lhs_t=lhs_v(ss);
mac(&tmp,&lhs_t,&rhs); mac(&tmp,&lhs_t,&rhs);
coalescedWrite(ret_v[ss],tmp); coalescedWrite(ret_v[ss],tmp);
@ -182,7 +182,7 @@ void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
autoView( ret_v , ret, AcceleratorWrite); autoView( ret_v , ret, AcceleratorWrite);
autoView( rhs_v , lhs, AcceleratorRead); autoView( rhs_v , lhs, AcceleratorRead);
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
auto tmp =ret_v(ss); decltype(coalescedRead(obj1())) tmp;
auto rhs_t=rhs_v(ss); auto rhs_t=rhs_v(ss);
mac(&tmp,&lhs,&rhs_t); mac(&tmp,&lhs,&rhs_t);
coalescedWrite(ret_v[ss],tmp); coalescedWrite(ret_v[ss],tmp);
@ -225,7 +225,7 @@ void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &
autoView( x_v , x, AcceleratorRead); autoView( x_v , x, AcceleratorRead);
autoView( y_v , y, AcceleratorRead); autoView( y_v , y, AcceleratorRead);
accelerator_for(ss,x_v.size(),vobj::Nsimd(),{ accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
auto tmp = a*coalescedRead(x_v[ss])+coalescedRead(y_v[ss]); auto tmp = a*x_v(ss)+y_v(ss);
coalescedWrite(ret_v[ss],tmp); coalescedWrite(ret_v[ss],tmp);
}); });
} }

View File

@ -88,13 +88,6 @@ public:
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode); LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode);
accessor.ViewClose(); accessor.ViewClose();
} }
// Helper function to print the state of this object in the AccCache
void PrintCacheState(void)
{
MemoryManager::PrintState(this->_odata);
}
///////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////
// Return a view object that may be dereferenced in site loops. // Return a view object that may be dereferenced in site loops.
// The view is trivially copy constructible and may be copied to an accelerator device // The view is trivially copy constructible and may be copied to an accelerator device

View File

@ -62,7 +62,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
basis_v.push_back(basis[k].View(AcceleratorWrite)); basis_v.push_back(basis[k].View(AcceleratorWrite));
} }
#if ( (!defined(GRID_CUDA)) ) #if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) )
int max_threads = thread_max(); int max_threads = thread_max();
Vector < vobj > Bt(Nm * max_threads); Vector < vobj > Bt(Nm * max_threads);
thread_region thread_region
@ -125,7 +125,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
for(int k=k0; k<k1; ++k){ for(int k=k0; k<k1; ++k){
auto tmp = coalescedRead(Bp[ss*nrot+j]); auto tmp = coalescedRead(Bp[ss*nrot+j]);
coalescedWrite(Bp[ss*nrot+j],tmp+ Qt_p[jj*Nm+k] * coalescedRead(basis_vp[k][sss])); coalescedWrite(Bp[ss*nrot+j],tmp+ Qt_p[jj*Nm+k] * coalescedRead(basis_v[k][sss]));
} }
}); });
@ -134,7 +134,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
int jj =j0+j; int jj =j0+j;
int ss =sj/nrot; int ss =sj/nrot;
int sss=ss+s; int sss=ss+s;
coalescedWrite(basis_vp[jj][sss],coalescedRead(Bp[ss*nrot+j])); coalescedWrite(basis_v[jj][sss],coalescedRead(Bp[ss*nrot+j]));
}); });
} }
#endif #endif
@ -161,13 +161,11 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
double * Qt_j = & Qt_jv[0]; double * Qt_j = & Qt_jv[0];
for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k); for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
auto basis_vp=& basis_v[0];
autoView(result_v,result,AcceleratorWrite); autoView(result_v,result,AcceleratorWrite);
accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{ accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
vobj zzz=Zero(); auto B=coalescedRead(zz);
auto B=coalescedRead(zzz);
for(int k=k0; k<k1; ++k){ for(int k=k0; k<k1; ++k){
B +=Qt_j[k] * coalescedRead(basis_vp[k][ss]); B +=Qt_j[k] * coalescedRead(basis_v[k][ss]);
} }
coalescedWrite(result_v[ss], B); coalescedWrite(result_v[ss], B);
}); });

View File

@ -1,42 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_crc.h
Copyright (C) 2021
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
template<class vobj> uint32_t crc(Lattice<vobj> & buf)
{
autoView( buf_v , buf, CpuRead);
return ::crc32(0L,(unsigned char *)&buf_v[0],(size_t)sizeof(vobj)*buf.oSites());
}
#define CRC(U) std::cout << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "<<crc(U)<<std::endl;
NAMESPACE_END(Grid);

View File

@ -45,8 +45,8 @@ template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
autoView( ret_v, ret, AcceleratorWrite); autoView( ret_v, ret, AcceleratorWrite);
ret.Checkerboard()=lhs.Checkerboard(); ret.Checkerboard()=lhs.Checkerboard();
accelerator_for( ss, lhs_v.size(), 1, { accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
ret_v[ss] = adj(lhs_v[ss]); coalescedWrite(ret_v[ss], adj(lhs_v(ss)));
}); });
return ret; return ret;
}; };
@ -64,53 +64,6 @@ template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
return ret; return ret;
}; };
template<class vobj> inline Lattice<typename vobj::Complexified> toComplex(const Lattice<vobj> &lhs){
Lattice<typename vobj::Complexified> ret(lhs.Grid());
autoView( lhs_v, lhs, AcceleratorRead);
autoView( ret_v, ret, AcceleratorWrite);
ret.Checkerboard() = lhs.Checkerboard();
accelerator_for( ss, lhs_v.size(), 1, {
ret_v[ss] = toComplex(lhs_v[ss]);
});
return ret;
};
template<class vobj> inline Lattice<typename vobj::Realified> toReal(const Lattice<vobj> &lhs){
Lattice<typename vobj::Realified> ret(lhs.Grid());
autoView( lhs_v, lhs, AcceleratorRead);
autoView( ret_v, ret, AcceleratorWrite);
ret.Checkerboard() = lhs.Checkerboard();
accelerator_for( ss, lhs_v.size(), 1, {
ret_v[ss] = toReal(lhs_v[ss]);
});
return ret;
};
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
auto toComplex(const Expression &expr) -> decltype(closure(expr))
{
return toComplex(closure(expr));
}
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
auto toReal(const Expression &expr) -> decltype(closure(expr))
{
return toReal(closure(expr));
}
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
auto adj(const Expression &expr) -> decltype(closure(expr))
{
return adj(closure(expr));
}
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
auto conjugate(const Expression &expr) -> decltype(closure(expr))
{
return conjugate(closure(expr));
}
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
#endif #endif

View File

@ -96,34 +96,8 @@ inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)
ssobj ret = ssum; ssobj ret = ssum;
return ret; return ret;
} }
/*
Threaded max, don't use for now
template<class Double>
inline Double max(const Double *arg, Integer osites)
{
// const int Nsimd = vobj::Nsimd();
const int nthread = GridThread::GetThreads();
std::vector<Double> maxarray(nthread);
thread_for(thr,nthread, {
int nwork, mywork, myoff;
nwork = osites;
GridThread::GetWork(nwork,thr,mywork,myoff);
Double max=arg[0];
for(int ss=myoff;ss<mywork+myoff; ss++){
if( arg[ss] > max ) max = arg[ss];
}
maxarray[thr]=max;
});
Double tmax=maxarray[0];
for(int i=0;i<nthread;i++){
if (maxarray[i]>tmax) tmax = maxarray[i];
}
return tmax;
}
*/
template<class vobj> template<class vobj>
inline typename vobj::scalar_object sum(const vobj *arg, Integer osites) inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
{ {
@ -167,32 +141,6 @@ template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
return real(nrm); return real(nrm);
} }
//The global maximum of the site norm2
template<class vobj> inline RealD maxLocalNorm2(const Lattice<vobj> &arg)
{
typedef typename vobj::tensor_reduced vscalar; //iScalar<iScalar<.... <vPODtype> > >
typedef typename vscalar::scalar_object scalar; //iScalar<iScalar<.... <PODtype> > >
Lattice<vscalar> inner = localNorm2(arg);
auto grid = arg.Grid();
RealD max;
for(int l=0;l<grid->lSites();l++){
Coordinate coor;
scalar val;
RealD r;
grid->LocalIndexToLocalCoor(l,coor);
peekLocalSite(val,inner,coor);
r=real(TensorRemove(val));
if( (l==0) || (r>max)){
max=r;
}
}
grid->GlobalMax(max);
return max;
}
// Double inner product // Double inner product
template<class vobj> template<class vobj>
inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
@ -361,7 +309,6 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
// But easily avoided by using double precision fields // But easily avoided by using double precision fields
/////////////////////////////////////////////////////// ///////////////////////////////////////////////////////
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_object::scalar_type scalar_type;
GridBase *grid = Data.Grid(); GridBase *grid = Data.Grid();
assert(grid!=NULL); assert(grid!=NULL);
@ -420,19 +367,20 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
} }
// sum over nodes. // sum over nodes.
sobj gsum;
for(int t=0;t<fd;t++){ for(int t=0;t<fd;t++){
int pt = t/ld; // processor plane int pt = t/ld; // processor plane
int lt = t%ld; int lt = t%ld;
if ( pt == grid->_processor_coor[orthogdim] ) { if ( pt == grid->_processor_coor[orthogdim] ) {
result[t]=lsSum[lt]; gsum=lsSum[lt];
} else { } else {
result[t]=Zero(); gsum=Zero();
} }
grid->GlobalSum(gsum);
result[t]=gsum;
} }
scalar_type * ptr = (scalar_type *) &result[0];
int words = fd*sizeof(sobj)/sizeof(scalar_type);
grid->GlobalSumVector(ptr, words);
} }
template<class vobj> template<class vobj>

View File

@ -2,13 +2,12 @@ NAMESPACE_BEGIN(Grid);
#ifdef GRID_HIP #ifdef GRID_HIP
extern hipDeviceProp_t *gpu_props; extern hipDeviceProp_t *gpu_props;
#define WARP_SIZE 64
#endif #endif
#ifdef GRID_CUDA #ifdef GRID_CUDA
extern cudaDeviceProp *gpu_props; extern cudaDeviceProp *gpu_props;
#define WARP_SIZE 32
#endif #endif
#define WARP_SIZE 32
__device__ unsigned int retirementCount = 0; __device__ unsigned int retirementCount = 0;
template <class Iterator> template <class Iterator>
@ -42,6 +41,7 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator
std::cout << GridLogDebug << "\twarpSize = " << warpSize << std::endl; std::cout << GridLogDebug << "\twarpSize = " << warpSize << std::endl;
std::cout << GridLogDebug << "\tsharedMemPerBlock = " << sharedMemPerBlock << std::endl; std::cout << GridLogDebug << "\tsharedMemPerBlock = " << sharedMemPerBlock << std::endl;
std::cout << GridLogDebug << "\tmaxThreadsPerBlock = " << maxThreadsPerBlock << std::endl; std::cout << GridLogDebug << "\tmaxThreadsPerBlock = " << maxThreadsPerBlock << std::endl;
std::cout << GridLogDebug << "\tmaxThreadsPerBlock = " << warpSize << std::endl;
std::cout << GridLogDebug << "\tmultiProcessorCount = " << multiProcessorCount << std::endl; std::cout << GridLogDebug << "\tmultiProcessorCount = " << multiProcessorCount << std::endl;
if (warpSize != WARP_SIZE) { if (warpSize != WARP_SIZE) {
@ -51,10 +51,6 @@ void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator
// let the number of threads in a block be a multiple of 2, starting from warpSize // let the number of threads in a block be a multiple of 2, starting from warpSize
threads = warpSize; threads = warpSize;
if ( threads*sizeofsobj > sharedMemPerBlock ) {
std::cout << GridLogError << "The object is too large for the shared memory." << std::endl;
exit(EXIT_FAILURE);
}
while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2; while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2;
// keep all the streaming multiprocessors busy // keep all the streaming multiprocessors busy
blocks = nextPow2(multiProcessorCount); blocks = nextPow2(multiProcessorCount);
@ -68,7 +64,7 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
// cannot use overloaded operators for sobj as they are not volatile-qualified // cannot use overloaded operators for sobj as they are not volatile-qualified
memcpy((void *)&sdata[tid], (void *)&mySum, sizeof(sobj)); memcpy((void *)&sdata[tid], (void *)&mySum, sizeof(sobj));
acceleratorSynchronise(); __syncwarp();
const Iterator VEC = WARP_SIZE; const Iterator VEC = WARP_SIZE;
const Iterator vid = tid & (VEC-1); const Iterator vid = tid & (VEC-1);
@ -82,9 +78,9 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
beta += temp; beta += temp;
memcpy((void *)&sdata[tid], (void *)&beta, sizeof(sobj)); memcpy((void *)&sdata[tid], (void *)&beta, sizeof(sobj));
} }
acceleratorSynchronise(); __syncwarp();
} }
acceleratorSynchroniseAll(); __syncthreads();
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
beta = Zero(); beta = Zero();
@ -94,7 +90,7 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
} }
memcpy((void *)&sdata[0], (void *)&beta, sizeof(sobj)); memcpy((void *)&sdata[0], (void *)&beta, sizeof(sobj));
} }
acceleratorSynchroniseAll(); __syncthreads();
} }

View File

@ -32,9 +32,8 @@
#include <random> #include <random>
#ifdef RNG_SITMO #ifdef RNG_SITMO
#include <Grid/random/sitmo_prng_engine.hpp> #include <Grid/sitmo_rng/sitmo_prng_engine.hpp>
#endif #endif
#include <Grid/random/gaussian.h>
#if defined(RNG_SITMO) #if defined(RNG_SITMO)
#define RNG_FAST_DISCARD #define RNG_FAST_DISCARD
@ -143,8 +142,8 @@ public:
std::vector<RngEngine> _generators; std::vector<RngEngine> _generators;
std::vector<std::uniform_real_distribution<RealD> > _uniform; std::vector<std::uniform_real_distribution<RealD> > _uniform;
std::vector<Grid::gaussian_distribution<RealD> > _gaussian; std::vector<std::normal_distribution<RealD> > _gaussian;
// std::vector<std::discrete_distribution<int32_t> > _bernoulli; std::vector<std::discrete_distribution<int32_t> > _bernoulli;
std::vector<std::uniform_int_distribution<uint32_t> > _uid; std::vector<std::uniform_int_distribution<uint32_t> > _uid;
/////////////////////// ///////////////////////
@ -244,8 +243,8 @@ public:
GridSerialRNG() : GridRNGbase() { GridSerialRNG() : GridRNGbase() {
_generators.resize(1); _generators.resize(1);
_uniform.resize(1,std::uniform_real_distribution<RealD>{0,1}); _uniform.resize(1,std::uniform_real_distribution<RealD>{0,1});
_gaussian.resize(1,gaussian_distribution<RealD>(0.0,1.0) ); _gaussian.resize(1,std::normal_distribution<RealD>(0.0,1.0) );
// _bernoulli.resize(1,std::discrete_distribution<int32_t>{1,1}); _bernoulli.resize(1,std::discrete_distribution<int32_t>{1,1});
_uid.resize(1,std::uniform_int_distribution<uint32_t>() ); _uid.resize(1,std::uniform_int_distribution<uint32_t>() );
} }
@ -358,8 +357,8 @@ public:
_generators.resize(_vol); _generators.resize(_vol);
_uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1}); _uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1});
_gaussian.resize(_vol,gaussian_distribution<RealD>(0.0,1.0) ); _gaussian.resize(_vol,std::normal_distribution<RealD>(0.0,1.0) );
// _bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1}); _bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1});
_uid.resize(_vol,std::uniform_int_distribution<uint32_t>() ); _uid.resize(_vol,std::uniform_int_distribution<uint32_t>() );
} }
@ -516,11 +515,11 @@ public:
template <class vobj> inline void random(GridParallelRNG &rng,Lattice<vobj> &l) { rng.fill(l,rng._uniform); } template <class vobj> inline void random(GridParallelRNG &rng,Lattice<vobj> &l) { rng.fill(l,rng._uniform); }
template <class vobj> inline void gaussian(GridParallelRNG &rng,Lattice<vobj> &l) { rng.fill(l,rng._gaussian); } template <class vobj> inline void gaussian(GridParallelRNG &rng,Lattice<vobj> &l) { rng.fill(l,rng._gaussian); }
//template <class vobj> inline void bernoulli(GridParallelRNG &rng,Lattice<vobj> &l){ rng.fill(l,rng._bernoulli);} template <class vobj> inline void bernoulli(GridParallelRNG &rng,Lattice<vobj> &l){ rng.fill(l,rng._bernoulli);}
template <class sobj> inline void random(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._uniform ); } template <class sobj> inline void random(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._uniform ); }
template <class sobj> inline void gaussian(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._gaussian ); } template <class sobj> inline void gaussian(GridSerialRNG &rng,sobj &l) { rng.fill(l,rng._gaussian ); }
//template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){ rng.fill(l,rng._bernoulli); } template <class sobj> inline void bernoulli(GridSerialRNG &rng,sobj &l){ rng.fill(l,rng._bernoulli); }
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
#endif #endif

View File

@ -85,76 +85,6 @@ template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Latti
}); });
} }
template<class vobj> inline void acceleratorPickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full, int checker_dim_half=0)
{
half.Checkerboard() = cb;
autoView(half_v, half, AcceleratorWrite);
autoView(full_v, full, AcceleratorRead);
Coordinate rdim_full = full.Grid()->_rdimensions;
Coordinate rdim_half = half.Grid()->_rdimensions;
unsigned long ndim_half = half.Grid()->_ndimension;
Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
Coordinate ostride_half = half.Grid()->_ostride;
accelerator_for(ss, full.Grid()->oSites(),full.Grid()->Nsimd(),{
Coordinate coor;
int cbos;
int linear=0;
Lexicographic::CoorFromIndex(coor,ss,rdim_full);
assert(coor.size()==ndim_half);
for(int d=0;d<ndim_half;d++){
if(checker_dim_mask_half[d]) linear += coor[d];
}
cbos = (linear&0x1);
if (cbos==cb) {
int ssh=0;
for(int d=0;d<ndim_half;d++) {
if (d == checker_dim_half) ssh += ostride_half[d] * ((coor[d] / 2) % rdim_half[d]);
else ssh += ostride_half[d] * (coor[d] % rdim_half[d]);
}
coalescedWrite(half_v[ssh],full_v(ss));
}
});
}
template<class vobj> inline void acceleratorSetCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half, int checker_dim_half=0)
{
int cb = half.Checkerboard();
autoView(half_v , half, AcceleratorRead);
autoView(full_v , full, AcceleratorWrite);
Coordinate rdim_full = full.Grid()->_rdimensions;
Coordinate rdim_half = half.Grid()->_rdimensions;
unsigned long ndim_half = half.Grid()->_ndimension;
Coordinate checker_dim_mask_half = half.Grid()->_checker_dim_mask;
Coordinate ostride_half = half.Grid()->_ostride;
accelerator_for(ss,full.Grid()->oSites(),full.Grid()->Nsimd(),{
Coordinate coor;
int cbos;
int linear=0;
Lexicographic::CoorFromIndex(coor,ss,rdim_full);
assert(coor.size()==ndim_half);
for(int d=0;d<ndim_half;d++){
if(checker_dim_mask_half[d]) linear += coor[d];
}
cbos = (linear&0x1);
if (cbos==cb) {
int ssh=0;
for(int d=0;d<ndim_half;d++){
if (d == checker_dim_half) ssh += ostride_half[d] * ((coor[d] / 2) % rdim_half[d]);
else ssh += ostride_half[d] * (coor[d] % rdim_half[d]);
}
coalescedWrite(full_v[ss],half_v(ssh));
}
});
}
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
// Flexible Type Conversion for internal promotion to double as well as graceful // Flexible Type Conversion for internal promotion to double as well as graceful
// treatment of scalar-compatible types // treatment of scalar-compatible types
@ -167,20 +97,6 @@ accelerator_inline void convertType(ComplexF & out, const std::complex<float> &
out = in; out = in;
} }
template<typename T>
accelerator_inline EnableIf<isGridFundamental<T>> convertType(T & out, const T & in) {
out = in;
}
// This would allow for conversions between GridFundamental types, but is not strictly needed as yet
/*template<typename T1, typename T2>
accelerator_inline typename std::enable_if<isGridFundamental<T1>::value && isGridFundamental<T2>::value>::type
// Or to make this very broad, conversions between anything that's not a GridTensor could be allowed
//accelerator_inline typename std::enable_if<!isGridTensor<T1>::value && !isGridTensor<T2>::value>::type
convertType(T1 & out, const T2 & in) {
out = in;
}*/
#ifdef GRID_SIMT #ifdef GRID_SIMT
accelerator_inline void convertType(vComplexF & out, const ComplexF & in) { accelerator_inline void convertType(vComplexF & out, const ComplexF & in) {
((ComplexF*)&out)[acceleratorSIMTlane(vComplexF::Nsimd())] = in; ((ComplexF*)&out)[acceleratorSIMTlane(vComplexF::Nsimd())] = in;
@ -201,18 +117,18 @@ accelerator_inline void convertType(vComplexD2 & out, const vComplexF & in) {
Optimization::PrecisionChange::StoD(in.v,out._internal[0].v,out._internal[1].v); Optimization::PrecisionChange::StoD(in.v,out._internal[0].v,out._internal[1].v);
} }
template<typename T1,typename T2> template<typename T1,typename T2,int N>
accelerator_inline void convertType(iScalar<T1> & out, const iScalar<T2> & in) { accelerator_inline void convertType(iMatrix<T1,N> & out, const iMatrix<T2,N> & in);
convertType(out._internal,in._internal); template<typename T1,typename T2,int N>
} accelerator_inline void convertType(iVector<T1,N> & out, const iVector<T2,N> & in);
template<typename T1,typename T2> template<typename T1,typename T2, typename std::enable_if<!isGridScalar<T1>::value, T1>::type* = nullptr>
accelerator_inline NotEnableIf<isGridScalar<T1>> convertType(T1 & out, const iScalar<T2> & in) { accelerator_inline void convertType(T1 & out, const iScalar<T2> & in) {
convertType(out,in._internal); convertType(out,in._internal);
} }
template<typename T1,typename T2> template<typename T1,typename T2>
accelerator_inline NotEnableIf<isGridScalar<T2>> convertType(iScalar<T1> & out, const T2 & in) { accelerator_inline void convertType(iScalar<T1> & out, const T2 & in) {
convertType(out._internal,in); convertType(out._internal,in);
} }
@ -229,6 +145,11 @@ accelerator_inline void convertType(iVector<T1,N> & out, const iVector<T2,N> & i
convertType(out._internal[i],in._internal[i]); convertType(out._internal[i],in._internal[i]);
} }
template<typename T, typename std::enable_if<isGridFundamental<T>::value, T>::type* = nullptr>
accelerator_inline void convertType(T & out, const T & in) {
out = in;
}
template<typename T1,typename T2> template<typename T1,typename T2>
accelerator_inline void convertType(Lattice<T1> & out, const Lattice<T2> & in) { accelerator_inline void convertType(Lattice<T1> & out, const Lattice<T2> & in) {
autoView( out_v , out,AcceleratorWrite); autoView( out_v , out,AcceleratorWrite);
@ -434,22 +355,16 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
autoView( coarseData_ , coarseData, AcceleratorWrite); autoView( coarseData_ , coarseData, AcceleratorWrite);
autoView( fineData_ , fineData, AcceleratorRead); autoView( fineData_ , fineData, AcceleratorRead);
auto coarseData_p = &coarseData_[0];
auto fineData_p = &fineData_[0];
Coordinate fine_rdimensions = fine->_rdimensions; Coordinate fine_rdimensions = fine->_rdimensions;
Coordinate coarse_rdimensions = coarse->_rdimensions; Coordinate coarse_rdimensions = coarse->_rdimensions;
vobj zz = Zero();
accelerator_for(sc,coarse->oSites(),1,{ accelerator_for(sc,coarse->oSites(),1,{
// One thread per sub block // One thread per sub block
Coordinate coor_c(_ndimension); Coordinate coor_c(_ndimension);
Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions); // Block coordinate Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions); // Block coordinate
coarseData_[sc]=Zero();
vobj cd = zz;
for(int sb=0;sb<blockVol;sb++){ for(int sb=0;sb<blockVol;sb++){
int sf; int sf;
@ -459,11 +374,9 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d]; for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d];
Lexicographic::IndexFromCoor(coor_f,sf,fine_rdimensions); Lexicographic::IndexFromCoor(coor_f,sf,fine_rdimensions);
cd=cd+fineData_p[sf]; coarseData_[sc]=coarseData_[sc]+fineData_[sf];
} }
coarseData_p[sc] = cd;
}); });
return; return;
} }
@ -855,7 +768,7 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int
template<class vobj> template<class vobj>
void Replicate(const Lattice<vobj> &coarse,Lattice<vobj> & fine) void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)
{ {
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
@ -1080,95 +993,53 @@ vectorizeFromRevLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
}); });
} }
//The workspace for a precision change operation allowing for the reuse of the mapping to save time on subsequent calls
class precisionChangeWorkspace{
std::pair<Integer,Integer>* fmap_device; //device pointer
public:
precisionChangeWorkspace(GridBase *out_grid, GridBase *in_grid){
//Build a map between the sites and lanes of the output field and the input field as we cannot use the Grids on the device
assert(out_grid->Nd() == in_grid->Nd());
for(int d=0;d<out_grid->Nd();d++){
assert(out_grid->FullDimensions()[d] == in_grid->FullDimensions()[d]);
}
int Nsimd_out = out_grid->Nsimd();
std::vector<Coordinate> out_icorrs(out_grid->Nsimd()); //reuse these
for(int lane=0; lane < out_grid->Nsimd(); lane++)
out_grid->iCoorFromIindex(out_icorrs[lane], lane);
std::vector<std::pair<Integer,Integer> > fmap_host(out_grid->lSites()); //lsites = osites*Nsimd
thread_for(out_oidx,out_grid->oSites(),{
Coordinate out_ocorr;
out_grid->oCoorFromOindex(out_ocorr, out_oidx);
Coordinate lcorr; //the local coordinate (common to both in and out as full coordinate)
for(int out_lane=0; out_lane < Nsimd_out; out_lane++){
out_grid->InOutCoorToLocalCoor(out_ocorr, out_icorrs[out_lane], lcorr);
//int in_oidx = in_grid->oIndex(lcorr), in_lane = in_grid->iIndex(lcorr);
//Note oIndex and OcorrFromOindex (and same for iIndex) are not inverse for checkerboarded lattice, the former coordinates being defined on the full lattice and the latter on the reduced lattice
//Until this is fixed we need to circumvent the problem locally. Here I will use the coordinates defined on the reduced lattice for simplicity
int in_oidx = 0, in_lane = 0;
for(int d=0;d<in_grid->_ndimension;d++){
in_oidx += in_grid->_ostride[d] * ( lcorr[d] % in_grid->_rdimensions[d] );
in_lane += in_grid->_istride[d] * ( lcorr[d] / in_grid->_rdimensions[d] );
}
fmap_host[out_lane + Nsimd_out*out_oidx] = std::pair<Integer,Integer>( in_oidx, in_lane );
}
});
//Copy the map to the device (if we had a way to tell if an accelerator is in use we could avoid this copy for CPU-only machines)
size_t fmap_bytes = out_grid->lSites() * sizeof(std::pair<Integer,Integer>);
fmap_device = (std::pair<Integer,Integer>*)acceleratorAllocDevice(fmap_bytes);
acceleratorCopyToDevice(fmap_host.data(), fmap_device, fmap_bytes);
}
//Prevent moving or copying
precisionChangeWorkspace(const precisionChangeWorkspace &r) = delete;
precisionChangeWorkspace(precisionChangeWorkspace &&r) = delete;
precisionChangeWorkspace &operator=(const precisionChangeWorkspace &r) = delete;
precisionChangeWorkspace &operator=(precisionChangeWorkspace &&r) = delete;
std::pair<Integer,Integer> const* getMap() const{ return fmap_device; }
~precisionChangeWorkspace(){
acceleratorFreeDevice(fmap_device);
}
};
//Convert a lattice of one precision to another. The input workspace contains the mapping data.
template<class VobjOut, class VobjIn>
void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, const precisionChangeWorkspace &workspace){
static_assert( std::is_same<typename VobjOut::DoublePrecision, typename VobjIn::DoublePrecision>::value == 1, "copyLane: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same
out.Checkerboard() = in.Checkerboard();
constexpr int Nsimd_out = VobjOut::Nsimd();
std::pair<Integer,Integer> const* fmap_device = workspace.getMap();
//Do the copy/precision change
autoView( out_v , out, AcceleratorWrite);
autoView( in_v , in, AcceleratorRead);
accelerator_for(out_oidx, out.Grid()->oSites(), 1,{
std::pair<Integer,Integer> const* fmap_osite = fmap_device + out_oidx*Nsimd_out;
for(int out_lane=0; out_lane < Nsimd_out; out_lane++){
int in_oidx = fmap_osite[out_lane].first;
int in_lane = fmap_osite[out_lane].second;
copyLane(out_v[out_oidx], out_lane, in_v[in_oidx], in_lane);
}
});
}
//Convert a Lattice from one precision to another //Convert a Lattice from one precision to another
//Generate the workspace in place; if multiple calls with the same mapping are performed, consider pregenerating the workspace and reusing
template<class VobjOut, class VobjIn> template<class VobjOut, class VobjIn>
void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
precisionChangeWorkspace workspace(out.Grid(), in.Grid()); {
precisionChange(out, in, workspace); assert(out.Grid()->Nd() == in.Grid()->Nd());
} for(int d=0;d<out.Grid()->Nd();d++){
assert(out.Grid()->FullDimensions()[d] == in.Grid()->FullDimensions()[d]);
}
out.Checkerboard() = in.Checkerboard();
GridBase *in_grid=in.Grid();
GridBase *out_grid = out.Grid();
typedef typename VobjOut::scalar_object SobjOut;
typedef typename VobjIn::scalar_object SobjIn;
int ndim = out.Grid()->Nd();
int out_nsimd = out_grid->Nsimd();
std::vector<Coordinate > out_icoor(out_nsimd);
for(int lane=0; lane < out_nsimd; lane++){
out_icoor[lane].resize(ndim);
out_grid->iCoorFromIindex(out_icoor[lane], lane);
}
std::vector<SobjOut> in_slex_conv(in_grid->lSites());
unvectorizeToLexOrdArray(in_slex_conv, in);
autoView( out_v , out, CpuWrite);
thread_for(out_oidx,out_grid->oSites(),{
Coordinate out_ocoor(ndim);
out_grid->oCoorFromOindex(out_ocoor, out_oidx);
ExtractPointerArray<SobjOut> ptrs(out_nsimd);
Coordinate lcoor(out_grid->Nd());
for(int lane=0; lane < out_nsimd; lane++){
for(int mu=0;mu<ndim;mu++)
lcoor[mu] = out_ocoor[mu] + out_grid->_rdimensions[mu]*out_icoor[lane][mu];
int llex; Lexicographic::IndexFromCoor(lcoor, llex, out_grid->_ldimensions);
ptrs[lane] = &in_slex_conv[llex];
}
merge(out_v[out_oidx], ptrs, 0);
});
}
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Communicate between grids // Communicate between grids

View File

@ -52,6 +52,7 @@ public:
// This will be safe to call from accelerator_for and is trivially copy constructible // This will be safe to call from accelerator_for and is trivially copy constructible
// The copy constructor for this will need to be used by device lambda functions // The copy constructor for this will need to be used by device lambda functions
///////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////
#undef LATTICE_BOUNDS_CHECK
template<class vobj> template<class vobj>
class LatticeView : public LatticeAccelerator<vobj> class LatticeView : public LatticeAccelerator<vobj>
{ {
@ -61,20 +62,37 @@ public:
void * cpu_ptr; void * cpu_ptr;
#ifdef GRID_SIMT #ifdef GRID_SIMT
accelerator_inline const typename vobj::scalar_object operator()(size_t i) const { accelerator_inline const typename vobj::scalar_object operator()(size_t i) const {
#ifdef LATTICE_BOUNDS_CHECK
assert(i<this->_odata_size);
assert(i>=0);
#endif
return coalescedRead(this->_odata[i]); return coalescedRead(this->_odata[i]);
} }
#else #else
accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; } accelerator_inline const vobj & operator()(size_t i) const {
#ifdef LATTICE_BOUNDS_CHECK
assert(i<this->_odata_size);
assert(i>=0);
#endif
return this->_odata[i];
}
#endif #endif
#if 1 accelerator_inline const vobj & operator[](size_t i) const {
// accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; }; #ifdef LATTICE_BOUNDS_CHECK
accelerator_inline vobj & operator[](size_t i) const { return this->_odata[i]; }; assert(i<this->_odata_size);
#else assert(i>=0);
accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
accelerator_inline vobj & operator[](size_t i) { return this->_odata[i]; };
#endif #endif
return this->_odata[i];
};
accelerator_inline vobj & operator[](size_t i) {
#ifdef LATTICE_BOUNDS_CHECK
assert(i<this->_odata_size);
assert(i>=0);
#endif
return this->_odata[i];
};
accelerator_inline uint64_t begin(void) const { return 0;}; accelerator_inline uint64_t begin(void) const { return 0;};
accelerator_inline uint64_t end(void) const { return this->_odata_size; }; accelerator_inline uint64_t end(void) const { return this->_odata_size; };
accelerator_inline uint64_t size(void) const { return this->_odata_size; }; accelerator_inline uint64_t size(void) const { return this->_odata_size; };

View File

@ -43,7 +43,7 @@ inline void whereWolf(Lattice<vobj> &ret,const Lattice<iobj> &predicate,Lattice<
conformable(iftrue,predicate); conformable(iftrue,predicate);
conformable(iftrue,ret); conformable(iftrue,ret);
GridBase *grid=iftrue.Grid(); GridBase *grid=iftrue._grid;
typedef typename vobj::scalar_object scalar_object; typedef typename vobj::scalar_object scalar_object;
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
@ -52,23 +52,22 @@ inline void whereWolf(Lattice<vobj> &ret,const Lattice<iobj> &predicate,Lattice<
const int Nsimd = grid->Nsimd(); const int Nsimd = grid->Nsimd();
autoView(iftrue_v,iftrue,CpuRead); std::vector<Integer> mask(Nsimd);
autoView(iffalse_v,iffalse,CpuRead); std::vector<scalar_object> truevals (Nsimd);
autoView(predicate_v,predicate,CpuRead); std::vector<scalar_object> falsevals(Nsimd);
autoView(ret_v,ret,CpuWrite);
Integer NN= grid->oSites(); parallel_for(int ss=0;ss<iftrue._grid->oSites(); ss++){
thread_for(ss,NN,{
Integer mask; extract(iftrue._odata[ss] ,truevals);
scalar_object trueval; extract(iffalse._odata[ss] ,falsevals);
scalar_object falseval; extract<vInteger,Integer>(TensorRemove(predicate._odata[ss]),mask);
for(int l=0;l<Nsimd;l++){
trueval =extractLane(l,iftrue_v[ss]); for(int s=0;s<Nsimd;s++){
falseval=extractLane(l,iffalse_v[ss]); if (mask[s]) falsevals[s]=truevals[s];
mask =extractLane(l,predicate_v[ss]);
if (mask) falseval=trueval;
insertLane(l,ret_v[ss],falseval);
} }
});
merge(ret._odata[ss],falsevals);
}
} }
template<class vobj,class iobj> template<class vobj,class iobj>
@ -77,9 +76,9 @@ inline Lattice<vobj> whereWolf(const Lattice<iobj> &predicate,Lattice<vobj> &ift
conformable(iftrue,iffalse); conformable(iftrue,iffalse);
conformable(iftrue,predicate); conformable(iftrue,predicate);
Lattice<vobj> ret(iftrue.Grid()); Lattice<vobj> ret(iftrue._grid);
whereWolf(ret,predicate,iftrue,iffalse); where(ret,predicate,iftrue,iffalse);
return ret; return ret;
} }

View File

@ -69,7 +69,6 @@ GridLogger GridLogDebug (1, "Debug", GridLogColours, "PURPLE");
GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN"); GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN");
GridLogger GridLogIterative (1, "Iterative", GridLogColours, "BLUE"); GridLogger GridLogIterative (1, "Iterative", GridLogColours, "BLUE");
GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE"); GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE");
GridLogger GridLogHMC (1, "HMC", GridLogColours, "BLUE");
void GridLogConfigure(std::vector<std::string> &logstreams) { void GridLogConfigure(std::vector<std::string> &logstreams) {
GridLogError.Active(0); GridLogError.Active(0);
@ -80,7 +79,6 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
GridLogPerformance.Active(0); GridLogPerformance.Active(0);
GridLogIntegrator.Active(1); GridLogIntegrator.Active(1);
GridLogColours.Active(0); GridLogColours.Active(0);
GridLogHMC.Active(1);
for (int i = 0; i < logstreams.size(); i++) { for (int i = 0; i < logstreams.size(); i++) {
if (logstreams[i] == std::string("Error")) GridLogError.Active(1); if (logstreams[i] == std::string("Error")) GridLogError.Active(1);
@ -89,8 +87,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
if (logstreams[i] == std::string("Iterative")) GridLogIterative.Active(1); if (logstreams[i] == std::string("Iterative")) GridLogIterative.Active(1);
if (logstreams[i] == std::string("Debug")) GridLogDebug.Active(1); if (logstreams[i] == std::string("Debug")) GridLogDebug.Active(1);
if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1); if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1);
if (logstreams[i] == std::string("NoIntegrator")) GridLogIntegrator.Active(0); if (logstreams[i] == std::string("Integrator")) GridLogIntegrator.Active(1);
if (logstreams[i] == std::string("NoHMC")) GridLogHMC.Active(0);
if (logstreams[i] == std::string("Colours")) GridLogColours.Active(1); if (logstreams[i] == std::string("Colours")) GridLogColours.Active(1);
} }
} }

View File

@ -130,8 +130,6 @@ public:
friend std::ostream& operator<< (std::ostream& stream, Logger& log){ friend std::ostream& operator<< (std::ostream& stream, Logger& log){
if ( log.active ) { if ( log.active ) {
std::ios_base::fmtflags f(stream.flags());
stream << log.background()<< std::left; stream << log.background()<< std::left;
if (log.topWidth > 0) if (log.topWidth > 0)
{ {
@ -154,8 +152,6 @@ public:
<< now << log.background() << " : " ; << now << log.background() << " : " ;
} }
stream << log.colour(); stream << log.colour();
stream.flags(f);
return stream; return stream;
} else { } else {
return devnull; return devnull;
@ -182,7 +178,6 @@ extern GridLogger GridLogDebug ;
extern GridLogger GridLogPerformance; extern GridLogger GridLogPerformance;
extern GridLogger GridLogIterative ; extern GridLogger GridLogIterative ;
extern GridLogger GridLogIntegrator ; extern GridLogger GridLogIntegrator ;
extern GridLogger GridLogHMC;
extern Colours GridLogColours; extern Colours GridLogColours;
std::string demangle(const char* name) ; std::string demangle(const char* name) ;

View File

@ -1,4 +1,3 @@
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
int Grid::BinaryIO::latticeWriteMaxRetry = -1; int Grid::BinaryIO::latticeWriteMaxRetry = -1;
Grid::BinaryIO::IoPerf Grid::BinaryIO::lastPerf;

View File

@ -79,13 +79,6 @@ inline void removeWhitespace(std::string &key)
/////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////
class BinaryIO { class BinaryIO {
public: public:
struct IoPerf
{
uint64_t size{0},time{0};
double mbytesPerSecond{0.};
};
static IoPerf lastPerf;
static int latticeWriteMaxRetry; static int latticeWriteMaxRetry;
///////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////
@ -509,15 +502,12 @@ class BinaryIO {
timer.Stop(); timer.Stop();
} }
lastPerf.size = sizeof(fobj)*iodata.size()*nrank;
lastPerf.time = timer.useconds();
lastPerf.mbytesPerSecond = lastPerf.size/1024./1024./(lastPerf.time/1.0e6);
std::cout<<GridLogMessage<<"IOobject: "; std::cout<<GridLogMessage<<"IOobject: ";
if ( control & BINARYIO_READ) std::cout << " read "; if ( control & BINARYIO_READ) std::cout << " read ";
else std::cout << " write "; else std::cout << " write ";
uint64_t bytes = sizeof(fobj)*iodata.size()*nrank; uint64_t bytes = sizeof(fobj)*iodata.size()*nrank;
std::cout<< lastPerf.size <<" bytes in "<< timer.Elapsed() <<" " std::cout<< bytes <<" bytes in "<<timer.Elapsed() <<" "
<< lastPerf.mbytesPerSecond <<" MB/s "<<std::endl; << (double)bytes/ (double)timer.useconds() <<" MB/s "<<std::endl;
std::cout<<GridLogMessage<<"IOobject: endian and checksum overhead "<<bstimer.Elapsed() <<std::endl; std::cout<<GridLogMessage<<"IOobject: endian and checksum overhead "<<bstimer.Elapsed() <<std::endl;
@ -673,15 +663,10 @@ class BinaryIO {
nersc_csum,scidac_csuma,scidac_csumb); nersc_csum,scidac_csuma,scidac_csumb);
timer.Start(); timer.Start();
thread_for(lidx,lsites,{ // FIX ME, suboptimal implementation thread_for(lidx,lsites,{
std::vector<RngStateType> tmp(RngStateCount); std::vector<RngStateType> tmp(RngStateCount);
std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin()); std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin());
Coordinate lcoor; parallel_rng.SetState(tmp,lidx);
grid->LocalIndexToLocalCoor(lidx, lcoor);
int o_idx=grid->oIndex(lcoor);
int i_idx=grid->iIndex(lcoor);
int gidx=parallel_rng.generator_idx(o_idx,i_idx);
parallel_rng.SetState(tmp,gidx);
}); });
timer.Stop(); timer.Stop();
@ -738,12 +723,7 @@ class BinaryIO {
std::vector<RNGstate> iodata(lsites); std::vector<RNGstate> iodata(lsites);
thread_for(lidx,lsites,{ thread_for(lidx,lsites,{
std::vector<RngStateType> tmp(RngStateCount); std::vector<RngStateType> tmp(RngStateCount);
Coordinate lcoor; parallel_rng.GetState(tmp,lidx);
grid->LocalIndexToLocalCoor(lidx, lcoor);
int o_idx=grid->oIndex(lcoor);
int i_idx=grid->iIndex(lcoor);
int gidx=parallel_rng.generator_idx(o_idx,i_idx);
parallel_rng.GetState(tmp,gidx);
std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin()); std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin());
}); });
timer.Stop(); timer.Stop();

View File

@ -123,7 +123,7 @@ assert(GRID_FIELD_NORM_CALC(FieldNormMetaData_, n2ck) < 1.0e-5);
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
// Helper to fill out metadata // Helper to fill out metadata
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
template<class vobj> void ScidacMetaData(Lattice<vobj> & field, template<class vobj> void ScidacMetaData(Lattice<vobj> & field,
FieldMetaData &header, FieldMetaData &header,
scidacRecord & _scidacRecord, scidacRecord & _scidacRecord,
scidacFile & _scidacFile) scidacFile & _scidacFile)
@ -576,8 +576,6 @@ class ScidacReader : public GridLimeReader {
std::string rec_name(ILDG_BINARY_DATA); std::string rec_name(ILDG_BINARY_DATA);
while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) { while ( limeReaderNextRecord(LimeR) == LIME_SUCCESS ) {
if ( !strncmp(limeReaderType(LimeR), rec_name.c_str(),strlen(rec_name.c_str()) ) ) { if ( !strncmp(limeReaderType(LimeR), rec_name.c_str(),strlen(rec_name.c_str()) ) ) {
// in principle should do the line below, but that breaks backard compatibility with old data
// skipPastObjectRecord(std::string(GRID_FIELD_NORM));
skipPastObjectRecord(std::string(SCIDAC_CHECKSUM)); skipPastObjectRecord(std::string(SCIDAC_CHECKSUM));
return; return;
} }
@ -621,12 +619,12 @@ class IldgWriter : public ScidacWriter {
// Don't require scidac records EXCEPT checksum // Don't require scidac records EXCEPT checksum
// Use Grid MetaData object if present. // Use Grid MetaData object if present.
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
template <class stats = PeriodicGaugeStatistics> template <class vsimd>
void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,int sequence,std::string LFN,std::string description) void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,int sequence,std::string LFN,std::string description)
{ {
GridBase * grid = Umu.Grid(); GridBase * grid = Umu.Grid();
typedef Lattice<vLorentzColourMatrixD> GaugeField; typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
typedef vLorentzColourMatrixD vobj; typedef iLorentzColourMatrix<vsimd> vobj;
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
//////////////////////////////////////// ////////////////////////////////////////
@ -638,9 +636,6 @@ class IldgWriter : public ScidacWriter {
ScidacMetaData(Umu,header,_scidacRecord,_scidacFile); ScidacMetaData(Umu,header,_scidacRecord,_scidacFile);
stats Stats;
Stats(Umu,header);
std::string format = header.floating_point; std::string format = header.floating_point;
header.ensemble_id = description; header.ensemble_id = description;
header.ensemble_label = description; header.ensemble_label = description;
@ -710,10 +705,10 @@ class IldgReader : public GridLimeReader {
// Else use ILDG MetaData object if present. // Else use ILDG MetaData object if present.
// Else use SciDAC MetaData object if present. // Else use SciDAC MetaData object if present.
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
template <class stats = PeriodicGaugeStatistics> template <class vsimd>
void readConfiguration(Lattice<vLorentzColourMatrixD> &Umu, FieldMetaData &FieldMetaData_) { void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu, FieldMetaData &FieldMetaData_) {
typedef Lattice<vLorentzColourMatrixD > GaugeField; typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
typedef typename GaugeField::vector_object vobj; typedef typename GaugeField::vector_object vobj;
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
@ -926,8 +921,7 @@ class IldgReader : public GridLimeReader {
if ( found_FieldMetaData || found_usqcdInfo ) { if ( found_FieldMetaData || found_usqcdInfo ) {
FieldMetaData checker; FieldMetaData checker;
stats Stats; GaugeStatistics(Umu,checker);
Stats(Umu,checker);
assert(fabs(checker.plaquette - FieldMetaData_.plaquette )<1.0e-5); assert(fabs(checker.plaquette - FieldMetaData_.plaquette )<1.0e-5);
assert(fabs(checker.link_trace - FieldMetaData_.link_trace)<1.0e-5); assert(fabs(checker.link_trace - FieldMetaData_.link_trace)<1.0e-5);
std::cout << GridLogMessage<<"Plaquette and link trace match " << std::endl; std::cout << GridLogMessage<<"Plaquette and link trace match " << std::endl;

View File

@ -128,7 +128,7 @@ inline void MachineCharacteristics(FieldMetaData &header)
std::time_t t = std::time(nullptr); std::time_t t = std::time(nullptr);
std::tm tm_ = *std::localtime(&t); std::tm tm_ = *std::localtime(&t);
std::ostringstream oss; std::ostringstream oss;
oss << std::put_time(&tm_, "%c %Z"); // oss << std::put_time(&tm_, "%c %Z");
header.creation_date = oss.str(); header.creation_date = oss.str();
header.archive_date = header.creation_date; header.archive_date = header.creation_date;
@ -176,18 +176,29 @@ template<class vobj> inline void PrepareMetaData(Lattice<vobj> & field, FieldMet
GridMetaData(grid,header); GridMetaData(grid,header);
MachineCharacteristics(header); MachineCharacteristics(header);
} }
template<class Impl> inline void GaugeStatistics(Lattice<vLorentzColourMatrixF> & data,FieldMetaData &header)
class GaugeStatistics
{ {
public: // How to convert data precision etc...
void operator()(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header) header.link_trace=WilsonLoops<PeriodicGimplF>::linkTrace(data);
{ header.plaquette =WilsonLoops<PeriodicGimplF>::avgPlaquette(data);
header.link_trace=WilsonLoops<Impl>::linkTrace(data); }
header.plaquette =WilsonLoops<Impl>::avgPlaquette(data); inline void GaugeStatistics(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header)
} {
}; // How to convert data precision etc...
typedef GaugeStatistics<PeriodicGimplD> PeriodicGaugeStatistics; header.link_trace=WilsonLoops<PeriodicGimplD>::linkTrace(data);
typedef GaugeStatistics<ConjugateGimplD> ConjugateGaugeStatistics; header.plaquette =WilsonLoops<PeriodicGimplD>::avgPlaquette(data);
}
template<> inline void PrepareMetaData<vLorentzColourMatrixF>(Lattice<vLorentzColourMatrixF> & field, FieldMetaData &header)
{
GridBase *grid = field.Grid();
std::string format = getFormatString<vLorentzColourMatrixF>();
header.floating_point = format;
header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
GridMetaData(grid,header);
GaugeStatistics(field,header);
MachineCharacteristics(header);
}
template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzColourMatrixD> & field, FieldMetaData &header) template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzColourMatrixD> & field, FieldMetaData &header)
{ {
GridBase *grid = field.Grid(); GridBase *grid = field.Grid();
@ -195,6 +206,7 @@ template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzCo
header.floating_point = format; header.floating_point = format;
header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
GridMetaData(grid,header); GridMetaData(grid,header);
GaugeStatistics(field,header);
MachineCharacteristics(header); MachineCharacteristics(header);
} }

View File

@ -39,10 +39,6 @@ using namespace Grid;
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
class NerscIO : public BinaryIO { class NerscIO : public BinaryIO {
public: public:
typedef Lattice<vLorentzColourMatrixD> GaugeField;
// Enable/disable exiting if the plaquette in the header does not match the value computed (default true)
static bool & exitOnReadPlaquetteMismatch(){ static bool v=true; return v; }
static inline void truncate(std::string file){ static inline void truncate(std::string file){
std::ofstream fout(file,std::ios::out); std::ofstream fout(file,std::ios::out);
@ -133,12 +129,12 @@ public:
// Now the meat: the object readers // Now the meat: the object readers
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
template<class GaugeStats=PeriodicGaugeStatistics> template<class vsimd>
static inline void readConfiguration(GaugeField &Umu, static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
FieldMetaData& header, FieldMetaData& header,
std::string file, std::string file)
GaugeStats GaugeStatisticsCalculator=GaugeStats())
{ {
typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
GridBase *grid = Umu.Grid(); GridBase *grid = Umu.Grid();
uint64_t offset = readHeader(file,Umu.Grid(),header); uint64_t offset = readHeader(file,Umu.Grid(),header);
@ -157,23 +153,23 @@ public:
// munger is a function of <floating point, Real, data_type> // munger is a function of <floating point, Real, data_type>
if ( header.data_type == std::string("4D_SU3_GAUGE") ) { if ( header.data_type == std::string("4D_SU3_GAUGE") ) {
if ( ieee32 || ieee32big ) { if ( ieee32 || ieee32big ) {
BinaryIO::readLatticeObject<vLorentzColourMatrixD, LorentzColour2x3F> BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3F>
(Umu,file,Gauge3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format, (Umu,file,Gauge3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format,
nersc_csum,scidac_csuma,scidac_csumb); nersc_csum,scidac_csuma,scidac_csumb);
} }
if ( ieee64 || ieee64big ) { if ( ieee64 || ieee64big ) {
BinaryIO::readLatticeObject<vLorentzColourMatrixD, LorentzColour2x3D> BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3D>
(Umu,file,Gauge3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format, (Umu,file,Gauge3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format,
nersc_csum,scidac_csuma,scidac_csumb); nersc_csum,scidac_csuma,scidac_csumb);
} }
} else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) { } else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
if ( ieee32 || ieee32big ) { if ( ieee32 || ieee32big ) {
BinaryIO::readLatticeObject<vLorentzColourMatrixD,LorentzColourMatrixF> BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
(Umu,file,GaugeSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format, (Umu,file,GaugeSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format,
nersc_csum,scidac_csuma,scidac_csumb); nersc_csum,scidac_csuma,scidac_csumb);
} }
if ( ieee64 || ieee64big ) { if ( ieee64 || ieee64big ) {
BinaryIO::readLatticeObject<vLorentzColourMatrixD,LorentzColourMatrixD> BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixD>
(Umu,file,GaugeSimpleMunger<LorentzColourMatrixD,LorentzColourMatrix>(),offset,format, (Umu,file,GaugeSimpleMunger<LorentzColourMatrixD,LorentzColourMatrix>(),offset,format,
nersc_csum,scidac_csuma,scidac_csumb); nersc_csum,scidac_csuma,scidac_csumb);
} }
@ -181,7 +177,7 @@ public:
assert(0); assert(0);
} }
GaugeStats Stats; Stats(Umu,clone); GaugeStatistics(Umu,clone);
std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" checksum "<<std::hex<<nersc_csum<< std::dec std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" checksum "<<std::hex<<nersc_csum<< std::dec
<<" header "<<std::hex<<header.checksum<<std::dec <<std::endl; <<" header "<<std::hex<<header.checksum<<std::dec <<std::endl;
@ -200,29 +196,22 @@ public:
std::cerr << " nersc_csum " <<std::hex<< nersc_csum << " " << header.checksum<< std::dec<< std::endl; std::cerr << " nersc_csum " <<std::hex<< nersc_csum << " " << header.checksum<< std::dec<< std::endl;
exit(0); exit(0);
} }
if(exitOnReadPlaquetteMismatch()) assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 ); assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 ); assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
assert(nersc_csum == header.checksum ); assert(nersc_csum == header.checksum );
std::cout<<GridLogMessage <<"NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl; std::cout<<GridLogMessage <<"NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl;
} }
// Preferred interface template<class vsimd>
template<class GaugeStats=PeriodicGaugeStatistics> static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
std::string file,
std::string ens_label = std::string("DWF"))
{
writeConfiguration(Umu,file,0,1,ens_label);
}
template<class GaugeStats=PeriodicGaugeStatistics>
static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
std::string file, std::string file,
int two_row, int two_row,
int bits32, int bits32)
std::string ens_label = std::string("DWF"))
{ {
typedef vLorentzColourMatrixD vobj; typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
typedef iLorentzColourMatrix<vsimd> vobj;
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
FieldMetaData header; FieldMetaData header;
@ -230,8 +219,8 @@ public:
// Following should become arguments // Following should become arguments
/////////////////////////////////////////// ///////////////////////////////////////////
header.sequence_number = 1; header.sequence_number = 1;
header.ensemble_id = std::string("UKQCD"); header.ensemble_id = "UKQCD";
header.ensemble_label = ens_label; header.ensemble_label = "DWF";
typedef LorentzColourMatrixD fobj3D; typedef LorentzColourMatrixD fobj3D;
typedef LorentzColour2x3D fobj2D; typedef LorentzColour2x3D fobj2D;
@ -240,28 +229,28 @@ public:
GridMetaData(grid,header); GridMetaData(grid,header);
assert(header.nd==4); assert(header.nd==4);
GaugeStats Stats; Stats(Umu,header); GaugeStatistics(Umu,header);
MachineCharacteristics(header); MachineCharacteristics(header);
uint64_t offset; uint64_t offset;
// Sod it -- always write 3x3 double // Sod it -- always write 3x3 double
header.floating_point = std::string("IEEE64BIG"); header.floating_point = std::string("IEEE64BIG");
header.data_type = std::string("4D_SU3_GAUGE_3x3"); header.data_type = std::string("4D_SU3_GAUGE_3x3");
GaugeSimpleUnmunger<fobj3D,sobj> munge; GaugeSimpleUnmunger<fobj3D,sobj> munge;
if ( grid->IsBoss() ) { if ( grid->IsBoss() ) {
truncate(file); truncate(file);
offset = writeHeader(header,file); offset = writeHeader(header,file);
} }
grid->Broadcast(0,(void *)&offset,sizeof(offset)); grid->Broadcast(0,(void *)&offset,sizeof(offset));
uint32_t nersc_csum,scidac_csuma,scidac_csumb; uint32_t nersc_csum,scidac_csuma,scidac_csumb;
BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point, BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
nersc_csum,scidac_csuma,scidac_csumb); nersc_csum,scidac_csuma,scidac_csumb);
header.checksum = nersc_csum; header.checksum = nersc_csum;
if ( grid->IsBoss() ) { if ( grid->IsBoss() ) {
writeHeader(header,file); writeHeader(header,file);
} }
std::cout<<GridLogMessage <<"Written NERSC Configuration on "<< file << " checksum " std::cout<<GridLogMessage <<"Written NERSC Configuration on "<< file << " checksum "
<<std::hex<<header.checksum <<std::hex<<header.checksum

View File

@ -154,7 +154,7 @@ public:
grid->Barrier(); timer.Stop(); grid->Barrier(); timer.Stop();
std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: redistribute overhead " << timer.Elapsed() << std::endl; std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: redistribute overhead " << timer.Elapsed() << std::endl;
PeriodicGaugeStatistics Stats; Stats(Umu, clone); GaugeStatistics(Umu, clone);
RealD plaq_diff = fabs(clone.plaquette - header.plaquette); RealD plaq_diff = fabs(clone.plaquette - header.plaquette);

View File

@ -208,7 +208,7 @@ public:
FieldMetaData clone(header); FieldMetaData clone(header);
PeriodicGaugeStatistics Stats; Stats(Umu, clone); GaugeStatistics(Umu, clone);
RealD plaq_diff = fabs(clone.plaquette - header.plaquette); RealD plaq_diff = fabs(clone.plaquette - header.plaquette);

View File

@ -47,7 +47,7 @@ static constexpr int Ym = 5;
static constexpr int Zm = 6; static constexpr int Zm = 6;
static constexpr int Tm = 7; static constexpr int Tm = 7;
static constexpr int Nc=Config_Nc; static constexpr int Nc=3;
static constexpr int Ns=4; static constexpr int Ns=4;
static constexpr int Nd=4; static constexpr int Nd=4;
static constexpr int Nhs=2; // half spinor static constexpr int Nhs=2; // half spinor
@ -63,7 +63,6 @@ static constexpr int Ngp=2; // gparity index range
#define ColourIndex (2) #define ColourIndex (2)
#define SpinIndex (1) #define SpinIndex (1)
#define LorentzIndex (0) #define LorentzIndex (0)
#define GparityFlavourIndex (0)
// Also should make these a named enum type // Also should make these a named enum type
static constexpr int DaggerNo=0; static constexpr int DaggerNo=0;
@ -78,18 +77,16 @@ const int SpinorIndex = 2;
template<typename T> struct isSpinor { template<typename T> struct isSpinor {
static constexpr bool value = (SpinorIndex==T::TensorLevel); static constexpr bool value = (SpinorIndex==T::TensorLevel);
}; };
const int CoarseIndex = 4;
template<typename T> struct isCoarsened {
static constexpr bool value = (CoarseIndex<=T::TensorLevel);
};
template <typename T> using IfSpinor = Invoke<std::enable_if< isSpinor<T>::value,int> > ; template <typename T> using IfSpinor = Invoke<std::enable_if< isSpinor<T>::value,int> > ;
template <typename T> using IfNotSpinor = Invoke<std::enable_if<!isSpinor<T>::value,int> > ; template <typename T> using IfNotSpinor = Invoke<std::enable_if<!isSpinor<T>::value,int> > ;
const int CoarseIndex = 4;
template<typename T> struct isCoarsened {
static constexpr bool value = (CoarseIndex<=T::TensorLevel);
};
template <typename T> using IfCoarsened = Invoke<std::enable_if< isCoarsened<T>::value,int> > ; template <typename T> using IfCoarsened = Invoke<std::enable_if< isCoarsened<T>::value,int> > ;
template <typename T> using IfNotCoarsened = Invoke<std::enable_if<!isCoarsened<T>::value,int> > ; template <typename T> using IfNotCoarsened = Invoke<std::enable_if<!isCoarsened<T>::value,int> > ;
const int GparityFlavourTensorIndex = 3; //TensorLevel counts from the bottom!
// ChrisK very keen to add extra space for Gparity doubling. // ChrisK very keen to add extra space for Gparity doubling.
// //
// Also add domain wall index, in a way where Wilson operator // Also add domain wall index, in a way where Wilson operator
@ -113,10 +110,8 @@ template<typename vtype> using iHalfSpinColourVector = iScalar<iVector<iVec
template<typename vtype> using iSpinColourSpinColourMatrix = iScalar<iMatrix<iMatrix<iMatrix<iMatrix<vtype, Nc>, Ns>, Nc>, Ns> >; template<typename vtype> using iSpinColourSpinColourMatrix = iScalar<iMatrix<iMatrix<iMatrix<iMatrix<vtype, Nc>, Ns>, Nc>, Ns> >;
template<typename vtype> using iGparityFlavourVector = iVector<iScalar<iScalar<vtype> >, Ngp>;
template<typename vtype> using iGparitySpinColourVector = iVector<iVector<iVector<vtype, Nc>, Ns>, Ngp >; template<typename vtype> using iGparitySpinColourVector = iVector<iVector<iVector<vtype, Nc>, Ns>, Ngp >;
template<typename vtype> using iGparityHalfSpinColourVector = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >; template<typename vtype> using iGparityHalfSpinColourVector = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
template<typename vtype> using iGparityFlavourMatrix = iMatrix<iScalar<iScalar<vtype> >, Ngp>;
// Spin matrix // Spin matrix
typedef iSpinMatrix<Complex > SpinMatrix; typedef iSpinMatrix<Complex > SpinMatrix;
@ -181,16 +176,6 @@ typedef iDoubleStoredColourMatrix<vComplex > vDoubleStoredColourMatrix;
typedef iDoubleStoredColourMatrix<vComplexF> vDoubleStoredColourMatrixF; typedef iDoubleStoredColourMatrix<vComplexF> vDoubleStoredColourMatrixF;
typedef iDoubleStoredColourMatrix<vComplexD> vDoubleStoredColourMatrixD; typedef iDoubleStoredColourMatrix<vComplexD> vDoubleStoredColourMatrixD;
//G-parity flavour matrix
typedef iGparityFlavourMatrix<Complex> GparityFlavourMatrix;
typedef iGparityFlavourMatrix<ComplexF> GparityFlavourMatrixF;
typedef iGparityFlavourMatrix<ComplexD> GparityFlavourMatrixD;
typedef iGparityFlavourMatrix<vComplex> vGparityFlavourMatrix;
typedef iGparityFlavourMatrix<vComplexF> vGparityFlavourMatrixF;
typedef iGparityFlavourMatrix<vComplexD> vGparityFlavourMatrixD;
// Spin vector // Spin vector
typedef iSpinVector<Complex > SpinVector; typedef iSpinVector<Complex > SpinVector;
typedef iSpinVector<ComplexF> SpinVectorF; typedef iSpinVector<ComplexF> SpinVectorF;
@ -235,16 +220,6 @@ typedef iHalfSpinColourVector<ComplexD> HalfSpinColourVectorD;
typedef iHalfSpinColourVector<vComplex > vHalfSpinColourVector; typedef iHalfSpinColourVector<vComplex > vHalfSpinColourVector;
typedef iHalfSpinColourVector<vComplexF> vHalfSpinColourVectorF; typedef iHalfSpinColourVector<vComplexF> vHalfSpinColourVectorF;
typedef iHalfSpinColourVector<vComplexD> vHalfSpinColourVectorD; typedef iHalfSpinColourVector<vComplexD> vHalfSpinColourVectorD;
//G-parity flavour vector
typedef iGparityFlavourVector<Complex > GparityFlavourVector;
typedef iGparityFlavourVector<ComplexF> GparityFlavourVectorF;
typedef iGparityFlavourVector<ComplexD> GparityFlavourVectorD;
typedef iGparityFlavourVector<vComplex > vGparityFlavourVector;
typedef iGparityFlavourVector<vComplexF> vGparityFlavourVectorF;
typedef iGparityFlavourVector<vComplexD> vGparityFlavourVectorD;
// singlets // singlets
typedef iSinglet<Complex > TComplex; // FIXME This is painful. Tensor singlet complex type. typedef iSinglet<Complex > TComplex; // FIXME This is painful. Tensor singlet complex type.

View File

@ -41,7 +41,7 @@ class Action
public: public:
bool is_smeared = false; bool is_smeared = false;
// Heatbath? // Heatbath?
virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) = 0; // refresh pseudofermions virtual void refresh(const GaugeField& U, GridParallelRNG& pRNG) = 0; // refresh pseudofermions
virtual RealD S(const GaugeField& U) = 0; // evaluate the action virtual RealD S(const GaugeField& U) = 0; // evaluate the action
virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0; // evaluate the action derivative virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0; // evaluate the action derivative
virtual std::string action_name() = 0; // return the action name virtual std::string action_name() = 0; // return the action name

View File

@ -36,8 +36,7 @@ NAMESPACE_BEGIN(Grid);
// These can move into a params header and be given MacroMagic serialisation // These can move into a params header and be given MacroMagic serialisation
struct GparityWilsonImplParams { struct GparityWilsonImplParams {
Coordinate twists; //Here the first Nd-1 directions are treated as "spatial", and a twist value of 1 indicates G-parity BCs in that direction. Coordinate twists;
//mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs
GparityWilsonImplParams() : twists(Nd, 0) {}; GparityWilsonImplParams() : twists(Nd, 0) {};
}; };
@ -66,8 +65,7 @@ struct StaggeredImplParams {
RealD, tolerance, RealD, tolerance,
int, degree, int, degree,
int, precision, int, precision,
int, BoundsCheckFreq, int, BoundsCheckFreq);
RealD, BoundsCheckTol);
// MaxIter and tolerance, vectors?? // MaxIter and tolerance, vectors??
@ -78,61 +76,15 @@ struct StaggeredImplParams {
RealD tol = 1.0e-8, RealD tol = 1.0e-8,
int _degree = 10, int _degree = 10,
int _precision = 64, int _precision = 64,
int _BoundsCheckFreq=20, int _BoundsCheckFreq=20)
double _BoundsCheckTol=1e-6)
: lo(_lo), : lo(_lo),
hi(_hi), hi(_hi),
MaxIter(_maxit), MaxIter(_maxit),
tolerance(tol), tolerance(tol),
degree(_degree), degree(_degree),
precision(_precision), precision(_precision),
BoundsCheckFreq(_BoundsCheckFreq), BoundsCheckFreq(_BoundsCheckFreq){};
BoundsCheckTol(_BoundsCheckTol){};
}; };
/*Action parameters for the generalized rational action
The approximation is for (M^dag M)^{1/inv_pow}
where inv_pow is the denominator of the fractional power.
Default inv_pow=2 for square root, making this equivalent to
the OneFlavourRational action
*/
struct RationalActionParams : Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(RationalActionParams,
int, inv_pow,
RealD, lo, //low eigenvalue bound of rational approx
RealD, hi, //high eigenvalue bound of rational approx
int, MaxIter, //maximum iterations in msCG
RealD, action_tolerance, //msCG tolerance in action evaluation
int, action_degree, //rational approx tolerance in action evaluation
RealD, md_tolerance, //msCG tolerance in MD integration
int, md_degree, //rational approx tolerance in MD integration
int, precision, //precision of floating point arithmetic
int, BoundsCheckFreq); //frequency the approximation is tested (with Metropolis degree/tolerance); 0 disables the check
// constructor
RationalActionParams(int _inv_pow = 2,
RealD _lo = 0.0,
RealD _hi = 1.0,
int _maxit = 1000,
RealD _action_tolerance = 1.0e-8,
int _action_degree = 10,
RealD _md_tolerance = 1.0e-8,
int _md_degree = 10,
int _precision = 64,
int _BoundsCheckFreq=20)
: inv_pow(_inv_pow),
lo(_lo),
hi(_hi),
MaxIter(_maxit),
action_tolerance(_action_tolerance),
action_degree(_action_degree),
md_tolerance(_md_tolerance),
md_degree(_md_degree),
precision(_precision),
BoundsCheckFreq(_BoundsCheckFreq){};
};
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@ -1,240 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermion.h
Copyright (C) 2020 - 2022
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
Author: Nils Meyer <nils.meyer@ur.de>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <Grid/qcd/action/fermion/WilsonCloverTypes.h>
#include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
NAMESPACE_BEGIN(Grid);
// see Grid/qcd/action/fermion/WilsonCloverFermion.h for description
//
// Modifications done here:
//
// Original: clover term = 12x12 matrix per site
//
// But: Only two diagonal 6x6 hermitian blocks are non-zero (also true for original, verified by running)
// Sufficient to store/transfer only the real parts of the diagonal and one triangular part
// 2 * (6 + 15 * 2) = 72 real or 36 complex words to be stored/transfered
//
// Here: Above but diagonal as complex numbers, i.e., need to store/transfer
// 2 * (6 * 2 + 15 * 2) = 84 real or 42 complex words
//
// Words per site and improvement compared to original (combined with the input and output spinors):
//
// - Original: 2*12 + 12*12 = 168 words -> 1.00 x less
// - Minimal: 2*12 + 36 = 60 words -> 2.80 x less
// - Here: 2*12 + 42 = 66 words -> 2.55 x less
//
// These improvements directly translate to wall-clock time
//
// Data layout:
//
// - diagonal and triangle part as separate lattice fields,
// this was faster than as 1 combined field on all tested machines
// - diagonal: as expected
// - triangle: store upper right triangle in row major order
// - graphical:
// 0 1 2 3 4
// 5 6 7 8
// 9 10 11 = upper right triangle indices
// 12 13
// 14
// 0
// 1
// 2
// 3 = diagonal indices
// 4
// 5
// 0
// 1 5
// 2 6 9 = lower left triangle indices
// 3 7 10 12
// 4 8 11 13 14
//
// Impact on total memory consumption:
// - Original: (2 * 1 + 8 * 1/2) 12x12 matrices = 6 12x12 matrices = 864 complex words per site
// - Here: (2 * 1 + 4 * 1/2) diagonal parts = 4 diagonal parts = 24 complex words per site
// + (2 * 1 + 4 * 1/2) triangle parts = 4 triangle parts = 60 complex words per site
// = 84 complex words per site
template<class Impl>
class CompactWilsonCloverFermion : public WilsonFermion<Impl>,
public WilsonCloverHelpers<Impl>,
public CompactWilsonCloverHelpers<Impl> {
/////////////////////////////////////////////
// Sizes
/////////////////////////////////////////////
public:
INHERIT_COMPACT_CLOVER_SIZES(Impl);
/////////////////////////////////////////////
// Type definitions
/////////////////////////////////////////////
public:
INHERIT_IMPL_TYPES(Impl);
INHERIT_CLOVER_TYPES(Impl);
INHERIT_COMPACT_CLOVER_TYPES(Impl);
typedef WilsonFermion<Impl> WilsonBase;
typedef WilsonCloverHelpers<Impl> Helpers;
typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
/////////////////////////////////////////////
// Constructors
/////////////////////////////////////////////
public:
CompactWilsonCloverFermion(GaugeField& _Umu,
GridCartesian& Fgrid,
GridRedBlackCartesian& Hgrid,
const RealD _mass,
const RealD _csw_r = 0.0,
const RealD _csw_t = 0.0,
const RealD _cF = 1.0,
const WilsonAnisotropyCoefficients& clover_anisotropy = WilsonAnisotropyCoefficients(),
const ImplParams& impl_p = ImplParams());
/////////////////////////////////////////////
// Member functions (implementing interface)
/////////////////////////////////////////////
public:
virtual void Instantiatable() {};
int ConstEE() override { return 0; };
int isTrivialEE() override { return 0; };
void Dhop(const FermionField& in, FermionField& out, int dag) override;
void DhopOE(const FermionField& in, FermionField& out, int dag) override;
void DhopEO(const FermionField& in, FermionField& out, int dag) override;
void DhopDir(const FermionField& in, FermionField& out, int dir, int disp) override;
void DhopDirAll(const FermionField& in, std::vector<FermionField>& out) /* override */;
void M(const FermionField& in, FermionField& out) override;
void Mdag(const FermionField& in, FermionField& out) override;
void Meooe(const FermionField& in, FermionField& out) override;
void MeooeDag(const FermionField& in, FermionField& out) override;
void Mooee(const FermionField& in, FermionField& out) override;
void MooeeDag(const FermionField& in, FermionField& out) override;
void MooeeInv(const FermionField& in, FermionField& out) override;
void MooeeInvDag(const FermionField& in, FermionField& out) override;
void Mdir(const FermionField& in, FermionField& out, int dir, int disp) override;
void MdirAll(const FermionField& in, std::vector<FermionField>& out) override;
void MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) override;
void MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
void MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) override;
/////////////////////////////////////////////
// Member functions (internals)
/////////////////////////////////////////////
void MooeeInternal(const FermionField& in,
FermionField& out,
const CloverDiagonalField& diagonal,
const CloverTriangleField& triangle);
/////////////////////////////////////////////
// Helpers
/////////////////////////////////////////////
void ImportGauge(const GaugeField& _Umu) override;
/////////////////////////////////////////////
// Helpers
/////////////////////////////////////////////
private:
template<class Field>
const MaskField* getCorrectMaskField(const Field &in) const {
if(in.Grid()->_isCheckerBoarded) {
if(in.Checkerboard() == Odd) {
return &this->BoundaryMaskOdd;
} else {
return &this->BoundaryMaskEven;
}
} else {
return &this->BoundaryMask;
}
}
template<class Field>
void ApplyBoundaryMask(Field& f) {
const MaskField* m = getCorrectMaskField(f); assert(m != nullptr);
assert(m != nullptr);
CompactHelpers::ApplyBoundaryMask(f, *m);
}
/////////////////////////////////////////////
// Member Data
/////////////////////////////////////////////
public:
RealD csw_r;
RealD csw_t;
RealD cF;
bool open_boundaries;
CloverDiagonalField Diagonal, DiagonalEven, DiagonalOdd;
CloverDiagonalField DiagonalInv, DiagonalInvEven, DiagonalInvOdd;
CloverTriangleField Triangle, TriangleEven, TriangleOdd;
CloverTriangleField TriangleInv, TriangleInvEven, TriangleInvOdd;
FermionField Tmp;
MaskField BoundaryMask, BoundaryMaskEven, BoundaryMaskOdd;
};
NAMESPACE_END(Grid);

View File

@ -53,7 +53,6 @@ NAMESPACE_CHECK(Wilson);
#include <Grid/qcd/action/fermion/WilsonTMFermion.h> // 4d wilson like #include <Grid/qcd/action/fermion/WilsonTMFermion.h> // 4d wilson like
NAMESPACE_CHECK(WilsonTM); NAMESPACE_CHECK(WilsonTM);
#include <Grid/qcd/action/fermion/WilsonCloverFermion.h> // 4d wilson clover fermions #include <Grid/qcd/action/fermion/WilsonCloverFermion.h> // 4d wilson clover fermions
#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h> // 4d compact wilson clover fermions
NAMESPACE_CHECK(WilsonClover); NAMESPACE_CHECK(WilsonClover);
#include <Grid/qcd/action/fermion/WilsonFermion5D.h> // 5d base used by all 5d overlap types #include <Grid/qcd/action/fermion/WilsonFermion5D.h> // 5d base used by all 5d overlap types
NAMESPACE_CHECK(Wilson5D); NAMESPACE_CHECK(Wilson5D);
@ -116,9 +115,9 @@ typedef WilsonFermion<WilsonImplR> WilsonFermionR;
typedef WilsonFermion<WilsonImplF> WilsonFermionF; typedef WilsonFermion<WilsonImplF> WilsonFermionF;
typedef WilsonFermion<WilsonImplD> WilsonFermionD; typedef WilsonFermion<WilsonImplD> WilsonFermionD;
//typedef WilsonFermion<WilsonImplRL> WilsonFermionRL; typedef WilsonFermion<WilsonImplRL> WilsonFermionRL;
//typedef WilsonFermion<WilsonImplFH> WilsonFermionFH; typedef WilsonFermion<WilsonImplFH> WilsonFermionFH;
//typedef WilsonFermion<WilsonImplDF> WilsonFermionDF; typedef WilsonFermion<WilsonImplDF> WilsonFermionDF;
typedef WilsonFermion<WilsonAdjImplR> WilsonAdjFermionR; typedef WilsonFermion<WilsonAdjImplR> WilsonAdjFermionR;
typedef WilsonFermion<WilsonAdjImplF> WilsonAdjFermionF; typedef WilsonFermion<WilsonAdjImplF> WilsonAdjFermionF;
@ -154,63 +153,46 @@ typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplR> WilsonCloverTwoInd
typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonCloverTwoIndexAntiSymmetricFermionF; typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonCloverTwoIndexAntiSymmetricFermionF;
typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiSymmetricFermionD; typedef WilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiSymmetricFermionD;
// Compact Clover fermions
typedef CompactWilsonCloverFermion<WilsonImplR> CompactWilsonCloverFermionR;
typedef CompactWilsonCloverFermion<WilsonImplF> CompactWilsonCloverFermionF;
typedef CompactWilsonCloverFermion<WilsonImplD> CompactWilsonCloverFermionD;
typedef CompactWilsonCloverFermion<WilsonAdjImplR> CompactWilsonCloverAdjFermionR;
typedef CompactWilsonCloverFermion<WilsonAdjImplF> CompactWilsonCloverAdjFermionF;
typedef CompactWilsonCloverFermion<WilsonAdjImplD> CompactWilsonCloverAdjFermionD;
typedef CompactWilsonCloverFermion<WilsonTwoIndexSymmetricImplR> CompactWilsonCloverTwoIndexSymmetricFermionR;
typedef CompactWilsonCloverFermion<WilsonTwoIndexSymmetricImplF> CompactWilsonCloverTwoIndexSymmetricFermionF;
typedef CompactWilsonCloverFermion<WilsonTwoIndexSymmetricImplD> CompactWilsonCloverTwoIndexSymmetricFermionD;
typedef CompactWilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplR> CompactWilsonCloverTwoIndexAntiSymmetricFermionR;
typedef CompactWilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplF> CompactWilsonCloverTwoIndexAntiSymmetricFermionF;
typedef CompactWilsonCloverFermion<WilsonTwoIndexAntiSymmetricImplD> CompactWilsonCloverTwoIndexAntiSymmetricFermionD;
// Domain Wall fermions // Domain Wall fermions
typedef DomainWallFermion<WilsonImplR> DomainWallFermionR; typedef DomainWallFermion<WilsonImplR> DomainWallFermionR;
typedef DomainWallFermion<WilsonImplF> DomainWallFermionF; typedef DomainWallFermion<WilsonImplF> DomainWallFermionF;
typedef DomainWallFermion<WilsonImplD> DomainWallFermionD; typedef DomainWallFermion<WilsonImplD> DomainWallFermionD;
//typedef DomainWallFermion<WilsonImplRL> DomainWallFermionRL; typedef DomainWallFermion<WilsonImplRL> DomainWallFermionRL;
//typedef DomainWallFermion<WilsonImplFH> DomainWallFermionFH; typedef DomainWallFermion<WilsonImplFH> DomainWallFermionFH;
//typedef DomainWallFermion<WilsonImplDF> DomainWallFermionDF; typedef DomainWallFermion<WilsonImplDF> DomainWallFermionDF;
typedef DomainWallEOFAFermion<WilsonImplR> DomainWallEOFAFermionR; typedef DomainWallEOFAFermion<WilsonImplR> DomainWallEOFAFermionR;
typedef DomainWallEOFAFermion<WilsonImplF> DomainWallEOFAFermionF; typedef DomainWallEOFAFermion<WilsonImplF> DomainWallEOFAFermionF;
typedef DomainWallEOFAFermion<WilsonImplD> DomainWallEOFAFermionD; typedef DomainWallEOFAFermion<WilsonImplD> DomainWallEOFAFermionD;
//typedef DomainWallEOFAFermion<WilsonImplRL> DomainWallEOFAFermionRL; typedef DomainWallEOFAFermion<WilsonImplRL> DomainWallEOFAFermionRL;
//typedef DomainWallEOFAFermion<WilsonImplFH> DomainWallEOFAFermionFH; typedef DomainWallEOFAFermion<WilsonImplFH> DomainWallEOFAFermionFH;
//typedef DomainWallEOFAFermion<WilsonImplDF> DomainWallEOFAFermionDF; typedef DomainWallEOFAFermion<WilsonImplDF> DomainWallEOFAFermionDF;
typedef MobiusFermion<WilsonImplR> MobiusFermionR; typedef MobiusFermion<WilsonImplR> MobiusFermionR;
typedef MobiusFermion<WilsonImplF> MobiusFermionF; typedef MobiusFermion<WilsonImplF> MobiusFermionF;
typedef MobiusFermion<WilsonImplD> MobiusFermionD; typedef MobiusFermion<WilsonImplD> MobiusFermionD;
//typedef MobiusFermion<WilsonImplRL> MobiusFermionRL; typedef MobiusFermion<WilsonImplRL> MobiusFermionRL;
//typedef MobiusFermion<WilsonImplFH> MobiusFermionFH; typedef MobiusFermion<WilsonImplFH> MobiusFermionFH;
//typedef MobiusFermion<WilsonImplDF> MobiusFermionDF; typedef MobiusFermion<WilsonImplDF> MobiusFermionDF;
typedef MobiusEOFAFermion<WilsonImplR> MobiusEOFAFermionR; typedef MobiusEOFAFermion<WilsonImplR> MobiusEOFAFermionR;
typedef MobiusEOFAFermion<WilsonImplF> MobiusEOFAFermionF; typedef MobiusEOFAFermion<WilsonImplF> MobiusEOFAFermionF;
typedef MobiusEOFAFermion<WilsonImplD> MobiusEOFAFermionD; typedef MobiusEOFAFermion<WilsonImplD> MobiusEOFAFermionD;
//typedef MobiusEOFAFermion<WilsonImplRL> MobiusEOFAFermionRL; typedef MobiusEOFAFermion<WilsonImplRL> MobiusEOFAFermionRL;
//typedef MobiusEOFAFermion<WilsonImplFH> MobiusEOFAFermionFH; typedef MobiusEOFAFermion<WilsonImplFH> MobiusEOFAFermionFH;
//typedef MobiusEOFAFermion<WilsonImplDF> MobiusEOFAFermionDF; typedef MobiusEOFAFermion<WilsonImplDF> MobiusEOFAFermionDF;
typedef ZMobiusFermion<ZWilsonImplR> ZMobiusFermionR; typedef ZMobiusFermion<ZWilsonImplR> ZMobiusFermionR;
typedef ZMobiusFermion<ZWilsonImplF> ZMobiusFermionF; typedef ZMobiusFermion<ZWilsonImplF> ZMobiusFermionF;
typedef ZMobiusFermion<ZWilsonImplD> ZMobiusFermionD; typedef ZMobiusFermion<ZWilsonImplD> ZMobiusFermionD;
//typedef ZMobiusFermion<ZWilsonImplRL> ZMobiusFermionRL; typedef ZMobiusFermion<ZWilsonImplRL> ZMobiusFermionRL;
//typedef ZMobiusFermion<ZWilsonImplFH> ZMobiusFermionFH; typedef ZMobiusFermion<ZWilsonImplFH> ZMobiusFermionFH;
//typedef ZMobiusFermion<ZWilsonImplDF> ZMobiusFermionDF; typedef ZMobiusFermion<ZWilsonImplDF> ZMobiusFermionDF;
// Ls vectorised // Ls vectorised
typedef ScaledShamirFermion<WilsonImplR> ScaledShamirFermionR; typedef ScaledShamirFermion<WilsonImplR> ScaledShamirFermionR;
@ -253,49 +235,49 @@ typedef WilsonFermion<GparityWilsonImplR> GparityWilsonFermionR;
typedef WilsonFermion<GparityWilsonImplF> GparityWilsonFermionF; typedef WilsonFermion<GparityWilsonImplF> GparityWilsonFermionF;
typedef WilsonFermion<GparityWilsonImplD> GparityWilsonFermionD; typedef WilsonFermion<GparityWilsonImplD> GparityWilsonFermionD;
//typedef WilsonFermion<GparityWilsonImplRL> GparityWilsonFermionRL; typedef WilsonFermion<GparityWilsonImplRL> GparityWilsonFermionRL;
//typedef WilsonFermion<GparityWilsonImplFH> GparityWilsonFermionFH; typedef WilsonFermion<GparityWilsonImplFH> GparityWilsonFermionFH;
//typedef WilsonFermion<GparityWilsonImplDF> GparityWilsonFermionDF; typedef WilsonFermion<GparityWilsonImplDF> GparityWilsonFermionDF;
typedef DomainWallFermion<GparityWilsonImplR> GparityDomainWallFermionR; typedef DomainWallFermion<GparityWilsonImplR> GparityDomainWallFermionR;
typedef DomainWallFermion<GparityWilsonImplF> GparityDomainWallFermionF; typedef DomainWallFermion<GparityWilsonImplF> GparityDomainWallFermionF;
typedef DomainWallFermion<GparityWilsonImplD> GparityDomainWallFermionD; typedef DomainWallFermion<GparityWilsonImplD> GparityDomainWallFermionD;
//typedef DomainWallFermion<GparityWilsonImplRL> GparityDomainWallFermionRL; typedef DomainWallFermion<GparityWilsonImplRL> GparityDomainWallFermionRL;
//typedef DomainWallFermion<GparityWilsonImplFH> GparityDomainWallFermionFH; typedef DomainWallFermion<GparityWilsonImplFH> GparityDomainWallFermionFH;
//typedef DomainWallFermion<GparityWilsonImplDF> GparityDomainWallFermionDF; typedef DomainWallFermion<GparityWilsonImplDF> GparityDomainWallFermionDF;
typedef DomainWallEOFAFermion<GparityWilsonImplR> GparityDomainWallEOFAFermionR; typedef DomainWallEOFAFermion<GparityWilsonImplR> GparityDomainWallEOFAFermionR;
typedef DomainWallEOFAFermion<GparityWilsonImplF> GparityDomainWallEOFAFermionF; typedef DomainWallEOFAFermion<GparityWilsonImplF> GparityDomainWallEOFAFermionF;
typedef DomainWallEOFAFermion<GparityWilsonImplD> GparityDomainWallEOFAFermionD; typedef DomainWallEOFAFermion<GparityWilsonImplD> GparityDomainWallEOFAFermionD;
//typedef DomainWallEOFAFermion<GparityWilsonImplRL> GparityDomainWallEOFAFermionRL; typedef DomainWallEOFAFermion<GparityWilsonImplRL> GparityDomainWallEOFAFermionRL;
//typedef DomainWallEOFAFermion<GparityWilsonImplFH> GparityDomainWallEOFAFermionFH; typedef DomainWallEOFAFermion<GparityWilsonImplFH> GparityDomainWallEOFAFermionFH;
//typedef DomainWallEOFAFermion<GparityWilsonImplDF> GparityDomainWallEOFAFermionDF; typedef DomainWallEOFAFermion<GparityWilsonImplDF> GparityDomainWallEOFAFermionDF;
typedef WilsonTMFermion<GparityWilsonImplR> GparityWilsonTMFermionR; typedef WilsonTMFermion<GparityWilsonImplR> GparityWilsonTMFermionR;
typedef WilsonTMFermion<GparityWilsonImplF> GparityWilsonTMFermionF; typedef WilsonTMFermion<GparityWilsonImplF> GparityWilsonTMFermionF;
typedef WilsonTMFermion<GparityWilsonImplD> GparityWilsonTMFermionD; typedef WilsonTMFermion<GparityWilsonImplD> GparityWilsonTMFermionD;
//typedef WilsonTMFermion<GparityWilsonImplRL> GparityWilsonTMFermionRL; typedef WilsonTMFermion<GparityWilsonImplRL> GparityWilsonTMFermionRL;
//typedef WilsonTMFermion<GparityWilsonImplFH> GparityWilsonTMFermionFH; typedef WilsonTMFermion<GparityWilsonImplFH> GparityWilsonTMFermionFH;
//typedef WilsonTMFermion<GparityWilsonImplDF> GparityWilsonTMFermionDF; typedef WilsonTMFermion<GparityWilsonImplDF> GparityWilsonTMFermionDF;
typedef MobiusFermion<GparityWilsonImplR> GparityMobiusFermionR; typedef MobiusFermion<GparityWilsonImplR> GparityMobiusFermionR;
typedef MobiusFermion<GparityWilsonImplF> GparityMobiusFermionF; typedef MobiusFermion<GparityWilsonImplF> GparityMobiusFermionF;
typedef MobiusFermion<GparityWilsonImplD> GparityMobiusFermionD; typedef MobiusFermion<GparityWilsonImplD> GparityMobiusFermionD;
//typedef MobiusFermion<GparityWilsonImplRL> GparityMobiusFermionRL; typedef MobiusFermion<GparityWilsonImplRL> GparityMobiusFermionRL;
//typedef MobiusFermion<GparityWilsonImplFH> GparityMobiusFermionFH; typedef MobiusFermion<GparityWilsonImplFH> GparityMobiusFermionFH;
//typedef MobiusFermion<GparityWilsonImplDF> GparityMobiusFermionDF; typedef MobiusFermion<GparityWilsonImplDF> GparityMobiusFermionDF;
typedef MobiusEOFAFermion<GparityWilsonImplR> GparityMobiusEOFAFermionR; typedef MobiusEOFAFermion<GparityWilsonImplR> GparityMobiusEOFAFermionR;
typedef MobiusEOFAFermion<GparityWilsonImplF> GparityMobiusEOFAFermionF; typedef MobiusEOFAFermion<GparityWilsonImplF> GparityMobiusEOFAFermionF;
typedef MobiusEOFAFermion<GparityWilsonImplD> GparityMobiusEOFAFermionD; typedef MobiusEOFAFermion<GparityWilsonImplD> GparityMobiusEOFAFermionD;
//typedef MobiusEOFAFermion<GparityWilsonImplRL> GparityMobiusEOFAFermionRL; typedef MobiusEOFAFermion<GparityWilsonImplRL> GparityMobiusEOFAFermionRL;
//typedef MobiusEOFAFermion<GparityWilsonImplFH> GparityMobiusEOFAFermionFH; typedef MobiusEOFAFermion<GparityWilsonImplFH> GparityMobiusEOFAFermionFH;
//typedef MobiusEOFAFermion<GparityWilsonImplDF> GparityMobiusEOFAFermionDF; typedef MobiusEOFAFermion<GparityWilsonImplDF> GparityMobiusEOFAFermionDF;
typedef ImprovedStaggeredFermion<StaggeredImplR> ImprovedStaggeredFermionR; typedef ImprovedStaggeredFermion<StaggeredImplR> ImprovedStaggeredFermionR;
typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF; typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF;
@ -309,6 +291,12 @@ typedef ImprovedStaggeredFermion5D<StaggeredImplR> ImprovedStaggeredFermion5DR;
typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF; typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF;
typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD; typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD;
#ifndef GRID_CUDA
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplR> ImprovedStaggeredFermionVec5dR;
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplF> ImprovedStaggeredFermionVec5dF;
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplD> ImprovedStaggeredFermionVec5dD;
#endif
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
//////////////////// ////////////////////

View File

@ -89,7 +89,8 @@ public:
virtual void Mdiag (const FermionField &in, FermionField &out) { Mooee(in,out);}; // Same as Mooee applied to both CB's virtual void Mdiag (const FermionField &in, FermionField &out) { Mooee(in,out);}; // Same as Mooee applied to both CB's
virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp)=0; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp)=0; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
virtual void MdirAll(const FermionField &in, std::vector<FermionField> &out)=0; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac virtual void MdirAll(const FermionField &in, std::vector<FermionField> &out)=0; // case by case Wilson, Clover, Cayley, ContFrac, PartFrac
virtual std::vector<int> Directions(void) =0;
virtual std::vector<int> Displacements(void)=0;
virtual void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) { assert(0);}; virtual void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) { assert(0);};

View File

@ -153,8 +153,8 @@ public:
typedef typename Impl::StencilImpl StencilImpl; \ typedef typename Impl::StencilImpl StencilImpl; \
typedef typename Impl::ImplParams ImplParams; \ typedef typename Impl::ImplParams ImplParams; \
typedef typename Impl::StencilImpl::View_type StencilView; \ typedef typename Impl::StencilImpl::View_type StencilView; \
typedef const typename ViewMap<FermionField>::Type FermionFieldView; \ typedef typename ViewMap<FermionField>::Type FermionFieldView; \
typedef const typename ViewMap<DoubledGaugeField>::Type DoubledGaugeFieldView; typedef typename ViewMap<DoubledGaugeField>::Type DoubledGaugeFieldView;
#define INHERIT_IMPL_TYPES(Base) \ #define INHERIT_IMPL_TYPES(Base) \
INHERIT_GIMPL_TYPES(Base) \ INHERIT_GIMPL_TYPES(Base) \
@ -183,8 +183,7 @@ NAMESPACE_CHECK(ImplStaggered);
///////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////
// Single flavour one component spinors with colour index. 5d vec // Single flavour one component spinors with colour index. 5d vec
///////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////
// Deprecate Vec5d #include <Grid/qcd/action/fermion/StaggeredVec5dImpl.h>
//#include <Grid/qcd/action/fermion/StaggeredVec5dImpl.h> NAMESPACE_CHECK(ImplStaggered5dVec);
//NAMESPACE_CHECK(ImplStaggered5dVec);

View File

@ -30,18 +30,6 @@ directory
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
/*
Policy implementation for G-parity boundary conditions
Rather than treating the gauge field as a flavored field, the Grid implementation of G-parity treats the gauge field as a regular
field with complex conjugate boundary conditions. In order to ensure the second flavor interacts with the conjugate links and the first
with the regular links we overload the functionality of doubleStore, whose purpose is to store the gauge field and the barrel-shifted gauge field
to avoid communicating links when applying the Dirac operator, such that the double-stored field contains also a flavor index which maps to
either the link or the conjugate link. This flavored field is then used by multLink to apply the correct link to a spinor.
Here the first Nd-1 directions are treated as "spatial", and a twist value of 1 indicates G-parity BCs in that direction.
mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs
*/
template <class S, class Representation = FundamentalRepresentation, class Options=CoeffReal> template <class S, class Representation = FundamentalRepresentation, class Options=CoeffReal>
class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Representation::Dimension> > { class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Representation::Dimension> > {
public: public:
@ -109,30 +97,42 @@ public:
Coordinate icoor; Coordinate icoor;
#ifdef GRID_SIMT #ifdef GRID_SIMT
_Spinor tmp;
const int Nsimd =SiteDoubledGaugeField::Nsimd(); const int Nsimd =SiteDoubledGaugeField::Nsimd();
int s = acceleratorSIMTlane(Nsimd); int s = acceleratorSIMTlane(Nsimd);
St.iCoorFromIindex(icoor,s); St.iCoorFromIindex(icoor,s);
int mmu = mu % Nd; int mmu = mu % Nd;
if ( SE->_around_the_world && St.parameters.twists[mmu] ) {
int permute_lane = (sl==1)
|| ((distance== 1)&&(icoor[direction]==1))
|| ((distance==-1)&&(icoor[direction]==0));
auto UU0=coalescedRead(U(0)(mu)); if ( permute_lane ) {
auto UU1=coalescedRead(U(1)(mu)); tmp(0) = chi(1);
tmp(1) = chi(0);
//Decide whether we do a G-parity flavor twist } else {
//Note: this assumes (but does not check) that sl==1 || sl==2 i.e. max 2 SIMD lanes in G-parity dir tmp(0) = chi(0);
//It also assumes (but does not check) that abs(distance) == 1 tmp(1) = chi(1);
int permute_lane = (sl==1) }
|| ((distance== 1)&&(icoor[direction]==1))
|| ((distance==-1)&&(icoor[direction]==0));
permute_lane = permute_lane && SE->_around_the_world && St.parameters.twists[mmu] && mmu < Nd-1; //only if we are going around the world in a spatial direction auto UU0=coalescedRead(U(0)(mu));
auto UU1=coalescedRead(U(1)(mu));
//Apply the links mult(&phi(0),&UU0,&tmp(0));
int f_upper = permute_lane ? 1 : 0; mult(&phi(1),&UU1,&tmp(1));
int f_lower = !f_upper;
mult(&phi(0),&UU0,&chi(f_upper)); } else {
mult(&phi(1),&UU1,&chi(f_lower));
auto UU0=coalescedRead(U(0)(mu));
auto UU1=coalescedRead(U(1)(mu));
mult(&phi(0),&UU0,&chi(0));
mult(&phi(1),&UU1,&chi(1));
}
#else #else
typedef _Spinor vobj; typedef _Spinor vobj;
@ -151,10 +151,10 @@ public:
assert((distance == 1) || (distance == -1)); // nearest neighbour stencil hard code assert((distance == 1) || (distance == -1)); // nearest neighbour stencil hard code
assert((sl == 1) || (sl == 2)); assert((sl == 1) || (sl == 2));
//If this site is an global boundary site, perform the G-parity flavor twist if ( SE->_around_the_world && St.parameters.twists[mmu] ) {
if ( mmu < Nd-1 && SE->_around_the_world && St.parameters.twists[mmu] ) {
if ( sl == 2 ) { if ( sl == 2 ) {
//Only do the twist for lanes on the edge of the physical node
ExtractBuffer<sobj> vals(Nsimd); ExtractBuffer<sobj> vals(Nsimd);
extract(chi,vals); extract(chi,vals);
@ -209,19 +209,6 @@ public:
reg = memory; reg = memory;
} }
//Poke 'poke_f0' onto flavor 0 and 'poke_f1' onto flavor 1 in direction mu of the doubled gauge field Uds
inline void pokeGparityDoubledGaugeField(DoubledGaugeField &Uds, const GaugeLinkField &poke_f0, const GaugeLinkField &poke_f1, const int mu){
autoView(poke_f0_v, poke_f0, CpuRead);
autoView(poke_f1_v, poke_f1, CpuRead);
autoView(Uds_v, Uds, CpuWrite);
thread_foreach(ss,poke_f0_v,{
Uds_v[ss](0)(mu) = poke_f0_v[ss]();
Uds_v[ss](1)(mu) = poke_f1_v[ss]();
});
}
inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu) inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
{ {
conformable(Uds.Grid(),GaugeGrid); conformable(Uds.Grid(),GaugeGrid);
@ -232,19 +219,14 @@ public:
GaugeLinkField Uconj(GaugeGrid); GaugeLinkField Uconj(GaugeGrid);
Lattice<iScalar<vInteger> > coor(GaugeGrid); Lattice<iScalar<vInteger> > coor(GaugeGrid);
//Here the first Nd-1 directions are treated as "spatial", and a twist value of 1 indicates G-parity BCs in that direction. for(int mu=0;mu<Nd;mu++){
//mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs
for(int mu=0;mu<Nd-1;mu++){ LatticeCoordinate(coor,mu);
if( Params.twists[mu] ){
LatticeCoordinate(coor,mu);
}
U = PeekIndex<LorentzIndex>(Umu,mu); U = PeekIndex<LorentzIndex>(Umu,mu);
Uconj = conjugate(U); Uconj = conjugate(U);
// Implement the isospin rotation sign on the boundary between f=1 and f=0
// This phase could come from a simple bc 1,1,-1,1 .. // This phase could come from a simple bc 1,1,-1,1 ..
int neglink = GaugeGrid->GlobalDimensions()[mu]-1; int neglink = GaugeGrid->GlobalDimensions()[mu]-1;
if ( Params.twists[mu] ) { if ( Params.twists[mu] ) {
@ -259,7 +241,7 @@ public:
thread_foreach(ss,U_v,{ thread_foreach(ss,U_v,{
Uds_v[ss](0)(mu) = U_v[ss](); Uds_v[ss](0)(mu) = U_v[ss]();
Uds_v[ss](1)(mu) = Uconj_v[ss](); Uds_v[ss](1)(mu) = Uconj_v[ss]();
}); });
} }
U = adj(Cshift(U ,mu,-1)); // correct except for spanning the boundary U = adj(Cshift(U ,mu,-1)); // correct except for spanning the boundary
@ -290,38 +272,6 @@ public:
}); });
} }
} }
{ //periodic / antiperiodic temporal BCs
int mu = Nd-1;
int L = GaugeGrid->GlobalDimensions()[mu];
int Lmu = L - 1;
LatticeCoordinate(coor, mu);
U = PeekIndex<LorentzIndex>(Umu, mu); //Get t-directed links
GaugeLinkField *Upoke = &U;
if(Params.twists[mu]){ //antiperiodic
Utmp = where(coor == Lmu, -U, U);
Upoke = &Utmp;
}
Uconj = conjugate(*Upoke); //second flavor interacts with conjugate links
pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu);
//Get the barrel-shifted field
Utmp = adj(Cshift(U, mu, -1)); //is a forward shift!
Upoke = &Utmp;
if(Params.twists[mu]){
U = where(coor == 0, -Utmp, Utmp); //boundary phase
Upoke = &U;
}
Uconj = conjugate(*Upoke);
pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu + 4);
}
} }
inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A, int mu) { inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A, int mu) {
@ -360,48 +310,28 @@ public:
inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){ inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
assert(0); assert(0);
} }
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
int Ls=Btilde.Grid()->_fdimensions[0];
{
GridBase *GaugeGrid = mat.Grid();
Lattice<iScalar<vInteger> > coor(GaugeGrid);
if( Params.twists[mu] ){
LatticeCoordinate(coor,mu);
}
autoView( mat_v , mat, AcceleratorWrite);
autoView( Btilde_v , Btilde, AcceleratorRead);
autoView( Atilde_v , Atilde, AcceleratorRead);
accelerator_for(sss,mat.Grid()->oSites(), FermionField::vector_type::Nsimd(),{
int sU=sss;
typedef decltype(coalescedRead(mat_v[sU](mu)() )) ColorMatrixType;
ColorMatrixType sum;
zeroit(sum);
for(int s=0;s<Ls;s++){
int sF = s+Ls*sU;
for(int spn=0;spn<Ns;spn++){ //sum over spin
//Flavor 0
auto bb = coalescedRead(Btilde_v[sF](0)(spn) ); //color vector
auto aa = coalescedRead(Atilde_v[sF](0)(spn) );
sum = sum + outerProduct(bb,aa);
//Flavor 1
bb = coalescedRead(Btilde_v[sF](1)(spn) );
aa = coalescedRead(Atilde_v[sF](1)(spn) );
sum = sum + conjugate(outerProduct(bb,aa));
}
}
coalescedWrite(mat_v[sU](mu)(), sum);
});
}
}
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
int Ls = Btilde.Grid()->_fdimensions[0];
GaugeLinkField tmp(mat.Grid());
tmp = Zero();
{
autoView( tmp_v , tmp, CpuWrite);
autoView( Atilde_v , Atilde, CpuRead);
autoView( Btilde_v , Btilde, CpuRead);
thread_for(ss,tmp.Grid()->oSites(),{
for (int s = 0; s < Ls; s++) {
int sF = s + Ls * ss;
auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde_v[sF], Atilde_v[sF]));
tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1));
}
});
}
PokeIndex<LorentzIndex>(mat, tmp, mu);
return;
}
}; };
@ -409,8 +339,8 @@ typedef GparityWilsonImpl<vComplex , FundamentalRepresentation,CoeffReal> Gparit
typedef GparityWilsonImpl<vComplexF, FundamentalRepresentation,CoeffReal> GparityWilsonImplF; // Float typedef GparityWilsonImpl<vComplexF, FundamentalRepresentation,CoeffReal> GparityWilsonImplF; // Float
typedef GparityWilsonImpl<vComplexD, FundamentalRepresentation,CoeffReal> GparityWilsonImplD; // Double typedef GparityWilsonImpl<vComplexD, FundamentalRepresentation,CoeffReal> GparityWilsonImplD; // Double
//typedef GparityWilsonImpl<vComplex , FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplRL; // Real.. whichever prec typedef GparityWilsonImpl<vComplex , FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplRL; // Real.. whichever prec
//typedef GparityWilsonImpl<vComplexF, FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplFH; // Float typedef GparityWilsonImpl<vComplexF, FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplFH; // Float
//typedef GparityWilsonImpl<vComplexD, FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplDF; // Double typedef GparityWilsonImpl<vComplexD, FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplDF; // Double
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@ -44,6 +44,9 @@ public:
INHERIT_IMPL_TYPES(Impl); INHERIT_IMPL_TYPES(Impl);
typedef StaggeredKernels<Impl> Kernels; typedef StaggeredKernels<Impl> Kernels;
virtual std::vector<int> Directions(void) { return this->directions; };
virtual std::vector<int> Displacements(void){ return this->displacements;};
FermionField _tmp; FermionField _tmp;
FermionField &tmp(void) { return _tmp; } FermionField &tmp(void) { return _tmp; }

View File

@ -49,6 +49,9 @@ public:
INHERIT_IMPL_TYPES(Impl); INHERIT_IMPL_TYPES(Impl);
typedef StaggeredKernels<Impl> Kernels; typedef StaggeredKernels<Impl> Kernels;
virtual std::vector<int> Directions(void) { return this->directions; };
virtual std::vector<int> Displacements(void){ return this->displacements;};
FermionField _tmp; FermionField _tmp;
FermionField &tmp(void) { return _tmp; } FermionField &tmp(void) { return _tmp; }

View File

@ -85,7 +85,7 @@ class MADWF
maxiter =_maxiter; maxiter =_maxiter;
}; };
void operator() (const FermionFieldo &src,FermionFieldo &sol5) void operator() (const FermionFieldo &src4,FermionFieldo &sol5)
{ {
std::cout << GridLogMessage<< " ************************************************" << std::endl; std::cout << GridLogMessage<< " ************************************************" << std::endl;
std::cout << GridLogMessage<< " MADWF-like algorithm " << std::endl; std::cout << GridLogMessage<< " MADWF-like algorithm " << std::endl;
@ -114,16 +114,8 @@ class MADWF
/////////////////////////////////////// ///////////////////////////////////////
//Import source, include Dminus factors //Import source, include Dminus factors
/////////////////////////////////////// ///////////////////////////////////////
GridBase *src_grid = src.Grid(); Mato.ImportPhysicalFermionSource(src4,b);
std::cout << GridLogMessage << " src4 " <<norm2(src4)<<std::endl;
assert( (src_grid == Mato.GaugeGrid()) || (src_grid == Mato.FermionGrid()));
if ( src_grid == Mato.GaugeGrid() ) {
Mato.ImportPhysicalFermionSource(src,b);
} else {
b=src;
}
std::cout << GridLogMessage << " src " <<norm2(src)<<std::endl;
std::cout << GridLogMessage << " b " <<norm2(b)<<std::endl; std::cout << GridLogMessage << " b " <<norm2(b)<<std::endl;
defect = b; defect = b;

View File

@ -47,6 +47,9 @@ public:
FermionField _tmp; FermionField _tmp;
FermionField &tmp(void) { return _tmp; } FermionField &tmp(void) { return _tmp; }
virtual std::vector<int> Directions(void) { return this->directions; };
virtual std::vector<int> Displacements(void){ return this->displacements;};
//////////////////////////////////////// ////////////////////////////////////////
// Performance monitoring // Performance monitoring
//////////////////////////////////////// ////////////////////////////////////////

View File

@ -72,23 +72,19 @@ public:
StaggeredImpl(const ImplParams &p = ImplParams()) : Params(p){}; StaggeredImpl(const ImplParams &p = ImplParams()) : Params(p){};
template<class _Spinor> static accelerator_inline void multLink(SiteSpinor &phi,
static accelerator_inline void multLink(_Spinor &phi,
const SiteDoubledGaugeField &U, const SiteDoubledGaugeField &U,
const _Spinor &chi, const SiteSpinor &chi,
int mu) int mu)
{ {
auto UU = coalescedRead(U(mu)); mult(&phi(), &U(mu), &chi());
mult(&phi(), &UU, &chi());
} }
template<class _Spinor> static accelerator_inline void multLinkAdd(SiteSpinor &phi,
static accelerator_inline void multLinkAdd(_Spinor &phi,
const SiteDoubledGaugeField &U, const SiteDoubledGaugeField &U,
const _Spinor &chi, const SiteSpinor &chi,
int mu) int mu)
{ {
auto UU = coalescedRead(U(mu)); mac(&phi(), &U(mu), &chi());
mac(&phi(), &UU, &chi());
} }
template <class ref> template <class ref>

View File

@ -63,20 +63,17 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
/////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////
// Generic Nc kernels // Generic Nc kernels
/////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////
template<int Naik> template<int Naik> accelerator_inline
static accelerator_inline
void DhopSiteGeneric(StencilView &st, void DhopSiteGeneric(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU, SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag); const FermionFieldView &in, FermionFieldView &out,int dag);
template<int Naik> accelerator_inline
template<int Naik> static accelerator_inline
void DhopSiteGenericInt(StencilView &st, void DhopSiteGenericInt(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU, SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag); const FermionFieldView &in, FermionFieldView &out,int dag);
template<int Naik> accelerator_inline
template<int Naik> static accelerator_inline
void DhopSiteGenericExt(StencilView &st, void DhopSiteGenericExt(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU, SiteSpinor * buf, int LLs, int sU,
@ -85,20 +82,17 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
/////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////
// Nc=3 specific kernels // Nc=3 specific kernels
/////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////
template<int Naik> accelerator_inline
template<int Naik> static accelerator_inline
void DhopSiteHand(StencilView &st, void DhopSiteHand(StencilView &st,
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU, SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag); const FermionFieldView &in, FermionFieldView &out,int dag);
template<int Naik> accelerator_inline
template<int Naik> static accelerator_inline
void DhopSiteHandInt(StencilView &st, void DhopSiteHandInt(StencilView &st,
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU, SiteSpinor * buf, int LLs, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag); const FermionFieldView &in, FermionFieldView &out,int dag);
template<int Naik> accelerator_inline
template<int Naik> static accelerator_inline
void DhopSiteHandExt(StencilView &st, void DhopSiteHandExt(StencilView &st,
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU, SiteSpinor * buf, int LLs, int sU,
@ -107,7 +101,6 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
/////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////
// Asm Nc=3 specific kernels // Asm Nc=3 specific kernels
/////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////
void DhopSiteAsm(StencilView &st, void DhopSiteAsm(StencilView &st,
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
SiteSpinor * buf, int LLs, int sU, SiteSpinor * buf, int LLs, int sU,

View File

@ -4,11 +4,10 @@
Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.h Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.h
Copyright (C) 2017 - 2022 Copyright (C) 2017
Author: Guido Cossu <guido.cossu@ed.ac.uk> Author: Guido Cossu <guido.cossu@ed.ac.uk>
Author: David Preti <> Author: David Preti <>
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -30,8 +29,7 @@
#pragma once #pragma once
#include <Grid/qcd/action/fermion/WilsonCloverTypes.h> #include <Grid/Grid.h>
#include <Grid/qcd/action/fermion/WilsonCloverHelpers.h>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
@ -52,15 +50,18 @@ NAMESPACE_BEGIN(Grid);
////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////
template <class Impl> template <class Impl>
class WilsonCloverFermion : public WilsonFermion<Impl>, class WilsonCloverFermion : public WilsonFermion<Impl>
public WilsonCloverHelpers<Impl>
{ {
public: public:
// Types definitions
INHERIT_IMPL_TYPES(Impl); INHERIT_IMPL_TYPES(Impl);
INHERIT_CLOVER_TYPES(Impl); template <typename vtype>
using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
typedef iImplClover<Simd> SiteCloverType;
typedef Lattice<SiteCloverType> CloverFieldType;
typedef WilsonFermion<Impl> WilsonBase; public:
typedef WilsonCloverHelpers<Impl> Helpers; typedef WilsonFermion<Impl> WilsonBase;
virtual int ConstEE(void) { return 0; }; virtual int ConstEE(void) { return 0; };
virtual void Instantiatable(void){}; virtual void Instantiatable(void){};
@ -71,7 +72,42 @@ public:
const RealD _csw_r = 0.0, const RealD _csw_r = 0.0,
const RealD _csw_t = 0.0, const RealD _csw_t = 0.0,
const WilsonAnisotropyCoefficients &clover_anisotropy = WilsonAnisotropyCoefficients(), const WilsonAnisotropyCoefficients &clover_anisotropy = WilsonAnisotropyCoefficients(),
const ImplParams &impl_p = ImplParams()); const ImplParams &impl_p = ImplParams()) : WilsonFermion<Impl>(_Umu,
Fgrid,
Hgrid,
_mass, impl_p, clover_anisotropy),
CloverTerm(&Fgrid),
CloverTermInv(&Fgrid),
CloverTermEven(&Hgrid),
CloverTermOdd(&Hgrid),
CloverTermInvEven(&Hgrid),
CloverTermInvOdd(&Hgrid),
CloverTermDagEven(&Hgrid),
CloverTermDagOdd(&Hgrid),
CloverTermInvDagEven(&Hgrid),
CloverTermInvDagOdd(&Hgrid)
{
assert(Nd == 4); // require 4 dimensions
if (clover_anisotropy.isAnisotropic)
{
csw_r = _csw_r * 0.5 / clover_anisotropy.xi_0;
diag_mass = _mass + 1.0 + (Nd - 1) * (clover_anisotropy.nu / clover_anisotropy.xi_0);
}
else
{
csw_r = _csw_r * 0.5;
diag_mass = 4.0 + _mass;
}
csw_t = _csw_t * 0.5;
if (csw_r == 0)
std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_r = 0" << std::endl;
if (csw_t == 0)
std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_t = 0" << std::endl;
ImportGauge(_Umu);
}
virtual void M(const FermionField &in, FermionField &out); virtual void M(const FermionField &in, FermionField &out);
virtual void Mdag(const FermionField &in, FermionField &out); virtual void Mdag(const FermionField &in, FermionField &out);
@ -88,21 +124,250 @@ public:
void ImportGauge(const GaugeField &_Umu); void ImportGauge(const GaugeField &_Umu);
// Derivative parts unpreconditioned pseudofermions // Derivative parts unpreconditioned pseudofermions
void MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag); void MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag)
{
conformable(X.Grid(), Y.Grid());
conformable(X.Grid(), force.Grid());
GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
GaugeField clover_force(force.Grid());
PropagatorField Lambda(force.Grid());
public: // Guido: Here we are hitting some performance issues:
// need to extract the components of the DoubledGaugeField
// for each call
// Possible solution
// Create a vector object to store them? (cons: wasting space)
std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
Impl::extractLinkField(U, this->Umu);
force = Zero();
// Derivative of the Wilson hopping term
this->DhopDeriv(force, X, Y, dag);
///////////////////////////////////////////////////////////
// Clover term derivative
///////////////////////////////////////////////////////////
Impl::outerProductImpl(Lambda, X, Y);
//std::cout << "Lambda:" << Lambda << std::endl;
Gamma::Algebra sigma[] = {
Gamma::Algebra::SigmaXY,
Gamma::Algebra::SigmaXZ,
Gamma::Algebra::SigmaXT,
Gamma::Algebra::MinusSigmaXY,
Gamma::Algebra::SigmaYZ,
Gamma::Algebra::SigmaYT,
Gamma::Algebra::MinusSigmaXZ,
Gamma::Algebra::MinusSigmaYZ,
Gamma::Algebra::SigmaZT,
Gamma::Algebra::MinusSigmaXT,
Gamma::Algebra::MinusSigmaYT,
Gamma::Algebra::MinusSigmaZT};
/*
sigma_{\mu \nu}=
| 0 sigma[0] sigma[1] sigma[2] |
| sigma[3] 0 sigma[4] sigma[5] |
| sigma[6] sigma[7] 0 sigma[8] |
| sigma[9] sigma[10] sigma[11] 0 |
*/
int count = 0;
clover_force = Zero();
for (int mu = 0; mu < 4; mu++)
{
force_mu = Zero();
for (int nu = 0; nu < 4; nu++)
{
if (mu == nu)
continue;
RealD factor;
if (nu == 4 || mu == 4)
{
factor = 2.0 * csw_t;
}
else
{
factor = 2.0 * csw_r;
}
PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
Impl::TraceSpinImpl(lambda, Slambda); // traceSpin ok
force_mu -= factor*Cmunu(U, lambda, mu, nu); // checked
count++;
}
pokeLorentz(clover_force, U[mu] * force_mu, mu);
}
//clover_force *= csw;
force += clover_force;
}
// Computing C_{\mu \nu}(x) as in Eq.(B.39) in Zbigniew Sroczynski's PhD thesis
GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu)
{
conformable(lambda.Grid(), U[0].Grid());
GaugeLinkField out(lambda.Grid()), tmp(lambda.Grid());
// insertion in upper staple
// please check redundancy of shift operations
// C1+
tmp = lambda * U[nu];
out = Impl::ShiftStaple(Impl::CovShiftForward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
// C2+
tmp = U[mu] * Impl::ShiftStaple(adj(lambda), mu);
out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(tmp, mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
// C3+
tmp = U[nu] * Impl::ShiftStaple(adj(lambda), nu);
out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(tmp, nu))), mu);
// C4+
out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu) * lambda;
// insertion in lower staple
// C1-
out -= Impl::ShiftStaple(lambda, mu) * Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
// C2-
tmp = adj(lambda) * U[nu];
out -= Impl::ShiftStaple(Impl::CovShiftBackward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
// C3-
tmp = lambda * U[nu];
out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, tmp)), mu);
// C4-
out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu) * lambda;
return out;
}
private:
// here fixing the 4 dimensions, make it more general? // here fixing the 4 dimensions, make it more general?
RealD csw_r; // Clover coefficient - spatial RealD csw_r; // Clover coefficient - spatial
RealD csw_t; // Clover coefficient - temporal RealD csw_t; // Clover coefficient - temporal
RealD diag_mass; // Mass term RealD diag_mass; // Mass term
CloverField CloverTerm, CloverTermInv; // Clover term CloverFieldType CloverTerm, CloverTermInv; // Clover term
CloverField CloverTermEven, CloverTermOdd; // Clover term EO CloverFieldType CloverTermEven, CloverTermOdd; // Clover term EO
CloverField CloverTermInvEven, CloverTermInvOdd; // Clover term Inv EO CloverFieldType CloverTermInvEven, CloverTermInvOdd; // Clover term Inv EO
CloverField CloverTermDagEven, CloverTermDagOdd; // Clover term Dag EO CloverFieldType CloverTermDagEven, CloverTermDagOdd; // Clover term Dag EO
CloverField CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO CloverFieldType CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO
};
public:
// eventually these can be compressed into 6x6 blocks instead of the 12x12
// using the DeGrand-Rossi basis for the gamma matrices
CloverFieldType fillCloverYZ(const GaugeLinkField &F)
{
CloverFieldType T(F.Grid());
T = Zero();
autoView(T_v,T,AcceleratorWrite);
autoView(F_v,F,AcceleratorRead);
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
{
T_v[i]()(0, 1) = timesMinusI(F_v[i]()());
T_v[i]()(1, 0) = timesMinusI(F_v[i]()());
T_v[i]()(2, 3) = timesMinusI(F_v[i]()());
T_v[i]()(3, 2) = timesMinusI(F_v[i]()());
});
return T;
}
CloverFieldType fillCloverXZ(const GaugeLinkField &F)
{
CloverFieldType T(F.Grid());
T = Zero();
autoView(T_v, T,AcceleratorWrite);
autoView(F_v, F,AcceleratorRead);
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
{
T_v[i]()(0, 1) = -F_v[i]()();
T_v[i]()(1, 0) = F_v[i]()();
T_v[i]()(2, 3) = -F_v[i]()();
T_v[i]()(3, 2) = F_v[i]()();
});
return T;
}
CloverFieldType fillCloverXY(const GaugeLinkField &F)
{
CloverFieldType T(F.Grid());
T = Zero();
autoView(T_v,T,AcceleratorWrite);
autoView(F_v,F,AcceleratorRead);
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
{
T_v[i]()(0, 0) = timesMinusI(F_v[i]()());
T_v[i]()(1, 1) = timesI(F_v[i]()());
T_v[i]()(2, 2) = timesMinusI(F_v[i]()());
T_v[i]()(3, 3) = timesI(F_v[i]()());
});
return T;
}
CloverFieldType fillCloverXT(const GaugeLinkField &F)
{
CloverFieldType T(F.Grid());
T = Zero();
autoView( T_v , T, AcceleratorWrite);
autoView( F_v , F, AcceleratorRead);
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
{
T_v[i]()(0, 1) = timesI(F_v[i]()());
T_v[i]()(1, 0) = timesI(F_v[i]()());
T_v[i]()(2, 3) = timesMinusI(F_v[i]()());
T_v[i]()(3, 2) = timesMinusI(F_v[i]()());
});
return T;
}
CloverFieldType fillCloverYT(const GaugeLinkField &F)
{
CloverFieldType T(F.Grid());
T = Zero();
autoView( T_v ,T,AcceleratorWrite);
autoView( F_v ,F,AcceleratorRead);
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
{
T_v[i]()(0, 1) = -(F_v[i]()());
T_v[i]()(1, 0) = (F_v[i]()());
T_v[i]()(2, 3) = (F_v[i]()());
T_v[i]()(3, 2) = -(F_v[i]()());
});
return T;
}
CloverFieldType fillCloverZT(const GaugeLinkField &F)
{
CloverFieldType T(F.Grid());
T = Zero();
autoView( T_v , T,AcceleratorWrite);
autoView( F_v , F,AcceleratorRead);
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
{
T_v[i]()(0, 0) = timesI(F_v[i]()());
T_v[i]()(1, 1) = timesMinusI(F_v[i]()());
T_v[i]()(2, 2) = timesMinusI(F_v[i]()());
T_v[i]()(3, 3) = timesI(F_v[i]()());
});
return T;
}
};
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@ -1,761 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonCloverHelpers.h
Copyright (C) 2021 - 2022
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
// Helper routines that implement common clover functionality
NAMESPACE_BEGIN(Grid);
template<class Impl> class WilsonCloverHelpers {
public:
INHERIT_IMPL_TYPES(Impl);
INHERIT_CLOVER_TYPES(Impl);
// Computing C_{\mu \nu}(x) as in Eq.(B.39) in Zbigniew Sroczynski's PhD thesis
static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu)
{
conformable(lambda.Grid(), U[0].Grid());
GaugeLinkField out(lambda.Grid()), tmp(lambda.Grid());
// insertion in upper staple
// please check redundancy of shift operations
// C1+
tmp = lambda * U[nu];
out = Impl::ShiftStaple(Impl::CovShiftForward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
// C2+
tmp = U[mu] * Impl::ShiftStaple(adj(lambda), mu);
out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(tmp, mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu);
// C3+
tmp = U[nu] * Impl::ShiftStaple(adj(lambda), nu);
out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(tmp, nu))), mu);
// C4+
out += Impl::ShiftStaple(Impl::CovShiftForward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, Impl::CovShiftIdentityBackward(U[nu], nu))), mu) * lambda;
// insertion in lower staple
// C1-
out -= Impl::ShiftStaple(lambda, mu) * Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
// C2-
tmp = adj(lambda) * U[nu];
out -= Impl::ShiftStaple(Impl::CovShiftBackward(tmp, nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu);
// C3-
tmp = lambda * U[nu];
out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, tmp)), mu);
// C4-
out -= Impl::ShiftStaple(Impl::CovShiftBackward(U[nu], nu, Impl::CovShiftBackward(U[mu], mu, U[nu])), mu) * lambda;
return out;
}
static CloverField fillCloverYZ(const GaugeLinkField &F)
{
CloverField T(F.Grid());
T = Zero();
autoView(T_v,T,AcceleratorWrite);
autoView(F_v,F,AcceleratorRead);
accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
{
coalescedWrite(T_v[i]()(0, 1), coalescedRead(timesMinusI(F_v[i]()())));
coalescedWrite(T_v[i]()(1, 0), coalescedRead(timesMinusI(F_v[i]()())));
coalescedWrite(T_v[i]()(2, 3), coalescedRead(timesMinusI(F_v[i]()())));
coalescedWrite(T_v[i]()(3, 2), coalescedRead(timesMinusI(F_v[i]()())));
});
return T;
}
static CloverField fillCloverXZ(const GaugeLinkField &F)
{
CloverField T(F.Grid());
T = Zero();
autoView(T_v, T,AcceleratorWrite);
autoView(F_v, F,AcceleratorRead);
accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
{
coalescedWrite(T_v[i]()(0, 1), coalescedRead(-F_v[i]()()));
coalescedWrite(T_v[i]()(1, 0), coalescedRead(F_v[i]()()));
coalescedWrite(T_v[i]()(2, 3), coalescedRead(-F_v[i]()()));
coalescedWrite(T_v[i]()(3, 2), coalescedRead(F_v[i]()()));
});
return T;
}
static CloverField fillCloverXY(const GaugeLinkField &F)
{
CloverField T(F.Grid());
T = Zero();
autoView(T_v,T,AcceleratorWrite);
autoView(F_v,F,AcceleratorRead);
accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
{
coalescedWrite(T_v[i]()(0, 0), coalescedRead(timesMinusI(F_v[i]()())));
coalescedWrite(T_v[i]()(1, 1), coalescedRead(timesI(F_v[i]()())));
coalescedWrite(T_v[i]()(2, 2), coalescedRead(timesMinusI(F_v[i]()())));
coalescedWrite(T_v[i]()(3, 3), coalescedRead(timesI(F_v[i]()())));
});
return T;
}
static CloverField fillCloverXT(const GaugeLinkField &F)
{
CloverField T(F.Grid());
T = Zero();
autoView( T_v , T, AcceleratorWrite);
autoView( F_v , F, AcceleratorRead);
accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
{
coalescedWrite(T_v[i]()(0, 1), coalescedRead(timesI(F_v[i]()())));
coalescedWrite(T_v[i]()(1, 0), coalescedRead(timesI(F_v[i]()())));
coalescedWrite(T_v[i]()(2, 3), coalescedRead(timesMinusI(F_v[i]()())));
coalescedWrite(T_v[i]()(3, 2), coalescedRead(timesMinusI(F_v[i]()())));
});
return T;
}
static CloverField fillCloverYT(const GaugeLinkField &F)
{
CloverField T(F.Grid());
T = Zero();
autoView( T_v ,T,AcceleratorWrite);
autoView( F_v ,F,AcceleratorRead);
accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
{
coalescedWrite(T_v[i]()(0, 1), coalescedRead(-(F_v[i]()())));
coalescedWrite(T_v[i]()(1, 0), coalescedRead((F_v[i]()())));
coalescedWrite(T_v[i]()(2, 3), coalescedRead((F_v[i]()())));
coalescedWrite(T_v[i]()(3, 2), coalescedRead(-(F_v[i]()())));
});
return T;
}
static CloverField fillCloverZT(const GaugeLinkField &F)
{
CloverField T(F.Grid());
T = Zero();
autoView( T_v , T,AcceleratorWrite);
autoView( F_v , F,AcceleratorRead);
accelerator_for(i, T.Grid()->oSites(),CloverField::vector_type::Nsimd(),
{
coalescedWrite(T_v[i]()(0, 0), coalescedRead(timesI(F_v[i]()())));
coalescedWrite(T_v[i]()(1, 1), coalescedRead(timesMinusI(F_v[i]()())));
coalescedWrite(T_v[i]()(2, 2), coalescedRead(timesMinusI(F_v[i]()())));
coalescedWrite(T_v[i]()(3, 3), coalescedRead(timesI(F_v[i]()())));
});
return T;
}
template<class _Spinor>
static accelerator_inline void multClover(_Spinor& phi, const SiteClover& C, const _Spinor& chi) {
auto CC = coalescedRead(C);
mult(&phi, &CC, &chi);
}
template<class _SpinorField>
inline void multCloverField(_SpinorField& out, const CloverField& C, const _SpinorField& phi) {
const int Nsimd = SiteSpinor::Nsimd();
autoView(out_v, out, AcceleratorWrite);
autoView(phi_v, phi, AcceleratorRead);
autoView(C_v, C, AcceleratorRead);
typedef decltype(coalescedRead(out_v[0])) calcSpinor;
accelerator_for(sss,out.Grid()->oSites(),Nsimd,{
calcSpinor tmp;
multClover(tmp,C_v[sss],phi_v(sss));
coalescedWrite(out_v[sss],tmp);
});
}
};
template<class Impl> class CompactWilsonCloverHelpers {
public:
INHERIT_COMPACT_CLOVER_SIZES(Impl);
INHERIT_IMPL_TYPES(Impl);
INHERIT_CLOVER_TYPES(Impl);
INHERIT_COMPACT_CLOVER_TYPES(Impl);
#if 0
static accelerator_inline typename SiteCloverTriangle::vector_type triangle_elem(const SiteCloverTriangle& triangle, int block, int i, int j) {
assert(i != j);
if(i < j) {
return triangle()(block)(triangle_index(i, j));
} else { // i > j
return conjugate(triangle()(block)(triangle_index(i, j)));
}
}
#else
template<typename vobj>
static accelerator_inline vobj triangle_elem(const iImplCloverTriangle<vobj>& triangle, int block, int i, int j) {
assert(i != j);
if(i < j) {
return triangle()(block)(triangle_index(i, j));
} else { // i > j
return conjugate(triangle()(block)(triangle_index(i, j)));
}
}
#endif
static accelerator_inline int triangle_index(int i, int j) {
if(i == j)
return 0;
else if(i < j)
return Nred * (Nred - 1) / 2 - (Nred - i) * (Nred - i - 1) / 2 + j - i - 1;
else // i > j
return Nred * (Nred - 1) / 2 - (Nred - j) * (Nred - j - 1) / 2 + i - j - 1;
}
static void MooeeKernel_gpu(int Nsite,
int Ls,
const FermionField& in,
FermionField& out,
const CloverDiagonalField& diagonal,
const CloverTriangleField& triangle) {
autoView(diagonal_v, diagonal, AcceleratorRead);
autoView(triangle_v, triangle, AcceleratorRead);
autoView(in_v, in, AcceleratorRead);
autoView(out_v, out, AcceleratorWrite);
typedef decltype(coalescedRead(out_v[0])) CalcSpinor;
const uint64_t NN = Nsite * Ls;
accelerator_for(ss, NN, Simd::Nsimd(), {
int sF = ss;
int sU = ss/Ls;
CalcSpinor res;
CalcSpinor in_t = in_v(sF);
auto diagonal_t = diagonal_v(sU);
auto triangle_t = triangle_v(sU);
for(int block=0; block<Nhs; block++) {
int s_start = block*Nhs;
for(int i=0; i<Nred; i++) {
int si = s_start + i/Nc, ci = i%Nc;
res()(si)(ci) = diagonal_t()(block)(i) * in_t()(si)(ci);
for(int j=0; j<Nred; j++) {
if (j == i) continue;
int sj = s_start + j/Nc, cj = j%Nc;
res()(si)(ci) = res()(si)(ci) + triangle_elem(triangle_t, block, i, j) * in_t()(sj)(cj);
};
};
};
coalescedWrite(out_v[sF], res);
});
}
static void MooeeKernel_cpu(int Nsite,
int Ls,
const FermionField& in,
FermionField& out,
const CloverDiagonalField& diagonal,
const CloverTriangleField& triangle) {
autoView(diagonal_v, diagonal, CpuRead);
autoView(triangle_v, triangle, CpuRead);
autoView(in_v, in, CpuRead);
autoView(out_v, out, CpuWrite);
typedef SiteSpinor CalcSpinor;
#if defined(A64FX) || defined(A64FXFIXEDSIZE)
#define PREFETCH_CLOVER(BASE) { \
uint64_t base; \
int pf_dist_L1 = 1; \
int pf_dist_L2 = -5; /* -> penalty -> disable */ \
\
if ((pf_dist_L1 >= 0) && (sU + pf_dist_L1 < Nsite)) { \
base = (uint64_t)&diag_t()(pf_dist_L1+BASE)(0); \
svprfd(svptrue_b64(), (int64_t*)(base + 0), SV_PLDL1STRM); \
svprfd(svptrue_b64(), (int64_t*)(base + 256), SV_PLDL1STRM); \
svprfd(svptrue_b64(), (int64_t*)(base + 512), SV_PLDL1STRM); \
svprfd(svptrue_b64(), (int64_t*)(base + 768), SV_PLDL1STRM); \
svprfd(svptrue_b64(), (int64_t*)(base + 1024), SV_PLDL1STRM); \
svprfd(svptrue_b64(), (int64_t*)(base + 1280), SV_PLDL1STRM); \
} \
\
if ((pf_dist_L2 >= 0) && (sU + pf_dist_L2 < Nsite)) { \
base = (uint64_t)&diag_t()(pf_dist_L2+BASE)(0); \
svprfd(svptrue_b64(), (int64_t*)(base + 0), SV_PLDL2STRM); \
svprfd(svptrue_b64(), (int64_t*)(base + 256), SV_PLDL2STRM); \
svprfd(svptrue_b64(), (int64_t*)(base + 512), SV_PLDL2STRM); \
svprfd(svptrue_b64(), (int64_t*)(base + 768), SV_PLDL2STRM); \
svprfd(svptrue_b64(), (int64_t*)(base + 1024), SV_PLDL2STRM); \
svprfd(svptrue_b64(), (int64_t*)(base + 1280), SV_PLDL2STRM); \
} \
}
// TODO: Implement/generalize this for other architectures
// I played around a bit on KNL (see below) but didn't bring anything
// #elif defined(AVX512)
// #define PREFETCH_CLOVER(BASE) { \
// uint64_t base; \
// int pf_dist_L1 = 1; \
// int pf_dist_L2 = +4; \
// \
// if ((pf_dist_L1 >= 0) && (sU + pf_dist_L1 < Nsite)) { \
// base = (uint64_t)&diag_t()(pf_dist_L1+BASE)(0); \
// _mm_prefetch((const char*)(base + 0), _MM_HINT_T0); \
// _mm_prefetch((const char*)(base + 64), _MM_HINT_T0); \
// _mm_prefetch((const char*)(base + 128), _MM_HINT_T0); \
// _mm_prefetch((const char*)(base + 192), _MM_HINT_T0); \
// _mm_prefetch((const char*)(base + 256), _MM_HINT_T0); \
// _mm_prefetch((const char*)(base + 320), _MM_HINT_T0); \
// } \
// \
// if ((pf_dist_L2 >= 0) && (sU + pf_dist_L2 < Nsite)) { \
// base = (uint64_t)&diag_t()(pf_dist_L2+BASE)(0); \
// _mm_prefetch((const char*)(base + 0), _MM_HINT_T1); \
// _mm_prefetch((const char*)(base + 64), _MM_HINT_T1); \
// _mm_prefetch((const char*)(base + 128), _MM_HINT_T1); \
// _mm_prefetch((const char*)(base + 192), _MM_HINT_T1); \
// _mm_prefetch((const char*)(base + 256), _MM_HINT_T1); \
// _mm_prefetch((const char*)(base + 320), _MM_HINT_T1); \
// } \
// }
#else
#define PREFETCH_CLOVER(BASE)
#endif
const uint64_t NN = Nsite * Ls;
thread_for(ss, NN, {
int sF = ss;
int sU = ss/Ls;
CalcSpinor res;
CalcSpinor in_t = in_v[sF];
auto diag_t = diagonal_v[sU]; // "diag" instead of "diagonal" here to make code below easier to read
auto triangle_t = triangle_v[sU];
// upper half
PREFETCH_CLOVER(0);
auto in_cc_0_0 = conjugate(in_t()(0)(0)); // Nils: reduces number
auto in_cc_0_1 = conjugate(in_t()(0)(1)); // of conjugates from
auto in_cc_0_2 = conjugate(in_t()(0)(2)); // 30 to 20
auto in_cc_1_0 = conjugate(in_t()(1)(0));
auto in_cc_1_1 = conjugate(in_t()(1)(1));
res()(0)(0) = diag_t()(0)( 0) * in_t()(0)(0)
+ triangle_t()(0)( 0) * in_t()(0)(1)
+ triangle_t()(0)( 1) * in_t()(0)(2)
+ triangle_t()(0)( 2) * in_t()(1)(0)
+ triangle_t()(0)( 3) * in_t()(1)(1)
+ triangle_t()(0)( 4) * in_t()(1)(2);
res()(0)(1) = triangle_t()(0)( 0) * in_cc_0_0;
res()(0)(1) = diag_t()(0)( 1) * in_t()(0)(1)
+ triangle_t()(0)( 5) * in_t()(0)(2)
+ triangle_t()(0)( 6) * in_t()(1)(0)
+ triangle_t()(0)( 7) * in_t()(1)(1)
+ triangle_t()(0)( 8) * in_t()(1)(2)
+ conjugate( res()(0)( 1));
res()(0)(2) = triangle_t()(0)( 1) * in_cc_0_0
+ triangle_t()(0)( 5) * in_cc_0_1;
res()(0)(2) = diag_t()(0)( 2) * in_t()(0)(2)
+ triangle_t()(0)( 9) * in_t()(1)(0)
+ triangle_t()(0)(10) * in_t()(1)(1)
+ triangle_t()(0)(11) * in_t()(1)(2)
+ conjugate( res()(0)( 2));
res()(1)(0) = triangle_t()(0)( 2) * in_cc_0_0
+ triangle_t()(0)( 6) * in_cc_0_1
+ triangle_t()(0)( 9) * in_cc_0_2;
res()(1)(0) = diag_t()(0)( 3) * in_t()(1)(0)
+ triangle_t()(0)(12) * in_t()(1)(1)
+ triangle_t()(0)(13) * in_t()(1)(2)
+ conjugate( res()(1)( 0));
res()(1)(1) = triangle_t()(0)( 3) * in_cc_0_0
+ triangle_t()(0)( 7) * in_cc_0_1
+ triangle_t()(0)(10) * in_cc_0_2
+ triangle_t()(0)(12) * in_cc_1_0;
res()(1)(1) = diag_t()(0)( 4) * in_t()(1)(1)
+ triangle_t()(0)(14) * in_t()(1)(2)
+ conjugate( res()(1)( 1));
res()(1)(2) = triangle_t()(0)( 4) * in_cc_0_0
+ triangle_t()(0)( 8) * in_cc_0_1
+ triangle_t()(0)(11) * in_cc_0_2
+ triangle_t()(0)(13) * in_cc_1_0
+ triangle_t()(0)(14) * in_cc_1_1;
res()(1)(2) = diag_t()(0)( 5) * in_t()(1)(2)
+ conjugate( res()(1)( 2));
vstream(out_v[sF]()(0)(0), res()(0)(0));
vstream(out_v[sF]()(0)(1), res()(0)(1));
vstream(out_v[sF]()(0)(2), res()(0)(2));
vstream(out_v[sF]()(1)(0), res()(1)(0));
vstream(out_v[sF]()(1)(1), res()(1)(1));
vstream(out_v[sF]()(1)(2), res()(1)(2));
// lower half
PREFETCH_CLOVER(1);
auto in_cc_2_0 = conjugate(in_t()(2)(0));
auto in_cc_2_1 = conjugate(in_t()(2)(1));
auto in_cc_2_2 = conjugate(in_t()(2)(2));
auto in_cc_3_0 = conjugate(in_t()(3)(0));
auto in_cc_3_1 = conjugate(in_t()(3)(1));
res()(2)(0) = diag_t()(1)( 0) * in_t()(2)(0)
+ triangle_t()(1)( 0) * in_t()(2)(1)
+ triangle_t()(1)( 1) * in_t()(2)(2)
+ triangle_t()(1)( 2) * in_t()(3)(0)
+ triangle_t()(1)( 3) * in_t()(3)(1)
+ triangle_t()(1)( 4) * in_t()(3)(2);
res()(2)(1) = triangle_t()(1)( 0) * in_cc_2_0;
res()(2)(1) = diag_t()(1)( 1) * in_t()(2)(1)
+ triangle_t()(1)( 5) * in_t()(2)(2)
+ triangle_t()(1)( 6) * in_t()(3)(0)
+ triangle_t()(1)( 7) * in_t()(3)(1)
+ triangle_t()(1)( 8) * in_t()(3)(2)
+ conjugate( res()(2)( 1));
res()(2)(2) = triangle_t()(1)( 1) * in_cc_2_0
+ triangle_t()(1)( 5) * in_cc_2_1;
res()(2)(2) = diag_t()(1)( 2) * in_t()(2)(2)
+ triangle_t()(1)( 9) * in_t()(3)(0)
+ triangle_t()(1)(10) * in_t()(3)(1)
+ triangle_t()(1)(11) * in_t()(3)(2)
+ conjugate( res()(2)( 2));
res()(3)(0) = triangle_t()(1)( 2) * in_cc_2_0
+ triangle_t()(1)( 6) * in_cc_2_1
+ triangle_t()(1)( 9) * in_cc_2_2;
res()(3)(0) = diag_t()(1)( 3) * in_t()(3)(0)
+ triangle_t()(1)(12) * in_t()(3)(1)
+ triangle_t()(1)(13) * in_t()(3)(2)
+ conjugate( res()(3)( 0));
res()(3)(1) = triangle_t()(1)( 3) * in_cc_2_0
+ triangle_t()(1)( 7) * in_cc_2_1
+ triangle_t()(1)(10) * in_cc_2_2
+ triangle_t()(1)(12) * in_cc_3_0;
res()(3)(1) = diag_t()(1)( 4) * in_t()(3)(1)
+ triangle_t()(1)(14) * in_t()(3)(2)
+ conjugate( res()(3)( 1));
res()(3)(2) = triangle_t()(1)( 4) * in_cc_2_0
+ triangle_t()(1)( 8) * in_cc_2_1
+ triangle_t()(1)(11) * in_cc_2_2
+ triangle_t()(1)(13) * in_cc_3_0
+ triangle_t()(1)(14) * in_cc_3_1;
res()(3)(2) = diag_t()(1)( 5) * in_t()(3)(2)
+ conjugate( res()(3)( 2));
vstream(out_v[sF]()(2)(0), res()(2)(0));
vstream(out_v[sF]()(2)(1), res()(2)(1));
vstream(out_v[sF]()(2)(2), res()(2)(2));
vstream(out_v[sF]()(3)(0), res()(3)(0));
vstream(out_v[sF]()(3)(1), res()(3)(1));
vstream(out_v[sF]()(3)(2), res()(3)(2));
});
}
static void MooeeKernel(int Nsite,
int Ls,
const FermionField& in,
FermionField& out,
const CloverDiagonalField& diagonal,
const CloverTriangleField& triangle) {
#if defined(GRID_CUDA) || defined(GRID_HIP)
MooeeKernel_gpu(Nsite, Ls, in, out, diagonal, triangle);
#else
MooeeKernel_cpu(Nsite, Ls, in, out, diagonal, triangle);
#endif
}
static void Invert(const CloverDiagonalField& diagonal,
const CloverTriangleField& triangle,
CloverDiagonalField& diagonalInv,
CloverTriangleField& triangleInv) {
conformable(diagonal, diagonalInv);
conformable(triangle, triangleInv);
conformable(diagonal, triangle);
diagonalInv.Checkerboard() = diagonal.Checkerboard();
triangleInv.Checkerboard() = triangle.Checkerboard();
GridBase* grid = diagonal.Grid();
long lsites = grid->lSites();
typedef typename SiteCloverDiagonal::scalar_object scalar_object_diagonal;
typedef typename SiteCloverTriangle::scalar_object scalar_object_triangle;
autoView(diagonal_v, diagonal, CpuRead);
autoView(triangle_v, triangle, CpuRead);
autoView(diagonalInv_v, diagonalInv, CpuWrite);
autoView(triangleInv_v, triangleInv, CpuWrite);
thread_for(site, lsites, { // NOTE: Not on GPU because of Eigen & (peek/poke)LocalSite
Eigen::MatrixXcd clover_inv_eigen = Eigen::MatrixXcd::Zero(Ns*Nc, Ns*Nc);
Eigen::MatrixXcd clover_eigen = Eigen::MatrixXcd::Zero(Ns*Nc, Ns*Nc);
scalar_object_diagonal diagonal_tmp = Zero();
scalar_object_diagonal diagonal_inv_tmp = Zero();
scalar_object_triangle triangle_tmp = Zero();
scalar_object_triangle triangle_inv_tmp = Zero();
Coordinate lcoor;
grid->LocalIndexToLocalCoor(site, lcoor);
peekLocalSite(diagonal_tmp, diagonal_v, lcoor);
peekLocalSite(triangle_tmp, triangle_v, lcoor);
// TODO: can we save time here by inverting the two 6x6 hermitian matrices separately?
for (long s_row=0;s_row<Ns;s_row++) {
for (long s_col=0;s_col<Ns;s_col++) {
if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
int block = s_row / Nhs;
int s_row_block = s_row % Nhs;
int s_col_block = s_col % Nhs;
for (long c_row=0;c_row<Nc;c_row++) {
for (long c_col=0;c_col<Nc;c_col++) {
int i = s_row_block * Nc + c_row;
int j = s_col_block * Nc + c_col;
if(i == j)
clover_eigen(s_row*Nc+c_row, s_col*Nc+c_col) = static_cast<ComplexD>(TensorRemove(diagonal_tmp()(block)(i)));
else
clover_eigen(s_row*Nc+c_row, s_col*Nc+c_col) = static_cast<ComplexD>(TensorRemove(triangle_elem(triangle_tmp, block, i, j)));
}
}
}
}
clover_inv_eigen = clover_eigen.inverse();
for (long s_row=0;s_row<Ns;s_row++) {
for (long s_col=0;s_col<Ns;s_col++) {
if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
int block = s_row / Nhs;
int s_row_block = s_row % Nhs;
int s_col_block = s_col % Nhs;
for (long c_row=0;c_row<Nc;c_row++) {
for (long c_col=0;c_col<Nc;c_col++) {
int i = s_row_block * Nc + c_row;
int j = s_col_block * Nc + c_col;
if(i == j)
diagonal_inv_tmp()(block)(i) = clover_inv_eigen(s_row*Nc+c_row, s_col*Nc+c_col);
else if(i < j)
triangle_inv_tmp()(block)(triangle_index(i, j)) = clover_inv_eigen(s_row*Nc+c_row, s_col*Nc+c_col);
else
continue;
}
}
}
}
pokeLocalSite(diagonal_inv_tmp, diagonalInv_v, lcoor);
pokeLocalSite(triangle_inv_tmp, triangleInv_v, lcoor);
});
}
static void ConvertLayout(const CloverField& full,
CloverDiagonalField& diagonal,
CloverTriangleField& triangle) {
conformable(full, diagonal);
conformable(full, triangle);
diagonal.Checkerboard() = full.Checkerboard();
triangle.Checkerboard() = full.Checkerboard();
autoView(full_v, full, AcceleratorRead);
autoView(diagonal_v, diagonal, AcceleratorWrite);
autoView(triangle_v, triangle, AcceleratorWrite);
// NOTE: this function cannot be 'private' since nvcc forbids this for kernels
accelerator_for(ss, full.Grid()->oSites(), 1, {
for(int s_row = 0; s_row < Ns; s_row++) {
for(int s_col = 0; s_col < Ns; s_col++) {
if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
int block = s_row / Nhs;
int s_row_block = s_row % Nhs;
int s_col_block = s_col % Nhs;
for(int c_row = 0; c_row < Nc; c_row++) {
for(int c_col = 0; c_col < Nc; c_col++) {
int i = s_row_block * Nc + c_row;
int j = s_col_block * Nc + c_col;
if(i == j)
diagonal_v[ss]()(block)(i) = full_v[ss]()(s_row, s_col)(c_row, c_col);
else if(i < j)
triangle_v[ss]()(block)(triangle_index(i, j)) = full_v[ss]()(s_row, s_col)(c_row, c_col);
else
continue;
}
}
}
}
});
}
static void ConvertLayout(const CloverDiagonalField& diagonal,
const CloverTriangleField& triangle,
CloverField& full) {
conformable(full, diagonal);
conformable(full, triangle);
full.Checkerboard() = diagonal.Checkerboard();
full = Zero();
autoView(diagonal_v, diagonal, AcceleratorRead);
autoView(triangle_v, triangle, AcceleratorRead);
autoView(full_v, full, AcceleratorWrite);
// NOTE: this function cannot be 'private' since nvcc forbids this for kernels
accelerator_for(ss, full.Grid()->oSites(), 1, {
for(int s_row = 0; s_row < Ns; s_row++) {
for(int s_col = 0; s_col < Ns; s_col++) {
if(abs(s_row - s_col) > 1 || s_row + s_col == 3) continue;
int block = s_row / Nhs;
int s_row_block = s_row % Nhs;
int s_col_block = s_col % Nhs;
for(int c_row = 0; c_row < Nc; c_row++) {
for(int c_col = 0; c_col < Nc; c_col++) {
int i = s_row_block * Nc + c_row;
int j = s_col_block * Nc + c_col;
if(i == j)
full_v[ss]()(s_row, s_col)(c_row, c_col) = diagonal_v[ss]()(block)(i);
else
full_v[ss]()(s_row, s_col)(c_row, c_col) = triangle_elem(triangle_v[ss], block, i, j);
}
}
}
}
});
}
static void ModifyBoundaries(CloverDiagonalField& diagonal, CloverTriangleField& triangle, RealD csw_t, RealD cF, RealD diag_mass) {
// Checks/grid
double t0 = usecond();
conformable(diagonal, triangle);
GridBase* grid = diagonal.Grid();
// Determine the boundary coordinates/sites
double t1 = usecond();
int t_dir = Nd - 1;
Lattice<iScalar<vInteger>> t_coor(grid);
LatticeCoordinate(t_coor, t_dir);
int T = grid->GlobalDimensions()[t_dir];
// Set off-diagonal parts at boundary to zero -- OK
double t2 = usecond();
CloverTriangleField zeroTriangle(grid);
zeroTriangle.Checkerboard() = triangle.Checkerboard();
zeroTriangle = Zero();
triangle = where(t_coor == 0, zeroTriangle, triangle);
triangle = where(t_coor == T-1, zeroTriangle, triangle);
// Set diagonal to unity (scaled correctly) -- OK
double t3 = usecond();
CloverDiagonalField tmp(grid);
tmp.Checkerboard() = diagonal.Checkerboard();
tmp = -1.0 * csw_t + diag_mass;
diagonal = where(t_coor == 0, tmp, diagonal);
diagonal = where(t_coor == T-1, tmp, diagonal);
// Correct values next to boundary
double t4 = usecond();
if(cF != 1.0) {
tmp = cF - 1.0;
tmp += diagonal;
diagonal = where(t_coor == 1, tmp, diagonal);
diagonal = where(t_coor == T-2, tmp, diagonal);
}
// Report timings
double t5 = usecond();
#if 0
std::cout << GridLogMessage << "CompactWilsonCloverHelpers::ModifyBoundaries timings:"
<< " checks = " << (t1 - t0) / 1e6
<< ", coordinate = " << (t2 - t1) / 1e6
<< ", off-diag zero = " << (t3 - t2) / 1e6
<< ", diagonal unity = " << (t4 - t3) / 1e6
<< ", near-boundary = " << (t5 - t4) / 1e6
<< ", total = " << (t5 - t0) / 1e6
<< std::endl;
#endif
}
template<class Field, class Mask>
static strong_inline void ApplyBoundaryMask(Field& f, const Mask& m) {
conformable(f, m);
auto grid = f.Grid();
const uint32_t Nsite = grid->oSites();
const uint32_t Nsimd = grid->Nsimd();
autoView(f_v, f, AcceleratorWrite);
autoView(m_v, m, AcceleratorRead);
// NOTE: this function cannot be 'private' since nvcc forbids this for kernels
accelerator_for(ss, Nsite, Nsimd, {
coalescedWrite(f_v[ss], m_v(ss) * f_v(ss));
});
}
template<class MaskField>
static void SetupMasks(MaskField& full, MaskField& even, MaskField& odd) {
assert(even.Grid()->_isCheckerBoarded && even.Checkerboard() == Even);
assert(odd.Grid()->_isCheckerBoarded && odd.Checkerboard() == Odd);
assert(!full.Grid()->_isCheckerBoarded);
GridBase* grid = full.Grid();
int t_dir = Nd-1;
Lattice<iScalar<vInteger>> t_coor(grid);
LatticeCoordinate(t_coor, t_dir);
int T = grid->GlobalDimensions()[t_dir];
MaskField zeroMask(grid); zeroMask = Zero();
full = 1.0;
full = where(t_coor == 0, zeroMask, full);
full = where(t_coor == T-1, zeroMask, full);
pickCheckerboard(Even, even, full);
pickCheckerboard(Odd, odd, full);
}
};
NAMESPACE_END(Grid);

View File

@ -1,92 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonCloverTypes.h
Copyright (C) 2021 - 2022
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
template<class Impl>
class WilsonCloverTypes {
public:
INHERIT_IMPL_TYPES(Impl);
template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
typedef iImplClover<Simd> SiteClover;
typedef Lattice<SiteClover> CloverField;
};
template<class Impl>
class CompactWilsonCloverTypes {
public:
INHERIT_IMPL_TYPES(Impl);
static_assert(Nd == 4 && Nc == 3 && Ns == 4 && Impl::Dimension == 3, "Wrong dimensions");
static constexpr int Nred = Nc * Nhs; // 6
static constexpr int Nblock = Nhs; // 2
static constexpr int Ndiagonal = Nred; // 6
static constexpr int Ntriangle = (Nred - 1) * Nc; // 15
template<typename vtype> using iImplCloverDiagonal = iScalar<iVector<iVector<vtype, Ndiagonal>, Nblock>>;
template<typename vtype> using iImplCloverTriangle = iScalar<iVector<iVector<vtype, Ntriangle>, Nblock>>;
typedef iImplCloverDiagonal<Simd> SiteCloverDiagonal;
typedef iImplCloverTriangle<Simd> SiteCloverTriangle;
typedef iSinglet<Simd> SiteMask;
typedef Lattice<SiteCloverDiagonal> CloverDiagonalField;
typedef Lattice<SiteCloverTriangle> CloverTriangleField;
typedef Lattice<SiteMask> MaskField;
};
#define INHERIT_CLOVER_TYPES(Impl) \
typedef typename WilsonCloverTypes<Impl>::SiteClover SiteClover; \
typedef typename WilsonCloverTypes<Impl>::CloverField CloverField;
#define INHERIT_COMPACT_CLOVER_TYPES(Impl) \
typedef typename CompactWilsonCloverTypes<Impl>::SiteCloverDiagonal SiteCloverDiagonal; \
typedef typename CompactWilsonCloverTypes<Impl>::SiteCloverTriangle SiteCloverTriangle; \
typedef typename CompactWilsonCloverTypes<Impl>::SiteMask SiteMask; \
typedef typename CompactWilsonCloverTypes<Impl>::CloverDiagonalField CloverDiagonalField; \
typedef typename CompactWilsonCloverTypes<Impl>::CloverTriangleField CloverTriangleField; \
typedef typename CompactWilsonCloverTypes<Impl>::MaskField MaskField; \
/* ugly duplication but needed inside functionality classes */ \
template<typename vtype> using iImplCloverDiagonal = \
iScalar<iVector<iVector<vtype, CompactWilsonCloverTypes<Impl>::Ndiagonal>, CompactWilsonCloverTypes<Impl>::Nblock>>; \
template<typename vtype> using iImplCloverTriangle = \
iScalar<iVector<iVector<vtype, CompactWilsonCloverTypes<Impl>::Ntriangle>, CompactWilsonCloverTypes<Impl>::Nblock>>;
#define INHERIT_COMPACT_CLOVER_SIZES(Impl) \
static constexpr int Nred = CompactWilsonCloverTypes<Impl>::Nred; \
static constexpr int Nblock = CompactWilsonCloverTypes<Impl>::Nblock; \
static constexpr int Ndiagonal = CompactWilsonCloverTypes<Impl>::Ndiagonal; \
static constexpr int Ntriangle = CompactWilsonCloverTypes<Impl>::Ntriangle;
NAMESPACE_END(Grid);

View File

@ -61,19 +61,18 @@ public:
typedef typename SiteHalfSpinor::vector_type vComplexHigh; typedef typename SiteHalfSpinor::vector_type vComplexHigh;
constexpr static int Nw=sizeof(SiteHalfSpinor)/sizeof(vComplexHigh); constexpr static int Nw=sizeof(SiteHalfSpinor)/sizeof(vComplexHigh);
accelerator_inline int CommDatumSize(void) const { accelerator_inline int CommDatumSize(void) {
return sizeof(SiteHalfCommSpinor); return sizeof(SiteHalfCommSpinor);
} }
/*****************************************************/ /*****************************************************/
/* Compress includes precision change if mpi data is not same */ /* Compress includes precision change if mpi data is not same */
/*****************************************************/ /*****************************************************/
accelerator_inline void Compress(SiteHalfSpinor &buf,const SiteSpinor &in) const { template<class _SiteHalfSpinor, class _SiteSpinor>
typedef decltype(coalescedRead(buf)) sobj; accelerator_inline void Compress(_SiteHalfSpinor *buf,Integer o,const _SiteSpinor &in) {
sobj sp; _SiteHalfSpinor tmp;
auto sin = coalescedRead(in); projector::Proj(tmp,in,mu,dag);
projector::Proj(sp,sin,mu,dag); vstream(buf[o],tmp);
coalescedWrite(buf,sp);
} }
/*****************************************************/ /*****************************************************/
@ -82,24 +81,19 @@ public:
accelerator_inline void Exchange(SiteHalfSpinor *mp, accelerator_inline void Exchange(SiteHalfSpinor *mp,
const SiteHalfSpinor * __restrict__ vp0, const SiteHalfSpinor * __restrict__ vp0,
const SiteHalfSpinor * __restrict__ vp1, const SiteHalfSpinor * __restrict__ vp1,
Integer type,Integer o) const { Integer type,Integer o){
#ifdef GRID_SIMT
exchangeSIMT(mp[2*o],mp[2*o+1],vp0[o],vp1[o],type);
#else
SiteHalfSpinor tmp1; SiteHalfSpinor tmp1;
SiteHalfSpinor tmp2; SiteHalfSpinor tmp2;
exchange(tmp1,tmp2,vp0[o],vp1[o],type); exchange(tmp1,tmp2,vp0[o],vp1[o],type);
vstream(mp[2*o ],tmp1); vstream(mp[2*o ],tmp1);
vstream(mp[2*o+1],tmp2); vstream(mp[2*o+1],tmp2);
#endif
} }
/*****************************************************/ /*****************************************************/
/* Have a decompression step if mpi data is not same */ /* Have a decompression step if mpi data is not same */
/*****************************************************/ /*****************************************************/
accelerator_inline void Decompress(SiteHalfSpinor * __restrict__ out, accelerator_inline void Decompress(SiteHalfSpinor * __restrict__ out,
SiteHalfSpinor * __restrict__ in, Integer o) const { SiteHalfSpinor * __restrict__ in, Integer o) {
assert(0); assert(0);
} }
@ -109,30 +103,8 @@ public:
accelerator_inline void CompressExchange(SiteHalfSpinor * __restrict__ out0, accelerator_inline void CompressExchange(SiteHalfSpinor * __restrict__ out0,
SiteHalfSpinor * __restrict__ out1, SiteHalfSpinor * __restrict__ out1,
const SiteSpinor * __restrict__ in, const SiteSpinor * __restrict__ in,
Integer j,Integer k, Integer m,Integer type) const Integer j,Integer k, Integer m,Integer type)
{ {
#ifdef GRID_SIMT
typedef SiteSpinor vobj;
typedef SiteHalfSpinor hvobj;
typedef decltype(coalescedRead(*in)) sobj;
typedef decltype(coalescedRead(*out0)) hsobj;
unsigned int Nsimd = vobj::Nsimd();
unsigned int mask = Nsimd >> (type + 1);
int lane = acceleratorSIMTlane(Nsimd);
int j0 = lane &(~mask); // inner coor zero
int j1 = lane |(mask) ; // inner coor one
const vobj *vp0 = &in[k];
const vobj *vp1 = &in[m];
const vobj *vp = (lane&mask) ? vp1:vp0;
auto sa = coalescedRead(*vp,j0);
auto sb = coalescedRead(*vp,j1);
hsobj psa, psb;
projector::Proj(psa,sa,mu,dag);
projector::Proj(psb,sb,mu,dag);
coalescedWrite(out0[j],psa);
coalescedWrite(out1[j],psb);
#else
SiteHalfSpinor temp1, temp2; SiteHalfSpinor temp1, temp2;
SiteHalfSpinor temp3, temp4; SiteHalfSpinor temp3, temp4;
projector::Proj(temp1,in[k],mu,dag); projector::Proj(temp1,in[k],mu,dag);
@ -140,17 +112,15 @@ public:
exchange(temp3,temp4,temp1,temp2,type); exchange(temp3,temp4,temp1,temp2,type);
vstream(out0[j],temp3); vstream(out0[j],temp3);
vstream(out1[j],temp4); vstream(out1[j],temp4);
#endif
} }
/*****************************************************/ /*****************************************************/
/* Pass the info to the stencil */ /* Pass the info to the stencil */
/*****************************************************/ /*****************************************************/
accelerator_inline bool DecompressionStep(void) const { return false; } accelerator_inline bool DecompressionStep(void) { return false; }
}; };
#if 0
template<class _HCspinor,class _Hspinor,class _Spinor, class projector> template<class _HCspinor,class _Hspinor,class _Spinor, class projector>
class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector, class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector,
typename std::enable_if<!std::is_same<_HCspinor,_Hspinor>::value>::type > typename std::enable_if<!std::is_same<_HCspinor,_Hspinor>::value>::type >
@ -172,30 +142,20 @@ public:
typedef typename SiteHalfSpinor::vector_type vComplexHigh; typedef typename SiteHalfSpinor::vector_type vComplexHigh;
constexpr static int Nw=sizeof(SiteHalfSpinor)/sizeof(vComplexHigh); constexpr static int Nw=sizeof(SiteHalfSpinor)/sizeof(vComplexHigh);
accelerator_inline int CommDatumSize(void) const { accelerator_inline int CommDatumSize(void) {
return sizeof(SiteHalfCommSpinor); return sizeof(SiteHalfCommSpinor);
} }
/*****************************************************/ /*****************************************************/
/* Compress includes precision change if mpi data is not same */ /* Compress includes precision change if mpi data is not same */
/*****************************************************/ /*****************************************************/
accelerator_inline void Compress(SiteHalfSpinor &buf,const SiteSpinor &in) const { template<class _SiteHalfSpinor, class _SiteSpinor>
SiteHalfSpinor hsp; accelerator_inline void Compress(_SiteHalfSpinor *buf,Integer o,const _SiteSpinor &in) {
_SiteHalfSpinor hsp;
SiteHalfCommSpinor *hbuf = (SiteHalfCommSpinor *)buf; SiteHalfCommSpinor *hbuf = (SiteHalfCommSpinor *)buf;
projector::Proj(hsp,in,mu,dag); projector::Proj(hsp,in,mu,dag);
precisionChange((vComplexLow *)&hbuf[o],(vComplexHigh *)&hsp,Nw); precisionChange((vComplexLow *)&hbuf[o],(vComplexHigh *)&hsp,Nw);
} }
accelerator_inline void Compress(SiteHalfSpinor &buf,const SiteSpinor &in) const {
#ifdef GRID_SIMT
typedef decltype(coalescedRead(buf)) sobj;
sobj sp;
auto sin = coalescedRead(in);
projector::Proj(sp,sin,mu,dag);
coalescedWrite(buf,sp);
#else
projector::Proj(buf,in,mu,dag);
#endif
}
/*****************************************************/ /*****************************************************/
/* Exchange includes precision change if mpi data is not same */ /* Exchange includes precision change if mpi data is not same */
@ -203,7 +163,7 @@ public:
accelerator_inline void Exchange(SiteHalfSpinor *mp, accelerator_inline void Exchange(SiteHalfSpinor *mp,
SiteHalfSpinor *vp0, SiteHalfSpinor *vp0,
SiteHalfSpinor *vp1, SiteHalfSpinor *vp1,
Integer type,Integer o) const { Integer type,Integer o){
SiteHalfSpinor vt0,vt1; SiteHalfSpinor vt0,vt1;
SiteHalfCommSpinor *vpp0 = (SiteHalfCommSpinor *)vp0; SiteHalfCommSpinor *vpp0 = (SiteHalfCommSpinor *)vp0;
SiteHalfCommSpinor *vpp1 = (SiteHalfCommSpinor *)vp1; SiteHalfCommSpinor *vpp1 = (SiteHalfCommSpinor *)vp1;
@ -215,7 +175,7 @@ public:
/*****************************************************/ /*****************************************************/
/* Have a decompression step if mpi data is not same */ /* Have a decompression step if mpi data is not same */
/*****************************************************/ /*****************************************************/
accelerator_inline void Decompress(SiteHalfSpinor *out, SiteHalfSpinor *in, Integer o) const { accelerator_inline void Decompress(SiteHalfSpinor *out, SiteHalfSpinor *in, Integer o){
SiteHalfCommSpinor *hin=(SiteHalfCommSpinor *)in; SiteHalfCommSpinor *hin=(SiteHalfCommSpinor *)in;
precisionChange((vComplexHigh *)&out[o],(vComplexLow *)&hin[o],Nw); precisionChange((vComplexHigh *)&out[o],(vComplexLow *)&hin[o],Nw);
} }
@ -226,7 +186,7 @@ public:
accelerator_inline void CompressExchange(SiteHalfSpinor *out0, accelerator_inline void CompressExchange(SiteHalfSpinor *out0,
SiteHalfSpinor *out1, SiteHalfSpinor *out1,
const SiteSpinor *in, const SiteSpinor *in,
Integer j,Integer k, Integer m,Integer type) const { Integer j,Integer k, Integer m,Integer type){
SiteHalfSpinor temp1, temp2,temp3,temp4; SiteHalfSpinor temp1, temp2,temp3,temp4;
SiteHalfCommSpinor *hout0 = (SiteHalfCommSpinor *)out0; SiteHalfCommSpinor *hout0 = (SiteHalfCommSpinor *)out0;
SiteHalfCommSpinor *hout1 = (SiteHalfCommSpinor *)out1; SiteHalfCommSpinor *hout1 = (SiteHalfCommSpinor *)out1;
@ -240,10 +200,9 @@ public:
/*****************************************************/ /*****************************************************/
/* Pass the info to the stencil */ /* Pass the info to the stencil */
/*****************************************************/ /*****************************************************/
accelerator_inline bool DecompressionStep(void) const { return true; } accelerator_inline bool DecompressionStep(void) { return true; }
}; };
#endif
#define DECLARE_PROJ(Projector,Compressor,spProj) \ #define DECLARE_PROJ(Projector,Compressor,spProj) \
class Projector { \ class Projector { \
@ -294,8 +253,33 @@ public:
typedef typename Base::View_type View_type; typedef typename Base::View_type View_type;
typedef typename Base::StencilVector StencilVector; typedef typename Base::StencilVector StencilVector;
void ZeroCountersi(void) { } double timer0;
void Reporti(int calls) { } double timer1;
double timer2;
double timer3;
double timer4;
double timer5;
double timer6;
uint64_t callsi;
void ZeroCountersi(void)
{
timer0=0;
timer1=0;
timer2=0;
timer3=0;
timer4=0;
timer5=0;
timer6=0;
callsi=0;
}
void Reporti(int calls)
{
if ( timer0 ) std::cout << GridLogMessage << " timer0 (HaloGatherOpt) " <<timer0/calls <<std::endl;
if ( timer1 ) std::cout << GridLogMessage << " timer1 (Communicate) " <<timer1/calls <<std::endl;
if ( timer2 ) std::cout << GridLogMessage << " timer2 (CommsMerge ) " <<timer2/calls <<std::endl;
if ( timer3 ) std::cout << GridLogMessage << " timer3 (commsMergeShm) " <<timer3/calls <<std::endl;
if ( timer4 ) std::cout << GridLogMessage << " timer4 " <<timer4 <<std::endl;
}
std::vector<int> surface_list; std::vector<int> surface_list;
@ -337,18 +321,26 @@ public:
{ {
std::vector<std::vector<CommsRequest_t> > reqs; std::vector<std::vector<CommsRequest_t> > reqs;
this->HaloExchangeOptGather(source,compress); this->HaloExchangeOptGather(source,compress);
double t1=usecond();
// Asynchronous MPI calls multidirectional, Isend etc... // Asynchronous MPI calls multidirectional, Isend etc...
// Non-overlapped directions within a thread. Asynchronous calls except MPI3, threaded up to comm threads ways. // Non-overlapped directions within a thread. Asynchronous calls except MPI3, threaded up to comm threads ways.
this->Communicate(); this->Communicate();
double t2=usecond(); timer1 += t2-t1;
this->CommsMerge(compress); this->CommsMerge(compress);
double t3=usecond(); timer2 += t3-t2;
this->CommsMergeSHM(compress); this->CommsMergeSHM(compress);
double t4=usecond(); timer3 += t4-t3;
} }
template <class compressor> template <class compressor>
void HaloExchangeOptGather(const Lattice<vobj> &source,compressor &compress) void HaloExchangeOptGather(const Lattice<vobj> &source,compressor &compress)
{ {
this->Prepare(); this->Prepare();
double t0=usecond();
this->HaloGatherOpt(source,compress); this->HaloGatherOpt(source,compress);
double t1=usecond();
timer0 += t1-t0;
callsi++;
} }
template <class compressor> template <class compressor>
@ -360,9 +352,12 @@ public:
typedef typename compressor::SiteHalfSpinor SiteHalfSpinor; typedef typename compressor::SiteHalfSpinor SiteHalfSpinor;
typedef typename compressor::SiteHalfCommSpinor SiteHalfCommSpinor; typedef typename compressor::SiteHalfCommSpinor SiteHalfCommSpinor;
this->mpi3synctime_g-=usecond();
this->_grid->StencilBarrier(); this->_grid->StencilBarrier();
this->mpi3synctime_g+=usecond();
assert(source.Grid()==this->_grid); assert(source.Grid()==this->_grid);
this->halogtime-=usecond();
this->u_comm_offset=0; this->u_comm_offset=0;
@ -398,6 +393,7 @@ public:
} }
this->face_table_computed=1; this->face_table_computed=1;
assert(this->u_comm_offset==this->_unified_buffer_size); assert(this->u_comm_offset==this->_unified_buffer_size);
this->halogtime+=usecond();
accelerator_barrier(); accelerator_barrier();
} }

View File

@ -63,6 +63,9 @@ public:
INHERIT_IMPL_TYPES(Impl); INHERIT_IMPL_TYPES(Impl);
typedef WilsonKernels<Impl> Kernels; typedef WilsonKernels<Impl> Kernels;
virtual std::vector<int> Directions(void) { return this->directions; };
virtual std::vector<int> Displacements(void){ return this->displacements;};
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
// Implement the abstract base // Implement the abstract base
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////

View File

@ -72,6 +72,9 @@ public:
typedef WilsonKernels<Impl> Kernels; typedef WilsonKernels<Impl> Kernels;
PmuStat stat; PmuStat stat;
virtual std::vector<int> Directions(void) { return this->directions; };
virtual std::vector<int> Displacements(void){ return this->displacements;};
FermionField _tmp; FermionField _tmp;
FermionField &tmp(void) { return _tmp; } FermionField &tmp(void) { return _tmp; }

View File

@ -72,7 +72,7 @@ public:
typedef WilsonCompressor<SiteHalfCommSpinor,SiteHalfSpinor, SiteSpinor> Compressor; typedef WilsonCompressor<SiteHalfCommSpinor,SiteHalfSpinor, SiteSpinor> Compressor;
typedef WilsonImplParams ImplParams; typedef WilsonImplParams ImplParams;
typedef WilsonStencil<SiteSpinor, SiteHalfSpinor,ImplParams> StencilImpl; typedef WilsonStencil<SiteSpinor, SiteHalfSpinor,ImplParams> StencilImpl;
typedef const typename StencilImpl::View_type StencilView; typedef typename StencilImpl::View_type StencilView;
ImplParams Params; ImplParams Params;
@ -106,15 +106,11 @@ public:
const _SpinorField & phi, const _SpinorField & phi,
int mu) int mu)
{ {
const int Nsimd = SiteHalfSpinor::Nsimd();
autoView( out_v, out, AcceleratorWrite); autoView( out_v, out, AcceleratorWrite);
autoView( phi_v, phi, AcceleratorRead); autoView( phi_v, phi, AcceleratorRead);
autoView( Umu_v, Umu, AcceleratorRead); autoView( Umu_v, Umu, AcceleratorRead);
typedef decltype(coalescedRead(out_v[0])) calcSpinor; accelerator_for(sss,out.Grid()->oSites(),1,{
accelerator_for(sss,out.Grid()->oSites(),Nsimd,{ multLink(out_v[sss],Umu_v[sss],phi_v[sss],mu);
calcSpinor tmp;
multLink(tmp,Umu_v[sss],phi_v(sss),mu);
coalescedWrite(out_v[sss],tmp);
}); });
} }
@ -184,22 +180,18 @@ public:
mat = TraceIndex<SpinIndex>(P); mat = TraceIndex<SpinIndex>(P);
} }
inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds) inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
{
for (int mu = 0; mu < Nd; mu++) for (int mu = 0; mu < Nd; mu++)
mat[mu] = PeekIndex<LorentzIndex>(Uds, mu); mat[mu] = PeekIndex<LorentzIndex>(Uds, mu);
} }
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu)
{ inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde,int mu){
#undef USE_OLD_INSERT_FORCE
int Ls=Btilde.Grid()->_fdimensions[0]; int Ls=Btilde.Grid()->_fdimensions[0];
autoView( mat_v , mat, AcceleratorWrite);
#ifdef USE_OLD_INSERT_FORCE
GaugeLinkField tmp(mat.Grid()); GaugeLinkField tmp(mat.Grid());
tmp = Zero(); tmp = Zero();
{ {
const int Nsimd = SiteSpinor::Nsimd();
autoView( tmp_v , tmp, AcceleratorWrite); autoView( tmp_v , tmp, AcceleratorWrite);
autoView( Btilde_v , Btilde, AcceleratorRead); autoView( Btilde_v , Btilde, AcceleratorRead);
autoView( Atilde_v , Atilde, AcceleratorRead); autoView( Atilde_v , Atilde, AcceleratorRead);
@ -212,29 +204,6 @@ public:
}); });
} }
PokeIndex<LorentzIndex>(mat,tmp,mu); PokeIndex<LorentzIndex>(mat,tmp,mu);
#else
{
const int Nsimd = SiteSpinor::Nsimd();
autoView( Btilde_v , Btilde, AcceleratorRead);
autoView( Atilde_v , Atilde, AcceleratorRead);
accelerator_for(sss,mat.Grid()->oSites(),Nsimd,{
int sU=sss;
typedef decltype(coalescedRead(mat_v[sU](mu)() )) ColorMatrixType;
ColorMatrixType sum;
zeroit(sum);
for(int s=0;s<Ls;s++){
int sF = s+Ls*sU;
for(int spn=0;spn<Ns;spn++){ //sum over spin
auto bb = coalescedRead(Btilde_v[sF]()(spn) ); //color vector
auto aa = coalescedRead(Atilde_v[sF]()(spn) );
auto op = outerProduct(bb,aa);
sum = sum + op;
}
}
coalescedWrite(mat_v[sU](mu)(), sum);
});
}
#endif
} }
}; };
@ -243,17 +212,17 @@ typedef WilsonImpl<vComplex, FundamentalRepresentation, CoeffReal > WilsonImplR
typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffReal > WilsonImplF; // Float typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffReal > WilsonImplF; // Float
typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffReal > WilsonImplD; // Double typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffReal > WilsonImplD; // Double
//typedef WilsonImpl<vComplex, FundamentalRepresentation, CoeffRealHalfComms > WilsonImplRL; // Real.. whichever prec typedef WilsonImpl<vComplex, FundamentalRepresentation, CoeffRealHalfComms > WilsonImplRL; // Real.. whichever prec
//typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffRealHalfComms > WilsonImplFH; // Float typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffRealHalfComms > WilsonImplFH; // Float
//typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffRealHalfComms > WilsonImplDF; // Double typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffRealHalfComms > WilsonImplDF; // Double
typedef WilsonImpl<vComplex, FundamentalRepresentation, CoeffComplex > ZWilsonImplR; // Real.. whichever prec typedef WilsonImpl<vComplex, FundamentalRepresentation, CoeffComplex > ZWilsonImplR; // Real.. whichever prec
typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffComplex > ZWilsonImplF; // Float typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffComplex > ZWilsonImplF; // Float
typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffComplex > ZWilsonImplD; // Double typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffComplex > ZWilsonImplD; // Double
//typedef WilsonImpl<vComplex, FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplRL; // Real.. whichever prec typedef WilsonImpl<vComplex, FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplRL; // Real.. whichever prec
//typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplFH; // Float typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplFH; // Float
//typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplDF; // Double typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplDF; // Double
typedef WilsonImpl<vComplex, AdjointRepresentation, CoeffReal > WilsonAdjImplR; // Real.. whichever prec typedef WilsonImpl<vComplex, AdjointRepresentation, CoeffReal > WilsonAdjImplR; // Real.. whichever prec
typedef WilsonImpl<vComplexF, AdjointRepresentation, CoeffReal > WilsonAdjImplF; // Float typedef WilsonImpl<vComplexF, AdjointRepresentation, CoeffReal > WilsonAdjImplF; // Float

View File

@ -49,17 +49,9 @@ public:
INHERIT_IMPL_TYPES(Impl); INHERIT_IMPL_TYPES(Impl);
typedef FermionOperator<Impl> Base; typedef FermionOperator<Impl> Base;
typedef AcceleratorVector<int,STENCIL_MAX> StencilVector;
public: public:
#ifdef GRID_SYCL
#define SYCL_HACK
#endif
#ifdef SYCL_HACK
static void HandDhopSiteSycl(StencilVector st_perm,StencilEntry *st_p, SiteDoubledGaugeField *U,SiteHalfSpinor *buf,
int ss,int sU,const SiteSpinor *in, SiteSpinor *out);
#endif
static void DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, static void DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
int Ls, int Nsite, const FermionField &in, FermionField &out, int Ls, int Nsite, const FermionField &in, FermionField &out,
int interior=1,int exterior=1) ; int interior=1,int exterior=1) ;

View File

@ -79,6 +79,8 @@ public:
_Mat.M(in,tmp); _Mat.M(in,tmp);
G5R5(out,tmp); G5R5(out,tmp);
} }
virtual std::vector<int> Directions(void) { return _Mat.Directions();};
virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
}; };
@ -127,6 +129,8 @@ public:
_Mat.M(in,tmp); _Mat.M(in,tmp);
out=g5*tmp; out=g5*tmp;
} }
virtual std::vector<int> Directions(void) { return _Mat.Directions();};
virtual std::vector<int> Displacements(void){ return _Mat.Displacements();};
}; };
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@ -642,7 +642,7 @@ void CayleyFermion5D<Impl>::ContractConservedCurrent( PropagatorField &q_in_1,
Current curr_type, Current curr_type,
unsigned int mu) unsigned int mu)
{ {
#if (!defined(GRID_HIP)) #if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
Gamma::Algebra Gmu [] = { Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX, Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY, Gamma::Algebra::GammaY,
@ -799,7 +799,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
PropagatorField tmp(UGrid); PropagatorField tmp(UGrid);
PropagatorField Utmp(UGrid); PropagatorField Utmp(UGrid);
PropagatorField zz (UGrid); zz=0.0; LatticeInteger zz (UGrid); zz=0.0;
LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1); LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
for (int s=0;s<Ls;s++) { for (int s=0;s<Ls;s++) {
@ -826,9 +826,8 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
} }
#endif #endif
#if (!defined(GRID_HIP)) #if (!defined(GRID_CUDA)) && (!defined(GRID_HIP))
int tshift = (mu == Nd-1) ? 1 : 0; int tshift = (mu == Nd-1) ? 1 : 0;
unsigned int LLt = GridDefaultLatt()[Tp];
//////////////////////////////////////////////// ////////////////////////////////////////////////
// GENERAL CAYLEY CASE // GENERAL CAYLEY CASE
//////////////////////////////////////////////// ////////////////////////////////////////////////
@ -851,7 +850,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
PropagatorField tmp(UGrid); PropagatorField tmp(UGrid);
PropagatorField Utmp(UGrid); PropagatorField Utmp(UGrid);
PropagatorField zz (UGrid); zz=0.0; LatticeInteger zz (UGrid); zz=0.0;
LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1); LatticeInteger lcoor(UGrid); LatticeCoordinate(lcoor,Nd-1);
for(int s=0;s<Ls;s++){ for(int s=0;s<Ls;s++){
@ -881,29 +880,17 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
} }
std::vector<RealD> G_s(Ls,1.0); std::vector<RealD> G_s(Ls,1.0);
RealD sign = 1.0; // sign flip for vector/tadpole
if ( curr_type == Current::Axial ) { if ( curr_type == Current::Axial ) {
for(int s=0;s<Ls/2;s++){ for(int s=0;s<Ls/2;s++){
G_s[s] = -1.0; G_s[s] = -1.0;
} }
} }
else if ( curr_type == Current::Tadpole ) {
auto b=this->_b;
auto c=this->_c;
if ( b == 1 && c == 0 ) {
sign = -1.0;
}
else {
std::cerr << "Error: Tadpole implementation currently unavailable for non-Shamir actions." << std::endl;
assert(b==1 && c==0);
}
}
for(int s=0;s<Ls;s++){ for(int s=0;s<Ls;s++){
int sp = (s+1)%Ls; int sp = (s+1)%Ls;
// int sr = Ls-1-s; int sr = Ls-1-s;
// int srp= (sr+1)%Ls; int srp= (sr+1)%Ls;
// Mobius parameters // Mobius parameters
auto b=this->bs[s]; auto b=this->bs[s];
@ -920,7 +907,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
tmp = Cshift(tmp,mu,1); tmp = Cshift(tmp,mu,1);
Impl::multLinkField(Utmp,this->Umu,tmp,mu); Impl::multLinkField(Utmp,this->Umu,tmp,mu);
tmp = sign*G_s[s]*( Utmp*ph - gmu*Utmp*ph ); // Forward hop tmp = G_s[s]*( Utmp*ph - gmu*Utmp*ph ); // Forward hop
tmp = where((lcoor>=tmin),tmp,zz); // Mask the time tmp = where((lcoor>=tmin),tmp,zz); // Mask the time
L_Q = where((lcoor<=tmax),tmp,zz); // Position of current complicated L_Q = where((lcoor<=tmax),tmp,zz); // Position of current complicated
@ -935,13 +922,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
tmp = Cshift(tmp,mu,-1); tmp = Cshift(tmp,mu,-1);
Impl::multLinkField(Utmp,this->Umu,tmp,mu+Nd); // Adjoint link Impl::multLinkField(Utmp,this->Umu,tmp,mu+Nd); // Adjoint link
tmp = -G_s[s]*( Utmp + gmu*Utmp ); tmp = -G_s[s]*( Utmp + gmu*Utmp );
// Mask the time tmp = where((lcoor>=tmin+tshift),tmp,zz); // Mask the time
if (tmax == LLt - 1 && tshift == 1){ // quick fix to include timeslice 0 if tmax + tshift is over the last timeslice
unsigned int t0 = 0;
tmp = where(((lcoor==t0) || (lcoor>=tmin+tshift)),tmp,zz);
} else {
tmp = where((lcoor>=tmin+tshift),tmp,zz);
}
L_Q += where((lcoor<=tmax+tshift),tmp,zz); // Position of current complicated L_Q += where((lcoor<=tmax+tshift),tmp,zz); // Position of current complicated
InsertSlice(L_Q, q_out, s , 0); InsertSlice(L_Q, q_out, s , 0);

View File

@ -1,363 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/CompactWilsonCloverFermionImplementation.h
Copyright (C) 2017 - 2022
Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Guido Cossu <guido.cossu@ed.ac.uk>
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#include <Grid/Grid.h>
#include <Grid/qcd/spin/Dirac.h>
#include <Grid/qcd/action/fermion/CompactWilsonCloverFermion.h>
NAMESPACE_BEGIN(Grid);
template<class Impl>
CompactWilsonCloverFermion<Impl>::CompactWilsonCloverFermion(GaugeField& _Umu,
GridCartesian& Fgrid,
GridRedBlackCartesian& Hgrid,
const RealD _mass,
const RealD _csw_r,
const RealD _csw_t,
const RealD _cF,
const WilsonAnisotropyCoefficients& clover_anisotropy,
const ImplParams& impl_p)
: WilsonBase(_Umu, Fgrid, Hgrid, _mass, impl_p, clover_anisotropy)
, csw_r(_csw_r)
, csw_t(_csw_t)
, cF(_cF)
, open_boundaries(impl_p.boundary_phases[Nd-1] == 0.0)
, Diagonal(&Fgrid), Triangle(&Fgrid)
, DiagonalEven(&Hgrid), TriangleEven(&Hgrid)
, DiagonalOdd(&Hgrid), TriangleOdd(&Hgrid)
, DiagonalInv(&Fgrid), TriangleInv(&Fgrid)
, DiagonalInvEven(&Hgrid), TriangleInvEven(&Hgrid)
, DiagonalInvOdd(&Hgrid), TriangleInvOdd(&Hgrid)
, Tmp(&Fgrid)
, BoundaryMask(&Fgrid)
, BoundaryMaskEven(&Hgrid), BoundaryMaskOdd(&Hgrid)
{
csw_r *= 0.5;
csw_t *= 0.5;
if (clover_anisotropy.isAnisotropic)
csw_r /= clover_anisotropy.xi_0;
ImportGauge(_Umu);
if (open_boundaries)
CompactHelpers::SetupMasks(this->BoundaryMask, this->BoundaryMaskEven, this->BoundaryMaskOdd);
}
template<class Impl>
void CompactWilsonCloverFermion<Impl>::Dhop(const FermionField& in, FermionField& out, int dag) {
WilsonBase::Dhop(in, out, dag);
if(open_boundaries) ApplyBoundaryMask(out);
}
template<class Impl>
void CompactWilsonCloverFermion<Impl>::DhopOE(const FermionField& in, FermionField& out, int dag) {
WilsonBase::DhopOE(in, out, dag);
if(open_boundaries) ApplyBoundaryMask(out);
}
template<class Impl>
void CompactWilsonCloverFermion<Impl>::DhopEO(const FermionField& in, FermionField& out, int dag) {
WilsonBase::DhopEO(in, out, dag);
if(open_boundaries) ApplyBoundaryMask(out);
}
template<class Impl>
void CompactWilsonCloverFermion<Impl>::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) {
WilsonBase::DhopDir(in, out, dir, disp);
if(this->open_boundaries) ApplyBoundaryMask(out);
}
template<class Impl>
void CompactWilsonCloverFermion<Impl>::DhopDirAll(const FermionField& in, std::vector<FermionField>& out) {
WilsonBase::DhopDirAll(in, out);
if(this->open_boundaries) {
for(auto& o : out) ApplyBoundaryMask(o);
}
}
template<class Impl>
void CompactWilsonCloverFermion<Impl>::M(const FermionField& in, FermionField& out) {
out.Checkerboard() = in.Checkerboard();
WilsonBase::Dhop(in, out, DaggerNo); // call base to save applying bc
Mooee(in, Tmp);
axpy(out, 1.0, out, Tmp);
if(open_boundaries) ApplyBoundaryMask(out);
}
template<class Impl>
void CompactWilsonCloverFermion<Impl>::Mdag(const FermionField& in, FermionField& out) {
out.Checkerboard() = in.Checkerboard();
WilsonBase::Dhop(in, out, DaggerYes); // call base to save applying bc
MooeeDag(in, Tmp);
axpy(out, 1.0, out, Tmp);
if(open_boundaries) ApplyBoundaryMask(out);
}
template<class Impl>
void CompactWilsonCloverFermion<Impl>::Meooe(const FermionField& in, FermionField& out) {
WilsonBase::Meooe(in, out);
if(open_boundaries) ApplyBoundaryMask(out);
}
template<class Impl>
void CompactWilsonCloverFermion<Impl>::MeooeDag(const FermionField& in, FermionField& out) {
WilsonBase::MeooeDag(in, out);
if(open_boundaries) ApplyBoundaryMask(out);
}
template<class Impl>
void CompactWilsonCloverFermion<Impl>::Mooee(const FermionField& in, FermionField& out) {
if(in.Grid()->_isCheckerBoarded) {
if(in.Checkerboard() == Odd) {
MooeeInternal(in, out, DiagonalOdd, TriangleOdd);
} else {
MooeeInternal(in, out, DiagonalEven, TriangleEven);
}
} else {
MooeeInternal(in, out, Diagonal, Triangle);
}
if(open_boundaries) ApplyBoundaryMask(out);
}
template<class Impl>
void CompactWilsonCloverFermion<Impl>::MooeeDag(const FermionField& in, FermionField& out) {
Mooee(in, out); // blocks are hermitian
}
template<class Impl>
void CompactWilsonCloverFermion<Impl>::MooeeInv(const FermionField& in, FermionField& out) {
if(in.Grid()->_isCheckerBoarded) {
if(in.Checkerboard() == Odd) {
MooeeInternal(in, out, DiagonalInvOdd, TriangleInvOdd);
} else {
MooeeInternal(in, out, DiagonalInvEven, TriangleInvEven);
}
} else {
MooeeInternal(in, out, DiagonalInv, TriangleInv);
}
if(open_boundaries) ApplyBoundaryMask(out);
}
template<class Impl>
void CompactWilsonCloverFermion<Impl>::MooeeInvDag(const FermionField& in, FermionField& out) {
MooeeInv(in, out); // blocks are hermitian
}
template<class Impl>
void CompactWilsonCloverFermion<Impl>::Mdir(const FermionField& in, FermionField& out, int dir, int disp) {
DhopDir(in, out, dir, disp);
}
template<class Impl>
void CompactWilsonCloverFermion<Impl>::MdirAll(const FermionField& in, std::vector<FermionField>& out) {
DhopDirAll(in, out);
}
template<class Impl>
void CompactWilsonCloverFermion<Impl>::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) {
assert(!open_boundaries); // TODO check for changes required for open bc
// NOTE: code copied from original clover term
conformable(X.Grid(), Y.Grid());
conformable(X.Grid(), force.Grid());
GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
GaugeField clover_force(force.Grid());
PropagatorField Lambda(force.Grid());
// Guido: Here we are hitting some performance issues:
// need to extract the components of the DoubledGaugeField
// for each call
// Possible solution
// Create a vector object to store them? (cons: wasting space)
std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
Impl::extractLinkField(U, this->Umu);
force = Zero();
// Derivative of the Wilson hopping term
this->DhopDeriv(force, X, Y, dag);
///////////////////////////////////////////////////////////
// Clover term derivative
///////////////////////////////////////////////////////////
Impl::outerProductImpl(Lambda, X, Y);
//std::cout << "Lambda:" << Lambda << std::endl;
Gamma::Algebra sigma[] = {
Gamma::Algebra::SigmaXY,
Gamma::Algebra::SigmaXZ,
Gamma::Algebra::SigmaXT,
Gamma::Algebra::MinusSigmaXY,
Gamma::Algebra::SigmaYZ,
Gamma::Algebra::SigmaYT,
Gamma::Algebra::MinusSigmaXZ,
Gamma::Algebra::MinusSigmaYZ,
Gamma::Algebra::SigmaZT,
Gamma::Algebra::MinusSigmaXT,
Gamma::Algebra::MinusSigmaYT,
Gamma::Algebra::MinusSigmaZT};
/*
sigma_{\mu \nu}=
| 0 sigma[0] sigma[1] sigma[2] |
| sigma[3] 0 sigma[4] sigma[5] |
| sigma[6] sigma[7] 0 sigma[8] |
| sigma[9] sigma[10] sigma[11] 0 |
*/
int count = 0;
clover_force = Zero();
for (int mu = 0; mu < 4; mu++)
{
force_mu = Zero();
for (int nu = 0; nu < 4; nu++)
{
if (mu == nu)
continue;
RealD factor;
if (nu == 4 || mu == 4)
{
factor = 2.0 * csw_t;
}
else
{
factor = 2.0 * csw_r;
}
PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
Impl::TraceSpinImpl(lambda, Slambda); // traceSpin ok
force_mu -= factor*Helpers::Cmunu(U, lambda, mu, nu); // checked
count++;
}
pokeLorentz(clover_force, U[mu] * force_mu, mu);
}
//clover_force *= csw;
force += clover_force;
}
template<class Impl>
void CompactWilsonCloverFermion<Impl>::MooDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
assert(0);
}
template<class Impl>
void CompactWilsonCloverFermion<Impl>::MeeDeriv(GaugeField& mat, const FermionField& U, const FermionField& V, int dag) {
assert(0);
}
template<class Impl>
void CompactWilsonCloverFermion<Impl>::MooeeInternal(const FermionField& in,
FermionField& out,
const CloverDiagonalField& diagonal,
const CloverTriangleField& triangle) {
assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
out.Checkerboard() = in.Checkerboard();
conformable(in, out);
conformable(in, diagonal);
conformable(in, triangle);
CompactHelpers::MooeeKernel(diagonal.oSites(), 1, in, out, diagonal, triangle);
}
template<class Impl>
void CompactWilsonCloverFermion<Impl>::ImportGauge(const GaugeField& _Umu) {
// NOTE: parts copied from original implementation
// Import gauge into base class
double t0 = usecond();
WilsonBase::ImportGauge(_Umu); // NOTE: called here and in wilson constructor -> performed twice, but can't avoid that
// Initialize temporary variables
double t1 = usecond();
conformable(_Umu.Grid(), this->GaugeGrid());
GridBase* grid = _Umu.Grid();
typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
CloverField TmpOriginal(grid);
// Compute the field strength terms mu>nu
double t2 = usecond();
WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
WilsonLoops<Impl>::FieldStrength(Bz, _Umu, Ydir, Xdir);
WilsonLoops<Impl>::FieldStrength(Ex, _Umu, Tdir, Xdir);
WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
// Compute the Clover Operator acting on Colour and Spin
// multiply here by the clover coefficients for the anisotropy
double t3 = usecond();
TmpOriginal = Helpers::fillCloverYZ(Bx) * csw_r;
TmpOriginal += Helpers::fillCloverXZ(By) * csw_r;
TmpOriginal += Helpers::fillCloverXY(Bz) * csw_r;
TmpOriginal += Helpers::fillCloverXT(Ex) * csw_t;
TmpOriginal += Helpers::fillCloverYT(Ey) * csw_t;
TmpOriginal += Helpers::fillCloverZT(Ez) * csw_t;
TmpOriginal += this->diag_mass;
// Convert the data layout of the clover term
double t4 = usecond();
CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle);
// Possible modify the boundary values
double t5 = usecond();
if(open_boundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, this->diag_mass);
// Invert the clover term in the improved layout
double t6 = usecond();
CompactHelpers::Invert(Diagonal, Triangle, DiagonalInv, TriangleInv);
// Fill the remaining clover fields
double t7 = usecond();
pickCheckerboard(Even, DiagonalEven, Diagonal);
pickCheckerboard(Even, TriangleEven, Triangle);
pickCheckerboard(Odd, DiagonalOdd, Diagonal);
pickCheckerboard(Odd, TriangleOdd, Triangle);
pickCheckerboard(Even, DiagonalInvEven, DiagonalInv);
pickCheckerboard(Even, TriangleInvEven, TriangleInv);
pickCheckerboard(Odd, DiagonalInvOdd, DiagonalInv);
pickCheckerboard(Odd, TriangleInvOdd, TriangleInv);
// Report timings
double t8 = usecond();
#if 0
std::cout << GridLogMessage << "CompactWilsonCloverFermion::ImportGauge timings:"
<< " WilsonFermion::Importgauge = " << (t1 - t0) / 1e6
<< ", allocations = " << (t2 - t1) / 1e6
<< ", field strength = " << (t3 - t2) / 1e6
<< ", fill clover = " << (t4 - t3) / 1e6
<< ", convert = " << (t5 - t4) / 1e6
<< ", boundaries = " << (t6 - t5) / 1e6
<< ", inversions = " << (t7 - t6) / 1e6
<< ", pick cbs = " << (t8 - t7) / 1e6
<< ", total = " << (t8 - t0) / 1e6
<< std::endl;
#endif
}
NAMESPACE_END(Grid);

View File

@ -680,8 +680,7 @@ void StaggeredKernels<Impl>::DhopSiteAsm(StencilView &st,
gauge2 =(uint64_t)&UU[sU]( Z ); \ gauge2 =(uint64_t)&UU[sU]( Z ); \
gauge3 =(uint64_t)&UU[sU]( T ); gauge3 =(uint64_t)&UU[sU]( T );
#undef STAG_VEC5D
#ifdef STAG_VEC5D
// This is the single precision 5th direction vectorised kernel // This is the single precision 5th direction vectorised kernel
#include <Grid/simd/Intel512single.h> #include <Grid/simd/Intel512single.h>
template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilView &st, template <> void StaggeredKernels<StaggeredVec5dImplF>::DhopSiteAsm(StencilView &st,
@ -791,7 +790,7 @@ template <> void StaggeredKernels<StaggeredVec5dImplD>::DhopSiteAsm(StencilView
#endif #endif
} }
#endif
#define PERMUTE_DIR3 __asm__ ( \ #define PERMUTE_DIR3 __asm__ ( \

View File

@ -32,50 +32,25 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
#ifdef GRID_SIMT #define LOAD_CHI(b) \
#define LOAD_CHI(ptype,b) \
const SiteSpinor & ref (b[offset]); \
Chi_0=coalescedReadPermute<ptype>(ref()()(0),perm,lane); \
Chi_1=coalescedReadPermute<ptype>(ref()()(1),perm,lane); \
Chi_2=coalescedReadPermute<ptype>(ref()()(2),perm,lane);
#define LOAD_CHI_COMMS(b) \
const SiteSpinor & ref (b[offset]); \ const SiteSpinor & ref (b[offset]); \
Chi_0=coalescedRead(ref()()(0),lane); \ Chi_0=ref()()(0);\
Chi_1=coalescedRead(ref()()(1),lane); \ Chi_1=ref()()(1);\
Chi_2=coalescedRead(ref()()(2),lane); Chi_2=ref()()(2);
#define PERMUTE_DIR(dir) ;
#else
#define LOAD_CHI(ptype,b) LOAD_CHI_COMMS(b)
#define LOAD_CHI_COMMS(b) \
const SiteSpinor & ref (b[offset]); \
Chi_0=ref()()(0); \
Chi_1=ref()()(1); \
Chi_2=ref()()(2);
#define PERMUTE_DIR(dir) \
permute##dir(Chi_0,Chi_0); \
permute##dir(Chi_1,Chi_1); \
permute##dir(Chi_2,Chi_2);
#endif
// To splat or not to splat depends on the implementation // To splat or not to splat depends on the implementation
#define MULT(A,UChi) \ #define MULT(A,UChi) \
auto & ref(U[sU](A)); \ auto & ref(U[sU](A)); \
U_00=coalescedRead(ref()(0,0),lane); \ Impl::loadLinkElement(U_00,ref()(0,0)); \
U_10=coalescedRead(ref()(1,0),lane); \ Impl::loadLinkElement(U_10,ref()(1,0)); \
U_20=coalescedRead(ref()(2,0),lane); \ Impl::loadLinkElement(U_20,ref()(2,0)); \
U_01=coalescedRead(ref()(0,1),lane); \ Impl::loadLinkElement(U_01,ref()(0,1)); \
U_11=coalescedRead(ref()(1,1),lane); \ Impl::loadLinkElement(U_11,ref()(1,1)); \
U_21=coalescedRead(ref()(2,1),lane); \ Impl::loadLinkElement(U_21,ref()(2,1)); \
U_02=coalescedRead(ref()(0,2),lane); \ Impl::loadLinkElement(U_02,ref()(0,2)); \
U_12=coalescedRead(ref()(1,2),lane); \ Impl::loadLinkElement(U_12,ref()(1,2)); \
U_22=coalescedRead(ref()(2,2),lane); \ Impl::loadLinkElement(U_22,ref()(2,2)); \
UChi ## _0 = U_00*Chi_0; \ UChi ## _0 = U_00*Chi_0; \
UChi ## _1 = U_10*Chi_0;\ UChi ## _1 = U_10*Chi_0;\
UChi ## _2 = U_20*Chi_0;\ UChi ## _2 = U_20*Chi_0;\
@ -88,15 +63,15 @@ NAMESPACE_BEGIN(Grid);
#define MULT_ADD(U,A,UChi) \ #define MULT_ADD(U,A,UChi) \
auto & ref(U[sU](A)); \ auto & ref(U[sU](A)); \
U_00=coalescedRead(ref()(0,0),lane); \ Impl::loadLinkElement(U_00,ref()(0,0)); \
U_10=coalescedRead(ref()(1,0),lane); \ Impl::loadLinkElement(U_10,ref()(1,0)); \
U_20=coalescedRead(ref()(2,0),lane); \ Impl::loadLinkElement(U_20,ref()(2,0)); \
U_01=coalescedRead(ref()(0,1),lane); \ Impl::loadLinkElement(U_01,ref()(0,1)); \
U_11=coalescedRead(ref()(1,1),lane); \ Impl::loadLinkElement(U_11,ref()(1,1)); \
U_21=coalescedRead(ref()(2,1),lane); \ Impl::loadLinkElement(U_21,ref()(2,1)); \
U_02=coalescedRead(ref()(0,2),lane); \ Impl::loadLinkElement(U_02,ref()(0,2)); \
U_12=coalescedRead(ref()(1,2),lane); \ Impl::loadLinkElement(U_12,ref()(1,2)); \
U_22=coalescedRead(ref()(2,2),lane); \ Impl::loadLinkElement(U_22,ref()(2,2)); \
UChi ## _0 += U_00*Chi_0; \ UChi ## _0 += U_00*Chi_0; \
UChi ## _1 += U_10*Chi_0;\ UChi ## _1 += U_10*Chi_0;\
UChi ## _2 += U_20*Chi_0;\ UChi ## _2 += U_20*Chi_0;\
@ -108,18 +83,24 @@ NAMESPACE_BEGIN(Grid);
UChi ## _2 += U_22*Chi_2; UChi ## _2 += U_22*Chi_2;
#define PERMUTE_DIR(dir) \
permute##dir(Chi_0,Chi_0); \
permute##dir(Chi_1,Chi_1); \
permute##dir(Chi_2,Chi_2);
#define HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \ #define HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \
SE=st.GetEntry(ptype,Dir+skew,sF); \ SE=st.GetEntry(ptype,Dir+skew,sF); \
offset = SE->_offset; \ offset = SE->_offset; \
local = SE->_is_local; \ local = SE->_is_local; \
perm = SE->_permute; \ perm = SE->_permute; \
if ( local ) { \ if ( local ) { \
LOAD_CHI(Perm,in); \ LOAD_CHI(in); \
if ( perm) { \ if ( perm) { \
PERMUTE_DIR(Perm); \ PERMUTE_DIR(Perm); \
} \ } \
} else { \ } else { \
LOAD_CHI_COMMS(buf); \ LOAD_CHI(buf); \
} }
#define HAND_STENCIL_LEG_BEGIN(Dir,Perm,skew,even) \ #define HAND_STENCIL_LEG_BEGIN(Dir,Perm,skew,even) \
@ -135,18 +116,19 @@ NAMESPACE_BEGIN(Grid);
} }
#define HAND_STENCIL_LEG_INT(U,Dir,Perm,skew,even) \ #define HAND_STENCIL_LEG_INT(U,Dir,Perm,skew,even) \
SE=st.GetEntry(ptype,Dir+skew,sF); \ SE=st.GetEntry(ptype,Dir+skew,sF); \
offset = SE->_offset; \ offset = SE->_offset; \
local = SE->_is_local; \ local = SE->_is_local; \
perm = SE->_permute; \ perm = SE->_permute; \
if ( local ) { \ if ( local ) { \
LOAD_CHI(Perm,in); \ LOAD_CHI(in); \
if ( perm) { \ if ( perm) { \
PERMUTE_DIR(Perm); \ PERMUTE_DIR(Perm); \
} \ } \
} else if ( st.same_node[Dir] ) { \ } else if ( st.same_node[Dir] ) { \
LOAD_CHI_COMMS(buf); \ LOAD_CHI(buf); \
} \ } \
if (local || st.same_node[Dir] ) { \ if (local || st.same_node[Dir] ) { \
MULT_ADD(U,Dir,even); \ MULT_ADD(U,Dir,even); \
@ -158,35 +140,13 @@ NAMESPACE_BEGIN(Grid);
local = SE->_is_local; \ local = SE->_is_local; \
if ((!local) && (!st.same_node[Dir]) ) { \ if ((!local) && (!st.same_node[Dir]) ) { \
nmu++; \ nmu++; \
{ LOAD_CHI_COMMS(buf); } \ { LOAD_CHI(buf); } \
{ MULT_ADD(U,Dir,even); } \ { MULT_ADD(U,Dir,even); } \
} }
#define HAND_DECLARATIONS(Simd) \
Simd even_0; \
Simd even_1; \
Simd even_2; \
Simd odd_0; \
Simd odd_1; \
Simd odd_2; \
\
Simd Chi_0; \
Simd Chi_1; \
Simd Chi_2; \
\
Simd U_00; \
Simd U_10; \
Simd U_20; \
Simd U_01; \
Simd U_11; \
Simd U_21; \
Simd U_02; \
Simd U_12; \
Simd U_22;
template <class Impl> template <class Impl>
template <int Naik> accelerator_inline template <int Naik>
void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st, void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int sF, int sU, SiteSpinor *buf, int sF, int sU,
@ -195,14 +155,28 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
typedef typename Simd::scalar_type S; typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V; typedef typename Simd::vector_type V;
Simd even_0; // 12 regs on knc
Simd even_1;
Simd even_2;
Simd odd_0; // 12 regs on knc
Simd odd_1;
Simd odd_2;
const int Nsimd = SiteHalfSpinor::Nsimd(); Simd Chi_0; // two spinor; 6 regs
const int lane=acceleratorSIMTlane(Nsimd); Simd Chi_1;
typedef decltype( coalescedRead( in[0]()()(0) )) Simt; Simd Chi_2;
HAND_DECLARATIONS(Simt);
Simd U_00; // two rows of U matrix
Simd U_10;
Simd U_20;
Simd U_01;
Simd U_11;
Simd U_21; // 2 reg left.
Simd U_02;
Simd U_12;
Simd U_22;
typedef decltype( coalescedRead( in[0] )) calcSiteSpinor; SiteSpinor result;
calcSiteSpinor result;
int offset,local,perm, ptype; int offset,local,perm, ptype;
StencilEntry *SE; StencilEntry *SE;
@ -241,13 +215,13 @@ void StaggeredKernels<Impl>::DhopSiteHand(StencilView &st,
result()()(1) = even_1 + odd_1; result()()(1) = even_1 + odd_1;
result()()(2) = even_2 + odd_2; result()()(2) = even_2 + odd_2;
} }
coalescedWrite(out[sF],result); vstream(out[sF],result);
} }
} }
template <class Impl> template <class Impl>
template <int Naik> accelerator_inline template <int Naik>
void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st, void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int sF, int sU, SiteSpinor *buf, int sF, int sU,
@ -256,13 +230,28 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
typedef typename Simd::scalar_type S; typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V; typedef typename Simd::vector_type V;
const int Nsimd = SiteHalfSpinor::Nsimd(); Simd even_0; // 12 regs on knc
const int lane=acceleratorSIMTlane(Nsimd); Simd even_1;
typedef decltype( coalescedRead( in[0]()()(0) )) Simt; Simd even_2;
HAND_DECLARATIONS(Simt); Simd odd_0; // 12 regs on knc
Simd odd_1;
Simd odd_2;
typedef decltype( coalescedRead( in[0] )) calcSiteSpinor; Simd Chi_0; // two spinor; 6 regs
calcSiteSpinor result; Simd Chi_1;
Simd Chi_2;
Simd U_00; // two rows of U matrix
Simd U_10;
Simd U_20;
Simd U_01;
Simd U_11;
Simd U_21; // 2 reg left.
Simd U_02;
Simd U_12;
Simd U_22;
SiteSpinor result;
int offset, ptype, local, perm; int offset, ptype, local, perm;
StencilEntry *SE; StencilEntry *SE;
@ -272,8 +261,8 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
// int sF=s+LLs*sU; // int sF=s+LLs*sU;
{ {
zeroit(even_0); zeroit(even_1); zeroit(even_2); even_0 = Zero(); even_1 = Zero(); even_2 = Zero();
zeroit(odd_0); zeroit(odd_1); zeroit(odd_2); odd_0 = Zero(); odd_1 = Zero(); odd_2 = Zero();
skew = 0; skew = 0;
HAND_STENCIL_LEG_INT(U,Xp,3,skew,even); HAND_STENCIL_LEG_INT(U,Xp,3,skew,even);
@ -305,13 +294,13 @@ void StaggeredKernels<Impl>::DhopSiteHandInt(StencilView &st,
result()()(1) = even_1 + odd_1; result()()(1) = even_1 + odd_1;
result()()(2) = even_2 + odd_2; result()()(2) = even_2 + odd_2;
} }
coalescedWrite(out[sF],result); vstream(out[sF],result);
} }
} }
template <class Impl> template <class Impl>
template <int Naik> accelerator_inline template <int Naik>
void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st, void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int sF, int sU, SiteSpinor *buf, int sF, int sU,
@ -320,13 +309,28 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
typedef typename Simd::scalar_type S; typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V; typedef typename Simd::vector_type V;
const int Nsimd = SiteHalfSpinor::Nsimd(); Simd even_0; // 12 regs on knc
const int lane=acceleratorSIMTlane(Nsimd); Simd even_1;
typedef decltype( coalescedRead( in[0]()()(0) )) Simt; Simd even_2;
HAND_DECLARATIONS(Simt); Simd odd_0; // 12 regs on knc
Simd odd_1;
Simd odd_2;
typedef decltype( coalescedRead( in[0] )) calcSiteSpinor; Simd Chi_0; // two spinor; 6 regs
calcSiteSpinor result; Simd Chi_1;
Simd Chi_2;
Simd U_00; // two rows of U matrix
Simd U_10;
Simd U_20;
Simd U_01;
Simd U_11;
Simd U_21; // 2 reg left.
Simd U_02;
Simd U_12;
Simd U_22;
SiteSpinor result;
int offset, ptype, local; int offset, ptype, local;
StencilEntry *SE; StencilEntry *SE;
@ -336,8 +340,8 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
// int sF=s+LLs*sU; // int sF=s+LLs*sU;
{ {
zeroit(even_0); zeroit(even_1); zeroit(even_2); even_0 = Zero(); even_1 = Zero(); even_2 = Zero();
zeroit(odd_0); zeroit(odd_1); zeroit(odd_2); odd_0 = Zero(); odd_1 = Zero(); odd_2 = Zero();
int nmu=0; int nmu=0;
skew = 0; skew = 0;
HAND_STENCIL_LEG_EXT(U,Xp,3,skew,even); HAND_STENCIL_LEG_EXT(U,Xp,3,skew,even);
@ -370,7 +374,7 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
result()()(1) = even_1 + odd_1; result()()(1) = even_1 + odd_1;
result()()(2) = even_2 + odd_2; result()()(2) = even_2 + odd_2;
} }
coalescedWrite(out[sF] , out(sF)+ result); out[sF] = out[sF] + result;
} }
} }
} }
@ -393,7 +397,6 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
const FermionFieldView &in, FermionFieldView &out, int dag); \ const FermionFieldView &in, FermionFieldView &out, int dag); \
*/ */
#undef LOAD_CHI #undef LOAD_CHI
#undef HAND_DECLARATIONS
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@ -35,32 +35,39 @@ NAMESPACE_BEGIN(Grid);
#define GENERIC_STENCIL_LEG(U,Dir,skew,multLink) \ #define GENERIC_STENCIL_LEG(U,Dir,skew,multLink) \
SE = st.GetEntry(ptype, Dir+skew, sF); \ SE = st.GetEntry(ptype, Dir+skew, sF); \
if (SE->_is_local ) { \ if (SE->_is_local ) { \
int perm= SE->_permute; \ if (SE->_permute) { \
chi = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);\ chi_p = &chi; \
permute(chi, in[SE->_offset], ptype); \
} else { \
chi_p = &in[SE->_offset]; \
} \
} else { \ } else { \
chi = coalescedRead(buf[SE->_offset],lane); \ chi_p = &buf[SE->_offset]; \
} \ } \
acceleratorSynchronise(); \ multLink(Uchi, U[sU], *chi_p, Dir);
multLink(Uchi, U[sU], chi, Dir);
#define GENERIC_STENCIL_LEG_INT(U,Dir,skew,multLink) \ #define GENERIC_STENCIL_LEG_INT(U,Dir,skew,multLink) \
SE = st.GetEntry(ptype, Dir+skew, sF); \ SE = st.GetEntry(ptype, Dir+skew, sF); \
if (SE->_is_local ) { \ if (SE->_is_local ) { \
int perm= SE->_permute; \ if (SE->_permute) { \
chi = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);\ chi_p = &chi; \
permute(chi, in[SE->_offset], ptype); \
} else { \
chi_p = &in[SE->_offset]; \
} \
} else if ( st.same_node[Dir] ) { \ } else if ( st.same_node[Dir] ) { \
chi = coalescedRead(buf[SE->_offset],lane); \ chi_p = &buf[SE->_offset]; \
} \ } \
if (SE->_is_local || st.same_node[Dir] ) { \ if (SE->_is_local || st.same_node[Dir] ) { \
multLink(Uchi, U[sU], chi, Dir); \ multLink(Uchi, U[sU], *chi_p, Dir); \
} }
#define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink) \ #define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink) \
SE = st.GetEntry(ptype, Dir+skew, sF); \ SE = st.GetEntry(ptype, Dir+skew, sF); \
if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \ if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \
nmu++; \ nmu++; \
chi = coalescedRead(buf[SE->_offset],lane); \ chi_p = &buf[SE->_offset]; \
multLink(Uchi, U[sU], chi, Dir); \ multLink(Uchi, U[sU], *chi_p, Dir); \
} }
template <class Impl> template <class Impl>
@ -71,20 +78,18 @@ StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};
// Int, Ext, Int+Ext cases for comms overlap // Int, Ext, Int+Ext cases for comms overlap
//////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////
template <class Impl> template <class Impl>
template <int Naik> accelerator_inline template <int Naik>
void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st, void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int sF, int sU, SiteSpinor *buf, int sF, int sU,
const FermionFieldView &in, FermionFieldView &out, int dag) const FermionFieldView &in, FermionFieldView &out, int dag)
{ {
typedef decltype(coalescedRead(in[0])) calcSpinor; const SiteSpinor *chi_p;
calcSpinor chi; SiteSpinor chi;
calcSpinor Uchi; SiteSpinor Uchi;
StencilEntry *SE; StencilEntry *SE;
int ptype; int ptype;
int skew; int skew;
const int Nsimd = SiteHalfSpinor::Nsimd();
const int lane=acceleratorSIMTlane(Nsimd);
// for(int s=0;s<LLs;s++){ // for(int s=0;s<LLs;s++){
// //
@ -113,7 +118,7 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st,
if ( dag ) { if ( dag ) {
Uchi = - Uchi; Uchi = - Uchi;
} }
coalescedWrite(out[sF], Uchi,lane); vstream(out[sF], Uchi);
} }
}; };
@ -121,20 +126,17 @@ void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st,
// Only contributions from interior of our node // Only contributions from interior of our node
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
template <class Impl> template <class Impl>
template <int Naik> accelerator_inline template <int Naik>
void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st, void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int sF, int sU, SiteSpinor *buf, int sF, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag) const FermionFieldView &in, FermionFieldView &out,int dag) {
{ const SiteSpinor *chi_p;
typedef decltype(coalescedRead(in[0])) calcSpinor; SiteSpinor chi;
calcSpinor chi; SiteSpinor Uchi;
calcSpinor Uchi;
StencilEntry *SE; StencilEntry *SE;
int ptype; int ptype;
int skew ; int skew ;
const int Nsimd = SiteHalfSpinor::Nsimd();
const int lane=acceleratorSIMTlane(Nsimd);
// for(int s=0;s<LLs;s++){ // for(int s=0;s<LLs;s++){
// int sF=LLs*sU+s; // int sF=LLs*sU+s;
@ -163,7 +165,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st,
if ( dag ) { if ( dag ) {
Uchi = - Uchi; Uchi = - Uchi;
} }
coalescedWrite(out[sF], Uchi,lane); vstream(out[sF], Uchi);
} }
}; };
@ -172,21 +174,18 @@ void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st,
// Only contributions from exterior of our node // Only contributions from exterior of our node
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
template <class Impl> template <class Impl>
template <int Naik> accelerator_inline template <int Naik>
void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st, void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st,
DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
SiteSpinor *buf, int sF, int sU, SiteSpinor *buf, int sF, int sU,
const FermionFieldView &in, FermionFieldView &out,int dag) const FermionFieldView &in, FermionFieldView &out,int dag) {
{ const SiteSpinor *chi_p;
typedef decltype(coalescedRead(in[0])) calcSpinor; // SiteSpinor chi;
calcSpinor chi; SiteSpinor Uchi;
calcSpinor Uchi;
StencilEntry *SE; StencilEntry *SE;
int ptype; int ptype;
int nmu=0; int nmu=0;
int skew ; int skew ;
const int Nsimd = SiteHalfSpinor::Nsimd();
const int lane=acceleratorSIMTlane(Nsimd);
// for(int s=0;s<LLs;s++){ // for(int s=0;s<LLs;s++){
// int sF=LLs*sU+s; // int sF=LLs*sU+s;
@ -212,12 +211,11 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st,
GENERIC_STENCIL_LEG_EXT(UUU,Zm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_EXT(UUU,Zm,skew,Impl::multLinkAdd);
GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd); GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd);
} }
if ( nmu ) { if ( nmu ) {
auto _out = coalescedRead(out[sF],lane); if ( dag ) {
if ( dag ) { out[sF] = out[sF] - Uchi;
coalescedWrite(out[sF], _out-Uchi,lane);
} else { } else {
coalescedWrite(out[sF], _out+Uchi,lane); out[sF] = out[sF] + Uchi;
} }
} }
} }
@ -226,7 +224,7 @@ void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st,
//////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////
// Driving / wrapping routine to select right kernel // Driving / wrapping routine to select right kernel
//////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////
template <class Impl> template <class Impl>
void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf, void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp) int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp)
{ {
@ -255,7 +253,7 @@ void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldVie
ThisKernel::A(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag); \ ThisKernel::A(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag); \
}); });
template <class Impl> template <class Impl>
void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo, void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU, DoubledGaugeField &U, DoubledGaugeField &UUU,
const FermionField &in, FermionField &out, int dag, int interior,int exterior) const FermionField &in, FermionField &out, int dag, int interior,int exterior)
@ -263,8 +261,6 @@ void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,
GridBase *FGrid=in.Grid(); GridBase *FGrid=in.Grid();
GridBase *UGrid=U.Grid(); GridBase *UGrid=U.Grid();
typedef StaggeredKernels<Impl> ThisKernel; typedef StaggeredKernels<Impl> ThisKernel;
const int Nsimd = SiteHalfSpinor::Nsimd();
const int lane=acceleratorSIMTlane(Nsimd);
autoView( UUU_v , UUU, AcceleratorRead); autoView( UUU_v , UUU, AcceleratorRead);
autoView( U_v , U, AcceleratorRead); autoView( U_v , U, AcceleratorRead);
autoView( in_v , in, AcceleratorRead); autoView( in_v , in, AcceleratorRead);
@ -297,7 +293,7 @@ void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,
} }
assert(0 && " Kernel optimisation case not covered "); assert(0 && " Kernel optimisation case not covered ");
} }
template <class Impl> template <class Impl>
void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo, void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag, int interior,int exterior) const FermionField &in, FermionField &out, int dag, int interior,int exterior)
@ -305,8 +301,6 @@ void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo,
GridBase *FGrid=in.Grid(); GridBase *FGrid=in.Grid();
GridBase *UGrid=U.Grid(); GridBase *UGrid=U.Grid();
typedef StaggeredKernels<Impl> ThisKernel; typedef StaggeredKernels<Impl> ThisKernel;
const int Nsimd = SiteHalfSpinor::Nsimd();
const int lane=acceleratorSIMTlane(Nsimd);
autoView( UUU_v , U, AcceleratorRead); autoView( UUU_v , U, AcceleratorRead);
autoView( U_v , U, AcceleratorRead); autoView( U_v , U, AcceleratorRead);
autoView( in_v , in, AcceleratorRead); autoView( in_v , in, AcceleratorRead);

View File

@ -2,13 +2,12 @@
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/WilsonCloverFermionImplementation.h Source file: ./lib/qcd/action/fermion/WilsonCloverFermion.cc
Copyright (C) 2017 - 2022 Copyright (C) 2017
Author: paboyle <paboyle@ph.ed.ac.uk> Author: paboyle <paboyle@ph.ed.ac.uk>
Author: Guido Cossu <guido.cossu@ed.ac.uk> Author: Guido Cossu <guido.cossu@ed.ac.uk>
Author: Daniel Richtmann <daniel.richtmann@gmail.com>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -34,45 +33,6 @@
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
template<class Impl>
WilsonCloverFermion<Impl>::WilsonCloverFermion(GaugeField& _Umu,
GridCartesian& Fgrid,
GridRedBlackCartesian& Hgrid,
const RealD _mass,
const RealD _csw_r,
const RealD _csw_t,
const WilsonAnisotropyCoefficients& clover_anisotropy,
const ImplParams& impl_p)
: WilsonFermion<Impl>(_Umu, Fgrid, Hgrid, _mass, impl_p, clover_anisotropy)
, CloverTerm(&Fgrid)
, CloverTermInv(&Fgrid)
, CloverTermEven(&Hgrid)
, CloverTermOdd(&Hgrid)
, CloverTermInvEven(&Hgrid)
, CloverTermInvOdd(&Hgrid)
, CloverTermDagEven(&Hgrid)
, CloverTermDagOdd(&Hgrid)
, CloverTermInvDagEven(&Hgrid)
, CloverTermInvDagOdd(&Hgrid) {
assert(Nd == 4); // require 4 dimensions
if(clover_anisotropy.isAnisotropic) {
csw_r = _csw_r * 0.5 / clover_anisotropy.xi_0;
diag_mass = _mass + 1.0 + (Nd - 1) * (clover_anisotropy.nu / clover_anisotropy.xi_0);
} else {
csw_r = _csw_r * 0.5;
diag_mass = 4.0 + _mass;
}
csw_t = _csw_t * 0.5;
if(csw_r == 0)
std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_r = 0" << std::endl;
if(csw_t == 0)
std::cout << GridLogWarning << "Initializing WilsonCloverFermion with csw_t = 0" << std::endl;
ImportGauge(_Umu);
}
// *NOT* EO // *NOT* EO
template <class Impl> template <class Impl>
void WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out) void WilsonCloverFermion<Impl>::M(const FermionField &in, FermionField &out)
@ -107,13 +67,10 @@ void WilsonCloverFermion<Impl>::Mdag(const FermionField &in, FermionField &out)
template <class Impl> template <class Impl>
void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu) void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
{ {
double t0 = usecond();
WilsonFermion<Impl>::ImportGauge(_Umu); WilsonFermion<Impl>::ImportGauge(_Umu);
double t1 = usecond();
GridBase *grid = _Umu.Grid(); GridBase *grid = _Umu.Grid();
typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid); typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
double t2 = usecond();
// Compute the field strength terms mu>nu // Compute the field strength terms mu>nu
WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir); WilsonLoops<Impl>::FieldStrength(Bx, _Umu, Zdir, Ydir);
WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir); WilsonLoops<Impl>::FieldStrength(By, _Umu, Zdir, Xdir);
@ -122,32 +79,33 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir); WilsonLoops<Impl>::FieldStrength(Ey, _Umu, Tdir, Ydir);
WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir); WilsonLoops<Impl>::FieldStrength(Ez, _Umu, Tdir, Zdir);
double t3 = usecond();
// Compute the Clover Operator acting on Colour and Spin // Compute the Clover Operator acting on Colour and Spin
// multiply here by the clover coefficients for the anisotropy // multiply here by the clover coefficients for the anisotropy
CloverTerm = Helpers::fillCloverYZ(Bx) * csw_r; CloverTerm = fillCloverYZ(Bx) * csw_r;
CloverTerm += Helpers::fillCloverXZ(By) * csw_r; CloverTerm += fillCloverXZ(By) * csw_r;
CloverTerm += Helpers::fillCloverXY(Bz) * csw_r; CloverTerm += fillCloverXY(Bz) * csw_r;
CloverTerm += Helpers::fillCloverXT(Ex) * csw_t; CloverTerm += fillCloverXT(Ex) * csw_t;
CloverTerm += Helpers::fillCloverYT(Ey) * csw_t; CloverTerm += fillCloverYT(Ey) * csw_t;
CloverTerm += Helpers::fillCloverZT(Ez) * csw_t; CloverTerm += fillCloverZT(Ez) * csw_t;
CloverTerm += diag_mass; CloverTerm += diag_mass;
double t4 = usecond();
int lvol = _Umu.Grid()->lSites(); int lvol = _Umu.Grid()->lSites();
int DimRep = Impl::Dimension; int DimRep = Impl::Dimension;
double t5 = usecond(); Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
Coordinate lcoor;
typename SiteCloverType::scalar_object Qx = Zero(), Qxinv = Zero();
{ {
autoView(CTv,CloverTerm,CpuRead); autoView(CTv,CloverTerm,CpuRead);
autoView(CTIv,CloverTermInv,CpuWrite); autoView(CTIv,CloverTermInv,CpuWrite);
thread_for(site, lvol, { for (int site = 0; site < lvol; site++) {
Coordinate lcoor;
grid->LocalIndexToLocalCoor(site, lcoor); grid->LocalIndexToLocalCoor(site, lcoor);
Eigen::MatrixXcd EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep); EigenCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
Eigen::MatrixXcd EigenInvCloverOp = Eigen::MatrixXcd::Zero(Ns * DimRep, Ns * DimRep);
typename SiteClover::scalar_object Qx = Zero(), Qxinv = Zero();
peekLocalSite(Qx, CTv, lcoor); peekLocalSite(Qx, CTv, lcoor);
Qxinv = Zero();
//if (csw!=0){ //if (csw!=0){
for (int j = 0; j < Ns; j++) for (int j = 0; j < Ns; j++)
for (int k = 0; k < Ns; k++) for (int k = 0; k < Ns; k++)
@ -168,10 +126,9 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
// if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl; // if (site==0) std::cout << "site =" << site << "\n" << EigenInvCloverOp << std::endl;
// } // }
pokeLocalSite(Qxinv, CTIv, lcoor); pokeLocalSite(Qxinv, CTIv, lcoor);
}); }
} }
double t6 = usecond();
// Separate the even and odd parts // Separate the even and odd parts
pickCheckerboard(Even, CloverTermEven, CloverTerm); pickCheckerboard(Even, CloverTermEven, CloverTerm);
pickCheckerboard(Odd, CloverTermOdd, CloverTerm); pickCheckerboard(Odd, CloverTermOdd, CloverTerm);
@ -184,20 +141,6 @@ void WilsonCloverFermion<Impl>::ImportGauge(const GaugeField &_Umu)
pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv)); pickCheckerboard(Even, CloverTermInvDagEven, adj(CloverTermInv));
pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv)); pickCheckerboard(Odd, CloverTermInvDagOdd, adj(CloverTermInv));
double t7 = usecond();
#if 0
std::cout << GridLogMessage << "WilsonCloverFermion::ImportGauge timings:"
<< " WilsonFermion::Importgauge = " << (t1 - t0) / 1e6
<< ", allocations = " << (t2 - t1) / 1e6
<< ", field strength = " << (t3 - t2) / 1e6
<< ", fill clover = " << (t4 - t3) / 1e6
<< ", misc = " << (t5 - t4) / 1e6
<< ", inversions = " << (t6 - t5) / 1e6
<< ", pick cbs = " << (t7 - t6) / 1e6
<< ", total = " << (t7 - t0) / 1e6
<< std::endl;
#endif
} }
template <class Impl> template <class Impl>
@ -228,7 +171,7 @@ template <class Impl>
void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv) void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionField &out, int dag, int inv)
{ {
out.Checkerboard() = in.Checkerboard(); out.Checkerboard() = in.Checkerboard();
CloverField *Clover; CloverFieldType *Clover;
assert(in.Checkerboard() == Odd || in.Checkerboard() == Even); assert(in.Checkerboard() == Odd || in.Checkerboard() == Even);
if (dag) if (dag)
@ -243,12 +186,12 @@ void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionFie
{ {
Clover = (inv) ? &CloverTermInvDagEven : &CloverTermDagEven; Clover = (inv) ? &CloverTermInvDagEven : &CloverTermDagEven;
} }
Helpers::multCloverField(out, *Clover, in); out = *Clover * in;
} }
else else
{ {
Clover = (inv) ? &CloverTermInv : &CloverTerm; Clover = (inv) ? &CloverTermInv : &CloverTerm;
Helpers::multCloverField(out, *Clover, in); // don't bother with adj, hermitian anyway out = adj(*Clover) * in;
} }
} }
else else
@ -266,98 +209,18 @@ void WilsonCloverFermion<Impl>::MooeeInternal(const FermionField &in, FermionFie
// std::cout << "Calling clover term Even" << std::endl; // std::cout << "Calling clover term Even" << std::endl;
Clover = (inv) ? &CloverTermInvEven : &CloverTermEven; Clover = (inv) ? &CloverTermInvEven : &CloverTermEven;
} }
Helpers::multCloverField(out, *Clover, in); out = *Clover * in;
// std::cout << GridLogMessage << "*Clover.Checkerboard() " << (*Clover).Checkerboard() << std::endl; // std::cout << GridLogMessage << "*Clover.Checkerboard() " << (*Clover).Checkerboard() << std::endl;
} }
else else
{ {
Clover = (inv) ? &CloverTermInv : &CloverTerm; Clover = (inv) ? &CloverTermInv : &CloverTerm;
Helpers::multCloverField(out, *Clover, in); out = *Clover * in;
} }
} }
} // MooeeInternal } // MooeeInternal
// Derivative parts unpreconditioned pseudofermions
template <class Impl>
void WilsonCloverFermion<Impl>::MDeriv(GaugeField &force, const FermionField &X, const FermionField &Y, int dag)
{
conformable(X.Grid(), Y.Grid());
conformable(X.Grid(), force.Grid());
GaugeLinkField force_mu(force.Grid()), lambda(force.Grid());
GaugeField clover_force(force.Grid());
PropagatorField Lambda(force.Grid());
// Guido: Here we are hitting some performance issues:
// need to extract the components of the DoubledGaugeField
// for each call
// Possible solution
// Create a vector object to store them? (cons: wasting space)
std::vector<GaugeLinkField> U(Nd, this->Umu.Grid());
Impl::extractLinkField(U, this->Umu);
force = Zero();
// Derivative of the Wilson hopping term
this->DhopDeriv(force, X, Y, dag);
///////////////////////////////////////////////////////////
// Clover term derivative
///////////////////////////////////////////////////////////
Impl::outerProductImpl(Lambda, X, Y);
//std::cout << "Lambda:" << Lambda << std::endl;
Gamma::Algebra sigma[] = {
Gamma::Algebra::SigmaXY,
Gamma::Algebra::SigmaXZ,
Gamma::Algebra::SigmaXT,
Gamma::Algebra::MinusSigmaXY,
Gamma::Algebra::SigmaYZ,
Gamma::Algebra::SigmaYT,
Gamma::Algebra::MinusSigmaXZ,
Gamma::Algebra::MinusSigmaYZ,
Gamma::Algebra::SigmaZT,
Gamma::Algebra::MinusSigmaXT,
Gamma::Algebra::MinusSigmaYT,
Gamma::Algebra::MinusSigmaZT};
/*
sigma_{\mu \nu}=
| 0 sigma[0] sigma[1] sigma[2] |
| sigma[3] 0 sigma[4] sigma[5] |
| sigma[6] sigma[7] 0 sigma[8] |
| sigma[9] sigma[10] sigma[11] 0 |
*/
int count = 0;
clover_force = Zero();
for (int mu = 0; mu < 4; mu++)
{
force_mu = Zero();
for (int nu = 0; nu < 4; nu++)
{
if (mu == nu)
continue;
RealD factor;
if (nu == 4 || mu == 4)
{
factor = 2.0 * csw_t;
}
else
{
factor = 2.0 * csw_r;
}
PropagatorField Slambda = Gamma(sigma[count]) * Lambda; // sigma checked
Impl::TraceSpinImpl(lambda, Slambda); // traceSpin ok
force_mu -= factor*Helpers::Cmunu(U, lambda, mu, nu); // checked
count++;
}
pokeLorentz(clover_force, U[mu] * force_mu, mu);
}
//clover_force *= csw;
force += clover_force;
}
// Derivative parts // Derivative parts
template <class Impl> template <class Impl>

View File

@ -397,7 +397,6 @@ void WilsonFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, co
template <class Impl> template <class Impl>
void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag)
{ {
DhopCalls+=2;
conformable(in.Grid(), _grid); // verifies full grid conformable(in.Grid(), _grid); // verifies full grid
conformable(in.Grid(), out.Grid()); conformable(in.Grid(), out.Grid());
@ -409,7 +408,6 @@ void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int da
template <class Impl> template <class Impl>
void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag)
{ {
DhopCalls++;
conformable(in.Grid(), _cbgrid); // verifies half grid conformable(in.Grid(), _cbgrid); // verifies half grid
conformable(in.Grid(), out.Grid()); // drops the cb check conformable(in.Grid(), out.Grid()); // drops the cb check
@ -422,7 +420,6 @@ void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int
template <class Impl> template <class Impl>
void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag) void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
{ {
DhopCalls++;
conformable(in.Grid(), _cbgrid); // verifies half grid conformable(in.Grid(), _cbgrid); // verifies half grid
conformable(in.Grid(), out.Grid()); // drops the cb check conformable(in.Grid(), out.Grid()); // drops the cb check

View File

@ -38,6 +38,9 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
// undefine everything related to kernels // undefine everything related to kernels
#include <simd/Fujitsu_A64FX_undef.h> #include <simd/Fujitsu_A64FX_undef.h>
// enable A64FX body
#define WILSONKERNELSASMBODYA64FX
//#pragma message("A64FX Dslash: WilsonKernelsAsmBodyA64FX.h")
/////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////
// If we are A64FX specialise the single precision routine // If we are A64FX specialise the single precision routine
@ -60,89 +63,119 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
#define INTERIOR_AND_EXTERIOR #define INTERIOR_AND_EXTERIOR
#undef INTERIOR #undef INTERIOR
#undef EXTERIOR #undef EXTERIOR
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void template<> void
WilsonKernels<WilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<WilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void template<> void
WilsonKernels<ZWilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<ZWilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void
//template<> void WilsonKernels<WilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
//WilsonKernels<WilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #if defined (WILSONKERNELSASMBODYA64FX)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void #endif
//WilsonKernels<ZWilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
template<> void
WilsonKernels<ZWilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#undef INTERIOR_AND_EXTERIOR #undef INTERIOR_AND_EXTERIOR
#define INTERIOR #define INTERIOR
#undef EXTERIOR #undef EXTERIOR
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void template<> void
WilsonKernels<WilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<WilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void template<> void
WilsonKernels<ZWilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<ZWilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void
//template<> void WilsonKernels<WilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
//WilsonKernels<WilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #if defined (WILSONKERNELSASMBODYA64FX)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void #endif
//WilsonKernels<ZWilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
template<> void
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#undef INTERIOR_AND_EXTERIOR #undef INTERIOR_AND_EXTERIOR
#undef INTERIOR #undef INTERIOR
#define EXTERIOR #define EXTERIOR
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void template<> void
WilsonKernels<WilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<WilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void template<> void
WilsonKernels<ZWilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<ZWilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void
//template<> void WilsonKernels<WilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
//WilsonKernels<WilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #if defined (WILSONKERNELSASMBODYA64FX)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void #endif
//WilsonKernels<ZWilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
template<> void
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
@ -152,89 +185,119 @@ WilsonKernels<ZWilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldVi
#define INTERIOR_AND_EXTERIOR #define INTERIOR_AND_EXTERIOR
#undef INTERIOR #undef INTERIOR
#undef EXTERIOR #undef EXTERIOR
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void template<> void
WilsonKernels<WilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<WilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void template<> void
WilsonKernels<ZWilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<ZWilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void
//template<> void WilsonKernels<WilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
//WilsonKernels<WilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #if defined (WILSONKERNELSASMBODYA64FX)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void #endif
//WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
template<> void
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#undef INTERIOR_AND_EXTERIOR #undef INTERIOR_AND_EXTERIOR
#define INTERIOR #define INTERIOR
#undef EXTERIOR #undef EXTERIOR
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void template<> void
WilsonKernels<WilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<WilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void template<> void
WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void
//template<> void WilsonKernels<WilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
//WilsonKernels<WilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #if defined (WILSONKERNELSASMBODYA64FX)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void #endif
//WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
template<> void
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#undef INTERIOR_AND_EXTERIOR #undef INTERIOR_AND_EXTERIOR
#undef INTERIOR #undef INTERIOR
#define EXTERIOR #define EXTERIOR
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void template<> void
WilsonKernels<WilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<WilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void template<> void
WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void
//template<> void WilsonKernels<WilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
//WilsonKernels<WilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #if defined (WILSONKERNELSASMBODYA64FX)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
//#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void #endif
//WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
template<> void
WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
// undefine // undefine
@ -267,89 +330,119 @@ WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFiel
#define INTERIOR_AND_EXTERIOR #define INTERIOR_AND_EXTERIOR
#undef INTERIOR #undef INTERIOR
#undef EXTERIOR #undef EXTERIOR
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void template<> void
WilsonKernels<WilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<WilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void template<> void
WilsonKernels<ZWilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<ZWilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void
// template<> void WilsonKernels<WilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// WilsonKernels<WilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #if defined (WILSONKERNELSASMBODYA64FX)
// #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
// template<> void #endif
// WilsonKernels<ZWilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
// #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
template<> void
WilsonKernels<ZWilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#undef INTERIOR_AND_EXTERIOR #undef INTERIOR_AND_EXTERIOR
#define INTERIOR #define INTERIOR
#undef EXTERIOR #undef EXTERIOR
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void template<> void
WilsonKernels<WilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<WilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void template<> void
WilsonKernels<ZWilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<ZWilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void
// template<> void WilsonKernels<WilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// WilsonKernels<WilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #if defined (WILSONKERNELSASMBODYA64FX)
// #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
// template<> void #endif
// WilsonKernels<ZWilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
// #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
template<> void
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#undef INTERIOR_AND_EXTERIOR #undef INTERIOR_AND_EXTERIOR
#undef INTERIOR #undef INTERIOR
#define EXTERIOR #define EXTERIOR
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void template<> void
WilsonKernels<WilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<WilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void template<> void
WilsonKernels<ZWilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<ZWilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void
// template<> void WilsonKernels<WilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// WilsonKernels<WilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #if defined (WILSONKERNELSASMBODYA64FX)
// #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
// template<> void #endif
// WilsonKernels<ZWilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
// #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
template<> void
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
// XYZT vectorised, dag Kernel, double // XYZT vectorised, dag Kernel, double
@ -358,93 +451,124 @@ WilsonKernels<ZWilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldVi
#define INTERIOR_AND_EXTERIOR #define INTERIOR_AND_EXTERIOR
#undef INTERIOR #undef INTERIOR
#undef EXTERIOR #undef EXTERIOR
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void template<> void
WilsonKernels<WilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<WilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void template<> void
WilsonKernels<ZWilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<ZWilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void
// template<> void WilsonKernels<WilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// WilsonKernels<WilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #if defined (WILSONKERNELSASMBODYA64FX)
// #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
// template<> void #endif
// WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
// #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
template<> void
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#undef INTERIOR_AND_EXTERIOR #undef INTERIOR_AND_EXTERIOR
#define INTERIOR #define INTERIOR
#undef EXTERIOR #undef EXTERIOR
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void template<> void
WilsonKernels<WilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<WilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void template<> void
WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void
// template<> void WilsonKernels<WilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// WilsonKernels<WilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #if defined (WILSONKERNELSASMBODYA64FX)
// #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
// template<> void #endif
// WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
// #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
template<> void
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#undef INTERIOR_AND_EXTERIOR #undef INTERIOR_AND_EXTERIOR
#undef INTERIOR #undef INTERIOR
#define EXTERIOR #define EXTERIOR
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void template<> void
WilsonKernels<WilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<WilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
#pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2")
template<> void template<> void
WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") template<> void
// template<> void WilsonKernels<WilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// WilsonKernels<WilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) #if defined (WILSONKERNELSASMBODYA64FX)
// #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
// #pragma GCC optimize ("-O3", "-fno-schedule-insns", "-fno-schedule-insns2") #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
// template<> void #endif
// WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
// #include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
template<> void
WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#if defined (WILSONKERNELSASMBODYA64FX)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBodyA64FX.h>
#else
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif
// undefs // undefs
#undef WILSONKERNELSASMBODYA64FX
#include <simd/Fujitsu_A64FX_undef.h> #include <simd/Fujitsu_A64FX_undef.h>
#endif //A64FXASM #endif //A64FXASM

View File

@ -74,15 +74,15 @@ WilsonKernels<ZWilsonImplF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void template<> void
//WilsonKernels<WilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<WilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//
//template<> void template<> void
//WilsonKernels<ZWilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<ZWilsonImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR #undef INTERIOR_AND_EXTERIOR
#define INTERIOR #define INTERIOR
@ -97,15 +97,15 @@ WilsonKernels<ZWilsonImplF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldVi
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void template<> void
//WilsonKernels<WilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<WilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//
//template<> void template<> void
//WilsonKernels<ZWilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<ZWilsonImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR #undef INTERIOR_AND_EXTERIOR
@ -121,15 +121,15 @@ WilsonKernels<ZWilsonImplF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldVi
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void template<> void
//WilsonKernels<WilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<WilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//
//template<> void template<> void
//WilsonKernels<ZWilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<ZWilsonImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
// XYZT vectorised, dag Kernel, single // XYZT vectorised, dag Kernel, single
@ -148,15 +148,15 @@ WilsonKernels<ZWilsonImplF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldVi
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void template<> void
//WilsonKernels<WilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<WilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//
//template<> void template<> void
//WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR #undef INTERIOR_AND_EXTERIOR
#define INTERIOR #define INTERIOR
@ -171,15 +171,15 @@ WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFiel
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void template<> void
//WilsonKernels<WilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<WilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//
//template<> void template<> void
//WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR #undef INTERIOR_AND_EXTERIOR
#undef INTERIOR #undef INTERIOR
@ -194,15 +194,15 @@ WilsonKernels<ZWilsonImplF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFiel
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void template<> void
//WilsonKernels<WilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<WilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//
//template<> void template<> void
//WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<ZWilsonImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#undef MAYBEPERM #undef MAYBEPERM
#undef MULT_2SPIN #undef MULT_2SPIN
@ -228,14 +228,14 @@ WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSite(StencilView &st, DoubledGaugeF
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void template<> void
//WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void template<> void
//WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR #undef INTERIOR_AND_EXTERIOR
#define INTERIOR #define INTERIOR
@ -249,14 +249,14 @@ WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteInt(StencilView &st, DoubledGau
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void template<> void
//WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void template<> void
//WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR #undef INTERIOR_AND_EXTERIOR
#undef INTERIOR #undef INTERIOR
@ -273,15 +273,15 @@ WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteExt(StencilView &st, DoubledGau
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void template<> void
//WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//
//template<> void template<> void
//WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
// Ls vectorised, dag Kernel, single // Ls vectorised, dag Kernel, single
@ -299,14 +299,14 @@ WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDag(StencilView &st, DoubledGau
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void template<> void
//WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void template<> void
//WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR #undef INTERIOR_AND_EXTERIOR
#define INTERIOR #define INTERIOR
@ -320,14 +320,14 @@ WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDagInt(StencilView &st, Doubled
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void template<> void
//WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void template<> void
//WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR #undef INTERIOR_AND_EXTERIOR
#undef INTERIOR #undef INTERIOR
@ -341,14 +341,14 @@ WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDagExt(StencilView &st, Doubled
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void template<> void
//WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, WilsonKernels<DomainWallVec5dImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void template<> void
//WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, WilsonKernels<ZDomainWallVec5dImplFH>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif // VEC 5D #endif // VEC 5D
@ -392,14 +392,14 @@ WilsonKernels<ZWilsonImplD>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void template<> void
//WilsonKernels<WilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<WilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void template<> void
//WilsonKernels<ZWilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<ZWilsonImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR #undef INTERIOR_AND_EXTERIOR
#define INTERIOR #define INTERIOR
@ -413,14 +413,14 @@ WilsonKernels<ZWilsonImplD>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldVi
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void template<> void
//WilsonKernels<WilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<WilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void template<> void
//WilsonKernels<ZWilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<ZWilsonImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR #undef INTERIOR_AND_EXTERIOR
#undef INTERIOR #undef INTERIOR
@ -434,14 +434,14 @@ WilsonKernels<ZWilsonImplD>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldVi
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void template<> void
//WilsonKernels<WilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<WilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void template<> void
//WilsonKernels<ZWilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<ZWilsonImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
// XYZT vectorised, dag Kernel, single // XYZT vectorised, dag Kernel, single
@ -459,14 +459,14 @@ WilsonKernels<ZWilsonImplD>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldVi
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void template<> void
//WilsonKernels<WilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<WilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void template<> void
//WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR #undef INTERIOR_AND_EXTERIOR
#define INTERIOR #define INTERIOR
@ -480,14 +480,14 @@ WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFiel
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void template<> void
//WilsonKernels<WilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<WilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void template<> void
//WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR #undef INTERIOR_AND_EXTERIOR
#undef INTERIOR #undef INTERIOR
@ -501,14 +501,14 @@ WilsonKernels<ZWilsonImplD>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFiel
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void template<> void
//WilsonKernels<WilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<WilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void template<> void
//WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<ZWilsonImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#undef MAYBEPERM #undef MAYBEPERM
#undef MULT_2SPIN #undef MULT_2SPIN
@ -533,14 +533,14 @@ WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSite(StencilView &st, DoubledGaugeF
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void template<> void
//WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void template<> void
//WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR #undef INTERIOR_AND_EXTERIOR
#define INTERIOR #define INTERIOR
@ -554,14 +554,14 @@ WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteInt(StencilView &st, DoubledGau
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void template<> void
//WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void template<> void
//WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR #undef INTERIOR_AND_EXTERIOR
#undef INTERIOR #undef INTERIOR
@ -577,14 +577,14 @@ WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteExt(StencilView &st, DoubledGau
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void template<> void
//WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void template<> void
//WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
// Ls vectorised, dag Kernel, single // Ls vectorised, dag Kernel, single
@ -602,14 +602,14 @@ WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteDag(StencilView &st, DoubledGau
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void template<> void
//WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void template<> void
//WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR #undef INTERIOR_AND_EXTERIOR
#define INTERIOR #define INTERIOR
@ -623,14 +623,14 @@ WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteDagInt(StencilView &st, Doubled
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void template<> void
//WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void template<> void
//WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#undef INTERIOR_AND_EXTERIOR #undef INTERIOR_AND_EXTERIOR
#undef INTERIOR #undef INTERIOR
@ -645,14 +645,14 @@ WilsonKernels<ZDomainWallVec5dImplD>::AsmDhopSiteDagExt(StencilView &st, Doubled
int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void template<> void
//WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, WilsonKernels<DomainWallVec5dImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
//template<> void template<> void
//WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, WilsonKernels<ZDomainWallVec5dImplDF>::AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
// int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out) int ss,int ssU,int Ls,int Ns,const FermionFieldView &in, FermionFieldView &out)
//#include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h> #include <qcd/action/fermion/implementation/WilsonKernelsAsmBody.h>
#endif // VEC 5D #endif // VEC 5D

View File

@ -25,11 +25,6 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
// GCC 10 messes up SVE instruction scheduling using -O3, but
// -O3 -fno-schedule-insns -fno-schedule-insns2 does wonders
// performance now is better than armclang 20.2
#ifdef KERNEL_DAG #ifdef KERNEL_DAG
#define DIR0_PROJ XP_PROJ #define DIR0_PROJ XP_PROJ
#define DIR1_PROJ YP_PROJ #define DIR1_PROJ YP_PROJ
@ -102,7 +97,7 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
PROJ; \ PROJ; \
MAYBEPERM(PERMUTE_DIR,perm); \ MAYBEPERM(PERMUTE_DIR,perm); \
} else { \ } else { \
LOAD_CHI(base); \ LOAD_CHI(base); \
} \ } \
base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \
MULT_2SPIN_1(Dir); \ MULT_2SPIN_1(Dir); \
@ -115,11 +110,6 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
} \ } \
RECON; \ RECON; \
/*
NB: picking PREFETCH_GAUGE_L2(Dir+4); here results in performance penalty
though I expected that it would improve on performance
*/
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ #define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
PREFETCH1_CHIMU(base); \ PREFETCH1_CHIMU(base); \
@ -136,63 +126,73 @@ NB: picking PREFETCH_GAUGE_L2(Dir+4); here results in performance penalty
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ #define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
basep = st.GetPFInfo(nent,plocal); nent++; \ basep = st.GetPFInfo(nent,plocal); nent++; \
if ( local ) { \ if ( local ) { \
LOAD_CHIMU(base); \ LOAD_CHIMU(base); \
LOAD_TABLE(PERMUTE_DIR); \ LOAD_TABLE(PERMUTE_DIR); \
PROJ; \ PROJ; \
MAYBEPERM(PERMUTE_DIR,perm); \ MAYBEPERM(PERMUTE_DIR,perm); \
}else if ( st.same_node[Dir] ) {LOAD_CHI(base);} \ }else if ( st.same_node[Dir] ) {LOAD_CHI(base);} \
if ( local || st.same_node[Dir] ) { \ base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \
MULT_2SPIN_1(Dir); \ if ( local || st.same_node[Dir] ) { \
MULT_2SPIN_2; \ MULT_2SPIN_1(Dir); \
RECON; \ PREFETCH_CHIMU(base); \
} \ /* PREFETCH_GAUGE_L1(NxtDir); */ \
base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ MULT_2SPIN_2; \
PREFETCH_CHIMU(base); \ if (s == 0) { \
PREFETCH_CHIMU_L2(basep); \ if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
} \
RECON; \
PREFETCH_CHIMU_L2(basep); \
} else { PREFETCH_CHIMU(base); } \
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ #define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
PREFETCH1_CHIMU(base); \ PREFETCH1_CHIMU(base); \
{ ZERO_PSI; } \
ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON)
#define RESULT(base,basep) SAVE_RESULT(base,basep); #define RESULT(base,basep) SAVE_RESULT(base,basep);
#endif #endif
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Post comms kernel // Post comms kernel
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
#ifdef EXTERIOR #ifdef EXTERIOR
#define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ #define ASM_LEG(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \
if((!local)&&(!st.same_node[Dir]) ) { \ if((!local)&&(!st.same_node[Dir]) ) { \
LOAD_CHI(base); \ LOAD_CHI(base); \
MULT_2SPIN_1(Dir); \ MULT_2SPIN_1(Dir); \
PREFETCH_CHIMU(base); \
/* PREFETCH_GAUGE_L1(NxtDir); */ \
MULT_2SPIN_2; \ MULT_2SPIN_2; \
RECON; \ if (s == 0) { \
nmu++; \ if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
} \
RECON; \
nmu++; \
} }
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ #define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
nmu=0; \ nmu=0; \
{ ZERO_PSI;} \ base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++;\
base = st.GetInfo(ptype,local,perm,Dir,ent,plocal); ent++; \ if((!local)&&(!st.same_node[Dir]) ) { \
if((!local)&&(!st.same_node[Dir]) ) { \ LOAD_CHI(base); \
LOAD_CHI(base); \
MULT_2SPIN_1(Dir); \ MULT_2SPIN_1(Dir); \
PREFETCH_CHIMU(base); \
/* PREFETCH_GAUGE_L1(NxtDir); */ \
MULT_2SPIN_2; \ MULT_2SPIN_2; \
RECON; \ if (s == 0) { \
nmu++; \ if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
} \
RECON; \
nmu++; \
} }
#define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);} #define RESULT(base,basep) if (nmu){ ADD_RESULT(base,base);}
#endif #endif
{ {
int nmu; int nmu;
int local,perm, ptype; int local,perm, ptype;
@ -209,6 +209,7 @@ NB: picking PREFETCH_GAUGE_L2(Dir+4); here results in performance penalty
int ssn=ssU+1; if(ssn>=nmax) ssn=0; int ssn=ssU+1; if(ssn>=nmax) ssn=0;
// int sUn=lo.Reorder(ssn); // int sUn=lo.Reorder(ssn);
int sUn=ssn; int sUn=ssn;
LOCK_GAUGE(0);
#else #else
int sU =ssU; int sU =ssU;
int ssn=ssU+1; if(ssn>=nmax) ssn=0; int ssn=ssU+1; if(ssn>=nmax) ssn=0;
@ -294,11 +295,6 @@ NB: picking PREFETCH_GAUGE_L2(Dir+4); here results in performance penalty
std::cout << "----------------------------------------------------" << std::endl; std::cout << "----------------------------------------------------" << std::endl;
#endif #endif
// DC ZVA test
// { uint64_t basestore = (uint64_t)&out[ss];
// PREFETCH_RESULT_L2_STORE(basestore); }
ASM_LEG(Ym,Zm,PERMUTE_DIR2,DIR5_PROJ,DIR5_RECON); ASM_LEG(Ym,Zm,PERMUTE_DIR2,DIR5_PROJ,DIR5_RECON);
#ifdef SHOW #ifdef SHOW
@ -312,11 +308,6 @@ NB: picking PREFETCH_GAUGE_L2(Dir+4); here results in performance penalty
std::cout << "----------------------------------------------------" << std::endl; std::cout << "----------------------------------------------------" << std::endl;
#endif #endif
// DC ZVA test
//{ uint64_t basestore = (uint64_t)&out[ss];
// PREFETCH_RESULT_L2_STORE(basestore); }
ASM_LEG(Zm,Tm,PERMUTE_DIR1,DIR6_PROJ,DIR6_RECON); ASM_LEG(Zm,Tm,PERMUTE_DIR1,DIR6_PROJ,DIR6_RECON);
#ifdef SHOW #ifdef SHOW
@ -330,11 +321,6 @@ NB: picking PREFETCH_GAUGE_L2(Dir+4); here results in performance penalty
std::cout << "----------------------------------------------------" << std::endl; std::cout << "----------------------------------------------------" << std::endl;
#endif #endif
// DC ZVA test
//{ uint64_t basestore = (uint64_t)&out[ss];
// PREFETCH_RESULT_L2_STORE(basestore); }
ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJ,DIR7_RECON); ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJ,DIR7_RECON);
#ifdef SHOW #ifdef SHOW
@ -355,7 +341,6 @@ NB: picking PREFETCH_GAUGE_L2(Dir+4); here results in performance penalty
base = (uint64_t) &out[ss]; base = (uint64_t) &out[ss];
basep= st.GetPFInfo(nent,plocal); ent++; basep= st.GetPFInfo(nent,plocal); ent++;
basep = (uint64_t) &out[ssn]; basep = (uint64_t) &out[ssn];
//PREFETCH_RESULT_L1_STORE(base);
RESULT(base,basep); RESULT(base,basep);
#ifdef SHOW #ifdef SHOW

View File

@ -646,7 +646,7 @@ NAMESPACE_BEGIN(Grid);
HAND_RESULT_EXT(ss,F) HAND_RESULT_EXT(ss,F)
#define HAND_SPECIALISE_GPARITY(IMPL) \ #define HAND_SPECIALISE_GPARITY(IMPL) \
template<> accelerator_inline void \ template<> void \
WilsonKernels<IMPL>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \ WilsonKernels<IMPL>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \ int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
{ \ { \
@ -662,7 +662,7 @@ NAMESPACE_BEGIN(Grid);
HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ HAND_DOP_SITE(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
} \ } \
\ \
template<> accelerator_inline void \ template<> void \
WilsonKernels<IMPL>::HandDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \ WilsonKernels<IMPL>::HandDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \ int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
{ \ { \
@ -678,7 +678,7 @@ NAMESPACE_BEGIN(Grid);
HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ HAND_DOP_SITE_DAG(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
} \ } \
\ \
template<> accelerator_inline void \ template<> void \
WilsonKernels<IMPL>::HandDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \ WilsonKernels<IMPL>::HandDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \ int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
{ \ { \
@ -694,7 +694,7 @@ NAMESPACE_BEGIN(Grid);
HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ HAND_DOP_SITE_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
} \ } \
\ \
template<> accelerator_inline void \ template<> void \
WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \ WilsonKernels<IMPL>::HandDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \ int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
{ \ { \
@ -710,7 +710,7 @@ NAMESPACE_BEGIN(Grid);
HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ HAND_DOP_SITE_DAG_INT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
} \ } \
\ \
template<> accelerator_inline void \ template<> void \
WilsonKernels<IMPL>::HandDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \ WilsonKernels<IMPL>::HandDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \ int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
{ \ { \
@ -727,7 +727,7 @@ NAMESPACE_BEGIN(Grid);
nmu = 0; \ nmu = 0; \
HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \ HAND_DOP_SITE_EXT(1, LOAD_CHI_GPARITY,LOAD_CHIMU_GPARITY,MULT_2SPIN_GPARITY); \
} \ } \
template<> accelerator_inline void \ template<> void \
WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \ WilsonKernels<IMPL>::HandDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, \
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \ int ss,int sU,const FermionFieldView &in, FermionFieldView &out) \
{ \ { \

View File

@ -76,24 +76,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#define REGISTER #define REGISTER
#ifdef GRID_SIMT #define LOAD_CHIMU \
#define LOAD_CHIMU(Ptype) \
{const SiteSpinor & ref (in[offset]); \
Chimu_00=coalescedReadPermute<Ptype>(ref()(0)(0),perm,lane); \
Chimu_01=coalescedReadPermute<Ptype>(ref()(0)(1),perm,lane); \
Chimu_02=coalescedReadPermute<Ptype>(ref()(0)(2),perm,lane); \
Chimu_10=coalescedReadPermute<Ptype>(ref()(1)(0),perm,lane); \
Chimu_11=coalescedReadPermute<Ptype>(ref()(1)(1),perm,lane); \
Chimu_12=coalescedReadPermute<Ptype>(ref()(1)(2),perm,lane); \
Chimu_20=coalescedReadPermute<Ptype>(ref()(2)(0),perm,lane); \
Chimu_21=coalescedReadPermute<Ptype>(ref()(2)(1),perm,lane); \
Chimu_22=coalescedReadPermute<Ptype>(ref()(2)(2),perm,lane); \
Chimu_30=coalescedReadPermute<Ptype>(ref()(3)(0),perm,lane); \
Chimu_31=coalescedReadPermute<Ptype>(ref()(3)(1),perm,lane); \
Chimu_32=coalescedReadPermute<Ptype>(ref()(3)(2),perm,lane); }
#define PERMUTE_DIR(dir) ;
#else
#define LOAD_CHIMU(Ptype) \
{const SiteSpinor & ref (in[offset]); \ {const SiteSpinor & ref (in[offset]); \
Chimu_00=ref()(0)(0);\ Chimu_00=ref()(0)(0);\
Chimu_01=ref()(0)(1);\ Chimu_01=ref()(0)(1);\
@ -108,54 +91,54 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
Chimu_31=ref()(3)(1);\ Chimu_31=ref()(3)(1);\
Chimu_32=ref()(3)(2);} Chimu_32=ref()(3)(2);}
#define PERMUTE_DIR(dir) \ #define LOAD_CHI\
permute##dir(Chi_00,Chi_00); \ {const SiteHalfSpinor &ref(buf[offset]); \
permute##dir(Chi_01,Chi_01); \ Chi_00 = ref()(0)(0);\
permute##dir(Chi_02,Chi_02); \ Chi_01 = ref()(0)(1);\
permute##dir(Chi_10,Chi_10); \ Chi_02 = ref()(0)(2);\
permute##dir(Chi_11,Chi_11); \ Chi_10 = ref()(1)(0);\
permute##dir(Chi_12,Chi_12); Chi_11 = ref()(1)(1);\
Chi_12 = ref()(1)(2);}
#endif
// To splat or not to splat depends on the implementation
#define MULT_2SPIN(A)\ #define MULT_2SPIN(A)\
{auto & ref(U[sU](A)); \ {auto & ref(U[sU](A)); \
U_00=coalescedRead(ref()(0,0),lane); \ Impl::loadLinkElement(U_00,ref()(0,0)); \
U_10=coalescedRead(ref()(1,0),lane); \ Impl::loadLinkElement(U_10,ref()(1,0)); \
U_20=coalescedRead(ref()(2,0),lane); \ Impl::loadLinkElement(U_20,ref()(2,0)); \
U_01=coalescedRead(ref()(0,1),lane); \ Impl::loadLinkElement(U_01,ref()(0,1)); \
U_11=coalescedRead(ref()(1,1),lane); \ Impl::loadLinkElement(U_11,ref()(1,1)); \
U_21=coalescedRead(ref()(2,1),lane); \ Impl::loadLinkElement(U_21,ref()(2,1)); \
UChi_00 = U_00*Chi_00; \ UChi_00 = U_00*Chi_00;\
UChi_10 = U_00*Chi_10; \ UChi_10 = U_00*Chi_10;\
UChi_01 = U_10*Chi_00; \ UChi_01 = U_10*Chi_00;\
UChi_11 = U_10*Chi_10; \ UChi_11 = U_10*Chi_10;\
UChi_02 = U_20*Chi_00; \ UChi_02 = U_20*Chi_00;\
UChi_12 = U_20*Chi_10; \ UChi_12 = U_20*Chi_10;\
UChi_00+= U_01*Chi_01; \ UChi_00+= U_01*Chi_01;\
UChi_10+= U_01*Chi_11; \ UChi_10+= U_01*Chi_11;\
UChi_01+= U_11*Chi_01; \ UChi_01+= U_11*Chi_01;\
UChi_11+= U_11*Chi_11; \ UChi_11+= U_11*Chi_11;\
UChi_02+= U_21*Chi_01; \ UChi_02+= U_21*Chi_01;\
UChi_12+= U_21*Chi_11; \ UChi_12+= U_21*Chi_11;\
U_00=coalescedRead(ref()(0,2),lane); \ Impl::loadLinkElement(U_00,ref()(0,2)); \
U_10=coalescedRead(ref()(1,2),lane); \ Impl::loadLinkElement(U_10,ref()(1,2)); \
U_20=coalescedRead(ref()(2,2),lane); \ Impl::loadLinkElement(U_20,ref()(2,2)); \
UChi_00+= U_00*Chi_02; \ UChi_00+= U_00*Chi_02;\
UChi_10+= U_00*Chi_12; \ UChi_10+= U_00*Chi_12;\
UChi_01+= U_10*Chi_02; \ UChi_01+= U_10*Chi_02;\
UChi_11+= U_10*Chi_12; \ UChi_11+= U_10*Chi_12;\
UChi_02+= U_20*Chi_02; \ UChi_02+= U_20*Chi_02;\
UChi_12+= U_20*Chi_12;} UChi_12+= U_20*Chi_12;}
#define LOAD_CHI \
{const SiteHalfSpinor &ref(buf[offset]); \ #define PERMUTE_DIR(dir) \
Chi_00 = coalescedRead(ref()(0)(0),lane); \ permute##dir(Chi_00,Chi_00);\
Chi_01 = coalescedRead(ref()(0)(1),lane); \ permute##dir(Chi_01,Chi_01);\
Chi_02 = coalescedRead(ref()(0)(2),lane); \ permute##dir(Chi_02,Chi_02);\
Chi_10 = coalescedRead(ref()(1)(0),lane); \ permute##dir(Chi_10,Chi_10);\
Chi_11 = coalescedRead(ref()(1)(1),lane); \ permute##dir(Chi_11,Chi_11);\
Chi_12 = coalescedRead(ref()(1)(2),lane);} permute##dir(Chi_12,Chi_12);
// hspin(0)=fspin(0)+timesI(fspin(3)); // hspin(0)=fspin(0)+timesI(fspin(3));
// hspin(1)=fspin(1)+timesI(fspin(2)); // hspin(1)=fspin(1)+timesI(fspin(2));
@ -370,124 +353,88 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
result_31-= UChi_11; \ result_31-= UChi_11; \
result_32-= UChi_12; result_32-= UChi_12;
#define HAND_STENCIL_LEGB(PROJ,PERM,DIR,RECON) \ #define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON) \
{int ptype; \ SE=st.GetEntry(ptype,DIR,ss); \
SE=st.GetEntry(ptype,DIR,ss); \ offset = SE->_offset; \
auto offset = SE->_offset; \ local = SE->_is_local; \
auto local = SE->_is_local; \ perm = SE->_permute; \
auto perm = SE->_permute; \ if ( local ) { \
if ( local ) { \ LOAD_CHIMU; \
LOAD_CHIMU(PERM); \ PROJ; \
PROJ; \ if ( perm) { \
if ( perm) { \ PERMUTE_DIR(PERM); \
PERMUTE_DIR(PERM); \ } \
} \ } else { \
} else { \ LOAD_CHI; \
LOAD_CHI; \ } \
} \ MULT_2SPIN(DIR); \
acceleratorSynchronise(); \ RECON;
MULT_2SPIN(DIR); \
RECON; }
#define HAND_STENCIL_LEG(PROJ,PERM,DIR,RECON) \
{ SE=&st_p[DIR+8*ss]; \
auto ptype=st_perm[DIR]; \
auto offset = SE->_offset; \
auto local = SE->_is_local; \
auto perm = SE->_permute; \
if ( local ) { \
LOAD_CHIMU(PERM); \
PROJ; \
if ( perm) { \
PERMUTE_DIR(PERM); \
} \
} else { \
LOAD_CHI; \
} \
acceleratorSynchronise(); \
MULT_2SPIN(DIR); \
RECON; }
#define HAND_STENCIL_LEGA(PROJ,PERM,DIR,RECON) \
{ SE=&st_p[DIR+8*ss]; \
auto ptype=st_perm[DIR]; \
/*SE=st.GetEntry(ptype,DIR,ss);*/ \
auto offset = SE->_offset; \
auto perm = SE->_permute; \
LOAD_CHIMU(PERM); \
PROJ; \
MULT_2SPIN(DIR); \
RECON; }
#define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON) \ #define HAND_STENCIL_LEG_INT(PROJ,PERM,DIR,RECON) \
{ int ptype; \ SE=st.GetEntry(ptype,DIR,ss); \
SE=st.GetEntry(ptype,DIR,ss); \ offset = SE->_offset; \
auto offset = SE->_offset; \ local = SE->_is_local; \
auto local = SE->_is_local; \ perm = SE->_permute; \
auto perm = SE->_permute; \ if ( local ) { \
if ( local ) { \ LOAD_CHIMU; \
LOAD_CHIMU(PERM); \ PROJ; \
PROJ; \ if ( perm) { \
if ( perm) { \ PERMUTE_DIR(PERM); \
PERMUTE_DIR(PERM); \ } \
} \ } else if ( st.same_node[DIR] ) { \
} else if ( st.same_node[DIR] ) { \ LOAD_CHI; \
LOAD_CHI; \ } \
} \ if (local || st.same_node[DIR] ) { \
acceleratorSynchronise(); \ MULT_2SPIN(DIR); \
if (local || st.same_node[DIR] ) { \ RECON; \
MULT_2SPIN(DIR); \ }
RECON; \
} \
acceleratorSynchronise(); }
#define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON) \ #define HAND_STENCIL_LEG_EXT(PROJ,PERM,DIR,RECON) \
{ int ptype; \ SE=st.GetEntry(ptype,DIR,ss); \
SE=st.GetEntry(ptype,DIR,ss); \ offset = SE->_offset; \
auto offset = SE->_offset; \ if((!SE->_is_local)&&(!st.same_node[DIR]) ) { \
if((!SE->_is_local)&&(!st.same_node[DIR]) ) { \ LOAD_CHI; \
LOAD_CHI; \ MULT_2SPIN(DIR); \
MULT_2SPIN(DIR); \ RECON; \
RECON; \ nmu++; \
nmu++; \
} \
acceleratorSynchronise(); }
#define HAND_RESULT(ss) \
{ \
SiteSpinor & ref (out[ss]); \
coalescedWrite(ref()(0)(0),result_00,lane); \
coalescedWrite(ref()(0)(1),result_01,lane); \
coalescedWrite(ref()(0)(2),result_02,lane); \
coalescedWrite(ref()(1)(0),result_10,lane); \
coalescedWrite(ref()(1)(1),result_11,lane); \
coalescedWrite(ref()(1)(2),result_12,lane); \
coalescedWrite(ref()(2)(0),result_20,lane); \
coalescedWrite(ref()(2)(1),result_21,lane); \
coalescedWrite(ref()(2)(2),result_22,lane); \
coalescedWrite(ref()(3)(0),result_30,lane); \
coalescedWrite(ref()(3)(1),result_31,lane); \
coalescedWrite(ref()(3)(2),result_32,lane); \
} }
#define HAND_RESULT_EXT(ss) \ #define HAND_RESULT(ss) \
{ \ { \
SiteSpinor & ref (out[ss]); \ SiteSpinor & ref (out[ss]); \
coalescedWrite(ref()(0)(0),coalescedRead(ref()(0)(0))+result_00,lane); \ vstream(ref()(0)(0),result_00); \
coalescedWrite(ref()(0)(1),coalescedRead(ref()(0)(1))+result_01,lane); \ vstream(ref()(0)(1),result_01); \
coalescedWrite(ref()(0)(2),coalescedRead(ref()(0)(2))+result_02,lane); \ vstream(ref()(0)(2),result_02); \
coalescedWrite(ref()(1)(0),coalescedRead(ref()(1)(0))+result_10,lane); \ vstream(ref()(1)(0),result_10); \
coalescedWrite(ref()(1)(1),coalescedRead(ref()(1)(1))+result_11,lane); \ vstream(ref()(1)(1),result_11); \
coalescedWrite(ref()(1)(2),coalescedRead(ref()(1)(2))+result_12,lane); \ vstream(ref()(1)(2),result_12); \
coalescedWrite(ref()(2)(0),coalescedRead(ref()(2)(0))+result_20,lane); \ vstream(ref()(2)(0),result_20); \
coalescedWrite(ref()(2)(1),coalescedRead(ref()(2)(1))+result_21,lane); \ vstream(ref()(2)(1),result_21); \
coalescedWrite(ref()(2)(2),coalescedRead(ref()(2)(2))+result_22,lane); \ vstream(ref()(2)(2),result_22); \
coalescedWrite(ref()(3)(0),coalescedRead(ref()(3)(0))+result_30,lane); \ vstream(ref()(3)(0),result_30); \
coalescedWrite(ref()(3)(1),coalescedRead(ref()(3)(1))+result_31,lane); \ vstream(ref()(3)(1),result_31); \
coalescedWrite(ref()(3)(2),coalescedRead(ref()(3)(2))+result_32,lane); \ vstream(ref()(3)(2),result_32); \
} }
#define HAND_DECLARATIONS(Simd) \ #define HAND_RESULT_EXT(ss) \
if (nmu){ \
SiteSpinor & ref (out[ss]); \
ref()(0)(0)+=result_00; \
ref()(0)(1)+=result_01; \
ref()(0)(2)+=result_02; \
ref()(1)(0)+=result_10; \
ref()(1)(1)+=result_11; \
ref()(1)(2)+=result_12; \
ref()(2)(0)+=result_20; \
ref()(2)(1)+=result_21; \
ref()(2)(2)+=result_22; \
ref()(3)(0)+=result_30; \
ref()(3)(1)+=result_31; \
ref()(3)(2)+=result_32; \
}
#define HAND_DECLARATIONS(a) \
Simd result_00; \ Simd result_00; \
Simd result_01; \ Simd result_01; \
Simd result_02; \ Simd result_02; \
@ -519,19 +466,19 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
Simd U_11; \ Simd U_11; \
Simd U_21; Simd U_21;
#define ZERO_RESULT \ #define ZERO_RESULT \
zeroit(result_00); \ result_00=Zero(); \
zeroit(result_01); \ result_01=Zero(); \
zeroit(result_02); \ result_02=Zero(); \
zeroit(result_10); \ result_10=Zero(); \
zeroit(result_11); \ result_11=Zero(); \
zeroit(result_12); \ result_12=Zero(); \
zeroit(result_20); \ result_20=Zero(); \
zeroit(result_21); \ result_21=Zero(); \
zeroit(result_22); \ result_22=Zero(); \
zeroit(result_30); \ result_30=Zero(); \
zeroit(result_31); \ result_31=Zero(); \
zeroit(result_32); result_32=Zero();
#define Chimu_00 Chi_00 #define Chimu_00 Chi_00
#define Chimu_01 Chi_01 #define Chimu_01 Chi_01
@ -548,54 +495,19 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
template<class Impl> void
#ifdef SYCL_HACK
template<class Impl> accelerator_inline void
WilsonKernels<Impl>::HandDhopSiteSycl(StencilVector st_perm,StencilEntry *st_p, SiteDoubledGaugeField *U,SiteHalfSpinor *buf,
int ss,int sU,const SiteSpinor *in, SiteSpinor *out)
{
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
typedef iSinglet<Simd> vCplx;
// typedef decltype( coalescedRead( vCplx()()() )) Simt;
typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;
const int Nsimd = SiteHalfSpinor::Nsimd();
const int lane=acceleratorSIMTlane(Nsimd);
HAND_DECLARATIONS(Simt);
StencilEntry *SE;
HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
HAND_STENCIL_LEG(TM_PROJ,0,Tp,TM_RECON_ACCUM);
HAND_STENCIL_LEG(XP_PROJ,3,Xm,XP_RECON_ACCUM);
HAND_STENCIL_LEG(YP_PROJ,2,Ym,YP_RECON_ACCUM);
HAND_STENCIL_LEG(ZP_PROJ,1,Zm,ZP_RECON_ACCUM);
HAND_STENCIL_LEG(TP_PROJ,0,Tm,TP_RECON_ACCUM);
HAND_RESULT(ss);
}
#endif
template<class Impl> accelerator_inline void
WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
{ {
auto st_p = st._entries_p;
auto st_perm = st._permute_type;
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
typedef typename Simd::scalar_type S; typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V; typedef typename Simd::vector_type V;
typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;
const int Nsimd = SiteHalfSpinor::Nsimd(); HAND_DECLARATIONS(ignore);
const int lane=acceleratorSIMTlane(Nsimd);
HAND_DECLARATIONS(Simt);
int offset,local,perm, ptype;
StencilEntry *SE; StencilEntry *SE;
HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON); HAND_STENCIL_LEG(XM_PROJ,3,Xp,XM_RECON);
HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM); HAND_STENCIL_LEG(YM_PROJ,2,Yp,YM_RECON_ACCUM);
HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM); HAND_STENCIL_LEG(ZM_PROJ,1,Zp,ZM_RECON_ACCUM);
@ -607,22 +519,18 @@ WilsonKernels<Impl>::HandDhopSite(StencilView &st, DoubledGaugeFieldView &U,Site
HAND_RESULT(ss); HAND_RESULT(ss);
} }
template<class Impl> accelerator_inline template<class Impl>
void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
{ {
auto st_p = st._entries_p;
auto st_perm = st._permute_type;
typedef typename Simd::scalar_type S; typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V; typedef typename Simd::vector_type V;
typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;
const int Nsimd = SiteHalfSpinor::Nsimd(); HAND_DECLARATIONS(ignore);
const int lane=acceleratorSIMTlane(Nsimd);
HAND_DECLARATIONS(Simt);
StencilEntry *SE; StencilEntry *SE;
int offset,local,perm, ptype;
HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON); HAND_STENCIL_LEG(XP_PROJ,3,Xp,XP_RECON);
HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM); HAND_STENCIL_LEG(YP_PROJ,2,Yp,YP_RECON_ACCUM);
HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM); HAND_STENCIL_LEG(ZP_PROJ,1,Zp,ZP_RECON_ACCUM);
@ -634,22 +542,17 @@ void WilsonKernels<Impl>::HandDhopSiteDag(StencilView &st,DoubledGaugeFieldView
HAND_RESULT(ss); HAND_RESULT(ss);
} }
template<class Impl> accelerator_inline void template<class Impl> void
WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
{ {
// auto st_p = st._entries_p;
// auto st_perm = st._permute_type;
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
typedef typename Simd::scalar_type S; typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V; typedef typename Simd::vector_type V;
typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;
const int Nsimd = SiteHalfSpinor::Nsimd(); HAND_DECLARATIONS(ignore);
const int lane=acceleratorSIMTlane(Nsimd);
HAND_DECLARATIONS(Simt);
int offset,local,perm, ptype;
StencilEntry *SE; StencilEntry *SE;
ZERO_RESULT; ZERO_RESULT;
HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM); HAND_STENCIL_LEG_INT(XM_PROJ,3,Xp,XM_RECON_ACCUM);
@ -663,22 +566,17 @@ WilsonKernels<Impl>::HandDhopSiteInt(StencilView &st,DoubledGaugeFieldView &U,Si
HAND_RESULT(ss); HAND_RESULT(ss);
} }
template<class Impl> accelerator_inline template<class Impl>
void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
{ {
// auto st_p = st._entries_p;
// auto st_perm = st._permute_type;
typedef typename Simd::scalar_type S; typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V; typedef typename Simd::vector_type V;
typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;
const int Nsimd = SiteHalfSpinor::Nsimd(); HAND_DECLARATIONS(ignore);
const int lane=acceleratorSIMTlane(Nsimd);
HAND_DECLARATIONS(Simt);
StencilEntry *SE; StencilEntry *SE;
int offset,local,perm, ptype;
ZERO_RESULT; ZERO_RESULT;
HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM); HAND_STENCIL_LEG_INT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM); HAND_STENCIL_LEG_INT(YP_PROJ,2,Yp,YP_RECON_ACCUM);
@ -691,23 +589,17 @@ void WilsonKernels<Impl>::HandDhopSiteDagInt(StencilView &st,DoubledGaugeFieldVi
HAND_RESULT(ss); HAND_RESULT(ss);
} }
template<class Impl> accelerator_inline void template<class Impl> void
WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
{ {
// auto st_p = st._entries_p;
// auto st_perm = st._permute_type;
// T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
typedef typename Simd::scalar_type S; typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V; typedef typename Simd::vector_type V;
typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;
const int Nsimd = SiteHalfSpinor::Nsimd(); HAND_DECLARATIONS(ignore);
const int lane=acceleratorSIMTlane(Nsimd);
HAND_DECLARATIONS(Simt); int offset, ptype;
// int offset, ptype;
StencilEntry *SE; StencilEntry *SE;
int nmu=0; int nmu=0;
ZERO_RESULT; ZERO_RESULT;
@ -722,23 +614,17 @@ WilsonKernels<Impl>::HandDhopSiteExt(StencilView &st,DoubledGaugeFieldView &U,Si
HAND_RESULT_EXT(ss); HAND_RESULT_EXT(ss);
} }
template<class Impl> accelerator_inline template<class Impl>
void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf, void WilsonKernels<Impl>::HandDhopSiteDagExt(StencilView &st,DoubledGaugeFieldView &U,SiteHalfSpinor *buf,
int ss,int sU,const FermionFieldView &in, FermionFieldView &out) int ss,int sU,const FermionFieldView &in, FermionFieldView &out)
{ {
// auto st_p = st._entries_p;
// auto st_perm = st._permute_type;
typedef typename Simd::scalar_type S; typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V; typedef typename Simd::vector_type V;
typedef decltype( coalescedRead( in[0]()(0)(0) )) Simt;
const int Nsimd = SiteHalfSpinor::Nsimd(); HAND_DECLARATIONS(ignore);
const int lane=acceleratorSIMTlane(Nsimd);
HAND_DECLARATIONS(Simt);
StencilEntry *SE; StencilEntry *SE;
// int offset, ptype; int offset, ptype;
int nmu=0; int nmu=0;
ZERO_RESULT; ZERO_RESULT;
HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM); HAND_STENCIL_LEG_EXT(XP_PROJ,3,Xp,XP_RECON_ACCUM);
@ -796,4 +682,3 @@ NAMESPACE_END(Grid);
#undef HAND_RESULT #undef HAND_RESULT
#undef HAND_RESULT_INT #undef HAND_RESULT_INT
#undef HAND_RESULT_EXT #undef HAND_RESULT_EXT
#undef HAND_DECLARATIONS

View File

@ -114,7 +114,7 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
// All legs kernels ; comms then compute // All legs kernels ; comms then compute
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
template <class Impl> accelerator_inline template <class Impl>
void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U,
SiteHalfSpinor *buf, int sF, SiteHalfSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out) int sU, const FermionFieldView &in, FermionFieldView &out)
@ -140,7 +140,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldV
coalescedWrite(out[sF],result,lane); coalescedWrite(out[sF],result,lane);
}; };
template <class Impl> accelerator_inline template <class Impl>
void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U, void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U,
SiteHalfSpinor *buf, int sF, SiteHalfSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out) int sU, const FermionFieldView &in, FermionFieldView &out)
@ -169,7 +169,7 @@ void WilsonKernels<Impl>::GenericDhopSite(StencilView &st, DoubledGaugeFieldView
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
// Interior kernels // Interior kernels
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
template <class Impl> accelerator_inline template <class Impl>
void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U,
SiteHalfSpinor *buf, int sF, SiteHalfSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out) int sU, const FermionFieldView &in, FermionFieldView &out)
@ -197,7 +197,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFi
coalescedWrite(out[sF], result,lane); coalescedWrite(out[sF], result,lane);
}; };
template <class Impl> accelerator_inline template <class Impl>
void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U,
SiteHalfSpinor *buf, int sF, SiteHalfSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out) int sU, const FermionFieldView &in, FermionFieldView &out)
@ -227,7 +227,7 @@ void WilsonKernels<Impl>::GenericDhopSiteInt(StencilView &st, DoubledGaugeField
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
// Exterior kernels // Exterior kernels
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
template <class Impl> accelerator_inline template <class Impl>
void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U,
SiteHalfSpinor *buf, int sF, SiteHalfSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out) int sU, const FermionFieldView &in, FermionFieldView &out)
@ -258,7 +258,7 @@ void WilsonKernels<Impl>::GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFi
} }
}; };
template <class Impl> accelerator_inline template <class Impl>
void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U,
SiteHalfSpinor *buf, int sF, SiteHalfSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out) int sU, const FermionFieldView &in, FermionFieldView &out)
@ -290,7 +290,7 @@ void WilsonKernels<Impl>::GenericDhopSiteExt(StencilView &st, DoubledGaugeField
}; };
#define DhopDirMacro(Dir,spProj,spRecon) \ #define DhopDirMacro(Dir,spProj,spRecon) \
template <class Impl> accelerator_inline \ template <class Impl> \
void WilsonKernels<Impl>::DhopDir##Dir(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, \ void WilsonKernels<Impl>::DhopDir##Dir(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, \
int sU, const FermionFieldView &in, FermionFieldView &out, int dir) \ int sU, const FermionFieldView &in, FermionFieldView &out, int dir) \
{ \ { \
@ -318,7 +318,7 @@ DhopDirMacro(Ym,spProjYm,spReconYm);
DhopDirMacro(Zm,spProjZm,spReconZm); DhopDirMacro(Zm,spProjZm,spReconZm);
DhopDirMacro(Tm,spProjTm,spReconTm); DhopDirMacro(Tm,spProjTm,spReconTm);
template <class Impl> accelerator_inline template <class Impl>
void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, void WilsonKernels<Impl>::DhopDirK( StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF,
int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma) int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int gamma)
{ {
@ -416,21 +416,7 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
#undef LoopBody #undef LoopBody
} }
#define KERNEL_CALL_TMP(A) \ #define KERNEL_CALLNB(A) \
const uint64_t NN = Nsite*Ls; \
auto U_p = & U_v[0]; \
auto in_p = & in_v[0]; \
auto out_p = & out_v[0]; \
auto st_p = st_v._entries_p; \
auto st_perm = st_v._permute_type; \
accelerator_forNB( ss, NN, Simd::Nsimd(), { \
int sF = ss; \
int sU = ss/Ls; \
WilsonKernels<Impl>::A(st_perm,st_p,U_p,buf,sF,sU,in_p,out_p); \
}); \
accelerator_barrier();
#define KERNEL_CALLNB(A) \
const uint64_t NN = Nsite*Ls; \ const uint64_t NN = Nsite*Ls; \
accelerator_forNB( ss, NN, Simd::Nsimd(), { \ accelerator_forNB( ss, NN, Simd::Nsimd(), { \
int sF = ss; \ int sF = ss; \
@ -459,24 +445,20 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
if( interior && exterior ) { if( interior && exterior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;} if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;}
#ifdef SYCL_HACK
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_TMP(HandDhopSiteSycl); return; }
#else
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;}
#endif
#ifndef GRID_CUDA #ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;}
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); return;}
#endif #endif
} else if( interior ) { } else if( interior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLNB(GenericDhopSiteInt); return;} if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLNB(GenericDhopSiteInt); return;}
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteInt); return;}
#ifndef GRID_CUDA #ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteInt); return;}
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;}
#endif #endif
} else if( exterior ) { } else if( exterior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;} if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;}
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;}
#ifndef GRID_CUDA #ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;}
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); return;}
#endif #endif
} }
@ -494,20 +476,20 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
if( interior && exterior ) { if( interior && exterior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDag); return;} if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDag); return;}
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag); return;}
#ifndef GRID_CUDA #ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDag); return;}
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDag); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDag); return;}
#endif #endif
} else if( interior ) { } else if( interior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagInt); return;} if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagInt); return;}
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt); return;}
#ifndef GRID_CUDA #ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt); return;}
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagInt); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagInt); return;}
#endif #endif
} else if( exterior ) { } else if( exterior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagExt); return;} if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagExt); return;}
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt); return;}
#ifndef GRID_CUDA #ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt); return;}
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagExt); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagExt); return;}
#endif #endif
} }

Some files were not shown because too many files have changed in this diff Show More