1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-08-02 20:57:06 +01:00

Compare commits

..

1 Commits

542 changed files with 7017 additions and 38786 deletions

1
.gitignore vendored
View File

@@ -88,7 +88,6 @@ Thumbs.db
# build directory # # build directory #
################### ###################
build*/* build*/*
Documentation/_build
# IDE related files # # IDE related files #
##################### #####################

61
.travis.yml Normal file
View File

@@ -0,0 +1,61 @@
language: cpp
cache:
directories:
- clang
matrix:
include:
- os: osx
osx_image: xcode8.3
compiler: clang
env: PREC=single
- os: osx
osx_image: xcode8.3
compiler: clang
env: PREC=double
before_install:
- export GRIDDIR=`pwd`
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]] && [ ! -e clang/bin ]; then wget $CLANG_LINK; tar -xf `basename $CLANG_LINK`; mkdir clang; mv clang+*/* clang/; fi
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export PATH="${GRIDDIR}/clang/bin:${PATH}"; fi
- if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then export LD_LIBRARY_PATH="${GRIDDIR}/clang/lib:${LD_LIBRARY_PATH}"; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install libmpc openssl; fi
install:
- export CWD=`pwd`
- echo $CWD
- export CC=$CC$VERSION
- export CXX=$CXX$VERSION
- echo $PATH
- which autoconf
- autoconf --version
- which automake
- automake --version
- which $CC
- $CC --version
- which $CXX
- $CXX --version
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export LDFLAGS='-L/usr/local/lib'; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export EXTRACONF='--with-openssl=/usr/local/opt/openssl'; fi
script:
- ./bootstrap.sh
- mkdir build
- cd build
- mkdir lime
- cd lime
- mkdir build
- cd build
- wget http://usqcd-software.github.io/downloads/c-lime/lime-1.3.2.tar.gz
- tar xf lime-1.3.2.tar.gz
- cd lime-1.3.2
- ./configure --prefix=$CWD/build/lime/install
- make -j4
- make install
- cd $CWD/build
- ../configure --enable-precision=$PREC --enable-simd=SSE4 --enable-comms=none --with-lime=$CWD/build/lime/install ${EXTRACONF}
- make -j4
- ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
- make check

View File

@@ -34,15 +34,10 @@ directory
#if defined __GNUC__ && __GNUC__>=6 #if defined __GNUC__ && __GNUC__>=6
#pragma GCC diagnostic ignored "-Wignored-attributes" #pragma GCC diagnostic ignored "-Wignored-attributes"
#endif
#if defined __GNUC__ && __GNUC__>=6
#pragma GCC diagnostic ignored "-Wpsabi"
#endif #endif
//disables and intel compiler specific warning (in json.hpp) //disables and intel compiler specific warning (in json.hpp)
#ifdef __ICC
#pragma warning disable 488 #pragma warning disable 488
#endif
#ifdef __NVCC__ #ifdef __NVCC__
//disables nvcc specific warning in json.hpp //disables nvcc specific warning in json.hpp

View File

@@ -47,9 +47,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <Grid/perfmon/PerfCount.h> #include <Grid/perfmon/PerfCount.h>
#include <Grid/util/Util.h> #include <Grid/util/Util.h>
#include <Grid/log/Log.h> #include <Grid/log/Log.h>
#include <Grid/allocator/Allocator.h> #include <Grid/allocator/AlignedAllocator.h>
#include <Grid/simd/Simd.h> #include <Grid/simd/Simd.h>
#include <Grid/threads/ThreadReduction.h> #include <Grid/threads/Threads.h>
#include <Grid/serialisation/Serialisation.h> #include <Grid/serialisation/Serialisation.h>
#include <Grid/util/Sha.h> #include <Grid/util/Sha.h>
#include <Grid/communicator/Communicator.h> #include <Grid/communicator/Communicator.h>

View File

@@ -36,7 +36,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
#include <Grid/qcd/QCD.h> #include <Grid/qcd/QCD.h>
#include <Grid/qcd/spin/Spin.h> #include <Grid/qcd/spin/Spin.h>
#include <Grid/qcd/gparity/Gparity.h>
#include <Grid/qcd/utils/Utils.h> #include <Grid/qcd/utils/Utils.h>
#include <Grid/qcd/representations/Representations.h> #include <Grid/qcd/representations/Representations.h>
NAMESPACE_CHECK(GridQCDCore); NAMESPACE_CHECK(GridQCDCore);

View File

@@ -6,7 +6,6 @@
/////////////////// ///////////////////
#include <cassert> #include <cassert>
#include <complex> #include <complex>
#include <memory>
#include <vector> #include <vector>
#include <array> #include <array>
#include <string> #include <string>
@@ -28,7 +27,4 @@
/////////////////// ///////////////////
#include "Config.h" #include "Config.h"
#ifdef TOFU
#undef GRID_COMMS_THREADS
#endif
#endif /* GRID_STD_H */ #endif /* GRID_STD_H */

View File

@@ -18,28 +18,21 @@
#pragma push_macro("__CUDA_ARCH__") #pragma push_macro("__CUDA_ARCH__")
#pragma push_macro("__NVCC__") #pragma push_macro("__NVCC__")
#pragma push_macro("__CUDACC__") #pragma push_macro("__CUDACC__")
#undef __CUDA_ARCH__
#undef __NVCC__ #undef __NVCC__
#undef __CUDACC__ #undef __CUDACC__
#undef __CUDA_ARCH__
#define __NVCC__REDEFINE__ #define __NVCC__REDEFINE__
#endif #endif
/* SYCL save and restore compile environment*/ /* SYCL save and restore compile environment*/
#ifdef GRID_SYCL #ifdef __SYCL_DEVICE_ONLY__
#pragma push #pragma push
#pragma push_macro("__SYCL_DEVICE_ONLY__") #pragma push_macro("__SYCL_DEVICE_ONLY__")
#undef __SYCL_DEVICE_ONLY__ #undef __SYCL_DEVICE_ONLY__
#undef EIGEN_USE_SYCL
#define EIGEN_DONT_VECTORIZE #define EIGEN_DONT_VECTORIZE
//#undef EIGEN_USE_SYCL
#define __SYCL__REDEFINE__
#endif #endif
/* HIP save and restore compile environment*/
#ifdef GRID_HIP
#pragma push
#pragma push_macro("__HIP_DEVICE_COMPILE__")
#endif
#define EIGEN_NO_HIP
#include <Grid/Eigen/Dense> #include <Grid/Eigen/Dense>
#include <Grid/Eigen/unsupported/CXX11/Tensor> #include <Grid/Eigen/unsupported/CXX11/Tensor>
@@ -58,12 +51,6 @@
#pragma pop #pragma pop
#endif #endif
/*HIP restore*/
#ifdef __HIP__REDEFINE__
#pragma pop_macro("__HIP_DEVICE_COMPILE__")
#pragma pop
#endif
#if defined __GNUC__ #if defined __GNUC__
#pragma GCC diagnostic pop #pragma GCC diagnostic pop
#endif #endif

View File

@@ -21,8 +21,7 @@ if BUILD_HDF5
extra_headers+=serialisation/Hdf5Type.h extra_headers+=serialisation/Hdf5Type.h
endif endif
all: version-cache
all: version-cache Version.h
version-cache: version-cache:
@if [ `git status --porcelain | grep -v '??' | wc -l` -gt 0 ]; then\ @if [ `git status --porcelain | grep -v '??' | wc -l` -gt 0 ]; then\
@@ -43,7 +42,7 @@ version-cache:
fi;\ fi;\
rm -f vertmp rm -f vertmp
Version.h: version-cache Version.h:
cp version-cache Version.h cp version-cache Version.h
.PHONY: version-cache .PHONY: version-cache
@@ -54,19 +53,6 @@ Version.h: version-cache
include Make.inc include Make.inc
include Eigen.inc include Eigen.inc
extra_sources+=$(WILS_FERMION_FILES)
extra_sources+=$(STAG_FERMION_FILES)
if BUILD_ZMOBIUS
extra_sources+=$(ZWILS_FERMION_FILES)
endif
if BUILD_GPARITY
extra_sources+=$(GP_FERMION_FILES)
endif
if BUILD_FERMION_REPS
extra_sources+=$(ADJ_FERMION_FILES)
extra_sources+=$(TWOIND_FERMION_FILES)
endif
lib_LIBRARIES = libGrid.a lib_LIBRARIES = libGrid.a
CCFILES += $(extra_sources) CCFILES += $(extra_sources)

View File

@@ -29,11 +29,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#ifndef GRID_ALGORITHMS_H #ifndef GRID_ALGORITHMS_H
#define GRID_ALGORITHMS_H #define GRID_ALGORITHMS_H
NAMESPACE_CHECK(algorithms);
#include <Grid/algorithms/SparseMatrix.h> #include <Grid/algorithms/SparseMatrix.h>
#include <Grid/algorithms/LinearOperator.h> #include <Grid/algorithms/LinearOperator.h>
#include <Grid/algorithms/Preconditioner.h> #include <Grid/algorithms/Preconditioner.h>
NAMESPACE_CHECK(SparseMatrix);
#include <Grid/algorithms/approx/Zolotarev.h> #include <Grid/algorithms/approx/Zolotarev.h>
#include <Grid/algorithms/approx/Chebyshev.h> #include <Grid/algorithms/approx/Chebyshev.h>
@@ -43,18 +41,15 @@ NAMESPACE_CHECK(SparseMatrix);
#include <Grid/algorithms/approx/Forecast.h> #include <Grid/algorithms/approx/Forecast.h>
#include <Grid/algorithms/approx/RemezGeneral.h> #include <Grid/algorithms/approx/RemezGeneral.h>
#include <Grid/algorithms/approx/ZMobius.h> #include <Grid/algorithms/approx/ZMobius.h>
NAMESPACE_CHECK(approx);
#include <Grid/algorithms/iterative/Deflation.h> #include <Grid/algorithms/iterative/Deflation.h>
#include <Grid/algorithms/iterative/ConjugateGradient.h> #include <Grid/algorithms/iterative/ConjugateGradient.h>
NAMESPACE_CHECK(ConjGrad);
#include <Grid/algorithms/iterative/BiCGSTAB.h> #include <Grid/algorithms/iterative/BiCGSTAB.h>
NAMESPACE_CHECK(BiCGSTAB);
#include <Grid/algorithms/iterative/ConjugateResidual.h> #include <Grid/algorithms/iterative/ConjugateResidual.h>
#include <Grid/algorithms/iterative/NormalEquations.h> #include <Grid/algorithms/iterative/NormalEquations.h>
#include <Grid/algorithms/iterative/SchurRedBlack.h> #include <Grid/algorithms/iterative/SchurRedBlack.h>
#include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h> #include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
#include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h> #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
#include <Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h>
#include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h> #include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h>
#include <Grid/algorithms/iterative/BlockConjugateGradient.h> #include <Grid/algorithms/iterative/BlockConjugateGradient.h>
#include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h> #include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>
@@ -67,9 +62,7 @@ NAMESPACE_CHECK(BiCGSTAB);
#include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h> #include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
#include <Grid/algorithms/iterative/PowerMethod.h> #include <Grid/algorithms/iterative/PowerMethod.h>
NAMESPACE_CHECK(PowerMethod);
#include <Grid/algorithms/CoarsenedMatrix.h> #include <Grid/algorithms/CoarsenedMatrix.h>
NAMESPACE_CHECK(CoarsendMatrix);
#include <Grid/algorithms/FFT.h> #include <Grid/algorithms/FFT.h>
#endif #endif

File diff suppressed because it is too large Load Diff

View File

@@ -1,3 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@@ -36,6 +37,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#endif #endif
#endif #endif
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
template<class scalar> struct FFTW { }; template<class scalar> struct FFTW { };
@@ -136,7 +138,7 @@ public:
flops=0; flops=0;
usec =0; usec =0;
Coordinate layout(Nd,1); Coordinate layout(Nd,1);
sgrid = new GridCartesian(dimensions,layout,processors,*grid); sgrid = new GridCartesian(dimensions,layout,processors);
}; };
~FFT ( void) { ~FFT ( void) {
@@ -182,14 +184,14 @@ public:
pencil_gd[dim] = G*processors[dim]; pencil_gd[dim] = G*processors[dim];
// Pencil global vol LxLxGxLxL per node // Pencil global vol LxLxGxLxL per node
GridCartesian pencil_g(pencil_gd,layout,processors,*vgrid); GridCartesian pencil_g(pencil_gd,layout,processors);
// Construct pencils // Construct pencils
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename sobj::scalar_type scalar; typedef typename sobj::scalar_type scalar;
Lattice<sobj> pgbuf(&pencil_g); Lattice<sobj> pgbuf(&pencil_g);
autoView(pgbuf_v , pgbuf, CpuWrite); auto pgbuf_v = pgbuf.View();
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar; typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan; typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
@@ -230,18 +232,15 @@ public:
result = source; result = source;
int pc = processor_coor[dim]; int pc = processor_coor[dim];
for(int p=0;p<processors[dim];p++) { for(int p=0;p<processors[dim];p++) {
{ thread_for(idx, sgrid->lSites(),{
autoView(r_v,result,CpuRead);
autoView(p_v,pgbuf,CpuWrite);
thread_for(idx, sgrid->lSites(),{
Coordinate cbuf(Nd); Coordinate cbuf(Nd);
sobj s; sobj s;
sgrid->LocalIndexToLocalCoor(idx,cbuf); sgrid->LocalIndexToLocalCoor(idx,cbuf);
peekLocalSite(s,r_v,cbuf); peekLocalSite(s,result,cbuf);
cbuf[dim]+=((pc+p) % processors[dim])*L; cbuf[dim]+=((pc+p) % processors[dim])*L;
pokeLocalSite(s,p_v,cbuf); // cbuf[dim]+=p*L;
}); pokeLocalSite(s,pgbuf,cbuf);
} });
if (p != processors[dim] - 1) { if (p != processors[dim] - 1) {
result = Cshift(result,dim,L); result = Cshift(result,dim,L);
} }
@@ -270,19 +269,15 @@ public:
flops+= flops_call*NN; flops+= flops_call*NN;
// writing out result // writing out result
{ thread_for(idx,sgrid->lSites(),{
autoView(pgbuf_v,pgbuf,CpuRead);
autoView(result_v,result,CpuWrite);
thread_for(idx,sgrid->lSites(),{
Coordinate clbuf(Nd), cgbuf(Nd); Coordinate clbuf(Nd), cgbuf(Nd);
sobj s; sobj s;
sgrid->LocalIndexToLocalCoor(idx,clbuf); sgrid->LocalIndexToLocalCoor(idx,clbuf);
cgbuf = clbuf; cgbuf = clbuf;
cgbuf[dim] = clbuf[dim]+L*pc; cgbuf[dim] = clbuf[dim]+L*pc;
peekLocalSite(s,pgbuf_v,cgbuf); peekLocalSite(s,pgbuf,cgbuf);
pokeLocalSite(s,result_v,clbuf); pokeLocalSite(s,result,clbuf);
}); });
}
result = result*div; result = result*div;
// destroying plan // destroying plan

View File

@@ -223,14 +223,9 @@ class SchurOperatorBase : public LinearOperatorBase<Field> {
Mpc(in,tmp); Mpc(in,tmp);
MpcDag(tmp,out); MpcDag(tmp,out);
} }
virtual void MpcMpcDag(const Field &in, Field &out) {
Field tmp(in.Grid());
tmp.Checkerboard() = in.Checkerboard();
MpcDag(in,tmp);
Mpc(tmp,out);
}
virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){ virtual void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
HermOp(in,out); out.Checkerboard() = in.Checkerboard();
MpcDagMpc(in,out);
ComplexD dot= innerProduct(in,out); ComplexD dot= innerProduct(in,out);
n1=real(dot); n1=real(dot);
n2=norm2(out); n2=norm2(out);
@@ -281,16 +276,6 @@ template<class Matrix,class Field>
axpy(out,-1.0,tmp,out); axpy(out,-1.0,tmp,out);
} }
}; };
// Mpc MpcDag system presented as the HermOp
template<class Matrix,class Field>
class SchurDiagMooeeDagOperator : public SchurDiagMooeeOperator<Matrix,Field> {
public:
virtual void HermOp(const Field &in, Field &out){
out.Checkerboard() = in.Checkerboard();
this->MpcMpcDag(in,out);
}
SchurDiagMooeeDagOperator (Matrix &Mat): SchurDiagMooeeOperator<Matrix,Field>(Mat){};
};
template<class Matrix,class Field> template<class Matrix,class Field>
class SchurDiagOneOperator : public SchurOperatorBase<Field> { class SchurDiagOneOperator : public SchurOperatorBase<Field> {
protected: protected:
@@ -545,16 +530,6 @@ public:
template<class Field> class LinearFunction { template<class Field> class LinearFunction {
public: public:
virtual void operator() (const Field &in, Field &out) = 0; virtual void operator() (const Field &in, Field &out) = 0;
virtual void operator() (const std::vector<Field> &in, std::vector<Field> &out)
{
assert(in.size() == out.size());
for (unsigned int i = 0; i < in.size(); ++i)
{
(*this)(in[i], out[i]);
}
}
}; };
template<class Field> class IdentityLinearFunction : public LinearFunction<Field> { template<class Field> class IdentityLinearFunction : public LinearFunction<Field> {

View File

@@ -292,7 +292,6 @@ public:
template<class Field> template<class Field>
class ChebyshevLanczos : public Chebyshev<Field> { class ChebyshevLanczos : public Chebyshev<Field> {
private: private:
std::vector<RealD> Coeffs; std::vector<RealD> Coeffs;
int order; int order;
RealD alpha; RealD alpha;

View File

@@ -122,14 +122,12 @@ class BiCGSTAB : public OperatorFunction<Field>
LinearCombTimer.Start(); LinearCombTimer.Start();
bo = beta * omega; bo = beta * omega;
{ auto p_v = p.View();
autoView( p_v , p, AcceleratorWrite); auto r_v = r.View();
autoView( r_v , r, AcceleratorRead); auto v_v = v.View();
autoView( v_v , v, AcceleratorRead); accelerator_for(ss, p_v.size(), Field::vector_object::Nsimd(),{
accelerator_for(ss, p_v.size(), Field::vector_object::Nsimd(),{ coalescedWrite(p_v[ss], beta*p_v(ss) - bo*v_v(ss) + r_v(ss));
coalescedWrite(p_v[ss], beta*p_v(ss) - bo*v_v(ss) + r_v(ss)); });
});
}
LinearCombTimer.Stop(); LinearCombTimer.Stop();
LinalgTimer.Stop(); LinalgTimer.Stop();
@@ -144,20 +142,16 @@ class BiCGSTAB : public OperatorFunction<Field>
alpha = rho / Calpha.real(); alpha = rho / Calpha.real();
LinearCombTimer.Start(); LinearCombTimer.Start();
{ auto h_v = h.View();
autoView( p_v , p, AcceleratorRead); auto psi_v = psi.View();
autoView( r_v , r, AcceleratorRead); accelerator_for(ss, h_v.size(), Field::vector_object::Nsimd(),{
autoView( v_v , v, AcceleratorRead); coalescedWrite(h_v[ss], alpha*p_v(ss) + psi_v(ss));
autoView( psi_v,psi, AcceleratorRead); });
autoView( h_v , h, AcceleratorWrite);
autoView( s_v , s, AcceleratorWrite); auto s_v = s.View();
accelerator_for(ss, h_v.size(), Field::vector_object::Nsimd(),{ accelerator_for(ss, s_v.size(), Field::vector_object::Nsimd(),{
coalescedWrite(h_v[ss], alpha*p_v(ss) + psi_v(ss)); coalescedWrite(s_v[ss], -alpha*v_v(ss) + r_v(ss));
}); });
accelerator_for(ss, s_v.size(), Field::vector_object::Nsimd(),{
coalescedWrite(s_v[ss], -alpha*v_v(ss) + r_v(ss));
});
}
LinearCombTimer.Stop(); LinearCombTimer.Stop();
LinalgTimer.Stop(); LinalgTimer.Stop();
@@ -172,19 +166,13 @@ class BiCGSTAB : public OperatorFunction<Field>
omega = Comega.real() / norm2(t); omega = Comega.real() / norm2(t);
LinearCombTimer.Start(); LinearCombTimer.Start();
{ auto t_v = t.View();
autoView( psi_v,psi, AcceleratorWrite); accelerator_for(ss, psi_v.size(), Field::vector_object::Nsimd(),{
autoView( r_v , r, AcceleratorWrite); coalescedWrite(psi_v[ss], h_v(ss) + omega * s_v(ss));
autoView( h_v , h, AcceleratorRead); coalescedWrite(r_v[ss], -omega * t_v(ss) + s_v(ss));
autoView( s_v , s, AcceleratorRead); });
autoView( t_v , t, AcceleratorRead);
accelerator_for(ss, psi_v.size(), Field::vector_object::Nsimd(),{
coalescedWrite(psi_v[ss], h_v(ss) + omega * s_v(ss));
coalescedWrite(r_v[ss], -omega * t_v(ss) + s_v(ss));
});
}
LinearCombTimer.Stop(); LinearCombTimer.Stop();
cp = norm2(r); cp = norm2(r);
LinalgTimer.Stop(); LinalgTimer.Stop();

View File

@@ -102,7 +102,7 @@ public:
// Check if guess is really REALLY good :) // Check if guess is really REALLY good :)
if (cp <= rsq) { if (cp <= rsq) {
TrueResidual = std::sqrt(a/ssq); TrueResidual = std::sqrt(a/ssq);
std::cout << GridLogMessage << "ConjugateGradient guess is converged already "<<TrueResidual<< " tol "<< Tolerance<< std::endl; std::cout << GridLogMessage << "ConjugateGradient guess is converged already " << std::endl;
IterationsToComplete = 0; IterationsToComplete = 0;
return; return;
} }
@@ -140,15 +140,13 @@ public:
b = cp / c; b = cp / c;
LinearCombTimer.Start(); LinearCombTimer.Start();
{ auto psi_v = psi.View();
autoView( psi_v , psi, AcceleratorWrite); auto p_v = p.View();
autoView( p_v , p, AcceleratorWrite); auto r_v = r.View();
autoView( r_v , r, AcceleratorWrite); accelerator_for(ss,p_v.size(), Field::vector_object::Nsimd(),{
accelerator_for(ss,p_v.size(), Field::vector_object::Nsimd(),{ coalescedWrite(psi_v[ss], a * p_v(ss) + psi_v(ss));
coalescedWrite(psi_v[ss], a * p_v(ss) + psi_v(ss)); coalescedWrite(p_v[ss] , b * p_v(ss) + r_v (ss));
coalescedWrite(p_v[ss] , b * p_v(ss) + r_v (ss)); });
});
}
LinearCombTimer.Stop(); LinearCombTimer.Stop();
LinalgTimer.Stop(); LinalgTimer.Stop();

View File

@@ -48,29 +48,19 @@ NAMESPACE_BEGIN(Grid);
Integer TotalInnerIterations; //Number of inner CG iterations Integer TotalInnerIterations; //Number of inner CG iterations
Integer TotalOuterIterations; //Number of restarts Integer TotalOuterIterations; //Number of restarts
Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
RealD TrueResidual;
//Option to speed up *inner single precision* solves using a LinearFunction that produces a guess //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
LinearFunction<FieldF> *guesser; LinearFunction<FieldF> *guesser;
MixedPrecisionConjugateGradient(RealD Tol, MixedPrecisionConjugateGradient(RealD tol,
Integer maxinnerit,
Integer maxouterit,
GridBase* _sp_grid,
LinearOperatorBase<FieldF> &_Linop_f,
LinearOperatorBase<FieldD> &_Linop_d) :
MixedPrecisionConjugateGradient(Tol, Tol, maxinnerit, maxouterit, _sp_grid, _Linop_f, _Linop_d) {};
MixedPrecisionConjugateGradient(RealD Tol,
RealD InnerTol,
Integer maxinnerit, Integer maxinnerit,
Integer maxouterit, Integer maxouterit,
GridBase* _sp_grid, GridBase* _sp_grid,
LinearOperatorBase<FieldF> &_Linop_f, LinearOperatorBase<FieldF> &_Linop_f,
LinearOperatorBase<FieldD> &_Linop_d) : LinearOperatorBase<FieldD> &_Linop_d) :
Linop_f(_Linop_f), Linop_d(_Linop_d), Linop_f(_Linop_f), Linop_d(_Linop_d),
Tolerance(Tol), InnerTolerance(InnerTol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), SinglePrecGrid(_sp_grid), Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), SinglePrecGrid(_sp_grid),
OuterLoopNormMult(100.), guesser(NULL){ assert(InnerTol < 1.0e-1);}; OuterLoopNormMult(100.), guesser(NULL){ };
void useGuesser(LinearFunction<FieldF> &g){ void useGuesser(LinearFunction<FieldF> &g){
guesser = &g; guesser = &g;
@@ -89,11 +79,6 @@ NAMESPACE_BEGIN(Grid);
RealD stop = src_norm * Tolerance*Tolerance; RealD stop = src_norm * Tolerance*Tolerance;
GridBase* DoublePrecGrid = src_d_in.Grid(); GridBase* DoublePrecGrid = src_d_in.Grid();
//Generate precision change workspaces
precisionChangeWorkspace wk_dp_from_sp(DoublePrecGrid, SinglePrecGrid);
precisionChangeWorkspace wk_sp_from_dp(SinglePrecGrid, DoublePrecGrid);
FieldD tmp_d(DoublePrecGrid); FieldD tmp_d(DoublePrecGrid);
tmp_d.Checkerboard() = cb; tmp_d.Checkerboard() = cb;
@@ -134,7 +119,7 @@ NAMESPACE_BEGIN(Grid);
while(norm * inner_tol * inner_tol < stop) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ?? while(norm * inner_tol * inner_tol < stop) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
PrecChangeTimer.Start(); PrecChangeTimer.Start();
precisionChange(src_f, src_d, wk_sp_from_dp); precisionChange(src_f, src_d);
PrecChangeTimer.Stop(); PrecChangeTimer.Stop();
sol_f = Zero(); sol_f = Zero();
@@ -152,7 +137,7 @@ NAMESPACE_BEGIN(Grid);
//Convert sol back to double and add to double prec solution //Convert sol back to double and add to double prec solution
PrecChangeTimer.Start(); PrecChangeTimer.Start();
precisionChange(tmp_d, sol_f, wk_dp_from_sp); precisionChange(tmp_d, sol_f);
PrecChangeTimer.Stop(); PrecChangeTimer.Stop();
axpy(sol_d, 1.0, tmp_d, sol_d); axpy(sol_d, 1.0, tmp_d, sol_d);
@@ -164,7 +149,6 @@ NAMESPACE_BEGIN(Grid);
ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations); ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations);
CG_d(Linop_d, src_d_in, sol_d); CG_d(Linop_d, src_d_in, sol_d);
TotalFinalStepIterations = CG_d.IterationsToComplete; TotalFinalStepIterations = CG_d.IterationsToComplete;
TrueResidual = CG_d.TrueResidual;
TotalTimer.Stop(); TotalTimer.Stop();
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl; std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl;

View File

@@ -52,7 +52,7 @@ public:
MultiShiftFunction shifts; MultiShiftFunction shifts;
std::vector<RealD> TrueResidualShift; std::vector<RealD> TrueResidualShift;
ConjugateGradientMultiShift(Integer maxit, const MultiShiftFunction &_shifts) : ConjugateGradientMultiShift(Integer maxit,MultiShiftFunction &_shifts) :
MaxIterations(maxit), MaxIterations(maxit),
shifts(_shifts) shifts(_shifts)
{ {
@@ -182,9 +182,6 @@ public:
for(int s=0;s<nshift;s++) { for(int s=0;s<nshift;s++) {
axpby(psi[s],0.,-bs[s]*alpha[s],src,src); axpby(psi[s],0.,-bs[s]*alpha[s],src,src);
} }
std::cout << GridLogIterative << "ConjugateGradientMultiShift: initial rn (|src|^2) =" << rn << " qq (|MdagM src|^2) =" << qq << " d ( dot(src, [MdagM + m_0]src) ) =" << d << " c=" << c << std::endl;
/////////////////////////////////////// ///////////////////////////////////////
// Timers // Timers

View File

@@ -1,411 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/ConjugateGradientMultiShift.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Christopher Kelly <ckelly@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CONJUGATE_GRADIENT_MULTI_SHIFT_MIXEDPREC_H
#define GRID_CONJUGATE_GRADIENT_MULTI_SHIFT_MIXEDPREC_H
NAMESPACE_BEGIN(Grid);
//CK 2020: A variant of the multi-shift conjugate gradient with the matrix multiplication in single precision.
//The residual is stored in single precision, but the search directions and solution are stored in double precision.
//Every update_freq iterations the residual is corrected in double precision.
//For safety the a final regular CG is applied to clean up if necessary
//Linop to add shift to input linop, used in cleanup CG
namespace ConjugateGradientMultiShiftMixedPrecSupport{
template<typename Field>
class ShiftedLinop: public LinearOperatorBase<Field>{
public:
LinearOperatorBase<Field> &linop_base;
RealD shift;
ShiftedLinop(LinearOperatorBase<Field> &_linop_base, RealD _shift): linop_base(_linop_base), shift(_shift){}
void OpDiag (const Field &in, Field &out){ assert(0); }
void OpDir (const Field &in, Field &out,int dir,int disp){ assert(0); }
void OpDirAll (const Field &in, std::vector<Field> &out){ assert(0); }
void Op (const Field &in, Field &out){ assert(0); }
void AdjOp (const Field &in, Field &out){ assert(0); }
void HermOp(const Field &in, Field &out){
linop_base.HermOp(in, out);
axpy(out, shift, in, out);
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
HermOp(in,out);
ComplexD dot = innerProduct(in,out);
n1=real(dot);
n2=norm2(out);
}
};
};
template<class FieldD, class FieldF,
typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class ConjugateGradientMultiShiftMixedPrec : public OperatorMultiFunction<FieldD>,
public OperatorFunction<FieldD>
{
public:
using OperatorFunction<FieldD>::operator();
RealD Tolerance;
Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
std::vector<int> IterationsToCompleteShift; // Iterations for this shift
int verbose;
MultiShiftFunction shifts;
std::vector<RealD> TrueResidualShift;
int ReliableUpdateFreq; //number of iterations between reliable updates
GridBase* SinglePrecGrid; //Grid for single-precision fields
LinearOperatorBase<FieldF> &Linop_f; //single precision
ConjugateGradientMultiShiftMixedPrec(Integer maxit, const MultiShiftFunction &_shifts,
GridBase* _SinglePrecGrid, LinearOperatorBase<FieldF> &_Linop_f,
int _ReliableUpdateFreq
) :
MaxIterations(maxit), shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq)
{
verbose=1;
IterationsToCompleteShift.resize(_shifts.order);
TrueResidualShift.resize(_shifts.order);
}
void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, FieldD &psi)
{
GridBase *grid = src.Grid();
int nshift = shifts.order;
std::vector<FieldD> results(nshift,grid);
(*this)(Linop,src,results,psi);
}
void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, std::vector<FieldD> &results, FieldD &psi)
{
int nshift = shifts.order;
(*this)(Linop,src,results);
psi = shifts.norm*src;
for(int i=0;i<nshift;i++){
psi = psi + shifts.residues[i]*results[i];
}
return;
}
void operator() (LinearOperatorBase<FieldD> &Linop_d, const FieldD &src_d, std::vector<FieldD> &psi_d)
{
GridBase *DoublePrecGrid = src_d.Grid();
precisionChangeWorkspace wk_f_from_d(SinglePrecGrid, DoublePrecGrid);
precisionChangeWorkspace wk_d_from_f(DoublePrecGrid, SinglePrecGrid);
////////////////////////////////////////////////////////////////////////
// Convenience references to the info stored in "MultiShiftFunction"
////////////////////////////////////////////////////////////////////////
int nshift = shifts.order;
std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts"
std::vector<RealD> &mresidual(shifts.tolerances);
std::vector<RealD> alpha(nshift,1.0);
//Double precision search directions
FieldD p_d(DoublePrecGrid);
std::vector<FieldD> ps_d(nshift, DoublePrecGrid);// Search directions (double precision)
FieldD tmp_d(DoublePrecGrid);
FieldD r_d(DoublePrecGrid);
FieldD mmp_d(DoublePrecGrid);
assert(psi_d.size()==nshift);
assert(mass.size()==nshift);
assert(mresidual.size()==nshift);
// dynamic sized arrays on stack; 2d is a pain with vector
RealD bs[nshift];
RealD rsq[nshift];
RealD z[nshift][2];
int converged[nshift];
const int primary =0;
//Primary shift fields CG iteration
RealD a,b,c,d;
RealD cp,bp,qq; //prev
// Matrix mult fields
FieldF r_f(SinglePrecGrid);
FieldF p_f(SinglePrecGrid);
FieldF tmp_f(SinglePrecGrid);
FieldF mmp_f(SinglePrecGrid);
FieldF src_f(SinglePrecGrid);
precisionChange(src_f, src_d, wk_f_from_d);
// Check lightest mass
for(int s=0;s<nshift;s++){
assert( mass[s]>= mass[primary] );
converged[s]=0;
}
// Wire guess to zero
// Residuals "r" are src
// First search direction "p" is also src
cp = norm2(src_d);
// Handle trivial case of zero src.
if( cp == 0. ){
for(int s=0;s<nshift;s++){
psi_d[s] = Zero();
IterationsToCompleteShift[s] = 1;
TrueResidualShift[s] = 0.;
}
return;
}
for(int s=0;s<nshift;s++){
rsq[s] = cp * mresidual[s] * mresidual[s];
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: shift "<< s <<" target resid "<<rsq[s]<<std::endl;
ps_d[s] = src_d;
}
// r and p for primary
r_f=src_f; //residual maintained in single
p_f=src_f;
p_d = src_d; //primary copy --- make this a reference to ps_d to save axpys
//MdagM+m[0]
Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
axpy(mmp_f,mass[0],p_f,mmp_f);
RealD rn = norm2(p_f);
d += rn*mass[0];
b = -cp /d;
// Set up the various shift variables
int iz=0;
z[0][1-iz] = 1.0;
z[0][iz] = 1.0;
bs[0] = b;
for(int s=1;s<nshift;s++){
z[s][1-iz] = 1.0;
z[s][iz] = 1.0/( 1.0 - b*(mass[s]-mass[0]));
bs[s] = b*z[s][iz];
}
// r += b[0] A.p[0]
// c= norm(r)
c=axpy_norm(r_f,b,mmp_f,r_f);
for(int s=0;s<nshift;s++) {
axpby(psi_d[s],0.,-bs[s]*alpha[s],src_d,src_d);
}
///////////////////////////////////////
// Timers
///////////////////////////////////////
GridStopWatch AXPYTimer, ShiftTimer, QRTimer, MatrixTimer, SolverTimer, PrecChangeTimer, CleanupTimer;
SolverTimer.Start();
// Iteration loop
int k;
for (k=1;k<=MaxIterations;k++){
a = c /cp;
//Update double precision search direction by residual
PrecChangeTimer.Start();
precisionChange(r_d, r_f, wk_d_from_f);
PrecChangeTimer.Stop();
AXPYTimer.Start();
axpy(p_d,a,p_d,r_d);
for(int s=0;s<nshift;s++){
if ( ! converged[s] ) {
if (s==0){
axpy(ps_d[s],a,ps_d[s],r_d);
} else{
RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b);
axpby(ps_d[s],z[s][iz],as,r_d,ps_d[s]);
}
}
}
AXPYTimer.Stop();
PrecChangeTimer.Start();
precisionChange(p_f, p_d, wk_f_from_d); //get back single prec search direction for linop
PrecChangeTimer.Stop();
cp=c;
MatrixTimer.Start();
Linop_f.HermOp(p_f,mmp_f);
d=real(innerProduct(p_f,mmp_f));
MatrixTimer.Stop();
AXPYTimer.Start();
axpy(mmp_f,mass[0],p_f,mmp_f);
AXPYTimer.Stop();
RealD rn = norm2(p_f);
d += rn*mass[0];
bp=b;
b=-cp/d;
// Toggle the recurrence history
bs[0] = b;
iz = 1-iz;
ShiftTimer.Start();
for(int s=1;s<nshift;s++){
if((!converged[s])){
RealD z0 = z[s][1-iz];
RealD z1 = z[s][iz];
z[s][iz] = z0*z1*bp
/ (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b));
bs[s] = b*z[s][iz]/z0; // NB sign rel to Mike
}
}
ShiftTimer.Stop();
//Update double precision solutions
AXPYTimer.Start();
for(int s=0;s<nshift;s++){
int ss = s;
if( (!converged[s]) ) {
axpy(psi_d[ss],-bs[s]*alpha[s],ps_d[s],psi_d[ss]);
}
}
//Perform reliable update if necessary; otherwise update residual from single-prec mmp
RealD c_f = axpy_norm(r_f,b,mmp_f,r_f);
AXPYTimer.Stop();
c = c_f;
if(k % ReliableUpdateFreq == 0){
//Replace r with true residual
MatrixTimer.Start();
Linop_d.HermOp(psi_d[0],mmp_d);
MatrixTimer.Stop();
AXPYTimer.Start();
axpy(mmp_d,mass[0],psi_d[0],mmp_d);
RealD c_d = axpy_norm(r_d, -1.0, mmp_d, src_d);
AXPYTimer.Stop();
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<< ", replaced |r|^2 = "<<c_f <<" with |r|^2 = "<<c_d<<std::endl;
PrecChangeTimer.Start();
precisionChange(r_f, r_d, wk_f_from_d);
PrecChangeTimer.Stop();
c = c_d;
}
// Convergence checks
int all_converged = 1;
for(int s=0;s<nshift;s++){
if ( (!converged[s]) ){
IterationsToCompleteShift[s] = k;
RealD css = c * z[s][iz]* z[s][iz];
if(css<rsq[s]){
if ( ! converged[s] )
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
converged[s]=1;
} else {
all_converged=0;
}
}
}
if ( all_converged ){
SolverTimer.Stop();
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: All shifts have converged iteration "<<k<<std::endl;
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Checking solutions"<<std::endl;
// Check answers
for(int s=0; s < nshift; s++) {
Linop_d.HermOpAndNorm(psi_d[s],mmp_d,d,qq);
axpy(tmp_d,mass[s],psi_d[s],mmp_d);
axpy(r_d,-alpha[s],src_d,tmp_d);
RealD rn = norm2(r_d);
RealD cn = norm2(src_d);
TrueResidualShift[s] = std::sqrt(rn/cn);
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: shift["<<s<<"] true residual "<< TrueResidualShift[s] << " target " << mresidual[s] << std::endl;
//If we have not reached the desired tolerance, do a (mixed precision) CG cleanup
if(rn >= rsq[s]){
CleanupTimer.Start();
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: performing cleanup step for shift " << s << std::endl;
//Setup linear operators for final cleanup
ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldD> Linop_shift_d(Linop_d, mass[s]);
ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldF> Linop_shift_f(Linop_f, mass[s]);
MixedPrecisionConjugateGradient<FieldD,FieldF> cg(mresidual[s], MaxIterations, MaxIterations, SinglePrecGrid, Linop_shift_f, Linop_shift_d);
cg(src_d, psi_d[s]);
TrueResidualShift[s] = cg.TrueResidual;
CleanupTimer.Stop();
}
}
std::cout << GridLogMessage << "ConjugateGradientMultiShiftMixedPrec: Time Breakdown for body"<<std::endl;
std::cout << GridLogMessage << "\tSolver " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tAXPY " << AXPYTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tShift " << ShiftTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tPrecision Change " << PrecChangeTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tFinal Cleanup " << CleanupTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tSolver+Cleanup " << SolverTimer.Elapsed() + CleanupTimer.Elapsed() << std::endl;
IterationsToComplete = k;
return;
}
}
// ugly hack
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
// assert(0);
}
};
NAMESPACE_END(Grid);
#endif

View File

@@ -54,23 +54,15 @@ class DeflatedGuesser: public LinearFunction<Field> {
private: private:
const std::vector<Field> &evec; const std::vector<Field> &evec;
const std::vector<RealD> &eval; const std::vector<RealD> &eval;
const unsigned int N;
public: public:
DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval) DeflatedGuesser(const std::vector<Field> & _evec,const std::vector<RealD> & _eval) : evec(_evec), eval(_eval) {};
: DeflatedGuesser(_evec, _eval, _evec.size())
{}
DeflatedGuesser(const std::vector<Field> & _evec, const std::vector<RealD> & _eval, const unsigned int _N)
: evec(_evec), eval(_eval), N(_N)
{
assert(evec.size()==eval.size());
assert(N <= evec.size());
}
virtual void operator()(const Field &src,Field &guess) { virtual void operator()(const Field &src,Field &guess) {
guess = Zero(); guess = Zero();
assert(evec.size()==eval.size());
auto N = evec.size();
for (int i=0;i<N;i++) { for (int i=0;i<N;i++) {
const Field& tmp = evec[i]; const Field& tmp = evec[i];
axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess); axpy(guess,TensorRemove(innerProduct(tmp,src)) / eval[i],tmp,guess);

View File

@@ -1,241 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/PrecGeneralisedConjugateResidual.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_PREC_GCR_NON_HERM_H
#define GRID_PREC_GCR_NON_HERM_H
///////////////////////////////////////////////////////////////////////////////////////////////////////
//VPGCR Abe and Zhang, 2005.
//INTERNATIONAL JOURNAL OF NUMERICAL ANALYSIS AND MODELING
//Computing and Information Volume 2, Number 2, Pages 147-161
//NB. Likely not original reference since they are focussing on a preconditioner variant.
// but VPGCR was nicely written up in their paper
///////////////////////////////////////////////////////////////////////////////////////////////////////
NAMESPACE_BEGIN(Grid);
#define GCRLogLevel std::cout << GridLogMessage <<std::string(level,'\t')<< " Level "<<level<<" "
template<class Field>
class PrecGeneralisedConjugateResidualNonHermitian : public LinearFunction<Field> {
public:
RealD Tolerance;
Integer MaxIterations;
int verbose;
int mmax;
int nstep;
int steps;
int level;
GridStopWatch PrecTimer;
GridStopWatch MatTimer;
GridStopWatch LinalgTimer;
LinearFunction<Field> &Preconditioner;
LinearOperatorBase<Field> &Linop;
void Level(int lv) { level=lv; };
PrecGeneralisedConjugateResidualNonHermitian(RealD tol,Integer maxit,LinearOperatorBase<Field> &_Linop,LinearFunction<Field> &Prec,int _mmax,int _nstep) :
Tolerance(tol),
MaxIterations(maxit),
Linop(_Linop),
Preconditioner(Prec),
mmax(_mmax),
nstep(_nstep)
{
level=1;
verbose=1;
};
void operator() (const Field &src, Field &psi){
psi=Zero();
RealD cp, ssq,rsq;
ssq=norm2(src);
rsq=Tolerance*Tolerance*ssq;
Field r(src.Grid());
PrecTimer.Reset();
MatTimer.Reset();
LinalgTimer.Reset();
GridStopWatch SolverTimer;
SolverTimer.Start();
steps=0;
for(int k=0;k<MaxIterations;k++){
cp=GCRnStep(src,psi,rsq);
GCRLogLevel <<"PGCR("<<mmax<<","<<nstep<<") "<< steps <<" steps cp = "<<cp<<" target "<<rsq <<std::endl;
if(cp<rsq) {
SolverTimer.Stop();
Linop.Op(psi,r);
axpy(r,-1.0,src,r);
RealD tr = norm2(r);
GCRLogLevel<<"PGCR: Converged on iteration " <<steps
<< " computed residual "<<sqrt(cp/ssq)
<< " true residual " <<sqrt(tr/ssq)
<< " target " <<Tolerance <<std::endl;
GCRLogLevel<<"PGCR Time elapsed: Total "<< SolverTimer.Elapsed() <<std::endl;
return;
}
}
GCRLogLevel<<"Variable Preconditioned GCR did not converge"<<std::endl;
// assert(0);
}
RealD GCRnStep(const Field &src, Field &psi,RealD rsq){
RealD cp;
ComplexD a, b, zAz;
RealD zAAz;
ComplexD rq;
GridBase *grid = src.Grid();
Field r(grid);
Field z(grid);
Field tmp(grid);
Field ttmp(grid);
Field Az(grid);
////////////////////////////////
// history for flexible orthog
////////////////////////////////
std::vector<Field> q(mmax,grid);
std::vector<Field> p(mmax,grid);
std::vector<RealD> qq(mmax);
GCRLogLevel<< "PGCR nStep("<<nstep<<")"<<std::endl;
//////////////////////////////////
// initial guess x0 is taken as nonzero.
// r0=src-A x0 = src
//////////////////////////////////
MatTimer.Start();
Linop.Op(psi,Az);
zAz = innerProduct(Az,psi);
zAAz= norm2(Az);
MatTimer.Stop();
LinalgTimer.Start();
r=src-Az;
LinalgTimer.Stop();
GCRLogLevel<< "PGCR true residual r = src - A psi "<<norm2(r) <<std::endl;
/////////////////////
// p = Prec(r)
/////////////////////
PrecTimer.Start();
Preconditioner(r,z);
PrecTimer.Stop();
MatTimer.Start();
Linop.Op(z,Az);
MatTimer.Stop();
LinalgTimer.Start();
zAz = innerProduct(Az,psi);
zAAz= norm2(Az);
//p[0],q[0],qq[0]
p[0]= z;
q[0]= Az;
qq[0]= zAAz;
cp =norm2(r);
LinalgTimer.Stop();
for(int k=0;k<nstep;k++){
steps++;
int kp = k+1;
int peri_k = k %mmax;
int peri_kp= kp%mmax;
LinalgTimer.Start();
rq= innerProduct(q[peri_k],r); // what if rAr not real?
a = rq/qq[peri_k];
axpy(psi,a,p[peri_k],psi);
cp = axpy_norm(r,-a,q[peri_k],r);
LinalgTimer.Stop();
GCRLogLevel<< "PGCR step["<<steps<<"] resid " << cp << " target " <<rsq<<std::endl;
if((k==nstep-1)||(cp<rsq)){
return cp;
}
PrecTimer.Start();
Preconditioner(r,z);// solve Az = r
PrecTimer.Stop();
MatTimer.Start();
Linop.Op(z,Az);
MatTimer.Stop();
zAz = innerProduct(Az,psi);
zAAz= norm2(Az);
LinalgTimer.Start();
q[peri_kp]=Az;
p[peri_kp]=z;
int northog = ((kp)>(mmax-1))?(mmax-1):(kp); // if more than mmax done, we orthog all mmax history.
for(int back=0;back<northog;back++){
int peri_back=(k-back)%mmax; assert((k-back)>=0);
b=-real(innerProduct(q[peri_back],Az))/qq[peri_back];
p[peri_kp]=p[peri_kp]+b*p[peri_back];
q[peri_kp]=q[peri_kp]+b*q[peri_back];
}
qq[peri_kp]=norm2(q[peri_kp]); // could use axpy_norm
LinalgTimer.Stop();
}
assert(0); // never reached
return cp;
}
};
NAMESPACE_END(Grid);
#endif

View File

@@ -40,7 +40,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
* (-MoeMee^{-1} 1 ) * (-MoeMee^{-1} 1 )
* L^{dag} = ( 1 Mee^{-dag} Moe^{dag} ) * L^{dag} = ( 1 Mee^{-dag} Moe^{dag} )
* ( 0 1 ) * ( 0 1 )
* L^{-dag}= ( 1 -Mee^{-dag} Moe^{dag} ) * L^{-d} = ( 1 -Mee^{-dag} Moe^{dag} )
* ( 0 1 ) * ( 0 1 )
* *
* U^-1 = (1 -Mee^{-1} Meo) * U^-1 = (1 -Mee^{-1} Meo)
@@ -82,8 +82,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
* c) M_oo^-dag Doo^{dag} Doo Moo^-1 phi_0 = M_oo^-dag (D_oo)^dag L^{-1} eta_o * c) M_oo^-dag Doo^{dag} Doo Moo^-1 phi_0 = M_oo^-dag (D_oo)^dag L^{-1} eta_o
* eta_o' = M_oo^-dag (D_oo)^dag (eta_o - Moe Mee^{-1} eta_e) * eta_o' = M_oo^-dag (D_oo)^dag (eta_o - Moe Mee^{-1} eta_e)
* psi_o = M_oo^-1 phi_o * psi_o = M_oo^-1 phi_o
* * TODO: Deflation
*
*/ */
namespace Grid { namespace Grid {
@@ -98,7 +97,6 @@ namespace Grid {
protected: protected:
typedef CheckerBoardedSparseMatrixBase<Field> Matrix; typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
OperatorFunction<Field> & _HermitianRBSolver; OperatorFunction<Field> & _HermitianRBSolver;
int CBfactorise; int CBfactorise;
bool subGuess; bool subGuess;
bool useSolnAsInitGuess; // if true user-supplied solution vector is used as initial guess for solver bool useSolnAsInitGuess; // if true user-supplied solution vector is used as initial guess for solver
@@ -134,31 +132,6 @@ namespace Grid {
(*this)(_Matrix,in,out,guess); (*this)(_Matrix,in,out,guess);
} }
void RedBlackSource(Matrix &_Matrix, const std::vector<Field> &in, std::vector<Field> &src_o)
{
GridBase *grid = _Matrix.RedBlackGrid();
Field tmp(grid);
int nblock = in.size();
for(int b=0;b<nblock;b++){
RedBlackSource(_Matrix,in[b],tmp,src_o[b]);
}
}
// James can write his own deflated guesser
// with optimised code for the inner products
// RedBlackSolveSplitGrid();
// RedBlackSolve(_Matrix,src_o,sol_o);
void RedBlackSolution(Matrix &_Matrix, const std::vector<Field> &in, const std::vector<Field> &sol_o, std::vector<Field> &out)
{
GridBase *grid = _Matrix.RedBlackGrid();
Field tmp(grid);
int nblock = in.size();
for(int b=0;b<nblock;b++) {
pickCheckerboard(Even,tmp,in[b]);
RedBlackSolution(_Matrix,sol_o[b],tmp,out[b]);
}
}
template<class Guesser> template<class Guesser>
void operator()(Matrix &_Matrix, const std::vector<Field> &in, std::vector<Field> &out,Guesser &guess) void operator()(Matrix &_Matrix, const std::vector<Field> &in, std::vector<Field> &out,Guesser &guess)
{ {
@@ -177,29 +150,24 @@ namespace Grid {
//////////////////////////////////////////////// ////////////////////////////////////////////////
// Prepare RedBlack source // Prepare RedBlack source
//////////////////////////////////////////////// ////////////////////////////////////////////////
RedBlackSource(_Matrix,in,src_o); for(int b=0;b<nblock;b++){
// for(int b=0;b<nblock;b++){ RedBlackSource(_Matrix,in[b],tmp,src_o[b]);
// RedBlackSource(_Matrix,in[b],tmp,src_o[b]); }
// }
//////////////////////////////////////////////// ////////////////////////////////////////////////
// Make the guesses // Make the guesses
//////////////////////////////////////////////// ////////////////////////////////////////////////
if ( subGuess ) guess_save.resize(nblock,grid); if ( subGuess ) guess_save.resize(nblock,grid);
for(int b=0;b<nblock;b++){
if(useSolnAsInitGuess) { if(useSolnAsInitGuess) {
for(int b=0;b<nblock;b++){
pickCheckerboard(Odd, sol_o[b], out[b]); pickCheckerboard(Odd, sol_o[b], out[b]);
} else {
guess(src_o[b],sol_o[b]);
} }
} else {
guess(src_o, sol_o);
}
if ( subGuess ) { if ( subGuess ) {
for(int b=0;b<nblock;b++){ guess_save[b] = sol_o[b];
guess_save[b] = sol_o[b]; }
}
} }
////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////
// Call the block solver // Call the block solver
@@ -221,20 +189,13 @@ namespace Grid {
///////////////////////////////////////////////// /////////////////////////////////////////////////
// Check unprec residual if possible // Check unprec residual if possible
///////////////////////////////////////////////// /////////////////////////////////////////////////
if ( ! subGuess ) { if ( ! subGuess ) {
_Matrix.M(out[b],resid);
if ( this->adjoint() ) _Matrix.Mdag(out[b],resid);
else _Matrix.M(out[b],resid);
resid = resid-in[b]; resid = resid-in[b];
RealD ns = norm2(in[b]); RealD ns = norm2(in[b]);
RealD nr = norm2(resid); RealD nr = norm2(resid);
std::cout<<GridLogMessage<< "SchurRedBlackBase adjoint "<< this->adjoint() << std::endl; std::cout<<GridLogMessage<< "SchurRedBlackBase solver true unprec resid["<<b<<"] "<<std::sqrt(nr/ns) << std::endl;
if ( this->adjoint() )
std::cout<<GridLogMessage<< "SchurRedBlackBase adjoint solver true unprec resid["<<b<<"] "<<std::sqrt(nr/ns) << std::endl;
else
std::cout<<GridLogMessage<< "SchurRedBlackBase solver true unprec resid["<<b<<"] "<<std::sqrt(nr/ns) << std::endl;
} else { } else {
std::cout<<GridLogMessage<< "SchurRedBlackBase Guess subtracted after solve["<<b<<"] " << std::endl; std::cout<<GridLogMessage<< "SchurRedBlackBase Guess subtracted after solve["<<b<<"] " << std::endl;
} }
@@ -288,21 +249,12 @@ namespace Grid {
// Verify the unprec residual // Verify the unprec residual
if ( ! subGuess ) { if ( ! subGuess ) {
_Matrix.M(out,resid);
std::cout<<GridLogMessage<< "SchurRedBlackBase adjoint "<< this->adjoint() << std::endl;
if ( this->adjoint() ) _Matrix.Mdag(out,resid);
else _Matrix.M(out,resid);
resid = resid-in; resid = resid-in;
RealD ns = norm2(in); RealD ns = norm2(in);
RealD nr = norm2(resid); RealD nr = norm2(resid);
if ( this->adjoint() ) std::cout<<GridLogMessage << "SchurRedBlackBase solver true unprec resid "<< std::sqrt(nr/ns) << std::endl;
std::cout<<GridLogMessage<< "SchurRedBlackBase adjoint solver true unprec resid "<<std::sqrt(nr/ns) << std::endl;
else
std::cout<<GridLogMessage<< "SchurRedBlackBase solver true unprec resid "<<std::sqrt(nr/ns) << std::endl;
} else { } else {
std::cout << GridLogMessage << "SchurRedBlackBase Guess subtracted after solve." << std::endl; std::cout << GridLogMessage << "SchurRedBlackBase Guess subtracted after solve." << std::endl;
} }
@@ -311,7 +263,6 @@ namespace Grid {
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
// Override in derived. // Override in derived.
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
virtual bool adjoint(void) { return false; }
virtual void RedBlackSource (Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o) =0; virtual void RedBlackSource (Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o) =0;
virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol) =0; virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol) =0;
virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o) =0; virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o) =0;
@@ -665,127 +616,6 @@ namespace Grid {
this->_HermitianRBSolver(_OpEO, src_o, sol_o); this->_HermitianRBSolver(_OpEO, src_o, sol_o);
} }
}; };
/*
* Red black Schur decomposition
*
* M = (Mee Meo) = (1 0 ) (Mee 0 ) (1 Mee^{-1} Meo)
* (Moe Moo) (Moe Mee^-1 1 ) (0 Moo-Moe Mee^-1 Meo) (0 1 )
* = L D U
*
* L^-1 = (1 0 )
* (-MoeMee^{-1} 1 )
* L^{dag} = ( 1 Mee^{-dag} Moe^{dag} )
* ( 0 1 )
*
* U^-1 = (1 -Mee^{-1} Meo)
* (0 1 )
* U^{dag} = ( 1 0)
* (Meo^dag Mee^{-dag} 1)
* U^{-dag} = ( 1 0)
* (-Meo^dag Mee^{-dag} 1)
*
*
***********************
* M^dag psi = eta
***********************
*
* Really for Mobius: (Wilson - easier to just use gamma 5 hermiticity)
*
* Mdag psi = Udag Ddag Ldag psi = eta
*
* U^{-dag} = ( 1 0)
* (-Meo^dag Mee^{-dag} 1)
*
*
* i) D^dag phi = (U^{-dag} eta)
* eta'_e = eta_e
* eta'_o = (eta_o - Meo^dag Mee^{-dag} eta_e)
*
* phi_o = D_oo^-dag eta'_o = D_oo^-dag (eta_o - Meo^dag Mee^{-dag} eta_e)
*
* phi_e = D_ee^-dag eta'_e = D_ee^-dag eta_e
*
* Solve:
*
* D_oo D_oo^dag phi_o = D_oo (eta_o - Meo^dag Mee^{-dag} eta_e)
*
* ii)
* phi = L^dag psi => psi = L^-dag phi.
*
* L^{-dag} = ( 1 -Mee^{-dag} Moe^{dag} )
* ( 0 1 )
*
* => sol_e = M_ee^-dag * ( src_e - Moe^dag phi_o )...
* => sol_o = phi_o
*/
///////////////////////////////////////////////////////////////////////////////////////////////////////
// Site diagonal has Mooee on it, but solve the Adjoint system
///////////////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class SchurRedBlackDiagMooeeDagSolve : public SchurRedBlackBase<Field> {
public:
typedef CheckerBoardedSparseMatrixBase<Field> Matrix;
virtual bool adjoint(void) { return true; }
SchurRedBlackDiagMooeeDagSolve(OperatorFunction<Field> &HermitianRBSolver,
const bool initSubGuess = false,
const bool _solnAsInitGuess = false)
: SchurRedBlackBase<Field> (HermitianRBSolver,initSubGuess,_solnAsInitGuess) {};
//////////////////////////////////////////////////////
// Override RedBlack specialisation
//////////////////////////////////////////////////////
virtual void RedBlackSource(Matrix & _Matrix,const Field &src, Field &src_e,Field &src_o)
{
GridBase *grid = _Matrix.RedBlackGrid();
GridBase *fgrid= _Matrix.Grid();
Field tmp(grid);
Field Mtmp(grid);
pickCheckerboard(Even,src_e,src);
pickCheckerboard(Odd ,src_o,src);
/////////////////////////////////////////////////////
// src_o = (source_o - Moe^dag MeeInvDag source_e)
/////////////////////////////////////////////////////
_Matrix.MooeeInvDag(src_e,tmp); assert( tmp.Checkerboard() ==Even);
_Matrix.MeooeDag (tmp,Mtmp); assert( Mtmp.Checkerboard() ==Odd);
tmp=src_o-Mtmp; assert( tmp.Checkerboard() ==Odd);
// get the right Mpc
SchurDiagMooeeOperator<Matrix,Field> _HermOpEO(_Matrix);
_HermOpEO.Mpc(tmp,src_o); assert(src_o.Checkerboard() ==Odd);
}
virtual void RedBlackSolve (Matrix & _Matrix,const Field &src_o, Field &sol_o)
{
SchurDiagMooeeDagOperator<Matrix,Field> _HermOpEO(_Matrix);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);
};
virtual void RedBlackSolve (Matrix & _Matrix,const std::vector<Field> &src_o, std::vector<Field> &sol_o)
{
SchurDiagMooeeDagOperator<Matrix,Field> _HermOpEO(_Matrix);
this->_HermitianRBSolver(_HermOpEO,src_o,sol_o);
}
virtual void RedBlackSolution(Matrix & _Matrix,const Field &sol_o, const Field &src_e,Field &sol)
{
GridBase *grid = _Matrix.RedBlackGrid();
GridBase *fgrid= _Matrix.Grid();
Field sol_e(grid);
Field tmp(grid);
///////////////////////////////////////////////////
// sol_e = M_ee^-dag * ( src_e - Moe^dag phi_o )...
// sol_o = phi_o
///////////////////////////////////////////////////
_Matrix.MeooeDag(sol_o,tmp); assert(tmp.Checkerboard()==Even);
tmp = src_e-tmp; assert(tmp.Checkerboard()==Even);
_Matrix.MooeeInvDag(tmp,sol_e); assert(sol_e.Checkerboard()==Even);
setCheckerboard(sol,sol_e); assert( sol_e.Checkerboard() ==Even);
setCheckerboard(sol,sol_o); assert( sol_o.Checkerboard() ==Odd );
}
};
} }
#endif #endif

View File

@@ -0,0 +1,154 @@
#include <Grid/GridCore.h>
#include <fcntl.h>
NAMESPACE_BEGIN(Grid);
MemoryStats *MemoryProfiler::stats = nullptr;
bool MemoryProfiler::debug = false;
int PointerCache::NcacheSmall = PointerCache::NcacheSmallMax;
#ifdef GRID_CUDA
int PointerCache::Ncache = 32;
#else
int PointerCache::Ncache = 8;
#endif
int PointerCache::Victim;
int PointerCache::VictimSmall;
PointerCache::PointerCacheEntry PointerCache::Entries[PointerCache::NcacheMax];
PointerCache::PointerCacheEntry PointerCache::EntriesSmall[PointerCache::NcacheSmallMax];
void PointerCache::Init(void)
{
char * str;
str= getenv("GRID_ALLOC_NCACHE_LARGE");
if ( str ) Ncache = atoi(str);
if ( (Ncache<0) || (Ncache > NcacheMax)) Ncache = NcacheMax;
str= getenv("GRID_ALLOC_NCACHE_SMALL");
if ( str ) NcacheSmall = atoi(str);
if ( (NcacheSmall<0) || (NcacheSmall > NcacheSmallMax)) NcacheSmall = NcacheSmallMax;
// printf("Aligned alloocator cache: large %d/%d small %d/%d\n",Ncache,NcacheMax,NcacheSmall,NcacheSmallMax);
}
void *PointerCache::Insert(void *ptr,size_t bytes)
{
if (bytes < GRID_ALLOC_SMALL_LIMIT )
return Insert(ptr,bytes,EntriesSmall,NcacheSmall,VictimSmall);
return Insert(ptr,bytes,Entries,Ncache,Victim);
}
void *PointerCache::Insert(void *ptr,size_t bytes,PointerCacheEntry *entries,int ncache,int &victim)
{
#ifdef GRID_OMP
assert(omp_in_parallel()==0);
#endif
void * ret = NULL;
int v = -1;
for(int e=0;e<ncache;e++) {
if ( entries[e].valid==0 ) {
v=e;
break;
}
}
if ( v==-1 ) {
v=victim;
victim = (victim+1)%ncache;
}
if ( entries[v].valid ) {
ret = entries[v].address;
entries[v].valid = 0;
entries[v].address = NULL;
entries[v].bytes = 0;
}
entries[v].address=ptr;
entries[v].bytes =bytes;
entries[v].valid =1;
return ret;
}
void *PointerCache::Lookup(size_t bytes)
{
if (bytes < GRID_ALLOC_SMALL_LIMIT )
return Lookup(bytes,EntriesSmall,NcacheSmall);
return Lookup(bytes,Entries,Ncache);
}
void *PointerCache::Lookup(size_t bytes,PointerCacheEntry *entries,int ncache)
{
#ifdef GRID_OMP
assert(omp_in_parallel()==0);
#endif
for(int e=0;e<ncache;e++){
if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
entries[e].valid = 0;
return entries[e].address;
}
}
return NULL;
}
void check_huge_pages(void *Buf,uint64_t BYTES)
{
#ifdef __linux__
int fd = open("/proc/self/pagemap", O_RDONLY);
assert(fd >= 0);
const int page_size = 4096;
uint64_t virt_pfn = (uint64_t)Buf / page_size;
off_t offset = sizeof(uint64_t) * virt_pfn;
uint64_t npages = (BYTES + page_size-1) / page_size;
uint64_t pagedata[npages];
uint64_t ret = lseek(fd, offset, SEEK_SET);
assert(ret == offset);
ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
assert(ret == sizeof(uint64_t) * npages);
int nhugepages = npages / 512;
int n4ktotal, nnothuge;
n4ktotal = 0;
nnothuge = 0;
for (int i = 0; i < nhugepages; ++i) {
uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size;
for (int j = 0; j < 512; ++j) {
uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size;
++n4ktotal;
if (pageaddr != baseaddr + j * page_size)
++nnothuge;
}
}
int rank = CartesianCommunicator::RankWorld();
printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge);
#endif
}
std::string sizeString(const size_t bytes)
{
constexpr unsigned int bufSize = 256;
const char *suffixes[7] = {"", "K", "M", "G", "T", "P", "E"};
char buf[256];
size_t s = 0;
double count = bytes;
while (count >= 1024 && s < 7)
{
s++;
count /= 1024;
}
if (count - floor(count) == 0.0)
{
snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]);
}
else
{
snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]);
}
return std::string(buf);
}
NAMESPACE_END(Grid);

View File

@@ -26,10 +26,129 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#pragma once #ifndef GRID_ALIGNED_ALLOCATOR_H
#define GRID_ALIGNED_ALLOCATOR_H
#ifdef HAVE_MALLOC_MALLOC_H
#include <malloc/malloc.h>
#endif
#ifdef HAVE_MALLOC_H
#include <malloc.h>
#endif
#ifdef HAVE_MM_MALLOC_H
#include <mm_malloc.h>
#endif
#define POINTER_CACHE
#define GRID_ALLOC_ALIGN (2*1024*1024)
#define GRID_ALLOC_SMALL_LIMIT (4096)
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
// Move control to configure.ac and Config.h?
class PointerCache {
private:
/*Pinning pages is costly*/
/*Could maintain separate large and small allocation caches*/
/* Could make these configurable, perhaps up to a max size*/
static const int NcacheSmallMax=128;
static const int NcacheMax=16;
static int NcacheSmall;
static int Ncache;
typedef struct {
void *address;
size_t bytes;
int valid;
} PointerCacheEntry;
static PointerCacheEntry Entries[NcacheMax];
static int Victim;
static PointerCacheEntry EntriesSmall[NcacheSmallMax];
static int VictimSmall;
public:
static void Init(void);
static void *Insert(void *ptr,size_t bytes) ;
static void *Insert(void *ptr,size_t bytes,PointerCacheEntry *entries,int ncache,int &victim) ;
static void *Lookup(size_t bytes) ;
static void *Lookup(size_t bytes,PointerCacheEntry *entries,int ncache) ;
};
std::string sizeString(size_t bytes);
struct MemoryStats
{
size_t totalAllocated{0}, maxAllocated{0},
currentlyAllocated{0}, totalFreed{0};
};
class MemoryProfiler
{
public:
static MemoryStats *stats;
static bool debug;
};
#ifdef GRID_NVCC
#define profilerCudaMeminfo \
{ size_t f, t ; cudaMemGetInfo ( &f,&t); std::cout << GridLogDebug << "[Memory debug] Cuda free "<<f<<"/"<<t << std::endl;}
#else
#define profilerCudaMeminfo
#endif
#define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")"
#define profilerDebugPrint \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl; \
std::cout << GridLogDebug << "[Memory debug] total : " << memString(s->totalAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] max : " << memString(s->maxAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] freed : " << memString(s->totalFreed) \
<< std::endl; \
} \
profilerCudaMeminfo;
#define profilerAllocate(bytes) \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
s->totalAllocated += (bytes); \
s->currentlyAllocated += (bytes); \
s->maxAllocated = std::max(s->maxAllocated, s->currentlyAllocated); \
} \
if (MemoryProfiler::debug) \
{ \
std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl; \
profilerDebugPrint; \
}
#define profilerFree(bytes) \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
s->totalFreed += (bytes); \
s->currentlyAllocated -= (bytes); \
} \
if (MemoryProfiler::debug) \
{ \
std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl; \
profilerDebugPrint; \
}
void check_huge_pages(void *Buf,uint64_t BYTES);
////////////////////////////////////////////////////////////////////
// A lattice of something, but assume the something is SIMDized.
////////////////////////////////////////////////////////////////////
template<typename _Tp> template<typename _Tp>
class alignedAllocator { class alignedAllocator {
public: public:
@@ -53,131 +172,89 @@ public:
{ {
size_type bytes = __n*sizeof(_Tp); size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes); profilerAllocate(bytes);
_Tp *ptr = (_Tp*) MemoryManager::CpuAllocate(bytes);
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
#ifdef POINTER_CACHE
_Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
#else
pointer ptr = nullptr;
#endif
#ifdef GRID_NVCC
////////////////////////////////////
// Unified (managed) memory
////////////////////////////////////
if ( ptr == (_Tp *) NULL ) {
// printf(" alignedAllocater cache miss %ld bytes ",bytes); BACKTRACEFP(stdout);
auto err = cudaMallocManaged((void **)&ptr,bytes);
if( err != cudaSuccess ) {
ptr = (_Tp *) NULL;
std::cerr << " cudaMallocManaged failed for " << bytes<<" bytes " <<cudaGetErrorString(err)<< std::endl;
assert(0);
}
}
assert( ptr != (_Tp *)NULL);
#else
//////////////////////////////////////////////////////////////////////////////////////////
// 2MB align; could make option probably doesn't need configurability
//////////////////////////////////////////////////////////////////////////////////////////
#ifdef HAVE_MM_MALLOC_H
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN);
#else
if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes);
#endif
assert( ptr != (_Tp *)NULL);
//////////////////////////////////////////////////
// First touch optimise in threaded loop
//////////////////////////////////////////////////
uint64_t *cp = (uint64_t *)ptr;
thread_for(n,bytes/sizeof(uint64_t), { // need only one touch per page
cp[n]=0;
});
#endif
return ptr; return ptr;
} }
void deallocate(pointer __p, size_type __n) void deallocate(pointer __p, size_type __n) {
{
size_type bytes = __n * sizeof(_Tp); size_type bytes = __n * sizeof(_Tp);
profilerFree(bytes); profilerFree(bytes);
MemoryManager::CpuFree((void *)__p,bytes);
#ifdef POINTER_CACHE
pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes);
#else
pointer __freeme = __p;
#endif
#ifdef GRID_NVCC
if ( __freeme ) cudaFree((void *)__freeme);
#else
#ifdef HAVE_MM_MALLOC_H
if ( __freeme ) _mm_free((void *)__freeme);
#else
if ( __freeme ) free((void *)__freeme);
#endif
#endif
} }
// FIXME: hack for the copy constructor: it must be avoided to avoid single thread loop // FIXME: hack for the copy constructor, eventually it must be avoided
void construct(pointer __p, const _Tp& __val) { assert(0);}; void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); };
//void construct(pointer __p, const _Tp& __val) { };
void construct(pointer __p) { }; void construct(pointer __p) { };
void destroy(pointer __p) { }; void destroy(pointer __p) { };
}; };
template<typename _Tp> inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; } template<typename _Tp> inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
template<typename _Tp> inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; } template<typename _Tp> inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
//////////////////////////////////////////////////////////////////////////////////////
// Unified virtual memory
//////////////////////////////////////////////////////////////////////////////////////
template<typename _Tp>
class uvmAllocator {
public:
typedef std::size_t size_type;
typedef std::ptrdiff_t difference_type;
typedef _Tp* pointer;
typedef const _Tp* const_pointer;
typedef _Tp& reference;
typedef const _Tp& const_reference;
typedef _Tp value_type;
template<typename _Tp1> struct rebind { typedef uvmAllocator<_Tp1> other; };
uvmAllocator() throw() { }
uvmAllocator(const uvmAllocator&) throw() { }
template<typename _Tp1> uvmAllocator(const uvmAllocator<_Tp1>&) throw() { }
~uvmAllocator() throw() { }
pointer address(reference __x) const { return &__x; }
size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
pointer allocate(size_type __n, const void* _p= 0)
{
size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes);
_Tp *ptr = (_Tp*) MemoryManager::SharedAllocate(bytes);
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
return ptr;
}
void deallocate(pointer __p, size_type __n)
{
size_type bytes = __n * sizeof(_Tp);
profilerFree(bytes);
MemoryManager::SharedFree((void *)__p,bytes);
}
void construct(pointer __p, const _Tp& __val) { new((void *)__p) _Tp(__val); };
void construct(pointer __p) { };
void destroy(pointer __p) { };
};
template<typename _Tp> inline bool operator==(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return true; }
template<typename _Tp> inline bool operator!=(const uvmAllocator<_Tp>&, const uvmAllocator<_Tp>&){ return false; }
////////////////////////////////////////////////////////////////////////////////
// Device memory
////////////////////////////////////////////////////////////////////////////////
template<typename _Tp>
class devAllocator {
public:
typedef std::size_t size_type;
typedef std::ptrdiff_t difference_type;
typedef _Tp* pointer;
typedef const _Tp* const_pointer;
typedef _Tp& reference;
typedef const _Tp& const_reference;
typedef _Tp value_type;
template<typename _Tp1> struct rebind { typedef devAllocator<_Tp1> other; };
devAllocator() throw() { }
devAllocator(const devAllocator&) throw() { }
template<typename _Tp1> devAllocator(const devAllocator<_Tp1>&) throw() { }
~devAllocator() throw() { }
pointer address(reference __x) const { return &__x; }
size_type max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
pointer allocate(size_type __n, const void* _p= 0)
{
size_type bytes = __n*sizeof(_Tp);
profilerAllocate(bytes);
_Tp *ptr = (_Tp*) MemoryManager::AcceleratorAllocate(bytes);
assert( ( (_Tp*)ptr != (_Tp *)NULL ) );
return ptr;
}
void deallocate(pointer __p, size_type __n)
{
size_type bytes = __n * sizeof(_Tp);
profilerFree(bytes);
MemoryManager::AcceleratorFree((void *)__p,bytes);
}
void construct(pointer __p, const _Tp& __val) { };
void construct(pointer __p) { };
void destroy(pointer __p) { };
};
template<typename _Tp> inline bool operator==(const devAllocator<_Tp>&, const devAllocator<_Tp>&){ return true; }
template<typename _Tp> inline bool operator!=(const devAllocator<_Tp>&, const devAllocator<_Tp>&){ return false; }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Template typedefs // Template typedefs
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
#ifdef ACCELERATOR_CSHIFT template<class T> using commAllocator = alignedAllocator<T>;
// Cshift on device template<class T> using Vector = std::vector<T,alignedAllocator<T> >;
template<class T> using cshiftAllocator = devAllocator<T>; template<class T> using commVector = std::vector<T,alignedAllocator<T> >;
#else template<class T> using Matrix = std::vector<std::vector<T,alignedAllocator<T> > >;
// Cshift on host
template<class T> using cshiftAllocator = std::allocator<T>;
#endif
template<class T> using Vector = std::vector<T,uvmAllocator<T> >;
template<class T> using stencilVector = std::vector<T,alignedAllocator<T> >;
template<class T> using commVector = std::vector<T,devAllocator<T> >;
template<class T> using cshiftVector = std::vector<T,cshiftAllocator<T> >;
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
#endif

View File

@@ -1,4 +0,0 @@
#pragma once
#include <Grid/allocator/MemoryStats.h>
#include <Grid/allocator/MemoryManager.h>
#include <Grid/allocator/AlignedAllocator.h>

View File

@@ -1,301 +0,0 @@
#include <Grid/GridCore.h>
NAMESPACE_BEGIN(Grid);
/*Allocation types, saying which pointer cache should be used*/
#define Cpu (0)
#define CpuSmall (1)
#define Acc (2)
#define AccSmall (3)
#define Shared (4)
#define SharedSmall (5)
#undef GRID_MM_VERBOSE
uint64_t total_shared;
uint64_t total_device;
uint64_t total_host;;
void MemoryManager::PrintBytes(void)
{
std::cout << " MemoryManager : ------------------------------------ "<<std::endl;
std::cout << " MemoryManager : PrintBytes "<<std::endl;
std::cout << " MemoryManager : ------------------------------------ "<<std::endl;
std::cout << " MemoryManager : "<<(total_shared>>20)<<" shared Mbytes "<<std::endl;
std::cout << " MemoryManager : "<<(total_device>>20)<<" accelerator Mbytes "<<std::endl;
std::cout << " MemoryManager : "<<(total_host>>20) <<" cpu Mbytes "<<std::endl;
uint64_t cacheBytes;
cacheBytes = CacheBytes[Cpu];
std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" cpu cache Mbytes "<<std::endl;
cacheBytes = CacheBytes[Acc];
std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" acc cache Mbytes "<<std::endl;
cacheBytes = CacheBytes[Shared];
std::cout << " MemoryManager : "<<(cacheBytes>>20) <<" shared cache Mbytes "<<std::endl;
#ifdef GRID_CUDA
cuda_mem();
#endif
}
//////////////////////////////////////////////////////////////////////
// Data tables for recently freed pooiniter caches
//////////////////////////////////////////////////////////////////////
MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
int MemoryManager::Victim[MemoryManager::NallocType];
int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 2, 8, 2, 8 };
uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType];
//////////////////////////////////////////////////////////////////////
// Actual allocation and deallocation utils
//////////////////////////////////////////////////////////////////////
void *MemoryManager::AcceleratorAllocate(size_t bytes)
{
total_device+=bytes;
void *ptr = (void *) Lookup(bytes,Acc);
if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocDevice(bytes);
}
#ifdef GRID_MM_VERBOSE
std::cout <<"AcceleratorAllocate "<<std::endl;
PrintBytes();
#endif
return ptr;
}
void MemoryManager::AcceleratorFree (void *ptr,size_t bytes)
{
total_device-=bytes;
void *__freeme = Insert(ptr,bytes,Acc);
if ( __freeme ) {
acceleratorFreeDevice(__freeme);
}
#ifdef GRID_MM_VERBOSE
std::cout <<"AcceleratorFree "<<std::endl;
PrintBytes();
#endif
}
void *MemoryManager::SharedAllocate(size_t bytes)
{
total_shared+=bytes;
void *ptr = (void *) Lookup(bytes,Shared);
if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocShared(bytes);
}
#ifdef GRID_MM_VERBOSE
std::cout <<"SharedAllocate "<<std::endl;
PrintBytes();
#endif
return ptr;
}
void MemoryManager::SharedFree (void *ptr,size_t bytes)
{
total_shared-=bytes;
void *__freeme = Insert(ptr,bytes,Shared);
if ( __freeme ) {
acceleratorFreeShared(__freeme);
}
#ifdef GRID_MM_VERBOSE
std::cout <<"SharedFree "<<std::endl;
PrintBytes();
#endif
}
#ifdef GRID_UVM
void *MemoryManager::CpuAllocate(size_t bytes)
{
total_host+=bytes;
void *ptr = (void *) Lookup(bytes,Cpu);
if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocShared(bytes);
}
#ifdef GRID_MM_VERBOSE
std::cout <<"CpuAllocate "<<std::endl;
PrintBytes();
#endif
return ptr;
}
void MemoryManager::CpuFree (void *_ptr,size_t bytes)
{
total_host-=bytes;
NotifyDeletion(_ptr);
void *__freeme = Insert(_ptr,bytes,Cpu);
if ( __freeme ) {
acceleratorFreeShared(__freeme);
}
#ifdef GRID_MM_VERBOSE
std::cout <<"CpuFree "<<std::endl;
PrintBytes();
#endif
}
#else
void *MemoryManager::CpuAllocate(size_t bytes)
{
total_host+=bytes;
void *ptr = (void *) Lookup(bytes,Cpu);
if ( ptr == (void *) NULL ) {
ptr = (void *) acceleratorAllocCpu(bytes);
}
#ifdef GRID_MM_VERBOSE
std::cout <<"CpuAllocate "<<std::endl;
PrintBytes();
#endif
return ptr;
}
void MemoryManager::CpuFree (void *_ptr,size_t bytes)
{
total_host-=bytes;
NotifyDeletion(_ptr);
void *__freeme = Insert(_ptr,bytes,Cpu);
if ( __freeme ) {
acceleratorFreeCpu(__freeme);
}
#ifdef GRID_MM_VERBOSE
std::cout <<"CpuFree "<<std::endl;
PrintBytes();
#endif
}
#endif
//////////////////////////////////////////
// call only once
//////////////////////////////////////////
void MemoryManager::Init(void)
{
char * str;
int Nc;
int NcS;
str= getenv("GRID_ALLOC_NCACHE_LARGE");
if ( str ) {
Nc = atoi(str);
if ( (Nc>=0) && (Nc < NallocCacheMax)) {
Ncache[Cpu]=Nc;
Ncache[Acc]=Nc;
Ncache[Shared]=Nc;
}
}
str= getenv("GRID_ALLOC_NCACHE_SMALL");
if ( str ) {
Nc = atoi(str);
if ( (Nc>=0) && (Nc < NallocCacheMax)) {
Ncache[CpuSmall]=Nc;
Ncache[AccSmall]=Nc;
Ncache[SharedSmall]=Nc;
}
}
}
void MemoryManager::InitMessage(void) {
#ifndef GRID_UVM
std::cout << GridLogMessage << "MemoryManager Cache "<< MemoryManager::DeviceMaxBytes <<" bytes "<<std::endl;
#endif
std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl;
#ifdef ALLOCATION_CACHE
std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<std::endl;
#endif
#ifdef GRID_UVM
std::cout << GridLogMessage<< "MemoryManager::Init() Unified memory space"<<std::endl;
#ifdef GRID_CUDA
std::cout << GridLogMessage<< "MemoryManager::Init() Using cudaMallocManaged"<<std::endl;
#endif
#ifdef GRID_HIP
std::cout << GridLogMessage<< "MemoryManager::Init() Using hipMallocManaged"<<std::endl;
#endif
#ifdef GRID_SYCL
std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_shared"<<std::endl;
#endif
#else
std::cout << GridLogMessage<< "MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory"<<std::endl;
#ifdef GRID_CUDA
std::cout << GridLogMessage<< "MemoryManager::Init() Using cudaMalloc"<<std::endl;
#endif
#ifdef GRID_HIP
std::cout << GridLogMessage<< "MemoryManager::Init() Using hipMalloc"<<std::endl;
#endif
#ifdef GRID_SYCL
std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_device"<<std::endl;
#endif
#endif
}
void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
{
#ifdef ALLOCATION_CACHE
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
int cache = type + small;
return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]);
#else
return ptr;
#endif
}
void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes)
{
assert(ncache>0);
#ifdef GRID_OMP
assert(omp_in_parallel()==0);
#endif
void * ret = NULL;
int v = -1;
for(int e=0;e<ncache;e++) {
if ( entries[e].valid==0 ) {
v=e;
break;
}
}
if ( v==-1 ) {
v=victim;
victim = (victim+1)%ncache;
}
if ( entries[v].valid ) {
ret = entries[v].address;
cacheBytes -= entries[v].bytes;
entries[v].valid = 0;
entries[v].address = NULL;
entries[v].bytes = 0;
}
entries[v].address=ptr;
entries[v].bytes =bytes;
entries[v].valid =1;
cacheBytes += bytes;
return ret;
}
void *MemoryManager::Lookup(size_t bytes,int type)
{
#ifdef ALLOCATION_CACHE
bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
int cache = type+small;
return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]);
#else
return NULL;
#endif
}
void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes)
{
assert(ncache>0);
#ifdef GRID_OMP
assert(omp_in_parallel()==0);
#endif
for(int e=0;e<ncache;e++){
if ( entries[e].valid && ( entries[e].bytes == bytes ) ) {
entries[e].valid = 0;
cacheBytes -= entries[e].bytes;
return entries[e].address;
}
}
return NULL;
}
NAMESPACE_END(Grid);

View File

@@ -1,181 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/MemoryManager.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <list>
#include <unordered_map>
NAMESPACE_BEGIN(Grid);
// Move control to configure.ac and Config.h?
#define GRID_ALLOC_SMALL_LIMIT (4096)
/*Pinning pages is costly*/
////////////////////////////////////////////////////////////////////////////
// Advise the LatticeAccelerator class
////////////////////////////////////////////////////////////////////////////
enum ViewAdvise {
AdviseDefault = 0x0, // Regular data
AdviseInfrequentUse = 0x1 // Advise that the data is used infrequently. This can
// significantly influence performance of bulk storage.
// AdviseTransient = 0x2, // Data will mostly be read. On some architectures
// enables read-only copies of memory to be kept on
// host and device.
// AdviseAcceleratorWriteDiscard = 0x4 // Field will be written in entirety on device
};
////////////////////////////////////////////////////////////////////////////
// View Access Mode
////////////////////////////////////////////////////////////////////////////
enum ViewMode {
AcceleratorRead = 0x01,
AcceleratorWrite = 0x02,
AcceleratorWriteDiscard = 0x04,
CpuRead = 0x08,
CpuWrite = 0x10,
CpuWriteDiscard = 0x10 // same for now
};
class MemoryManager {
private:
////////////////////////////////////////////////////////////
// For caching recently freed allocations
////////////////////////////////////////////////////////////
typedef struct {
void *address;
size_t bytes;
int valid;
} AllocationCacheEntry;
static const int NallocCacheMax=128;
static const int NallocType=6;
static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
static int Victim[NallocType];
static int Ncache[NallocType];
static uint64_t CacheBytes[NallocType];
/////////////////////////////////////////////////
// Free pool
/////////////////////////////////////////////////
static void *Insert(void *ptr,size_t bytes,int type) ;
static void *Lookup(size_t bytes,int type) ;
static void *Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim,uint64_t &cbytes) ;
static void *Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t &cbytes) ;
static void PrintBytes(void);
public:
static void Init(void);
static void InitMessage(void);
static void *AcceleratorAllocate(size_t bytes);
static void AcceleratorFree (void *ptr,size_t bytes);
static void *SharedAllocate(size_t bytes);
static void SharedFree (void *ptr,size_t bytes);
static void *CpuAllocate(size_t bytes);
static void CpuFree (void *ptr,size_t bytes);
////////////////////////////////////////////////////////
// Footprint tracking
////////////////////////////////////////////////////////
static uint64_t DeviceBytes;
static uint64_t DeviceLRUBytes;
static uint64_t DeviceMaxBytes;
static uint64_t HostToDeviceBytes;
static uint64_t DeviceToHostBytes;
static uint64_t HostToDeviceXfer;
static uint64_t DeviceToHostXfer;
private:
#ifndef GRID_UVM
//////////////////////////////////////////////////////////////////////
// Data tables for ViewCache
//////////////////////////////////////////////////////////////////////
typedef std::list<uint64_t> LRU_t;
typedef typename LRU_t::iterator LRUiterator;
typedef struct {
int LRU_valid;
LRUiterator LRU_entry;
uint64_t CpuPtr;
uint64_t AccPtr;
size_t bytes;
uint32_t transient;
uint32_t state;
uint32_t accLock;
uint32_t cpuLock;
} AcceleratorViewEntry;
typedef std::unordered_map<uint64_t,AcceleratorViewEntry> AccViewTable_t;
typedef typename AccViewTable_t::iterator AccViewTableIterator ;
static AccViewTable_t AccViewTable;
static LRU_t LRU;
/////////////////////////////////////////////////
// Device motion
/////////////////////////////////////////////////
static void Create(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
static void EvictVictims(uint64_t bytes); // Frees up <bytes>
static void Evict(AcceleratorViewEntry &AccCache);
static void Flush(AcceleratorViewEntry &AccCache);
static void Clone(AcceleratorViewEntry &AccCache);
static void AccDiscard(AcceleratorViewEntry &AccCache);
static void CpuDiscard(AcceleratorViewEntry &AccCache);
// static void LRUupdate(AcceleratorViewEntry &AccCache);
static void LRUinsert(AcceleratorViewEntry &AccCache);
static void LRUremove(AcceleratorViewEntry &AccCache);
// manage entries in the table
static int EntryPresent(uint64_t CpuPtr);
static void EntryCreate(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
static void EntryErase (uint64_t CpuPtr);
static AccViewTableIterator EntryLookup(uint64_t CpuPtr);
static void EntrySet (uint64_t CpuPtr,AcceleratorViewEntry &entry);
static void AcceleratorViewClose(uint64_t AccPtr);
static uint64_t AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
static void CpuViewClose(uint64_t Ptr);
static uint64_t CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
#endif
static void NotifyDeletion(void * CpuPtr);
public:
static void Print(void);
static int isOpen (void* CpuPtr);
static void ViewClose(void* CpuPtr,ViewMode mode);
static void *ViewOpen (void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
};
NAMESPACE_END(Grid);

View File

@@ -1,479 +0,0 @@
#include <Grid/GridCore.h>
#ifndef GRID_UVM
#warning "Using explicit device memory copies"
NAMESPACE_BEGIN(Grid);
//#define dprintf(...) printf ( __VA_ARGS__ ); fflush(stdout);
#define dprintf(...)
////////////////////////////////////////////////////////////
// For caching copies of data on device
////////////////////////////////////////////////////////////
MemoryManager::AccViewTable_t MemoryManager::AccViewTable;
MemoryManager::LRU_t MemoryManager::LRU;
////////////////////////////////////////////////////////
// Footprint tracking
////////////////////////////////////////////////////////
uint64_t MemoryManager::DeviceBytes;
uint64_t MemoryManager::DeviceLRUBytes;
uint64_t MemoryManager::DeviceMaxBytes = 1024*1024*128;
uint64_t MemoryManager::HostToDeviceBytes;
uint64_t MemoryManager::DeviceToHostBytes;
uint64_t MemoryManager::HostToDeviceXfer;
uint64_t MemoryManager::DeviceToHostXfer;
////////////////////////////////////
// Priority ordering for unlocked entries
// Empty
// CpuDirty
// Consistent
// AccDirty
////////////////////////////////////
#define Empty (0x0) /*Entry unoccupied */
#define CpuDirty (0x1) /*CPU copy is golden, Acc buffer MAY not be allocated*/
#define Consistent (0x2) /*ACC copy AND CPU copy are valid */
#define AccDirty (0x4) /*ACC copy is golden */
#define EvictNext (0x8) /*Priority for eviction*/
/////////////////////////////////////////////////
// Mechanics of data table maintenance
/////////////////////////////////////////////////
int MemoryManager::EntryPresent(uint64_t CpuPtr)
{
if(AccViewTable.empty()) return 0;
auto count = AccViewTable.count(CpuPtr); assert((count==0)||(count==1));
return count;
}
void MemoryManager::EntryCreate(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
{
assert(!EntryPresent(CpuPtr));
AcceleratorViewEntry AccCache;
AccCache.CpuPtr = CpuPtr;
AccCache.AccPtr = (uint64_t)NULL;
AccCache.bytes = bytes;
AccCache.state = CpuDirty;
AccCache.LRU_valid=0;
AccCache.transient=0;
AccCache.accLock=0;
AccCache.cpuLock=0;
AccViewTable[CpuPtr] = AccCache;
}
MemoryManager::AccViewTableIterator MemoryManager::EntryLookup(uint64_t CpuPtr)
{
assert(EntryPresent(CpuPtr));
auto AccCacheIterator = AccViewTable.find(CpuPtr);
assert(AccCacheIterator!=AccViewTable.end());
return AccCacheIterator;
}
void MemoryManager::EntryErase(uint64_t CpuPtr)
{
auto AccCache = EntryLookup(CpuPtr);
AccViewTable.erase(CpuPtr);
}
void MemoryManager::LRUinsert(AcceleratorViewEntry &AccCache)
{
assert(AccCache.LRU_valid==0);
if (AccCache.transient) {
LRU.push_back(AccCache.CpuPtr);
AccCache.LRU_entry = --LRU.end();
} else {
LRU.push_front(AccCache.CpuPtr);
AccCache.LRU_entry = LRU.begin();
}
AccCache.LRU_valid = 1;
DeviceLRUBytes+=AccCache.bytes;
}
void MemoryManager::LRUremove(AcceleratorViewEntry &AccCache)
{
assert(AccCache.LRU_valid==1);
LRU.erase(AccCache.LRU_entry);
AccCache.LRU_valid = 0;
DeviceLRUBytes-=AccCache.bytes;
}
/////////////////////////////////////////////////
// Accelerator cache motion & consistency logic
/////////////////////////////////////////////////
void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
{
///////////////////////////////////////////////////////////
// Remove from Accelerator, remove entry, without flush
// Cannot be locked. If allocated Must be in LRU pool.
///////////////////////////////////////////////////////////
assert(AccCache.state!=Empty);
dprintf("MemoryManager: Discard(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
assert(AccCache.accLock==0);
assert(AccCache.cpuLock==0);
assert(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr) {
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
DeviceBytes -=AccCache.bytes;
LRUremove(AccCache);
dprintf("MemoryManager: Free(%llx) LRU %lld Total %lld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);
}
uint64_t CpuPtr = AccCache.CpuPtr;
EntryErase(CpuPtr);
}
void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
{
///////////////////////////////////////////////////////////////////////////
// Make CPU consistent, remove from Accelerator, remove entry
// Cannot be locked. If allocated must be in LRU pool.
///////////////////////////////////////////////////////////////////////////
assert(AccCache.state!=Empty);
dprintf("MemoryManager: Evict(%llx) %llx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
assert(AccCache.accLock==0);
assert(AccCache.cpuLock==0);
if(AccCache.state==AccDirty) {
Flush(AccCache);
}
assert(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr) {
AcceleratorFree((void *)AccCache.AccPtr,AccCache.bytes);
DeviceBytes -=AccCache.bytes;
LRUremove(AccCache);
dprintf("MemoryManager: Free(%llx) footprint now %lld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);
}
uint64_t CpuPtr = AccCache.CpuPtr;
EntryErase(CpuPtr);
}
void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
{
assert(AccCache.state==AccDirty);
assert(AccCache.cpuLock==0);
assert(AccCache.accLock==0);
assert(AccCache.AccPtr!=(uint64_t)NULL);
assert(AccCache.CpuPtr!=(uint64_t)NULL);
acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
dprintf("MemoryManager: Flush %llx -> %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
DeviceToHostBytes+=AccCache.bytes;
DeviceToHostXfer++;
AccCache.state=Consistent;
}
void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
{
assert(AccCache.state==CpuDirty);
assert(AccCache.cpuLock==0);
assert(AccCache.accLock==0);
assert(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr==(uint64_t)NULL){
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
DeviceBytes+=AccCache.bytes;
}
dprintf("MemoryManager: Clone %llx <- %llx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
HostToDeviceBytes+=AccCache.bytes;
HostToDeviceXfer++;
AccCache.state=Consistent;
}
void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache)
{
assert(AccCache.state!=Empty);
assert(AccCache.cpuLock==0);
assert(AccCache.accLock==0);
assert(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.AccPtr==(uint64_t)NULL){
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
DeviceBytes+=AccCache.bytes;
}
AccCache.state=AccDirty;
}
/////////////////////////////////////////////////////////////////////////////////
// View management
/////////////////////////////////////////////////////////////////////////////////
void MemoryManager::ViewClose(void* Ptr,ViewMode mode)
{
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
AcceleratorViewClose((uint64_t)Ptr);
} else if( (mode==CpuRead)||(mode==CpuWrite)){
CpuViewClose((uint64_t)Ptr);
} else {
assert(0);
}
}
void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
{
uint64_t CpuPtr = (uint64_t)_CpuPtr;
if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint);
} else if( (mode==CpuRead)||(mode==CpuWrite)){
return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint);
} else {
assert(0);
return NULL;
}
}
void MemoryManager::EvictVictims(uint64_t bytes)
{
while(bytes+DeviceLRUBytes > DeviceMaxBytes){
if ( DeviceLRUBytes > 0){
assert(LRU.size()>0);
uint64_t victim = LRU.back();
auto AccCacheIterator = EntryLookup(victim);
auto & AccCache = AccCacheIterator->second;
Evict(AccCache);
}
}
}
uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint)
{
////////////////////////////////////////////////////////////////////////////
// Find if present, otherwise get or force an empty
////////////////////////////////////////////////////////////////////////////
if ( EntryPresent(CpuPtr)==0 ){
EntryCreate(CpuPtr,bytes,mode,hint);
}
auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second;
if (!AccCache.AccPtr) {
EvictVictims(bytes);
}
assert((mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard));
assert(AccCache.cpuLock==0); // Programming error
if(AccCache.state!=Empty) {
dprintf("ViewOpen found entry %llx %llx : %lld %lld\n",
(uint64_t)AccCache.CpuPtr,
(uint64_t)CpuPtr,
(uint64_t)AccCache.bytes,
(uint64_t)bytes);
assert(AccCache.CpuPtr == CpuPtr);
assert(AccCache.bytes ==bytes);
}
/*
* State transitions and actions
*
* Action State StateNext Flush Clone
*
* AccRead Empty Consistent - Y
* AccWrite Empty AccDirty - Y
* AccRead CpuDirty Consistent - Y
* AccWrite CpuDirty AccDirty - Y
* AccRead Consistent Consistent - -
* AccWrite Consistent AccDirty - -
* AccRead AccDirty AccDirty - -
* AccWrite AccDirty AccDirty - -
*/
if(AccCache.state==Empty) {
assert(AccCache.LRU_valid==0);
AccCache.CpuPtr = CpuPtr;
AccCache.AccPtr = (uint64_t)NULL;
AccCache.bytes = bytes;
AccCache.state = CpuDirty; // Cpu starts primary
if(mode==AcceleratorWriteDiscard){
CpuDiscard(AccCache);
AccCache.state = AccDirty; // Empty + AcceleratorWrite=> AccDirty
} else if(mode==AcceleratorWrite){
Clone(AccCache);
AccCache.state = AccDirty; // Empty + AcceleratorWrite=> AccDirty
} else {
Clone(AccCache);
AccCache.state = Consistent; // Empty + AccRead => Consistent
}
AccCache.accLock= 1;
} else if(AccCache.state==CpuDirty ){
if(mode==AcceleratorWriteDiscard) {
CpuDiscard(AccCache);
AccCache.state = AccDirty; // CpuDirty + AcceleratorWrite=> AccDirty
} else if(mode==AcceleratorWrite) {
Clone(AccCache);
AccCache.state = AccDirty; // CpuDirty + AcceleratorWrite=> AccDirty
} else {
Clone(AccCache);
AccCache.state = Consistent; // CpuDirty + AccRead => Consistent
}
AccCache.accLock++;
dprintf("Copied CpuDirty entry into device accLock %d\n",AccCache.accLock);
} else if(AccCache.state==Consistent) {
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
AccCache.state = AccDirty; // Consistent + AcceleratorWrite=> AccDirty
else
AccCache.state = Consistent; // Consistent + AccRead => Consistent
AccCache.accLock++;
dprintf("Consistent entry into device accLock %d\n",AccCache.accLock);
} else if(AccCache.state==AccDirty) {
if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
AccCache.state = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
else
AccCache.state = AccDirty; // AccDirty + AccRead => AccDirty
AccCache.accLock++;
dprintf("AccDirty entry into device accLock %d\n",AccCache.accLock);
} else {
assert(0);
}
// If view is opened on device remove from LRU
if(AccCache.LRU_valid==1){
// must possibly remove from LRU as now locked on GPU
LRUremove(AccCache);
}
int transient =hint;
AccCache.transient= transient? EvictNext : 0;
return AccCache.AccPtr;
}
////////////////////////////////////
// look up & decrement lock count
////////////////////////////////////
void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr)
{
auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second;
assert(AccCache.cpuLock==0);
assert(AccCache.accLock>0);
AccCache.accLock--;
// Move to LRU queue if not locked and close on device
if(AccCache.accLock==0) {
LRUinsert(AccCache);
}
}
void MemoryManager::CpuViewClose(uint64_t CpuPtr)
{
auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second;
assert(AccCache.cpuLock>0);
assert(AccCache.accLock==0);
AccCache.cpuLock--;
}
/*
* Action State StateNext Flush Clone
*
* CpuRead Empty CpuDirty - -
* CpuWrite Empty CpuDirty - -
* CpuRead CpuDirty CpuDirty - -
* CpuWrite CpuDirty CpuDirty - -
* CpuRead Consistent Consistent - -
* CpuWrite Consistent CpuDirty - -
* CpuRead AccDirty Consistent Y -
* CpuWrite AccDirty CpuDirty Y -
*/
uint64_t MemoryManager::CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise transient)
{
////////////////////////////////////////////////////////////////////////////
// Find if present, otherwise get or force an empty
////////////////////////////////////////////////////////////////////////////
if ( EntryPresent(CpuPtr)==0 ){
EntryCreate(CpuPtr,bytes,mode,transient);
}
auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second;
if (!AccCache.AccPtr) {
EvictVictims(bytes);
}
assert((mode==CpuRead)||(mode==CpuWrite));
assert(AccCache.accLock==0); // Programming error
if(AccCache.state!=Empty) {
assert(AccCache.CpuPtr == CpuPtr);
assert(AccCache.bytes==bytes);
}
if(AccCache.state==Empty) {
AccCache.CpuPtr = CpuPtr;
AccCache.AccPtr = (uint64_t)NULL;
AccCache.bytes = bytes;
AccCache.state = CpuDirty; // Empty + CpuRead/CpuWrite => CpuDirty
AccCache.accLock= 0;
AccCache.cpuLock= 1;
} else if(AccCache.state==CpuDirty ){
// AccPtr dont care, deferred allocate
AccCache.state = CpuDirty; // CpuDirty +CpuRead/CpuWrite => CpuDirty
AccCache.cpuLock++;
} else if(AccCache.state==Consistent) {
assert(AccCache.AccPtr != (uint64_t)NULL);
if(mode==CpuWrite)
AccCache.state = CpuDirty; // Consistent +CpuWrite => CpuDirty
else
AccCache.state = Consistent; // Consistent +CpuRead => Consistent
AccCache.cpuLock++;
} else if(AccCache.state==AccDirty) {
assert(AccCache.AccPtr != (uint64_t)NULL);
Flush(AccCache);
if(mode==CpuWrite) AccCache.state = CpuDirty; // AccDirty +CpuWrite => CpuDirty, Flush
else AccCache.state = Consistent; // AccDirty +CpuRead => Consistent, Flush
AccCache.cpuLock++;
} else {
assert(0); // should be unreachable
}
AccCache.transient= transient? EvictNext : 0;
return AccCache.CpuPtr;
}
void MemoryManager::NotifyDeletion(void *_ptr)
{
// Look up in ViewCache
uint64_t ptr = (uint64_t)_ptr;
if(EntryPresent(ptr)) {
auto e = EntryLookup(ptr);
AccDiscard(e->second);
}
}
void MemoryManager::Print(void)
{
PrintBytes();
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
std::cout << GridLogDebug << "Memory Manager " << std::endl;
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
std::cout << GridLogDebug << DeviceBytes << " bytes allocated on device " << std::endl;
std::cout << GridLogDebug << DeviceLRUBytes<< " bytes evictable on device " << std::endl;
std::cout << GridLogDebug << DeviceMaxBytes<< " bytes max on device " << std::endl;
std::cout << GridLogDebug << HostToDeviceXfer << " transfers to device " << std::endl;
std::cout << GridLogDebug << DeviceToHostXfer << " transfers from device " << std::endl;
std::cout << GridLogDebug << HostToDeviceBytes<< " bytes transfered to device " << std::endl;
std::cout << GridLogDebug << DeviceToHostBytes<< " bytes transfered from device " << std::endl;
std::cout << GridLogDebug << AccViewTable.size()<< " vectors " << LRU.size()<<" evictable"<< std::endl;
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
std::cout << GridLogDebug << "CpuAddr\t\tAccAddr\t\tState\t\tcpuLock\taccLock\tLRU_valid "<<std::endl;
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
for(auto it=AccViewTable.begin();it!=AccViewTable.end();it++){
auto &AccCache = it->second;
std::string str;
if ( AccCache.state==Empty ) str = std::string("Empty");
if ( AccCache.state==CpuDirty ) str = std::string("CpuDirty");
if ( AccCache.state==AccDirty ) str = std::string("AccDirty");
if ( AccCache.state==Consistent)str = std::string("Consistent");
std::cout << GridLogDebug << "0x"<<std::hex<<AccCache.CpuPtr<<std::dec
<< "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
<< "\t" << AccCache.cpuLock
<< "\t" << AccCache.accLock
<< "\t" << AccCache.LRU_valid<<std::endl;
}
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
};
int MemoryManager::isOpen (void* _CpuPtr)
{
uint64_t CpuPtr = (uint64_t)_CpuPtr;
if ( EntryPresent(CpuPtr) ){
auto AccCacheIterator = EntryLookup(CpuPtr);
auto & AccCache = AccCacheIterator->second;
return AccCache.cpuLock+AccCache.accLock;
} else {
return 0;
}
}
NAMESPACE_END(Grid);
#endif

View File

@@ -1,23 +0,0 @@
#include <Grid/GridCore.h>
#ifdef GRID_UVM
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////////////////////
// View management is 1:1 address space mapping
/////////////////////////////////////////////////////////////////////////////////
uint64_t MemoryManager::DeviceBytes;
uint64_t MemoryManager::DeviceLRUBytes;
uint64_t MemoryManager::DeviceMaxBytes = 1024*1024*128;
uint64_t MemoryManager::HostToDeviceBytes;
uint64_t MemoryManager::DeviceToHostBytes;
uint64_t MemoryManager::HostToDeviceXfer;
uint64_t MemoryManager::DeviceToHostXfer;
void MemoryManager::ViewClose(void* AccPtr,ViewMode mode){};
void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; };
int MemoryManager::isOpen (void* CpuPtr) { return 0;}
void MemoryManager::Print(void){};
void MemoryManager::NotifyDeletion(void *ptr){};
NAMESPACE_END(Grid);
#endif

View File

@@ -1,67 +0,0 @@
#include <Grid/GridCore.h>
#include <fcntl.h>
NAMESPACE_BEGIN(Grid);
MemoryStats *MemoryProfiler::stats = nullptr;
bool MemoryProfiler::debug = false;
void check_huge_pages(void *Buf,uint64_t BYTES)
{
#ifdef __linux__
int fd = open("/proc/self/pagemap", O_RDONLY);
assert(fd >= 0);
const int page_size = 4096;
uint64_t virt_pfn = (uint64_t)Buf / page_size;
off_t offset = sizeof(uint64_t) * virt_pfn;
uint64_t npages = (BYTES + page_size-1) / page_size;
uint64_t pagedata[npages];
uint64_t ret = lseek(fd, offset, SEEK_SET);
assert(ret == offset);
ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
assert(ret == sizeof(uint64_t) * npages);
int nhugepages = npages / 512;
int n4ktotal, nnothuge;
n4ktotal = 0;
nnothuge = 0;
for (int i = 0; i < nhugepages; ++i) {
uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size;
for (int j = 0; j < 512; ++j) {
uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size;
++n4ktotal;
if (pageaddr != baseaddr + j * page_size)
++nnothuge;
}
}
int rank = CartesianCommunicator::RankWorld();
printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge);
#endif
}
std::string sizeString(const size_t bytes)
{
constexpr unsigned int bufSize = 256;
const char *suffixes[7] = {"", "K", "M", "G", "T", "P", "E"};
char buf[256];
size_t s = 0;
double count = bytes;
while (count >= 1024 && s < 7)
{
s++;
count /= 1024;
}
if (count - floor(count) == 0.0)
{
snprintf(buf, bufSize, "%d %sB", (int)count, suffixes[s]);
}
else
{
snprintf(buf, bufSize, "%.1f %sB", count, suffixes[s]);
}
return std::string(buf);
}
NAMESPACE_END(Grid);

View File

@@ -1,95 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/MemoryStats.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
std::string sizeString(size_t bytes);
struct MemoryStats
{
size_t totalAllocated{0}, maxAllocated{0},
currentlyAllocated{0}, totalFreed{0};
};
class MemoryProfiler
{
public:
static MemoryStats *stats;
static bool debug;
};
#define memString(bytes) std::to_string(bytes) + " (" + sizeString(bytes) + ")"
#define profilerDebugPrint \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
std::cout << GridLogDebug << "[Memory debug] Stats " << MemoryProfiler::stats << std::endl; \
std::cout << GridLogDebug << "[Memory debug] total : " << memString(s->totalAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] max : " << memString(s->maxAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] current: " << memString(s->currentlyAllocated) \
<< std::endl; \
std::cout << GridLogDebug << "[Memory debug] freed : " << memString(s->totalFreed) \
<< std::endl; \
}
#define profilerAllocate(bytes) \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
s->totalAllocated += (bytes); \
s->currentlyAllocated += (bytes); \
s->maxAllocated = std::max(s->maxAllocated, s->currentlyAllocated); \
} \
if (MemoryProfiler::debug) \
{ \
std::cout << GridLogDebug << "[Memory debug] allocating " << memString(bytes) << std::endl; \
profilerDebugPrint; \
}
#define profilerFree(bytes) \
if (MemoryProfiler::stats) \
{ \
auto s = MemoryProfiler::stats; \
s->totalFreed += (bytes); \
s->currentlyAllocated -= (bytes); \
} \
if (MemoryProfiler::debug) \
{ \
std::cout << GridLogDebug << "[Memory debug] freeing " << memString(bytes) << std::endl; \
profilerDebugPrint; \
}
void check_huge_pages(void *Buf,uint64_t BYTES);
NAMESPACE_END(Grid);

View File

@@ -81,7 +81,6 @@ public:
bool _isCheckerBoarded; bool _isCheckerBoarded;
int LocallyPeriodic; int LocallyPeriodic;
Coordinate _checker_dim_mask;
public: public:

View File

@@ -38,7 +38,6 @@ class GridCartesian: public GridBase {
public: public:
int dummy; int dummy;
Coordinate _checker_dim_mask;
virtual int CheckerBoardFromOindexTable (int Oindex) { virtual int CheckerBoardFromOindexTable (int Oindex) {
return 0; return 0;
} }
@@ -105,7 +104,6 @@ public:
_ldimensions.resize(_ndimension); _ldimensions.resize(_ndimension);
_rdimensions.resize(_ndimension); _rdimensions.resize(_ndimension);
_simd_layout.resize(_ndimension); _simd_layout.resize(_ndimension);
_checker_dim_mask.resize(_ndimension);;
_lstart.resize(_ndimension); _lstart.resize(_ndimension);
_lend.resize(_ndimension); _lend.resize(_ndimension);
@@ -116,8 +114,6 @@ public:
for (int d = 0; d < _ndimension; d++) for (int d = 0; d < _ndimension; d++)
{ {
_checker_dim_mask[d]=0;
_fdimensions[d] = dimensions[d]; // Global dimensions _fdimensions[d] = dimensions[d]; // Global dimensions
_gdimensions[d] = _fdimensions[d]; // Global dimensions _gdimensions[d] = _fdimensions[d]; // Global dimensions
_simd_layout[d] = simd_layout[d]; _simd_layout[d] = simd_layout[d];

View File

@@ -35,28 +35,12 @@ static const int CbRed =0;
static const int CbBlack=1; static const int CbBlack=1;
static const int Even =CbRed; static const int Even =CbRed;
static const int Odd =CbBlack; static const int Odd =CbBlack;
accelerator_inline int RedBlackCheckerBoardFromOindex (int oindex,const Coordinate &rdim,const Coordinate &chk_dim_msk)
{
int nd=rdim.size();
Coordinate coor(nd);
Lexicographic::CoorFromIndex(coor,oindex,rdim);
int linear=0;
for(int d=0;d<nd;d++){
if(chk_dim_msk[d])
linear=linear+coor[d];
}
return (linear&0x1);
}
// Specialise this for red black grids storing half the data like a chess board. // Specialise this for red black grids storing half the data like a chess board.
class GridRedBlackCartesian : public GridBase class GridRedBlackCartesian : public GridBase
{ {
public: public:
// Coordinate _checker_dim_mask; Coordinate _checker_dim_mask;
int _checker_dim; int _checker_dim;
std::vector<int> _checker_board; std::vector<int> _checker_board;

View File

@@ -33,8 +33,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
bool Stencil_force_mpi = true;
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
// Info that is setup once and indept of cartesian layout // Info that is setup once and indept of cartesian layout
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////

View File

@@ -1,3 +1,4 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
@@ -35,8 +36,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
extern bool Stencil_force_mpi ;
class CartesianCommunicator : public SharedMemory { class CartesianCommunicator : public SharedMemory {
public: public:
@@ -109,8 +108,6 @@ public:
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
// Reduction // Reduction
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
void GlobalMax(RealD &);
void GlobalMax(RealF &);
void GlobalSum(RealF &); void GlobalSum(RealF &);
void GlobalSumVector(RealF *,int N); void GlobalSumVector(RealF *,int N);
void GlobalSum(RealD &); void GlobalSum(RealD &);
@@ -141,6 +138,21 @@ public:
int recv_from_rank, int recv_from_rank,
int bytes); int bytes);
void SendRecvPacket(void *xmit,
void *recv,
int xmit_to_rank,
int recv_from_rank,
int bytes);
void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,
void *recv,
int recv_from_rank,
int bytes);
void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
double StencilSendToRecvFrom(void *xmit, double StencilSendToRecvFrom(void *xmit,
int xmit_to_rank, int xmit_to_rank,
void *recv, void *recv,

View File

@@ -1,6 +1,6 @@
/************************************************************************************* /*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/communicator/Communicator_mpi.cc Source file: ./lib/communicator/Communicator_mpi.cc
@@ -35,7 +35,7 @@ Grid_MPI_Comm CartesianCommunicator::communicator_world;
//////////////////////////////////////////// ////////////////////////////////////////////
// First initialise of comms system // First initialise of comms system
//////////////////////////////////////////// ////////////////////////////////////////////
void CartesianCommunicator::Init(int *argc, char ***argv) void CartesianCommunicator::Init(int *argc, char ***argv)
{ {
int flag; int flag;
@@ -43,16 +43,8 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
MPI_Initialized(&flag); // needed to coexist with other libs apparently MPI_Initialized(&flag); // needed to coexist with other libs apparently
if ( !flag ) { if ( !flag ) {
#ifndef GRID_COMMS_THREADS
nCommThreads=1;
// wrong results here too
// For now: comms-overlap leads to wrong results in Benchmark_wilson even on single node MPI runs
// other comms schemes are ok
MPI_Init_thread(argc,argv,MPI_THREAD_SERIALIZED,&provided);
#else
MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided); MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
#endif
//If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE //If only 1 comms thread we require any threading mode other than SINGLE, but for multiple comms threads we need MULTIPLE
if( (nCommThreads == 1) && (provided == MPI_THREAD_SINGLE) ) { if( (nCommThreads == 1) && (provided == MPI_THREAD_SINGLE) ) {
assert(0); assert(0);
@@ -99,7 +91,7 @@ void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor)
//////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////
// Initialises from communicator_world // Initialises from communicator_world
//////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////
CartesianCommunicator::CartesianCommunicator(const Coordinate &processors) CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
{ {
MPI_Comm optimal_comm; MPI_Comm optimal_comm;
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
@@ -118,7 +110,7 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
////////////////////////////////// //////////////////////////////////
// Try to subdivide communicator // Try to subdivide communicator
////////////////////////////////// //////////////////////////////////
CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank) CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank)
{ {
_ndimension = processors.size(); assert(_ndimension>=1); _ndimension = processors.size(); assert(_ndimension>=1);
int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension); int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension);
@@ -135,7 +127,7 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
// split the communicator // split the communicator
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
// int Nparent = parent._processors ; // int Nparent = parent._processors ;
int Nparent; int Nparent;
MPI_Comm_size(parent.communicator,&Nparent); MPI_Comm_size(parent.communicator,&Nparent);
@@ -157,13 +149,13 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
} }
// rank within subcomm ; srank is rank of subcomm within blocks of subcomms // rank within subcomm ; srank is rank of subcomm within blocks of subcomms
int crank; int crank;
// Mpi uses the reverse Lexico convention to us; so reversed routines called // Mpi uses the reverse Lexico convention to us; so reversed routines called
Lexicographic::IndexFromCoorReversed(ccoor,crank,processors); // processors is the split grid dimensions Lexicographic::IndexFromCoorReversed(ccoor,crank,processors); // processors is the split grid dimensions
Lexicographic::IndexFromCoorReversed(scoor,srank,ssize); // ssize is the number of split grids Lexicographic::IndexFromCoorReversed(scoor,srank,ssize); // ssize is the number of split grids
MPI_Comm comm_split; MPI_Comm comm_split;
if ( Nchild > 1 ) { if ( Nchild > 1 ) {
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Split the communicator // Split the communicator
@@ -188,11 +180,11 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
SetCommunicator(comm_split); SetCommunicator(comm_split);
/////////////////////////////////////////////// ///////////////////////////////////////////////
// Free the temp communicator // Free the temp communicator
/////////////////////////////////////////////// ///////////////////////////////////////////////
MPI_Comm_free(&comm_split); MPI_Comm_free(&comm_split);
if(0){ if(0){
std::cout << " ndim " <<_ndimension<<" " << parent._ndimension << std::endl; std::cout << " ndim " <<_ndimension<<" " << parent._ndimension << std::endl;
for(int d=0;d<processors.size();d++){ for(int d=0;d<processors.size();d++){
std::cout << d<< " " << _processor_coor[d] <<" " << ccoor[d]<<std::endl; std::cout << d<< " " << _processor_coor[d] <<" " << ccoor[d]<<std::endl;
@@ -253,7 +245,7 @@ CartesianCommunicator::~CartesianCommunicator()
for(int i=0;i<communicator_halo.size();i++){ for(int i=0;i<communicator_halo.size();i++){
MPI_Comm_free(&communicator_halo[i]); MPI_Comm_free(&communicator_halo[i]);
} }
} }
} }
void CartesianCommunicator::GlobalSum(uint32_t &u){ void CartesianCommunicator::GlobalSum(uint32_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator); int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
@@ -275,16 +267,6 @@ void CartesianCommunicator::GlobalXOR(uint64_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator); int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
assert(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::GlobalMax(float &f)
{
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_MAX,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalMax(double &d)
{
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(float &f){ void CartesianCommunicator::GlobalSum(float &f){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator); int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0); assert(ierr==0);
@@ -312,28 +294,60 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
int bytes) int bytes)
{ {
std::vector<CommsRequest_t> reqs(0); std::vector<CommsRequest_t> reqs(0);
unsigned long xcrc = crc32(0L, Z_NULL, 0); // unsigned long xcrc = crc32(0L, Z_NULL, 0);
unsigned long rcrc = crc32(0L, Z_NULL, 0); // unsigned long rcrc = crc32(0L, Z_NULL, 0);
// xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
SendToRecvFromComplete(reqs);
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
// printf("proc %d SendToRecvFrom %d bytes %lx %lx\n",_processor,bytes,xcrc,rcrc);
}
void CartesianCommunicator::SendRecvPacket(void *xmit,
void *recv,
int sender,
int receiver,
int bytes)
{
MPI_Status stat;
assert(sender != receiver);
int tag = sender;
if ( _processor == sender ) {
MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
}
if ( _processor == receiver ) {
MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
}
}
// Basic Halo comms primitive
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
int from,
int bytes)
{
int myrank = _processor; int myrank = _processor;
int ierr; int ierr;
// Enforce no UVM in comms, device or host OK if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) {
assert(acceleratorIsCommunicable(xmit)); MPI_Request xrq;
assert(acceleratorIsCommunicable(recv)); MPI_Request rrq;
// Give the CPU to MPI immediately; can use threads to overlap optionally ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
// printf("proc %d SendToRecvFrom %d bytes Sendrecv \n",_processor,bytes); ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
recv,bytes,MPI_CHAR,from, from, assert(ierr==0);
communicator,MPI_STATUS_IGNORE); list.push_back(xrq);
assert(ierr==0); list.push_back(rrq);
} else {
// xcrc = crc32(xcrc,(unsigned char *)xmit,bytes); // Give the CPU to MPI immediately; can use threads to overlap optionally
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes); ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
// printf("proc %d SendToRecvFrom %d bytes xcrc %lx rcrc %lx\n",_processor,bytes,xcrc,rcrc); fflush recv,bytes,MPI_CHAR,from, from,
communicator,MPI_STATUS_IGNORE);
assert(ierr==0);
}
} }
// Basic Halo comms primitive
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
int dest, int dest,
void *recv, void *recv,
@@ -353,7 +367,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
int from, int from,
int bytes,int dir) int bytes,int dir)
{ {
int ncomm =communicator_halo.size(); int ncomm =communicator_halo.size();
int commdir=dir%ncomm; int commdir=dir%ncomm;
MPI_Request xrq; MPI_Request xrq;
@@ -368,37 +382,36 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
assert(from != _processor); assert(from != _processor);
assert(gme == ShmRank); assert(gme == ShmRank);
double off_node_bytes=0.0; double off_node_bytes=0.0;
int tag;
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) { if ( gfrom ==MPI_UNDEFINED) {
tag= dir+from*32; ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator_halo[commdir],&rrq);
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
assert(ierr==0); assert(ierr==0);
list.push_back(rrq); list.push_back(rrq);
off_node_bytes+=bytes; off_node_bytes+=bytes;
} }
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) { if ( gdest == MPI_UNDEFINED ) {
tag= dir+_processor*32; ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator_halo[commdir],&xrq);
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
assert(ierr==0); assert(ierr==0);
list.push_back(xrq); list.push_back(xrq);
off_node_bytes+=bytes; off_node_bytes+=bytes;
} else {
// TODO : make a OMP loop on CPU, call threaded bcopy
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
assert(shm!=NULL);
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes);
acceleratorCopySynchronise(); // MPI prob slower
} }
if ( CommunicatorPolicy == CommunicatorPolicySequential ) { if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
this->StencilSendToRecvFromComplete(list,dir); this->StencilSendToRecvFromComplete(list,dir);
} }
return off_node_bytes; return off_node_bytes;
} }
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir) void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
{
SendToRecvFromComplete(waitall);
}
void CartesianCommunicator::StencilBarrier(void)
{
MPI_Barrier (ShmComm);
}
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
{ {
int nreq=list.size(); int nreq=list.size();
@@ -409,13 +422,6 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
assert(ierr==0); assert(ierr==0);
list.resize(0); list.resize(0);
} }
void CartesianCommunicator::StencilBarrier(void)
{
MPI_Barrier (ShmComm);
}
//void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
//{
//}
void CartesianCommunicator::Barrier(void) void CartesianCommunicator::Barrier(void)
{ {
int ierr = MPI_Barrier(communicator); int ierr = MPI_Barrier(communicator);
@@ -430,8 +436,8 @@ void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
communicator); communicator);
assert(ierr==0); assert(ierr==0);
} }
int CartesianCommunicator::RankWorld(void){ int CartesianCommunicator::RankWorld(void){
int r; int r;
MPI_Comm_rank(communicator_world,&r); MPI_Comm_rank(communicator_world,&r);
return r; return r;
} }
@@ -464,7 +470,7 @@ void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t
// When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug. // When 24*4 bytes multiples get 50x 10^9 >>> 2x10^9 Y2K bug.
// (Turns up on 32^3 x 64 Gparity too) // (Turns up on 32^3 x 64 Gparity too)
MPI_Datatype object; MPI_Datatype object;
int iwords; int iwords;
int ibytes; int ibytes;
iwords = words; iwords = words;
ibytes = bytes; ibytes = bytes;
@@ -477,3 +483,5 @@ void CartesianCommunicator::AllToAll(void *in,void *out,uint64_t words,uint64_t
} }
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@@ -67,8 +67,6 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
CartesianCommunicator::~CartesianCommunicator(){} CartesianCommunicator::~CartesianCommunicator(){}
void CartesianCommunicator::GlobalMax(float &){}
void CartesianCommunicator::GlobalMax(double &){}
void CartesianCommunicator::GlobalSum(float &){} void CartesianCommunicator::GlobalSum(float &){}
void CartesianCommunicator::GlobalSumVector(float *,int N){} void CartesianCommunicator::GlobalSumVector(float *,int N){}
void CartesianCommunicator::GlobalSum(double &){} void CartesianCommunicator::GlobalSum(double &){}
@@ -79,6 +77,15 @@ void CartesianCommunicator::GlobalSumVector(uint64_t *,int N){}
void CartesianCommunicator::GlobalXOR(uint32_t &){} void CartesianCommunicator::GlobalXOR(uint32_t &){}
void CartesianCommunicator::GlobalXOR(uint64_t &){} void CartesianCommunicator::GlobalXOR(uint64_t &){}
void CartesianCommunicator::SendRecvPacket(void *xmit,
void *recv,
int xmit_to_rank,
int recv_from_rank,
int bytes)
{
assert(0);
}
// Basic Halo comms primitive -- should never call in single node // Basic Halo comms primitive -- should never call in single node
void CartesianCommunicator::SendToRecvFrom(void *xmit, void CartesianCommunicator::SendToRecvFrom(void *xmit,
@@ -89,6 +96,20 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
{ {
assert(0); assert(0);
} }
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
int from,
int bytes)
{
assert(0);
}
void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
{
assert(0);
}
void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes) void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes)
{ {
bcopy(in,out,bytes*words); bcopy(in,out,bytes*words);
@@ -116,6 +137,10 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
int recv_from_rank, int recv_from_rank,
int bytes, int dir) int bytes, int dir)
{ {
std::vector<CommsRequest_t> list;
// Discard the "dir"
SendToRecvFromBegin (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
SendToRecvFromComplete(list);
return 2.0*bytes; return 2.0*bytes;
} }
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
@@ -125,10 +150,13 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
int recv_from_rank, int recv_from_rank,
int bytes, int dir) int bytes, int dir)
{ {
// Discard the "dir"
SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
return 2.0*bytes; return 2.0*bytes;
} }
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir) void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
{ {
SendToRecvFromComplete(waitall);
} }
void CartesianCommunicator::StencilBarrier(void){}; void CartesianCommunicator::StencilBarrier(void){};

View File

@@ -102,7 +102,7 @@ public:
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
static void SharedMemoryAllocate(uint64_t bytes, int flags); static void SharedMemoryAllocate(uint64_t bytes, int flags);
static void SharedMemoryFree(void); static void SharedMemoryFree(void);
static void SharedMemoryCopy(void *dest,void *src,size_t bytes); static void SharedMemoryCopy(void *dest,const void *src,size_t bytes);
static void SharedMemoryZero(void *dest,size_t bytes); static void SharedMemoryZero(void *dest,size_t bytes);
}; };

View File

@@ -7,7 +7,6 @@
Copyright (C) 2015 Copyright (C) 2015
Author: Peter Boyle <paboyle@ph.ed.ac.uk> Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Christoph Lehner <christoph@lhnr.de>
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@@ -30,14 +29,8 @@ Author: Christoph Lehner <christoph@lhnr.de>
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
#include <pwd.h> #include <pwd.h>
#ifdef GRID_CUDA #ifdef GRID_NVCC
#include <cuda_runtime_api.h> #include <cuda_runtime_api.h>
#endif
#ifdef GRID_HIP
#include <hip/hip_runtime_api.h>
#endif
#ifdef GRID_SYCl
#endif #endif
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
@@ -54,12 +47,7 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
///////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////
// Split into groups that can share memory // Split into groups that can share memory
///////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////
#ifndef GRID_MPI3_SHM_NONE
MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm); MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&WorldShmComm);
#else
MPI_Comm_split(comm, WorldRank, 0, &WorldShmComm);
#endif
MPI_Comm_rank(WorldShmComm ,&WorldShmRank); MPI_Comm_rank(WorldShmComm ,&WorldShmRank);
MPI_Comm_size(WorldShmComm ,&WorldShmSize); MPI_Comm_size(WorldShmComm ,&WorldShmSize);
@@ -73,7 +61,6 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
WorldNodes = WorldSize/WorldShmSize; WorldNodes = WorldSize/WorldShmSize;
assert( (WorldNodes * WorldShmSize) == WorldSize ); assert( (WorldNodes * WorldShmSize) == WorldSize );
// FIXME: Check all WorldShmSize are the same ? // FIXME: Check all WorldShmSize are the same ?
///////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////
@@ -174,23 +161,6 @@ static inline int divides(int a,int b)
} }
void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims) void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims)
{ {
////////////////////////////////////////////////////////////////
// Allow user to configure through environment variable
////////////////////////////////////////////////////////////////
char* str = getenv(("GRID_SHM_DIMS_" + std::to_string(ShmDims.size())).c_str());
if ( str ) {
std::vector<int> IntShmDims;
GridCmdOptionIntVector(std::string(str),IntShmDims);
assert(IntShmDims.size() == WorldDims.size());
long ShmSize = 1;
for (int dim=0;dim<WorldDims.size();dim++) {
ShmSize *= (ShmDims[dim] = IntShmDims[dim]);
assert(divides(ShmDims[dim],WorldDims[dim]));
}
assert(ShmSize == WorldShmSize);
return;
}
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Powers of 2,3,5 only in prime decomposition for now // Powers of 2,3,5 only in prime decomposition for now
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
@@ -200,24 +170,17 @@ void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmD
std::vector<int> primes({2,3,5}); std::vector<int> primes({2,3,5});
int dim = 0; int dim = 0;
int last_dim = ndimension - 1;
int AutoShmSize = 1; int AutoShmSize = 1;
while(AutoShmSize != WorldShmSize) { while(AutoShmSize != WorldShmSize) {
int p; for(int p=0;p<primes.size();p++) {
for(p=0;p<primes.size();p++) {
int prime=primes[p]; int prime=primes[p];
if ( divides(prime,WorldDims[dim]/ShmDims[dim]) if ( divides(prime,WorldDims[dim]/ShmDims[dim])
&& divides(prime,WorldShmSize/AutoShmSize) ) { && divides(prime,WorldShmSize/AutoShmSize) ) {
AutoShmSize*=prime; AutoShmSize*=prime;
ShmDims[dim]*=prime; ShmDims[dim]*=prime;
last_dim = dim;
break; break;
} }
} }
if (p == primes.size() && last_dim == dim) {
std::cerr << "GlobalSharedMemory::GetShmDims failed" << std::endl;
exit(EXIT_FAILURE);
}
dim=(dim+1) %ndimension; dim=(dim+1) %ndimension;
} }
} }
@@ -450,47 +413,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
// Hugetlbfs mapping intended // Hugetlbfs mapping intended
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
#if defined(GRID_CUDA) ||defined(GRID_HIP) || defined(GRID_SYCL) #ifdef GRID_NVCC
//if defined(GRID_SYCL)
#if 0
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
void * ShmCommBuf ;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// allocate the pointer array for shared windows for our group
//////////////////////////////////////////////////////////////////////////////////////////////////////////
MPI_Barrier(WorldShmComm);
WorldShmCommBufs.resize(WorldShmSize);
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Each MPI rank should allocate our own buffer
///////////////////////////////////////////////////////////////////////////////////////////////////////////
ShmCommBuf = acceleratorAllocDevice(bytes);
if (ShmCommBuf == (void *)NULL ) {
std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
exit(EXIT_FAILURE);
}
std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
SharedMemoryZero(ShmCommBuf,bytes);
assert(WorldShmSize == 1);
for(int r=0;r<WorldShmSize;r++){
WorldShmCommBufs[r] = ShmCommBuf;
}
_ShmAllocBytes=bytes;
_ShmAlloc=1;
}
#endif
#if defined(GRID_CUDA) ||defined(GRID_HIP) ||defined(GRID_SYCL)
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
void * ShmCommBuf ; void * ShmCommBuf ;
@@ -510,73 +433,47 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////
// cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2); // cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2);
#ifdef GRID_IBM_SUMMIT
// IBM Jsrun makes cuda Device numbering screwy and not match rank
std::cout << "IBM Summit or similar - NOT setting device to WorldShmRank"<<std::endl;
#else
std::cout << "setting device to WorldShmRank"<<std::endl;
cudaSetDevice(WorldShmRank);
#endif
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Each MPI rank should allocate our own buffer // Each MPI rank should allocate our own buffer
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
ShmCommBuf = acceleratorAllocDevice(bytes); auto err = cudaMalloc(&ShmCommBuf, bytes);
if ( err != cudaSuccess) {
std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed for " << bytes<<" bytes " <<cudaGetErrorString(err)<< std::endl;
exit(EXIT_FAILURE);
}
if (ShmCommBuf == (void *)NULL ) { if (ShmCommBuf == (void *)NULL ) {
std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl; std::cerr << " SharedMemoryMPI.cc cudaMallocManaged failed NULL pointer for " << bytes<<" bytes " << std::endl;
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
if ( WorldRank == 0 ){ if ( WorldRank == 0 ){
std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes std::cout << header " SharedMemoryMPI.cc cudaMalloc "<< bytes << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
} }
SharedMemoryZero(ShmCommBuf,bytes); SharedMemoryZero(ShmCommBuf,bytes);
std::cout<< "Setting up IPC"<<std::endl;
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Loop over ranks/gpu's on our node // Loop over ranks/gpu's on our node
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
for(int r=0;r<WorldShmSize;r++){ for(int r=0;r<WorldShmSize;r++){
#ifndef GRID_MPI3_SHM_NONE
////////////////////////////////////////////////// //////////////////////////////////////////////////
// If it is me, pass around the IPC access key // If it is me, pass around the IPC access key
////////////////////////////////////////////////// //////////////////////////////////////////////////
void * thisBuf = ShmCommBuf;
if(!Stencil_force_mpi) {
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
typedef struct { int fd; pid_t pid ; } clone_mem_t;
auto zeDevice = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_device());
auto zeContext = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_context());
ze_ipc_mem_handle_t ihandle;
clone_mem_t handle;
if ( r==WorldShmRank ) {
auto err = zeMemGetIpcHandle(zeContext,ShmCommBuf,&ihandle);
if ( err != ZE_RESULT_SUCCESS ) {
std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
exit(EXIT_FAILURE);
} else {
std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
}
memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int));
handle.pid = getpid();
}
#endif
#ifdef GRID_CUDA
cudaIpcMemHandle_t handle; cudaIpcMemHandle_t handle;
if ( r==WorldShmRank ) { if ( r==WorldShmRank ) {
auto err = cudaIpcGetMemHandle(&handle,ShmCommBuf); err = cudaIpcGetMemHandle(&handle,ShmCommBuf);
if ( err != cudaSuccess) { if ( err != cudaSuccess) {
std::cerr << " SharedMemoryMPI.cc cudaIpcGetMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl; std::cerr << " SharedMemoryMPI.cc cudaIpcGetMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
} }
#endif
#ifdef GRID_HIP
hipIpcMemHandle_t handle;
if ( r==WorldShmRank ) {
auto err = hipIpcGetMemHandle(&handle,ShmCommBuf);
if ( err != hipSuccess) {
std::cerr << " SharedMemoryMPI.cc hipIpcGetMemHandle failed for rank" << r <<" "<<hipGetErrorString(err)<< std::endl;
exit(EXIT_FAILURE);
}
}
#endif
////////////////////////////////////////////////// //////////////////////////////////////////////////
// Share this IPC handle across the Shm Comm // Share this IPC handle across the Shm Comm
////////////////////////////////////////////////// //////////////////////////////////////////////////
@@ -592,68 +489,23 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
// If I am not the source, overwrite thisBuf with remote buffer // If I am not the source, overwrite thisBuf with remote buffer
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
void * thisBuf = ShmCommBuf;
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
if ( r!=WorldShmRank ) {
thisBuf = nullptr;
std::cout<<"mapping seeking remote pid/fd "
<<handle.pid<<"/"
<<handle.fd<<std::endl;
int pidfd = syscall(SYS_pidfd_open,handle.pid,0);
std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n";
// int myfd = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0);
int myfd = syscall(438,pidfd,handle.fd,0);
std::cout<<"Using IpcHandle myfd "<<myfd<<"\n";
memcpy((void *)&ihandle,(void *)&myfd,sizeof(int));
auto err = zeMemOpenIpcHandle(zeContext,zeDevice,ihandle,0,&thisBuf);
if ( err != ZE_RESULT_SUCCESS ) {
std::cout << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
exit(EXIT_FAILURE);
} else {
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle succeeded for rank "<<r<<std::endl;
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle pointer is "<<std::hex<<thisBuf<<std::dec<<std::endl;
}
assert(thisBuf!=nullptr);
}
#endif
#ifdef GRID_CUDA
if ( r!=WorldShmRank ) { if ( r!=WorldShmRank ) {
auto err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess); err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess);
if ( err != cudaSuccess) { if ( err != cudaSuccess) {
std::cerr << " SharedMemoryMPI.cc cudaIpcOpenMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl; std::cerr << " SharedMemoryMPI.cc cudaIpcOpenMemHandle failed for rank" << r <<" "<<cudaGetErrorString(err)<< std::endl;
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
} }
#endif
#ifdef GRID_HIP
if ( r!=WorldShmRank ) {
auto err = hipIpcOpenMemHandle(&thisBuf,handle,hipIpcMemLazyEnablePeerAccess);
if ( err != hipSuccess) {
std::cerr << " SharedMemoryMPI.cc hipIpcOpenMemHandle failed for rank" << r <<" "<<hipGetErrorString(err)<< std::endl;
exit(EXIT_FAILURE);
}
}
#endif
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
// Save a copy of the device buffers // Save a copy of the device buffers
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
}
WorldShmCommBufs[r] = thisBuf; WorldShmCommBufs[r] = thisBuf;
#else
WorldShmCommBufs[r] = ShmCommBuf;
#endif
} }
_ShmAllocBytes=bytes; _ShmAllocBytes=bytes;
_ShmAlloc=1; _ShmAlloc=1;
} }
#endif
#else #else
#ifdef GRID_MPI3_SHMMMAP #ifdef GRID_MPI3_SHMMMAP
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
@@ -781,6 +633,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
#endif #endif
void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0); void * ptr = mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
// std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< size<< "bytes)"<<std::endl;
if ( ptr == (void * )MAP_FAILED ) { if ( ptr == (void * )MAP_FAILED ) {
perror("failed mmap"); perror("failed mmap");
assert(0); assert(0);
@@ -824,16 +677,16 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
///////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////
void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes) void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
{ {
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL) #ifdef GRID_NVCC
acceleratorMemSet(dest,0,bytes); cudaMemset(dest,0,bytes);
#else #else
bzero(dest,bytes); bzero(dest,bytes);
#endif #endif
} }
void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes) void GlobalSharedMemory::SharedMemoryCopy(void *dest,const void *src,size_t bytes)
{ {
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL) #ifdef GRID_NVCC
acceleratorCopyToDevice(src,dest,bytes); cudaMemcpy(dest,src,bytes,cudaMemcpyDefault);
#else #else
bcopy(src,dest,bytes); bcopy(src,dest,bytes);
#endif #endif
@@ -852,11 +705,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
///////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////
// Split into groups that can share memory // Split into groups that can share memory
///////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////
#ifndef GRID_MPI3_SHM_NONE
MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm); MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm);
#else
MPI_Comm_split(comm, rank, 0, &ShmComm);
#endif
MPI_Comm_rank(ShmComm ,&ShmRank); MPI_Comm_rank(ShmComm ,&ShmRank);
MPI_Comm_size(ShmComm ,&ShmSize); MPI_Comm_size(ShmComm ,&ShmSize);
ShmCommBufs.resize(ShmSize); ShmCommBufs.resize(ShmSize);
@@ -886,18 +735,25 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
std::vector<int> ranks(size); for(int r=0;r<size;r++) ranks[r]=r; std::vector<int> ranks(size); for(int r=0;r<size;r++) ranks[r]=r;
MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]); MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]);
#ifdef GRID_SHM_FORCE_MPI #ifdef GRID_IBM_SUMMIT
// Hide the shared memory path between ranks // Hide the shared memory path between sockets
{ // if even number of nodes
if ( (ShmSize & 0x1)==0 ) {
int SocketSize = ShmSize/2;
int mySocket = ShmRank/SocketSize;
for(int r=0;r<size;r++){ for(int r=0;r<size;r++){
if ( r!=rank ) { int hisRank=ShmRanks[r];
ShmRanks[r] = MPI_UNDEFINED; if ( hisRank!= MPI_UNDEFINED ) {
int hisSocket=hisRank/SocketSize;
if ( hisSocket != mySocket ) {
ShmRanks[r] = MPI_UNDEFINED;
}
} }
} }
} }
#endif #endif
//SharedMemoryTest(); SharedMemoryTest();
} }
////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////
// On node barrier // On node barrier

View File

@@ -29,7 +29,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
#define header "SharedMemoryNone: "
/*Construct from an MPI communicator*/ /*Construct from an MPI communicator*/
void GlobalSharedMemory::Init(Grid_MPI_Comm comm) void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
@@ -56,38 +55,6 @@ void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_M
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
// Hugetlbfs mapping intended, use anonymous mmap // Hugetlbfs mapping intended, use anonymous mmap
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
#if 1
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{
std::cout << header "SharedMemoryAllocate "<< bytes<< " GPU implementation "<<std::endl;
void * ShmCommBuf ;
assert(_ShmSetup==1);
assert(_ShmAlloc==0);
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Each MPI rank should allocate our own buffer
///////////////////////////////////////////////////////////////////////////////////////////////////////////
ShmCommBuf = acceleratorAllocDevice(bytes);
if (ShmCommBuf == (void *)NULL ) {
std::cerr << " SharedMemoryNone.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
exit(EXIT_FAILURE);
}
if ( WorldRank == 0 ){
std::cout << WorldRank << header " SharedMemoryNone.cc acceleratorAllocDevice "<< bytes
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
}
SharedMemoryZero(ShmCommBuf,bytes);
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Loop over ranks/gpu's on our node
///////////////////////////////////////////////////////////////////////////////////////////////////////////
WorldShmCommBufs[0] = ShmCommBuf;
_ShmAllocBytes=bytes;
_ShmAlloc=1;
}
#else
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
void * ShmCommBuf ; void * ShmCommBuf ;
@@ -116,15 +83,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
_ShmAllocBytes=bytes; _ShmAllocBytes=bytes;
_ShmAlloc=1; _ShmAlloc=1;
}; };
#endif
void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
{
acceleratorMemSet(dest,0,bytes);
}
void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
{
acceleratorCopyToDevice(src,dest,bytes);
}
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
// Global shared functionality finished // Global shared functionality finished
// Now move to per communicator functionality // Now move to per communicator functionality

View File

@@ -52,8 +52,23 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr> template<typename Op, typename T1>
auto Cshift(const Expression &expr,int dim,int shift) -> decltype(closure(expr)) auto Cshift(const LatticeUnaryExpression<Op,T1> &expr,int dim,int shift)
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))>
{
return Cshift(closure(expr),dim,shift);
}
template <class Op, class T1, class T2>
auto Cshift(const LatticeBinaryExpression<Op,T1,T2> &expr,int dim,int shift)
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))>
{
return Cshift(closure(expr),dim,shift);
}
template <class Op, class T1, class T2, class T3>
auto Cshift(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr,int dim,int shift)
-> Lattice<decltype(expr.op.func(eval(0, expr.arg1),
eval(0, expr.arg2),
eval(0, expr.arg3)))>
{ {
return Cshift(closure(expr),dim,shift); return Cshift(closure(expr),dim,shift);
} }

View File

@@ -29,13 +29,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
extern Vector<std::pair<int,int> > Cshift_table;
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
// Gather for when there is no need to SIMD split // Gather for when there is no need to SIMD split
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
template<class vobj> void template<class vobj> void
Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0) Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
{ {
int rd = rhs.Grid()->_rdimensions[dimension]; int rd = rhs.Grid()->_rdimensions[dimension];
@@ -48,16 +46,16 @@ Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dim
int e2=rhs.Grid()->_slice_block[dimension]; int e2=rhs.Grid()->_slice_block[dimension];
int ent = 0; int ent = 0;
if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest static Vector<std::pair<int,int> > table; table.resize(e1*e2);
int stride=rhs.Grid()->_slice_stride[dimension]; int stride=rhs.Grid()->_slice_stride[dimension];
auto rhs_v = rhs.View();
if ( cbmask == 0x3 ) { if ( cbmask == 0x3 ) {
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o = n*stride; int o = n*stride;
int bo = n*e2; int bo = n*e2;
Cshift_table[ent++] = std::pair<int,int>(off+bo+b,so+o+b); table[ent++] = std::pair<int,int>(off+bo+b,so+o+b);
} }
} }
} else { } else {
@@ -67,26 +65,14 @@ Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dim
int o = n*stride; int o = n*stride;
int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b); int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);
if ( ocb &cbmask ) { if ( ocb &cbmask ) {
Cshift_table[ent++]=std::pair<int,int> (off+bo++,so+o+b); table[ent++]=std::pair<int,int> (off+bo++,so+o+b);
} }
} }
} }
} }
{ thread_for(i,ent,{
auto buffer_p = & buffer[0]; buffer[table[i].first]=rhs_v[table[i].second];
auto table = &Cshift_table[0]; });
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
});
#else
autoView(rhs_v , rhs, CpuRead);
thread_for(i,ent,{
buffer_p[table[i].first]=rhs_v[table[i].second];
});
#endif
}
} }
/////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////
@@ -109,80 +95,43 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
int e2=rhs.Grid()->_slice_block[dimension]; int e2=rhs.Grid()->_slice_block[dimension];
int n1=rhs.Grid()->_slice_stride[dimension]; int n1=rhs.Grid()->_slice_stride[dimension];
auto rhs_v = rhs.View();
if ( cbmask ==0x3){ if ( cbmask ==0x3){
#ifdef ACCELERATOR_CSHIFT thread_for_collapse(2,n,e1,{
autoView(rhs_v , rhs, AcceleratorRead); for(int b=0;b<e2;b++){
accelerator_for(nn,e1*e2,1,{
int n = nn%e1;
int b = nn/e1;
int o = n*n1; int o = n*n1;
int offset = b+n*e2; int offset = b+n*e2;
vobj temp =rhs_v[so+o+b]; vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset); extract<vobj>(temp,pointers,offset);
}); }
#else });
autoView(rhs_v , rhs, CpuRead);
thread_for2d(n,e1,b,e2,{
int o = n*n1;
int offset = b+n*e2;
vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset);
});
#endif
} else { } else {
Coordinate rdim=rhs.Grid()->_rdimensions;
Coordinate cdm =rhs.Grid()->_checker_dim_mask;
std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for(nn,e1*e2,1,{
int n = nn%e1;
int b = nn/e1;
Coordinate coor; // Case of SIMD split AND checker dim cannot currently be hit, except in
// Test_cshift_red_black code.
std::cout << " Dense packed buffer WARNING " <<std::endl;
thread_for_collapse(2,n,e1,{
for(int b=0;b<e2;b++){
int o=n*n1; int o=n*n1;
int oindex = o+b; int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);
int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm);
int ocb=1<<cb;
int offset = b+n*e2; int offset = b+n*e2;
if ( ocb & cbmask ) { if ( ocb & cbmask ) {
vobj temp =rhs_v[so+o+b]; vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset); extract<vobj>(temp,pointers,offset);
} }
}); }
#else });
autoView(rhs_v , rhs, CpuRead);
thread_for2d(n,e1,b,e2,{
Coordinate coor;
int o=n*n1;
int oindex = o+b;
int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm);
int ocb=1<<cb;
int offset = b+n*e2;
if ( ocb & cbmask ) {
vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset);
}
});
#endif
} }
} }
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
// Scatter for when there is no need to SIMD split // Scatter for when there is no need to SIMD split
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<vobj> &buffer, int dimension,int plane,int cbmask) template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vobj> &buffer, int dimension,int plane,int cbmask)
{ {
int rd = rhs.Grid()->_rdimensions[dimension]; int rd = rhs.Grid()->_rdimensions[dimension];
@@ -196,8 +145,7 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<
int e2=rhs.Grid()->_slice_block[dimension]; int e2=rhs.Grid()->_slice_block[dimension];
int stride=rhs.Grid()->_slice_stride[dimension]; int stride=rhs.Grid()->_slice_stride[dimension];
if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
int ent =0; int ent =0;
if ( cbmask ==0x3 ) { if ( cbmask ==0x3 ) {
@@ -206,7 +154,7 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o =n*rhs.Grid()->_slice_stride[dimension]; int o =n*rhs.Grid()->_slice_stride[dimension];
int bo =n*rhs.Grid()->_slice_block[dimension]; int bo =n*rhs.Grid()->_slice_block[dimension];
Cshift_table[ent++] = std::pair<int,int>(so+o+b,bo+b); table[ent++] = std::pair<int,int>(so+o+b,bo+b);
} }
} }
@@ -217,27 +165,16 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<
int o =n*rhs.Grid()->_slice_stride[dimension]; int o =n*rhs.Grid()->_slice_stride[dimension];
int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);// Could easily be a table lookup int ocb=1<<rhs.Grid()->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
if ( ocb & cbmask ) { if ( ocb & cbmask ) {
Cshift_table[ent++]=std::pair<int,int> (so+o+b,bo++); table[ent++]=std::pair<int,int> (so+o+b,bo++);
} }
} }
} }
} }
{ auto rhs_v = rhs.View();
auto buffer_p = & buffer[0]; thread_for(i,ent,{
auto table = &Cshift_table[0]; rhs_v[table[i].first]=buffer[table[i].second];
#ifdef ACCELERATOR_CSHIFT });
autoView( rhs_v, rhs, AcceleratorWrite);
accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
});
#else
autoView( rhs_v, rhs, CpuWrite);
thread_for(i,ent,{
rhs_v[table[i].first]=buffer_p[table[i].second];
});
#endif
}
} }
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
@@ -257,33 +194,21 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
int e2=rhs.Grid()->_slice_block[dimension]; int e2=rhs.Grid()->_slice_block[dimension];
if(cbmask ==0x3 ) { if(cbmask ==0x3 ) {
int _slice_stride = rhs.Grid()->_slice_stride[dimension]; auto rhs_v = rhs.View();
int _slice_block = rhs.Grid()->_slice_block[dimension]; thread_for_collapse(2,n,e1,{
#ifdef ACCELERATOR_CSHIFT for(int b=0;b<e2;b++){
autoView( rhs_v , rhs, AcceleratorWrite); int o = n*rhs.Grid()->_slice_stride[dimension];
accelerator_for(nn,e1*e2,1,{ int offset = b+n*rhs.Grid()->_slice_block[dimension];
int n = nn%e1;
int b = nn/e1;
int o = n*_slice_stride;
int offset = b+n*_slice_block;
merge(rhs_v[so+o+b],pointers,offset);
});
#else
autoView( rhs_v , rhs, CpuWrite);
thread_for2d(n,e1,b,e2,{
int o = n*_slice_stride;
int offset = b+n*_slice_block;
merge(rhs_v[so+o+b],pointers,offset); merge(rhs_v[so+o+b],pointers,offset);
}
}); });
#endif
} else { } else {
// Case of SIMD split AND checker dim cannot currently be hit, except in // Case of SIMD split AND checker dim cannot currently be hit, except in
// Test_cshift_red_black code. // Test_cshift_red_black code.
std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME // std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl; std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl;
assert(0); // This will fail if hit on GPU auto rhs_v = rhs.View();
autoView( rhs_v, rhs, CpuWrite);
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o = n*rhs.Grid()->_slice_stride[dimension]; int o = n*rhs.Grid()->_slice_stride[dimension];
@@ -300,7 +225,6 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
// local to node block strided copies // local to node block strided copies
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask) template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask)
{ {
int rd = rhs.Grid()->_rdimensions[dimension]; int rd = rhs.Grid()->_rdimensions[dimension];
@@ -315,16 +239,14 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
int e1=rhs.Grid()->_slice_nblock[dimension]; // clearly loop invariant for icpc int e1=rhs.Grid()->_slice_nblock[dimension]; // clearly loop invariant for icpc
int e2=rhs.Grid()->_slice_block[dimension]; int e2=rhs.Grid()->_slice_block[dimension];
int stride = rhs.Grid()->_slice_stride[dimension]; int stride = rhs.Grid()->_slice_stride[dimension];
static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest
int ent=0; int ent=0;
if(cbmask == 0x3 ){ if(cbmask == 0x3 ){
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o =n*stride+b; int o =n*stride+b;
Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o); table[ent++] = std::pair<int,int>(lo+o,ro+o);
} }
} }
} else { } else {
@@ -333,32 +255,23 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
int o =n*stride+b; int o =n*stride+b;
int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o); int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o);
if ( ocb&cbmask ) { if ( ocb&cbmask ) {
Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o); table[ent++] = std::pair<int,int>(lo+o,ro+o);
} }
} }
} }
} }
{ auto rhs_v = rhs.View();
auto table = &Cshift_table[0]; auto lhs_v = lhs.View();
#ifdef ACCELERATOR_CSHIFT thread_for(i,ent,{
autoView(rhs_v , rhs, AcceleratorRead); lhs_v[table[i].first]=rhs_v[table[i].second];
autoView(lhs_v , lhs, AcceleratorWrite); });
accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second]));
});
#else
autoView(rhs_v , rhs, CpuRead);
autoView(lhs_v , lhs, CpuWrite);
thread_for(i,ent,{
lhs_v[table[i].first]=rhs_v[table[i].second];
});
#endif
}
} }
template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type) template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type)
{ {
int rd = rhs.Grid()->_rdimensions[dimension]; int rd = rhs.Grid()->_rdimensions[dimension];
if ( !rhs.Grid()->CheckerBoarded(dimension) ) { if ( !rhs.Grid()->CheckerBoarded(dimension) ) {
@@ -372,41 +285,29 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
int e2=rhs.Grid()->_slice_block [dimension]; int e2=rhs.Grid()->_slice_block [dimension];
int stride = rhs.Grid()->_slice_stride[dimension]; int stride = rhs.Grid()->_slice_stride[dimension];
if(Cshift_table.size()<e1*e2) Cshift_table.resize(e1*e2); // Let it grow to biggest static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
int ent=0; int ent=0;
if ( cbmask == 0x3 ) { if ( cbmask == 0x3 ) {
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o =n*stride; int o =n*stride;
Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b); table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
}} }}
} else { } else {
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o =n*stride; int o =n*stride;
int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o+b); int ocb=1<<lhs.Grid()->CheckerBoardFromOindex(o+b);
if ( ocb&cbmask ) Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b); if ( ocb&cbmask ) table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
}} }}
} }
{ auto rhs_v = rhs.View();
auto table = &Cshift_table[0]; auto lhs_v = lhs.View();
#ifdef ACCELERATOR_CSHIFT thread_for(i,ent,{
autoView( rhs_v, rhs, AcceleratorRead); permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
autoView( lhs_v, lhs, AcceleratorWrite); });
accelerator_for(i,ent,1,{
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
});
#else
autoView( rhs_v, rhs, CpuRead);
autoView( lhs_v, lhs, CpuWrite);
thread_for(i,ent,{
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
});
#endif
}
} }
////////////////////////////////////////////////////// //////////////////////////////////////////////////////

View File

@@ -101,8 +101,7 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
} }
} }
#define ACCELERATOR_CSHIFT_NO_COPY
#ifdef ACCELERATOR_CSHIFT_NO_COPY
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask) template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{ {
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
@@ -122,9 +121,9 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
assert(shift<fd); assert(shift<fd);
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension]; int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
static cshiftVector<vobj> send_buf; send_buf.resize(buffer_size); commVector<vobj> send_buf(buffer_size);
static cshiftVector<vobj> recv_buf; recv_buf.resize(buffer_size); commVector<vobj> recv_buf(buffer_size);
int cb= (cbmask==0x2)? Odd : Even; int cb= (cbmask==0x2)? Odd : Even;
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb); int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
@@ -139,7 +138,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
} else { } else {
int words = buffer_size; int words = send_buf.size();
if (cbmask != 0x3) words=words>>1; if (cbmask != 0x3) words=words>>1;
int bytes = words * sizeof(vobj); int bytes = words * sizeof(vobj);
@@ -151,14 +150,12 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
int xmit_to_rank; int xmit_to_rank;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank); grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
grid->Barrier();
grid->SendToRecvFrom((void *)&send_buf[0], grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank, xmit_to_rank,
(void *)&recv_buf[0], (void *)&recv_buf[0],
recv_from_rank, recv_from_rank,
bytes); bytes);
grid->Barrier(); grid->Barrier();
Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask); Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
@@ -198,15 +195,8 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension]; int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
// int words = sizeof(vobj)/sizeof(vector_type); // int words = sizeof(vobj)/sizeof(vector_type);
static std::vector<cshiftVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd); std::vector<commVector<scalar_object> > send_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
static std::vector<cshiftVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd); std::vector<commVector<scalar_object> > recv_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
scalar_object * recv_buf_extract_mpi;
scalar_object * send_buf_extract_mpi;
for(int s=0;s<Nsimd;s++){
send_buf_extract[s].resize(buffer_size);
recv_buf_extract[s].resize(buffer_size);
}
int bytes = buffer_size*sizeof(scalar_object); int bytes = buffer_size*sizeof(scalar_object);
@@ -252,204 +242,11 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
if(nbr_proc){ if(nbr_proc){
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
grid->Barrier(); grid->SendToRecvFrom((void *)&send_buf_extract[nbr_lane][0],
send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
recv_buf_extract_mpi = &recv_buf_extract[i][0];
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
xmit_to_rank, xmit_to_rank,
(void *)recv_buf_extract_mpi, (void *)&recv_buf_extract[i][0],
recv_from_rank, recv_from_rank,
bytes); bytes);
grid->Barrier();
rpointers[i] = &recv_buf_extract[i][0];
} else {
rpointers[i] = &send_buf_extract[nbr_lane][0];
}
}
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
}
}
#else
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
GridBase *grid=rhs.Grid();
Lattice<vobj> temp(rhs.Grid());
int fd = rhs.Grid()->_fdimensions[dimension];
int rd = rhs.Grid()->_rdimensions[dimension];
int pd = rhs.Grid()->_processors[dimension];
int simd_layout = rhs.Grid()->_simd_layout[dimension];
int comm_dim = rhs.Grid()->_processors[dimension] >1 ;
assert(simd_layout==1);
assert(comm_dim==1);
assert(shift>=0);
assert(shift<fd);
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size);
static cshiftVector<vobj> recv_buf_v; recv_buf_v.resize(buffer_size);
vobj *send_buf;
vobj *recv_buf;
{
grid->ShmBufferFreeAll();
size_t bytes = buffer_size*sizeof(vobj);
send_buf=(vobj *)grid->ShmBufferMalloc(bytes);
recv_buf=(vobj *)grid->ShmBufferMalloc(bytes);
}
int cb= (cbmask==0x2)? Odd : Even;
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
for(int x=0;x<rd;x++){
int sx = (x+sshift)%rd;
int comm_proc = ((x+sshift)/rd)%pd;
if (comm_proc==0) {
Copy_plane(ret,rhs,dimension,x,sx,cbmask);
} else {
int words = buffer_size;
if (cbmask != 0x3) words=words>>1;
int bytes = words * sizeof(vobj);
Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask);
// int rank = grid->_processor;
int recv_from_rank;
int xmit_to_rank;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
grid->Barrier();
acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes);
grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank,
(void *)&recv_buf[0],
recv_from_rank,
bytes);
acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes);
grid->Barrier();
Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask);
}
}
}
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
GridBase *grid=rhs.Grid();
const int Nsimd = grid->Nsimd();
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_object scalar_object;
typedef typename vobj::scalar_type scalar_type;
int fd = grid->_fdimensions[dimension];
int rd = grid->_rdimensions[dimension];
int ld = grid->_ldimensions[dimension];
int pd = grid->_processors[dimension];
int simd_layout = grid->_simd_layout[dimension];
int comm_dim = grid->_processors[dimension] >1 ;
//std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
// << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout
// << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
assert(comm_dim==1);
assert(simd_layout==2);
assert(shift>=0);
assert(shift<fd);
int permute_type=grid->PermuteType(dimension);
///////////////////////////////////////////////
// Simd direction uses an extract/merge pair
///////////////////////////////////////////////
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
// int words = sizeof(vobj)/sizeof(vector_type);
static std::vector<cshiftVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd);
static std::vector<cshiftVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
scalar_object * recv_buf_extract_mpi;
scalar_object * send_buf_extract_mpi;
{
size_t bytes = sizeof(scalar_object)*buffer_size;
grid->ShmBufferFreeAll();
send_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
recv_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
}
for(int s=0;s<Nsimd;s++){
send_buf_extract[s].resize(buffer_size);
recv_buf_extract[s].resize(buffer_size);
}
int bytes = buffer_size*sizeof(scalar_object);
ExtractPointerArray<scalar_object> pointers(Nsimd); //
ExtractPointerArray<scalar_object> rpointers(Nsimd); // received pointers
///////////////////////////////////////////
// Work out what to send where
///////////////////////////////////////////
int cb = (cbmask==0x2)? Odd : Even;
int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
// loop over outer coord planes orthog to dim
for(int x=0;x<rd;x++){
// FIXME call local permute copy if none are offnode.
for(int i=0;i<Nsimd;i++){
pointers[i] = &send_buf_extract[i][0];
}
int sx = (x+sshift)%rd;
Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
for(int i=0;i<Nsimd;i++){
int inner_bit = (Nsimd>>(permute_type+1));
int ic= (i&inner_bit)? 1:0;
int my_coor = rd*ic + x;
int nbr_coor = my_coor+sshift;
int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
int nbr_ic = (nbr_coor%ld)/rd; // inner coord of peer
int nbr_ox = (nbr_coor%rd); // outer coord of peer
int nbr_lane = (i&(~inner_bit));
int recv_from_rank;
int xmit_to_rank;
if (nbr_ic) nbr_lane|=inner_bit;
assert (sx == nbr_ox);
if(nbr_proc){
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
grid->Barrier();
acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes);
grid->SendToRecvFrom((void *)send_buf_extract_mpi,
xmit_to_rank,
(void *)recv_buf_extract_mpi,
recv_from_rank,
bytes);
acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes);
grid->Barrier(); grid->Barrier();
rpointers[i] = &recv_buf_extract[i][0]; rpointers[i] = &recv_buf_extract[i][0];
} else { } else {
@@ -461,7 +258,7 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
} }
} }
#endif
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
#endif #endif

View File

@@ -1,4 +0,0 @@
#include <Grid/GridCore.h>
NAMESPACE_BEGIN(Grid);
Vector<std::pair<int,int> > Cshift_table;
NAMESPACE_END(Grid);

View File

@@ -26,7 +26,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#pragma once #pragma once
#include <Grid/lattice/Lattice_view.h>
#include <Grid/lattice/Lattice_base.h> #include <Grid/lattice/Lattice_base.h>
#include <Grid/lattice/Lattice_conformable.h> #include <Grid/lattice/Lattice_conformable.h>
#include <Grid/lattice/Lattice_ET.h> #include <Grid/lattice/Lattice_ET.h>
@@ -36,8 +35,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/lattice/Lattice_local.h> #include <Grid/lattice/Lattice_local.h>
#include <Grid/lattice/Lattice_reduction.h> #include <Grid/lattice/Lattice_reduction.h>
#include <Grid/lattice/Lattice_peekpoke.h> #include <Grid/lattice/Lattice_peekpoke.h>
#include <Grid/lattice/Lattice_reality.h> //#include <Grid/lattice/Lattice_reality.h>
#include <Grid/lattice/Lattice_real_imag.h>
#include <Grid/lattice/Lattice_comparison_utils.h> #include <Grid/lattice/Lattice_comparison_utils.h>
#include <Grid/lattice/Lattice_comparison.h> #include <Grid/lattice/Lattice_comparison.h>
#include <Grid/lattice/Lattice_coordinate.h> #include <Grid/lattice/Lattice_coordinate.h>
@@ -46,4 +44,3 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/lattice/Lattice_unary.h> #include <Grid/lattice/Lattice_unary.h>
#include <Grid/lattice/Lattice_transfer.h> #include <Grid/lattice/Lattice_transfer.h>
#include <Grid/lattice/Lattice_basis.h> #include <Grid/lattice/Lattice_basis.h>
#include <Grid/lattice/Lattice_crc.h>

View File

@@ -42,24 +42,9 @@ NAMESPACE_BEGIN(Grid);
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
// Predicated where support // Predicated where support
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
#ifdef GRID_SIMT
// drop to scalar in SIMT; cleaner in fact
template <class iobj, class vobj, class robj> template <class iobj, class vobj, class robj>
accelerator_inline vobj predicatedWhere(const iobj &predicate, accelerator_inline vobj predicatedWhere(const iobj &predicate, const vobj &iftrue,
const vobj &iftrue, const robj &iffalse) {
const robj &iffalse)
{
Integer mask = TensorRemove(predicate);
typename std::remove_const<vobj>::type ret= iffalse;
if (mask) ret=iftrue;
return ret;
}
#else
template <class iobj, class vobj, class robj>
accelerator_inline vobj predicatedWhere(const iobj &predicate,
const vobj &iftrue,
const robj &iffalse)
{
typename std::remove_const<vobj>::type ret; typename std::remove_const<vobj>::type ret;
typedef typename vobj::scalar_object scalar_object; typedef typename vobj::scalar_object scalar_object;
@@ -83,7 +68,6 @@ accelerator_inline vobj predicatedWhere(const iobj &predicate,
merge(ret, falsevals); merge(ret, falsevals);
return ret; return ret;
} }
#endif
///////////////////////////////////////////////////// /////////////////////////////////////////////////////
//Specialization of getVectorType for lattices //Specialization of getVectorType for lattices
@@ -97,62 +81,26 @@ struct getVectorType<Lattice<T> >{
//-- recursive evaluation of expressions; -- //-- recursive evaluation of expressions; --
// handle leaves of syntax tree // handle leaves of syntax tree
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
template<class sobj, template<class sobj> accelerator_inline
typename std::enable_if<!is_lattice<sobj>::value&&!is_lattice_expr<sobj>::value,sobj>::type * = nullptr>
accelerator_inline
sobj eval(const uint64_t ss, const sobj &arg) sobj eval(const uint64_t ss, const sobj &arg)
{ {
return arg; return arg;
} }
template <class lobj> accelerator_inline
auto eval(const uint64_t ss, const LatticeView<lobj> &arg) -> decltype(arg(ss))
{
return arg(ss);
}
////////////////////////////////////////////
//-- recursive evaluation of expressions; --
// whole vector return, used only for expression return type inference
///////////////////////////////////////////////////
template<class sobj> accelerator_inline
sobj vecEval(const uint64_t ss, const sobj &arg)
{
return arg;
}
template <class lobj> accelerator_inline template <class lobj> accelerator_inline
const lobj & vecEval(const uint64_t ss, const LatticeView<lobj> &arg) const lobj & eval(const uint64_t ss, const LatticeView<lobj> &arg)
{ {
return arg[ss]; return arg[ss];
} }
template <class lobj> accelerator_inline
const lobj & eval(const uint64_t ss, const Lattice<lobj> &arg)
{
auto view = arg.AcceleratorView(ViewRead);
return view[ss];
}
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
// handle nodes in syntax tree- eval one operand // handle nodes in syntax tree- eval one operand
// vecEval needed (but never called as all expressions offloaded) to infer the return type
// in SIMT contexts of closure.
///////////////////////////////////////////////////
template <typename Op, typename T1> accelerator_inline
auto vecEval(const uint64_t ss, const LatticeUnaryExpression<Op, T1> &expr)
-> decltype(expr.op.func( vecEval(ss, expr.arg1)))
{
return expr.op.func( vecEval(ss, expr.arg1) );
}
// vecEval two operands
template <typename Op, typename T1, typename T2> accelerator_inline
auto vecEval(const uint64_t ss, const LatticeBinaryExpression<Op, T1, T2> &expr)
-> decltype(expr.op.func( vecEval(ss,expr.arg1),vecEval(ss,expr.arg2)))
{
return expr.op.func( vecEval(ss,expr.arg1), vecEval(ss,expr.arg2) );
}
// vecEval three operands
template <typename Op, typename T1, typename T2, typename T3> accelerator_inline
auto vecEval(const uint64_t ss, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
-> decltype(expr.op.func(vecEval(ss, expr.arg1), vecEval(ss, expr.arg2), vecEval(ss, expr.arg3)))
{
return expr.op.func(vecEval(ss, expr.arg1), vecEval(ss, expr.arg2), vecEval(ss, expr.arg3));
}
///////////////////////////////////////////////////
// handle nodes in syntax tree- eval one operand coalesced
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
template <typename Op, typename T1> accelerator_inline template <typename Op, typename T1> accelerator_inline
auto eval(const uint64_t ss, const LatticeUnaryExpression<Op, T1> &expr) auto eval(const uint64_t ss, const LatticeUnaryExpression<Op, T1> &expr)
@@ -160,41 +108,23 @@ auto eval(const uint64_t ss, const LatticeUnaryExpression<Op, T1> &expr)
{ {
return expr.op.func( eval(ss, expr.arg1) ); return expr.op.func( eval(ss, expr.arg1) );
} }
///////////////////////
// eval two operands // eval two operands
///////////////////////
template <typename Op, typename T1, typename T2> accelerator_inline template <typename Op, typename T1, typename T2> accelerator_inline
auto eval(const uint64_t ss, const LatticeBinaryExpression<Op, T1, T2> &expr) auto eval(const uint64_t ss, const LatticeBinaryExpression<Op, T1, T2> &expr)
-> decltype(expr.op.func( eval(ss,expr.arg1),eval(ss,expr.arg2))) -> decltype(expr.op.func( eval(ss,expr.arg1),eval(ss,expr.arg2)))
{ {
return expr.op.func( eval(ss,expr.arg1), eval(ss,expr.arg2) ); return expr.op.func( eval(ss,expr.arg1), eval(ss,expr.arg2) );
} }
///////////////////////
// eval three operands // eval three operands
///////////////////////
template <typename Op, typename T1, typename T2, typename T3> accelerator_inline template <typename Op, typename T1, typename T2, typename T3> accelerator_inline
auto eval(const uint64_t ss, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) auto eval(const uint64_t ss, const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
-> decltype(expr.op.func(eval(ss, expr.arg1), -> decltype(expr.op.func(eval(ss, expr.arg1), eval(ss, expr.arg2), eval(ss, expr.arg3)))
eval(ss, expr.arg2),
eval(ss, expr.arg3)))
{ {
#ifdef GRID_SIMT return expr.op.func(eval(ss, expr.arg1), eval(ss, expr.arg2), eval(ss, expr.arg3));
// Handles Nsimd (vInteger) != Nsimd(ComplexD)
typedef decltype(vecEval(ss, expr.arg2)) rvobj;
typedef typename std::remove_reference<rvobj>::type vobj;
const int Nsimd = vobj::vector_type::Nsimd();
auto vpred = vecEval(ss,expr.arg1);
ExtractBuffer<Integer> mask(Nsimd);
extract<vInteger, Integer>(TensorRemove(vpred), mask);
int s = acceleratorSIMTlane(Nsimd);
return expr.op.func(mask[s],
eval(ss, expr.arg2),
eval(ss, expr.arg3));
#else
return expr.op.func(eval(ss, expr.arg1),
eval(ss, expr.arg2),
eval(ss, expr.arg3));
#endif
} }
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
@@ -250,12 +180,16 @@ inline void CBFromExpression(int &cb, const T1 &lat) // Lattice leaf
cb = lat.Checkerboard(); cb = lat.Checkerboard();
} }
template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr> template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
inline void CBFromExpression(int &cb, const T1 &notlat) {} // non-lattice leaf inline void CBFromExpression(int &cb, const T1 &notlat) // non-lattice leaf
{
}
template <typename Op, typename T1> inline template <typename Op, typename T1> inline
void CBFromExpression(int &cb,const LatticeUnaryExpression<Op, T1> &expr) void CBFromExpression(int &cb,const LatticeUnaryExpression<Op, T1> &expr)
{ {
CBFromExpression(cb, expr.arg1); // recurse AST CBFromExpression(cb, expr.arg1); // recurse AST
} }
template <typename Op, typename T1, typename T2> inline template <typename Op, typename T1, typename T2> inline
void CBFromExpression(int &cb,const LatticeBinaryExpression<Op, T1, T2> &expr) void CBFromExpression(int &cb,const LatticeBinaryExpression<Op, T1, T2> &expr)
{ {
@@ -270,86 +204,32 @@ inline void CBFromExpression(int &cb, const LatticeTrinaryExpression<Op, T1, T2,
CBFromExpression(cb, expr.arg3); // recurse AST CBFromExpression(cb, expr.arg3); // recurse AST
} }
//////////////////////////////////////////////////////////////////////////
// ViewOpen
//////////////////////////////////////////////////////////////////////////
template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
inline void ExpressionViewOpen(T1 &lat) // Lattice leaf
{
lat.ViewOpen(AcceleratorRead);
}
template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
inline void ExpressionViewOpen(T1 &notlat) {}
template <typename Op, typename T1> inline
void ExpressionViewOpen(LatticeUnaryExpression<Op, T1> &expr)
{
ExpressionViewOpen(expr.arg1); // recurse AST
}
template <typename Op, typename T1, typename T2> inline
void ExpressionViewOpen(LatticeBinaryExpression<Op, T1, T2> &expr)
{
ExpressionViewOpen(expr.arg1); // recurse AST
ExpressionViewOpen(expr.arg2); // rrecurse AST
}
template <typename Op, typename T1, typename T2, typename T3>
inline void ExpressionViewOpen(LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
{
ExpressionViewOpen(expr.arg1); // recurse AST
ExpressionViewOpen(expr.arg2); // recurse AST
ExpressionViewOpen(expr.arg3); // recurse AST
}
//////////////////////////////////////////////////////////////////////////
// ViewClose
//////////////////////////////////////////////////////////////////////////
template <class T1,typename std::enable_if<is_lattice<T1>::value, T1>::type * = nullptr>
inline void ExpressionViewClose( T1 &lat) // Lattice leaf
{
lat.ViewClose();
}
template <class T1,typename std::enable_if<!is_lattice<T1>::value, T1>::type * = nullptr>
inline void ExpressionViewClose(T1 &notlat) {}
template <typename Op, typename T1> inline
void ExpressionViewClose(LatticeUnaryExpression<Op, T1> &expr)
{
ExpressionViewClose(expr.arg1); // recurse AST
}
template <typename Op, typename T1, typename T2> inline
void ExpressionViewClose(LatticeBinaryExpression<Op, T1, T2> &expr)
{
ExpressionViewClose(expr.arg1); // recurse AST
ExpressionViewClose(expr.arg2); // recurse AST
}
template <typename Op, typename T1, typename T2, typename T3>
inline void ExpressionViewClose(LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
{
ExpressionViewClose(expr.arg1); // recurse AST
ExpressionViewClose(expr.arg2); // recurse AST
ExpressionViewClose(expr.arg3); // recurse AST
}
//////////////////////////////////////////// ////////////////////////////////////////////
// Unary operators and funcs // Unary operators and funcs
//////////////////////////////////////////// ////////////////////////////////////////////
#define GridUnopClass(name, ret) \ #define GridUnopClass(name, ret) \
template <class arg> \
struct name { \ struct name { \
template<class _arg> static auto accelerator_inline func(const _arg a) -> decltype(ret) { return ret; } \ static auto accelerator_inline func(const arg a) -> decltype(ret) { return ret; } \
}; };
GridUnopClass(UnarySub, -a); GridUnopClass(UnarySub, -a);
GridUnopClass(UnaryNot, Not(a)); GridUnopClass(UnaryNot, Not(a));
GridUnopClass(UnaryAdj, adj(a));
GridUnopClass(UnaryConj, conjugate(a));
GridUnopClass(UnaryTrace, trace(a)); GridUnopClass(UnaryTrace, trace(a));
GridUnopClass(UnaryTranspose, transpose(a)); GridUnopClass(UnaryTranspose, transpose(a));
GridUnopClass(UnaryTa, Ta(a)); GridUnopClass(UnaryTa, Ta(a));
GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a)); GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a));
GridUnopClass(UnaryReal, real(a));
GridUnopClass(UnaryImag, imag(a));
GridUnopClass(UnaryToReal, toReal(a));
GridUnopClass(UnaryToComplex, toComplex(a));
GridUnopClass(UnaryTimesI, timesI(a)); GridUnopClass(UnaryTimesI, timesI(a));
GridUnopClass(UnaryTimesMinusI, timesMinusI(a)); GridUnopClass(UnaryTimesMinusI, timesMinusI(a));
GridUnopClass(UnaryAbs, abs(a)); GridUnopClass(UnaryAbs, abs(a));
GridUnopClass(UnarySqrt, sqrt(a)); GridUnopClass(UnarySqrt, sqrt(a));
GridUnopClass(UnaryRsqrt, rsqrt(a));
GridUnopClass(UnarySin, sin(a)); GridUnopClass(UnarySin, sin(a));
GridUnopClass(UnaryCos, cos(a)); GridUnopClass(UnaryCos, cos(a));
GridUnopClass(UnaryAsin, asin(a)); GridUnopClass(UnaryAsin, asin(a));
@@ -361,10 +241,10 @@ GridUnopClass(UnaryExp, exp(a));
// Binary operators // Binary operators
//////////////////////////////////////////// ////////////////////////////////////////////
#define GridBinOpClass(name, combination) \ #define GridBinOpClass(name, combination) \
template <class left, class right> \
struct name { \ struct name { \
template <class _left, class _right> \
static auto accelerator_inline \ static auto accelerator_inline \
func(const _left &lhs, const _right &rhs) \ func(const left &lhs, const right &rhs) \
-> decltype(combination) const \ -> decltype(combination) const \
{ \ { \
return combination; \ return combination; \
@@ -384,10 +264,10 @@ GridBinOpClass(BinaryOrOr, lhs || rhs);
// Trinary conditional op // Trinary conditional op
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
#define GridTrinOpClass(name, combination) \ #define GridTrinOpClass(name, combination) \
template <class predicate, class left, class right> \
struct name { \ struct name { \
template <class _predicate,class _left, class _right> \
static auto accelerator_inline \ static auto accelerator_inline \
func(const _predicate &pred, const _left &lhs, const _right &rhs) \ func(const predicate &pred, const left &lhs, const right &rhs) \
-> decltype(combination) const \ -> decltype(combination) const \
{ \ { \
return combination; \ return combination; \
@@ -395,17 +275,17 @@ GridBinOpClass(BinaryOrOr, lhs || rhs);
}; };
GridTrinOpClass(TrinaryWhere, GridTrinOpClass(TrinaryWhere,
(predicatedWhere< (predicatedWhere<predicate,
typename std::remove_reference<_predicate>::type, typename std::remove_reference<left>::type,
typename std::remove_reference<_left>::type, typename std::remove_reference<right>::type>(pred, lhs,rhs)));
typename std::remove_reference<_right>::type>(pred, lhs,rhs)));
//////////////////////////////////////////// ////////////////////////////////////////////
// Operator syntactical glue // Operator syntactical glue
//////////////////////////////////////////// ////////////////////////////////////////////
#define GRID_UNOP(name) name
#define GRID_BINOP(name) name #define GRID_UNOP(name) name<decltype(eval(0, arg))>
#define GRID_TRINOP(name) name #define GRID_BINOP(name) name<decltype(eval(0, lhs)), decltype(eval(0, rhs))>
#define GRID_TRINOP(name) name<decltype(eval(0, pred)), decltype(eval(0, lhs)), decltype(eval(0, rhs))>
#define GRID_DEF_UNOP(op, name) \ #define GRID_DEF_UNOP(op, name) \
template <typename T1, typename std::enable_if<is_lattice<T1>::value||is_lattice_expr<T1>::value,T1>::type * = nullptr> \ template <typename T1, typename std::enable_if<is_lattice<T1>::value||is_lattice_expr<T1>::value,T1>::type * = nullptr> \
@@ -451,17 +331,22 @@ GridTrinOpClass(TrinaryWhere,
GRID_DEF_UNOP(operator-, UnarySub); GRID_DEF_UNOP(operator-, UnarySub);
GRID_DEF_UNOP(Not, UnaryNot); GRID_DEF_UNOP(Not, UnaryNot);
GRID_DEF_UNOP(operator!, UnaryNot); GRID_DEF_UNOP(operator!, UnaryNot);
//GRID_DEF_UNOP(adj, UnaryAdj); GRID_DEF_UNOP(adj, UnaryAdj);
//GRID_DEF_UNOP(conjugate, UnaryConj); GRID_DEF_UNOP(conjugate, UnaryConj);
GRID_DEF_UNOP(trace, UnaryTrace); GRID_DEF_UNOP(trace, UnaryTrace);
GRID_DEF_UNOP(transpose, UnaryTranspose); GRID_DEF_UNOP(transpose, UnaryTranspose);
GRID_DEF_UNOP(Ta, UnaryTa); GRID_DEF_UNOP(Ta, UnaryTa);
GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup); GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup);
GRID_DEF_UNOP(real, UnaryReal);
GRID_DEF_UNOP(imag, UnaryImag);
GRID_DEF_UNOP(toReal, UnaryToReal);
GRID_DEF_UNOP(toComplex, UnaryToComplex);
GRID_DEF_UNOP(timesI, UnaryTimesI); GRID_DEF_UNOP(timesI, UnaryTimesI);
GRID_DEF_UNOP(timesMinusI, UnaryTimesMinusI); GRID_DEF_UNOP(timesMinusI, UnaryTimesMinusI);
GRID_DEF_UNOP(abs, UnaryAbs); // abs overloaded in cmath C++98; DON'T do the GRID_DEF_UNOP(abs, UnaryAbs); // abs overloaded in cmath C++98; DON'T do the
// abs-fabs-dabs-labs thing // abs-fabs-dabs-labs thing
GRID_DEF_UNOP(sqrt, UnarySqrt); GRID_DEF_UNOP(sqrt, UnarySqrt);
GRID_DEF_UNOP(rsqrt, UnaryRsqrt);
GRID_DEF_UNOP(sin, UnarySin); GRID_DEF_UNOP(sin, UnarySin);
GRID_DEF_UNOP(cos, UnaryCos); GRID_DEF_UNOP(cos, UnaryCos);
GRID_DEF_UNOP(asin, UnaryAsin); GRID_DEF_UNOP(asin, UnaryAsin);
@@ -486,36 +371,29 @@ GRID_DEF_TRINOP(where, TrinaryWhere);
///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////
template <class Op, class T1> template <class Op, class T1>
auto closure(const LatticeUnaryExpression<Op, T1> &expr) auto closure(const LatticeUnaryExpression<Op, T1> &expr)
-> Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1)))>::type > -> Lattice<decltype(expr.op.func(eval(0, expr.arg1)))>
{ {
Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1)))>::type > ret(expr); Lattice<decltype(expr.op.func(eval(0, expr.arg1)))> ret(expr);
return ret; return ret;
} }
template <class Op, class T1, class T2> template <class Op, class T1, class T2>
auto closure(const LatticeBinaryExpression<Op, T1, T2> &expr) auto closure(const LatticeBinaryExpression<Op, T1, T2> &expr)
-> Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1),vecEval(0, expr.arg2)))>::type > -> Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))>
{ {
Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1),vecEval(0, expr.arg2)))>::type > ret(expr); Lattice<decltype(expr.op.func(eval(0, expr.arg1),eval(0, expr.arg2)))> ret(expr);
return ret; return ret;
} }
template <class Op, class T1, class T2, class T3> template <class Op, class T1, class T2, class T3>
auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr) auto closure(const LatticeTrinaryExpression<Op, T1, T2, T3> &expr)
-> Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1), -> Lattice<decltype(expr.op.func(eval(0, expr.arg1),
vecEval(0, expr.arg2), eval(0, expr.arg2),
vecEval(0, expr.arg3)))>::type > eval(0, expr.arg3)))>
{ {
Lattice<typename std::remove_const<decltype(expr.op.func(vecEval(0, expr.arg1), Lattice<decltype(expr.op.func(eval(0, expr.arg1),
vecEval(0, expr.arg2), eval(0, expr.arg2),
vecEval(0, expr.arg3)))>::type > ret(expr); eval(0, expr.arg3)))> ret(expr);
return ret; return ret;
} }
#define EXPRESSION_CLOSURE(function) \
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr> \
auto function(Expression &expr) -> decltype(function(closure(expr))) \
{ \
return function(closure(expr)); \
}
#undef GRID_UNOP #undef GRID_UNOP
#undef GRID_BINOP #undef GRID_BINOP

View File

@@ -37,9 +37,9 @@ NAMESPACE_BEGIN(Grid);
template<class obj1,class obj2,class obj3> inline template<class obj1,class obj2,class obj3> inline
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){ void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
autoView( ret_v , ret, AcceleratorWrite); auto ret_v = ret.AcceleratorView(ViewWrite);
autoView( lhs_v , lhs, AcceleratorRead); auto lhs_v = lhs.AcceleratorView(ViewRead);
autoView( rhs_v , rhs, AcceleratorRead); auto rhs_v = rhs.AcceleratorView(ViewRead);
conformable(ret,rhs); conformable(ret,rhs);
conformable(lhs,rhs); conformable(lhs,rhs);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
@@ -56,13 +56,13 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
conformable(lhs,rhs); conformable(lhs,rhs);
autoView( ret_v , ret, AcceleratorWrite); auto ret_v = ret.AcceleratorView(ViewWrite);
autoView( lhs_v , lhs, AcceleratorRead); auto lhs_v = lhs.AcceleratorView(ViewRead);
autoView( rhs_v , rhs, AcceleratorRead); auto rhs_v = rhs.AcceleratorView(ViewRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss); auto lhs_t=lhs_v(ss);
auto rhs_t=rhs_v(ss); auto rhs_t=rhs_v(ss);
auto tmp =ret_v(ss);
mac(&tmp,&lhs_t,&rhs_t); mac(&tmp,&lhs_t,&rhs_t);
coalescedWrite(ret_v[ss],tmp); coalescedWrite(ret_v[ss],tmp);
}); });
@@ -73,9 +73,9 @@ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
conformable(lhs,rhs); conformable(lhs,rhs);
autoView( ret_v , ret, AcceleratorWrite); auto ret_v = ret.AcceleratorView(ViewWrite);
autoView( lhs_v , lhs, AcceleratorRead); auto lhs_v = lhs.AcceleratorView(ViewRead);
autoView( rhs_v , rhs, AcceleratorRead); auto rhs_v = rhs.AcceleratorView(ViewRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss); auto lhs_t=lhs_v(ss);
@@ -89,9 +89,9 @@ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
conformable(lhs,rhs); conformable(lhs,rhs);
autoView( ret_v , ret, AcceleratorWrite); auto ret_v = ret.AcceleratorView(ViewWrite);
autoView( lhs_v , lhs, AcceleratorRead); auto lhs_v = lhs.AcceleratorView(ViewRead);
autoView( rhs_v , rhs, AcceleratorRead); auto rhs_v = rhs.AcceleratorView(ViewRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss); auto lhs_t=lhs_v(ss);
@@ -108,8 +108,8 @@ template<class obj1,class obj2,class obj3> inline
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(lhs,ret); conformable(lhs,ret);
autoView( ret_v , ret, AcceleratorWrite); auto ret_v = ret.AcceleratorView(ViewWrite);
autoView( lhs_v , lhs, AcceleratorRead); auto lhs_v = lhs.AcceleratorView(ViewRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
mult(&tmp,&lhs_v(ss),&rhs); mult(&tmp,&lhs_v(ss),&rhs);
@@ -121,10 +121,10 @@ template<class obj1,class obj2,class obj3> inline
void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,lhs); conformable(ret,lhs);
autoView( ret_v , ret, AcceleratorWrite); auto ret_v = ret.AcceleratorView(ViewWrite);
autoView( lhs_v , lhs, AcceleratorRead); auto lhs_v = lhs.AcceleratorView(ViewRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
auto tmp =ret_v(ss); decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss); auto lhs_t=lhs_v(ss);
mac(&tmp,&lhs_t,&rhs); mac(&tmp,&lhs_t,&rhs);
coalescedWrite(ret_v[ss],tmp); coalescedWrite(ret_v[ss],tmp);
@@ -135,8 +135,8 @@ template<class obj1,class obj2,class obj3> inline
void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,lhs); conformable(ret,lhs);
autoView( ret_v , ret, AcceleratorWrite); auto ret_v = ret.AcceleratorView(ViewWrite);
autoView( lhs_v , lhs, AcceleratorRead); auto lhs_v = lhs.AcceleratorView(ViewRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss); auto lhs_t=lhs_v(ss);
@@ -148,8 +148,8 @@ template<class obj1,class obj2,class obj3> inline
void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(lhs,ret); conformable(lhs,ret);
autoView( ret_v , ret, AcceleratorWrite); auto ret_v = ret.AcceleratorView(ViewWrite);
autoView( lhs_v , lhs, AcceleratorRead); auto lhs_v = lhs.AcceleratorView(ViewRead);
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto lhs_t=lhs_v(ss); auto lhs_t=lhs_v(ss);
@@ -165,8 +165,8 @@ template<class obj1,class obj2,class obj3> inline
void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard(); ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
autoView( ret_v , ret, AcceleratorWrite); auto ret_v = ret.AcceleratorView(ViewWrite);
autoView( rhs_v , lhs, AcceleratorRead); auto rhs_v = lhs.AcceleratorView(ViewRead);
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto rhs_t=rhs_v(ss); auto rhs_t=rhs_v(ss);
@@ -179,10 +179,10 @@ template<class obj1,class obj2,class obj3> inline
void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard(); ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
autoView( ret_v , ret, AcceleratorWrite); auto ret_v = ret.AcceleratorView(ViewWrite);
autoView( rhs_v , lhs, AcceleratorRead); auto rhs_v = lhs.AcceleratorView(ViewRead);
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
auto tmp =ret_v(ss); decltype(coalescedRead(obj1())) tmp;
auto rhs_t=rhs_v(ss); auto rhs_t=rhs_v(ss);
mac(&tmp,&lhs,&rhs_t); mac(&tmp,&lhs,&rhs_t);
coalescedWrite(ret_v[ss],tmp); coalescedWrite(ret_v[ss],tmp);
@@ -193,8 +193,8 @@ template<class obj1,class obj2,class obj3> inline
void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard(); ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
autoView( ret_v , ret, AcceleratorWrite); auto ret_v = ret.AcceleratorView(ViewWrite);
autoView( rhs_v , lhs, AcceleratorRead); auto rhs_v = lhs.AcceleratorView(ViewRead);
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto rhs_t=rhs_v(ss); auto rhs_t=rhs_v(ss);
@@ -206,8 +206,8 @@ template<class obj1,class obj2,class obj3> inline
void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
ret.Checkerboard() = rhs.Checkerboard(); ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
autoView( ret_v , ret, AcceleratorWrite); auto ret_v = ret.AcceleratorView(ViewWrite);
autoView( rhs_v , lhs, AcceleratorRead); auto rhs_v = lhs.AcceleratorView(ViewRead);
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{ accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
decltype(coalescedRead(obj1())) tmp; decltype(coalescedRead(obj1())) tmp;
auto rhs_t=rhs_v(ss); auto rhs_t=rhs_v(ss);
@@ -221,11 +221,11 @@ void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &
ret.Checkerboard() = x.Checkerboard(); ret.Checkerboard() = x.Checkerboard();
conformable(ret,x); conformable(ret,x);
conformable(x,y); conformable(x,y);
autoView( ret_v , ret, AcceleratorWrite); auto ret_v = ret.AcceleratorView(ViewWrite);
autoView( x_v , x, AcceleratorRead); auto x_v = x.AcceleratorView(ViewRead);
autoView( y_v , y, AcceleratorRead); auto y_v = y.AcceleratorView(ViewRead);
accelerator_for(ss,x_v.size(),vobj::Nsimd(),{ accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
auto tmp = a*coalescedRead(x_v[ss])+coalescedRead(y_v[ss]); auto tmp = a*x_v(ss)+y_v(ss);
coalescedWrite(ret_v[ss],tmp); coalescedWrite(ret_v[ss],tmp);
}); });
} }
@@ -234,9 +234,9 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
ret.Checkerboard() = x.Checkerboard(); ret.Checkerboard() = x.Checkerboard();
conformable(ret,x); conformable(ret,x);
conformable(x,y); conformable(x,y);
autoView( ret_v , ret, AcceleratorWrite); auto ret_v = ret.AcceleratorView(ViewWrite);
autoView( x_v , x, AcceleratorRead); auto x_v = x.AcceleratorView(ViewRead);
autoView( y_v , y, AcceleratorRead); auto y_v = y.AcceleratorView(ViewRead);
accelerator_for(ss,x_v.size(),vobj::Nsimd(),{ accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
auto tmp = a*x_v(ss)+b*y_v(ss); auto tmp = a*x_v(ss)+b*y_v(ss);
coalescedWrite(ret_v[ss],tmp); coalescedWrite(ret_v[ss],tmp);

View File

@@ -29,7 +29,6 @@ See the full license in the file "LICENSE" in the top level distribution
directory directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#pragma once #pragma once
#define STREAMING_STORES #define STREAMING_STORES
@@ -38,6 +37,180 @@ NAMESPACE_BEGIN(Grid);
extern int GridCshiftPermuteMap[4][16]; extern int GridCshiftPermuteMap[4][16];
///////////////////////////////////////////////////////////////////
// Base class which can be used by traits to pick up behaviour
///////////////////////////////////////////////////////////////////
class LatticeBase {};
/////////////////////////////////////////////////////////////////////////////////////////
// Conformable checks; same instance of Grid required
/////////////////////////////////////////////////////////////////////////////////////////
void accelerator_inline conformable(GridBase *lhs,GridBase *rhs)
{
assert(lhs == rhs);
}
////////////////////////////////////////////////////////////////////////////
// Advise the LatticeAccelerator class
////////////////////////////////////////////////////////////////////////////
enum LatticeAcceleratorAdvise {
AdviseInfrequentUse = 0x1, // Advise that the data is used infrequently. This can
// significantly influence performance of bulk storage.
AdviseReadMostly = 0x2, // Data will mostly be read. On some architectures
// enables read-only copies of memory to be kept on
// host and device.
};
////////////////////////////////////////////////////////////////////////////
// View Access Mode
////////////////////////////////////////////////////////////////////////////
enum ViewMode {
ViewRead = 0x1,
ViewWrite = 0x2,
ViewReadWrite = 0x3
};
////////////////////////////////////////////////////////////////////////////
// Minimal base class containing only data valid to access from accelerator
// _odata will be a managed pointer in CUDA
////////////////////////////////////////////////////////////////////////////
// Force access to lattice through a view object.
// prevents writing of code that will not offload to GPU, but perhaps annoyingly
// strict since host could could in principle direct access through the lattice object
// Need to decide programming model.
#define LATTICE_VIEW_STRICT
template<class vobj> class LatticeAccelerator : public LatticeBase
{
protected:
GridBase *_grid;
int checkerboard;
vobj *_odata; // A managed pointer
uint64_t _odata_size;
public:
accelerator_inline LatticeAccelerator() : checkerboard(0), _odata(nullptr), _odata_size(0), _grid(nullptr) { };
accelerator_inline uint64_t oSites(void) const { return _odata_size; };
accelerator_inline int Checkerboard(void) const { return checkerboard; };
accelerator_inline int &Checkerboard(void) { return this->checkerboard; }; // can assign checkerboard on a container, not a view
accelerator_inline void Conformable(GridBase * &grid) const
{
if (grid) conformable(grid, _grid);
else grid = _grid;
};
accelerator_inline void Advise(int advise) {
#ifdef GRID_NVCC
#ifndef __CUDA_ARCH__ // only on host
if (advise & AdviseInfrequentUse) {
cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetPreferredLocation,cudaCpuDeviceId);
}
if (advise & AdviseReadMostly) {
cudaMemAdvise(_odata,_odata_size*sizeof(vobj),cudaMemAdviseSetReadMostly,-1);
}
#endif
#endif
};
accelerator_inline void AcceleratorPrefetch(int accessMode = ViewReadWrite) { // will use accessMode in future
#ifdef GRID_NVCC
#ifndef __CUDA_ARCH__ // only on host
int target;
cudaGetDevice(&target);
cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),target);
#endif
#endif
};
accelerator_inline void HostPrefetch(int accessMode = ViewReadWrite) { // will use accessMode in future
#ifdef GRID_NVCC
#ifndef __CUDA_ARCH__ // only on host
cudaMemPrefetchAsync(_odata,_odata_size*sizeof(vobj),cudaCpuDeviceId);
#endif
#endif
};
};
/////////////////////////////////////////////////////////////////////////////////////////
// A View class which provides accessor to the data.
// This will be safe to call from accelerator_for and is trivially copy constructible
// The copy constructor for this will need to be used by device lambda functions
/////////////////////////////////////////////////////////////////////////////////////////
template<class vobj>
class LatticeView : public LatticeAccelerator<vobj>
{
public:
// Rvalue
#ifdef __CUDA_ARCH__
accelerator_inline const typename vobj::scalar_object operator()(size_t i) const { return coalescedRead(this->_odata[i]); }
#else
accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; }
#endif
accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
accelerator_inline vobj & operator[](size_t i) { return this->_odata[i]; };
accelerator_inline uint64_t begin(void) const { return 0;};
accelerator_inline uint64_t end(void) const { return this->_odata_size; };
accelerator_inline uint64_t size(void) const { return this->_odata_size; };
LatticeView(const LatticeAccelerator<vobj> &refer_to_me) : LatticeAccelerator<vobj> (refer_to_me)
{
}
};
/////////////////////////////////////////////////////////////////////////////////////////
// Lattice expression types used by ET to assemble the AST
//
// Need to be able to detect code paths according to the whether a lattice object or not
// so introduce some trait type things
/////////////////////////////////////////////////////////////////////////////////////////
class LatticeExpressionBase {};
template <typename T> using is_lattice = std::is_base_of<LatticeBase, T>;
template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >;
template<class T, bool isLattice> struct ViewMapBase { typedef T Type; };
template<class T> struct ViewMapBase<T,true> { typedef LatticeView<typename T::vector_object> Type; };
template<class T> using ViewMap = ViewMapBase<T,std::is_base_of<LatticeBase, T>::value >;
template <typename Op, typename _T1>
class LatticeUnaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
Op op;
T1 arg1;
LatticeUnaryExpression(Op _op,const _T1 &_arg1) : op(_op), arg1(_arg1) {};
};
template <typename Op, typename _T1, typename _T2>
class LatticeBinaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
typedef typename ViewMap<_T2>::Type T2;
Op op;
T1 arg1;
T2 arg2;
LatticeBinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2) : op(_op), arg1(_arg1), arg2(_arg2) {};
};
template <typename Op, typename _T1, typename _T2, typename _T3>
class LatticeTrinaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
typedef typename ViewMap<_T2>::Type T2;
typedef typename ViewMap<_T3>::Type T3;
Op op;
T1 arg1;
T2 arg2;
T3 arg3;
LatticeTrinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2,const _T3 &_arg3) : op(_op), arg1(_arg1), arg2(_arg2), arg3(_arg3) {};
};
///////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////
// The real lattice class, with normal copy and assignment semantics. // The real lattice class, with normal copy and assignment semantics.
// This contains extra (host resident) grid pointer data that may be accessed by host code // This contains extra (host resident) grid pointer data that may be accessed by host code
@@ -73,33 +246,38 @@ private:
dealloc(); dealloc();
this->_odata_size = size; this->_odata_size = size;
if ( size ) if ( size )
this->_odata = alloc.allocate(this->_odata_size); this->_odata = alloc.allocate(this->_odata_size);
else else
this->_odata = nullptr; this->_odata = nullptr;
} }
} }
public: public:
/////////////////////////////////////////////////////////////////////////////////
// Can use to make accelerator dirty without copy from host ; useful for temporaries "dont care" prev contents
/////////////////////////////////////////////////////////////////////////////////
void SetViewMode(ViewMode mode) {
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode);
accessor.ViewClose();
}
///////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////
// Return a view object that may be dereferenced in site loops. // Return a view object that may be dereferenced in site loops.
// The view is trivially copy constructible and may be copied to an accelerator device // The view is trivially copy constructible and may be copied to an accelerator device
// in device lambdas // in device lambdas
///////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////
LatticeView<vobj> View (void) const // deprecated, should pick AcceleratorView for accelerator_for
LatticeView<vobj> View (ViewMode mode) const { // and HostView for thread_for
{ LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode);
return accessor; return accessor;
} }
LatticeView<vobj> AcceleratorView(int mode = ViewReadWrite) const
{
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
accessor.AcceleratorPrefetch(mode);
return accessor;
}
LatticeView<vobj> HostView(int mode = ViewReadWrite) const
{
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this));
accessor.HostPrefetch(mode);
return accessor;
}
~Lattice() { ~Lattice() {
if ( this->_odata_size ) { if ( this->_odata_size ) {
dealloc(); dealloc();
@@ -119,16 +297,12 @@ public:
CBFromExpression(cb,expr); CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even)); assert( (cb==Odd) || (cb==Even));
this->checkerboard=cb; this->checkerboard=cb;
auto exprCopy = expr; auto me = AcceleratorView(ViewWrite);
ExpressionViewOpen(exprCopy); accelerator_for(ss,me.size(),1,{
auto me = View(AcceleratorWriteDiscard); auto tmp = eval(ss,expr);
accelerator_for(ss,me.size(),vobj::Nsimd(),{ vstream(me[ss],tmp);
auto tmp = eval(ss,exprCopy);
coalescedWrite(me[ss],tmp);
}); });
me.ViewClose();
ExpressionViewClose(exprCopy);
return *this; return *this;
} }
template <typename Op, typename T1,typename T2> inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr) template <typename Op, typename T1,typename T2> inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr)
@@ -143,15 +317,11 @@ public:
assert( (cb==Odd) || (cb==Even)); assert( (cb==Odd) || (cb==Even));
this->checkerboard=cb; this->checkerboard=cb;
auto exprCopy = expr; auto me = AcceleratorView(ViewWrite);
ExpressionViewOpen(exprCopy); accelerator_for(ss,me.size(),1,{
auto me = View(AcceleratorWriteDiscard); auto tmp = eval(ss,expr);
accelerator_for(ss,me.size(),vobj::Nsimd(),{ vstream(me[ss],tmp);
auto tmp = eval(ss,exprCopy);
coalescedWrite(me[ss],tmp);
}); });
me.ViewClose();
ExpressionViewClose(exprCopy);
return *this; return *this;
} }
template <typename Op, typename T1,typename T2,typename T3> inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr) template <typename Op, typename T1,typename T2,typename T3> inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr)
@@ -165,15 +335,11 @@ public:
CBFromExpression(cb,expr); CBFromExpression(cb,expr);
assert( (cb==Odd) || (cb==Even)); assert( (cb==Odd) || (cb==Even));
this->checkerboard=cb; this->checkerboard=cb;
auto exprCopy = expr; auto me = AcceleratorView(ViewWrite);
ExpressionViewOpen(exprCopy); accelerator_for(ss,me.size(),1,{
auto me = View(AcceleratorWriteDiscard); auto tmp = eval(ss,expr);
accelerator_for(ss,me.size(),vobj::Nsimd(),{ vstream(me[ss],tmp);
auto tmp = eval(ss,exprCopy);
coalescedWrite(me[ss],tmp);
}); });
me.ViewClose();
ExpressionViewClose(exprCopy);
return *this; return *this;
} }
//GridFromExpression is tricky to do //GridFromExpression is tricky to do
@@ -224,11 +390,10 @@ public:
} }
template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){ template<class sobj> inline Lattice<vobj> & operator = (const sobj & r){
auto me = View(CpuWrite); auto me = View();
thread_for(ss,me.size(),{ thread_for(ss,me.size(),{
me[ss]= r; me[ss] = r;
}); });
me.ViewClose();
return *this; return *this;
} }
@@ -238,12 +403,11 @@ public:
/////////////////////////////////////////// ///////////////////////////////////////////
// user defined constructor // user defined constructor
/////////////////////////////////////////// ///////////////////////////////////////////
Lattice(GridBase *grid,ViewMode mode=AcceleratorWriteDiscard) { Lattice(GridBase *grid) {
this->_grid = grid; this->_grid = grid;
resize(this->_grid->oSites()); resize(this->_grid->oSites());
assert((((uint64_t)&this->_odata[0])&0xF) ==0); assert((((uint64_t)&this->_odata[0])&0xF) ==0);
this->checkerboard=0; this->checkerboard=0;
SetViewMode(mode);
} }
// virtual ~Lattice(void) = default; // virtual ~Lattice(void) = default;
@@ -281,12 +445,11 @@ public:
typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0; typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0;
conformable(*this,r); conformable(*this,r);
this->checkerboard = r.Checkerboard(); this->checkerboard = r.Checkerboard();
auto me = View(AcceleratorWriteDiscard); auto me = AcceleratorView(ViewWrite);
auto him= r.View(AcceleratorRead); auto him= r.AcceleratorView(ViewRead);
accelerator_for(ss,me.size(),vobj::Nsimd(),{ accelerator_for(ss,me.size(),vobj::Nsimd(),{
coalescedWrite(me[ss],him(ss)); coalescedWrite(me[ss],him(ss));
}); });
me.ViewClose(); him.ViewClose();
return *this; return *this;
} }
@@ -296,12 +459,11 @@ public:
inline Lattice<vobj> & operator = (const Lattice<vobj> & r){ inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
this->checkerboard = r.Checkerboard(); this->checkerboard = r.Checkerboard();
conformable(*this,r); conformable(*this,r);
auto me = View(AcceleratorWriteDiscard); auto me = AcceleratorView(ViewWrite);
auto him= r.View(AcceleratorRead); auto him= r.AcceleratorView(ViewRead);
accelerator_for(ss,me.size(),vobj::Nsimd(),{ accelerator_for(ss,me.size(),vobj::Nsimd(),{
coalescedWrite(me[ss],him(ss)); coalescedWrite(me[ss],him(ss));
}); });
me.ViewClose(); him.ViewClose();
return *this; return *this;
} }
/////////////////////////////////////////// ///////////////////////////////////////////

View File

@@ -51,39 +51,34 @@ template<class VField, class Matrix>
void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm) void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
{ {
typedef decltype(basis[0]) Field; typedef decltype(basis[0]) Field;
typedef decltype(basis[0].View(AcceleratorRead)) View; typedef decltype(basis[0].View()) View;
auto tmp_v = basis[0].AcceleratorView(ViewReadWrite);
Vector<View> basis_v; basis_v.reserve(basis.size()); Vector<View> basis_v(basis.size(),tmp_v);
typedef typename std::remove_reference<decltype(basis_v[0][0])>::type vobj; typedef typename std::remove_reference<decltype(tmp_v[0])>::type vobj;
typedef typename std::remove_reference<decltype(Qt(0,0))>::type Coeff_t;
GridBase* grid = basis[0].Grid(); GridBase* grid = basis[0].Grid();
for(int k=0;k<basis.size();k++){ for(int k=0;k<basis.size();k++){
basis_v.push_back(basis[k].View(AcceleratorWrite)); basis_v[k] = basis[k].AcceleratorView(ViewReadWrite);
} }
#if ( (!defined(GRID_CUDA)) ) #ifndef GRID_NVCC
int max_threads = thread_max();
Vector < vobj > Bt(Nm * max_threads);
thread_region thread_region
{ {
vobj* B = &Bt[Nm * thread_num()]; std::vector < vobj > B(Nm); // Thread private
thread_for_in_region(ss, grid->oSites(),{ thread_for_in_region(ss, grid->oSites(),{
for(int j=j0; j<j1; ++j) B[j]=0.; for(int j=j0; j<j1; ++j) B[j]=0.;
for(int j=j0; j<j1; ++j){ for(int j=j0; j<j1; ++j){
for(int k=k0; k<k1; ++k){ for(int k=k0; k<k1; ++k){
B[j] +=Qt(j,k) * basis_v[k][ss]; B[j] +=Qt(j,k) * basis_v[k][ss];
}
} }
for(int j=j0; j<j1; ++j){ }
basis_v[j][ss] = B[j]; for(int j=j0; j<j1; ++j){
} basis_v[j][ss] = B[j];
}); }
} });
}
#else #else
View *basis_vp = &basis_v[0];
int nrot = j1-j0; int nrot = j1-j0;
if (!nrot) // edge case not handled gracefully by Cuda if (!nrot) // edge case not handled gracefully by Cuda
return; return;
@@ -95,13 +90,13 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
auto Bp=&Bt[0]; auto Bp=&Bt[0];
// GPU readable copy of matrix // GPU readable copy of matrix
Vector<Coeff_t> Qt_jv(Nm*Nm); Vector<double> Qt_jv(Nm*Nm);
Coeff_t *Qt_p = & Qt_jv[0]; double *Qt_p = & Qt_jv[0];
thread_for(i,Nm*Nm,{ thread_for(i,Nm*Nm,{
int j = i/Nm; int j = i/Nm;
int k = i%Nm; int k = i%Nm;
Qt_p[i]=Qt(j,k); Qt_p[i]=Qt(j,k);
}); });
// Block the loop to keep storage footprint down // Block the loop to keep storage footprint down
for(uint64_t s=0;s<oSites;s+=siteBlock){ for(uint64_t s=0;s<oSites;s+=siteBlock){
@@ -125,7 +120,7 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
for(int k=k0; k<k1; ++k){ for(int k=k0; k<k1; ++k){
auto tmp = coalescedRead(Bp[ss*nrot+j]); auto tmp = coalescedRead(Bp[ss*nrot+j]);
coalescedWrite(Bp[ss*nrot+j],tmp+ Qt_p[jj*Nm+k] * coalescedRead(basis_vp[k][sss])); coalescedWrite(Bp[ss*nrot+j],tmp+ Qt_p[jj*Nm+k] * coalescedRead(basis_v[k][sss]));
} }
}); });
@@ -134,44 +129,37 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
int jj =j0+j; int jj =j0+j;
int ss =sj/nrot; int ss =sj/nrot;
int sss=ss+s; int sss=ss+s;
coalescedWrite(basis_vp[jj][sss],coalescedRead(Bp[ss*nrot+j])); coalescedWrite(basis_v[jj][sss],coalescedRead(Bp[ss*nrot+j]));
}); });
} }
#endif #endif
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
} }
// Extract a single rotated vector // Extract a single rotated vector
template<class Field> template<class Field>
void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm) void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm)
{ {
typedef decltype(basis[0].View(AcceleratorRead)) View; typedef decltype(basis[0].AcceleratorView()) View;
typedef typename Field::vector_object vobj; typedef typename Field::vector_object vobj;
GridBase* grid = basis[0].Grid(); GridBase* grid = basis[0].Grid();
result.Checkerboard() = basis[0].Checkerboard(); result.Checkerboard() = basis[0].Checkerboard();
auto result_v=result.AcceleratorView(ViewWrite);
Vector<View> basis_v; basis_v.reserve(basis.size()); Vector<View> basis_v(basis.size(),result_v);
for(int k=0;k<basis.size();k++){ for(int k=0;k<basis.size();k++){
basis_v.push_back(basis[k].View(AcceleratorRead)); basis_v[k] = basis[k].AcceleratorView(ViewRead);
} }
vobj zz=Zero(); vobj zz=Zero();
Vector<double> Qt_jv(Nm); Vector<double> Qt_jv(Nm);
double * Qt_j = & Qt_jv[0]; double * Qt_j = & Qt_jv[0];
for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k); for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
auto basis_vp=& basis_v[0];
autoView(result_v,result,AcceleratorWrite);
accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{ accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
vobj zzz=Zero(); auto B=coalescedRead(zz);
auto B=coalescedRead(zzz);
for(int k=k0; k<k1; ++k){ for(int k=k0; k<k1; ++k){
B +=Qt_j[k] * coalescedRead(basis_vp[k][ss]); B +=Qt_j[k] * coalescedRead(basis_v[k][ss]);
} }
coalescedWrite(result_v[ss], B); coalescedWrite(result_v[ss], B);
}); });
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
} }
template<class Field> template<class Field>

View File

@@ -42,6 +42,34 @@ NAMESPACE_BEGIN(Grid);
typedef iScalar<vInteger> vPredicate ; typedef iScalar<vInteger> vPredicate ;
/*
template <class iobj, class vobj, class robj> accelerator_inline
vobj predicatedWhere(const iobj &predicate, const vobj &iftrue, const robj &iffalse)
{
typename std::remove_const<vobj>::type ret;
typedef typename vobj::scalar_object scalar_object;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
const int Nsimd = vobj::vector_type::Nsimd();
ExtractBuffer<Integer> mask(Nsimd);
ExtractBuffer<scalar_object> truevals(Nsimd);
ExtractBuffer<scalar_object> falsevals(Nsimd);
extract(iftrue, truevals);
extract(iffalse, falsevals);
extract<vInteger, Integer>(TensorRemove(predicate), mask);
for (int s = 0; s < Nsimd; s++) {
if (mask[s]) falsevals[s] = truevals[s];
}
merge(ret, falsevals);
return ret;
}
*/
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
// compare lattice to lattice // compare lattice to lattice
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
@@ -50,9 +78,9 @@ template<class vfunctor,class lobj,class robj>
inline Lattice<vPredicate> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs) inline Lattice<vPredicate> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs)
{ {
Lattice<vPredicate> ret(rhs.Grid()); Lattice<vPredicate> ret(rhs.Grid());
autoView( lhs_v, lhs, CpuRead); auto lhs_v = lhs.View();
autoView( rhs_v, rhs, CpuRead); auto rhs_v = rhs.View();
autoView( ret_v, ret, CpuWrite); auto ret_v = ret.View();
thread_for( ss, rhs_v.size(), { thread_for( ss, rhs_v.size(), {
ret_v[ss]=op(lhs_v[ss],rhs_v[ss]); ret_v[ss]=op(lhs_v[ss],rhs_v[ss]);
}); });
@@ -65,8 +93,8 @@ template<class vfunctor,class lobj,class robj>
inline Lattice<vPredicate> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs) inline Lattice<vPredicate> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs)
{ {
Lattice<vPredicate> ret(lhs.Grid()); Lattice<vPredicate> ret(lhs.Grid());
autoView( lhs_v, lhs, CpuRead); auto lhs_v = lhs.View();
autoView( ret_v, ret, CpuWrite); auto ret_v = ret.View();
thread_for( ss, lhs_v.size(), { thread_for( ss, lhs_v.size(), {
ret_v[ss]=op(lhs_v[ss],rhs); ret_v[ss]=op(lhs_v[ss],rhs);
}); });
@@ -79,8 +107,8 @@ template<class vfunctor,class lobj,class robj>
inline Lattice<vPredicate> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs) inline Lattice<vPredicate> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs)
{ {
Lattice<vPredicate> ret(rhs.Grid()); Lattice<vPredicate> ret(rhs.Grid());
autoView( rhs_v, rhs, CpuRead); auto rhs_v = rhs.View();
autoView( ret_v, ret, CpuWrite); auto ret_v = ret.View();
thread_for( ss, rhs_v.size(), { thread_for( ss, rhs_v.size(), {
ret_v[ss]=op(lhs,rhs_v[ss]); ret_v[ss]=op(lhs,rhs_v[ss]);
}); });

View File

@@ -37,7 +37,7 @@ template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
GridBase *grid = l.Grid(); GridBase *grid = l.Grid();
int Nsimd = grid->iSites(); int Nsimd = grid->iSites();
autoView(l_v, l, CpuWrite); auto l_v = l.View();
thread_for( o, grid->oSites(), { thread_for( o, grid->oSites(), {
vector_type vI; vector_type vI;
Coordinate gcoor; Coordinate gcoor;
@@ -51,5 +51,23 @@ template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
}); });
}; };
// LatticeCoordinate();
// FIXME for debug; deprecate this; made obscelete by
template<class vobj> void lex_sites(Lattice<vobj> &l){
auto l_v = l.View();
Real *v_ptr = (Real *)&l_v[0];
size_t o_len = l.Grid()->oSites();
size_t v_len = sizeof(vobj)/sizeof(vRealF);
size_t vec_len = vRealF::Nsimd();
for(int i=0;i<o_len;i++){
for(int j=0;j<v_len;j++){
for(int vv=0;vv<vec_len;vv+=2){
v_ptr[i*v_len*vec_len+j*vec_len+vv ]= i+vv*500;
v_ptr[i*v_len*vec_len+j*vec_len+vv+1]= i+vv*500;
}
}}
}
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@@ -1,55 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_crc.h
Copyright (C) 2021
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
template<class vobj> void DumpSliceNorm(std::string s,Lattice<vobj> &f,int mu=-1)
{
auto ff = localNorm2(f);
if ( mu==-1 ) mu = f.Grid()->Nd()-1;
typedef typename vobj::tensor_reduced normtype;
typedef typename normtype::scalar_object scalar;
std::vector<scalar> sff;
sliceSum(ff,sff,mu);
for(int t=0;t<sff.size();t++){
std::cout << s<<" "<<t<<" "<<sff[t]<<std::endl;
}
}
template<class vobj> uint32_t crc(Lattice<vobj> & buf)
{
autoView( buf_v , buf, CpuRead);
return ::crc32(0L,(unsigned char *)&buf_v[0],(size_t)sizeof(vobj)*buf.oSites());
}
#define CRC(U) std::cout << "FingerPrint "<<__FILE__ <<" "<< __LINE__ <<" "<< #U <<" "<<crc(U)<<std::endl;
NAMESPACE_END(Grid);

View File

@@ -43,8 +43,8 @@ template<class vobj>
inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced> inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced>
{ {
Lattice<typename vobj::tensor_reduced> ret(rhs.Grid()); Lattice<typename vobj::tensor_reduced> ret(rhs.Grid());
autoView( rhs_v , rhs, AcceleratorRead); auto rhs_v = rhs.View();
autoView( ret_v , ret, AcceleratorWrite); auto ret_v = ret.View();
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{ accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
coalescedWrite(ret_v[ss],innerProduct(rhs_v(ss),rhs_v(ss))); coalescedWrite(ret_v[ss],innerProduct(rhs_v(ss),rhs_v(ss)));
}); });
@@ -56,9 +56,9 @@ template<class vobj>
inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) -> Lattice<typename vobj::tensor_reduced> inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) -> Lattice<typename vobj::tensor_reduced>
{ {
Lattice<typename vobj::tensor_reduced> ret(rhs.Grid()); Lattice<typename vobj::tensor_reduced> ret(rhs.Grid());
autoView( lhs_v , lhs, AcceleratorRead); auto lhs_v = lhs.View();
autoView( rhs_v , rhs, AcceleratorRead); auto rhs_v = rhs.View();
autoView( ret_v , ret, AcceleratorWrite); auto ret_v = ret.View();
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{ accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
coalescedWrite(ret_v[ss],innerProduct(lhs_v(ss),rhs_v(ss))); coalescedWrite(ret_v[ss],innerProduct(lhs_v(ss),rhs_v(ss)));
}); });
@@ -73,9 +73,9 @@ inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Latt
typedef decltype(coalescedRead(ll())) sll; typedef decltype(coalescedRead(ll())) sll;
typedef decltype(coalescedRead(rr())) srr; typedef decltype(coalescedRead(rr())) srr;
Lattice<decltype(outerProduct(ll(),rr()))> ret(rhs.Grid()); Lattice<decltype(outerProduct(ll(),rr()))> ret(rhs.Grid());
autoView( lhs_v , lhs, AcceleratorRead); auto lhs_v = lhs.View();
autoView( rhs_v , rhs, AcceleratorRead); auto rhs_v = rhs.View();
autoView( ret_v , ret, AcceleratorWrite); auto ret_v = ret.View();
accelerator_for(ss,rhs_v.size(),1,{ accelerator_for(ss,rhs_v.size(),1,{
// FIXME had issues with scalar version of outer // FIXME had issues with scalar version of outer
// Use vector [] operator and don't read coalesce this loop // Use vector [] operator and don't read coalesce this loop

View File

@@ -51,9 +51,9 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
int block =FullGrid->_slice_block [Orthog]; int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog]; int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog]; int ostride=FullGrid->_ostride[Orthog];
autoView( X_v , X, CpuRead); auto X_v = X.View();
autoView( Y_v , Y, CpuRead); auto Y_v = Y.View();
autoView( R_v , R, CpuWrite); auto R_v = R.View();
thread_region thread_region
{ {
std::vector<vobj> s_x(Nblock); std::vector<vobj> s_x(Nblock);
@@ -97,8 +97,8 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
int nblock=FullGrid->_slice_nblock[Orthog]; int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog]; int ostride=FullGrid->_ostride[Orthog];
autoView( X_v , X, CpuRead); auto X_v = X.View();
autoView( R_v , R, CpuWrite); auto R_v = R.View();
thread_region thread_region
{ {
@@ -156,8 +156,8 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
int ostride=FullGrid->_ostride[Orthog]; int ostride=FullGrid->_ostride[Orthog];
typedef typename vobj::vector_typeD vector_typeD; typedef typename vobj::vector_typeD vector_typeD;
autoView( lhs_v , lhs, CpuRead); auto lhs_v = lhs.View();
autoView( rhs_v , rhs, CpuRead); auto rhs_v = rhs.View();
thread_region { thread_region {
std::vector<vobj> Left(Nblock); std::vector<vobj> Left(Nblock);
std::vector<vobj> Right(Nblock); std::vector<vobj> Right(Nblock);

View File

@@ -46,9 +46,9 @@ auto PeekIndex(const Lattice<vobj> &lhs,int i) -> Lattice<decltype(peekIndex<Ind
{ {
Lattice<decltype(peekIndex<Index>(vobj(),i))> ret(lhs.Grid()); Lattice<decltype(peekIndex<Index>(vobj(),i))> ret(lhs.Grid());
ret.Checkerboard()=lhs.Checkerboard(); ret.Checkerboard()=lhs.Checkerboard();
autoView( ret_v, ret, AcceleratorWrite); auto ret_v = ret.View();
autoView( lhs_v, lhs, AcceleratorRead); auto lhs_v = lhs.View();
accelerator_for( ss, lhs_v.size(), 1, { thread_for( ss, lhs_v.size(), {
ret_v[ss] = peekIndex<Index>(lhs_v[ss],i); ret_v[ss] = peekIndex<Index>(lhs_v[ss],i);
}); });
return ret; return ret;
@@ -58,9 +58,9 @@ auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekInd
{ {
Lattice<decltype(peekIndex<Index>(vobj(),i,j))> ret(lhs.Grid()); Lattice<decltype(peekIndex<Index>(vobj(),i,j))> ret(lhs.Grid());
ret.Checkerboard()=lhs.Checkerboard(); ret.Checkerboard()=lhs.Checkerboard();
autoView( ret_v, ret, AcceleratorWrite); auto ret_v = ret.View();
autoView( lhs_v, lhs, AcceleratorRead); auto lhs_v = lhs.View();
accelerator_for( ss, lhs_v.size(), 1, { thread_for( ss, lhs_v.size(), {
ret_v[ss] = peekIndex<Index>(lhs_v[ss],i,j); ret_v[ss] = peekIndex<Index>(lhs_v[ss],i,j);
}); });
return ret; return ret;
@@ -72,18 +72,18 @@ auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekInd
template<int Index,class vobj> template<int Index,class vobj>
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0))> & rhs,int i) void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0))> & rhs,int i)
{ {
autoView( rhs_v, rhs, AcceleratorRead); auto rhs_v = rhs.View();
autoView( lhs_v, lhs, AcceleratorWrite); auto lhs_v = lhs.View();
accelerator_for( ss, lhs_v.size(), 1, { thread_for( ss, lhs_v.size(), {
pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i); pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i);
}); });
} }
template<int Index,class vobj> template<int Index,class vobj>
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0,0))> & rhs,int i,int j) void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0,0))> & rhs,int i,int j)
{ {
autoView( rhs_v, rhs, AcceleratorRead); auto rhs_v = rhs.View();
autoView( lhs_v, lhs, AcceleratorWrite); auto lhs_v = lhs.View();
accelerator_for( ss, lhs_v.size(), 1, { thread_for( ss, lhs_v.size(), {
pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i,j); pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i,j);
}); });
} }
@@ -111,7 +111,7 @@ void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){
// extract-modify-merge cycle is easiest way and this is not perf critical // extract-modify-merge cycle is easiest way and this is not perf critical
ExtractBuffer<sobj> buf(Nsimd); ExtractBuffer<sobj> buf(Nsimd);
autoView( l_v , l, CpuWrite); auto l_v = l.View();
if ( rank == grid->ThisRank() ) { if ( rank == grid->ThisRank() ) {
extract(l_v[odx],buf); extract(l_v[odx],buf);
buf[idx] = s; buf[idx] = s;
@@ -141,7 +141,7 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
grid->GlobalCoorToRankIndex(rank,odx,idx,site); grid->GlobalCoorToRankIndex(rank,odx,idx,site);
ExtractBuffer<sobj> buf(Nsimd); ExtractBuffer<sobj> buf(Nsimd);
autoView( l_v , l, CpuWrite); auto l_v = l.View();
extract(l_v[odx],buf); extract(l_v[odx],buf);
s = buf[idx]; s = buf[idx];
@@ -151,21 +151,21 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
return; return;
}; };
////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////
// Peek a scalar object from the SIMD array // Peek a scalar object from the SIMD array
////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////
// Must be CPU read view
template<class vobj,class sobj> template<class vobj,class sobj>
inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site) inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){
{
GridBase *grid = l.getGrid(); GridBase *grid = l.Grid();
assert(l.mode==CpuRead);
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd(); int Nsimd = grid->Nsimd();
assert( l.Checkerboard()== grid->CheckerBoard(site)); assert( l.Checkerboard()== l.Grid()->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj)); assert( sizeof(sobj)*Nsimd == sizeof(vobj));
static const int words=sizeof(vobj)/sizeof(vector_type); static const int words=sizeof(vobj)/sizeof(vector_type);
@@ -173,7 +173,8 @@ inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
idx= grid->iIndex(site); idx= grid->iIndex(site);
odx= grid->oIndex(site); odx= grid->oIndex(site);
scalar_type * vp = (scalar_type *)&l[odx]; auto l_v = l.View();
scalar_type * vp = (scalar_type *)&l_v[odx];
scalar_type * pt = (scalar_type *)&s; scalar_type * pt = (scalar_type *)&s;
for(int w=0;w<words;w++){ for(int w=0;w<words;w++){
@@ -182,27 +183,18 @@ inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
return; return;
}; };
template<class vobj,class sobj>
inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site)
{
autoView(lv,l,CpuRead);
peekLocalSite(s,lv,site);
return;
};
// Must be CPU write view
template<class vobj,class sobj> template<class vobj,class sobj>
inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site) inline void pokeLocalSite(const sobj &s,Lattice<vobj> &l,Coordinate &site){
{
GridBase *grid=l.getGrid(); GridBase *grid=l.Grid();
assert(l.mode==CpuWrite);
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd(); int Nsimd = grid->Nsimd();
assert( l.Checkerboard()== grid->CheckerBoard(site)); assert( l.Checkerboard()== l.Grid()->CheckerBoard(site));
assert( sizeof(sobj)*Nsimd == sizeof(vobj)); assert( sizeof(sobj)*Nsimd == sizeof(vobj));
static const int words=sizeof(vobj)/sizeof(vector_type); static const int words=sizeof(vobj)/sizeof(vector_type);
@@ -210,19 +202,13 @@ inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site)
idx= grid->iIndex(site); idx= grid->iIndex(site);
odx= grid->oIndex(site); odx= grid->oIndex(site);
scalar_type * vp = (scalar_type *)&l[odx]; auto l_v = l.View();
scalar_type * vp = (scalar_type *)&l_v[odx];
scalar_type * pt = (scalar_type *)&s; scalar_type * pt = (scalar_type *)&s;
for(int w=0;w<words;w++){ for(int w=0;w<words;w++){
vp[idx+w*Nsimd] = pt[w]; vp[idx+w*Nsimd] = pt[w];
} }
return;
};
template<class vobj,class sobj>
inline void pokeLocalSite(const sobj &s, Lattice<vobj> &l,Coordinate &site)
{
autoView(lv,l,CpuWrite);
pokeLocalSite(s,lv,site);
return; return;
}; };

View File

@@ -1,79 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/Lattice_reality.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: neo <cossu@post.kek.jp>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_LATTICE_REAL_IMAG_H
#define GRID_LATTICE_REAL_IMAG_H
// FIXME .. this is the sector of the code
// I am most worried about the directions
// The choice of burying complex in the SIMD
// is making the use of "real" and "imag" very cumbersome
NAMESPACE_BEGIN(Grid);
template<class vobj> inline Lattice<vobj> real(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs.Grid());
autoView( lhs_v, lhs, AcceleratorRead);
autoView( ret_v, ret, AcceleratorWrite);
ret.Checkerboard()=lhs.Checkerboard();
accelerator_for( ss, lhs_v.size(), 1, {
ret_v[ss] =real(lhs_v[ss]);
});
return ret;
};
template<class vobj> inline Lattice<vobj> imag(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs.Grid());
autoView( lhs_v, lhs, AcceleratorRead);
autoView( ret_v, ret, AcceleratorWrite);
ret.Checkerboard()=lhs.Checkerboard();
accelerator_for( ss, lhs_v.size(), 1, {
ret_v[ss] =imag(lhs_v[ss]);
});
return ret;
};
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
auto real(const Expression &expr) -> decltype(real(closure(expr)))
{
return real(closure(expr));
}
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
auto imag(const Expression &expr) -> decltype(imag(closure(expr)))
{
return imag(closure(expr));
}
NAMESPACE_END(Grid);
#endif

View File

@@ -40,77 +40,26 @@ NAMESPACE_BEGIN(Grid);
template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){ template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs.Grid()); Lattice<vobj> ret(lhs.Grid());
autoView( lhs_v, lhs, AcceleratorRead);
autoView( ret_v, ret, AcceleratorWrite);
ret.Checkerboard()=lhs.Checkerboard(); ret.Checkerboard()=lhs.Checkerboard();
accelerator_for( ss, lhs_v.size(), 1, { auto lhs_v = lhs.View();
ret_v[ss] = adj(lhs_v[ss]); auto ret_v = ret.View();
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
coalescedWrite(ret_v[ss], adj(lhs_v(ss)));
}); });
return ret; return ret;
}; };
template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){ template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs.Grid()); Lattice<vobj> ret(lhs.Grid());
autoView( lhs_v, lhs, AcceleratorRead);
autoView( ret_v, ret, AcceleratorWrite);
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
auto lhs_v = lhs.View();
auto ret_v = ret.View();
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
coalescedWrite( ret_v[ss] , conjugate(lhs_v(ss))); coalescedWrite( ret_v[ss] , conjugate(lhs_v(ss)));
}); });
return ret; return ret;
}; };
template<class vobj> inline Lattice<typename vobj::Complexified> toComplex(const Lattice<vobj> &lhs){
Lattice<typename vobj::Complexified> ret(lhs.Grid());
autoView( lhs_v, lhs, AcceleratorRead);
autoView( ret_v, ret, AcceleratorWrite);
ret.Checkerboard() = lhs.Checkerboard();
accelerator_for( ss, lhs_v.size(), 1, {
ret_v[ss] = toComplex(lhs_v[ss]);
});
return ret;
};
template<class vobj> inline Lattice<typename vobj::Realified> toReal(const Lattice<vobj> &lhs){
Lattice<typename vobj::Realified> ret(lhs.Grid());
autoView( lhs_v, lhs, AcceleratorRead);
autoView( ret_v, ret, AcceleratorWrite);
ret.Checkerboard() = lhs.Checkerboard();
accelerator_for( ss, lhs_v.size(), 1, {
ret_v[ss] = toReal(lhs_v[ss]);
});
return ret;
};
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
auto toComplex(const Expression &expr) -> decltype(closure(expr))
{
return toComplex(closure(expr));
}
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
auto toReal(const Expression &expr) -> decltype(closure(expr))
{
return toReal(closure(expr));
}
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
auto adj(const Expression &expr) -> decltype(closure(expr))
{
return adj(closure(expr));
}
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
auto conjugate(const Expression &expr) -> decltype(closure(expr))
{
return conjugate(closure(expr));
}
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
#endif #endif

View File

@@ -25,7 +25,7 @@ Author: Christoph Lehner <christoph@lhnr.de>
#include <Grid/Grid_Eigen_Dense.h> #include <Grid/Grid_Eigen_Dense.h>
#if defined(GRID_CUDA)||defined(GRID_HIP) #ifdef GRID_NVCC
#include <Grid/lattice/Lattice_reduction_gpu.h> #include <Grid/lattice/Lattice_reduction_gpu.h>
#endif #endif
@@ -39,7 +39,7 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
{ {
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
// const int Nsimd = vobj::Nsimd(); const int Nsimd = vobj::Nsimd();
const int nthread = GridThread::GetThreads(); const int nthread = GridThread::GetThreads();
Vector<sobj> sumarray(nthread); Vector<sobj> sumarray(nthread);
@@ -62,99 +62,24 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
for(int i=0;i<nthread;i++){ for(int i=0;i<nthread;i++){
ssum = ssum+sumarray[i]; ssum = ssum+sumarray[i];
} }
return ssum; return ssum;
} }
template<class vobj> template<class vobj>
inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)
{
typedef typename vobj::scalar_objectD sobj;
const int nthread = GridThread::GetThreads();
Vector<sobj> sumarray(nthread);
for(int i=0;i<nthread;i++){
sumarray[i]=Zero();
}
thread_for(thr,nthread, {
int nwork, mywork, myoff;
nwork = osites;
GridThread::GetWork(nwork,thr,mywork,myoff);
vobj vvsum=Zero();
for(int ss=myoff;ss<mywork+myoff; ss++){
vvsum = vvsum + arg[ss];
}
sumarray[thr]=Reduce(vvsum);
});
sobj ssum=Zero(); // sum across threads
for(int i=0;i<nthread;i++){
ssum = ssum+sumarray[i];
}
typedef typename vobj::scalar_object ssobj;
ssobj ret = ssum;
return ret;
}
/*
Threaded max, don't use for now
template<class Double>
inline Double max(const Double *arg, Integer osites)
{
// const int Nsimd = vobj::Nsimd();
const int nthread = GridThread::GetThreads();
std::vector<Double> maxarray(nthread);
thread_for(thr,nthread, {
int nwork, mywork, myoff;
nwork = osites;
GridThread::GetWork(nwork,thr,mywork,myoff);
Double max=arg[0];
for(int ss=myoff;ss<mywork+myoff; ss++){
if( arg[ss] > max ) max = arg[ss];
}
maxarray[thr]=max;
});
Double tmax=maxarray[0];
for(int i=0;i<nthread;i++){
if (maxarray[i]>tmax) tmax = maxarray[i];
}
return tmax;
}
*/
template<class vobj>
inline typename vobj::scalar_object sum(const vobj *arg, Integer osites) inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
{ {
#if defined(GRID_CUDA)||defined(GRID_HIP) #ifdef GRID_NVCC
return sum_gpu(arg,osites); return sum_gpu(arg,osites);
#else #else
return sum_cpu(arg,osites); return sum_cpu(arg,osites);
#endif #endif
} }
template<class vobj>
inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
{
#if defined(GRID_CUDA)||defined(GRID_HIP)
return sumD_gpu(arg,osites);
#else
return sumD_cpu(arg,osites);
#endif
}
template<class vobj> template<class vobj>
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg) inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
{ {
#if defined(GRID_CUDA)||defined(GRID_HIP) auto arg_v = arg.View();
autoView( arg_v, arg, AcceleratorRead);
Integer osites = arg.Grid()->oSites(); Integer osites = arg.Grid()->oSites();
auto ssum= sum_gpu(&arg_v[0],osites); auto ssum= sum(&arg_v[0],osites);
#else
autoView(arg_v, arg, CpuRead);
Integer osites = arg.Grid()->oSites();
auto ssum= sum_cpu(&arg_v[0],osites);
#endif
arg.Grid()->GlobalSum(ssum); arg.Grid()->GlobalSum(ssum);
return ssum; return ssum;
} }
@@ -167,32 +92,6 @@ template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
return real(nrm); return real(nrm);
} }
//The global maximum of the site norm2
template<class vobj> inline RealD maxLocalNorm2(const Lattice<vobj> &arg)
{
typedef typename vobj::tensor_reduced vscalar; //iScalar<iScalar<.... <vPODtype> > >
typedef typename vscalar::scalar_object scalar; //iScalar<iScalar<.... <PODtype> > >
Lattice<vscalar> inner = localNorm2(arg);
auto grid = arg.Grid();
RealD max;
for(int l=0;l<grid->lSites();l++){
Coordinate coor;
scalar val;
RealD r;
grid->LocalIndexToLocalCoor(l,coor);
peekLocalSite(val,inner,coor);
r=real(TensorRemove(val));
if( (l==0) || (r>max)){
max=r;
}
}
grid->GlobalMax(max);
return max;
}
// Double inner product // Double inner product
template<class vobj> template<class vobj>
inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
@@ -202,30 +101,43 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
ComplexD nrm; ComplexD nrm;
GridBase *grid = left.Grid(); GridBase *grid = left.Grid();
// Might make all code paths go this way.
auto left_v = left.AcceleratorView(ViewRead);
auto right_v=right.AcceleratorView(ViewRead);
const uint64_t nsimd = grid->Nsimd(); const uint64_t nsimd = grid->Nsimd();
const uint64_t sites = grid->oSites(); const uint64_t sites = grid->oSites();
// Might make all code paths go this way. #ifdef GRID_NVCC
typedef decltype(innerProductD(vobj(),vobj())) inner_t; // GPU - SIMT lane compliance...
typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites); Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0]; auto inner_tmp_v = &inner_tmp[0];
{
autoView( left_v , left, AcceleratorRead);
autoView( right_v,right, AcceleratorRead);
// GPU - SIMT lane compliance... accelerator_for( ss, sites, nsimd,{
accelerator_for( ss, sites, 1,{ auto x_l = left_v(ss);
auto x_l = left_v[ss]; auto y_l = right_v(ss);
auto y_l = right_v[ss]; coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
inner_tmp_v[ss]=innerProductD(x_l,y_l); })
});
}
// This is in single precision and fails some tests // This is in single precision and fails some tests
auto anrm = sum(inner_tmp_v,sites); // Need a sumD that sums in double
nrm = anrm; nrm = TensorRemove(sumD_gpu(inner_tmp_v,sites));
#else
// CPU
typedef decltype(innerProductD(left_v[0],right_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, nsimd,{
auto x_l = left_v[ss];
auto y_l = right_v[ss];
inner_tmp_v[ss]=innerProductD(x_l,y_l);
})
nrm = TensorRemove(sum(inner_tmp_v,sites));
#endif
return nrm; return nrm;
} }
@@ -263,24 +175,40 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
GridBase *grid = x.Grid(); GridBase *grid = x.Grid();
auto x_v=x.AcceleratorView(ViewRead);
auto y_v=y.AcceleratorView(ViewRead);
auto z_v=z.AcceleratorView(ViewWrite);
const uint64_t nsimd = grid->Nsimd(); const uint64_t nsimd = grid->Nsimd();
const uint64_t sites = grid->oSites(); const uint64_t sites = grid->oSites();
#ifdef GRID_NVCC
// GPU // GPU
autoView( x_v, x, AcceleratorRead); typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
autoView( y_v, y, AcceleratorRead);
autoView( z_v, z, AcceleratorWrite);
typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites); Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0]; auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, 1,{ accelerator_for( ss, sites, nsimd,{
auto tmp = a*x_v[ss]+b*y_v[ss]; auto tmp = a*x_v(ss)+b*y_v(ss);
coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
coalescedWrite(z_v[ss],tmp);
});
nrm = real(TensorRemove(sumD_gpu(inner_tmp_v,sites)));
#else
// CPU
typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, nsimd,{
auto tmp = a*x_v(ss)+b*y_v(ss);
inner_tmp_v[ss]=innerProductD(tmp,tmp); inner_tmp_v[ss]=innerProductD(tmp,tmp);
z_v[ss]=tmp; z_v[ss]=tmp;
}); });
// Already promoted to double
nrm = real(TensorRemove(sum(inner_tmp_v,sites))); nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
#endif
grid->GlobalSum(nrm); grid->GlobalSum(nrm);
return nrm; return nrm;
} }
@@ -296,29 +224,47 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
GridBase *grid = left.Grid(); GridBase *grid = left.Grid();
auto left_v=left.AcceleratorView(ViewRead);
auto right_v=right.AcceleratorView(ViewRead);
const uint64_t nsimd = grid->Nsimd(); const uint64_t nsimd = grid->Nsimd();
const uint64_t sites = grid->oSites(); const uint64_t sites = grid->oSites();
#ifdef GRID_NVCC
// GPU // GPU
typedef decltype(innerProductD(vobj(),vobj())) inner_t; typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t;
typedef decltype(innerProductD(vobj(),vobj())) norm_t; typedef decltype(innerProduct(left_v[0],left_v[0])) norm_t;
Vector<inner_t> inner_tmp(sites); Vector<inner_t> inner_tmp(sites);
Vector<norm_t> norm_tmp(sites); Vector<norm_t> norm_tmp(sites);
auto inner_tmp_v = &inner_tmp[0]; auto inner_tmp_v = &inner_tmp[0];
auto norm_tmp_v = &norm_tmp[0]; auto norm_tmp_v = &norm_tmp[0];
{
autoView(left_v,left, AcceleratorRead);
autoView(right_v,right,AcceleratorRead);
accelerator_for( ss, sites, 1,{
auto left_tmp = left_v[ss];
inner_tmp_v[ss]=innerProductD(left_tmp,right_v[ss]);
norm_tmp_v [ss]=innerProductD(left_tmp,left_tmp);
});
}
accelerator_for( ss, sites, nsimd,{
auto left_tmp = left_v(ss);
coalescedWrite(inner_tmp_v[ss],innerProduct(left_tmp,right_v(ss)));
coalescedWrite(norm_tmp_v[ss],innerProduct(left_tmp,left_tmp));
});
tmp[0] = TensorRemove(sumD_gpu(inner_tmp_v,sites));
tmp[1] = TensorRemove(sumD_gpu(norm_tmp_v,sites));
#else
// CPU
typedef decltype(innerProductD(left_v[0],right_v[0])) inner_t;
typedef decltype(innerProductD(left_v[0],left_v[0])) norm_t;
Vector<inner_t> inner_tmp(sites);
Vector<norm_t> norm_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
auto norm_tmp_v = &norm_tmp[0];
accelerator_for( ss, sites, nsimd,{
auto left_tmp = left_v(ss);
inner_tmp_v[ss] = innerProductD(left_tmp,right_v(ss));
norm_tmp_v[ss] = innerProductD(left_tmp,left_tmp);
});
// Already promoted to double
tmp[0] = TensorRemove(sum(inner_tmp_v,sites)); tmp[0] = TensorRemove(sum(inner_tmp_v,sites));
tmp[1] = TensorRemove(sum(norm_tmp_v,sites)); tmp[1] = TensorRemove(sum(norm_tmp_v,sites));
#endif
grid->GlobalSumVector(&tmp[0],2); // keep norm Complex -> can use GlobalSumVector grid->GlobalSumVector(&tmp[0],2); // keep norm Complex -> can use GlobalSumVector
ip = tmp[0]; ip = tmp[0];
nrm = real(tmp[1]); nrm = real(tmp[1]);
@@ -361,7 +307,6 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
// But easily avoided by using double precision fields // But easily avoided by using double precision fields
/////////////////////////////////////////////////////// ///////////////////////////////////////////////////////
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_object::scalar_type scalar_type;
GridBase *grid = Data.Grid(); GridBase *grid = Data.Grid();
assert(grid!=NULL); assert(grid!=NULL);
@@ -390,7 +335,7 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
// sum over reduced dimension planes, breaking out orthog dir // sum over reduced dimension planes, breaking out orthog dir
// Parallel over orthog direction // Parallel over orthog direction
autoView( Data_v, Data, CpuRead); auto Data_v=Data.View();
thread_for( r,rd, { thread_for( r,rd, {
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
@@ -420,19 +365,20 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
} }
// sum over nodes. // sum over nodes.
sobj gsum;
for(int t=0;t<fd;t++){ for(int t=0;t<fd;t++){
int pt = t/ld; // processor plane int pt = t/ld; // processor plane
int lt = t%ld; int lt = t%ld;
if ( pt == grid->_processor_coor[orthogdim] ) { if ( pt == grid->_processor_coor[orthogdim] ) {
result[t]=lsSum[lt]; gsum=lsSum[lt];
} else { } else {
result[t]=Zero(); gsum=Zero();
} }
grid->GlobalSum(gsum);
result[t]=gsum;
} }
scalar_type * ptr = (scalar_type *) &result[0];
int words = fd*sizeof(sobj)/sizeof(scalar_type);
grid->GlobalSumVector(ptr, words);
} }
template<class vobj> template<class vobj>
@@ -467,8 +413,8 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti
int e2= grid->_slice_block [orthogdim]; int e2= grid->_slice_block [orthogdim];
int stride=grid->_slice_stride[orthogdim]; int stride=grid->_slice_stride[orthogdim];
autoView( lhv, lhs, CpuRead); auto lhv=lhs.View();
autoView( rhv, rhs, CpuRead); auto rhv=rhs.View();
thread_for( r,rd,{ thread_for( r,rd,{
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
@@ -575,12 +521,14 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
tensor_reduced at; at=av; tensor_reduced at; at=av;
autoView( Rv, R, CpuWrite); auto Rv=R.View();
autoView( Xv, X, CpuRead); auto Xv=X.View();
autoView( Yv, Y, CpuRead); auto Yv=Y.View();
thread_for2d( n, e1, b,e2, { thread_for_collapse(2, n, e1, {
for(int b=0;b<e2;b++){
int ss= so+n*stride+b; int ss= so+n*stride+b;
Rv[ss] = at*Xv[ss]+Yv[ss]; Rv[ss] = at*Xv[ss]+Yv[ss];
}
}); });
} }
}; };
@@ -633,9 +581,9 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
int nblock=FullGrid->_slice_nblock[Orthog]; int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog]; int ostride=FullGrid->_ostride[Orthog];
autoView( X_v, X, CpuRead); auto X_v=X.View();
autoView( Y_v, Y, CpuRead); auto Y_v=Y.View();
autoView( R_v, R, CpuWrite); auto R_v=R.View();
thread_region thread_region
{ {
Vector<vobj> s_x(Nblock); Vector<vobj> s_x(Nblock);
@@ -680,14 +628,13 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
// int nl=1; // int nl=1;
//FIXME package in a convenient iterator //FIXME package in a convenient iterator
// thread_for2d_in_region
//Should loop over a plane orthogonal to direction "Orthog" //Should loop over a plane orthogonal to direction "Orthog"
int stride=FullGrid->_slice_stride[Orthog]; int stride=FullGrid->_slice_stride[Orthog];
int block =FullGrid->_slice_block [Orthog]; int block =FullGrid->_slice_block [Orthog];
int nblock=FullGrid->_slice_nblock[Orthog]; int nblock=FullGrid->_slice_nblock[Orthog];
int ostride=FullGrid->_ostride[Orthog]; int ostride=FullGrid->_ostride[Orthog];
autoView( R_v, R, CpuWrite); auto R_v = R.View();
autoView( X_v, X, CpuRead); auto X_v = X.View();
thread_region thread_region
{ {
std::vector<vobj> s_x(Nblock); std::vector<vobj> s_x(Nblock);
@@ -745,8 +692,8 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
typedef typename vobj::vector_typeD vector_typeD; typedef typename vobj::vector_typeD vector_typeD;
autoView( lhs_v, lhs, CpuRead); auto lhs_v=lhs.View();
autoView( rhs_v, rhs, CpuRead); auto rhs_v=rhs.View();
thread_region thread_region
{ {
std::vector<vobj> Left(Nblock); std::vector<vobj> Left(Nblock);

View File

@@ -1,14 +1,7 @@
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
#ifdef GRID_HIP
extern hipDeviceProp_t *gpu_props;
#define WARP_SIZE 64
#endif
#ifdef GRID_CUDA
extern cudaDeviceProp *gpu_props;
#define WARP_SIZE 32 #define WARP_SIZE 32
#endif extern cudaDeviceProp *gpu_props;
__device__ unsigned int retirementCount = 0; __device__ unsigned int retirementCount = 0;
template <class Iterator> template <class Iterator>
@@ -26,12 +19,7 @@ template <class Iterator>
void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) { void getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
int device; int device;
#ifdef GRID_CUDA
cudaGetDevice(&device); cudaGetDevice(&device);
#endif
#ifdef GRID_HIP
hipGetDevice(&device);
#endif
Iterator warpSize = gpu_props[device].warpSize; Iterator warpSize = gpu_props[device].warpSize;
Iterator sharedMemPerBlock = gpu_props[device].sharedMemPerBlock; Iterator sharedMemPerBlock = gpu_props[device].sharedMemPerBlock;
@@ -65,7 +53,7 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
// cannot use overloaded operators for sobj as they are not volatile-qualified // cannot use overloaded operators for sobj as they are not volatile-qualified
memcpy((void *)&sdata[tid], (void *)&mySum, sizeof(sobj)); memcpy((void *)&sdata[tid], (void *)&mySum, sizeof(sobj));
acceleratorSynchronise(); __syncwarp();
const Iterator VEC = WARP_SIZE; const Iterator VEC = WARP_SIZE;
const Iterator vid = tid & (VEC-1); const Iterator vid = tid & (VEC-1);
@@ -79,9 +67,9 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
beta += temp; beta += temp;
memcpy((void *)&sdata[tid], (void *)&beta, sizeof(sobj)); memcpy((void *)&sdata[tid], (void *)&beta, sizeof(sobj));
} }
acceleratorSynchronise(); __syncwarp();
} }
acceleratorSynchroniseAll(); __syncthreads();
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
beta = Zero(); beta = Zero();
@@ -91,7 +79,7 @@ __device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid
} }
memcpy((void *)&sdata[0], (void *)&beta, sizeof(sobj)); memcpy((void *)&sdata[0], (void *)&beta, sizeof(sobj));
} }
acceleratorSynchroniseAll(); __syncthreads();
} }
@@ -159,7 +147,7 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
sobj *smem = (sobj *)shmem_pointer; sobj *smem = (sobj *)shmem_pointer;
// wait until all outstanding memory instructions in this thread are finished // wait until all outstanding memory instructions in this thread are finished
acceleratorFence(); __threadfence();
if (tid==0) { if (tid==0) {
unsigned int ticket = atomicInc(&retirementCount, gridDim.x); unsigned int ticket = atomicInc(&retirementCount, gridDim.x);
@@ -168,8 +156,8 @@ __global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
} }
// each thread must read the correct value of amLast // each thread must read the correct value of amLast
acceleratorSynchroniseAll(); __syncthreads();
if (amLast) { if (amLast) {
// reduce buffer[0], ..., buffer[gridDim.x-1] // reduce buffer[0], ..., buffer[gridDim.x-1]
Iterator i = tid; Iterator i = tid;
@@ -211,7 +199,13 @@ inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
sobj *buffer_v = &buffer[0]; sobj *buffer_v = &buffer[0];
reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size); reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size);
accelerator_barrier(); cudaDeviceSynchronize();
cudaError err = cudaGetLastError();
if ( cudaSuccess != err ) {
printf("Cuda error %s\n",cudaGetErrorString( err ));
exit(0);
}
auto result = buffer_v[0]; auto result = buffer_v[0];
return result; return result;
} }

View File

@@ -32,9 +32,8 @@
#include <random> #include <random>
#ifdef RNG_SITMO #ifdef RNG_SITMO
#include <Grid/random/sitmo_prng_engine.hpp> #include <Grid/sitmo_rng/sitmo_prng_engine.hpp>
#endif #endif
#include <Grid/random/gaussian.h>
#if defined(RNG_SITMO) #if defined(RNG_SITMO)
#define RNG_FAST_DISCARD #define RNG_FAST_DISCARD
@@ -143,7 +142,7 @@ public:
std::vector<RngEngine> _generators; std::vector<RngEngine> _generators;
std::vector<std::uniform_real_distribution<RealD> > _uniform; std::vector<std::uniform_real_distribution<RealD> > _uniform;
std::vector<Grid::gaussian_distribution<RealD> > _gaussian; std::vector<std::normal_distribution<RealD> > _gaussian;
std::vector<std::discrete_distribution<int32_t> > _bernoulli; std::vector<std::discrete_distribution<int32_t> > _bernoulli;
std::vector<std::uniform_int_distribution<uint32_t> > _uid; std::vector<std::uniform_int_distribution<uint32_t> > _uid;
@@ -244,7 +243,7 @@ public:
GridSerialRNG() : GridRNGbase() { GridSerialRNG() : GridRNGbase() {
_generators.resize(1); _generators.resize(1);
_uniform.resize(1,std::uniform_real_distribution<RealD>{0,1}); _uniform.resize(1,std::uniform_real_distribution<RealD>{0,1});
_gaussian.resize(1,gaussian_distribution<RealD>(0.0,1.0) ); _gaussian.resize(1,std::normal_distribution<RealD>(0.0,1.0) );
_bernoulli.resize(1,std::discrete_distribution<int32_t>{1,1}); _bernoulli.resize(1,std::discrete_distribution<int32_t>{1,1});
_uid.resize(1,std::uniform_int_distribution<uint32_t>() ); _uid.resize(1,std::uniform_int_distribution<uint32_t>() );
} }
@@ -358,7 +357,7 @@ public:
_generators.resize(_vol); _generators.resize(_vol);
_uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1}); _uniform.resize(_vol,std::uniform_real_distribution<RealD>{0,1});
_gaussian.resize(_vol,gaussian_distribution<RealD>(0.0,1.0) ); _gaussian.resize(_vol,std::normal_distribution<RealD>(0.0,1.0) );
_bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1}); _bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1});
_uid.resize(_vol,std::uniform_int_distribution<uint32_t>() ); _uid.resize(_vol,std::uniform_int_distribution<uint32_t>() );
} }
@@ -376,7 +375,7 @@ public:
int osites = _grid->oSites(); // guaranteed to be <= l.Grid()->oSites() by a factor multiplicity int osites = _grid->oSites(); // guaranteed to be <= l.Grid()->oSites() by a factor multiplicity
int words = sizeof(scalar_object) / sizeof(scalar_type); int words = sizeof(scalar_object) / sizeof(scalar_type);
autoView(l_v, l, CpuWrite); auto l_v = l.View();
thread_for( ss, osites, { thread_for( ss, osites, {
ExtractBuffer<scalar_object> buf(Nsimd); ExtractBuffer<scalar_object> buf(Nsimd);
for (int m = 0; m < multiplicity; m++) { // Draw from same generator multiplicity times for (int m = 0; m < multiplicity; m++) { // Draw from same generator multiplicity times
@@ -462,8 +461,8 @@ public:
} }
{ {
// Obtain one reseeded generator per thread // Obtain one reseeded generator per thread
int Nthread = 32; // Hardwire a good level or parallelism int Nthread = GridThread::GetThreads();
std::vector<RngEngine> seeders(Nthread); std::vector<RngEngine> seeders(Nthread);
for(int t=0;t<Nthread;t++){ for(int t=0;t<Nthread;t++){
seeders[t] = Reseed(master_engine); seeders[t] = Reseed(master_engine);

View File

@@ -42,8 +42,8 @@ template<class vobj>
inline auto trace(const Lattice<vobj> &lhs) -> Lattice<decltype(trace(vobj()))> inline auto trace(const Lattice<vobj> &lhs) -> Lattice<decltype(trace(vobj()))>
{ {
Lattice<decltype(trace(vobj()))> ret(lhs.Grid()); Lattice<decltype(trace(vobj()))> ret(lhs.Grid());
autoView(ret_v , ret, AcceleratorWrite); auto ret_v = ret.View();
autoView(lhs_v , lhs, AcceleratorRead); auto lhs_v = lhs.View();
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
coalescedWrite(ret_v[ss], trace(lhs_v(ss))); coalescedWrite(ret_v[ss], trace(lhs_v(ss)));
}); });
@@ -58,8 +58,8 @@ template<int Index,class vobj>
inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(vobj()))> inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(vobj()))>
{ {
Lattice<decltype(traceIndex<Index>(vobj()))> ret(lhs.Grid()); Lattice<decltype(traceIndex<Index>(vobj()))> ret(lhs.Grid());
autoView( ret_v , ret, AcceleratorWrite); auto ret_v = ret.View();
autoView( lhs_v , lhs, AcceleratorRead); auto lhs_v = lhs.View();
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), { accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
coalescedWrite(ret_v[ss], traceIndex<Index>(lhs_v(ss))); coalescedWrite(ret_v[ss], traceIndex<Index>(lhs_v(ss)));
}); });

View File

@@ -47,12 +47,11 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
// remove and insert a half checkerboard // remove and insert a half checkerboard
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full) template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full){
{
half.Checkerboard() = cb; half.Checkerboard() = cb;
autoView( half_v, half, CpuWrite); auto half_v = half.View();
autoView( full_v, full, CpuRead); auto full_v = full.View();
thread_for(ss, full.Grid()->oSites(),{ thread_for(ss, full.Grid()->oSites(),{
int cbos; int cbos;
Coordinate coor; Coordinate coor;
@@ -65,11 +64,11 @@ template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,con
} }
}); });
} }
template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half)
{ template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){
int cb = half.Checkerboard(); int cb = half.Checkerboard();
autoView( half_v , half, CpuRead); auto half_v = half.View();
autoView( full_v , full, CpuWrite); auto full_v = full.View();
thread_for(ss,full.Grid()->oSites(),{ thread_for(ss,full.Grid()->oSites(),{
Coordinate coor; Coordinate coor;
@@ -97,29 +96,15 @@ accelerator_inline void convertType(ComplexF & out, const std::complex<float> &
out = in; out = in;
} }
template<typename T> #ifdef __CUDA_ARCH__
accelerator_inline EnableIf<isGridFundamental<T>> convertType(T & out, const T & in) {
out = in;
}
// This would allow for conversions between GridFundamental types, but is not strictly needed as yet
/*template<typename T1, typename T2>
accelerator_inline typename std::enable_if<isGridFundamental<T1>::value && isGridFundamental<T2>::value>::type
// Or to make this very broad, conversions between anything that's not a GridTensor could be allowed
//accelerator_inline typename std::enable_if<!isGridTensor<T1>::value && !isGridTensor<T2>::value>::type
convertType(T1 & out, const T2 & in) {
out = in;
}*/
#ifdef GRID_SIMT
accelerator_inline void convertType(vComplexF & out, const ComplexF & in) { accelerator_inline void convertType(vComplexF & out, const ComplexF & in) {
((ComplexF*)&out)[acceleratorSIMTlane(vComplexF::Nsimd())] = in; ((ComplexF*)&out)[SIMTlane(vComplexF::Nsimd())] = in;
} }
accelerator_inline void convertType(vComplexD & out, const ComplexD & in) { accelerator_inline void convertType(vComplexD & out, const ComplexD & in) {
((ComplexD*)&out)[acceleratorSIMTlane(vComplexD::Nsimd())] = in; ((ComplexD*)&out)[SIMTlane(vComplexD::Nsimd())] = in;
} }
accelerator_inline void convertType(vComplexD2 & out, const ComplexD & in) { accelerator_inline void convertType(vComplexD2 & out, const ComplexD & in) {
((ComplexD*)&out)[acceleratorSIMTlane(vComplexD::Nsimd()*2)] = in; ((ComplexD*)&out)[SIMTlane(vComplexD::Nsimd()*2)] = in;
} }
#endif #endif
@@ -131,18 +116,18 @@ accelerator_inline void convertType(vComplexD2 & out, const vComplexF & in) {
Optimization::PrecisionChange::StoD(in.v,out._internal[0].v,out._internal[1].v); Optimization::PrecisionChange::StoD(in.v,out._internal[0].v,out._internal[1].v);
} }
template<typename T1,typename T2> template<typename T1,typename T2,int N>
accelerator_inline void convertType(iScalar<T1> & out, const iScalar<T2> & in) { accelerator_inline void convertType(iMatrix<T1,N> & out, const iMatrix<T2,N> & in);
convertType(out._internal,in._internal); template<typename T1,typename T2,int N>
} accelerator_inline void convertType(iVector<T1,N> & out, const iVector<T2,N> & in);
template<typename T1,typename T2> template<typename T1,typename T2, typename std::enable_if<!isGridScalar<T1>::value, T1>::type* = nullptr>
accelerator_inline NotEnableIf<isGridScalar<T1>> convertType(T1 & out, const iScalar<T2> & in) { accelerator_inline void convertType(T1 & out, const iScalar<T2> & in) {
convertType(out,in._internal); convertType(out,in._internal);
} }
template<typename T1,typename T2> template<typename T1,typename T2>
accelerator_inline NotEnableIf<isGridScalar<T2>> convertType(iScalar<T1> & out, const T2 & in) { accelerator_inline void convertType(iScalar<T1> & out, const T2 & in) {
convertType(out._internal,in); convertType(out._internal,in);
} }
@@ -159,13 +144,19 @@ accelerator_inline void convertType(iVector<T1,N> & out, const iVector<T2,N> & i
convertType(out._internal[i],in._internal[i]); convertType(out._internal[i],in._internal[i]);
} }
template<typename T, typename std::enable_if<isGridFundamental<T>::value, T>::type* = nullptr>
accelerator_inline void convertType(T & out, const T & in) {
out = in;
}
template<typename T1,typename T2> template<typename T1,typename T2>
accelerator_inline void convertType(Lattice<T1> & out, const Lattice<T2> & in) { accelerator_inline void convertType(Lattice<T1> & out, const Lattice<T2> & in) {
autoView( out_v , out,AcceleratorWrite); auto out_v = out.AcceleratorView(ViewWrite);
autoView( in_v , in ,AcceleratorRead); auto in_v = in.AcceleratorView(ViewRead);
accelerator_for(ss,out_v.size(),T1::Nsimd(),{ accelerator_for(ss,out_v.size(),T1::Nsimd(),{
convertType(out_v[ss],in_v(ss)); convertType(out_v[ss],in_v(ss));
}); });
} }
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
@@ -173,20 +164,19 @@ accelerator_inline void convertType(Lattice<T1> & out, const Lattice<T2> & in) {
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj> template<class vobj>
inline auto localInnerProductD(const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) inline auto localInnerProductD(const Lattice<vobj> &lhs,const Lattice<vobj> &rhs)
-> Lattice<iScalar<decltype(TensorRemove(innerProductD2(lhs.View(CpuRead)[0],rhs.View(CpuRead)[0])))>> -> Lattice<iScalar<decltype(TensorRemove(innerProductD2(lhs.View()[0],rhs.View()[0])))>>
{ {
autoView( lhs_v , lhs, AcceleratorRead); auto lhs_v = lhs.AcceleratorView(ViewRead);
autoView( rhs_v , rhs, AcceleratorRead); auto rhs_v = rhs.AcceleratorView(ViewRead);
typedef decltype(TensorRemove(innerProductD2(lhs_v[0],rhs_v[0]))) t_inner; typedef decltype(TensorRemove(innerProductD2(lhs_v[0],rhs_v[0]))) t_inner;
Lattice<iScalar<t_inner>> ret(lhs.Grid()); Lattice<iScalar<t_inner>> ret(lhs.Grid());
auto ret_v = ret.AcceleratorView(ViewWrite);
{ accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
autoView(ret_v, ret,AcceleratorWrite);
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
convertType(ret_v[ss],innerProductD2(lhs_v(ss),rhs_v(ss))); convertType(ret_v[ss],innerProductD2(lhs_v(ss),rhs_v(ss)));
}); });
}
return ret; return ret;
} }
@@ -204,13 +194,14 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
Lattice<iScalar<CComplex>> ip(coarse); Lattice<iScalar<CComplex>> ip(coarse);
Lattice<vobj> fineDataRed = fineData; Lattice<vobj> fineDataRed = fineData;
autoView( coarseData_ , coarseData, AcceleratorWrite); // auto fineData_ = fineData.View();
autoView( ip_ , ip, AcceleratorWrite); auto coarseData_ = coarseData.AcceleratorView(ViewWrite);
auto ip_ = ip.AcceleratorView(ViewReadWrite);
for(int v=0;v<nbasis;v++) { for(int v=0;v<nbasis;v++) {
blockInnerProductD(ip,Basis[v],fineDataRed); // ip = <basis|fine> blockInnerProductD(ip,Basis[v],fineDataRed); // ip = <basis|fine>
accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), { accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
convertType(coarseData_[sc](v),ip_[sc]); convertType(coarseData_[sc](v),ip_[sc]);
}); });
// improve numerical stability of projection // improve numerical stability of projection
// |fine> = |fine> - <basis|fine> |basis> // |fine> = |fine> - <basis|fine> |basis>
@@ -219,6 +210,68 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
} }
} }
template<class vobj,class CComplex,int nbasis>
inline void blockProject1(Lattice<iVector<CComplex,nbasis > > &coarseData,
const Lattice<vobj> &fineData,
const std::vector<Lattice<vobj> > &Basis)
{
typedef iVector<CComplex,nbasis > coarseSiteData;
coarseSiteData elide;
typedef decltype(coalescedRead(elide)) ScalarComplex;
GridBase * fine = fineData.Grid();
GridBase * coarse= coarseData.Grid();
int _ndimension = coarse->_ndimension;
// checks
assert( nbasis == Basis.size() );
subdivides(coarse,fine);
for(int i=0;i<nbasis;i++){
conformable(Basis[i],fineData);
}
Coordinate block_r (_ndimension);
for(int d=0 ; d<_ndimension;d++){
block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
assert(block_r[d]*coarse->_rdimensions[d] == fine->_rdimensions[d]);
}
int blockVol = fine->oSites()/coarse->oSites();
coarseData=Zero();
auto fineData_ = fineData.View();
auto coarseData_ = coarseData.View();
////////////////////////////////////////////////////////////////////////////////////////////////////////
// To make this lock free, loop over coars parallel, and then loop over fine associated with coarse.
// Otherwise do fine inner product per site, and make the update atomic
////////////////////////////////////////////////////////////////////////////////////////////////////////
accelerator_for( sci, nbasis*coarse->oSites(), vobj::Nsimd(), {
auto sc=sci/nbasis;
auto i=sci%nbasis;
auto Basis_ = Basis[i].View();
Coordinate coor_c(_ndimension);
Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions); // Block coordinate
int sf;
decltype(innerProduct(Basis_(sf),fineData_(sf))) reduce=Zero();
for(int sb=0;sb<blockVol;sb++){
Coordinate coor_b(_ndimension);
Coordinate coor_f(_ndimension);
Lexicographic::CoorFromIndex(coor_b,sb,block_r);
for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d]+coor_b[d];
Lexicographic::IndexFromCoor(coor_f,sf,fine->_rdimensions);
reduce=reduce+innerProduct(Basis_(sf),fineData_(sf));
}
coalescedWrite(coarseData_[sc](i),reduce);
});
return;
}
template<class vobj,class vobj2,class CComplex> template<class vobj,class vobj2,class CComplex>
inline void blockZAXPY(Lattice<vobj> &fineZ, inline void blockZAXPY(Lattice<vobj> &fineZ,
@@ -245,12 +298,10 @@ template<class vobj,class vobj2,class CComplex>
assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]); assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]);
} }
autoView( fineZ_ , fineZ, AcceleratorWrite); auto fineZ_ = fineZ.AcceleratorView(ViewWrite);
autoView( fineX_ , fineX, AcceleratorRead); auto fineX_ = fineX.AcceleratorView(ViewRead);
autoView( fineY_ , fineY, AcceleratorRead); auto fineY_ = fineY.AcceleratorView(ViewRead);
autoView( coarseA_, coarseA, AcceleratorRead); auto coarseA_= coarseA.AcceleratorView(ViewRead);
Coordinate fine_rdimensions = fine->_rdimensions;
Coordinate coarse_rdimensions = coarse->_rdimensions;
accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), { accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), {
@@ -258,12 +309,12 @@ template<class vobj,class vobj2,class CComplex>
Coordinate coor_c(_ndimension); Coordinate coor_c(_ndimension);
Coordinate coor_f(_ndimension); Coordinate coor_f(_ndimension);
Lexicographic::CoorFromIndex(coor_f,sf,fine_rdimensions); Lexicographic::CoorFromIndex(coor_f,sf,fine->_rdimensions);
for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d]; for(int d=0;d<_ndimension;d++) coor_c[d]=coor_f[d]/block_r[d];
Lexicographic::IndexFromCoor(coor_c,sc,coarse_rdimensions); Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
// z = A x + y // z = A x + y
#ifdef GRID_SIMT #ifdef __CUDA_ARCH__
typename vobj2::tensor_reduced::scalar_object cA; typename vobj2::tensor_reduced::scalar_object cA;
typename vobj::scalar_object cAx; typename vobj::scalar_object cAx;
#else #else
@@ -293,16 +344,15 @@ template<class vobj,class CComplex>
Lattice<dotp> fine_inner(fine); fine_inner.Checkerboard() = fineX.Checkerboard(); Lattice<dotp> fine_inner(fine); fine_inner.Checkerboard() = fineX.Checkerboard();
Lattice<dotp> coarse_inner(coarse); Lattice<dotp> coarse_inner(coarse);
auto CoarseInner_ = CoarseInner.AcceleratorView(ViewWrite);
auto coarse_inner_ = coarse_inner.AcceleratorView(ViewReadWrite);
// Precision promotion // Precision promotion
fine_inner = localInnerProductD<vobj>(fineX,fineY); fine_inner = localInnerProductD(fineX,fineY);
blockSum(coarse_inner,fine_inner); blockSum(coarse_inner,fine_inner);
{ accelerator_for(ss, coarse->oSites(), 1, {
autoView( CoarseInner_ , CoarseInner,AcceleratorWrite);
autoView( coarse_inner_ , coarse_inner,AcceleratorRead);
accelerator_for(ss, coarse->oSites(), 1, {
convertType(CoarseInner_[ss], TensorRemove(coarse_inner_[ss])); convertType(CoarseInner_[ss], TensorRemove(coarse_inner_[ss]));
}); });
}
} }
@@ -320,15 +370,14 @@ inline void blockInnerProduct(Lattice<CComplex> &CoarseInner,
Lattice<dotp> coarse_inner(coarse); Lattice<dotp> coarse_inner(coarse);
// Precision promotion? // Precision promotion?
auto CoarseInner_ = CoarseInner.AcceleratorView(ViewWrite);
auto coarse_inner_ = coarse_inner.AcceleratorView(ViewReadWrite);
fine_inner = localInnerProduct(fineX,fineY); fine_inner = localInnerProduct(fineX,fineY);
blockSum(coarse_inner,fine_inner); blockSum(coarse_inner,fine_inner);
{ accelerator_for(ss, coarse->oSites(), 1, {
autoView( CoarseInner_ , CoarseInner, AcceleratorWrite); CoarseInner_[ss] = coarse_inner_[ss];
autoView( coarse_inner_ , coarse_inner, AcceleratorRead); });
accelerator_for(ss, coarse->oSites(), 1, {
CoarseInner_[ss] = coarse_inner_[ss];
});
}
} }
template<class vobj,class CComplex> template<class vobj,class CComplex>
@@ -359,27 +408,16 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
} }
int blockVol = fine->oSites()/coarse->oSites(); int blockVol = fine->oSites()/coarse->oSites();
// Turn this around to loop threaded over sc and interior loop auto coarseData_ = coarseData.AcceleratorView(ViewReadWrite);
// over sf would thread better auto fineData_ = fineData.AcceleratorView(ViewRead);
autoView( coarseData_ , coarseData, AcceleratorWrite);
autoView( fineData_ , fineData, AcceleratorRead);
auto coarseData_p = &coarseData_[0];
auto fineData_p = &fineData_[0];
Coordinate fine_rdimensions = fine->_rdimensions;
Coordinate coarse_rdimensions = coarse->_rdimensions;
vobj zz = Zero();
accelerator_for(sc,coarse->oSites(),1,{ accelerator_for(sc,coarse->oSites(),1,{
// One thread per sub block // One thread per sub block
Coordinate coor_c(_ndimension); Coordinate coor_c(_ndimension);
Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions); // Block coordinate Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions); // Block coordinate
coarseData_[sc]=Zero();
vobj cd = zz;
for(int sb=0;sb<blockVol;sb++){ for(int sb=0;sb<blockVol;sb++){
int sf; int sf;
@@ -387,13 +425,11 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
Coordinate coor_f(_ndimension); Coordinate coor_f(_ndimension);
Lexicographic::CoorFromIndex(coor_b,sb,block_r); // Block sub coordinate Lexicographic::CoorFromIndex(coor_b,sb,block_r); // Block sub coordinate
for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d]; for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d];
Lexicographic::IndexFromCoor(coor_f,sf,fine_rdimensions); Lexicographic::IndexFromCoor(coor_f,sf,fine->_rdimensions);
cd=cd+fineData_p[sf]; coarseData_[sc]=coarseData_[sc]+fineData_[sf];
} }
coarseData_p[sc] = cd;
}); });
return; return;
} }
@@ -474,8 +510,8 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
for(int d=0 ; d<_ndimension;d++){ for(int d=0 ; d<_ndimension;d++){
block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d]; block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
} }
autoView( fineData_ , fineData, AcceleratorWrite); auto fineData_ = fineData.View();
autoView( coarseData_ , coarseData, AcceleratorRead); auto coarseData_ = coarseData.View();
// Loop with a cache friendly loop ordering // Loop with a cache friendly loop ordering
accelerator_for(sf,fine->oSites(),1,{ accelerator_for(sf,fine->oSites(),1,{
@@ -488,7 +524,7 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions); Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
for(int i=0;i<nbasis;i++) { for(int i=0;i<nbasis;i++) {
/* auto basis_ = Basis[i], );*/ auto basis_ = Basis[i].View();
if(i==0) fineData_[sf]=coarseData_[sc](i) *basis_[sf]); if(i==0) fineData_[sf]=coarseData_[sc](i) *basis_[sf]);
else fineData_[sf]=fineData_[sf]+coarseData_[sc](i)*basis_[sf]); else fineData_[sf]=fineData_[sf]+coarseData_[sc](i)*basis_[sf]);
} }
@@ -507,14 +543,7 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
fineData=Zero(); fineData=Zero();
for(int i=0;i<nbasis;i++) { for(int i=0;i<nbasis;i++) {
Lattice<iScalar<CComplex> > ip = PeekIndex<0>(coarseData,i); Lattice<iScalar<CComplex> > ip = PeekIndex<0>(coarseData,i);
auto ip_ = ip.AcceleratorView(ViewRead);
//Lattice<CComplex> cip(coarse);
//autoView( cip_ , cip, AcceleratorWrite);
//autoView( ip_ , ip, AcceleratorRead);
//accelerator_forNB(sc,coarse->oSites(),CComplex::Nsimd(),{
// coalescedWrite(cip_[sc], ip_(sc)());
// });
//blockZAXPY<vobj,CComplex >(fineData,cip,Basis[i],fineData);
blockZAXPY(fineData,ip,Basis[i],fineData); blockZAXPY(fineData,ip,Basis[i],fineData);
} }
} }
@@ -542,17 +571,15 @@ void localConvert(const Lattice<vobj> &in,Lattice<vvobj> &out)
assert(ig->lSites() == og->lSites()); assert(ig->lSites() == og->lSites());
} }
autoView(in_v,in,CpuRead);
autoView(out_v,out,CpuWrite);
thread_for(idx, ig->lSites(),{ thread_for(idx, ig->lSites(),{
sobj s; sobj s;
ssobj ss; ssobj ss;
Coordinate lcoor(ni); Coordinate lcoor(ni);
ig->LocalIndexToLocalCoor(idx,lcoor); ig->LocalIndexToLocalCoor(idx,lcoor);
peekLocalSite(s,in_v,lcoor); peekLocalSite(s,in,lcoor);
ss=s; ss=s;
pokeLocalSite(ss,out_v,lcoor); pokeLocalSite(ss,out,lcoor);
}); });
} }
@@ -587,9 +614,8 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
Coordinate rdt = Tg->_rdimensions; Coordinate rdt = Tg->_rdimensions;
Coordinate ist = Tg->_istride; Coordinate ist = Tg->_istride;
Coordinate ost = Tg->_ostride; Coordinate ost = Tg->_ostride;
auto t_v = To.AcceleratorView(ViewWrite);
autoView( t_v , To, AcceleratorWrite); auto f_v = From.AcceleratorView(ViewRead);
autoView( f_v , From, AcceleratorRead);
accelerator_for(idx,Fg->lSites(),1,{ accelerator_for(idx,Fg->lSites(),1,{
sobj s; sobj s;
Coordinate Fcoor(nd); Coordinate Fcoor(nd);
@@ -612,6 +638,8 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
for(int w=0;w<words;w++){ for(int w=0;w<words;w++){
tp[idx_t+w*Nsimd] = fp[idx_f+w*Nsimd]; // FIXME IF RRII layout, type pun no worke tp[idx_t+w*Nsimd] = fp[idx_f+w*Nsimd]; // FIXME IF RRII layout, type pun no worke
} }
// peekLocalSite(s,From,Fcoor);
// pokeLocalSite(s,To ,Tcoor);
} }
}); });
} }
@@ -642,8 +670,6 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
} }
// the above should guarantee that the operations are local // the above should guarantee that the operations are local
autoView(lowDimv,lowDim,CpuRead);
autoView(higherDimv,higherDim,CpuWrite);
thread_for(idx,lg->lSites(),{ thread_for(idx,lg->lSites(),{
sobj s; sobj s;
Coordinate lcoor(nl); Coordinate lcoor(nl);
@@ -656,8 +682,8 @@ void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice
hcoor[d]=lcoor[ddl++]; hcoor[d]=lcoor[ddl++];
} }
} }
peekLocalSite(s,lowDimv,lcoor); peekLocalSite(s,lowDim,lcoor);
pokeLocalSite(s,higherDimv,hcoor); pokeLocalSite(s,higherDim,hcoor);
}); });
} }
@@ -685,8 +711,6 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
} }
} }
// the above should guarantee that the operations are local // the above should guarantee that the operations are local
autoView(lowDimv,lowDim,CpuWrite);
autoView(higherDimv,higherDim,CpuRead);
thread_for(idx,lg->lSites(),{ thread_for(idx,lg->lSites(),{
sobj s; sobj s;
Coordinate lcoor(nl); Coordinate lcoor(nl);
@@ -699,8 +723,8 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
hcoor[d]=lcoor[ddl++]; hcoor[d]=lcoor[ddl++];
} }
} }
peekLocalSite(s,higherDimv,hcoor); peekLocalSite(s,higherDim,hcoor);
pokeLocalSite(s,lowDimv,lcoor); pokeLocalSite(s,lowDim,lcoor);
}); });
} }
@@ -728,8 +752,6 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
} }
// the above should guarantee that the operations are local // the above should guarantee that the operations are local
autoView(lowDimv,lowDim,CpuRead);
autoView(higherDimv,higherDim,CpuWrite);
thread_for(idx,lg->lSites(),{ thread_for(idx,lg->lSites(),{
sobj s; sobj s;
Coordinate lcoor(nl); Coordinate lcoor(nl);
@@ -738,8 +760,8 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
if( lcoor[orthog] == slice_lo ) { if( lcoor[orthog] == slice_lo ) {
hcoor=lcoor; hcoor=lcoor;
hcoor[orthog] = slice_hi; hcoor[orthog] = slice_hi;
peekLocalSite(s,lowDimv,lcoor); peekLocalSite(s,lowDim,lcoor);
pokeLocalSite(s,higherDimv,hcoor); pokeLocalSite(s,higherDim,hcoor);
} }
}); });
} }
@@ -767,8 +789,6 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int
} }
// the above should guarantee that the operations are local // the above should guarantee that the operations are local
autoView(lowDimv,lowDim,CpuWrite);
autoView(higherDimv,higherDim,CpuRead);
thread_for(idx,lg->lSites(),{ thread_for(idx,lg->lSites(),{
sobj s; sobj s;
Coordinate lcoor(nl); Coordinate lcoor(nl);
@@ -777,15 +797,15 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int
if( lcoor[orthog] == slice_lo ) { if( lcoor[orthog] == slice_lo ) {
hcoor=lcoor; hcoor=lcoor;
hcoor[orthog] = slice_hi; hcoor[orthog] = slice_hi;
peekLocalSite(s,higherDimv,hcoor); peekLocalSite(s,higherDim,hcoor);
pokeLocalSite(s,lowDimv,lcoor); pokeLocalSite(s,lowDim,lcoor);
} }
}); });
} }
template<class vobj> template<class vobj>
void Replicate(const Lattice<vobj> &coarse,Lattice<vobj> & fine) void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)
{ {
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
@@ -842,7 +862,7 @@ unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
} }
//loop over outer index //loop over outer index
autoView( in_v , in, CpuRead); auto in_v = in.View();
thread_for(in_oidx,in_grid->oSites(),{ thread_for(in_oidx,in_grid->oSites(),{
//Assemble vector of pointers to output elements //Assemble vector of pointers to output elements
ExtractPointerArray<sobj> out_ptrs(in_nsimd); ExtractPointerArray<sobj> out_ptrs(in_nsimd);
@@ -935,7 +955,7 @@ vectorizeFromLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
icoor[lane].resize(ndim); icoor[lane].resize(ndim);
grid->iCoorFromIindex(icoor[lane],lane); grid->iCoorFromIindex(icoor[lane],lane);
} }
autoView( out_v , out, CpuWrite); auto out_v = out.View();
thread_for(oidx, grid->oSites(),{ thread_for(oidx, grid->oSites(),{
//Assemble vector of pointers to output elements //Assemble vector of pointers to output elements
ExtractPointerArray<sobj> ptrs(nsimd); ExtractPointerArray<sobj> ptrs(nsimd);
@@ -1010,95 +1030,53 @@ vectorizeFromRevLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
}); });
} }
//The workspace for a precision change operation allowing for the reuse of the mapping to save time on subsequent calls
class precisionChangeWorkspace{
std::pair<Integer,Integer>* fmap_device; //device pointer
public:
precisionChangeWorkspace(GridBase *out_grid, GridBase *in_grid){
//Build a map between the sites and lanes of the output field and the input field as we cannot use the Grids on the device
assert(out_grid->Nd() == in_grid->Nd());
for(int d=0;d<out_grid->Nd();d++){
assert(out_grid->FullDimensions()[d] == in_grid->FullDimensions()[d]);
}
int Nsimd_out = out_grid->Nsimd();
std::vector<Coordinate> out_icorrs(out_grid->Nsimd()); //reuse these
for(int lane=0; lane < out_grid->Nsimd(); lane++)
out_grid->iCoorFromIindex(out_icorrs[lane], lane);
std::vector<std::pair<Integer,Integer> > fmap_host(out_grid->lSites()); //lsites = osites*Nsimd
thread_for(out_oidx,out_grid->oSites(),{
Coordinate out_ocorr;
out_grid->oCoorFromOindex(out_ocorr, out_oidx);
Coordinate lcorr; //the local coordinate (common to both in and out as full coordinate)
for(int out_lane=0; out_lane < Nsimd_out; out_lane++){
out_grid->InOutCoorToLocalCoor(out_ocorr, out_icorrs[out_lane], lcorr);
//int in_oidx = in_grid->oIndex(lcorr), in_lane = in_grid->iIndex(lcorr);
//Note oIndex and OcorrFromOindex (and same for iIndex) are not inverse for checkerboarded lattice, the former coordinates being defined on the full lattice and the latter on the reduced lattice
//Until this is fixed we need to circumvent the problem locally. Here I will use the coordinates defined on the reduced lattice for simplicity
int in_oidx = 0, in_lane = 0;
for(int d=0;d<in_grid->_ndimension;d++){
in_oidx += in_grid->_ostride[d] * ( lcorr[d] % in_grid->_rdimensions[d] );
in_lane += in_grid->_istride[d] * ( lcorr[d] / in_grid->_rdimensions[d] );
}
fmap_host[out_lane + Nsimd_out*out_oidx] = std::pair<Integer,Integer>( in_oidx, in_lane );
}
});
//Copy the map to the device (if we had a way to tell if an accelerator is in use we could avoid this copy for CPU-only machines)
size_t fmap_bytes = out_grid->lSites() * sizeof(std::pair<Integer,Integer>);
fmap_device = (std::pair<Integer,Integer>*)acceleratorAllocDevice(fmap_bytes);
acceleratorCopyToDevice(fmap_host.data(), fmap_device, fmap_bytes);
}
//Prevent moving or copying
precisionChangeWorkspace(const precisionChangeWorkspace &r) = delete;
precisionChangeWorkspace(precisionChangeWorkspace &&r) = delete;
precisionChangeWorkspace &operator=(const precisionChangeWorkspace &r) = delete;
precisionChangeWorkspace &operator=(precisionChangeWorkspace &&r) = delete;
std::pair<Integer,Integer> const* getMap() const{ return fmap_device; }
~precisionChangeWorkspace(){
acceleratorFreeDevice(fmap_device);
}
};
//Convert a lattice of one precision to another. The input workspace contains the mapping data.
template<class VobjOut, class VobjIn>
void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, const precisionChangeWorkspace &workspace){
static_assert( std::is_same<typename VobjOut::DoublePrecision, typename VobjIn::DoublePrecision>::value == 1, "copyLane: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same
out.Checkerboard() = in.Checkerboard();
constexpr int Nsimd_out = VobjOut::Nsimd();
std::pair<Integer,Integer> const* fmap_device = workspace.getMap();
//Do the copy/precision change
autoView( out_v , out, AcceleratorWrite);
autoView( in_v , in, AcceleratorRead);
accelerator_for(out_oidx, out.Grid()->oSites(), 1,{
std::pair<Integer,Integer> const* fmap_osite = fmap_device + out_oidx*Nsimd_out;
for(int out_lane=0; out_lane < Nsimd_out; out_lane++){
int in_oidx = fmap_osite[out_lane].first;
int in_lane = fmap_osite[out_lane].second;
copyLane(out_v[out_oidx], out_lane, in_v[in_oidx], in_lane);
}
});
}
//Convert a Lattice from one precision to another //Convert a Lattice from one precision to another
//Generate the workspace in place; if multiple calls with the same mapping are performed, consider pregenerating the workspace and reusing
template<class VobjOut, class VobjIn> template<class VobjOut, class VobjIn>
void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
precisionChangeWorkspace workspace(out.Grid(), in.Grid()); {
precisionChange(out, in, workspace); assert(out.Grid()->Nd() == in.Grid()->Nd());
} for(int d=0;d<out.Grid()->Nd();d++){
assert(out.Grid()->FullDimensions()[d] == in.Grid()->FullDimensions()[d]);
}
out.Checkerboard() = in.Checkerboard();
GridBase *in_grid=in.Grid();
GridBase *out_grid = out.Grid();
typedef typename VobjOut::scalar_object SobjOut;
typedef typename VobjIn::scalar_object SobjIn;
int ndim = out.Grid()->Nd();
int out_nsimd = out_grid->Nsimd();
std::vector<Coordinate > out_icoor(out_nsimd);
for(int lane=0; lane < out_nsimd; lane++){
out_icoor[lane].resize(ndim);
out_grid->iCoorFromIindex(out_icoor[lane], lane);
}
std::vector<SobjOut> in_slex_conv(in_grid->lSites());
unvectorizeToLexOrdArray(in_slex_conv, in);
auto out_v = out.View();
thread_for(out_oidx,out_grid->oSites(),{
Coordinate out_ocoor(ndim);
out_grid->oCoorFromOindex(out_ocoor, out_oidx);
ExtractPointerArray<SobjOut> ptrs(out_nsimd);
Coordinate lcoor(out_grid->Nd());
for(int lane=0; lane < out_nsimd; lane++){
for(int mu=0;mu<ndim;mu++)
lcoor[mu] = out_ocoor[mu] + out_grid->_rdimensions[mu]*out_icoor[lane][mu];
int llex; Lexicographic::IndexFromCoor(lcoor, llex, out_grid->_ldimensions);
ptrs[lane] = &in_slex_conv[llex];
}
merge(out_v[out_oidx], ptrs, 0);
});
}
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Communicate between grids // Communicate between grids

View File

@@ -42,8 +42,8 @@ NAMESPACE_BEGIN(Grid);
template<class vobj> template<class vobj>
inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){ inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){
Lattice<vobj> ret(lhs.Grid()); Lattice<vobj> ret(lhs.Grid());
autoView( ret_v, ret, AcceleratorWrite); auto ret_v = ret.View();
autoView( lhs_v, lhs, AcceleratorRead); auto lhs_v = lhs.View();
accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{ accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{
coalescedWrite(ret_v[ss], transpose(lhs_v(ss))); coalescedWrite(ret_v[ss], transpose(lhs_v(ss)));
}); });
@@ -58,8 +58,8 @@ template<int Index,class vobj>
inline auto TransposeIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(transposeIndex<Index>(vobj()))> inline auto TransposeIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(transposeIndex<Index>(vobj()))>
{ {
Lattice<decltype(transposeIndex<Index>(vobj()))> ret(lhs.Grid()); Lattice<decltype(transposeIndex<Index>(vobj()))> ret(lhs.Grid());
autoView( ret_v, ret, AcceleratorWrite); auto ret_v = ret.View();
autoView( lhs_v, lhs, AcceleratorRead); auto lhs_v = lhs.View();
accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{ accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{
coalescedWrite(ret_v[ss] , transposeIndex<Index>(lhs_v(ss))); coalescedWrite(ret_v[ss] , transposeIndex<Index>(lhs_v(ss)));
}); });

View File

@@ -35,8 +35,8 @@ NAMESPACE_BEGIN(Grid);
template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs_i,RealD y){ template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs_i,RealD y){
Lattice<obj> ret_i(rhs_i.Grid()); Lattice<obj> ret_i(rhs_i.Grid());
autoView( rhs, rhs_i, AcceleratorRead); auto rhs = rhs_i.View();
autoView( ret, ret_i, AcceleratorWrite); auto ret = ret_i.View();
ret.Checkerboard() = rhs.Checkerboard(); ret.Checkerboard() = rhs.Checkerboard();
accelerator_for(ss,rhs.size(),1,{ accelerator_for(ss,rhs.size(),1,{
ret[ss]=pow(rhs[ss],y); ret[ss]=pow(rhs[ss],y);
@@ -45,8 +45,8 @@ template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs_i,RealD y){
} }
template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs_i,Integer y){ template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs_i,Integer y){
Lattice<obj> ret_i(rhs_i.Grid()); Lattice<obj> ret_i(rhs_i.Grid());
autoView( rhs , rhs_i, AcceleratorRead); auto rhs = rhs_i.View();
autoView( ret , ret_i, AcceleratorWrite); auto ret = ret_i.View();
ret.Checkerboard() = rhs.Checkerboard(); ret.Checkerboard() = rhs.Checkerboard();
accelerator_for(ss,rhs.size(),obj::Nsimd(),{ accelerator_for(ss,rhs.size(),obj::Nsimd(),{
coalescedWrite(ret[ss],mod(rhs(ss),y)); coalescedWrite(ret[ss],mod(rhs(ss),y));
@@ -56,8 +56,8 @@ template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs_i,Integer y){
template<class obj> Lattice<obj> div(const Lattice<obj> &rhs_i,Integer y){ template<class obj> Lattice<obj> div(const Lattice<obj> &rhs_i,Integer y){
Lattice<obj> ret_i(rhs_i.Grid()); Lattice<obj> ret_i(rhs_i.Grid());
autoView( ret , ret_i, AcceleratorWrite); auto ret = ret_i.View();
autoView( rhs , rhs_i, AcceleratorRead); auto rhs = rhs_i.View();
ret.Checkerboard() = rhs_i.Checkerboard(); ret.Checkerboard() = rhs_i.Checkerboard();
accelerator_for(ss,rhs.size(),obj::Nsimd(),{ accelerator_for(ss,rhs.size(),obj::Nsimd(),{
coalescedWrite(ret[ss],div(rhs(ss),y)); coalescedWrite(ret[ss],div(rhs(ss),y));
@@ -67,8 +67,8 @@ template<class obj> Lattice<obj> div(const Lattice<obj> &rhs_i,Integer y){
template<class obj> Lattice<obj> expMat(const Lattice<obj> &rhs_i, RealD alpha, Integer Nexp = DEFAULT_MAT_EXP){ template<class obj> Lattice<obj> expMat(const Lattice<obj> &rhs_i, RealD alpha, Integer Nexp = DEFAULT_MAT_EXP){
Lattice<obj> ret_i(rhs_i.Grid()); Lattice<obj> ret_i(rhs_i.Grid());
autoView( rhs , rhs_i, AcceleratorRead); auto rhs = rhs_i.View();
autoView( ret , ret_i, AcceleratorWrite); auto ret = ret_i.View();
ret.Checkerboard() = rhs.Checkerboard(); ret.Checkerboard() = rhs.Checkerboard();
accelerator_for(ss,rhs.size(),obj::Nsimd(),{ accelerator_for(ss,rhs.size(),obj::Nsimd(),{
coalescedWrite(ret[ss],Exponentiate(rhs(ss),alpha, Nexp)); coalescedWrite(ret[ss],Exponentiate(rhs(ss),alpha, Nexp));

View File

@@ -1,173 +0,0 @@
#pragma once
NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////////////////
// Base class which can be used by traits to pick up behaviour
///////////////////////////////////////////////////////////////////
class LatticeBase {};
/////////////////////////////////////////////////////////////////////////////////////////
// Conformable checks; same instance of Grid required
/////////////////////////////////////////////////////////////////////////////////////////
void accelerator_inline conformable(GridBase *lhs,GridBase *rhs)
{
assert(lhs == rhs);
}
////////////////////////////////////////////////////////////////////////////
// Minimal base class containing only data valid to access from accelerator
// _odata will be a managed pointer in CUDA
////////////////////////////////////////////////////////////////////////////
// Force access to lattice through a view object.
// prevents writing of code that will not offload to GPU, but perhaps annoyingly
// strict since host could could in principle direct access through the lattice object
// Need to decide programming model.
#define LATTICE_VIEW_STRICT
template<class vobj> class LatticeAccelerator : public LatticeBase
{
protected:
//public:
GridBase *_grid;
int checkerboard;
vobj *_odata; // A managed pointer
uint64_t _odata_size;
ViewAdvise advise;
public:
accelerator_inline LatticeAccelerator() : checkerboard(0), _odata(nullptr), _odata_size(0), _grid(nullptr), advise(AdviseDefault) { };
accelerator_inline uint64_t oSites(void) const { return _odata_size; };
accelerator_inline int Checkerboard(void) const { return checkerboard; };
accelerator_inline int &Checkerboard(void) { return this->checkerboard; }; // can assign checkerboard on a container, not a view
accelerator_inline ViewAdvise Advise(void) const { return advise; };
accelerator_inline ViewAdvise &Advise(void) { return this->advise; }; // can assign advise on a container, not a view
accelerator_inline void Conformable(GridBase * &grid) const
{
if (grid) conformable(grid, _grid);
else grid = _grid;
};
// Host only
GridBase * getGrid(void) const { return _grid; };
};
/////////////////////////////////////////////////////////////////////////////////////////
// A View class which provides accessor to the data.
// This will be safe to call from accelerator_for and is trivially copy constructible
// The copy constructor for this will need to be used by device lambda functions
/////////////////////////////////////////////////////////////////////////////////////////
template<class vobj>
class LatticeView : public LatticeAccelerator<vobj>
{
public:
// Rvalue
ViewMode mode;
void * cpu_ptr;
#ifdef GRID_SIMT
accelerator_inline const typename vobj::scalar_object operator()(size_t i) const {
return coalescedRead(this->_odata[i]);
}
#else
accelerator_inline const vobj & operator()(size_t i) const { return this->_odata[i]; }
#endif
#if 1
// accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
accelerator_inline vobj & operator[](size_t i) const { return this->_odata[i]; };
#else
accelerator_inline const vobj & operator[](size_t i) const { return this->_odata[i]; };
accelerator_inline vobj & operator[](size_t i) { return this->_odata[i]; };
#endif
accelerator_inline uint64_t begin(void) const { return 0;};
accelerator_inline uint64_t end(void) const { return this->_odata_size; };
accelerator_inline uint64_t size(void) const { return this->_odata_size; };
LatticeView(const LatticeAccelerator<vobj> &refer_to_me) : LatticeAccelerator<vobj> (refer_to_me){}
LatticeView(const LatticeView<vobj> &refer_to_me) = default; // Trivially copyable
LatticeView(const LatticeAccelerator<vobj> &refer_to_me,ViewMode mode) : LatticeAccelerator<vobj> (refer_to_me)
{
this->ViewOpen(mode);
}
// Host functions
void ViewOpen(ViewMode mode)
{ // Translate the pointer, could save a copy. Could use a "Handle" and not save _odata originally in base
// std::cout << "View Open"<<std::hex<<this->_odata<<std::dec <<std::endl;
this->cpu_ptr = (void *)this->_odata;
this->mode = mode;
this->_odata =(vobj *)
MemoryManager::ViewOpen(this->cpu_ptr,
this->_odata_size*sizeof(vobj),
mode,
this->advise);
}
void ViewClose(void)
{ // Inform the manager
// std::cout << "View Close"<<std::hex<<this->cpu_ptr<<std::dec <<std::endl;
MemoryManager::ViewClose(this->cpu_ptr,this->mode);
}
};
// Little autoscope assister
template<class View>
class ViewCloser
{
View v; // Take a copy of view and call view close when I go out of scope automatically
public:
ViewCloser(View &_v) : v(_v) {};
~ViewCloser() { v.ViewClose(); }
};
#define autoView(l_v,l,mode) \
auto l_v = l.View(mode); \
ViewCloser<decltype(l_v)> _autoView##l_v(l_v);
/////////////////////////////////////////////////////////////////////////////////////////
// Lattice expression types used by ET to assemble the AST
//
// Need to be able to detect code paths according to the whether a lattice object or not
// so introduce some trait type things
/////////////////////////////////////////////////////////////////////////////////////////
class LatticeExpressionBase {};
template <typename T> using is_lattice = std::is_base_of<LatticeBase, T>;
template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >;
template<class T, bool isLattice> struct ViewMapBase { typedef T Type; };
template<class T> struct ViewMapBase<T,true> { typedef LatticeView<typename T::vector_object> Type; };
template<class T> using ViewMap = ViewMapBase<T,std::is_base_of<LatticeBase, T>::value >;
template <typename Op, typename _T1>
class LatticeUnaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
Op op;
T1 arg1;
LatticeUnaryExpression(Op _op,const _T1 &_arg1) : op(_op), arg1(_arg1) {};
};
template <typename Op, typename _T1, typename _T2>
class LatticeBinaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
typedef typename ViewMap<_T2>::Type T2;
Op op;
T1 arg1;
T2 arg2;
LatticeBinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2) : op(_op), arg1(_arg1), arg2(_arg2) {};
};
template <typename Op, typename _T1, typename _T2, typename _T3>
class LatticeTrinaryExpression : public LatticeExpressionBase
{
public:
typedef typename ViewMap<_T1>::Type T1;
typedef typename ViewMap<_T2>::Type T2;
typedef typename ViewMap<_T3>::Type T3;
Op op;
T1 arg1;
T2 arg2;
T3 arg3;
LatticeTrinaryExpression(Op _op,const _T1 &_arg1,const _T2 &_arg2,const _T3 &_arg3) : op(_op), arg1(_arg1), arg2(_arg2), arg3(_arg3) {};
};
NAMESPACE_END(Grid);

View File

@@ -43,7 +43,7 @@ inline void whereWolf(Lattice<vobj> &ret,const Lattice<iobj> &predicate,Lattice<
conformable(iftrue,predicate); conformable(iftrue,predicate);
conformable(iftrue,ret); conformable(iftrue,ret);
GridBase *grid=iftrue.Grid(); GridBase *grid=iftrue._grid;
typedef typename vobj::scalar_object scalar_object; typedef typename vobj::scalar_object scalar_object;
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
@@ -52,23 +52,22 @@ inline void whereWolf(Lattice<vobj> &ret,const Lattice<iobj> &predicate,Lattice<
const int Nsimd = grid->Nsimd(); const int Nsimd = grid->Nsimd();
autoView(iftrue_v,iftrue,CpuRead); std::vector<Integer> mask(Nsimd);
autoView(iffalse_v,iffalse,CpuRead); std::vector<scalar_object> truevals (Nsimd);
autoView(predicate_v,predicate,CpuRead); std::vector<scalar_object> falsevals(Nsimd);
autoView(ret_v,ret,CpuWrite);
Integer NN= grid->oSites(); parallel_for(int ss=0;ss<iftrue._grid->oSites(); ss++){
thread_for(ss,NN,{
Integer mask; extract(iftrue._odata[ss] ,truevals);
scalar_object trueval; extract(iffalse._odata[ss] ,falsevals);
scalar_object falseval; extract<vInteger,Integer>(TensorRemove(predicate._odata[ss]),mask);
for(int l=0;l<Nsimd;l++){
trueval =extractLane(l,iftrue_v[ss]); for(int s=0;s<Nsimd;s++){
falseval=extractLane(l,iffalse_v[ss]); if (mask[s]) falsevals[s]=truevals[s];
mask =extractLane(l,predicate_v[ss]);
if (mask) falseval=trueval;
insertLane(l,ret_v[ss],falseval);
} }
});
merge(ret._odata[ss],falsevals);
}
} }
template<class vobj,class iobj> template<class vobj,class iobj>
@@ -77,9 +76,9 @@ inline Lattice<vobj> whereWolf(const Lattice<iobj> &predicate,Lattice<vobj> &ift
conformable(iftrue,iffalse); conformable(iftrue,iffalse);
conformable(iftrue,predicate); conformable(iftrue,predicate);
Lattice<vobj> ret(iftrue.Grid()); Lattice<vobj> ret(iftrue._grid);
whereWolf(ret,predicate,iftrue,iffalse); where(ret,predicate,iftrue,iffalse);
return ret; return ret;
} }

View File

@@ -69,7 +69,6 @@ GridLogger GridLogDebug (1, "Debug", GridLogColours, "PURPLE");
GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN"); GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN");
GridLogger GridLogIterative (1, "Iterative", GridLogColours, "BLUE"); GridLogger GridLogIterative (1, "Iterative", GridLogColours, "BLUE");
GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE"); GridLogger GridLogIntegrator (1, "Integrator", GridLogColours, "BLUE");
GridLogger GridLogHMC (1, "HMC", GridLogColours, "BLUE");
void GridLogConfigure(std::vector<std::string> &logstreams) { void GridLogConfigure(std::vector<std::string> &logstreams) {
GridLogError.Active(0); GridLogError.Active(0);
@@ -80,7 +79,6 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
GridLogPerformance.Active(0); GridLogPerformance.Active(0);
GridLogIntegrator.Active(1); GridLogIntegrator.Active(1);
GridLogColours.Active(0); GridLogColours.Active(0);
GridLogHMC.Active(1);
for (int i = 0; i < logstreams.size(); i++) { for (int i = 0; i < logstreams.size(); i++) {
if (logstreams[i] == std::string("Error")) GridLogError.Active(1); if (logstreams[i] == std::string("Error")) GridLogError.Active(1);
@@ -89,8 +87,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
if (logstreams[i] == std::string("Iterative")) GridLogIterative.Active(1); if (logstreams[i] == std::string("Iterative")) GridLogIterative.Active(1);
if (logstreams[i] == std::string("Debug")) GridLogDebug.Active(1); if (logstreams[i] == std::string("Debug")) GridLogDebug.Active(1);
if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1); if (logstreams[i] == std::string("Performance")) GridLogPerformance.Active(1);
if (logstreams[i] == std::string("NoIntegrator")) GridLogIntegrator.Active(0); if (logstreams[i] == std::string("Integrator")) GridLogIntegrator.Active(1);
if (logstreams[i] == std::string("NoHMC")) GridLogHMC.Active(0);
if (logstreams[i] == std::string("Colours")) GridLogColours.Active(1); if (logstreams[i] == std::string("Colours")) GridLogColours.Active(1);
} }
} }

View File

@@ -130,8 +130,6 @@ public:
friend std::ostream& operator<< (std::ostream& stream, Logger& log){ friend std::ostream& operator<< (std::ostream& stream, Logger& log){
if ( log.active ) { if ( log.active ) {
std::ios_base::fmtflags f(stream.flags());
stream << log.background()<< std::left; stream << log.background()<< std::left;
if (log.topWidth > 0) if (log.topWidth > 0)
{ {
@@ -154,8 +152,6 @@ public:
<< now << log.background() << " : " ; << now << log.background() << " : " ;
} }
stream << log.colour(); stream << log.colour();
stream.flags(f);
return stream; return stream;
} else { } else {
return devnull; return devnull;
@@ -182,7 +178,6 @@ extern GridLogger GridLogDebug ;
extern GridLogger GridLogPerformance; extern GridLogger GridLogPerformance;
extern GridLogger GridLogIterative ; extern GridLogger GridLogIterative ;
extern GridLogger GridLogIntegrator ; extern GridLogger GridLogIntegrator ;
extern GridLogger GridLogHMC;
extern Colours GridLogColours; extern Colours GridLogColours;
std::string demangle(const char* name) ; std::string demangle(const char* name) ;

View File

@@ -1,4 +1,3 @@
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
int Grid::BinaryIO::latticeWriteMaxRetry = -1; int Grid::BinaryIO::latticeWriteMaxRetry = -1;
Grid::BinaryIO::IoPerf Grid::BinaryIO::lastPerf;

View File

@@ -79,13 +79,6 @@ inline void removeWhitespace(std::string &key)
/////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////
class BinaryIO { class BinaryIO {
public: public:
struct IoPerf
{
uint64_t size{0},time{0};
double mbytesPerSecond{0.};
};
static IoPerf lastPerf;
static int latticeWriteMaxRetry; static int latticeWriteMaxRetry;
///////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////
@@ -509,15 +502,12 @@ class BinaryIO {
timer.Stop(); timer.Stop();
} }
lastPerf.size = sizeof(fobj)*iodata.size()*nrank;
lastPerf.time = timer.useconds();
lastPerf.mbytesPerSecond = lastPerf.size/1024./1024./(lastPerf.time/1.0e6);
std::cout<<GridLogMessage<<"IOobject: "; std::cout<<GridLogMessage<<"IOobject: ";
if ( control & BINARYIO_READ) std::cout << " read "; if ( control & BINARYIO_READ) std::cout << " read ";
else std::cout << " write "; else std::cout << " write ";
uint64_t bytes = sizeof(fobj)*iodata.size()*nrank; uint64_t bytes = sizeof(fobj)*iodata.size()*nrank;
std::cout<< lastPerf.size <<" bytes in "<< timer.Elapsed() <<" " std::cout<< bytes <<" bytes in "<<timer.Elapsed() <<" "
<< lastPerf.mbytesPerSecond <<" MB/s "<<std::endl; << (double)bytes/ (double)timer.useconds() <<" MB/s "<<std::endl;
std::cout<<GridLogMessage<<"IOobject: endian and checksum overhead "<<bstimer.Elapsed() <<std::endl; std::cout<<GridLogMessage<<"IOobject: endian and checksum overhead "<<bstimer.Elapsed() <<std::endl;
@@ -673,15 +663,10 @@ class BinaryIO {
nersc_csum,scidac_csuma,scidac_csumb); nersc_csum,scidac_csuma,scidac_csumb);
timer.Start(); timer.Start();
thread_for(lidx,lsites,{ // FIX ME, suboptimal implementation thread_for(lidx,lsites,{
std::vector<RngStateType> tmp(RngStateCount); std::vector<RngStateType> tmp(RngStateCount);
std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin()); std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin());
Coordinate lcoor; parallel_rng.SetState(tmp,lidx);
grid->LocalIndexToLocalCoor(lidx, lcoor);
int o_idx=grid->oIndex(lcoor);
int i_idx=grid->iIndex(lcoor);
int gidx=parallel_rng.generator_idx(o_idx,i_idx);
parallel_rng.SetState(tmp,gidx);
}); });
timer.Stop(); timer.Stop();
@@ -738,12 +723,7 @@ class BinaryIO {
std::vector<RNGstate> iodata(lsites); std::vector<RNGstate> iodata(lsites);
thread_for(lidx,lsites,{ thread_for(lidx,lsites,{
std::vector<RngStateType> tmp(RngStateCount); std::vector<RngStateType> tmp(RngStateCount);
Coordinate lcoor; parallel_rng.GetState(tmp,lidx);
grid->LocalIndexToLocalCoor(lidx, lcoor);
int o_idx=grid->oIndex(lcoor);
int i_idx=grid->iIndex(lcoor);
int gidx=parallel_rng.generator_idx(o_idx,i_idx);
parallel_rng.GetState(tmp,gidx);
std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin()); std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin());
}); });
timer.Stop(); timer.Stop();

View File

@@ -123,7 +123,7 @@ assert(GRID_FIELD_NORM_CALC(FieldNormMetaData_, n2ck) < 1.0e-5);
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
// Helper to fill out metadata // Helper to fill out metadata
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
template<class vobj> void ScidacMetaData(Lattice<vobj> & field, template<class vobj> void ScidacMetaData(Lattice<vobj> & field,
FieldMetaData &header, FieldMetaData &header,
scidacRecord & _scidacRecord, scidacRecord & _scidacRecord,
scidacFile & _scidacFile) scidacFile & _scidacFile)
@@ -619,12 +619,12 @@ class IldgWriter : public ScidacWriter {
// Don't require scidac records EXCEPT checksum // Don't require scidac records EXCEPT checksum
// Use Grid MetaData object if present. // Use Grid MetaData object if present.
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
template <class stats = PeriodicGaugeStatistics> template <class vsimd>
void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,int sequence,std::string LFN,std::string description) void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,int sequence,std::string LFN,std::string description)
{ {
GridBase * grid = Umu.Grid(); GridBase * grid = Umu.Grid();
typedef Lattice<vLorentzColourMatrixD> GaugeField; typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
typedef vLorentzColourMatrixD vobj; typedef iLorentzColourMatrix<vsimd> vobj;
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
//////////////////////////////////////// ////////////////////////////////////////
@@ -636,9 +636,6 @@ class IldgWriter : public ScidacWriter {
ScidacMetaData(Umu,header,_scidacRecord,_scidacFile); ScidacMetaData(Umu,header,_scidacRecord,_scidacFile);
stats Stats;
Stats(Umu,header);
std::string format = header.floating_point; std::string format = header.floating_point;
header.ensemble_id = description; header.ensemble_id = description;
header.ensemble_label = description; header.ensemble_label = description;
@@ -708,10 +705,10 @@ class IldgReader : public GridLimeReader {
// Else use ILDG MetaData object if present. // Else use ILDG MetaData object if present.
// Else use SciDAC MetaData object if present. // Else use SciDAC MetaData object if present.
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
template <class stats = PeriodicGaugeStatistics> template <class vsimd>
void readConfiguration(Lattice<vLorentzColourMatrixD> &Umu, FieldMetaData &FieldMetaData_) { void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu, FieldMetaData &FieldMetaData_) {
typedef Lattice<vLorentzColourMatrixD > GaugeField; typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
typedef typename GaugeField::vector_object vobj; typedef typename GaugeField::vector_object vobj;
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
@@ -924,8 +921,7 @@ class IldgReader : public GridLimeReader {
if ( found_FieldMetaData || found_usqcdInfo ) { if ( found_FieldMetaData || found_usqcdInfo ) {
FieldMetaData checker; FieldMetaData checker;
stats Stats; GaugeStatistics(Umu,checker);
Stats(Umu,checker);
assert(fabs(checker.plaquette - FieldMetaData_.plaquette )<1.0e-5); assert(fabs(checker.plaquette - FieldMetaData_.plaquette )<1.0e-5);
assert(fabs(checker.link_trace - FieldMetaData_.link_trace)<1.0e-5); assert(fabs(checker.link_trace - FieldMetaData_.link_trace)<1.0e-5);
std::cout << GridLogMessage<<"Plaquette and link trace match " << std::endl; std::cout << GridLogMessage<<"Plaquette and link trace match " << std::endl;

View File

@@ -128,7 +128,7 @@ inline void MachineCharacteristics(FieldMetaData &header)
std::time_t t = std::time(nullptr); std::time_t t = std::time(nullptr);
std::tm tm_ = *std::localtime(&t); std::tm tm_ = *std::localtime(&t);
std::ostringstream oss; std::ostringstream oss;
oss << std::put_time(&tm_, "%c %Z"); // oss << std::put_time(&tm_, "%c %Z");
header.creation_date = oss.str(); header.creation_date = oss.str();
header.archive_date = header.creation_date; header.archive_date = header.creation_date;
@@ -176,18 +176,29 @@ template<class vobj> inline void PrepareMetaData(Lattice<vobj> & field, FieldMet
GridMetaData(grid,header); GridMetaData(grid,header);
MachineCharacteristics(header); MachineCharacteristics(header);
} }
template<class Impl> inline void GaugeStatistics(Lattice<vLorentzColourMatrixF> & data,FieldMetaData &header)
class GaugeStatistics
{ {
public: // How to convert data precision etc...
void operator()(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header) header.link_trace=WilsonLoops<PeriodicGimplF>::linkTrace(data);
{ header.plaquette =WilsonLoops<PeriodicGimplF>::avgPlaquette(data);
header.link_trace=WilsonLoops<Impl>::linkTrace(data); }
header.plaquette =WilsonLoops<Impl>::avgPlaquette(data); inline void GaugeStatistics(Lattice<vLorentzColourMatrixD> & data,FieldMetaData &header)
} {
}; // How to convert data precision etc...
typedef GaugeStatistics<PeriodicGimplD> PeriodicGaugeStatistics; header.link_trace=WilsonLoops<PeriodicGimplD>::linkTrace(data);
typedef GaugeStatistics<ConjugateGimplD> ConjugateGaugeStatistics; header.plaquette =WilsonLoops<PeriodicGimplD>::avgPlaquette(data);
}
template<> inline void PrepareMetaData<vLorentzColourMatrixF>(Lattice<vLorentzColourMatrixF> & field, FieldMetaData &header)
{
GridBase *grid = field.Grid();
std::string format = getFormatString<vLorentzColourMatrixF>();
header.floating_point = format;
header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
GridMetaData(grid,header);
GaugeStatistics(field,header);
MachineCharacteristics(header);
}
template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzColourMatrixD> & field, FieldMetaData &header) template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzColourMatrixD> & field, FieldMetaData &header)
{ {
GridBase *grid = field.Grid(); GridBase *grid = field.Grid();
@@ -195,6 +206,7 @@ template<> inline void PrepareMetaData<vLorentzColourMatrixD>(Lattice<vLorentzCo
header.floating_point = format; header.floating_point = format;
header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac header.checksum = 0x0; // Nersc checksum unused in ILDG, Scidac
GridMetaData(grid,header); GridMetaData(grid,header);
GaugeStatistics(field,header);
MachineCharacteristics(header); MachineCharacteristics(header);
} }

View File

@@ -39,10 +39,6 @@ using namespace Grid;
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
class NerscIO : public BinaryIO { class NerscIO : public BinaryIO {
public: public:
typedef Lattice<vLorentzColourMatrixD> GaugeField;
// Enable/disable exiting if the plaquette in the header does not match the value computed (default true)
static bool & exitOnReadPlaquetteMismatch(){ static bool v=true; return v; }
static inline void truncate(std::string file){ static inline void truncate(std::string file){
std::ofstream fout(file,std::ios::out); std::ofstream fout(file,std::ios::out);
@@ -133,12 +129,12 @@ public:
// Now the meat: the object readers // Now the meat: the object readers
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
template<class GaugeStats=PeriodicGaugeStatistics> template<class vsimd>
static inline void readConfiguration(GaugeField &Umu, static inline void readConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
FieldMetaData& header, FieldMetaData& header,
std::string file, std::string file)
GaugeStats GaugeStatisticsCalculator=GaugeStats())
{ {
typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
GridBase *grid = Umu.Grid(); GridBase *grid = Umu.Grid();
uint64_t offset = readHeader(file,Umu.Grid(),header); uint64_t offset = readHeader(file,Umu.Grid(),header);
@@ -157,23 +153,23 @@ public:
// munger is a function of <floating point, Real, data_type> // munger is a function of <floating point, Real, data_type>
if ( header.data_type == std::string("4D_SU3_GAUGE") ) { if ( header.data_type == std::string("4D_SU3_GAUGE") ) {
if ( ieee32 || ieee32big ) { if ( ieee32 || ieee32big ) {
BinaryIO::readLatticeObject<vLorentzColourMatrixD, LorentzColour2x3F> BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3F>
(Umu,file,Gauge3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format, (Umu,file,Gauge3x2munger<LorentzColour2x3F,LorentzColourMatrix>(), offset,format,
nersc_csum,scidac_csuma,scidac_csumb); nersc_csum,scidac_csuma,scidac_csumb);
} }
if ( ieee64 || ieee64big ) { if ( ieee64 || ieee64big ) {
BinaryIO::readLatticeObject<vLorentzColourMatrixD, LorentzColour2x3D> BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>, LorentzColour2x3D>
(Umu,file,Gauge3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format, (Umu,file,Gauge3x2munger<LorentzColour2x3D,LorentzColourMatrix>(),offset,format,
nersc_csum,scidac_csuma,scidac_csumb); nersc_csum,scidac_csuma,scidac_csumb);
} }
} else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) { } else if ( header.data_type == std::string("4D_SU3_GAUGE_3x3") ) {
if ( ieee32 || ieee32big ) { if ( ieee32 || ieee32big ) {
BinaryIO::readLatticeObject<vLorentzColourMatrixD,LorentzColourMatrixF> BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixF>
(Umu,file,GaugeSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format, (Umu,file,GaugeSimpleMunger<LorentzColourMatrixF,LorentzColourMatrix>(),offset,format,
nersc_csum,scidac_csuma,scidac_csumb); nersc_csum,scidac_csuma,scidac_csumb);
} }
if ( ieee64 || ieee64big ) { if ( ieee64 || ieee64big ) {
BinaryIO::readLatticeObject<vLorentzColourMatrixD,LorentzColourMatrixD> BinaryIO::readLatticeObject<iLorentzColourMatrix<vsimd>,LorentzColourMatrixD>
(Umu,file,GaugeSimpleMunger<LorentzColourMatrixD,LorentzColourMatrix>(),offset,format, (Umu,file,GaugeSimpleMunger<LorentzColourMatrixD,LorentzColourMatrix>(),offset,format,
nersc_csum,scidac_csuma,scidac_csumb); nersc_csum,scidac_csuma,scidac_csumb);
} }
@@ -181,7 +177,7 @@ public:
assert(0); assert(0);
} }
GaugeStats Stats; Stats(Umu,clone); GaugeStatistics(Umu,clone);
std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" checksum "<<std::hex<<nersc_csum<< std::dec std::cout<<GridLogMessage <<"NERSC Configuration "<<file<<" checksum "<<std::hex<<nersc_csum<< std::dec
<<" header "<<std::hex<<header.checksum<<std::dec <<std::endl; <<" header "<<std::hex<<header.checksum<<std::dec <<std::endl;
@@ -200,29 +196,22 @@ public:
std::cerr << " nersc_csum " <<std::hex<< nersc_csum << " " << header.checksum<< std::dec<< std::endl; std::cerr << " nersc_csum " <<std::hex<< nersc_csum << " " << header.checksum<< std::dec<< std::endl;
exit(0); exit(0);
} }
if(exitOnReadPlaquetteMismatch()) assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 ); assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 ); assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
assert(nersc_csum == header.checksum ); assert(nersc_csum == header.checksum );
std::cout<<GridLogMessage <<"NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl; std::cout<<GridLogMessage <<"NERSC Configuration "<<file<< " and plaquette, link trace, and checksum agree"<<std::endl;
} }
// Preferred interface template<class vsimd>
template<class GaugeStats=PeriodicGaugeStatistics> static inline void writeConfiguration(Lattice<iLorentzColourMatrix<vsimd> > &Umu,
static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
std::string file,
std::string ens_label = std::string("DWF"))
{
writeConfiguration(Umu,file,0,1,ens_label);
}
template<class GaugeStats=PeriodicGaugeStatistics>
static inline void writeConfiguration(Lattice<vLorentzColourMatrixD > &Umu,
std::string file, std::string file,
int two_row, int two_row,
int bits32, int bits32)
std::string ens_label = std::string("DWF"))
{ {
typedef vLorentzColourMatrixD vobj; typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
typedef iLorentzColourMatrix<vsimd> vobj;
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
FieldMetaData header; FieldMetaData header;
@@ -230,8 +219,8 @@ public:
// Following should become arguments // Following should become arguments
/////////////////////////////////////////// ///////////////////////////////////////////
header.sequence_number = 1; header.sequence_number = 1;
header.ensemble_id = std::string("UKQCD"); header.ensemble_id = "UKQCD";
header.ensemble_label = ens_label; header.ensemble_label = "DWF";
typedef LorentzColourMatrixD fobj3D; typedef LorentzColourMatrixD fobj3D;
typedef LorentzColour2x3D fobj2D; typedef LorentzColour2x3D fobj2D;
@@ -240,28 +229,28 @@ public:
GridMetaData(grid,header); GridMetaData(grid,header);
assert(header.nd==4); assert(header.nd==4);
GaugeStats Stats; Stats(Umu,header); GaugeStatistics(Umu,header);
MachineCharacteristics(header); MachineCharacteristics(header);
uint64_t offset; uint64_t offset;
// Sod it -- always write 3x3 double // Sod it -- always write 3x3 double
header.floating_point = std::string("IEEE64BIG"); header.floating_point = std::string("IEEE64BIG");
header.data_type = std::string("4D_SU3_GAUGE_3x3"); header.data_type = std::string("4D_SU3_GAUGE_3x3");
GaugeSimpleUnmunger<fobj3D,sobj> munge; GaugeSimpleUnmunger<fobj3D,sobj> munge;
if ( grid->IsBoss() ) { if ( grid->IsBoss() ) {
truncate(file); truncate(file);
offset = writeHeader(header,file); offset = writeHeader(header,file);
} }
grid->Broadcast(0,(void *)&offset,sizeof(offset)); grid->Broadcast(0,(void *)&offset,sizeof(offset));
uint32_t nersc_csum,scidac_csuma,scidac_csumb; uint32_t nersc_csum,scidac_csuma,scidac_csumb;
BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point, BinaryIO::writeLatticeObject<vobj,fobj3D>(Umu,file,munge,offset,header.floating_point,
nersc_csum,scidac_csuma,scidac_csumb); nersc_csum,scidac_csuma,scidac_csumb);
header.checksum = nersc_csum; header.checksum = nersc_csum;
if ( grid->IsBoss() ) { if ( grid->IsBoss() ) {
writeHeader(header,file); writeHeader(header,file);
} }
std::cout<<GridLogMessage <<"Written NERSC Configuration on "<< file << " checksum " std::cout<<GridLogMessage <<"Written NERSC Configuration on "<< file << " checksum "
<<std::hex<<header.checksum <<std::hex<<header.checksum

View File

@@ -154,7 +154,7 @@ public:
grid->Barrier(); timer.Stop(); grid->Barrier(); timer.Stop();
std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: redistribute overhead " << timer.Elapsed() << std::endl; std::cout << Grid::GridLogMessage << "OpenQcdIO::readConfiguration: redistribute overhead " << timer.Elapsed() << std::endl;
PeriodicGaugeStatistics Stats; Stats(Umu, clone); GaugeStatistics(Umu, clone);
RealD plaq_diff = fabs(clone.plaquette - header.plaquette); RealD plaq_diff = fabs(clone.plaquette - header.plaquette);

View File

@@ -208,7 +208,7 @@ public:
FieldMetaData clone(header); FieldMetaData clone(header);
PeriodicGaugeStatistics Stats; Stats(Umu, clone); GaugeStatistics(Umu, clone);
RealD plaq_diff = fabs(clone.plaquette - header.plaquette); RealD plaq_diff = fabs(clone.plaquette - header.plaquette);

View File

@@ -44,7 +44,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <sys/syscall.h> #include <sys/syscall.h>
#endif #endif
#ifdef __x86_64__ #ifdef __x86_64__
#ifdef GRID_CUDA #ifdef GRID_NVCC
accelerator_inline uint64_t __rdtsc(void) { return 0; } accelerator_inline uint64_t __rdtsc(void) { return 0; }
accelerator_inline uint64_t __rdpmc(int ) { return 0; } accelerator_inline uint64_t __rdpmc(int ) { return 0; }
#else #else
@@ -112,6 +112,7 @@ class PerformanceCounter {
private: private:
typedef struct { typedef struct {
public:
uint32_t type; uint32_t type;
uint64_t config; uint64_t config;
const char *name; const char *name;

View File

@@ -12773,7 +12773,7 @@ namespace pugi
#undef PUGI__THROW_ERROR #undef PUGI__THROW_ERROR
#undef PUGI__CHECK_ERROR #undef PUGI__CHECK_ERROR
#ifdef GRID_CUDA #ifdef GRID_NVCC
#pragma pop #pragma pop
#endif #endif

View File

@@ -47,7 +47,7 @@ static constexpr int Ym = 5;
static constexpr int Zm = 6; static constexpr int Zm = 6;
static constexpr int Tm = 7; static constexpr int Tm = 7;
static constexpr int Nc=Config_Nc; static constexpr int Nc=3;
static constexpr int Ns=4; static constexpr int Ns=4;
static constexpr int Nd=4; static constexpr int Nd=4;
static constexpr int Nhs=2; // half spinor static constexpr int Nhs=2; // half spinor
@@ -63,7 +63,6 @@ static constexpr int Ngp=2; // gparity index range
#define ColourIndex (2) #define ColourIndex (2)
#define SpinIndex (1) #define SpinIndex (1)
#define LorentzIndex (0) #define LorentzIndex (0)
#define GparityFlavourIndex (0)
// Also should make these a named enum type // Also should make these a named enum type
static constexpr int DaggerNo=0; static constexpr int DaggerNo=0;
@@ -81,15 +80,6 @@ template<typename T> struct isSpinor {
template <typename T> using IfSpinor = Invoke<std::enable_if< isSpinor<T>::value,int> > ; template <typename T> using IfSpinor = Invoke<std::enable_if< isSpinor<T>::value,int> > ;
template <typename T> using IfNotSpinor = Invoke<std::enable_if<!isSpinor<T>::value,int> > ; template <typename T> using IfNotSpinor = Invoke<std::enable_if<!isSpinor<T>::value,int> > ;
const int CoarseIndex = 4;
template<typename T> struct isCoarsened {
static constexpr bool value = (CoarseIndex<=T::TensorLevel);
};
template <typename T> using IfCoarsened = Invoke<std::enable_if< isCoarsened<T>::value,int> > ;
template <typename T> using IfNotCoarsened = Invoke<std::enable_if<!isCoarsened<T>::value,int> > ;
const int GparityFlavourTensorIndex = 3; //TensorLevel counts from the bottom!
// ChrisK very keen to add extra space for Gparity doubling. // ChrisK very keen to add extra space for Gparity doubling.
// //
// Also add domain wall index, in a way where Wilson operator // Also add domain wall index, in a way where Wilson operator
@@ -104,7 +94,6 @@ template<typename vtype> using iSpinMatrix = iScalar<iMatrix<iSca
template<typename vtype> using iColourMatrix = iScalar<iScalar<iMatrix<vtype, Nc> > > ; template<typename vtype> using iColourMatrix = iScalar<iScalar<iMatrix<vtype, Nc> > > ;
template<typename vtype> using iSpinColourMatrix = iScalar<iMatrix<iMatrix<vtype, Nc>, Ns> >; template<typename vtype> using iSpinColourMatrix = iScalar<iMatrix<iMatrix<vtype, Nc>, Ns> >;
template<typename vtype> using iLorentzColourMatrix = iVector<iScalar<iMatrix<vtype, Nc> >, Nd > ; template<typename vtype> using iLorentzColourMatrix = iVector<iScalar<iMatrix<vtype, Nc> >, Nd > ;
template<typename vtype> using iLorentzVector = iVector<iScalar<iScalar<vtype> >, Nd > ;
template<typename vtype> using iDoubleStoredColourMatrix = iVector<iScalar<iMatrix<vtype, Nc> >, Nds > ; template<typename vtype> using iDoubleStoredColourMatrix = iVector<iScalar<iMatrix<vtype, Nc> >, Nds > ;
template<typename vtype> using iSpinVector = iScalar<iVector<iScalar<vtype>, Ns> >; template<typename vtype> using iSpinVector = iScalar<iVector<iScalar<vtype>, Ns> >;
template<typename vtype> using iColourVector = iScalar<iScalar<iVector<vtype, Nc> > >; template<typename vtype> using iColourVector = iScalar<iScalar<iVector<vtype, Nc> > >;
@@ -114,10 +103,8 @@ template<typename vtype> using iHalfSpinColourVector = iScalar<iVector<iVec
template<typename vtype> using iSpinColourSpinColourMatrix = iScalar<iMatrix<iMatrix<iMatrix<iMatrix<vtype, Nc>, Ns>, Nc>, Ns> >; template<typename vtype> using iSpinColourSpinColourMatrix = iScalar<iMatrix<iMatrix<iMatrix<iMatrix<vtype, Nc>, Ns>, Nc>, Ns> >;
template<typename vtype> using iGparityFlavourVector = iVector<iScalar<iScalar<vtype> >, Ngp>;
template<typename vtype> using iGparitySpinColourVector = iVector<iVector<iVector<vtype, Nc>, Ns>, Ngp >; template<typename vtype> using iGparitySpinColourVector = iVector<iVector<iVector<vtype, Nc>, Ns>, Ngp >;
template<typename vtype> using iGparityHalfSpinColourVector = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >; template<typename vtype> using iGparityHalfSpinColourVector = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
template<typename vtype> using iGparityFlavourMatrix = iMatrix<iScalar<iScalar<vtype> >, Ngp>;
// Spin matrix // Spin matrix
typedef iSpinMatrix<Complex > SpinMatrix; typedef iSpinMatrix<Complex > SpinMatrix;
@@ -164,16 +151,7 @@ typedef iSpinColourSpinColourMatrix<vComplex > vSpinColourSpinColourMatrix;
typedef iSpinColourSpinColourMatrix<vComplexF> vSpinColourSpinColourMatrixF; typedef iSpinColourSpinColourMatrix<vComplexF> vSpinColourSpinColourMatrixF;
typedef iSpinColourSpinColourMatrix<vComplexD> vSpinColourSpinColourMatrixD; typedef iSpinColourSpinColourMatrix<vComplexD> vSpinColourSpinColourMatrixD;
// LorentzVector // LorentzColour
typedef iLorentzVector<Complex > LorentzVector;
typedef iLorentzVector<ComplexF > LorentzVectorF;
typedef iLorentzVector<ComplexD > LorentzVectorD;
typedef iLorentzVector<vComplex > vLorentzVector;
typedef iLorentzVector<vComplexF> vLorentzVectorF;
typedef iLorentzVector<vComplexD> vLorentzVectorD;
// LorentzColourMatrix
typedef iLorentzColourMatrix<Complex > LorentzColourMatrix; typedef iLorentzColourMatrix<Complex > LorentzColourMatrix;
typedef iLorentzColourMatrix<ComplexF > LorentzColourMatrixF; typedef iLorentzColourMatrix<ComplexF > LorentzColourMatrixF;
typedef iLorentzColourMatrix<ComplexD > LorentzColourMatrixD; typedef iLorentzColourMatrix<ComplexD > LorentzColourMatrixD;
@@ -191,16 +169,6 @@ typedef iDoubleStoredColourMatrix<vComplex > vDoubleStoredColourMatrix;
typedef iDoubleStoredColourMatrix<vComplexF> vDoubleStoredColourMatrixF; typedef iDoubleStoredColourMatrix<vComplexF> vDoubleStoredColourMatrixF;
typedef iDoubleStoredColourMatrix<vComplexD> vDoubleStoredColourMatrixD; typedef iDoubleStoredColourMatrix<vComplexD> vDoubleStoredColourMatrixD;
//G-parity flavour matrix
typedef iGparityFlavourMatrix<Complex> GparityFlavourMatrix;
typedef iGparityFlavourMatrix<ComplexF> GparityFlavourMatrixF;
typedef iGparityFlavourMatrix<ComplexD> GparityFlavourMatrixD;
typedef iGparityFlavourMatrix<vComplex> vGparityFlavourMatrix;
typedef iGparityFlavourMatrix<vComplexF> vGparityFlavourMatrixF;
typedef iGparityFlavourMatrix<vComplexD> vGparityFlavourMatrixD;
// Spin vector // Spin vector
typedef iSpinVector<Complex > SpinVector; typedef iSpinVector<Complex > SpinVector;
typedef iSpinVector<ComplexF> SpinVectorF; typedef iSpinVector<ComplexF> SpinVectorF;
@@ -245,16 +213,6 @@ typedef iHalfSpinColourVector<ComplexD> HalfSpinColourVectorD;
typedef iHalfSpinColourVector<vComplex > vHalfSpinColourVector; typedef iHalfSpinColourVector<vComplex > vHalfSpinColourVector;
typedef iHalfSpinColourVector<vComplexF> vHalfSpinColourVectorF; typedef iHalfSpinColourVector<vComplexF> vHalfSpinColourVectorF;
typedef iHalfSpinColourVector<vComplexD> vHalfSpinColourVectorD; typedef iHalfSpinColourVector<vComplexD> vHalfSpinColourVectorD;
//G-parity flavour vector
typedef iGparityFlavourVector<Complex > GparityFlavourVector;
typedef iGparityFlavourVector<ComplexF> GparityFlavourVectorF;
typedef iGparityFlavourVector<ComplexD> GparityFlavourVectorD;
typedef iGparityFlavourVector<vComplex > vGparityFlavourVector;
typedef iGparityFlavourVector<vComplexF> vGparityFlavourVectorF;
typedef iGparityFlavourVector<vComplexD> vGparityFlavourVectorD;
// singlets // singlets
typedef iSinglet<Complex > TComplex; // FIXME This is painful. Tensor singlet complex type. typedef iSinglet<Complex > TComplex; // FIXME This is painful. Tensor singlet complex type.
@@ -298,10 +256,6 @@ typedef Lattice<vLorentzColourMatrix> LatticeLorentzColourMatrix;
typedef Lattice<vLorentzColourMatrixF> LatticeLorentzColourMatrixF; typedef Lattice<vLorentzColourMatrixF> LatticeLorentzColourMatrixF;
typedef Lattice<vLorentzColourMatrixD> LatticeLorentzColourMatrixD; typedef Lattice<vLorentzColourMatrixD> LatticeLorentzColourMatrixD;
typedef Lattice<vLorentzVector> LatticeLorentzVector;
typedef Lattice<vLorentzVectorF> LatticeLorentzVectorF;
typedef Lattice<vLorentzVectorD> LatticeLorentzVectorD;
// DoubleStored gauge field // DoubleStored gauge field
typedef Lattice<vDoubleStoredColourMatrix> LatticeDoubleStoredColourMatrix; typedef Lattice<vDoubleStoredColourMatrix> LatticeDoubleStoredColourMatrix;
typedef Lattice<vDoubleStoredColourMatrixF> LatticeDoubleStoredColourMatrixF; typedef Lattice<vDoubleStoredColourMatrixF> LatticeDoubleStoredColourMatrixF;

View File

@@ -30,7 +30,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#pragma once #ifndef GRID_QCD_ACTION_H
#define GRID_QCD_ACTION_H
//////////////////////////////////////////// ////////////////////////////////////////////
// Abstract base interface // Abstract base interface
@@ -50,4 +51,4 @@ NAMESPACE_CHECK(Fermion);
#include <Grid/qcd/action/pseudofermion/PseudoFermion.h> #include <Grid/qcd/action/pseudofermion/PseudoFermion.h>
NAMESPACE_CHECK(PseudoFermion); NAMESPACE_CHECK(PseudoFermion);
#endif

View File

@@ -40,31 +40,8 @@ class Action
public: public:
bool is_smeared = false; bool is_smeared = false;
RealD deriv_norm_sum;
RealD deriv_max_sum;
int deriv_num;
RealD deriv_us;
RealD S_us;
RealD refresh_us;
void reset_timer(void) {
deriv_us = S_us = refresh_us = 0.0;
deriv_num=0;
deriv_norm_sum = deriv_max_sum=0.0;
}
void deriv_log(RealD nrm, RealD max) { deriv_max_sum+=max; deriv_norm_sum+=nrm; deriv_num++;}
RealD deriv_max_average(void) { return deriv_max_sum/deriv_num; };
RealD deriv_norm_average(void) { return deriv_norm_sum/deriv_num; };
RealD deriv_timer(void) { return deriv_us; };
RealD S_timer(void) { return deriv_us; };
RealD refresh_timer(void) { return deriv_us; };
void deriv_timer_start(void) { deriv_us-=usecond(); }
void deriv_timer_stop(void) { deriv_us+=usecond(); }
void refresh_timer_start(void) { refresh_us-=usecond(); }
void refresh_timer_stop(void) { refresh_us+=usecond(); }
void S_timer_start(void) { S_us-=usecond(); }
void S_timer_stop(void) { S_us+=usecond(); }
// Heatbath? // Heatbath?
virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) = 0; // refresh pseudofermions virtual void refresh(const GaugeField& U, GridParallelRNG& pRNG) = 0; // refresh pseudofermions
virtual RealD S(const GaugeField& U) = 0; // evaluate the action virtual RealD S(const GaugeField& U) = 0; // evaluate the action
virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0; // evaluate the action derivative virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0; // evaluate the action derivative
virtual std::string action_name() = 0; // return the action name virtual std::string action_name() = 0; // return the action name

View File

@@ -58,8 +58,6 @@ NAMESPACE_CHECK(Scalar);
//////////////////////////////////////////// ////////////////////////////////////////////
// Utility functions // Utility functions
//////////////////////////////////////////// ////////////////////////////////////////////
#include <Grid/qcd/action/domains/Domains.h>
#include <Grid/qcd/utils/Metric.h> #include <Grid/qcd/utils/Metric.h>
NAMESPACE_CHECK(Metric); NAMESPACE_CHECK(Metric);
#include <Grid/qcd/utils/CovariantLaplacian.h> #include <Grid/qcd/utils/CovariantLaplacian.h>

View File

@@ -36,34 +36,28 @@ NAMESPACE_BEGIN(Grid);
// These can move into a params header and be given MacroMagic serialisation // These can move into a params header and be given MacroMagic serialisation
struct GparityWilsonImplParams { struct GparityWilsonImplParams {
Coordinate twists; //Here the first Nd-1 directions are treated as "spatial", and a twist value of 1 indicates G-parity BCs in that direction. Coordinate twists;
//mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs GparityWilsonImplParams() : twists(Nd, 0) {};
bool locally_periodic;
GparityWilsonImplParams() : twists(Nd, 0), locally_periodic(false) {};
}; };
struct WilsonImplParams { struct WilsonImplParams {
bool overlapCommsCompute; bool overlapCommsCompute;
bool locally_periodic;
AcceleratorVector<Real,Nd> twist_n_2pi_L; AcceleratorVector<Real,Nd> twist_n_2pi_L;
AcceleratorVector<Complex,Nd> boundary_phases; AcceleratorVector<Complex,Nd> boundary_phases;
WilsonImplParams() { WilsonImplParams() {
boundary_phases.resize(Nd, 1.0); boundary_phases.resize(Nd, 1.0);
twist_n_2pi_L.resize(Nd, 0.0); twist_n_2pi_L.resize(Nd, 0.0);
locally_periodic = false;
}; };
WilsonImplParams(const AcceleratorVector<Complex,Nd> phi) : boundary_phases(phi), overlapCommsCompute(false) { WilsonImplParams(const AcceleratorVector<Complex,Nd> phi) : boundary_phases(phi), overlapCommsCompute(false) {
twist_n_2pi_L.resize(Nd, 0.0); twist_n_2pi_L.resize(Nd, 0.0);
locally_periodic = false;
} }
}; };
struct StaggeredImplParams { struct StaggeredImplParams {
bool locally_periodic; StaggeredImplParams() {};
StaggeredImplParams() : locally_periodic(false) {};
}; };
struct OneFlavourRationalParams : Serializable { struct OneFlavourRationalParams : Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(OneFlavourRationalParams, GRID_SERIALIZABLE_CLASS_MEMBERS(OneFlavourRationalParams,
RealD, lo, RealD, lo,
RealD, hi, RealD, hi,
@@ -91,50 +85,6 @@ struct OneFlavourRationalParams : Serializable {
precision(_precision), precision(_precision),
BoundsCheckFreq(_BoundsCheckFreq){}; BoundsCheckFreq(_BoundsCheckFreq){};
}; };
/*Action parameters for the generalized rational action
The approximation is for (M^dag M)^{1/inv_pow}
where inv_pow is the denominator of the fractional power.
Default inv_pow=2 for square root, making this equivalent to
the OneFlavourRational action
*/
struct RationalActionParams : Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(RationalActionParams,
int, inv_pow,
RealD, lo, //low eigenvalue bound of rational approx
RealD, hi, //high eigenvalue bound of rational approx
int, MaxIter, //maximum iterations in msCG
RealD, action_tolerance, //msCG tolerance in action evaluation
int, action_degree, //rational approx tolerance in action evaluation
RealD, md_tolerance, //msCG tolerance in MD integration
int, md_degree, //rational approx tolerance in MD integration
int, precision, //precision of floating point arithmetic
int, BoundsCheckFreq); //frequency the approximation is tested (with Metropolis degree/tolerance); 0 disables the check
// constructor
RationalActionParams(int _inv_pow = 2,
RealD _lo = 0.0,
RealD _hi = 1.0,
int _maxit = 1000,
RealD _action_tolerance = 1.0e-8,
int _action_degree = 10,
RealD _md_tolerance = 1.0e-8,
int _md_degree = 10,
int _precision = 64,
int _BoundsCheckFreq=20)
: inv_pow(_inv_pow),
lo(_lo),
hi(_hi),
MaxIter(_maxit),
action_tolerance(_action_tolerance),
action_degree(_action_degree),
md_tolerance(_md_tolerance),
md_degree(_md_degree),
precision(_precision),
BoundsCheckFreq(_BoundsCheckFreq){};
};
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@@ -1,52 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/hmc/DDHMC.h
Copyright (C) 2021
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Christopher Kelly
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
NAMESPACE_BEGIN(Grid);
////////////////////////////////////////////////////
// DDHMC filter with sub-block size B[mu]
////////////////////////////////////////////////////
template<typename MomentaField>
struct DDHMCFilter: public MomentumFilterBase<MomentaField>
{
Coordinate Block;
int Width;
DDHMCFilter(const Coordinate &_Block): Block(_Block) {}
void applyFilter(MomentaField &P) const override
{
DomainDecomposition Domains(Block);
Domains.ProjectDDHMC(P);
}
};
NAMESPACE_END(Grid);

View File

@@ -1,98 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/momentum/DirichletFilter.h
Copyright (C) 2021
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
////////////////////////////////////////////////////
// Dirichlet filter with sub-block size B[mu]
////////////////////////////////////////////////////
#pragma once
#include <Grid/qcd/action/domains/DomainDecomposition.h>
NAMESPACE_BEGIN(Grid);
template<typename MomentaField>
struct DirichletFilter: public MomentumFilterBase<MomentaField>
{
Coordinate Block;
DirichletFilter(const Coordinate &_Block): Block(_Block) {}
// Edge detect using domain projectors
void applyFilter (MomentaField &U) const override
{
DomainDecomposition Domains(Block);
GridBase *grid = U.Grid();
LatticeInteger coor(grid);
LatticeInteger face(grid);
LatticeInteger one(grid); one = 1;
LatticeInteger zero(grid); zero = 0;
LatticeInteger omega(grid);
LatticeInteger omegabar(grid);
LatticeInteger tmp(grid);
omega=one; Domains.ProjectDomain(omega,0);
omegabar=one; Domains.ProjectDomain(omegabar,1);
LatticeInteger nface(grid); nface=Zero();
MomentaField projected(grid); projected=Zero();
typedef decltype(PeekIndex<LorentzIndex>(U,0)) MomentaLinkField;
MomentaLinkField Umu(grid);
MomentaLinkField zz(grid); zz=Zero();
int dims = grid->Nd();
Coordinate Global=grid->GlobalDimensions();
assert(dims==Nd);
for(int mu=0;mu<Nd;mu++){
if ( Block[mu]!=0 ) {
Umu = PeekIndex<LorentzIndex>(U,mu);
// Upper face
tmp = Cshift(omegabar,mu,1);
tmp = tmp + omega;
face = where(tmp == Integer(2),one,zero );
tmp = Cshift(omega,mu,1);
tmp = tmp + omegabar;
face = where(tmp == Integer(2),one,face );
Umu = where(face,zz,Umu);
PokeIndex<LorentzIndex>(U, Umu, mu);
}
}
}
};
NAMESPACE_END(Grid);

View File

@@ -1,187 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/domains/DomainDecomposition.h
Copyright (C) 2021
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
////////////////////////////////////////////////////
// Dirichlet filter with sub-block size B[mu]
////////////////////////////////////////////////////
#pragma once
NAMESPACE_BEGIN(Grid);
struct DomainDecomposition
{
Coordinate Block;
static constexpr RealD factor = 0.6;
DomainDecomposition(const Coordinate &_Block): Block(_Block){ assert(Block.size()==Nd);};
template<class Field>
void ProjectDomain(Field &f,Integer domain)
{
GridBase *grid = f.Grid();
int dims = grid->Nd();
int isDWF= (dims==Nd+1);
assert((dims==Nd)||(dims==Nd+1));
Field zz(grid); zz = Zero();
LatticeInteger coor(grid);
LatticeInteger domaincoor(grid);
LatticeInteger mask(grid); mask = Integer(1);
LatticeInteger zi(grid); zi = Integer(0);
for(int d=0;d<Nd;d++){
Integer B= Block[d];
if ( B ) {
LatticeCoordinate(coor,d+isDWF);
domaincoor = mod(coor,B);
mask = where(domaincoor==Integer(0),zi,mask);
mask = where(domaincoor==Integer(B-1),zi,mask);
}
}
if ( !domain )
f = where(mask==Integer(1),f,zz);
else
f = where(mask==Integer(0),f,zz);
};
template<class GaugeField>
void ProjectDDHMC(GaugeField &U)
{
GridBase *grid = U.Grid();
Coordinate Global=grid->GlobalDimensions();
GaugeField zzz(grid); zzz = Zero();
LatticeInteger coor(grid);
GaugeField Uorg(grid); Uorg = U;
auto zzz_mu = PeekIndex<LorentzIndex>(zzz,0);
////////////////////////////////////////////////////
// Zero BDY layers
////////////////////////////////////////////////////
for(int mu=0;mu<Nd;mu++) {
Integer B1 = Block[mu];
if ( B1 && (B1 <= Global[mu]) ) {
LatticeCoordinate(coor,mu);
////////////////////////////////
// OmegaBar - zero all links contained in slice B-1,0 and
// mu links connecting to Omega
////////////////////////////////
U = where(mod(coor,B1)==Integer(B1-1),zzz,U);
U = where(mod(coor,B1)==Integer(0) ,zzz,U);
auto U_mu = PeekIndex<LorentzIndex>(U,mu);
U_mu = where(mod(coor,B1)==Integer(B1-2),zzz_mu,U_mu);
PokeIndex<LorentzIndex>(U, U_mu, mu);
}
}
////////////////////////////////////////////
// Omega interior slow the evolution
// Tricky as we need to take the smallest of values imposed by each cut
// Do them in order or largest to smallest and smallest writes last
////////////////////////////////////////////
RealD f= factor;
#if 0
for(int mu=0;mu<Nd;mu++) {
Integer B1 = Block[mu];
if ( B1 && (B1 <= Global[mu]) ) {
auto U_mu = PeekIndex<LorentzIndex>(U,mu);
auto Uorg_mu= PeekIndex<LorentzIndex>(Uorg,mu);
// In the plane
U = where(mod(coor,B1)==Integer(B1-5),Uorg*f,U);
U = where(mod(coor,B1)==Integer(4) ,Uorg*f,U);
// Perp links
U_mu = where(mod(coor,B1)==Integer(B1-6),Uorg_mu*f,U_mu);
U_mu = where(mod(coor,B1)==Integer(4) ,Uorg_mu*f,U_mu);
PokeIndex<LorentzIndex>(U, U_mu, mu);
}
}
#endif
for(int mu=0;mu<Nd;mu++) {
Integer B1 = Block[mu];
if ( B1 && (B1 <= Global[mu]) ) {
auto U_mu = PeekIndex<LorentzIndex>(U,mu);
auto Uorg_mu= PeekIndex<LorentzIndex>(Uorg,mu);
// In the plane
U = where(mod(coor,B1)==Integer(B1-4),Uorg*f*f,U);
U = where(mod(coor,B1)==Integer(3) ,Uorg*f*f,U);
// Perp links
U_mu = where(mod(coor,B1)==Integer(B1-5),Uorg_mu*f*f,U_mu);
U_mu = where(mod(coor,B1)==Integer(3) ,Uorg_mu*f*f,U_mu);
PokeIndex<LorentzIndex>(U, U_mu, mu);
}
}
for(int mu=0;mu<Nd;mu++) {
Integer B1 = Block[mu];
if ( B1 && (B1 <= Global[mu]) ) {
auto U_mu = PeekIndex<LorentzIndex>(U,mu);
auto Uorg_mu= PeekIndex<LorentzIndex>(Uorg,mu);
// In the plane
U = where(mod(coor,B1)==Integer(B1-3),Uorg*f*f*f,U);
U = where(mod(coor,B1)==Integer(2) ,Uorg*f*f*f,U);
// Perp links
U_mu = where(mod(coor,B1)==Integer(B1-4),Uorg_mu*f*f*f,U_mu);
U_mu = where(mod(coor,B1)==Integer(2) ,Uorg_mu*f*f*f,U_mu);
PokeIndex<LorentzIndex>(U, U_mu, mu);
}
}
for(int mu=0;mu<Nd;mu++) {
Integer B1 = Block[mu];
if ( B1 && (B1 <= Global[mu]) ) {
auto U_mu = PeekIndex<LorentzIndex>(U,mu);
auto Uorg_mu= PeekIndex<LorentzIndex>(Uorg,mu);
// In the plane
U = where(mod(coor,B1)==Integer(B1-2),zzz,U);
U = where(mod(coor,B1)==Integer(1) ,zzz,U);
// Perp links
U_mu = where(mod(coor,B1)==Integer(B1-3),Uorg_mu*f*f*f*f,U_mu);
U_mu = where(mod(coor,B1)==Integer(1) ,Uorg_mu*f*f*f*f,U_mu);
PokeIndex<LorentzIndex>(U, U_mu, mu);
}
}
}
};
NAMESPACE_END(Grid);

View File

@@ -1,39 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/momentum/Domains.h
Copyright (C) 2021
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
////////////////////////////////////////////////////
// Dirichlet filter with sub-block size B[mu]
////////////////////////////////////////////////////
#pragma once
#include <Grid/qcd/action/domains/DomainDecomposition.h>
#include <Grid/qcd/action/domains/MomentumFilter.h>
#include <Grid/qcd/action/domains/DirichletFilter.h>
#include <Grid/qcd/action/domains/DDHMCFilter.h>

View File

@@ -1,91 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/hmc/integrators/MomentumFilter.h
Copyright (C) 2015
Author: Christopher Kelly <ckelly@bnl.gov>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
//--------------------------------------------------------------------
#pragma once
NAMESPACE_BEGIN(Grid);
//These filter objects allow the user to manipulate the conjugate momentum as part of the update / refresh
template<typename MomentaField>
struct MomentumFilterBase{
virtual void applyFilter(MomentaField &P) const = 0;
};
//Do nothing
template<typename MomentaField>
struct MomentumFilterNone: public MomentumFilterBase<MomentaField>{
void applyFilter(MomentaField &P) const override{}
};
//Multiply each site/direction by a Lorentz vector complex number field
//Can be used to implement a mask, zeroing out sites
template<typename MomentaField>
struct MomentumFilterApplyPhase: public MomentumFilterBase<MomentaField>{
typedef typename MomentaField::vector_type vector_type; //SIMD-vectorized complex type
typedef typename MomentaField::scalar_type scalar_type; //scalar complex type
typedef iVector<iScalar<iScalar<vector_type> >, Nd > LorentzScalarType; //complex phase for each site/direction
typedef Lattice<LorentzScalarType> LatticeLorentzScalarType;
LatticeLorentzScalarType phase;
MomentumFilterApplyPhase(const LatticeLorentzScalarType _phase): phase(_phase){}
//Default to uniform field of (1,0)
MomentumFilterApplyPhase(GridBase* _grid): phase(_grid){
LorentzScalarType one;
for(int mu=0;mu<Nd;mu++)
one(mu)()() = scalar_type(1.);
phase = one;
}
void applyFilter(MomentaField &P) const override{
conformable(P,phase);
autoView( P_v , P, AcceleratorWrite);
autoView( phase_v , phase, AcceleratorRead);
accelerator_for(ss,P_v.size(),MomentaField::vector_type::Nsimd(),{
auto site_mom = P_v(ss);
auto site_phase = phase_v(ss);
for(int mu=0;mu<Nd;mu++)
site_mom(mu) = site_mom(mu) * site_phase(mu);
coalescedWrite(P_v[ss], site_mom);
});
}
};
NAMESPACE_END(Grid);

View File

@@ -60,8 +60,6 @@ public:
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
virtual void Dminus(const FermionField &psi, FermionField &chi); virtual void Dminus(const FermionField &psi, FermionField &chi);
virtual void DminusDag(const FermionField &psi, FermionField &chi); virtual void DminusDag(const FermionField &psi, FermionField &chi);
virtual void ImportFourDimPseudoFermion(const FermionField &input,FermionField &imported);
virtual void ExportFourDimPseudoFermion(const FermionField &solution,FermionField &exported);
virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d); virtual void ExportPhysicalFermionSolution(const FermionField &solution5d,FermionField &exported4d);
virtual void ExportPhysicalFermionSource(const FermionField &solution5d, FermionField &exported4d); virtual void ExportPhysicalFermionSource(const FermionField &solution5d, FermionField &exported4d);
virtual void ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d); virtual void ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d);

View File

@@ -1,185 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/DirichletFermionOperator.h
Copyright (C) 2021
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
////////////////////////////////////////////////////////////////
// Wrap a fermion operator in Dirichlet BC's at node boundary
////////////////////////////////////////////////////////////////
template<class Impl>
class DirichletFermionOperator : public FermionOperator<Impl>
{
public:
INHERIT_IMPL_TYPES(Impl);
// Data members
int CommsMode;
Coordinate Block;
DirichletFilter<GaugeField> Filter;
FermionOperator<Impl> & FermOp;
// Constructor / bespoke
DirichletFermionOperator(FermionOperator<Impl> & _FermOp, Coordinate &_Block)
: FermOp(_FermOp), Block(_Block), Filter(Block)
{
// Save what the comms mode should be under normal BCs
CommsMode = WilsonKernelsStatic::Comms;
assert((CommsMode == WilsonKernelsStatic::CommsAndCompute)
||(CommsMode == WilsonKernelsStatic::CommsThenCompute));
// Check the block size divides local lattice
GridBase *grid = FermOp.GaugeGrid();
int blocks_per_rank = 1;
Coordinate LocalDims = grid->LocalDimensions();
Coordinate GlobalDims= grid->GlobalDimensions();
assert(Block.size()==LocalDims.size());
for(int d=0;d<LocalDims.size();d++){
if (Block[d]&&(Block[d]<=GlobalDims[d])){
int r = LocalDims[d] % Block[d];
assert(r == 0);
blocks_per_rank *= (LocalDims[d] / Block[d]);
}
}
// Even blocks per node required // could be relaxed but inefficient use of hardware as idle nodes in boundary operator R
assert( blocks_per_rank != 0);
// Possible checks that SIMD lanes are used with full occupancy???
};
virtual ~DirichletFermionOperator(void) = default;
void DirichletOn(void) {
assert(WilsonKernelsStatic::Comms!= WilsonKernelsStatic::CommsDirichlet);
// WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsDirichlet;
}
void DirichletOff(void) {
// assert(WilsonKernelsStatic::Comms== WilsonKernelsStatic::CommsDirichlet);
// WilsonKernelsStatic::Comms = CommsMode;
}
// Implement the full interface
virtual FermionField &tmp(void) { return FermOp.tmp(); };
virtual GridBase *FermionGrid(void) { return FermOp.FermionGrid(); }
virtual GridBase *FermionRedBlackGrid(void) { return FermOp.FermionRedBlackGrid(); }
virtual GridBase *GaugeGrid(void) { return FermOp.GaugeGrid(); }
virtual GridBase *GaugeRedBlackGrid(void) { return FermOp.GaugeRedBlackGrid(); }
// override multiply
virtual void M (const FermionField &in, FermionField &out) { DirichletOn(); FermOp.M(in,out); DirichletOff(); };
virtual void Mdag (const FermionField &in, FermionField &out) { DirichletOn(); FermOp.Mdag(in,out); DirichletOff(); };
// half checkerboard operaions
virtual void Meooe (const FermionField &in, FermionField &out) { DirichletOn(); FermOp.Meooe(in,out); DirichletOff(); };
virtual void MeooeDag (const FermionField &in, FermionField &out) { DirichletOn(); FermOp.MeooeDag(in,out); DirichletOff(); };
virtual void Mooee (const FermionField &in, FermionField &out) { DirichletOn(); FermOp.Mooee(in,out); DirichletOff(); };
virtual void MooeeDag (const FermionField &in, FermionField &out) { DirichletOn(); FermOp.MooeeDag(in,out); DirichletOff(); };
virtual void MooeeInv (const FermionField &in, FermionField &out) { DirichletOn(); FermOp.MooeeInv(in,out); DirichletOff(); };
virtual void MooeeInvDag (const FermionField &in, FermionField &out) { DirichletOn(); FermOp.MooeeInvDag(in,out); DirichletOff(); };
// non-hermitian hopping term; half cb or both
virtual void Dhop (const FermionField &in, FermionField &out,int dag) { DirichletOn(); FermOp.Dhop(in,out,dag); DirichletOff(); };
virtual void DhopOE(const FermionField &in, FermionField &out,int dag) { DirichletOn(); FermOp.DhopOE(in,out,dag); DirichletOff(); };
virtual void DhopEO(const FermionField &in, FermionField &out,int dag) { DirichletOn(); FermOp.DhopEO(in,out,dag); DirichletOff(); };
virtual void DhopDir(const FermionField &in, FermionField &out,int dir,int disp) { DirichletOn(); FermOp.DhopDir(in,out,dir,disp); DirichletOff(); };
// force terms; five routines; default to Dhop on diagonal
virtual void MDeriv (GaugeField &mat,const FermionField &U,const FermionField &V,int dag){FermOp.MDeriv(mat,U,V,dag);};
virtual void MoeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){FermOp.MoeDeriv(mat,U,V,dag);};
virtual void MeoDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){FermOp.MeoDeriv(mat,U,V,dag);};
virtual void MooDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){FermOp.MooDeriv(mat,U,V,dag);};
virtual void MeeDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){FermOp.MeeDeriv(mat,U,V,dag);};
virtual void DhopDeriv (GaugeField &mat,const FermionField &U,const FermionField &V,int dag){FermOp.DhopDeriv(mat,U,V,dag);};
virtual void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){FermOp.DhopDerivEO(mat,U,V,dag);};
virtual void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){FermOp.DhopDerivOE(mat,U,V,dag);};
virtual void Mdiag (const FermionField &in, FermionField &out) { Mooee(in,out);};
virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp){FermOp.Mdir(in,out,dir,disp);};
virtual void MdirAll(const FermionField &in, std::vector<FermionField> &out) {FermOp.MdirAll(in,out);};
///////////////////////////////////////////////
// Updates gauge field during HMC
///////////////////////////////////////////////
DoubledGaugeField &GetDoubledGaugeField(void){ return FermOp.GetDoubledGaugeField(); };
DoubledGaugeField &GetDoubledGaugeFieldE(void){ return FermOp.GetDoubledGaugeFieldE(); };
DoubledGaugeField &GetDoubledGaugeFieldO(void){ return FermOp.GetDoubledGaugeFieldO(); };
virtual void ImportGauge(const GaugeField & _U)
{
GaugeField U = _U;
// Filter gauge field to apply Dirichlet
Filter.applyFilter(U);
FermOp.ImportGauge(U);
}
///////////////////////////////////////////////
// Physical field import/export
///////////////////////////////////////////////
virtual void Dminus(const FermionField &psi, FermionField &chi) { FermOp.Dminus(psi,chi); }
virtual void DminusDag(const FermionField &psi, FermionField &chi) { FermOp.DminusDag(psi,chi); }
virtual void ImportFourDimPseudoFermion(const FermionField &input,FermionField &imported) { FermOp.ImportFourDimPseudoFermion(input,imported);}
virtual void ExportFourDimPseudoFermion(const FermionField &solution,FermionField &exported){ FermOp.ExportFourDimPseudoFermion(solution,exported);}
virtual void ImportPhysicalFermionSource(const FermionField &input,FermionField &imported) { FermOp.ImportPhysicalFermionSource(input,imported);}
virtual void ImportUnphysicalFermion(const FermionField &input,FermionField &imported) { FermOp.ImportUnphysicalFermion(input,imported);}
virtual void ExportPhysicalFermionSolution(const FermionField &solution,FermionField &exported) {FermOp.ExportPhysicalFermionSolution(solution,exported);}
virtual void ExportPhysicalFermionSource(const FermionField &solution,FermionField &exported) {FermOp.ExportPhysicalFermionSource(solution,exported);}
//////////////////////////////////////////////////////////////////////
// Should never be used
//////////////////////////////////////////////////////////////////////
virtual void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) { assert(0);};
virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary,std::vector<double> twist) {assert(0);}
virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) { assert(0);}
virtual void ContractConservedCurrent(PropagatorField &q_in_1,
PropagatorField &q_in_2,
PropagatorField &q_out,
PropagatorField &phys_src,
Current curr_type,
unsigned int mu)
{assert(0);};
virtual void SeqConservedCurrent(PropagatorField &q_in,
PropagatorField &q_out,
PropagatorField &phys_src,
Current curr_type,
unsigned int mu,
unsigned int tmin,
unsigned int tmax,
ComplexField &lattice_cmplx)
{assert(0);};
// Only reimplemented in Wilson5D
// Default to just a zero correlation function
virtual void ContractJ5q(FermionField &q_in ,ComplexField &J5q) { J5q=Zero(); };
virtual void ContractJ5q(PropagatorField &q_in,ComplexField &J5q) { J5q=Zero(); };
};
NAMESPACE_END(Grid);

View File

@@ -114,22 +114,19 @@ public:
U = adj(Cshift(U, mu, -1)); U = adj(Cshift(U, mu, -1));
PokeIndex<LorentzIndex>(Uadj, U, mu); PokeIndex<LorentzIndex>(Uadj, U, mu);
} }
autoView(Umu_v,Umu,CpuRead); for (int lidx = 0; lidx < GaugeGrid->lSites(); lidx++) {
autoView(Uadj_v,Uadj,CpuRead);
autoView(Uds_v,Uds,CpuWrite);
thread_for( lidx, GaugeGrid->lSites(), {
Coordinate lcoor; Coordinate lcoor;
GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor); GaugeGrid->LocalIndexToLocalCoor(lidx, lcoor);
peekLocalSite(ScalarUmu, Umu_v, lcoor); peekLocalSite(ScalarUmu, Umu, lcoor);
for (int mu = 0; mu < 4; mu++) ScalarUds(mu) = ScalarUmu(mu); for (int mu = 0; mu < 4; mu++) ScalarUds(mu) = ScalarUmu(mu);
peekLocalSite(ScalarUmu, Uadj_v, lcoor); peekLocalSite(ScalarUmu, Uadj, lcoor);
for (int mu = 0; mu < 4; mu++) ScalarUds(mu + 4) = ScalarUmu(mu); for (int mu = 0; mu < 4; mu++) ScalarUds(mu + 4) = ScalarUmu(mu);
pokeLocalSite(ScalarUds, Uds_v, lcoor); pokeLocalSite(ScalarUds, Uds, lcoor);
}); }
} }
inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,FermionField &A, int mu) inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,FermionField &A, int mu)

View File

@@ -57,7 +57,6 @@ NAMESPACE_CHECK(WilsonClover);
#include <Grid/qcd/action/fermion/WilsonFermion5D.h> // 5d base used by all 5d overlap types #include <Grid/qcd/action/fermion/WilsonFermion5D.h> // 5d base used by all 5d overlap types
NAMESPACE_CHECK(Wilson5D); NAMESPACE_CHECK(Wilson5D);
#include <Grid/qcd/action/fermion/NaiveStaggeredFermion.h>
#include <Grid/qcd/action/fermion/ImprovedStaggeredFermion.h> #include <Grid/qcd/action/fermion/ImprovedStaggeredFermion.h>
#include <Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h> #include <Grid/qcd/action/fermion/ImprovedStaggeredFermion5D.h>
NAMESPACE_CHECK(Staggered); NAMESPACE_CHECK(Staggered);
@@ -101,12 +100,6 @@ NAMESPACE_CHECK(WilsonTM5);
#include <Grid/qcd/action/fermion/PauliVillarsInverters.h> #include <Grid/qcd/action/fermion/PauliVillarsInverters.h>
#include <Grid/qcd/action/fermion/Reconstruct5Dprop.h> #include <Grid/qcd/action/fermion/Reconstruct5Dprop.h>
#include <Grid/qcd/action/fermion/MADWF.h> #include <Grid/qcd/action/fermion/MADWF.h>
////////////////////////////////////////////////////////////////////
// DDHMC related
////////////////////////////////////////////////////////////////////
#include <Grid/qcd/action/fermion/DirichletFermionOperator.h>
#include <Grid/qcd/action/fermion/SchurFactoredFermionOperator.h>
NAMESPACE_CHECK(DWFutils); NAMESPACE_CHECK(DWFutils);
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -121,9 +114,9 @@ typedef WilsonFermion<WilsonImplR> WilsonFermionR;
typedef WilsonFermion<WilsonImplF> WilsonFermionF; typedef WilsonFermion<WilsonImplF> WilsonFermionF;
typedef WilsonFermion<WilsonImplD> WilsonFermionD; typedef WilsonFermion<WilsonImplD> WilsonFermionD;
//typedef WilsonFermion<WilsonImplRL> WilsonFermionRL; typedef WilsonFermion<WilsonImplRL> WilsonFermionRL;
//typedef WilsonFermion<WilsonImplFH> WilsonFermionFH; typedef WilsonFermion<WilsonImplFH> WilsonFermionFH;
//typedef WilsonFermion<WilsonImplDF> WilsonFermionDF; typedef WilsonFermion<WilsonImplDF> WilsonFermionDF;
typedef WilsonFermion<WilsonAdjImplR> WilsonAdjFermionR; typedef WilsonFermion<WilsonAdjImplR> WilsonAdjFermionR;
typedef WilsonFermion<WilsonAdjImplF> WilsonAdjFermionF; typedef WilsonFermion<WilsonAdjImplF> WilsonAdjFermionF;
@@ -164,41 +157,41 @@ typedef DomainWallFermion<WilsonImplR> DomainWallFermionR;
typedef DomainWallFermion<WilsonImplF> DomainWallFermionF; typedef DomainWallFermion<WilsonImplF> DomainWallFermionF;
typedef DomainWallFermion<WilsonImplD> DomainWallFermionD; typedef DomainWallFermion<WilsonImplD> DomainWallFermionD;
//typedef DomainWallFermion<WilsonImplRL> DomainWallFermionRL; typedef DomainWallFermion<WilsonImplRL> DomainWallFermionRL;
//typedef DomainWallFermion<WilsonImplFH> DomainWallFermionFH; typedef DomainWallFermion<WilsonImplFH> DomainWallFermionFH;
//typedef DomainWallFermion<WilsonImplDF> DomainWallFermionDF; typedef DomainWallFermion<WilsonImplDF> DomainWallFermionDF;
typedef DomainWallEOFAFermion<WilsonImplR> DomainWallEOFAFermionR; typedef DomainWallEOFAFermion<WilsonImplR> DomainWallEOFAFermionR;
typedef DomainWallEOFAFermion<WilsonImplF> DomainWallEOFAFermionF; typedef DomainWallEOFAFermion<WilsonImplF> DomainWallEOFAFermionF;
typedef DomainWallEOFAFermion<WilsonImplD> DomainWallEOFAFermionD; typedef DomainWallEOFAFermion<WilsonImplD> DomainWallEOFAFermionD;
//typedef DomainWallEOFAFermion<WilsonImplRL> DomainWallEOFAFermionRL; typedef DomainWallEOFAFermion<WilsonImplRL> DomainWallEOFAFermionRL;
//typedef DomainWallEOFAFermion<WilsonImplFH> DomainWallEOFAFermionFH; typedef DomainWallEOFAFermion<WilsonImplFH> DomainWallEOFAFermionFH;
//typedef DomainWallEOFAFermion<WilsonImplDF> DomainWallEOFAFermionDF; typedef DomainWallEOFAFermion<WilsonImplDF> DomainWallEOFAFermionDF;
typedef MobiusFermion<WilsonImplR> MobiusFermionR; typedef MobiusFermion<WilsonImplR> MobiusFermionR;
typedef MobiusFermion<WilsonImplF> MobiusFermionF; typedef MobiusFermion<WilsonImplF> MobiusFermionF;
typedef MobiusFermion<WilsonImplD> MobiusFermionD; typedef MobiusFermion<WilsonImplD> MobiusFermionD;
//typedef MobiusFermion<WilsonImplRL> MobiusFermionRL; typedef MobiusFermion<WilsonImplRL> MobiusFermionRL;
//typedef MobiusFermion<WilsonImplFH> MobiusFermionFH; typedef MobiusFermion<WilsonImplFH> MobiusFermionFH;
//typedef MobiusFermion<WilsonImplDF> MobiusFermionDF; typedef MobiusFermion<WilsonImplDF> MobiusFermionDF;
typedef MobiusEOFAFermion<WilsonImplR> MobiusEOFAFermionR; typedef MobiusEOFAFermion<WilsonImplR> MobiusEOFAFermionR;
typedef MobiusEOFAFermion<WilsonImplF> MobiusEOFAFermionF; typedef MobiusEOFAFermion<WilsonImplF> MobiusEOFAFermionF;
typedef MobiusEOFAFermion<WilsonImplD> MobiusEOFAFermionD; typedef MobiusEOFAFermion<WilsonImplD> MobiusEOFAFermionD;
//typedef MobiusEOFAFermion<WilsonImplRL> MobiusEOFAFermionRL; typedef MobiusEOFAFermion<WilsonImplRL> MobiusEOFAFermionRL;
//typedef MobiusEOFAFermion<WilsonImplFH> MobiusEOFAFermionFH; typedef MobiusEOFAFermion<WilsonImplFH> MobiusEOFAFermionFH;
//typedef MobiusEOFAFermion<WilsonImplDF> MobiusEOFAFermionDF; typedef MobiusEOFAFermion<WilsonImplDF> MobiusEOFAFermionDF;
typedef ZMobiusFermion<ZWilsonImplR> ZMobiusFermionR; typedef ZMobiusFermion<ZWilsonImplR> ZMobiusFermionR;
typedef ZMobiusFermion<ZWilsonImplF> ZMobiusFermionF; typedef ZMobiusFermion<ZWilsonImplF> ZMobiusFermionF;
typedef ZMobiusFermion<ZWilsonImplD> ZMobiusFermionD; typedef ZMobiusFermion<ZWilsonImplD> ZMobiusFermionD;
//typedef ZMobiusFermion<ZWilsonImplRL> ZMobiusFermionRL; typedef ZMobiusFermion<ZWilsonImplRL> ZMobiusFermionRL;
//typedef ZMobiusFermion<ZWilsonImplFH> ZMobiusFermionFH; typedef ZMobiusFermion<ZWilsonImplFH> ZMobiusFermionFH;
//typedef ZMobiusFermion<ZWilsonImplDF> ZMobiusFermionDF; typedef ZMobiusFermion<ZWilsonImplDF> ZMobiusFermionDF;
// Ls vectorised // Ls vectorised
typedef ScaledShamirFermion<WilsonImplR> ScaledShamirFermionR; typedef ScaledShamirFermion<WilsonImplR> ScaledShamirFermionR;
@@ -241,62 +234,64 @@ typedef WilsonFermion<GparityWilsonImplR> GparityWilsonFermionR;
typedef WilsonFermion<GparityWilsonImplF> GparityWilsonFermionF; typedef WilsonFermion<GparityWilsonImplF> GparityWilsonFermionF;
typedef WilsonFermion<GparityWilsonImplD> GparityWilsonFermionD; typedef WilsonFermion<GparityWilsonImplD> GparityWilsonFermionD;
//typedef WilsonFermion<GparityWilsonImplRL> GparityWilsonFermionRL; typedef WilsonFermion<GparityWilsonImplRL> GparityWilsonFermionRL;
//typedef WilsonFermion<GparityWilsonImplFH> GparityWilsonFermionFH; typedef WilsonFermion<GparityWilsonImplFH> GparityWilsonFermionFH;
//typedef WilsonFermion<GparityWilsonImplDF> GparityWilsonFermionDF; typedef WilsonFermion<GparityWilsonImplDF> GparityWilsonFermionDF;
typedef DomainWallFermion<GparityWilsonImplR> GparityDomainWallFermionR; typedef DomainWallFermion<GparityWilsonImplR> GparityDomainWallFermionR;
typedef DomainWallFermion<GparityWilsonImplF> GparityDomainWallFermionF; typedef DomainWallFermion<GparityWilsonImplF> GparityDomainWallFermionF;
typedef DomainWallFermion<GparityWilsonImplD> GparityDomainWallFermionD; typedef DomainWallFermion<GparityWilsonImplD> GparityDomainWallFermionD;
//typedef DomainWallFermion<GparityWilsonImplRL> GparityDomainWallFermionRL; typedef DomainWallFermion<GparityWilsonImplRL> GparityDomainWallFermionRL;
//typedef DomainWallFermion<GparityWilsonImplFH> GparityDomainWallFermionFH; typedef DomainWallFermion<GparityWilsonImplFH> GparityDomainWallFermionFH;
//typedef DomainWallFermion<GparityWilsonImplDF> GparityDomainWallFermionDF; typedef DomainWallFermion<GparityWilsonImplDF> GparityDomainWallFermionDF;
typedef DomainWallEOFAFermion<GparityWilsonImplR> GparityDomainWallEOFAFermionR; typedef DomainWallEOFAFermion<GparityWilsonImplR> GparityDomainWallEOFAFermionR;
typedef DomainWallEOFAFermion<GparityWilsonImplF> GparityDomainWallEOFAFermionF; typedef DomainWallEOFAFermion<GparityWilsonImplF> GparityDomainWallEOFAFermionF;
typedef DomainWallEOFAFermion<GparityWilsonImplD> GparityDomainWallEOFAFermionD; typedef DomainWallEOFAFermion<GparityWilsonImplD> GparityDomainWallEOFAFermionD;
//typedef DomainWallEOFAFermion<GparityWilsonImplRL> GparityDomainWallEOFAFermionRL; typedef DomainWallEOFAFermion<GparityWilsonImplRL> GparityDomainWallEOFAFermionRL;
//typedef DomainWallEOFAFermion<GparityWilsonImplFH> GparityDomainWallEOFAFermionFH; typedef DomainWallEOFAFermion<GparityWilsonImplFH> GparityDomainWallEOFAFermionFH;
//typedef DomainWallEOFAFermion<GparityWilsonImplDF> GparityDomainWallEOFAFermionDF; typedef DomainWallEOFAFermion<GparityWilsonImplDF> GparityDomainWallEOFAFermionDF;
typedef WilsonTMFermion<GparityWilsonImplR> GparityWilsonTMFermionR; typedef WilsonTMFermion<GparityWilsonImplR> GparityWilsonTMFermionR;
typedef WilsonTMFermion<GparityWilsonImplF> GparityWilsonTMFermionF; typedef WilsonTMFermion<GparityWilsonImplF> GparityWilsonTMFermionF;
typedef WilsonTMFermion<GparityWilsonImplD> GparityWilsonTMFermionD; typedef WilsonTMFermion<GparityWilsonImplD> GparityWilsonTMFermionD;
//typedef WilsonTMFermion<GparityWilsonImplRL> GparityWilsonTMFermionRL; typedef WilsonTMFermion<GparityWilsonImplRL> GparityWilsonTMFermionRL;
//typedef WilsonTMFermion<GparityWilsonImplFH> GparityWilsonTMFermionFH; typedef WilsonTMFermion<GparityWilsonImplFH> GparityWilsonTMFermionFH;
//typedef WilsonTMFermion<GparityWilsonImplDF> GparityWilsonTMFermionDF; typedef WilsonTMFermion<GparityWilsonImplDF> GparityWilsonTMFermionDF;
typedef MobiusFermion<GparityWilsonImplR> GparityMobiusFermionR; typedef MobiusFermion<GparityWilsonImplR> GparityMobiusFermionR;
typedef MobiusFermion<GparityWilsonImplF> GparityMobiusFermionF; typedef MobiusFermion<GparityWilsonImplF> GparityMobiusFermionF;
typedef MobiusFermion<GparityWilsonImplD> GparityMobiusFermionD; typedef MobiusFermion<GparityWilsonImplD> GparityMobiusFermionD;
//typedef MobiusFermion<GparityWilsonImplRL> GparityMobiusFermionRL; typedef MobiusFermion<GparityWilsonImplRL> GparityMobiusFermionRL;
//typedef MobiusFermion<GparityWilsonImplFH> GparityMobiusFermionFH; typedef MobiusFermion<GparityWilsonImplFH> GparityMobiusFermionFH;
//typedef MobiusFermion<GparityWilsonImplDF> GparityMobiusFermionDF; typedef MobiusFermion<GparityWilsonImplDF> GparityMobiusFermionDF;
typedef MobiusEOFAFermion<GparityWilsonImplR> GparityMobiusEOFAFermionR; typedef MobiusEOFAFermion<GparityWilsonImplR> GparityMobiusEOFAFermionR;
typedef MobiusEOFAFermion<GparityWilsonImplF> GparityMobiusEOFAFermionF; typedef MobiusEOFAFermion<GparityWilsonImplF> GparityMobiusEOFAFermionF;
typedef MobiusEOFAFermion<GparityWilsonImplD> GparityMobiusEOFAFermionD; typedef MobiusEOFAFermion<GparityWilsonImplD> GparityMobiusEOFAFermionD;
//typedef MobiusEOFAFermion<GparityWilsonImplRL> GparityMobiusEOFAFermionRL; typedef MobiusEOFAFermion<GparityWilsonImplRL> GparityMobiusEOFAFermionRL;
//typedef MobiusEOFAFermion<GparityWilsonImplFH> GparityMobiusEOFAFermionFH; typedef MobiusEOFAFermion<GparityWilsonImplFH> GparityMobiusEOFAFermionFH;
//typedef MobiusEOFAFermion<GparityWilsonImplDF> GparityMobiusEOFAFermionDF; typedef MobiusEOFAFermion<GparityWilsonImplDF> GparityMobiusEOFAFermionDF;
typedef ImprovedStaggeredFermion<StaggeredImplR> ImprovedStaggeredFermionR; typedef ImprovedStaggeredFermion<StaggeredImplR> ImprovedStaggeredFermionR;
typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF; typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF;
typedef ImprovedStaggeredFermion<StaggeredImplD> ImprovedStaggeredFermionD; typedef ImprovedStaggeredFermion<StaggeredImplD> ImprovedStaggeredFermionD;
typedef NaiveStaggeredFermion<StaggeredImplR> NaiveStaggeredFermionR;
typedef NaiveStaggeredFermion<StaggeredImplF> NaiveStaggeredFermionF;
typedef NaiveStaggeredFermion<StaggeredImplD> NaiveStaggeredFermionD;
typedef ImprovedStaggeredFermion5D<StaggeredImplR> ImprovedStaggeredFermion5DR; typedef ImprovedStaggeredFermion5D<StaggeredImplR> ImprovedStaggeredFermion5DR;
typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF; typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF;
typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD; typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD;
#ifndef GRID_NVCC
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplR> ImprovedStaggeredFermionVec5dR;
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplF> ImprovedStaggeredFermionVec5dF;
typedef ImprovedStaggeredFermion5D<StaggeredVec5dImplD> ImprovedStaggeredFermionVec5dD;
#endif
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
//////////////////// ////////////////////

View File

@@ -25,7 +25,8 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#pragma once #ifndef GRID_QCD_FERMION_CORE_H
#define GRID_QCD_FERMION_CORE_H
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
#include <Grid/GridQCDcore.h> #include <Grid/GridQCDcore.h>
@@ -44,3 +45,4 @@ NAMESPACE_CHECK(FermionOperator);
#include <Grid/qcd/action/fermion/StaggeredKernels.h> //used by all wilson type fermions #include <Grid/qcd/action/fermion/StaggeredKernels.h> //used by all wilson type fermions
NAMESPACE_CHECK(Kernels); NAMESPACE_CHECK(Kernels);
#endif

View File

@@ -140,9 +140,6 @@ public:
// Updates gauge field during HMC // Updates gauge field during HMC
/////////////////////////////////////////////// ///////////////////////////////////////////////
virtual void ImportGauge(const GaugeField & _U)=0; virtual void ImportGauge(const GaugeField & _U)=0;
virtual DoubledGaugeField &GetDoubledGaugeField(void) =0;
virtual DoubledGaugeField &GetDoubledGaugeFieldE(void) =0;
virtual DoubledGaugeField &GetDoubledGaugeFieldO(void) =0;
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
// Conserved currents, either contract at sink or insert sequentially. // Conserved currents, either contract at sink or insert sequentially.
@@ -174,16 +171,6 @@ public:
/////////////////////////////////////////////// ///////////////////////////////////////////////
virtual void Dminus(const FermionField &psi, FermionField &chi) { chi=psi; } virtual void Dminus(const FermionField &psi, FermionField &chi) { chi=psi; }
virtual void DminusDag(const FermionField &psi, FermionField &chi) { chi=psi; } virtual void DminusDag(const FermionField &psi, FermionField &chi) { chi=psi; }
virtual void ImportFourDimPseudoFermion(const FermionField &input,FermionField &imported)
{
imported = input;
};
virtual void ExportFourDimPseudoFermion(const FermionField &solution,FermionField &exported)
{
exported=solution;
};
virtual void ImportPhysicalFermionSource(const FermionField &input,FermionField &imported) virtual void ImportPhysicalFermionSource(const FermionField &input,FermionField &imported)
{ {
imported = input; imported = input;

View File

@@ -153,8 +153,8 @@ public:
typedef typename Impl::StencilImpl StencilImpl; \ typedef typename Impl::StencilImpl StencilImpl; \
typedef typename Impl::ImplParams ImplParams; \ typedef typename Impl::ImplParams ImplParams; \
typedef typename Impl::StencilImpl::View_type StencilView; \ typedef typename Impl::StencilImpl::View_type StencilView; \
typedef const typename ViewMap<FermionField>::Type FermionFieldView; \ typedef typename ViewMap<FermionField>::Type FermionFieldView; \
typedef const typename ViewMap<DoubledGaugeField>::Type DoubledGaugeFieldView; typedef typename ViewMap<DoubledGaugeField>::Type DoubledGaugeFieldView;
#define INHERIT_IMPL_TYPES(Base) \ #define INHERIT_IMPL_TYPES(Base) \
INHERIT_GIMPL_TYPES(Base) \ INHERIT_GIMPL_TYPES(Base) \
@@ -183,8 +183,7 @@ NAMESPACE_CHECK(ImplStaggered);
///////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////
// Single flavour one component spinors with colour index. 5d vec // Single flavour one component spinors with colour index. 5d vec
///////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////
// Deprecate Vec5d #include <Grid/qcd/action/fermion/StaggeredVec5dImpl.h>
//#include <Grid/qcd/action/fermion/StaggeredVec5dImpl.h> NAMESPACE_CHECK(ImplStaggered5dVec);
//NAMESPACE_CHECK(ImplStaggered5dVec);

View File

@@ -30,18 +30,6 @@ directory
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
/*
Policy implementation for G-parity boundary conditions
Rather than treating the gauge field as a flavored field, the Grid implementation of G-parity treats the gauge field as a regular
field with complex conjugate boundary conditions. In order to ensure the second flavor interacts with the conjugate links and the first
with the regular links we overload the functionality of doubleStore, whose purpose is to store the gauge field and the barrel-shifted gauge field
to avoid communicating links when applying the Dirac operator, such that the double-stored field contains also a flavor index which maps to
either the link or the conjugate link. This flavored field is then used by multLink to apply the correct link to a spinor.
Here the first Nd-1 directions are treated as "spatial", and a twist value of 1 indicates G-parity BCs in that direction.
mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs
*/
template <class S, class Representation = FundamentalRepresentation, class Options=CoeffReal> template <class S, class Representation = FundamentalRepresentation, class Options=CoeffReal>
class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Representation::Dimension> > { class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Representation::Dimension> > {
public: public:
@@ -108,31 +96,43 @@ public:
int sl = St._simd_layout[direction]; int sl = St._simd_layout[direction];
Coordinate icoor; Coordinate icoor;
#ifdef GRID_SIMT #ifdef __CUDA_ARCH__
_Spinor tmp;
const int Nsimd =SiteDoubledGaugeField::Nsimd(); const int Nsimd =SiteDoubledGaugeField::Nsimd();
int s = acceleratorSIMTlane(Nsimd); int s = SIMTlane(Nsimd);
St.iCoorFromIindex(icoor,s); St.iCoorFromIindex(icoor,s);
int mmu = mu % Nd; int mmu = mu % Nd;
if ( SE->_around_the_world && St.parameters.twists[mmu] ) {
int permute_lane = (sl==1)
|| ((distance== 1)&&(icoor[direction]==1))
|| ((distance==-1)&&(icoor[direction]==0));
auto UU0=coalescedRead(U(0)(mu)); if ( permute_lane ) {
auto UU1=coalescedRead(U(1)(mu)); tmp(0) = chi(1);
tmp(1) = chi(0);
//Decide whether we do a G-parity flavor twist } else {
//Note: this assumes (but does not check) that sl==1 || sl==2 i.e. max 2 SIMD lanes in G-parity dir tmp(0) = chi(0);
//It also assumes (but does not check) that abs(distance) == 1 tmp(1) = chi(1);
int permute_lane = (sl==1) }
|| ((distance== 1)&&(icoor[direction]==1))
|| ((distance==-1)&&(icoor[direction]==0));
permute_lane = permute_lane && SE->_around_the_world && St.parameters.twists[mmu] && mmu < Nd-1; //only if we are going around the world in a spatial direction auto UU0=coalescedRead(U(0)(mu));
auto UU1=coalescedRead(U(1)(mu));
//Apply the links mult(&phi(0),&UU0,&tmp(0));
int f_upper = permute_lane ? 1 : 0; mult(&phi(1),&UU1,&tmp(1));
int f_lower = !f_upper;
mult(&phi(0),&UU0,&chi(f_upper)); } else {
mult(&phi(1),&UU1,&chi(f_lower));
auto UU0=coalescedRead(U(0)(mu));
auto UU1=coalescedRead(U(1)(mu));
mult(&phi(0),&UU0,&chi(0));
mult(&phi(1),&UU1,&chi(1));
}
#else #else
typedef _Spinor vobj; typedef _Spinor vobj;
@@ -151,10 +151,10 @@ public:
assert((distance == 1) || (distance == -1)); // nearest neighbour stencil hard code assert((distance == 1) || (distance == -1)); // nearest neighbour stencil hard code
assert((sl == 1) || (sl == 2)); assert((sl == 1) || (sl == 2));
//If this site is an global boundary site, perform the G-parity flavor twist if ( SE->_around_the_world && St.parameters.twists[mmu] ) {
if ( mmu < Nd-1 && SE->_around_the_world && St.parameters.twists[mmu] ) {
if ( sl == 2 ) { if ( sl == 2 ) {
//Only do the twist for lanes on the edge of the physical node
ExtractBuffer<sobj> vals(Nsimd); ExtractBuffer<sobj> vals(Nsimd);
extract(chi,vals); extract(chi,vals);
@@ -209,19 +209,6 @@ public:
reg = memory; reg = memory;
} }
//Poke 'poke_f0' onto flavor 0 and 'poke_f1' onto flavor 1 in direction mu of the doubled gauge field Uds
inline void pokeGparityDoubledGaugeField(DoubledGaugeField &Uds, const GaugeLinkField &poke_f0, const GaugeLinkField &poke_f1, const int mu){
autoView(poke_f0_v, poke_f0, CpuRead);
autoView(poke_f1_v, poke_f1, CpuRead);
autoView(Uds_v, Uds, CpuWrite);
thread_foreach(ss,poke_f0_v,{
Uds_v[ss](0)(mu) = poke_f0_v[ss]();
Uds_v[ss](1)(mu) = poke_f1_v[ss]();
});
}
inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu) inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
{ {
conformable(Uds.Grid(),GaugeGrid); conformable(Uds.Grid(),GaugeGrid);
@@ -232,35 +219,28 @@ public:
GaugeLinkField Uconj(GaugeGrid); GaugeLinkField Uconj(GaugeGrid);
Lattice<iScalar<vInteger> > coor(GaugeGrid); Lattice<iScalar<vInteger> > coor(GaugeGrid);
//Here the first Nd-1 directions are treated as "spatial", and a twist value of 1 indicates G-parity BCs in that direction. for(int mu=0;mu<Nd;mu++){
//mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs
for(int mu=0;mu<Nd-1;mu++){ LatticeCoordinate(coor,mu);
if( Params.twists[mu] ){
LatticeCoordinate(coor,mu);
}
U = PeekIndex<LorentzIndex>(Umu,mu); U = PeekIndex<LorentzIndex>(Umu,mu);
Uconj = conjugate(U); Uconj = conjugate(U);
// Implement the isospin rotation sign on the boundary between f=1 and f=0
// This phase could come from a simple bc 1,1,-1,1 .. // This phase could come from a simple bc 1,1,-1,1 ..
int neglink = GaugeGrid->GlobalDimensions()[mu]-1; int neglink = GaugeGrid->GlobalDimensions()[mu]-1;
if ( Params.twists[mu] ) { if ( Params.twists[mu] ) {
Uconj = where(coor==neglink,-Uconj,Uconj); Uconj = where(coor==neglink,-Uconj,Uconj);
} }
{ auto U_v = U.View();
autoView( U_v , U, CpuRead); auto Uds_v = Uds.View();
autoView( Uconj_v , Uconj, CpuRead); auto Uconj_v = Uconj.View();
autoView( Uds_v , Uds, CpuWrite); auto Utmp_v= Utmp.View();
autoView( Utmp_v, Utmp, CpuWrite); thread_foreach(ss,U_v,{
thread_foreach(ss,U_v,{ Uds_v[ss](0)(mu) = U_v[ss]();
Uds_v[ss](0)(mu) = U_v[ss](); Uds_v[ss](1)(mu) = Uconj_v[ss]();
Uds_v[ss](1)(mu) = Uconj_v[ss](); });
});
}
U = adj(Cshift(U ,mu,-1)); // correct except for spanning the boundary U = adj(Cshift(U ,mu,-1)); // correct except for spanning the boundary
Uconj = adj(Cshift(Uconj,mu,-1)); Uconj = adj(Cshift(Uconj,mu,-1));
@@ -270,57 +250,19 @@ public:
Utmp = where(coor==0,Uconj,Utmp); Utmp = where(coor==0,Uconj,Utmp);
} }
{ thread_foreach(ss,Utmp_v,{
autoView( Uds_v , Uds, CpuWrite); Uds_v[ss](0)(mu+4) = Utmp_v[ss]();
autoView( Utmp_v, Utmp, CpuWrite); });
thread_foreach(ss,Utmp_v,{
Uds_v[ss](0)(mu+4) = Utmp_v[ss]();
});
}
Utmp = Uconj; Utmp = Uconj;
if ( Params.twists[mu] ) { if ( Params.twists[mu] ) {
Utmp = where(coor==0,U,Utmp); Utmp = where(coor==0,U,Utmp);
} }
{ thread_foreach(ss,Utmp_v,{
autoView( Uds_v , Uds, CpuWrite); Uds_v[ss](1)(mu+4) = Utmp_v[ss]();
autoView( Utmp_v, Utmp, CpuWrite); });
thread_foreach(ss,Utmp_v,{
Uds_v[ss](1)(mu+4) = Utmp_v[ss]();
});
}
}
{ //periodic / antiperiodic temporal BCs
int mu = Nd-1;
int L = GaugeGrid->GlobalDimensions()[mu];
int Lmu = L - 1;
LatticeCoordinate(coor, mu);
U = PeekIndex<LorentzIndex>(Umu, mu); //Get t-directed links
GaugeLinkField *Upoke = &U;
if(Params.twists[mu]){ //antiperiodic
Utmp = where(coor == Lmu, -U, U);
Upoke = &Utmp;
}
Uconj = conjugate(*Upoke); //second flavor interacts with conjugate links
pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu);
//Get the barrel-shifted field
Utmp = adj(Cshift(U, mu, -1)); //is a forward shift!
Upoke = &Utmp;
if(Params.twists[mu]){
U = where(coor == 0, -Utmp, Utmp); //boundary phase
Upoke = &U;
}
Uconj = conjugate(*Upoke);
pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu + 4);
} }
} }
@@ -330,14 +272,11 @@ public:
GaugeLinkField link(mat.Grid()); GaugeLinkField link(mat.Grid());
// use lorentz for flavour as hack. // use lorentz for flavour as hack.
auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde, A)); auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde, A));
auto link_v = link.View();
{ auto tmp_v = tmp.View();
autoView( link_v , link, CpuWrite); thread_foreach(ss,tmp_v,{
autoView( tmp_v , tmp, CpuRead); link_v[ss]() = tmp_v[ss](0, 0) + conjugate(tmp_v[ss](1, 1));
thread_foreach(ss,tmp_v,{ });
link_v[ss]() = tmp_v[ss](0, 0) + conjugate(tmp_v[ss](1, 1));
});
}
PokeIndex<LorentzIndex>(mat, link, mu); PokeIndex<LorentzIndex>(mat, link, mu);
return; return;
} }
@@ -360,48 +299,26 @@ public:
inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){ inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
assert(0); assert(0);
} }
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
int Ls=Btilde.Grid()->_fdimensions[0];
{
GridBase *GaugeGrid = mat.Grid();
Lattice<iScalar<vInteger> > coor(GaugeGrid);
if( Params.twists[mu] ){
LatticeCoordinate(coor,mu);
}
autoView( mat_v , mat, AcceleratorWrite);
autoView( Btilde_v , Btilde, AcceleratorRead);
autoView( Atilde_v , Atilde, AcceleratorRead);
accelerator_for(sss,mat.Grid()->oSites(), FermionField::vector_type::Nsimd(),{
int sU=sss;
typedef decltype(coalescedRead(mat_v[sU](mu)() )) ColorMatrixType;
ColorMatrixType sum;
zeroit(sum);
for(int s=0;s<Ls;s++){
int sF = s+Ls*sU;
for(int spn=0;spn<Ns;spn++){ //sum over spin
//Flavor 0
auto bb = coalescedRead(Btilde_v[sF](0)(spn) ); //color vector
auto aa = coalescedRead(Atilde_v[sF](0)(spn) );
sum = sum + outerProduct(bb,aa);
//Flavor 1
bb = coalescedRead(Btilde_v[sF](1)(spn) );
aa = coalescedRead(Atilde_v[sF](1)(spn) );
sum = sum + conjugate(outerProduct(bb,aa));
}
}
coalescedWrite(mat_v[sU](mu)(), sum);
});
}
}
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
int Ls = Btilde.Grid()->_fdimensions[0];
GaugeLinkField tmp(mat.Grid());
tmp = Zero();
auto tmp_v = tmp.View();
auto Atilde_v = Atilde.View();
auto Btilde_v = Btilde.View();
thread_for(ss,tmp.Grid()->oSites(),{
for (int s = 0; s < Ls; s++) {
int sF = s + Ls * ss;
auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde_v[sF], Atilde_v[sF]));
tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1));
}
});
PokeIndex<LorentzIndex>(mat, tmp, mu);
return;
}
}; };
@@ -409,8 +326,8 @@ typedef GparityWilsonImpl<vComplex , FundamentalRepresentation,CoeffReal> Gparit
typedef GparityWilsonImpl<vComplexF, FundamentalRepresentation,CoeffReal> GparityWilsonImplF; // Float typedef GparityWilsonImpl<vComplexF, FundamentalRepresentation,CoeffReal> GparityWilsonImplF; // Float
typedef GparityWilsonImpl<vComplexD, FundamentalRepresentation,CoeffReal> GparityWilsonImplD; // Double typedef GparityWilsonImpl<vComplexD, FundamentalRepresentation,CoeffReal> GparityWilsonImplD; // Double
//typedef GparityWilsonImpl<vComplex , FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplRL; // Real.. whichever prec typedef GparityWilsonImpl<vComplex , FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplRL; // Real.. whichever prec
//typedef GparityWilsonImpl<vComplexF, FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplFH; // Float typedef GparityWilsonImpl<vComplexF, FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplFH; // Float
//typedef GparityWilsonImpl<vComplexD, FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplDF; // Double typedef GparityWilsonImpl<vComplexD, FundamentalRepresentation,CoeffRealHalfComms> GparityWilsonImplDF; // Double
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@@ -141,11 +141,8 @@ public:
void ImportGauge(const GaugeField &_Uthin, const GaugeField &_Ufat); void ImportGauge(const GaugeField &_Uthin, const GaugeField &_Ufat);
void ImportGaugeSimple(const GaugeField &_UUU ,const GaugeField &_U); void ImportGaugeSimple(const GaugeField &_UUU ,const GaugeField &_U);
void ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U); void ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U);
virtual DoubledGaugeField &GetDoubledGaugeField(void) override { return Umu; }; DoubledGaugeField &GetU(void) { return Umu ; } ;
virtual DoubledGaugeField &GetDoubledGaugeFieldE(void) override { return UmuEven; }; DoubledGaugeField &GetUUU(void) { return UUUmu; };
virtual DoubledGaugeField &GetDoubledGaugeFieldO(void) override { return UmuOdd; };
virtual DoubledGaugeField &GetU(void) { return Umu ; } ;
virtual DoubledGaugeField &GetUUU(void) { return UUUmu; };
void CopyGaugeCheckerboards(void); void CopyGaugeCheckerboards(void);
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////

View File

@@ -61,8 +61,8 @@ public:
double DhopCalls; double DhopCalls;
double DhopCommTime; double DhopCommTime;
double DhopComputeTime; double DhopComputeTime;
double DhopComputeTime2; double DhopComputeTime2;
double DhopFaceTime; double DhopFaceTime;
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
// Implement the abstract base // Implement the abstract base
@@ -160,20 +160,17 @@ public:
RealD _c1=1.0, RealD _c2=1.0,RealD _u0=1.0, RealD _c1=1.0, RealD _c2=1.0,RealD _u0=1.0,
const ImplParams &p= ImplParams()); const ImplParams &p= ImplParams());
// DoubleStore gauge field in operator // DoubleStore gauge field in operator
void ImportGauge (const GaugeField &_Uthin ) { assert(0); } void ImportGauge (const GaugeField &_Uthin ) { assert(0); }
void ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat); void ImportGauge(const GaugeField &_Uthin,const GaugeField &_Ufat);
void ImportGaugeSimple(const GaugeField &_UUU,const GaugeField &_U); void ImportGaugeSimple(const GaugeField &_UUU,const GaugeField &_U);
void ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U); void ImportGaugeSimple(const DoubledGaugeField &_UUU,const DoubledGaugeField &_U);
// Give a reference; can be used to do an assignment or copy back out after import // Give a reference; can be used to do an assignment or copy back out after import
// if Carleton wants to cache them and not use the ImportSimple // if Carleton wants to cache them and not use the ImportSimple
virtual DoubledGaugeField &GetDoubledGaugeField(void) override { return Umu; }; DoubledGaugeField &GetU(void) { return Umu ; } ;
virtual DoubledGaugeField &GetDoubledGaugeFieldE(void) override { return UmuEven; }; DoubledGaugeField &GetUUU(void) { return UUUmu; };
virtual DoubledGaugeField &GetDoubledGaugeFieldO(void) override { return UmuOdd; }; void CopyGaugeCheckerboards(void);
DoubledGaugeField &GetU(void) { return Umu ; } ;
DoubledGaugeField &GetUUU(void) { return UUUmu; };
void CopyGaugeCheckerboards(void);
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
// Data members require to support the functionality // Data members require to support the functionality
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
@@ -211,7 +208,7 @@ public:
LebesgueOrder LebesgueEvenOdd; LebesgueOrder LebesgueEvenOdd;
// Comms buffer // Comms buffer
// std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf; std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
// Conserved current utilities // Conserved current utilities

View File

@@ -85,7 +85,7 @@ class MADWF
maxiter =_maxiter; maxiter =_maxiter;
}; };
void operator() (const FermionFieldo &src,FermionFieldo &sol5) void operator() (const FermionFieldo &src4,FermionFieldo &sol5)
{ {
std::cout << GridLogMessage<< " ************************************************" << std::endl; std::cout << GridLogMessage<< " ************************************************" << std::endl;
std::cout << GridLogMessage<< " MADWF-like algorithm " << std::endl; std::cout << GridLogMessage<< " MADWF-like algorithm " << std::endl;
@@ -114,16 +114,8 @@ class MADWF
/////////////////////////////////////// ///////////////////////////////////////
//Import source, include Dminus factors //Import source, include Dminus factors
/////////////////////////////////////// ///////////////////////////////////////
GridBase *src_grid = src.Grid(); Mato.ImportPhysicalFermionSource(src4,b);
std::cout << GridLogMessage << " src4 " <<norm2(src4)<<std::endl;
assert( (src_grid == Mato.GaugeGrid()) || (src_grid == Mato.FermionGrid()));
if ( src_grid == Mato.GaugeGrid() ) {
Mato.ImportPhysicalFermionSource(src,b);
} else {
b=src;
}
std::cout << GridLogMessage << " src " <<norm2(src)<<std::endl;
std::cout << GridLogMessage << " b " <<norm2(b)<<std::endl; std::cout << GridLogMessage << " b " <<norm2(b)<<std::endl;
defect = b; defect = b;

View File

@@ -1,197 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/ImprovedStaggered.h
Copyright (C) 2015
Author: Azusa Yamaguchi, Peter Boyle
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_QCD_NAIVE_STAG_FERMION_H
#define GRID_QCD_NAIVE_STAG_FERMION_H
NAMESPACE_BEGIN(Grid);
class NaiveStaggeredFermionStatic {
public:
static const std::vector<int> directions;
static const std::vector<int> displacements;
static const int npoint = 8;
};
template <class Impl>
class NaiveStaggeredFermion : public StaggeredKernels<Impl>, public NaiveStaggeredFermionStatic {
public:
INHERIT_IMPL_TYPES(Impl);
typedef StaggeredKernels<Impl> Kernels;
FermionField _tmp;
FermionField &tmp(void) { return _tmp; }
////////////////////////////////////////
// Performance monitoring
////////////////////////////////////////
void Report(void);
void ZeroCounters(void);
double DhopTotalTime;
double DhopCalls;
double DhopCommTime;
double DhopComputeTime;
double DhopComputeTime2;
double DhopFaceTime;
///////////////////////////////////////////////////////////////
// Implement the abstract base
///////////////////////////////////////////////////////////////
GridBase *GaugeGrid(void) { return _grid; }
GridBase *GaugeRedBlackGrid(void) { return _cbgrid; }
GridBase *FermionGrid(void) { return _grid; }
GridBase *FermionRedBlackGrid(void) { return _cbgrid; }
//////////////////////////////////////////////////////////////////
// override multiply; cut number routines if pass dagger argument
// and also make interface more uniformly consistent
//////////////////////////////////////////////////////////////////
void M(const FermionField &in, FermionField &out);
void Mdag(const FermionField &in, FermionField &out);
/////////////////////////////////////////////////////////
// half checkerboard operations
/////////////////////////////////////////////////////////
void Meooe(const FermionField &in, FermionField &out);
void MeooeDag(const FermionField &in, FermionField &out);
void Mooee(const FermionField &in, FermionField &out);
void MooeeDag(const FermionField &in, FermionField &out);
void MooeeInv(const FermionField &in, FermionField &out);
void MooeeInvDag(const FermionField &in, FermionField &out);
////////////////////////
// Derivative interface
////////////////////////
// Interface calls an internal routine
void DhopDeriv (GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
void DhopDerivOE(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
void DhopDerivEO(GaugeField &mat, const FermionField &U, const FermionField &V, int dag);
///////////////////////////////////////////////////////////////
// non-hermitian hopping term; half cb or both
///////////////////////////////////////////////////////////////
void Dhop (const FermionField &in, FermionField &out, int dag);
void DhopOE(const FermionField &in, FermionField &out, int dag);
void DhopEO(const FermionField &in, FermionField &out, int dag);
///////////////////////////////////////////////////////////////
// Multigrid assistance; force term uses too
///////////////////////////////////////////////////////////////
void Mdir(const FermionField &in, FermionField &out, int dir, int disp);
void MdirAll(const FermionField &in, std::vector<FermionField> &out);
void DhopDir(const FermionField &in, FermionField &out, int dir, int disp);
///////////////////////////////////////////////////////////////
// Extra methods added by derived
///////////////////////////////////////////////////////////////
void DerivInternal(StencilImpl &st,
DoubledGaugeField &U,
GaugeField &mat,
const FermionField &A, const FermionField &B, int dag);
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag);
void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag);
void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag);
//////////////////////////////////////////////////////////////////////////
// Grid own interface Constructor
//////////////////////////////////////////////////////////////////////////
NaiveStaggeredFermion(GaugeField &_U, GridCartesian &Fgrid,
GridRedBlackCartesian &Hgrid, RealD _mass,
RealD _c1, RealD _u0,
const ImplParams &p = ImplParams());
NaiveStaggeredFermion(GridCartesian &Fgrid,
GridRedBlackCartesian &Hgrid, RealD _mass,
RealD _c1, RealD _u0,
const ImplParams &p = ImplParams());
// DoubleStore impl dependent
void ImportGauge (const GaugeField &_U );
DoubledGaugeField &GetDoubledGaugeField(void){ return Umu; };
DoubledGaugeField &GetDoubledGaugeFieldE(void){ return UmuEven; };
DoubledGaugeField &GetDoubledGaugeFieldO(void){ return UmuOdd; };
DoubledGaugeField &GetU(void) { return Umu ; } ;
void CopyGaugeCheckerboards(void);
///////////////////////////////////////////////////////////////
// Data members require to support the functionality
///////////////////////////////////////////////////////////////
// protected:
public:
// any other parameters of action ???
virtual int isTrivialEE(void) { return 1; };
virtual RealD Mass(void) { return mass; }
RealD mass;
RealD u0;
RealD c1;
GridBase *_grid;
GridBase *_cbgrid;
// Defines the stencils for even and odd
StencilImpl Stencil;
StencilImpl StencilEven;
StencilImpl StencilOdd;
// Copy of the gauge field , with even and odd subsets
DoubledGaugeField Umu;
DoubledGaugeField UmuEven;
DoubledGaugeField UmuOdd;
LebesgueOrder Lebesgue;
LebesgueOrder LebesgueEvenOdd;
///////////////////////////////////////////////////////////////
// Conserved current utilities
///////////////////////////////////////////////////////////////
void ContractConservedCurrent(PropagatorField &q_in_1,
PropagatorField &q_in_2,
PropagatorField &q_out,
PropagatorField &src,
Current curr_type,
unsigned int mu);
void SeqConservedCurrent(PropagatorField &q_in,
PropagatorField &q_out,
PropagatorField &srct,
Current curr_type,
unsigned int mu,
unsigned int tmin,
unsigned int tmax,
ComplexField &lattice_cmplx);
};
typedef NaiveStaggeredFermion<StaggeredImplF> NaiveStaggeredFermionF;
typedef NaiveStaggeredFermion<StaggeredImplD> NaiveStaggeredFermionD;
NAMESPACE_END(Grid);
#endif

View File

@@ -1,534 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/SchurFactoredFermionOperator.h
Copyright (C) 2021
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <Grid/qcd/utils/MixedPrecisionOperatorFunction.h>
#include <Grid/qcd/action/domains/Domains.h>
NAMESPACE_BEGIN(Grid);
////////////////////////////////////////////////////////
// Some explanation of class structure for domain decomposition:
//
// Need a dirichlet operator for two flavour determinant - acts on both Omega and OmegaBar.
//
// Possible gain if the global sums and CG are run independently?? Could measure this.
//
// Types of operations
//
// 1) assemble local det dOmega det dOmegaBar pseudofermion
//
// - DirichletFermionOperator - can either do a global solve, or independent/per cell coefficients.
//
// 2) assemble dOmegaInverse and dOmegaBarInverse in R
//
// - DirichletFermionOperator - can also be used to
// - need two or more cells per node. Options
// - a) solve one cell at a time, no new code, CopyRegion and reduced /split Grids
// - b) solve multiple cells in parallel. predicated dslash implementation
//
// - b) has more parallelism, experience with block solver suggest might not be aalgorithmically inefficient
// a) has more cache friendly and easier code.
// b) is easy to implement in a "trial" or inefficient code with projection.
//
// 3) Additional functionality for domain operations
//
// - SchurFactoredFermionOperator - Need a DDHMC utility - whether used in two flavour or one flavour
//
// - dBoundary - needs non-dirichlet operator
// - Contains one Dirichlet Op, and one non-Dirichlet op. Implements dBoundary etc...
// - The Dirichlet ops can be passed to dOmega(Bar) solvers etc...
//
////////////////////////////////////////////////////////
template<class ImplD,class ImplF>
class SchurFactoredFermionOperator : public ImplD
{
INHERIT_IMPL_TYPES(ImplD);
typedef typename ImplF::FermionField FermionFieldF;
typedef typename ImplD::FermionField FermionFieldD;
typedef SchurDiagMooeeOperator<FermionOperator<ImplD>,FermionFieldD> LinearOperatorD;
typedef SchurDiagMooeeOperator<FermionOperator<ImplF>,FermionFieldF> LinearOperatorF;
typedef SchurDiagMooeeDagOperator<FermionOperator<ImplD>,FermionFieldD> LinearOperatorDagD;
typedef SchurDiagMooeeDagOperator<FermionOperator<ImplF>,FermionFieldF> LinearOperatorDagF;
typedef MixedPrecisionConjugateGradientOperatorFunction<FermionOperator<ImplD>,
FermionOperator<ImplF>,
LinearOperatorD,
LinearOperatorF> MxPCG;
typedef MixedPrecisionConjugateGradientOperatorFunction<FermionOperator<ImplD>,
FermionOperator<ImplF>,
LinearOperatorDagD,
LinearOperatorDagF> MxDagPCG;
public:
GridBase *FermionGrid(void) { return PeriodicFermOpD.FermionGrid(); };
GridBase *GaugeGrid(void) { return PeriodicFermOpD.GaugeGrid(); };
FermionOperator<ImplD> & DirichletFermOpD;
FermionOperator<ImplF> & DirichletFermOpF;
FermionOperator<ImplD> & PeriodicFermOpD;
FermionOperator<ImplF> & PeriodicFermOpF;
LinearOperatorD DirichletLinOpD;
LinearOperatorF DirichletLinOpF;
LinearOperatorD PeriodicLinOpD;
LinearOperatorF PeriodicLinOpF;
LinearOperatorDagD DirichletLinOpDagD;
LinearOperatorDagF DirichletLinOpDagF;
LinearOperatorDagD PeriodicLinOpDagD;
LinearOperatorDagF PeriodicLinOpDagF;
// Can tinker with these in the pseudofermion for force vs. action solves
Integer maxinnerit;
Integer maxouterit;
RealD tol;
RealD tolinner;
Coordinate Block;
DomainDecomposition Domains;
SchurFactoredFermionOperator(FermionOperator<ImplD> & _PeriodicFermOpD,
FermionOperator<ImplF> & _PeriodicFermOpF,
FermionOperator<ImplD> & _DirichletFermOpD,
FermionOperator<ImplF> & _DirichletFermOpF,
Coordinate &_Block)
: Block(_Block), Domains(Block),
PeriodicFermOpD(_PeriodicFermOpD),
PeriodicFermOpF(_PeriodicFermOpF),
DirichletFermOpD(_DirichletFermOpD),
DirichletFermOpF(_DirichletFermOpF),
DirichletLinOpD(DirichletFermOpD),
DirichletLinOpF(DirichletFermOpF),
PeriodicLinOpD(PeriodicFermOpD),
PeriodicLinOpF(PeriodicFermOpF),
DirichletLinOpDagD(DirichletFermOpD),
DirichletLinOpDagF(DirichletFermOpF),
PeriodicLinOpDagD(PeriodicFermOpD),
PeriodicLinOpDagF(PeriodicFermOpF)
{
tol=1.0e-10;
tolinner=1.0e-6;
maxinnerit=1000;
maxouterit=10;
assert(PeriodicFermOpD.FermionGrid() == DirichletFermOpD.FermionGrid());
assert(PeriodicFermOpF.FermionGrid() == DirichletFermOpF.FermionGrid());
};
enum Domain { Omega=0, OmegaBar=1 };
void ImportGauge(const GaugeField &Umu)
{
// Single precision will update in the mixed prec CG
PeriodicFermOpD.ImportGauge(Umu);
GaugeField dUmu(Umu.Grid());
dUmu=Umu;
// DirchletBCs(dUmu);
DirichletFilter<GaugeField> Filter(Block);
Filter.applyFilter(dUmu);
DirichletFermOpD.ImportGauge(dUmu);
}
/*
void ProjectBoundaryBothDomains (FermionField &f,int sgn)
{
assert((sgn==1)||(sgn==-1));
Real rsgn = sgn;
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT
};
GridBase *grid = f.Grid();
LatticeInteger coor(grid);
LatticeInteger face(grid);
LatticeInteger one(grid); one = 1;
LatticeInteger zero(grid); zero = 0;
LatticeInteger nface(grid); nface=Zero();
FermionField projected(grid); projected=Zero();
FermionField sp_proj (grid);
int dims = grid->Nd();
int isDWF= (dims==Nd+1);
assert((dims==Nd)||(dims==Nd+1));
Coordinate Global=grid->GlobalDimensions();
for(int mu=0;mu<Nd;mu++){
if ( Block[mu] <= Global[mu+isDWF] ) {
// need to worry about DWF 5th dim first
LatticeCoordinate(coor,mu+isDWF);
face = where(mod(coor,Block[mu]) == Integer(0),one,zero );
nface = nface + face;
Gamma G(Gmu[mu]);
// Lower face receives (1-gamma)/2 in normal forward hopping term
sp_proj = 0.5*(f-G*f*rsgn);
projected= where(face,sp_proj,projected);
//projected= where(face,f,projected);
face = where(mod(coor,Block[mu]) == Integer(Block[mu]-1) ,one,zero );
nface = nface + face;
// Upper face receives (1+gamma)/2 in normal backward hopping term
sp_proj = 0.5*(f+G*f*rsgn);
projected= where(face,sp_proj,projected);
//projected= where(face,f,projected);
}
}
// Initial Zero() where nface==0.
// Keep the spin projected faces where nface==1
// Full spinor where nface>=2
projected = where(nface>Integer(1),f,projected);
f=projected;
}
*/
void ProjectBoundaryBothDomains (FermionField &f,int sgn)
{
assert((sgn==1)||(sgn==-1));
Real rsgn = sgn;
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT
};
GridBase *grid = f.Grid();
LatticeInteger coor(grid);
LatticeInteger face(grid);
LatticeInteger one(grid); one = 1;
LatticeInteger zero(grid); zero = 0;
LatticeInteger omega(grid);
LatticeInteger omegabar(grid);
LatticeInteger tmp(grid);
omega=one; Domains.ProjectDomain(omega,0);
omegabar=one; Domains.ProjectDomain(omegabar,1);
LatticeInteger nface(grid); nface=Zero();
FermionField projected(grid); projected=Zero();
FermionField sp_proj (grid);
int dims = grid->Nd();
int isDWF= (dims==Nd+1);
assert((dims==Nd)||(dims==Nd+1));
Coordinate Global=grid->GlobalDimensions();
for(int mmu=0;mmu<Nd;mmu++){
Gamma G(Gmu[mmu]);
// need to worry about DWF 5th dim first
int mu = mmu+isDWF;
if ( Block[mmu] && (Block[mmu] <= Global[mu]) ) {
// Lower face receives (1-gamma)/2 in normal forward hopping term
tmp = Cshift(omegabar,mu,-1);
tmp = tmp + omega;
face = where(tmp == Integer(2),one,zero );
tmp = Cshift(omega,mu,-1);
tmp = tmp + omegabar;
face = where(tmp == Integer(2),one,face );
nface = nface + face;
sp_proj = 0.5*(f-G*f*rsgn);
projected= where(face,sp_proj,projected);
// Upper face receives (1+gamma)/2 in normal backward hopping term
tmp = Cshift(omegabar,mu,1);
tmp = tmp + omega;
face = where(tmp == Integer(2),one,zero );
tmp = Cshift(omega,mu,1);
tmp = tmp + omegabar;
face = where(tmp == Integer(2),one,face );
nface = nface + face;
sp_proj = 0.5*(f+G*f*rsgn);
projected= where(face,sp_proj,projected);
}
}
// Initial Zero() where nface==0.
// Keep the spin projected faces where nface==1
// Full spinor where nface>=2
projected = where(nface>Integer(1),f,projected);
f=projected;
}
void ProjectDomain(FermionField &f,int domain)
{
/*
GridBase *grid = f.Grid();
int dims = grid->Nd();
int isDWF= (dims==Nd+1);
assert((dims==Nd)||(dims==Nd+1));
FermionField zz(grid); zz=Zero();
LatticeInteger coor(grid);
LatticeInteger domaincb(grid); domaincb=Zero();
for(int d=0;d<Nd;d++){
LatticeCoordinate(coor,d+isDWF);
domaincb = domaincb + div(coor,Block[d]);
}
f = where(mod(domaincb,2)==Integer(domain),f,zz);
*/
Domains.ProjectDomain(f,domain);
};
void ProjectOmegaBar (FermionField &f) {ProjectDomain(f,OmegaBar);}
void ProjectOmega (FermionField &f) {ProjectDomain(f,Omega);}
// See my notes(!).
// Notation: Following Luscher, we introduce projectors $\hPdb$ with both spinor and space structure
// projecting all spinor elements in $\Omega$ connected by $\Ddb$ to $\bar{\Omega}$,
void ProjectBoundaryBar(FermionField &f)
{
ProjectBoundaryBothDomains(f,1);
ProjectOmega(f);
}
// and $\hPd$ projecting all spinor elements in $\bar{\Omega}$ connected by $\Dd$ to $\Omega$.
void ProjectBoundary (FermionField &f)
{
ProjectBoundaryBothDomains(f,1);
ProjectOmegaBar(f);
// DumpSliceNorm("ProjectBoundary",f,f.Grid()->Nd()-1);
};
void dBoundary (FermionField &in,FermionField &out)
{
FermionField tmp(in);
ProjectOmegaBar(tmp);
PeriodicFermOpD.M(tmp,out);
ProjectOmega(out);
};
void dBoundaryDag (FermionField &in,FermionField &out)
{
FermionField tmp(in);
ProjectOmega(tmp);
PeriodicFermOpD.Mdag(tmp,out);
ProjectOmegaBar(out);
};
void dBoundaryBar (FermionField &in,FermionField &out)
{
FermionField tmp(in);
ProjectOmega(tmp);
PeriodicFermOpD.M(tmp,out);
ProjectOmegaBar(out);
};
void dBoundaryBarDag (FermionField &in,FermionField &out)
{
FermionField tmp(in);
ProjectOmegaBar(tmp);
PeriodicFermOpD.Mdag(tmp,out);
ProjectOmega(out);
};
void dOmega (FermionField &in,FermionField &out)
{
FermionField tmp(in);
ProjectOmega(tmp);
DirichletFermOpD.M(tmp,out);
ProjectOmega(out);
};
void dOmegaBar (FermionField &in,FermionField &out)
{
FermionField tmp(in);
ProjectOmegaBar(tmp);
DirichletFermOpD.M(tmp,out);
ProjectOmegaBar(out);
};
void dOmegaDag (FermionField &in,FermionField &out)
{
FermionField tmp(in);
ProjectOmega(tmp);
DirichletFermOpD.Mdag(tmp,out);
ProjectOmega(out);
};
void dOmegaBarDag (FermionField &in,FermionField &out)
{
FermionField tmp(in);
ProjectOmegaBar(tmp);
DirichletFermOpD.Mdag(tmp,out);
ProjectOmegaBar(out);
};
void dOmegaInv (FermionField &in,FermionField &out)
{
FermionField tmp(in);
ProjectOmega(tmp);
dOmegaInvAndOmegaBarInv(tmp,out); // Inefficient warning
ProjectOmega(out);
};
void dOmegaBarInv(FermionField &in,FermionField &out)
{
FermionField tmp(in);
ProjectOmegaBar(tmp);
dOmegaInvAndOmegaBarInv(tmp,out);
ProjectOmegaBar(out);
};
void dOmegaDagInv (FermionField &in,FermionField &out)
{
FermionField tmp(in);
ProjectOmega(tmp);
dOmegaDagInvAndOmegaBarDagInv(tmp,out);
ProjectOmega(out);
};
void dOmegaBarDagInv(FermionField &in,FermionField &out)
{
FermionField tmp(in);
ProjectOmegaBar(tmp);
dOmegaDagInvAndOmegaBarDagInv(tmp,out);
ProjectOmegaBar(out);
};
void dOmegaInvAndOmegaBarInv(FermionField &in,FermionField &out)
{
MxPCG OmegaSolver(tol,
tolinner,
maxinnerit,
maxouterit,
DirichletFermOpF.FermionRedBlackGrid(),
DirichletFermOpF,
DirichletFermOpD,
DirichletLinOpF,
DirichletLinOpD);
SchurRedBlackDiagMooeeSolve<FermionField> PrecSolve(OmegaSolver);
PrecSolve(DirichletFermOpD,in,out);
};
void dOmegaDagInvAndOmegaBarDagInv(FermionField &in,FermionField &out)
{
MxDagPCG OmegaDagSolver(tol,
tolinner,
maxinnerit,
maxouterit,
DirichletFermOpF.FermionRedBlackGrid(),
DirichletFermOpF,
DirichletFermOpD,
DirichletLinOpDagF,
DirichletLinOpDagD);
SchurRedBlackDiagMooeeDagSolve<FermionField> PrecSolve(OmegaDagSolver);
PrecSolve(DirichletFermOpD,in,out);
};
// Rdag = Pdbar - DdbarDag DomegabarDagInv DdDag DomegaDagInv Pdbar
void RDag(FermionField &in,FermionField &out)
{
FermionField tmp1(PeriodicFermOpD.FermionGrid());
FermionField tmp2(PeriodicFermOpD.FermionGrid());
out = in;
ProjectBoundaryBar(out);
dOmegaDagInv(out,tmp1);
dBoundaryDag(tmp1,tmp2);
dOmegaBarDagInv(tmp2,tmp1);
dBoundaryBarDag(tmp1,tmp2);
out = out - tmp2;
};
// R = Pdbar - Pdbar DomegaInv Dd DomegabarInv Ddbar
void R(FermionField &in,FermionField &out)
{
FermionField tmp1(PeriodicFermOpD.FermionGrid());
FermionField tmp2(PeriodicFermOpD.FermionGrid());
out = in;
ProjectBoundaryBar(out);
dBoundaryBar(out,tmp1);
dOmegaBarInv(tmp1,tmp2);
dBoundary(tmp2,tmp1);
dOmegaInv(tmp1,tmp2);
out = in - tmp2 ;
ProjectBoundaryBar(out);
// DumpSliceNorm("R",out,out.Grid()->Nd()-1);
};
// R = Pdbar - Pdbar Dinv Ddbar
void RInv(FermionField &in,FermionField &out)
{
FermionField tmp1(PeriodicFermOpD.FermionGrid());
dBoundaryBar(in,out);
Dinverse(out,tmp1);
out =in -tmp1;
ProjectBoundaryBar(out);
};
// R = Pdbar - DdbarDag DinvDag Pdbar
void RDagInv(FermionField &in,FermionField &out)
{
FermionField tmp(PeriodicFermOpD.FermionGrid());
FermionField Pin(PeriodicFermOpD.FermionGrid());
Pin = in; ProjectBoundaryBar(Pin);
DinverseDag(Pin,out);
dBoundaryBarDag(out,tmp);
out =Pin -tmp;
};
// Non-dirichlet inverter using red-black preconditioning
void Dinverse(FermionField &in,FermionField &out)
{
MxPCG DSolver(tol,
tolinner,
maxinnerit,
maxouterit,
PeriodicFermOpF.FermionRedBlackGrid(),
PeriodicFermOpF,
PeriodicFermOpD,
PeriodicLinOpF,
PeriodicLinOpD);
SchurRedBlackDiagMooeeSolve<FermionField> Solve(DSolver);
Solve(PeriodicFermOpD,in,out);
}
void DinverseDag(FermionField &in,FermionField &out)
{
MxDagPCG DdagSolver(tol,
tolinner,
maxinnerit,
maxouterit,
PeriodicFermOpF.FermionRedBlackGrid(),
PeriodicFermOpF,
PeriodicFermOpD,
PeriodicLinOpDagF,
PeriodicLinOpDagD);
SchurRedBlackDiagMooeeDagSolve<FermionField> Solve(DdagSolver);
Solve(PeriodicFermOpD,in,out);
}
};
NAMESPACE_END(Grid);

Some files were not shown because too many files have changed in this diff Show More