1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-23 10:12:02 +01:00

Compare commits

..

1 Commits

Author SHA1 Message Date
8b91b61b61 First cut at faster GPU slice sum via atomics 2022-12-22 15:13:45 -05:00
493 changed files with 7184 additions and 36554 deletions

View File

@ -1,54 +0,0 @@
name: Bug report
description: Report a bug.
title: "<insert title>"
labels: [bug]
body:
- type: markdown
attributes:
value: >
Thank you for taking the time to file a bug report.
Please check that the code is pointing to the HEAD of develop
or any commit in master which is tagged with a version number.
- type: textarea
attributes:
label: "Describe the issue:"
description: >
Describe the issue and any previous attempt to solve it.
validations:
required: true
- type: textarea
attributes:
label: "Code example:"
description: >
If relevant, show how to reproduce the issue using a minimal working
example.
placeholder: |
<< your code here >>
render: shell
validations:
required: false
- type: textarea
attributes:
label: "Target platform:"
description: >
Give a description of the target platform (CPU, network, compiler).
Please give the full CPU part description, using for example
`cat /proc/cpuinfo | grep 'model name' | uniq` (Linux)
or `sysctl machdep.cpu.brand_string` (macOS) and the full output
the `--version` option of your compiler.
validations:
required: true
- type: textarea
attributes:
label: "Configure options:"
description: >
Please give the exact configure command used and attach
`config.log`, `grid.config.summary` and the output of `make V=1`.
render: shell
validations:
required: true

View File

@ -45,7 +45,7 @@ directory
//disables nvcc specific warning in json.hpp //disables nvcc specific warning in json.hpp
#pragma clang diagnostic ignored "-Wdeprecated-register" #pragma clang diagnostic ignored "-Wdeprecated-register"
#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ #if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
//disables nvcc specific warning in json.hpp //disables nvcc specific warning in json.hpp
#pragma nv_diag_suppress unsigned_compare_with_zero #pragma nv_diag_suppress unsigned_compare_with_zero
#pragma nv_diag_suppress cast_to_qualified_type #pragma nv_diag_suppress cast_to_qualified_type

View File

@ -44,10 +44,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <Grid/GridStd.h> #include <Grid/GridStd.h>
#include <Grid/threads/Pragmas.h> #include <Grid/threads/Pragmas.h>
#include <Grid/perfmon/Timer.h> #include <Grid/perfmon/Timer.h>
//#include <Grid/perfmon/PerfCount.h> #include <Grid/perfmon/PerfCount.h>
#include <Grid/util/Util.h> #include <Grid/util/Util.h>
#include <Grid/log/Log.h> #include <Grid/log/Log.h>
#include <Grid/perfmon/Tracing.h>
#include <Grid/allocator/Allocator.h> #include <Grid/allocator/Allocator.h>
#include <Grid/simd/Simd.h> #include <Grid/simd/Simd.h>
#include <Grid/threads/ThreadReduction.h> #include <Grid/threads/ThreadReduction.h>

View File

@ -36,7 +36,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
#include <Grid/qcd/QCD.h> #include <Grid/qcd/QCD.h>
#include <Grid/qcd/spin/Spin.h> #include <Grid/qcd/spin/Spin.h>
#include <Grid/qcd/gparity/Gparity.h>
#include <Grid/qcd/utils/Utils.h> #include <Grid/qcd/utils/Utils.h>
#include <Grid/qcd/representations/Representations.h> #include <Grid/qcd/representations/Representations.h>
NAMESPACE_CHECK(GridQCDCore); NAMESPACE_CHECK(GridQCDCore);

View File

@ -14,7 +14,7 @@
/* NVCC save and restore compile environment*/ /* NVCC save and restore compile environment*/
#ifdef __NVCC__ #ifdef __NVCC__
#pragma push #pragma push
#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ #if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
#pragma nv_diag_suppress code_is_unreachable #pragma nv_diag_suppress code_is_unreachable
#else #else
#pragma diag_suppress code_is_unreachable #pragma diag_suppress code_is_unreachable

View File

@ -66,10 +66,6 @@ if BUILD_FERMION_REPS
extra_sources+=$(ADJ_FERMION_FILES) extra_sources+=$(ADJ_FERMION_FILES)
extra_sources+=$(TWOIND_FERMION_FILES) extra_sources+=$(TWOIND_FERMION_FILES)
endif endif
if BUILD_SP
extra_sources+=$(SP_FERMION_FILES)
extra_sources+=$(SP_TWOIND_FERMION_FILES)
endif
lib_LIBRARIES = libGrid.a lib_LIBRARIES = libGrid.a

View File

@ -54,8 +54,6 @@ NAMESPACE_CHECK(BiCGSTAB);
#include <Grid/algorithms/iterative/SchurRedBlack.h> #include <Grid/algorithms/iterative/SchurRedBlack.h>
#include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h> #include <Grid/algorithms/iterative/ConjugateGradientMultiShift.h>
#include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h> #include <Grid/algorithms/iterative/ConjugateGradientMixedPrec.h>
#include <Grid/algorithms/iterative/ConjugateGradientMultiShiftMixedPrec.h>
#include <Grid/algorithms/iterative/ConjugateGradientMixedPrecBatched.h>
#include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h> #include <Grid/algorithms/iterative/BiCGSTABMixedPrec.h>
#include <Grid/algorithms/iterative/BlockConjugateGradient.h> #include <Grid/algorithms/iterative/BlockConjugateGradient.h>
#include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h> #include <Grid/algorithms/iterative/ConjugateGradientReliableUpdate.h>
@ -69,8 +67,7 @@ NAMESPACE_CHECK(BiCGSTAB);
#include <Grid/algorithms/iterative/PowerMethod.h> #include <Grid/algorithms/iterative/PowerMethod.h>
NAMESPACE_CHECK(PowerMethod); NAMESPACE_CHECK(PowerMethod);
#include <Grid/algorithms/multigrid/MultiGrid.h> #include <Grid/algorithms/CoarsenedMatrix.h>
NAMESPACE_CHECK(CoarsendMatrix); NAMESPACE_CHECK(CoarsendMatrix);
#include <Grid/algorithms/FFT.h> #include <Grid/algorithms/FFT.h>

View File

@ -56,6 +56,243 @@ inline void blockMaskedInnerProduct(Lattice<CComplex> &CoarseInner,
blockSum(CoarseInner,fine_inner_msk); blockSum(CoarseInner,fine_inner_msk);
} }
class Geometry {
public:
int npoint;
int base;
std::vector<int> directions ;
std::vector<int> displacements;
std::vector<int> points_dagger;
Geometry(int _d) {
base = (_d==5) ? 1:0;
// make coarse grid stencil for 4d , not 5d
if ( _d==5 ) _d=4;
npoint = 2*_d+1;
directions.resize(npoint);
displacements.resize(npoint);
points_dagger.resize(npoint);
for(int d=0;d<_d;d++){
directions[d ] = d+base;
directions[d+_d] = d+base;
displacements[d ] = +1;
displacements[d+_d]= -1;
points_dagger[d ] = d+_d;
points_dagger[d+_d] = d;
}
directions [2*_d]=0;
displacements[2*_d]=0;
points_dagger[2*_d]=2*_d;
}
int point(int dir, int disp) {
assert(disp == -1 || disp == 0 || disp == 1);
assert(base+0 <= dir && dir < base+4);
// directions faster index = new indexing
// 4d (base = 0):
// point 0 1 2 3 4 5 6 7 8
// dir 0 1 2 3 0 1 2 3 0
// disp +1 +1 +1 +1 -1 -1 -1 -1 0
// 5d (base = 1):
// point 0 1 2 3 4 5 6 7 8
// dir 1 2 3 4 1 2 3 4 0
// disp +1 +1 +1 +1 -1 -1 -1 -1 0
// displacements faster index = old indexing
// 4d (base = 0):
// point 0 1 2 3 4 5 6 7 8
// dir 0 0 1 1 2 2 3 3 0
// disp +1 -1 +1 -1 +1 -1 +1 -1 0
// 5d (base = 1):
// point 0 1 2 3 4 5 6 7 8
// dir 1 1 2 2 3 3 4 4 0
// disp +1 -1 +1 -1 +1 -1 +1 -1 0
if(dir == 0 and disp == 0)
return 8;
else // New indexing
return (1 - disp) / 2 * 4 + dir - base;
// else // Old indexing
// return (4 * (dir - base) + 1 - disp) / 2;
}
};
template<class Fobj,class CComplex,int nbasis>
class Aggregation {
public:
typedef iVector<CComplex,nbasis > siteVector;
typedef Lattice<siteVector> CoarseVector;
typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field
typedef Lattice<Fobj > FineField;
GridBase *CoarseGrid;
GridBase *FineGrid;
std::vector<Lattice<Fobj> > subspace;
int checkerboard;
int Checkerboard(void){return checkerboard;}
Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid,int _checkerboard) :
CoarseGrid(_CoarseGrid),
FineGrid(_FineGrid),
subspace(nbasis,_FineGrid),
checkerboard(_checkerboard)
{
};
void Orthogonalise(void){
CoarseScalar InnerProd(CoarseGrid);
std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 1"<<std::endl;
blockOrthogonalise(InnerProd,subspace);
}
void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
blockProject(CoarseVec,FineVec,subspace);
}
void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
FineVec.Checkerboard() = subspace[0].Checkerboard();
blockPromote(CoarseVec,FineVec,subspace);
}
virtual void CreateSubspace(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis) {
RealD scale;
ConjugateGradient<FineField> CG(1.0e-2,100,false);
FineField noise(FineGrid);
FineField Mn(FineGrid);
for(int b=0;b<nn;b++){
subspace[b] = Zero();
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
for(int i=0;i<1;i++){
CG(hermop,noise,subspace[b]);
noise = subspace[b];
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
}
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(Mn)<<std::endl;
subspace[b] = noise;
}
}
////////////////////////////////////////////////////////////////////////////////////////////////
// World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
// and this is the best I found
////////////////////////////////////////////////////////////////////////////////////////////////
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
int nn,
double hi,
double lo,
int orderfilter,
int ordermin,
int orderstep,
double filterlo
) {
RealD scale;
FineField noise(FineGrid);
FineField Mn(FineGrid);
FineField tmp(FineGrid);
// New normalised noise
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
// Initial matrix element
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
int b =0;
{
// Filter
Chebyshev<FineField> Cheb(lo,hi,orderfilter);
Cheb(hermop,noise,Mn);
// normalise
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
subspace[b] = Mn;
hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
b++;
}
// Generate a full sequence of Chebyshevs
{
lo=filterlo;
noise=Mn;
FineField T0(FineGrid); T0 = noise;
FineField T1(FineGrid);
FineField T2(FineGrid);
FineField y(FineGrid);
FineField *Tnm = &T0;
FineField *Tn = &T1;
FineField *Tnp = &T2;
// Tn=T1 = (xscale M + mscale)in
RealD xscale = 2.0/(hi-lo);
RealD mscale = -(hi+lo)/(hi-lo);
hermop.HermOp(T0,y);
T1=y*xscale+noise*mscale;
for(int n=2;n<=ordermin+orderstep*(nn-2);n++){
hermop.HermOp(*Tn,y);
autoView( y_v , y, AcceleratorWrite);
autoView( Tn_v , (*Tn), AcceleratorWrite);
autoView( Tnp_v , (*Tnp), AcceleratorWrite);
autoView( Tnm_v , (*Tnm), AcceleratorWrite);
const int Nsimd = CComplex::Nsimd();
accelerator_for(ss, FineGrid->oSites(), Nsimd, {
coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
});
// Possible more fine grained control is needed than a linear sweep,
// but huge productivity gain if this is simple algorithm and not a tunable
int m =1;
if ( n>=ordermin ) m=n-ordermin;
if ( (m%orderstep)==0 ) {
Mn=*Tnp;
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
subspace[b] = Mn;
hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << n<<" filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
b++;
}
// Cycle pointers to avoid copies
FineField *swizzle = Tnm;
Tnm =Tn;
Tn =Tnp;
Tnp =swizzle;
}
}
assert(b==nn);
}
};
// Fine Object == (per site) type of fine field // Fine Object == (per site) type of fine field
// nbasis == number of deflation vectors // nbasis == number of deflation vectors
template<class Fobj,class CComplex,int nbasis> template<class Fobj,class CComplex,int nbasis>
@ -87,9 +324,9 @@ public:
GridBase* _cbgrid; GridBase* _cbgrid;
int hermitian; int hermitian;
CartesianStencil<siteVector,siteVector,DefaultImplParams> Stencil; CartesianStencil<siteVector,siteVector,int> Stencil;
CartesianStencil<siteVector,siteVector,DefaultImplParams> StencilEven; CartesianStencil<siteVector,siteVector,int> StencilEven;
CartesianStencil<siteVector,siteVector,DefaultImplParams> StencilOdd; CartesianStencil<siteVector,siteVector,int> StencilOdd;
std::vector<CoarseMatrix> A; std::vector<CoarseMatrix> A;
std::vector<CoarseMatrix> Aeven; std::vector<CoarseMatrix> Aeven;
@ -394,7 +631,7 @@ public:
assert(Aself != nullptr); assert(Aself != nullptr);
} }
void DselfInternal(CartesianStencil<siteVector,siteVector,DefaultImplParams> &st, CoarseMatrix &a, void DselfInternal(CartesianStencil<siteVector,siteVector,int> &st, CoarseMatrix &a,
const CoarseVector &in, CoarseVector &out, int dag) { const CoarseVector &in, CoarseVector &out, int dag) {
int point = geom.npoint-1; int point = geom.npoint-1;
autoView( out_v, out, AcceleratorWrite); autoView( out_v, out, AcceleratorWrite);
@ -457,7 +694,7 @@ public:
} }
} }
void DhopInternal(CartesianStencil<siteVector,siteVector,DefaultImplParams> &st, std::vector<CoarseMatrix> &a, void DhopInternal(CartesianStencil<siteVector,siteVector,int> &st, std::vector<CoarseMatrix> &a,
const CoarseVector &in, CoarseVector &out, int dag) { const CoarseVector &in, CoarseVector &out, int dag) {
SimpleCompressor<siteVector> compressor; SimpleCompressor<siteVector> compressor;
@ -547,9 +784,9 @@ public:
_cbgrid(new GridRedBlackCartesian(&CoarseGrid)), _cbgrid(new GridRedBlackCartesian(&CoarseGrid)),
geom(CoarseGrid._ndimension), geom(CoarseGrid._ndimension),
hermitian(hermitian_), hermitian(hermitian_),
Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements), Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
StencilEven(_cbgrid,geom.npoint,Even,geom.directions,geom.displacements), StencilEven(_cbgrid,geom.npoint,Even,geom.directions,geom.displacements,0),
StencilOdd(_cbgrid,geom.npoint,Odd,geom.directions,geom.displacements), StencilOdd(_cbgrid,geom.npoint,Odd,geom.directions,geom.displacements,0),
A(geom.npoint,&CoarseGrid), A(geom.npoint,&CoarseGrid),
Aeven(geom.npoint,_cbgrid), Aeven(geom.npoint,_cbgrid),
Aodd(geom.npoint,_cbgrid), Aodd(geom.npoint,_cbgrid),
@ -567,9 +804,9 @@ public:
_cbgrid(&CoarseRBGrid), _cbgrid(&CoarseRBGrid),
geom(CoarseGrid._ndimension), geom(CoarseGrid._ndimension),
hermitian(hermitian_), hermitian(hermitian_),
Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements), Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
StencilEven(&CoarseRBGrid,geom.npoint,Even,geom.directions,geom.displacements), StencilEven(&CoarseRBGrid,geom.npoint,Even,geom.directions,geom.displacements,0),
StencilOdd(&CoarseRBGrid,geom.npoint,Odd,geom.directions,geom.displacements), StencilOdd(&CoarseRBGrid,geom.npoint,Odd,geom.directions,geom.displacements,0),
A(geom.npoint,&CoarseGrid), A(geom.npoint,&CoarseGrid),
Aeven(geom.npoint,&CoarseRBGrid), Aeven(geom.npoint,&CoarseRBGrid),
Aodd(geom.npoint,&CoarseRBGrid), Aodd(geom.npoint,&CoarseRBGrid),

View File

@ -526,7 +526,6 @@ public:
(*this)(Linop,in[k],out[k]); (*this)(Linop,in[k],out[k]);
} }
}; };
virtual ~OperatorFunction(){};
}; };
template<class Field> class LinearFunction { template<class Field> class LinearFunction {
@ -542,7 +541,6 @@ public:
(*this)(in[i], out[i]); (*this)(in[i], out[i]);
} }
} }
virtual ~LinearFunction(){};
}; };
template<class Field> class IdentityLinearFunction : public LinearFunction<Field> { template<class Field> class IdentityLinearFunction : public LinearFunction<Field> {

View File

@ -90,8 +90,9 @@ public:
order=_order; order=_order;
if(order < 2) exit(-1); if(order < 2) exit(-1);
Coeffs.resize(order,0.0); Coeffs.resize(order);
Coeffs[order-1] = 1.0; Coeffs.assign(0.,order);
Coeffs[order-1] = 1.;
}; };
// PB - more efficient low pass drops high modes above the low as 1/x uses all Chebyshev's. // PB - more efficient low pass drops high modes above the low as 1/x uses all Chebyshev's.
@ -257,12 +258,26 @@ public:
for(int n=2;n<order;n++){ for(int n=2;n<order;n++){
Linop.HermOp(*Tn,y); Linop.HermOp(*Tn,y);
#if 0
auto y_v = y.View();
auto Tn_v = Tn->View();
auto Tnp_v = Tnp->View();
auto Tnm_v = Tnm->View();
constexpr int Nsimd = vector_type::Nsimd();
accelerator_for(ss, in.Grid()->oSites(), Nsimd, {
coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
});
if ( Coeffs[n] != 0.0) {
axpy(out,Coeffs[n],*Tnp,out);
}
#else
axpby(y,xscale,mscale,y,(*Tn)); axpby(y,xscale,mscale,y,(*Tn));
axpby(*Tnp,2.0,-1.0,y,(*Tnm)); axpby(*Tnp,2.0,-1.0,y,(*Tnm));
if ( Coeffs[n] != 0.0) { if ( Coeffs[n] != 0.0) {
axpy(out,Coeffs[n],*Tnp,out); axpy(out,Coeffs[n],*Tnp,out);
} }
#endif
// Cycle pointers to avoid copies // Cycle pointers to avoid copies
Field *swizzle = Tnm; Field *swizzle = Tnm;
Tnm =Tn; Tnm =Tn;

View File

@ -33,254 +33,218 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
* Script A = SolverMatrix * Script A = SolverMatrix
* Script P = Preconditioner * Script P = Preconditioner
* *
* Deflation methods considered
* -- Solve P A x = P b [ like Luscher ]
* DEF-1 M P A x = M P b [i.e. left precon]
* DEF-2 P^T M A x = P^T M b
* ADEF-1 Preconditioner = M P + Q [ Q + M + M A Q]
* ADEF-2 Preconditioner = P^T M + Q
* BNN Preconditioner = P^T M P + Q
* BNN2 Preconditioner = M P + P^TM +Q - M P A M
*
* Implement ADEF-2 * Implement ADEF-2
* *
* Vstart = P^Tx + Qb * Vstart = P^Tx + Qb
* M1 = P^TM + Q * M1 = P^TM + Q
* M2=M3=1 * M2=M3=1
* Vout = x
*/ */
NAMESPACE_BEGIN(Grid);
template<class Field> // abstract base
class TwoLevelCG : public LinearFunction<Field> template<class Field, class CoarseField>
class TwoLevelFlexiblePcg : public LinearFunction<Field>
{ {
public: public:
int verbose;
RealD Tolerance; RealD Tolerance;
Integer MaxIterations; Integer MaxIterations;
const int mmax = 5;
GridBase *grid; GridBase *grid;
GridBase *coarsegrid;
// Fine operator, Smoother, CoarseSolver LinearOperatorBase<Field> *_Linop
LinearOperatorBase<Field> &_FineLinop; OperatorFunction<Field> *_Smoother,
LinearFunction<Field> &_Smoother; LinearFunction<CoarseField> *_CoarseSolver;
// Need somthing that knows how to get from Coarse to fine and back again
// more most opertor functions // more most opertor functions
TwoLevelCG(RealD tol, TwoLevelFlexiblePcg(RealD tol,
Integer maxit, Integer maxit,
LinearOperatorBase<Field> &FineLinop, LinearOperatorBase<Field> *Linop,
LinearFunction<Field> &Smoother, LinearOperatorBase<Field> *SmootherLinop,
GridBase *fine) : OperatorFunction<Field> *Smoother,
OperatorFunction<CoarseField> CoarseLinop
) :
Tolerance(tol), Tolerance(tol),
MaxIterations(maxit), MaxIterations(maxit),
_FineLinop(FineLinop), _Linop(Linop),
_Smoother(Smoother) _PreconditionerLinop(PrecLinop),
{ _Preconditioner(Preconditioner)
grid = fine; {
verbose=0;
}; };
virtual void operator() (const Field &src, Field &x) // The Pcg routine is common to all, but the various matrices differ from derived
{ // implementation to derived implmentation
Field resid(grid); void operator() (const Field &src, Field &psi){
void operator() (const Field &src, Field &psi){
psi.Checkerboard() = src.Checkerboard();
grid = src.Grid();
RealD f; RealD f;
RealD rtzp,rtz,a,d,b; RealD rtzp,rtz,a,d,b;
RealD rptzp; RealD rptzp;
RealD tn;
RealD guess = norm2(psi);
RealD ssq = norm2(src);
RealD rsq = ssq*Tolerance*Tolerance;
Field p(grid); /////////////////////////////
Field z(grid); // Set up history vectors
/////////////////////////////
std::vector<Field> p (mmax,grid);
std::vector<Field> mmp(mmax,grid);
std::vector<RealD> pAp(mmax);
Field x (grid); x = psi;
Field z (grid);
Field tmp(grid); Field tmp(grid);
Field mmp(grid);
Field r (grid); Field r (grid);
Field mu (grid); Field mu (grid);
Field rp (grid);
//Initial residual computation & set up
double tn;
GridStopWatch HDCGTimer;
HDCGTimer.Start();
////////////////////////// //////////////////////////
// x0 = Vstart -- possibly modify guess // x0 = Vstart -- possibly modify guess
////////////////////////// //////////////////////////
x=Zero(); x=src;
Vstart(x,src); Vstart(x,src);
// r0 = b -A x0 // r0 = b -A x0
_FineLinop.HermOp(x,mmp); HermOp(x,mmp); // Shouldn't this be something else?
axpy (r, -1.0,mmp[0], src); // Recomputes r=src-Ax0
axpy(r, -1.0, mmp, src); // Recomputes r=src-x0
rp=r;
////////////////////////////////// //////////////////////////////////
// Compute z = M1 x // Compute z = M1 x
////////////////////////////////// //////////////////////////////////
PcgM1(r,z); M1(r,z,tmp,mp,SmootherMirs);
rtzp =real(innerProduct(r,z)); rtzp =real(innerProduct(r,z));
/////////////////////////////////////// ///////////////////////////////////////
// Except Def2, M2 is trivial // Solve for Mss mu = P A z and set p = z-mu
// Def2: p = 1 - Q Az = Pright z
// Other algos M2 is trivial
/////////////////////////////////////// ///////////////////////////////////////
p=z; M2(z,p[0]);
RealD ssq = norm2(src); for (int k=0;k<=MaxIterations;k++){
RealD rsq = ssq*Tolerance*Tolerance;
std::cout<<GridLogMessage<<"HDCG: k=0 residual "<<rtzp<<" target rsq "<<rsq<<" ssq "<<ssq<<std::endl;
for (int k=1;k<=MaxIterations;k++){ int peri_k = k % mmax;
int peri_kp = (k+1) % mmax;
rtz=rtzp; rtz=rtzp;
d= PcgM3(p,mmp); d= M3(p[peri_k],mp,mmp[peri_k],tmp);
a = rtz/d; a = rtz/d;
// Memorise this
pAp[peri_k] = d;
axpy(x,a,p,x); axpy(x,a,p[peri_k],x);
RealD rn = axpy_norm(r,-a,mmp,r); RealD rn = axpy_norm(r,-a,mmp[peri_k],r);
PcgM1(r,z); // Compute z = M x
M1(r,z,tmp,mp);
rtzp =real(innerProduct(r,z)); rtzp =real(innerProduct(r,z));
int ipcg=1; // almost free inexact preconditioned CG M2(z,mu); // ADEF-2 this is identity. Axpy possible to eliminate
if (ipcg) {
rptzp =real(innerProduct(rp,z)); p[peri_kp]=p[peri_k];
} else {
rptzp =0; // Standard search direction p -> z + b p ; b =
b = (rtzp)/rtz;
int northog;
// northog = (peri_kp==0)?1:peri_kp; // This is the fCG(mmax) algorithm
northog = (k>mmax-1)?(mmax-1):k; // This is the fCG-Tr(mmax-1) algorithm
for(int back=0; back < northog; back++){
int peri_back = (k-back)%mmax;
RealD pbApk= real(innerProduct(mmp[peri_back],p[peri_kp]));
RealD beta = -pbApk/pAp[peri_back];
axpy(p[peri_kp],beta,p[peri_back],p[peri_kp]);
} }
b = (rtzp-rptzp)/rtz;
PcgM2(z,mu); // ADEF-2 this is identity. Axpy possible to eliminate
axpy(p,b,p,mu); // mu = A r
RealD rrn=sqrt(rn/ssq); RealD rrn=sqrt(rn/ssq);
RealD rtn=sqrt(rtz/ssq); std::cout<<GridLogMessage<<"TwoLevelfPcg: k= "<<k<<" residual = "<<rrn<<std::endl;
std::cout<<GridLogMessage<<"HDCG: Pcg k= "<<k<<" residual = "<<rrn<<std::endl;
if ( ipcg ) {
axpy(rp,0.0,r,r);
}
// Stopping condition // Stopping condition
if ( rn <= rsq ) { if ( rn <= rsq ) {
HDCGTimer.Stop(); HermOp(x,mmp); // Shouldn't this be something else?
std::cout<<GridLogMessage<<"HDCG: Pcg converged in "<<k<<" iterations and "<<HDCGTimer.Elapsed()<<std::endl;; axpy(tmp,-1.0,src,mmp[0]);
_FineLinop.HermOp(x,mmp); RealD psinorm = sqrt(norm2(x));
axpy(tmp,-1.0,src,mmp); RealD srcnorm = sqrt(norm2(src));
RealD tmpnorm = sqrt(norm2(tmp));
RealD mmpnorm = sqrt(norm2(mmp)); RealD true_residual = tmpnorm/srcnorm;
RealD xnorm = sqrt(norm2(x)); std::cout<<GridLogMessage<<"TwoLevelfPcg: true residual is "<<true_residual<<std::endl;
RealD srcnorm = sqrt(norm2(src)); std::cout<<GridLogMessage<<"TwoLevelfPcg: target residual was"<<Tolerance<<std::endl;
RealD tmpnorm = sqrt(norm2(tmp)); return k;
RealD true_residual = tmpnorm/srcnorm;
std::cout<<GridLogMessage
<<"HDCG: true residual is "<<true_residual
<<" solution "<<xnorm
<<" source "<<srcnorm
<<" mmp "<<mmpnorm
<<std::endl;
return;
} }
} }
std::cout<<GridLogMessage<<"HDCG: not converged"<<std::endl; // Non-convergence
RealD xnorm = sqrt(norm2(x)); assert(0);
RealD srcnorm = sqrt(norm2(src));
std::cout<<GridLogMessage<<"HDCG: non-converged solution "<<xnorm<<" source "<<srcnorm<<std::endl;
return ;
} }
public: public:
virtual void PcgM1(Field & in, Field & out) =0; virtual void M(Field & in,Field & out,Field & tmp) {
virtual void Vstart(Field & x,const Field & src)=0;
virtual void PcgM2(const Field & in, Field & out) {
out=in;
} }
virtual RealD PcgM3(const Field & p, Field & mmp){ virtual void M1(Field & in, Field & out) {// the smoother
RealD dd;
_FineLinop.HermOp(p,mmp);
ComplexD dot = innerProduct(p,mmp);
dd=real(dot);
return dd;
}
/////////////////////////////////////////////////////////////////////
// Only Def1 has non-trivial Vout.
/////////////////////////////////////////////////////////////////////
};
template<class Field, class CoarseField, class Aggregation>
class TwoLevelADEF2 : public TwoLevelCG<Field>
{
public:
///////////////////////////////////////////////////////////////////////////////////
// Need something that knows how to get from Coarse to fine and back again
// void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
// void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
///////////////////////////////////////////////////////////////////////////////////
GridBase *coarsegrid;
Aggregation &_Aggregates;
LinearFunction<CoarseField> &_CoarseSolver;
LinearFunction<CoarseField> &_CoarseSolverPrecise;
///////////////////////////////////////////////////////////////////////////////////
// more most opertor functions
TwoLevelADEF2(RealD tol,
Integer maxit,
LinearOperatorBase<Field> &FineLinop,
LinearFunction<Field> &Smoother,
LinearFunction<CoarseField> &CoarseSolver,
LinearFunction<CoarseField> &CoarseSolverPrecise,
Aggregation &Aggregates
) :
TwoLevelCG<Field>(tol,maxit,FineLinop,Smoother,Aggregates.FineGrid),
_CoarseSolver(CoarseSolver),
_CoarseSolverPrecise(CoarseSolverPrecise),
_Aggregates(Aggregates)
{
coarsegrid = Aggregates.CoarseGrid;
};
virtual void PcgM1(Field & in, Field & out)
{
// [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min] // [PTM+Q] in = [1 - Q A] M in + Q in = Min + Q [ in -A Min]
Field tmp(grid);
Field Min(grid);
Field tmp(this->grid); PcgM(in,Min); // Smoother call
Field Min(this->grid);
CoarseField PleftProj(this->coarsegrid);
CoarseField PleftMss_proj(this->coarsegrid);
GridStopWatch SmootherTimer; HermOp(Min,out);
GridStopWatch MatrixTimer;
SmootherTimer.Start();
this->_Smoother(in,Min);
SmootherTimer.Stop();
MatrixTimer.Start();
this->_FineLinop.HermOp(Min,out);
MatrixTimer.Stop();
axpy(tmp,-1.0,out,in); // tmp = in - A Min axpy(tmp,-1.0,out,in); // tmp = in - A Min
GridStopWatch ProjTimer; ProjectToSubspace(tmp,PleftProj);
GridStopWatch CoarseTimer; ApplyInverse(PleftProj,PleftMss_proj); // Ass^{-1} [in - A Min]_s
GridStopWatch PromTimer; PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Q[in - A Min]
ProjTimer.Start();
this->_Aggregates.ProjectToSubspace(PleftProj,tmp);
ProjTimer.Stop();
CoarseTimer.Start();
this->_CoarseSolver(PleftProj,PleftMss_proj); // Ass^{-1} [in - A Min]_s
CoarseTimer.Stop();
PromTimer.Start();
this->_Aggregates.PromoteFromSubspace(PleftMss_proj,tmp);// tmp = Q[in - A Min]
PromTimer.Stop();
std::cout << GridLogPerformance << "PcgM1 breakdown "<<std::endl;
std::cout << GridLogPerformance << "\tSmoother " << SmootherTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\tProj " << ProjTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\tCoarse " << CoarseTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\tProm " << PromTimer.Elapsed() <<std::endl;
axpy(out,1.0,Min,tmp); // Min+tmp axpy(out,1.0,Min,tmp); // Min+tmp
} }
virtual void Vstart(Field & x,const Field & src) virtual void M2(const Field & in, Field & out) {
{ out=in;
// Must override for Def2 only
// case PcgDef2:
// Pright(in,out);
// break;
}
virtual RealD M3(const Field & p, Field & mmp){
double d,dd;
HermOpAndNorm(p,mmp,d,dd);
return dd;
// Must override for Def1 only
// case PcgDef1:
// d=linop_d->Mprec(p,mmp,tmp,0,1);// Dag no
// linop_d->Mprec(mmp,mp,tmp,1);// Dag yes
// Pleft(mp,mmp);
// d=real(linop_d->inner(p,mmp));
}
virtual void VstartDef2(Field & xconst Field & src){
//case PcgDef2:
//case PcgAdef2:
//case PcgAdef2f:
//case PcgV11f:
/////////////////////////////////// ///////////////////////////////////
// Choose x_0 such that // Choose x_0 such that
// x_0 = guess + (A_ss^inv) r_s = guess + Ass_inv [src -Aguess] // x_0 = guess + (A_ss^inv) r_s = guess + Ass_inv [src -Aguess]
@ -292,73 +256,142 @@ class TwoLevelADEF2 : public TwoLevelCG<Field>
// = src_s - (A guess)_s - src_s + (A guess)_s // = src_s - (A guess)_s - src_s + (A guess)_s
// = 0 // = 0
/////////////////////////////////// ///////////////////////////////////
Field r(this->grid); Field r(grid);
Field mmp(this->grid); Field mmp(grid);
CoarseField PleftProj(this->coarsegrid);
CoarseField PleftMss_proj(this->coarsegrid); HermOp(x,mmp);
axpy (r, -1.0, mmp, src); // r_{-1} = src - A x
this->_Aggregates.ProjectToSubspace(PleftProj,src); ProjectToSubspace(r,PleftProj);
this->_CoarseSolverPrecise(PleftProj,PleftMss_proj); // Ass^{-1} r_s ApplyInverseCG(PleftProj,PleftMss_proj); // Ass^{-1} r_s
this->_Aggregates.PromoteFromSubspace(PleftMss_proj,x); PromoteFromSubspace(PleftMss_proj,mmp);
x=x+mmp;
} }
}; virtual void Vstart(Field & x,const Field & src){
return;
}
/////////////////////////////////////////////////////////////////////
// Only Def1 has non-trivial Vout. Override in Def1
/////////////////////////////////////////////////////////////////////
virtual void Vout (Field & in, Field & out,Field & src){
out = in;
//case PcgDef1:
// //Qb + PT x
// ProjectToSubspace(src,PleftProj);
// ApplyInverse(PleftProj,PleftMss_proj); // Ass^{-1} r_s
// PromoteFromSubspace(PleftMss_proj,tmp);
//
// Pright(in,out);
//
// linop_d->axpy(out,tmp,out,1.0);
// break;
}
////////////////////////////////////////////////////////////////////////////////////////////////
// Pright and Pleft are common to all implementations
////////////////////////////////////////////////////////////////////////////////////////////////
virtual void Pright(Field & in,Field & out){
// P_R = [ 1 0 ]
// [ -Mss^-1 Msb 0 ]
Field in_sbar(grid);
ProjectToSubspace(in,PleftProj);
PromoteFromSubspace(PleftProj,out);
axpy(in_sbar,-1.0,out,in); // in_sbar = in - in_s
HermOp(in_sbar,out);
ProjectToSubspace(out,PleftProj); // Mssbar in_sbar (project)
ApplyInverse (PleftProj,PleftMss_proj); // Mss^{-1} Mssbar
PromoteFromSubspace(PleftMss_proj,out); //
axpy(out,-1.0,out,in_sbar); // in_sbar - Mss^{-1} Mssbar in_sbar
}
virtual void Pleft (Field & in,Field & out){
// P_L = [ 1 -Mbs Mss^-1]
// [ 0 0 ]
Field in_sbar(grid);
Field tmp2(grid);
Field Mtmp(grid);
ProjectToSubspace(in,PleftProj);
PromoteFromSubspace(PleftProj,out);
axpy(in_sbar,-1.0,out,in); // in_sbar = in - in_s
ApplyInverse(PleftProj,PleftMss_proj); // Mss^{-1} in_s
PromoteFromSubspace(PleftMss_proj,out);
HermOp(out,Mtmp);
ProjectToSubspace(Mtmp,PleftProj); // Msbar s Mss^{-1}
PromoteFromSubspace(PleftProj,tmp2);
axpy(out,-1.0,tmp2,Mtmp);
axpy(out,-1.0,out,in_sbar); // in_sbar - Msbars Mss^{-1} in_s
}
}
template<class Field> template<class Field>
class TwoLevelADEF1defl : public TwoLevelCG<Field> class TwoLevelFlexiblePcgADef2 : public TwoLevelFlexiblePcg<Field> {
{ public:
public: virtual void M(Field & in,Field & out,Field & tmp){
const std::vector<Field> &evec;
const std::vector<RealD> &eval;
TwoLevelADEF1defl(RealD tol,
Integer maxit,
LinearOperatorBase<Field> &FineLinop,
LinearFunction<Field> &Smoother,
std::vector<Field> &_evec,
std::vector<RealD> &_eval) :
TwoLevelCG<Field>(tol,maxit,FineLinop,Smoother,_evec[0].Grid()),
evec(_evec),
eval(_eval)
{};
// Can just inherit existing M2 }
// Can just inherit existing M3 virtual void M1(Field & in, Field & out,Field & tmp,Field & mp){
// Simple vstart - do nothing
virtual void Vstart(Field & x,const Field & src){
x=src; // Could apply Q
};
// Override PcgM1
virtual void PcgM1(Field & in, Field & out)
{
int N=evec.size();
Field Pin(this->grid);
Field Qin(this->grid);
//MP + Q = M(1-AQ) + Q = M
// // If we are eigenvector deflating in coarse space
// // Q = Sum_i |phi_i> 1/lambda_i <phi_i|
// // A Q = Sum_i |phi_i> <phi_i|
// // M(1-AQ) = M(1-proj) + Q
Qin.Checkerboard()=in.Checkerboard();
Qin = Zero();
Pin = in;
for (int i=0;i<N;i++) {
const Field& tmp = evec[i];
auto ip = TensorRemove(innerProduct(tmp,in));
axpy(Qin, ip / eval[i],tmp,Qin);
axpy(Pin, -ip ,tmp,Pin);
}
this->_Smoother(Pin,out);
out = out + Qin;
} }
}; virtual void M2(Field & in, Field & out){
NAMESPACE_END(Grid); }
virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp){
}
virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp){
}
}
/*
template<class Field>
class TwoLevelFlexiblePcgAD : public TwoLevelFlexiblePcg<Field> {
public:
virtual void M(Field & in,Field & out,Field & tmp);
virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
virtual void M2(Field & in, Field & out);
virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
}
template<class Field>
class TwoLevelFlexiblePcgDef1 : public TwoLevelFlexiblePcg<Field> {
public:
virtual void M(Field & in,Field & out,Field & tmp);
virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
virtual void M2(Field & in, Field & out);
virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
virtual void Vout (Field & in, Field & out,Field & src,Field & tmp);
}
template<class Field>
class TwoLevelFlexiblePcgDef2 : public TwoLevelFlexiblePcg<Field> {
public:
virtual void M(Field & in,Field & out,Field & tmp);
virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
virtual void M2(Field & in, Field & out);
virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
}
template<class Field>
class TwoLevelFlexiblePcgV11: public TwoLevelFlexiblePcg<Field> {
public:
virtual void M(Field & in,Field & out,Field & tmp);
virtual void M1(Field & in, Field & out,Field & tmp,Field & mp);
virtual void M2(Field & in, Field & out);
virtual RealD M3(Field & p, Field & mp,Field & mmp, Field & tmp);
virtual void Vstart(Field & in, Field & src, Field & r, Field & mp, Field & mmp, Field & tmp);
}
*/
#endif #endif

View File

@ -58,7 +58,6 @@ public:
void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) { void operator()(LinearOperatorBase<Field> &Linop, const Field &src, Field &psi) {
GRID_TRACE("ConjugateGradient");
psi.Checkerboard() = src.Checkerboard(); psi.Checkerboard() = src.Checkerboard();
conformable(psi, src); conformable(psi, src);
@ -118,13 +117,9 @@ public:
GridStopWatch MatrixTimer; GridStopWatch MatrixTimer;
GridStopWatch SolverTimer; GridStopWatch SolverTimer;
RealD usecs = -usecond();
SolverTimer.Start(); SolverTimer.Start();
int k; int k;
for (k = 1; k <= MaxIterations; k++) { for (k = 1; k <= MaxIterations; k++) {
GridStopWatch IterationTimer;
IterationTimer.Start();
c = cp; c = cp;
MatrixTimer.Start(); MatrixTimer.Start();
@ -157,41 +152,31 @@ public:
LinearCombTimer.Stop(); LinearCombTimer.Stop();
LinalgTimer.Stop(); LinalgTimer.Stop();
IterationTimer.Stop(); std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
if ( (k % 500) == 0 ) {
std::cout << GridLogMessage << "ConjugateGradient: Iteration " << k
<< " residual " << sqrt(cp/ssq) << " target " << Tolerance << std::endl; << " residual " << sqrt(cp/ssq) << " target " << Tolerance << std::endl;
} else {
std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
<< " residual " << sqrt(cp/ssq) << " target " << Tolerance << " took " << IterationTimer.Elapsed() << std::endl;
}
// Stopping condition // Stopping condition
if (cp <= rsq) { if (cp <= rsq) {
usecs +=usecond();
SolverTimer.Stop(); SolverTimer.Stop();
Linop.HermOpAndNorm(psi, mmp, d, qq); Linop.HermOpAndNorm(psi, mmp, d, qq);
p = mmp - src; p = mmp - src;
GridBase *grid = src.Grid();
RealD DwfFlops = (1452. )*grid->gSites()*4*k
+ (8+4+8+4+4)*12*grid->gSites()*k; // CG linear algebra
RealD srcnorm = std::sqrt(norm2(src)); RealD srcnorm = std::sqrt(norm2(src));
RealD resnorm = std::sqrt(norm2(p)); RealD resnorm = std::sqrt(norm2(p));
RealD true_residual = resnorm / srcnorm; RealD true_residual = resnorm / srcnorm;
std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k std::cout << GridLogMessage << "ConjugateGradient Converged on iteration " << k
<< "\tComputed residual " << std::sqrt(cp / ssq) << "\tComputed residual " << std::sqrt(cp / ssq)
<< "\tTrue residual " << true_residual << "\tTrue residual " << true_residual
<< "\tTarget " << Tolerance << std::endl; << "\tTarget " << Tolerance << std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl; std::cout << GridLogIterative << "Time breakdown "<<std::endl;
std::cout << GridLogPerformance << "Time breakdown "<<std::endl; std::cout << GridLogIterative << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl; std::cout << GridLogIterative << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl; std::cout << GridLogIterative << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\tInner " << InnerTimer.Elapsed() <<std::endl; std::cout << GridLogIterative << "\tInner " << InnerTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\tAxpyNorm " << AxpyNormTimer.Elapsed() <<std::endl; std::cout << GridLogIterative << "\tAxpyNorm " << AxpyNormTimer.Elapsed() <<std::endl;
std::cout << GridLogPerformance << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl; std::cout << GridLogIterative << "\tLinearComb " << LinearCombTimer.Elapsed() <<std::endl;
std::cout << GridLogDebug << "\tMobius flop rate " << DwfFlops/ usecs<< " Gflops " <<std::endl;
if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0); if (ErrorOnNoConverge) assert(true_residual / Tolerance < 10000.0);

View File

@ -49,7 +49,6 @@ NAMESPACE_BEGIN(Grid);
Integer TotalInnerIterations; //Number of inner CG iterations Integer TotalInnerIterations; //Number of inner CG iterations
Integer TotalOuterIterations; //Number of restarts Integer TotalOuterIterations; //Number of restarts
Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step Integer TotalFinalStepIterations; //Number of CG iterations in final patch-up step
RealD TrueResidual;
//Option to speed up *inner single precision* solves using a LinearFunction that produces a guess //Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
LinearFunction<FieldF> *guesser; LinearFunction<FieldF> *guesser;
@ -69,7 +68,6 @@ NAMESPACE_BEGIN(Grid);
} }
void operator() (const FieldD &src_d_in, FieldD &sol_d){ void operator() (const FieldD &src_d_in, FieldD &sol_d){
std::cout << GridLogMessage << "MixedPrecisionConjugateGradient: Starting mixed precision CG with outer tolerance " << Tolerance << " and inner tolerance " << InnerTolerance << std::endl;
TotalInnerIterations = 0; TotalInnerIterations = 0;
GridStopWatch TotalTimer; GridStopWatch TotalTimer;
@ -99,7 +97,6 @@ NAMESPACE_BEGIN(Grid);
FieldF sol_f(SinglePrecGrid); FieldF sol_f(SinglePrecGrid);
sol_f.Checkerboard() = cb; sol_f.Checkerboard() = cb;
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Starting initial inner CG with tolerance " << inner_tol << std::endl;
ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations); ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
CG_f.ErrorOnNoConverge = false; CG_f.ErrorOnNoConverge = false;
@ -108,10 +105,7 @@ NAMESPACE_BEGIN(Grid);
GridStopWatch PrecChangeTimer; GridStopWatch PrecChangeTimer;
Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
precisionChangeWorkspace pc_wk_sp_to_dp(DoublePrecGrid, SinglePrecGrid);
precisionChangeWorkspace pc_wk_dp_to_sp(SinglePrecGrid, DoublePrecGrid);
for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){ for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
//Compute double precision rsd and also new RHS vector. //Compute double precision rsd and also new RHS vector.
Linop_d.HermOp(sol_d, tmp_d); Linop_d.HermOp(sol_d, tmp_d);
@ -126,7 +120,7 @@ NAMESPACE_BEGIN(Grid);
while(norm * inner_tol * inner_tol < stop) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ?? while(norm * inner_tol * inner_tol < stop) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
PrecChangeTimer.Start(); PrecChangeTimer.Start();
precisionChange(src_f, src_d, pc_wk_dp_to_sp); precisionChange(src_f, src_d);
PrecChangeTimer.Stop(); PrecChangeTimer.Stop();
sol_f = Zero(); sol_f = Zero();
@ -136,7 +130,6 @@ NAMESPACE_BEGIN(Grid);
(*guesser)(src_f, sol_f); (*guesser)(src_f, sol_f);
//Inner CG //Inner CG
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " << outer_iter << " starting inner CG with tolerance " << inner_tol << std::endl;
CG_f.Tolerance = inner_tol; CG_f.Tolerance = inner_tol;
InnerCGtimer.Start(); InnerCGtimer.Start();
CG_f(Linop_f, src_f, sol_f); CG_f(Linop_f, src_f, sol_f);
@ -145,7 +138,7 @@ NAMESPACE_BEGIN(Grid);
//Convert sol back to double and add to double prec solution //Convert sol back to double and add to double prec solution
PrecChangeTimer.Start(); PrecChangeTimer.Start();
precisionChange(tmp_d, sol_f, pc_wk_sp_to_dp); precisionChange(tmp_d, sol_f);
PrecChangeTimer.Stop(); PrecChangeTimer.Stop();
axpy(sol_d, 1.0, tmp_d, sol_d); axpy(sol_d, 1.0, tmp_d, sol_d);
@ -157,7 +150,6 @@ NAMESPACE_BEGIN(Grid);
ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations); ConjugateGradient<FieldD> CG_d(Tolerance, MaxInnerIterations);
CG_d(Linop_d, src_d_in, sol_d); CG_d(Linop_d, src_d_in, sol_d);
TotalFinalStepIterations = CG_d.IterationsToComplete; TotalFinalStepIterations = CG_d.IterationsToComplete;
TrueResidual = CG_d.TrueResidual;
TotalTimer.Stop(); TotalTimer.Stop();
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl; std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Inner CG iterations " << TotalInnerIterations << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations << std::endl;

View File

@ -1,213 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/ConjugateGradientMixedPrecBatched.h
Copyright (C) 2015
Author: Raoul Hodgson <raoul.hodgson@ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H
#define GRID_CONJUGATE_GRADIENT_MIXED_PREC_BATCHED_H
NAMESPACE_BEGIN(Grid);
//Mixed precision restarted defect correction CG
template<class FieldD,class FieldF,
typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class MixedPrecisionConjugateGradientBatched : public LinearFunction<FieldD> {
public:
using LinearFunction<FieldD>::operator();
RealD Tolerance;
RealD InnerTolerance; //Initial tolerance for inner CG. Defaults to Tolerance but can be changed
Integer MaxInnerIterations;
Integer MaxOuterIterations;
Integer MaxPatchupIterations;
GridBase* SinglePrecGrid; //Grid for single-precision fields
RealD OuterLoopNormMult; //Stop the outer loop and move to a final double prec solve when the residual is OuterLoopNormMult * Tolerance
LinearOperatorBase<FieldF> &Linop_f;
LinearOperatorBase<FieldD> &Linop_d;
//Option to speed up *inner single precision* solves using a LinearFunction that produces a guess
LinearFunction<FieldF> *guesser;
bool updateResidual;
MixedPrecisionConjugateGradientBatched(RealD tol,
Integer maxinnerit,
Integer maxouterit,
Integer maxpatchit,
GridBase* _sp_grid,
LinearOperatorBase<FieldF> &_Linop_f,
LinearOperatorBase<FieldD> &_Linop_d,
bool _updateResidual=true) :
Linop_f(_Linop_f), Linop_d(_Linop_d),
Tolerance(tol), InnerTolerance(tol), MaxInnerIterations(maxinnerit), MaxOuterIterations(maxouterit), MaxPatchupIterations(maxpatchit), SinglePrecGrid(_sp_grid),
OuterLoopNormMult(100.), guesser(NULL), updateResidual(_updateResidual) { };
void useGuesser(LinearFunction<FieldF> &g){
guesser = &g;
}
void operator() (const FieldD &src_d_in, FieldD &sol_d){
std::vector<FieldD> srcs_d_in{src_d_in};
std::vector<FieldD> sols_d{sol_d};
(*this)(srcs_d_in,sols_d);
sol_d = sols_d[0];
}
void operator() (const std::vector<FieldD> &src_d_in, std::vector<FieldD> &sol_d){
assert(src_d_in.size() == sol_d.size());
int NBatch = src_d_in.size();
std::cout << GridLogMessage << "NBatch = " << NBatch << std::endl;
Integer TotalOuterIterations = 0; //Number of restarts
std::vector<Integer> TotalInnerIterations(NBatch,0); //Number of inner CG iterations
std::vector<Integer> TotalFinalStepIterations(NBatch,0); //Number of CG iterations in final patch-up step
GridStopWatch TotalTimer;
TotalTimer.Start();
GridStopWatch InnerCGtimer;
GridStopWatch PrecChangeTimer;
int cb = src_d_in[0].Checkerboard();
std::vector<RealD> src_norm;
std::vector<RealD> norm;
std::vector<RealD> stop;
GridBase* DoublePrecGrid = src_d_in[0].Grid();
FieldD tmp_d(DoublePrecGrid);
tmp_d.Checkerboard() = cb;
FieldD tmp2_d(DoublePrecGrid);
tmp2_d.Checkerboard() = cb;
std::vector<FieldD> src_d;
std::vector<FieldF> src_f;
std::vector<FieldF> sol_f;
for (int i=0; i<NBatch; i++) {
sol_d[i].Checkerboard() = cb;
src_norm.push_back(norm2(src_d_in[i]));
norm.push_back(0.);
stop.push_back(src_norm[i] * Tolerance*Tolerance);
src_d.push_back(src_d_in[i]); //source for next inner iteration, computed from residual during operation
src_f.push_back(SinglePrecGrid);
src_f[i].Checkerboard() = cb;
sol_f.push_back(SinglePrecGrid);
sol_f[i].Checkerboard() = cb;
}
RealD inner_tol = InnerTolerance;
ConjugateGradient<FieldF> CG_f(inner_tol, MaxInnerIterations);
CG_f.ErrorOnNoConverge = false;
Integer &outer_iter = TotalOuterIterations; //so it will be equal to the final iteration count
for(outer_iter = 0; outer_iter < MaxOuterIterations; outer_iter++){
std::cout << GridLogMessage << std::endl;
std::cout << GridLogMessage << "Outer iteration " << outer_iter << std::endl;
bool allConverged = true;
for (int i=0; i<NBatch; i++) {
//Compute double precision rsd and also new RHS vector.
Linop_d.HermOp(sol_d[i], tmp_d);
norm[i] = axpy_norm(src_d[i], -1., tmp_d, src_d_in[i]); //src_d is residual vector
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Outer iteration " << outer_iter <<" solve " << i << " residual "<< norm[i] << " target "<< stop[i] <<std::endl;
PrecChangeTimer.Start();
precisionChange(src_f[i], src_d[i]);
PrecChangeTimer.Stop();
sol_f[i] = Zero();
if(norm[i] > OuterLoopNormMult * stop[i]) {
allConverged = false;
}
}
if (allConverged) break;
if (updateResidual) {
RealD normMax = *std::max_element(std::begin(norm), std::end(norm));
RealD stopMax = *std::max_element(std::begin(stop), std::end(stop));
while( normMax * inner_tol * inner_tol < stopMax) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
CG_f.Tolerance = inner_tol;
}
//Optionally improve inner solver guess (eg using known eigenvectors)
if(guesser != NULL) {
(*guesser)(src_f, sol_f);
}
for (int i=0; i<NBatch; i++) {
//Inner CG
InnerCGtimer.Start();
CG_f(Linop_f, src_f[i], sol_f[i]);
InnerCGtimer.Stop();
TotalInnerIterations[i] += CG_f.IterationsToComplete;
//Convert sol back to double and add to double prec solution
PrecChangeTimer.Start();
precisionChange(tmp_d, sol_f[i]);
PrecChangeTimer.Stop();
axpy(sol_d[i], 1.0, tmp_d, sol_d[i]);
}
}
//Final trial CG
std::cout << GridLogMessage << std::endl;
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Starting final patch-up double-precision solve"<<std::endl;
for (int i=0; i<NBatch; i++) {
ConjugateGradient<FieldD> CG_d(Tolerance, MaxPatchupIterations);
CG_d(Linop_d, src_d_in[i], sol_d[i]);
TotalFinalStepIterations[i] += CG_d.IterationsToComplete;
}
TotalTimer.Stop();
std::cout << GridLogMessage << std::endl;
for (int i=0; i<NBatch; i++) {
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: solve " << i << " Inner CG iterations " << TotalInnerIterations[i] << " Restarts " << TotalOuterIterations << " Final CG iterations " << TotalFinalStepIterations[i] << std::endl;
}
std::cout << GridLogMessage << std::endl;
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradientBatched: Total time " << TotalTimer.Elapsed() << " Precision change " << PrecChangeTimer.Elapsed() << " Inner CG total " << InnerCGtimer.Elapsed() << std::endl;
}
};
NAMESPACE_END(Grid);
#endif

View File

@ -44,7 +44,7 @@ public:
using OperatorFunction<Field>::operator(); using OperatorFunction<Field>::operator();
// RealD Tolerance; RealD Tolerance;
Integer MaxIterations; Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
std::vector<int> IterationsToCompleteShift; // Iterations for this shift std::vector<int> IterationsToCompleteShift; // Iterations for this shift
@ -52,7 +52,7 @@ public:
MultiShiftFunction shifts; MultiShiftFunction shifts;
std::vector<RealD> TrueResidualShift; std::vector<RealD> TrueResidualShift;
ConjugateGradientMultiShift(Integer maxit, const MultiShiftFunction &_shifts) : ConjugateGradientMultiShift(Integer maxit,MultiShiftFunction &_shifts) :
MaxIterations(maxit), MaxIterations(maxit),
shifts(_shifts) shifts(_shifts)
{ {
@ -84,7 +84,6 @@ public:
void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector<Field> &psi) void operator() (LinearOperatorBase<Field> &Linop, const Field &src, std::vector<Field> &psi)
{ {
GRID_TRACE("ConjugateGradientMultiShift");
GridBase *grid = src.Grid(); GridBase *grid = src.Grid();
@ -183,9 +182,6 @@ public:
for(int s=0;s<nshift;s++) { for(int s=0;s<nshift;s++) {
axpby(psi[s],0.,-bs[s]*alpha[s],src,src); axpby(psi[s],0.,-bs[s]*alpha[s],src,src);
} }
std::cout << GridLogIterative << "ConjugateGradientMultiShift: initial rn (|src|^2) =" << rn << " qq (|MdagM src|^2) =" << qq << " d ( dot(src, [MdagM + m_0]src) ) =" << d << " c=" << c << std::endl;
/////////////////////////////////////// ///////////////////////////////////////
// Timers // Timers
@ -325,8 +321,8 @@ public:
std::cout << GridLogMessage << "Time Breakdown "<<std::endl; std::cout << GridLogMessage << "Time Breakdown "<<std::endl;
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl; std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tAXPY " << AXPYTimer.Elapsed() <<std::endl; std::cout << GridLogMessage << "\tAXPY " << AXPYTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl; std::cout << GridLogMessage << "\tMarix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tShift " << ShiftTimer.Elapsed() <<std::endl; std::cout << GridLogMessage << "\tShift " << ShiftTimer.Elapsed() <<std::endl;
IterationsToComplete = k; IterationsToComplete = k;

View File

@ -1,373 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/ConjugateGradientMultiShift.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Christopher Kelly <ckelly@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
//CK 2020: A variant of the multi-shift conjugate gradient with the matrix multiplication in single precision.
//The residual is stored in single precision, but the search directions and solution are stored in double precision.
//Every update_freq iterations the residual is corrected in double precision.
//For safety the a final regular CG is applied to clean up if necessary
//PB Pure single, then double fixup
template<class FieldD, class FieldF,
typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class ConjugateGradientMultiShiftMixedPrecCleanup : public OperatorMultiFunction<FieldD>,
public OperatorFunction<FieldD>
{
public:
using OperatorFunction<FieldD>::operator();
RealD Tolerance;
Integer MaxIterationsMshift;
Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
std::vector<int> IterationsToCompleteShift; // Iterations for this shift
int verbose;
MultiShiftFunction shifts;
std::vector<RealD> TrueResidualShift;
int ReliableUpdateFreq; //number of iterations between reliable updates
GridBase* SinglePrecGrid; //Grid for single-precision fields
LinearOperatorBase<FieldF> &Linop_f; //single precision
ConjugateGradientMultiShiftMixedPrecCleanup(Integer maxit, const MultiShiftFunction &_shifts,
GridBase* _SinglePrecGrid, LinearOperatorBase<FieldF> &_Linop_f,
int _ReliableUpdateFreq) :
MaxIterationsMshift(maxit), shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq),
MaxIterations(20000)
{
verbose=1;
IterationsToCompleteShift.resize(_shifts.order);
TrueResidualShift.resize(_shifts.order);
}
void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, FieldD &psi)
{
GridBase *grid = src.Grid();
int nshift = shifts.order;
std::vector<FieldD> results(nshift,grid);
(*this)(Linop,src,results,psi);
}
void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, std::vector<FieldD> &results, FieldD &psi)
{
int nshift = shifts.order;
(*this)(Linop,src,results);
psi = shifts.norm*src;
for(int i=0;i<nshift;i++){
psi = psi + shifts.residues[i]*results[i];
}
return;
}
void operator() (LinearOperatorBase<FieldD> &Linop_d, const FieldD &src_d, std::vector<FieldD> &psi_d)
{
GRID_TRACE("ConjugateGradientMultiShiftMixedPrecCleanup");
GridBase *DoublePrecGrid = src_d.Grid();
////////////////////////////////////////////////////////////////////////
// Convenience references to the info stored in "MultiShiftFunction"
////////////////////////////////////////////////////////////////////////
int nshift = shifts.order;
std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts"
std::vector<RealD> &mresidual(shifts.tolerances);
std::vector<RealD> alpha(nshift,1.0);
//Double precision search directions
FieldD p_d(DoublePrecGrid);
std::vector<FieldF> ps_f (nshift, SinglePrecGrid);// Search directions (single precision)
std::vector<FieldF> psi_f(nshift, SinglePrecGrid);// solutions (single precision)
FieldD tmp_d(DoublePrecGrid);
FieldD r_d(DoublePrecGrid);
FieldF r_f(SinglePrecGrid);
FieldD mmp_d(DoublePrecGrid);
assert(psi_d.size()==nshift);
assert(mass.size()==nshift);
assert(mresidual.size()==nshift);
// dynamic sized arrays on stack; 2d is a pain with vector
RealD bs[nshift];
RealD rsq[nshift];
RealD rsqf[nshift];
RealD z[nshift][2];
int converged[nshift];
const int primary =0;
//Primary shift fields CG iteration
RealD a,b,c,d;
RealD cp,bp,qq; //prev
// Matrix mult fields
FieldF p_f(SinglePrecGrid);
FieldF mmp_f(SinglePrecGrid);
// Check lightest mass
for(int s=0;s<nshift;s++){
assert( mass[s]>= mass[primary] );
converged[s]=0;
}
// Wire guess to zero
// Residuals "r" are src
// First search direction "p" is also src
cp = norm2(src_d);
// Handle trivial case of zero src.
if( cp == 0. ){
for(int s=0;s<nshift;s++){
psi_d[s] = Zero();
psi_f[s] = Zero();
IterationsToCompleteShift[s] = 1;
TrueResidualShift[s] = 0.;
}
return;
}
for(int s=0;s<nshift;s++){
rsq[s] = cp * mresidual[s] * mresidual[s];
rsqf[s] =rsq[s];
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: shift "<< s <<" target resid "<<rsq[s]<<std::endl;
// ps_d[s] = src_d;
precisionChange(ps_f[s],src_d);
}
// r and p for primary
p_d = src_d; //primary copy --- make this a reference to ps_d to save axpys
r_d = p_d;
//MdagM+m[0]
precisionChange(p_f,p_d);
Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
precisionChange(tmp_d,mmp_f);
Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
tmp_d = tmp_d - mmp_d;
std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
// assert(norm2(tmp_d)< 1.0e-4);
axpy(mmp_d,mass[0],p_d,mmp_d);
RealD rn = norm2(p_d);
d += rn*mass[0];
b = -cp /d;
// Set up the various shift variables
int iz=0;
z[0][1-iz] = 1.0;
z[0][iz] = 1.0;
bs[0] = b;
for(int s=1;s<nshift;s++){
z[s][1-iz] = 1.0;
z[s][iz] = 1.0/( 1.0 - b*(mass[s]-mass[0]));
bs[s] = b*z[s][iz];
}
// r += b[0] A.p[0]
// c= norm(r)
c=axpy_norm(r_d,b,mmp_d,r_d);
for(int s=0;s<nshift;s++) {
axpby(psi_d[s],0.,-bs[s]*alpha[s],src_d,src_d);
precisionChange(psi_f[s],psi_d[s]);
}
///////////////////////////////////////
// Timers
///////////////////////////////////////
GridStopWatch AXPYTimer, ShiftTimer, QRTimer, MatrixTimer, SolverTimer, PrecChangeTimer, CleanupTimer;
SolverTimer.Start();
// Iteration loop
int k;
for (k=1;k<=MaxIterationsMshift;k++){
a = c /cp;
AXPYTimer.Start();
axpy(p_d,a,p_d,r_d);
AXPYTimer.Stop();
PrecChangeTimer.Start();
precisionChange(r_f, r_d);
PrecChangeTimer.Stop();
AXPYTimer.Start();
for(int s=0;s<nshift;s++){
if ( ! converged[s] ) {
if (s==0){
axpy(ps_f[s],a,ps_f[s],r_f);
} else{
RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b);
axpby(ps_f[s],z[s][iz],as,r_f,ps_f[s]);
}
}
}
AXPYTimer.Stop();
cp=c;
PrecChangeTimer.Start();
precisionChange(p_f, p_d); //get back single prec search direction for linop
PrecChangeTimer.Stop();
MatrixTimer.Start();
Linop_f.HermOp(p_f,mmp_f);
MatrixTimer.Stop();
PrecChangeTimer.Start();
precisionChange(mmp_d, mmp_f); // From Float to Double
PrecChangeTimer.Stop();
d=real(innerProduct(p_d,mmp_d));
axpy(mmp_d,mass[0],p_d,mmp_d);
RealD rn = norm2(p_d);
d += rn*mass[0];
bp=b;
b=-cp/d;
// Toggle the recurrence history
bs[0] = b;
iz = 1-iz;
ShiftTimer.Start();
for(int s=1;s<nshift;s++){
if((!converged[s])){
RealD z0 = z[s][1-iz];
RealD z1 = z[s][iz];
z[s][iz] = z0*z1*bp
/ (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b));
bs[s] = b*z[s][iz]/z0; // NB sign rel to Mike
}
}
ShiftTimer.Stop();
//Update single precision solutions
AXPYTimer.Start();
for(int s=0;s<nshift;s++){
int ss = s;
if( (!converged[s]) ) {
axpy(psi_f[ss],-bs[s]*alpha[s],ps_f[s],psi_f[ss]);
}
}
c = axpy_norm(r_d,b,mmp_d,r_d);
AXPYTimer.Stop();
// Convergence checks
int all_converged = 1;
for(int s=0;s<nshift;s++){
if ( (!converged[s]) ){
IterationsToCompleteShift[s] = k;
RealD css = c * z[s][iz]* z[s][iz];
if(css<rsqf[s]){
if ( ! converged[s] )
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
converged[s]=1;
} else {
all_converged=0;
}
}
}
if ( all_converged || k == MaxIterationsMshift-1){
SolverTimer.Stop();
for(int s=0;s<nshift;s++){
precisionChange(psi_d[s],psi_f[s]);
}
if ( all_converged ){
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrecCleanup: All shifts have converged iteration "<<k<<std::endl;
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrecCleanup: Checking solutions"<<std::endl;
} else {
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrecCleanup: Not all shifts have converged iteration "<<k<<std::endl;
}
// Check answers
for(int s=0; s < nshift; s++) {
Linop_d.HermOpAndNorm(psi_d[s],mmp_d,d,qq);
axpy(tmp_d,mass[s],psi_d[s],mmp_d);
axpy(r_d,-alpha[s],src_d,tmp_d);
RealD rn = norm2(r_d);
RealD cn = norm2(src_d);
TrueResidualShift[s] = std::sqrt(rn/cn);
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: shift["<<s<<"] true residual "<< TrueResidualShift[s] << " target " << mresidual[s] << std::endl;
//If we have not reached the desired tolerance, do a (mixed precision) CG cleanup
if(rn >= rsq[s]){
CleanupTimer.Start();
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrecCleanup: performing cleanup step for shift " << s << std::endl;
//Setup linear operators for final cleanup
ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldD> Linop_shift_d(Linop_d, mass[s]);
ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldF> Linop_shift_f(Linop_f, mass[s]);
MixedPrecisionConjugateGradient<FieldD,FieldF> cg(mresidual[s], MaxIterations, MaxIterations, SinglePrecGrid, Linop_shift_f, Linop_shift_d);
cg(src_d, psi_d[s]);
TrueResidualShift[s] = cg.TrueResidual;
CleanupTimer.Stop();
}
}
std::cout << GridLogMessage << "ConjugateGradientMultiShiftMixedPrecCleanup: Time Breakdown for body"<<std::endl;
std::cout << GridLogMessage << "\tSolver " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tAXPY " << AXPYTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tShift " << ShiftTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tPrecision Change " << PrecChangeTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tFinal Cleanup " << CleanupTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tSolver+Cleanup " << SolverTimer.Elapsed() + CleanupTimer.Elapsed() << std::endl;
IterationsToComplete = k;
return;
}
}
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
assert(0);
}
};
NAMESPACE_END(Grid);

View File

@ -1,416 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/iterative/ConjugateGradientMultiShift.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Christopher Kelly <ckelly@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#ifndef GRID_CONJUGATE_GRADIENT_MULTI_SHIFT_MIXEDPREC_H
#define GRID_CONJUGATE_GRADIENT_MULTI_SHIFT_MIXEDPREC_H
NAMESPACE_BEGIN(Grid);
//CK 2020: A variant of the multi-shift conjugate gradient with the matrix multiplication in single precision.
//The residual is stored in single precision, but the search directions and solution are stored in double precision.
//Every update_freq iterations the residual is corrected in double precision.
//For safety the a final regular CG is applied to clean up if necessary
//Linop to add shift to input linop, used in cleanup CG
namespace ConjugateGradientMultiShiftMixedPrecSupport{
template<typename Field>
class ShiftedLinop: public LinearOperatorBase<Field>{
public:
LinearOperatorBase<Field> &linop_base;
RealD shift;
ShiftedLinop(LinearOperatorBase<Field> &_linop_base, RealD _shift): linop_base(_linop_base), shift(_shift){}
void OpDiag (const Field &in, Field &out){ assert(0); }
void OpDir (const Field &in, Field &out,int dir,int disp){ assert(0); }
void OpDirAll (const Field &in, std::vector<Field> &out){ assert(0); }
void Op (const Field &in, Field &out){ assert(0); }
void AdjOp (const Field &in, Field &out){ assert(0); }
void HermOp(const Field &in, Field &out){
linop_base.HermOp(in, out);
axpy(out, shift, in, out);
}
void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
HermOp(in,out);
ComplexD dot = innerProduct(in,out);
n1=real(dot);
n2=norm2(out);
}
};
};
template<class FieldD, class FieldF,
typename std::enable_if< getPrecision<FieldD>::value == 2, int>::type = 0,
typename std::enable_if< getPrecision<FieldF>::value == 1, int>::type = 0>
class ConjugateGradientMultiShiftMixedPrec : public OperatorMultiFunction<FieldD>,
public OperatorFunction<FieldD>
{
public:
using OperatorFunction<FieldD>::operator();
RealD Tolerance;
Integer MaxIterationsMshift;
Integer MaxIterations;
Integer IterationsToComplete; //Number of iterations the CG took to finish. Filled in upon completion
std::vector<int> IterationsToCompleteShift; // Iterations for this shift
int verbose;
MultiShiftFunction shifts;
std::vector<RealD> TrueResidualShift;
int ReliableUpdateFreq; //number of iterations between reliable updates
GridBase* SinglePrecGrid; //Grid for single-precision fields
LinearOperatorBase<FieldF> &Linop_f; //single precision
ConjugateGradientMultiShiftMixedPrec(Integer maxit, const MultiShiftFunction &_shifts,
GridBase* _SinglePrecGrid, LinearOperatorBase<FieldF> &_Linop_f,
int _ReliableUpdateFreq) :
MaxIterationsMshift(maxit), shifts(_shifts), SinglePrecGrid(_SinglePrecGrid), Linop_f(_Linop_f), ReliableUpdateFreq(_ReliableUpdateFreq),
MaxIterations(20000)
{
verbose=1;
IterationsToCompleteShift.resize(_shifts.order);
TrueResidualShift.resize(_shifts.order);
}
void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, FieldD &psi)
{
GridBase *grid = src.Grid();
int nshift = shifts.order;
std::vector<FieldD> results(nshift,grid);
(*this)(Linop,src,results,psi);
}
void operator() (LinearOperatorBase<FieldD> &Linop, const FieldD &src, std::vector<FieldD> &results, FieldD &psi)
{
int nshift = shifts.order;
(*this)(Linop,src,results);
psi = shifts.norm*src;
for(int i=0;i<nshift;i++){
psi = psi + shifts.residues[i]*results[i];
}
return;
}
void operator() (LinearOperatorBase<FieldD> &Linop_d, const FieldD &src_d, std::vector<FieldD> &psi_d)
{
GRID_TRACE("ConjugateGradientMultiShiftMixedPrec");
GridBase *DoublePrecGrid = src_d.Grid();
precisionChangeWorkspace pc_wk_s_to_d(DoublePrecGrid,SinglePrecGrid);
precisionChangeWorkspace pc_wk_d_to_s(SinglePrecGrid,DoublePrecGrid);
////////////////////////////////////////////////////////////////////////
// Convenience references to the info stored in "MultiShiftFunction"
////////////////////////////////////////////////////////////////////////
int nshift = shifts.order;
std::vector<RealD> &mass(shifts.poles); // Make references to array in "shifts"
std::vector<RealD> &mresidual(shifts.tolerances);
std::vector<RealD> alpha(nshift,1.0);
//Double precision search directions
FieldD p_d(DoublePrecGrid);
std::vector<FieldD> ps_d(nshift, DoublePrecGrid);// Search directions (double precision)
FieldD tmp_d(DoublePrecGrid);
FieldD r_d(DoublePrecGrid);
FieldD mmp_d(DoublePrecGrid);
assert(psi_d.size()==nshift);
assert(mass.size()==nshift);
assert(mresidual.size()==nshift);
// dynamic sized arrays on stack; 2d is a pain with vector
RealD bs[nshift];
RealD rsq[nshift];
RealD rsqf[nshift];
RealD z[nshift][2];
int converged[nshift];
const int primary =0;
//Primary shift fields CG iteration
RealD a,b,c,d;
RealD cp,bp,qq; //prev
// Matrix mult fields
FieldF p_f(SinglePrecGrid);
FieldF mmp_f(SinglePrecGrid);
// Check lightest mass
for(int s=0;s<nshift;s++){
assert( mass[s]>= mass[primary] );
converged[s]=0;
}
// Wire guess to zero
// Residuals "r" are src
// First search direction "p" is also src
cp = norm2(src_d);
// Handle trivial case of zero src.
if( cp == 0. ){
for(int s=0;s<nshift;s++){
psi_d[s] = Zero();
IterationsToCompleteShift[s] = 1;
TrueResidualShift[s] = 0.;
}
return;
}
for(int s=0;s<nshift;s++){
rsq[s] = cp * mresidual[s] * mresidual[s];
rsqf[s] =rsq[s];
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: shift "<< s <<" target resid "<<rsq[s]<<std::endl;
ps_d[s] = src_d;
}
// r and p for primary
p_d = src_d; //primary copy --- make this a reference to ps_d to save axpys
r_d = p_d;
//MdagM+m[0]
precisionChange(p_f, p_d, pc_wk_d_to_s);
Linop_f.HermOpAndNorm(p_f,mmp_f,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
precisionChange(tmp_d, mmp_f, pc_wk_s_to_d);
Linop_d.HermOpAndNorm(p_d,mmp_d,d,qq); // mmp = MdagM p d=real(dot(p, mmp)), qq=norm2(mmp)
tmp_d = tmp_d - mmp_d;
std::cout << " Testing operators match "<<norm2(mmp_d)<<" f "<<norm2(mmp_f)<<" diff "<< norm2(tmp_d)<<std::endl;
assert(norm2(tmp_d)< 1.0);
axpy(mmp_d,mass[0],p_d,mmp_d);
RealD rn = norm2(p_d);
d += rn*mass[0];
b = -cp /d;
// Set up the various shift variables
int iz=0;
z[0][1-iz] = 1.0;
z[0][iz] = 1.0;
bs[0] = b;
for(int s=1;s<nshift;s++){
z[s][1-iz] = 1.0;
z[s][iz] = 1.0/( 1.0 - b*(mass[s]-mass[0]));
bs[s] = b*z[s][iz];
}
// r += b[0] A.p[0]
// c= norm(r)
c=axpy_norm(r_d,b,mmp_d,r_d);
for(int s=0;s<nshift;s++) {
axpby(psi_d[s],0.,-bs[s]*alpha[s],src_d,src_d);
}
///////////////////////////////////////
// Timers
///////////////////////////////////////
GridStopWatch AXPYTimer, ShiftTimer, QRTimer, MatrixTimer, SolverTimer, PrecChangeTimer, CleanupTimer;
SolverTimer.Start();
// Iteration loop
int k;
for (k=1;k<=MaxIterationsMshift;k++){
a = c /cp;
AXPYTimer.Start();
axpy(p_d,a,p_d,r_d);
for(int s=0;s<nshift;s++){
if ( ! converged[s] ) {
if (s==0){
axpy(ps_d[s],a,ps_d[s],r_d);
} else{
RealD as =a *z[s][iz]*bs[s] /(z[s][1-iz]*b);
axpby(ps_d[s],z[s][iz],as,r_d,ps_d[s]);
}
}
}
AXPYTimer.Stop();
PrecChangeTimer.Start();
precisionChange(p_f, p_d, pc_wk_d_to_s); //get back single prec search direction for linop
PrecChangeTimer.Stop();
cp=c;
MatrixTimer.Start();
Linop_f.HermOp(p_f,mmp_f);
MatrixTimer.Stop();
PrecChangeTimer.Start();
precisionChange(mmp_d, mmp_f, pc_wk_s_to_d); // From Float to Double
PrecChangeTimer.Stop();
AXPYTimer.Start();
d=real(innerProduct(p_d,mmp_d));
axpy(mmp_d,mass[0],p_d,mmp_d);
AXPYTimer.Stop();
RealD rn = norm2(p_d);
d += rn*mass[0];
bp=b;
b=-cp/d;
// Toggle the recurrence history
bs[0] = b;
iz = 1-iz;
ShiftTimer.Start();
for(int s=1;s<nshift;s++){
if((!converged[s])){
RealD z0 = z[s][1-iz];
RealD z1 = z[s][iz];
z[s][iz] = z0*z1*bp
/ (b*a*(z1-z0) + z1*bp*(1- (mass[s]-mass[0])*b));
bs[s] = b*z[s][iz]/z0; // NB sign rel to Mike
}
}
ShiftTimer.Stop();
//Update double precision solutions
AXPYTimer.Start();
for(int s=0;s<nshift;s++){
int ss = s;
if( (!converged[s]) ) {
axpy(psi_d[ss],-bs[s]*alpha[s],ps_d[s],psi_d[ss]);
}
}
//Perform reliable update if necessary; otherwise update residual from single-prec mmp
c = axpy_norm(r_d,b,mmp_d,r_d);
AXPYTimer.Stop();
if(k % ReliableUpdateFreq == 0){
RealD c_old = c;
//Replace r with true residual
MatrixTimer.Start();
Linop_d.HermOp(psi_d[0],mmp_d);
MatrixTimer.Stop();
AXPYTimer.Start();
axpy(mmp_d,mass[0],psi_d[0],mmp_d);
c = axpy_norm(r_d, -1.0, mmp_d, src_d);
AXPYTimer.Stop();
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<< ", replaced |r|^2 = "<<c_old <<" with |r|^2 = "<<c<<std::endl;
}
// Convergence checks
int all_converged = 1;
for(int s=0;s<nshift;s++){
if ( (!converged[s]) ){
IterationsToCompleteShift[s] = k;
RealD css = c * z[s][iz]* z[s][iz];
if(css<rsqf[s]){
if ( ! converged[s] )
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec k="<<k<<" Shift "<<s<<" has converged"<<std::endl;
converged[s]=1;
} else {
all_converged=0;
}
}
}
if ( all_converged || k == MaxIterationsMshift-1){
SolverTimer.Stop();
if ( all_converged ){
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: All shifts have converged iteration "<<k<<std::endl;
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Checking solutions"<<std::endl;
} else {
std::cout<<GridLogMessage<< "ConjugateGradientMultiShiftMixedPrec: Not all shifts have converged iteration "<<k<<std::endl;
}
// Check answers
for(int s=0; s < nshift; s++) {
Linop_d.HermOpAndNorm(psi_d[s],mmp_d,d,qq);
axpy(tmp_d,mass[s],psi_d[s],mmp_d);
axpy(r_d,-alpha[s],src_d,tmp_d);
RealD rn = norm2(r_d);
RealD cn = norm2(src_d);
TrueResidualShift[s] = std::sqrt(rn/cn);
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: shift["<<s<<"] true residual "<< TrueResidualShift[s] << " target " << mresidual[s] << std::endl;
//If we have not reached the desired tolerance, do a (mixed precision) CG cleanup
if(rn >= rsq[s]){
CleanupTimer.Start();
std::cout<<GridLogMessage<<"ConjugateGradientMultiShiftMixedPrec: performing cleanup step for shift " << s << std::endl;
//Setup linear operators for final cleanup
ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldD> Linop_shift_d(Linop_d, mass[s]);
ConjugateGradientMultiShiftMixedPrecSupport::ShiftedLinop<FieldF> Linop_shift_f(Linop_f, mass[s]);
MixedPrecisionConjugateGradient<FieldD,FieldF> cg(mresidual[s], MaxIterations, MaxIterations, SinglePrecGrid, Linop_shift_f, Linop_shift_d);
cg(src_d, psi_d[s]);
TrueResidualShift[s] = cg.TrueResidual;
CleanupTimer.Stop();
}
}
std::cout << GridLogMessage << "ConjugateGradientMultiShiftMixedPrec: Time Breakdown for body"<<std::endl;
std::cout << GridLogMessage << "\tSolver " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tAXPY " << AXPYTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tShift " << ShiftTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\t\tPrecision Change " << PrecChangeTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tFinal Cleanup " << CleanupTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tSolver+Cleanup " << SolverTimer.Elapsed() + CleanupTimer.Elapsed() << std::endl;
IterationsToComplete = k;
return;
}
}
std::cout<<GridLogMessage<<"CG multi shift did not converge"<<std::endl;
assert(0);
}
};
NAMESPACE_END(Grid);
#endif

View File

@ -48,7 +48,7 @@ public:
LinearOperatorBase<FieldF> &Linop_f; LinearOperatorBase<FieldF> &Linop_f;
LinearOperatorBase<FieldD> &Linop_d; LinearOperatorBase<FieldD> &Linop_d;
GridBase* SinglePrecGrid; GridBase* SinglePrecGrid;
RealD Delta; //reliable update parameter. A reliable update is performed when the residual drops by a factor of Delta relative to its value at the last update RealD Delta; //reliable update parameter
//Optional ability to switch to a different linear operator once the tolerance reaches a certain point. Useful for single/half -> single/single //Optional ability to switch to a different linear operator once the tolerance reaches a certain point. Useful for single/half -> single/single
LinearOperatorBase<FieldF> *Linop_fallback; LinearOperatorBase<FieldF> *Linop_fallback;
@ -65,9 +65,7 @@ public:
ErrorOnNoConverge(err_on_no_conv), ErrorOnNoConverge(err_on_no_conv),
DoFinalCleanup(true), DoFinalCleanup(true),
Linop_fallback(NULL) Linop_fallback(NULL)
{ {};
assert(Delta > 0. && Delta < 1. && "Expect 0 < Delta < 1");
};
void setFallbackLinop(LinearOperatorBase<FieldF> &_Linop_fallback, const RealD _fallback_transition_tol){ void setFallbackLinop(LinearOperatorBase<FieldF> &_Linop_fallback, const RealD _fallback_transition_tol){
Linop_fallback = &_Linop_fallback; Linop_fallback = &_Linop_fallback;
@ -75,7 +73,6 @@ public:
} }
void operator()(const FieldD &src, FieldD &psi) { void operator()(const FieldD &src, FieldD &psi) {
GRID_TRACE("ConjugateGradientReliableUpdate");
LinearOperatorBase<FieldF> *Linop_f_use = &Linop_f; LinearOperatorBase<FieldF> *Linop_f_use = &Linop_f;
bool using_fallback = false; bool using_fallback = false;
@ -118,12 +115,9 @@ public:
} }
//Single prec initialization //Single prec initialization
precisionChangeWorkspace pc_wk_sp_to_dp(src.Grid(), SinglePrecGrid);
precisionChangeWorkspace pc_wk_dp_to_sp(SinglePrecGrid, src.Grid());
FieldF r_f(SinglePrecGrid); FieldF r_f(SinglePrecGrid);
r_f.Checkerboard() = r.Checkerboard(); r_f.Checkerboard() = r.Checkerboard();
precisionChange(r_f, r, pc_wk_dp_to_sp); precisionChange(r_f, r);
FieldF psi_f(r_f); FieldF psi_f(r_f);
psi_f = Zero(); psi_f = Zero();
@ -139,8 +133,7 @@ public:
GridStopWatch LinalgTimer; GridStopWatch LinalgTimer;
GridStopWatch MatrixTimer; GridStopWatch MatrixTimer;
GridStopWatch SolverTimer; GridStopWatch SolverTimer;
GridStopWatch PrecChangeTimer;
SolverTimer.Start(); SolverTimer.Start();
int k = 0; int k = 0;
int l = 0; int l = 0;
@ -179,9 +172,7 @@ public:
// Stopping condition // Stopping condition
if (cp <= rsq) { if (cp <= rsq) {
//Although not written in the paper, I assume that I have to add on the final solution //Although not written in the paper, I assume that I have to add on the final solution
PrecChangeTimer.Start(); precisionChange(mmp, psi_f);
precisionChange(mmp, psi_f, pc_wk_sp_to_dp);
PrecChangeTimer.Stop();
psi = psi + mmp; psi = psi + mmp;
@ -202,10 +193,7 @@ public:
std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl; std::cout << GridLogMessage << "\tElapsed " << SolverTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl; std::cout << GridLogMessage << "\tMatrix " << MatrixTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl; std::cout << GridLogMessage << "\tLinalg " << LinalgTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tPrecChange " << PrecChangeTimer.Elapsed() <<std::endl;
std::cout << GridLogMessage << "\tPrecChange avg time " << PrecChangeTimer.Elapsed()/(2*l+1) <<std::endl;
IterationsToComplete = k; IterationsToComplete = k;
ReliableUpdatesPerformed = l; ReliableUpdatesPerformed = l;
@ -225,21 +213,14 @@ public:
else if(cp < Delta * MaxResidSinceLastRelUp) { //reliable update else if(cp < Delta * MaxResidSinceLastRelUp) { //reliable update
std::cout << GridLogMessage << "ConjugateGradientReliableUpdate " std::cout << GridLogMessage << "ConjugateGradientReliableUpdate "
<< cp << "(residual) < " << Delta << "(Delta) * " << MaxResidSinceLastRelUp << "(MaxResidSinceLastRelUp) on iteration " << k << " : performing reliable update\n"; << cp << "(residual) < " << Delta << "(Delta) * " << MaxResidSinceLastRelUp << "(MaxResidSinceLastRelUp) on iteration " << k << " : performing reliable update\n";
PrecChangeTimer.Start(); precisionChange(mmp, psi_f);
precisionChange(mmp, psi_f, pc_wk_sp_to_dp);
PrecChangeTimer.Stop();
psi = psi + mmp; psi = psi + mmp;
MatrixTimer.Start();
Linop_d.HermOpAndNorm(psi, mmp, d, qq); Linop_d.HermOpAndNorm(psi, mmp, d, qq);
MatrixTimer.Stop();
r = src - mmp; r = src - mmp;
psi_f = Zero(); psi_f = Zero();
PrecChangeTimer.Start(); precisionChange(r_f, r);
precisionChange(r_f, r, pc_wk_dp_to_sp);
PrecChangeTimer.Stop();
cp = norm2(r); cp = norm2(r);
MaxResidSinceLastRelUp = cp; MaxResidSinceLastRelUp = cp;

File diff suppressed because it is too large Load Diff

View File

@ -419,15 +419,14 @@ until convergence
} }
} }
if ( Nconv < Nstop ) { if ( Nconv < Nstop )
std::cout << GridLogIRL << "Nconv ("<<Nconv<<") < Nstop ("<<Nstop<<")"<<std::endl; std::cout << GridLogIRL << "Nconv ("<<Nconv<<") < Nstop ("<<Nstop<<")"<<std::endl;
std::cout << GridLogIRL << "returning Nstop vectors, the last "<< Nstop-Nconv << "of which might meet convergence criterion only approximately" <<std::endl;
}
eval=eval2; eval=eval2;
//Keep only converged //Keep only converged
eval.resize(Nstop);// was Nconv eval.resize(Nconv);// Nstop?
evec.resize(Nstop,grid);// was Nconv evec.resize(Nconv,grid);// Nstop?
basisSortInPlace(evec,eval,reverse); basisSortInPlace(evec,eval,reverse);
} }
@ -457,7 +456,7 @@ until convergence
std::vector<Field>& evec, std::vector<Field>& evec,
Field& w,int Nm,int k) Field& w,int Nm,int k)
{ {
std::cout<<GridLogDebug << "Lanczos step " <<k<<std::endl; std::cout<<GridLogIRL << "Lanczos step " <<k<<std::endl;
const RealD tiny = 1.0e-20; const RealD tiny = 1.0e-20;
assert( k< Nm ); assert( k< Nm );
@ -465,7 +464,7 @@ until convergence
Field& evec_k = evec[k]; Field& evec_k = evec[k];
_PolyOp(evec_k,w); std::cout<<GridLogDebug << "PolyOp" <<std::endl; _PolyOp(evec_k,w); std::cout<<GridLogIRL << "PolyOp" <<std::endl;
if(k>0) w -= lme[k-1] * evec[k-1]; if(k>0) w -= lme[k-1] * evec[k-1];
@ -480,18 +479,18 @@ until convergence
lme[k] = beta; lme[k] = beta;
if ( (k>0) && ( (k % orth_period) == 0 )) { if ( (k>0) && ( (k % orth_period) == 0 )) {
std::cout<<GridLogDebug << "Orthogonalising " <<k<<std::endl; std::cout<<GridLogIRL << "Orthogonalising " <<k<<std::endl;
orthogonalize(w,evec,k); // orthonormalise orthogonalize(w,evec,k); // orthonormalise
std::cout<<GridLogDebug << "Orthogonalised " <<k<<std::endl; std::cout<<GridLogIRL << "Orthogonalised " <<k<<std::endl;
} }
if(k < Nm-1) evec[k+1] = w; if(k < Nm-1) evec[k+1] = w;
std::cout<<GridLogIRL << "Lanczos step alpha[" << k << "] = " << zalph << " beta[" << k << "] = "<<beta<<std::endl; std::cout<<GridLogIRL << "alpha[" << k << "] = " << zalph << " beta[" << k << "] = "<<beta<<std::endl;
if ( beta < tiny ) if ( beta < tiny )
std::cout<<GridLogIRL << " beta is tiny "<<beta<<std::endl; std::cout<<GridLogIRL << " beta is tiny "<<beta<<std::endl;
std::cout<<GridLogDebug << "Lanczos step complete " <<k<<std::endl; std::cout<<GridLogIRL << "Lanczos step complete " <<k<<std::endl;
} }
void diagonalize_Eigen(std::vector<RealD>& lmd, std::vector<RealD>& lme, void diagonalize_Eigen(std::vector<RealD>& lmd, std::vector<RealD>& lme,

View File

@ -44,7 +44,6 @@ public:
int, MinRes); // Must restart int, MinRes); // Must restart
}; };
//This class is the input parameter class for some testing programs
struct LocalCoherenceLanczosParams : Serializable { struct LocalCoherenceLanczosParams : Serializable {
public: public:
GRID_SERIALIZABLE_CLASS_MEMBERS(LocalCoherenceLanczosParams, GRID_SERIALIZABLE_CLASS_MEMBERS(LocalCoherenceLanczosParams,
@ -146,24 +145,16 @@ public:
LinearOperatorBase<FineField> &_Linop; LinearOperatorBase<FineField> &_Linop;
RealD _coarse_relax_tol; RealD _coarse_relax_tol;
std::vector<FineField> &_subspace; std::vector<FineField> &_subspace;
int _largestEvalIdxForReport; //The convergence of the LCL is based on the evals of the coarse grid operator, not those of the underlying fine grid operator
//As a result we do not know what the eval range of the fine operator is until the very end, making tuning the Cheby bounds very difficult
//To work around this issue, every restart we separately reconstruct the fine operator eval for the lowest and highest evec and print these
//out alongside the evals of the coarse operator. To do so we need to know the index of the largest eval (i.e. Nstop-1)
//NOTE: If largestEvalIdxForReport=-1 (default) then this is not performed
ImplicitlyRestartedLanczosSmoothedTester(LinearFunction<CoarseField> &Poly, ImplicitlyRestartedLanczosSmoothedTester(LinearFunction<CoarseField> &Poly,
OperatorFunction<FineField> &smoother, OperatorFunction<FineField> &smoother,
LinearOperatorBase<FineField> &Linop, LinearOperatorBase<FineField> &Linop,
std::vector<FineField> &subspace, std::vector<FineField> &subspace,
RealD coarse_relax_tol=5.0e3, RealD coarse_relax_tol=5.0e3)
int largestEvalIdxForReport=-1)
: _smoother(smoother), _Linop(Linop), _Poly(Poly), _subspace(subspace), : _smoother(smoother), _Linop(Linop), _Poly(Poly), _subspace(subspace),
_coarse_relax_tol(coarse_relax_tol), _largestEvalIdxForReport(largestEvalIdxForReport) _coarse_relax_tol(coarse_relax_tol)
{ }; { };
//evalMaxApprox: approximation of largest eval of the fine Chebyshev operator (suitably wrapped by block projection)
int TestConvergence(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox) int TestConvergence(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
{ {
CoarseField v(B); CoarseField v(B);
@ -186,26 +177,12 @@ public:
<<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
<<std::endl; <<std::endl;
if(_largestEvalIdxForReport != -1 && (j==0 || j==_largestEvalIdxForReport)){
std::cout<<GridLogIRL << "Estimating true eval of fine grid operator for eval idx " << j << std::endl;
RealD tmp_eval;
ReconstructEval(j,eresid,B,tmp_eval,1.0); //don't use evalMaxApprox of coarse operator! (cf below)
}
int conv=0; int conv=0;
if( (vv<eresid*eresid) ) conv = 1; if( (vv<eresid*eresid) ) conv = 1;
return conv; return conv;
} }
int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
//This function is called at the end of the coarse grid Lanczos. It promotes the coarse eigenvector 'B' to the fine grid,
//applies a smoother to the result then computes the computes the *fine grid* eigenvalue (output as 'eval').
//evalMaxApprox should be the approximation of the largest eval of the fine Hermop. However when this function is called by IRL it actually passes the largest eval of the *Chebyshev* operator (as this is the max approx used for the TestConvergence above)
//As the largest eval of the Chebyshev is typically several orders of magnitude larger this makes the convergence test pass even when it should not.
//We therefore ignore evalMaxApprox here and use a value of 1.0 (note this value is already used by TestCoarse)
int ReconstructEval(int j,RealD eresid,CoarseField &B, RealD &eval,RealD evalMaxApprox)
{ {
evalMaxApprox = 1.0; //cf above
GridBase *FineGrid = _subspace[0].Grid(); GridBase *FineGrid = _subspace[0].Grid();
int checkerboard = _subspace[0].Checkerboard(); int checkerboard = _subspace[0].Checkerboard();
FineField fB(FineGrid);fB.Checkerboard() =checkerboard; FineField fB(FineGrid);fB.Checkerboard() =checkerboard;
@ -224,13 +201,13 @@ public:
eval = vnum/vden; eval = vnum/vden;
fv -= eval*fB; fv -= eval*fB;
RealD vv = norm2(fv) / ::pow(evalMaxApprox,2.0); RealD vv = norm2(fv) / ::pow(evalMaxApprox,2.0);
if ( j > nbasis ) eresid = eresid*_coarse_relax_tol;
std::cout.precision(13); std::cout.precision(13);
std::cout<<GridLogIRL << "[" << std::setw(3)<<j<<"] " std::cout<<GridLogIRL << "[" << std::setw(3)<<j<<"] "
<<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")" <<"eval = "<<std::setw(25)<< eval << " (" << eval_poly << ")"
<<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv << " target " << eresid*eresid <<" |H B[i] - eval[i]B[i]|^2 / evalMaxApprox^2 " << std::setw(25) << vv
<<std::endl; <<std::endl;
if ( j > nbasis ) eresid = eresid*_coarse_relax_tol;
if( (vv<eresid*eresid) ) return 1; if( (vv<eresid*eresid) ) return 1;
return 0; return 0;
} }
@ -308,10 +285,6 @@ public:
evals_coarse.resize(0); evals_coarse.resize(0);
}; };
//The block inner product is the inner product on the fine grid locally summed over the blocks
//to give a Lattice<Scalar> on the coarse grid. This function orthnormalizes the fine-grid subspace
//vectors under the block inner product. This step must be performed after computing the fine grid
//eigenvectors and before computing the coarse grid eigenvectors.
void Orthogonalise(void ) { void Orthogonalise(void ) {
CoarseScalar InnerProd(_CoarseGrid); CoarseScalar InnerProd(_CoarseGrid);
std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl; std::cout << GridLogMessage <<" Gramm-Schmidt pass 1"<<std::endl;
@ -355,8 +328,6 @@ public:
} }
} }
//While this method serves to check the coarse eigenvectors, it also recomputes the eigenvalues from the smoothed reconstructed eigenvectors
//hence the smoother can be tuned after running the coarse Lanczos by using a different smoother here
void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax) void testCoarse(RealD resid,ChebyParams cheby_smooth,RealD relax)
{ {
assert(evals_fine.size() == nbasis); assert(evals_fine.size() == nbasis);
@ -405,31 +376,25 @@ public:
evals_fine.resize(nbasis); evals_fine.resize(nbasis);
subspace.resize(nbasis,_FineGrid); subspace.resize(nbasis,_FineGrid);
} }
//cheby_op: Parameters of the fine grid Chebyshev polynomial used for the Lanczos acceleration
//cheby_smooth: Parameters of a separate Chebyshev polynomial used after the Lanczos has completed to smooth out high frequency noise in the reconstructed fine grid eigenvectors prior to computing the eigenvalue
//relax: Reconstructed eigenvectors (post smoothing) are naturally not as precise as true eigenvectors. This factor acts as a multiplier on the stopping condition when determining whether the results satisfy the user provided stopping condition
void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth,RealD relax, void calcCoarse(ChebyParams cheby_op,ChebyParams cheby_smooth,RealD relax,
int Nstop, int Nk, int Nm,RealD resid, int Nstop, int Nk, int Nm,RealD resid,
RealD MaxIt, RealD betastp, int MinRes) RealD MaxIt, RealD betastp, int MinRes)
{ {
Chebyshev<FineField> Cheby(cheby_op); //Chebyshev of fine operator on fine grid Chebyshev<FineField> Cheby(cheby_op);
ProjectedHermOp<Fobj,CComplex,nbasis> Op(_FineOp,subspace); //Fine operator on coarse grid with intermediate fine grid conversion ProjectedHermOp<Fobj,CComplex,nbasis> Op(_FineOp,subspace);
ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,subspace); //Chebyshev of fine operator on coarse grid with intermediate fine grid conversion ProjectedFunctionHermOp<Fobj,CComplex,nbasis> ChebyOp (Cheby,_FineOp,subspace);
////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////
// create a smoother and see if we can get a cheap convergence test and smooth inside the IRL // create a smoother and see if we can get a cheap convergence test and smooth inside the IRL
////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////
Chebyshev<FineField> ChebySmooth(cheby_smooth); //lower order Chebyshev of fine operator on fine grid used to smooth regenerated eigenvectors Chebyshev<FineField> ChebySmooth(cheby_smooth);
ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax,Nstop-1); ImplicitlyRestartedLanczosSmoothedTester<Fobj,CComplex,nbasis> ChebySmoothTester(ChebyOp,ChebySmooth,_FineOp,subspace,relax);
evals_coarse.resize(Nm); evals_coarse.resize(Nm);
evec_coarse.resize(Nm,_CoarseGrid); evec_coarse.resize(Nm,_CoarseGrid);
CoarseField src(_CoarseGrid); src=1.0; CoarseField src(_CoarseGrid); src=1.0;
//Note the "tester" here is also responsible for generating the fine grid eigenvalues which are output into the "evals_coarse" array
ImplicitlyRestartedLanczos<CoarseField> IRL(ChebyOp,ChebyOp,ChebySmoothTester,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes); ImplicitlyRestartedLanczos<CoarseField> IRL(ChebyOp,ChebyOp,ChebySmoothTester,Nstop,Nk,Nm,resid,MaxIt,betastp,MinRes);
int Nconv=0; int Nconv=0;
IRL.calc(evals_coarse,evec_coarse,src,Nconv,false); IRL.calc(evals_coarse,evec_coarse,src,Nconv,false);
@ -440,14 +405,6 @@ public:
std::cout << i << " Coarse eval = " << evals_coarse[i] << std::endl; std::cout << i << " Coarse eval = " << evals_coarse[i] << std::endl;
} }
} }
//Get the fine eigenvector 'i' by reconstruction
void getFineEvecEval(FineField &evec, RealD &eval, const int i) const{
blockPromote(evec_coarse[i],evec,subspace);
eval = evals_coarse[i];
}
}; };
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@ -33,7 +33,7 @@ NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////
// Take a matrix and form an NE solver calling a Herm solver // Take a matrix and form an NE solver calling a Herm solver
/////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////
template<class Field> class NormalEquations : public LinearFunction<Field>{ template<class Field> class NormalEquations {
private: private:
SparseMatrixBase<Field> & _Matrix; SparseMatrixBase<Field> & _Matrix;
OperatorFunction<Field> & _HermitianSolver; OperatorFunction<Field> & _HermitianSolver;
@ -60,7 +60,7 @@ public:
} }
}; };
template<class Field> class HPDSolver : public LinearFunction<Field> { template<class Field> class HPDSolver {
private: private:
LinearOperatorBase<Field> & _Matrix; LinearOperatorBase<Field> & _Matrix;
OperatorFunction<Field> & _HermitianSolver; OperatorFunction<Field> & _HermitianSolver;
@ -78,13 +78,13 @@ public:
void operator() (const Field &in, Field &out){ void operator() (const Field &in, Field &out){
_Guess(in,out); _Guess(in,out);
_HermitianSolver(_Matrix,in,out); //M out = in _HermitianSolver(_Matrix,in,out); // Mdag M out = Mdag in
} }
}; };
template<class Field> class MdagMSolver : public LinearFunction<Field> { template<class Field> class MdagMSolver {
private: private:
SparseMatrixBase<Field> & _Matrix; SparseMatrixBase<Field> & _Matrix;
OperatorFunction<Field> & _HermitianSolver; OperatorFunction<Field> & _HermitianSolver;

View File

@ -20,7 +20,7 @@ template<class Field> class PowerMethod
RealD evalMaxApprox = 0.0; RealD evalMaxApprox = 0.0;
auto src_n = src; auto src_n = src;
auto tmp = src; auto tmp = src;
const int _MAX_ITER_EST_ = 100; const int _MAX_ITER_EST_ = 50;
for (int i=0;i<_MAX_ITER_EST_;i++) { for (int i=0;i<_MAX_ITER_EST_;i++) {
@ -29,8 +29,6 @@ template<class Field> class PowerMethod
RealD vnum = real(innerProduct(src_n,tmp)); // HermOp. RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
RealD vden = norm2(src_n); RealD vden = norm2(src_n);
RealD na = vnum/vden; RealD na = vnum/vden;
std::cout << GridLogIterative << "PowerMethod: Current approximation of largest eigenvalue " << na << std::endl;
if ( (fabs(evalMaxApprox/na - 1.0) < 0.001) || (i==_MAX_ITER_EST_-1) ) { if ( (fabs(evalMaxApprox/na - 1.0) < 0.001) || (i==_MAX_ITER_EST_-1) ) {
evalMaxApprox = na; evalMaxApprox = na;

View File

@ -1,262 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/Aggregates.h
Copyright (C) 2015
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
Author: paboyle <paboyle@ph.ed.ac.uk>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
template<class Fobj,class CComplex,int nbasis>
class Aggregation {
public:
typedef iVector<CComplex,nbasis > siteVector;
typedef Lattice<siteVector> CoarseVector;
typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field
typedef Lattice<Fobj > FineField;
GridBase *CoarseGrid;
GridBase *FineGrid;
std::vector<Lattice<Fobj> > subspace;
int checkerboard;
int Checkerboard(void){return checkerboard;}
Aggregation(GridBase *_CoarseGrid,GridBase *_FineGrid,int _checkerboard) :
CoarseGrid(_CoarseGrid),
FineGrid(_FineGrid),
subspace(nbasis,_FineGrid),
checkerboard(_checkerboard)
{
};
void Orthogonalise(void){
CoarseScalar InnerProd(CoarseGrid);
// std::cout << GridLogMessage <<" Block Gramm-Schmidt pass 1"<<std::endl;
blockOrthogonalise(InnerProd,subspace);
}
void ProjectToSubspace(CoarseVector &CoarseVec,const FineField &FineVec){
blockProject(CoarseVec,FineVec,subspace);
}
void PromoteFromSubspace(const CoarseVector &CoarseVec,FineField &FineVec){
FineVec.Checkerboard() = subspace[0].Checkerboard();
blockPromote(CoarseVec,FineVec,subspace);
}
virtual void CreateSubspaceRandom(GridParallelRNG &RNG) {
int nn=nbasis;
RealD scale;
FineField noise(FineGrid);
for(int b=0;b<nn;b++){
subspace[b] = Zero();
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
subspace[b] = noise;
}
}
virtual void CreateSubspace(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,int nn=nbasis)
{
RealD scale;
ConjugateGradient<FineField> CG(1.0e-2,100,false);
FineField noise(FineGrid);
FineField Mn(FineGrid);
for(int b=0;b<nn;b++){
subspace[b] = Zero();
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise ["<<b<<"] <n|MdagM|n> "<<norm2(Mn)<<std::endl;
for(int i=0;i<1;i++){
CG(hermop,noise,subspace[b]);
noise = subspace[b];
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
}
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "filtered["<<b<<"] <f|MdagM|f> "<<norm2(Mn)<<std::endl;
subspace[b] = noise;
}
}
////////////////////////////////////////////////////////////////////////////////////////////////
// World of possibilities here. But have tried quite a lot of experiments (250+ jobs run on Summit)
// and this is the best I found
////////////////////////////////////////////////////////////////////////////////////////////////
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
int nn,
double hi,
double lo,
int orderfilter,
int ordermin,
int orderstep,
double filterlo
) {
RealD scale;
FineField noise(FineGrid);
FineField Mn(FineGrid);
FineField tmp(FineGrid);
// New normalised noise
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
std::cout << GridLogMessage<<" Chebyshev subspace pass-1 : ord "<<orderfilter<<" ["<<lo<<","<<hi<<"]"<<std::endl;
std::cout << GridLogMessage<<" Chebyshev subspace pass-2 : nbasis"<<nn<<" min "
<<ordermin<<" step "<<orderstep
<<" lo"<<filterlo<<std::endl;
// Initial matrix element
hermop.Op(noise,Mn); std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
int b =0;
{
// Filter
Chebyshev<FineField> Cheb(lo,hi,orderfilter);
Cheb(hermop,noise,Mn);
// normalise
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
subspace[b] = Mn;
hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
b++;
}
// Generate a full sequence of Chebyshevs
{
lo=filterlo;
noise=Mn;
FineField T0(FineGrid); T0 = noise;
FineField T1(FineGrid);
FineField T2(FineGrid);
FineField y(FineGrid);
FineField *Tnm = &T0;
FineField *Tn = &T1;
FineField *Tnp = &T2;
// Tn=T1 = (xscale M + mscale)in
RealD xscale = 2.0/(hi-lo);
RealD mscale = -(hi+lo)/(hi-lo);
hermop.HermOp(T0,y);
T1=y*xscale+noise*mscale;
for(int n=2;n<=ordermin+orderstep*(nn-2);n++){
hermop.HermOp(*Tn,y);
autoView( y_v , y, AcceleratorWrite);
autoView( Tn_v , (*Tn), AcceleratorWrite);
autoView( Tnp_v , (*Tnp), AcceleratorWrite);
autoView( Tnm_v , (*Tnm), AcceleratorWrite);
const int Nsimd = CComplex::Nsimd();
accelerator_for(ss, FineGrid->oSites(), Nsimd, {
coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
coalescedWrite(Tnp_v[ss],2.0*y_v(ss)-Tnm_v(ss));
});
// Possible more fine grained control is needed than a linear sweep,
// but huge productivity gain if this is simple algorithm and not a tunable
int m =1;
if ( n>=ordermin ) m=n-ordermin;
if ( (m%orderstep)==0 ) {
Mn=*Tnp;
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
subspace[b] = Mn;
hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << n<<" filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
b++;
}
// Cycle pointers to avoid copies
FineField *swizzle = Tnm;
Tnm =Tn;
Tn =Tnp;
Tnp =swizzle;
}
}
assert(b==nn);
}
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
int nn,
double hi,
double lo,
int orderfilter
) {
RealD scale;
FineField noise(FineGrid);
FineField Mn(FineGrid);
FineField tmp(FineGrid);
// New normalised noise
std::cout << GridLogMessage<<" Chebyshev subspace pure noise : ord "<<orderfilter<<" ["<<lo<<","<<hi<<"]"<<std::endl;
std::cout << GridLogMessage<<" Chebyshev subspace pure noise : nbasis "<<nn<<std::endl;
for(int b =0;b<nbasis;b++)
{
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
// Initial matrix element
hermop.Op(noise,Mn);
if(b==0) std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
// Filter
Chebyshev<FineField> Cheb(lo,hi,orderfilter);
Cheb(hermop,noise,Mn);
// normalise
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
subspace[b] = Mn;
hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
}
}
};
NAMESPACE_END(Grid);

View File

@ -1,449 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/GeneralCoarsenedMatrix.h
Copyright (C) 2015
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <Grid/qcd/QCD.h> // needed for Dagger(Yes|No), Inverse(Yes|No)
#include <Grid/lattice/PaddedCell.h>
#include <Grid/stencil/GeneralLocalStencil.h>
NAMESPACE_BEGIN(Grid);
// Fine Object == (per site) type of fine field
// nbasis == number of deflation vectors
template<class Fobj,class CComplex,int nbasis>
class GeneralCoarsenedMatrix : public SparseMatrixBase<Lattice<iVector<CComplex,nbasis > > > {
public:
typedef GeneralCoarsenedMatrix<Fobj,CComplex,nbasis> GeneralCoarseOp;
typedef iVector<CComplex,nbasis > siteVector;
typedef iMatrix<CComplex,nbasis > siteMatrix;
typedef Lattice<iScalar<CComplex> > CoarseComplexField;
typedef Lattice<siteVector> CoarseVector;
typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
typedef iMatrix<CComplex,nbasis > Cobj;
typedef iVector<CComplex,nbasis > Cvec;
typedef Lattice< CComplex > CoarseScalar; // used for inner products on fine field
typedef Lattice<Fobj > FineField;
typedef CoarseVector Field;
////////////////////
// Data members
////////////////////
int hermitian;
GridBase * _FineGrid;
GridCartesian * _CoarseGrid;
NonLocalStencilGeometry &geom;
PaddedCell Cell;
GeneralLocalStencil Stencil;
std::vector<CoarseMatrix> _A;
std::vector<CoarseMatrix> _Adag;
std::vector<CoarseVector> MultTemporaries;
///////////////////////
// Interface
///////////////////////
GridBase * Grid(void) { return _FineGrid; }; // this is all the linalg routines need to know
GridBase * FineGrid(void) { return _FineGrid; }; // this is all the linalg routines need to know
GridCartesian * CoarseGrid(void) { return _CoarseGrid; }; // this is all the linalg routines need to know
void ProjectNearestNeighbour(RealD shift, GeneralCoarseOp &CopyMe)
{
int nfound=0;
std::cout << GridLogMessage <<"GeneralCoarsenedMatrix::ProjectNearestNeighbour "<< CopyMe._A[0].Grid()<<std::endl;
for(int p=0;p<geom.npoint;p++){
for(int pp=0;pp<CopyMe.geom.npoint;pp++){
// Search for the same relative shift
// Avoids brutal handling of Grid pointers
if ( CopyMe.geom.shifts[pp]==geom.shifts[p] ) {
_A[p] = CopyMe.Cell.Extract(CopyMe._A[pp]);
_Adag[p] = CopyMe.Cell.Extract(CopyMe._Adag[pp]);
nfound++;
}
}
}
assert(nfound==geom.npoint);
ExchangeCoarseLinks();
}
GeneralCoarsenedMatrix(NonLocalStencilGeometry &_geom,GridBase *FineGrid, GridCartesian * CoarseGrid)
: geom(_geom),
_FineGrid(FineGrid),
_CoarseGrid(CoarseGrid),
hermitian(1),
Cell(_geom.Depth(),_CoarseGrid),
Stencil(Cell.grids.back(),geom.shifts)
{
{
int npoint = _geom.npoint;
autoView( Stencil_v , Stencil, AcceleratorRead);
int osites=Stencil.Grid()->oSites();
for(int ss=0;ss<osites;ss++){
for(int point=0;point<npoint;point++){
auto SE = Stencil_v.GetEntry(point,ss);
int o = SE->_offset;
assert( o< osites);
}
}
}
_A.resize(geom.npoint,CoarseGrid);
_Adag.resize(geom.npoint,CoarseGrid);
}
void M (const CoarseVector &in, CoarseVector &out)
{
Mult(_A,in,out);
}
void Mdag (const CoarseVector &in, CoarseVector &out)
{
if ( hermitian ) M(in,out);
else Mult(_Adag,in,out);
}
void Mult (std::vector<CoarseMatrix> &A,const CoarseVector &in, CoarseVector &out)
{
RealD tviews=0; RealD ttot=0; RealD tmult=0; RealD texch=0; RealD text=0; RealD ttemps=0; RealD tcopy=0;
ttot=-usecond();
conformable(CoarseGrid(),in.Grid());
conformable(in.Grid(),out.Grid());
out.Checkerboard() = in.Checkerboard();
CoarseVector tin=in;
texch-=usecond();
CoarseVector pin = Cell.ExchangePeriodic(tin);
texch+=usecond();
CoarseVector pout(pin.Grid());
int npoint = geom.npoint;
typedef LatticeView<Cobj> Aview;
typedef LatticeView<Cvec> Vview;
const int Nsimd = CComplex::Nsimd();
int64_t osites=pin.Grid()->oSites();
RealD flops = 1.0* npoint * nbasis * nbasis * 8.0 * osites * CComplex::Nsimd();
RealD bytes = 1.0*osites*sizeof(siteMatrix)*npoint
+ 2.0*osites*sizeof(siteVector)*npoint;
{
tviews-=usecond();
autoView( in_v , pin, AcceleratorRead);
autoView( out_v , pout, AcceleratorWriteDiscard);
autoView( Stencil_v , Stencil, AcceleratorRead);
tviews+=usecond();
// Static and prereserve to keep UVM region live and not resized across multiple calls
ttemps-=usecond();
MultTemporaries.resize(npoint,pin.Grid());
ttemps+=usecond();
std::vector<Aview> AcceleratorViewContainer_h;
std::vector<Vview> AcceleratorVecViewContainer_h;
tviews-=usecond();
for(int p=0;p<npoint;p++) {
AcceleratorViewContainer_h.push_back( A[p].View(AcceleratorRead));
AcceleratorVecViewContainer_h.push_back(MultTemporaries[p].View(AcceleratorWrite));
}
tviews+=usecond();
static deviceVector<Aview> AcceleratorViewContainer; AcceleratorViewContainer.resize(npoint);
static deviceVector<Vview> AcceleratorVecViewContainer; AcceleratorVecViewContainer.resize(npoint);
auto Aview_p = &AcceleratorViewContainer[0];
auto Vview_p = &AcceleratorVecViewContainer[0];
tcopy-=usecond();
acceleratorCopyToDevice(&AcceleratorViewContainer_h[0],&AcceleratorViewContainer[0],npoint *sizeof(Aview));
acceleratorCopyToDevice(&AcceleratorVecViewContainer_h[0],&AcceleratorVecViewContainer[0],npoint *sizeof(Vview));
tcopy+=usecond();
tmult-=usecond();
accelerator_for(spb, osites*nbasis*npoint, Nsimd, {
typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
int32_t ss = spb/(nbasis*npoint);
int32_t bp = spb%(nbasis*npoint);
int32_t b = bp/npoint;
int32_t point= bp%npoint;
auto SE = Stencil_v.GetEntry(point,ss);
auto nbr = coalescedReadGeneralPermute(in_v[SE->_offset],SE->_permute,Nd);
auto res = coalescedRead(Aview_p[point][ss](b,0))*nbr(0);
for(int bb=1;bb<nbasis;bb++) {
res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
}
coalescedWrite(Vview_p[point][ss](b),res);
});
accelerator_for(sb, osites*nbasis, Nsimd, {
int ss = sb/nbasis;
int b = sb%nbasis;
auto res = coalescedRead(Vview_p[0][ss](b));
for(int point=1;point<npoint;point++){
res = res + coalescedRead(Vview_p[point][ss](b));
}
coalescedWrite(out_v[ss](b),res);
});
tmult+=usecond();
for(int p=0;p<npoint;p++) {
AcceleratorViewContainer_h[p].ViewClose();
AcceleratorVecViewContainer_h[p].ViewClose();
}
}
text-=usecond();
out = Cell.Extract(pout);
text+=usecond();
ttot+=usecond();
std::cout << GridLogPerformance<<"Coarse Mult Aviews "<<tviews<<" us"<<std::endl;
std::cout << GridLogPerformance<<"Coarse Mult exch "<<texch<<" us"<<std::endl;
std::cout << GridLogPerformance<<"Coarse Mult mult "<<tmult<<" us"<<std::endl;
std::cout << GridLogPerformance<<"Coarse Mult ext "<<text<<" us"<<std::endl;
std::cout << GridLogPerformance<<"Coarse Mult temps "<<ttemps<<" us"<<std::endl;
std::cout << GridLogPerformance<<"Coarse Mult copy "<<tcopy<<" us"<<std::endl;
std::cout << GridLogPerformance<<"Coarse Mult tot "<<ttot<<" us"<<std::endl;
// std::cout << GridLogPerformance<<std::endl;
// std::cout << GridLogPerformance<<"Coarse Kernel flop/s "<< flops/tmult<<" mflop/s"<<std::endl;
// std::cout << GridLogPerformance<<"Coarse Kernel bytes/s"<< bytes/tmult<<" MB/s"<<std::endl;
// std::cout << GridLogPerformance<<"Coarse overall flops/s "<< flops/ttot<<" mflop/s"<<std::endl;
// std::cout << GridLogPerformance<<"Coarse total bytes "<< bytes/1e6<<" MB"<<std::endl;
};
void PopulateAdag(void)
{
for(int64_t bidx=0;bidx<CoarseGrid()->gSites() ;bidx++){
Coordinate bcoor;
CoarseGrid()->GlobalIndexToGlobalCoor(bidx,bcoor);
for(int p=0;p<geom.npoint;p++){
Coordinate scoor = bcoor;
for(int mu=0;mu<bcoor.size();mu++){
int L = CoarseGrid()->GlobalDimensions()[mu];
scoor[mu] = (bcoor[mu] - geom.shifts[p][mu] + L) % L; // Modulo arithmetic
}
// Flip to poke/peekLocalSite and not too bad
auto link = peekSite(_A[p],scoor);
int pp = geom.Reverse(p);
pokeSite(adj(link),_Adag[pp],bcoor);
}
}
}
/////////////////////////////////////////////////////////////
//
// A) Only reduced flops option is to use a padded cell of depth 4
// and apply MpcDagMpc in the padded cell.
//
// Makes for ONE application of MpcDagMpc per vector instead of 30 or 80.
// With the effective cell size around (B+8)^4 perhaps 12^4/4^4 ratio
// Cost is 81x more, same as stencil size.
//
// But: can eliminate comms and do as local dirichlet.
//
// Local exchange gauge field once.
// Apply to all vectors, local only computation.
// Must exchange ghost subcells in reverse process of PaddedCell to take inner products
//
// B) Can reduce cost: pad by 1, apply Deo (4^4+6^4+8^4+8^4 )/ (4x 4^4)
// pad by 2, apply Doe
// pad by 3, apply Deo
// then break out 8x directions; cost is ~10x MpcDagMpc per vector
//
// => almost factor of 10 in setup cost, excluding data rearrangement
//
// Intermediates -- ignore the corner terms, leave approximate and force Hermitian
// Intermediates -- pad by 2 and apply 1+8+24 = 33 times.
/////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////
// BFM HDCG style approach: Solve a system of equations to get Aij
//////////////////////////////////////////////////////////
/*
* Here, k,l index which possible shift within the 3^Nd "ball" connected by MdagM.
*
* conj(phases[block]) proj[k][ block*Nvec+j ] = \sum_ball e^{i q_k . delta} < phi_{block,j} | MdagM | phi_{(block+delta),i} >
* = \sum_ball e^{iqk.delta} A_ji
*
* Must invert matrix M_k,l = e^[i q_k . delta_l]
*
* Where q_k = delta_k . (2*M_PI/global_nb[mu])
*/
void CoarsenOperator(LinearOperatorBase<Lattice<Fobj> > &linop,
Aggregation<Fobj,CComplex,nbasis> & Subspace)
{
std::cout << GridLogMessage<< "GeneralCoarsenMatrix "<< std::endl;
GridBase *grid = FineGrid();
RealD tproj=0.0;
RealD teigen=0.0;
RealD tmat=0.0;
RealD tphase=0.0;
RealD tinv=0.0;
/////////////////////////////////////////////////////////////
// Orthogonalise the subblocks over the basis
/////////////////////////////////////////////////////////////
CoarseScalar InnerProd(CoarseGrid());
blockOrthogonalise(InnerProd,Subspace.subspace);
const int npoint = geom.npoint;
Coordinate clatt = CoarseGrid()->GlobalDimensions();
int Nd = CoarseGrid()->Nd();
/*
* Here, k,l index which possible momentum/shift within the N-points connected by MdagM.
* Matrix index i is mapped to this shift via
* geom.shifts[i]
*
* conj(pha[block]) proj[k (which mom)][j (basis vec cpt)][block]
* = \sum_{l in ball} e^{i q_k . delta_l} < phi_{block,j} | MdagM | phi_{(block+delta_l),i} >
* = \sum_{l in ball} e^{iqk.delta_l} A_ji^{b.b+l}
* = M_{kl} A_ji^{b.b+l}
*
* Must assemble and invert matrix M_k,l = e^[i q_k . delta_l]
*
* Where q_k = delta_k . (2*M_PI/global_nb[mu])
*
* Then A{ji}^{b,b+l} = M^{-1}_{lm} ComputeProj_{m,b,i,j}
*/
teigen-=usecond();
Eigen::MatrixXcd Mkl = Eigen::MatrixXcd::Zero(npoint,npoint);
Eigen::MatrixXcd invMkl = Eigen::MatrixXcd::Zero(npoint,npoint);
ComplexD ci(0.0,1.0);
for(int k=0;k<npoint;k++){ // Loop over momenta
for(int l=0;l<npoint;l++){ // Loop over nbr relative
ComplexD phase(0.0,0.0);
for(int mu=0;mu<Nd;mu++){
RealD TwoPiL = M_PI * 2.0/ clatt[mu];
phase=phase+TwoPiL*geom.shifts[k][mu]*geom.shifts[l][mu];
}
phase=exp(phase*ci);
Mkl(k,l) = phase;
}
}
invMkl = Mkl.inverse();
teigen+=usecond();
///////////////////////////////////////////////////////////////////////
// Now compute the matrix elements of linop between the orthonormal
// set of vectors.
///////////////////////////////////////////////////////////////////////
FineField phaV(grid); // Phased block basis vector
FineField MphaV(grid);// Matrix applied
CoarseVector coarseInner(CoarseGrid());
std::vector<CoarseVector> ComputeProj(npoint,CoarseGrid());
std::vector<CoarseVector> FT(npoint,CoarseGrid());
for(int i=0;i<nbasis;i++){// Loop over basis vectors
std::cout << GridLogMessage<< "CoarsenMatrixColoured vec "<<i<<"/"<<nbasis<< std::endl;
for(int p=0;p<npoint;p++){ // Loop over momenta in npoint
/////////////////////////////////////////////////////
// Stick a phase on every block
/////////////////////////////////////////////////////
tphase-=usecond();
CoarseComplexField coor(CoarseGrid());
CoarseComplexField pha(CoarseGrid()); pha=Zero();
for(int mu=0;mu<Nd;mu++){
LatticeCoordinate(coor,mu);
RealD TwoPiL = M_PI * 2.0/ clatt[mu];
pha = pha + (TwoPiL * geom.shifts[p][mu]) * coor;
}
pha =exp(pha*ci);
phaV=Zero();
blockZAXPY(phaV,pha,Subspace.subspace[i],phaV);
tphase+=usecond();
/////////////////////////////////////////////////////////////////////
// Multiple phased subspace vector by matrix and project to subspace
// Remove local bulk phase to leave relative phases
/////////////////////////////////////////////////////////////////////
tmat-=usecond();
linop.Op(phaV,MphaV);
tmat+=usecond();
tproj-=usecond();
blockProject(coarseInner,MphaV,Subspace.subspace);
coarseInner = conjugate(pha) * coarseInner;
ComputeProj[p] = coarseInner;
tproj+=usecond();
}
tinv-=usecond();
for(int k=0;k<npoint;k++){
FT[k] = Zero();
for(int l=0;l<npoint;l++){
FT[k]= FT[k]+ invMkl(l,k)*ComputeProj[l];
}
int osites=CoarseGrid()->oSites();
autoView( A_v , _A[k], AcceleratorWrite);
autoView( FT_v , FT[k], AcceleratorRead);
accelerator_for(sss, osites, 1, {
for(int j=0;j<nbasis;j++){
A_v[sss](j,i) = FT_v[sss](j);
}
});
}
tinv+=usecond();
}
for(int p=0;p<geom.npoint;p++){
Coordinate coor({0,0,0,0,0});
auto sval = peekSite(_A[p],coor);
}
// Only needed if nonhermitian
if ( ! hermitian ) {
std::cout << GridLogMessage<<"PopulateAdag "<<std::endl;
PopulateAdag();
}
// Need to write something to populate Adag from A
ExchangeCoarseLinks();
std::cout << GridLogMessage<<"CoarsenOperator eigen "<<teigen<<" us"<<std::endl;
std::cout << GridLogMessage<<"CoarsenOperator phase "<<tphase<<" us"<<std::endl;
std::cout << GridLogMessage<<"CoarsenOperator mat "<<tmat <<" us"<<std::endl;
std::cout << GridLogMessage<<"CoarsenOperator proj "<<tproj<<" us"<<std::endl;
std::cout << GridLogMessage<<"CoarsenOperator inv "<<tinv<<" us"<<std::endl;
}
void ExchangeCoarseLinks(void){
for(int p=0;p<geom.npoint;p++){
_A[p] = Cell.Exchange(_A[p]);
_Adag[p]= Cell.Exchange(_Adag[p]);
}
}
virtual void Mdiag (const Field &in, Field &out){ assert(0);};
virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);};
virtual void MdirAll (const Field &in, std::vector<Field> &out){assert(0);};
};
NAMESPACE_END(Grid);

View File

@ -1,243 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/algorithms/GeneralCoarsenedMatrix.h
Copyright (C) 2015
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
/////////////////////////////////////////////////////////////////
// Geometry class in cartesian case
/////////////////////////////////////////////////////////////////
class Geometry {
public:
int npoint;
int base;
std::vector<int> directions ;
std::vector<int> displacements;
std::vector<int> points_dagger;
Geometry(int _d) {
base = (_d==5) ? 1:0;
// make coarse grid stencil for 4d , not 5d
if ( _d==5 ) _d=4;
npoint = 2*_d+1;
directions.resize(npoint);
displacements.resize(npoint);
points_dagger.resize(npoint);
for(int d=0;d<_d;d++){
directions[d ] = d+base;
directions[d+_d] = d+base;
displacements[d ] = +1;
displacements[d+_d]= -1;
points_dagger[d ] = d+_d;
points_dagger[d+_d] = d;
}
directions [2*_d]=0;
displacements[2*_d]=0;
points_dagger[2*_d]=2*_d;
}
int point(int dir, int disp) {
assert(disp == -1 || disp == 0 || disp == 1);
assert(base+0 <= dir && dir < base+4);
// directions faster index = new indexing
// 4d (base = 0):
// point 0 1 2 3 4 5 6 7 8
// dir 0 1 2 3 0 1 2 3 0
// disp +1 +1 +1 +1 -1 -1 -1 -1 0
// 5d (base = 1):
// point 0 1 2 3 4 5 6 7 8
// dir 1 2 3 4 1 2 3 4 0
// disp +1 +1 +1 +1 -1 -1 -1 -1 0
// displacements faster index = old indexing
// 4d (base = 0):
// point 0 1 2 3 4 5 6 7 8
// dir 0 0 1 1 2 2 3 3 0
// disp +1 -1 +1 -1 +1 -1 +1 -1 0
// 5d (base = 1):
// point 0 1 2 3 4 5 6 7 8
// dir 1 1 2 2 3 3 4 4 0
// disp +1 -1 +1 -1 +1 -1 +1 -1 0
if(dir == 0 and disp == 0)
return 8;
else // New indexing
return (1 - disp) / 2 * 4 + dir - base;
// else // Old indexing
// return (4 * (dir - base) + 1 - disp) / 2;
}
};
/////////////////////////////////////////////////////////////////
// Less local equivalent of Geometry class in cartesian case
/////////////////////////////////////////////////////////////////
class NonLocalStencilGeometry {
public:
int depth;
int hops;
int npoint;
std::vector<Coordinate> shifts;
Coordinate stencil_size;
Coordinate stencil_lo;
Coordinate stencil_hi;
GridCartesian *grid;
GridCartesian *Grid() {return grid;};
int Depth(void){return 1;}; // Ghost zone depth
int Hops(void){return hops;}; // # of hops=> level of corner fill in in stencil
virtual int DimSkip(void) =0;
virtual ~NonLocalStencilGeometry() {};
int Reverse(int point)
{
int Nd = Grid()->Nd();
Coordinate shft = shifts[point];
Coordinate rev(Nd);
for(int mu=0;mu<Nd;mu++) rev[mu]= -shft[mu];
for(int p=0;p<npoint;p++){
if(rev==shifts[p]){
return p;
}
}
assert(0);
return -1;
}
void BuildShifts(void)
{
this->shifts.resize(0);
int Nd = this->grid->Nd();
int dd = this->DimSkip();
for(int s0=this->stencil_lo[dd+0];s0<=this->stencil_hi[dd+0];s0++){
for(int s1=this->stencil_lo[dd+1];s1<=this->stencil_hi[dd+1];s1++){
for(int s2=this->stencil_lo[dd+2];s2<=this->stencil_hi[dd+2];s2++){
for(int s3=this->stencil_lo[dd+3];s3<=this->stencil_hi[dd+3];s3++){
Coordinate sft(Nd,0);
sft[dd+0] = s0;
sft[dd+1] = s1;
sft[dd+2] = s2;
sft[dd+3] = s3;
int nhops = abs(s0)+abs(s1)+abs(s2)+abs(s3);
if(nhops<=this->hops) this->shifts.push_back(sft);
}}}}
this->npoint = this->shifts.size();
std::cout << GridLogMessage << "NonLocalStencilGeometry has "<< this->npoint << " terms in stencil "<<std::endl;
}
NonLocalStencilGeometry(GridCartesian *_coarse_grid,int _hops) : grid(_coarse_grid), hops(_hops)
{
Coordinate latt = grid->GlobalDimensions();
stencil_size.resize(grid->Nd());
stencil_lo.resize(grid->Nd());
stencil_hi.resize(grid->Nd());
for(int d=0;d<grid->Nd();d++){
if ( latt[d] == 1 ) {
stencil_lo[d] = 0;
stencil_hi[d] = 0;
stencil_size[d]= 1;
} else if ( latt[d] == 2 ) {
stencil_lo[d] = -1;
stencil_hi[d] = 0;
stencil_size[d]= 2;
} else if ( latt[d] > 2 ) {
stencil_lo[d] = -1;
stencil_hi[d] = 1;
stencil_size[d]= 3;
}
}
};
};
// Need to worry about red-black now
class NonLocalStencilGeometry4D : public NonLocalStencilGeometry {
public:
virtual int DimSkip(void) { return 0;};
NonLocalStencilGeometry4D(GridCartesian *Coarse,int _hops) : NonLocalStencilGeometry(Coarse,_hops) { };
virtual ~NonLocalStencilGeometry4D() {};
};
class NonLocalStencilGeometry5D : public NonLocalStencilGeometry {
public:
virtual int DimSkip(void) { return 1; };
NonLocalStencilGeometry5D(GridCartesian *Coarse,int _hops) : NonLocalStencilGeometry(Coarse,_hops) { };
virtual ~NonLocalStencilGeometry5D() {};
};
/*
* Bunch of different options classes
*/
class NextToNextToNextToNearestStencilGeometry4D : public NonLocalStencilGeometry4D {
public:
NextToNextToNextToNearestStencilGeometry4D(GridCartesian *Coarse) : NonLocalStencilGeometry4D(Coarse,4)
{
this->BuildShifts();
};
};
class NextToNextToNextToNearestStencilGeometry5D : public NonLocalStencilGeometry5D {
public:
NextToNextToNextToNearestStencilGeometry5D(GridCartesian *Coarse) : NonLocalStencilGeometry5D(Coarse,4)
{
this->BuildShifts();
};
};
class NextToNearestStencilGeometry4D : public NonLocalStencilGeometry4D {
public:
NextToNearestStencilGeometry4D(GridCartesian *Coarse) : NonLocalStencilGeometry4D(Coarse,2)
{
this->BuildShifts();
};
};
class NextToNearestStencilGeometry5D : public NonLocalStencilGeometry5D {
public:
NextToNearestStencilGeometry5D(GridCartesian *Coarse) : NonLocalStencilGeometry5D(Coarse,2)
{
this->BuildShifts();
};
};
class NearestStencilGeometry4D : public NonLocalStencilGeometry4D {
public:
NearestStencilGeometry4D(GridCartesian *Coarse) : NonLocalStencilGeometry4D(Coarse,1)
{
this->BuildShifts();
};
};
class NearestStencilGeometry5D : public NonLocalStencilGeometry5D {
public:
NearestStencilGeometry5D(GridCartesian *Coarse) : NonLocalStencilGeometry5D(Coarse,1)
{
this->BuildShifts();
};
};
NAMESPACE_END(Grid);

View File

@ -1,33 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: Grid/algorithms/multigrid/MultiGrid.h
Copyright (C) 2023
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include <Grid/algorithms/multigrid/Aggregates.h>
#include <Grid/algorithms/multigrid/Geometry.h>
#include <Grid/algorithms/multigrid/CoarsenedMatrix.h>
#include <Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h>

View File

@ -4,14 +4,11 @@ NAMESPACE_BEGIN(Grid);
/*Allocation types, saying which pointer cache should be used*/ /*Allocation types, saying which pointer cache should be used*/
#define Cpu (0) #define Cpu (0)
#define CpuHuge (1) #define CpuSmall (1)
#define CpuSmall (2) #define Acc (2)
#define Acc (3) #define AccSmall (3)
#define AccHuge (4) #define Shared (4)
#define AccSmall (5) #define SharedSmall (5)
#define Shared (6)
#define SharedHuge (7)
#define SharedSmall (8)
#undef GRID_MM_VERBOSE #undef GRID_MM_VERBOSE
uint64_t total_shared; uint64_t total_shared;
uint64_t total_device; uint64_t total_device;
@ -38,15 +35,12 @@ void MemoryManager::PrintBytes(void)
} }
uint64_t MemoryManager::DeviceCacheBytes() { return CacheBytes[Acc] + CacheBytes[AccHuge] + CacheBytes[AccSmall]; }
uint64_t MemoryManager::HostCacheBytes() { return CacheBytes[Cpu] + CacheBytes[CpuHuge] + CacheBytes[CpuSmall]; }
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
// Data tables for recently freed pooiniter caches // Data tables for recently freed pooiniter caches
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax]; MemoryManager::AllocationCacheEntry MemoryManager::Entries[MemoryManager::NallocType][MemoryManager::NallocCacheMax];
int MemoryManager::Victim[MemoryManager::NallocType]; int MemoryManager::Victim[MemoryManager::NallocType];
int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 0, 8, 8, 0, 16, 8, 0, 16 }; int MemoryManager::Ncache[MemoryManager::NallocType] = { 2, 8, 8, 16, 8, 16 };
uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType]; uint64_t MemoryManager::CacheBytes[MemoryManager::NallocType];
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
// Actual allocation and deallocation utils // Actual allocation and deallocation utils
@ -176,16 +170,6 @@ void MemoryManager::Init(void)
} }
} }
str= getenv("GRID_ALLOC_NCACHE_HUGE");
if ( str ) {
Nc = atoi(str);
if ( (Nc>=0) && (Nc < NallocCacheMax)) {
Ncache[CpuHuge]=Nc;
Ncache[AccHuge]=Nc;
Ncache[SharedHuge]=Nc;
}
}
str= getenv("GRID_ALLOC_NCACHE_SMALL"); str= getenv("GRID_ALLOC_NCACHE_SMALL");
if ( str ) { if ( str ) {
Nc = atoi(str); Nc = atoi(str);
@ -206,9 +190,7 @@ void MemoryManager::InitMessage(void) {
std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl; std::cout << GridLogMessage<< "MemoryManager::Init() setting up"<<std::endl;
#ifdef ALLOCATION_CACHE #ifdef ALLOCATION_CACHE
std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent host allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<" HUGE "<<Ncache[CpuHuge]<<std::endl; std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent allocations: SMALL "<<Ncache[CpuSmall]<<" LARGE "<<Ncache[Cpu]<<std::endl;
std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent device allocations: SMALL "<<Ncache[AccSmall]<<" LARGE "<<Ncache[Acc]<<" Huge "<<Ncache[AccHuge]<<std::endl;
std::cout << GridLogMessage<< "MemoryManager::Init() cache pool for recent shared allocations: SMALL "<<Ncache[SharedSmall]<<" LARGE "<<Ncache[Shared]<<" Huge "<<Ncache[SharedHuge]<<std::endl;
#endif #endif
#ifdef GRID_UVM #ifdef GRID_UVM
@ -240,11 +222,8 @@ void MemoryManager::InitMessage(void) {
void *MemoryManager::Insert(void *ptr,size_t bytes,int type) void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
{ {
#ifdef ALLOCATION_CACHE #ifdef ALLOCATION_CACHE
int cache; bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
if (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2; int cache = type + small;
else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1;
else cache = type;
return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]); return Insert(ptr,bytes,Entries[cache],Ncache[cache],Victim[cache],CacheBytes[cache]);
#else #else
return ptr; return ptr;
@ -253,12 +232,11 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,int type)
void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes) void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries,int ncache,int &victim, uint64_t &cacheBytes)
{ {
assert(ncache>0);
#ifdef GRID_OMP #ifdef GRID_OMP
assert(omp_in_parallel()==0); assert(omp_in_parallel()==0);
#endif #endif
if (ncache == 0) return ptr;
void * ret = NULL; void * ret = NULL;
int v = -1; int v = -1;
@ -293,11 +271,8 @@ void *MemoryManager::Insert(void *ptr,size_t bytes,AllocationCacheEntry *entries
void *MemoryManager::Lookup(size_t bytes,int type) void *MemoryManager::Lookup(size_t bytes,int type)
{ {
#ifdef ALLOCATION_CACHE #ifdef ALLOCATION_CACHE
int cache; bool small = (bytes < GRID_ALLOC_SMALL_LIMIT);
if (bytes < GRID_ALLOC_SMALL_LIMIT) cache = type + 2; int cache = type+small;
else if (bytes >= GRID_ALLOC_HUGE_LIMIT) cache = type + 1;
else cache = type;
return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]); return Lookup(bytes,Entries[cache],Ncache[cache],CacheBytes[cache]);
#else #else
return NULL; return NULL;
@ -306,6 +281,7 @@ void *MemoryManager::Lookup(size_t bytes,int type)
void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes) void *MemoryManager::Lookup(size_t bytes,AllocationCacheEntry *entries,int ncache,uint64_t & cacheBytes)
{ {
assert(ncache>0);
#ifdef GRID_OMP #ifdef GRID_OMP
assert(omp_in_parallel()==0); assert(omp_in_parallel()==0);
#endif #endif

View File

@ -35,7 +35,6 @@ NAMESPACE_BEGIN(Grid);
// Move control to configure.ac and Config.h? // Move control to configure.ac and Config.h?
#define GRID_ALLOC_SMALL_LIMIT (4096) #define GRID_ALLOC_SMALL_LIMIT (4096)
#define GRID_ALLOC_HUGE_LIMIT (2147483648)
#define STRINGIFY(x) #x #define STRINGIFY(x) #x
#define TOSTRING(x) STRINGIFY(x) #define TOSTRING(x) STRINGIFY(x)
@ -71,21 +70,6 @@ enum ViewMode {
CpuWriteDiscard = 0x10 // same for now CpuWriteDiscard = 0x10 // same for now
}; };
struct MemoryStatus {
uint64_t DeviceBytes;
uint64_t DeviceLRUBytes;
uint64_t DeviceMaxBytes;
uint64_t HostToDeviceBytes;
uint64_t DeviceToHostBytes;
uint64_t HostToDeviceXfer;
uint64_t DeviceToHostXfer;
uint64_t DeviceEvictions;
uint64_t DeviceDestroy;
uint64_t DeviceAllocCacheBytes;
uint64_t HostAllocCacheBytes;
};
class MemoryManager { class MemoryManager {
private: private:
@ -99,7 +83,7 @@ private:
} AllocationCacheEntry; } AllocationCacheEntry;
static const int NallocCacheMax=128; static const int NallocCacheMax=128;
static const int NallocType=9; static const int NallocType=6;
static AllocationCacheEntry Entries[NallocType][NallocCacheMax]; static AllocationCacheEntry Entries[NallocType][NallocCacheMax];
static int Victim[NallocType]; static int Victim[NallocType];
static int Ncache[NallocType]; static int Ncache[NallocType];
@ -137,26 +121,7 @@ private:
static uint64_t DeviceToHostXfer; static uint64_t DeviceToHostXfer;
static uint64_t DeviceEvictions; static uint64_t DeviceEvictions;
static uint64_t DeviceDestroy; static uint64_t DeviceDestroy;
static uint64_t DeviceCacheBytes();
static uint64_t HostCacheBytes();
static MemoryStatus GetFootprint(void) {
MemoryStatus stat;
stat.DeviceBytes = DeviceBytes;
stat.DeviceLRUBytes = DeviceLRUBytes;
stat.DeviceMaxBytes = DeviceMaxBytes;
stat.HostToDeviceBytes = HostToDeviceBytes;
stat.DeviceToHostBytes = DeviceToHostBytes;
stat.HostToDeviceXfer = HostToDeviceXfer;
stat.DeviceToHostXfer = DeviceToHostXfer;
stat.DeviceEvictions = DeviceEvictions;
stat.DeviceDestroy = DeviceDestroy;
stat.DeviceAllocCacheBytes = DeviceCacheBytes();
stat.HostAllocCacheBytes = HostCacheBytes();
return stat;
};
private: private:
#ifndef GRID_UVM #ifndef GRID_UVM
////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////
@ -209,9 +174,9 @@ private:
static void CpuViewClose(uint64_t Ptr); static void CpuViewClose(uint64_t Ptr);
static uint64_t CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint); static uint64_t CpuViewOpen(uint64_t CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
#endif #endif
static void NotifyDeletion(void * CpuPtr);
public: public:
static void NotifyDeletion(void * CpuPtr);
static void Print(void); static void Print(void);
static void PrintAll(void); static void PrintAll(void);
static void PrintState( void* CpuPtr); static void PrintState( void* CpuPtr);

View File

@ -8,7 +8,7 @@ NAMESPACE_BEGIN(Grid);
static char print_buffer [ MAXLINE ]; static char print_buffer [ MAXLINE ];
#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer; #define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer;
#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug << print_buffer; #define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer;
//#define dprintf(...) //#define dprintf(...)
@ -111,7 +111,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
/////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////
assert(AccCache.state!=Empty); assert(AccCache.state!=Empty);
dprintf("MemoryManager: Discard(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); mprintf("MemoryManager: Discard(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr);
assert(AccCache.accLock==0); assert(AccCache.accLock==0);
assert(AccCache.cpuLock==0); assert(AccCache.cpuLock==0);
assert(AccCache.CpuPtr!=(uint64_t)NULL); assert(AccCache.CpuPtr!=(uint64_t)NULL);
@ -141,11 +141,11 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
assert(AccCache.state!=Empty); assert(AccCache.state!=Empty);
mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld\n", mprintf("MemoryManager: Evict cpu %lx acc %lx cpuLock %ld accLock %ld\n",
(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr, (uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr,
(uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock); (uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock);
if (AccCache.accLock!=0) return; assert(AccCache.accLock==0); // Cannot evict so logic bomb
if (AccCache.cpuLock!=0) return; assert(AccCache.CpuPtr!=(uint64_t)NULL);
if(AccCache.state==AccDirty) { if(AccCache.state==AccDirty) {
Flush(AccCache); Flush(AccCache);
} }
@ -155,7 +155,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
AccCache.AccPtr=(uint64_t)NULL; AccCache.AccPtr=(uint64_t)NULL;
AccCache.state=CpuDirty; // CPU primary now AccCache.state=CpuDirty; // CPU primary now
DeviceBytes -=AccCache.bytes; DeviceBytes -=AccCache.bytes;
dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes); dprintf("MemoryManager: Free(%lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);
} }
// uint64_t CpuPtr = AccCache.CpuPtr; // uint64_t CpuPtr = AccCache.CpuPtr;
DeviceEvictions++; DeviceEvictions++;
@ -169,7 +169,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
assert(AccCache.AccPtr!=(uint64_t)NULL); assert(AccCache.AccPtr!=(uint64_t)NULL);
assert(AccCache.CpuPtr!=(uint64_t)NULL); assert(AccCache.CpuPtr!=(uint64_t)NULL);
acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes); acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
mprintf("MemoryManager: acceleratorCopyFromDevice Flush AccPtr %lx -> CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); mprintf("MemoryManager: Flush %lx -> %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
DeviceToHostBytes+=AccCache.bytes; DeviceToHostBytes+=AccCache.bytes;
DeviceToHostXfer++; DeviceToHostXfer++;
AccCache.state=Consistent; AccCache.state=Consistent;
@ -184,7 +184,7 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes); AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
DeviceBytes+=AccCache.bytes; DeviceBytes+=AccCache.bytes;
} }
mprintf("MemoryManager: acceleratorCopyToDevice Clone AccPtr %lx <- CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout); mprintf("MemoryManager: Clone %lx <- %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes); acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
HostToDeviceBytes+=AccCache.bytes; HostToDeviceBytes+=AccCache.bytes;
HostToDeviceXfer++; HostToDeviceXfer++;
@ -519,6 +519,7 @@ void MemoryManager::Audit(std::string s)
uint64_t LruBytes1=0; uint64_t LruBytes1=0;
uint64_t LruBytes2=0; uint64_t LruBytes2=0;
uint64_t LruCnt=0; uint64_t LruCnt=0;
uint64_t LockedBytes=0;
std::cout << " Memory Manager::Audit() from "<<s<<std::endl; std::cout << " Memory Manager::Audit() from "<<s<<std::endl;
for(auto it=LRU.begin();it!=LRU.end();it++){ for(auto it=LRU.begin();it!=LRU.end();it++){
@ -531,7 +532,6 @@ void MemoryManager::Audit(std::string s)
assert(AccCache.LRU_entry==it); assert(AccCache.LRU_entry==it);
} }
std::cout << " Memory Manager::Audit() LRU queue matches table entries "<<std::endl; std::cout << " Memory Manager::Audit() LRU queue matches table entries "<<std::endl;
for(auto it=AccViewTable.begin();it!=AccViewTable.end();it++){ for(auto it=AccViewTable.begin();it!=AccViewTable.end();it++){
auto &AccCache = it->second; auto &AccCache = it->second;
@ -548,7 +548,6 @@ void MemoryManager::Audit(std::string s)
if ( AccCache.cpuLock || AccCache.accLock ) { if ( AccCache.cpuLock || AccCache.accLock ) {
assert(AccCache.LRU_valid==0); assert(AccCache.LRU_valid==0);
std::cout << GridLogError << s<< "\n\t 0x"<<std::hex<<AccCache.CpuPtr<<std::dec std::cout << GridLogError << s<< "\n\t 0x"<<std::hex<<AccCache.CpuPtr<<std::dec
<< "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str << "\t0x"<<std::hex<<AccCache.AccPtr<<std::dec<<"\t" <<str
<< "\t cpuLock " << AccCache.cpuLock << "\t cpuLock " << AccCache.cpuLock
@ -567,7 +566,6 @@ void MemoryManager::Audit(std::string s)
std::cout << " Memory Manager::Audit() device bytes matches sum over table "<<std::endl; std::cout << " Memory Manager::Audit() device bytes matches sum over table "<<std::endl;
assert(LruCnt == LRU.size()); assert(LruCnt == LRU.size());
std::cout << " Memory Manager::Audit() LRU entry count matches "<<std::endl; std::cout << " Memory Manager::Audit() LRU entry count matches "<<std::endl;
} }
void MemoryManager::PrintState(void* _CpuPtr) void MemoryManager::PrintState(void* _CpuPtr)

View File

@ -70,8 +70,8 @@ public:
Coordinate _istride; // Inner stride i.e. within simd lane Coordinate _istride; // Inner stride i.e. within simd lane
int _osites; // _isites*_osites = product(dimensions). int _osites; // _isites*_osites = product(dimensions).
int _isites; int _isites;
int64_t _fsites; // _isites*_osites = product(dimensions). int _fsites; // _isites*_osites = product(dimensions).
int64_t _gsites; int _gsites;
Coordinate _slice_block;// subslice information Coordinate _slice_block;// subslice information
Coordinate _slice_stride; Coordinate _slice_stride;
Coordinate _slice_nblock; Coordinate _slice_nblock;
@ -183,7 +183,7 @@ public:
inline int Nsimd(void) const { return _isites; };// Synonymous with iSites inline int Nsimd(void) const { return _isites; };// Synonymous with iSites
inline int oSites(void) const { return _osites; }; inline int oSites(void) const { return _osites; };
inline int lSites(void) const { return _isites*_osites; }; inline int lSites(void) const { return _isites*_osites; };
inline int64_t gSites(void) const { return (int64_t)_isites*(int64_t)_osites*(int64_t)_Nprocessors; }; inline int gSites(void) const { return _isites*_osites*_Nprocessors; };
inline int Nd (void) const { return _ndimension;}; inline int Nd (void) const { return _ndimension;};
inline const Coordinate LocalStarts(void) { return _lstart; }; inline const Coordinate LocalStarts(void) { return _lstart; };
@ -214,7 +214,7 @@ public:
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Global addressing // Global addressing
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
void GlobalIndexToGlobalCoor(int64_t gidx,Coordinate &gcoor){ void GlobalIndexToGlobalCoor(int gidx,Coordinate &gcoor){
assert(gidx< gSites()); assert(gidx< gSites());
Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions); Lexicographic::CoorFromIndex(gcoor,gidx,_gdimensions);
} }
@ -222,7 +222,7 @@ public:
assert(lidx<lSites()); assert(lidx<lSites());
Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions); Lexicographic::CoorFromIndex(lcoor,lidx,_ldimensions);
} }
void GlobalCoorToGlobalIndex(const Coordinate & gcoor,int64_t & gidx){ void GlobalCoorToGlobalIndex(const Coordinate & gcoor,int & gidx){
gidx=0; gidx=0;
int mult=1; int mult=1;
for(int mu=0;mu<_ndimension;mu++) { for(int mu=0;mu<_ndimension;mu++) {

View File

@ -53,11 +53,10 @@ public:
// Communicator should know nothing of the physics grid, only processor grid. // Communicator should know nothing of the physics grid, only processor grid.
//////////////////////////////////////////// ////////////////////////////////////////////
int _Nprocessors; // How many in all int _Nprocessors; // How many in all
int _processor; // linear processor rank
unsigned long _ndimension;
Coordinate _shm_processors; // Which dimensions get relayed out over processors lanes.
Coordinate _processors; // Which dimensions get relayed out over processors lanes. Coordinate _processors; // Which dimensions get relayed out over processors lanes.
int _processor; // linear processor rank
Coordinate _processor_coor; // linear processor coordinate Coordinate _processor_coor; // linear processor coordinate
unsigned long _ndimension;
static Grid_MPI_Comm communicator_world; static Grid_MPI_Comm communicator_world;
Grid_MPI_Comm communicator; Grid_MPI_Comm communicator;
std::vector<Grid_MPI_Comm> communicator_halo; std::vector<Grid_MPI_Comm> communicator_halo;
@ -98,16 +97,14 @@ public:
int BossRank(void) ; int BossRank(void) ;
int ThisRank(void) ; int ThisRank(void) ;
const Coordinate & ThisProcessorCoor(void) ; const Coordinate & ThisProcessorCoor(void) ;
const Coordinate & ShmGrid(void) { return _shm_processors; } ;
const Coordinate & ProcessorGrid(void) ; const Coordinate & ProcessorGrid(void) ;
int ProcessorCount(void) ; int ProcessorCount(void) ;
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// very VERY rarely (Log, serial RNG) we need world without a grid // very VERY rarely (Log, serial RNG) we need world without a grid
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
static int RankWorld(void) ; static int RankWorld(void) ;
static void BroadcastWorld(int root,void* data, int bytes); static void BroadcastWorld(int root,void* data, int bytes);
static void BarrierWorld(void);
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
// Reduction // Reduction
@ -131,21 +128,13 @@ public:
template<class obj> void GlobalSum(obj &o){ template<class obj> void GlobalSum(obj &o){
typedef typename obj::scalar_type scalar_type; typedef typename obj::scalar_type scalar_type;
int words = sizeof(obj)/sizeof(scalar_type); int words = sizeof(obj)/sizeof(scalar_type);
scalar_type * ptr = (scalar_type *)& o; // Safe alias scalar_type * ptr = (scalar_type *)& o;
GlobalSumVector(ptr,words); GlobalSumVector(ptr,words);
} }
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
// Face exchange, buffer swap in translational invariant way // Face exchange, buffer swap in translational invariant way
//////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////
void CommsComplete(std::vector<CommsRequest_t> &list);
void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
int from,
int bytes,int dir);
void SendToRecvFrom(void *xmit, void SendToRecvFrom(void *xmit,
int xmit_to_rank, int xmit_to_rank,
void *recv, void *recv,
@ -153,17 +142,17 @@ public:
int bytes); int bytes);
double StencilSendToRecvFrom(void *xmit, double StencilSendToRecvFrom(void *xmit,
int xmit_to_rank,int do_xmit, int xmit_to_rank,
void *recv, void *recv,
int recv_from_rank,int do_recv, int recv_from_rank,
int bytes,int dir); int bytes,int dir);
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit, void *xmit,
int xmit_to_rank,int do_xmit, int xmit_to_rank,
void *recv, void *recv,
int recv_from_rank,int do_recv, int recv_from_rank,
int xbytes,int rbytes,int dir); int bytes,int dir);
void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i); void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i);

View File

@ -106,7 +106,7 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
// Remap using the shared memory optimising routine // Remap using the shared memory optimising routine
// The remap creates a comm which must be freed // The remap creates a comm which must be freed
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
GlobalSharedMemory::OptimalCommunicator (processors,optimal_comm,_shm_processors); GlobalSharedMemory::OptimalCommunicator (processors,optimal_comm);
InitFromMPICommunicator(processors,optimal_comm); InitFromMPICommunicator(processors,optimal_comm);
SetCommunicator(optimal_comm); SetCommunicator(optimal_comm);
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
@ -124,13 +124,12 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension); int parent_ndimension = parent._ndimension; assert(_ndimension >= parent._ndimension);
Coordinate parent_processor_coor(_ndimension,0); Coordinate parent_processor_coor(_ndimension,0);
Coordinate parent_processors (_ndimension,1); Coordinate parent_processors (_ndimension,1);
Coordinate shm_processors (_ndimension,1);
// Can make 5d grid from 4d etc... // Can make 5d grid from 4d etc...
int pad = _ndimension-parent_ndimension; int pad = _ndimension-parent_ndimension;
for(int d=0;d<parent_ndimension;d++){ for(int d=0;d<parent_ndimension;d++){
parent_processor_coor[pad+d]=parent._processor_coor[d]; parent_processor_coor[pad+d]=parent._processor_coor[d];
parent_processors [pad+d]=parent._processors[d]; parent_processors [pad+d]=parent._processors[d];
shm_processors [pad+d]=parent._shm_processors[d];
} }
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
@ -155,7 +154,6 @@ CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const
ccoor[d] = parent_processor_coor[d] % processors[d]; ccoor[d] = parent_processor_coor[d] % processors[d];
scoor[d] = parent_processor_coor[d] / processors[d]; scoor[d] = parent_processor_coor[d] / processors[d];
ssize[d] = parent_processors[d] / processors[d]; ssize[d] = parent_processors[d] / processors[d];
if ( processors[d] < shm_processors[d] ) shm_processors[d] = processors[d]; // subnode splitting.
} }
// rank within subcomm ; srank is rank of subcomm within blocks of subcomms // rank within subcomm ; srank is rank of subcomm within blocks of subcomms
@ -306,44 +304,6 @@ void CartesianCommunicator::GlobalSumVector(double *d,int N)
int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator); int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0); assert(ierr==0);
} }
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
int from,
int bytes,int dir)
{
MPI_Request xrq;
MPI_Request rrq;
assert(dest != _processor);
assert(from != _processor);
int tag;
tag= dir+from*32;
int ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator,&rrq);
assert(ierr==0);
list.push_back(rrq);
tag= dir+_processor*32;
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator,&xrq);
assert(ierr==0);
list.push_back(xrq);
}
void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list)
{
int nreq=list.size();
if (nreq==0) return;
std::vector<MPI_Status> status(nreq);
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
assert(ierr==0);
list.resize(0);
}
// Basic Halo comms primitive // Basic Halo comms primitive
void CartesianCommunicator::SendToRecvFrom(void *xmit, void CartesianCommunicator::SendToRecvFrom(void *xmit,
int dest, int dest,
@ -375,23 +335,23 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
} }
// Basic Halo comms primitive // Basic Halo comms primitive
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
int dest, int dox, int dest,
void *recv, void *recv,
int from, int dor, int from,
int bytes,int dir) int bytes,int dir)
{ {
std::vector<CommsRequest_t> list; std::vector<CommsRequest_t> list;
double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir); double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir);
StencilSendToRecvFromComplete(list,dir); StencilSendToRecvFromComplete(list,dir);
return offbytes; return offbytes;
} }
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit, void *xmit,
int dest,int dox, int dest,
void *recv, void *recv,
int from,int dor, int from,
int xbytes,int rbytes,int dir) int bytes,int dir)
{ {
int ncomm =communicator_halo.size(); int ncomm =communicator_halo.size();
int commdir=dir%ncomm; int commdir=dir%ncomm;
@ -410,34 +370,39 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
double off_node_bytes=0.0; double off_node_bytes=0.0;
int tag; int tag;
if ( dor ) { if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) { tag= dir+from*32;
tag= dir+from*32; ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
ierr=MPI_Irecv(recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq); assert(ierr==0);
assert(ierr==0); list.push_back(rrq);
list.push_back(rrq); off_node_bytes+=bytes;
off_node_bytes+=rbytes;
}
} }
if (dox) { if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) { tag= dir+_processor*32;
tag= dir+_processor*32; ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq); assert(ierr==0);
assert(ierr==0); list.push_back(xrq);
list.push_back(xrq); off_node_bytes+=bytes;
off_node_bytes+=xbytes; } else {
} else { // TODO : make a OMP loop on CPU, call threaded bcopy
void *shm = (void *) this->ShmBufferTranslate(dest,recv); void *shm = (void *) this->ShmBufferTranslate(dest,recv);
assert(shm!=NULL); assert(shm!=NULL);
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes); // std::cout <<"acceleratorCopyDeviceToDeviceAsynch"<< std::endl;
} acceleratorCopyDeviceToDeviceAsynch(xmit,shm,bytes);
} }
// if ( CommunicatorPolicy == CommunicatorPolicySequential ) {
// this->StencilSendToRecvFromComplete(list,dir);
// }
return off_node_bytes; return off_node_bytes;
} }
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir) void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
{ {
// std::cout << "Copy Synchronised\n"<<std::endl;
acceleratorCopySynchronise();
int nreq=list.size(); int nreq=list.size();
if (nreq==0) return; if (nreq==0) return;
@ -473,10 +438,6 @@ int CartesianCommunicator::RankWorld(void){
MPI_Comm_rank(communicator_world,&r); MPI_Comm_rank(communicator_world,&r);
return r; return r;
} }
void CartesianCommunicator::BarrierWorld(void){
int ierr = MPI_Barrier(communicator_world);
assert(ierr==0);
}
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
{ {
int ierr= MPI_Bcast(data, int ierr= MPI_Bcast(data,

View File

@ -45,14 +45,12 @@ void CartesianCommunicator::Init(int *argc, char *** arv)
CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank) CartesianCommunicator::CartesianCommunicator(const Coordinate &processors,const CartesianCommunicator &parent,int &srank)
: CartesianCommunicator(processors) : CartesianCommunicator(processors)
{ {
_shm_processors = Coordinate(processors.size(),1);
srank=0; srank=0;
SetCommunicator(communicator_world); SetCommunicator(communicator_world);
} }
CartesianCommunicator::CartesianCommunicator(const Coordinate &processors) CartesianCommunicator::CartesianCommunicator(const Coordinate &processors)
{ {
_shm_processors = Coordinate(processors.size(),1);
_processors = processors; _processors = processors;
_ndimension = processors.size(); assert(_ndimension>=1); _ndimension = processors.size(); assert(_ndimension>=1);
_processor_coor.resize(_ndimension); _processor_coor.resize(_ndimension);
@ -91,17 +89,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
{ {
assert(0); assert(0);
} }
void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(0);}
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
int from,
int bytes,int dir)
{
assert(0);
}
void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes) void CartesianCommunicator::AllToAll(int dim,void *in,void *out,uint64_t words,uint64_t bytes)
{ {
bcopy(in,out,bytes*words); bcopy(in,out,bytes*words);
@ -115,7 +102,6 @@ int CartesianCommunicator::RankWorld(void){return 0;}
void CartesianCommunicator::Barrier(void){} void CartesianCommunicator::Barrier(void){}
void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {} void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {}
void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { } void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { }
void CartesianCommunicator::BarrierWorld(void) { }
int CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor) { return 0;} int CartesianCommunicator::RankFromProcessorCoor(Coordinate &coor) { return 0;}
void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor){ coor = _processor_coor; } void CartesianCommunicator::ProcessorCoorFromRank(int rank, Coordinate &coor){ coor = _processor_coor; }
void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest) void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
@ -125,21 +111,21 @@ void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest
} }
double CartesianCommunicator::StencilSendToRecvFrom( void *xmit, double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
int xmit_to_rank,int dox, int xmit_to_rank,
void *recv, void *recv,
int recv_from_rank,int dor, int recv_from_rank,
int bytes, int dir) int bytes, int dir)
{ {
return 2.0*bytes; return 2.0*bytes;
} }
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list, double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit, void *xmit,
int xmit_to_rank,int dox, int xmit_to_rank,
void *recv, void *recv,
int recv_from_rank,int dor, int recv_from_rank,
int xbytes,int rbytes, int dir) int bytes, int dir)
{ {
return xbytes+rbytes; return 2.0*bytes;
} }
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir) void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
{ {

View File

@ -91,59 +91,6 @@ void *SharedMemory::ShmBufferSelf(void)
//std::cerr << "ShmBufferSelf "<<ShmRank<<" "<<std::hex<< ShmCommBufs[ShmRank] <<std::dec<<std::endl; //std::cerr << "ShmBufferSelf "<<ShmRank<<" "<<std::hex<< ShmCommBufs[ShmRank] <<std::dec<<std::endl;
return ShmCommBufs[ShmRank]; return ShmCommBufs[ShmRank];
} }
static inline int divides(int a,int b)
{
return ( b == ( (b/a)*a ) );
}
void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims)
{
////////////////////////////////////////////////////////////////
// Allow user to configure through environment variable
////////////////////////////////////////////////////////////////
char* str = getenv(("GRID_SHM_DIMS_" + std::to_string(ShmDims.size())).c_str());
if ( str ) {
std::vector<int> IntShmDims;
GridCmdOptionIntVector(std::string(str),IntShmDims);
assert(IntShmDims.size() == WorldDims.size());
long ShmSize = 1;
for (int dim=0;dim<WorldDims.size();dim++) {
ShmSize *= (ShmDims[dim] = IntShmDims[dim]);
assert(divides(ShmDims[dim],WorldDims[dim]));
}
assert(ShmSize == WorldShmSize);
return;
}
////////////////////////////////////////////////////////////////
// Powers of 2,3,5 only in prime decomposition for now
////////////////////////////////////////////////////////////////
int ndimension = WorldDims.size();
ShmDims=Coordinate(ndimension,1);
std::vector<int> primes({2,3,5});
int dim = 0;
int last_dim = ndimension - 1;
int AutoShmSize = 1;
while(AutoShmSize != WorldShmSize) {
int p;
for(p=0;p<primes.size();p++) {
int prime=primes[p];
if ( divides(prime,WorldDims[dim]/ShmDims[dim])
&& divides(prime,WorldShmSize/AutoShmSize) ) {
AutoShmSize*=prime;
ShmDims[dim]*=prime;
last_dim = dim;
break;
}
}
if (p == primes.size() && last_dim == dim) {
std::cerr << "GlobalSharedMemory::GetShmDims failed" << std::endl;
exit(EXIT_FAILURE);
}
dim=(dim+1) %ndimension;
}
}
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@ -93,10 +93,9 @@ public:
// Create an optimal reordered communicator that makes MPI_Cart_create get it right // Create an optimal reordered communicator that makes MPI_Cart_create get it right
////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////
static void Init(Grid_MPI_Comm comm); // Typically MPI_COMM_WORLD static void Init(Grid_MPI_Comm comm); // Typically MPI_COMM_WORLD
// Turns MPI_COMM_WORLD into right layout for Cartesian static void OptimalCommunicator (const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian
static void OptimalCommunicator (const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &ShmDims); static void OptimalCommunicatorHypercube (const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian
static void OptimalCommunicatorHypercube (const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &ShmDims); static void OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm); // Turns MPI_COMM_WORLD into right layout for Cartesian
static void OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &ShmDims);
static void GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims); static void GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims);
/////////////////////////////////////////////////// ///////////////////////////////////////////////////
// Provide shared memory facilities off comm world // Provide shared memory facilities off comm world

View File

@ -27,8 +27,6 @@ Author: Christoph Lehner <christoph@lhnr.de>
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
#define Mheader "SharedMemoryMpi: "
#include <Grid/GridCore.h> #include <Grid/GridCore.h>
#include <pwd.h> #include <pwd.h>
@ -38,120 +36,12 @@ Author: Christoph Lehner <christoph@lhnr.de>
#ifdef GRID_HIP #ifdef GRID_HIP
#include <hip/hip_runtime_api.h> #include <hip/hip_runtime_api.h>
#endif #endif
#ifdef GRID_SYCL #ifdef GRID_SYCl
#define GRID_SYCL_LEVEL_ZERO_IPC
#include <syscall.h>
#define SHM_SOCKETS
#endif
#include <sys/socket.h> #endif
#include <sys/un.h>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
#define header "SharedMemoryMpi: "
#ifdef SHM_SOCKETS
/*
* Barbaric extra intranode communication route in case we need sockets to pass FDs
* Forced by level_zero not being nicely designed
*/
static int sock;
static const char *sock_path_fmt = "/tmp/GridUnixSocket.%d";
static char sock_path[256];
class UnixSockets {
public:
static void Open(int rank)
{
int errnum;
sock = socket(AF_UNIX, SOCK_DGRAM, 0); assert(sock>0);
struct sockaddr_un sa_un = { 0 };
sa_un.sun_family = AF_UNIX;
snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,rank);
unlink(sa_un.sun_path);
if (bind(sock, (struct sockaddr *)&sa_un, sizeof(sa_un))) {
perror("bind failure");
exit(EXIT_FAILURE);
}
}
static int RecvFileDescriptor(void)
{
int n;
int fd;
char buf[1];
struct iovec iov;
struct msghdr msg;
struct cmsghdr *cmsg;
char cms[CMSG_SPACE(sizeof(int))];
iov.iov_base = buf;
iov.iov_len = 1;
memset(&msg, 0, sizeof msg);
msg.msg_name = 0;
msg.msg_namelen = 0;
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
msg.msg_control = (caddr_t)cms;
msg.msg_controllen = sizeof cms;
if((n=recvmsg(sock, &msg, 0)) < 0) {
perror("recvmsg failed");
return -1;
}
if(n == 0){
perror("recvmsg returned 0");
return -1;
}
cmsg = CMSG_FIRSTHDR(&msg);
memmove(&fd, CMSG_DATA(cmsg), sizeof(int));
return fd;
}
static void SendFileDescriptor(int fildes,int xmit_to_rank)
{
struct msghdr msg;
struct iovec iov;
struct cmsghdr *cmsg = NULL;
char ctrl[CMSG_SPACE(sizeof(int))];
char data = ' ';
memset(&msg, 0, sizeof(struct msghdr));
memset(ctrl, 0, CMSG_SPACE(sizeof(int)));
iov.iov_base = &data;
iov.iov_len = sizeof(data);
sprintf(sock_path,sock_path_fmt,xmit_to_rank);
struct sockaddr_un sa_un = { 0 };
sa_un.sun_family = AF_UNIX;
snprintf(sa_un.sun_path, sizeof(sa_un.sun_path),sock_path_fmt,xmit_to_rank);
msg.msg_name = (void *)&sa_un;
msg.msg_namelen = sizeof(sa_un);
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
msg.msg_controllen = CMSG_SPACE(sizeof(int));
msg.msg_control = ctrl;
cmsg = CMSG_FIRSTHDR(&msg);
cmsg->cmsg_level = SOL_SOCKET;
cmsg->cmsg_type = SCM_RIGHTS;
cmsg->cmsg_len = CMSG_LEN(sizeof(int));
*((int *) CMSG_DATA(cmsg)) = fildes;
sendmsg(sock, &msg, 0);
};
};
#endif
/*Construct from an MPI communicator*/ /*Construct from an MPI communicator*/
void GlobalSharedMemory::Init(Grid_MPI_Comm comm) void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
{ {
@ -174,8 +64,8 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
MPI_Comm_size(WorldShmComm ,&WorldShmSize); MPI_Comm_size(WorldShmComm ,&WorldShmSize);
if ( WorldRank == 0) { if ( WorldRank == 0) {
std::cout << Mheader " World communicator of size " <<WorldSize << std::endl; std::cout << header " World communicator of size " <<WorldSize << std::endl;
std::cout << Mheader " Node communicator of size " <<WorldShmSize << std::endl; std::cout << header " Node communicator of size " <<WorldShmSize << std::endl;
} }
// WorldShmComm, WorldShmSize, WorldShmRank // WorldShmComm, WorldShmSize, WorldShmRank
@ -262,7 +152,7 @@ int Log2Size(int TwoToPower,int MAXLOG2)
} }
return log2size; return log2size;
} }
void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM) void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
{ {
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
// Look and see if it looks like an HPE 8600 based on hostname conventions // Look and see if it looks like an HPE 8600 based on hostname conventions
@ -275,11 +165,63 @@ void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_M
gethostname(name,namelen); gethostname(name,namelen);
int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ; int nscan = sscanf(name,"r%di%dn%d",&R,&I,&N) ;
if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm,SHM); if(nscan==3 && HPEhypercube ) OptimalCommunicatorHypercube(processors,optimal_comm);
else OptimalCommunicatorSharedMemory(processors,optimal_comm,SHM); else OptimalCommunicatorSharedMemory(processors,optimal_comm);
} }
static inline int divides(int a,int b)
{
return ( b == ( (b/a)*a ) );
}
void GlobalSharedMemory::GetShmDims(const Coordinate &WorldDims,Coordinate &ShmDims)
{
////////////////////////////////////////////////////////////////
// Allow user to configure through environment variable
////////////////////////////////////////////////////////////////
char* str = getenv(("GRID_SHM_DIMS_" + std::to_string(ShmDims.size())).c_str());
if ( str ) {
std::vector<int> IntShmDims;
GridCmdOptionIntVector(std::string(str),IntShmDims);
assert(IntShmDims.size() == WorldDims.size());
long ShmSize = 1;
for (int dim=0;dim<WorldDims.size();dim++) {
ShmSize *= (ShmDims[dim] = IntShmDims[dim]);
assert(divides(ShmDims[dim],WorldDims[dim]));
}
assert(ShmSize == WorldShmSize);
return;
}
////////////////////////////////////////////////////////////////
// Powers of 2,3,5 only in prime decomposition for now
////////////////////////////////////////////////////////////////
int ndimension = WorldDims.size();
ShmDims=Coordinate(ndimension,1);
void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM) std::vector<int> primes({2,3,5});
int dim = 0;
int last_dim = ndimension - 1;
int AutoShmSize = 1;
while(AutoShmSize != WorldShmSize) {
int p;
for(p=0;p<primes.size();p++) {
int prime=primes[p];
if ( divides(prime,WorldDims[dim]/ShmDims[dim])
&& divides(prime,WorldShmSize/AutoShmSize) ) {
AutoShmSize*=prime;
ShmDims[dim]*=prime;
last_dim = dim;
break;
}
}
if (p == primes.size() && last_dim == dim) {
std::cerr << "GlobalSharedMemory::GetShmDims failed" << std::endl;
exit(EXIT_FAILURE);
}
dim=(dim+1) %ndimension;
}
}
void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
{ {
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Assert power of two shm_size. // Assert power of two shm_size.
@ -352,8 +294,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
Coordinate HyperCoor(ndimension); Coordinate HyperCoor(ndimension);
GetShmDims(WorldDims,ShmDims); GetShmDims(WorldDims,ShmDims);
SHM = ShmDims;
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Establish torus of processes and nodes with sub-blockings // Establish torus of processes and nodes with sub-blockings
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
@ -400,7 +341,7 @@ void GlobalSharedMemory::OptimalCommunicatorHypercube(const Coordinate &processo
int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm); int ierr= MPI_Comm_split(WorldComm,0,rank,&optimal_comm);
assert(ierr==0); assert(ierr==0);
} }
void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM) void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
{ {
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Identify subblock of ranks on node spreading across dims // Identify subblock of ranks on node spreading across dims
@ -412,8 +353,6 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce
Coordinate ShmCoor(ndimension); Coordinate NodeCoor(ndimension); Coordinate WorldCoor(ndimension); Coordinate ShmCoor(ndimension); Coordinate NodeCoor(ndimension); Coordinate WorldCoor(ndimension);
GetShmDims(WorldDims,ShmDims); GetShmDims(WorldDims,ShmDims);
SHM=ShmDims;
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Establish torus of processes and nodes with sub-blockings // Establish torus of processes and nodes with sub-blockings
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
@ -452,7 +391,7 @@ void GlobalSharedMemory::OptimalCommunicatorSharedMemory(const Coordinate &proce
#ifdef GRID_MPI3_SHMGET #ifdef GRID_MPI3_SHMGET
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl; std::cout << header "SharedMemoryAllocate "<< bytes<< " shmget implementation "<<std::endl;
assert(_ShmSetup==1); assert(_ShmSetup==1);
assert(_ShmAlloc==0); assert(_ShmAlloc==0);
@ -537,7 +476,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl; << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
SharedMemoryZero(ShmCommBuf,bytes); SharedMemoryZero(ShmCommBuf,bytes);
@ -580,21 +519,16 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
if ( WorldRank == 0 ){ if ( WorldRank == 0 ){
std::cout << WorldRank << Mheader " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
<< "bytes at "<< std::hex<< ShmCommBuf << " - "<<(bytes-1+(uint64_t)ShmCommBuf) <<std::dec<<" for comms buffers " <<std::endl; << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
} }
SharedMemoryZero(ShmCommBuf,bytes); SharedMemoryZero(ShmCommBuf,bytes);
std::cout<< "Setting up IPC"<<std::endl; std::cout<< "Setting up IPC"<<std::endl;
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Loop over ranks/gpu's on our node // Loop over ranks/gpu's on our node
/////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////
#ifdef SHM_SOCKETS
UnixSockets::Open(WorldShmRank);
#endif
for(int r=0;r<WorldShmSize;r++){ for(int r=0;r<WorldShmSize;r++){
MPI_Barrier(WorldShmComm);
#ifndef GRID_MPI3_SHM_NONE #ifndef GRID_MPI3_SHM_NONE
////////////////////////////////////////////////// //////////////////////////////////////////////////
// If it is me, pass around the IPC access key // If it is me, pass around the IPC access key
@ -602,32 +536,24 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
void * thisBuf = ShmCommBuf; void * thisBuf = ShmCommBuf;
if(!Stencil_force_mpi) { if(!Stencil_force_mpi) {
#ifdef GRID_SYCL_LEVEL_ZERO_IPC #ifdef GRID_SYCL_LEVEL_ZERO_IPC
typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t; typedef struct { int fd; pid_t pid ; } clone_mem_t;
auto zeDevice = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device()); auto zeDevice = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_device());
auto zeContext = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context()); auto zeContext = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_context());
ze_ipc_mem_handle_t ihandle; ze_ipc_mem_handle_t ihandle;
clone_mem_t handle; clone_mem_t handle;
if ( r==WorldShmRank ) { if ( r==WorldShmRank ) {
auto err = zeMemGetIpcHandle(zeContext,ShmCommBuf,&ihandle); auto err = zeMemGetIpcHandle(zeContext,ShmCommBuf,&ihandle);
if ( err != ZE_RESULT_SUCCESS ) { if ( err != ZE_RESULT_SUCCESS ) {
std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl; std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} else { } else {
std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl; std::cout << "SharedMemoryMPI.cc zeMemGetIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
} }
memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int)); memcpy((void *)&handle.fd,(void *)&ihandle,sizeof(int));
handle.pid = getpid(); handle.pid = getpid();
memcpy((void *)&handle.ze,(void *)&ihandle,sizeof(ihandle));
#ifdef SHM_SOCKETS
for(int rr=0;rr<WorldShmSize;rr++){
if(rr!=r){
UnixSockets::SendFileDescriptor(handle.fd,rr);
}
}
#endif
} }
#endif #endif
#ifdef GRID_CUDA #ifdef GRID_CUDA
@ -655,7 +581,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
// Share this IPC handle across the Shm Comm // Share this IPC handle across the Shm Comm
////////////////////////////////////////////////// //////////////////////////////////////////////////
{ {
MPI_Barrier(WorldShmComm);
int ierr=MPI_Bcast(&handle, int ierr=MPI_Bcast(&handle,
sizeof(handle), sizeof(handle),
MPI_BYTE, MPI_BYTE,
@ -671,10 +596,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
#ifdef GRID_SYCL_LEVEL_ZERO_IPC #ifdef GRID_SYCL_LEVEL_ZERO_IPC
if ( r!=WorldShmRank ) { if ( r!=WorldShmRank ) {
thisBuf = nullptr; thisBuf = nullptr;
int myfd;
#ifdef SHM_SOCKETS
myfd=UnixSockets::RecvFileDescriptor();
#else
std::cout<<"mapping seeking remote pid/fd " std::cout<<"mapping seeking remote pid/fd "
<<handle.pid<<"/" <<handle.pid<<"/"
<<handle.fd<<std::endl; <<handle.fd<<std::endl;
@ -682,22 +603,16 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
int pidfd = syscall(SYS_pidfd_open,handle.pid,0); int pidfd = syscall(SYS_pidfd_open,handle.pid,0);
std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n"; std::cout<<"Using IpcHandle pidfd "<<pidfd<<"\n";
// int myfd = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0); // int myfd = syscall(SYS_pidfd_getfd,pidfd,handle.fd,0);
myfd = syscall(438,pidfd,handle.fd,0); int myfd = syscall(438,pidfd,handle.fd,0);
int err_t = errno;
if (myfd < 0) { std::cout<<"Using IpcHandle myfd "<<myfd<<"\n";
fprintf(stderr,"pidfd_getfd returned %d errno was %d\n", myfd,err_t); fflush(stderr);
perror("pidfd_getfd failed ");
assert(0);
}
#endif
std::cout<<"Using IpcHandle mapped remote pid "<<handle.pid <<" FD "<<handle.fd <<" to myfd "<<myfd<<"\n";
memcpy((void *)&ihandle,(void *)&handle.ze,sizeof(ihandle));
memcpy((void *)&ihandle,(void *)&myfd,sizeof(int)); memcpy((void *)&ihandle,(void *)&myfd,sizeof(int));
auto err = zeMemOpenIpcHandle(zeContext,zeDevice,ihandle,0,&thisBuf); auto err = zeMemOpenIpcHandle(zeContext,zeDevice,ihandle,0,&thisBuf);
if ( err != ZE_RESULT_SUCCESS ) { if ( err != ZE_RESULT_SUCCESS ) {
std::cerr << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl; std::cout << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl; std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} else { } else {
std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle succeeded for rank "<<r<<std::endl; std::cout << "SharedMemoryMPI.cc zeMemOpenIpcHandle succeeded for rank "<<r<<std::endl;
@ -732,7 +647,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
#else #else
WorldShmCommBufs[r] = ShmCommBuf; WorldShmCommBufs[r] = ShmCommBuf;
#endif #endif
MPI_Barrier(WorldShmComm);
} }
_ShmAllocBytes=bytes; _ShmAllocBytes=bytes;
@ -744,7 +658,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
#ifdef GRID_MPI3_SHMMMAP #ifdef GRID_MPI3_SHMMMAP
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl; std::cout << header "SharedMemoryAllocate "<< bytes<< " MMAP implementation "<< GRID_SHM_PATH <<std::endl;
assert(_ShmSetup==1); assert(_ShmSetup==1);
assert(_ShmAlloc==0); assert(_ShmAlloc==0);
////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -781,7 +695,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
assert(((uint64_t)ptr&0x3F)==0); assert(((uint64_t)ptr&0x3F)==0);
close(fd); close(fd);
WorldShmCommBufs[r] =ptr; WorldShmCommBufs[r] =ptr;
// std::cout << Mheader "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl; // std::cout << header "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
} }
_ShmAlloc=1; _ShmAlloc=1;
_ShmAllocBytes = bytes; _ShmAllocBytes = bytes;
@ -791,7 +705,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
#ifdef GRID_MPI3_SHM_NONE #ifdef GRID_MPI3_SHM_NONE
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl; std::cout << header "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
assert(_ShmSetup==1); assert(_ShmSetup==1);
assert(_ShmAlloc==0); assert(_ShmAlloc==0);
////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -838,7 +752,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////
void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags) void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
{ {
std::cout << Mheader "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl; std::cout << header "SharedMemoryAllocate "<< bytes<< " SHMOPEN implementation "<<std::endl;
assert(_ShmSetup==1); assert(_ShmSetup==1);
assert(_ShmAlloc==0); assert(_ShmAlloc==0);
MPI_Barrier(WorldShmComm); MPI_Barrier(WorldShmComm);

View File

@ -48,10 +48,9 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
_ShmSetup=1; _ShmSetup=1;
} }
void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm,Coordinate &SHM) void GlobalSharedMemory::OptimalCommunicator(const Coordinate &processors,Grid_MPI_Comm & optimal_comm)
{ {
optimal_comm = WorldComm; optimal_comm = WorldComm;
SHM = Coordinate(processors.size(),1);
} }
//////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -297,30 +297,6 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
} }
} }
#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
template <typename T>
T iDivUp(T a, T b) // Round a / b to nearest higher integer value
{ return (a % b != 0) ? (a / b + 1) : (a / b); }
template <typename T>
__global__ void populate_Cshift_table(T* vector, T lo, T ro, T e1, T e2, T stride)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if (idx >= e1*e2) return;
int n, b, o;
n = idx / e2;
b = idx % e2;
o = n*stride + b;
vector[2*idx + 0] = lo + o;
vector[2*idx + 1] = ro + o;
}
#endif
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
// local to node block strided copies // local to node block strided copies
////////////////////////////////////////////////////// //////////////////////////////////////////////////////
@ -345,20 +321,12 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
int ent=0; int ent=0;
if(cbmask == 0x3 ){ if(cbmask == 0x3 ){
#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
ent = e1*e2;
dim3 blockSize(acceleratorThreads());
dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x));
populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride);
accelerator_barrier();
#else
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o =n*stride+b; int o =n*stride+b;
Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o); Cshift_table[ent++] = std::pair<int,int>(lo+o,ro+o);
} }
} }
#endif
} else { } else {
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
@ -409,19 +377,11 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
int ent=0; int ent=0;
if ( cbmask == 0x3 ) { if ( cbmask == 0x3 ) {
#if (defined(GRID_CUDA) || defined(GRID_HIP)) && defined(ACCELERATOR_CSHIFT)
ent = e1*e2;
dim3 blockSize(acceleratorThreads());
dim3 gridSize(iDivUp((unsigned int)ent, blockSize.x));
populate_Cshift_table<<<gridSize, blockSize>>>(&Cshift_table[0].first, lo, ro, e1, e2, stride);
accelerator_barrier();
#else
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){
int o =n*stride; int o =n*stride;
Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b); Cshift_table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
}} }}
#endif
} else { } else {
for(int n=0;n<e1;n++){ for(int n=0;n<e1;n++){
for(int b=0;b<e2;b++){ for(int b=0;b<e2;b++){

View File

@ -47,4 +47,3 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#include <Grid/lattice/Lattice_transfer.h> #include <Grid/lattice/Lattice_transfer.h>
#include <Grid/lattice/Lattice_basis.h> #include <Grid/lattice/Lattice_basis.h>
#include <Grid/lattice/Lattice_crc.h> #include <Grid/lattice/Lattice_crc.h>
#include <Grid/lattice/PaddedCell.h>

View File

@ -63,7 +63,7 @@ accelerator_inline vobj predicatedWhere(const iobj &predicate,
typename std::remove_const<vobj>::type ret; typename std::remove_const<vobj>::type ret;
typedef typename vobj::scalar_object scalar_object; typedef typename vobj::scalar_object scalar_object;
// typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
const int Nsimd = vobj::vector_type::Nsimd(); const int Nsimd = vobj::vector_type::Nsimd();
@ -345,9 +345,7 @@ GridUnopClass(UnaryNot, Not(a));
GridUnopClass(UnaryTrace, trace(a)); GridUnopClass(UnaryTrace, trace(a));
GridUnopClass(UnaryTranspose, transpose(a)); GridUnopClass(UnaryTranspose, transpose(a));
GridUnopClass(UnaryTa, Ta(a)); GridUnopClass(UnaryTa, Ta(a));
GridUnopClass(UnarySpTa, SpTa(a));
GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a)); GridUnopClass(UnaryProjectOnGroup, ProjectOnGroup(a));
GridUnopClass(UnaryProjectOnSpGroup, ProjectOnSpGroup(a));
GridUnopClass(UnaryTimesI, timesI(a)); GridUnopClass(UnaryTimesI, timesI(a));
GridUnopClass(UnaryTimesMinusI, timesMinusI(a)); GridUnopClass(UnaryTimesMinusI, timesMinusI(a));
GridUnopClass(UnaryAbs, abs(a)); GridUnopClass(UnaryAbs, abs(a));
@ -458,9 +456,7 @@ GRID_DEF_UNOP(operator!, UnaryNot);
GRID_DEF_UNOP(trace, UnaryTrace); GRID_DEF_UNOP(trace, UnaryTrace);
GRID_DEF_UNOP(transpose, UnaryTranspose); GRID_DEF_UNOP(transpose, UnaryTranspose);
GRID_DEF_UNOP(Ta, UnaryTa); GRID_DEF_UNOP(Ta, UnaryTa);
GRID_DEF_UNOP(SpTa, UnarySpTa);
GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup); GRID_DEF_UNOP(ProjectOnGroup, UnaryProjectOnGroup);
GRID_DEF_UNOP(ProjectOnSpGroup, UnaryProjectOnSpGroup);
GRID_DEF_UNOP(timesI, UnaryTimesI); GRID_DEF_UNOP(timesI, UnaryTimesI);
GRID_DEF_UNOP(timesMinusI, UnaryTimesMinusI); GRID_DEF_UNOP(timesMinusI, UnaryTimesMinusI);
GRID_DEF_UNOP(abs, UnaryAbs); // abs overloaded in cmath C++98; DON'T do the GRID_DEF_UNOP(abs, UnaryAbs); // abs overloaded in cmath C++98; DON'T do the

View File

@ -36,7 +36,6 @@ NAMESPACE_BEGIN(Grid);
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
template<class obj1,class obj2,class obj3> inline template<class obj1,class obj2,class obj3> inline
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){ void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
GRID_TRACE("mult");
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
autoView( ret_v , ret, AcceleratorWrite); autoView( ret_v , ret, AcceleratorWrite);
autoView( lhs_v , lhs, AcceleratorRead); autoView( lhs_v , lhs, AcceleratorRead);
@ -54,7 +53,6 @@ void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
template<class obj1,class obj2,class obj3> inline template<class obj1,class obj2,class obj3> inline
void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
GRID_TRACE("mac");
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
conformable(lhs,rhs); conformable(lhs,rhs);
@ -72,7 +70,6 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
template<class obj1,class obj2,class obj3> inline template<class obj1,class obj2,class obj3> inline
void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
GRID_TRACE("sub");
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
conformable(lhs,rhs); conformable(lhs,rhs);
@ -89,7 +86,6 @@ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
} }
template<class obj1,class obj2,class obj3> inline template<class obj1,class obj2,class obj3> inline
void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
GRID_TRACE("add");
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
conformable(lhs,rhs); conformable(lhs,rhs);
@ -110,7 +106,6 @@ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
template<class obj1,class obj2,class obj3> inline template<class obj1,class obj2,class obj3> inline
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
GRID_TRACE("mult");
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(lhs,ret); conformable(lhs,ret);
autoView( ret_v , ret, AcceleratorWrite); autoView( ret_v , ret, AcceleratorWrite);
@ -124,7 +119,6 @@ void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
template<class obj1,class obj2,class obj3> inline template<class obj1,class obj2,class obj3> inline
void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
GRID_TRACE("mac");
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,lhs); conformable(ret,lhs);
autoView( ret_v , ret, AcceleratorWrite); autoView( ret_v , ret, AcceleratorWrite);
@ -139,7 +133,6 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
template<class obj1,class obj2,class obj3> inline template<class obj1,class obj2,class obj3> inline
void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
GRID_TRACE("sub");
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(ret,lhs); conformable(ret,lhs);
autoView( ret_v , ret, AcceleratorWrite); autoView( ret_v , ret, AcceleratorWrite);
@ -153,7 +146,6 @@ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
} }
template<class obj1,class obj2,class obj3> inline template<class obj1,class obj2,class obj3> inline
void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
GRID_TRACE("add");
ret.Checkerboard() = lhs.Checkerboard(); ret.Checkerboard() = lhs.Checkerboard();
conformable(lhs,ret); conformable(lhs,ret);
autoView( ret_v , ret, AcceleratorWrite); autoView( ret_v , ret, AcceleratorWrite);
@ -171,7 +163,6 @@ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
template<class obj1,class obj2,class obj3> inline template<class obj1,class obj2,class obj3> inline
void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
GRID_TRACE("mult");
ret.Checkerboard() = rhs.Checkerboard(); ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
autoView( ret_v , ret, AcceleratorWrite); autoView( ret_v , ret, AcceleratorWrite);
@ -186,7 +177,6 @@ void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
template<class obj1,class obj2,class obj3> inline template<class obj1,class obj2,class obj3> inline
void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
GRID_TRACE("mac");
ret.Checkerboard() = rhs.Checkerboard(); ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
autoView( ret_v , ret, AcceleratorWrite); autoView( ret_v , ret, AcceleratorWrite);
@ -201,7 +191,6 @@ void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
template<class obj1,class obj2,class obj3> inline template<class obj1,class obj2,class obj3> inline
void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
GRID_TRACE("sub");
ret.Checkerboard() = rhs.Checkerboard(); ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
autoView( ret_v , ret, AcceleratorWrite); autoView( ret_v , ret, AcceleratorWrite);
@ -215,7 +204,6 @@ void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
} }
template<class obj1,class obj2,class obj3> inline template<class obj1,class obj2,class obj3> inline
void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){ void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
GRID_TRACE("add");
ret.Checkerboard() = rhs.Checkerboard(); ret.Checkerboard() = rhs.Checkerboard();
conformable(ret,rhs); conformable(ret,rhs);
autoView( ret_v , ret, AcceleratorWrite); autoView( ret_v , ret, AcceleratorWrite);
@ -230,7 +218,6 @@ void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
template<class sobj,class vobj> inline template<class sobj,class vobj> inline
void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){ void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y){
GRID_TRACE("axpy");
ret.Checkerboard() = x.Checkerboard(); ret.Checkerboard() = x.Checkerboard();
conformable(ret,x); conformable(ret,x);
conformable(x,y); conformable(x,y);
@ -244,7 +231,6 @@ void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &
} }
template<class sobj,class vobj> inline template<class sobj,class vobj> inline
void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y){
GRID_TRACE("axpby");
ret.Checkerboard() = x.Checkerboard(); ret.Checkerboard() = x.Checkerboard();
conformable(ret,x); conformable(ret,x);
conformable(x,y); conformable(x,y);
@ -260,13 +246,11 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
template<class sobj,class vobj> inline template<class sobj,class vobj> inline
RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y) RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
{ {
GRID_TRACE("axpy_norm");
return axpy_norm_fast(ret,a,x,y); return axpy_norm_fast(ret,a,x,y);
} }
template<class sobj,class vobj> inline template<class sobj,class vobj> inline
RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y) RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
{ {
GRID_TRACE("axpby_norm");
return axpby_norm_fast(ret,a,b,x,y); return axpby_norm_fast(ret,a,b,x,y);
} }

View File

@ -117,7 +117,6 @@ public:
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template <typename Op, typename T1> inline Lattice<vobj> & operator=(const LatticeUnaryExpression<Op,T1> &expr) template <typename Op, typename T1> inline Lattice<vobj> & operator=(const LatticeUnaryExpression<Op,T1> &expr)
{ {
GRID_TRACE("ExpressionTemplateEval");
GridBase *egrid(nullptr); GridBase *egrid(nullptr);
GridFromExpression(egrid,expr); GridFromExpression(egrid,expr);
assert(egrid!=nullptr); assert(egrid!=nullptr);
@ -130,7 +129,7 @@ public:
auto exprCopy = expr; auto exprCopy = expr;
ExpressionViewOpen(exprCopy); ExpressionViewOpen(exprCopy);
auto me = View(AcceleratorWriteDiscard); auto me = View(AcceleratorWrite);
accelerator_for(ss,me.size(),vobj::Nsimd(),{ accelerator_for(ss,me.size(),vobj::Nsimd(),{
auto tmp = eval(ss,exprCopy); auto tmp = eval(ss,exprCopy);
coalescedWrite(me[ss],tmp); coalescedWrite(me[ss],tmp);
@ -141,7 +140,6 @@ public:
} }
template <typename Op, typename T1,typename T2> inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr) template <typename Op, typename T1,typename T2> inline Lattice<vobj> & operator=(const LatticeBinaryExpression<Op,T1,T2> &expr)
{ {
GRID_TRACE("ExpressionTemplateEval");
GridBase *egrid(nullptr); GridBase *egrid(nullptr);
GridFromExpression(egrid,expr); GridFromExpression(egrid,expr);
assert(egrid!=nullptr); assert(egrid!=nullptr);
@ -154,7 +152,7 @@ public:
auto exprCopy = expr; auto exprCopy = expr;
ExpressionViewOpen(exprCopy); ExpressionViewOpen(exprCopy);
auto me = View(AcceleratorWriteDiscard); auto me = View(AcceleratorWrite);
accelerator_for(ss,me.size(),vobj::Nsimd(),{ accelerator_for(ss,me.size(),vobj::Nsimd(),{
auto tmp = eval(ss,exprCopy); auto tmp = eval(ss,exprCopy);
coalescedWrite(me[ss],tmp); coalescedWrite(me[ss],tmp);
@ -165,7 +163,6 @@ public:
} }
template <typename Op, typename T1,typename T2,typename T3> inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr) template <typename Op, typename T1,typename T2,typename T3> inline Lattice<vobj> & operator=(const LatticeTrinaryExpression<Op,T1,T2,T3> &expr)
{ {
GRID_TRACE("ExpressionTemplateEval");
GridBase *egrid(nullptr); GridBase *egrid(nullptr);
GridFromExpression(egrid,expr); GridFromExpression(egrid,expr);
assert(egrid!=nullptr); assert(egrid!=nullptr);
@ -177,7 +174,7 @@ public:
this->checkerboard=cb; this->checkerboard=cb;
auto exprCopy = expr; auto exprCopy = expr;
ExpressionViewOpen(exprCopy); ExpressionViewOpen(exprCopy);
auto me = View(AcceleratorWriteDiscard); auto me = View(AcceleratorWrite);
accelerator_for(ss,me.size(),vobj::Nsimd(),{ accelerator_for(ss,me.size(),vobj::Nsimd(),{
auto tmp = eval(ss,exprCopy); auto tmp = eval(ss,exprCopy);
coalescedWrite(me[ss],tmp); coalescedWrite(me[ss],tmp);
@ -248,7 +245,7 @@ public:
/////////////////////////////////////////// ///////////////////////////////////////////
// user defined constructor // user defined constructor
/////////////////////////////////////////// ///////////////////////////////////////////
Lattice(GridBase *grid,ViewMode mode=AcceleratorWriteDiscard) { Lattice(GridBase *grid,ViewMode mode=AcceleratorWrite) {
this->_grid = grid; this->_grid = grid;
resize(this->_grid->oSites()); resize(this->_grid->oSites());
assert((((uint64_t)&this->_odata[0])&0xF) ==0); assert((((uint64_t)&this->_odata[0])&0xF) ==0);
@ -291,8 +288,8 @@ public:
typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0; typename std::enable_if<!std::is_same<robj,vobj>::value,int>::type i=0;
conformable(*this,r); conformable(*this,r);
this->checkerboard = r.Checkerboard(); this->checkerboard = r.Checkerboard();
auto me = View(AcceleratorWrite);
auto him= r.View(AcceleratorRead); auto him= r.View(AcceleratorRead);
auto me = View(AcceleratorWriteDiscard);
accelerator_for(ss,me.size(),vobj::Nsimd(),{ accelerator_for(ss,me.size(),vobj::Nsimd(),{
coalescedWrite(me[ss],him(ss)); coalescedWrite(me[ss],him(ss));
}); });
@ -306,8 +303,8 @@ public:
inline Lattice<vobj> & operator = (const Lattice<vobj> & r){ inline Lattice<vobj> & operator = (const Lattice<vobj> & r){
this->checkerboard = r.Checkerboard(); this->checkerboard = r.Checkerboard();
conformable(*this,r); conformable(*this,r);
auto me = View(AcceleratorWrite);
auto him= r.View(AcceleratorRead); auto him= r.View(AcceleratorRead);
auto me = View(AcceleratorWriteDiscard);
accelerator_for(ss,me.size(),vobj::Nsimd(),{ accelerator_for(ss,me.size(),vobj::Nsimd(),{
coalescedWrite(me[ss],him(ss)); coalescedWrite(me[ss],him(ss));
}); });
@ -360,7 +357,7 @@ public:
template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){ template<class vobj> std::ostream& operator<< (std::ostream& stream, const Lattice<vobj> &o){
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
for(int64_t g=0;g<o.Grid()->_gsites;g++){ for(int g=0;g<o.Grid()->_gsites;g++){
Coordinate gcoor; Coordinate gcoor;
o.Grid()->GlobalIndexToGlobalCoor(g,gcoor); o.Grid()->GlobalIndexToGlobalCoor(g,gcoor);

View File

@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
template<class vobj> void DumpSliceNorm(std::string s,const Lattice<vobj> &f,int mu=-1) template<class vobj> void DumpSliceNorm(std::string s,Lattice<vobj> &f,int mu=-1)
{ {
auto ff = localNorm2(f); auto ff = localNorm2(f);
if ( mu==-1 ) mu = f.Grid()->Nd()-1; if ( mu==-1 ) mu = f.Grid()->Nd()-1;

View File

@ -32,6 +32,7 @@ template<class vobj>
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)
{ {
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nblock = X.Grid()->GlobalDimensions()[Orthog]; int Nblock = X.Grid()->GlobalDimensions()[Orthog];
@ -81,6 +82,7 @@ template<class vobj>
static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0) static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0)
{ {
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nblock = X.Grid()->GlobalDimensions()[Orthog]; int Nblock = X.Grid()->GlobalDimensions()[Orthog];
@ -128,6 +130,7 @@ template<class vobj>
static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)
{ {
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
GridBase *FullGrid = lhs.Grid(); GridBase *FullGrid = lhs.Grid();

View File

@ -96,6 +96,9 @@ void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){
GridBase *grid=l.Grid(); GridBase *grid=l.Grid();
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd(); int Nsimd = grid->Nsimd();
assert( l.Checkerboard()== l.Grid()->CheckerBoard(site)); assert( l.Checkerboard()== l.Grid()->CheckerBoard(site));
@ -122,17 +125,14 @@ void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){
////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////
// Peek a scalar object from the SIMD array // Peek a scalar object from the SIMD array
////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////
template<class vobj>
typename vobj::scalar_object peekSite(const Lattice<vobj> &l,const Coordinate &site){
typename vobj::scalar_object s;
peekSite(s,l,site);
return s;
}
template<class vobj,class sobj> template<class vobj,class sobj>
void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
GridBase *grid=l.Grid(); GridBase *grid=l.Grid();
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type;
int Nsimd = grid->Nsimd(); int Nsimd = grid->Nsimd();
assert( l.Checkerboard() == l.Grid()->CheckerBoard(site)); assert( l.Checkerboard() == l.Grid()->CheckerBoard(site));
@ -173,11 +173,11 @@ inline void peekLocalSite(sobj &s,const LatticeView<vobj> &l,Coordinate &site)
idx= grid->iIndex(site); idx= grid->iIndex(site);
odx= grid->oIndex(site); odx= grid->oIndex(site);
const vector_type *vp = (const vector_type *) &l[odx]; scalar_type * vp = (scalar_type *)&l[odx];
scalar_type * pt = (scalar_type *)&s; scalar_type * pt = (scalar_type *)&s;
for(int w=0;w<words;w++){ for(int w=0;w<words;w++){
pt[w] = getlane(vp[w],idx); pt[w] = vp[idx+w*Nsimd];
} }
return; return;
@ -210,10 +210,10 @@ inline void pokeLocalSite(const sobj &s,LatticeView<vobj> &l,Coordinate &site)
idx= grid->iIndex(site); idx= grid->iIndex(site);
odx= grid->oIndex(site); odx= grid->oIndex(site);
vector_type * vp = (vector_type *)&l[odx]; scalar_type * vp = (scalar_type *)&l[odx];
scalar_type * pt = (scalar_type *)&s; scalar_type * pt = (scalar_type *)&s;
for(int w=0;w<words;w++){ for(int w=0;w<words;w++){
putlane(vp[w],pt[w],idx); vp[idx+w*Nsimd] = pt[w];
} }
return; return;
}; };

View File

@ -94,7 +94,10 @@ inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)
for(int i=0;i<nthread;i++){ for(int i=0;i<nthread;i++){
ssum = ssum+sumarray[i]; ssum = ssum+sumarray[i];
} }
return ssum;
typedef typename vobj::scalar_object ssobj;
ssobj ret = ssum;
return ret;
} }
/* /*
Threaded max, don't use for now Threaded max, don't use for now
@ -153,44 +156,33 @@ inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites)
} }
template<class vobj> template<class vobj>
inline typename vobj::scalar_object rankSum(const Lattice<vobj> &arg) inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
{ {
Integer osites = arg.Grid()->oSites(); Integer osites = arg.Grid()->oSites();
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL) #if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
typename vobj::scalar_object ssum;
autoView( arg_v, arg, AcceleratorRead); autoView( arg_v, arg, AcceleratorRead);
return sum_gpu(&arg_v[0],osites); ssum= sum_gpu(&arg_v[0],osites);
#else #else
autoView(arg_v, arg, CpuRead); autoView(arg_v, arg, CpuRead);
return sum_cpu(&arg_v[0],osites); auto ssum= sum_cpu(&arg_v[0],osites);
#endif #endif
}
template<class vobj>
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
{
auto ssum = rankSum(arg);
arg.Grid()->GlobalSum(ssum); arg.Grid()->GlobalSum(ssum);
return ssum; return ssum;
} }
template<class vobj> template<class vobj>
inline typename vobj::scalar_object rankSumLarge(const Lattice<vobj> &arg) inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg)
{ {
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL) #if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
autoView( arg_v, arg, AcceleratorRead); autoView( arg_v, arg, AcceleratorRead);
Integer osites = arg.Grid()->oSites(); Integer osites = arg.Grid()->oSites();
return sum_gpu_large(&arg_v[0],osites); auto ssum= sum_gpu_large(&arg_v[0],osites);
#else #else
autoView(arg_v, arg, CpuRead); autoView(arg_v, arg, CpuRead);
Integer osites = arg.Grid()->oSites(); Integer osites = arg.Grid()->oSites();
return sum_cpu(&arg_v[0],osites); auto ssum= sum_cpu(&arg_v[0],osites);
#endif #endif
}
template<class vobj>
inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg)
{
auto ssum = rankSumLarge(arg);
arg.Grid()->GlobalSum(ssum); arg.Grid()->GlobalSum(ssum);
return ssum; return ssum;
} }
@ -233,6 +225,7 @@ template<class vobj> inline RealD maxLocalNorm2(const Lattice<vobj> &arg)
template<class vobj> template<class vobj>
inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
{ {
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_typeD vector_type; typedef typename vobj::vector_typeD vector_type;
ComplexD nrm; ComplexD nrm;
@ -242,7 +235,6 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
const uint64_t sites = grid->oSites(); const uint64_t sites = grid->oSites();
// Might make all code paths go this way. // Might make all code paths go this way.
#if 0
typedef decltype(innerProductD(vobj(),vobj())) inner_t; typedef decltype(innerProductD(vobj(),vobj())) inner_t;
Vector<inner_t> inner_tmp(sites); Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0]; auto inner_tmp_v = &inner_tmp[0];
@ -251,31 +243,15 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
autoView( right_v,right, AcceleratorRead); autoView( right_v,right, AcceleratorRead);
// This code could read coalesce // This code could read coalesce
// GPU - SIMT lane compliance... // GPU - SIMT lane compliance...
accelerator_for( ss, sites, nsimd,{ accelerator_for( ss, sites, 1,{
auto x_l = left_v(ss); auto x_l = left_v[ss];
auto y_l = right_v(ss); auto y_l = right_v[ss];
coalescedWrite(inner_tmp_v[ss],innerProductD(x_l,y_l)); inner_tmp_v[ss]=innerProductD(x_l,y_l);
}); });
} }
#else
typedef decltype(innerProduct(vobj(),vobj())) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
{
autoView( left_v , left, AcceleratorRead);
autoView( right_v,right, AcceleratorRead);
// GPU - SIMT lane compliance...
accelerator_for( ss, sites, nsimd,{
auto x_l = left_v(ss);
auto y_l = right_v(ss);
coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
});
}
#endif
// This is in single precision and fails some tests // This is in single precision and fails some tests
auto anrm = sumD(inner_tmp_v,sites); auto anrm = sum(inner_tmp_v,sites);
nrm = anrm; nrm = anrm;
return nrm; return nrm;
} }
@ -308,7 +284,8 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
conformable(z,x); conformable(z,x);
conformable(x,y); conformable(x,y);
// typedef typename vobj::vector_typeD vector_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_typeD vector_type;
RealD nrm; RealD nrm;
GridBase *grid = x.Grid(); GridBase *grid = x.Grid();
@ -320,29 +297,17 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
autoView( x_v, x, AcceleratorRead); autoView( x_v, x, AcceleratorRead);
autoView( y_v, y, AcceleratorRead); autoView( y_v, y, AcceleratorRead);
autoView( z_v, z, AcceleratorWrite); autoView( z_v, z, AcceleratorWrite);
#if 0
typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t; typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites); Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0]; auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, nsimd,{ accelerator_for( ss, sites, 1,{
auto tmp = a*x_v(ss)+b*y_v(ss); auto tmp = a*x_v[ss]+b*y_v[ss];
coalescedWrite(inner_tmp_v[ss],innerProductD(tmp,tmp)); inner_tmp_v[ss]=innerProductD(tmp,tmp);
coalescedWrite(z_v[ss],tmp); z_v[ss]=tmp;
}); });
nrm = real(TensorRemove(sum(inner_tmp_v,sites))); nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
#else
typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, nsimd,{
auto tmp = a*x_v(ss)+b*y_v(ss);
coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
coalescedWrite(z_v[ss],tmp);
});
nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
#endif
grid->GlobalSum(nrm); grid->GlobalSum(nrm);
return nrm; return nrm;
} }
@ -352,6 +317,7 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
{ {
conformable(left,right); conformable(left,right);
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_typeD vector_type; typedef typename vobj::vector_typeD vector_type;
Vector<ComplexD> tmp(2); Vector<ComplexD> tmp(2);
@ -495,14 +461,6 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
int words = fd*sizeof(sobj)/sizeof(scalar_type); int words = fd*sizeof(sobj)/sizeof(scalar_type);
grid->GlobalSumVector(ptr, words); grid->GlobalSumVector(ptr, words);
} }
template<class vobj> inline
std::vector<typename vobj::scalar_object>
sliceSum(const Lattice<vobj> &Data,int orthogdim)
{
std::vector<typename vobj::scalar_object> result;
sliceSum(Data,result,orthogdim);
return result;
}
template<class vobj> template<class vobj>
static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim) static void sliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)
@ -607,8 +565,7 @@ static void sliceNorm (std::vector<RealD> &sn,const Lattice<vobj> &rhs,int Ortho
template<class vobj> template<class vobj>
static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice<vobj> &X,const Lattice<vobj> &Y, static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice<vobj> &X,const Lattice<vobj> &Y,
int orthogdim,RealD scale=1.0) int orthogdim,RealD scale=1.0)
{ {
// perhaps easier to just promote A to a field and use regular madd
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
@ -639,7 +596,8 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
for(int l=0;l<Nsimd;l++){ for(int l=0;l<Nsimd;l++){
grid->iCoorFromIindex(icoor,l); grid->iCoorFromIindex(icoor,l);
int ldx =r+icoor[orthogdim]*rd; int ldx =r+icoor[orthogdim]*rd;
av.putlane(scalar_type(a[ldx])*zscale,l); scalar_type *as =(scalar_type *)&av;
as[l] = scalar_type(a[ldx])*zscale;
} }
tensor_reduced at; at=av; tensor_reduced at; at=av;
@ -679,6 +637,7 @@ template<class vobj>
static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0)
{ {
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nblock = X.Grid()->GlobalDimensions()[Orthog]; int Nblock = X.Grid()->GlobalDimensions()[Orthog];
@ -732,6 +691,7 @@ template<class vobj>
static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0) static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0)
{ {
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
int Nblock = X.Grid()->GlobalDimensions()[Orthog]; int Nblock = X.Grid()->GlobalDimensions()[Orthog];
@ -785,6 +745,7 @@ template<class vobj>
static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog)
{ {
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
GridBase *FullGrid = lhs.Grid(); GridBase *FullGrid = lhs.Grid();

View File

@ -211,25 +211,13 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi
assert(ok); assert(ok);
Integer smemSize = numThreads * sizeof(sobj); Integer smemSize = numThreads * sizeof(sobj);
// Move out of UVM
// Turns out I had messed up the synchronise after move to compute stream
// as running this on the default stream fools the synchronise
#undef UVM_BLOCK_BUFFER
#ifndef UVM_BLOCK_BUFFER
commVector<sobj> buffer(numBlocks);
sobj *buffer_v = &buffer[0];
sobj result;
reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
accelerator_barrier();
acceleratorCopyFromDevice(buffer_v,&result,sizeof(result));
#else
Vector<sobj> buffer(numBlocks); Vector<sobj> buffer(numBlocks);
sobj *buffer_v = &buffer[0]; sobj *buffer_v = &buffer[0];
sobj result;
reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size); reduceKernel<<< numBlocks, numThreads, smemSize >>>(lat, buffer_v, size);
accelerator_barrier(); accelerator_barrier();
result = *buffer_v; auto result = buffer_v[0];
#endif
return result; return result;
} }
@ -262,6 +250,8 @@ inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osi
template <class vobj> template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites) inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
{ {
typedef typename vobj::vector_type vector;
typedef typename vobj::scalar_typeD scalarD;
typedef typename vobj::scalar_objectD sobj; typedef typename vobj::scalar_objectD sobj;
sobj ret; sobj ret;

View File

@ -361,14 +361,9 @@ public:
_bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1}); _bernoulli.resize(_vol,std::discrete_distribution<int32_t>{1,1});
_uid.resize(_vol,std::uniform_int_distribution<uint32_t>() ); _uid.resize(_vol,std::uniform_int_distribution<uint32_t>() );
} }
template <class vobj,class distribution> inline void fill(Lattice<vobj> &l,std::vector<distribution> &dist)
{ template <class vobj,class distribution> inline void fill(Lattice<vobj> &l,std::vector<distribution> &dist){
if ( l.Grid()->_isCheckerBoarded ) {
Lattice<vobj> tmp(_grid);
fill(tmp,dist);
pickCheckerboard(l.Checkerboard(),l,tmp);
return;
}
typedef typename vobj::scalar_object scalar_object; typedef typename vobj::scalar_object scalar_object;
typedef typename vobj::scalar_type scalar_type; typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::vector_type vector_type; typedef typename vobj::vector_type vector_type;
@ -429,33 +424,9 @@ public:
// MT implementation does not implement fast discard even though // MT implementation does not implement fast discard even though
// in principle this is possible // in principle this is possible
//////////////////////////////////////////////// ////////////////////////////////////////////////
#if 1
thread_for( lidx, _grid->lSites(), {
int64_t gidx;
int o_idx;
int i_idx;
int rank;
Coordinate pcoor;
Coordinate lcoor;
Coordinate gcoor;
_grid->LocalIndexToLocalCoor(lidx,lcoor);
pcoor=_grid->ThisProcessorCoor();
_grid->ProcessorCoorLocalCoorToGlobalCoor(pcoor,lcoor,gcoor);
_grid->GlobalCoorToGlobalIndex(gcoor,gidx);
_grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
assert(rank == _grid->ThisRank() );
int l_idx=generator_idx(o_idx,i_idx);
_generators[l_idx] = master_engine;
Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
});
#else
// Everybody loops over global volume. // Everybody loops over global volume.
thread_for( gidx, _grid->_gsites, { thread_for( gidx, _grid->_gsites, {
// Where is it? // Where is it?
int rank; int rank;
int o_idx; int o_idx;
@ -472,7 +443,6 @@ public:
Skip(_generators[l_idx],gidx); // Skip to next RNG sequence Skip(_generators[l_idx],gidx); // Skip to next RNG sequence
} }
}); });
#endif
#else #else
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// Machine and thread decomposition dependent seeding is efficient // Machine and thread decomposition dependent seeding is efficient

View File

@ -0,0 +1,126 @@
NAMESPACE_BEGIN(Grid);
// If NOT CUDA or HIP -- we should provide
// -- atomicAdd(float *,float)
// -- atomicAdd(double *,double)
//
// Augment CUDA with complex atomics
#if !defined(GRID_HIP) || !defined(GRID_CUDA)
inline void atomicAdd(float *acc,float elem)
{
*acc += elem;
}
inline void atomicAdd(double *acc,double elem)
{
*acc += elem;
}
#endif
inline void atomicAdd(ComplexD *accum,ComplexD & elem)
{
double *a_p = (double *)accum;
double *e_p = (double *)&elem;
for(int w=0;w<2;w++){
atomicAdd(&a_p[w],e_p[w]);
}
}
inline void atomicAdd(ComplexF *accum,ComplexF & elem)
{
float *a_p = (float *)accum;
float *e_p = (float *)&elem;
for(int w=0;w<2;w++){
atomicAdd(&a_p[w],e_p[w]);
}
}
// Augment CUDA with vobj atomics
template<class vobj> accelerator_inline void atomicAdd(vobj *accum, vobj & elem)
{
typedef typename vobj::scalar_type scalar_type;
scalar_type *a_p= (scalar_type *)accum;
scalar_type *e_p= (scalar_type *)& elem;
for(int w=0;w<vobj::Nsimd();w++){
atomicAdd(&a_p[w],e_p[w]);
}
}
// Atomics based slice sum
template<class vobj> inline void sliceSumGpu(const Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_object::scalar_type scalar_type;
GridBase *grid = Data.Grid();
assert(grid!=NULL);
const int Nd = grid->_ndimension;
const int Nsimd = grid->Nsimd();
assert(orthogdim >= 0);
assert(orthogdim < Nd);
int fd=grid->_fdimensions[orthogdim];
int ld=grid->_ldimensions[orthogdim];
int rd=grid->_rdimensions[orthogdim];
// Move to device memory and copy in / out
Vector<vobj> lvSum(rd); // will locally sum vectors first
Vector<sobj> lsSum(ld,Zero()); // sum across these down to scalars
ExtractBuffer<sobj> extracted(Nsimd); // splitting the SIMD
result.resize(fd); // And then global sum to return the same vector to every node
for(int r=0;r<rd;r++){
lvSum[r]=Zero();
}
int e1= grid->_slice_nblock[orthogdim];
int e2= grid->_slice_block [orthogdim];
int stride=grid->_slice_stride[orthogdim];
// sum over reduced dimension planes, breaking out orthog dir
// Parallel over orthog direction
autoView( Data_v, Data, AcceleratorRead);
auto lvSum_p=&lvSum[0];
int ostride = grid->_ostride[orthogdim];
accelerator_for( ree,rd*e1*e2,1, {
int b = ree%e2;
int re= ree/e2;
int n=re%e1;
int r=re/e1;
int so=r*ostride;
int ss=so+n*stride+b;
atomicAdd(&lvSum_p[r],Data_v[ss]);
});
// Sum across simd lanes in the plane, breaking out orthog dir.
Coordinate icoor(Nd);
for(int rt=0;rt<rd;rt++){
extract(lvSum[rt],extracted);
for(int idx=0;idx<Nsimd;idx++){
grid->iCoorFromIindex(icoor,idx);
int ldx =rt+icoor[orthogdim]*rd;
lsSum[ldx]=lsSum[ldx]+extracted[idx];
}
}
// sum over nodes.
for(int t=0;t<fd;t++){
int pt = t/ld; // processor plane
int lt = t%ld;
if ( pt == grid->_processor_coor[orthogdim] ) {
result[t]=lsSum[lt];
} else {
result[t]=Zero();
}
}
scalar_type * ptr = (scalar_type *) &result[0];
int words = fd*sizeof(sobj)/sizeof(scalar_type);
grid->GlobalSumVector(ptr, words);
}
NAMESPACE_END(Grid);

View File

@ -66,65 +66,6 @@ inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<
return ret; return ret;
}; };
template<int N, class Vec>
Lattice<iScalar<iScalar<iScalar<Vec> > > > Determinant(const Lattice<iScalar<iScalar<iMatrix<Vec, N> > > > &Umu)
{
GridBase *grid=Umu.Grid();
auto lvol = grid->lSites();
Lattice<iScalar<iScalar<iScalar<Vec> > > > ret(grid);
typedef typename Vec::scalar_type scalar;
autoView(Umu_v,Umu,CpuRead);
autoView(ret_v,ret,CpuWrite);
thread_for(site,lvol,{
Eigen::MatrixXcd EigenU = Eigen::MatrixXcd::Zero(N,N);
Coordinate lcoor;
grid->LocalIndexToLocalCoor(site, lcoor);
iScalar<iScalar<iMatrix<scalar, N> > > Us;
peekLocalSite(Us, Umu_v, lcoor);
for(int i=0;i<N;i++){
for(int j=0;j<N;j++){
scalar tmp= Us()()(i,j);
ComplexD ztmp(real(tmp),imag(tmp));
EigenU(i,j)=ztmp;
}}
ComplexD detD = EigenU.determinant();
typename Vec::scalar_type det(detD.real(),detD.imag());
pokeLocalSite(det,ret_v,lcoor);
});
return ret;
}
template<int N>
Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > Inverse(const Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > &Umu)
{
GridBase *grid=Umu.Grid();
auto lvol = grid->lSites();
Lattice<iScalar<iScalar<iMatrix<vComplexD, N> > > > ret(grid);
autoView(Umu_v,Umu,CpuRead);
autoView(ret_v,ret,CpuWrite);
thread_for(site,lvol,{
Eigen::MatrixXcd EigenU = Eigen::MatrixXcd::Zero(N,N);
Coordinate lcoor;
grid->LocalIndexToLocalCoor(site, lcoor);
iScalar<iScalar<iMatrix<ComplexD, N> > > Us;
iScalar<iScalar<iMatrix<ComplexD, N> > > Ui;
peekLocalSite(Us, Umu_v, lcoor);
for(int i=0;i<N;i++){
for(int j=0;j<N;j++){
EigenU(i,j) = Us()()(i,j);
}}
Eigen::MatrixXcd EigenUinv = EigenU.inverse();
for(int i=0;i<N;i++){
for(int j=0;j<N;j++){
Ui()()(i,j) = EigenUinv(i,j);
}}
pokeLocalSite(Ui,ret_v,lcoor);
});
return ret;
}
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
#endif #endif

View File

@ -194,11 +194,11 @@ accelerator_inline void convertType(vComplexD2 & out, const ComplexD & in) {
#endif #endif
accelerator_inline void convertType(vComplexF & out, const vComplexD2 & in) { accelerator_inline void convertType(vComplexF & out, const vComplexD2 & in) {
precisionChange(out,in); out.v = Optimization::PrecisionChange::DtoS(in._internal[0].v,in._internal[1].v);
} }
accelerator_inline void convertType(vComplexD2 & out, const vComplexF & in) { accelerator_inline void convertType(vComplexD2 & out, const vComplexF & in) {
precisionChange(out,in); Optimization::PrecisionChange::StoD(in.v,out._internal[0].v,out._internal[1].v);
} }
template<typename T1,typename T2> template<typename T1,typename T2>
@ -288,36 +288,7 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed); blockZAXPY(fineDataRed,ip,Basis[v],fineDataRed);
} }
} }
template<class vobj,class CComplex,int nbasis,class VLattice>
inline void batchBlockProject(std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData,
const std::vector<Lattice<vobj>> &fineData,
const VLattice &Basis)
{
int NBatch = fineData.size();
assert(coarseData.size() == NBatch);
GridBase * fine = fineData[0].Grid();
GridBase * coarse= coarseData[0].Grid();
Lattice<iScalar<CComplex>> ip(coarse);
std::vector<Lattice<vobj>> fineDataCopy = fineData;
autoView(ip_, ip, AcceleratorWrite);
for(int v=0;v<nbasis;v++) {
for (int k=0; k<NBatch; k++) {
autoView( coarseData_ , coarseData[k], AcceleratorWrite);
blockInnerProductD(ip,Basis[v],fineDataCopy[k]); // ip = <basis|fine>
accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
convertType(coarseData_[sc](v),ip_[sc]);
});
// improve numerical stability of projection
// |fine> = |fine> - <basis|fine> |basis>
ip=-ip;
blockZAXPY(fineDataCopy[k],ip,Basis[v],fineDataCopy[k]);
}
}
}
template<class vobj,class vobj2,class CComplex> template<class vobj,class vobj2,class CComplex>
inline void blockZAXPY(Lattice<vobj> &fineZ, inline void blockZAXPY(Lattice<vobj> &fineZ,
@ -471,13 +442,13 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
vobj zz = Zero(); vobj zz = Zero();
accelerator_for(sc,coarse->oSites(),vobj::Nsimd(),{ accelerator_for(sc,coarse->oSites(),1,{
// One thread per sub block // One thread per sub block
Coordinate coor_c(_ndimension); Coordinate coor_c(_ndimension);
Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions); // Block coordinate Lexicographic::CoorFromIndex(coor_c,sc,coarse_rdimensions); // Block coordinate
auto cd = coalescedRead(zz); vobj cd = zz;
for(int sb=0;sb<blockVol;sb++){ for(int sb=0;sb<blockVol;sb++){
@ -488,10 +459,10 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d]; for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d] + coor_b[d];
Lexicographic::IndexFromCoor(coor_f,sf,fine_rdimensions); Lexicographic::IndexFromCoor(coor_f,sf,fine_rdimensions);
cd=cd+coalescedRead(fineData_p[sf]); cd=cd+fineData_p[sf];
} }
coalescedWrite(coarseData_p[sc],cd); coarseData_p[sc] = cd;
}); });
return; return;
@ -619,26 +590,6 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
} }
#endif #endif
template<class vobj,class CComplex,int nbasis,class VLattice>
inline void batchBlockPromote(const std::vector<Lattice<iVector<CComplex,nbasis>>> &coarseData,
std::vector<Lattice<vobj>> &fineData,
const VLattice &Basis)
{
int NBatch = coarseData.size();
assert(fineData.size() == NBatch);
GridBase * fine = fineData[0].Grid();
GridBase * coarse = coarseData[0].Grid();
for (int k=0; k<NBatch; k++)
fineData[k]=Zero();
for (int i=0;i<nbasis;i++) {
for (int k=0; k<NBatch; k++) {
Lattice<iScalar<CComplex>> ip = PeekIndex<0>(coarseData[k],i);
blockZAXPY(fineData[k],ip,Basis[i],fineData[k]);
}
}
}
// Useful for precision conversion, or indeed anything where an operator= does a conversion on scalars. // Useful for precision conversion, or indeed anything where an operator= does a conversion on scalars.
// Simd layouts need not match since we use peek/poke Local // Simd layouts need not match since we use peek/poke Local
template<class vobj,class vvobj> template<class vobj,class vvobj>
@ -697,68 +648,8 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
for(int d=0;d<nd;d++){ for(int d=0;d<nd;d++){
assert(Fg->_processors[d] == Tg->_processors[d]); assert(Fg->_processors[d] == Tg->_processors[d]);
} }
// the above should guarantee that the operations are local // the above should guarantee that the operations are local
#if 1
size_t nsite = 1;
for(int i=0;i<nd;i++) nsite *= RegionSize[i];
size_t tbytes = 4*nsite*sizeof(int);
int *table = (int*)malloc(tbytes);
thread_for(idx, nsite, {
Coordinate from_coor, to_coor;
size_t rem = idx;
for(int i=0;i<nd;i++){
size_t base_i = rem % RegionSize[i]; rem /= RegionSize[i];
from_coor[i] = base_i + FromLowerLeft[i];
to_coor[i] = base_i + ToLowerLeft[i];
}
int foidx = Fg->oIndex(from_coor);
int fiidx = Fg->iIndex(from_coor);
int toidx = Tg->oIndex(to_coor);
int tiidx = Tg->iIndex(to_coor);
int* tt = table + 4*idx;
tt[0] = foidx;
tt[1] = fiidx;
tt[2] = toidx;
tt[3] = tiidx;
});
int* table_d = (int*)acceleratorAllocDevice(tbytes);
acceleratorCopyToDevice(table,table_d,tbytes);
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
autoView(from_v,From,AcceleratorRead);
autoView(to_v,To,AcceleratorWrite);
accelerator_for(idx,nsite,1,{
static const int words=sizeof(vobj)/sizeof(vector_type);
int* tt = table_d + 4*idx;
int from_oidx = *tt++;
int from_lane = *tt++;
int to_oidx = *tt++;
int to_lane = *tt;
const vector_type* from = (const vector_type *)&from_v[from_oidx];
vector_type* to = (vector_type *)&to_v[to_oidx];
scalar_type stmp;
for(int w=0;w<words;w++){
stmp = getlane(from[w], from_lane);
putlane(to[w], stmp, to_lane);
}
});
acceleratorFreeDevice(table_d);
free(table);
#else
Coordinate ldf = Fg->_ldimensions; Coordinate ldf = Fg->_ldimensions;
Coordinate rdf = Fg->_rdimensions; Coordinate rdf = Fg->_rdimensions;
Coordinate isf = Fg->_istride; Coordinate isf = Fg->_istride;
@ -767,9 +658,9 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
Coordinate ist = Tg->_istride; Coordinate ist = Tg->_istride;
Coordinate ost = Tg->_ostride; Coordinate ost = Tg->_ostride;
autoView( t_v , To, CpuWrite); autoView( t_v , To, AcceleratorWrite);
autoView( f_v , From, CpuRead); autoView( f_v , From, AcceleratorRead);
thread_for(idx,Fg->lSites(),{ accelerator_for(idx,Fg->lSites(),1,{
sobj s; sobj s;
Coordinate Fcoor(nd); Coordinate Fcoor(nd);
Coordinate Tcoor(nd); Coordinate Tcoor(nd);
@ -782,24 +673,17 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
Tcoor[d] = ToLowerLeft[d]+ Fcoor[d]-FromLowerLeft[d]; Tcoor[d] = ToLowerLeft[d]+ Fcoor[d]-FromLowerLeft[d];
} }
if (in_region) { if (in_region) {
#if 0 Integer idx_f = 0; for(int d=0;d<nd;d++) idx_f+=isf[d]*(Fcoor[d]/rdf[d]);
Integer idx_f = 0; for(int d=0;d<nd;d++) idx_f+=isf[d]*(Fcoor[d]/rdf[d]); // inner index from Integer idx_t = 0; for(int d=0;d<nd;d++) idx_t+=ist[d]*(Tcoor[d]/rdt[d]);
Integer idx_t = 0; for(int d=0;d<nd;d++) idx_t+=ist[d]*(Tcoor[d]/rdt[d]); // inner index to Integer odx_f = 0; for(int d=0;d<nd;d++) odx_f+=osf[d]*(Fcoor[d]%rdf[d]);
Integer odx_f = 0; for(int d=0;d<nd;d++) odx_f+=osf[d]*(Fcoor[d]%rdf[d]); // outer index from Integer odx_t = 0; for(int d=0;d<nd;d++) odx_t+=ost[d]*(Tcoor[d]%rdt[d]);
Integer odx_t = 0; for(int d=0;d<nd;d++) odx_t+=ost[d]*(Tcoor[d]%rdt[d]); // outer index to
scalar_type * fp = (scalar_type *)&f_v[odx_f]; scalar_type * fp = (scalar_type *)&f_v[odx_f];
scalar_type * tp = (scalar_type *)&t_v[odx_t]; scalar_type * tp = (scalar_type *)&t_v[odx_t];
for(int w=0;w<words;w++){ for(int w=0;w<words;w++){
tp[w].putlane(fp[w].getlane(idx_f),idx_t); tp[idx_t+w*Nsimd] = fp[idx_f+w*Nsimd]; // FIXME IF RRII layout, type pun no worke
} }
#else
peekLocalSite(s,f_v,Fcoor);
pokeLocalSite(s,t_v,Tcoor);
#endif
} }
}); });
#endif
} }
@ -892,8 +776,6 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
} }
//Insert subvolume orthogonal to direction 'orthog' with slice index 'slice_lo' from 'lowDim' onto slice index 'slice_hi' of higherDim
//The local dimensions of both 'lowDim' and 'higherDim' orthogonal to 'orthog' should be the same
template<class vobj> template<class vobj>
void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog) void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
{ {
@ -910,70 +792,11 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
for(int d=0;d<nh;d++){ for(int d=0;d<nh;d++){
if ( d!=orthog ) { if ( d!=orthog ) {
assert(lg->_processors[d] == hg->_processors[d]); assert(lg->_processors[d] == hg->_processors[d]);
assert(lg->_ldimensions[d] == hg->_ldimensions[d]); assert(lg->_ldimensions[d] == hg->_ldimensions[d]);
} }
} }
#if 1
size_t nsite = lg->lSites()/lg->LocalDimensions()[orthog];
size_t tbytes = 4*nsite*sizeof(int);
int *table = (int*)malloc(tbytes);
thread_for(idx,nsite,{
Coordinate lcoor(nl);
Coordinate hcoor(nh);
lcoor[orthog] = slice_lo;
hcoor[orthog] = slice_hi;
size_t rem = idx;
for(int mu=0;mu<nl;mu++){
if(mu != orthog){
int xmu = rem % lg->LocalDimensions()[mu]; rem /= lg->LocalDimensions()[mu];
lcoor[mu] = hcoor[mu] = xmu;
}
}
int loidx = lg->oIndex(lcoor);
int liidx = lg->iIndex(lcoor);
int hoidx = hg->oIndex(hcoor);
int hiidx = hg->iIndex(hcoor);
int* tt = table + 4*idx;
tt[0] = loidx;
tt[1] = liidx;
tt[2] = hoidx;
tt[3] = hiidx;
});
int* table_d = (int*)acceleratorAllocDevice(tbytes);
acceleratorCopyToDevice(table,table_d,tbytes);
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
autoView(lowDim_v,lowDim,AcceleratorRead);
autoView(higherDim_v,higherDim,AcceleratorWrite);
accelerator_for(idx,nsite,1,{
static const int words=sizeof(vobj)/sizeof(vector_type);
int* tt = table_d + 4*idx;
int from_oidx = *tt++;
int from_lane = *tt++;
int to_oidx = *tt++;
int to_lane = *tt;
const vector_type* from = (const vector_type *)&lowDim_v[from_oidx];
vector_type* to = (vector_type *)&higherDim_v[to_oidx];
scalar_type stmp;
for(int w=0;w<words;w++){
stmp = getlane(from[w], from_lane);
putlane(to[w], stmp, to_lane);
}
});
acceleratorFreeDevice(table_d);
free(table);
#else
// the above should guarantee that the operations are local // the above should guarantee that the operations are local
autoView(lowDimv,lowDim,CpuRead); autoView(lowDimv,lowDim,CpuRead);
autoView(higherDimv,higherDim,CpuWrite); autoView(higherDimv,higherDim,CpuWrite);
@ -989,7 +812,6 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
pokeLocalSite(s,higherDimv,hcoor); pokeLocalSite(s,higherDimv,hcoor);
} }
}); });
#endif
} }
@ -1033,7 +855,7 @@ void ExtractSliceLocal(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int
template<class vobj> template<class vobj>
void Replicate(const Lattice<vobj> &coarse,Lattice<vobj> & fine) void Replicate(Lattice<vobj> &coarse,Lattice<vobj> & fine)
{ {
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
@ -1054,7 +876,7 @@ void Replicate(const Lattice<vobj> &coarse,Lattice<vobj> & fine)
Coordinate fcoor(nd); Coordinate fcoor(nd);
Coordinate ccoor(nd); Coordinate ccoor(nd);
for(int64_t g=0;g<fg->gSites();g++){ for(int g=0;g<fg->gSites();g++){
fg->GlobalIndexToGlobalCoor(g,fcoor); fg->GlobalIndexToGlobalCoor(g,fcoor);
for(int d=0;d<nd;d++){ for(int d=0;d<nd;d++){
@ -1258,27 +1080,9 @@ vectorizeFromRevLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
}); });
} }
//Very fast precision change. Requires in/out objects to reside on same Grid (e.g. by using double2 for the double-precision field) //Convert a Lattice from one precision to another
template<class VobjOut, class VobjIn> template<class VobjOut, class VobjIn>
void precisionChangeFast(Lattice<VobjOut> &out, const Lattice<VobjIn> &in) void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
{
typedef typename VobjOut::vector_type Vout;
typedef typename VobjIn::vector_type Vin;
const int N = sizeof(VobjOut)/sizeof(Vout);
conformable(out.Grid(),in.Grid());
out.Checkerboard() = in.Checkerboard();
int nsimd = out.Grid()->Nsimd();
autoView( out_v , out, AcceleratorWrite);
autoView( in_v , in, AcceleratorRead);
accelerator_for(idx,out.Grid()->oSites(),1,{
Vout *vout = (Vout *)&out_v[idx];
Vin *vin = (Vin *)&in_v[idx];
precisionChange(vout,vin,N);
});
}
//Convert a Lattice from one precision to another (original, slow implementation)
template<class VobjOut, class VobjIn>
void precisionChangeOrig(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
{ {
assert(out.Grid()->Nd() == in.Grid()->Nd()); assert(out.Grid()->Nd() == in.Grid()->Nd());
for(int d=0;d<out.Grid()->Nd();d++){ for(int d=0;d<out.Grid()->Nd();d++){
@ -1293,7 +1097,7 @@ void precisionChangeOrig(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
int ndim = out.Grid()->Nd(); int ndim = out.Grid()->Nd();
int out_nsimd = out_grid->Nsimd(); int out_nsimd = out_grid->Nsimd();
int in_nsimd = in_grid->Nsimd();
std::vector<Coordinate > out_icoor(out_nsimd); std::vector<Coordinate > out_icoor(out_nsimd);
for(int lane=0; lane < out_nsimd; lane++){ for(int lane=0; lane < out_nsimd; lane++){
@ -1324,128 +1128,6 @@ void precisionChangeOrig(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
}); });
} }
//The workspace for a precision change operation allowing for the reuse of the mapping to save time on subsequent calls
class precisionChangeWorkspace{
std::pair<Integer,Integer>* fmap_device; //device pointer
//maintain grids for checking
GridBase* _out_grid;
GridBase* _in_grid;
public:
precisionChangeWorkspace(GridBase *out_grid, GridBase *in_grid): _out_grid(out_grid), _in_grid(in_grid){
//Build a map between the sites and lanes of the output field and the input field as we cannot use the Grids on the device
assert(out_grid->Nd() == in_grid->Nd());
for(int d=0;d<out_grid->Nd();d++){
assert(out_grid->FullDimensions()[d] == in_grid->FullDimensions()[d]);
}
int Nsimd_out = out_grid->Nsimd();
std::vector<Coordinate> out_icorrs(out_grid->Nsimd()); //reuse these
for(int lane=0; lane < out_grid->Nsimd(); lane++)
out_grid->iCoorFromIindex(out_icorrs[lane], lane);
std::vector<std::pair<Integer,Integer> > fmap_host(out_grid->lSites()); //lsites = osites*Nsimd
thread_for(out_oidx,out_grid->oSites(),{
Coordinate out_ocorr;
out_grid->oCoorFromOindex(out_ocorr, out_oidx);
Coordinate lcorr; //the local coordinate (common to both in and out as full coordinate)
for(int out_lane=0; out_lane < Nsimd_out; out_lane++){
out_grid->InOutCoorToLocalCoor(out_ocorr, out_icorrs[out_lane], lcorr);
//int in_oidx = in_grid->oIndex(lcorr), in_lane = in_grid->iIndex(lcorr);
//Note oIndex and OcorrFromOindex (and same for iIndex) are not inverse for checkerboarded lattice, the former coordinates being defined on the full lattice and the latter on the reduced lattice
//Until this is fixed we need to circumvent the problem locally. Here I will use the coordinates defined on the reduced lattice for simplicity
int in_oidx = 0, in_lane = 0;
for(int d=0;d<in_grid->_ndimension;d++){
in_oidx += in_grid->_ostride[d] * ( lcorr[d] % in_grid->_rdimensions[d] );
in_lane += in_grid->_istride[d] * ( lcorr[d] / in_grid->_rdimensions[d] );
}
fmap_host[out_lane + Nsimd_out*out_oidx] = std::pair<Integer,Integer>( in_oidx, in_lane );
}
});
//Copy the map to the device (if we had a way to tell if an accelerator is in use we could avoid this copy for CPU-only machines)
size_t fmap_bytes = out_grid->lSites() * sizeof(std::pair<Integer,Integer>);
fmap_device = (std::pair<Integer,Integer>*)acceleratorAllocDevice(fmap_bytes);
acceleratorCopyToDevice(fmap_host.data(), fmap_device, fmap_bytes);
}
//Prevent moving or copying
precisionChangeWorkspace(const precisionChangeWorkspace &r) = delete;
precisionChangeWorkspace(precisionChangeWorkspace &&r) = delete;
precisionChangeWorkspace &operator=(const precisionChangeWorkspace &r) = delete;
precisionChangeWorkspace &operator=(precisionChangeWorkspace &&r) = delete;
std::pair<Integer,Integer> const* getMap() const{ return fmap_device; }
void checkGrids(GridBase* out, GridBase* in) const{
conformable(out, _out_grid);
conformable(in, _in_grid);
}
~precisionChangeWorkspace(){
acceleratorFreeDevice(fmap_device);
}
};
//We would like to use precisionChangeFast when possible. However usage of this requires the Grids to be the same (runtime check)
//*and* the precisionChange(VobjOut::vector_type, VobjIn, int) function to be defined for the types; this requires an extra compile-time check which we do using some SFINAE trickery
template<class VobjOut, class VobjIn>
auto _precisionChangeFastWrap(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, int dummy)->decltype( precisionChange( ((typename VobjOut::vector_type*)0), ((typename VobjIn::vector_type*)0), 1), int()){
if(out.Grid() == in.Grid()){
precisionChangeFast(out,in);
return 1;
}else{
return 0;
}
}
template<class VobjOut, class VobjIn>
int _precisionChangeFastWrap(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, long dummy){ //note long here is intentional; it means the above is preferred if available
return 0;
}
//Convert a lattice of one precision to another. Much faster than original implementation but requires a pregenerated workspace
//which contains the mapping data.
template<class VobjOut, class VobjIn>
void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in, const precisionChangeWorkspace &workspace){
if(_precisionChangeFastWrap(out,in,0)) return;
static_assert( std::is_same<typename VobjOut::scalar_typeD, typename VobjIn::scalar_typeD>::value == 1, "precisionChange: tensor types must be the same" ); //if tensor types are same the DoublePrecision type must be the same
out.Checkerboard() = in.Checkerboard();
constexpr int Nsimd_out = VobjOut::Nsimd();
workspace.checkGrids(out.Grid(),in.Grid());
std::pair<Integer,Integer> const* fmap_device = workspace.getMap();
//Do the copy/precision change
autoView( out_v , out, AcceleratorWrite);
autoView( in_v , in, AcceleratorRead);
accelerator_for(out_oidx, out.Grid()->oSites(), 1,{
std::pair<Integer,Integer> const* fmap_osite = fmap_device + out_oidx*Nsimd_out;
for(int out_lane=0; out_lane < Nsimd_out; out_lane++){
int in_oidx = fmap_osite[out_lane].first;
int in_lane = fmap_osite[out_lane].second;
copyLane(out_v[out_oidx], out_lane, in_v[in_oidx], in_lane);
}
});
}
//Convert a Lattice from one precision to another. Much faster than original implementation but slower than precisionChangeFast
//or precisionChange called with pregenerated workspace, as it needs to internally generate the workspace on the host and copy to device
template<class VobjOut, class VobjIn>
void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in){
if(_precisionChangeFastWrap(out,in,0)) return;
precisionChangeWorkspace workspace(out.Grid(), in.Grid());
precisionChange(out, in, workspace);
}
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Communicate between grids // Communicate between grids
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////

View File

@ -1,565 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/lattice/PaddedCell.h
Copyright (C) 2019
Author: Peter Boyle pboyle@bnl.gov
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once
#include<Grid/cshift/Cshift.h>
NAMESPACE_BEGIN(Grid);
//Allow the user to specify how the C-shift is performed, e.g. to respect the appropriate boundary conditions
template<typename vobj>
struct CshiftImplBase{
virtual Lattice<vobj> Cshift(const Lattice<vobj> &in, int dir, int shift) const = 0;
virtual ~CshiftImplBase(){}
};
template<typename vobj>
struct CshiftImplDefault: public CshiftImplBase<vobj>{
Lattice<vobj> Cshift(const Lattice<vobj> &in, int dir, int shift) const override{ return Grid::Cshift(in,dir,shift); }
};
template<typename Gimpl>
struct CshiftImplGauge: public CshiftImplBase<typename Gimpl::GaugeLinkField::vector_object>{
typename Gimpl::GaugeLinkField Cshift(const typename Gimpl::GaugeLinkField &in, int dir, int shift) const override{ return Gimpl::CshiftLink(in,dir,shift); }
};
/*
*
* TODO:
* -- address elementsof vobj via thread block in Scatter/Gather
* -- overlap comms with motion in Face_exchange
*
*/
template<class vobj> inline void ScatterSlice(const cshiftVector<vobj> &buf,
Lattice<vobj> &lat,
int x,
int dim,
int offset=0)
{
const int Nsimd=vobj::Nsimd();
typedef typename vobj::scalar_object sobj;
GridBase *grid = lat.Grid();
Coordinate simd = grid->_simd_layout;
int Nd = grid->Nd();
int block = grid->_slice_block[dim];
int stride = grid->_slice_stride[dim];
int nblock = grid->_slice_nblock[dim];
int rd = grid->_rdimensions[dim];
int ox = x%rd;
int ix = x/rd;
int isites = 1; for(int d=0;d<Nd;d++) if( d!=dim) isites*=simd[d];
Coordinate rsimd= simd; rsimd[dim]=1; // maybe reduce Nsimd
int rNsimd = 1; for(int d=0;d<Nd;d++) rNsimd*=rsimd[d];
int rNsimda= Nsimd/simd[dim]; // should be equal
assert(rNsimda==rNsimd);
int face_ovol=block*nblock;
// assert(buf.size()==face_ovol*rNsimd);
/*This will work GPU ONLY unless rNsimd is put in the lexico index*/
//Let's make it work on GPU and then make a special accelerator_for that
//doesn't hide the SIMD direction and keeps explicit in the threadIdx
//for cross platform
// FIXME -- can put internal indices into thread loop
auto buf_p = & buf[0];
autoView(lat_v, lat, AcceleratorRead);
accelerator_for(ss, face_ovol/simd[dim],Nsimd,{
// scalar layout won't coalesce
int blane=acceleratorSIMTlane(Nsimd); // buffer lane
int olane=blane%rNsimd; // reduced lattice lane
int obit =blane/rNsimd;
///////////////////////////////////////////////////////////////
// osite -- potentially one bit from simd in the buffer: (ss<<1)|obit
///////////////////////////////////////////////////////////////
int ssp = ss*simd[dim]+obit;
int b = ssp%block;
int n = ssp/block;
int osite= b+n*stride + ox*block;
////////////////////////////////////////////
// isite -- map lane within buffer to lane within lattice
////////////////////////////////////////////
Coordinate icoor;
int lane;
Lexicographic::CoorFromIndex(icoor,olane,rsimd);
icoor[dim]=ix;
Lexicographic::IndexFromCoor(icoor,lane,simd);
///////////////////////////////////////////
// Transfer into lattice - will coalesce
///////////////////////////////////////////
sobj obj = extractLane(blane,buf_p[ss+offset]);
insertLane(lane,lat_v[osite],obj);
});
}
template<class vobj> inline void GatherSlice(cshiftVector<vobj> &buf,
const Lattice<vobj> &lat,
int x,
int dim,
int offset=0)
{
const int Nsimd=vobj::Nsimd();
typedef typename vobj::scalar_object sobj;
autoView(lat_v, lat, AcceleratorRead);
GridBase *grid = lat.Grid();
Coordinate simd = grid->_simd_layout;
int Nd = grid->Nd();
int block = grid->_slice_block[dim];
int stride = grid->_slice_stride[dim];
int nblock = grid->_slice_nblock[dim];
int rd = grid->_rdimensions[dim];
int ox = x%rd;
int ix = x/rd;
int isites = 1; for(int d=0;d<Nd;d++) if( d!=dim) isites*=simd[d];
Coordinate rsimd= simd; rsimd[dim]=1; // maybe reduce Nsimd
int rNsimd = 1; for(int d=0;d<Nd;d++) rNsimd*=rsimd[d];
int face_ovol=block*nblock;
// assert(buf.size()==face_ovol*rNsimd);
/*This will work GPU ONLY unless rNsimd is put in the lexico index*/
//Let's make it work on GPU and then make a special accelerator_for that
//doesn't hide the SIMD direction and keeps explicit in the threadIdx
//for cross platform
//For CPU perhaps just run a loop over Nsimd
auto buf_p = & buf[0];
accelerator_for(ss, face_ovol/simd[dim],Nsimd,{
// scalar layout won't coalesce
int blane=acceleratorSIMTlane(Nsimd); // buffer lane
int olane=blane%rNsimd; // reduced lattice lane
int obit =blane/rNsimd;
////////////////////////////////////////////
// osite
////////////////////////////////////////////
int ssp = ss*simd[dim]+obit;
int b = ssp%block;
int n = ssp/block;
int osite= b+n*stride + ox*block;
////////////////////////////////////////////
// isite -- map lane within buffer to lane within lattice
////////////////////////////////////////////
Coordinate icoor;
int lane;
Lexicographic::CoorFromIndex(icoor,olane,rsimd);
icoor[dim]=ix;
Lexicographic::IndexFromCoor(icoor,lane,simd);
///////////////////////////////////////////
// Take out of lattice
///////////////////////////////////////////
sobj obj = extractLane(lane,lat_v[osite]);
insertLane(blane,buf_p[ss+offset],obj);
});
/*
int words =block*nblock/simd[dim];
std::vector<vobj> tbuf(words);
acceleratorCopyFromDevice((void *)&buf[offset],(void *)&tbuf[0],words*sizeof(vobj));
typedef typename vobj::scalar_type scalar;
scalar *sbuf = (scalar *)&tbuf[0];
scalar tmp=0.0;
for(int w=0;w<words*sizeof(vobj)/sizeof(scalar);w++){
tmp=tmp+conjugate(sbuf[w])*sbuf[w];
}
std::cout << " Gathered buffer norm "<<tmp<<std::endl;
*/
}
class PaddedCell {
public:
GridCartesian * unpadded_grid;
int dims;
int depth;
std::vector<GridCartesian *> grids;
~PaddedCell()
{
DeleteGrids();
}
PaddedCell(int _depth,GridCartesian *_grid)
{
unpadded_grid = _grid;
depth=_depth;
dims=_grid->Nd();
AllocateGrids();
Coordinate local =unpadded_grid->LocalDimensions();
Coordinate procs =unpadded_grid->ProcessorGrid();
for(int d=0;d<dims;d++){
if ( procs[d] > 1 ) assert(local[d]>=depth);
}
}
void DeleteGrids(void)
{
for(int d=0;d<grids.size();d++){
delete grids[d];
}
grids.resize(0);
};
void AllocateGrids(void)
{
Coordinate local =unpadded_grid->LocalDimensions();
Coordinate simd =unpadded_grid->_simd_layout;
Coordinate processors=unpadded_grid->_processors;
Coordinate plocal =unpadded_grid->LocalDimensions();
Coordinate global(dims);
GridCartesian *old_grid = unpadded_grid;
// expand up one dim at a time
for(int d=0;d<dims;d++){
if ( processors[d] > 1 ) {
plocal[d] += 2*depth;
for(int d=0;d<dims;d++){
global[d] = plocal[d]*processors[d];
}
old_grid = new GridCartesian(global,simd,processors);
}
grids.push_back(old_grid);
}
};
template<class vobj>
inline Lattice<vobj> Extract(const Lattice<vobj> &in) const
{
Coordinate processors=unpadded_grid->_processors;
Lattice<vobj> out(unpadded_grid);
Coordinate local =unpadded_grid->LocalDimensions();
// depends on the MPI spread
Coordinate fll(dims,depth);
Coordinate tll(dims,0); // depends on the MPI spread
for(int d=0;d<dims;d++){
if( processors[d]==1 ) fll[d]=0;
}
localCopyRegion(in,out,fll,tll,local);
return out;
}
template<class vobj>
inline Lattice<vobj> Exchange(const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
{
GridBase *old_grid = in.Grid();
int dims = old_grid->Nd();
Lattice<vobj> tmp = in;
for(int d=0;d<dims;d++){
tmp = Expand(d,tmp,cshift); // rvalue && assignment
}
return tmp;
}
template<class vobj>
inline Lattice<vobj> ExchangePeriodic(const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
{
GridBase *old_grid = in.Grid();
int dims = old_grid->Nd();
Lattice<vobj> tmp = in;
for(int d=0;d<dims;d++){
tmp = ExpandPeriodic(d,tmp,cshift); // rvalue && assignment
}
return tmp;
}
// expand up one dim at a time
template<class vobj>
inline Lattice<vobj> Expand(int dim, const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
{
Coordinate processors=unpadded_grid->_processors;
GridBase *old_grid = in.Grid();
GridCartesian *new_grid = grids[dim];//These are new grids
Lattice<vobj> padded(new_grid);
Lattice<vobj> shifted(old_grid);
Coordinate local =old_grid->LocalDimensions();
Coordinate plocal =new_grid->LocalDimensions();
if(dim==0) conformable(old_grid,unpadded_grid);
else conformable(old_grid,grids[dim-1]);
// std::cout << " dim "<<dim<<" local "<<local << " padding to "<<plocal<<std::endl;
double tins=0, tshift=0;
int islocal = 0 ;
if ( processors[dim] == 1 ) islocal = 1;
if ( islocal ) {
// replace with a copy and maybe grid swizzle
double t = usecond();
padded = in;
tins += usecond() - t;
} else {
//////////////////////////////////////////////
// Replace sequence with
// ---------------------
// (i) Gather high face(s); start comms
// (ii) Gather low face(s); start comms
// (iii) Copy middle bit with localCopyRegion
// (iv) Complete high face(s), insert slice(s)
// (iv) Complete low face(s), insert slice(s)
//////////////////////////////////////////////
// Middle bit
double t = usecond();
for(int x=0;x<local[dim];x++){
InsertSliceLocal(in,padded,x,depth+x,dim);
}
tins += usecond() - t;
// High bit
t = usecond();
shifted = cshift.Cshift(in,dim,depth);
tshift += usecond() - t;
t=usecond();
for(int x=0;x<depth;x++){
InsertSliceLocal(shifted,padded,local[dim]-depth+x,depth+local[dim]+x,dim);
}
tins += usecond() - t;
// Low bit
t = usecond();
shifted = cshift.Cshift(in,dim,-depth);
tshift += usecond() - t;
t = usecond();
for(int x=0;x<depth;x++){
InsertSliceLocal(shifted,padded,x,x,dim);
}
tins += usecond() - t;
}
std::cout << GridLogPerformance << "PaddedCell::Expand timings: cshift:" << tshift/1000 << "ms, insert-slice:" << tins/1000 << "ms" << std::endl;
return padded;
}
template<class vobj>
inline Lattice<vobj> ExpandPeriodic(int dim, const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
{
Coordinate processors=unpadded_grid->_processors;
GridBase *old_grid = in.Grid();
GridCartesian *new_grid = grids[dim];//These are new grids
Lattice<vobj> padded(new_grid);
Lattice<vobj> shifted(old_grid);
Coordinate local =old_grid->LocalDimensions();
Coordinate plocal =new_grid->LocalDimensions();
if(dim==0) conformable(old_grid,unpadded_grid);
else conformable(old_grid,grids[dim-1]);
// std::cout << " dim "<<dim<<" local "<<local << " padding to "<<plocal<<std::endl;
double tins=0, tshift=0;
int islocal = 0 ;
if ( processors[dim] == 1 ) islocal = 1;
if ( islocal ) {
// replace with a copy and maybe grid swizzle
double t = usecond();
padded = in;
tins += usecond() - t;
} else {
//////////////////////////////////////////////
// Replace sequence with
// ---------------------
// (i) Gather high face(s); start comms
// (ii) Gather low face(s); start comms
// (iii) Copy middle bit with localCopyRegion
// (iv) Complete high face(s), insert slice(s)
// (iv) Complete low face(s), insert slice(s)
//////////////////////////////////////////////
Face_exchange(in,padded,dim,depth);
}
return padded;
}
template<class vobj>
void Face_exchange(const Lattice<vobj> &from,
Lattice<vobj> &to,
int dimension,int depth) const
{
typedef typename vobj::vector_type vector_type;
typedef typename vobj::scalar_type scalar_type;
typedef typename vobj::scalar_object sobj;
RealD t_gather=0.0;
RealD t_scatter=0.0;
RealD t_comms=0.0;
RealD t_copy=0.0;
// std::cout << GridLogMessage << "dimension " <<dimension<<std::endl;
// DumpSliceNorm(std::string("Face_exchange from"),from,dimension);
GridBase *grid=from.Grid();
GridBase *new_grid=to.Grid();
Coordinate lds = from.Grid()->_ldimensions;
Coordinate nlds= to.Grid()->_ldimensions;
Coordinate simd= from.Grid()->_simd_layout;
int ld = lds[dimension];
int nld = to.Grid()->_ldimensions[dimension];
const int Nsimd = vobj::Nsimd();
assert(depth<=lds[dimension]); // A must be on neighbouring node
assert(depth>0); // A caller bug if zero
assert(ld+2*depth==nld);
////////////////////////////////////////////////////////////////////////////
// Face size and byte calculations
////////////////////////////////////////////////////////////////////////////
int buffer_size = 1;
for(int d=0;d<lds.size();d++){
if ( d!= dimension) buffer_size=buffer_size*lds[d];
}
buffer_size = buffer_size / Nsimd;
int rNsimd = Nsimd / simd[dimension];
assert( buffer_size == from.Grid()->_slice_nblock[dimension]*from.Grid()->_slice_block[dimension] / simd[dimension]);
static cshiftVector<vobj> send_buf;
static cshiftVector<vobj> recv_buf;
send_buf.resize(buffer_size*2*depth);
recv_buf.resize(buffer_size*2*depth);
std::vector<CommsRequest_t> fwd_req;
std::vector<CommsRequest_t> bwd_req;
int words = buffer_size;
int bytes = words * sizeof(vobj);
////////////////////////////////////////////////////////////////////////////
// Communication coords
////////////////////////////////////////////////////////////////////////////
int comm_proc = 1;
int xmit_to_rank;
int recv_from_rank;
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
////////////////////////////////////////////////////////////////////////////
// Gather all surface terms up to depth "d"
////////////////////////////////////////////////////////////////////////////
RealD t;
int plane=0;
for ( int d=0;d < depth ; d ++ ) {
int tag = d*1024 + dimension*2+0;
t=usecond();
GatherSlice(send_buf,from,d,dimension,plane*buffer_size); plane++;
t_gather+=usecond()-t;
t=usecond();
grid->SendToRecvFromBegin(fwd_req,
(void *)&send_buf[d*buffer_size], xmit_to_rank,
(void *)&recv_buf[d*buffer_size], recv_from_rank, bytes, tag);
t_comms+=usecond()-t;
}
for ( int d=0;d < depth ; d ++ ) {
int tag = d*1024 + dimension*2+1;
t=usecond();
GatherSlice(send_buf,from,ld-depth+d,dimension,plane*buffer_size); plane++;
t_gather+= usecond() - t;
t=usecond();
grid->SendToRecvFromBegin(bwd_req,
(void *)&send_buf[(d+depth)*buffer_size], recv_from_rank,
(void *)&recv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
t_comms+=usecond()-t;
}
////////////////////////////////////////////////////////////////////////////
// Copy interior -- overlap this with comms
////////////////////////////////////////////////////////////////////////////
int Nd = new_grid->Nd();
Coordinate LL(Nd,0);
Coordinate sz = grid->_ldimensions;
Coordinate toLL(Nd,0);
toLL[dimension]=depth;
t=usecond();
localCopyRegion(from,to,LL,toLL,sz);
t_copy= usecond() - t;
////////////////////////////////////////////////////////////////////////////
// Scatter all faces
////////////////////////////////////////////////////////////////////////////
// DumpSliceNorm(std::string("Face_exchange to before scatter"),to,dimension);
plane=0;
t=usecond();
grid->CommsComplete(fwd_req);
t_comms+= usecond() - t;
t=usecond();
for ( int d=0;d < depth ; d ++ ) {
ScatterSlice(recv_buf,to,nld-depth+d,dimension,plane*buffer_size); plane++;
}
t_scatter= usecond() - t;
t=usecond();
grid->CommsComplete(bwd_req);
t_comms+= usecond() - t;
t=usecond();
for ( int d=0;d < depth ; d ++ ) {
ScatterSlice(recv_buf,to,d,dimension,plane*buffer_size); plane++;
}
t_scatter+= usecond() - t;
// DumpSliceNorm(std::string("Face_exchange to scatter 1st "),to,dimension);
//DumpSliceNorm(std::string("Face_exchange to done"),to,dimension);
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: gather :" << t_gather/1000 << "ms"<<std::endl;
// std::cout << GridLogPerformance << "PaddedCell::Expand new timings: gather :" << 2.0*bytes/t_gather << "MB/s"<<std::endl;
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: scatter:" << t_scatter/1000 << "ms"<<std::endl;
// std::cout << GridLogPerformance << "PaddedCell::Expand new timings: scatter:" << 2.0*bytes/t_scatter<< "MB/s"<<std::endl;
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: copy :" << t_copy/1000 << "ms"<<std::endl;
std::cout << GridLogPerformance << "PaddedCell::Expand new timings: comms :" << t_comms/1000 << "ms"<<std::endl;
// std::cout << GridLogPerformance << "PaddedCell::Expand new timings: comms :" << (RealD)4.0*bytes/t_comms << "MB/s"<<std::endl;
}
};
NAMESPACE_END(Grid);

View File

@ -165,7 +165,7 @@ class BinaryIO {
* FIXME -- 128^3 x 256 x 16 will overflow. * FIXME -- 128^3 x 256 x 16 will overflow.
*/ */
int64_t global_site; int global_site;
Lexicographic::CoorFromIndex(coor,local_site,local_vol); Lexicographic::CoorFromIndex(coor,local_site,local_vol);
@ -175,8 +175,8 @@ class BinaryIO {
Lexicographic::IndexFromCoor(coor,global_site,global_vol); Lexicographic::IndexFromCoor(coor,global_site,global_vol);
uint64_t gsite29 = global_site%29; uint32_t gsite29 = global_site%29;
uint64_t gsite31 = global_site%31; uint32_t gsite31 = global_site%31;
site_crc = crc32(0,(unsigned char *)site_buf,sizeof(fobj)); site_crc = crc32(0,(unsigned char *)site_buf,sizeof(fobj));
// std::cout << "Site "<<local_site << " crc "<<std::hex<<site_crc<<std::dec<<std::endl; // std::cout << "Site "<<local_site << " crc "<<std::hex<<site_crc<<std::dec<<std::endl;
@ -545,9 +545,7 @@ class BinaryIO {
const std::string &format, const std::string &format,
uint32_t &nersc_csum, uint32_t &nersc_csum,
uint32_t &scidac_csuma, uint32_t &scidac_csuma,
uint32_t &scidac_csumb, uint32_t &scidac_csumb)
int control=BINARYIO_LEXICOGRAPHIC
)
{ {
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename vobj::Realified::scalar_type word; word w=0; typedef typename vobj::Realified::scalar_type word; word w=0;
@ -558,7 +556,7 @@ class BinaryIO {
std::vector<sobj> scalardata(lsites); std::vector<sobj> scalardata(lsites);
std::vector<fobj> iodata(lsites); // Munge, checksum, byte order in here std::vector<fobj> iodata(lsites); // Munge, checksum, byte order in here
IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|control, IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC,
nersc_csum,scidac_csuma,scidac_csumb); nersc_csum,scidac_csuma,scidac_csumb);
GridStopWatch timer; GridStopWatch timer;
@ -584,8 +582,7 @@ class BinaryIO {
const std::string &format, const std::string &format,
uint32_t &nersc_csum, uint32_t &nersc_csum,
uint32_t &scidac_csuma, uint32_t &scidac_csuma,
uint32_t &scidac_csumb, uint32_t &scidac_csumb)
int control=BINARYIO_LEXICOGRAPHIC)
{ {
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
typedef typename vobj::Realified::scalar_type word; word w=0; typedef typename vobj::Realified::scalar_type word; word w=0;
@ -610,7 +607,7 @@ class BinaryIO {
while (attemptsLeft >= 0) while (attemptsLeft >= 0)
{ {
grid->Barrier(); grid->Barrier();
IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|control, IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
nersc_csum,scidac_csuma,scidac_csumb); nersc_csum,scidac_csuma,scidac_csumb);
if (checkWrite) if (checkWrite)
{ {
@ -620,7 +617,7 @@ class BinaryIO {
std::cout << GridLogMessage << "writeLatticeObject: read back object" << std::endl; std::cout << GridLogMessage << "writeLatticeObject: read back object" << std::endl;
grid->Barrier(); grid->Barrier();
IOobject(w,grid,ckiodata,file,ckoffset,format,BINARYIO_READ|control, IOobject(w,grid,ckiodata,file,ckoffset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC,
cknersc_csum,ckscidac_csuma,ckscidac_csumb); cknersc_csum,ckscidac_csuma,ckscidac_csumb);
if ((cknersc_csum != nersc_csum) or (ckscidac_csuma != scidac_csuma) or (ckscidac_csumb != scidac_csumb)) if ((cknersc_csum != nersc_csum) or (ckscidac_csuma != scidac_csuma) or (ckscidac_csumb != scidac_csumb))
{ {

View File

@ -206,7 +206,7 @@ class GridLimeReader : public BinaryIO {
// Read a generic lattice field and verify checksum // Read a generic lattice field and verify checksum
//////////////////////////////////////////// ////////////////////////////////////////////
template<class vobj> template<class vobj>
void readLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name,int control=BINARYIO_LEXICOGRAPHIC) void readLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name)
{ {
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
scidacChecksum scidacChecksum_; scidacChecksum scidacChecksum_;
@ -238,7 +238,7 @@ class GridLimeReader : public BinaryIO {
uint64_t offset= ftello(File); uint64_t offset= ftello(File);
// std::cout << " ReadLatticeObject from offset "<<offset << std::endl; // std::cout << " ReadLatticeObject from offset "<<offset << std::endl;
BinarySimpleMunger<sobj,sobj> munge; BinarySimpleMunger<sobj,sobj> munge;
BinaryIO::readLatticeObject< vobj, sobj >(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb,control); BinaryIO::readLatticeObject< vobj, sobj >(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
std::cout << GridLogMessage << "SciDAC checksum A " << std::hex << scidac_csuma << std::dec << std::endl; std::cout << GridLogMessage << "SciDAC checksum A " << std::hex << scidac_csuma << std::dec << std::endl;
std::cout << GridLogMessage << "SciDAC checksum B " << std::hex << scidac_csumb << std::dec << std::endl; std::cout << GridLogMessage << "SciDAC checksum B " << std::hex << scidac_csumb << std::dec << std::endl;
///////////////////////////////////////////// /////////////////////////////////////////////
@ -408,7 +408,7 @@ class GridLimeWriter : public BinaryIO
// in communicator used by the field.Grid() // in communicator used by the field.Grid()
//////////////////////////////////////////////////// ////////////////////////////////////////////////////
template<class vobj> template<class vobj>
void writeLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name,int control=BINARYIO_LEXICOGRAPHIC) void writeLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name)
{ {
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
// NB: FILE and iostream are jointly writing disjoint sequences in the // NB: FILE and iostream are jointly writing disjoint sequences in the
@ -459,7 +459,7 @@ class GridLimeWriter : public BinaryIO
/////////////////////////////////////////// ///////////////////////////////////////////
std::string format = getFormatString<vobj>(); std::string format = getFormatString<vobj>();
BinarySimpleMunger<sobj,sobj> munge; BinarySimpleMunger<sobj,sobj> munge;
BinaryIO::writeLatticeObject<vobj,sobj>(field, filename, munge, offset1, format,nersc_csum,scidac_csuma,scidac_csumb,control); BinaryIO::writeLatticeObject<vobj,sobj>(field, filename, munge, offset1, format,nersc_csum,scidac_csuma,scidac_csumb);
/////////////////////////////////////////// ///////////////////////////////////////////
// Wind forward and close the record // Wind forward and close the record
@ -512,8 +512,7 @@ class ScidacWriter : public GridLimeWriter {
//////////////////////////////////////////////// ////////////////////////////////////////////////
template <class vobj, class userRecord> template <class vobj, class userRecord>
void writeScidacFieldRecord(Lattice<vobj> &field,userRecord _userRecord, void writeScidacFieldRecord(Lattice<vobj> &field,userRecord _userRecord,
const unsigned int recordScientificPrec = 0, const unsigned int recordScientificPrec = 0)
int control=BINARYIO_LEXICOGRAPHIC)
{ {
GridBase * grid = field.Grid(); GridBase * grid = field.Grid();
@ -535,7 +534,7 @@ class ScidacWriter : public GridLimeWriter {
writeLimeObject(0,0,_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML)); writeLimeObject(0,0,_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
} }
// Collective call // Collective call
writeLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA),control); // Closes message with checksum writeLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA)); // Closes message with checksum
} }
}; };
@ -554,8 +553,7 @@ class ScidacReader : public GridLimeReader {
// Write generic lattice field in scidac format // Write generic lattice field in scidac format
//////////////////////////////////////////////// ////////////////////////////////////////////////
template <class vobj, class userRecord> template <class vobj, class userRecord>
void readScidacFieldRecord(Lattice<vobj> &field,userRecord &_userRecord, void readScidacFieldRecord(Lattice<vobj> &field,userRecord &_userRecord)
int control=BINARYIO_LEXICOGRAPHIC)
{ {
typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_object sobj;
GridBase * grid = field.Grid(); GridBase * grid = field.Grid();
@ -573,7 +571,7 @@ class ScidacReader : public GridLimeReader {
readLimeObject(header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message readLimeObject(header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message
readLimeObject(_userRecord,_userRecord.SerialisableClassName(),std::string(SCIDAC_RECORD_XML)); readLimeObject(_userRecord,_userRecord.SerialisableClassName(),std::string(SCIDAC_RECORD_XML));
readLimeObject(_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML)); readLimeObject(_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
readLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA),control); readLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA));
} }
void skipPastBinaryRecord(void) { void skipPastBinaryRecord(void) {
std::string rec_name(ILDG_BINARY_DATA); std::string rec_name(ILDG_BINARY_DATA);

View File

@ -42,10 +42,8 @@ using namespace Grid;
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
class NerscIO : public BinaryIO { class NerscIO : public BinaryIO {
public: public:
typedef Lattice<vLorentzColourMatrixD> GaugeField;
// Enable/disable exiting if the plaquette in the header does not match the value computed (default true) typedef Lattice<vLorentzColourMatrixD> GaugeField;
static bool & exitOnReadPlaquetteMismatch(){ static bool v=true; return v; }
static inline void truncate(std::string file){ static inline void truncate(std::string file){
std::ofstream fout(file,std::ios::out); std::ofstream fout(file,std::ios::out);
@ -205,7 +203,7 @@ public:
std::cerr << " nersc_csum " <<std::hex<< nersc_csum << " " << header.checksum<< std::dec<< std::endl; std::cerr << " nersc_csum " <<std::hex<< nersc_csum << " " << header.checksum<< std::dec<< std::endl;
exit(0); exit(0);
} }
if(exitOnReadPlaquetteMismatch()) assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 ); assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 ); assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
assert(nersc_csum == header.checksum ); assert(nersc_csum == header.checksum );

View File

@ -16,7 +16,7 @@
#ifdef __NVCC__ #ifdef __NVCC__
#pragma push #pragma push
#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ #if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 5)
#pragma nv_diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning" #pragma nv_diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning"
#else #else
#pragma diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning" #pragma diag_suppress declared_but_not_referenced // suppress "function was declared but never referenced warning"

View File

@ -63,7 +63,6 @@ static constexpr int Ngp=2; // gparity index range
#define ColourIndex (2) #define ColourIndex (2)
#define SpinIndex (1) #define SpinIndex (1)
#define LorentzIndex (0) #define LorentzIndex (0)
#define GparityFlavourIndex (0)
// Also should make these a named enum type // Also should make these a named enum type
static constexpr int DaggerNo=0; static constexpr int DaggerNo=0;
@ -88,8 +87,6 @@ template<typename T> struct isCoarsened {
template <typename T> using IfCoarsened = Invoke<std::enable_if< isCoarsened<T>::value,int> > ; template <typename T> using IfCoarsened = Invoke<std::enable_if< isCoarsened<T>::value,int> > ;
template <typename T> using IfNotCoarsened = Invoke<std::enable_if<!isCoarsened<T>::value,int> > ; template <typename T> using IfNotCoarsened = Invoke<std::enable_if<!isCoarsened<T>::value,int> > ;
const int GparityFlavourTensorIndex = 3; //TensorLevel counts from the bottom!
// ChrisK very keen to add extra space for Gparity doubling. // ChrisK very keen to add extra space for Gparity doubling.
// //
// Also add domain wall index, in a way where Wilson operator // Also add domain wall index, in a way where Wilson operator
@ -104,7 +101,6 @@ template<typename vtype> using iSpinMatrix = iScalar<iMatrix<iSca
template<typename vtype> using iColourMatrix = iScalar<iScalar<iMatrix<vtype, Nc> > > ; template<typename vtype> using iColourMatrix = iScalar<iScalar<iMatrix<vtype, Nc> > > ;
template<typename vtype> using iSpinColourMatrix = iScalar<iMatrix<iMatrix<vtype, Nc>, Ns> >; template<typename vtype> using iSpinColourMatrix = iScalar<iMatrix<iMatrix<vtype, Nc>, Ns> >;
template<typename vtype> using iLorentzColourMatrix = iVector<iScalar<iMatrix<vtype, Nc> >, Nd > ; template<typename vtype> using iLorentzColourMatrix = iVector<iScalar<iMatrix<vtype, Nc> >, Nd > ;
template<typename vtype> using iLorentzComplex = iVector<iScalar<iScalar<vtype> >, Nd > ;
template<typename vtype> using iDoubleStoredColourMatrix = iVector<iScalar<iMatrix<vtype, Nc> >, Nds > ; template<typename vtype> using iDoubleStoredColourMatrix = iVector<iScalar<iMatrix<vtype, Nc> >, Nds > ;
template<typename vtype> using iSpinVector = iScalar<iVector<iScalar<vtype>, Ns> >; template<typename vtype> using iSpinVector = iScalar<iVector<iScalar<vtype>, Ns> >;
template<typename vtype> using iColourVector = iScalar<iScalar<iVector<vtype, Nc> > >; template<typename vtype> using iColourVector = iScalar<iScalar<iVector<vtype, Nc> > >;
@ -114,10 +110,8 @@ template<typename vtype> using iHalfSpinColourVector = iScalar<iVector<iVec
template<typename vtype> using iSpinColourSpinColourMatrix = iScalar<iMatrix<iMatrix<iMatrix<iMatrix<vtype, Nc>, Ns>, Nc>, Ns> >; template<typename vtype> using iSpinColourSpinColourMatrix = iScalar<iMatrix<iMatrix<iMatrix<iMatrix<vtype, Nc>, Ns>, Nc>, Ns> >;
template<typename vtype> using iGparityFlavourVector = iVector<iScalar<iScalar<vtype> >, Ngp>;
template<typename vtype> using iGparitySpinColourVector = iVector<iVector<iVector<vtype, Nc>, Ns>, Ngp >; template<typename vtype> using iGparitySpinColourVector = iVector<iVector<iVector<vtype, Nc>, Ns>, Ngp >;
template<typename vtype> using iGparityHalfSpinColourVector = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >; template<typename vtype> using iGparityHalfSpinColourVector = iVector<iVector<iVector<vtype, Nc>, Nhs>, Ngp >;
template<typename vtype> using iGparityFlavourMatrix = iMatrix<iScalar<iScalar<vtype> >, Ngp>;
// Spin matrix // Spin matrix
typedef iSpinMatrix<Complex > SpinMatrix; typedef iSpinMatrix<Complex > SpinMatrix;
@ -127,7 +121,6 @@ typedef iSpinMatrix<ComplexD > SpinMatrixD;
typedef iSpinMatrix<vComplex > vSpinMatrix; typedef iSpinMatrix<vComplex > vSpinMatrix;
typedef iSpinMatrix<vComplexF> vSpinMatrixF; typedef iSpinMatrix<vComplexF> vSpinMatrixF;
typedef iSpinMatrix<vComplexD> vSpinMatrixD; typedef iSpinMatrix<vComplexD> vSpinMatrixD;
typedef iSpinMatrix<vComplexD2> vSpinMatrixD2;
// Colour Matrix // Colour Matrix
typedef iColourMatrix<Complex > ColourMatrix; typedef iColourMatrix<Complex > ColourMatrix;
@ -137,7 +130,6 @@ typedef iColourMatrix<ComplexD > ColourMatrixD;
typedef iColourMatrix<vComplex > vColourMatrix; typedef iColourMatrix<vComplex > vColourMatrix;
typedef iColourMatrix<vComplexF> vColourMatrixF; typedef iColourMatrix<vComplexF> vColourMatrixF;
typedef iColourMatrix<vComplexD> vColourMatrixD; typedef iColourMatrix<vComplexD> vColourMatrixD;
typedef iColourMatrix<vComplexD2> vColourMatrixD2;
// SpinColour matrix // SpinColour matrix
typedef iSpinColourMatrix<Complex > SpinColourMatrix; typedef iSpinColourMatrix<Complex > SpinColourMatrix;
@ -147,7 +139,6 @@ typedef iSpinColourMatrix<ComplexD > SpinColourMatrixD;
typedef iSpinColourMatrix<vComplex > vSpinColourMatrix; typedef iSpinColourMatrix<vComplex > vSpinColourMatrix;
typedef iSpinColourMatrix<vComplexF> vSpinColourMatrixF; typedef iSpinColourMatrix<vComplexF> vSpinColourMatrixF;
typedef iSpinColourMatrix<vComplexD> vSpinColourMatrixD; typedef iSpinColourMatrix<vComplexD> vSpinColourMatrixD;
typedef iSpinColourMatrix<vComplexD2> vSpinColourMatrixD2;
// SpinColourSpinColour matrix // SpinColourSpinColour matrix
typedef iSpinColourSpinColourMatrix<Complex > SpinColourSpinColourMatrix; typedef iSpinColourSpinColourMatrix<Complex > SpinColourSpinColourMatrix;
@ -157,7 +148,6 @@ typedef iSpinColourSpinColourMatrix<ComplexD > SpinColourSpinColourMatrixD;
typedef iSpinColourSpinColourMatrix<vComplex > vSpinColourSpinColourMatrix; typedef iSpinColourSpinColourMatrix<vComplex > vSpinColourSpinColourMatrix;
typedef iSpinColourSpinColourMatrix<vComplexF> vSpinColourSpinColourMatrixF; typedef iSpinColourSpinColourMatrix<vComplexF> vSpinColourSpinColourMatrixF;
typedef iSpinColourSpinColourMatrix<vComplexD> vSpinColourSpinColourMatrixD; typedef iSpinColourSpinColourMatrix<vComplexD> vSpinColourSpinColourMatrixD;
typedef iSpinColourSpinColourMatrix<vComplexD2> vSpinColourSpinColourMatrixD2;
// SpinColourSpinColour matrix // SpinColourSpinColour matrix
typedef iSpinColourSpinColourMatrix<Complex > SpinColourSpinColourMatrix; typedef iSpinColourSpinColourMatrix<Complex > SpinColourSpinColourMatrix;
@ -167,47 +157,24 @@ typedef iSpinColourSpinColourMatrix<ComplexD > SpinColourSpinColourMatrixD;
typedef iSpinColourSpinColourMatrix<vComplex > vSpinColourSpinColourMatrix; typedef iSpinColourSpinColourMatrix<vComplex > vSpinColourSpinColourMatrix;
typedef iSpinColourSpinColourMatrix<vComplexF> vSpinColourSpinColourMatrixF; typedef iSpinColourSpinColourMatrix<vComplexF> vSpinColourSpinColourMatrixF;
typedef iSpinColourSpinColourMatrix<vComplexD> vSpinColourSpinColourMatrixD; typedef iSpinColourSpinColourMatrix<vComplexD> vSpinColourSpinColourMatrixD;
typedef iSpinColourSpinColourMatrix<vComplexD2> vSpinColourSpinColourMatrixD2;
// LorentzColour // LorentzColour
typedef iLorentzColourMatrix<Complex > LorentzColourMatrix; typedef iLorentzColourMatrix<Complex > LorentzColourMatrix;
typedef iLorentzColourMatrix<ComplexF > LorentzColourMatrixF; typedef iLorentzColourMatrix<ComplexF > LorentzColourMatrixF;
typedef iLorentzColourMatrix<ComplexD > LorentzColourMatrixD; typedef iLorentzColourMatrix<ComplexD > LorentzColourMatrixD;
typedef iLorentzColourMatrix<vComplex > vLorentzColourMatrix; typedef iLorentzColourMatrix<vComplex > vLorentzColourMatrix;
typedef iLorentzColourMatrix<vComplexF> vLorentzColourMatrixF; typedef iLorentzColourMatrix<vComplexF> vLorentzColourMatrixF;
typedef iLorentzColourMatrix<vComplexD> vLorentzColourMatrixD; typedef iLorentzColourMatrix<vComplexD> vLorentzColourMatrixD;
typedef iLorentzColourMatrix<vComplexD2> vLorentzColourMatrixD2;
// LorentzComplex
typedef iLorentzComplex<Complex > LorentzComplex;
typedef iLorentzComplex<ComplexF > LorentzComplexF;
typedef iLorentzComplex<ComplexD > LorentzComplexD;
typedef iLorentzComplex<vComplex > vLorentzComplex;
typedef iLorentzComplex<vComplexF> vLorentzComplexF;
typedef iLorentzComplex<vComplexD> vLorentzComplexD;
// DoubleStored gauge field // DoubleStored gauge field
typedef iDoubleStoredColourMatrix<Complex > DoubleStoredColourMatrix; typedef iDoubleStoredColourMatrix<Complex > DoubleStoredColourMatrix;
typedef iDoubleStoredColourMatrix<ComplexF > DoubleStoredColourMatrixF; typedef iDoubleStoredColourMatrix<ComplexF > DoubleStoredColourMatrixF;
typedef iDoubleStoredColourMatrix<ComplexD > DoubleStoredColourMatrixD; typedef iDoubleStoredColourMatrix<ComplexD > DoubleStoredColourMatrixD;
typedef iDoubleStoredColourMatrix<vComplex > vDoubleStoredColourMatrix; typedef iDoubleStoredColourMatrix<vComplex > vDoubleStoredColourMatrix;
typedef iDoubleStoredColourMatrix<vComplexF> vDoubleStoredColourMatrixF; typedef iDoubleStoredColourMatrix<vComplexF> vDoubleStoredColourMatrixF;
typedef iDoubleStoredColourMatrix<vComplexD> vDoubleStoredColourMatrixD; typedef iDoubleStoredColourMatrix<vComplexD> vDoubleStoredColourMatrixD;
typedef iDoubleStoredColourMatrix<vComplexD2> vDoubleStoredColourMatrixD2;
//G-parity flavour matrix
typedef iGparityFlavourMatrix<Complex> GparityFlavourMatrix;
typedef iGparityFlavourMatrix<ComplexF> GparityFlavourMatrixF;
typedef iGparityFlavourMatrix<ComplexD> GparityFlavourMatrixD;
typedef iGparityFlavourMatrix<vComplex> vGparityFlavourMatrix;
typedef iGparityFlavourMatrix<vComplexF> vGparityFlavourMatrixF;
typedef iGparityFlavourMatrix<vComplexD> vGparityFlavourMatrixD;
typedef iGparityFlavourMatrix<vComplexD2> vGparityFlavourMatrixD2;
// Spin vector // Spin vector
typedef iSpinVector<Complex > SpinVector; typedef iSpinVector<Complex > SpinVector;
@ -217,7 +184,6 @@ typedef iSpinVector<ComplexD> SpinVectorD;
typedef iSpinVector<vComplex > vSpinVector; typedef iSpinVector<vComplex > vSpinVector;
typedef iSpinVector<vComplexF> vSpinVectorF; typedef iSpinVector<vComplexF> vSpinVectorF;
typedef iSpinVector<vComplexD> vSpinVectorD; typedef iSpinVector<vComplexD> vSpinVectorD;
typedef iSpinVector<vComplexD2> vSpinVectorD2;
// Colour vector // Colour vector
typedef iColourVector<Complex > ColourVector; typedef iColourVector<Complex > ColourVector;
@ -227,7 +193,6 @@ typedef iColourVector<ComplexD> ColourVectorD;
typedef iColourVector<vComplex > vColourVector; typedef iColourVector<vComplex > vColourVector;
typedef iColourVector<vComplexF> vColourVectorF; typedef iColourVector<vComplexF> vColourVectorF;
typedef iColourVector<vComplexD> vColourVectorD; typedef iColourVector<vComplexD> vColourVectorD;
typedef iColourVector<vComplexD2> vColourVectorD2;
// SpinColourVector // SpinColourVector
typedef iSpinColourVector<Complex > SpinColourVector; typedef iSpinColourVector<Complex > SpinColourVector;
@ -237,7 +202,6 @@ typedef iSpinColourVector<ComplexD> SpinColourVectorD;
typedef iSpinColourVector<vComplex > vSpinColourVector; typedef iSpinColourVector<vComplex > vSpinColourVector;
typedef iSpinColourVector<vComplexF> vSpinColourVectorF; typedef iSpinColourVector<vComplexF> vSpinColourVectorF;
typedef iSpinColourVector<vComplexD> vSpinColourVectorD; typedef iSpinColourVector<vComplexD> vSpinColourVectorD;
typedef iSpinColourVector<vComplexD2> vSpinColourVectorD2;
// HalfSpin vector // HalfSpin vector
typedef iHalfSpinVector<Complex > HalfSpinVector; typedef iHalfSpinVector<Complex > HalfSpinVector;
@ -247,27 +211,15 @@ typedef iHalfSpinVector<ComplexD> HalfSpinVectorD;
typedef iHalfSpinVector<vComplex > vHalfSpinVector; typedef iHalfSpinVector<vComplex > vHalfSpinVector;
typedef iHalfSpinVector<vComplexF> vHalfSpinVectorF; typedef iHalfSpinVector<vComplexF> vHalfSpinVectorF;
typedef iHalfSpinVector<vComplexD> vHalfSpinVectorD; typedef iHalfSpinVector<vComplexD> vHalfSpinVectorD;
typedef iHalfSpinVector<vComplexD2> vHalfSpinVectorD2;
// HalfSpinColour vector // HalfSpinColour vector
typedef iHalfSpinColourVector<Complex > HalfSpinColourVector; typedef iHalfSpinColourVector<Complex > HalfSpinColourVector;
typedef iHalfSpinColourVector<ComplexF> HalfSpinColourVectorF; typedef iHalfSpinColourVector<ComplexF> HalfSpinColourVectorF;
typedef iHalfSpinColourVector<ComplexD> HalfSpinColourVectorD; typedef iHalfSpinColourVector<ComplexD> HalfSpinColourVectorD;
typedef iHalfSpinColourVector<vComplex > vHalfSpinColourVector; typedef iHalfSpinColourVector<vComplex > vHalfSpinColourVector;
typedef iHalfSpinColourVector<vComplexF> vHalfSpinColourVectorF; typedef iHalfSpinColourVector<vComplexF> vHalfSpinColourVectorF;
typedef iHalfSpinColourVector<vComplexD> vHalfSpinColourVectorD; typedef iHalfSpinColourVector<vComplexD> vHalfSpinColourVectorD;
typedef iHalfSpinColourVector<vComplexD2> vHalfSpinColourVectorD2;
//G-parity flavour vector
typedef iGparityFlavourVector<Complex > GparityFlavourVector;
typedef iGparityFlavourVector<ComplexF> GparityFlavourVectorF;
typedef iGparityFlavourVector<ComplexD> GparityFlavourVectorD;
typedef iGparityFlavourVector<vComplex > vGparityFlavourVector;
typedef iGparityFlavourVector<vComplexF> vGparityFlavourVectorF;
typedef iGparityFlavourVector<vComplexD> vGparityFlavourVectorD;
typedef iGparityFlavourVector<vComplexD2> vGparityFlavourVectorD2;
// singlets // singlets
typedef iSinglet<Complex > TComplex; // FIXME This is painful. Tensor singlet complex type. typedef iSinglet<Complex > TComplex; // FIXME This is painful. Tensor singlet complex type.
@ -277,7 +229,6 @@ typedef iSinglet<ComplexD> TComplexD; // FIXME This is painful. Tenso
typedef iSinglet<vComplex > vTComplex ; // what if we don't know the tensor structure typedef iSinglet<vComplex > vTComplex ; // what if we don't know the tensor structure
typedef iSinglet<vComplexF> vTComplexF; // what if we don't know the tensor structure typedef iSinglet<vComplexF> vTComplexF; // what if we don't know the tensor structure
typedef iSinglet<vComplexD> vTComplexD; // what if we don't know the tensor structure typedef iSinglet<vComplexD> vTComplexD; // what if we don't know the tensor structure
typedef iSinglet<vComplexD2> vTComplexD2; // what if we don't know the tensor structure
typedef iSinglet<Real > TReal; // Shouldn't need these; can I make it work without? typedef iSinglet<Real > TReal; // Shouldn't need these; can I make it work without?
typedef iSinglet<RealF> TRealF; // Shouldn't need these; can I make it work without? typedef iSinglet<RealF> TRealF; // Shouldn't need these; can I make it work without?
@ -295,62 +246,47 @@ typedef iSinglet<Integer > TInteger;
typedef Lattice<vColourMatrix> LatticeColourMatrix; typedef Lattice<vColourMatrix> LatticeColourMatrix;
typedef Lattice<vColourMatrixF> LatticeColourMatrixF; typedef Lattice<vColourMatrixF> LatticeColourMatrixF;
typedef Lattice<vColourMatrixD> LatticeColourMatrixD; typedef Lattice<vColourMatrixD> LatticeColourMatrixD;
typedef Lattice<vColourMatrixD2> LatticeColourMatrixD2;
typedef Lattice<vSpinMatrix> LatticeSpinMatrix; typedef Lattice<vSpinMatrix> LatticeSpinMatrix;
typedef Lattice<vSpinMatrixF> LatticeSpinMatrixF; typedef Lattice<vSpinMatrixF> LatticeSpinMatrixF;
typedef Lattice<vSpinMatrixD> LatticeSpinMatrixD; typedef Lattice<vSpinMatrixD> LatticeSpinMatrixD;
typedef Lattice<vSpinMatrixD2> LatticeSpinMatrixD2;
typedef Lattice<vSpinColourMatrix> LatticeSpinColourMatrix; typedef Lattice<vSpinColourMatrix> LatticeSpinColourMatrix;
typedef Lattice<vSpinColourMatrixF> LatticeSpinColourMatrixF; typedef Lattice<vSpinColourMatrixF> LatticeSpinColourMatrixF;
typedef Lattice<vSpinColourMatrixD> LatticeSpinColourMatrixD; typedef Lattice<vSpinColourMatrixD> LatticeSpinColourMatrixD;
typedef Lattice<vSpinColourMatrixD2> LatticeSpinColourMatrixD2;
typedef Lattice<vSpinColourSpinColourMatrix> LatticeSpinColourSpinColourMatrix; typedef Lattice<vSpinColourSpinColourMatrix> LatticeSpinColourSpinColourMatrix;
typedef Lattice<vSpinColourSpinColourMatrixF> LatticeSpinColourSpinColourMatrixF; typedef Lattice<vSpinColourSpinColourMatrixF> LatticeSpinColourSpinColourMatrixF;
typedef Lattice<vSpinColourSpinColourMatrixD> LatticeSpinColourSpinColourMatrixD; typedef Lattice<vSpinColourSpinColourMatrixD> LatticeSpinColourSpinColourMatrixD;
typedef Lattice<vSpinColourSpinColourMatrixD2> LatticeSpinColourSpinColourMatrixD2;
typedef Lattice<vLorentzColourMatrix> LatticeLorentzColourMatrix; typedef Lattice<vLorentzColourMatrix> LatticeLorentzColourMatrix;
typedef Lattice<vLorentzColourMatrixF> LatticeLorentzColourMatrixF; typedef Lattice<vLorentzColourMatrixF> LatticeLorentzColourMatrixF;
typedef Lattice<vLorentzColourMatrixD> LatticeLorentzColourMatrixD; typedef Lattice<vLorentzColourMatrixD> LatticeLorentzColourMatrixD;
typedef Lattice<vLorentzColourMatrixD2> LatticeLorentzColourMatrixD2;
typedef Lattice<vLorentzComplex> LatticeLorentzComplex;
typedef Lattice<vLorentzComplexF> LatticeLorentzComplexF;
typedef Lattice<vLorentzComplexD> LatticeLorentzComplexD;
// DoubleStored gauge field // DoubleStored gauge field
typedef Lattice<vDoubleStoredColourMatrix> LatticeDoubleStoredColourMatrix; typedef Lattice<vDoubleStoredColourMatrix> LatticeDoubleStoredColourMatrix;
typedef Lattice<vDoubleStoredColourMatrixF> LatticeDoubleStoredColourMatrixF; typedef Lattice<vDoubleStoredColourMatrixF> LatticeDoubleStoredColourMatrixF;
typedef Lattice<vDoubleStoredColourMatrixD> LatticeDoubleStoredColourMatrixD; typedef Lattice<vDoubleStoredColourMatrixD> LatticeDoubleStoredColourMatrixD;
typedef Lattice<vDoubleStoredColourMatrixD2> LatticeDoubleStoredColourMatrixD2;
typedef Lattice<vSpinVector> LatticeSpinVector; typedef Lattice<vSpinVector> LatticeSpinVector;
typedef Lattice<vSpinVectorF> LatticeSpinVectorF; typedef Lattice<vSpinVectorF> LatticeSpinVectorF;
typedef Lattice<vSpinVectorD> LatticeSpinVectorD; typedef Lattice<vSpinVectorD> LatticeSpinVectorD;
typedef Lattice<vSpinVectorD2> LatticeSpinVectorD2;
typedef Lattice<vColourVector> LatticeColourVector; typedef Lattice<vColourVector> LatticeColourVector;
typedef Lattice<vColourVectorF> LatticeColourVectorF; typedef Lattice<vColourVectorF> LatticeColourVectorF;
typedef Lattice<vColourVectorD> LatticeColourVectorD; typedef Lattice<vColourVectorD> LatticeColourVectorD;
typedef Lattice<vColourVectorD2> LatticeColourVectorD2;
typedef Lattice<vSpinColourVector> LatticeSpinColourVector; typedef Lattice<vSpinColourVector> LatticeSpinColourVector;
typedef Lattice<vSpinColourVectorF> LatticeSpinColourVectorF; typedef Lattice<vSpinColourVectorF> LatticeSpinColourVectorF;
typedef Lattice<vSpinColourVectorD> LatticeSpinColourVectorD; typedef Lattice<vSpinColourVectorD> LatticeSpinColourVectorD;
typedef Lattice<vSpinColourVectorD2> LatticeSpinColourVectorD2;
typedef Lattice<vHalfSpinVector> LatticeHalfSpinVector; typedef Lattice<vHalfSpinVector> LatticeHalfSpinVector;
typedef Lattice<vHalfSpinVectorF> LatticeHalfSpinVectorF; typedef Lattice<vHalfSpinVectorF> LatticeHalfSpinVectorF;
typedef Lattice<vHalfSpinVectorD> LatticeHalfSpinVectorD; typedef Lattice<vHalfSpinVectorD> LatticeHalfSpinVectorD;
typedef Lattice<vHalfSpinVectorD2> LatticeHalfSpinVectorD2;
typedef Lattice<vHalfSpinColourVector> LatticeHalfSpinColourVector; typedef Lattice<vHalfSpinColourVector> LatticeHalfSpinColourVector;
typedef Lattice<vHalfSpinColourVectorF> LatticeHalfSpinColourVectorF; typedef Lattice<vHalfSpinColourVectorF> LatticeHalfSpinColourVectorF;
typedef Lattice<vHalfSpinColourVectorD> LatticeHalfSpinColourVectorD; typedef Lattice<vHalfSpinColourVectorD> LatticeHalfSpinColourVectorD;
typedef Lattice<vHalfSpinColourVectorD2> LatticeHalfSpinColourVectorD2;
typedef Lattice<vTReal> LatticeReal; typedef Lattice<vTReal> LatticeReal;
typedef Lattice<vTRealF> LatticeRealF; typedef Lattice<vTRealF> LatticeRealF;
@ -359,7 +295,6 @@ typedef Lattice<vTRealD> LatticeRealD;
typedef Lattice<vTComplex> LatticeComplex; typedef Lattice<vTComplex> LatticeComplex;
typedef Lattice<vTComplexF> LatticeComplexF; typedef Lattice<vTComplexF> LatticeComplexF;
typedef Lattice<vTComplexD> LatticeComplexD; typedef Lattice<vTComplexD> LatticeComplexD;
typedef Lattice<vTComplexD2> LatticeComplexD2;
typedef Lattice<vTInteger> LatticeInteger; // Predicates for "where" typedef Lattice<vTInteger> LatticeInteger; // Predicates for "where"
@ -367,42 +302,37 @@ typedef Lattice<vTInteger> LatticeInteger; // Predicates for "where"
/////////////////////////////////////////// ///////////////////////////////////////////
// Physical names for things // Physical names for things
/////////////////////////////////////////// ///////////////////////////////////////////
typedef LatticeHalfSpinColourVector LatticeHalfFermion; typedef LatticeHalfSpinColourVector LatticeHalfFermion;
typedef LatticeHalfSpinColourVectorF LatticeHalfFermionF; typedef LatticeHalfSpinColourVectorF LatticeHalfFermionF;
typedef LatticeHalfSpinColourVectorD LatticeHalfFermionD; typedef LatticeHalfSpinColourVectorF LatticeHalfFermionD;
typedef LatticeHalfSpinColourVectorD2 LatticeHalfFermionD2;
typedef LatticeSpinColourVector LatticeFermion; typedef LatticeSpinColourVector LatticeFermion;
typedef LatticeSpinColourVectorF LatticeFermionF; typedef LatticeSpinColourVectorF LatticeFermionF;
typedef LatticeSpinColourVectorD LatticeFermionD; typedef LatticeSpinColourVectorD LatticeFermionD;
typedef LatticeSpinColourVectorD2 LatticeFermionD2;
typedef LatticeSpinColourMatrix LatticePropagator; typedef LatticeSpinColourMatrix LatticePropagator;
typedef LatticeSpinColourMatrixF LatticePropagatorF; typedef LatticeSpinColourMatrixF LatticePropagatorF;
typedef LatticeSpinColourMatrixD LatticePropagatorD; typedef LatticeSpinColourMatrixD LatticePropagatorD;
typedef LatticeSpinColourMatrixD2 LatticePropagatorD2;
typedef LatticeLorentzColourMatrix LatticeGaugeField; typedef LatticeLorentzColourMatrix LatticeGaugeField;
typedef LatticeLorentzColourMatrixF LatticeGaugeFieldF; typedef LatticeLorentzColourMatrixF LatticeGaugeFieldF;
typedef LatticeLorentzColourMatrixD LatticeGaugeFieldD; typedef LatticeLorentzColourMatrixD LatticeGaugeFieldD;
typedef LatticeLorentzColourMatrixD2 LatticeGaugeFieldD2;
typedef LatticeDoubleStoredColourMatrix LatticeDoubledGaugeField; typedef LatticeDoubleStoredColourMatrix LatticeDoubledGaugeField;
typedef LatticeDoubleStoredColourMatrixF LatticeDoubledGaugeFieldF; typedef LatticeDoubleStoredColourMatrixF LatticeDoubledGaugeFieldF;
typedef LatticeDoubleStoredColourMatrixD LatticeDoubledGaugeFieldD; typedef LatticeDoubleStoredColourMatrixD LatticeDoubledGaugeFieldD;
typedef LatticeDoubleStoredColourMatrixD2 LatticeDoubledGaugeFieldD2;
template<class GF> using LorentzScalar = Lattice<iScalar<typename GF::vector_object::element> >; template<class GF> using LorentzScalar = Lattice<iScalar<typename GF::vector_object::element> >;
// Uhgg... typing this hurt ;)
// (my keyboard got burning hot when I typed this, must be the anti-Fermion)
typedef Lattice<vColourVector> LatticeStaggeredFermion; typedef Lattice<vColourVector> LatticeStaggeredFermion;
typedef Lattice<vColourVectorF> LatticeStaggeredFermionF; typedef Lattice<vColourVectorF> LatticeStaggeredFermionF;
typedef Lattice<vColourVectorD> LatticeStaggeredFermionD; typedef Lattice<vColourVectorD> LatticeStaggeredFermionD;
typedef Lattice<vColourVectorD2> LatticeStaggeredFermionD2;
typedef Lattice<vColourMatrix> LatticeStaggeredPropagator; typedef Lattice<vColourMatrix> LatticeStaggeredPropagator;
typedef Lattice<vColourMatrixF> LatticeStaggeredPropagatorF; typedef Lattice<vColourMatrixF> LatticeStaggeredPropagatorF;
typedef Lattice<vColourMatrixD> LatticeStaggeredPropagatorD; typedef Lattice<vColourMatrixD> LatticeStaggeredPropagatorD;
typedef Lattice<vColourMatrixD2> LatticeStaggeredPropagatorD2;
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
// Peek and Poke named after physics attributes // Peek and Poke named after physics attributes

View File

@ -34,96 +34,16 @@ directory
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
///////////////////////////////////
// Smart configuration base class
///////////////////////////////////
template< class Field >
class ConfigurationBase
{
public:
ConfigurationBase() {}
virtual ~ConfigurationBase() {}
virtual void set_Field(Field& U) =0;
virtual void smeared_force(Field&) = 0;
virtual Field& get_SmearedU() =0;
virtual Field &get_U(bool smeared = false) = 0;
};
template <class GaugeField > template <class GaugeField >
class Action class Action
{ {
public: public:
bool is_smeared = false; bool is_smeared = false;
RealD deriv_norm_sum;
RealD deriv_max_sum;
RealD Fdt_norm_sum;
RealD Fdt_max_sum;
int deriv_num;
RealD deriv_us;
RealD S_us;
RealD refresh_us;
void reset_timer(void) {
deriv_us = S_us = refresh_us = 0.0;
deriv_norm_sum = deriv_max_sum=0.0;
Fdt_max_sum = Fdt_norm_sum = 0.0;
deriv_num=0;
}
void deriv_log(RealD nrm, RealD max,RealD Fdt_nrm,RealD Fdt_max) {
if ( max > deriv_max_sum ) {
deriv_max_sum=max;
}
deriv_norm_sum+=nrm;
if ( Fdt_max > Fdt_max_sum ) {
Fdt_max_sum=Fdt_max;
}
Fdt_norm_sum+=Fdt_nrm; deriv_num++;
}
RealD deriv_max_average(void) { return deriv_max_sum; };
RealD deriv_norm_average(void) { return deriv_norm_sum/deriv_num; };
RealD Fdt_max_average(void) { return Fdt_max_sum; };
RealD Fdt_norm_average(void) { return Fdt_norm_sum/deriv_num; };
RealD deriv_timer(void) { return deriv_us; };
RealD S_timer(void) { return S_us; };
RealD refresh_timer(void) { return refresh_us; };
void deriv_timer_start(void) { deriv_us-=usecond(); }
void deriv_timer_stop(void) { deriv_us+=usecond(); }
void refresh_timer_start(void) { refresh_us-=usecond(); }
void refresh_timer_stop(void) { refresh_us+=usecond(); }
void S_timer_start(void) { S_us-=usecond(); }
void S_timer_stop(void) { S_us+=usecond(); }
/////////////////////////////
// Heatbath? // Heatbath?
/////////////////////////////
virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) = 0; // refresh pseudofermions virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) = 0; // refresh pseudofermions
virtual RealD S(const GaugeField& U) = 0; // evaluate the action virtual RealD S(const GaugeField& U) = 0; // evaluate the action
virtual RealD Sinitial(const GaugeField& U) { return this->S(U); } ; // if the refresh computes the action, can cache it. Alternately refreshAndAction() ?
virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0; // evaluate the action derivative virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0; // evaluate the action derivative
/////////////////////////////////////////////////////////////
// virtual smeared interface through configuration container
/////////////////////////////////////////////////////////////
virtual void refresh(ConfigurationBase<GaugeField> & U, GridSerialRNG &sRNG, GridParallelRNG& pRNG)
{
refresh(U.get_U(is_smeared),sRNG,pRNG);
}
virtual RealD S(ConfigurationBase<GaugeField>& U)
{
return S(U.get_U(is_smeared));
}
virtual RealD Sinitial(ConfigurationBase<GaugeField>& U)
{
return Sinitial(U.get_U(is_smeared));
}
virtual void deriv(ConfigurationBase<GaugeField>& U, GaugeField& dSdU)
{
deriv(U.get_U(is_smeared),dSdU);
if ( is_smeared ) {
U.smeared_force(dSdU);
}
}
///////////////////////////////
// Logging
///////////////////////////////
virtual std::string action_name() = 0; // return the action name virtual std::string action_name() = 0; // return the action name
virtual std::string LogParameters() = 0; // prints action parameters virtual std::string LogParameters() = 0; // prints action parameters
virtual ~Action(){} virtual ~Action(){}

View File

@ -30,8 +30,6 @@ directory
#ifndef QCD_ACTION_CORE #ifndef QCD_ACTION_CORE
#define QCD_ACTION_CORE #define QCD_ACTION_CORE
#include <Grid/qcd/action/gauge/GaugeImplementations.h>
#include <Grid/qcd/action/ActionBase.h> #include <Grid/qcd/action/ActionBase.h>
NAMESPACE_CHECK(ActionBase); NAMESPACE_CHECK(ActionBase);
#include <Grid/qcd/action/ActionSet.h> #include <Grid/qcd/action/ActionSet.h>
@ -39,10 +37,6 @@ NAMESPACE_CHECK(ActionSet);
#include <Grid/qcd/action/ActionParams.h> #include <Grid/qcd/action/ActionParams.h>
NAMESPACE_CHECK(ActionParams); NAMESPACE_CHECK(ActionParams);
#include <Grid/qcd/action/filters/MomentumFilter.h>
#include <Grid/qcd/action/filters/DirichletFilter.h>
#include <Grid/qcd/action/filters/DDHMCFilter.h>
//////////////////////////////////////////// ////////////////////////////////////////////
// Gauge Actions // Gauge Actions
//////////////////////////////////////////// ////////////////////////////////////////////

View File

@ -34,45 +34,27 @@ directory
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
// These can move into a params header and be given MacroMagic serialisation
struct GparityWilsonImplParams { struct GparityWilsonImplParams {
Coordinate twists; Coordinate twists;
//mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs GparityWilsonImplParams() : twists(Nd, 0) {};
Coordinate dirichlet; // Blocksize of dirichlet BCs
int partialDirichlet;
GparityWilsonImplParams() : twists(Nd, 0) {
dirichlet.resize(0);
partialDirichlet=0;
};
}; };
struct WilsonImplParams { struct WilsonImplParams {
bool overlapCommsCompute; bool overlapCommsCompute;
Coordinate dirichlet; // Blocksize of dirichlet BCs
int partialDirichlet;
AcceleratorVector<Real,Nd> twist_n_2pi_L; AcceleratorVector<Real,Nd> twist_n_2pi_L;
AcceleratorVector<Complex,Nd> boundary_phases; AcceleratorVector<Complex,Nd> boundary_phases;
WilsonImplParams() { WilsonImplParams() {
dirichlet.resize(0);
partialDirichlet=0;
boundary_phases.resize(Nd, 1.0); boundary_phases.resize(Nd, 1.0);
twist_n_2pi_L.resize(Nd, 0.0); twist_n_2pi_L.resize(Nd, 0.0);
}; };
WilsonImplParams(const AcceleratorVector<Complex,Nd> phi) : boundary_phases(phi), overlapCommsCompute(false) { WilsonImplParams(const AcceleratorVector<Complex,Nd> phi) : boundary_phases(phi), overlapCommsCompute(false) {
twist_n_2pi_L.resize(Nd, 0.0); twist_n_2pi_L.resize(Nd, 0.0);
partialDirichlet=0;
dirichlet.resize(0);
} }
}; };
struct StaggeredImplParams { struct StaggeredImplParams {
Coordinate dirichlet; // Blocksize of dirichlet BCs StaggeredImplParams() {};
int partialDirichlet;
StaggeredImplParams()
{
partialDirichlet=0;
dirichlet.resize(0);
};
}; };
struct OneFlavourRationalParams : Serializable { struct OneFlavourRationalParams : Serializable {
@ -81,11 +63,9 @@ struct StaggeredImplParams {
RealD, hi, RealD, hi,
int, MaxIter, int, MaxIter,
RealD, tolerance, RealD, tolerance,
RealD, mdtolerance,
int, degree, int, degree,
int, precision, int, precision,
int, BoundsCheckFreq, int, BoundsCheckFreq);
RealD, BoundsCheckTol);
// MaxIter and tolerance, vectors?? // MaxIter and tolerance, vectors??
@ -96,62 +76,16 @@ struct StaggeredImplParams {
RealD tol = 1.0e-8, RealD tol = 1.0e-8,
int _degree = 10, int _degree = 10,
int _precision = 64, int _precision = 64,
int _BoundsCheckFreq=20, int _BoundsCheckFreq=20)
RealD mdtol = 1.0e-6,
double _BoundsCheckTol=1e-6)
: lo(_lo), : lo(_lo),
hi(_hi), hi(_hi),
MaxIter(_maxit), MaxIter(_maxit),
tolerance(tol), tolerance(tol),
mdtolerance(mdtol),
degree(_degree), degree(_degree),
precision(_precision), precision(_precision),
BoundsCheckFreq(_BoundsCheckFreq), BoundsCheckFreq(_BoundsCheckFreq){};
BoundsCheckTol(_BoundsCheckTol){};
}; };
/*Action parameters for the generalized rational action
The approximation is for (M^dag M)^{1/inv_pow}
where inv_pow is the denominator of the fractional power.
Default inv_pow=2 for square root, making this equivalent to
the OneFlavourRational action
*/
struct RationalActionParams : Serializable {
GRID_SERIALIZABLE_CLASS_MEMBERS(RationalActionParams,
int, inv_pow,
RealD, lo, //low eigenvalue bound of rational approx
RealD, hi, //high eigenvalue bound of rational approx
int, MaxIter, //maximum iterations in msCG
RealD, action_tolerance, //msCG tolerance in action evaluation
int, action_degree, //rational approx tolerance in action evaluation
RealD, md_tolerance, //msCG tolerance in MD integration
int, md_degree, //rational approx tolerance in MD integration
int, precision, //precision of floating point arithmetic
int, BoundsCheckFreq); //frequency the approximation is tested (with Metropolis degree/tolerance); 0 disables the check
// constructor
RationalActionParams(int _inv_pow = 2,
RealD _lo = 0.0,
RealD _hi = 1.0,
int _maxit = 1000,
RealD _action_tolerance = 1.0e-8,
int _action_degree = 10,
RealD _md_tolerance = 1.0e-8,
int _md_degree = 10,
int _precision = 64,
int _BoundsCheckFreq=20)
: inv_pow(_inv_pow),
lo(_lo),
hi(_hi),
MaxIter(_maxit),
action_tolerance(_action_tolerance),
action_degree(_action_degree),
md_tolerance(_md_tolerance),
md_degree(_md_degree),
precision(_precision),
BoundsCheckFreq(_BoundsCheckFreq){};
};
NAMESPACE_END(Grid); NAMESPACE_END(Grid);
#endif #endif

View File

@ -71,7 +71,6 @@ public:
RealD Mass(void) { return (mass_plus + mass_minus) / 2.0; }; RealD Mass(void) { return (mass_plus + mass_minus) / 2.0; };
RealD MassPlus(void) { return mass_plus; }; RealD MassPlus(void) { return mass_plus; };
RealD MassMinus(void) { return mass_minus; }; RealD MassMinus(void) { return mass_minus; };
void SetMass(RealD _mass) { void SetMass(RealD _mass) {
mass_plus=mass_minus=_mass; mass_plus=mass_minus=_mass;
SetCoefficientsInternal(_zolo_hi,_gamma,_b,_c); // Reset coeffs SetCoefficientsInternal(_zolo_hi,_gamma,_b,_c); // Reset coeffs
@ -183,6 +182,16 @@ public:
GridRedBlackCartesian &FourDimRedBlackGrid, GridRedBlackCartesian &FourDimRedBlackGrid,
RealD _mass,RealD _M5,const ImplParams &p= ImplParams()); RealD _mass,RealD _M5,const ImplParams &p= ImplParams());
void CayleyReport(void);
void CayleyZeroCounters(void);
double M5Dflops;
double M5Dcalls;
double M5Dtime;
double MooeeInvFlops;
double MooeeInvCalls;
double MooeeInvTime;
protected: protected:
virtual void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c); virtual void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);

View File

@ -140,7 +140,6 @@ public:
return NMAX; return NMAX;
} }
static int getNMAX(Lattice<iImplClover<vComplexD2>> &t, RealD R) {return getNMAX(1e-12,R);}
static int getNMAX(Lattice<iImplClover<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);} static int getNMAX(Lattice<iImplClover<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);}
static int getNMAX(Lattice<iImplClover<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);} static int getNMAX(Lattice<iImplClover<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);}

View File

@ -1,291 +0,0 @@
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
Source file: ./lib/qcd/action/fermion/DWFSlow.h
Copyright (C) 2022
Author: Peter Boyle <pboyle@bnl.gov>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
See the full license in the file "LICENSE" in the top level distribution
directory
*************************************************************************************/
/* END LEGAL */
#pragma once
NAMESPACE_BEGIN(Grid);
template <class Impl>
class DWFSlowFermion : public FermionOperator<Impl>
{
public:
INHERIT_IMPL_TYPES(Impl);
///////////////////////////////////////////////////////////////
// Implement the abstract base
///////////////////////////////////////////////////////////////
GridBase *GaugeGrid(void) { return _grid4; }
GridBase *GaugeRedBlackGrid(void) { return _cbgrid4; }
GridBase *FermionGrid(void) { return _grid; }
GridBase *FermionRedBlackGrid(void) { return _cbgrid; }
FermionField _tmp;
FermionField &tmp(void) { return _tmp; }
//////////////////////////////////////////////////////////////////
// override multiply; cut number routines if pass dagger argument
// and also make interface more uniformly consistent
//////////////////////////////////////////////////////////////////
virtual void M(const FermionField &in, FermionField &out)
{
FermionField tmp(_grid);
out = (5.0 - M5) * in;
Dhop(in,tmp,DaggerNo);
out = out + tmp;
}
virtual void Mdag(const FermionField &in, FermionField &out)
{
FermionField tmp(_grid);
out = (5.0 - M5) * in;
Dhop(in,tmp,DaggerYes);
out = out + tmp;
};
/////////////////////////////////////////////////////////
// half checkerboard operations 5D redblack so just site identiy
/////////////////////////////////////////////////////////
void Meooe(const FermionField &in, FermionField &out)
{
if ( in.Checkerboard() == Odd ) {
this->DhopEO(in,out,DaggerNo);
} else {
this->DhopOE(in,out,DaggerNo);
}
}
void MeooeDag(const FermionField &in, FermionField &out)
{
if ( in.Checkerboard() == Odd ) {
this->DhopEO(in,out,DaggerYes);
} else {
this->DhopOE(in,out,DaggerYes);
}
};
// allow override for twisted mass and clover
virtual void Mooee(const FermionField &in, FermionField &out)
{
out = (5.0 - M5) * in;
}
virtual void MooeeDag(const FermionField &in, FermionField &out)
{
out = (5.0 - M5) * in;
}
virtual void MooeeInv(const FermionField &in, FermionField &out)
{
out = (1.0/(5.0 - M5)) * in;
};
virtual void MooeeInvDag(const FermionField &in, FermionField &out)
{
out = (1.0/(5.0 - M5)) * in;
};
virtual void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _mass,std::vector<double> twist) {} ;
////////////////////////
// Derivative interface
////////////////////////
// Interface calls an internal routine
void DhopDeriv(GaugeField &mat,const FermionField &U,const FermionField &V,int dag) { assert(0);};
void DhopDerivOE(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){ assert(0);};
void DhopDerivEO(GaugeField &mat,const FermionField &U,const FermionField &V,int dag){ assert(0);};
///////////////////////////////////////////////////////////////
// non-hermitian hopping term; half cb or both
///////////////////////////////////////////////////////////////
void Dhop(const FermionField &in, FermionField &out, int dag)
{
FermionField tmp(in.Grid());
Dhop5(in,out,MassField,MassField,dag );
for(int mu=0;mu<4;mu++){
DhopDirU(in,Umu[mu],Umu[mu],tmp,mu,dag ); out = out + tmp;
}
};
void DhopOE(const FermionField &in, FermionField &out, int dag)
{
FermionField tmp(in.Grid());
assert(in.Checkerboard()==Even);
Dhop5(in,out,MassFieldOdd,MassFieldEven,dag);
for(int mu=0;mu<4;mu++){
DhopDirU(in,UmuOdd[mu],UmuEven[mu],tmp,mu,dag ); out = out + tmp;
}
};
void DhopEO(const FermionField &in, FermionField &out, int dag)
{
FermionField tmp(in.Grid());
assert(in.Checkerboard()==Odd);
Dhop5(in,out, MassFieldEven,MassFieldOdd ,dag );
for(int mu=0;mu<4;mu++){
DhopDirU(in,UmuEven[mu],UmuOdd[mu],tmp,mu,dag ); out = out + tmp;
}
};
///////////////////////////////////////////////////////////////
// Multigrid assistance; force term uses too
///////////////////////////////////////////////////////////////
void Mdir(const FermionField &in, FermionField &out, int dir, int disp){ assert(0);};
void MdirAll(const FermionField &in, std::vector<FermionField> &out) { assert(0);};
void DhopDir(const FermionField &in, FermionField &out, int dir, int disp) { assert(0);};
void DhopDirAll(const FermionField &in, std::vector<FermionField> &out) { assert(0);};
void DhopDirCalc(const FermionField &in, FermionField &out, int dirdisp,int gamma, int dag) { assert(0);};
void DhopDirU(const FermionField &in, const GaugeLinkField &U5e, const GaugeLinkField &U5o, FermionField &out, int mu, int dag)
{
RealD sgn= 1.0;
if (dag ) sgn=-1.0;
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT
};
// mass is 1,1,1,1,-m has to multiply the round the world term
FermionField tmp (in.Grid());
tmp = U5e * Cshift(in,mu+1,1);
out = tmp - Gamma(Gmu[mu])*tmp*sgn;
tmp = Cshift(adj(U5o)*in,mu+1,-1);
out = out + tmp + Gamma(Gmu[mu])*tmp*sgn;
out = -0.5*out;
};
void Dhop5(const FermionField &in, FermionField &out, ComplexField &massE, ComplexField &massO, int dag)
{
// Mass term.... must multiple the round world with mass = 1,1,1,1, -m
RealD sgn= 1.0;
if (dag ) sgn=-1.0;
Gamma G5(Gamma::Algebra::Gamma5);
FermionField tmp (in.Grid());
tmp = massE*Cshift(in,0,1);
out = tmp - G5*tmp*sgn;
tmp = Cshift(massO*in,0,-1);
out = out + tmp + G5*tmp*sgn;
out = -0.5*out;
};
// Constructor
DWFSlowFermion(GaugeField &_Umu, GridCartesian &Fgrid,
GridRedBlackCartesian &Hgrid, RealD _mass, RealD _M5)
:
_grid(&Fgrid),
_cbgrid(&Hgrid),
_grid4(_Umu.Grid()),
Umu(Nd,&Fgrid),
UmuEven(Nd,&Hgrid),
UmuOdd(Nd,&Hgrid),
MassField(&Fgrid),
MassFieldEven(&Hgrid),
MassFieldOdd(&Hgrid),
M5(_M5),
mass(_mass),
_tmp(&Hgrid)
{
Ls=Fgrid._fdimensions[0];
ImportGauge(_Umu);
typedef typename FermionField::scalar_type scalar;
Lattice<iScalar<vInteger> > coor(&Fgrid);
LatticeCoordinate(coor, 0); // Scoor
ComplexField one(&Fgrid);
MassField =scalar(-mass);
one =scalar(1.0);
MassField =where(coor==Integer(Ls-1),MassField,one);
for(int mu=0;mu<Nd;mu++){
pickCheckerboard(Even,UmuEven[mu],Umu[mu]);
pickCheckerboard(Odd ,UmuOdd[mu],Umu[mu]);
}
pickCheckerboard(Even,MassFieldEven,MassField);
pickCheckerboard(Odd ,MassFieldOdd,MassField);
}
// DoubleStore impl dependent
void ImportGauge(const GaugeField &_Umu4)
{
GaugeLinkField U4(_grid4);
for(int mu=0;mu<Nd;mu++){
U4 = PeekIndex<LorentzIndex>(_Umu4, mu);
for(int s=0;s<this->Ls;s++){
InsertSlice(U4,Umu[mu],s,0);
}
}
}
///////////////////////////////////////////////////////////////
// Data members require to support the functionality
///////////////////////////////////////////////////////////////
public:
virtual RealD Mass(void) { return mass; }
virtual int isTrivialEE(void) { return 1; };
RealD mass;
RealD M5;
int Ls;
GridBase *_grid4;
GridBase *_grid;
GridBase *_cbgrid4;
GridBase *_cbgrid;
// Copy of the gauge field , with even and odd subsets
std::vector<GaugeLinkField> Umu;
std::vector<GaugeLinkField> UmuEven;
std::vector<GaugeLinkField> UmuOdd;
ComplexField MassField;
ComplexField MassFieldEven;
ComplexField MassFieldOdd;
///////////////////////////////////////////////////////////////
// Conserved current utilities
///////////////////////////////////////////////////////////////
void ContractConservedCurrent(PropagatorField &q_in_1,
PropagatorField &q_in_2,
PropagatorField &q_out,
PropagatorField &phys_src,
Current curr_type,
unsigned int mu){}
void SeqConservedCurrent(PropagatorField &q_in,
PropagatorField &q_out,
PropagatorField &phys_src,
Current curr_type,
unsigned int mu,
unsigned int tmin,
unsigned int tmax,
ComplexField &lattice_cmplx){}
};
typedef DWFSlowFermion<WilsonImplF> DWFSlowFermionF;
typedef DWFSlowFermion<WilsonImplD> DWFSlowFermionD;
NAMESPACE_END(Grid);

View File

@ -47,7 +47,6 @@ Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
//////////////////////////////////////////// ////////////////////////////////////////////
// Fermion operators / actions // Fermion operators / actions
//////////////////////////////////////////// ////////////////////////////////////////////
#include <Grid/qcd/action/fermion/DWFSlow.h> // Slow DWF
#include <Grid/qcd/action/fermion/WilsonFermion.h> // 4d wilson like #include <Grid/qcd/action/fermion/WilsonFermion.h> // 4d wilson like
NAMESPACE_CHECK(Wilson); NAMESPACE_CHECK(Wilson);
@ -113,31 +112,28 @@ NAMESPACE_CHECK(DWFutils);
// Cayley 5d // Cayley 5d
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
typedef WilsonFermion<WilsonImplD2> WilsonFermionD2; typedef WilsonFermion<WilsonImplR> WilsonFermionR;
typedef WilsonFermion<WilsonImplF> WilsonFermionF; typedef WilsonFermion<WilsonImplF> WilsonFermionF;
typedef WilsonFermion<WilsonImplD> WilsonFermionD; typedef WilsonFermion<WilsonImplD> WilsonFermionD;
//typedef WilsonFermion<WilsonImplRL> WilsonFermionRL;
//typedef WilsonFermion<WilsonImplFH> WilsonFermionFH;
//typedef WilsonFermion<WilsonImplDF> WilsonFermionDF;
typedef WilsonFermion<WilsonAdjImplR> WilsonAdjFermionR;
typedef WilsonFermion<WilsonAdjImplF> WilsonAdjFermionF; typedef WilsonFermion<WilsonAdjImplF> WilsonAdjFermionF;
typedef WilsonFermion<WilsonAdjImplD> WilsonAdjFermionD; typedef WilsonFermion<WilsonAdjImplD> WilsonAdjFermionD;
typedef WilsonFermion<WilsonTwoIndexSymmetricImplR> WilsonTwoIndexSymmetricFermionR;
typedef WilsonFermion<WilsonTwoIndexSymmetricImplF> WilsonTwoIndexSymmetricFermionF; typedef WilsonFermion<WilsonTwoIndexSymmetricImplF> WilsonTwoIndexSymmetricFermionF;
typedef WilsonFermion<WilsonTwoIndexSymmetricImplD> WilsonTwoIndexSymmetricFermionD; typedef WilsonFermion<WilsonTwoIndexSymmetricImplD> WilsonTwoIndexSymmetricFermionD;
typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplR> WilsonTwoIndexAntiSymmetricFermionR;
typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonTwoIndexAntiSymmetricFermionF; typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplF> WilsonTwoIndexAntiSymmetricFermionF;
typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonTwoIndexAntiSymmetricFermionD; typedef WilsonFermion<WilsonTwoIndexAntiSymmetricImplD> WilsonTwoIndexAntiSymmetricFermionD;
// Sp(2n)
typedef WilsonFermion<SpWilsonImplF> SpWilsonFermionF;
typedef WilsonFermion<SpWilsonImplD> SpWilsonFermionD;
typedef WilsonFermion<SpWilsonTwoIndexAntiSymmetricImplF> SpWilsonTwoIndexAntiSymmetricFermionF;
typedef WilsonFermion<SpWilsonTwoIndexAntiSymmetricImplD> SpWilsonTwoIndexAntiSymmetricFermionD;
typedef WilsonFermion<SpWilsonTwoIndexSymmetricImplF> SpWilsonTwoIndexSymmetricFermionF;
typedef WilsonFermion<SpWilsonTwoIndexSymmetricImplD> SpWilsonTwoIndexSymmetricFermionD;
// Twisted mass fermion // Twisted mass fermion
typedef WilsonTMFermion<WilsonImplD2> WilsonTMFermionD2; typedef WilsonTMFermion<WilsonImplR> WilsonTMFermionR;
typedef WilsonTMFermion<WilsonImplF> WilsonTMFermionF; typedef WilsonTMFermion<WilsonImplF> WilsonTMFermionF;
typedef WilsonTMFermion<WilsonImplD> WilsonTMFermionD; typedef WilsonTMFermion<WilsonImplD> WilsonTMFermionD;
@ -145,20 +141,23 @@ typedef WilsonTMFermion<WilsonImplD> WilsonTMFermionD;
template <typename WImpl> using WilsonClover = WilsonCloverFermion<WImpl, CloverHelpers<WImpl>>; template <typename WImpl> using WilsonClover = WilsonCloverFermion<WImpl, CloverHelpers<WImpl>>;
template <typename WImpl> using WilsonExpClover = WilsonCloverFermion<WImpl, ExpCloverHelpers<WImpl>>; template <typename WImpl> using WilsonExpClover = WilsonCloverFermion<WImpl, ExpCloverHelpers<WImpl>>;
typedef WilsonClover<WilsonImplD2> WilsonCloverFermionD2; typedef WilsonClover<WilsonImplR> WilsonCloverFermionR;
typedef WilsonClover<WilsonImplF> WilsonCloverFermionF; typedef WilsonClover<WilsonImplF> WilsonCloverFermionF;
typedef WilsonClover<WilsonImplD> WilsonCloverFermionD; typedef WilsonClover<WilsonImplD> WilsonCloverFermionD;
typedef WilsonExpClover<WilsonImplD2> WilsonExpCloverFermionD2; typedef WilsonExpClover<WilsonImplR> WilsonExpCloverFermionR;
typedef WilsonExpClover<WilsonImplF> WilsonExpCloverFermionF; typedef WilsonExpClover<WilsonImplF> WilsonExpCloverFermionF;
typedef WilsonExpClover<WilsonImplD> WilsonExpCloverFermionD; typedef WilsonExpClover<WilsonImplD> WilsonExpCloverFermionD;
typedef WilsonClover<WilsonAdjImplR> WilsonCloverAdjFermionR;
typedef WilsonClover<WilsonAdjImplF> WilsonCloverAdjFermionF; typedef WilsonClover<WilsonAdjImplF> WilsonCloverAdjFermionF;
typedef WilsonClover<WilsonAdjImplD> WilsonCloverAdjFermionD; typedef WilsonClover<WilsonAdjImplD> WilsonCloverAdjFermionD;
typedef WilsonClover<WilsonTwoIndexSymmetricImplR> WilsonCloverTwoIndexSymmetricFermionR;
typedef WilsonClover<WilsonTwoIndexSymmetricImplF> WilsonCloverTwoIndexSymmetricFermionF; typedef WilsonClover<WilsonTwoIndexSymmetricImplF> WilsonCloverTwoIndexSymmetricFermionF;
typedef WilsonClover<WilsonTwoIndexSymmetricImplD> WilsonCloverTwoIndexSymmetricFermionD; typedef WilsonClover<WilsonTwoIndexSymmetricImplD> WilsonCloverTwoIndexSymmetricFermionD;
typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplR> WilsonCloverTwoIndexAntiSymmetricFermionR;
typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplF> WilsonCloverTwoIndexAntiSymmetricFermionF; typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplF> WilsonCloverTwoIndexAntiSymmetricFermionF;
typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiSymmetricFermionD; typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiSymmetricFermionD;
@ -166,108 +165,161 @@ typedef WilsonClover<WilsonTwoIndexAntiSymmetricImplD> WilsonCloverTwoIndexAntiS
template <typename WImpl> using CompactWilsonClover = CompactWilsonCloverFermion<WImpl, CompactCloverHelpers<WImpl>>; template <typename WImpl> using CompactWilsonClover = CompactWilsonCloverFermion<WImpl, CompactCloverHelpers<WImpl>>;
template <typename WImpl> using CompactWilsonExpClover = CompactWilsonCloverFermion<WImpl, CompactExpCloverHelpers<WImpl>>; template <typename WImpl> using CompactWilsonExpClover = CompactWilsonCloverFermion<WImpl, CompactExpCloverHelpers<WImpl>>;
typedef CompactWilsonClover<WilsonImplD2> CompactWilsonCloverFermionD2; typedef CompactWilsonClover<WilsonImplR> CompactWilsonCloverFermionR;
typedef CompactWilsonClover<WilsonImplF> CompactWilsonCloverFermionF; typedef CompactWilsonClover<WilsonImplF> CompactWilsonCloverFermionF;
typedef CompactWilsonClover<WilsonImplD> CompactWilsonCloverFermionD; typedef CompactWilsonClover<WilsonImplD> CompactWilsonCloverFermionD;
typedef CompactWilsonExpClover<WilsonImplD2> CompactWilsonExpCloverFermionD2; typedef CompactWilsonExpClover<WilsonImplR> CompactWilsonExpCloverFermionR;
typedef CompactWilsonExpClover<WilsonImplF> CompactWilsonExpCloverFermionF; typedef CompactWilsonExpClover<WilsonImplF> CompactWilsonExpCloverFermionF;
typedef CompactWilsonExpClover<WilsonImplD> CompactWilsonExpCloverFermionD; typedef CompactWilsonExpClover<WilsonImplD> CompactWilsonExpCloverFermionD;
typedef CompactWilsonClover<WilsonAdjImplR> CompactWilsonCloverAdjFermionR;
typedef CompactWilsonClover<WilsonAdjImplF> CompactWilsonCloverAdjFermionF; typedef CompactWilsonClover<WilsonAdjImplF> CompactWilsonCloverAdjFermionF;
typedef CompactWilsonClover<WilsonAdjImplD> CompactWilsonCloverAdjFermionD; typedef CompactWilsonClover<WilsonAdjImplD> CompactWilsonCloverAdjFermionD;
typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplR> CompactWilsonCloverTwoIndexSymmetricFermionR;
typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplF> CompactWilsonCloverTwoIndexSymmetricFermionF; typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplF> CompactWilsonCloverTwoIndexSymmetricFermionF;
typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplD> CompactWilsonCloverTwoIndexSymmetricFermionD; typedef CompactWilsonClover<WilsonTwoIndexSymmetricImplD> CompactWilsonCloverTwoIndexSymmetricFermionD;
typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplR> CompactWilsonCloverTwoIndexAntiSymmetricFermionR;
typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplF> CompactWilsonCloverTwoIndexAntiSymmetricFermionF; typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplF> CompactWilsonCloverTwoIndexAntiSymmetricFermionF;
typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplD> CompactWilsonCloverTwoIndexAntiSymmetricFermionD; typedef CompactWilsonClover<WilsonTwoIndexAntiSymmetricImplD> CompactWilsonCloverTwoIndexAntiSymmetricFermionD;
// Domain Wall fermions // Domain Wall fermions
typedef DomainWallFermion<WilsonImplR> DomainWallFermionR;
typedef DomainWallFermion<WilsonImplF> DomainWallFermionF; typedef DomainWallFermion<WilsonImplF> DomainWallFermionF;
typedef DomainWallFermion<WilsonImplD> DomainWallFermionD; typedef DomainWallFermion<WilsonImplD> DomainWallFermionD;
typedef DomainWallFermion<WilsonImplD2> DomainWallFermionD2;
typedef DomainWallEOFAFermion<WilsonImplD2> DomainWallEOFAFermionD2; //typedef DomainWallFermion<WilsonImplRL> DomainWallFermionRL;
//typedef DomainWallFermion<WilsonImplFH> DomainWallFermionFH;
//typedef DomainWallFermion<WilsonImplDF> DomainWallFermionDF;
typedef DomainWallEOFAFermion<WilsonImplR> DomainWallEOFAFermionR;
typedef DomainWallEOFAFermion<WilsonImplF> DomainWallEOFAFermionF; typedef DomainWallEOFAFermion<WilsonImplF> DomainWallEOFAFermionF;
typedef DomainWallEOFAFermion<WilsonImplD> DomainWallEOFAFermionD; typedef DomainWallEOFAFermion<WilsonImplD> DomainWallEOFAFermionD;
typedef MobiusFermion<WilsonImplD2> MobiusFermionD2; //typedef DomainWallEOFAFermion<WilsonImplRL> DomainWallEOFAFermionRL;
//typedef DomainWallEOFAFermion<WilsonImplFH> DomainWallEOFAFermionFH;
//typedef DomainWallEOFAFermion<WilsonImplDF> DomainWallEOFAFermionDF;
typedef MobiusFermion<WilsonImplR> MobiusFermionR;
typedef MobiusFermion<WilsonImplF> MobiusFermionF; typedef MobiusFermion<WilsonImplF> MobiusFermionF;
typedef MobiusFermion<WilsonImplD> MobiusFermionD; typedef MobiusFermion<WilsonImplD> MobiusFermionD;
typedef MobiusEOFAFermion<WilsonImplD2> MobiusEOFAFermionD2; //typedef MobiusFermion<WilsonImplRL> MobiusFermionRL;
//typedef MobiusFermion<WilsonImplFH> MobiusFermionFH;
//typedef MobiusFermion<WilsonImplDF> MobiusFermionDF;
typedef MobiusEOFAFermion<WilsonImplR> MobiusEOFAFermionR;
typedef MobiusEOFAFermion<WilsonImplF> MobiusEOFAFermionF; typedef MobiusEOFAFermion<WilsonImplF> MobiusEOFAFermionF;
typedef MobiusEOFAFermion<WilsonImplD> MobiusEOFAFermionD; typedef MobiusEOFAFermion<WilsonImplD> MobiusEOFAFermionD;
typedef ZMobiusFermion<ZWilsonImplD2> ZMobiusFermionD2; //typedef MobiusEOFAFermion<WilsonImplRL> MobiusEOFAFermionRL;
//typedef MobiusEOFAFermion<WilsonImplFH> MobiusEOFAFermionFH;
//typedef MobiusEOFAFermion<WilsonImplDF> MobiusEOFAFermionDF;
typedef ZMobiusFermion<ZWilsonImplR> ZMobiusFermionR;
typedef ZMobiusFermion<ZWilsonImplF> ZMobiusFermionF; typedef ZMobiusFermion<ZWilsonImplF> ZMobiusFermionF;
typedef ZMobiusFermion<ZWilsonImplD> ZMobiusFermionD; typedef ZMobiusFermion<ZWilsonImplD> ZMobiusFermionD;
typedef ScaledShamirFermion<WilsonImplD2> ScaledShamirFermionD2; //typedef ZMobiusFermion<ZWilsonImplRL> ZMobiusFermionRL;
//typedef ZMobiusFermion<ZWilsonImplFH> ZMobiusFermionFH;
//typedef ZMobiusFermion<ZWilsonImplDF> ZMobiusFermionDF;
// Ls vectorised
typedef ScaledShamirFermion<WilsonImplR> ScaledShamirFermionR;
typedef ScaledShamirFermion<WilsonImplF> ScaledShamirFermionF; typedef ScaledShamirFermion<WilsonImplF> ScaledShamirFermionF;
typedef ScaledShamirFermion<WilsonImplD> ScaledShamirFermionD; typedef ScaledShamirFermion<WilsonImplD> ScaledShamirFermionD;
typedef MobiusZolotarevFermion<WilsonImplD2> MobiusZolotarevFermionD2; typedef MobiusZolotarevFermion<WilsonImplR> MobiusZolotarevFermionR;
typedef MobiusZolotarevFermion<WilsonImplF> MobiusZolotarevFermionF; typedef MobiusZolotarevFermion<WilsonImplF> MobiusZolotarevFermionF;
typedef MobiusZolotarevFermion<WilsonImplD> MobiusZolotarevFermionD; typedef MobiusZolotarevFermion<WilsonImplD> MobiusZolotarevFermionD;
typedef ShamirZolotarevFermion<WilsonImplD2> ShamirZolotarevFermionD2; typedef ShamirZolotarevFermion<WilsonImplR> ShamirZolotarevFermionR;
typedef ShamirZolotarevFermion<WilsonImplF> ShamirZolotarevFermionF; typedef ShamirZolotarevFermion<WilsonImplF> ShamirZolotarevFermionF;
typedef ShamirZolotarevFermion<WilsonImplD> ShamirZolotarevFermionD; typedef ShamirZolotarevFermion<WilsonImplD> ShamirZolotarevFermionD;
typedef OverlapWilsonCayleyTanhFermion<WilsonImplD2> OverlapWilsonCayleyTanhFermionD2; typedef OverlapWilsonCayleyTanhFermion<WilsonImplR> OverlapWilsonCayleyTanhFermionR;
typedef OverlapWilsonCayleyTanhFermion<WilsonImplF> OverlapWilsonCayleyTanhFermionF; typedef OverlapWilsonCayleyTanhFermion<WilsonImplF> OverlapWilsonCayleyTanhFermionF;
typedef OverlapWilsonCayleyTanhFermion<WilsonImplD> OverlapWilsonCayleyTanhFermionD; typedef OverlapWilsonCayleyTanhFermion<WilsonImplD> OverlapWilsonCayleyTanhFermionD;
typedef OverlapWilsonCayleyZolotarevFermion<WilsonImplD2> OverlapWilsonCayleyZolotarevFermionD2; typedef OverlapWilsonCayleyZolotarevFermion<WilsonImplR> OverlapWilsonCayleyZolotarevFermionR;
typedef OverlapWilsonCayleyZolotarevFermion<WilsonImplF> OverlapWilsonCayleyZolotarevFermionF; typedef OverlapWilsonCayleyZolotarevFermion<WilsonImplF> OverlapWilsonCayleyZolotarevFermionF;
typedef OverlapWilsonCayleyZolotarevFermion<WilsonImplD> OverlapWilsonCayleyZolotarevFermionD; typedef OverlapWilsonCayleyZolotarevFermion<WilsonImplD> OverlapWilsonCayleyZolotarevFermionD;
// Continued fraction // Continued fraction
typedef OverlapWilsonContFracTanhFermion<WilsonImplD2> OverlapWilsonContFracTanhFermionD2; typedef OverlapWilsonContFracTanhFermion<WilsonImplR> OverlapWilsonContFracTanhFermionR;
typedef OverlapWilsonContFracTanhFermion<WilsonImplF> OverlapWilsonContFracTanhFermionF; typedef OverlapWilsonContFracTanhFermion<WilsonImplF> OverlapWilsonContFracTanhFermionF;
typedef OverlapWilsonContFracTanhFermion<WilsonImplD> OverlapWilsonContFracTanhFermionD; typedef OverlapWilsonContFracTanhFermion<WilsonImplD> OverlapWilsonContFracTanhFermionD;
typedef OverlapWilsonContFracZolotarevFermion<WilsonImplD2> OverlapWilsonContFracZolotarevFermionD2; typedef OverlapWilsonContFracZolotarevFermion<WilsonImplR> OverlapWilsonContFracZolotarevFermionR;
typedef OverlapWilsonContFracZolotarevFermion<WilsonImplF> OverlapWilsonContFracZolotarevFermionF; typedef OverlapWilsonContFracZolotarevFermion<WilsonImplF> OverlapWilsonContFracZolotarevFermionF;
typedef OverlapWilsonContFracZolotarevFermion<WilsonImplD> OverlapWilsonContFracZolotarevFermionD; typedef OverlapWilsonContFracZolotarevFermion<WilsonImplD> OverlapWilsonContFracZolotarevFermionD;
// Partial fraction // Partial fraction
typedef OverlapWilsonPartialFractionTanhFermion<WilsonImplD2> OverlapWilsonPartialFractionTanhFermionD2; typedef OverlapWilsonPartialFractionTanhFermion<WilsonImplR> OverlapWilsonPartialFractionTanhFermionR;
typedef OverlapWilsonPartialFractionTanhFermion<WilsonImplF> OverlapWilsonPartialFractionTanhFermionF; typedef OverlapWilsonPartialFractionTanhFermion<WilsonImplF> OverlapWilsonPartialFractionTanhFermionF;
typedef OverlapWilsonPartialFractionTanhFermion<WilsonImplD> OverlapWilsonPartialFractionTanhFermionD; typedef OverlapWilsonPartialFractionTanhFermion<WilsonImplD> OverlapWilsonPartialFractionTanhFermionD;
typedef OverlapWilsonPartialFractionZolotarevFermion<WilsonImplD2> OverlapWilsonPartialFractionZolotarevFermionD2; typedef OverlapWilsonPartialFractionZolotarevFermion<WilsonImplR> OverlapWilsonPartialFractionZolotarevFermionR;
typedef OverlapWilsonPartialFractionZolotarevFermion<WilsonImplF> OverlapWilsonPartialFractionZolotarevFermionF; typedef OverlapWilsonPartialFractionZolotarevFermion<WilsonImplF> OverlapWilsonPartialFractionZolotarevFermionF;
typedef OverlapWilsonPartialFractionZolotarevFermion<WilsonImplD> OverlapWilsonPartialFractionZolotarevFermionD; typedef OverlapWilsonPartialFractionZolotarevFermion<WilsonImplD> OverlapWilsonPartialFractionZolotarevFermionD;
// Gparity cases; partial list until tested // Gparity cases; partial list until tested
typedef WilsonFermion<GparityWilsonImplR> GparityWilsonFermionR;
typedef WilsonFermion<GparityWilsonImplF> GparityWilsonFermionF; typedef WilsonFermion<GparityWilsonImplF> GparityWilsonFermionF;
typedef WilsonFermion<GparityWilsonImplD> GparityWilsonFermionD; typedef WilsonFermion<GparityWilsonImplD> GparityWilsonFermionD;
//typedef WilsonFermion<GparityWilsonImplRL> GparityWilsonFermionRL;
//typedef WilsonFermion<GparityWilsonImplFH> GparityWilsonFermionFH;
//typedef WilsonFermion<GparityWilsonImplDF> GparityWilsonFermionDF;
typedef DomainWallFermion<GparityWilsonImplR> GparityDomainWallFermionR;
typedef DomainWallFermion<GparityWilsonImplF> GparityDomainWallFermionF; typedef DomainWallFermion<GparityWilsonImplF> GparityDomainWallFermionF;
typedef DomainWallFermion<GparityWilsonImplD> GparityDomainWallFermionD; typedef DomainWallFermion<GparityWilsonImplD> GparityDomainWallFermionD;
typedef DomainWallEOFAFermion<GparityWilsonImplR> GparityDomainWallEOFAFermionD2; //typedef DomainWallFermion<GparityWilsonImplRL> GparityDomainWallFermionRL;
//typedef DomainWallFermion<GparityWilsonImplFH> GparityDomainWallFermionFH;
//typedef DomainWallFermion<GparityWilsonImplDF> GparityDomainWallFermionDF;
typedef DomainWallEOFAFermion<GparityWilsonImplR> GparityDomainWallEOFAFermionR;
typedef DomainWallEOFAFermion<GparityWilsonImplF> GparityDomainWallEOFAFermionF; typedef DomainWallEOFAFermion<GparityWilsonImplF> GparityDomainWallEOFAFermionF;
typedef DomainWallEOFAFermion<GparityWilsonImplD> GparityDomainWallEOFAFermionD; typedef DomainWallEOFAFermion<GparityWilsonImplD> GparityDomainWallEOFAFermionD;
typedef WilsonTMFermion<GparityWilsonImplR> GparityWilsonTMFermionD2; //typedef DomainWallEOFAFermion<GparityWilsonImplRL> GparityDomainWallEOFAFermionRL;
//typedef DomainWallEOFAFermion<GparityWilsonImplFH> GparityDomainWallEOFAFermionFH;
//typedef DomainWallEOFAFermion<GparityWilsonImplDF> GparityDomainWallEOFAFermionDF;
typedef WilsonTMFermion<GparityWilsonImplR> GparityWilsonTMFermionR;
typedef WilsonTMFermion<GparityWilsonImplF> GparityWilsonTMFermionF; typedef WilsonTMFermion<GparityWilsonImplF> GparityWilsonTMFermionF;
typedef WilsonTMFermion<GparityWilsonImplD> GparityWilsonTMFermionD; typedef WilsonTMFermion<GparityWilsonImplD> GparityWilsonTMFermionD;
typedef MobiusFermion<GparityWilsonImplR> GparityMobiusFermionD2; //typedef WilsonTMFermion<GparityWilsonImplRL> GparityWilsonTMFermionRL;
//typedef WilsonTMFermion<GparityWilsonImplFH> GparityWilsonTMFermionFH;
//typedef WilsonTMFermion<GparityWilsonImplDF> GparityWilsonTMFermionDF;
typedef MobiusFermion<GparityWilsonImplR> GparityMobiusFermionR;
typedef MobiusFermion<GparityWilsonImplF> GparityMobiusFermionF; typedef MobiusFermion<GparityWilsonImplF> GparityMobiusFermionF;
typedef MobiusFermion<GparityWilsonImplD> GparityMobiusFermionD; typedef MobiusFermion<GparityWilsonImplD> GparityMobiusFermionD;
typedef MobiusEOFAFermion<GparityWilsonImplR> GparityMobiusEOFAFermionD2; //typedef MobiusFermion<GparityWilsonImplRL> GparityMobiusFermionRL;
//typedef MobiusFermion<GparityWilsonImplFH> GparityMobiusFermionFH;
//typedef MobiusFermion<GparityWilsonImplDF> GparityMobiusFermionDF;
typedef MobiusEOFAFermion<GparityWilsonImplR> GparityMobiusEOFAFermionR;
typedef MobiusEOFAFermion<GparityWilsonImplF> GparityMobiusEOFAFermionF; typedef MobiusEOFAFermion<GparityWilsonImplF> GparityMobiusEOFAFermionF;
typedef MobiusEOFAFermion<GparityWilsonImplD> GparityMobiusEOFAFermionD; typedef MobiusEOFAFermion<GparityWilsonImplD> GparityMobiusEOFAFermionD;
//typedef MobiusEOFAFermion<GparityWilsonImplRL> GparityMobiusEOFAFermionRL;
//typedef MobiusEOFAFermion<GparityWilsonImplFH> GparityMobiusEOFAFermionFH;
//typedef MobiusEOFAFermion<GparityWilsonImplDF> GparityMobiusEOFAFermionDF;
typedef ImprovedStaggeredFermion<StaggeredImplR> ImprovedStaggeredFermionR;
typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF; typedef ImprovedStaggeredFermion<StaggeredImplF> ImprovedStaggeredFermionF;
typedef ImprovedStaggeredFermion<StaggeredImplD> ImprovedStaggeredFermionD; typedef ImprovedStaggeredFermion<StaggeredImplD> ImprovedStaggeredFermionD;
typedef NaiveStaggeredFermion<StaggeredImplR> NaiveStaggeredFermionR;
typedef NaiveStaggeredFermion<StaggeredImplF> NaiveStaggeredFermionF; typedef NaiveStaggeredFermion<StaggeredImplF> NaiveStaggeredFermionF;
typedef NaiveStaggeredFermion<StaggeredImplD> NaiveStaggeredFermionD; typedef NaiveStaggeredFermion<StaggeredImplD> NaiveStaggeredFermionD;
typedef ImprovedStaggeredFermion5D<StaggeredImplR> ImprovedStaggeredFermion5DR;
typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF; typedef ImprovedStaggeredFermion5D<StaggeredImplF> ImprovedStaggeredFermion5DF;
typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD; typedef ImprovedStaggeredFermion5D<StaggeredImplD> ImprovedStaggeredFermion5DD;

View File

@ -49,8 +49,6 @@ public:
virtual FermionField &tmp(void) = 0; virtual FermionField &tmp(void) = 0;
virtual void DirichletBlock(const Coordinate & _Block) { assert(0); };
GridBase * Grid(void) { return FermionGrid(); }; // this is all the linalg routines need to know GridBase * Grid(void) { return FermionGrid(); }; // this is all the linalg routines need to know
GridBase * RedBlackGrid(void) { return FermionRedBlackGrid(); }; GridBase * RedBlackGrid(void) { return FermionRedBlackGrid(); };

View File

@ -30,18 +30,6 @@ directory
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
/*
Policy implementation for G-parity boundary conditions
Rather than treating the gauge field as a flavored field, the Grid implementation of G-parity treats the gauge field as a regular
field with complex conjugate boundary conditions. In order to ensure the second flavor interacts with the conjugate links and the first
with the regular links we overload the functionality of doubleStore, whose purpose is to store the gauge field and the barrel-shifted gauge field
to avoid communicating links when applying the Dirac operator, such that the double-stored field contains also a flavor index which maps to
either the link or the conjugate link. This flavored field is then used by multLink to apply the correct link to a spinor.
Here the first Nd-1 directions are treated as "spatial", and a twist value of 1 indicates G-parity BCs in that direction.
mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs
*/
template <class S, class Representation = FundamentalRepresentation, class Options=CoeffReal> template <class S, class Representation = FundamentalRepresentation, class Options=CoeffReal>
class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Representation::Dimension> > { class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Representation::Dimension> > {
public: public:
@ -125,7 +113,7 @@ public:
|| ((distance== 1)&&(icoor[direction]==1)) || ((distance== 1)&&(icoor[direction]==1))
|| ((distance==-1)&&(icoor[direction]==0)); || ((distance==-1)&&(icoor[direction]==0));
permute_lane = permute_lane && SE->_around_the_world && St.parameters.twists[mmu] && mmu < Nd-1; //only if we are going around the world in a spatial direction permute_lane = permute_lane && SE->_around_the_world && St.parameters.twists[mmu]; //only if we are going around the world
//Apply the links //Apply the links
int f_upper = permute_lane ? 1 : 0; int f_upper = permute_lane ? 1 : 0;
@ -151,10 +139,10 @@ public:
assert((distance == 1) || (distance == -1)); // nearest neighbour stencil hard code assert((distance == 1) || (distance == -1)); // nearest neighbour stencil hard code
assert((sl == 1) || (sl == 2)); assert((sl == 1) || (sl == 2));
//If this site is an global boundary site, perform the G-parity flavor twist if ( SE->_around_the_world && St.parameters.twists[mmu] ) {
if ( mmu < Nd-1 && SE->_around_the_world && St.parameters.twists[mmu] ) {
if ( sl == 2 ) { if ( sl == 2 ) {
//Only do the twist for lanes on the edge of the physical node
ExtractBuffer<sobj> vals(Nsimd); ExtractBuffer<sobj> vals(Nsimd);
extract(chi,vals); extract(chi,vals);
@ -209,19 +197,6 @@ public:
reg = memory; reg = memory;
} }
//Poke 'poke_f0' onto flavor 0 and 'poke_f1' onto flavor 1 in direction mu of the doubled gauge field Uds
inline void pokeGparityDoubledGaugeField(DoubledGaugeField &Uds, const GaugeLinkField &poke_f0, const GaugeLinkField &poke_f1, const int mu){
autoView(poke_f0_v, poke_f0, CpuRead);
autoView(poke_f1_v, poke_f1, CpuRead);
autoView(Uds_v, Uds, CpuWrite);
thread_foreach(ss,poke_f0_v,{
Uds_v[ss](0)(mu) = poke_f0_v[ss]();
Uds_v[ss](1)(mu) = poke_f1_v[ss]();
});
}
inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu) inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
{ {
conformable(Uds.Grid(),GaugeGrid); conformable(Uds.Grid(),GaugeGrid);
@ -232,19 +207,14 @@ public:
GaugeLinkField Uconj(GaugeGrid); GaugeLinkField Uconj(GaugeGrid);
Lattice<iScalar<vInteger> > coor(GaugeGrid); Lattice<iScalar<vInteger> > coor(GaugeGrid);
//Here the first Nd-1 directions are treated as "spatial", and a twist value of 1 indicates G-parity BCs in that direction. for(int mu=0;mu<Nd;mu++){
//mu=Nd-1 is assumed to be the time direction and a twist value of 1 indicates antiperiodic BCs
for(int mu=0;mu<Nd-1;mu++){ LatticeCoordinate(coor,mu);
if( Params.twists[mu] ){
LatticeCoordinate(coor,mu);
}
U = PeekIndex<LorentzIndex>(Umu,mu); U = PeekIndex<LorentzIndex>(Umu,mu);
Uconj = conjugate(U); Uconj = conjugate(U);
// Implement the isospin rotation sign on the boundary between f=1 and f=0
// This phase could come from a simple bc 1,1,-1,1 .. // This phase could come from a simple bc 1,1,-1,1 ..
int neglink = GaugeGrid->GlobalDimensions()[mu]-1; int neglink = GaugeGrid->GlobalDimensions()[mu]-1;
if ( Params.twists[mu] ) { if ( Params.twists[mu] ) {
@ -259,7 +229,7 @@ public:
thread_foreach(ss,U_v,{ thread_foreach(ss,U_v,{
Uds_v[ss](0)(mu) = U_v[ss](); Uds_v[ss](0)(mu) = U_v[ss]();
Uds_v[ss](1)(mu) = Uconj_v[ss](); Uds_v[ss](1)(mu) = Uconj_v[ss]();
}); });
} }
U = adj(Cshift(U ,mu,-1)); // correct except for spanning the boundary U = adj(Cshift(U ,mu,-1)); // correct except for spanning the boundary
@ -290,38 +260,6 @@ public:
}); });
} }
} }
{ //periodic / antiperiodic temporal BCs
int mu = Nd-1;
int L = GaugeGrid->GlobalDimensions()[mu];
int Lmu = L - 1;
LatticeCoordinate(coor, mu);
U = PeekIndex<LorentzIndex>(Umu, mu); //Get t-directed links
GaugeLinkField *Upoke = &U;
if(Params.twists[mu]){ //antiperiodic
Utmp = where(coor == Lmu, -U, U);
Upoke = &Utmp;
}
Uconj = conjugate(*Upoke); //second flavor interacts with conjugate links
pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu);
//Get the barrel-shifted field
Utmp = adj(Cshift(U, mu, -1)); //is a forward shift!
Upoke = &Utmp;
if(Params.twists[mu]){
U = where(coor == 0, -Utmp, Utmp); //boundary phase
Upoke = &U;
}
Uconj = conjugate(*Upoke);
pokeGparityDoubledGaugeField(Uds, *Upoke, Uconj, mu + 4);
}
} }
inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A, int mu) { inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A, int mu) {
@ -360,48 +298,28 @@ public:
inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){ inline void extractLinkField(std::vector<GaugeLinkField> &mat, DoubledGaugeField &Uds){
assert(0); assert(0);
} }
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
int Ls=Btilde.Grid()->_fdimensions[0];
{
GridBase *GaugeGrid = mat.Grid();
Lattice<iScalar<vInteger> > coor(GaugeGrid);
if( Params.twists[mu] ){
LatticeCoordinate(coor,mu);
}
autoView( mat_v , mat, AcceleratorWrite);
autoView( Btilde_v , Btilde, AcceleratorRead);
autoView( Atilde_v , Atilde, AcceleratorRead);
accelerator_for(sss,mat.Grid()->oSites(), FermionField::vector_type::Nsimd(),{
int sU=sss;
typedef decltype(coalescedRead(mat_v[sU](mu)() )) ColorMatrixType;
ColorMatrixType sum;
zeroit(sum);
for(int s=0;s<Ls;s++){
int sF = s+Ls*sU;
for(int spn=0;spn<Ns;spn++){ //sum over spin
//Flavor 0
auto bb = coalescedRead(Btilde_v[sF](0)(spn) ); //color vector
auto aa = coalescedRead(Atilde_v[sF](0)(spn) );
sum = sum + outerProduct(bb,aa);
//Flavor 1
bb = coalescedRead(Btilde_v[sF](1)(spn) );
aa = coalescedRead(Atilde_v[sF](1)(spn) );
sum = sum + conjugate(outerProduct(bb,aa));
}
}
coalescedWrite(mat_v[sU](mu)(), sum);
});
}
}
inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
int Ls = Btilde.Grid()->_fdimensions[0];
GaugeLinkField tmp(mat.Grid());
tmp = Zero();
{
autoView( tmp_v , tmp, CpuWrite);
autoView( Atilde_v , Atilde, CpuRead);
autoView( Btilde_v , Btilde, CpuRead);
thread_for(ss,tmp.Grid()->oSites(),{
for (int s = 0; s < Ls; s++) {
int sF = s + Ls * ss;
auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde_v[sF], Atilde_v[sF]));
tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1));
}
});
}
PokeIndex<LorentzIndex>(mat, tmp, mu);
return;
}
}; };

View File

@ -47,6 +47,18 @@ public:
FermionField _tmp; FermionField _tmp;
FermionField &tmp(void) { return _tmp; } FermionField &tmp(void) { return _tmp; }
////////////////////////////////////////
// Performance monitoring
////////////////////////////////////////
void Report(void);
void ZeroCounters(void);
double DhopTotalTime;
double DhopCalls;
double DhopCommTime;
double DhopComputeTime;
double DhopComputeTime2;
double DhopFaceTime;
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
// Implement the abstract base // Implement the abstract base
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////

View File

@ -52,6 +52,18 @@ public:
FermionField _tmp; FermionField _tmp;
FermionField &tmp(void) { return _tmp; } FermionField &tmp(void) { return _tmp; }
////////////////////////////////////////
// Performance monitoring
////////////////////////////////////////
void Report(void);
void ZeroCounters(void);
double DhopTotalTime;
double DhopCalls;
double DhopCommTime;
double DhopComputeTime;
double DhopComputeTime2;
double DhopFaceTime;
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
// Implement the abstract base // Implement the abstract base
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////

View File

@ -47,6 +47,18 @@ public:
FermionField _tmp; FermionField _tmp;
FermionField &tmp(void) { return _tmp; } FermionField &tmp(void) { return _tmp; }
////////////////////////////////////////
// Performance monitoring
////////////////////////////////////////
void Report(void);
void ZeroCounters(void);
double DhopTotalTime;
double DhopCalls;
double DhopCommTime;
double DhopComputeTime;
double DhopComputeTime2;
double DhopFaceTime;
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
// Implement the abstract base // Implement the abstract base
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////

View File

@ -32,218 +32,17 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid); NAMESPACE_BEGIN(Grid);
///////////////////////////////////////////////////////////////
// Wilson compressor will need FaceGather policies for:
// Periodic, Dirichlet, and partial Dirichlet for DWF
///////////////////////////////////////////////////////////////
const int dwf_compressor_depth=2;
#define DWF_COMPRESS
class FaceGatherPartialDWF
{
public:
#ifdef DWF_COMPRESS
static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/(2*dwf_compressor_depth);};
#else
static int PartialCompressionFactor(GridBase *grid) { return 1;}
#endif
template<class vobj,class cobj,class compressor>
static void Gather_plane_simple (commVector<std::pair<int,int> >& table,
const Lattice<vobj> &rhs,
cobj *buffer,
compressor &compress,
int off,int so,int partial)
{
//DWF only hack: If a direction that is OFF node we use Partial Dirichlet
// Shrinks local and remote comms buffers
GridBase *Grid = rhs.Grid();
int Ls = Grid->_rdimensions[0];
#ifdef DWF_COMPRESS
int depth=dwf_compressor_depth;
#else
int depth=Ls/2;
#endif
std::pair<int,int> *table_v = & table[0];
auto rhs_v = rhs.View(AcceleratorRead);
int vol=table.size()/Ls;
accelerator_forNB( idx,table.size(), vobj::Nsimd(), {
Integer i=idx/Ls;
Integer s=idx%Ls;
Integer sc=depth+s-(Ls-depth);
if(s<depth) compress.Compress(buffer[off+i+s*vol],rhs_v[so+table_v[idx].second]);
if(s>=Ls-depth) compress.Compress(buffer[off+i+sc*vol],rhs_v[so+table_v[idx].second]);
});
rhs_v.ViewClose();
}
template<class decompressor,class Decompression>
static void DecompressFace(decompressor decompress,Decompression &dd)
{
auto Ls = dd.dims[0];
#ifdef DWF_COMPRESS
int depth=dwf_compressor_depth;
#else
int depth=Ls/2;
#endif
// Just pass in the Grid
auto kp = dd.kernel_p;
auto mp = dd.mpi_p;
int size= dd.buffer_size;
int vol= size/Ls;
accelerator_forNB(o,size,1,{
int idx=o/Ls;
int s=o%Ls;
if ( s < depth ) {
int oo=s*vol+idx;
kp[o]=mp[oo];
} else if ( s >= Ls-depth ) {
int sc = depth + s - (Ls-depth);
int oo=sc*vol+idx;
kp[o]=mp[oo];
} else {
kp[o] = Zero();//fill rest with zero if partial dirichlet
}
});
}
////////////////////////////////////////////////////////////////////////////////////////////
// Need to gather *interior portions* for ALL s-slices in simd directions
// Do the gather as need to treat SIMD lanes differently, and insert zeroes on receive side
// Reorder the fifth dim to be s=Ls-1 , s=0, s=1,...,Ls-2.
////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj,class cobj,class compressor>
static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
compressor &compress,int type,int partial)
{
GridBase *Grid = rhs.Grid();
int Ls = Grid->_rdimensions[0];
#ifdef DWF_COMPRESS
int depth=dwf_compressor_depth;
#else
int depth = Ls/2;
#endif
// insertion of zeroes...
assert( (table.size()&0x1)==0);
int num=table.size()/2;
int so = plane*rhs.Grid()->_ostride[dimension]; // base offset for start of plane
auto rhs_v = rhs.View(AcceleratorRead);
auto p0=&pointers[0][0];
auto p1=&pointers[1][0];
auto tp=&table[0];
int nnum=num/Ls;
accelerator_forNB(j, num, vobj::Nsimd(), {
// Reorders both local and remote comms buffers
//
int s = j % Ls;
int sp1 = (s+depth)%Ls; // peri incremented s slice
int hxyz= j/Ls;
int xyz0= hxyz*2; // xyzt part of coor
int xyz1= hxyz*2+1;
int jj= hxyz + sp1*nnum ; // 0,1,2,3 -> Ls-1 slice , 0-slice, 1-slice ....
int kk0= xyz0*Ls + s ; // s=0 goes to s=1
int kk1= xyz1*Ls + s ; // s=Ls-1 -> s=0
compress.CompressExchange(p0[jj],p1[jj],
rhs_v[so+tp[kk0 ].second], // Same s, consecutive xyz sites
rhs_v[so+tp[kk1 ].second],
type);
});
rhs_v.ViewClose();
}
// Merge routine is for SIMD faces
template<class decompressor,class Merger>
static void MergeFace(decompressor decompress,Merger &mm)
{
auto Ls = mm.dims[0];
#ifdef DWF_COMPRESS
int depth=dwf_compressor_depth;
#else
int depth = Ls/2;
#endif
int num= mm.buffer_size/2; // relate vol and Ls to buffer size
auto mp = &mm.mpointer[0];
auto vp0= &mm.vpointers[0][0]; // First arg is exchange first
auto vp1= &mm.vpointers[1][0];
auto type= mm.type;
int nnum = num/Ls;
accelerator_forNB(o,num,Merger::Nsimd,{
int s=o%Ls;
int hxyz=o/Ls; // xyzt related component
int xyz0=hxyz*2;
int xyz1=hxyz*2+1;
int sp = (s+depth)%Ls;
int jj= hxyz + sp*nnum ; // 0,1,2,3 -> Ls-1 slice , 0-slice, 1-slice ....
int oo0= s+xyz0*Ls;
int oo1= s+xyz1*Ls;
// same ss0, ss1 pair goes to new layout
decompress.Exchange(mp[oo0],mp[oo1],vp0[jj],vp1[jj],type);
});
}
};
class FaceGatherDWFMixedBCs
{
public:
#ifdef DWF_COMPRESS
static int PartialCompressionFactor(GridBase *grid) {return grid->_fdimensions[0]/(2*dwf_compressor_depth);};
#else
static int PartialCompressionFactor(GridBase *grid) {return 1;}
#endif
template<class vobj,class cobj,class compressor>
static void Gather_plane_simple (commVector<std::pair<int,int> >& table,
const Lattice<vobj> &rhs,
cobj *buffer,
compressor &compress,
int off,int so,int partial)
{
// std::cout << " face gather simple DWF partial "<<partial <<std::endl;
if(partial) FaceGatherPartialDWF::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
else FaceGatherSimple::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
}
template<class vobj,class cobj,class compressor>
static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
compressor &compress,int type,int partial)
{
// std::cout << " face gather exch DWF partial "<<partial <<std::endl;
if(partial) FaceGatherPartialDWF::Gather_plane_exchange(table,rhs,pointers,dimension, plane,cbmask,compress,type,partial);
else FaceGatherSimple::Gather_plane_exchange (table,rhs,pointers,dimension, plane,cbmask,compress,type,partial);
}
template<class decompressor,class Merger>
static void MergeFace(decompressor decompress,Merger &mm)
{
int partial = mm.partial;
// std::cout << " merge DWF partial "<<partial <<std::endl;
if ( partial ) FaceGatherPartialDWF::MergeFace(decompress,mm);
else FaceGatherSimple::MergeFace(decompress,mm);
}
template<class decompressor,class Decompression>
static void DecompressFace(decompressor decompress,Decompression &dd)
{
int partial = dd.partial;
// std::cout << " decompress DWF partial "<<partial <<std::endl;
if ( partial ) FaceGatherPartialDWF::DecompressFace(decompress,dd);
else FaceGatherSimple::DecompressFace(decompress,dd);
}
};
///////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////
// optimised versions supporting half precision too??? Deprecate // optimised versions supporting half precision too
///////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////
template<class _HCspinor,class _Hspinor,class _Spinor, class projector,typename SFINAE = void >
class WilsonCompressorTemplate;
//Could make FaceGather a template param, but then behaviour is runtime not compile time
template<class _HCspinor,class _Hspinor,class _Spinor, class projector> template<class _HCspinor,class _Hspinor,class _Spinor, class projector>
class WilsonCompressorTemplate : public FaceGatherDWFMixedBCs class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector,
// : public FaceGatherSimple typename std::enable_if<std::is_same<_HCspinor,_Hspinor>::value>::type >
{ {
public: public:
@ -280,81 +79,172 @@ public:
/*****************************************************/ /*****************************************************/
/* Exchange includes precision change if mpi data is not same */ /* Exchange includes precision change if mpi data is not same */
/*****************************************************/ /*****************************************************/
accelerator_inline void Exchange(SiteHalfSpinor &mp0, accelerator_inline void Exchange(SiteHalfSpinor *mp,
SiteHalfSpinor &mp1, const SiteHalfSpinor * __restrict__ vp0,
const SiteHalfSpinor & vp0, const SiteHalfSpinor * __restrict__ vp1,
const SiteHalfSpinor & vp1, Integer type,Integer o) const {
Integer type) const {
#ifdef GRID_SIMT #ifdef GRID_SIMT
exchangeSIMT(mp0,mp1,vp0,vp1,type); exchangeSIMT(mp[2*o],mp[2*o+1],vp0[o],vp1[o],type);
#else #else
SiteHalfSpinor tmp1; SiteHalfSpinor tmp1;
SiteHalfSpinor tmp2; SiteHalfSpinor tmp2;
exchange(tmp1,tmp2,vp0,vp1,type); exchange(tmp1,tmp2,vp0[o],vp1[o],type);
vstream(mp0,tmp1); vstream(mp[2*o ],tmp1);
vstream(mp1,tmp2); vstream(mp[2*o+1],tmp2);
#endif #endif
} }
/*****************************************************/ /*****************************************************/
/* Have a decompression step if mpi data is not same */ /* Have a decompression step if mpi data is not same */
/*****************************************************/ /*****************************************************/
accelerator_inline void Decompress(SiteHalfSpinor &out, accelerator_inline void Decompress(SiteHalfSpinor * __restrict__ out,
SiteHalfSpinor &in) const { SiteHalfSpinor * __restrict__ in, Integer o) const {
out = in; assert(0);
} }
/*****************************************************/ /*****************************************************/
/* Compress Exchange */ /* Compress Exchange */
/*****************************************************/ /*****************************************************/
accelerator_inline void CompressExchange(SiteHalfSpinor &out0, accelerator_inline void CompressExchange(SiteHalfSpinor * __restrict__ out0,
SiteHalfSpinor &out1, SiteHalfSpinor * __restrict__ out1,
const SiteSpinor &in0, const SiteSpinor * __restrict__ in,
const SiteSpinor &in1, Integer j,Integer k, Integer m,Integer type) const
Integer type) const
{ {
#ifdef GRID_SIMT #ifdef GRID_SIMT
typedef SiteSpinor vobj; typedef SiteSpinor vobj;
typedef SiteHalfSpinor hvobj; typedef SiteHalfSpinor hvobj;
typedef decltype(coalescedRead(in0)) sobj; typedef decltype(coalescedRead(*in)) sobj;
typedef decltype(coalescedRead(out0)) hsobj; typedef decltype(coalescedRead(*out0)) hsobj;
constexpr unsigned int Nsimd = vobj::Nsimd(); constexpr unsigned int Nsimd = vobj::Nsimd();
unsigned int mask = Nsimd >> (type + 1); unsigned int mask = Nsimd >> (type + 1);
int lane = acceleratorSIMTlane(Nsimd); int lane = acceleratorSIMTlane(Nsimd);
int j0 = lane &(~mask); // inner coor zero int j0 = lane &(~mask); // inner coor zero
int j1 = lane |(mask) ; // inner coor one int j1 = lane |(mask) ; // inner coor one
const vobj *vp0 = &in0; const vobj *vp0 = &in[k]; // out0[j] = merge low bit of type from in[k] and in[m]
const vobj *vp1 = &in1; const vobj *vp1 = &in[m]; // out1[j] = merge hi bit of type from in[k] and in[m]
const vobj *vp = (lane&mask) ? vp1:vp0; const vobj *vp = (lane&mask) ? vp1:vp0;// if my lane has high bit take vp1, low bit take vp0
auto sa = coalescedRead(*vp,j0); auto sa = coalescedRead(*vp,j0); // lane to read for out 0, NB 50% read coalescing
auto sb = coalescedRead(*vp,j1); auto sb = coalescedRead(*vp,j1); // lane to read for out 1
hsobj psa, psb; hsobj psa, psb;
projector::Proj(psa,sa,mu,dag); projector::Proj(psa,sa,mu,dag); // spin project the result0
projector::Proj(psb,sb,mu,dag); projector::Proj(psb,sb,mu,dag); // spin project the result1
coalescedWrite(out0,psa); coalescedWrite(out0[j],psa);
coalescedWrite(out1,psb); coalescedWrite(out1[j],psb);
#else #else
SiteHalfSpinor temp1, temp2; SiteHalfSpinor temp1, temp2;
SiteHalfSpinor temp3, temp4; SiteHalfSpinor temp3, temp4;
projector::Proj(temp1,in0,mu,dag); projector::Proj(temp1,in[k],mu,dag);
projector::Proj(temp2,in1,mu,dag); projector::Proj(temp2,in[m],mu,dag);
exchange(temp3,temp4,temp1,temp2,type); exchange(temp3,temp4,temp1,temp2,type);
vstream(out0,temp3); vstream(out0[j],temp3);
vstream(out1,temp4); vstream(out1[j],temp4);
#endif #endif
} }
/*****************************************************/ /*****************************************************/
/* Pass the info to the stencil */ /* Pass the info to the stencil */
/*****************************************************/ /*****************************************************/
accelerator_inline bool DecompressionStep(void) const { accelerator_inline bool DecompressionStep(void) const { return false; }
return false;
}
}; };
#if 0
template<class _HCspinor,class _Hspinor,class _Spinor, class projector>
class WilsonCompressorTemplate< _HCspinor, _Hspinor, _Spinor, projector,
typename std::enable_if<!std::is_same<_HCspinor,_Hspinor>::value>::type >
{
public:
int mu,dag;
void Point(int p) { mu=p; };
WilsonCompressorTemplate(int _dag=0){
dag = _dag;
}
typedef _Spinor SiteSpinor;
typedef _Hspinor SiteHalfSpinor;
typedef _HCspinor SiteHalfCommSpinor;
typedef typename SiteHalfCommSpinor::vector_type vComplexLow;
typedef typename SiteHalfSpinor::vector_type vComplexHigh;
constexpr static int Nw=sizeof(SiteHalfSpinor)/sizeof(vComplexHigh);
accelerator_inline int CommDatumSize(void) const {
return sizeof(SiteHalfCommSpinor);
}
/*****************************************************/
/* Compress includes precision change if mpi data is not same */
/*****************************************************/
accelerator_inline void Compress(SiteHalfSpinor &buf,const SiteSpinor &in) const {
SiteHalfSpinor hsp;
SiteHalfCommSpinor *hbuf = (SiteHalfCommSpinor *)buf;
projector::Proj(hsp,in,mu,dag);
precisionChange((vComplexLow *)&hbuf[o],(vComplexHigh *)&hsp,Nw);
}
accelerator_inline void Compress(SiteHalfSpinor &buf,const SiteSpinor &in) const {
#ifdef GRID_SIMT
typedef decltype(coalescedRead(buf)) sobj;
sobj sp;
auto sin = coalescedRead(in);
projector::Proj(sp,sin,mu,dag);
coalescedWrite(buf,sp);
#else
projector::Proj(buf,in,mu,dag);
#endif
}
/*****************************************************/
/* Exchange includes precision change if mpi data is not same */
/*****************************************************/
accelerator_inline void Exchange(SiteHalfSpinor *mp,
SiteHalfSpinor *vp0,
SiteHalfSpinor *vp1,
Integer type,Integer o) const {
SiteHalfSpinor vt0,vt1;
SiteHalfCommSpinor *vpp0 = (SiteHalfCommSpinor *)vp0;
SiteHalfCommSpinor *vpp1 = (SiteHalfCommSpinor *)vp1;
precisionChange((vComplexHigh *)&vt0,(vComplexLow *)&vpp0[o],Nw);
precisionChange((vComplexHigh *)&vt1,(vComplexLow *)&vpp1[o],Nw);
exchange(mp[2*o],mp[2*o+1],vt0,vt1,type);
}
/*****************************************************/
/* Have a decompression step if mpi data is not same */
/*****************************************************/
accelerator_inline void Decompress(SiteHalfSpinor *out, SiteHalfSpinor *in, Integer o) const {
SiteHalfCommSpinor *hin=(SiteHalfCommSpinor *)in;
precisionChange((vComplexHigh *)&out[o],(vComplexLow *)&hin[o],Nw);
}
/*****************************************************/
/* Compress Exchange */
/*****************************************************/
accelerator_inline void CompressExchange(SiteHalfSpinor *out0,
SiteHalfSpinor *out1,
const SiteSpinor *in,
Integer j,Integer k, Integer m,Integer type) const {
SiteHalfSpinor temp1, temp2,temp3,temp4;
SiteHalfCommSpinor *hout0 = (SiteHalfCommSpinor *)out0;
SiteHalfCommSpinor *hout1 = (SiteHalfCommSpinor *)out1;
projector::Proj(temp1,in[k],mu,dag);
projector::Proj(temp2,in[m],mu,dag);
exchange(temp3,temp4,temp1,temp2,type);
precisionChange((vComplexLow *)&hout0[j],(vComplexHigh *)&temp3,Nw);
precisionChange((vComplexLow *)&hout1[j],(vComplexHigh *)&temp4,Nw);
}
/*****************************************************/
/* Pass the info to the stencil */
/*****************************************************/
accelerator_inline bool DecompressionStep(void) const { return true; }
};
#endif
#define DECLARE_PROJ(Projector,Compressor,spProj) \ #define DECLARE_PROJ(Projector,Compressor,spProj) \
class Projector { \ class Projector { \
public: \ public: \
@ -404,7 +294,11 @@ public:
typedef typename Base::View_type View_type; typedef typename Base::View_type View_type;
typedef typename Base::StencilVector StencilVector; typedef typename Base::StencilVector StencilVector;
// Vector<int> surface_list; void ZeroCountersi(void) { }
void Reporti(int calls) { }
std::vector<int> surface_list;
WilsonStencil(GridBase *grid, WilsonStencil(GridBase *grid,
int npoints, int npoints,
int checkerboard, int checkerboard,
@ -412,11 +306,11 @@ public:
const std::vector<int> &distances,Parameters p) const std::vector<int> &distances,Parameters p)
: CartesianStencil<vobj,cobj,Parameters> (grid,npoints,checkerboard,directions,distances,p) : CartesianStencil<vobj,cobj,Parameters> (grid,npoints,checkerboard,directions,distances,p)
{ {
// surface_list.resize(0); ZeroCountersi();
surface_list.resize(0);
this->same_node.resize(npoints); this->same_node.resize(npoints);
}; };
/*
void BuildSurfaceList(int Ls,int vol4){ void BuildSurfaceList(int Ls,int vol4){
// find same node for SHM // find same node for SHM
@ -437,8 +331,7 @@ public:
} }
} }
} }
*/
template < class compressor> template < class compressor>
void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress) void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress)
{ {
@ -484,26 +377,24 @@ public:
int dag = compress.dag; int dag = compress.dag;
int face_idx=0; int face_idx=0;
#define vet_same_node(a,b) \
{ auto tmp = b; }
if ( dag ) { if ( dag ) {
vet_same_node(this->same_node[Xp],this->HaloGatherDir(source,XpCompress,Xp,face_idx)); assert(this->same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx));
vet_same_node(this->same_node[Yp],this->HaloGatherDir(source,YpCompress,Yp,face_idx)); assert(this->same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx));
vet_same_node(this->same_node[Zp],this->HaloGatherDir(source,ZpCompress,Zp,face_idx)); assert(this->same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx));
vet_same_node(this->same_node[Tp],this->HaloGatherDir(source,TpCompress,Tp,face_idx)); assert(this->same_node[Tp]==this->HaloGatherDir(source,TpCompress,Tp,face_idx));
vet_same_node(this->same_node[Xm],this->HaloGatherDir(source,XmCompress,Xm,face_idx)); assert(this->same_node[Xm]==this->HaloGatherDir(source,XmCompress,Xm,face_idx));
vet_same_node(this->same_node[Ym],this->HaloGatherDir(source,YmCompress,Ym,face_idx)); assert(this->same_node[Ym]==this->HaloGatherDir(source,YmCompress,Ym,face_idx));
vet_same_node(this->same_node[Zm],this->HaloGatherDir(source,ZmCompress,Zm,face_idx)); assert(this->same_node[Zm]==this->HaloGatherDir(source,ZmCompress,Zm,face_idx));
vet_same_node(this->same_node[Tm],this->HaloGatherDir(source,TmCompress,Tm,face_idx)); assert(this->same_node[Tm]==this->HaloGatherDir(source,TmCompress,Tm,face_idx));
} else { } else {
vet_same_node(this->same_node[Xp],this->HaloGatherDir(source,XmCompress,Xp,face_idx)); assert(this->same_node[Xp]==this->HaloGatherDir(source,XmCompress,Xp,face_idx));
vet_same_node(this->same_node[Yp],this->HaloGatherDir(source,YmCompress,Yp,face_idx)); assert(this->same_node[Yp]==this->HaloGatherDir(source,YmCompress,Yp,face_idx));
vet_same_node(this->same_node[Zp],this->HaloGatherDir(source,ZmCompress,Zp,face_idx)); assert(this->same_node[Zp]==this->HaloGatherDir(source,ZmCompress,Zp,face_idx));
vet_same_node(this->same_node[Tp],this->HaloGatherDir(source,TmCompress,Tp,face_idx)); assert(this->same_node[Tp]==this->HaloGatherDir(source,TmCompress,Tp,face_idx));
vet_same_node(this->same_node[Xm],this->HaloGatherDir(source,XpCompress,Xm,face_idx)); assert(this->same_node[Xm]==this->HaloGatherDir(source,XpCompress,Xm,face_idx));
vet_same_node(this->same_node[Ym],this->HaloGatherDir(source,YpCompress,Ym,face_idx)); assert(this->same_node[Ym]==this->HaloGatherDir(source,YpCompress,Ym,face_idx));
vet_same_node(this->same_node[Zm],this->HaloGatherDir(source,ZpCompress,Zm,face_idx)); assert(this->same_node[Zm]==this->HaloGatherDir(source,ZpCompress,Zm,face_idx));
vet_same_node(this->same_node[Tm],this->HaloGatherDir(source,TpCompress,Tm,face_idx)); assert(this->same_node[Tm]==this->HaloGatherDir(source,TpCompress,Tm,face_idx));
} }
this->face_table_computed=1; this->face_table_computed=1;
assert(this->u_comm_offset==this->_unified_buffer_size); assert(this->u_comm_offset==this->_unified_buffer_size);

View File

@ -74,6 +74,20 @@ public:
FermionField _tmp; FermionField _tmp;
FermionField &tmp(void) { return _tmp; } FermionField &tmp(void) { return _tmp; }
void Report(void);
void ZeroCounters(void);
double DhopCalls;
double DhopCommTime;
double DhopComputeTime;
double DhopComputeTime2;
double DhopFaceTime;
double DhopTotalTime;
double DerivCalls;
double DerivCommTime;
double DerivComputeTime;
double DerivDhopComputeTime;
////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////
// override multiply; cut number routines if pass dagger argument // override multiply; cut number routines if pass dagger argument
// and also make interface more uniformly consistent // and also make interface more uniformly consistent

View File

@ -75,8 +75,19 @@ public:
FermionField _tmp; FermionField _tmp;
FermionField &tmp(void) { return _tmp; } FermionField &tmp(void) { return _tmp; }
int Dirichlet; void Report(void);
Coordinate Block; void ZeroCounters(void);
double DhopCalls;
double DhopCommTime;
double DhopComputeTime;
double DhopComputeTime2;
double DhopFaceTime;
double DhopTotalTime;
double DerivCalls;
double DerivCommTime;
double DerivComputeTime;
double DerivDhopComputeTime;
/////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////
// Implement the abstract base // Implement the abstract base
@ -162,10 +173,7 @@ public:
GridCartesian &FourDimGrid, GridCartesian &FourDimGrid,
GridRedBlackCartesian &FourDimRedBlackGrid, GridRedBlackCartesian &FourDimRedBlackGrid,
double _M5,const ImplParams &p= ImplParams()); double _M5,const ImplParams &p= ImplParams());
virtual void DirichletBlock(const Coordinate & block)
{
}
// Constructors // Constructors
/* /*
WilsonFermion5D(int simd, WilsonFermion5D(int simd,

View File

@ -37,7 +37,7 @@ NAMESPACE_BEGIN(Grid);
template <class S, class Representation = FundamentalRepresentation,class Options = CoeffReal > template <class S, class Representation = FundamentalRepresentation,class Options = CoeffReal >
class WilsonImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > { class WilsonImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > {
public: public:
static const int Dimension = Representation::Dimension; static const int Dimension = Representation::Dimension;
static const bool isFundamental = Representation::isFundamental; static const bool isFundamental = Representation::isFundamental;
static const bool LsVectorised=false; static const bool LsVectorised=false;
@ -242,13 +242,19 @@ public:
typedef WilsonImpl<vComplex, FundamentalRepresentation, CoeffReal > WilsonImplR; // Real.. whichever prec typedef WilsonImpl<vComplex, FundamentalRepresentation, CoeffReal > WilsonImplR; // Real.. whichever prec
typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffReal > WilsonImplF; // Float typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffReal > WilsonImplF; // Float
typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffReal > WilsonImplD; // Double typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffReal > WilsonImplD; // Double
typedef WilsonImpl<vComplexD2, FundamentalRepresentation, CoeffReal > WilsonImplD2; // Double
//typedef WilsonImpl<vComplex, FundamentalRepresentation, CoeffRealHalfComms > WilsonImplRL; // Real.. whichever prec
//typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffRealHalfComms > WilsonImplFH; // Float
//typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffRealHalfComms > WilsonImplDF; // Double
typedef WilsonImpl<vComplex, FundamentalRepresentation, CoeffComplex > ZWilsonImplR; // Real.. whichever prec typedef WilsonImpl<vComplex, FundamentalRepresentation, CoeffComplex > ZWilsonImplR; // Real.. whichever prec
typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffComplex > ZWilsonImplF; // Float typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffComplex > ZWilsonImplF; // Float
typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffComplex > ZWilsonImplD; // Double typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffComplex > ZWilsonImplD; // Double
typedef WilsonImpl<vComplexD2, FundamentalRepresentation, CoeffComplex > ZWilsonImplD2; // Double
//typedef WilsonImpl<vComplex, FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplRL; // Real.. whichever prec
//typedef WilsonImpl<vComplexF, FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplFH; // Float
//typedef WilsonImpl<vComplexD, FundamentalRepresentation, CoeffComplexHalfComms > ZWilsonImplDF; // Double
typedef WilsonImpl<vComplex, AdjointRepresentation, CoeffReal > WilsonAdjImplR; // Real.. whichever prec typedef WilsonImpl<vComplex, AdjointRepresentation, CoeffReal > WilsonAdjImplR; // Real.. whichever prec
typedef WilsonImpl<vComplexF, AdjointRepresentation, CoeffReal > WilsonAdjImplF; // Float typedef WilsonImpl<vComplexF, AdjointRepresentation, CoeffReal > WilsonAdjImplF; // Float
typedef WilsonImpl<vComplexD, AdjointRepresentation, CoeffReal > WilsonAdjImplD; // Double typedef WilsonImpl<vComplexD, AdjointRepresentation, CoeffReal > WilsonAdjImplD; // Double
@ -261,22 +267,6 @@ typedef WilsonImpl<vComplex, TwoIndexAntiSymmetricRepresentation, CoeffReal > W
typedef WilsonImpl<vComplexF, TwoIndexAntiSymmetricRepresentation, CoeffReal > WilsonTwoIndexAntiSymmetricImplF; // Float typedef WilsonImpl<vComplexF, TwoIndexAntiSymmetricRepresentation, CoeffReal > WilsonTwoIndexAntiSymmetricImplF; // Float
typedef WilsonImpl<vComplexD, TwoIndexAntiSymmetricRepresentation, CoeffReal > WilsonTwoIndexAntiSymmetricImplD; // Double typedef WilsonImpl<vComplexD, TwoIndexAntiSymmetricRepresentation, CoeffReal > WilsonTwoIndexAntiSymmetricImplD; // Double
//sp 2n
typedef WilsonImpl<vComplex, SpFundamentalRepresentation, CoeffReal > SpWilsonImplR; // Real.. whichever prec
typedef WilsonImpl<vComplexF, SpFundamentalRepresentation, CoeffReal > SpWilsonImplF; // Float
typedef WilsonImpl<vComplexD, SpFundamentalRepresentation, CoeffReal > SpWilsonImplD; // Double
typedef WilsonImpl<vComplex, SpTwoIndexAntiSymmetricRepresentation, CoeffReal > SpWilsonTwoIndexAntiSymmetricImplR; // Real.. whichever prec
typedef WilsonImpl<vComplexF, SpTwoIndexAntiSymmetricRepresentation, CoeffReal > SpWilsonTwoIndexAntiSymmetricImplF; // Float
typedef WilsonImpl<vComplexD, SpTwoIndexAntiSymmetricRepresentation, CoeffReal > SpWilsonTwoIndexAntiSymmetricImplD; // Double
typedef WilsonImpl<vComplex, SpTwoIndexSymmetricRepresentation, CoeffReal > SpWilsonTwoIndexSymmetricImplR; // Real.. whichever prec
typedef WilsonImpl<vComplexF, SpTwoIndexSymmetricRepresentation, CoeffReal > SpWilsonTwoIndexSymmetricImplF; // Float
typedef WilsonImpl<vComplexD, SpTwoIndexSymmetricRepresentation, CoeffReal > SpWilsonTwoIndexSymmetricImplD; // Double
typedef WilsonImpl<vComplex, SpTwoIndexSymmetricRepresentation, CoeffReal > SpWilsonAdjImplR; // Real.. whichever prec // adj = 2indx symmetric for Sp(2N)
typedef WilsonImpl<vComplexF, SpTwoIndexSymmetricRepresentation, CoeffReal > SpWilsonAdjImplF; // Float // adj = 2indx symmetric for Sp(2N)
typedef WilsonImpl<vComplexD, SpTwoIndexSymmetricRepresentation, CoeffReal > SpWilsonAdjImplD; // Double // adj = 2indx symmetric for Sp(2N)
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@ -52,6 +52,13 @@ public:
typedef AcceleratorVector<int,STENCIL_MAX> StencilVector; typedef AcceleratorVector<int,STENCIL_MAX> StencilVector;
public: public:
#ifdef GRID_SYCL
#define SYCL_HACK
#endif
#ifdef SYCL_HACK
static void HandDhopSiteSycl(StencilVector st_perm,StencilEntry *st_p, SiteDoubledGaugeField *U,SiteHalfSpinor *buf,
int ss,int sU,const SiteSpinor *in, SiteSpinor *out);
#endif
static void DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, static void DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
int Ls, int Nsite, const FermionField &in, FermionField &out, int Ls, int Nsite, const FermionField &in, FermionField &out,

View File

@ -152,6 +152,58 @@ void CayleyFermion5D<Impl>::DminusDag(const FermionField &psi, FermionField &chi
} }
} }
template<class Impl> void CayleyFermion5D<Impl>::CayleyReport(void)
{
this->Report();
Coordinate latt = GridDefaultLatt();
RealD volume = this->Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
RealD NP = this->_FourDimGrid->_Nprocessors;
if ( M5Dcalls > 0 ) {
std::cout << GridLogMessage << "#### M5D calls report " << std::endl;
std::cout << GridLogMessage << "CayleyFermion5D Number of M5D Calls : " << M5Dcalls << std::endl;
std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << M5Dtime / M5Dcalls << " us" << std::endl;
// Flops = 10.0*(Nc*Ns) *Ls*vol
RealD mflops = 10.0*(Nc*Ns)*volume*M5Dcalls/M5Dtime/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
// Bytes = sizeof(Real) * (Nc*Ns*Nreim) * Ls * vol * (read+write) (/2 for red black counting)
// read = 2 ( psi[ss+s+1] and psi[ss+s-1] count as 1 )
// write = 1
RealD Gbytes = sizeof(Real) * (Nc*Ns*2) * volume * 3 /2. * 1.e-9;
std::cout << GridLogMessage << "Average bandwidth (GB/s) : " << Gbytes/M5Dtime*M5Dcalls*1.e6 << std::endl;
}
if ( MooeeInvCalls > 0 ) {
std::cout << GridLogMessage << "#### MooeeInv calls report " << std::endl;
std::cout << GridLogMessage << "CayleyFermion5D Number of MooeeInv Calls : " << MooeeInvCalls << std::endl;
std::cout << GridLogMessage << "CayleyFermion5D ComputeTime/Calls : " << MooeeInvTime / MooeeInvCalls << " us" << std::endl;
#ifdef GRID_CUDA
RealD mflops = ( -16.*Nc*Ns+this->Ls*(1.+18.*Nc*Ns) )*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
#else
// Flops = MADD * Ls *Ls *4dvol * spin/colour/complex
RealD mflops = 2.0*24*this->Ls*volume*MooeeInvCalls/MooeeInvTime/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
#endif
}
}
template<class Impl> void CayleyFermion5D<Impl>::CayleyZeroCounters(void)
{
this->ZeroCounters();
M5Dflops=0;
M5Dcalls=0;
M5Dtime=0;
MooeeInvFlops=0;
MooeeInvCalls=0;
MooeeInvTime=0;
}
template<class Impl> template<class Impl>
void CayleyFermion5D<Impl>::M5D (const FermionField &psi, FermionField &chi) void CayleyFermion5D<Impl>::M5D (const FermionField &psi, FermionField &chi)
{ {
@ -594,6 +646,7 @@ void CayleyFermion5D<Impl>::ContractConservedCurrent( PropagatorField &q_in_1,
assert(mass_plus == mass_minus); assert(mass_plus == mass_minus);
RealD mass = mass_plus; RealD mass = mass_plus;
#if (!defined(GRID_HIP))
Gamma::Algebra Gmu [] = { Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX, Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY, Gamma::Algebra::GammaY,
@ -712,7 +765,7 @@ void CayleyFermion5D<Impl>::ContractConservedCurrent( PropagatorField &q_in_1,
else q_out += C; else q_out += C;
} }
#endif
} }
template <class Impl> template <class Impl>
@ -779,6 +832,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
} }
#endif #endif
#if (!defined(GRID_HIP))
int tshift = (mu == Nd-1) ? 1 : 0; int tshift = (mu == Nd-1) ? 1 : 0;
unsigned int LLt = GridDefaultLatt()[Tp]; unsigned int LLt = GridDefaultLatt()[Tp];
//////////////////////////////////////////////// ////////////////////////////////////////////////
@ -898,6 +952,7 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
InsertSlice(L_Q, q_out, s , 0); InsertSlice(L_Q, q_out, s , 0);
} }
#endif
} }
#undef Pp #undef Pp
#undef Pm #undef Pm
@ -905,6 +960,88 @@ void CayleyFermion5D<Impl>::SeqConservedCurrent(PropagatorField &q_in,
#undef TopRowWithSource #undef TopRowWithSource
#if 0
template<class Impl>
void CayleyFermion5D<Impl>::MooeeInternalCompute(int dag, int inv,
Vector<iSinglet<Simd> > & Matp,
Vector<iSinglet<Simd> > & Matm)
{
int Ls=this->Ls;
GridBase *grid = this->FermionRedBlackGrid();
int LLs = grid->_rdimensions[0];
if ( LLs == Ls ) {
return; // Not vectorised in 5th direction
}
Eigen::MatrixXcd Pplus = Eigen::MatrixXcd::Zero(Ls,Ls);
Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls);
for(int s=0;s<Ls;s++){
Pplus(s,s) = bee[s];
Pminus(s,s)= bee[s];
}
for(int s=0;s<Ls-1;s++){
Pminus(s,s+1) = -cee[s];
}
for(int s=0;s<Ls-1;s++){
Pplus(s+1,s) = -cee[s+1];
}
Pplus (0,Ls-1) = mass*cee[0];
Pminus(Ls-1,0) = mass*cee[Ls-1];
Eigen::MatrixXcd PplusMat ;
Eigen::MatrixXcd PminusMat;
if ( inv ) {
PplusMat =Pplus.inverse();
PminusMat=Pminus.inverse();
} else {
PplusMat =Pplus;
PminusMat=Pminus;
}
if(dag){
PplusMat.adjointInPlace();
PminusMat.adjointInPlace();
}
typedef typename SiteHalfSpinor::scalar_type scalar_type;
const int Nsimd=Simd::Nsimd();
Matp.resize(Ls*LLs);
Matm.resize(Ls*LLs);
for(int s2=0;s2<Ls;s2++){
for(int s1=0;s1<LLs;s1++){
int istride = LLs;
int ostride = 1;
Simd Vp;
Simd Vm;
scalar_type *sp = (scalar_type *)&Vp;
scalar_type *sm = (scalar_type *)&Vm;
for(int l=0;l<Nsimd;l++){
if ( switcheroo<Coeff_t>::iscomplex() ) {
sp[l] = PplusMat (l*istride+s1*ostride,s2);
sm[l] = PminusMat(l*istride+s1*ostride,s2);
} else {
// if real
scalar_type tmp;
tmp = PplusMat (l*istride+s1*ostride,s2);
sp[l] = scalar_type(tmp.real(),tmp.real());
tmp = PminusMat(l*istride+s1*ostride,s2);
sm[l] = scalar_type(tmp.real(),tmp.real());
}
}
Matp[LLs*s2+s1] = Vp;
Matm[LLs*s2+s1] = Vm;
}}
}
#endif
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@ -63,18 +63,23 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
// 10 = 3 complex mult + 2 complex add // 10 = 3 complex mult + 2 complex add
// Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting) // Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting)
uint64_t nloop = grid->oSites(); M5Dcalls++;
M5Dtime-=usecond();
uint64_t nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,Simd::Nsimd(),{ accelerator_for(sss,nloop,Simd::Nsimd(),{
uint64_t s = sss%Ls; uint64_t ss= sss*Ls;
uint64_t ss= sss-s;
typedef decltype(coalescedRead(psi[0])) spinor; typedef decltype(coalescedRead(psi[0])) spinor;
spinor tmp1, tmp2; spinor tmp1, tmp2;
uint64_t idx_u = ss+((s+1)%Ls); for(int s=0;s<Ls;s++){
uint64_t idx_l = ss+((s+Ls-1)%Ls); uint64_t idx_u = ss+((s+1)%Ls);
spProj5m(tmp1,psi(idx_u)); uint64_t idx_l = ss+((s+Ls-1)%Ls);
spProj5p(tmp2,psi(idx_l)); spProj5m(tmp1,psi(idx_u));
coalescedWrite(chi[ss+s],pdiag[s]*phi(ss+s)+pupper[s]*tmp1+plower[s]*tmp2); spProj5p(tmp2,psi(idx_l));
coalescedWrite(chi[ss+s],pdiag[s]*phi(ss+s)+pupper[s]*tmp1+plower[s]*tmp2);
}
}); });
M5Dtime+=usecond();
} }
template<class Impl> template<class Impl>
@ -100,18 +105,23 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
int Ls=this->Ls; int Ls=this->Ls;
// Flops = 6.0*(Nc*Ns) *Ls*vol // Flops = 6.0*(Nc*Ns) *Ls*vol
uint64_t nloop = grid->oSites(); M5Dcalls++;
M5Dtime-=usecond();
uint64_t nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,Simd::Nsimd(),{ accelerator_for(sss,nloop,Simd::Nsimd(),{
uint64_t s = sss%Ls; uint64_t ss=sss*Ls;
uint64_t ss= sss-s;
typedef decltype(coalescedRead(psi[0])) spinor; typedef decltype(coalescedRead(psi[0])) spinor;
spinor tmp1,tmp2; spinor tmp1,tmp2;
uint64_t idx_u = ss+((s+1)%Ls); for(int s=0;s<Ls;s++){
uint64_t idx_l = ss+((s+Ls-1)%Ls); uint64_t idx_u = ss+((s+1)%Ls);
spProj5p(tmp1,psi(idx_u)); uint64_t idx_l = ss+((s+Ls-1)%Ls);
spProj5m(tmp2,psi(idx_l)); spProj5p(tmp1,psi(idx_u));
coalescedWrite(chi[ss+s],pdiag[s]*phi(ss+s)+pupper[s]*tmp1+plower[s]*tmp2); spProj5m(tmp2,psi(idx_l));
coalescedWrite(chi[ss+s],pdiag[s]*phi(ss+s)+pupper[s]*tmp1+plower[s]*tmp2);
}
}); });
M5Dtime+=usecond();
} }
template<class Impl> template<class Impl>
@ -132,6 +142,8 @@ CayleyFermion5D<Impl>::MooeeInv (const FermionField &psi_i, FermionField &chi
auto pleem = & leem[0]; auto pleem = & leem[0];
auto pueem = & ueem[0]; auto pueem = & ueem[0];
MooeeInvCalls++;
MooeeInvTime-=usecond();
uint64_t nloop = grid->oSites()/Ls; uint64_t nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,Simd::Nsimd(),{ accelerator_for(sss,nloop,Simd::Nsimd(),{
uint64_t ss=sss*Ls; uint64_t ss=sss*Ls;
@ -168,6 +180,8 @@ CayleyFermion5D<Impl>::MooeeInv (const FermionField &psi_i, FermionField &chi
coalescedWrite(chi[ss+s],res); coalescedWrite(chi[ss+s],res);
} }
}); });
MooeeInvTime+=usecond();
} }
@ -190,6 +204,10 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi
assert(psi.Checkerboard() == psi.Checkerboard()); assert(psi.Checkerboard() == psi.Checkerboard());
MooeeInvCalls++;
MooeeInvTime-=usecond();
uint64_t nloop = grid->oSites()/Ls; uint64_t nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,Simd::Nsimd(),{ accelerator_for(sss,nloop,Simd::Nsimd(),{
uint64_t ss=sss*Ls; uint64_t ss=sss*Ls;
@ -226,6 +244,7 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi
coalescedWrite(chi[ss+s],res); coalescedWrite(chi[ss+s],res);
} }
}); });
MooeeInvTime+=usecond();
} }

View File

@ -94,6 +94,10 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
d_p[ss] = diag[s]; d_p[ss] = diag[s];
}} }}
M5Dcalls++;
M5Dtime-=usecond();
assert(Nc==3); assert(Nc==3);
thread_loop( (int ss=0;ss<grid->oSites();ss+=LLs),{ // adds LLs thread_loop( (int ss=0;ss<grid->oSites();ss+=LLs),{ // adds LLs
@ -194,6 +198,7 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
} }
#endif #endif
}); });
M5Dtime+=usecond();
} }
template<class Impl> template<class Impl>
@ -237,6 +242,8 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
d_p[ss] = diag[s]; d_p[ss] = diag[s];
}} }}
M5Dcalls++;
M5Dtime-=usecond();
thread_loop( (int ss=0;ss<grid->oSites();ss+=LLs),{ // adds LLs thread_loop( (int ss=0;ss<grid->oSites();ss+=LLs),{ // adds LLs
#if 0 #if 0
alignas(64) SiteHalfSpinor hp; alignas(64) SiteHalfSpinor hp;
@ -332,6 +339,7 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
} }
#endif #endif
}); });
M5Dtime+=usecond();
} }
@ -805,6 +813,9 @@ CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,
} }
assert(_Matp->size()==Ls*LLs); assert(_Matp->size()==Ls*LLs);
MooeeInvCalls++;
MooeeInvTime-=usecond();
if ( switcheroo<Coeff_t>::iscomplex() ) { if ( switcheroo<Coeff_t>::iscomplex() ) {
thread_loop( (auto site=0;site<vol;site++),{ thread_loop( (auto site=0;site<vol;site++),{
MooeeInternalZAsm(psi,chi,LLs,site,*_Matp,*_Matm); MooeeInternalZAsm(psi,chi,LLs,site,*_Matp,*_Matm);
@ -814,7 +825,7 @@ CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,
MooeeInternalAsm(psi,chi,LLs,site,*_Matp,*_Matm); MooeeInternalAsm(psi,chi,LLs,site,*_Matp,*_Matm);
}); });
} }
MooeeInvTime+=usecond();
} }
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@ -54,6 +54,8 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
auto pupper = &upper[0]; auto pupper = &upper[0];
auto plower = &lower[0]; auto plower = &lower[0];
// Flops = 6.0*(Nc*Ns) *Ls*vol // Flops = 6.0*(Nc*Ns) *Ls*vol
this->M5Dcalls++;
this->M5Dtime -= usecond();
auto nloop=grid->oSites()/Ls; auto nloop=grid->oSites()/Ls;
accelerator_for(sss,nloop,Simd::Nsimd(),{ accelerator_for(sss,nloop,Simd::Nsimd(),{
@ -69,6 +71,7 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
} }
}); });
this->M5Dtime += usecond();
} }
template<class Impl> template<class Impl>
@ -88,6 +91,8 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const Fermio
auto plower = &lower[0]; auto plower = &lower[0];
// Flops = 6.0*(Nc*Ns) *Ls*vol // Flops = 6.0*(Nc*Ns) *Ls*vol
this->M5Dcalls++;
this->M5Dtime -= usecond();
auto nloop=grid->oSites()/Ls; auto nloop=grid->oSites()/Ls;
accelerator_for(sss,nloop,Simd::Nsimd(),{ accelerator_for(sss,nloop,Simd::Nsimd(),{
@ -103,6 +108,7 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const Fermio
} }
}); });
this->M5Dtime += usecond();
} }
template<class Impl> template<class Impl>
@ -121,6 +127,8 @@ void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionFie
auto pleem = & this->leem[0]; auto pleem = & this->leem[0];
auto pueem = & this->ueem[0]; auto pueem = & this->ueem[0];
this->MooeeInvCalls++;
this->MooeeInvTime -= usecond();
uint64_t nloop=grid->oSites()/Ls; uint64_t nloop=grid->oSites()/Ls;
accelerator_for(sss,nloop,Simd::Nsimd(),{ accelerator_for(sss,nloop,Simd::Nsimd(),{
uint64_t ss=sss*Ls; uint64_t ss=sss*Ls;
@ -156,6 +164,7 @@ void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionFie
coalescedWrite(chi[ss+s],res); coalescedWrite(chi[ss+s],res);
} }
}); });
this->MooeeInvTime += usecond();
} }
template<class Impl> template<class Impl>
@ -176,6 +185,8 @@ void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi_i, Fermion
assert(psi.Checkerboard() == psi.Checkerboard()); assert(psi.Checkerboard() == psi.Checkerboard());
this->MooeeInvCalls++;
this->MooeeInvTime -= usecond();
auto nloop = grid->oSites()/Ls; auto nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,Simd::Nsimd(),{ accelerator_for(sss,nloop,Simd::Nsimd(),{
uint64_t ss=sss*Ls; uint64_t ss=sss*Ls;
@ -212,6 +223,7 @@ void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi_i, Fermion
} }
}); });
this->MooeeInvTime += usecond();
} }
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@ -298,33 +298,45 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
int LLs = in.Grid()->_rdimensions[0]; int LLs = in.Grid()->_rdimensions[0];
int len = U.Grid()->oSites(); int len = U.Grid()->oSites();
DhopFaceTime-=usecond();
st.Prepare(); st.Prepare();
st.HaloGather(in,compressor); st.HaloGather(in,compressor);
DhopFaceTime+=usecond();
DhopCommTime -=usecond();
std::vector<std::vector<CommsRequest_t> > requests; std::vector<std::vector<CommsRequest_t> > requests;
st.CommunicateBegin(requests); st.CommunicateBegin(requests);
// st.HaloExchangeOptGather(in,compressor); // Wilson compressor // st.HaloExchangeOptGather(in,compressor); // Wilson compressor
DhopFaceTime-=usecond();
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
DhopFaceTime+=usecond();
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
// Remove explicit thread mapping introduced for OPA reasons. // Remove explicit thread mapping introduced for OPA reasons.
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
DhopComputeTime-=usecond();
{ {
int interior=1; int interior=1;
int exterior=0; int exterior=0;
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
} }
DhopComputeTime+=usecond();
DhopFaceTime-=usecond();
st.CommsMerge(compressor); st.CommsMerge(compressor);
DhopFaceTime+=usecond();
st.CommunicateComplete(requests); st.CommunicateComplete(requests);
DhopCommTime +=usecond();
DhopComputeTime2-=usecond();
{ {
int interior=0; int interior=0;
int exterior=1; int exterior=1;
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
} }
DhopComputeTime2+=usecond();
} }
template<class Impl> template<class Impl>
@ -335,14 +347,22 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
Compressor compressor; Compressor compressor;
int LLs = in.Grid()->_rdimensions[0]; int LLs = in.Grid()->_rdimensions[0];
//double t1=usecond();
DhopTotalTime -= usecond();
DhopCommTime -= usecond();
st.HaloExchange(in,compressor); st.HaloExchange(in,compressor);
DhopCommTime += usecond();
DhopComputeTime -= usecond();
// Dhop takes the 4d grid from U, and makes a 5d index for fermion // Dhop takes the 4d grid from U, and makes a 5d index for fermion
{ {
int interior=1; int interior=1;
int exterior=1; int exterior=1;
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
} }
DhopComputeTime += usecond();
DhopTotalTime += usecond();
} }
/*CHANGE END*/ /*CHANGE END*/
@ -351,6 +371,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
template<class Impl> template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag) void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
{ {
DhopCalls+=1;
conformable(in.Grid(),FermionRedBlackGrid()); // verifies half grid conformable(in.Grid(),FermionRedBlackGrid()); // verifies half grid
conformable(in.Grid(),out.Grid()); // drops the cb check conformable(in.Grid(),out.Grid()); // drops the cb check
@ -362,6 +383,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionFie
template<class Impl> template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag) void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
{ {
DhopCalls+=1;
conformable(in.Grid(),FermionRedBlackGrid()); // verifies half grid conformable(in.Grid(),FermionRedBlackGrid()); // verifies half grid
conformable(in.Grid(),out.Grid()); // drops the cb check conformable(in.Grid(),out.Grid()); // drops the cb check
@ -373,6 +395,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionFie
template<class Impl> template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag) void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
{ {
DhopCalls+=2;
conformable(in.Grid(),FermionGrid()); // verifies full grid conformable(in.Grid(),FermionGrid()); // verifies full grid
conformable(in.Grid(),out.Grid()); conformable(in.Grid(),out.Grid());
@ -381,6 +404,58 @@ void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField
DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag); DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag);
} }
template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::Report(void)
{
Coordinate latt = GridDefaultLatt();
RealD volume = Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
RealD NP = _FourDimGrid->_Nprocessors;
RealD NN = _FourDimGrid->NodeCount();
std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Number of DhopEO Calls : "
<< DhopCalls << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion5D TotalTime /Calls : "
<< DhopTotalTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion5D CommTime /Calls : "
<< DhopCommTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion5D ComputeTime/Calls : "
<< DhopComputeTime / DhopCalls << " us" << std::endl;
// Average the compute time
_FourDimGrid->GlobalSum(DhopComputeTime);
DhopComputeTime/=NP;
RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl;
RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Stencil" <<std::endl; Stencil.Report();
std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilEven"<<std::endl; StencilEven.Report();
std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilOdd" <<std::endl; StencilOdd.Report();
}
template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::ZeroCounters(void)
{
DhopCalls = 0;
DhopTotalTime = 0;
DhopCommTime = 0;
DhopComputeTime = 0;
DhopFaceTime = 0;
Stencil.ZeroCounters();
StencilEven.ZeroCounters();
StencilOdd.ZeroCounters();
}
///////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////
// Implement the general interface. Here we use SAME mass on all slices // Implement the general interface. Here we use SAME mass on all slices
///////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////

View File

@ -334,6 +334,7 @@ void ImprovedStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionF
template <class Impl> template <class Impl>
void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag)
{ {
DhopCalls+=2;
conformable(in.Grid(), _grid); // verifies full grid conformable(in.Grid(), _grid); // verifies full grid
conformable(in.Grid(), out.Grid()); conformable(in.Grid(), out.Grid());
@ -345,6 +346,7 @@ void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &
template <class Impl> template <class Impl>
void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag)
{ {
DhopCalls+=1;
conformable(in.Grid(), _cbgrid); // verifies half grid conformable(in.Grid(), _cbgrid); // verifies half grid
conformable(in.Grid(), out.Grid()); // drops the cb check conformable(in.Grid(), out.Grid()); // drops the cb check
@ -357,6 +359,7 @@ void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField
template <class Impl> template <class Impl>
void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag) void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag)
{ {
DhopCalls+=1;
conformable(in.Grid(), _cbgrid); // verifies half grid conformable(in.Grid(), _cbgrid); // verifies half grid
conformable(in.Grid(), out.Grid()); // drops the cb check conformable(in.Grid(), out.Grid()); // drops the cb check
@ -415,33 +418,47 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
Compressor compressor; Compressor compressor;
int len = U.Grid()->oSites(); int len = U.Grid()->oSites();
DhopTotalTime -= usecond();
DhopFaceTime -= usecond();
st.Prepare(); st.Prepare();
st.HaloGather(in,compressor); st.HaloGather(in,compressor);
DhopFaceTime += usecond();
DhopCommTime -=usecond();
std::vector<std::vector<CommsRequest_t> > requests; std::vector<std::vector<CommsRequest_t> > requests;
st.CommunicateBegin(requests); st.CommunicateBegin(requests);
DhopFaceTime-=usecond();
st.CommsMergeSHM(compressor); st.CommsMergeSHM(compressor);
DhopFaceTime+= usecond();
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
// Removed explicit thread comms // Removed explicit thread comms
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
DhopComputeTime -= usecond();
{ {
int interior=1; int interior=1;
int exterior=0; int exterior=0;
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
} }
DhopComputeTime += usecond();
st.CommunicateComplete(requests); st.CommunicateComplete(requests);
DhopCommTime +=usecond();
// First to enter, last to leave timing // First to enter, last to leave timing
DhopFaceTime -= usecond();
st.CommsMerge(compressor); st.CommsMerge(compressor);
DhopFaceTime -= usecond();
DhopComputeTime2 -= usecond();
{ {
int interior=0; int interior=0;
int exterior=1; int exterior=1;
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
} }
DhopComputeTime2 += usecond();
} }
@ -454,16 +471,78 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, Le
{ {
assert((dag == DaggerNo) || (dag == DaggerYes)); assert((dag == DaggerNo) || (dag == DaggerYes));
DhopTotalTime -= usecond();
DhopCommTime -= usecond();
Compressor compressor; Compressor compressor;
st.HaloExchange(in, compressor); st.HaloExchange(in, compressor);
DhopCommTime += usecond();
DhopComputeTime -= usecond();
{ {
int interior=1; int interior=1;
int exterior=1; int exterior=1;
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior); Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
} }
DhopComputeTime += usecond();
DhopTotalTime += usecond();
}; };
////////////////////////////////////////////////////////////////
// Reporting
////////////////////////////////////////////////////////////////
template<class Impl>
void ImprovedStaggeredFermion<Impl>::Report(void)
{
Coordinate latt = _grid->GlobalDimensions();
RealD volume = 1; for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
RealD NP = _grid->_Nprocessors;
RealD NN = _grid->NodeCount();
std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion Number of DhopEO Calls : "
<< DhopCalls << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion TotalTime /Calls : "
<< DhopTotalTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion CommTime /Calls : "
<< DhopCommTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion ComputeTime/Calls : "
<< DhopComputeTime / DhopCalls << " us" << std::endl;
// Average the compute time
_grid->GlobalSum(DhopComputeTime);
DhopComputeTime/=NP;
RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl;
RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
std::cout << GridLogMessage << "ImprovedStaggeredFermion Stencil" <<std::endl; Stencil.Report();
std::cout << GridLogMessage << "ImprovedStaggeredFermion StencilEven"<<std::endl; StencilEven.Report();
std::cout << GridLogMessage << "ImprovedStaggeredFermion StencilOdd" <<std::endl; StencilOdd.Report();
}
template<class Impl>
void ImprovedStaggeredFermion<Impl>::ZeroCounters(void)
{
DhopCalls = 0;
DhopTotalTime = 0;
DhopCommTime = 0;
DhopComputeTime = 0;
DhopFaceTime = 0;
Stencil.ZeroCounters();
StencilEven.ZeroCounters();
StencilOdd.ZeroCounters();
}
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
// Conserved current - not yet implemented. // Conserved current - not yet implemented.
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////

View File

@ -55,6 +55,9 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField
auto plower = &lower[0]; auto plower = &lower[0];
// Flops = 6.0*(Nc*Ns) *Ls*vol // Flops = 6.0*(Nc*Ns) *Ls*vol
this->M5Dcalls++;
this->M5Dtime -= usecond();
int nloop = grid->oSites()/Ls; int nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,Simd::Nsimd(),{ accelerator_for(sss,nloop,Simd::Nsimd(),{
uint64_t ss = sss*Ls; uint64_t ss = sss*Ls;
@ -70,6 +73,7 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField
} }
}); });
this->M5Dtime += usecond();
} }
template<class Impl> template<class Impl>
@ -95,6 +99,9 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
auto pshift_coeffs = &shift_coeffs[0]; auto pshift_coeffs = &shift_coeffs[0];
// Flops = 6.0*(Nc*Ns) *Ls*vol // Flops = 6.0*(Nc*Ns) *Ls*vol
this->M5Dcalls++;
this->M5Dtime -= usecond();
int nloop = grid->oSites()/Ls; int nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,Simd::Nsimd(),{ accelerator_for(sss,nloop,Simd::Nsimd(),{
uint64_t ss = sss*Ls; uint64_t ss = sss*Ls;
@ -115,6 +122,7 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
} }
}); });
this->M5Dtime += usecond();
} }
template<class Impl> template<class Impl>
@ -135,6 +143,9 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
auto plower = &lower[0]; auto plower = &lower[0];
// Flops = 6.0*(Nc*Ns) *Ls*vol // Flops = 6.0*(Nc*Ns) *Ls*vol
this->M5Dcalls++;
this->M5Dtime -= usecond();
int nloop = grid->oSites()/Ls; int nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,Simd::Nsimd(), { accelerator_for(sss,nloop,Simd::Nsimd(), {
uint64_t ss = sss*Ls; uint64_t ss = sss*Ls;
@ -150,6 +161,8 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
coalescedWrite(chi[ss+s], pdiag[s]*phi(ss+s) + pupper[s]*tmp1 + plower[s]*tmp2); coalescedWrite(chi[ss+s], pdiag[s]*phi(ss+s) + pupper[s]*tmp1 + plower[s]*tmp2);
} }
}); });
this->M5Dtime += usecond();
} }
template<class Impl> template<class Impl>
@ -173,6 +186,9 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const Ferm
auto pshift_coeffs = &shift_coeffs[0]; auto pshift_coeffs = &shift_coeffs[0];
// Flops = 6.0*(Nc*Ns) *Ls*vol // Flops = 6.0*(Nc*Ns) *Ls*vol
this->M5Dcalls++;
this->M5Dtime -= usecond();
auto pm = this->pm; auto pm = this->pm;
int nloop = grid->oSites()/Ls; int nloop = grid->oSites()/Ls;
@ -201,6 +217,7 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const Ferm
} }
}); });
this->M5Dtime += usecond();
} }
template<class Impl> template<class Impl>
@ -220,6 +237,9 @@ void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField &
if(this->shift != 0.0){ MooeeInv_shift(psi_i,chi_i); return; } if(this->shift != 0.0){ MooeeInv_shift(psi_i,chi_i); return; }
this->MooeeInvCalls++;
this->MooeeInvTime -= usecond();
int nloop = grid->oSites()/Ls; int nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,Simd::Nsimd(),{ accelerator_for(sss,nloop,Simd::Nsimd(),{
uint64_t ss=sss*Ls; uint64_t ss=sss*Ls;
@ -257,6 +277,7 @@ void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField &
} }
}); });
this->MooeeInvTime += usecond();
} }
template<class Impl> template<class Impl>
@ -276,6 +297,8 @@ void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionF
auto pueem= & this->ueem[0]; auto pueem= & this->ueem[0];
auto pMooeeInv_shift_lc = &MooeeInv_shift_lc[0]; auto pMooeeInv_shift_lc = &MooeeInv_shift_lc[0];
auto pMooeeInv_shift_norm = &MooeeInv_shift_norm[0]; auto pMooeeInv_shift_norm = &MooeeInv_shift_norm[0];
this->MooeeInvCalls++;
this->MooeeInvTime -= usecond();
int nloop = grid->oSites()/Ls; int nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,Simd::Nsimd(),{ accelerator_for(sss,nloop,Simd::Nsimd(),{
@ -320,6 +343,7 @@ void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionF
} }
}); });
this->MooeeInvTime += usecond();
} }
template<class Impl> template<class Impl>
@ -339,6 +363,9 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionFiel
auto pleem= & this->leem[0]; auto pleem= & this->leem[0];
auto pueem= & this->ueem[0]; auto pueem= & this->ueem[0];
this->MooeeInvCalls++;
this->MooeeInvTime -= usecond();
int nloop = grid->oSites()/Ls; int nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,Simd::Nsimd(),{ accelerator_for(sss,nloop,Simd::Nsimd(),{
uint64_t ss=sss*Ls; uint64_t ss=sss*Ls;
@ -375,6 +402,7 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionFiel
coalescedWrite(chi[ss+s],res); coalescedWrite(chi[ss+s],res);
} }
}); });
this->MooeeInvTime += usecond();
} }
template<class Impl> template<class Impl>
@ -395,6 +423,9 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, Fermi
auto pMooeeInvDag_shift_lc = &MooeeInvDag_shift_lc[0]; auto pMooeeInvDag_shift_lc = &MooeeInvDag_shift_lc[0];
auto pMooeeInvDag_shift_norm = &MooeeInvDag_shift_norm[0]; auto pMooeeInvDag_shift_norm = &MooeeInvDag_shift_norm[0];
this->MooeeInvCalls++;
this->MooeeInvTime -= usecond();
int nloop = grid->oSites()/Ls; int nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,Simd::Nsimd(),{ accelerator_for(sss,nloop,Simd::Nsimd(),{
uint64_t ss=sss*Ls; uint64_t ss=sss*Ls;
@ -438,6 +469,7 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, Fermi
} }
}); });
this->MooeeInvTime += usecond();
} }
NAMESPACE_END(Grid); NAMESPACE_END(Grid);

View File

@ -263,6 +263,7 @@ void NaiveStaggeredFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionFiel
template <class Impl> template <class Impl>
void NaiveStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) void NaiveStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag)
{ {
DhopCalls+=2;
conformable(in.Grid(), _grid); // verifies full grid conformable(in.Grid(), _grid); // verifies full grid
conformable(in.Grid(), out.Grid()); conformable(in.Grid(), out.Grid());
@ -274,6 +275,7 @@ void NaiveStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out
template <class Impl> template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) void NaiveStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag)
{ {
DhopCalls+=1;
conformable(in.Grid(), _cbgrid); // verifies half grid conformable(in.Grid(), _cbgrid); // verifies half grid
conformable(in.Grid(), out.Grid()); // drops the cb check conformable(in.Grid(), out.Grid()); // drops the cb check
@ -286,6 +288,7 @@ void NaiveStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &o
template <class Impl> template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag) void NaiveStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &out, int dag)
{ {
DhopCalls+=1;
conformable(in.Grid(), _cbgrid); // verifies half grid conformable(in.Grid(), _cbgrid); // verifies half grid
conformable(in.Grid(), out.Grid()); // drops the cb check conformable(in.Grid(), out.Grid()); // drops the cb check
@ -342,33 +345,47 @@ void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, L
Compressor compressor; Compressor compressor;
int len = U.Grid()->oSites(); int len = U.Grid()->oSites();
DhopTotalTime -= usecond();
DhopFaceTime -= usecond();
st.Prepare(); st.Prepare();
st.HaloGather(in,compressor); st.HaloGather(in,compressor);
DhopFaceTime += usecond();
DhopCommTime -=usecond();
std::vector<std::vector<CommsRequest_t> > requests; std::vector<std::vector<CommsRequest_t> > requests;
st.CommunicateBegin(requests); st.CommunicateBegin(requests);
DhopFaceTime-=usecond();
st.CommsMergeSHM(compressor); st.CommsMergeSHM(compressor);
DhopFaceTime+= usecond();
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
// Removed explicit thread comms // Removed explicit thread comms
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
DhopComputeTime -= usecond();
{ {
int interior=1; int interior=1;
int exterior=0; int exterior=0;
Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior); Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
} }
DhopComputeTime += usecond();
st.CommunicateComplete(requests); st.CommunicateComplete(requests);
DhopCommTime +=usecond();
// First to enter, last to leave timing // First to enter, last to leave timing
DhopFaceTime -= usecond();
st.CommsMerge(compressor); st.CommsMerge(compressor);
DhopFaceTime -= usecond();
DhopComputeTime2 -= usecond();
{ {
int interior=0; int interior=0;
int exterior=1; int exterior=1;
Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior); Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
} }
DhopComputeTime2 += usecond();
} }
template <class Impl> template <class Impl>
@ -379,16 +396,78 @@ void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, Lebes
{ {
assert((dag == DaggerNo) || (dag == DaggerYes)); assert((dag == DaggerNo) || (dag == DaggerYes));
DhopTotalTime -= usecond();
DhopCommTime -= usecond();
Compressor compressor; Compressor compressor;
st.HaloExchange(in, compressor); st.HaloExchange(in, compressor);
DhopCommTime += usecond();
DhopComputeTime -= usecond();
{ {
int interior=1; int interior=1;
int exterior=1; int exterior=1;
Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior); Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
} }
DhopComputeTime += usecond();
DhopTotalTime += usecond();
}; };
////////////////////////////////////////////////////////////////
// Reporting
////////////////////////////////////////////////////////////////
template<class Impl>
void NaiveStaggeredFermion<Impl>::Report(void)
{
Coordinate latt = _grid->GlobalDimensions();
RealD volume = 1; for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
RealD NP = _grid->_Nprocessors;
RealD NN = _grid->NodeCount();
std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
std::cout << GridLogMessage << "NaiveStaggeredFermion Number of DhopEO Calls : "
<< DhopCalls << std::endl;
std::cout << GridLogMessage << "NaiveStaggeredFermion TotalTime /Calls : "
<< DhopTotalTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "NaiveStaggeredFermion CommTime /Calls : "
<< DhopCommTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "NaiveStaggeredFermion ComputeTime/Calls : "
<< DhopComputeTime / DhopCalls << " us" << std::endl;
// Average the compute time
_grid->GlobalSum(DhopComputeTime);
DhopComputeTime/=NP;
RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl;
RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
std::cout << GridLogMessage << "NaiveStaggeredFermion Stencil" <<std::endl; Stencil.Report();
std::cout << GridLogMessage << "NaiveStaggeredFermion StencilEven"<<std::endl; StencilEven.Report();
std::cout << GridLogMessage << "NaiveStaggeredFermion StencilOdd" <<std::endl; StencilOdd.Report();
}
template<class Impl>
void NaiveStaggeredFermion<Impl>::ZeroCounters(void)
{
DhopCalls = 0;
DhopTotalTime = 0;
DhopCommTime = 0;
DhopComputeTime = 0;
DhopFaceTime = 0;
Stencil.ZeroCounters();
StencilEven.ZeroCounters();
StencilOdd.ZeroCounters();
}
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
// Conserved current - not yet implemented. // Conserved current - not yet implemented.
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////

View File

@ -60,13 +60,8 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
UmuOdd (_FourDimRedBlackGrid), UmuOdd (_FourDimRedBlackGrid),
Lebesgue(_FourDimGrid), Lebesgue(_FourDimGrid),
LebesgueEvenOdd(_FourDimRedBlackGrid), LebesgueEvenOdd(_FourDimRedBlackGrid),
_tmp(&FiveDimRedBlackGrid), _tmp(&FiveDimRedBlackGrid)
Dirichlet(0)
{ {
Stencil.lo = &Lebesgue;
StencilEven.lo = &LebesgueEvenOdd;
StencilOdd.lo = &LebesgueEvenOdd;
// some assertions // some assertions
assert(FiveDimGrid._ndimension==5); assert(FiveDimGrid._ndimension==5);
assert(FourDimGrid._ndimension==4); assert(FourDimGrid._ndimension==4);
@ -96,19 +91,6 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
assert(FourDimRedBlackGrid._simd_layout[d] ==FourDimGrid._simd_layout[d]); assert(FourDimRedBlackGrid._simd_layout[d] ==FourDimGrid._simd_layout[d]);
} }
if ( p.dirichlet.size() == Nd+1) {
Coordinate block = p.dirichlet;
if ( block[0] || block[1] || block[2] || block[3] || block[4] ){
Dirichlet = 1;
std::cout << GridLogMessage << " WilsonFermion: non-trivial Dirichlet condition "<< block << std::endl;
std::cout << GridLogMessage << " WilsonFermion: partial Dirichlet "<< p.partialDirichlet << std::endl;
Block = block;
}
} else {
Coordinate block(Nd+1,0);
Block = block;
}
if (Impl::LsVectorised) { if (Impl::LsVectorised) {
int nsimd = Simd::Nsimd(); int nsimd = Simd::Nsimd();
@ -143,38 +125,99 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
StencilEven.BuildSurfaceList(LLs,vol4); StencilEven.BuildSurfaceList(LLs,vol4);
StencilOdd.BuildSurfaceList(LLs,vol4); StencilOdd.BuildSurfaceList(LLs,vol4);
// std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size()
// <<" " << StencilEven.surface_list.size()<<std::endl;
} }
template<class Impl>
void WilsonFermion5D<Impl>::Report(void)
{
RealD NP = _FourDimGrid->_Nprocessors;
RealD NN = _FourDimGrid->NodeCount();
RealD volume = Ls;
Coordinate latt = _FourDimGrid->GlobalDimensions();
for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
if ( DhopCalls > 0 ) {
std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
std::cout << GridLogMessage << "WilsonFermion5D Number of DhopEO Calls : " << DhopCalls << std::endl;
std::cout << GridLogMessage << "WilsonFermion5D TotalTime /Calls : " << DhopTotalTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "WilsonFermion5D CommTime /Calls : " << DhopCommTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "WilsonFermion5D FaceTime /Calls : " << DhopFaceTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "WilsonFermion5D ComputeTime1/Calls : " << DhopComputeTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "WilsonFermion5D ComputeTime2/Calls : " << DhopComputeTime2/ DhopCalls << " us" << std::endl;
// Average the compute time
_FourDimGrid->GlobalSum(DhopComputeTime);
DhopComputeTime/=NP;
RealD mflops = 1344*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl;
RealD Fullmflops = 1344*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
}
if ( DerivCalls > 0 ) {
std::cout << GridLogMessage << "#### Deriv calls report "<< std::endl;
std::cout << GridLogMessage << "WilsonFermion5D Number of Deriv Calls : " <<DerivCalls <<std::endl;
std::cout << GridLogMessage << "WilsonFermion5D CommTime/Calls : " <<DerivCommTime/DerivCalls<<" us" <<std::endl;
std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls : " <<DerivComputeTime/DerivCalls<<" us" <<std::endl;
std::cout << GridLogMessage << "WilsonFermion5D Dhop ComputeTime/Calls : " <<DerivDhopComputeTime/DerivCalls<<" us" <<std::endl;
RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime;
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NP << std::endl;
RealD Fullmflops = 144*volume*DerivCalls/(DerivDhopComputeTime+DerivCommTime)/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NP << std::endl; }
if (DerivCalls > 0 || DhopCalls > 0){
std::cout << GridLogMessage << "WilsonFermion5D Stencil" <<std::endl; Stencil.Report();
std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl; StencilEven.Report();
std::cout << GridLogMessage << "WilsonFermion5D StencilOdd" <<std::endl; StencilOdd.Report();
}
if ( DhopCalls > 0){
std::cout << GridLogMessage << "WilsonFermion5D Stencil Reporti()" <<std::endl; Stencil.Reporti(DhopCalls);
std::cout << GridLogMessage << "WilsonFermion5D StencilEven Reporti()"<<std::endl; StencilEven.Reporti(DhopCalls);
std::cout << GridLogMessage << "WilsonFermion5D StencilOdd Reporti()" <<std::endl; StencilOdd.Reporti(DhopCalls);
}
}
template<class Impl>
void WilsonFermion5D<Impl>::ZeroCounters(void) {
DhopCalls = 0;
DhopCommTime = 0;
DhopComputeTime = 0;
DhopComputeTime2= 0;
DhopFaceTime = 0;
DhopTotalTime = 0;
DerivCalls = 0;
DerivCommTime = 0;
DerivComputeTime = 0;
DerivDhopComputeTime = 0;
Stencil.ZeroCounters();
StencilEven.ZeroCounters();
StencilOdd.ZeroCounters();
Stencil.ZeroCountersi();
StencilEven.ZeroCountersi();
StencilOdd.ZeroCountersi();
}
template<class Impl> template<class Impl>
void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu) void WilsonFermion5D<Impl>::ImportGauge(const GaugeField &_Umu)
{ {
GaugeField HUmu(_Umu.Grid()); GaugeField HUmu(_Umu.Grid());
HUmu = _Umu*(-0.5); HUmu = _Umu*(-0.5);
if ( Dirichlet ) {
if ( this->Params.partialDirichlet ) {
std::cout << GridLogMessage << " partialDirichlet BCs " <<Block<<std::endl;
} else {
std::cout << GridLogMessage << " FULL Dirichlet BCs " <<Block<<std::endl;
}
std:: cout << GridLogMessage << "Checking block size multiple of rank boundaries for Dirichlet"<<std::endl;
for(int d=0;d<Nd;d++) {
int GaugeBlock = Block[d+1];
int ldim=GaugeGrid()->LocalDimensions()[d];
if (GaugeBlock) assert( (GaugeBlock%ldim)==0);
}
if (!this->Params.partialDirichlet) {
std::cout << GridLogMessage << " Dirichlet filtering gauge field BCs block " <<Block<<std::endl;
Coordinate GaugeBlock(Nd);
for(int d=0;d<Nd;d++) GaugeBlock[d] = Block[d+1];
DirichletFilter<GaugeField> Filter(GaugeBlock);
Filter.applyFilter(HUmu);
} else {
std::cout << GridLogMessage << " Dirichlet "<< Dirichlet << " NOT filtered gauge field" <<std::endl;
}
}
Impl::DoubleStore(GaugeGrid(),Umu,HUmu); Impl::DoubleStore(GaugeGrid(),Umu,HUmu);
pickCheckerboard(Even,UmuEven,Umu); pickCheckerboard(Even,UmuEven,Umu);
pickCheckerboard(Odd ,UmuOdd,Umu); pickCheckerboard(Odd ,UmuOdd,Umu);
@ -216,6 +259,7 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
const FermionField &B, const FermionField &B,
int dag) int dag)
{ {
DerivCalls++;
assert((dag==DaggerNo) ||(dag==DaggerYes)); assert((dag==DaggerNo) ||(dag==DaggerYes));
conformable(st.Grid(),A.Grid()); conformable(st.Grid(),A.Grid());
@ -226,12 +270,15 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
FermionField Btilde(B.Grid()); FermionField Btilde(B.Grid());
FermionField Atilde(B.Grid()); FermionField Atilde(B.Grid());
DerivCommTime-=usecond();
st.HaloExchange(B,compressor); st.HaloExchange(B,compressor);
DerivCommTime+=usecond();
Atilde=A; Atilde=A;
int LLs = B.Grid()->_rdimensions[0]; int LLs = B.Grid()->_rdimensions[0];
DerivComputeTime-=usecond();
for (int mu = 0; mu < Nd; mu++) { for (int mu = 0; mu < Nd; mu++) {
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// Flip gamma if dag // Flip gamma if dag
@ -243,6 +290,8 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
// Call the single hop // Call the single hop
//////////////////////// ////////////////////////
DerivDhopComputeTime -= usecond();
int Usites = U.Grid()->oSites(); int Usites = U.Grid()->oSites();
Kernels::DhopDirKernel(st, U, st.CommBuf(), Ls, Usites, B, Btilde, mu,gamma); Kernels::DhopDirKernel(st, U, st.CommBuf(), Ls, Usites, B, Btilde, mu,gamma);
@ -250,8 +299,10 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
//////////////////////////// ////////////////////////////
// spin trace outer product // spin trace outer product
//////////////////////////// ////////////////////////////
DerivDhopComputeTime += usecond();
Impl::InsertForce5D(mat, Btilde, Atilde, mu); Impl::InsertForce5D(mat, Btilde, Atilde, mu);
} }
DerivComputeTime += usecond();
} }
template<class Impl> template<class Impl>
@ -309,10 +360,12 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
DoubledGaugeField & U, DoubledGaugeField & U,
const FermionField &in, FermionField &out,int dag) const FermionField &in, FermionField &out,int dag)
{ {
DhopTotalTime-=usecond();
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
DhopInternalOverlappedComms(st,lo,U,in,out,dag); DhopInternalOverlappedComms(st,lo,U,in,out,dag);
else else
DhopInternalSerialComms(st,lo,U,in,out,dag); DhopInternalSerialComms(st,lo,U,in,out,dag);
DhopTotalTime+=usecond();
} }
@ -321,7 +374,6 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
DoubledGaugeField & U, DoubledGaugeField & U,
const FermionField &in, FermionField &out,int dag) const FermionField &in, FermionField &out,int dag)
{ {
GRID_TRACE("DhopInternalOverlappedComms");
Compressor compressor(dag); Compressor compressor(dag);
int LLs = in.Grid()->_rdimensions[0]; int LLs = in.Grid()->_rdimensions[0];
@ -330,57 +382,53 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
///////////////////////////// /////////////////////////////
// Start comms // Gather intranode and extra node differentiated?? // Start comms // Gather intranode and extra node differentiated??
///////////////////////////// /////////////////////////////
{ DhopFaceTime-=usecond();
GRID_TRACE("Gather"); st.HaloExchangeOptGather(in,compressor);
st.HaloExchangeOptGather(in,compressor); // Put the barrier in the routine DhopFaceTime+=usecond();
}
DhopCommTime -=usecond();
std::vector<std::vector<CommsRequest_t> > requests; std::vector<std::vector<CommsRequest_t> > requests;
auto id=traceStart("Communicate overlapped");
st.CommunicateBegin(requests); st.CommunicateBegin(requests);
///////////////////////////// /////////////////////////////
// Overlap with comms // Overlap with comms
///////////////////////////// /////////////////////////////
{ DhopFaceTime-=usecond();
GRID_TRACE("MergeSHM"); st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms DhopFaceTime+=usecond();
}
///////////////////////////// /////////////////////////////
// do the compute interior // do the compute interior
///////////////////////////// /////////////////////////////
int Opt = WilsonKernelsStatic::Opt; // Why pass this. Kernels should know int Opt = WilsonKernelsStatic::Opt; // Why pass this. Kernels should know
DhopComputeTime-=usecond();
if (dag == DaggerYes) { if (dag == DaggerYes) {
GRID_TRACE("DhopDagInterior");
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0); Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
} else { } else {
GRID_TRACE("DhopInterior");
Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0); Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
} }
DhopComputeTime+=usecond();
///////////////////////////// /////////////////////////////
// Complete comms // Complete comms
///////////////////////////// /////////////////////////////
st.CommunicateComplete(requests); st.CommunicateComplete(requests);
traceStop(id); DhopCommTime +=usecond();
///////////////////////////// /////////////////////////////
// do the compute exterior // do the compute exterior
///////////////////////////// /////////////////////////////
{ DhopFaceTime-=usecond();
GRID_TRACE("Merge"); st.CommsMerge(compressor);
st.CommsMerge(compressor); DhopFaceTime+=usecond();
}
DhopComputeTime2-=usecond();
if (dag == DaggerYes) { if (dag == DaggerYes) {
GRID_TRACE("DhopDagExterior");
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1); Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
} else { } else {
GRID_TRACE("DhopExterior");
Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1); Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
} }
DhopComputeTime2+=usecond();
} }
@ -390,30 +438,29 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOr
const FermionField &in, const FermionField &in,
FermionField &out,int dag) FermionField &out,int dag)
{ {
GRID_TRACE("DhopInternalSerialComms");
Compressor compressor(dag); Compressor compressor(dag);
int LLs = in.Grid()->_rdimensions[0]; int LLs = in.Grid()->_rdimensions[0];
{
GRID_TRACE("HaloExchange");
st.HaloExchangeOpt(in,compressor);
}
DhopCommTime-=usecond();
st.HaloExchangeOpt(in,compressor);
DhopCommTime+=usecond();
DhopComputeTime-=usecond();
int Opt = WilsonKernelsStatic::Opt; int Opt = WilsonKernelsStatic::Opt;
if (dag == DaggerYes) { if (dag == DaggerYes) {
GRID_TRACE("DhopDag");
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out); Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
} else { } else {
GRID_TRACE("Dhop");
Kernels::DhopKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out); Kernels::DhopKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
} }
DhopComputeTime+=usecond();
} }
template<class Impl> template<class Impl>
void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag) void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
{ {
DhopCalls++;
conformable(in.Grid(),FermionRedBlackGrid()); // verifies half grid conformable(in.Grid(),FermionRedBlackGrid()); // verifies half grid
conformable(in.Grid(),out.Grid()); // drops the cb check conformable(in.Grid(),out.Grid()); // drops the cb check
@ -425,6 +472,7 @@ void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int
template<class Impl> template<class Impl>
void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag) void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
{ {
DhopCalls++;
conformable(in.Grid(),FermionRedBlackGrid()); // verifies half grid conformable(in.Grid(),FermionRedBlackGrid()); // verifies half grid
conformable(in.Grid(),out.Grid()); // drops the cb check conformable(in.Grid(),out.Grid()); // drops the cb check
@ -436,6 +484,7 @@ void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int
template<class Impl> template<class Impl>
void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag) void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
{ {
DhopCalls+=2;
conformable(in.Grid(),FermionGrid()); // verifies full grid conformable(in.Grid(),FermionGrid()); // verifies full grid
conformable(in.Grid(),out.Grid()); conformable(in.Grid(),out.Grid());
@ -490,17 +539,12 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const
LatComplex sk(_grid); sk = Zero(); LatComplex sk(_grid); sk = Zero();
LatComplex sk2(_grid); sk2= Zero(); LatComplex sk2(_grid); sk2= Zero();
LatComplex W(_grid); W= Zero(); LatComplex W(_grid); W= Zero();
LatComplex a(_grid); a= Zero();
LatComplex one (_grid); one = ScalComplex(1.0,0.0); LatComplex one (_grid); one = ScalComplex(1.0,0.0);
LatComplex cosha(_grid); LatComplex cosha(_grid);
LatComplex kmu(_grid); LatComplex kmu(_grid);
LatComplex Wea(_grid); LatComplex Wea(_grid);
LatComplex Wema(_grid); LatComplex Wema(_grid);
LatComplex ea(_grid);
LatComplex ema(_grid);
LatComplex eaLs(_grid);
LatComplex emaLs(_grid);
LatComplex ea2Ls(_grid);
LatComplex ema2Ls(_grid);
LatComplex sinha(_grid); LatComplex sinha(_grid);
LatComplex sinhaLs(_grid); LatComplex sinhaLs(_grid);
LatComplex coshaLs(_grid); LatComplex coshaLs(_grid);
@ -535,29 +579,39 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const
//////////////////////////////////////////// ////////////////////////////////////////////
cosha = (one + W*W + sk) / (abs(W)*2.0); cosha = (one + W*W + sk) / (abs(W)*2.0);
ea = (cosha + sqrt(cosha*cosha-one)); // FIXME Need a Lattice acosh
ema= (cosha - sqrt(cosha*cosha-one));
eaLs = pow(ea,Ls); {
emaLs= pow(ema,Ls); autoView(cosha_v,cosha,CpuRead);
ea2Ls = pow(ea,2.0*Ls); autoView(a_v,a,CpuWrite);
ema2Ls= pow(ema,2.0*Ls); for(int idx=0;idx<_grid->lSites();idx++){
Wea= abs(W) * ea; Coordinate lcoor(Nd);
Wema= abs(W) * ema; Tcomplex cc;
// a=log(ea); // RealD sgn;
_grid->LocalIndexToLocalCoor(idx,lcoor);
sinha = 0.5*(ea - ema); peekLocalSite(cc,cosha_v,lcoor);
sinhaLs = 0.5*(eaLs-emaLs); assert((double)real(cc)>=1.0);
coshaLs = 0.5*(eaLs+emaLs); assert(fabs((double)imag(cc))<=1.0e-15);
cc = ScalComplex(::acosh(real(cc)),0.0);
pokeLocalSite(cc,a_v,lcoor);
}
}
Wea = ( exp( a) * abs(W) );
Wema= ( exp(-a) * abs(W) );
sinha = 0.5*(exp( a) - exp(-a));
sinhaLs = 0.5*(exp( a*Ls) - exp(-a*Ls));
coshaLs = 0.5*(exp( a*Ls) + exp(-a*Ls));
A = one / (abs(W) * sinha * 2.0) * one / (sinhaLs * 2.0); A = one / (abs(W) * sinha * 2.0) * one / (sinhaLs * 2.0);
F = eaLs * (one - Wea + (Wema - one) * mass*mass); F = exp( a*Ls) * (one - Wea + (Wema - one) * mass*mass);
F = F + emaLs * (Wema - one + (one - Wea) * mass*mass); F = F + exp(-a*Ls) * (Wema - one + (one - Wea) * mass*mass);
F = F - abs(W) * sinha * 4.0 * mass; F = F - abs(W) * sinha * 4.0 * mass;
Bpp = (A/F) * (ema2Ls - one) * (one - Wema) * (one - mass*mass * one); Bpp = (A/F) * (exp(-a*Ls*2.0) - one) * (one - Wema) * (one - mass*mass * one);
Bmm = (A/F) * (one - ea2Ls) * (one - Wea) * (one - mass*mass * one); Bmm = (A/F) * (one - exp(a*Ls*2.0)) * (one - Wea) * (one - mass*mass * one);
App = (A/F) * (ema2Ls - one) * ema * (ema - abs(W)) * (one - mass*mass * one); App = (A/F) * (exp(-a*Ls*2.0) - one) * exp(-a) * (exp(-a) - abs(W)) * (one - mass*mass * one);
Amm = (A/F) * (one - ea2Ls) * ea * (ea - abs(W)) * (one - mass*mass * one); Amm = (A/F) * (one - exp(a*Ls*2.0)) * exp(a) * (exp(a) - abs(W)) * (one - mass*mass * one);
ABpm = (A/F) * abs(W) * sinha * 2.0 * (one + mass * coshaLs * 2.0 + mass*mass * one); ABpm = (A/F) * abs(W) * sinha * 2.0 * (one + mass * coshaLs * 2.0 + mass*mass * one);
//P+ source, P- source //P+ source, P- source
@ -580,29 +634,29 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt_5d(FermionField &out,const
buf1_4d = Zero(); buf1_4d = Zero();
ExtractSlice(buf1_4d, PRsource, (tt-1), 0); ExtractSlice(buf1_4d, PRsource, (tt-1), 0);
//G(s,t) //G(s,t)
bufR_4d = bufR_4d + A * eaLs * pow(ema,f) * signW * buf1_4d + A * emaLs * pow(ea,f) * signW * buf1_4d; bufR_4d = bufR_4d + A * exp(a*Ls) * exp(-a*f) * signW * buf1_4d + A * exp(-a*Ls) * exp(a*f) * signW * buf1_4d;
//A++*exp(a(s+t)) //A++*exp(a(s+t))
bufR_4d = bufR_4d + App * pow(ea,ss) * pow(ea,tt) * signW * buf1_4d ; bufR_4d = bufR_4d + App * exp(a*ss) * exp(a*tt) * signW * buf1_4d ;
//A+-*exp(a(s-t)) //A+-*exp(a(s-t))
bufR_4d = bufR_4d + ABpm * pow(ea,ss) * pow(ema,tt) * signW * buf1_4d ; bufR_4d = bufR_4d + ABpm * exp(a*ss) * exp(-a*tt) * signW * buf1_4d ;
//A-+*exp(a(-s+t)) //A-+*exp(a(-s+t))
bufR_4d = bufR_4d + ABpm * pow(ema,ss) * pow(ea,tt) * signW * buf1_4d ; bufR_4d = bufR_4d + ABpm * exp(-a*ss) * exp(a*tt) * signW * buf1_4d ;
//A--*exp(a(-s-t)) //A--*exp(a(-s-t))
bufR_4d = bufR_4d + Amm * pow(ema,ss) * pow(ema,tt) * signW * buf1_4d ; bufR_4d = bufR_4d + Amm * exp(-a*ss) * exp(-a*tt) * signW * buf1_4d ;
//GL //GL
buf2_4d = Zero(); buf2_4d = Zero();
ExtractSlice(buf2_4d, PLsource, (tt-1), 0); ExtractSlice(buf2_4d, PLsource, (tt-1), 0);
//G(s,t) //G(s,t)
bufL_4d = bufL_4d + A * eaLs * pow(ema,f) * signW * buf2_4d + A * emaLs * pow(ea,f) * signW * buf2_4d; bufL_4d = bufL_4d + A * exp(a*Ls) * exp(-a*f) * signW * buf2_4d + A * exp(-a*Ls) * exp(a*f) * signW * buf2_4d;
//B++*exp(a(s+t)) //B++*exp(a(s+t))
bufL_4d = bufL_4d + Bpp * pow(ea,ss) * pow(ea,tt) * signW * buf2_4d ; bufL_4d = bufL_4d + Bpp * exp(a*ss) * exp(a*tt) * signW * buf2_4d ;
//B+-*exp(a(s-t)) //B+-*exp(a(s-t))
bufL_4d = bufL_4d + ABpm * pow(ea,ss) * pow(ema,tt) * signW * buf2_4d ; bufL_4d = bufL_4d + ABpm * exp(a*ss) * exp(-a*tt) * signW * buf2_4d ;
//B-+*exp(a(-s+t)) //B-+*exp(a(-s+t))
bufL_4d = bufL_4d + ABpm * pow(ema,ss) * pow(ea,tt) * signW * buf2_4d ; bufL_4d = bufL_4d + ABpm * exp(-a*ss) * exp(a*tt) * signW * buf2_4d ;
//B--*exp(a(-s-t)) //B--*exp(a(-s-t))
bufL_4d = bufL_4d + Bmm * pow(ema,ss) * pow(ema,tt) * signW * buf2_4d ; bufL_4d = bufL_4d + Bmm * exp(-a*ss) * exp(-a*tt) * signW * buf2_4d ;
} }
InsertSlice(bufR_4d, GR, (ss-1), 0); InsertSlice(bufR_4d, GR, (ss-1), 0);
InsertSlice(bufL_4d, GL, (ss-1), 0); InsertSlice(bufL_4d, GL, (ss-1), 0);
@ -721,12 +775,28 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt(FermionField &out,const Fe
W = one - M5 + sk2; W = one - M5 + sk2;
//////////////////////////////////////////// ////////////////////////////////////////////
// Cosh alpha -> exp(+/- alpha) // Cosh alpha -> alpha
//////////////////////////////////////////// ////////////////////////////////////////////
cosha = (one + W*W + sk) / (abs(W)*2.0); cosha = (one + W*W + sk) / (abs(W)*2.0);
Wea = abs(W)*(cosha + sqrt(cosha*cosha-one)); // FIXME Need a Lattice acosh
Wema= abs(W)*(cosha - sqrt(cosha*cosha-one)); {
autoView(cosha_v,cosha,CpuRead);
autoView(a_v,a,CpuWrite);
for(int idx=0;idx<_grid->lSites();idx++){
Coordinate lcoor(Nd);
Tcomplex cc;
// RealD sgn;
_grid->LocalIndexToLocalCoor(idx,lcoor);
peekLocalSite(cc,cosha_v,lcoor);
assert((double)real(cc)>=1.0);
assert(fabs((double)imag(cc))<=1.0e-15);
cc = ScalComplex(::acosh(real(cc)),0.0);
pokeLocalSite(cc,a_v,lcoor);
}}
Wea = ( exp( a) * abs(W) );
Wema= ( exp(-a) * abs(W) );
num = num + ( one - Wema ) * mass * in; num = num + ( one - Wema ) * mass * in;
denom= ( Wea - one ) + mass*mass * (one - Wema); denom= ( Wea - one ) + mass*mass * (one - Wema);

View File

@ -60,9 +60,6 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
_tmp(&Hgrid), _tmp(&Hgrid),
anisotropyCoeff(anis) anisotropyCoeff(anis)
{ {
Stencil.lo = &Lebesgue;
StencilEven.lo = &LebesgueEvenOdd;
StencilOdd.lo = &LebesgueEvenOdd;
// Allocate the required comms buffer // Allocate the required comms buffer
ImportGauge(_Umu); ImportGauge(_Umu);
if (anisotropyCoeff.isAnisotropic){ if (anisotropyCoeff.isAnisotropic){
@ -79,6 +76,91 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
StencilOdd.BuildSurfaceList(1,vol4); StencilOdd.BuildSurfaceList(1,vol4);
} }
template<class Impl>
void WilsonFermion<Impl>::Report(void)
{
RealD NP = _grid->_Nprocessors;
RealD NN = _grid->NodeCount();
RealD volume = 1;
Coordinate latt = _grid->GlobalDimensions();
for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
if ( DhopCalls > 0 ) {
std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
std::cout << GridLogMessage << "WilsonFermion Number of DhopEO Calls : " << DhopCalls << std::endl;
std::cout << GridLogMessage << "WilsonFermion TotalTime /Calls : " << DhopTotalTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "WilsonFermion CommTime /Calls : " << DhopCommTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "WilsonFermion FaceTime /Calls : " << DhopFaceTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "WilsonFermion ComputeTime1/Calls : " << DhopComputeTime / DhopCalls << " us" << std::endl;
std::cout << GridLogMessage << "WilsonFermion ComputeTime2/Calls : " << DhopComputeTime2/ DhopCalls << " us" << std::endl;
// Average the compute time
_grid->GlobalSum(DhopComputeTime);
DhopComputeTime/=NP;
RealD mflops = 1320*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank : " << mflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node : " << mflops/NN << std::endl;
RealD Fullmflops = 1320*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call (full) : " << Fullmflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
}
if ( DerivCalls > 0 ) {
std::cout << GridLogMessage << "#### Deriv calls report "<< std::endl;
std::cout << GridLogMessage << "WilsonFermion Number of Deriv Calls : " <<DerivCalls <<std::endl;
std::cout << GridLogMessage << "WilsonFermion CommTime/Calls : " <<DerivCommTime/DerivCalls<<" us" <<std::endl;
std::cout << GridLogMessage << "WilsonFermion ComputeTime/Calls : " <<DerivComputeTime/DerivCalls<<" us" <<std::endl;
std::cout << GridLogMessage << "WilsonFermion Dhop ComputeTime/Calls : " <<DerivDhopComputeTime/DerivCalls<<" us" <<std::endl;
// how to count flops here?
RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime;
std::cout << GridLogMessage << "Average mflops/s per call ? : " << mflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node ? : " << mflops/NP << std::endl;
// how to count flops here?
RealD Fullmflops = 144*volume*DerivCalls/(DerivDhopComputeTime+DerivCommTime)/2; // 2 for red black counting
std::cout << GridLogMessage << "Average mflops/s per call (full) ? : " << Fullmflops << std::endl;
std::cout << GridLogMessage << "Average mflops/s per call per node (full) ? : " << Fullmflops/NP << std::endl; }
if (DerivCalls > 0 || DhopCalls > 0){
std::cout << GridLogMessage << "WilsonFermion Stencil" <<std::endl; Stencil.Report();
std::cout << GridLogMessage << "WilsonFermion StencilEven"<<std::endl; StencilEven.Report();
std::cout << GridLogMessage << "WilsonFermion StencilOdd" <<std::endl; StencilOdd.Report();
}
if ( DhopCalls > 0){
std::cout << GridLogMessage << "WilsonFermion Stencil Reporti()" <<std::endl; Stencil.Reporti(DhopCalls);
std::cout << GridLogMessage << "WilsonFermion StencilEven Reporti()"<<std::endl; StencilEven.Reporti(DhopCalls);
std::cout << GridLogMessage << "WilsonFermion StencilOdd Reporti()" <<std::endl; StencilOdd.Reporti(DhopCalls);
}
}
template<class Impl>
void WilsonFermion<Impl>::ZeroCounters(void) {
DhopCalls = 0; // ok
DhopCommTime = 0;
DhopComputeTime = 0;
DhopComputeTime2= 0;
DhopFaceTime = 0;
DhopTotalTime = 0;
DerivCalls = 0; // ok
DerivCommTime = 0;
DerivComputeTime = 0;
DerivDhopComputeTime = 0;
Stencil.ZeroCounters();
StencilEven.ZeroCounters();
StencilOdd.ZeroCounters();
Stencil.ZeroCountersi();
StencilEven.ZeroCountersi();
StencilOdd.ZeroCountersi();
}
template <class Impl> template <class Impl>
void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu) void WilsonFermion<Impl>::ImportGauge(const GaugeField &_Umu)
{ {
@ -238,6 +320,7 @@ template <class Impl>
void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U, void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
GaugeField &mat, const FermionField &A, GaugeField &mat, const FermionField &A,
const FermionField &B, int dag) { const FermionField &B, int dag) {
DerivCalls++;
assert((dag == DaggerNo) || (dag == DaggerYes)); assert((dag == DaggerNo) || (dag == DaggerYes));
Compressor compressor(dag); Compressor compressor(dag);
@ -246,8 +329,11 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
FermionField Atilde(B.Grid()); FermionField Atilde(B.Grid());
Atilde = A; Atilde = A;
DerivCommTime-=usecond();
st.HaloExchange(B, compressor); st.HaloExchange(B, compressor);
DerivCommTime+=usecond();
DerivComputeTime-=usecond();
for (int mu = 0; mu < Nd; mu++) { for (int mu = 0; mu < Nd; mu++) {
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// Flip gamma (1+g)<->(1-g) if dag // Flip gamma (1+g)<->(1-g) if dag
@ -255,6 +341,7 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
int gamma = mu; int gamma = mu;
if (!dag) gamma += Nd; if (!dag) gamma += Nd;
DerivDhopComputeTime -= usecond();
int Ls=1; int Ls=1;
Kernels::DhopDirKernel(st, U, st.CommBuf(), Ls, B.Grid()->oSites(), B, Btilde, mu, gamma); Kernels::DhopDirKernel(st, U, st.CommBuf(), Ls, B.Grid()->oSites(), B, Btilde, mu, gamma);
@ -262,7 +349,9 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
// spin trace outer product // spin trace outer product
////////////////////////////////////////////////// //////////////////////////////////////////////////
Impl::InsertForce4D(mat, Btilde, Atilde, mu); Impl::InsertForce4D(mat, Btilde, Atilde, mu);
DerivDhopComputeTime += usecond();
} }
DerivComputeTime += usecond();
} }
template <class Impl> template <class Impl>
@ -309,6 +398,7 @@ void WilsonFermion<Impl>::DhopDerivEO(GaugeField &mat, const FermionField &U, co
template <class Impl> template <class Impl>
void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag) void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int dag)
{ {
DhopCalls+=2;
conformable(in.Grid(), _grid); // verifies full grid conformable(in.Grid(), _grid); // verifies full grid
conformable(in.Grid(), out.Grid()); conformable(in.Grid(), out.Grid());
@ -320,6 +410,7 @@ void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int da
template <class Impl> template <class Impl>
void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag) void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int dag)
{ {
DhopCalls++;
conformable(in.Grid(), _cbgrid); // verifies half grid conformable(in.Grid(), _cbgrid); // verifies half grid
conformable(in.Grid(), out.Grid()); // drops the cb check conformable(in.Grid(), out.Grid()); // drops the cb check
@ -332,6 +423,7 @@ void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int
template <class Impl> template <class Impl>
void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag) void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
{ {
DhopCalls++;
conformable(in.Grid(), _cbgrid); // verifies half grid conformable(in.Grid(), _cbgrid); // verifies half grid
conformable(in.Grid(), out.Grid()); // drops the cb check conformable(in.Grid(), out.Grid()); // drops the cb check
@ -396,12 +488,14 @@ void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
const FermionField &in, const FermionField &in,
FermionField &out, int dag) FermionField &out, int dag)
{ {
DhopTotalTime-=usecond();
#ifdef GRID_OMP #ifdef GRID_OMP
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
DhopInternalOverlappedComms(st,lo,U,in,out,dag); DhopInternalOverlappedComms(st,lo,U,in,out,dag);
else else
#endif #endif
DhopInternalSerial(st,lo,U,in,out,dag); DhopInternalSerial(st,lo,U,in,out,dag);
DhopTotalTime+=usecond();
} }
template <class Impl> template <class Impl>
@ -410,7 +504,6 @@ void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueO
const FermionField &in, const FermionField &in,
FermionField &out, int dag) FermionField &out, int dag)
{ {
GRID_TRACE("DhopOverlapped");
assert((dag == DaggerNo) || (dag == DaggerYes)); assert((dag == DaggerNo) || (dag == DaggerYes));
Compressor compressor(dag); Compressor compressor(dag);
@ -421,55 +514,53 @@ void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueO
///////////////////////////// /////////////////////////////
std::vector<std::vector<CommsRequest_t> > requests; std::vector<std::vector<CommsRequest_t> > requests;
st.Prepare(); st.Prepare();
{ DhopFaceTime-=usecond();
GRID_TRACE("Gather"); st.HaloGather(in,compressor);
st.HaloGather(in,compressor); DhopFaceTime+=usecond();
}
tracePush("Communication"); DhopCommTime -=usecond();
st.CommunicateBegin(requests); st.CommunicateBegin(requests);
///////////////////////////// /////////////////////////////
// Overlap with comms // Overlap with comms
///////////////////////////// /////////////////////////////
{ DhopFaceTime-=usecond();
GRID_TRACE("MergeSHM"); st.CommsMergeSHM(compressor);
st.CommsMergeSHM(compressor); DhopFaceTime+=usecond();
}
///////////////////////////// /////////////////////////////
// do the compute interior // do the compute interior
///////////////////////////// /////////////////////////////
int Opt = WilsonKernelsStatic::Opt; int Opt = WilsonKernelsStatic::Opt;
DhopComputeTime-=usecond();
if (dag == DaggerYes) { if (dag == DaggerYes) {
GRID_TRACE("DhopDagInterior");
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0); Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
} else { } else {
GRID_TRACE("DhopInterior");
Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0); Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,1,0);
} }
DhopComputeTime+=usecond();
///////////////////////////// /////////////////////////////
// Complete comms // Complete comms
///////////////////////////// /////////////////////////////
st.CommunicateComplete(requests); st.CommunicateComplete(requests);
tracePop("Communication"); DhopCommTime +=usecond();
DhopFaceTime-=usecond();
st.CommsMerge(compressor);
DhopFaceTime+=usecond();
{
GRID_TRACE("Merge");
st.CommsMerge(compressor);
}
///////////////////////////// /////////////////////////////
// do the compute exterior // do the compute exterior
///////////////////////////// /////////////////////////////
DhopComputeTime2-=usecond();
if (dag == DaggerYes) { if (dag == DaggerYes) {
GRID_TRACE("DhopDagExterior");
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1); Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1);
} else { } else {
GRID_TRACE("DhopExterior");
Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1); Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out,0,1);
} }
DhopComputeTime2+=usecond();
}; };
@ -479,22 +570,20 @@ void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo,
const FermionField &in, const FermionField &in,
FermionField &out, int dag) FermionField &out, int dag)
{ {
GRID_TRACE("DhopSerial");
assert((dag == DaggerNo) || (dag == DaggerYes)); assert((dag == DaggerNo) || (dag == DaggerYes));
Compressor compressor(dag); Compressor compressor(dag);
{ DhopCommTime-=usecond();
GRID_TRACE("HaloExchange"); st.HaloExchange(in, compressor);
st.HaloExchange(in, compressor); DhopCommTime+=usecond();
}
DhopComputeTime-=usecond();
int Opt = WilsonKernelsStatic::Opt; int Opt = WilsonKernelsStatic::Opt;
if (dag == DaggerYes) { if (dag == DaggerYes) {
GRID_TRACE("DhopDag");
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out); Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out);
} else { } else {
GRID_TRACE("Dhop");
Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out); Kernels::DhopKernel(Opt,st,U,st.CommBuf(),1,U.oSites(),in,out);
} }
DhopComputeTime+=usecond();
}; };
/*Change ends */ /*Change ends */

View File

@ -72,15 +72,20 @@ accelerator_inline void get_stencil(StencilEntry * mem, StencilEntry &chip)
if (SE->_is_local) { \ if (SE->_is_local) { \
int perm= SE->_permute; \ int perm= SE->_permute; \
auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane); \ auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane); \
spProj(chi,tmp); \ spProj(chi,tmp); \
Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \ } else if ( st.same_node[Dir] ) { \
Recon(result, Uchi); \ chi = coalescedRead(buf[SE->_offset],lane); \
} \ } \
acceleratorSynchronise(); \
if (SE->_is_local || st.same_node[Dir] ) { \
Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \
Recon(result, Uchi); \
} \
acceleratorSynchronise(); acceleratorSynchronise();
#define GENERIC_STENCIL_LEG_EXT(Dir,spProj,Recon) \ #define GENERIC_STENCIL_LEG_EXT(Dir,spProj,Recon) \
SE = st.GetEntry(ptype, Dir, sF); \ SE = st.GetEntry(ptype, Dir, sF); \
if (!SE->_is_local ) { \ if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \
auto chi = coalescedRead(buf[SE->_offset],lane); \ auto chi = coalescedRead(buf[SE->_offset],lane); \
Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \ Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \
Recon(result, Uchi); \ Recon(result, Uchi); \
@ -411,6 +416,19 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
#undef LoopBody #undef LoopBody
} }
#define KERNEL_CALL_TMP(A) \
const uint64_t NN = Nsite*Ls; \
auto U_p = & U_v[0]; \
auto in_p = & in_v[0]; \
auto out_p = & out_v[0]; \
auto st_p = st_v._entries_p; \
auto st_perm = st_v._permute_type; \
accelerator_forNB( ss, NN, Simd::Nsimd(), { \
int sF = ss; \
int sU = ss/Ls; \
WilsonKernels<Impl>::A(st_perm,st_p,U_p,buf,sF,sU,in_p,out_p); \
}); \
accelerator_barrier();
#define KERNEL_CALLNB(A) \ #define KERNEL_CALLNB(A) \
const uint64_t NN = Nsite*Ls; \ const uint64_t NN = Nsite*Ls; \
@ -422,34 +440,12 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
#define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier(); #define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier();
#define KERNEL_CALL_EXT(A) \
const uint64_t sz = st.surface_list.size(); \
auto ptr = &st.surface_list[0]; \
accelerator_forNB( ss, sz, Simd::Nsimd(), { \
int sF = ptr[ss]; \
int sU = sF/Ls; \
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v); \
}); \
accelerator_barrier();
#define ASM_CALL(A) \ #define ASM_CALL(A) \
thread_for( sss, Nsite, { \ thread_for( ss, Nsite, { \
int ss = st.lo->Reorder(sss); \
int sU = ss; \ int sU = ss; \
int sF = ss*Ls; \ int sF = ss*Ls; \
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \ WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \
}); });
#define ASM_CALL_SLICE(A) \
auto grid = in.Grid() ; \
int nt = grid->LocalDimensions()[4]; \
int nxyz = Nsite/nt ; \
for(int t=0;t<nt;t++){ \
thread_for( sss, nxyz, { \
int ss = t*nxyz+sss; \
int sU = ss; \
int sF = ss*Ls; \
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \
});}
template <class Impl> template <class Impl>
void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf, void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
@ -463,7 +459,11 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
if( interior && exterior ) { if( interior && exterior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;} if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;}
#ifdef SYCL_HACK
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_TMP(HandDhopSiteSycl); return; }
#else
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;}
#endif
#ifndef GRID_CUDA #ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); return;}
#endif #endif
@ -474,10 +474,8 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); return;}
#endif #endif
} else if( exterior ) { } else if( exterior ) {
// dependent on result of merge if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;}
acceleratorFenceComputeStream(); if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;}
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL_EXT(GenericDhopSiteExt); return;}
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_EXT(HandDhopSiteExt); return;}
#ifndef GRID_CUDA #ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); return;}
#endif #endif
@ -500,20 +498,21 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
#ifndef GRID_CUDA #ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDag); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDag); return;}
#endif #endif
acceleratorFenceComputeStream();
} else if( interior ) { } else if( interior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLNB(GenericDhopSiteDagInt); return;} if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagInt); return;}
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteDagInt); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagInt); return;}
#ifndef GRID_CUDA #ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagInt); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagInt); return;}
#endif #endif
} else if( exterior ) { } else if( exterior ) {
// Dependent on result of merge
acceleratorFenceComputeStream(); acceleratorFenceComputeStream();
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL_EXT(GenericDhopSiteDagExt); return;} if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDagExt); return;}
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_EXT(HandDhopSiteDagExt); return;} if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteDagExt); return;}
#ifndef GRID_CUDA #ifndef GRID_CUDA
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagExt); return;} if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteDagExt); return;}
#endif #endif
acceleratorFenceComputeStream();
} }
assert(0 && " Kernel optimisation case not covered "); assert(0 && " Kernel optimisation case not covered ");
} }

View File

@ -1 +0,0 @@
../WilsonCloverFermionInstantiation.cc.master

View File

@ -1 +0,0 @@
../WilsonFermionInstantiation.cc.master

View File

@ -1 +0,0 @@
../WilsonKernelsInstantiation.cc.master

View File

@ -1 +0,0 @@
../WilsonTMFermionInstantiation.cc.master

View File

@ -1 +0,0 @@
#define IMPLEMENTATION SpWilsonImplD

View File

@ -1 +0,0 @@
../WilsonCloverFermionInstantiation.cc.master

View File

@ -1 +0,0 @@
../WilsonFermionInstantiation.cc.master

View File

@ -1 +0,0 @@
../WilsonKernelsInstantiation.cc.master

View File

@ -1 +0,0 @@
../WilsonTMFermionInstantiation.cc.master

View File

@ -1 +0,0 @@
#define IMPLEMENTATION SpWilsonImplF

Some files were not shown because too many files have changed in this diff Show More