mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-09 23:45:36 +00:00
commit
40ee605591
@ -28,6 +28,9 @@ Author: Christoph Lehner <christoph@lhnr.de>
|
|||||||
#if defined(GRID_CUDA)||defined(GRID_HIP)
|
#if defined(GRID_CUDA)||defined(GRID_HIP)
|
||||||
#include <Grid/lattice/Lattice_reduction_gpu.h>
|
#include <Grid/lattice/Lattice_reduction_gpu.h>
|
||||||
#endif
|
#endif
|
||||||
|
#if defined(GRID_SYCL)
|
||||||
|
#include <Grid/lattice/Lattice_reduction_sycl.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
@ -127,7 +130,7 @@ inline Double max(const Double *arg, Integer osites)
|
|||||||
template<class vobj>
|
template<class vobj>
|
||||||
inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
|
inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
|
||||||
{
|
{
|
||||||
#if defined(GRID_CUDA)||defined(GRID_HIP)
|
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
|
||||||
return sum_gpu(arg,osites);
|
return sum_gpu(arg,osites);
|
||||||
#else
|
#else
|
||||||
return sum_cpu(arg,osites);
|
return sum_cpu(arg,osites);
|
||||||
@ -136,7 +139,7 @@ inline typename vobj::scalar_object sum(const vobj *arg, Integer osites)
|
|||||||
template<class vobj>
|
template<class vobj>
|
||||||
inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
|
inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
|
||||||
{
|
{
|
||||||
#if defined(GRID_CUDA)||defined(GRID_HIP)
|
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
|
||||||
return sumD_gpu(arg,osites);
|
return sumD_gpu(arg,osites);
|
||||||
#else
|
#else
|
||||||
return sumD_cpu(arg,osites);
|
return sumD_cpu(arg,osites);
|
||||||
@ -145,7 +148,7 @@ inline typename vobj::scalar_objectD sumD(const vobj *arg, Integer osites)
|
|||||||
template<class vobj>
|
template<class vobj>
|
||||||
inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites)
|
inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites)
|
||||||
{
|
{
|
||||||
#if defined(GRID_CUDA)||defined(GRID_HIP)
|
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
|
||||||
return sumD_gpu_large(arg,osites);
|
return sumD_gpu_large(arg,osites);
|
||||||
#else
|
#else
|
||||||
return sumD_cpu(arg,osites);
|
return sumD_cpu(arg,osites);
|
||||||
@ -155,13 +158,13 @@ inline typename vobj::scalar_objectD sumD_large(const vobj *arg, Integer osites)
|
|||||||
template<class vobj>
|
template<class vobj>
|
||||||
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
|
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
|
||||||
{
|
{
|
||||||
#if defined(GRID_CUDA)||defined(GRID_HIP)
|
|
||||||
autoView( arg_v, arg, AcceleratorRead);
|
|
||||||
Integer osites = arg.Grid()->oSites();
|
Integer osites = arg.Grid()->oSites();
|
||||||
auto ssum= sum_gpu(&arg_v[0],osites);
|
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
|
||||||
|
typename vobj::scalar_object ssum;
|
||||||
|
autoView( arg_v, arg, AcceleratorRead);
|
||||||
|
ssum= sum_gpu(&arg_v[0],osites);
|
||||||
#else
|
#else
|
||||||
autoView(arg_v, arg, CpuRead);
|
autoView(arg_v, arg, CpuRead);
|
||||||
Integer osites = arg.Grid()->oSites();
|
|
||||||
auto ssum= sum_cpu(&arg_v[0],osites);
|
auto ssum= sum_cpu(&arg_v[0],osites);
|
||||||
#endif
|
#endif
|
||||||
arg.Grid()->GlobalSum(ssum);
|
arg.Grid()->GlobalSum(ssum);
|
||||||
@ -171,7 +174,7 @@ inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
|
|||||||
template<class vobj>
|
template<class vobj>
|
||||||
inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg)
|
inline typename vobj::scalar_object sum_large(const Lattice<vobj> &arg)
|
||||||
{
|
{
|
||||||
#if defined(GRID_CUDA)||defined(GRID_HIP)
|
#if defined(GRID_CUDA)||defined(GRID_HIP)||defined(GRID_SYCL)
|
||||||
autoView( arg_v, arg, AcceleratorRead);
|
autoView( arg_v, arg, AcceleratorRead);
|
||||||
Integer osites = arg.Grid()->oSites();
|
Integer osites = arg.Grid()->oSites();
|
||||||
auto ssum= sum_gpu_large(&arg_v[0],osites);
|
auto ssum= sum_gpu_large(&arg_v[0],osites);
|
||||||
@ -235,11 +238,10 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
|
|||||||
typedef decltype(innerProductD(vobj(),vobj())) inner_t;
|
typedef decltype(innerProductD(vobj(),vobj())) inner_t;
|
||||||
Vector<inner_t> inner_tmp(sites);
|
Vector<inner_t> inner_tmp(sites);
|
||||||
auto inner_tmp_v = &inner_tmp[0];
|
auto inner_tmp_v = &inner_tmp[0];
|
||||||
|
|
||||||
{
|
{
|
||||||
autoView( left_v , left, AcceleratorRead);
|
autoView( left_v , left, AcceleratorRead);
|
||||||
autoView( right_v,right, AcceleratorRead);
|
autoView( right_v,right, AcceleratorRead);
|
||||||
|
// This code could read coalesce
|
||||||
// GPU - SIMT lane compliance...
|
// GPU - SIMT lane compliance...
|
||||||
accelerator_for( ss, sites, 1,{
|
accelerator_for( ss, sites, 1,{
|
||||||
auto x_l = left_v[ss];
|
auto x_l = left_v[ss];
|
||||||
|
125
Grid/lattice/Lattice_reduction_sycl.h
Normal file
125
Grid/lattice/Lattice_reduction_sycl.h
Normal file
@ -0,0 +1,125 @@
|
|||||||
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Possibly promote to double and sum
|
||||||
|
/////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
template <class vobj>
|
||||||
|
inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites)
|
||||||
|
{
|
||||||
|
typedef typename vobj::scalar_object sobj;
|
||||||
|
typedef typename vobj::scalar_objectD sobjD;
|
||||||
|
sobj *mysum =(sobj *) malloc_shared(sizeof(sobj),*theGridAccelerator);
|
||||||
|
sobj identity; zeroit(identity);
|
||||||
|
sobj ret ;
|
||||||
|
|
||||||
|
Integer nsimd= vobj::Nsimd();
|
||||||
|
|
||||||
|
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
|
||||||
|
auto Reduction = cl::sycl::reduction(mysum,identity,std::plus<>());
|
||||||
|
cgh.parallel_for(cl::sycl::range<1>{osites},
|
||||||
|
Reduction,
|
||||||
|
[=] (cl::sycl::id<1> item, auto &sum) {
|
||||||
|
auto osite = item[0];
|
||||||
|
sum +=Reduce(lat[osite]);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
theGridAccelerator->wait();
|
||||||
|
ret = mysum[0];
|
||||||
|
free(mysum,*theGridAccelerator);
|
||||||
|
sobjD dret; convertType(dret,ret);
|
||||||
|
return dret;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class vobj>
|
||||||
|
inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites)
|
||||||
|
{
|
||||||
|
return sumD_gpu_tensor(lat,osites);
|
||||||
|
}
|
||||||
|
template <class vobj>
|
||||||
|
inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites)
|
||||||
|
{
|
||||||
|
return sumD_gpu_large(lat,osites);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class vobj>
|
||||||
|
inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
|
||||||
|
{
|
||||||
|
return sumD_gpu_large(lat,osites);
|
||||||
|
}
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Return as same precision as input performing reduction in double precision though
|
||||||
|
/////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
|
template <class vobj>
|
||||||
|
inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites)
|
||||||
|
{
|
||||||
|
typedef typename vobj::scalar_object sobj;
|
||||||
|
sobj result;
|
||||||
|
result = sumD_gpu(lat,osites);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class vobj>
|
||||||
|
inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites)
|
||||||
|
{
|
||||||
|
typedef typename vobj::scalar_object sobj;
|
||||||
|
sobj result;
|
||||||
|
result = sumD_gpu_large(lat,osites);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
/*
|
||||||
|
template<class Double> Double svm_reduce(Double *vec,uint64_t L)
|
||||||
|
{
|
||||||
|
Double sumResult; zeroit(sumResult);
|
||||||
|
Double *d_sum =(Double *)cl::sycl::malloc_shared(sizeof(Double),*theGridAccelerator);
|
||||||
|
Double identity; zeroit(identity);
|
||||||
|
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
|
||||||
|
auto Reduction = cl::sycl::reduction(d_sum,identity,std::plus<>());
|
||||||
|
cgh.parallel_for(cl::sycl::range<1>{L},
|
||||||
|
Reduction,
|
||||||
|
[=] (cl::sycl::id<1> index, auto &sum) {
|
||||||
|
sum +=vec[index];
|
||||||
|
});
|
||||||
|
});
|
||||||
|
theGridAccelerator->wait();
|
||||||
|
Double ret = d_sum[0];
|
||||||
|
free(d_sum,*theGridAccelerator);
|
||||||
|
std::cout << " svm_reduce finished "<<L<<" sites sum = " << ret <<std::endl;
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class vobj>
|
||||||
|
inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites)
|
||||||
|
{
|
||||||
|
typedef typename vobj::vector_type vector;
|
||||||
|
typedef typename vobj::scalar_type scalar;
|
||||||
|
|
||||||
|
typedef typename vobj::scalar_typeD scalarD;
|
||||||
|
typedef typename vobj::scalar_objectD sobjD;
|
||||||
|
|
||||||
|
sobjD ret;
|
||||||
|
scalarD *ret_p = (scalarD *)&ret;
|
||||||
|
|
||||||
|
const int nsimd = vobj::Nsimd();
|
||||||
|
const int words = sizeof(vobj)/sizeof(vector);
|
||||||
|
|
||||||
|
Vector<scalar> buffer(osites*nsimd);
|
||||||
|
scalar *buf = &buffer[0];
|
||||||
|
vector *dat = (vector *)lat;
|
||||||
|
|
||||||
|
for(int w=0;w<words;w++) {
|
||||||
|
|
||||||
|
accelerator_for(ss,osites,nsimd,{
|
||||||
|
int lane = acceleratorSIMTlane(nsimd);
|
||||||
|
buf[ss*nsimd+lane] = dat[ss*words+w].getlane(lane);
|
||||||
|
});
|
||||||
|
//Precision change at this point is to late to gain precision
|
||||||
|
ret_p[w] = svm_reduce(buf,nsimd*osites);
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
*/
|
@ -451,9 +451,20 @@ template<class vobj> void pokeLorentz(vobj &lhs,const decltype(peekIndex<Lorentz
|
|||||||
// Fermion <-> propagator assignements
|
// Fermion <-> propagator assignements
|
||||||
//////////////////////////////////////////////
|
//////////////////////////////////////////////
|
||||||
//template <class Prop, class Ferm>
|
//template <class Prop, class Ferm>
|
||||||
|
#define FAST_FERM_TO_PROP
|
||||||
template <class Fimpl>
|
template <class Fimpl>
|
||||||
void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::FermionField &f, const int s, const int c)
|
void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::FermionField &f, const int s, const int c)
|
||||||
{
|
{
|
||||||
|
#ifdef FAST_FERM_TO_PROP
|
||||||
|
autoView(p_v,p,CpuWrite);
|
||||||
|
autoView(f_v,f,CpuRead);
|
||||||
|
thread_for(idx,p_v.oSites(),{
|
||||||
|
for(int ss = 0; ss < Ns; ++ss) {
|
||||||
|
for(int cc = 0; cc < Fimpl::Dimension; ++cc) {
|
||||||
|
p_v[idx]()(ss,s)(cc,c) = f_v[idx]()(ss)(cc); // Propagator sink index is LEFT, suitable for left mult by gauge link (e.g.)
|
||||||
|
}}
|
||||||
|
});
|
||||||
|
#else
|
||||||
for(int j = 0; j < Ns; ++j)
|
for(int j = 0; j < Ns; ++j)
|
||||||
{
|
{
|
||||||
auto pjs = peekSpin(p, j, s);
|
auto pjs = peekSpin(p, j, s);
|
||||||
@ -465,12 +476,23 @@ void FermToProp(typename Fimpl::PropagatorField &p, const typename Fimpl::Fermio
|
|||||||
}
|
}
|
||||||
pokeSpin(p, pjs, j, s);
|
pokeSpin(p, pjs, j, s);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
//template <class Prop, class Ferm>
|
//template <class Prop, class Ferm>
|
||||||
template <class Fimpl>
|
template <class Fimpl>
|
||||||
void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::PropagatorField &p, const int s, const int c)
|
void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::PropagatorField &p, const int s, const int c)
|
||||||
{
|
{
|
||||||
|
#ifdef FAST_FERM_TO_PROP
|
||||||
|
autoView(p_v,p,CpuRead);
|
||||||
|
autoView(f_v,f,CpuWrite);
|
||||||
|
thread_for(idx,p_v.oSites(),{
|
||||||
|
for(int ss = 0; ss < Ns; ++ss) {
|
||||||
|
for(int cc = 0; cc < Fimpl::Dimension; ++cc) {
|
||||||
|
f_v[idx]()(ss)(cc) = p_v[idx]()(ss,s)(cc,c); // LEFT index is copied across for s,c right index
|
||||||
|
}}
|
||||||
|
});
|
||||||
|
#else
|
||||||
for(int j = 0; j < Ns; ++j)
|
for(int j = 0; j < Ns; ++j)
|
||||||
{
|
{
|
||||||
auto pjs = peekSpin(p, j, s);
|
auto pjs = peekSpin(p, j, s);
|
||||||
@ -482,6 +504,7 @@ void PropToFerm(typename Fimpl::FermionField &f, const typename Fimpl::Propagato
|
|||||||
}
|
}
|
||||||
pokeSpin(f, fj, j);
|
pokeSpin(f, fj, j);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////////
|
//////////////////////////////////////////////
|
||||||
|
@ -204,15 +204,18 @@ public:
|
|||||||
typedef WilsonCloverHelpers<Impl> Helpers;
|
typedef WilsonCloverHelpers<Impl> Helpers;
|
||||||
typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
|
typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
|
||||||
|
|
||||||
static void MassTerm(CloverField& Clover, RealD diag_mass) {
|
static void InstantiateClover(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
|
||||||
Clover += diag_mass;
|
Clover += diag_mass;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void Exponentiate_Clover(CloverDiagonalField& Diagonal,
|
static void InvertClover(CloverField& InvClover,
|
||||||
CloverTriangleField& Triangle,
|
const CloverDiagonalField& diagonal,
|
||||||
RealD csw_t, RealD diag_mass) {
|
const CloverTriangleField& triangle,
|
||||||
|
CloverDiagonalField& diagonalInv,
|
||||||
|
CloverTriangleField& triangleInv,
|
||||||
|
bool fixedBoundaries) {
|
||||||
|
|
||||||
// Do nothing
|
CompactHelpers::Invert(diagonal, triangle, diagonalInv, triangleInv);
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: implement Cmunu for better performances with compact layout, but don't do it
|
// TODO: implement Cmunu for better performances with compact layout, but don't do it
|
||||||
@ -237,9 +240,17 @@ public:
|
|||||||
template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
|
template <typename vtype> using iImplClover = iScalar<iMatrix<iMatrix<vtype, Impl::Dimension>, Ns>>;
|
||||||
typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
|
typedef CompactWilsonCloverHelpers<Impl> CompactHelpers;
|
||||||
|
|
||||||
static void MassTerm(CloverField& Clover, RealD diag_mass) {
|
// Can this be avoided?
|
||||||
// do nothing!
|
static void IdentityTimesC(const CloverField& in, RealD c) {
|
||||||
// mass term is multiplied to exp(Clover) below
|
int DimRep = Impl::Dimension;
|
||||||
|
|
||||||
|
autoView(in_v, in, AcceleratorWrite);
|
||||||
|
|
||||||
|
accelerator_for(ss, in.Grid()->oSites(), 1, {
|
||||||
|
for (int sa=0; sa<Ns; sa++)
|
||||||
|
for (int ca=0; ca<DimRep; ca++)
|
||||||
|
in_v[ss]()(sa,sa)(ca,ca) = c;
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
static int getNMAX(RealD prec, RealD R) {
|
static int getNMAX(RealD prec, RealD R) {
|
||||||
@ -254,175 +265,62 @@ public:
|
|||||||
return NMAX;
|
return NMAX;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int getNMAX(Lattice<iImplCloverDiagonal<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);}
|
static int getNMAX(Lattice<iImplClover<vComplexD>> &t, RealD R) {return getNMAX(1e-12,R);}
|
||||||
static int getNMAX(Lattice<iImplCloverDiagonal<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);}
|
static int getNMAX(Lattice<iImplClover<vComplexF>> &t, RealD R) {return getNMAX(1e-6,R);}
|
||||||
|
|
||||||
static void ExponentiateHermitean6by6(const iMatrix<ComplexD,6> &arg, const RealD& alpha, const std::vector<RealD>& cN, const int Niter, iMatrix<ComplexD,6>& dest){
|
static void InstantiateClover(CloverField& Clover, CloverField& CloverInv, RealD csw_t, RealD diag_mass) {
|
||||||
|
|
||||||
typedef iMatrix<ComplexD,6> mat;
|
GridBase* grid = Clover.Grid();
|
||||||
|
CloverField ExpClover(grid);
|
||||||
|
|
||||||
RealD qn[6];
|
int NMAX = getNMAX(Clover, 3.*csw_t/diag_mass);
|
||||||
RealD qnold[6];
|
|
||||||
RealD p[5];
|
|
||||||
RealD trA2, trA3, trA4;
|
|
||||||
|
|
||||||
mat A2, A3, A4, A5;
|
Clover *= (1.0/diag_mass);
|
||||||
A2 = alpha * alpha * arg * arg;
|
|
||||||
A3 = alpha * arg * A2;
|
|
||||||
A4 = A2 * A2;
|
|
||||||
A5 = A2 * A3;
|
|
||||||
|
|
||||||
trA2 = toReal( trace(A2) );
|
// Taylor expansion, slow but generic
|
||||||
trA3 = toReal( trace(A3) );
|
// Horner scheme: a0 + a1 x + a2 x^2 + .. = a0 + x (a1 + x(...))
|
||||||
trA4 = toReal( trace(A4));
|
// qN = cN
|
||||||
|
// qn = cn + qn+1 X
|
||||||
p[0] = toReal( trace(A3 * A3)) / 6.0 - 0.125 * trA4 * trA2 - trA3 * trA3 / 18.0 + trA2 * trA2 * trA2/ 48.0;
|
|
||||||
p[1] = toReal( trace(A5)) / 5.0 - trA3 * trA2 / 6.0;
|
|
||||||
p[2] = toReal( trace(A4)) / 4.0 - 0.125 * trA2 * trA2;
|
|
||||||
p[3] = trA3 / 3.0;
|
|
||||||
p[4] = 0.5 * trA2;
|
|
||||||
|
|
||||||
qnold[0] = cN[Niter];
|
|
||||||
qnold[1] = 0.0;
|
|
||||||
qnold[2] = 0.0;
|
|
||||||
qnold[3] = 0.0;
|
|
||||||
qnold[4] = 0.0;
|
|
||||||
qnold[5] = 0.0;
|
|
||||||
|
|
||||||
for(int i = Niter-1; i >= 0; i--)
|
|
||||||
{
|
|
||||||
qn[0] = p[0] * qnold[5] + cN[i];
|
|
||||||
qn[1] = p[1] * qnold[5] + qnold[0];
|
|
||||||
qn[2] = p[2] * qnold[5] + qnold[1];
|
|
||||||
qn[3] = p[3] * qnold[5] + qnold[2];
|
|
||||||
qn[4] = p[4] * qnold[5] + qnold[3];
|
|
||||||
qn[5] = qnold[4];
|
|
||||||
|
|
||||||
qnold[0] = qn[0];
|
|
||||||
qnold[1] = qn[1];
|
|
||||||
qnold[2] = qn[2];
|
|
||||||
qnold[3] = qn[3];
|
|
||||||
qnold[4] = qn[4];
|
|
||||||
qnold[5] = qn[5];
|
|
||||||
}
|
|
||||||
|
|
||||||
mat unit(1.0);
|
|
||||||
|
|
||||||
dest = (qn[0] * unit + qn[1] * alpha * arg + qn[2] * A2 + qn[3] * A3 + qn[4] * A4 + qn[5] * A5);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
static void Exponentiate_Clover(CloverDiagonalField& Diagonal, CloverTriangleField& Triangle, RealD csw_t, RealD diag_mass) {
|
|
||||||
|
|
||||||
GridBase* grid = Diagonal.Grid();
|
|
||||||
int NMAX = getNMAX(Diagonal, 3.*csw_t/diag_mass);
|
|
||||||
|
|
||||||
//
|
|
||||||
// Implementation completely in Daniel's layout
|
|
||||||
//
|
|
||||||
|
|
||||||
// Taylor expansion with Cayley-Hamilton recursion
|
|
||||||
// underlying Horner scheme as above
|
|
||||||
std::vector<RealD> cn(NMAX+1);
|
std::vector<RealD> cn(NMAX+1);
|
||||||
cn[0] = 1.0;
|
cn[0] = 1.0;
|
||||||
for (int i=1; i<=NMAX; i++){
|
for (int i=1; i<=NMAX; i++)
|
||||||
cn[i] = cn[i-1] / RealD(i);
|
cn[i] = cn[i-1] / RealD(i);
|
||||||
}
|
|
||||||
|
|
||||||
// Taken over from Daniel's implementation
|
ExpClover = Zero();
|
||||||
conformable(Diagonal, Triangle);
|
IdentityTimesC(ExpClover, cn[NMAX]);
|
||||||
|
for (int i=NMAX-1; i>=0; i--)
|
||||||
|
ExpClover = ExpClover * Clover + cn[i];
|
||||||
|
|
||||||
long lsites = grid->lSites();
|
// prepare inverse
|
||||||
{
|
CloverInv = (-1.0)*Clover;
|
||||||
typedef typename SiteCloverDiagonal::scalar_object scalar_object_diagonal;
|
|
||||||
typedef typename SiteCloverTriangle::scalar_object scalar_object_triangle;
|
|
||||||
typedef iMatrix<ComplexD,6> mat;
|
|
||||||
|
|
||||||
autoView(diagonal_v, Diagonal, CpuRead);
|
Clover = ExpClover * diag_mass;
|
||||||
autoView(triangle_v, Triangle, CpuRead);
|
|
||||||
autoView(diagonalExp_v, Diagonal, CpuWrite);
|
|
||||||
autoView(triangleExp_v, Triangle, CpuWrite);
|
|
||||||
|
|
||||||
thread_for(site, lsites, { // NOTE: Not on GPU because of (peek/poke)LocalSite
|
ExpClover = Zero();
|
||||||
|
IdentityTimesC(ExpClover, cn[NMAX]);
|
||||||
|
for (int i=NMAX-1; i>=0; i--)
|
||||||
|
ExpClover = ExpClover * CloverInv + cn[i];
|
||||||
|
|
||||||
mat srcCloverOpUL(0.0); // upper left block
|
CloverInv = ExpClover * (1.0/diag_mass);
|
||||||
mat srcCloverOpLR(0.0); // lower right block
|
|
||||||
mat ExpCloverOp;
|
|
||||||
|
|
||||||
scalar_object_diagonal diagonal_tmp = Zero();
|
|
||||||
scalar_object_diagonal diagonal_exp_tmp = Zero();
|
|
||||||
scalar_object_triangle triangle_tmp = Zero();
|
|
||||||
scalar_object_triangle triangle_exp_tmp = Zero();
|
|
||||||
|
|
||||||
Coordinate lcoor;
|
|
||||||
grid->LocalIndexToLocalCoor(site, lcoor);
|
|
||||||
|
|
||||||
peekLocalSite(diagonal_tmp, diagonal_v, lcoor);
|
|
||||||
peekLocalSite(triangle_tmp, triangle_v, lcoor);
|
|
||||||
|
|
||||||
int block;
|
|
||||||
block = 0;
|
|
||||||
for(int i = 0; i < 6; i++){
|
|
||||||
for(int j = 0; j < 6; j++){
|
|
||||||
if (i == j){
|
|
||||||
srcCloverOpUL(i,j) = static_cast<ComplexD>(TensorRemove(diagonal_tmp()(block)(i)));
|
|
||||||
}
|
|
||||||
else{
|
|
||||||
srcCloverOpUL(i,j) = static_cast<ComplexD>(TensorRemove(CompactHelpers::triangle_elem(triangle_tmp, block, i, j)));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
block = 1;
|
|
||||||
for(int i = 0; i < 6; i++){
|
|
||||||
for(int j = 0; j < 6; j++){
|
|
||||||
if (i == j){
|
|
||||||
srcCloverOpLR(i,j) = static_cast<ComplexD>(TensorRemove(diagonal_tmp()(block)(i)));
|
|
||||||
}
|
|
||||||
else{
|
|
||||||
srcCloverOpLR(i,j) = static_cast<ComplexD>(TensorRemove(CompactHelpers::triangle_elem(triangle_tmp, block, i, j)));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// exp(Clover)
|
|
||||||
|
|
||||||
ExponentiateHermitean6by6(srcCloverOpUL,1.0/diag_mass,cn,NMAX,ExpCloverOp);
|
|
||||||
|
|
||||||
block = 0;
|
|
||||||
for(int i = 0; i < 6; i++){
|
|
||||||
for(int j = 0; j < 6; j++){
|
|
||||||
if (i == j){
|
|
||||||
diagonal_exp_tmp()(block)(i) = ExpCloverOp(i,j);
|
|
||||||
}
|
|
||||||
else if(i < j){
|
|
||||||
triangle_exp_tmp()(block)(CompactHelpers::triangle_index(i, j)) = ExpCloverOp(i,j);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
ExponentiateHermitean6by6(srcCloverOpLR,1.0/diag_mass,cn,NMAX,ExpCloverOp);
|
|
||||||
|
|
||||||
block = 1;
|
|
||||||
for(int i = 0; i < 6; i++){
|
|
||||||
for(int j = 0; j < 6; j++){
|
|
||||||
if (i == j){
|
|
||||||
diagonal_exp_tmp()(block)(i) = ExpCloverOp(i,j);
|
|
||||||
}
|
|
||||||
else if(i < j){
|
|
||||||
triangle_exp_tmp()(block)(CompactHelpers::triangle_index(i, j)) = ExpCloverOp(i,j);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pokeLocalSite(diagonal_exp_tmp, diagonalExp_v, lcoor);
|
|
||||||
pokeLocalSite(triangle_exp_tmp, triangleExp_v, lcoor);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
Diagonal *= diag_mass;
|
|
||||||
Triangle *= diag_mass;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void InvertClover(CloverField& InvClover,
|
||||||
|
const CloverDiagonalField& diagonal,
|
||||||
|
const CloverTriangleField& triangle,
|
||||||
|
CloverDiagonalField& diagonalInv,
|
||||||
|
CloverTriangleField& triangleInv,
|
||||||
|
bool fixedBoundaries) {
|
||||||
|
|
||||||
|
if (fixedBoundaries)
|
||||||
|
{
|
||||||
|
CompactHelpers::Invert(diagonal, triangle, diagonalInv, triangleInv);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
CompactHelpers::ConvertLayout(InvClover, diagonalInv, triangleInv);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
|
static GaugeLinkField Cmunu(std::vector<GaugeLinkField> &U, GaugeLinkField &lambda, int mu, int nu) {
|
||||||
assert(0);
|
assert(0);
|
||||||
|
@ -225,7 +225,7 @@ public:
|
|||||||
RealD csw_t;
|
RealD csw_t;
|
||||||
RealD cF;
|
RealD cF;
|
||||||
|
|
||||||
bool open_boundaries;
|
bool fixedBoundaries;
|
||||||
|
|
||||||
CloverDiagonalField Diagonal, DiagonalEven, DiagonalOdd;
|
CloverDiagonalField Diagonal, DiagonalEven, DiagonalOdd;
|
||||||
CloverDiagonalField DiagonalInv, DiagonalInvEven, DiagonalInvOdd;
|
CloverDiagonalField DiagonalInv, DiagonalInvEven, DiagonalInvOdd;
|
||||||
|
@ -48,7 +48,7 @@ CompactWilsonCloverFermion<Impl, CloverHelpers>::CompactWilsonCloverFermion(Gaug
|
|||||||
, csw_r(_csw_r)
|
, csw_r(_csw_r)
|
||||||
, csw_t(_csw_t)
|
, csw_t(_csw_t)
|
||||||
, cF(_cF)
|
, cF(_cF)
|
||||||
, open_boundaries(impl_p.boundary_phases[Nd-1] == 0.0)
|
, fixedBoundaries(impl_p.boundary_phases[Nd-1] == 0.0)
|
||||||
, Diagonal(&Fgrid), Triangle(&Fgrid)
|
, Diagonal(&Fgrid), Triangle(&Fgrid)
|
||||||
, DiagonalEven(&Hgrid), TriangleEven(&Hgrid)
|
, DiagonalEven(&Hgrid), TriangleEven(&Hgrid)
|
||||||
, DiagonalOdd(&Hgrid), TriangleOdd(&Hgrid)
|
, DiagonalOdd(&Hgrid), TriangleOdd(&Hgrid)
|
||||||
@ -67,7 +67,7 @@ CompactWilsonCloverFermion<Impl, CloverHelpers>::CompactWilsonCloverFermion(Gaug
|
|||||||
csw_r /= clover_anisotropy.xi_0;
|
csw_r /= clover_anisotropy.xi_0;
|
||||||
|
|
||||||
ImportGauge(_Umu);
|
ImportGauge(_Umu);
|
||||||
if (open_boundaries) {
|
if (fixedBoundaries) {
|
||||||
this->BoundaryMaskEven.Checkerboard() = Even;
|
this->BoundaryMaskEven.Checkerboard() = Even;
|
||||||
this->BoundaryMaskOdd.Checkerboard() = Odd;
|
this->BoundaryMaskOdd.Checkerboard() = Odd;
|
||||||
CompactHelpers::SetupMasks(this->BoundaryMask, this->BoundaryMaskEven, this->BoundaryMaskOdd);
|
CompactHelpers::SetupMasks(this->BoundaryMask, this->BoundaryMaskEven, this->BoundaryMaskOdd);
|
||||||
@ -77,31 +77,31 @@ CompactWilsonCloverFermion<Impl, CloverHelpers>::CompactWilsonCloverFermion(Gaug
|
|||||||
template<class Impl, class CloverHelpers>
|
template<class Impl, class CloverHelpers>
|
||||||
void CompactWilsonCloverFermion<Impl, CloverHelpers>::Dhop(const FermionField& in, FermionField& out, int dag) {
|
void CompactWilsonCloverFermion<Impl, CloverHelpers>::Dhop(const FermionField& in, FermionField& out, int dag) {
|
||||||
WilsonBase::Dhop(in, out, dag);
|
WilsonBase::Dhop(in, out, dag);
|
||||||
if(open_boundaries) ApplyBoundaryMask(out);
|
if(fixedBoundaries) ApplyBoundaryMask(out);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl, class CloverHelpers>
|
template<class Impl, class CloverHelpers>
|
||||||
void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopOE(const FermionField& in, FermionField& out, int dag) {
|
void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopOE(const FermionField& in, FermionField& out, int dag) {
|
||||||
WilsonBase::DhopOE(in, out, dag);
|
WilsonBase::DhopOE(in, out, dag);
|
||||||
if(open_boundaries) ApplyBoundaryMask(out);
|
if(fixedBoundaries) ApplyBoundaryMask(out);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl, class CloverHelpers>
|
template<class Impl, class CloverHelpers>
|
||||||
void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopEO(const FermionField& in, FermionField& out, int dag) {
|
void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopEO(const FermionField& in, FermionField& out, int dag) {
|
||||||
WilsonBase::DhopEO(in, out, dag);
|
WilsonBase::DhopEO(in, out, dag);
|
||||||
if(open_boundaries) ApplyBoundaryMask(out);
|
if(fixedBoundaries) ApplyBoundaryMask(out);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl, class CloverHelpers>
|
template<class Impl, class CloverHelpers>
|
||||||
void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) {
|
void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopDir(const FermionField& in, FermionField& out, int dir, int disp) {
|
||||||
WilsonBase::DhopDir(in, out, dir, disp);
|
WilsonBase::DhopDir(in, out, dir, disp);
|
||||||
if(this->open_boundaries) ApplyBoundaryMask(out);
|
if(this->fixedBoundaries) ApplyBoundaryMask(out);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl, class CloverHelpers>
|
template<class Impl, class CloverHelpers>
|
||||||
void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopDirAll(const FermionField& in, std::vector<FermionField>& out) {
|
void CompactWilsonCloverFermion<Impl, CloverHelpers>::DhopDirAll(const FermionField& in, std::vector<FermionField>& out) {
|
||||||
WilsonBase::DhopDirAll(in, out);
|
WilsonBase::DhopDirAll(in, out);
|
||||||
if(this->open_boundaries) {
|
if(this->fixedBoundaries) {
|
||||||
for(auto& o : out) ApplyBoundaryMask(o);
|
for(auto& o : out) ApplyBoundaryMask(o);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -112,7 +112,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::M(const FermionField& in,
|
|||||||
WilsonBase::Dhop(in, out, DaggerNo); // call base to save applying bc
|
WilsonBase::Dhop(in, out, DaggerNo); // call base to save applying bc
|
||||||
Mooee(in, Tmp);
|
Mooee(in, Tmp);
|
||||||
axpy(out, 1.0, out, Tmp);
|
axpy(out, 1.0, out, Tmp);
|
||||||
if(open_boundaries) ApplyBoundaryMask(out);
|
if(fixedBoundaries) ApplyBoundaryMask(out);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl, class CloverHelpers>
|
template<class Impl, class CloverHelpers>
|
||||||
@ -121,19 +121,19 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mdag(const FermionField& i
|
|||||||
WilsonBase::Dhop(in, out, DaggerYes); // call base to save applying bc
|
WilsonBase::Dhop(in, out, DaggerYes); // call base to save applying bc
|
||||||
MooeeDag(in, Tmp);
|
MooeeDag(in, Tmp);
|
||||||
axpy(out, 1.0, out, Tmp);
|
axpy(out, 1.0, out, Tmp);
|
||||||
if(open_boundaries) ApplyBoundaryMask(out);
|
if(fixedBoundaries) ApplyBoundaryMask(out);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl, class CloverHelpers>
|
template<class Impl, class CloverHelpers>
|
||||||
void CompactWilsonCloverFermion<Impl, CloverHelpers>::Meooe(const FermionField& in, FermionField& out) {
|
void CompactWilsonCloverFermion<Impl, CloverHelpers>::Meooe(const FermionField& in, FermionField& out) {
|
||||||
WilsonBase::Meooe(in, out);
|
WilsonBase::Meooe(in, out);
|
||||||
if(open_boundaries) ApplyBoundaryMask(out);
|
if(fixedBoundaries) ApplyBoundaryMask(out);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl, class CloverHelpers>
|
template<class Impl, class CloverHelpers>
|
||||||
void CompactWilsonCloverFermion<Impl, CloverHelpers>::MeooeDag(const FermionField& in, FermionField& out) {
|
void CompactWilsonCloverFermion<Impl, CloverHelpers>::MeooeDag(const FermionField& in, FermionField& out) {
|
||||||
WilsonBase::MeooeDag(in, out);
|
WilsonBase::MeooeDag(in, out);
|
||||||
if(open_boundaries) ApplyBoundaryMask(out);
|
if(fixedBoundaries) ApplyBoundaryMask(out);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl, class CloverHelpers>
|
template<class Impl, class CloverHelpers>
|
||||||
@ -147,7 +147,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::Mooee(const FermionField&
|
|||||||
} else {
|
} else {
|
||||||
MooeeInternal(in, out, Diagonal, Triangle);
|
MooeeInternal(in, out, Diagonal, Triangle);
|
||||||
}
|
}
|
||||||
if(open_boundaries) ApplyBoundaryMask(out);
|
if(fixedBoundaries) ApplyBoundaryMask(out);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl, class CloverHelpers>
|
template<class Impl, class CloverHelpers>
|
||||||
@ -166,7 +166,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::MooeeInv(const FermionFiel
|
|||||||
} else {
|
} else {
|
||||||
MooeeInternal(in, out, DiagonalInv, TriangleInv);
|
MooeeInternal(in, out, DiagonalInv, TriangleInv);
|
||||||
}
|
}
|
||||||
if(open_boundaries) ApplyBoundaryMask(out);
|
if(fixedBoundaries) ApplyBoundaryMask(out);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Impl, class CloverHelpers>
|
template<class Impl, class CloverHelpers>
|
||||||
@ -186,7 +186,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::MdirAll(const FermionField
|
|||||||
|
|
||||||
template<class Impl, class CloverHelpers>
|
template<class Impl, class CloverHelpers>
|
||||||
void CompactWilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) {
|
void CompactWilsonCloverFermion<Impl, CloverHelpers>::MDeriv(GaugeField& force, const FermionField& X, const FermionField& Y, int dag) {
|
||||||
assert(!open_boundaries); // TODO check for changes required for open bc
|
assert(!fixedBoundaries); // TODO check for changes required for open bc
|
||||||
|
|
||||||
// NOTE: code copied from original clover term
|
// NOTE: code copied from original clover term
|
||||||
conformable(X.Grid(), Y.Grid());
|
conformable(X.Grid(), Y.Grid());
|
||||||
@ -305,6 +305,7 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeFie
|
|||||||
GridBase* grid = _Umu.Grid();
|
GridBase* grid = _Umu.Grid();
|
||||||
typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
|
typename Impl::GaugeLinkField Bx(grid), By(grid), Bz(grid), Ex(grid), Ey(grid), Ez(grid);
|
||||||
CloverField TmpOriginal(grid);
|
CloverField TmpOriginal(grid);
|
||||||
|
CloverField TmpInverse(grid);
|
||||||
|
|
||||||
// Compute the field strength terms mu>nu
|
// Compute the field strength terms mu>nu
|
||||||
double t2 = usecond();
|
double t2 = usecond();
|
||||||
@ -324,24 +325,27 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeFie
|
|||||||
TmpOriginal += Helpers::fillCloverXT(Ex) * csw_t;
|
TmpOriginal += Helpers::fillCloverXT(Ex) * csw_t;
|
||||||
TmpOriginal += Helpers::fillCloverYT(Ey) * csw_t;
|
TmpOriginal += Helpers::fillCloverYT(Ey) * csw_t;
|
||||||
TmpOriginal += Helpers::fillCloverZT(Ez) * csw_t;
|
TmpOriginal += Helpers::fillCloverZT(Ez) * csw_t;
|
||||||
// Handle mass term based on clover policy
|
|
||||||
CloverHelpers::MassTerm(TmpOriginal, this->diag_mass);
|
// Instantiate the clover term
|
||||||
|
// - In case of the standard clover the mass term is added
|
||||||
|
// - In case of the exponential clover the clover term is exponentiated
|
||||||
|
double t4 = usecond();
|
||||||
|
CloverHelpers::InstantiateClover(TmpOriginal, TmpInverse, csw_t, this->diag_mass);
|
||||||
|
|
||||||
// Convert the data layout of the clover term
|
// Convert the data layout of the clover term
|
||||||
double t4 = usecond();
|
double t5 = usecond();
|
||||||
CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle);
|
CompactHelpers::ConvertLayout(TmpOriginal, Diagonal, Triangle);
|
||||||
|
|
||||||
// Exponentiate the clover (nothing happens in case of the standard clover)
|
// Modify the clover term at the temporal boundaries in case of open boundary conditions
|
||||||
double t5 = usecond();
|
|
||||||
CloverHelpers::Exponentiate_Clover(Diagonal, Triangle, csw_t, this->diag_mass);
|
|
||||||
|
|
||||||
// Possible modify the boundary values
|
|
||||||
double t6 = usecond();
|
double t6 = usecond();
|
||||||
if(open_boundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, this->diag_mass);
|
if(fixedBoundaries) CompactHelpers::ModifyBoundaries(Diagonal, Triangle, csw_t, cF, this->diag_mass);
|
||||||
|
|
||||||
// Invert the Clover term (explicit inversion needed for the improvement in case of open boundary conditions)
|
// Invert the Clover term
|
||||||
|
// In case of the exponential clover with (anti-)periodic boundary conditions exp(-Clover) saved
|
||||||
|
// in TmpInverse can be used. In all other cases the clover term has to be explictly inverted.
|
||||||
|
// TODO: For now this inversion is explictly done on the CPU
|
||||||
double t7 = usecond();
|
double t7 = usecond();
|
||||||
CompactHelpers::Invert(Diagonal, Triangle, DiagonalInv, TriangleInv);
|
CloverHelpers::InvertClover(TmpInverse, Diagonal, Triangle, DiagonalInv, TriangleInv, fixedBoundaries);
|
||||||
|
|
||||||
// Fill the remaining clover fields
|
// Fill the remaining clover fields
|
||||||
double t8 = usecond();
|
double t8 = usecond();
|
||||||
@ -362,10 +366,10 @@ void CompactWilsonCloverFermion<Impl, CloverHelpers>::ImportGauge(const GaugeFie
|
|||||||
std::cout << GridLogDebug << "allocations = " << (t2 - t1) / 1e6 << std::endl;
|
std::cout << GridLogDebug << "allocations = " << (t2 - t1) / 1e6 << std::endl;
|
||||||
std::cout << GridLogDebug << "field strength = " << (t3 - t2) / 1e6 << std::endl;
|
std::cout << GridLogDebug << "field strength = " << (t3 - t2) / 1e6 << std::endl;
|
||||||
std::cout << GridLogDebug << "fill clover = " << (t4 - t3) / 1e6 << std::endl;
|
std::cout << GridLogDebug << "fill clover = " << (t4 - t3) / 1e6 << std::endl;
|
||||||
std::cout << GridLogDebug << "convert = " << (t5 - t4) / 1e6 << std::endl;
|
std::cout << GridLogDebug << "instantiate clover = " << (t5 - t4) / 1e6 << std::endl;
|
||||||
std::cout << GridLogDebug << "exponentiation = " << (t6 - t5) / 1e6 << std::endl;
|
std::cout << GridLogDebug << "convert layout = " << (t6 - t5) / 1e6 << std::endl;
|
||||||
std::cout << GridLogDebug << "boundaries = " << (t7 - t6) / 1e6 << std::endl;
|
std::cout << GridLogDebug << "modify boundaries = " << (t7 - t6) / 1e6 << std::endl;
|
||||||
std::cout << GridLogDebug << "inversions = " << (t8 - t7) / 1e6 << std::endl;
|
std::cout << GridLogDebug << "invert clover = " << (t8 - t7) / 1e6 << std::endl;
|
||||||
std::cout << GridLogDebug << "pick cbs = " << (t9 - t8) / 1e6 << std::endl;
|
std::cout << GridLogDebug << "pick cbs = " << (t9 - t8) / 1e6 << std::endl;
|
||||||
std::cout << GridLogDebug << "total = " << (t9 - t0) / 1e6 << std::endl;
|
std::cout << GridLogDebug << "total = " << (t9 - t0) / 1e6 << std::endl;
|
||||||
}
|
}
|
||||||
|
4
systems/PVC/benchmarks/run-1tile.sh
Normal file → Executable file
4
systems/PVC/benchmarks/run-1tile.sh
Normal file → Executable file
@ -6,7 +6,7 @@
|
|||||||
|
|
||||||
source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
|
source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
|
||||||
|
|
||||||
export NT=16
|
export NT=8
|
||||||
|
|
||||||
export I_MPI_OFFLOAD=1
|
export I_MPI_OFFLOAD=1
|
||||||
export I_MPI_OFFLOAD_TOPOLIB=level_zero
|
export I_MPI_OFFLOAD_TOPOLIB=level_zero
|
||||||
@ -21,7 +21,7 @@ export I_MPI_OFFLOAD_CELL=tile
|
|||||||
export EnableImplicitScaling=0
|
export EnableImplicitScaling=0
|
||||||
export EnableWalkerPartition=0
|
export EnableWalkerPartition=0
|
||||||
export ZE_AFFINITY_MASK=0.0
|
export ZE_AFFINITY_MASK=0.0
|
||||||
mpiexec -launcher ssh -n 1 -host localhost ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 --cacheblocking 8.8.8.8
|
mpiexec -launcher ssh -n 1 -host localhost ./Benchmark_dwf_fp32 --mpi 1.1.1.1 --grid 32.32.32.32 --accelerator-threads $NT --comms-sequential --shm-mpi 1 --device-mem 32768
|
||||||
|
|
||||||
export ZE_AFFINITY_MASK=0
|
export ZE_AFFINITY_MASK=0
|
||||||
export I_MPI_OFFLOAD_CELL=device
|
export I_MPI_OFFLOAD_CELL=device
|
||||||
|
@ -8,7 +8,6 @@ source /nfs/site/home/paboylex/ATS/GridNew/Grid/systems/PVC-nightly/setup.sh
|
|||||||
|
|
||||||
export NT=16
|
export NT=16
|
||||||
|
|
||||||
|
|
||||||
# export IGC_EnableLSCFenceUGMBeforeEOT=0
|
# export IGC_EnableLSCFenceUGMBeforeEOT=0
|
||||||
# export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file=False"
|
# export SYCL_PROGRAM_COMPILE_OPTIONS="-ze-opt-large-register-file=False"
|
||||||
#export IGC_ShaderDumpEnable=1
|
#export IGC_ShaderDumpEnable=1
|
||||||
@ -20,14 +19,16 @@ export SYCL_DEVICE_FILTER=gpu,level_zero
|
|||||||
export I_MPI_OFFLOAD_CELL=tile
|
export I_MPI_OFFLOAD_CELL=tile
|
||||||
export EnableImplicitScaling=0
|
export EnableImplicitScaling=0
|
||||||
export EnableWalkerPartition=0
|
export EnableWalkerPartition=0
|
||||||
#export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1
|
export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=1
|
||||||
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
|
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
|
||||||
#export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0
|
export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0
|
||||||
|
|
||||||
mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT --shm-mpi 1 > dw.2tile.1x2.log
|
for i in 0
|
||||||
mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --shm-mpi 1 > dw.2tile.2x1.log
|
do
|
||||||
|
mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_dwf_fp32 --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT --shm-mpi 1 --device-mem 32768
|
||||||
mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_halo --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT --shm-mpi 1 > halo.2tile.1x2.log
|
mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_dwf_fp32 --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --shm-mpi 1 --device-mem 32768
|
||||||
mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_halo --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --shm-mpi 1 > halo.2tile.2x1.log
|
done
|
||||||
|
#mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_halo --mpi 1.1.1.2 --grid 32.32.32.64 --accelerator-threads $NT --shm-mpi 1 > halo.2tile.1x2.log
|
||||||
|
#mpiexec -launcher ssh -n 2 -host localhost ./wrap4gpu.sh ./Benchmark_halo --mpi 2.1.1.1 --grid 64.32.32.32 --accelerator-threads $NT --shm-mpi 1 > halo.2tile.2x1.log
|
||||||
|
|
||||||
|
|
||||||
|
@ -2,14 +2,15 @@ INSTALL=/nfs/site/home/azusayax/install
|
|||||||
../../configure \
|
../../configure \
|
||||||
--enable-simd=GPU \
|
--enable-simd=GPU \
|
||||||
--enable-gen-simd-width=64 \
|
--enable-gen-simd-width=64 \
|
||||||
--enable-comms=mpi \
|
--enable-comms=mpi-auto \
|
||||||
--disable-accelerator-cshift \
|
--disable-accelerator-cshift \
|
||||||
--disable-gparity \
|
--disable-gparity \
|
||||||
--disable-fermion-reps \
|
--disable-fermion-reps \
|
||||||
--enable-shm=nvlink \
|
--enable-shm=nvlink \
|
||||||
--enable-accelerator=sycl \
|
--enable-accelerator=sycl \
|
||||||
--enable-unified=yes \
|
--enable-unified=no \
|
||||||
CXX=mpicxx \
|
MPICXX=mpicxx \
|
||||||
|
CXX=dpcpp \
|
||||||
LDFLAGS="-fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$INSTALL/lib" \
|
LDFLAGS="-fsycl-device-code-split=per_kernel -fsycl-device-lib=all -lze_loader -L$INSTALL/lib" \
|
||||||
CXXFLAGS="-cxx=dpcpp -fsycl-unnamed-lambda -fsycl -no-fma -I$INSTALL/include -Wtautological-constant-compare"
|
CXXFLAGS="-fsycl-unnamed-lambda -fsycl -no-fma -I$INSTALL/include -Wno-tautological-compare"
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user