1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-19 00:07:05 +01:00

Compare commits

..

15 Commits

Author SHA1 Message Date
970b4e81cb Merge 570b72a47b into 3f3661a86f 2025-01-22 14:35:06 +00:00
570b72a47b Bugfix. Sorry! 2025-01-21 15:37:39 -05:00
a5798a89ed Merge branch 'develop' into specflow 2025-01-21 12:13:24 -05:00
3f3661a86f Heading towards PVdagM multigrid 2025-01-17 14:33:35 +00:00
f7e2f9a401 Checking in spectral flow and DWF/Mobius kernel eigenvalue measurement 2025-01-16 20:47:33 +00:00
2848a9b558 DWF Kernel lanczos working(?) 2025-01-16 01:29:56 +00:00
5a4f9bf2e3 Force the ROCM version 2024-10-29 18:12:31 -04:00
f617468e04 Update Lattice_base.h 2024-10-11 10:39:16 -04:00
ee4046fe92 Added a dimension ordered column sum based reduction for scalar.
Removes dependence on MPI_Allreduce and allows for work around on
systems where this is bollox.
2024-09-27 09:26:03 -04:00
2a9cfeb9ea New files 2024-09-26 14:23:29 -04:00
1147b8ea40 Cheby poly setup 2024-09-26 14:20:32 -04:00
3f9119b39d Remove vectors used for the power spectrum table in paper 2024-09-26 14:19:41 -04:00
35e8225abd Verbose control 2024-09-26 14:18:35 -04:00
bdbfbb7a14 Merge branch 'develop' of https://github.com/paboyle/Grid into develop 2024-09-26 14:05:45 -04:00
f7d4be8d96 Calculate bytes correctly 2024-09-26 14:04:44 -04:00
140 changed files with 4932 additions and 3663 deletions

View File

@ -168,7 +168,6 @@ public:
template<class vobj>
void FFT_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int dim, int sign){
#ifndef HAVE_FFTW
std::cerr << "FFTW is not compiled but is called"<<std::endl;
assert(0);
#else
conformable(result.Grid(),vgrid);
@ -191,8 +190,7 @@ public:
Lattice<sobj> pgbuf(&pencil_g);
autoView(pgbuf_v , pgbuf, CpuWrite);
std::cout << "CPU view" << std::endl;
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
@ -215,7 +213,6 @@ public:
else if ( sign == forward ) div = 1.0;
else assert(0);
std::cout << GridLogPerformance<<"Making FFTW plan" << std::endl;
FFTW_plan p;
{
FFTW_scalar *in = (FFTW_scalar *)&pgbuf_v[0];
@ -229,7 +226,6 @@ public:
}
// Barrel shift and collect global pencil
std::cout << GridLogPerformance<<"Making pencil" << std::endl;
Coordinate lcoor(Nd), gcoor(Nd);
result = source;
int pc = processor_coor[dim];
@ -251,7 +247,6 @@ public:
}
}
std::cout <<GridLogPerformance<< "Looping orthog" << std::endl;
// Loop over orthog coords
int NN=pencil_g.lSites();
GridStopWatch timer;
@ -274,7 +269,6 @@ public:
usec += timer.useconds();
flops+= flops_call*NN;
std::cout <<GridLogPerformance<< "Writing back results " << std::endl;
// writing out result
{
autoView(pgbuf_v,pgbuf,CpuRead);
@ -291,7 +285,6 @@ public:
}
result = result*div;
std::cout <<GridLogPerformance<< "Destroying plan " << std::endl;
// destroying plan
FFTW<scalar>::fftw_destroy_plan(p);
#endif

View File

@ -55,10 +55,10 @@ NAMESPACE_BEGIN(Grid);
typedef cublasHandle_t gridblasHandle_t;
#endif
#ifdef GRID_SYCL
typedef sycl::queue *gridblasHandle_t;
typedef cl::sycl::queue *gridblasHandle_t;
#endif
#ifdef GRID_ONE_MKL
typedef sycl::queue *gridblasHandle_t;
typedef cl::sycl::queue *gridblasHandle_t;
#endif
#if !defined(GRID_SYCL) && !defined(GRID_CUDA) && !defined(GRID_HIP) && !defined(GRID_ONE_MKL)
typedef int32_t gridblasHandle_t;
@ -89,9 +89,9 @@ public:
gridblasHandle = theGridAccelerator;
#endif
#ifdef GRID_ONE_MKL
sycl::gpu_selector selector;
sycl::device selectedDevice { selector };
sycl::property_list q_prop{sycl::property::queue::in_order()};
cl::sycl::gpu_selector selector;
cl::sycl::device selectedDevice { selector };
cl::sycl::property_list q_prop{cl::sycl::property::queue::in_order()};
gridblasHandle =new sycl::queue (selectedDevice,q_prop);
#endif
gridblasInit=1;

View File

@ -144,11 +144,11 @@ public:
acceleratorCopyDeviceToDevice(&BLAS_Y[offset],&y_v[0],sizeof(scalar_object)*vol);
}
RealD t4 = usecond();
std::cout << "MulMatrix alloc took "<< t1-t0<<" us"<<std::endl;
std::cout << "MulMatrix preamble took "<< t2-t1<<" us"<<std::endl;
std::cout << "MulMatrix blas took "<< t3-t2<<" us"<<std::endl;
std::cout << "MulMatrix copy took "<< t4-t3<<" us"<<std::endl;
std::cout << "MulMatrix total "<< t4-t0<<" us"<<std::endl;
std::cout <<GridLogPerformance << "MulMatrix alloc took "<< t1-t0<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "MulMatrix preamble took "<< t2-t1<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "MulMatrix blas took "<< t3-t2<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "MulMatrix copy took "<< t4-t3<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "MulMatrix total "<< t4-t0<<" us"<<std::endl;
}
void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y)
@ -242,16 +242,16 @@ public:
RealD flops = 8.0*M*N*K;
flops = flops/(t4-t3)/1.e3;
bytes = bytes/(t4-t3)/1.e3;
std::cout << "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
std::cout << "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
std::cout << "InnerProductMatrix cp t2 "<< t2-t1<<" us"<<std::endl;
std::cout << "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
std::cout << "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
std::cout << "InnerProductMatrix blas "<< flops<<" GF/s"<<std::endl;
std::cout << "InnerProductMatrix blas "<< bytes<<" GB/s"<<std::endl;
std::cout << "InnerProductMatrix gsum t5 "<< t5-t4<<" us"<<std::endl;
std::cout << "InnerProductMatrix cp t6 "<< t6-t5<<" us"<<std::endl;
std::cout << "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t2 "<< t2-t1<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< flops<<" GF/s"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< bytes<<" GB/s"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix gsum t5 "<< t5-t4<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t6 "<< t6-t5<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
#else
int nrhs;
GridBase *grid;
@ -358,17 +358,17 @@ public:
flops = flops/(t4-t3)/1.e3;
bytes = bytes/(t4-t3)/1.e3;
xybytes = 4*xybytes/(t2-t1)/1.e3;
std::cout << "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
std::cout << "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
std::cout << "InnerProductMatrix cp t2 "<< t2-t1<<" us "<<xybytes<<" GB/s"<<std::endl;
std::cout << "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
std::cout << "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
std::cout << "InnerProductMatrix blas "<< flops<<" GF/s"<<std::endl;
std::cout << "InnerProductMatrix blas "<< bytes<<" GB/s"<<std::endl;
std::cout << "InnerProductMatrix cp t5 "<< t5-t4<<" us"<<std::endl;
std::cout << "InnerProductMatrix lsum t6l "<< t6l-t5<<" us"<<std::endl;
std::cout << "InnerProductMatrix gsum t6 "<< t6-t6l<<" us"<<std::endl;
std::cout << "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t2 "<< t2-t1<<" us "<<xybytes<<" GB/s"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< flops<<" GF/s"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< bytes<<" GB/s"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix cp t5 "<< t5-t4<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix lsum t6l "<< t6l-t5<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix gsum t6 "<< t6-t6l<<" us"<<std::endl;
std::cout <<GridLogPerformance<< "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
#endif
}
};

View File

@ -63,7 +63,12 @@ class TwoLevelCGmrhs
GridStopWatch SmoothTimer;
GridStopWatch InsertTimer;
/*
Field rrr;
Field sss;
Field qqq;
Field zzz;
*/
// more most opertor functions
TwoLevelCGmrhs(RealD tol,
Integer maxit,
@ -74,6 +79,12 @@ class TwoLevelCGmrhs
MaxIterations(maxit),
_FineLinop(FineLinop),
_Smoother(Smoother)
/*
rrr(fine),
sss(fine),
qqq(fine),
zzz(fine)
*/
{
grid = fine;
};
@ -81,8 +92,8 @@ class TwoLevelCGmrhs
// Vector case
virtual void operator() (std::vector<Field> &src, std::vector<Field> &x)
{
SolveSingleSystem(src,x);
// SolvePrecBlockCG(src,x);
// SolveSingleSystem(src,x);
SolvePrecBlockCG(src,x);
}
////////////////////////////////////////////////////////////////////////////////////////////////////
@ -657,6 +668,8 @@ public:
CoarseField PleftProjMrhs(this->coarsegridmrhs);
CoarseField PleftMss_projMrhs(this->coarsegridmrhs);
// this->rrr=in[0];
#undef SMOOTHER_BLOCK_SOLVE
#if SMOOTHER_BLOCK_SOLVE
this->SmoothTimer.Start();
@ -669,6 +682,7 @@ public:
this->SmoothTimer.Stop();
}
#endif
// this->sss=Min[0];
for(int rhs=0;rhs<nrhs;rhs++) {
@ -705,9 +719,11 @@ public:
this->_Projector.blockPromote(tmp,PleftMss_proj);// tmp= Q[in - A Min]
this->PromoteTimer.Stop();
this->FineTimer.Start();
// this->qqq=tmp[0];
for(int rhs=0;rhs<nrhs;rhs++) {
axpy(out[rhs],1.0,Min[rhs],tmp[rhs]); // Min+tmp
}
// this->zzz=out[0];
this->FineTimer.Stop();
}
};

View File

@ -116,14 +116,14 @@ NAMESPACE_BEGIN(Grid);
//Compute double precision rsd and also new RHS vector.
Linop_d.HermOp(sol_d, tmp_d);
RealD norm = axpy_norm(src_d, -1., tmp_d, src_d_in); //src_d is residual vector
std::cout<<GridLogMessage<<" rsd norm "<<norm<<std::endl;
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration " <<outer_iter<<" residual "<< norm<< " target "<< stop<<std::endl;
if(norm < OuterLoopNormMult * stop){
std::cout<<GridLogMessage<<"MixedPrecisionConjugateGradient: Outer iteration converged on iteration " <<outer_iter <<std::endl;
break;
}
while(norm * inner_tol * inner_tol < stop*1.01) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
while(norm * inner_tol * inner_tol < stop) inner_tol *= 2; // inner_tol = sqrt(stop/norm) ??
PrecChangeTimer.Start();
precisionChange(src_f, src_d, pc_wk_dp_to_sp);

View File

@ -102,11 +102,11 @@ public:
assert(mass.size()==nshift);
assert(mresidual.size()==nshift);
// remove dynamic sized arrays on stack; 2d is a pain with vector
std::vector<RealD> bs(nshift);
std::vector<RealD> rsq(nshift);
std::vector<std::array<RealD,2> > z(nshift);
std::vector<int> converged(nshift);
// dynamic sized arrays on stack; 2d is a pain with vector
RealD bs[nshift];
RealD rsq[nshift];
RealD z[nshift][2];
int converged[nshift];
const int primary =0;

View File

@ -123,11 +123,11 @@ public:
assert(mresidual.size()==nshift);
// dynamic sized arrays on stack; 2d is a pain with vector
std::vector<RealD> bs(nshift);
std::vector<RealD> rsq(nshift);
std::vector<RealD> rsqf(nshift);
std::vector<std::array<RealD,2> > z(nshift);
std::vector<int> converged(nshift);
RealD bs[nshift];
RealD rsq[nshift];
RealD rsqf[nshift];
RealD z[nshift][2];
int converged[nshift];
const int primary =0;

View File

@ -156,11 +156,11 @@ public:
assert(mresidual.size()==nshift);
// dynamic sized arrays on stack; 2d is a pain with vector
std::vector<RealD> bs(nshift);
std::vector<RealD> rsq(nshift);
std::vector<RealD> rsqf(nshift);
std::vector<std::array<RealD,2> > z(nshift);
std::vector<int> converged(nshift);
RealD bs[nshift];
RealD rsq[nshift];
RealD rsqf[nshift];
RealD z[nshift][2];
int converged[nshift];
const int primary =0;

View File

@ -245,9 +245,10 @@ until convergence
_HermOp(src_n,tmp);
// std::cout << GridLogMessage<< tmp<<std::endl; exit(0);
// std::cout << GridLogIRL << " _HermOp " << norm2(tmp) << std::endl;
RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
// RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
RealD vnum = real(innerProduct(tmp,tmp)); // HermOp^2.
RealD vden = norm2(src_n);
RealD na = vnum/vden;
RealD na = std::sqrt(vnum/vden);
if (fabs(evalMaxApprox/na - 1.0) < 0.0001)
i=_MAX_ITER_IRL_MEVAPP_;
evalMaxApprox = na;
@ -255,6 +256,7 @@ until convergence
src_n = tmp;
}
}
std::cout << GridLogIRL << " Final evalMaxApprox " << evalMaxApprox << std::endl;
std::vector<RealD> lme(Nm);
std::vector<RealD> lme2(Nm);

View File

@ -228,6 +228,70 @@ public:
}
assert(b==nn);
}
virtual void CreateSubspacePolyCheby(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
int nn,
double hi,
double lo1,
int orderfilter,
double lo2,
int orderstep)
{
RealD scale;
FineField noise(FineGrid);
FineField Mn(FineGrid);
FineField tmp(FineGrid);
// New normalised noise
gaussian(RNG,noise);
scale = std::pow(norm2(noise),-0.5);
noise=noise*scale;
std::cout << GridLogMessage<<" CreateSubspacePolyCheby "<<std::endl;
// Initial matrix element
hermop.Op(noise,Mn);
std::cout<<GridLogMessage << "noise <n|MdagM|n> "<<norm2(Mn)<<std::endl;
int b =0;
{
// Filter
std::cout << GridLogMessage << "Cheby "<<lo1<<","<<hi<<" "<<orderstep<<std::endl;
Chebyshev<FineField> Cheb(lo1,hi,orderfilter);
Cheb(hermop,noise,Mn);
// normalise
scale = std::pow(norm2(Mn),-0.5); Mn=Mn*scale;
subspace[b] = Mn;
hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
std::cout<<GridLogMessage << "filt ["<<b<<"] <n|n> "<<norm2(Mn)<<std::endl;
}
// Generate a full sequence of Chebyshevs
for(int n=1;n<nn;n++){
std::cout << GridLogMessage << "Cheby "<<lo2<<","<<hi<<" "<<orderstep<<std::endl;
Chebyshev<FineField> Cheb(lo2,hi,orderstep);
Cheb(hermop,subspace[n-1],Mn);
for(int m=0;m<n;m++){
ComplexD c = innerProduct(subspace[m],Mn);
Mn = Mn - c*subspace[m];
}
// normalise
scale = std::pow(norm2(Mn),-0.5);
Mn=Mn*scale;
subspace[n]=Mn;
hermop.Op(Mn,tmp);
std::cout<<GridLogMessage << "filt ["<<n<<"] <n|MdagM|n> "<<norm2(tmp)<<std::endl;
std::cout<<GridLogMessage << "filt ["<<n<<"] <n|n> "<<norm2(Mn)<<std::endl;
}
}
virtual void CreateSubspaceChebyshev(GridParallelRNG &RNG,LinearOperatorBase<FineField> &hermop,
int nn,
double hi,

View File

@ -99,7 +99,7 @@ public:
CoarseMatrix AselfInvEven;
CoarseMatrix AselfInvOdd;
deviceVector<RealD> dag_factor;
Vector<RealD> dag_factor;
///////////////////////
// Interface
@ -124,13 +124,9 @@ public:
int npoint = geom.npoint;
typedef LatticeView<Cobj> Aview;
deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
Vector<Aview> AcceleratorViewContainer;
for(int p=0;p<geom.npoint;p++) {
hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
}
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
Aview *Aview_p = & AcceleratorViewContainer[0];
const int Nsimd = CComplex::Nsimd();
@ -165,7 +161,7 @@ public:
coalescedWrite(out_v[ss](b),res);
});
for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
};
void Mdag (const CoarseVector &in, CoarseVector &out)
@ -194,14 +190,9 @@ public:
int npoint = geom.npoint;
typedef LatticeView<Cobj> Aview;
Vector<Aview> AcceleratorViewContainer;
deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
for(int p=0;p<geom.npoint;p++) {
hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
}
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
Aview *Aview_p = & AcceleratorViewContainer[0];
const int Nsimd = CComplex::Nsimd();
@ -210,10 +201,10 @@ public:
int osites=Grid()->oSites();
deviceVector<int> points(geom.npoint);
for(int p=0; p<geom.npoint; p++) {
acceleratorPut(points[p],geom.points_dagger[p]);
}
Vector<int> points(geom.npoint, 0);
for(int p=0; p<geom.npoint; p++)
points[p] = geom.points_dagger[p];
auto points_p = &points[0];
RealD* dag_factor_p = &dag_factor[0];
@ -245,7 +236,7 @@ public:
coalescedWrite(out_v[ss](b),res);
});
for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
}
void MdirComms(const CoarseVector &in)
@ -260,14 +251,8 @@ public:
out.Checkerboard() = in.Checkerboard();
typedef LatticeView<Cobj> Aview;
deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
for(int p=0;p<geom.npoint;p++) {
hAcceleratorViewContainer[p] = A[p].View(AcceleratorRead);
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
}
Vector<Aview> AcceleratorViewContainer;
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
Aview *Aview_p = & AcceleratorViewContainer[0];
autoView( out_v , out, AcceleratorWrite);
@ -300,7 +285,7 @@ public:
}
coalescedWrite(out_v[ss](b),res);
});
for(int p=0;p<geom.npoint;p++) hAcceleratorViewContainer[p].ViewClose();
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
}
void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out)
{
@ -484,20 +469,14 @@ public:
// determine in what order we need the points
int npoint = geom.npoint-1;
deviceVector<int> points(npoint);
for(int p=0; p<npoint; p++) {
int val = (dag && !hermitian) ? geom.points_dagger[p] : p;
acceleratorPut(points[p], val);
}
Vector<int> points(npoint, 0);
for(int p=0; p<npoint; p++)
points[p] = (dag && !hermitian) ? geom.points_dagger[p] : p;
auto points_p = &points[0];
deviceVector<Aview> AcceleratorViewContainer(geom.npoint);
hostVector<Aview> hAcceleratorViewContainer(geom.npoint);
for(int p=0;p<geom.npoint;p++) {
hAcceleratorViewContainer[p] = a[p].View(AcceleratorRead);
acceleratorPut(AcceleratorViewContainer[p],hAcceleratorViewContainer[p]);
}
Vector<Aview> AcceleratorViewContainer;
for(int p=0;p<npoint;p++) AcceleratorViewContainer.push_back(a[p].View(AcceleratorRead));
Aview *Aview_p = & AcceleratorViewContainer[0];
const int Nsimd = CComplex::Nsimd();
@ -560,7 +539,7 @@ public:
});
}
for(int p=0;p<npoint;p++) hAcceleratorViewContainer[p].ViewClose();
for(int p=0;p<npoint;p++) AcceleratorViewContainer[p].ViewClose();
}
CoarsenedMatrix(GridCartesian &CoarseGrid, int hermitian_=0) :
@ -611,13 +590,11 @@ public:
}
// GPU readable prefactor
std::vector<RealD> h_dag_factor(nbasis*nbasis);
thread_for(i, nbasis*nbasis, {
int j = i/nbasis;
int k = i%nbasis;
h_dag_factor[i] = dag_factor_eigen(j, k);
dag_factor[i] = dag_factor_eigen(j, k);
});
acceleratorCopyToDevice(&h_dag_factor[0],&dag_factor[0],dag_factor.size()*sizeof(RealD));
}
void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase<Lattice<Fobj> > &linop,

View File

@ -174,11 +174,21 @@ template<typename _Tp> inline bool operator!=(const devAllocator<_Tp>&, const d
////////////////////////////////////////////////////////////////////////////////
// Template typedefs
////////////////////////////////////////////////////////////////////////////////
template<class T> using hostVector = std::vector<T,alignedAllocator<T> >; // Needs autoview
template<class T> using Vector = std::vector<T,uvmAllocator<T> >; //
template<class T> using uvmVector = std::vector<T,uvmAllocator<T> >; // auto migrating page
template<class T> using deviceVector = std::vector<T,devAllocator<T> >; // device vector
#ifdef ACCELERATOR_CSHIFT
// Cshift on device
template<class T> using cshiftAllocator = devAllocator<T>;
#else
// Cshift on host
template<class T> using cshiftAllocator = std::allocator<T>;
#endif
template<class T> using Vector = std::vector<T,uvmAllocator<T> >;
template<class T> using stencilVector = std::vector<T,alignedAllocator<T> >;
template<class T> using commVector = std::vector<T,devAllocator<T> >;
template<class T> using deviceVector = std::vector<T,devAllocator<T> >;
template<class T> using cshiftVector = std::vector<T,cshiftAllocator<T> >;
/*
template<class T> class vecView
{
protected:
@ -187,9 +197,8 @@ template<class T> class vecView
ViewMode mode;
void * cpu_ptr;
public:
// Rvalue accessor
accelerator_inline T & operator[](size_t i) const { return this->data[i]; };
vecView(Vector<T> &refer_to_me,ViewMode _mode)
vecView(std::vector<T> &refer_to_me,ViewMode _mode)
{
cpu_ptr = &refer_to_me[0];
size = refer_to_me.size();
@ -205,15 +214,26 @@ template<class T> class vecView
}
};
template<class T> vecView<T> VectorView(Vector<T> &vec,ViewMode _mode)
template<class T> vecView<T> VectorView(std::vector<T> &vec,ViewMode _mode)
{
vecView<T> ret(vec,_mode); // does the open
return ret; // must be closed
}
// Little autoscope assister
template<class View>
class VectorViewCloser
{
View v; // Take a copy of view and call view close when I go out of scope automatically
public:
VectorViewCloser(View &_v) : v(_v) {};
~VectorViewCloser() { auto ptr = v.cpu_ptr; v.ViewClose(); MemoryManager::NotifyDeletion(ptr);}
};
#define autoVecView(v_v,v,mode) \
auto v_v = VectorView(v,mode); \
ViewCloser<decltype(v_v)> _autoView##v_v(v_v);
*/
NAMESPACE_END(Grid);

View File

@ -1,15 +1,17 @@
#include <Grid/GridCore.h>
#ifndef GRID_UVM
#warning "Using explicit device memory copies"
NAMESPACE_BEGIN(Grid);
#define MAXLINE 512
static char print_buffer [ MAXLINE ];
#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer << std::endl;
#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug << print_buffer << std::endl;
#define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer;
#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug << print_buffer;
//#define dprintf(...)
////////////////////////////////////////////////////////////
// For caching copies of data on device
////////////////////////////////////////////////////////////
@ -167,7 +169,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
assert(AccCache.AccPtr!=(uint64_t)NULL);
assert(AccCache.CpuPtr!=(uint64_t)NULL);
acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
mprintf("MemoryManager: acceleratorCopyFromDevice Flush size %ld AccPtr %lx -> CpuPtr %lx\n",(uint64_t)AccCache.bytes,(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
mprintf("MemoryManager: acceleratorCopyFromDevice Flush AccPtr %lx -> CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
DeviceToHostBytes+=AccCache.bytes;
DeviceToHostXfer++;
AccCache.state=Consistent;
@ -182,9 +184,7 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
DeviceBytes+=AccCache.bytes;
}
mprintf("MemoryManager: acceleratorCopyToDevice Clone size %ld AccPtr %lx <- CpuPtr %lx\n",
(uint64_t)AccCache.bytes,
(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
mprintf("MemoryManager: acceleratorCopyToDevice Clone AccPtr %lx <- CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
HostToDeviceBytes+=AccCache.bytes;
HostToDeviceXfer++;
@ -265,7 +265,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
assert(AccCache.cpuLock==0); // Programming error
if(AccCache.state!=Empty) {
dprintf("ViewOpen found entry %lx %lx : sizes %ld %ld accLock %ld\n",
dprintf("ViewOpen found entry %lx %lx : %ld %ld accLock %ld\n",
(uint64_t)AccCache.CpuPtr,
(uint64_t)CpuPtr,
(uint64_t)AccCache.bytes,

View File

@ -15,10 +15,10 @@ void check_huge_pages(void *Buf,uint64_t BYTES)
uint64_t virt_pfn = (uint64_t)Buf / page_size;
off_t offset = sizeof(uint64_t) * virt_pfn;
uint64_t npages = (BYTES + page_size-1) / page_size;
std::vector<uint64_t> pagedata(npages);
uint64_t pagedata[npages];
uint64_t ret = lseek(fd, offset, SEEK_SET);
assert(ret == offset);
ret = ::read(fd, &pagedata[0], sizeof(uint64_t)*npages);
ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
assert(ret == sizeof(uint64_t) * npages);
int nhugepages = npages / 512;
int n4ktotal, nnothuge;

View File

@ -57,29 +57,18 @@ int CartesianCommunicator::ProcessorCount(void) { return
// very VERY rarely (Log, serial RNG) we need world without a grid
////////////////////////////////////////////////////////////////////////////////
#ifdef USE_GRID_REDUCTION
void CartesianCommunicator::GlobalSum(ComplexF &c)
{
GlobalSumP2P(c);
}
void CartesianCommunicator::GlobalSum(ComplexD &c)
{
GlobalSumP2P(c);
}
#else
void CartesianCommunicator::GlobalSum(ComplexF &c)
{
GlobalSumVector((float *)&c,2);
}
void CartesianCommunicator::GlobalSum(ComplexD &c)
{
GlobalSumVector((double *)&c,2);
}
#endif
void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
{
GlobalSumVector((float *)c,2*N);
}
void CartesianCommunicator::GlobalSum(ComplexD &c)
{
GlobalSumVector((double *)&c,2);
}
void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
{
GlobalSumVector((double *)c,2*N);

View File

@ -127,7 +127,7 @@ public:
void GlobalSumVector(ComplexD *c,int N);
void GlobalXOR(uint32_t &);
void GlobalXOR(uint64_t &);
template<class obj> void GlobalSumP2P(obj &o)
{
std::vector<obj> column;
@ -136,7 +136,7 @@ public:
for(int d=0;d<_ndimension;d++){
column.resize(_processors[d]);
column[0] = accum;
std::vector<MpiCommsRequest_t> list;
std::vector<CommsRequest_t> list;
for(int p=1;p<_processors[d];p++){
ShiftedRanks(d,p,source,dest);
SendToRecvFromBegin(list,
@ -166,8 +166,8 @@ public:
////////////////////////////////////////////////////////////
// Face exchange, buffer swap in translational invariant way
////////////////////////////////////////////////////////////
void CommsComplete(std::vector<MpiCommsRequest_t> &list);
void SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &list,
void CommsComplete(std::vector<CommsRequest_t> &list);
void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
@ -186,12 +186,6 @@ public:
int recv_from_rank,int do_recv,
int bytes,int dir);
double StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,int do_xmit,
void *recv,
int recv_from_rank,int do_recv,
int xbytes,int rbytes,int dir);
double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,int do_xmit,

View File

@ -257,25 +257,6 @@ CartesianCommunicator::~CartesianCommunicator()
}
}
}
#ifdef USE_GRID_REDUCTION
void CartesianCommunicator::GlobalSum(float &f){
CartesianCommunicator::GlobalSumP2P(f);
}
void CartesianCommunicator::GlobalSum(double &d)
{
CartesianCommunicator::GlobalSumP2P(d);
}
#else
void CartesianCommunicator::GlobalSum(float &f){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(double &d)
{
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
}
#endif
void CartesianCommunicator::GlobalSum(uint32_t &u){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
assert(ierr==0);
@ -306,18 +287,27 @@ void CartesianCommunicator::GlobalMax(double &d)
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_MAX,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(float &f){
int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(float *f,int N)
{
int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSum(double &d)
{
int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::GlobalSumVector(double *d,int N)
{
int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
assert(ierr==0);
}
void CartesianCommunicator::SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &list,
void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,
void *recv,
@ -342,7 +332,7 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<MpiCommsRequest_t> &
assert(ierr==0);
list.push_back(xrq);
}
void CartesianCommunicator::CommsComplete(std::vector<MpiCommsRequest_t> &list)
void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list)
{
int nreq=list.size();
@ -361,7 +351,7 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
int from,
int bytes)
{
std::vector<MpiCommsRequest_t> reqs(0);
std::vector<CommsRequest_t> reqs(0);
unsigned long xcrc = crc32(0L, Z_NULL, 0);
unsigned long rcrc = crc32(0L, Z_NULL, 0);
@ -391,23 +381,12 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
int bytes,int dir)
{
std::vector<CommsRequest_t> list;
double offbytes = StencilSendToRecvFromPrepare(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
offbytes += StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,dox,recv,from,dor,bytes,bytes,dir);
StencilSendToRecvFromComplete(list,dir);
return offbytes;
}
#ifdef ACCELERATOR_AWARE_MPI
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,int dox,
void *recv,
int from,int dor,
int xbytes,int rbytes,int dir)
{
return 0.0; // Do nothing -- no preparation required
}
#undef NVLINK_GET // Define to use get instead of put DMA
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,int dox,
@ -431,7 +410,7 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
assert(gme == ShmRank);
double off_node_bytes=0.0;
int tag;
if ( dor ) {
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+from*32;
@ -440,9 +419,15 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
list.push_back(rrq);
off_node_bytes+=rbytes;
}
#ifdef NVLINK_GET
void *shm = (void *) this->ShmBufferTranslate(from,xmit);
assert(shm!=NULL);
acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
#endif
}
if (dox) {
// rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+_processor*32;
ierr =MPI_Isend(xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
@ -450,14 +435,17 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
list.push_back(xrq);
off_node_bytes+=xbytes;
} else {
#ifndef NVLINK_GET
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
assert(shm!=NULL);
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
#endif
}
}
return off_node_bytes;
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
{
int nreq=list.size();
@ -465,254 +453,12 @@ void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsReque
acceleratorCopySynchronise();
if (nreq==0) return;
std::vector<MPI_Status> status(nreq);
int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
assert(ierr==0);
list.resize(0);
this->StencilBarrier();
}
#else /* NOT ... ACCELERATOR_AWARE_MPI */
///////////////////////////////////////////
// Pipeline mode through host memory
///////////////////////////////////////////
/*
* In prepare (phase 1):
* PHASE 1: (prepare)
* - post MPI receive buffers asynch
* - post device - host send buffer transfer asynch
* PHASE 2: (Begin)
* - complete all copies
* - post MPI send asynch
* - post device - device transfers
* PHASE 3: (Complete)
* - MPI_waitall
* - host-device transfers
*
*********************************
* NB could split this further:
*--------------------------------
* PHASE 1: (Prepare)
* - post MPI receive buffers asynch
* - post device - host send buffer transfer asynch
* PHASE 2: (BeginInterNode)
* - complete all copies
* - post MPI send asynch
* PHASE 3: (BeginIntraNode)
* - post device - device transfers
* PHASE 4: (Complete)
* - MPI_waitall
* - host-device transfers asynch
* - (complete all copies)
*/
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,int dox,
void *recv,
int from,int dor,
int xbytes,int rbytes,int dir)
{
/*
* Bring sequence from Stencil.h down to lower level.
* Assume using XeLink is ok
*/
int ncomm =communicator_halo.size();
int commdir=dir%ncomm;
MPI_Request xrq;
MPI_Request rrq;
int ierr;
int gdest = ShmRanks[dest];
int gfrom = ShmRanks[from];
int gme = ShmRanks[_processor];
assert(dest != _processor);
assert(from != _processor);
assert(gme == ShmRank);
double off_node_bytes=0.0;
int tag;
void * host_recv = NULL;
void * host_xmit = NULL;
/*
* PHASE 1: (Prepare)
* - post MPI receive buffers asynch
* - post device - host send buffer transfer asynch
*/
if ( dor ) {
if ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) {
tag= dir+from*32;
host_recv = this->HostBufferMalloc(rbytes);
ierr=MPI_Irecv(host_recv, rbytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
assert(ierr==0);
CommsRequest_t srq;
srq.PacketType = InterNodeRecv;
srq.bytes = rbytes;
srq.req = rrq;
srq.host_buf = host_recv;
srq.device_buf = recv;
list.push_back(srq);
off_node_bytes+=rbytes;
}
}
if (dox) {
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
#undef DEVICE_TO_HOST_CONCURRENT // pipeline
#ifdef DEVICE_TO_HOST_CONCURRENT
tag= dir+_processor*32;
host_xmit = this->HostBufferMalloc(xbytes);
acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
// ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
// assert(ierr==0);
// off_node_bytes+=xbytes;
CommsRequest_t srq;
srq.PacketType = InterNodeXmit;
srq.bytes = xbytes;
// srq.req = xrq;
srq.host_buf = host_xmit;
srq.device_buf = xmit;
list.push_back(srq);
#else
tag= dir+_processor*32;
host_xmit = this->HostBufferMalloc(xbytes);
const int chunks=1;
for(int n=0;n<chunks;n++){
void * host_xmitc = (void *)( (uint64_t) host_xmit + n*xbytes/chunks);
void * xmitc = (void *)( (uint64_t) xmit + n*xbytes/chunks);
acceleratorCopyFromDeviceAsynch(xmitc, host_xmitc,xbytes/chunks); // Make this Asynch
}
acceleratorCopySynchronise(); // Complete all pending copy transfers
ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
assert(ierr==0);
off_node_bytes+=xbytes;
CommsRequest_t srq;
srq.PacketType = InterNodeXmit;
srq.bytes = xbytes;
srq.req = xrq;
srq.host_buf = host_xmit;
srq.device_buf = xmit;
list.push_back(srq);
#endif
}
}
return off_node_bytes;
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int dest,int dox,
void *recv,
int from,int dor,
int xbytes,int rbytes,int dir)
{
int ncomm =communicator_halo.size();
int commdir=dir%ncomm;
MPI_Request xrq;
MPI_Request rrq;
int ierr;
int gdest = ShmRanks[dest];
int gfrom = ShmRanks[from];
int gme = ShmRanks[_processor];
assert(dest != _processor);
assert(from != _processor);
assert(gme == ShmRank);
double off_node_bytes=0.0;
int tag;
void * host_xmit = NULL;
////////////////////////////////
// Receives already posted
// Copies already started
////////////////////////////////
/*
* PHASE 2: (Begin)
* - complete all copies
* - post MPI send asynch
*/
// static int printed;
// if((printed<8) && this->IsBoss() ) {
// printf("dir %d doX %d doR %d Face size %ld %ld\n",dir,dox,dor,xbytes,rbytes);
// printed++;
// }
if (dox) {
if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
#ifdef DEVICE_TO_HOST_CONCURRENT
tag= dir+_processor*32;
// Find the send in the prepared list
int list_idx=-1;
for(int idx = 0; idx<list.size();idx++){
if ( (list[idx].device_buf==xmit)
&&(list[idx].PacketType==InterNodeXmit)
&&(list[idx].bytes==xbytes) ) {
list_idx = idx;
host_xmit = list[idx].host_buf;
}
}
assert(list_idx != -1); // found it
ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
assert(ierr==0);
list[list_idx].req = xrq; // Update the MPI request in the list
off_node_bytes+=xbytes;
#endif
} else {
void *shm = (void *) this->ShmBufferTranslate(dest,recv);
assert(shm!=NULL);
acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
}
}
return off_node_bytes;
}
void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
{
int nreq=list.size();
if (nreq==0) return;
std::vector<MPI_Status> status(nreq);
std::vector<MPI_Request> MpiRequests(nreq);
for(int r=0;r<nreq;r++){
MpiRequests[r] = list[r].req;
}
int ierr = MPI_Waitall(nreq,&MpiRequests[0],&status[0]);
assert(ierr==0);
for(int r=0;r<nreq;r++){
if ( list[r].PacketType==InterNodeRecv ) {
acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
}
}
acceleratorCopySynchronise(); // Complete all pending copy transfers
list.resize(0); // Delete the list
this->HostBufferFreeAll(); // Clean up the buffer allocs
this->StencilBarrier();
}
#endif
////////////////////////////////////////////
// END PIPELINE MODE / NO CUDA AWARE MPI
////////////////////////////////////////////
void CartesianCommunicator::StencilBarrier(void)
{
MPI_Barrier (ShmComm);

View File

@ -132,15 +132,6 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
{
return 2.0*bytes;
}
double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,int dox,
void *recv,
int recv_from_rank,int dor,
int xbytes,int rbytes, int dir)
{
return xbytes+rbytes;
}
double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
void *xmit,
int xmit_to_rank,int dox,

View File

@ -46,22 +46,8 @@ NAMESPACE_BEGIN(Grid);
#if defined (GRID_COMMS_MPI3)
typedef MPI_Comm Grid_MPI_Comm;
typedef MPI_Request MpiCommsRequest_t;
#ifdef ACCELERATOR_AWARE_MPI
typedef MPI_Request CommsRequest_t;
#else
enum PacketType_t { InterNodeXmit, InterNodeRecv, IntraNodeXmit, IntraNodeRecv };
typedef struct {
PacketType_t PacketType;
void *host_buf;
void *device_buf;
unsigned long bytes;
MpiCommsRequest_t req;
} CommsRequest_t;
#endif
#else
typedef int MpiCommsRequest_t;
typedef int CommsRequest_t;
typedef int Grid_MPI_Comm;
#endif

View File

@ -42,11 +42,6 @@ Author: Christoph Lehner <christoph@lhnr.de>
#ifdef ACCELERATOR_AWARE_MPI
#define GRID_SYCL_LEVEL_ZERO_IPC
#define SHM_SOCKETS
#else
#ifdef HAVE_NUMAIF_H
#warning " Using NUMAIF "
#include <numaif.h>
#endif
#endif
#include <syscall.h>
#endif
@ -542,38 +537,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
// Each MPI rank should allocate our own buffer
///////////////////////////////////////////////////////////////////////////////////////////////////////////
#ifndef ACCELERATOR_AWARE_MPI
printf("Host buffer allocate for GPU non-aware MPI\n");
#if 0
HostCommBuf= acceleratorAllocHost(bytes);
#else
HostCommBuf= malloc(bytes); /// CHANGE THIS TO malloc_host
#ifdef HAVE_NUMAIF_H
#warning "Moving host buffers to specific NUMA domain"
int numa;
char *numa_name=(char *)getenv("MPI_BUF_NUMA");
if(numa_name) {
unsigned long page_size = sysconf(_SC_PAGESIZE);
numa = atoi(numa_name);
unsigned long page_count = bytes/page_size;
std::vector<void *> pages(page_count);
std::vector<int> nodes(page_count,numa);
std::vector<int> status(page_count,-1);
for(unsigned long p=0;p<page_count;p++){
pages[p] =(void *) ((uint64_t) HostCommBuf + p*page_size);
}
int ret = move_pages(0,
page_count,
&pages[0],
&nodes[0],
&status[0],
MPOL_MF_MOVE);
printf("Host buffer move to numa domain %d : move_pages returned %d\n",numa,ret);
if (ret) perror(" move_pages failed for reason:");
}
#endif
acceleratorPin(HostCommBuf,bytes);
#endif
HostCommBuf= malloc(bytes);
#endif
ShmCommBuf = acceleratorAllocDevice(bytes);
if (ShmCommBuf == (void *)NULL ) {
@ -605,8 +569,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
typedef struct { int fd; pid_t pid ; ze_ipc_mem_handle_t ze; } clone_mem_t;
auto zeDevice = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());
auto zeContext = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());
auto zeDevice = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_device());
auto zeContext = cl::sycl::get_native<cl::sycl::backend::ext_oneapi_level_zero>(theGridAccelerator->get_context());
ze_ipc_mem_handle_t ihandle;
clone_mem_t handle;

View File

@ -51,6 +51,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
#endif
NAMESPACE_BEGIN(Grid);
template<class Expression,typename std::enable_if<is_lattice_expr<Expression>::value,void>::type * = nullptr>
auto Cshift(const Expression &expr,int dim,int shift) -> decltype(closure(expr))
{

View File

@ -30,11 +30,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid);
extern std::vector<std::pair<int,int> > Cshift_table;
extern deviceVector<std::pair<int,int> > Cshift_table_device;
extern commVector<std::pair<int,int> > Cshift_table_device;
inline std::pair<int,int> *MapCshiftTable(void)
{
// GPU version
#ifdef ACCELERATOR_CSHIFT
uint64_t sz=Cshift_table.size();
if (Cshift_table_device.size()!=sz ) {
Cshift_table_device.resize(sz);
@ -44,13 +45,16 @@ inline std::pair<int,int> *MapCshiftTable(void)
sizeof(Cshift_table[0])*sz);
return &Cshift_table_device[0];
#else
return &Cshift_table[0];
#endif
// CPU version use identify map
}
///////////////////////////////////////////////////////////////////
// Gather for when there is no need to SIMD split
///////////////////////////////////////////////////////////////////
template<class vobj> void
Gather_plane_simple (const Lattice<vobj> &rhs,deviceVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
{
int rd = rhs.Grid()->_rdimensions[dimension];
@ -90,10 +94,17 @@ Gather_plane_simple (const Lattice<vobj> &rhs,deviceVector<vobj> &buffer,int dim
{
auto buffer_p = & buffer[0];
auto table = MapCshiftTable();
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
});
#else
autoView(rhs_v , rhs, CpuRead);
thread_for(i,ent,{
buffer_p[table[i].first]=rhs_v[table[i].second];
});
#endif
}
}
@ -118,6 +129,7 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
int n1=rhs.Grid()->_slice_stride[dimension];
if ( cbmask ==0x3){
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for(nn,e1*e2,1,{
int n = nn%e1;
@ -128,10 +140,21 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset);
});
#else
autoView(rhs_v , rhs, CpuRead);
thread_for2d(n,e1,b,e2,{
int o = n*n1;
int offset = b+n*e2;
vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset);
});
#endif
} else {
Coordinate rdim=rhs.Grid()->_rdimensions;
Coordinate cdm =rhs.Grid()->_checker_dim_mask;
std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for(nn,e1*e2,1,{
int n = nn%e1;
@ -152,13 +175,33 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
extract<vobj>(temp,pointers,offset);
}
});
#else
autoView(rhs_v , rhs, CpuRead);
thread_for2d(n,e1,b,e2,{
Coordinate coor;
int o=n*n1;
int oindex = o+b;
int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm);
int ocb=1<<cb;
int offset = b+n*e2;
if ( ocb & cbmask ) {
vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset);
}
});
#endif
}
}
//////////////////////////////////////////////////////
// Scatter for when there is no need to SIMD split
//////////////////////////////////////////////////////
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,deviceVector<vobj> &buffer, int dimension,int plane,int cbmask)
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<vobj> &buffer, int dimension,int plane,int cbmask)
{
int rd = rhs.Grid()->_rdimensions[dimension];
@ -202,10 +245,17 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,deviceVector<
{
auto buffer_p = & buffer[0];
auto table = MapCshiftTable();
#ifdef ACCELERATOR_CSHIFT
autoView( rhs_v, rhs, AcceleratorWrite);
accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
});
#else
autoView( rhs_v, rhs, CpuWrite);
thread_for(i,ent,{
rhs_v[table[i].first]=buffer_p[table[i].second];
});
#endif
}
}
@ -228,6 +278,7 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
if(cbmask ==0x3 ) {
int _slice_stride = rhs.Grid()->_slice_stride[dimension];
int _slice_block = rhs.Grid()->_slice_block[dimension];
#ifdef ACCELERATOR_CSHIFT
autoView( rhs_v , rhs, AcceleratorWrite);
accelerator_for(nn,e1*e2,1,{
int n = nn%e1;
@ -236,6 +287,14 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
int offset = b+n*_slice_block;
merge(rhs_v[so+o+b],pointers,offset);
});
#else
autoView( rhs_v , rhs, CpuWrite);
thread_for2d(n,e1,b,e2,{
int o = n*_slice_stride;
int offset = b+n*_slice_block;
merge(rhs_v[so+o+b],pointers,offset);
});
#endif
} else {
// Case of SIMD split AND checker dim cannot currently be hit, except in
@ -301,11 +360,19 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
{
auto table = MapCshiftTable();
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead);
autoView(lhs_v , lhs, AcceleratorWrite);
accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second]));
});
#else
autoView(rhs_v , rhs, CpuRead);
autoView(lhs_v , lhs, CpuWrite);
thread_for(i,ent,{
lhs_v[table[i].first]=rhs_v[table[i].second];
});
#endif
}
}
@ -345,11 +412,19 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
{
auto table = MapCshiftTable();
#ifdef ACCELERATOR_CSHIFT
autoView( rhs_v, rhs, AcceleratorRead);
autoView( lhs_v, lhs, AcceleratorWrite);
accelerator_for(i,ent,1,{
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
});
#else
autoView( rhs_v, rhs, CpuRead);
autoView( lhs_v, lhs, CpuWrite);
thread_for(i,ent,{
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
});
#endif
}
}

View File

@ -31,7 +31,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
NAMESPACE_BEGIN(Grid);
const int Cshift_verbose=0;
template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
{
typedef typename vobj::vector_type vector_type;
@ -55,20 +55,20 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
RealD t1,t0;
t0=usecond();
if ( !comm_dim ) {
// std::cout << "CSHIFT: Cshift_local" <<std::endl;
//std::cout << "CSHIFT: Cshift_local" <<std::endl;
Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding
} else if ( splice_dim ) {
// std::cout << "CSHIFT: Cshift_comms_simd call - splice_dim = " << splice_dim << " shift " << shift << " dimension = " << dimension << std::endl;
//std::cout << "CSHIFT: Cshift_comms_simd call - splice_dim = " << splice_dim << " shift " << shift << " dimension = " << dimension << std::endl;
Cshift_comms_simd(ret,rhs,dimension,shift);
} else {
// std::cout << "CSHIFT: Cshift_comms" <<std::endl;
//std::cout << "CSHIFT: Cshift_comms" <<std::endl;
Cshift_comms(ret,rhs,dimension,shift);
}
t1=usecond();
if(Cshift_verbose) std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl;
// std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl;
return ret;
}
#if 1
template<class vobj> void Cshift_comms(Lattice<vobj>& ret,const Lattice<vobj> &rhs,int dimension,int shift)
{
int sshift[2];
@ -94,16 +94,18 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vob
sshift[0] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Even);
sshift[1] = rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,Odd);
// std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.Checkerboard()<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
//std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
if ( sshift[0] == sshift[1] ) {
// std::cout << "Single pass Cshift_comms" <<std::endl;
//std::cout << "Single pass Cshift_comms" <<std::endl;
Cshift_comms_simd(ret,rhs,dimension,shift,0x3);
} else {
// std::cout << "Two pass Cshift_comms" <<std::endl;
//std::cout << "Two pass Cshift_comms" <<std::endl;
Cshift_comms_simd(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
}
}
#define ACCELERATOR_CSHIFT_NO_COPY
#ifdef ACCELERATOR_CSHIFT_NO_COPY
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
{
typedef typename vobj::vector_type vector_type;
@ -123,8 +125,8 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
assert(shift<fd);
int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
static deviceVector<vobj> send_buf; send_buf.resize(buffer_size);
static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size);
static cshiftVector<vobj> send_buf; send_buf.resize(buffer_size);
static cshiftVector<vobj> recv_buf; recv_buf.resize(buffer_size);
int cb= (cbmask==0x2)? Odd : Even;
int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
@ -159,7 +161,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
tcomms-=usecond();
grid->Barrier();
// grid->Barrier();
grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank,
@ -167,7 +169,7 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
recv_from_rank,
bytes);
xbytes+=bytes;
grid->Barrier();
// grid->Barrier();
tcomms+=usecond();
tscatter-=usecond();
@ -175,13 +177,13 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
tscatter+=usecond();
}
}
if (Cshift_verbose){
std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
}
/*
std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
*/
}
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
@ -199,9 +201,9 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
int simd_layout = grid->_simd_layout[dimension];
int comm_dim = grid->_processors[dimension] >1 ;
// std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
// << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout
// << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
//std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
// << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout
// << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
assert(comm_dim==1);
assert(simd_layout==2);
@ -222,8 +224,8 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
// int words = sizeof(vobj)/sizeof(vector_type);
static std::vector<deviceVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd);
static std::vector<deviceVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
static std::vector<cshiftVector<scalar_object> > send_buf_extract; send_buf_extract.resize(Nsimd);
static std::vector<cshiftVector<scalar_object> > recv_buf_extract; recv_buf_extract.resize(Nsimd);
scalar_object * recv_buf_extract_mpi;
scalar_object * send_buf_extract_mpi;
@ -279,7 +281,7 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
tcomms-=usecond();
grid->Barrier();
// grid->Barrier();
send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
recv_buf_extract_mpi = &recv_buf_extract[i][0];
@ -290,7 +292,7 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
bytes);
xbytes+=bytes;
grid->Barrier();
// grid->Barrier();
tcomms+=usecond();
rpointers[i] = &recv_buf_extract[i][0];
@ -303,13 +305,13 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
tscatter+=usecond();
}
if(Cshift_verbose){
std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
}
/*
std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
*/
}
#else
template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
@ -398,13 +400,13 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
tscatter+=usecond();
}
}
if(Cshift_verbose){
std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
}
/*
std::cout << GridLogPerformance << " Cshift copy "<<tcopy/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift gather "<<tgather/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift comm "<<tcomms/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
*/
}
template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
@ -530,16 +532,15 @@ template<class vobj> void Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
tscatter+=usecond();
}
if(Cshift_verbose){
std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s"<<std::endl;
}
/*
std::cout << GridLogPerformance << " Cshift (s) copy "<<tcopy/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) gather "<<tgather/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift (s) comm "<<tcomms/1e3<<" ms"<<std::endl;
std::cout << GridLogPerformance << " Cshift BW "<<(2.0*xbytes)/tcomms<<" MB/s"<<std::endl;
*/
}
#endif
NAMESPACE_END(Grid);
#endif

View File

@ -1,5 +1,5 @@
#include <Grid/GridCore.h>
NAMESPACE_BEGIN(Grid);
std::vector<std::pair<int,int> > Cshift_table;
deviceVector<std::pair<int,int> > Cshift_table_device;
commVector<std::pair<int,int> > Cshift_table_device;
NAMESPACE_END(Grid);

View File

@ -257,30 +257,17 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
});
}
#define FAST_AXPY_NORM
template<class sobj,class vobj> inline
RealD axpy_norm(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &y)
{
GRID_TRACE("axpy_norm");
#ifdef FAST_AXPY_NORM
return axpy_norm_fast(ret,a,x,y);
#else
ret = a*x+y;
RealD nn=norm2(ret);
return nn;
#endif
return axpy_norm_fast(ret,a,x,y);
}
template<class sobj,class vobj> inline
RealD axpby_norm(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice<vobj> &y)
{
GRID_TRACE("axpby_norm");
#ifdef FAST_AXPY_NORM
return axpby_norm_fast(ret,a,b,x,y);
#else
ret = a*x+b*y;
RealD nn=norm2(ret);
return nn;
#endif
return axpby_norm_fast(ret,a,b,x,y);
}
/// Trace product

View File

@ -237,12 +237,9 @@ public:
vobj vtmp;
vtmp = r;
#if 0
deviceVector<vobj> vvtmp(1);
acceleratorPut(vvtmp[0],vtmp);
vobj *vvtmp_p = & vvtmp[0];
auto me = View(AcceleratorWrite);
accelerator_for(ss,me.size(),vobj::Nsimd(),{
auto stmp=coalescedRead(*vvtmp_p);
auto stmp=coalescedRead(vtmp);
coalescedWrite(me[ss],stmp);
});
#else

View File

@ -53,19 +53,36 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
typedef decltype(basis[0]) Field;
typedef decltype(basis[0].View(AcceleratorRead)) View;
hostVector<View> h_basis_v(basis.size());
deviceVector<View> d_basis_v(basis.size());
typedef typename std::remove_reference<decltype(h_basis_v[0][0])>::type vobj;
Vector<View> basis_v; basis_v.reserve(basis.size());
typedef typename std::remove_reference<decltype(basis_v[0][0])>::type vobj;
typedef typename std::remove_reference<decltype(Qt(0,0))>::type Coeff_t;
GridBase* grid = basis[0].Grid();
for(int k=0;k<basis.size();k++){
h_basis_v[k] = basis[k].View(AcceleratorWrite);
acceleratorPut(d_basis_v[k],h_basis_v[k]);
basis_v.push_back(basis[k].View(AcceleratorWrite));
}
View *basis_vp = &d_basis_v[0];
#if ( !(defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)) )
int max_threads = thread_max();
Vector < vobj > Bt(Nm * max_threads);
thread_region
{
vobj* B = &Bt[Nm * thread_num()];
thread_for_in_region(ss, grid->oSites(),{
for(int j=j0; j<j1; ++j) B[j]=0.;
for(int j=j0; j<j1; ++j){
for(int k=k0; k<k1; ++k){
B[j] +=Qt(j,k) * basis_v[k][ss];
}
}
for(int j=j0; j<j1; ++j){
basis_v[j][ss] = B[j];
}
});
}
#else
View *basis_vp = &basis_v[0];
int nrot = j1-j0;
if (!nrot) // edge case not handled gracefully by Cuda
@ -74,19 +91,17 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
uint64_t oSites =grid->oSites();
uint64_t siteBlock=(grid->oSites()+nrot-1)/nrot; // Maximum 1 additional vector overhead
deviceVector <vobj> Bt(siteBlock * nrot);
Vector <vobj> Bt(siteBlock * nrot);
auto Bp=&Bt[0];
// GPU readable copy of matrix
hostVector<Coeff_t> h_Qt_jv(Nm*Nm);
deviceVector<Coeff_t> Qt_jv(Nm*Nm);
Vector<Coeff_t> Qt_jv(Nm*Nm);
Coeff_t *Qt_p = & Qt_jv[0];
thread_for(i,Nm*Nm,{
int j = i/Nm;
int k = i%Nm;
h_Qt_jv[i]=Qt(j,k);
Qt_p[i]=Qt(j,k);
});
acceleratorCopyToDevice(&h_Qt_jv[0],Qt_p,Nm*Nm*sizeof(Coeff_t));
// Block the loop to keep storage footprint down
for(uint64_t s=0;s<oSites;s+=siteBlock){
@ -122,8 +137,9 @@ void basisRotate(VField &basis,Matrix& Qt,int j0, int j1, int k0,int k1,int Nm)
coalescedWrite(basis_vp[jj][sss],coalescedRead(Bp[ss*nrot+j]));
});
}
#endif
for(int k=0;k<basis.size();k++) h_basis_v[k].ViewClose();
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
}
// Extract a single rotated vector
@ -136,19 +152,16 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
result.Checkerboard() = basis[0].Checkerboard();
hostVector<View> h_basis_v(basis.size());
deviceVector<View> d_basis_v(basis.size());
Vector<View> basis_v; basis_v.reserve(basis.size());
for(int k=0;k<basis.size();k++){
h_basis_v[k]=basis[k].View(AcceleratorRead);
acceleratorPut(d_basis_v[k],h_basis_v[k]);
basis_v.push_back(basis[k].View(AcceleratorRead));
}
vobj zz=Zero();
deviceVector<double> Qt_jv(Nm);
Vector<double> Qt_jv(Nm);
double * Qt_j = & Qt_jv[0];
for(int k=0;k<Nm;++k) acceleratorPut(Qt_j[k],Qt(j,k));
for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
auto basis_vp=& d_basis_v[0];
auto basis_vp=& basis_v[0];
autoView(result_v,result,AcceleratorWrite);
accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
vobj zzz=Zero();
@ -158,7 +171,7 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
}
coalescedWrite(result_v[ss], B);
});
for(int k=0;k<basis.size();k++) h_basis_v[k].ViewClose();
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
}
template<class Field>

View File

@ -46,7 +46,7 @@ inline typename vobj::scalar_object sum_cpu(const vobj *arg, Integer osites)
// const int Nsimd = vobj::Nsimd();
const int nthread = GridThread::GetThreads();
std::vector<sobj> sumarray(nthread);
Vector<sobj> sumarray(nthread);
for(int i=0;i<nthread;i++){
sumarray[i]=Zero();
}
@ -75,7 +75,7 @@ inline typename vobj::scalar_objectD sumD_cpu(const vobj *arg, Integer osites)
const int nthread = GridThread::GetThreads();
std::vector<sobj> sumarray(nthread);
Vector<sobj> sumarray(nthread);
for(int i=0;i<nthread;i++){
sumarray[i]=Zero();
}
@ -290,10 +290,8 @@ template<class vobj>
inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) {
GridBase *grid = left.Grid();
bool ok;
#ifdef GRID_SYCL
uint64_t csum=0;
uint64_t csum2=0;
if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
{
// Hack
@ -302,33 +300,13 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
Integer words = left.Grid()->oSites()*sizeof(vobj)/sizeof(uint64_t);
uint64_t *base= (uint64_t *)&l_v[0];
csum=svm_xor(base,words);
ok = FlightRecorder::CsumLog(csum);
if ( !ok ) {
csum2=svm_xor(base,words);
std::cerr<< " Bad CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
} else {
// csum2=svm_xor(base,words);
// std::cerr<< " ok CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
}
assert(ok);
}
FlightRecorder::CsumLog(csum);
#endif
FlightRecorder::StepLog("rank inner product");
ComplexD nrm = rankInnerProduct(left,right);
// ComplexD nrmck=nrm;
RealD local = real(nrm);
ok = FlightRecorder::NormLog(real(nrm));
if ( !ok ) {
ComplexD nrm2 = rankInnerProduct(left,right);
RealD local2 = real(nrm2);
std::cerr<< " Bad NORM " << local << " recomputed as "<<local2<<std::endl;
assert(ok);
}
FlightRecorder::StepLog("Start global sum");
// grid->GlobalSumP2P(nrm);
FlightRecorder::NormLog(real(nrm));
grid->GlobalSum(nrm);
FlightRecorder::StepLog("Finished global sum");
// std::cout << " norm "<< nrm << " p2p norm "<<nrmck<<std::endl;
FlightRecorder::ReductionLog(local,real(nrm));
return nrm;
}
@ -365,6 +343,18 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
autoView( x_v, x, AcceleratorRead);
autoView( y_v, y, AcceleratorRead);
autoView( z_v, z, AcceleratorWrite);
#if 0
typedef decltype(innerProductD(x_v[0],y_v[0])) inner_t;
Vector<inner_t> inner_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
accelerator_for( ss, sites, nsimd,{
auto tmp = a*x_v(ss)+b*y_v(ss);
coalescedWrite(inner_tmp_v[ss],innerProductD(tmp,tmp));
coalescedWrite(z_v[ss],tmp);
});
nrm = real(TensorRemove(sum(inner_tmp_v,sites)));
#else
typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
deviceVector<inner_t> inner_tmp;
inner_tmp.resize(sites);
@ -375,44 +365,9 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
coalescedWrite(inner_tmp_v[ss],innerProduct(tmp,tmp));
coalescedWrite(z_v[ss],tmp);
});
bool ok;
#ifdef GRID_SYCL
uint64_t csum=0;
uint64_t csum2=0;
if ( FlightRecorder::LoggingMode != FlightRecorder::LoggingModeNone)
{
// z_v
{
Integer words = sites*sizeof(vobj)/sizeof(uint64_t);
uint64_t *base= (uint64_t *)&z_v[0];
csum=svm_xor(base,words);
ok = FlightRecorder::CsumLog(csum);
if ( !ok ) {
csum2=svm_xor(base,words);
std::cerr<< " Bad z_v CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
}
assert(ok);
}
// inner_v
{
Integer words = sites*sizeof(inner_t)/sizeof(uint64_t);
uint64_t *base= (uint64_t *)&inner_tmp_v[0];
csum=svm_xor(base,words);
ok = FlightRecorder::CsumLog(csum);
if ( !ok ) {
csum2=svm_xor(base,words);
std::cerr<< " Bad inner_tmp_v CSUM " << std::hex<< csum << " recomputed as "<<csum2<<std::dec<<std::endl;
}
assert(ok);
}
}
#endif
nrm = real(TensorRemove(sumD(inner_tmp_v,sites)));
ok = FlightRecorder::NormLog(real(nrm));
assert(ok);
RealD local = real(nrm);
#endif
grid->GlobalSum(nrm);
FlightRecorder::ReductionLog(local,real(nrm));
return nrm;
}
@ -422,7 +377,7 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
conformable(left,right);
typedef typename vobj::vector_typeD vector_type;
std::vector<ComplexD> tmp(2);
Vector<ComplexD> tmp(2);
GridBase *grid = left.Grid();
@ -432,8 +387,8 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice<vobj> &left,const Latti
// GPU
typedef decltype(innerProductD(vobj(),vobj())) inner_t;
typedef decltype(innerProductD(vobj(),vobj())) norm_t;
deviceVector<inner_t> inner_tmp(sites);
deviceVector<norm_t> norm_tmp(sites);
Vector<inner_t> inner_tmp(sites);
Vector<norm_t> norm_tmp(sites);
auto inner_tmp_v = &inner_tmp[0];
auto norm_tmp_v = &norm_tmp[0];
{
@ -483,9 +438,7 @@ inline auto sum(const LatticeTrinaryExpression<Op,T1,T2,T3> & expr)
// sliceSum, sliceInnerProduct, sliceAxpy, sliceNorm etc...
//////////////////////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,
std::vector<typename vobj::scalar_object> &result,
int orthogdim)
template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim)
{
///////////////////////////////////////////////////////
// FIXME precision promoted summation
@ -507,8 +460,8 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,
int ld=grid->_ldimensions[orthogdim];
int rd=grid->_rdimensions[orthogdim];
std::vector<vobj> lvSum(rd); // will locally sum vectors first
std::vector<sobj> lsSum(ld,Zero()); // sum across these down to scalars
Vector<vobj> lvSum(rd); // will locally sum vectors first
Vector<sobj> lsSum(ld,Zero()); // sum across these down to scalars
ExtractBuffer<sobj> extracted(Nsimd); // splitting the SIMD
result.resize(fd); // And then global sum to return the same vector to every node
@ -556,8 +509,6 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,
scalar_type * ptr = (scalar_type *) &result[0];
int words = fd*sizeof(sobj)/sizeof(scalar_type);
grid->GlobalSumVector(ptr, words);
// std::cout << GridLogMessage << " sliceSum local"<<t_sum<<" us, host+mpi "<<t_rest<<std::endl;
}
template<class vobj> inline
std::vector<typename vobj::scalar_object>
@ -601,8 +552,8 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti
int ld=grid->_ldimensions[orthogdim];
int rd=grid->_rdimensions[orthogdim];
std::vector<vector_type> lvSum(rd); // will locally sum vectors first
std::vector<scalar_type > lsSum(ld,scalar_type(0.0)); // sum across these down to scalars
Vector<vector_type> lvSum(rd); // will locally sum vectors first
Vector<scalar_type > lsSum(ld,scalar_type(0.0)); // sum across these down to scalars
ExtractBuffer<iScalar<scalar_type> > extracted(Nsimd); // splitting the SIMD
result.resize(fd); // And then global sum to return the same vector to every node for IO to file

View File

@ -214,12 +214,22 @@ inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osi
// Move out of UVM
// Turns out I had messed up the synchronise after move to compute stream
// as running this on the default stream fools the synchronise
deviceVector<sobj> buffer(numBlocks);
#undef UVM_BLOCK_BUFFER
#ifndef UVM_BLOCK_BUFFER
commVector<sobj> buffer(numBlocks);
sobj *buffer_v = &buffer[0];
sobj result;
reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
accelerator_barrier();
acceleratorCopyFromDevice(buffer_v,&result,sizeof(result));
#else
Vector<sobj> buffer(numBlocks);
sobj *buffer_v = &buffer[0];
sobj result;
reduceKernel<<< numBlocks, numThreads, smemSize, computeStream >>>(lat, buffer_v, size);
accelerator_barrier();
result = *buffer_v;
#endif
return result;
}
@ -234,7 +244,7 @@ inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osi
const int words = sizeof(vobj)/sizeof(vector);
deviceVector<vector> buffer(osites);
Vector<vector> buffer(osites);
vector *dat = (vector *)lat;
vector *buf = &buffer[0];
iScalar<vector> *tbuf =(iScalar<vector> *) &buffer[0];

View File

@ -4,28 +4,33 @@ NAMESPACE_BEGIN(Grid);
// Possibly promote to double and sum
/////////////////////////////////////////////////////////////////////////////////////////////////////////
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu_tensor(const vobj *lat, Integer osites)
{
typedef typename vobj::scalar_object sobj;
typedef typename vobj::scalar_objectD sobjD;
static Vector<sobj> mysum;
mysum.resize(1);
sobj *mysum_p = & mysum[0];
sobj identity; zeroit(identity);
sobj ret; zeroit(ret);
mysum[0] = identity;
sobj ret ;
Integer nsimd= vobj::Nsimd();
{
sycl::buffer<sobj, 1> abuff(&ret, {1});
theGridAccelerator->submit([&](sycl::handler &cgh) {
auto Reduction = sycl::reduction(abuff,cgh,identity,std::plus<>());
cgh.parallel_for(sycl::range<1>{osites},
Reduction,
[=] (sycl::id<1> item, auto &sum) {
auto osite = item[0];
sum +=Reduce(lat[osite]);
});
});
}
const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() });
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
auto Reduction = cl::sycl::reduction(mysum_p,identity,std::plus<>(),PropList);
cgh.parallel_for(cl::sycl::range<1>{osites},
Reduction,
[=] (cl::sycl::id<1> item, auto &sum) {
auto osite = item[0];
sum +=Reduce(lat[osite]);
});
});
theGridAccelerator->wait();
ret = mysum[0];
// free(mysum,*theGridAccelerator);
sobjD dret; convertType(dret,ret);
return dret;
}
@ -71,22 +76,59 @@ inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osite
template<class Word> Word svm_xor(Word *vec,uint64_t L)
{
Word xorResult; xorResult = 0;
static Vector<Word> d_sum;
d_sum.resize(1);
Word *d_sum_p=&d_sum[0];
Word identity; identity=0;
Word ret = 0;
{
sycl::buffer<Word, 1> abuff(&ret, {1});
theGridAccelerator->submit([&](sycl::handler &cgh) {
auto Reduction = sycl::reduction(abuff,cgh,identity,std::bit_xor<>());
cgh.parallel_for(sycl::range<1>{L},
Reduction,
[=] (sycl::id<1> index, auto &sum) {
sum ^=vec[index];
});
});
}
d_sum[0] = identity;
const cl::sycl::property_list PropList ({ cl::sycl::property::reduction::initialize_to_identity() });
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
auto Reduction = cl::sycl::reduction(d_sum_p,identity,std::bit_xor<>(),PropList);
cgh.parallel_for(cl::sycl::range<1>{L},
Reduction,
[=] (cl::sycl::id<1> index, auto &sum) {
sum^=vec[index];
});
});
theGridAccelerator->wait();
Word ret = d_sum[0];
// free(d_sum,*theGridAccelerator);
return ret;
}
NAMESPACE_END(Grid);
/*
template <class vobj>
inline typename vobj::scalar_objectD sumD_gpu_repack(const vobj *lat, Integer osites)
{
typedef typename vobj::vector_type vector;
typedef typename vobj::scalar_type scalar;
typedef typename vobj::scalar_typeD scalarD;
typedef typename vobj::scalar_objectD sobjD;
sobjD ret;
scalarD *ret_p = (scalarD *)&ret;
const int nsimd = vobj::Nsimd();
const int words = sizeof(vobj)/sizeof(vector);
Vector<scalar> buffer(osites*nsimd);
scalar *buf = &buffer[0];
vector *dat = (vector *)lat;
for(int w=0;w<words;w++) {
accelerator_for(ss,osites,nsimd,{
int lane = acceleratorSIMTlane(nsimd);
buf[ss*nsimd+lane] = dat[ss*words+w].getlane(lane);
});
//Precision change at this point is to late to gain precision
ret_p[w] = svm_reduce(buf,nsimd*osites);
}
return ret;
}
*/

View File

@ -21,18 +21,9 @@ NAMESPACE_BEGIN(Grid);
#if defined(GRID_CUDA) || defined(GRID_HIP)
template<class vobj>
inline void sliceSumReduction_cub_small(const vobj *Data,
std::vector<vobj> &lvSum,
const int rd,
const int e1,
const int e2,
const int stride,
const int ostride,
const int Nsimd)
{
template<class vobj> inline void sliceSumReduction_cub_small(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
size_t subvol_size = e1*e2;
deviceVector<vobj> reduction_buffer(rd*subvol_size);
commVector<vobj> reduction_buffer(rd*subvol_size);
auto rb_p = &reduction_buffer[0];
vobj zero_init;
zeroit(zero_init);
@ -103,15 +94,7 @@ inline void sliceSumReduction_cub_small(const vobj *Data,
#if defined(GRID_SYCL)
template<class vobj>
inline void sliceSumReduction_sycl_small(const vobj *Data,
std::vector <vobj> &lvSum,
const int &rd,
const int &e1,
const int &e2,
const int &stride,
const int &ostride,
const int &Nsimd)
template<class vobj> inline void sliceSumReduction_sycl_small(const vobj *Data, Vector <vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
{
size_t subvol_size = e1*e2;
@ -122,7 +105,7 @@ inline void sliceSumReduction_sycl_small(const vobj *Data,
mysum[r] = vobj_zero;
}
deviceVector<vobj> reduction_buffer(rd*subvol_size);
commVector<vobj> reduction_buffer(rd*subvol_size);
auto rb_p = &reduction_buffer[0];
@ -141,11 +124,11 @@ inline void sliceSumReduction_sycl_small(const vobj *Data,
});
for (int r = 0; r < rd; r++) {
theGridAccelerator->submit([&](sycl::handler &cgh) {
auto Reduction = sycl::reduction(&mysum[r],std::plus<>());
cgh.parallel_for(sycl::range<1>{subvol_size},
theGridAccelerator->submit([&](cl::sycl::handler &cgh) {
auto Reduction = cl::sycl::reduction(&mysum[r],std::plus<>());
cgh.parallel_for(cl::sycl::range<1>{subvol_size},
Reduction,
[=](sycl::id<1> item, auto &sum) {
[=](cl::sycl::id<1> item, auto &sum) {
auto s = item[0];
sum += rb_p[r*subvol_size+s];
});
@ -161,23 +144,14 @@ inline void sliceSumReduction_sycl_small(const vobj *Data,
}
#endif
template<class vobj>
inline void sliceSumReduction_large(const vobj *Data,
std::vector<vobj> &lvSum,
const int rd,
const int e1,
const int e2,
const int stride,
const int ostride,
const int Nsimd)
{
template<class vobj> inline void sliceSumReduction_large(const vobj *Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd) {
typedef typename vobj::vector_type vector;
const int words = sizeof(vobj)/sizeof(vector);
const int osites = rd*e1*e2;
deviceVector<vector>buffer(osites);
commVector<vector>buffer(osites);
vector *dat = (vector *)Data;
vector *buf = &buffer[0];
std::vector<vector> lvSum_small(rd);
Vector<vector> lvSum_small(rd);
vector *lvSum_ptr = (vector *)&lvSum[0];
for (int w = 0; w < words; w++) {
@ -194,18 +168,13 @@ inline void sliceSumReduction_large(const vobj *Data,
for (int r = 0; r < rd; r++) {
lvSum_ptr[w+words*r]=lvSum_small[r];
}
}
}
template<class vobj>
inline void sliceSumReduction_gpu(const Lattice<vobj> &Data,
std::vector<vobj> &lvSum,
const int rd,
const int e1,
const int e2,
const int stride,
const int ostride,
const int Nsimd)
template<class vobj> inline void sliceSumReduction_gpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd)
{
autoView(Data_v, Data, AcceleratorRead); //reduction libraries cannot deal with large vobjs so we split into small/large case.
if constexpr (sizeof(vobj) <= 256) {
@ -223,15 +192,7 @@ inline void sliceSumReduction_gpu(const Lattice<vobj> &Data,
}
template<class vobj>
inline void sliceSumReduction_cpu(const Lattice<vobj> &Data,
std::vector<vobj> &lvSum,
const int &rd,
const int &e1,
const int &e2,
const int &stride,
const int &ostride,
const int &Nsimd)
template<class vobj> inline void sliceSumReduction_cpu(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
{
// sum over reduced dimension planes, breaking out orthog dir
// Parallel over orthog direction
@ -247,20 +208,16 @@ inline void sliceSumReduction_cpu(const Lattice<vobj> &Data,
});
}
template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data,
std::vector<vobj> &lvSum,
const int &rd,
const int &e1,
const int &e2,
const int &stride,
const int &ostride,
const int &Nsimd)
template<class vobj> inline void sliceSumReduction(const Lattice<vobj> &Data, Vector<vobj> &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
{
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
sliceSumReduction_gpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
#else
#else
sliceSumReduction_cpu(Data, lvSum, rd, e1, e2, stride, ostride, Nsimd);
#endif
#endif
}

View File

@ -54,7 +54,7 @@ struct CshiftImplGauge: public CshiftImplBase<typename Gimpl::GaugeLinkField::ve
*
*/
template<class vobj> inline void ScatterSlice(const deviceVector<vobj> &buf,
template<class vobj> inline void ScatterSlice(const cshiftVector<vobj> &buf,
Lattice<vobj> &lat,
int x,
int dim,
@ -140,7 +140,7 @@ template<class vobj> inline void ScatterSlice(const deviceVector<vobj> &buf,
});
}
template<class vobj> inline void GatherSlice(deviceVector<vobj> &buf,
template<class vobj> inline void GatherSlice(cshiftVector<vobj> &buf,
const Lattice<vobj> &lat,
int x,
int dim,
@ -462,13 +462,13 @@ public:
int rNsimd = Nsimd / simd[dimension];
assert( buffer_size == from.Grid()->_slice_nblock[dimension]*from.Grid()->_slice_block[dimension] / simd[dimension]);
static deviceVector<vobj> send_buf;
static deviceVector<vobj> recv_buf;
static cshiftVector<vobj> send_buf;
static cshiftVector<vobj> recv_buf;
send_buf.resize(buffer_size*2*depth);
recv_buf.resize(buffer_size*2*depth);
std::vector<MpiCommsRequest_t> fwd_req;
std::vector<MpiCommsRequest_t> bwd_req;
std::vector<CommsRequest_t> fwd_req;
std::vector<CommsRequest_t> bwd_req;
int words = buffer_size;
int bytes = words * sizeof(vobj);

View File

@ -98,7 +98,7 @@ public:
virtual RealD S(const GaugeField& U) = 0; // evaluate the action
virtual RealD Sinitial(const GaugeField& U) { return this->S(U); } ; // if the refresh computes the action, can cache it. Alternately refreshAndAction() ?
virtual void deriv(const GaugeField& U, GaugeField& dSdU) = 0; // evaluate the action derivative
/////////////////////////////////////////////////////////////
// virtual smeared interface through configuration container
/////////////////////////////////////////////////////////////
@ -132,10 +132,6 @@ public:
template <class GaugeField >
class EmptyAction : public Action <GaugeField>
{
using Action<GaugeField>::refresh;
using Action<GaugeField>::Sinitial;
using Action<GaugeField>::deriv;
virtual void refresh(const GaugeField& U, GridSerialRNG &sRNG, GridParallelRNG& pRNG) { assert(0);}; // refresh pseudofermions
virtual RealD S(const GaugeField& U) { return 0.0;}; // evaluate the action
virtual void deriv(const GaugeField& U, GaugeField& dSdU) { assert(0); }; // evaluate the action derivative

View File

@ -55,11 +55,6 @@ public:
RealD alpha; // Mobius scale
RealD k; // EOFA normalization constant
// Device resident
deviceVector<Coeff_t> d_shift_coefficients;
deviceVector<Coeff_t> d_MooeeInv_shift_lc;
deviceVector<Coeff_t> d_MooeeInv_shift_norm;
virtual void Instantiatable(void) = 0;
// EOFA-specific operations
@ -97,11 +92,6 @@ public:
this->k = this->alpha * (_mq3-_mq2) * std::pow(this->alpha+1.0,2*Ls) /
( std::pow(this->alpha+1.0,Ls) + _mq2*std::pow(this->alpha-1.0,Ls) ) /
( std::pow(this->alpha+1.0,Ls) + _mq3*std::pow(this->alpha-1.0,Ls) );
d_shift_coefficients.resize(Ls);
d_MooeeInv_shift_lc.resize(Ls);
d_MooeeInv_shift_norm.resize(Ls);
};
};

View File

@ -90,16 +90,16 @@ public:
void M5D(const FermionField &psi,
const FermionField &phi,
FermionField &chi,
std::vector<Coeff_t> &lower,
std::vector<Coeff_t> &diag,
std::vector<Coeff_t> &upper);
Vector<Coeff_t> &lower,
Vector<Coeff_t> &diag,
Vector<Coeff_t> &upper);
void M5Ddag(const FermionField &psi,
const FermionField &phi,
FermionField &chi,
std::vector<Coeff_t> &lower,
std::vector<Coeff_t> &diag,
std::vector<Coeff_t> &upper);
Vector<Coeff_t> &lower,
Vector<Coeff_t> &diag,
Vector<Coeff_t> &upper);
virtual void Instantiatable(void)=0;
@ -119,51 +119,35 @@ public:
RealD mass_plus, mass_minus;
// Save arguments to SetCoefficientsInternal
std::vector<Coeff_t> _gamma;
Vector<Coeff_t> _gamma;
RealD _zolo_hi;
RealD _b;
RealD _c;
// possible boost
std::vector<ComplexD> qmu;
void set_qmu(std::vector<ComplexD> _qmu) { qmu=_qmu; assert(qmu.size()==Nd);};
void addQmu(const FermionField &in, FermionField &out, int dag);
// Cayley form Moebius (tanh and zolotarev)
std::vector<Coeff_t> omega;
std::vector<Coeff_t> bs; // S dependent coeffs
std::vector<Coeff_t> cs;
std::vector<Coeff_t> as;
Vector<Coeff_t> omega;
Vector<Coeff_t> bs; // S dependent coeffs
Vector<Coeff_t> cs;
Vector<Coeff_t> as;
// For preconditioning Cayley form
std::vector<Coeff_t> bee;
std::vector<Coeff_t> cee;
std::vector<Coeff_t> aee;
std::vector<Coeff_t> beo;
std::vector<Coeff_t> ceo;
std::vector<Coeff_t> aeo;
Vector<Coeff_t> bee;
Vector<Coeff_t> cee;
Vector<Coeff_t> aee;
Vector<Coeff_t> beo;
Vector<Coeff_t> ceo;
Vector<Coeff_t> aeo;
// LDU factorisation of the eeoo matrix
std::vector<Coeff_t> lee;
std::vector<Coeff_t> leem;
std::vector<Coeff_t> uee;
std::vector<Coeff_t> ueem;
std::vector<Coeff_t> dee;
// Device memory
deviceVector<Coeff_t> d_diag;
deviceVector<Coeff_t> d_upper;
deviceVector<Coeff_t> d_lower;
deviceVector<Coeff_t> d_lee;
deviceVector<Coeff_t> d_dee;
deviceVector<Coeff_t> d_uee;
deviceVector<Coeff_t> d_leem;
deviceVector<Coeff_t> d_ueem;
Vector<Coeff_t> lee;
Vector<Coeff_t> leem;
Vector<Coeff_t> uee;
Vector<Coeff_t> ueem;
Vector<Coeff_t> dee;
// Matrices of 5d ee inverse params
// std::vector<iSinglet<Simd> > MatpInv;
// std::vector<iSinglet<Simd> > MatmInv;
// std::vector<iSinglet<Simd> > MatpInvDag;
// std::vector<iSinglet<Simd> > MatmInvDag;
Vector<iSinglet<Simd> > MatpInv;
Vector<iSinglet<Simd> > MatmInv;
Vector<iSinglet<Simd> > MatpInvDag;
Vector<iSinglet<Simd> > MatmInvDag;
///////////////////////////////////////////////////////////////
// Conserved current utilities
@ -203,7 +187,7 @@ public:
protected:
virtual void SetCoefficientsZolotarev(RealD zolohi,Approx::zolotarev_data *zdata,RealD b,RealD c);
virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c);
virtual void SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c);
virtual void SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t> & gamma,RealD b,RealD c);
};
NAMESPACE_END(Grid);

View File

@ -60,50 +60,6 @@ public:
// virtual void Instantiatable(void)=0;
virtual void Instantiatable(void) =0;
void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary, std::vector<double> twist)
{
std::cout << "Free Propagator for PartialFraction"<<std::endl;
FermionField in_k(in.Grid());
FermionField prop_k(in.Grid());
FFT theFFT((GridCartesian *) in.Grid());
//phase for boundary condition
ComplexField coor(in.Grid());
ComplexField ph(in.Grid()); ph = Zero();
FermionField in_buf(in.Grid()); in_buf = Zero();
typedef typename Simd::scalar_type Scalar;
Scalar ci(0.0,1.0);
assert(twist.size() == Nd);//check that twist is Nd
assert(boundary.size() == Nd);//check that boundary conditions is Nd
int shift = 0;
for(unsigned int nu = 0; nu < Nd; nu++)
{
// Shift coordinate lattice index by 1 to account for 5th dimension.
LatticeCoordinate(coor, nu + shift);
double boundary_phase = ::acos(real(boundary[nu]));
ph = ph + boundary_phase*coor*((1./(in.Grid()->_fdimensions[nu+shift])));
//momenta for propagator shifted by twist+boundary
twist[nu] = twist[nu] + boundary_phase/((2.0*M_PI));
}
in_buf = exp(ci*ph*(-1.0))*in;
theFFT.FFT_all_dim(in_k,in,FFT::forward);
this->MomentumSpacePropagatorHw(prop_k,in_k,mass,twist);
theFFT.FFT_all_dim(out,prop_k,FFT::backward);
//phase for boundary condition
out = out * exp(ci*ph);
};
virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) {
std::vector<double> twist(Nd,0.0); //default: periodic boundarys in all directions
std::vector<Complex> boundary;
for(int i=0;i<Nd;i++) boundary.push_back(1);//default: periodic boundary conditions
FreePropagator(in,out,mass,boundary,twist);
};
// Efficient support for multigrid coarsening
virtual void Mdir (const FermionField &in, FermionField &out,int dir,int disp);
virtual void MdirAll(const FermionField &in, std::vector<FermionField> &out);
@ -134,12 +90,12 @@ protected:
RealD mass;
RealD R;
RealD ZoloHiInv;
std::vector<double> Beta;
std::vector<double> cc;;
std::vector<double> cc_d;;
std::vector<double> sqrt_cc;
std::vector<double> See;
std::vector<double> Aee;
Vector<double> Beta;
Vector<double> cc;;
Vector<double> cc_d;;
Vector<double> sqrt_cc;
Vector<double> See;
Vector<double> Aee;
};

View File

@ -69,10 +69,10 @@ public:
// Instantiate different versions depending on Impl
/////////////////////////////////////////////////////
void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);
void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);
virtual void RefreshShiftCoefficients(RealD new_shift);
@ -83,7 +83,7 @@ public:
RealD _M5, const ImplParams& p=ImplParams());
protected:
void SetCoefficientsInternal(RealD zolo_hi, std::vector<Coeff_t>& gamma, RealD b, RealD c);
void SetCoefficientsInternal(RealD zolo_hi, Vector<Coeff_t>& gamma, RealD b, RealD c);
};
NAMESPACE_END(Grid);

View File

@ -102,11 +102,11 @@ public:
GaugeField &mat,
const FermionField &A, const FermionField &B, int dag);
void DhopInternal(StencilImpl &st, DoubledGaugeField &U,DoubledGaugeField &UUU,
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
const FermionField &in, FermionField &out, int dag);
void DhopInternalSerialComms(StencilImpl &st, DoubledGaugeField &U,DoubledGaugeField &UUU,
void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
const FermionField &in, FermionField &out, int dag);
void DhopInternalOverlappedComms(StencilImpl &st, DoubledGaugeField &U,DoubledGaugeField &UUU,
void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,DoubledGaugeField &UUU,
const FermionField &in, FermionField &out, int dag);
//////////////////////////////////////////////////////////////////////////
@ -164,6 +164,8 @@ public:
DoubledGaugeField UUUmuEven;
DoubledGaugeField UUUmuOdd;
LebesgueOrder Lebesgue;
LebesgueOrder LebesgueEvenOdd;
///////////////////////////////////////////////////////////////
// Conserved current utilities

View File

@ -100,6 +100,7 @@ public:
int dag);
void DhopInternal(StencilImpl & st,
LebesgueOrder &lo,
DoubledGaugeField &U,
DoubledGaugeField &UUU,
const FermionField &in,
@ -107,6 +108,7 @@ public:
int dag);
void DhopInternalOverlappedComms(StencilImpl & st,
LebesgueOrder &lo,
DoubledGaugeField &U,
DoubledGaugeField &UUU,
const FermionField &in,
@ -114,6 +116,7 @@ public:
int dag);
void DhopInternalSerialComms(StencilImpl & st,
LebesgueOrder &lo,
DoubledGaugeField &U,
DoubledGaugeField &UUU,
const FermionField &in,
@ -189,6 +192,8 @@ public:
DoubledGaugeField UUUmuEven;
DoubledGaugeField UUUmuOdd;
LebesgueOrder Lebesgue;
LebesgueOrder LebesgueEvenOdd;
// Comms buffer
// std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;

View File

@ -42,11 +42,11 @@ public:
public:
// Shift operator coefficients for red-black preconditioned Mobius EOFA
std::vector<Coeff_t> Mooee_shift;
std::vector<Coeff_t> MooeeInv_shift_lc;
std::vector<Coeff_t> MooeeInv_shift_norm;
std::vector<Coeff_t> MooeeInvDag_shift_lc;
std::vector<Coeff_t> MooeeInvDag_shift_norm;
Vector<Coeff_t> Mooee_shift;
Vector<Coeff_t> MooeeInv_shift_lc;
Vector<Coeff_t> MooeeInv_shift_norm;
Vector<Coeff_t> MooeeInvDag_shift_lc;
Vector<Coeff_t> MooeeInvDag_shift_norm;
virtual void Instantiatable(void) {};
@ -74,18 +74,18 @@ public:
// Instantiate different versions depending on Impl
/////////////////////////////////////////////////////
void M5D(const FermionField& psi, const FermionField& phi, FermionField& chi,
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);
void M5D_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
std::vector<Coeff_t>& shift_coeffs);
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper,
Vector<Coeff_t>& shift_coeffs);
void M5Ddag(const FermionField& psi, const FermionField& phi, FermionField& chi,
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper);
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper);
void M5Ddag_shift(const FermionField& psi, const FermionField& phi, FermionField& chi,
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper,
std::vector<Coeff_t>& shift_coeffs);
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper,
Vector<Coeff_t>& shift_coeffs);
virtual void RefreshShiftCoefficients(RealD new_shift);

View File

@ -102,11 +102,11 @@ public:
GaugeField &mat,
const FermionField &A, const FermionField &B, int dag);
void DhopInternal(StencilImpl &st, DoubledGaugeField &U,
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag);
void DhopInternalSerialComms(StencilImpl &st, DoubledGaugeField &U,
void DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag);
void DhopInternalOverlappedComms(StencilImpl &st, DoubledGaugeField &U,
void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag);
//////////////////////////////////////////////////////////////////////////
@ -152,6 +152,9 @@ public:
DoubledGaugeField UmuEven;
DoubledGaugeField UmuOdd;
LebesgueOrder Lebesgue;
LebesgueOrder LebesgueEvenOdd;
///////////////////////////////////////////////////////////////
// Conserved current utilities
///////////////////////////////////////////////////////////////

View File

@ -42,7 +42,7 @@ public:
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
this->MomentumSpacePropagatorHw(out,in,_m,twist);
};
};
// Constructors
OverlapWilsonCayleyTanhFermion(GaugeField &_Umu,

View File

@ -41,10 +41,6 @@ public:
public:
// Constructors
virtual void Instantiatable(void){};
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
this->MomentumSpacePropagatorHw(out,in,_m,twist);
};
OverlapWilsonCayleyZolotarevFermion(GaugeField &_Umu,
GridCartesian &FiveDimGrid,

View File

@ -41,9 +41,6 @@ public:
public:
virtual void Instantiatable(void){};
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
this->MomentumSpacePropagatorHw(out,in,_m,twist);
};
// Constructors
OverlapWilsonContFracTanhFermion(GaugeField &_Umu,
GridCartesian &FiveDimGrid,

View File

@ -40,9 +40,6 @@ public:
INHERIT_IMPL_TYPES(Impl);
virtual void Instantiatable(void){};
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
this->MomentumSpacePropagatorHw(out,in,_m,twist);
};
// Constructors
OverlapWilsonContFracZolotarevFermion(GaugeField &_Umu,
GridCartesian &FiveDimGrid,

View File

@ -41,9 +41,6 @@ public:
public:
virtual void Instantiatable(void){};
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
this->MomentumSpacePropagatorHw(out,in,_m,twist);
};
// Constructors
OverlapWilsonPartialFractionTanhFermion(GaugeField &_Umu,
GridCartesian &FiveDimGrid,

View File

@ -40,11 +40,6 @@ public:
INHERIT_IMPL_TYPES(Impl);
virtual void Instantiatable(void){};
void MomentumSpacePropagator(FermionField &out,const FermionField &in,RealD _m,std::vector<double> twist) {
this->MomentumSpacePropagatorHw(out,in,_m,twist);
};
// Constructors
OverlapWilsonPartialFractionZolotarevFermion(GaugeField &_Umu,
GridCartesian &FiveDimGrid,

View File

@ -39,7 +39,7 @@ class PartialFractionFermion5D : public WilsonFermion5D<Impl>
public:
INHERIT_IMPL_TYPES(Impl);
const int part_frac_chroma_convention=0;
const int part_frac_chroma_convention=1;
void Meooe_internal(const FermionField &in, FermionField &out,int dag);
void Mooee_internal(const FermionField &in, FermionField &out,int dag);
@ -83,78 +83,19 @@ public:
GridRedBlackCartesian &FourDimRedBlackGrid,
RealD _mass,RealD M5,const ImplParams &p= ImplParams());
PartialFractionFermion5D(GaugeField &_Umu,
GridCartesian &FiveDimGrid,
GridRedBlackCartesian &FiveDimRedBlackGrid,
GridCartesian &FourDimGrid,
GridRedBlackCartesian &FourDimRedBlackGrid,
RealD _mass,RealD M5,std::vector<RealD> &_qmu,const ImplParams &p= ImplParams());
void FreePropagator(const FermionField &in,FermionField &out,RealD mass,std::vector<Complex> boundary, std::vector<double> twist)
{
std::cout << "Free Propagator for PartialFraction"<<std::endl;
FermionField in_k(in.Grid());
FermionField prop_k(in.Grid());
FFT theFFT((GridCartesian *) in.Grid());
//phase for boundary condition
ComplexField coor(in.Grid());
ComplexField ph(in.Grid()); ph = Zero();
FermionField in_buf(in.Grid()); in_buf = Zero();
typedef typename Simd::scalar_type Scalar;
Scalar ci(0.0,1.0);
assert(twist.size() == Nd);//check that twist is Nd
assert(boundary.size() == Nd);//check that boundary conditions is Nd
int shift = 0;
for(unsigned int nu = 0; nu < Nd; nu++)
{
// Shift coordinate lattice index by 1 to account for 5th dimension.
LatticeCoordinate(coor, nu + shift);
double boundary_phase = ::acos(real(boundary[nu]));
ph = ph + boundary_phase*coor*((1./(in.Grid()->_fdimensions[nu+shift])));
//momenta for propagator shifted by twist+boundary
twist[nu] = twist[nu] + boundary_phase/((2.0*M_PI));
}
in_buf = exp(ci*ph*(-1.0))*in;
theFFT.FFT_all_dim(in_k,in,FFT::forward);
if ( this->qmu.size() ){
this->MomentumSpacePropagatorHwQ(prop_k,in_k,mass,twist,this->qmu);
} else {
this->MomentumSpacePropagatorHw(prop_k,in_k,mass,twist);
}
theFFT.FFT_all_dim(out,prop_k,FFT::backward);
//phase for boundary condition
out = out * exp(ci*ph);
};
virtual void FreePropagator(const FermionField &in,FermionField &out,RealD mass) {
std::vector<double> twist(Nd,0.0); //default: periodic boundarys in all directions
std::vector<Complex> boundary;
for(int i=0;i<Nd;i++) boundary.push_back(1);//default: periodic boundary conditions
FreePropagator(in,out,mass,boundary,twist);
};
void set_qmu(std::vector<RealD> _qmu) { qmu=_qmu; assert(qmu.size()==Nd);};
void addQmu(const FermionField &in, FermionField &out, int dag);
protected:
virtual void SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD scale);
virtual void SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata);
std::vector<RealD> qmu;
// Part frac
RealD mass;
RealD dw_diag;
RealD R;
RealD amax;
RealD scale;
std::vector<double> p;
std::vector<double> q;
Vector<double> p;
Vector<double> q;
};

View File

@ -35,7 +35,7 @@ template<class Matrix, class Field>
class KappaSimilarityTransform {
public:
INHERIT_IMPL_TYPES(Matrix);
std::vector<Coeff_t> kappa, kappaDag, kappaInv, kappaInvDag;
Vector<Coeff_t> kappa, kappaDag, kappaInv, kappaInvDag;
KappaSimilarityTransform (Matrix &zmob) {
for (int i=0;i<(int)zmob.bs.size();i++) {

View File

@ -49,10 +49,10 @@ template<class Impl> class StaggeredKernels : public FermionOperator<Impl> , pub
public:
void DhopImproved(StencilImpl &st,
void DhopImproved(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
const FermionField &in, FermionField &out, int dag, int interior,int exterior);
void DhopNaive(StencilImpl &st,
void DhopNaive(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag, int interior,int exterior);

View File

@ -47,7 +47,7 @@ public:
static int PartialCompressionFactor(GridBase *grid) { return 1;}
#endif
template<class vobj,class cobj,class compressor>
static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
static void Gather_plane_simple (commVector<std::pair<int,int> >& table,
const Lattice<vobj> &rhs,
cobj *buffer,
compressor &compress,
@ -109,7 +109,7 @@ public:
// Reorder the fifth dim to be s=Ls-1 , s=0, s=1,...,Ls-2.
////////////////////////////////////////////////////////////////////////////////////////////
template<class vobj,class cobj,class compressor>
static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
compressor &compress,int type,int partial)
{
@ -197,7 +197,7 @@ public:
#endif
template<class vobj,class cobj,class compressor>
static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
static void Gather_plane_simple (commVector<std::pair<int,int> >& table,
const Lattice<vobj> &rhs,
cobj *buffer,
compressor &compress,
@ -208,7 +208,7 @@ public:
else FaceGatherSimple::Gather_plane_simple(table,rhs,buffer,compress,off,so,partial);
}
template<class vobj,class cobj,class compressor>
static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
compressor &compress,int type,int partial)
{
@ -402,6 +402,7 @@ public:
typedef CartesianStencil<vobj,cobj,Parameters> Base;
typedef typename Base::View_type View_type;
typedef typename Base::StencilVector StencilVector;
// Vector<int> surface_list;
WilsonStencil(GridBase *grid,
@ -414,6 +415,29 @@ public:
// surface_list.resize(0);
this->same_node.resize(npoints);
};
/*
void BuildSurfaceList(int Ls,int vol4){
// find same node for SHM
// Here we know the distance is 1 for WilsonStencil
for(int point=0;point<this->_npoints;point++){
this->same_node[point] = this->SameNode(point);
}
for(int site = 0 ;site< vol4;site++){
int local = 1;
for(int point=0;point<this->_npoints;point++){
if( (!this->GetNodeLocal(site*Ls,point)) && (!this->same_node[point]) ){
local = 0;
}
}
if(local == 0) {
surface_list.push_back(site);
}
}
}
*/
template < class compressor>
void HaloExchangeOpt(const Lattice<vobj> &source,compressor &compress)

View File

@ -126,17 +126,14 @@ public:
void DerivInternal(StencilImpl &st, DoubledGaugeField &U, GaugeField &mat,
const FermionField &A, const FermionField &B, int dag);
void DhopInternal(StencilImpl &st,
DoubledGaugeField &U,
void DhopInternal(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag);
void DhopInternalSerial(StencilImpl &st,
DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag);
void DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag);
void DhopInternalOverlappedComms(StencilImpl &st,
DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag);
void DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag);
// Constructor
WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
@ -171,6 +168,9 @@ public:
DoubledGaugeField UmuEven;
DoubledGaugeField UmuOdd;
LebesgueOrder Lebesgue;
LebesgueOrder LebesgueEvenOdd;
WilsonAnisotropyCoefficients anisotropyCoeff;
///////////////////////////////////////////////////////////////

View File

@ -109,8 +109,6 @@ public:
void MomentumSpacePropagatorHt_5d(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
void MomentumSpacePropagatorHt(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
void MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist) ;
void MomentumSpacePropagatorHwQ(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist,
std::vector<double> qmu) ;
// Implement hopping term non-hermitian hopping term; half cb or both
// Implement s-diagonal DW
@ -119,9 +117,6 @@ public:
void DhopOE(const FermionField &in, FermionField &out,int dag);
void DhopEO(const FermionField &in, FermionField &out,int dag);
void DhopComms (const FermionField &in, FermionField &out);
void DhopCalc (const FermionField &in, FermionField &out,uint64_t *ids);
// add a DhopComm
// -- suboptimal interface will presently trigger multiple comms.
void DhopDir(const FermionField &in, FermionField &out,int dir,int disp);
@ -140,18 +135,21 @@ public:
int dag);
void DhopInternal(StencilImpl & st,
LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in,
FermionField &out,
int dag);
void DhopInternalOverlappedComms(StencilImpl & st,
LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in,
FermionField &out,
int dag);
void DhopInternalSerialComms(StencilImpl & st,
LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in,
FermionField &out,
@ -205,6 +203,9 @@ public:
DoubledGaugeField UmuEven;
DoubledGaugeField UmuOdd;
LebesgueOrder Lebesgue;
LebesgueOrder LebesgueEvenOdd;
// Comms buffer
// std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > comm_buf;

View File

@ -57,10 +57,6 @@ public:
int Ls, int Nsite, const FermionField &in, FermionField &out,
int interior=1,int exterior=1) ;
static void DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
int Ls, int Nsite, const FermionField &in, FermionField &out,
uint64_t *ids);
static void DhopDagKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
int Ls, int Nsite, const FermionField &in, FermionField &out,
int interior=1,int exterior=1) ;

View File

@ -58,7 +58,7 @@ public:
{
// RealD eps = 1.0;
std::cout<<GridLogMessage << "ZMobiusFermion (b="<<b<<",c="<<c<<") with Ls= "<<this->Ls<<" gamma passed in"<<std::endl;
std::vector<Coeff_t> zgamma(this->Ls);
Vector<Coeff_t> zgamma(this->Ls);
for(int s=0;s<this->Ls;s++){
zgamma[s] = gamma[s];
}

View File

@ -48,8 +48,7 @@ CayleyFermion5D<Impl>::CayleyFermion5D(GaugeField &_Umu,
FourDimGrid,
FourDimRedBlackGrid,_M5,p),
mass_plus(_mass), mass_minus(_mass)
{
// qmu defaults to zero size;
{
}
///////////////////////////////////////////////////////////////
@ -157,18 +156,18 @@ template<class Impl>
void CayleyFermion5D<Impl>::M5D (const FermionField &psi, FermionField &chi)
{
int Ls=this->Ls;
std::vector<Coeff_t> diag (Ls,1.0);
std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass_minus;
std::vector<Coeff_t> lower(Ls,-1.0); lower[0] =mass_plus;
Vector<Coeff_t> diag (Ls,1.0);
Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1]=mass_minus;
Vector<Coeff_t> lower(Ls,-1.0); lower[0] =mass_plus;
M5D(psi,chi,chi,lower,diag,upper);
}
template<class Impl>
void CayleyFermion5D<Impl>::Meooe5D (const FermionField &psi, FermionField &Din)
{
int Ls=this->Ls;
std::vector<Coeff_t> diag = bs;
std::vector<Coeff_t> upper= cs;
std::vector<Coeff_t> lower= cs;
Vector<Coeff_t> diag = bs;
Vector<Coeff_t> upper= cs;
Vector<Coeff_t> lower= cs;
upper[Ls-1]=-mass_minus*upper[Ls-1];
lower[0] =-mass_plus*lower[0];
M5D(psi,psi,Din,lower,diag,upper);
@ -177,9 +176,9 @@ void CayleyFermion5D<Impl>::Meooe5D (const FermionField &psi, FermionField &D
template<class Impl> void CayleyFermion5D<Impl>::Meo5D (const FermionField &psi, FermionField &chi)
{
int Ls=this->Ls;
std::vector<Coeff_t> diag = beo;
std::vector<Coeff_t> upper(Ls);
std::vector<Coeff_t> lower(Ls);
Vector<Coeff_t> diag = beo;
Vector<Coeff_t> upper(Ls);
Vector<Coeff_t> lower(Ls);
for(int i=0;i<Ls;i++) {
upper[i]=-ceo[i];
lower[i]=-ceo[i];
@ -192,9 +191,9 @@ template<class Impl>
void CayleyFermion5D<Impl>::Mooee (const FermionField &psi, FermionField &chi)
{
int Ls=this->Ls;
std::vector<Coeff_t> diag = bee;
std::vector<Coeff_t> upper(Ls);
std::vector<Coeff_t> lower(Ls);
Vector<Coeff_t> diag = bee;
Vector<Coeff_t> upper(Ls);
Vector<Coeff_t> lower(Ls);
for(int i=0;i<Ls;i++) {
upper[i]=-cee[i];
lower[i]=-cee[i];
@ -207,9 +206,9 @@ template<class Impl>
void CayleyFermion5D<Impl>::MooeeDag (const FermionField &psi, FermionField &chi)
{
int Ls=this->Ls;
std::vector<Coeff_t> diag = bee;
std::vector<Coeff_t> upper(Ls);
std::vector<Coeff_t> lower(Ls);
Vector<Coeff_t> diag = bee;
Vector<Coeff_t> upper(Ls);
Vector<Coeff_t> lower(Ls);
for (int s=0;s<Ls;s++){
// Assemble the 5d matrix
@ -237,9 +236,9 @@ template<class Impl>
void CayleyFermion5D<Impl>::M5Ddag (const FermionField &psi, FermionField &chi)
{
int Ls=this->Ls;
std::vector<Coeff_t> diag(Ls,1.0);
std::vector<Coeff_t> upper(Ls,-1.0);
std::vector<Coeff_t> lower(Ls,-1.0);
Vector<Coeff_t> diag(Ls,1.0);
Vector<Coeff_t> upper(Ls,-1.0);
Vector<Coeff_t> lower(Ls,-1.0);
upper[Ls-1]=-mass_plus*upper[Ls-1];
lower[0] =-mass_minus*lower[0];
M5Ddag(psi,chi,chi,lower,diag,upper);
@ -249,9 +248,9 @@ template<class Impl>
void CayleyFermion5D<Impl>::MeooeDag5D (const FermionField &psi, FermionField &Din)
{
int Ls=this->Ls;
std::vector<Coeff_t> diag =bs;
std::vector<Coeff_t> upper=cs;
std::vector<Coeff_t> lower=cs;
Vector<Coeff_t> diag =bs;
Vector<Coeff_t> upper=cs;
Vector<Coeff_t> lower=cs;
for (int s=0;s<Ls;s++){
if ( s== 0 ) {
@ -271,34 +270,6 @@ void CayleyFermion5D<Impl>::MeooeDag5D (const FermionField &psi, FermionField
M5Ddag(psi,psi,Din,lower,diag,upper);
}
template<class Impl>
void CayleyFermion5D<Impl>::addQmu(const FermionField &psi,FermionField &chi, int dag)
{
if ( qmu.size() ) {
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT
};
std::vector<ComplexD> coeff(Nd);
ComplexD ci(0,1);
assert(qmu.size()==Nd);
for(int mu=0;mu<Nd;mu++){
coeff[mu] = ci*qmu[mu];
if ( dag ) coeff[mu] = conjugate(coeff[mu]);
}
chi = chi + Gamma(Gmu[0])*psi*coeff[0];
for(int mu=1;mu<Nd;mu++){
chi = chi + Gamma(Gmu[mu])*psi*coeff[mu];
}
}
}
template<class Impl>
void CayleyFermion5D<Impl>::M (const FermionField &psi, FermionField &chi)
{
@ -306,12 +277,8 @@ void CayleyFermion5D<Impl>::M (const FermionField &psi, FermionField &chi)
// Assemble Din
Meooe5D(psi,Din);
this->DW(Din,chi,DaggerNo);
// add i q_mu gamma_mu here
addQmu(Din,chi,DaggerNo);
this->DW(Din,chi,DaggerNo);
// ((b D_W + D_w hop terms +1) on s-diag
axpby(chi,1.0,1.0,chi,psi);
@ -328,9 +295,6 @@ void CayleyFermion5D<Impl>::Mdag (const FermionField &psi, FermionField &chi)
FermionField Din(psi.Grid());
// Apply Dw
this->DW(psi,Din,DaggerYes);
// add -i conj(q_mu) gamma_mu here ... if qmu is real, gammm_5 hermitian, otherwise not.
addQmu(psi,Din,DaggerYes);
MeooeDag5D(Din,chi);
@ -430,7 +394,7 @@ void CayleyFermion5D<Impl>::MeoDeriv(GaugeField &mat,const FermionField &U,const
template<class Impl>
void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,RealD b,RealD c)
{
std::vector<Coeff_t> gamma(this->Ls);
Vector<Coeff_t> gamma(this->Ls);
for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
SetCoefficientsInternal(1.0,gamma,b,c);
}
@ -438,13 +402,13 @@ void CayleyFermion5D<Impl>::SetCoefficientsTanh(Approx::zolotarev_data *zdata,Re
template<class Impl>
void CayleyFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata,RealD b,RealD c)
{
std::vector<Coeff_t> gamma(this->Ls);
Vector<Coeff_t> gamma(this->Ls);
for(int s=0;s<this->Ls;s++) gamma[s] = zdata->gamma[s];
SetCoefficientsInternal(zolo_hi,gamma,b,c);
}
//Zolo
template<class Impl>
void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Coeff_t> & gamma,RealD b,RealD c)
void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,Vector<Coeff_t> & gamma,RealD b,RealD c)
{
int Ls=this->Ls;
@ -524,7 +488,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
leem.resize(Ls);
uee.resize(Ls);
ueem.resize(Ls);
for(int i=0;i<Ls;i++){
dee[i] = bee[i];
@ -565,18 +529,6 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
dee[Ls-1] += delta_d;
}
//////////////////////////////////////////
// Device buffers
//////////////////////////////////////////
d_diag.resize(Ls);
d_upper.resize(Ls);
d_lower.resize(Ls);
d_dee.resize(Ls);
d_lee.resize(Ls);
d_uee.resize(Ls);
d_leem.resize(Ls);
d_ueem.resize(Ls);
// int inv=1;
// this->MooeeInternalCompute(0,inv,MatpInv,MatmInv);
// this->MooeeInternalCompute(1,inv,MatpInvDag,MatmInvDag);

View File

@ -43,9 +43,9 @@ void
CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
const FermionField &phi_i,
FermionField &chi_i,
std::vector<Coeff_t> &lower,
std::vector<Coeff_t> &diag,
std::vector<Coeff_t> &upper)
Vector<Coeff_t> &lower,
Vector<Coeff_t> &diag,
Vector<Coeff_t> &upper)
{
chi_i.Checkerboard()=psi_i.Checkerboard();
@ -55,15 +55,11 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
autoView(chi , chi_i,AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard());
int Ls =this->Ls;
auto pdiag = &diag[0];
auto pupper = &upper[0];
auto plower = &lower[0];
acceleratorCopyToDevice(&diag[0] ,&this->d_diag[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&upper[0],&this->d_upper[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&lower[0],&this->d_lower[0],Ls*sizeof(Coeff_t));
auto pdiag = &d_diag[0];
auto pupper = &d_upper[0];
auto plower = &d_lower[0];
int Ls =this->Ls;
// 10 = 3 complex mult + 2 complex add
// Flops = 10.0*(Nc*Ns) *Ls*vol (/2 for red black counting)
@ -86,9 +82,9 @@ void
CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
const FermionField &phi_i,
FermionField &chi_i,
std::vector<Coeff_t> &lower,
std::vector<Coeff_t> &diag,
std::vector<Coeff_t> &upper)
Vector<Coeff_t> &lower,
Vector<Coeff_t> &diag,
Vector<Coeff_t> &upper)
{
chi_i.Checkerboard()=psi_i.Checkerboard();
GridBase *grid=psi_i.Grid();
@ -97,15 +93,11 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
autoView(chi , chi_i,AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard());
int Ls=this->Ls;
auto pdiag = &diag[0];
auto pupper = &upper[0];
auto plower = &lower[0];
acceleratorCopyToDevice(&diag[0] ,&this->d_diag[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&upper[0],&this->d_upper[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&lower[0],&this->d_lower[0],Ls*sizeof(Coeff_t));
auto pdiag = &d_diag[0];
auto pupper = &d_upper[0];
auto plower = &d_lower[0];
int Ls=this->Ls;
// Flops = 6.0*(Nc*Ns) *Ls*vol
uint64_t nloop = grid->oSites();
@ -134,17 +126,11 @@ CayleyFermion5D<Impl>::MooeeInv (const FermionField &psi_i, FermionField &chi
int Ls=this->Ls;
acceleratorCopyToDevice(&lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
auto plee = & d_lee [0];
auto pdee = & d_dee [0];
auto puee = & d_uee [0];
auto pleem = & d_leem[0];
auto pueem = & d_ueem[0];
auto plee = & lee [0];
auto pdee = & dee [0];
auto puee = & uee [0];
auto pleem = & leem[0];
auto pueem = & ueem[0];
uint64_t nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,Simd::Nsimd(),{
@ -196,17 +182,11 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi
autoView(psi , psi_i,AcceleratorRead);
autoView(chi , chi_i,AcceleratorWrite);
acceleratorCopyToDevice(&lee[0],&d_lee[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&dee[0],&d_dee[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&uee[0],&d_uee[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&leem[0],&d_leem[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&ueem[0],&d_ueem[0],Ls*sizeof(Coeff_t));
auto plee = & d_lee [0];
auto pdee = & d_dee [0];
auto puee = & d_uee [0];
auto pleem = & d_leem[0];
auto pueem = & d_ueem[0];
auto plee = & lee [0];
auto pdee = & dee [0];
auto puee = & uee [0];
auto pleem = & leem[0];
auto pueem = & ueem[0];
assert(psi.Checkerboard() == psi.Checkerboard());

View File

@ -1,5 +1,3 @@
#if 0
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@ -820,5 +818,3 @@ CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,
}
NAMESPACE_END(Grid);
#endif

View File

@ -42,13 +42,13 @@ template<class Impl>
void ContinuedFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,Approx::zolotarev_data *zdata)
{
// How to check Ls matches??
std::cout<<GridLogMessage << zdata->n << " - n"<<std::endl;
std::cout<<GridLogMessage << zdata->da << " -da "<<std::endl;
std::cout<<GridLogMessage << zdata->db << " -db"<<std::endl;
std::cout<<GridLogMessage << zdata->dn << " -dn"<<std::endl;
std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl;
// std::cout<<GridLogMessage << Ls << " Ls"<<std::endl;
// std::cout<<GridLogMessage << zdata->n << " - n"<<std::endl;
// std::cout<<GridLogMessage << zdata->da << " -da "<<std::endl;
// std::cout<<GridLogMessage << zdata->db << " -db"<<std::endl;
// std::cout<<GridLogMessage << zdata->dn << " -dn"<<std::endl;
// std::cout<<GridLogMessage << zdata->dd << " -dd"<<std::endl;
int Ls = this->Ls;
std::cout<<GridLogMessage << Ls << " Ls"<<std::endl;
assert(zdata->db==Ls);// Beta has Ls coeffs
R=(1+this->mass)/(1-this->mass);
@ -320,7 +320,7 @@ ContinuedFractionFermion5D<Impl>::ContinuedFractionFermion5D(
int Ls = this->Ls;
conformable(solution5d.Grid(),this->FermionGrid());
conformable(exported4d.Grid(),this->GaugeGrid());
ExtractSlice(exported4d, solution5d, Ls-1, 0);
ExtractSlice(exported4d, solution5d, Ls-1, Ls-1);
}
template<class Impl>
void ContinuedFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
@ -330,7 +330,7 @@ ContinuedFractionFermion5D<Impl>::ContinuedFractionFermion5D(
conformable(input4d.Grid() ,this->GaugeGrid());
FermionField tmp(this->FermionGrid());
tmp=Zero();
InsertSlice(input4d, tmp, Ls-1, 0);
InsertSlice(input4d, tmp, Ls-1, Ls-1);
tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
this->Dminus(tmp,imported5d);
}

View File

@ -41,7 +41,7 @@ NAMESPACE_BEGIN(Grid);
// Pplus backwards..
template<class Impl>
void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionField& phi_i,FermionField& chi_i,
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
{
chi_i.Checkerboard() = psi_i.Checkerboard();
int Ls = this->Ls;
@ -50,15 +50,9 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
autoView( psi , psi_i, AcceleratorRead);
autoView( chi , chi_i, AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard());
auto pdiag = &this->d_diag[0];
auto pupper = &this->d_upper[0];
auto plower = &this->d_lower[0];
acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
auto pdiag = &diag[0];
auto pupper = &upper[0];
auto plower = &lower[0];
// Flops = 6.0*(Nc*Ns) *Ls*vol
auto nloop=grid->oSites()/Ls;
@ -79,7 +73,7 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
template<class Impl>
void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const FermionField& phi_i, FermionField& chi_i,
std::vector<Coeff_t>& lower, std::vector<Coeff_t>& diag, std::vector<Coeff_t>& upper)
Vector<Coeff_t>& lower, Vector<Coeff_t>& diag, Vector<Coeff_t>& upper)
{
chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase* grid = psi_i.Grid();
@ -89,14 +83,9 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const Fermio
autoView( phi , phi_i, AcceleratorRead);
autoView( chi , chi_i, AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard());
auto pdiag = &this->d_diag[0];
auto pupper = &this->d_upper[0];
auto plower = &this->d_lower[0];
acceleratorCopyToDevice(&diag[0] ,&pdiag[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
auto pdiag = &diag[0];
auto pupper = &upper[0];
auto plower = &lower[0];
// Flops = 6.0*(Nc*Ns) *Ls*vol
@ -125,17 +114,12 @@ void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionFie
autoView( chi, chi_i, AcceleratorWrite);
int Ls = this->Ls;
auto plee = & this->d_lee [0];
auto pdee = & this->d_dee [0];
auto puee = & this->d_uee [0];
auto pleem = & this->d_leem[0];
auto pueem = & this->d_ueem[0];
acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t));
auto plee = & this->lee[0];
auto pdee = & this->dee[0];
auto puee = & this->uee[0];
auto pleem = & this->leem[0];
auto pueem = & this->ueem[0];
uint64_t nloop=grid->oSites()/Ls;
accelerator_for(sss,nloop,Simd::Nsimd(),{

View File

@ -131,9 +131,9 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi
else{ shiftm = -shift*(mq3-mq2); }
}
std::vector<Coeff_t> diag(Ls,1.0);
std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftm;
std::vector<Coeff_t> lower(Ls,-1.0); lower[0] = mq1 + shiftp;
Vector<Coeff_t> diag(Ls,1.0);
Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftm;
Vector<Coeff_t> lower(Ls,-1.0); lower[0] = mq1 + shiftp;
#if(0)
std::cout << GridLogMessage << "DomainWallEOFAFermion::M5D(FF&,FF&):" << std::endl;
@ -168,9 +168,9 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField&
else{ shiftm = -shift*(mq3-mq2); }
}
std::vector<Coeff_t> diag(Ls,1.0);
std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftp;
std::vector<Coeff_t> lower(Ls,-1.0); lower[0] = mq1 + shiftm;
Vector<Coeff_t> diag(Ls,1.0);
Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = mq1 + shiftp;
Vector<Coeff_t> lower(Ls,-1.0); lower[0] = mq1 + shiftm;
this->M5Ddag(psi, chi, chi, lower, diag, upper);
}
@ -181,9 +181,9 @@ void DomainWallEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& c
{
int Ls = this->Ls;
std::vector<Coeff_t> diag = this->bee;
std::vector<Coeff_t> upper(Ls);
std::vector<Coeff_t> lower(Ls);
Vector<Coeff_t> diag = this->bee;
Vector<Coeff_t> upper(Ls);
Vector<Coeff_t> lower(Ls);
for(int s=0; s<Ls; s++){
upper[s] = -this->cee[s];
@ -200,9 +200,9 @@ void DomainWallEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField
{
int Ls = this->Ls;
std::vector<Coeff_t> diag = this->bee;
std::vector<Coeff_t> upper(Ls);
std::vector<Coeff_t> lower(Ls);
Vector<Coeff_t> diag = this->bee;
Vector<Coeff_t> upper(Ls);
Vector<Coeff_t> lower(Ls);
for(int s=0; s<Ls; s++){
upper[s] = -this->cee[s];
@ -218,7 +218,7 @@ void DomainWallEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField
//Zolo
template<class Impl>
void DomainWallEOFAFermion<Impl>::SetCoefficientsInternal(RealD zolo_hi, std::vector<Coeff_t>& gamma, RealD b, RealD c)
void DomainWallEOFAFermion<Impl>::SetCoefficientsInternal(RealD zolo_hi, Vector<Coeff_t>& gamma, RealD b, RealD c)
{
int Ls = this->Ls;
int pm = this->pm;

View File

@ -61,6 +61,8 @@ ImprovedStaggeredFermion5D<Impl>::ImprovedStaggeredFermion5D(GridCartesian
UUUmu(&FourDimGrid),
UUUmuEven(&FourDimRedBlackGrid),
UUUmuOdd(&FourDimRedBlackGrid),
Lebesgue(&FourDimGrid),
LebesgueEvenOdd(&FourDimRedBlackGrid),
_tmp(&FiveDimRedBlackGrid)
{
@ -275,18 +277,18 @@ void ImprovedStaggeredFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
/*CHANGE */
template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st,
void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
DoubledGaugeField & U,DoubledGaugeField & UUU,
const FermionField &in, FermionField &out,int dag)
{
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
DhopInternalOverlappedComms(st,U,UUU,in,out,dag);
DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
else
DhopInternalSerialComms(st,U,UUU,in,out,dag);
DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
}
template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo,
DoubledGaugeField & U,DoubledGaugeField & UUU,
const FermionField &in, FermionField &out,int dag)
{
@ -311,7 +313,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
{
int interior=1;
int exterior=0;
Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
}
st.CommsMerge(compressor);
@ -321,12 +323,12 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl &
{
int interior=0;
int exterior=1;
Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
}
}
template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
DoubledGaugeField & U,DoubledGaugeField & UUU,
const FermionField &in, FermionField &out,int dag)
{
@ -339,7 +341,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
{
int interior=1;
int exterior=1;
Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
}
}
/*CHANGE END*/
@ -355,7 +357,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionFie
assert(in.Checkerboard()==Even);
out.Checkerboard() = Odd;
DhopInternal(StencilEven,UmuOdd,UUUmuOdd,in,out,dag);
DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,UUUmuOdd,in,out,dag);
}
template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
@ -366,7 +368,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionFie
assert(in.Checkerboard()==Odd);
out.Checkerboard() = Even;
DhopInternal(StencilOdd,UmuEven,UUUmuEven,in,out,dag);
DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,UUUmuEven,in,out,dag);
}
template<class Impl>
void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
@ -376,7 +378,7 @@ void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField
out.Checkerboard() = in.Checkerboard();
DhopInternal(Stencil,Umu,UUUmu,in,out,dag);
DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag);
}
/////////////////////////////////////////////////////////////////////////

View File

@ -48,6 +48,8 @@ ImprovedStaggeredFermion<Impl>::ImprovedStaggeredFermion(GridCartesian &Fgrid, G
StencilEven(&Hgrid, npoint, Even, directions, displacements,p), // source is Even
StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p), // source is Odd
mass(_mass),
Lebesgue(_grid),
LebesgueEvenOdd(_cbgrid),
Umu(&Fgrid),
UmuEven(&Hgrid),
UmuOdd(&Hgrid),
@ -337,7 +339,7 @@ void ImprovedStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &
out.Checkerboard() = in.Checkerboard();
DhopInternal(Stencil, Umu, UUUmu, in, out, dag);
DhopInternal(Stencil, Lebesgue, Umu, UUUmu, in, out, dag);
}
template <class Impl>
@ -349,7 +351,7 @@ void ImprovedStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField
assert(in.Checkerboard() == Even);
out.Checkerboard() = Odd;
DhopInternal(StencilEven, UmuOdd, UUUmuOdd, in, out, dag);
DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, UUUmuOdd, in, out, dag);
}
template <class Impl>
@ -361,7 +363,7 @@ void ImprovedStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField
assert(in.Checkerboard() == Odd);
out.Checkerboard() = Even;
DhopInternal(StencilOdd, UmuEven, UUUmuEven, in, out, dag);
DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, UUUmuEven, in, out, dag);
}
template <class Impl>
@ -392,19 +394,19 @@ void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionFiel
template <class Impl>
void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st,
void ImprovedStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
DoubledGaugeField &UUU,
const FermionField &in,
FermionField &out, int dag)
{
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
DhopInternalOverlappedComms(st,U,UUU,in,out,dag);
DhopInternalOverlappedComms(st,lo,U,UUU,in,out,dag);
else
DhopInternalSerialComms(st,U,UUU,in,out,dag);
DhopInternalSerialComms(st,lo,U,UUU,in,out,dag);
}
template <class Impl>
void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st,
void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
DoubledGaugeField &UUU,
const FermionField &in,
@ -427,7 +429,7 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
{
int interior=1;
int exterior=0;
Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
}
st.CommunicateComplete(requests);
@ -438,13 +440,13 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st
{
int interior=0;
int exterior=1;
Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
}
}
template <class Impl>
void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st,
void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
DoubledGaugeField &UUU,
const FermionField &in,
@ -458,7 +460,7 @@ void ImprovedStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st,
{
int interior=1;
int exterior=1;
Kernels::DhopImproved(st,U,UUU,in,out,dag,interior,exterior);
Kernels::DhopImproved(st,lo,U,UUU,in,out,dag,interior,exterior);
}
};

View File

@ -39,7 +39,7 @@ NAMESPACE_BEGIN(Grid);
template<class Impl>
void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper)
Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper)
{
chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase *grid = psi_i.Grid();
@ -50,14 +50,10 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField
assert(phi.Checkerboard() == psi.Checkerboard());
auto pdiag = &this->d_diag[0];
auto pupper = &this->d_upper[0];
auto plower = &this->d_lower[0];
auto pdiag = &diag[0];
auto pupper = &upper[0];
auto plower = &lower[0];
acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
// Flops = 6.0*(Nc*Ns) *Ls*vol
int nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,Simd::Nsimd(),{
@ -78,8 +74,8 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField
template<class Impl>
void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper,
std::vector<Coeff_t> &shift_coeffs)
Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper,
Vector<Coeff_t> &shift_coeffs)
{
chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase *grid = psi_i.Grid();
@ -90,18 +86,13 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
auto pm = this->pm;
int shift_s = (pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
assert(phi.Checkerboard() == psi.Checkerboard());
auto pdiag = &this->d_diag[0];
auto pupper = &this->d_upper[0];
auto plower = &this->d_lower[0];
auto pshift_coeffs = &this->d_shift_coefficients[0];
acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&shift_coeffs[0],&pshift_coeffs[0],Ls*sizeof(Coeff_t));
auto pdiag = &diag[0];
auto pupper = &upper[0];
auto plower = &lower[0];
auto pshift_coeffs = &shift_coeffs[0];
// Flops = 6.0*(Nc*Ns) *Ls*vol
int nloop = grid->oSites()/Ls;
@ -128,7 +119,7 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
template<class Impl>
void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper)
Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper)
{
chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase *grid = psi_i.Grid();
@ -138,14 +129,10 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
autoView(chi , chi_i, AcceleratorWrite);
assert(phi.Checkerboard() == psi.Checkerboard());
auto pdiag = &this->d_diag[0];
auto pupper = &this->d_upper[0];
auto plower = &this->d_lower[0];
acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
auto pdiag = &diag[0];
auto pupper = &upper[0];
auto plower = &lower[0];
// Flops = 6.0*(Nc*Ns) *Ls*vol
int nloop = grid->oSites()/Ls;
@ -167,8 +154,8 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
template<class Impl>
void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const FermionField &phi_i, FermionField &chi_i,
std::vector<Coeff_t> &lower, std::vector<Coeff_t> &diag, std::vector<Coeff_t> &upper,
std::vector<Coeff_t> &shift_coeffs)
Vector<Coeff_t> &lower, Vector<Coeff_t> &diag, Vector<Coeff_t> &upper,
Vector<Coeff_t> &shift_coeffs)
{
chi_i.Checkerboard() = psi_i.Checkerboard();
GridBase *grid = psi_i.Grid();
@ -180,16 +167,11 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const Ferm
assert(phi.Checkerboard() == psi.Checkerboard());
auto pdiag = &this->d_diag[0];
auto pupper = &this->d_upper[0];
auto plower = &this->d_lower[0];
auto pshift_coeffs = &this->d_shift_coefficients[0];
auto pdiag = &diag[0];
auto pupper = &upper[0];
auto plower = &lower[0];
auto pshift_coeffs = &shift_coeffs[0];
acceleratorCopyToDevice(&diag[0],&pdiag[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&upper[0],&pupper[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&lower[0],&plower[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&shift_coeffs[0],&pshift_coeffs[0],Ls*sizeof(Coeff_t));
// Flops = 6.0*(Nc*Ns) *Ls*vol
auto pm = this->pm;
@ -230,17 +212,11 @@ void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField &
autoView(psi , psi_i, AcceleratorRead);
autoView(chi , chi_i, AcceleratorWrite);
auto plee = & this->d_lee [0];
auto pdee = & this->d_dee [0];
auto puee = & this->d_uee [0];
auto pleem = & this->d_leem[0];
auto pueem = & this->d_ueem[0];
acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t));
auto plee = & this->lee [0];
auto pdee = & this->dee [0];
auto puee = & this->uee [0];
auto pleem= & this->leem[0];
auto pueem= & this->ueem[0];
if(this->shift != 0.0){ MooeeInv_shift(psi_i,chi_i); return; }
@ -292,23 +268,14 @@ void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionF
autoView(psi , psi_i, AcceleratorRead);
autoView(chi , chi_i, AcceleratorWrite);
// Move into object and constructor
auto pm = this->pm;
auto plee = & this->d_lee [0];
auto pdee = & this->d_dee [0];
auto puee = & this->d_uee [0];
auto pleem = & this->d_leem[0];
auto pueem = & this->d_ueem[0];
auto pMooeeInv_shift_lc = &this->d_MooeeInv_shift_lc[0];
auto pMooeeInv_shift_norm = &this->d_MooeeInv_shift_norm[0];
acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&MooeeInv_shift_lc[0],&pMooeeInv_shift_lc[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&MooeeInv_shift_norm[0],&pMooeeInv_shift_norm[0],Ls*sizeof(Coeff_t));
auto plee = & this->lee [0];
auto pdee = & this->dee [0];
auto puee = & this->uee [0];
auto pleem= & this->leem[0];
auto pueem= & this->ueem[0];
auto pMooeeInv_shift_lc = &MooeeInv_shift_lc[0];
auto pMooeeInv_shift_norm = &MooeeInv_shift_norm[0];
int nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,Simd::Nsimd(),{
@ -366,17 +333,11 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionFiel
autoView(psi , psi_i, AcceleratorRead);
autoView(chi , chi_i, AcceleratorWrite);
auto plee = &this->d_lee [0];
auto pdee = &this->d_dee [0];
auto puee = &this->d_uee [0];
auto pleem = &this->d_leem[0];
auto pueem = &this->d_ueem[0];
acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t));
auto plee = & this->lee [0];
auto pdee = & this->dee [0];
auto puee = & this->uee [0];
auto pleem= & this->leem[0];
auto pueem= & this->ueem[0];
int nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,Simd::Nsimd(),{
@ -426,25 +387,13 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, Fermi
int Ls = this->Ls;
auto pm = this->pm;
auto plee = & this->d_lee [0];
auto pdee = & this->d_dee [0];
auto puee = & this->d_uee [0];
auto pleem = & this->d_leem[0];
auto pueem = & this->d_ueem[0];
auto pMooeeInvDag_shift_lc = &this->d_MooeeInv_shift_lc[0];
auto pMooeeInvDag_shift_norm = &this->d_MooeeInv_shift_norm[0];
acceleratorCopyToDevice(&this->lee[0],&plee[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&this->dee[0],&pdee[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&this->uee[0],&puee[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&this->leem[0],&pleem[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&this->ueem[0],&pueem[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&MooeeInvDag_shift_lc[0],&pMooeeInvDag_shift_lc[0],Ls*sizeof(Coeff_t));
acceleratorCopyToDevice(&MooeeInvDag_shift_norm[0],&pMooeeInvDag_shift_norm[0],Ls*sizeof(Coeff_t));
// auto pMooeeInvDag_shift_lc = &MooeeInvDag_shift_lc[0];
// auto pMooeeInvDag_shift_norm = &MooeeInvDag_shift_norm[0];
auto plee = & this->lee [0];
auto pdee = & this->dee [0];
auto puee = & this->uee [0];
auto pleem= & this->leem[0];
auto pueem= & this->ueem[0];
auto pMooeeInvDag_shift_lc = &MooeeInvDag_shift_lc[0];
auto pMooeeInvDag_shift_norm = &MooeeInvDag_shift_norm[0];
int nloop = grid->oSites()/Ls;
accelerator_for(sss,nloop,Simd::Nsimd(),{

View File

@ -196,9 +196,9 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField& psi, FermionField& chi)
{
int Ls = this->Ls;
std::vector<Coeff_t> diag(Ls,1.0);
std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = this->mq1;
std::vector<Coeff_t> lower(Ls,-1.0); lower[0] = this->mq1;
Vector<Coeff_t> diag(Ls,1.0);
Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = this->mq1;
Vector<Coeff_t> lower(Ls,-1.0); lower[0] = this->mq1;
// no shift term
if(this->shift == 0.0){ this->M5D(psi, chi, chi, lower, diag, upper); }
@ -212,9 +212,9 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField& psi, FermionField& chi)
{
int Ls = this->Ls;
std::vector<Coeff_t> diag(Ls,1.0);
std::vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = this->mq1;
std::vector<Coeff_t> lower(Ls,-1.0); lower[0] = this->mq1;
Vector<Coeff_t> diag(Ls,1.0);
Vector<Coeff_t> upper(Ls,-1.0); upper[Ls-1] = this->mq1;
Vector<Coeff_t> lower(Ls,-1.0); lower[0] = this->mq1;
// no shift term
if(this->shift == 0.0){ this->M5Ddag(psi, chi, chi, lower, diag, upper); }
@ -230,9 +230,9 @@ void MobiusEOFAFermion<Impl>::Mooee(const FermionField& psi, FermionField& chi)
int Ls = this->Ls;
// coefficients of Mooee
std::vector<Coeff_t> diag = this->bee;
std::vector<Coeff_t> upper(Ls);
std::vector<Coeff_t> lower(Ls);
Vector<Coeff_t> diag = this->bee;
Vector<Coeff_t> upper(Ls);
Vector<Coeff_t> lower(Ls);
for(int s=0; s<Ls; s++){
upper[s] = -this->cee[s];
lower[s] = -this->cee[s];
@ -253,9 +253,9 @@ void MobiusEOFAFermion<Impl>::MooeeDag(const FermionField& psi, FermionField& ch
int Ls = this->Ls;
// coefficients of MooeeDag
std::vector<Coeff_t> diag = this->bee;
std::vector<Coeff_t> upper(Ls);
std::vector<Coeff_t> lower(Ls);
Vector<Coeff_t> diag = this->bee;
Vector<Coeff_t> upper(Ls);
Vector<Coeff_t> lower(Ls);
for(int s=0; s<Ls; s++){
if(s==0) {
upper[s] = -this->cee[s+1];
@ -314,10 +314,10 @@ void MobiusEOFAFermion<Impl>::SetCoefficientsPrecondShiftOps()
// Tridiagonal solve for MooeeInvDag_shift_lc
{
Coeff_t m(0.0);
std::vector<Coeff_t> d = Mooee_shift;
std::vector<Coeff_t> u(Ls,0.0);
std::vector<Coeff_t> y(Ls,0.0);
std::vector<Coeff_t> q(Ls,0.0);
Vector<Coeff_t> d = Mooee_shift;
Vector<Coeff_t> u(Ls,0.0);
Vector<Coeff_t> y(Ls,0.0);
Vector<Coeff_t> q(Ls,0.0);
if(pm == 1){ u[0] = 1.0; }
else{ u[Ls-1] = 1.0; }

View File

@ -48,6 +48,8 @@ NaiveStaggeredFermion<Impl>::NaiveStaggeredFermion(GridCartesian &Fgrid, GridRed
StencilEven(&Hgrid, npoint, Even, directions, displacements,p), // source is Even
StencilOdd(&Hgrid, npoint, Odd, directions, displacements,p), // source is Odd
mass(_mass),
Lebesgue(_grid),
LebesgueEvenOdd(_cbgrid),
Umu(&Fgrid),
UmuEven(&Hgrid),
UmuOdd(&Hgrid),
@ -266,7 +268,7 @@ void NaiveStaggeredFermion<Impl>::Dhop(const FermionField &in, FermionField &out
out.Checkerboard() = in.Checkerboard();
DhopInternal(Stencil, Umu, in, out, dag);
DhopInternal(Stencil, Lebesgue, Umu, in, out, dag);
}
template <class Impl>
@ -278,7 +280,7 @@ void NaiveStaggeredFermion<Impl>::DhopOE(const FermionField &in, FermionField &o
assert(in.Checkerboard() == Even);
out.Checkerboard() = Odd;
DhopInternal(StencilEven, UmuOdd, in, out, dag);
DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag);
}
template <class Impl>
@ -290,7 +292,7 @@ void NaiveStaggeredFermion<Impl>::DhopEO(const FermionField &in, FermionField &o
assert(in.Checkerboard() == Odd);
out.Checkerboard() = Even;
DhopInternal(StencilOdd, UmuEven, in, out, dag);
DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag);
}
template <class Impl>
@ -321,18 +323,18 @@ void NaiveStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionField &
template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopInternal(StencilImpl &st,
void NaiveStaggeredFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in,
FermionField &out, int dag)
{
if ( StaggeredKernelsStatic::Comms == StaggeredKernelsStatic::CommsAndCompute )
DhopInternalOverlappedComms(st,U,in,out,dag);
DhopInternalOverlappedComms(st,lo,U,in,out,dag);
else
DhopInternalSerialComms(st,U,in,out,dag);
DhopInternalSerialComms(st,lo,U,in,out,dag);
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st,
void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in,
FermionField &out, int dag)
@ -354,7 +356,7 @@ void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st,
{
int interior=1;
int exterior=0;
Kernels::DhopNaive(st,U,in,out,dag,interior,exterior);
Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
}
st.CommunicateComplete(requests);
@ -365,12 +367,12 @@ void NaiveStaggeredFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st,
{
int interior=0;
int exterior=1;
Kernels::DhopNaive(st,U,in,out,dag,interior,exterior);
Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
}
}
template <class Impl>
void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st,
void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in,
FermionField &out, int dag)
@ -383,7 +385,7 @@ void NaiveStaggeredFermion<Impl>::DhopInternalSerialComms(StencilImpl &st,
{
int interior=1;
int exterior=1;
Kernels::DhopNaive(st,U,in,out,dag,interior,exterior);
Kernels::DhopNaive(st,lo,U,in,out,dag,interior,exterior);
}
};

View File

@ -237,32 +237,7 @@ void PartialFractionFermion5D<Impl>::M_internal(const FermionField &psi, Fermi
// ( 0 -sqrt(p_i)*amax | 2 R gamma_5 + p0/amax 2H
//
this->DW(psi,D,DaggerNo);
// DW - DW+iqslash
// (g5 Dw)^dag = g5 Dw
// (iqmu g5 gmu)^dag = (-i qmu gmu^dag g5^dag) = i qmu g5 gmu
if ( qmu.size() ) {
std::cout<< "Mat" << "qmu ("<<qmu[0]<<","<<qmu[1]<<","<<qmu[2]<<","<<qmu[3]<<")"<<std::endl;
assert(qmu.size()==Nd);
FermionField qslash_psi(psi.Grid());
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ,
Gamma::Algebra::GammaT
};
qslash_psi = qmu[0]*(Gamma(Gmu[0])*psi);
for(int mu=1;mu<Nd;mu++){
qslash_psi = qslash_psi + qmu[mu]*(Gamma(Gmu[mu])*psi);
}
ComplexD ci(0.0,1.0);
qslash_psi = ci*qslash_psi ; // i qslash
D = D + qslash_psi;
}
this->DW(psi,D,DaggerNo);
int nblock=(Ls-1)/2;
for(int b=0;b<nblock;b++){
@ -280,55 +255,15 @@ void PartialFractionFermion5D<Impl>::M_internal(const FermionField &psi, Fermi
}
{
// The 'conventional' Cayley overlap operator is
//
// Dov = (1+m)/2 + (1-m)/2 g5 sgn Hw
//
//
// With massless limit 1/2(1+g5 sgnHw)
//
// Luscher shows quite neatly that 1+g5 sgn Hw has tree level propagator i qslash +O(a^2)
//
// However, the conventional normalisation has both a leading order factor of 2 in Zq
// at tree level AND a mass dependent (1-m) that are convenient to absorb.
//
// In WilsonFermion5DImplementation.h, the tree level propagator for Hw is
//
// num = -i sin kmu gmu
//
// denom ( sqrt(sk^2 + (2shk^2 - 1)^2
// b_k = sk2 - M5;
//
// w_k = sqrt(sk + b_k*b_k);
//
// denom= ( w_k + b_k + mass*mass) ;
//
// denom= one/denom;
// out = num*denom;
//
// Chroma, and Grid define partial fraction via 4d operator
//
// Dpf = 2/(1-m) x Dov = (1+m)/(1-m) + g5 sgn Hw
//
// Now since:
//
// (1+m)/(1-m) = (1-m)/(1-m) + 2m/(1-m) = 1 + 2m/(1-m)
//
// This corresponds to a modified mass parameter
//
// It has an annoying
//
//
double R=(1+this->mass)/(1-this->mass);
//R g5 psi[Ls] + p[0] Hw
//R g5 psi[Ls] + p[0] H
ag5xpbg5y_ssp(chi,R*scale,psi,p[nblock]*scale/amax,D,Ls-1,Ls-1);
for(int b=0;b<nblock;b++){
int s = 2*b+1;
double pp = p[nblock-1-b];
axpby_ssp(chi,1.0,chi,-sqrt(amax*pp)*scale*sign,psi,Ls-1,s);
}
}
}
@ -476,18 +411,17 @@ void PartialFractionFermion5D<Impl>::SetCoefficientsZolotarev(RealD zolo_hi,App
int Ls = this->Ls;
conformable(solution5d.Grid(),this->FermionGrid());
conformable(exported4d.Grid(),this->GaugeGrid());
ExtractSlice(exported4d, solution5d, Ls-1, 0);
ExtractSlice(exported4d, solution5d, Ls-1, Ls-1);
}
template<class Impl>
void PartialFractionFermion5D<Impl>::ImportPhysicalFermionSource(const FermionField &input4d,FermionField &imported5d)
{
//void InsertSlice(const Lattice<vobj> &lowDim,Lattice<vobj> & higherDim,int slice, int orthog)
int Ls = this->Ls;
conformable(imported5d.Grid(),this->FermionGrid());
conformable(input4d.Grid() ,this->GaugeGrid());
FermionField tmp(this->FermionGrid());
tmp=Zero();
InsertSlice(input4d, tmp, Ls-1, 0);
InsertSlice(input4d, tmp, Ls-1, Ls-1);
tmp=Gamma(Gamma::Algebra::Gamma5)*tmp;
this->Dminus(tmp,imported5d);
}
@ -508,7 +442,7 @@ PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,
{
int Ls = this->Ls;
qmu.resize(0);
assert((Ls&0x1)==1); // Odd Ls required
int nrational=Ls-1;
@ -526,22 +460,6 @@ PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,
Approx::zolotarev_free(zdata);
}
template<class Impl>
PartialFractionFermion5D<Impl>::PartialFractionFermion5D(GaugeField &_Umu,
GridCartesian &FiveDimGrid,
GridRedBlackCartesian &FiveDimRedBlackGrid,
GridCartesian &FourDimGrid,
GridRedBlackCartesian &FourDimRedBlackGrid,
RealD _mass,RealD M5,
std::vector<RealD> &_qmu,
const ImplParams &p)
: PartialFractionFermion5D<Impl>(_Umu,
FiveDimGrid,FiveDimRedBlackGrid,
FourDimGrid,FourDimRedBlackGrid,
_mass,M5,p)
{
qmu=_qmu;
}
NAMESPACE_END(Grid);

View File

@ -375,6 +375,23 @@ void StaggeredKernels<Impl>::DhopSiteHandExt(StencilView &st,
}
}
/*
#define DHOP_SITE_HAND_INSTANTIATE(IMPL) \
template void StaggeredKernels<IMPL>::DhopSiteHand(StencilImpl &st, LebesgueOrder &lo, \
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
SiteSpinor *buf, int LLs, int sU, \
const FermionFieldView &in, FermionFieldView &out, int dag); \
\
template void StaggeredKernels<IMPL>::DhopSiteHandInt(StencilImpl &st, LebesgueOrder &lo, \
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
SiteSpinor *buf, int LLs, int sU, \
const FermionFieldView &in, FermionFieldView &out, int dag); \
\
template void StaggeredKernels<IMPL>::DhopSiteHandExt(StencilImpl &st, LebesgueOrder &lo, \
DoubledGaugeFieldView &U,DoubledGaugeFieldView &UUU, \
SiteSpinor *buf, int LLs, int sU, \
const FermionFieldView &in, FermionFieldView &out, int dag); \
*/
#undef LOAD_CHI
#undef HAND_DECLARATIONS

View File

@ -256,7 +256,7 @@ void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldVie
});
template <class Impl>
void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st,
void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U, DoubledGaugeField &UUU,
const FermionField &in, FermionField &out, int dag, int interior,int exterior)
{
@ -294,7 +294,7 @@ void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st,
assert(0 && " Kernel optimisation case not covered ");
}
template <class Impl>
void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st,
void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in, FermionField &out, int dag, int interior,int exterior)
{

View File

@ -58,9 +58,15 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
Umu(_FourDimGrid),
UmuEven(_FourDimRedBlackGrid),
UmuOdd (_FourDimRedBlackGrid),
Lebesgue(_FourDimGrid),
LebesgueEvenOdd(_FourDimRedBlackGrid),
_tmp(&FiveDimRedBlackGrid),
Dirichlet(0)
{
Stencil.lo = &Lebesgue;
StencilEven.lo = &LebesgueEvenOdd;
StencilOdd.lo = &LebesgueEvenOdd;
// some assertions
assert(FiveDimGrid._ndimension==5);
assert(FourDimGrid._ndimension==4);
@ -299,19 +305,19 @@ void WilsonFermion5D<Impl>::DhopDerivOE(GaugeField &mat,
}
template<class Impl>
void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st,
void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
DoubledGaugeField & U,
const FermionField &in, FermionField &out,int dag)
{
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
DhopInternalOverlappedComms(st,U,in,out,dag);
DhopInternalOverlappedComms(st,lo,U,in,out,dag);
else
DhopInternalSerialComms(st,U,in,out,dag);
DhopInternalSerialComms(st,lo,U,in,out,dag);
}
template<class Impl>
void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, LebesgueOrder &lo,
DoubledGaugeField & U,
const FermionField &in, FermionField &out,int dag)
{
@ -325,22 +331,22 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
// Start comms // Gather intranode and extra node differentiated??
/////////////////////////////
{
// std::cout << " WilsonFermion5D gather " <<std::endl;
GRID_TRACE("Gather");
st.HaloExchangeOptGather(in,compressor); // Put the barrier in the routine
}
// std::cout << " WilsonFermion5D Communicate Begin " <<std::endl;
std::vector<std::vector<CommsRequest_t> > requests;
auto id=traceStart("Communicate overlapped");
st.CommunicateBegin(requests);
#if 1
/////////////////////////////
// Overlap with comms
/////////////////////////////
st.CommunicateBegin(requests);
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
#endif
{
GRID_TRACE("MergeSHM");
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
}
/////////////////////////////
// do the compute interior
/////////////////////////////
@ -352,35 +358,22 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
GRID_TRACE("DhopInterior");
Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,1,0);
}
//ifdef GRID_ACCELERATED
#if 0
/////////////////////////////
// Overlap with comms -- on GPU the interior kernel call is nonblocking
/////////////////////////////
st.CommunicateBegin(requests);
st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
#endif
/////////////////////////////
// Complete comms
/////////////////////////////
// std::cout << " WilsonFermion5D Comms Complete " <<std::endl;
st.CommunicateComplete(requests);
// traceStop(id);
traceStop(id);
/////////////////////////////
// do the compute exterior
/////////////////////////////
{
// std::cout << " WilsonFermion5D Comms Merge " <<std::endl;
GRID_TRACE("Merge");
st.CommsMerge(compressor);
}
// std::cout << " WilsonFermion5D Exterior " <<std::endl;
if (dag == DaggerYes) {
GRID_TRACE("DhopDagExterior");
Kernels::DhopDagKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
@ -388,12 +381,11 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st,
GRID_TRACE("DhopExterior");
Kernels::DhopKernel (Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out,0,1);
}
// std::cout << " WilsonFermion5D Done " <<std::endl;
}
template<class Impl>
void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
DoubledGaugeField & U,
const FermionField &in,
FermionField &out,int dag)
@ -403,13 +395,11 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
int LLs = in.Grid()->_rdimensions[0];
// std::cout << " WilsonFermion5D Halo exch " <<std::endl;
{
GRID_TRACE("HaloExchange");
st.HaloExchangeOpt(in,compressor);
}
// std::cout << " WilsonFermion5D Dhop " <<std::endl;
int Opt = WilsonKernelsStatic::Opt;
if (dag == DaggerYes) {
GRID_TRACE("DhopDag");
@ -418,7 +408,6 @@ void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st,
GRID_TRACE("Dhop");
Kernels::DhopKernel(Opt,st,U,st.CommBuf(),LLs,U.oSites(),in,out);
}
// std::cout << " WilsonFermion5D Done " <<std::endl;
}
@ -431,7 +420,7 @@ void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int
assert(in.Checkerboard()==Even);
out.Checkerboard() = Odd;
DhopInternal(StencilEven,UmuOdd,in,out,dag);
DhopInternal(StencilEven,LebesgueEvenOdd,UmuOdd,in,out,dag);
}
template<class Impl>
void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
@ -442,31 +431,8 @@ void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int
assert(in.Checkerboard()==Odd);
out.Checkerboard() = Even;
DhopInternal(StencilOdd,UmuEven,in,out,dag);
DhopInternal(StencilOdd,LebesgueEvenOdd,UmuEven,in,out,dag);
}
template<class Impl>
void WilsonFermion5D<Impl>::DhopComms(const FermionField &in, FermionField &out)
{
int dag =0 ;
conformable(in.Grid(),FermionGrid()); // verifies full grid
conformable(in.Grid(),out.Grid());
out.Checkerboard() = in.Checkerboard();
Compressor compressor(dag);
Stencil.HaloExchangeOpt(in,compressor);
}
template<class Impl>
void WilsonFermion5D<Impl>::DhopCalc(const FermionField &in, FermionField &out,uint64_t *ids)
{
conformable(in.Grid(),FermionGrid()); // verifies full grid
conformable(in.Grid(),out.Grid());
out.Checkerboard() = in.Checkerboard();
int LLs = in.Grid()->_rdimensions[0];
int Opt = WilsonKernelsStatic::Opt;
Kernels::DhopKernel(Opt,Stencil,Umu,Stencil.CommBuf(),LLs,Umu.oSites(),in,out,ids);
}
template<class Impl>
void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
{
@ -475,7 +441,7 @@ void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int d
out.Checkerboard() = in.Checkerboard();
DhopInternal(Stencil,Umu,in,out,dag);
DhopInternal(Stencil,Lebesgue,Umu,in,out,dag);
}
template<class Impl>
void WilsonFermion5D<Impl>::DW(const FermionField &in, FermionField &out,int dag)
@ -769,15 +735,6 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHt(FermionField &out,const Fe
template<class Impl>
void WilsonFermion5D<Impl>::MomentumSpacePropagatorHw(FermionField &out,const FermionField &in,RealD mass,std::vector<double> twist)
{
std::vector<double> empty_q(Nd,0.0);
MomentumSpacePropagatorHwQ(out,in,mass,twist,empty_q);
}
template<class Impl>
void WilsonFermion5D<Impl>::MomentumSpacePropagatorHwQ(FermionField &out,const FermionField &in,
RealD mass,
std::vector<double> twist,
std::vector<double> qmu)
{
Gamma::Algebra Gmu [] = {
Gamma::Algebra::GammaX,
@ -793,7 +750,6 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHwQ(FermionField &out,const F
typedef typename FermionField::scalar_type ScalComplex;
typedef Lattice<iSinglet<vector_type> > LatComplex;
typedef iSpinMatrix<ScalComplex> SpinMat;
Coordinate latt_size = _grid->_fdimensions;
@ -811,10 +767,8 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHwQ(FermionField &out,const F
LatComplex kmu(_grid);
ScalComplex ci(0.0,1.0);
std::cout<< "Feynman Rule" << "qmu ("<<qmu[0]<<","<<qmu[1]<<","<<qmu[2]<<","<<qmu[3]<<")"<<std::endl;
for(int mu=0;mu<Nd;mu++) {
LatticeCoordinate(kmu,mu);
RealD TwoPiL = M_PI * 2.0/ latt_size[mu];
@ -823,18 +777,9 @@ void WilsonFermion5D<Impl>::MomentumSpacePropagatorHwQ(FermionField &out,const F
kmu = kmu + TwoPiL * one * twist[mu];//momentum for twisted boundary conditions
sk2 = sk2 + 2.0*sin(kmu*0.5)*sin(kmu*0.5);
sk = sk + sin(kmu)*sin(kmu);
sk = sk + (sin(kmu)+qmu[mu])*(sin(kmu)+qmu[mu]);
// Terms for boosted Fermion
// 1/2 [ -i gamma.(sin p + q ) ]
// [ --------------------- + 1 ]
// [ wq + b ]
//
// wq = sqrt( (sinp+q)^2 + b^2 )
//
num = num - (sin(kmu)+qmu[mu])*ci*(Gamma(Gmu[mu])*in);
num = num - sin(kmu)*ci*(Gamma(Gmu[mu])*in);
}
num = num + mass * in ;

View File

@ -52,12 +52,17 @@ WilsonFermion<Impl>::WilsonFermion(GaugeField &_Umu, GridCartesian &Fgrid,
StencilEven(&Hgrid, npoint, Even, directions,displacements,p), // source is Even
StencilOdd(&Hgrid, npoint, Odd, directions,displacements,p), // source is Odd
mass(_mass),
Lebesgue(_grid),
LebesgueEvenOdd(_cbgrid),
Umu(&Fgrid),
UmuEven(&Hgrid),
UmuOdd(&Hgrid),
_tmp(&Hgrid),
anisotropyCoeff(anis)
{
Stencil.lo = &Lebesgue;
StencilEven.lo = &LebesgueEvenOdd;
StencilOdd.lo = &LebesgueEvenOdd;
// Allocate the required comms buffer
ImportGauge(_Umu);
if (anisotropyCoeff.isAnisotropic){
@ -309,7 +314,7 @@ void WilsonFermion<Impl>::Dhop(const FermionField &in, FermionField &out, int da
out.Checkerboard() = in.Checkerboard();
DhopInternal(Stencil, Umu, in, out, dag);
DhopInternal(Stencil, Lebesgue, Umu, in, out, dag);
}
template <class Impl>
@ -321,7 +326,7 @@ void WilsonFermion<Impl>::DhopOE(const FermionField &in, FermionField &out, int
assert(in.Checkerboard() == Even);
out.Checkerboard() = Odd;
DhopInternal(StencilEven, UmuOdd, in, out, dag);
DhopInternal(StencilEven, LebesgueEvenOdd, UmuOdd, in, out, dag);
}
template <class Impl>
@ -333,7 +338,7 @@ void WilsonFermion<Impl>::DhopEO(const FermionField &in, FermionField &out,int d
assert(in.Checkerboard() == Odd);
out.Checkerboard() = Even;
DhopInternal(StencilOdd, UmuEven, in, out, dag);
DhopInternal(StencilOdd, LebesgueEvenOdd, UmuEven, in, out, dag);
}
template <class Impl>
@ -386,21 +391,21 @@ void WilsonFermion<Impl>::DhopDirCalc(const FermionField &in, FermionField &out,
};
template <class Impl>
void WilsonFermion<Impl>::DhopInternal(StencilImpl &st,
void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in,
FermionField &out, int dag)
{
#ifdef GRID_OMP
if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute )
DhopInternalOverlappedComms(st,U,in,out,dag);
DhopInternalOverlappedComms(st,lo,U,in,out,dag);
else
#endif
DhopInternalSerial(st,U,in,out,dag);
DhopInternalSerial(st,lo,U,in,out,dag);
}
template <class Impl>
void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st,
void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in,
FermionField &out, int dag)
@ -469,10 +474,10 @@ void WilsonFermion<Impl>::DhopInternalOverlappedComms(StencilImpl &st,
template <class Impl>
void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st,
DoubledGaugeField &U,
const FermionField &in,
FermionField &out, int dag)
void WilsonFermion<Impl>::DhopInternalSerial(StencilImpl &st, LebesgueOrder &lo,
DoubledGaugeField &U,
const FermionField &in,
FermionField &out, int dag)
{
GRID_TRACE("DhopSerial");
assert((dag == DaggerNo) || (dag == DaggerYes));

View File

@ -40,11 +40,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
/// Switch off the 5d vectorised code optimisations
#undef DWFVEC5D
static std::vector<vComplexF> signsF;
static Vector<vComplexF> signsF;
template<typename vtype>
int setupSigns(std::vector<vtype>& signs ){
std::vector<vtype> bother(2);
int setupSigns(Vector<vtype>& signs ){
Vector<vtype> bother(2);
signs = bother;
vrsign(signs[0]);
visign(signs[1]);
@ -364,7 +364,7 @@ WilsonKernels<ZDomainWallVec5dImplF>::AsmDhopSiteDagExt(StencilView &st, Doubled
#include <simd/Intel512double.h>
static std::vector<vComplexD> signsD;
static Vector<vComplexD> signsD;
static int signInitD = setupSigns(signsD);
#define MAYBEPERM(A,perm) if (perm) { A ; }

View File

@ -411,46 +411,6 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
#undef LoopBody
}
#ifdef GRID_SYCL
extern "C" {
ulong SYCL_EXTERNAL __attribute__((overloadable)) intel_get_cycle_counter( void );
uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_active_channel_mask( void );
uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_grf_register( uint reg );
uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_flag_register( uint flag );
uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_control_register( uint reg );
uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_hw_thread_id( void );
uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_slice_id( void );
uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_subslice_id( void );
uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_eu_id( void );
uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_eu_thread_id( void );
void SYCL_EXTERNAL __attribute__((overloadable)) intel_eu_thread_pause( uint value );
}
#ifdef GRID_SIMT
#define MAKE_ID(A) (intel_get_eu_id()<<16)|(intel_get_slice_id()<<8)|(intel_get_subslice_id())
#else
#define MAKE_ID(A) (0)
#endif
#else
#define MAKE_ID(A) (0)
#endif
#define KERNEL_CALL_ID(A) \
const uint64_t NN = Nsite*Ls; \
accelerator_forNB( ss, NN, Simd::Nsimd(), { \
int sF = ss; \
int sU = ss/Ls; \
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v); \
const int Nsimd = SiteHalfSpinor::Nsimd(); \
const int lane=acceleratorSIMTlane(Nsimd); \
int idx=sF*Nsimd+lane; \
uint64_t id = MAKE_ID(); \
ids[idx]=id; \
}); \
accelerator_barrier();
#define KERNEL_CALLNB(A) \
const uint64_t NN = Nsite*Ls; \
@ -458,7 +418,7 @@ extern "C" {
int sF = ss; \
int sU = ss/Ls; \
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v); \
});
});
#define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier();
@ -474,7 +434,7 @@ extern "C" {
#define ASM_CALL(A) \
thread_for( sss, Nsite, { \
int ss = sss; /*st.lo->Reorder(sss);*/ \
int ss = st.lo->Reorder(sss); \
int sU = ss; \
int sF = ss*Ls; \
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \
@ -491,8 +451,6 @@ extern "C" {
WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \
});}
template <class Impl>
void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
int Ls, int Nsite, const FermionField &in, FermionField &out,
@ -527,18 +485,6 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
}
assert(0 && " Kernel optimisation case not covered ");
}
template <class Impl>
void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
int Ls, int Nsite, const FermionField &in, FermionField &out,
uint64_t *ids)
{
autoView(U_v , U,AcceleratorRead);
autoView(in_v , in,AcceleratorRead);
autoView(out_v,out,AcceleratorWrite);
autoView(st_v , st,AcceleratorRead);
KERNEL_CALL_ID(GenericDhopSite);
}
template <class Impl>
void WilsonKernels<Impl>::DhopDagKernel(int Opt,StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor * buf,
int Ls, int Nsite, const FermionField &in, FermionField &out,

View File

@ -40,11 +40,6 @@ public:
INHERIT_GIMPL_TYPES(Gimpl);
using Action<GaugeField>::S;
using Action<GaugeField>::Sinitial;
using Action<GaugeField>::deriv;
using Action<GaugeField>::refresh;
private:
RealD c_plaq;
RealD c_rect;

View File

@ -43,11 +43,6 @@ class WilsonGaugeAction : public Action<typename Gimpl::GaugeField> {
public:
INHERIT_GIMPL_TYPES(Gimpl);
using Action<GaugeField>::S;
using Action<GaugeField>::Sinitial;
using Action<GaugeField>::deriv;
using Action<GaugeField>::refresh;
/////////////////////////// constructors
explicit WilsonGaugeAction(RealD beta_):beta(beta_){};

View File

@ -40,7 +40,7 @@ public:
U = Zero();
LatticeColourMatrix tmp(Uin.Grid());
std::vector<typename SU<ncolour>::Matrix> ta(Dimension);
Vector<typename SU<ncolour>::Matrix> ta(Dimension);
// Debug lines
// LatticeMatrix uno(Uin.Grid());

View File

@ -43,7 +43,7 @@ public:
U = Zero();
LatticeColourMatrix tmp(Uin.Grid());
std::vector<typename GaugeGroup<ncolour,group_name>::Matrix> eij(Dimension);
Vector<typename GaugeGroup<ncolour,group_name>::Matrix> eij(Dimension);
for (int a = 0; a < Dimension; a++)
GaugeGroupTwoIndex<ncolour, S, group_name>::base(a, eij[a]);

File diff suppressed because it is too large Load Diff

View File

@ -971,9 +971,7 @@ void BaryonUtils<FImpl>::BaryonGamma3pt(
autoView( vq_ti , q_ti , AcceleratorRead);
autoView( vq_tf , q_tf , AcceleratorRead);
deviceVector<mobj> my_Dq_spec(2);
acceleratorPut(my_Dq_spec[0],Dq_spec1);
acceleratorPut(my_Dq_spec[1],Dq_spec2);
Vector<mobj> my_Dq_spec{Dq_spec1,Dq_spec2};
mobj * Dq_spec_p = &my_Dq_spec[0];
if (group == 1) {
@ -1302,8 +1300,7 @@ void BaryonUtils<FImpl>::SigmaToNucleonEye(const PropagatorField &qq_loop,
autoView( vd_tf , qd_tf , AcceleratorRead);
autoView( vs_ti , qs_ti , AcceleratorRead);
deviceVector<mobj> my_Dq_spec(1);
acceleratorPut(my_Dq_spec[0],Du_spec);
Vector<mobj> my_Dq_spec{Du_spec};
mobj * Dq_spec_p = &my_Dq_spec[0];
if(op == "Q1"){
@ -1356,8 +1353,7 @@ void BaryonUtils<FImpl>::SigmaToNucleonNonEye(const PropagatorField &qq_ti,
autoView( vd_tf , qd_tf , AcceleratorRead );
autoView( vs_ti , qs_ti , AcceleratorRead );
deviceVector<mobj> my_Dq_spec(1);
acceleratorPut(my_Dq_spec[0],Du_spec);
Vector<mobj> my_Dq_spec{Du_spec};
mobj * Dq_spec_p = &my_Dq_spec[0];
if(op == "Q1"){
@ -1548,9 +1544,7 @@ void BaryonUtils<FImpl>::XiToSigmaEye(const PropagatorField &qq_loop,
autoView( vd_tf , qd_tf , AcceleratorRead);
autoView( vs_ti , qs_ti , AcceleratorRead);
deviceVector<mobj> my_Dq_spec(2);
acceleratorPut(my_Dq_spec[0],Dd_spec);
acceleratorPut(my_Dq_spec[0],Ds_spec);
Vector<mobj> my_Dq_spec{Dd_spec,Ds_spec};
mobj * Dq_spec_p = &my_Dq_spec[0];
if(op == "Q1"){

View File

@ -118,7 +118,7 @@ static void generatorDiagonal(int diagIndex, iGroupMatrix<cplx> &ta) {
////////////////////////////////////////////////////////////////////////
// Map a su2 subgroup number to the pair of rows that are non zero
////////////////////////////////////////////////////////////////////////
static accelerator_inline void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::SU) {
static void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::SU) {
assert((su2_index >= 0) && (su2_index < (ncolour * (ncolour - 1)) / 2));
int spare = su2_index;

View File

@ -62,7 +62,7 @@ public:
// returns i(T_Adj)^index necessary for the projectors
// see definitions above
iAdjTa = Zero();
iSUnMatrix<cplx> ta[ncolour * ncolour - 1];
Vector<iSUnMatrix<cplx> > ta(ncolour * ncolour - 1);
iSUnMatrix<cplx> tmp;
// FIXME not very efficient to get all the generators everytime

View File

@ -207,7 +207,7 @@ static void generatorZtype(int zIndex, iGroupMatrix<cplx> &ta) {
// Map a su2 subgroup number to the pair of rows that are non zero
////////////////////////////////////////////////////////////////////////
template <ONLY_IF_Sp>
static accelerator_inline void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::Sp) {
static void su2SubGroupIndex(int &i1, int &i2, int su2_index, GroupName::Sp) {
const int nsp=ncolour/2;
assert((su2_index >= 0) && (su2_index < (nsp * (nsp - 1)) / 2));

View File

@ -72,7 +72,7 @@ public:
}
// Resident in managed memory
deviceVector<GeneralStencilEntry> _entries;
Vector<GeneralStencilEntry> _entries;
GeneralLocalStencil(GridBase *grid, const std::vector<Coordinate> &shifts)
{
@ -141,7 +141,7 @@ public:
////////////////////////////////////////////////
// Store in look up table
////////////////////////////////////////////////
acceleratorPut(this->_entries[lex],SE);
this->_entries[lex] = SE;
}
});
}

View File

@ -1,4 +1,3 @@
#if 0
/*************************************************************************************
Grid physics library, www.github.com/paboyle/Grid
@ -242,4 +241,3 @@ void LebesgueOrder::ZGraph(void)
}
NAMESPACE_END(Grid);
#endif

View File

@ -72,7 +72,7 @@ public:
void ThreadInterleave(void);
private:
deviceVector<IndexInteger> _LebesgueReorder;
Vector<IndexInteger> _LebesgueReorder;
};

View File

@ -19,7 +19,7 @@ public:
static int PartialCompressionFactor(GridBase *grid) {return 1;};
// Decompress is after merge so ok
template<class vobj,class cobj,class compressor>
static void Gather_plane_simple (deviceVector<std::pair<int,int> >& table,
static void Gather_plane_simple (commVector<std::pair<int,int> >& table,
const Lattice<vobj> &rhs,
cobj *buffer,
compressor &compress,
@ -35,7 +35,7 @@ public:
rhs_v.ViewClose();
}
template<class vobj,class cobj,class compressor>
static void Gather_plane_exchange(deviceVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
std::vector<cobj *> pointers,int dimension,int plane,int cbmask,
compressor &compress,int type,int partial)
{
@ -83,6 +83,25 @@ public:
// Wilson compressor will add alternate policies for Dirichlet
// and possibly partial Dirichlet for DWF
////////////////////////////////////
/*
class FaceGatherDirichlet
{
// If it's dirichlet we don't assemble comms buffers
//
// Rely on zeroes in gauge field to drive the correct result
// NAN propgagation: field will locally wrap, so fermion should NOT contain NAN and just permute
template<class vobj,class cobj,class compressor>
static void Gather_plane_simple (commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so){};
template<class vobj,class cobj,class compressor>
static void Gather_plane_exchange(commVector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,
Vector<cobj *> pointers,int dimension,int plane,int cbmask,
compressor &compress,int type) {}
template<class decompressor,class Merger>
static void Merge(decompressor decompress,Merge &mm) { }
template<class decompressor,class Decompression>
static void Decompress(decompressor decompress,Decompression &dd) {}
};
*/
template<class vobj,class FaceGather>
class SimpleCompressorGather : public FaceGather {

View File

@ -31,6 +31,7 @@
#define STENCIL_MAX (16)
#include <Grid/stencil/SimpleCompressor.h> // subdir aggregate
#include <Grid/stencil/Lebesgue.h> // subdir aggregate
#include <Grid/stencil/GeneralLocalStencil.h>
//////////////////////////////////////////////////////////////////////////////////////////
@ -121,22 +122,17 @@ class CartesianStencilAccelerator {
StencilVector same_node;
Coordinate _simd_layout;
Parameters parameters;
ViewMode mode;
StencilEntry* _entries_p;
StencilEntry* _entries_host_p;
cobj* u_recv_buf_p;
cobj* u_send_buf_p;
accelerator_inline cobj *CommBuf(void) const { return u_recv_buf_p; }
// Not a device function
inline int GetNodeLocal(int osite,int point) const {
StencilEntry SE=this->_entries_host_p[point+this->_npoints*osite];
return SE._is_local;
accelerator_inline int GetNodeLocal(int osite,int point) const {
return this->_entries_p[point+this->_npoints*osite]._is_local;
}
accelerator_inline StencilEntry * GetEntry(int &ptype,int point,int osite) const {
ptype = this->_permute_type[point];
return & this->_entries_p[point+this->_npoints*osite];
ptype = this->_permute_type[point]; return & this->_entries_p[point+this->_npoints*osite];
}
accelerator_inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) const {
@ -169,22 +165,28 @@ class CartesianStencilView : public CartesianStencilAccelerator<vobj,cobj,Parame
{
public:
int *closed;
// StencilEntry *cpu_ptr;
StencilEntry *cpu_ptr;
ViewMode mode;
public:
// default copy constructor
CartesianStencilView (const CartesianStencilView &refer_to_me) = default;
CartesianStencilView (const CartesianStencilAccelerator<vobj,cobj,Parameters> &refer_to_me,ViewMode _mode)
: CartesianStencilAccelerator<vobj,cobj,Parameters>(refer_to_me)
: CartesianStencilAccelerator<vobj,cobj,Parameters>(refer_to_me),
cpu_ptr(this->_entries_p),
mode(_mode)
{
this->ViewOpen(_mode);
}
void ViewOpen(ViewMode _mode)
{
this->mode = _mode;
this->_entries_p =(StencilEntry *)
MemoryManager::ViewOpen(this->_entries_p,
this->_npoints*this->_osites*sizeof(StencilEntry),
mode,
AdviseDefault);
}
void ViewClose(void) { }
void ViewClose(void)
{
MemoryManager::ViewClose(this->cpu_ptr,this->mode);
}
};
@ -254,6 +256,7 @@ protected:
GridBase * _grid;
public:
GridBase *Grid(void) const { return _grid; }
LebesgueOrder *lo;
////////////////////////////////////////////////////////////////////////
// Needed to conveniently communicate gparity parameters into GPU memory
@ -270,11 +273,11 @@ public:
int face_table_computed;
int partialDirichlet;
int fullDirichlet;
std::vector<deviceVector<std::pair<int,int> > > face_table ;
deviceVector<int> surface_list;
std::vector<commVector<std::pair<int,int> > > face_table ;
Vector<int> surface_list;
std::vector<StencilEntry> _entries; // Resident in host memory
deviceVector<StencilEntry> _entries_device; // Resident in device memory
stencilVector<StencilEntry> _entries; // Resident in managed memory
commVector<StencilEntry> _entries_device; // Resident in device memory
std::vector<Packet> Packets;
std::vector<Merge> Mergers;
std::vector<Merge> MergersSHM;
@ -363,20 +366,11 @@ public:
////////////////////////////////////////////////////////////////////////
void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
{
FlightRecorder::StepLog("Communicate begin");
// All GPU kernel tasks must complete
// accelerator_barrier(); // All kernels should ALREADY be complete
// _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer
// But the HaloGather had a barrier too.
for(int i=0;i<Packets.size();i++){
_grid->StencilSendToRecvFromPrepare(MpiReqs,
Packets[i].send_buf,
Packets[i].to_rank,Packets[i].do_send,
Packets[i].recv_buf,
Packets[i].from_rank,Packets[i].do_recv,
Packets[i].xbytes,Packets[i].rbytes,i);
}
acceleratorCopySynchronise();
#ifdef ACCELERATOR_AWARE_MPI
for(int i=0;i<Packets.size();i++){
_grid->StencilSendToRecvFromBegin(MpiReqs,
Packets[i].send_buf,
@ -385,6 +379,23 @@ public:
Packets[i].from_rank,Packets[i].do_recv,
Packets[i].xbytes,Packets[i].rbytes,i);
}
#else
#warning "Using COPY VIA HOST BUFFERS IN STENCIL"
for(int i=0;i<Packets.size();i++){
// Introduce a host buffer with a cheap slab allocator and zero cost wipe all
Packets[i].host_send_buf = _grid->HostBufferMalloc(Packets[i].xbytes);
Packets[i].host_recv_buf = _grid->HostBufferMalloc(Packets[i].rbytes);
if ( Packets[i].do_send ) {
acceleratorCopyFromDevice(Packets[i].send_buf, Packets[i].host_send_buf,Packets[i].xbytes);
}
_grid->StencilSendToRecvFromBegin(MpiReqs,
Packets[i].host_send_buf,
Packets[i].to_rank,Packets[i].do_send,
Packets[i].host_recv_buf,
Packets[i].from_rank,Packets[i].do_recv,
Packets[i].xbytes,Packets[i].rbytes,i);
}
#endif
// Get comms started then run checksums
// Having this PRIOR to the dslash seems to make Sunspot work... (!)
for(int i=0;i<Packets.size();i++){
@ -395,18 +406,27 @@ public:
void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
{
FlightRecorder::StepLog("Start communicate complete");
_grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
if ( this->partialDirichlet ) DslashLogPartial();
else if ( this->fullDirichlet ) DslashLogDirichlet();
else DslashLogFull();
// acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete
// acceleratorCopySynchronise() is in the StencilSendToRecvFromComplete
// accelerator_barrier();
_grid->StencilBarrier();
#ifndef ACCELERATOR_AWARE_MPI
#warning "Using COPY VIA HOST BUFFERS IN STENCIL"
for(int i=0;i<Packets.size();i++){
if ( Packets[i].do_recv ) {
acceleratorCopyToDevice(Packets[i].host_recv_buf, Packets[i].recv_buf,Packets[i].rbytes);
}
}
_grid->HostBufferFreeAll();
#endif
// run any checksums
for(int i=0;i<Packets.size();i++){
if ( Packets[i].do_recv )
FlightRecorder::recvLog(Packets[i].recv_buf,Packets[i].rbytes,Packets[i].from_rank);
}
FlightRecorder::StepLog("Finish communicate complete");
}
////////////////////////////////////////////////////////////////////////
// Blocking send and receive. Either sequential or parallel.
@ -496,7 +516,6 @@ public:
HaloGatherDir(source,compress,point,face_idx);
}
accelerator_barrier(); // All my local gathers are complete
// _grid->StencilBarrier();// Synch shared memory on a single nodes
face_table_computed=1;
assert(u_comm_offset==_unified_buffer_size);
}
@ -632,10 +651,10 @@ public:
////////////////////////////////////////
void PrecomputeByteOffsets(void){
for(int i=0;i<_entries.size();i++){
if( this->_entries[i]._is_local ) {
this->_entries[i]._byte_offset = this->_entries[i]._offset*sizeof(vobj);
if( _entries[i]._is_local ) {
_entries[i]._byte_offset = _entries[i]._offset*sizeof(vobj);
} else {
this->_entries[i]._byte_offset = this->_entries[i]._offset*sizeof(cobj);
_entries[i]._byte_offset = _entries[i]._offset*sizeof(cobj);
}
}
};
@ -649,7 +668,7 @@ public:
for(int point=0;point<this->_npoints;point++){
this->same_node[point] = this->SameNode(point);
}
int32_t surface_list_size=0;
for(int site = 0 ;site< vol4;site++){
int local = 1;
for(int point=0;point<this->_npoints;point++){
@ -659,30 +678,11 @@ public:
}
if(local == 0) {
for(int s=0;s<Ls;s++){
surface_list_size++;
surface_list.push_back(site*Ls+s);
}
}
}
std::cout << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
surface_list.resize(surface_list_size);
std::vector<int> surface_list_host(surface_list_size);
int32_t ss=0;
for(int site = 0 ;site< vol4;site++){
int local = 1;
for(int point=0;point<this->_npoints;point++){
if( (!this->GetNodeLocal(site*Ls,point)) && (!this->same_node[point]) ){
local = 0;
}
}
if(local == 0) {
for(int s=0;s<Ls;s++){
int idx=site*Ls+s;
surface_list_host[ss]= idx;
ss++;
}
}
}
acceleratorCopyToDevice(&surface_list_host[0],&surface_list[0],surface_list_size*sizeof(int));
//std::cout << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
}
/// Introduce a block structure and switch off comms on boundaries
void DirichletBlock(const Coordinate &dirichlet_block)
@ -770,13 +770,7 @@ public:
this->_osites = _grid->oSites();
_entries.resize(this->_npoints* this->_osites);
_entries_device.resize(this->_npoints* this->_osites);
this->_entries_host_p = &_entries[0];
this->_entries_p = &_entries_device[0];
std::cout << GridLogMessage << " Stencil object allocated for "<<std::dec<<this->_osites
<<" sites table "<<std::hex<<this->_entries_p<< " GridPtr "<<_grid<<std::dec<<std::endl;
this->_entries_p = &_entries[0];
for(int ii=0;ii<npoints;ii++){
int i = ii; // reverse direction to get SIMD comms done first
@ -853,7 +847,6 @@ public:
u_simd_send_buf[l] = (cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
}
PrecomputeByteOffsets();
acceleratorCopyToDevice(&this->_entries[0],&this->_entries_device[0],this->_entries.size()*sizeof(StencilEntry));
}
void Local (int point, int dimension,int shiftpm,int cbmask)
@ -1009,10 +1002,10 @@ public:
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
for(int b=0;b<_grid->_slice_block[dimension];b++){
int idx=point+(lo+o+b)*this->_npoints;
this->_entries[idx]._offset =ro+o+b;
this->_entries[idx]._permute=permute;
this->_entries[idx]._is_local=1;
this->_entries[idx]._around_the_world=wrap;
_entries[idx]._offset =ro+o+b;
_entries[idx]._permute=permute;
_entries[idx]._is_local=1;
_entries[idx]._around_the_world=wrap;
}
o +=_grid->_slice_stride[dimension];
}
@ -1030,10 +1023,10 @@ public:
if ( ocb&cbmask ) {
int idx = point+(lo+o+b)*this->_npoints;
this->_entries[idx]._offset =ro+o+b;
this->_entries[idx]._is_local=1;
this->_entries[idx]._permute=permute;
this->_entries[idx]._around_the_world=wrap;
_entries[idx]._offset =ro+o+b;
_entries[idx]._is_local=1;
_entries[idx]._permute=permute;
_entries[idx]._around_the_world=wrap;
}
}
@ -1057,10 +1050,10 @@ public:
for(int n=0;n<_grid->_slice_nblock[dimension];n++){
for(int b=0;b<_grid->_slice_block[dimension];b++){
int idx=point+(so+o+b)*this->_npoints;
this->_entries[idx]._offset =offset+(bo++);
this->_entries[idx]._is_local=0;
this->_entries[idx]._permute=0;
this->_entries[idx]._around_the_world=wrap;
_entries[idx]._offset =offset+(bo++);
_entries[idx]._is_local=0;
_entries[idx]._permute=0;
_entries[idx]._around_the_world=wrap;
}
o +=_grid->_slice_stride[dimension];
}
@ -1077,10 +1070,10 @@ public:
int ocb=1<<_grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
if ( ocb & cbmask ) {
int idx = point+(so+o+b)*this->_npoints;
this->_entries[idx]._offset =offset+(bo++);
this->_entries[idx]._is_local=0;
this->_entries[idx]._permute =0;
this->_entries[idx]._around_the_world=wrap;
_entries[idx]._offset =offset+(bo++);
_entries[idx]._is_local=0;
_entries[idx]._permute =0;
_entries[idx]._around_the_world=wrap;
}
}
o +=_grid->_slice_stride[dimension];

View File

@ -202,15 +202,15 @@ void acceleratorInit(void)
#ifdef GRID_SYCL
sycl::queue *theGridAccelerator;
sycl::queue *theCopyAccelerator;
cl::sycl::queue *theGridAccelerator;
cl::sycl::queue *theCopyAccelerator;
void acceleratorInit(void)
{
int nDevices = 1;
// sycl::gpu_selector selector;
// sycl::device selectedDevice { selector };
theGridAccelerator = new sycl::queue (sycl::gpu_selector_v);
theCopyAccelerator = new sycl::queue (sycl::gpu_selector_v);
cl::sycl::gpu_selector selector;
cl::sycl::device selectedDevice { selector };
theGridAccelerator = new sycl::queue (selectedDevice);
theCopyAccelerator = new sycl::queue (selectedDevice);
// theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway.
#ifdef GRID_SYCL_LEVEL_ZERO_IPC
@ -242,14 +242,14 @@ void acceleratorInit(void)
gethostname(hostname, HOST_NAME_MAX+1);
if ( rank==0 ) printf(" acceleratorInit world_rank %d is host %s \n",world_rank,hostname);
auto devices = sycl::device::get_devices();
auto devices = cl::sycl::device::get_devices();
for(int d = 0;d<devices.size();d++){
#define GPU_PROP_STR(prop) \
printf("AcceleratorSyclInit: " #prop ": %s \n",devices[d].get_info<sycl::info::device::prop>().c_str());
printf("AcceleratorSyclInit: " #prop ": %s \n",devices[d].get_info<cl::sycl::info::device::prop>().c_str());
#define GPU_PROP_FMT(prop,FMT) \
printf("AcceleratorSyclInit: " #prop ": " FMT" \n",devices[d].get_info<sycl::info::device::prop>());
printf("AcceleratorSyclInit: " #prop ": " FMT" \n",devices[d].get_info<cl::sycl::info::device::prop>());
#define GPU_PROP(prop) GPU_PROP_FMT(prop,"%ld");
if ( world_rank == 0) {

View File

@ -132,17 +132,27 @@ inline void cuda_mem(void)
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
{ \
if ( num1*num2 ) { \
int nt=acceleratorThreads(); \
typedef uint64_t Iterator; \
auto lambda = [=] accelerator \
(Iterator iter1,Iterator iter2,Iterator lane) mutable { \
__VA_ARGS__; \
}; \
dim3 cu_threads(nsimd,acceleratorThreads(),1); \
dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \
LambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda); \
} \
int nt=acceleratorThreads(); \
typedef uint64_t Iterator; \
auto lambda = [=] accelerator \
(Iterator iter1,Iterator iter2,Iterator lane) mutable { \
__VA_ARGS__; \
}; \
dim3 cu_threads(nsimd,acceleratorThreads(),1); \
dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \
LambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda); \
}
#define prof_accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
{ \
int nt=acceleratorThreads(); \
typedef uint64_t Iterator; \
auto lambda = [=] accelerator \
(Iterator iter1,Iterator iter2,Iterator lane) mutable { \
__VA_ARGS__; \
}; \
dim3 cu_threads(nsimd,acceleratorThreads(),1); \
dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \
ProfileLambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda); \
}
#define accelerator_for6dNB(iter1, num1, \
@ -165,6 +175,19 @@ inline void cuda_mem(void)
}
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
{ \
int nt=acceleratorThreads(); \
typedef uint64_t Iterator; \
auto lambda = [=] accelerator \
(Iterator iter1,Iterator iter2,Iterator lane) mutable { \
__VA_ARGS__; \
}; \
dim3 cu_threads(nsimd,acceleratorThreads(),1); \
dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \
LambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda); \
}
template<typename lambda> __global__
void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda)
{
@ -176,6 +199,17 @@ void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda)
Lambda(x,y,z);
}
}
template<typename lambda> __global__
void ProfileLambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda)
{
// Weird permute is to make lane coalesce for large blocks
uint64_t x = threadIdx.y + blockDim.y*blockIdx.x;
uint64_t y = threadIdx.z + blockDim.z*blockIdx.y;
uint64_t z = threadIdx.x;
if ( (x < num1) && (y<num2) && (z<num3) ) {
Lambda(x,y,z);
}
}
template<typename lambda> __global__
void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3,
@ -209,17 +243,6 @@ void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3,
} \
}
inline void *acceleratorAllocHost(size_t bytes)
{
void *ptr=NULL;
auto err = cudaMallocHost((void **)&ptr,bytes);
if( err != cudaSuccess ) {
ptr = (void *) NULL;
printf(" cudaMallocHost failed for %d %s \n",bytes,cudaGetErrorString(err));
assert(0);
}
return ptr;
}
inline void *acceleratorAllocShared(size_t bytes)
{
void *ptr=NULL;
@ -241,10 +264,8 @@ inline void *acceleratorAllocDevice(size_t bytes)
}
return ptr;
};
inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
inline void acceleratorFreeHost(void *ptr){ cudaFree(ptr);};
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
inline void acceleratorCopyToDeviceAsync(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) { cudaMemcpyAsync(to,from,bytes, cudaMemcpyHostToDevice, stream);}
@ -281,7 +302,7 @@ NAMESPACE_END(Grid);
// Force deterministic reductions
#define SYCL_REDUCTION_DETERMINISTIC
#include <sycl/sycl.hpp>
#include <sycl/CL/sycl.hpp>
#include <sycl/usm.hpp>
#include <level_zero/ze_api.h>
#include <sycl/ext/oneapi/backend/level_zero.hpp>
@ -293,8 +314,8 @@ inline void acceleratorMem(void)
std::cout <<" SYCL acceleratorMem not implemented"<<std::endl;
}
extern sycl::queue *theGridAccelerator;
extern sycl::queue *theCopyAccelerator;
extern cl::sycl::queue *theGridAccelerator;
extern cl::sycl::queue *theCopyAccelerator;
#ifdef __SYCL_DEVICE_ONLY__
#define GRID_SIMT
@ -305,24 +326,24 @@ extern sycl::queue *theCopyAccelerator;
accelerator_inline int acceleratorSIMTlane(int Nsimd) {
#ifdef GRID_SIMT
return __spirv::initLocalInvocationId<3, sycl::id<3>>()[2];
return __spirv::initLocalInvocationId<3, cl::sycl::id<3>>()[2];
#else
return 0;
#endif
} // SYCL specific
#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
theGridAccelerator->submit([&](sycl::handler &cgh) { \
theGridAccelerator->submit([&](cl::sycl::handler &cgh) { \
unsigned long nt=acceleratorThreads(); \
if(nt < 8)nt=8; \
unsigned long unum1 = num1; \
unsigned long unum2 = num2; \
unsigned long unum1_divisible_by_nt = ((unum1 + nt - 1) / nt) * nt; \
sycl::range<3> local {nt,1,nsimd}; \
sycl::range<3> global{unum1_divisible_by_nt,unum2,nsimd}; \
cl::sycl::range<3> local {nt,1,nsimd}; \
cl::sycl::range<3> global{unum1_divisible_by_nt,unum2,nsimd}; \
cgh.parallel_for( \
sycl::nd_range<3>(global,local), \
[=] (sycl::nd_item<3> item) /*mutable*/ \
cl::sycl::nd_range<3>(global,local), \
[=] (cl::sycl::nd_item<3> item) /*mutable*/ \
[[intel::reqd_sub_group_size(16)]] \
{ \
auto iter1 = item.get_global_id(0); \
@ -335,17 +356,12 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
#define accelerator_barrier(dummy) { theGridAccelerator->wait(); }
inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);};
inline void *acceleratorAllocHost(size_t bytes) { return malloc_host(bytes,*theGridAccelerator);};
inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);};
inline void acceleratorFreeHost(void *ptr){free(ptr,*theGridAccelerator);};
inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
inline void acceleratorCopySynchronise(void) { theCopyAccelerator->wait(); }
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes);}
inline void acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); }
inline void acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); }
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();}
@ -353,8 +369,8 @@ inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccele
inline int acceleratorIsCommunicable(void *ptr)
{
#if 0
auto uvm = sycl::usm::get_pointer_type(ptr, theGridAccelerator->get_context());
if ( uvm = sycl::usm::alloc::shared ) return 1;
auto uvm = cl::sycl::usm::get_pointer_type(ptr, theGridAccelerator->get_context());
if ( uvm = cl::sycl::usm::alloc::shared ) return 1;
else return 0;
#endif
return 1;
@ -456,16 +472,6 @@ void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
} \
}
inline void *acceleratorAllocHost(size_t bytes)
{
void *ptr=NULL;
auto err = hipMallocHost((void **)&ptr,bytes);
if( err != hipSuccess ) {
ptr = (void *) NULL;
fprintf(stderr," hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err)); fflush(stderr);
}
return ptr;
};
inline void *acceleratorAllocShared(size_t bytes)
{
void *ptr=NULL;
@ -489,12 +495,12 @@ inline void *acceleratorAllocDevice(size_t bytes)
return ptr;
};
inline void acceleratorFreeHost(void *ptr){ auto discard=hipFree(ptr);};
inline void acceleratorFreeShared(void *ptr){ auto discard=hipFree(ptr);};
inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);};
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
//inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { hipMemcpy(to,from,bytes, hipMemcpyDeviceToDevice);}
//inline void acceleratorCopySynchronise(void) { }
inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);}
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
@ -511,19 +517,15 @@ inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize
#endif
inline void acceleratorPin(void *ptr,unsigned long bytes)
{
#ifdef GRID_SYCL
sycl::ext::oneapi::experimental::prepare_for_device_copy(ptr,bytes,theCopyAccelerator->get_context());
#endif
}
//////////////////////////////////////////////
// Common on all GPU targets
//////////////////////////////////////////////
#if defined(GRID_SYCL) || defined(GRID_CUDA) || defined(GRID_HIP)
// FIXME -- the non-blocking nature got broken March 30 2023 by PAB
#define accelerator_forNB( iter1, num1, nsimd, ... ) accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} );
#define prof_accelerator_for( iter1, num1, nsimd, ... ) \
prof_accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} );\
accelerator_barrier(dummy);
#define accelerator_for( iter, num, nsimd, ... ) \
accelerator_forNB(iter, num, nsimd, { __VA_ARGS__ } ); \
@ -572,10 +574,8 @@ inline void acceleratorCopySynchronise(void) {};
inline int acceleratorIsCommunicable(void *ptr){ return 1; }
inline void acceleratorMemSet(void *base,int value,size_t bytes) { memset(base,value,bytes);}
#ifdef HAVE_MM_MALLOC_H
inline void *acceleratorAllocHost(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
inline void *acceleratorAllocShared(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
inline void *acceleratorAllocDevice(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
inline void acceleratorFreeHost(void *ptr){_mm_free(ptr);};
inline void acceleratorFreeShared(void *ptr){_mm_free(ptr);};
inline void acceleratorFreeDevice(void *ptr){_mm_free(ptr);};
#else

View File

@ -39,8 +39,6 @@ int FlightRecorder::ContinueOnFail;
int FlightRecorder::LoggingMode;
int FlightRecorder::ChecksumComms;
int FlightRecorder::ChecksumCommsSend;
const char * FlightRecorder::StepName;
int32_t FlightRecorder::StepLoggingCounter;
int32_t FlightRecorder::XmitLoggingCounter;
int32_t FlightRecorder::RecvLoggingCounter;
int32_t FlightRecorder::CsumLoggingCounter;
@ -60,8 +58,6 @@ void FlightRecorder::ResetCounters(void)
CsumLoggingCounter=0;
NormLoggingCounter=0;
ReductionLoggingCounter=0;
StepName = "No steps started";
StepLoggingCounter=0;
}
void FlightRecorder::Truncate(void)
{
@ -92,12 +88,6 @@ void FlightRecorder::SetLoggingMode(FlightRecorder::LoggingMode_t mode)
assert(0);
}
}
bool FlightRecorder::StepLog(const char *name)
{
StepName = name;
StepLoggingCounter ++;
return true;
}
void FlightRecorder::SetLoggingModePrint(void)
{
@ -121,19 +111,17 @@ uint64_t FlightRecorder::ErrorCount(void)
{
return ErrorCounter;
}
bool FlightRecorder::NormLog(double value)
void FlightRecorder::NormLog(double value)
{
uint64_t hex = * ( (uint64_t *)&value );
if(LoggingMode == LoggingModePrint) {
std::cerr<<"FlightRecorder::NormLog : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
NormLoggingCounter++;
return true;
}
if(LoggingMode == LoggingModeRecord) {
std::cerr<<"FlightRecorder::NormLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
NormLogVector.push_back(value);
NormLoggingCounter++;
return true;
}
if(LoggingMode == LoggingModeVerify) {
@ -142,9 +130,6 @@ bool FlightRecorder::NormLog(double value)
if ( (value != NormLogVector[NormLoggingCounter]) || std::isnan(value) ) {
fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
FlightRecorder::StepLoggingCounter,
FlightRecorder::StepName);
std::cerr<<"FlightRecorder::NormLog Oops, I did it again "<< NormLoggingCounter
<<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<" "
<<std::hexfloat<<value<<" "<< NormLogVector[NormLoggingCounter]<<std::endl;
@ -157,9 +142,7 @@ bool FlightRecorder::NormLog(double value)
NormLoggingCounter,NormLogVector.size(),
value, NormLogVector[NormLoggingCounter]); fflush(stderr);
BACKTRACEFP(stderr);
if(!ContinueOnFail) return false;
if(!ContinueOnFail)assert(0); // Force takedown of job
ErrorCounter++;
} else {
@ -176,21 +159,18 @@ bool FlightRecorder::NormLog(double value)
}
NormLoggingCounter++;
}
return true;
}
bool FlightRecorder::CsumLog(uint64_t hex)
void FlightRecorder::CsumLog(uint64_t hex)
{
if(LoggingMode == LoggingModePrint) {
std::cerr<<"FlightRecorder::CsumLog : "<< CsumLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
CsumLoggingCounter++;
return true;
}
if(LoggingMode == LoggingModeRecord) {
std::cerr<<"FlightRecorder::CsumLog RECORDING : "<< NormLoggingCounter <<" "<<std::hex<< hex<<std::dec <<std::endl;
CsumLogVector.push_back(hex);
CsumLoggingCounter++;
return true;
}
if(LoggingMode == LoggingModeVerify) {
@ -201,9 +181,6 @@ bool FlightRecorder::CsumLog(uint64_t hex)
if ( hex != hexref ) {
fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
FlightRecorder::StepLoggingCounter,
FlightRecorder::StepName);
std::cerr<<"FlightRecorder::CsumLog Oops, I did it again "<< CsumLoggingCounter
<<std::hex<<" "<<hex<<" "<<hexref<<std::dec<<std::endl;
@ -211,10 +188,9 @@ bool FlightRecorder::CsumLog(uint64_t hex)
GridHostname(),
GlobalSharedMemory::WorldShmRank,
CsumLoggingCounter,hex, hexref);
BACKTRACEFP(stderr);
fflush(stderr);
if(!ContinueOnFail) return false;
if(!ContinueOnFail) assert(0); // Force takedown of job
ErrorCounter++;
@ -231,9 +207,7 @@ bool FlightRecorder::CsumLog(uint64_t hex)
}
CsumLoggingCounter++;
}
return true;
}
void FlightRecorder::ReductionLog(double local,double global)
{
uint64_t hex_l = * ( (uint64_t *)&local );
@ -250,15 +224,11 @@ void FlightRecorder::ReductionLog(double local,double global)
if(LoggingMode == LoggingModeVerify) {
if(ReductionLoggingCounter < ReductionLogVector.size()){
if ( global != ReductionLogVector[ReductionLoggingCounter] ) {
fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
FlightRecorder::StepLoggingCounter,
FlightRecorder::StepName);
fprintf(stderr,"%s:%d Oops, MPI_Allreduce did it again! Reproduce failure for norm %d/%zu glb %.16e lcl %.16e expect glb %.16e\n",
GridHostname(),
GlobalSharedMemory::WorldShmRank,
ReductionLoggingCounter,ReductionLogVector.size(),
global, local, ReductionLogVector[ReductionLoggingCounter]); fflush(stderr);
BACKTRACEFP(stderr);
if ( !ContinueOnFail ) assert(0);
@ -280,11 +250,10 @@ void FlightRecorder::xmitLog(void *buf,uint64_t bytes)
if(LoggingMode == LoggingModeNone) return;
if ( ChecksumCommsSend ){
if(LoggingMode == LoggingModeNone) return;
uint64_t *ubuf = (uint64_t *)buf;
if(LoggingMode == LoggingModeNone) return;
#ifdef GRID_SYCL
uint64_t *ubuf = (uint64_t *)buf;
uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t));
if(LoggingMode == LoggingModePrint) {
std::cerr<<"FlightRecorder::xmitLog : "<< XmitLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl;
@ -298,15 +267,11 @@ void FlightRecorder::xmitLog(void *buf,uint64_t bytes)
if(LoggingMode == LoggingModeVerify) {
if(XmitLoggingCounter < XmitLogVector.size()){
if ( _xor != XmitLogVector[XmitLoggingCounter] ) {
fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
FlightRecorder::StepLoggingCounter,
FlightRecorder::StepName);
fprintf(stderr,"%s:%d Oops, send buf difference! Reproduce failure for xmit %d/%zu %lx expect glb %lx\n",
GridHostname(),
GlobalSharedMemory::WorldShmRank,
XmitLoggingCounter,XmitLogVector.size(),
_xor, XmitLogVector[XmitLoggingCounter]); fflush(stderr);
BACKTRACEFP(stderr);
if ( !ContinueOnFail ) assert(0);
@ -328,9 +293,9 @@ void FlightRecorder::xmitLog(void *buf,uint64_t bytes)
void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank)
{
if ( ChecksumComms ){
uint64_t *ubuf = (uint64_t *)buf;
if(LoggingMode == LoggingModeNone) return;
#ifdef GRID_SYCL
uint64_t *ubuf = (uint64_t *)buf;
uint64_t _xor = svm_xor(ubuf,bytes/sizeof(uint64_t));
if(LoggingMode == LoggingModePrint) {
std::cerr<<"FlightRecorder::recvLog : "<< RecvLoggingCounter <<" "<< std::hex << _xor <<std::dec <<std::endl;
@ -344,15 +309,11 @@ void FlightRecorder::recvLog(void *buf,uint64_t bytes,int rank)
if(LoggingMode == LoggingModeVerify) {
if(RecvLoggingCounter < RecvLogVector.size()){
if ( _xor != RecvLogVector[RecvLoggingCounter] ) {
fprintf(stderr,"FlightRecorder Oops step %d stage %s \n",
FlightRecorder::StepLoggingCounter,
FlightRecorder::StepName);
fprintf(stderr,"%s:%d Oops, recv buf difference! Reproduce failure for recv %d/%zu %lx expect glb %lx from MPI rank %d\n",
GridHostname(),
GlobalSharedMemory::WorldShmRank,
RecvLoggingCounter,RecvLogVector.size(),
_xor, RecvLogVector[RecvLoggingCounter],rank); fflush(stderr);
BACKTRACEFP(stderr);
if ( !ContinueOnFail ) assert(0);

View File

@ -12,8 +12,6 @@ class FlightRecorder {
static int LoggingMode;
static uint64_t ErrorCounter;
static const char * StepName;
static int32_t StepLoggingCounter;
static int32_t XmitLoggingCounter;
static int32_t RecvLoggingCounter;
static int32_t CsumLoggingCounter;
@ -32,9 +30,8 @@ class FlightRecorder {
static void SetLoggingModeRecord(void);
static void SetLoggingModeVerify(void);
static void SetLoggingMode(LoggingMode_t mode);
static bool StepLog(const char *name);
static bool NormLog(double value);
static bool CsumLog(uint64_t csum);
static void NormLog(double value);
static void CsumLog(uint64_t csum);
static void ReductionLog(double lcl, double glbl);
static void Truncate(void);
static void ResetCounters(void);

View File

@ -464,12 +464,16 @@ void Grid_init(int *argc,char ***argv)
std::cout<<GridLogMessage<<std::endl;
std::cout<<GridLogMessage<<"Performance:"<<std::endl;
std::cout<<GridLogMessage<<std::endl;
std::cout<<GridLogMessage<<" --comms-concurrent : Asynchronous MPI calls; several dirs at a time "<<std::endl;
std::cout<<GridLogMessage<<" --comms-sequential : Synchronous MPI calls; one dirs at a time "<<std::endl;
std::cout<<GridLogMessage<<" --comms-overlap : Overlap comms with compute "<<std::endl;
std::cout<<GridLogMessage<<std::endl;
std::cout<<GridLogMessage<<" --dslash-generic: Wilson kernel for generic Nc"<<std::endl;
std::cout<<GridLogMessage<<" --dslash-unroll : Wilson kernel for Nc=3"<<std::endl;
std::cout<<GridLogMessage<<" --dslash-asm : Wilson kernel for AVX512"<<std::endl;
std::cout<<GridLogMessage<<std::endl;
std::cout<<GridLogMessage<<" --lebesgue : Cache oblivious Lebesgue curve/Morton order/Z-graph stencil looping"<<std::endl;
std::cout<<GridLogMessage<<" --cacheblocking n.m.o.p : Hypercuboidal cache blocking"<<std::endl;
std::cout<<GridLogMessage<<std::endl;
exit(EXIT_SUCCESS);
}
@ -497,8 +501,28 @@ void Grid_init(int *argc,char ***argv)
WilsonKernelsStatic::Comms = WilsonKernelsStatic::CommsThenCompute;
StaggeredKernelsStatic::Comms = StaggeredKernelsStatic::CommsThenCompute;
}
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-concurrent") ){
CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicyConcurrent);
}
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-sequential") ){
CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
}
if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
LebesgueOrder::UseLebesgueOrder=1;
}
CartesianCommunicator::nCommThreads = 1;
#ifdef GRID_COMMS_THREADS
if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-threads") ){
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--comms-threads");
GridCmdOptionInt(arg,CartesianCommunicator::nCommThreads);
assert(CartesianCommunicator::nCommThreads > 0);
}
#endif
if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking");
GridCmdOptionIntVector(arg,LebesgueOrder::Block);
}
if( GridCmdOptionExists(*argv,*argv+*argc,"--notimestamp") ){
GridLogTimestamp(0);
} else {
@ -549,34 +573,8 @@ void GridLogLayout() {
void * Grid_backtrace_buffer[_NBACKTRACE];
void Grid_usr_signal_handler(int sig,siginfo_t *si,void * ptr)
{
fprintf(stderr,"Signal handler on host %s\n",hostname);
fprintf(stderr,"FlightRecorder step %d stage %s \n",
FlightRecorder::StepLoggingCounter,
FlightRecorder::StepName);
fprintf(stderr,"Caught signal %d\n",si->si_signo);
fprintf(stderr," mem address %llx\n",(unsigned long long)si->si_addr);
fprintf(stderr," code %d\n",si->si_code);
// x86 64bit
#ifdef __linux__
#ifdef __x86_64__
ucontext_t * uc= (ucontext_t *)ptr;
struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
fprintf(stderr," instruction %llx\n",(unsigned long long)sc->rip);
#endif
#endif
fflush(stderr);
BACKTRACEFP(stderr);
fprintf(stderr,"Called backtrace\n");
fflush(stdout);
fflush(stderr);
return;
}
void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
{
fprintf(stderr,"Signal handler on host %s\n",hostname);
fprintf(stderr,"Caught signal %d\n",si->si_signo);
fprintf(stderr," mem address %llx\n",(unsigned long long)si->si_addr);
fprintf(stderr," code %d\n",si->si_code);
@ -587,7 +585,7 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
ucontext_t * uc= (ucontext_t *)ptr;
struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
fprintf(stderr," instruction %llx\n",(unsigned long long)sc->rip);
#define REG(A) fprintf(stderr," %s %lx\n",#A,sc-> A);
#define REG(A) printf(" %s %lx\n",#A,sc-> A);
REG(rdi);
REG(rsi);
REG(rbp);
@ -620,8 +618,8 @@ void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
void Grid_exit_handler(void)
{
// BACKTRACEFP(stdout);
// fflush(stdout);
BACKTRACEFP(stdout);
fflush(stdout);
}
void Grid_debug_handler_init(void)
{
@ -629,10 +627,10 @@ void Grid_debug_handler_init(void)
sigemptyset (&sa.sa_mask);
sa.sa_sigaction= Grid_sa_signal_handler;
sa.sa_flags = SA_SIGINFO;
// sigaction(SIGSEGV,&sa,NULL);
sigaction(SIGSEGV,&sa,NULL);
sigaction(SIGTRAP,&sa,NULL);
sigaction(SIGBUS,&sa,NULL);
// sigaction(SIGUSR2,&sa,NULL);
sigaction(SIGUSR2,&sa,NULL);
feenableexcept( FE_INVALID|FE_OVERFLOW|FE_DIVBYZERO);
@ -640,14 +638,7 @@ void Grid_debug_handler_init(void)
sigaction(SIGKILL,&sa,NULL);
sigaction(SIGILL,&sa,NULL);
// Non terminating SIGUSR1/2 handler
struct sigaction sa_ping;
sigemptyset (&sa_ping.sa_mask);
sa_ping.sa_sigaction= Grid_usr_signal_handler;
sa_ping.sa_flags = SA_SIGINFO;
sigaction(SIGHUP,&sa_ping,NULL);
// atexit(Grid_exit_handler);
atexit(Grid_exit_handler);
}
NAMESPACE_END(Grid);

View File

@ -1,5 +1,5 @@
# additional include paths necessary to compile the C++ library
SUBDIRS = Grid benchmarks tests examples HMC
SUBDIRS = Grid HMC benchmarks tests examples
include $(top_srcdir)/doxygen.inc

View File

@ -644,6 +644,11 @@ int main (int argc, char ** argv)
Grid_init(&argc,&argv);
CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
#ifdef KNL
LebesgueOrder::Block = std::vector<int>({8,2,2,2});
#else
LebesgueOrder::Block = std::vector<int>({2,2,2,2});
#endif
Benchmark::Decomposition();
int do_su4=1;

View File

@ -52,7 +52,7 @@ int main (int argc, char ** argv)
int threads = GridThread::GetThreads();
int Ls=8;
int Ls=16;
for(int i=0;i<argc;i++) {
if(std::string(argv[i]) == "-Ls"){
std::stringstream ss(argv[i+1]); ss >> Ls;

View File

@ -70,7 +70,7 @@ int main (int argc, char ** argv)
pRNG.SeedFixedIntegers(std::vector<int>({56,17,89,101}));
std::vector<double> stop(threads);
std::vector<Vec> sum(threads);
Vector<Vec> sum(threads);
std::vector<LatticeVec> x(threads,&Grid);
for(int t=0;t<threads;t++){

View File

@ -78,9 +78,9 @@ int main (int argc, char ** argv)
double t0,t1;
typedef typename DomainWallFermionD::Coeff_t Coeff_t;
std::vector<Coeff_t> diag = Dw.bs;
std::vector<Coeff_t> upper= Dw.cs;
std::vector<Coeff_t> lower= Dw.cs;
Vector<Coeff_t> diag = Dw.bs;
Vector<Coeff_t> upper= Dw.cs;
Vector<Coeff_t> lower= Dw.cs;
upper[Ls-1]=-Dw.mass_minus*upper[Ls-1];
lower[0] =-Dw.mass_plus*lower[0];

View File

@ -118,7 +118,7 @@ public:
fprintf(FP,"Packet bytes, direction, GB/s per node\n");
for(int lat=16;lat<=maxlat;lat+=8){
// for(int Ls=8;Ls<=8;Ls*=2){
{ int Ls=8;
{ int Ls=12;
Coordinate latt_size ({lat*mpi_layout[0],
lat*mpi_layout[1],
@ -175,8 +175,8 @@ public:
timestat.statistics(t_time);
dbytes=dbytes*ppn;
double xbytes = dbytes*0.5;
double bidibytes = dbytes;
double xbytes = dbytes;
double bidibytes = dbytes*2.0;
std::cout<<GridLogMessage << lat<<"\t"<<Ls<<"\t "
<< bytes << " \t "
@ -861,7 +861,7 @@ int main (int argc, char ** argv)
}
CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
// LebesgueOrder::Block = std::vector<int>({2,2,2,2});
LebesgueOrder::Block = std::vector<int>({2,2,2,2});
Benchmark::Decomposition();
@ -872,7 +872,7 @@ int main (int argc, char ** argv)
int do_dslash=1;
int sel=4;
std::vector<int> L_list({8,12,16,24});
std::vector<int> L_list({8,12,16,24,32});
int selm1=sel-1;
std::vector<double> clover;

View File

@ -72,7 +72,6 @@ AC_CHECK_HEADERS(malloc/malloc.h)
AC_CHECK_HEADERS(malloc.h)
AC_CHECK_HEADERS(endian.h)
AC_CHECK_HEADERS(execinfo.h)
AC_CHECK_HEADERS(numaif.h)
AC_CHECK_DECLS([ntohll],[], [], [[#include <arpa/inet.h>]])
AC_CHECK_DECLS([be64toh],[], [], [[#include <arpa/inet.h>]])
@ -129,20 +128,6 @@ case ${ac_LAPACK} in
AC_DEFINE([USE_LAPACK],[1],[use LAPACK]);;
esac
############### internal reduction
AC_ARG_ENABLE([reduction],
[AS_HELP_STRING([--enable-reduction=mpi|grid],[enable reduction])],
[ac_REDUCTION=${enable_reduction}], [ac_REDUCTION=grid])
case ${ac_REDUCTION} in
mpi)
;;
grid)
AC_DEFINE([USE_GRID_REDUCTION],[1],[use GRID REDUCTION]);;
*)
AC_DEFINE([USE_GRID_REDUCTION],[1],[use GRID REDUCTION]);;
esac
############### tracing
AC_ARG_ENABLE([tracing],
[AS_HELP_STRING([--enable-tracing=none|nvtx|roctx|timer],[enable tracing])],
@ -240,21 +225,19 @@ case ${ac_SFW_FP16} in
AC_MSG_ERROR(["SFW FP16 option not supported ${ac_SFW_FP16}"]);;
esac
############### MPI BOUNCE TO HOST
############### Default to accelerator cshift, but revert to host if UCX is buggy or other reasons
AC_ARG_ENABLE([accelerator-aware-mpi],
[AS_HELP_STRING([--enable-accelerator-aware-mpi=yes|no],[run mpi transfers from device])],
[ac_ACCELERATOR_AWARE_MPI=${enable_accelerator_aware_mpi}], [ac_ACCELERATOR_AWARE_MPI=yes])
# Force accelerator CSHIFT now
AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ Cshift runs on device])
case ${ac_ACCELERATOR_AWARE_MPI} in
yes)
AC_DEFINE([ACCELERATOR_CSHIFT],[1],[ Cshift runs on host])
AC_DEFINE([ACCELERATOR_AWARE_MPI],[1],[ Stencil can use device pointers]);;
*);;
esac
############### SYCL/CUDA/HIP/none
AC_ARG_ENABLE([accelerator],
[AS_HELP_STRING([--enable-accelerator=cuda|sycl|hip|none],[enable none,cuda,sycl,hip acceleration])],
@ -681,6 +664,16 @@ case ${ac_SHM_FAST_PATH} in
*) ;;
esac
############### communication type selection
AC_ARG_ENABLE([comms-threads],[AS_HELP_STRING([--enable-comms-threads | --disable-comms-threads],[Use multiple threads in MPI calls])],[ac_COMMS_THREADS=${enable_comms_threads}],[ac_COMMS_THREADS=yes])
case ${ac_COMMS_THREADS} in
yes)
AC_DEFINE([GRID_COMMS_THREADING],[1],[GRID_COMMS_NONE] )
;;
*) ;;
esac
############### communication type selection
AC_ARG_ENABLE([comms],[AS_HELP_STRING([--enable-comms=none|mpi|mpi-auto],[Select communications])],[ac_COMMS=${enable_comms}],[ac_COMMS=none])

383
examples/Example_taku.cc Normal file
View File

@ -0,0 +1,383 @@
/*
* Warning: This code illustrative only: not well tested, and not meant for production use
* without regression / tests being applied
*/
#include <Grid/Grid.h>
using namespace std;
using namespace Grid;
RealD LLscale =1.0;
RealD LCscale =1.0;
template<class Gimpl,class Field> class CovariantLaplacianCshift : public SparseMatrixBase<Field>
{
public:
INHERIT_GIMPL_TYPES(Gimpl);
GridBase *grid;
GaugeField U;
CovariantLaplacianCshift(GaugeField &_U) :
grid(_U.Grid()),
U(_U) { };
virtual GridBase *Grid(void) { return grid; };
virtual void M (const Field &in, Field &out)
{
out=Zero();
for(int mu=0;mu<Nd-1;mu++) {
GaugeLinkField Umu = PeekIndex<LorentzIndex>(U, mu); // NB: Inefficent
out = out - Gimpl::CovShiftForward(Umu,mu,in);
out = out - Gimpl::CovShiftBackward(Umu,mu,in);
out = out + 2.0*in;
}
};
virtual void Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian
virtual void Mdiag (const Field &in, Field &out) {assert(0);}; // Unimplemented need only for multigrid
virtual void Mdir (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid
virtual void MdirAll (const Field &in, std::vector<Field> &out) {assert(0);}; // Unimplemented need only for multigrid
};
void MakePhase(Coordinate mom,LatticeComplex &phase)
{
GridBase *grid = phase.Grid();
auto latt_size = grid->GlobalDimensions();
ComplexD ci(0.0,1.0);
phase=Zero();
LatticeComplex coor(phase.Grid());
for(int mu=0;mu<Nd;mu++){
RealD TwoPiL = M_PI * 2.0/ latt_size[mu];
LatticeCoordinate(coor,mu);
phase = phase + (TwoPiL * mom[mu]) * coor;
}
phase = exp(phase*ci);
}
void PointSource(Coordinate &coor,LatticePropagator &source)
{
// Coordinate coor({0,0,0,0});
source=Zero();
SpinColourMatrix kronecker; kronecker=1.0;
pokeSite(kronecker,source,coor);
}
void Z2WallSource(GridParallelRNG &RNG,int tslice,LatticePropagator &source)
{
GridBase *grid = source.Grid();
LatticeComplex noise(grid);
LatticeComplex zz(grid); zz=Zero();
LatticeInteger t(grid);
RealD nrm=1.0/sqrt(2);
bernoulli(RNG, noise); // 0,1 50:50
noise = (2.*noise - Complex(1,1))*nrm;
LatticeCoordinate(t,Tdir);
noise = where(t==Integer(tslice), noise, zz);
source = 1.0;
source = source*noise;
std::cout << " Z2 wall " << norm2(source) << std::endl;
}
template<class Field>
void GaussianSmear(LatticeGaugeField &U,Field &unsmeared,Field &smeared)
{
typedef CovariantLaplacianCshift <PeriodicGimplR,Field> Laplacian_t;
Laplacian_t Laplacian(U);
Integer Iterations = 40;
Real width = 2.0;
Real coeff = (width*width) / Real(4*Iterations);
Field tmp(U.Grid());
smeared=unsmeared;
// chi = (1-p^2/2N)^N kronecker
for(int n = 0; n < Iterations; ++n) {
Laplacian.M(smeared,tmp);
smeared = smeared - coeff*tmp;
std::cout << " smear iter " << n<<" " <<norm2(smeared)<<std::endl;
}
}
void GaussianSource(Coordinate &site,LatticeGaugeField &U,LatticePropagator &source)
{
LatticePropagator tmp(source.Grid());
PointSource(site,source);
std::cout << " GaussianSource Kronecker "<< norm2(source)<<std::endl;
tmp = source;
GaussianSmear(U,tmp,source);
std::cout << " GaussianSource Smeared "<< norm2(source)<<std::endl;
}
void GaussianWallSource(GridParallelRNG &RNG,int tslice,LatticeGaugeField &U,LatticePropagator &source)
{
Z2WallSource(RNG,tslice,source);
auto tmp = source;
GaussianSmear(U,tmp,source);
}
void SequentialSource(int tslice,Coordinate &mom,LatticePropagator &spectator,LatticePropagator &source)
{
assert(mom.size()==Nd);
assert(mom[Tdir] == 0);
GridBase * grid = spectator.Grid();
LatticeInteger ts(grid);
LatticeCoordinate(ts,Tdir);
source = Zero();
source = where(ts==Integer(tslice),spectator,source); // Stick in a slice of the spectator, zero everywhere else
LatticeComplex phase(grid);
MakePhase(mom,phase);
source = source *phase;
}
template<class Action>
void Solve(Action &D,LatticePropagator &source,LatticePropagator &propagator)
{
GridBase *UGrid = D.GaugeGrid();
GridBase *FGrid = D.FermionGrid();
LatticeFermion src4 (UGrid);
LatticeFermion src5 (FGrid);
LatticeFermion result5(FGrid);
LatticeFermion result4(UGrid);
LatticePropagator prop5(FGrid);
ConjugateGradient<LatticeFermion> CG(1.0e-8,100000);
SchurRedBlackDiagMooeeSolve<LatticeFermion> schur(CG);
ZeroGuesser<LatticeFermion> ZG; // Could be a DeflatedGuesser if have eigenvectors
for(int s=0;s<Nd;s++){
for(int c=0;c<Nc;c++){
PropToFerm<Action>(src4,source,s,c);
D.ImportPhysicalFermionSource(src4,src5);
result5=Zero();
schur(D,src5,result5,ZG);
std::cout<<GridLogMessage
<<"spin "<<s<<" color "<<c
<<" norm2(src5d) " <<norm2(src5)
<<" norm2(result5d) "<<norm2(result5)<<std::endl;
D.ExportPhysicalFermionSolution(result5,result4);
FermToProp<Action>(prop5,result5,s,c);
FermToProp<Action>(propagator,result4,s,c);
}
}
LatticePropagator Axial_mu(UGrid);
LatticePropagator Vector_mu(UGrid);
LatticeComplex PA (UGrid);
LatticeComplex VV (UGrid);
LatticeComplex PJ5q(UGrid);
LatticeComplex PP (UGrid);
std::vector<TComplex> sumPA;
std::vector<TComplex> sumVV;
std::vector<TComplex> sumPP;
std::vector<TComplex> sumPJ5q;
Gamma g5(Gamma::Algebra::Gamma5);
D.ContractConservedCurrent(prop5,prop5,Axial_mu,source,Current::Axial,Tdir);
PA = trace(g5*Axial_mu); // Pseudoscalar-Axial conserved current
sliceSum(PA,sumPA,Tdir);
int Nt{static_cast<int>(sumPA.size())};
for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PAc["<<t<<"] "<<real(TensorRemove(sumPA[t]))*LCscale<<std::endl;
PP = trace(adj(propagator)*propagator); // Pseudoscalar density
sliceSum(PP,sumPP,Tdir);
for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PP["<<t<<"] "<<real(TensorRemove(sumPP[t]))*LCscale<<std::endl;
D.ContractJ5q(prop5,PJ5q);
sliceSum(PJ5q,sumPJ5q,Tdir);
for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"PJ5q["<<t<<"] "<<real(TensorRemove(sumPJ5q[t]))<<std::endl;
Gamma::Algebra GammaV[3] = {
Gamma::Algebra::GammaX,
Gamma::Algebra::GammaY,
Gamma::Algebra::GammaZ
};
for( int mu=0;mu<3;mu++ ) {
Gamma gV(GammaV[mu]);
D.ContractConservedCurrent(prop5,prop5,Vector_mu,source,Current::Vector,mu);
// auto ss=sliceSum(Vector_mu,Tdir);
// for(int t=0;t<Nt;t++) std::cout<<GridLogMessage <<"ss["<<mu<<"]["<<t<<"] "<<ss[t]<<std::endl;
VV = trace(gV*Vector_mu); // (local) Vector-Vector conserved current
sliceSum(VV,sumVV,Tdir);
for(int t=0;t<Nt;t++){
RealD Ct = real(TensorRemove(sumVV[t]))*LCscale;
std::cout<<GridLogMessage <<"VVc["<<mu<<"]["<<t<<"] "<< Ct
<< " 2 pi^2 t^3 C(t) "<< 2 * M_PI *M_PI * t*t*t *Ct<<std::endl;
}
}
}
class MesonFile: Serializable {
public:
GRID_SERIALIZABLE_CLASS_MEMBERS(MesonFile, std::vector<std::vector<Complex> >, data);
};
void MesonTrace(std::string file,LatticePropagator &q1,LatticePropagator &q2,LatticeComplex &phase)
{
const int nchannel=3;
Gamma::Algebra Gammas[nchannel][2] = {
{Gamma::Algebra::GammaX,Gamma::Algebra::GammaX},
{Gamma::Algebra::GammaY,Gamma::Algebra::GammaY},
{Gamma::Algebra::GammaZ,Gamma::Algebra::GammaZ}
};
Gamma G5(Gamma::Algebra::Gamma5);
LatticeComplex meson_CF(q1.Grid());
MesonFile MF;
for(int ch=0;ch<nchannel;ch++){
Gamma Gsrc(Gammas[ch][0]);
Gamma Gsnk(Gammas[ch][1]);
meson_CF = trace(G5*adj(q1)*G5*Gsnk*q2*adj(Gsrc));
std::vector<TComplex> meson_T;
sliceSum(meson_CF,meson_T, Tdir);
int nt=meson_T.size();
std::vector<Complex> corr(nt);
for(int t=0;t<nt;t++){
corr[t] = TensorRemove(meson_T[t])*LLscale; // Yes this is ugly, not figured a work around
std::cout << " channel "<<ch<<" t "<<t<<" " <<real(corr[t])<< " 2 pi^2 t^3 C(t) "<< 2 * M_PI *M_PI * t*t*t *real(corr[t])<<std::endl;
}
MF.data.push_back(corr);
}
{
XmlWriter WR(file);
write(WR,"MesonFile",MF);
}
}
int main (int argc, char ** argv)
{
const int Ls=32;
Grid_init(&argc,&argv);
// Double precision grids
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(),
GridDefaultSimd(Nd,vComplex::Nsimd()),
GridDefaultMpi());
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
//////////////////////////////////////////////////////////////////////
// You can manage seeds however you like.
// Recommend SeedUniqueString.
//////////////////////////////////////////////////////////////////////
std::vector<int> seeds4({1,2,3,4});
GridParallelRNG RNG4(UGrid); RNG4.SeedFixedIntegers(seeds4);
LatticeGaugeField Umu(UGrid);
std::string config;
RealD M5=1.8;
if( argc > 1 && argv[1][0] != '-' )
{
std::cout<<GridLogMessage <<"Loading configuration from "<<argv[1]<<std::endl;
FieldMetaData header;
NerscIO::readConfiguration(Umu, header, argv[1]);
config=argv[1];
M5=1.8;
}
else
{
SU<Nc>::ColdConfiguration(Umu);
config="ColdConfig";
// RealD P=1.0; // Don't scale
RealD P=0.5871119; // 48I
// RealD P=0.6153342; // 64I
// RealD P=0.6388238 // 32Ifine
RealD u0 = sqrt(sqrt(P));
RealD M5mf = M5 - 4.0*(1.0-u0);
RealD w0 = 1.0 - M5mf;
#if 0
// M5=1.8 with U=u0
Umu = Umu * u0;
LLscale = 1.0;
LCscale = 1.0;
std::cout<<GridLogMessage <<"Gauge links are u=u0= "<<u0<<std::endl;
std::cout<<GridLogMessage <<"M5 = "<<M5<<std::endl;
#else
M5 = M5mf;
std::cout<<GridLogMessage <<"Gauge links are u=1 "<<std::endl;
std::cout<<GridLogMessage <<"u0="<<u0<<std::endl;
std::cout<<GridLogMessage <<"M5=M5mf = "<<M5<<std::endl;
LLscale = 1.0/(1-w0*w0)/(1-w0*w0);
LCscale = 1.0/(1-w0*w0)/(1-w0*w0);
#endif
std::cout<<GridLogMessage <<"LLscale = "<<LLscale<<std::endl;
std::cout<<GridLogMessage <<"LCscale = "<<LCscale<<std::endl;
}
std::vector<RealD> masses({ 0.00} ); // u/d, s, c ??
int nmass = masses.size();
std::vector<MobiusFermionD *> FermActs;
std::cout<<GridLogMessage <<"======================"<<std::endl;
std::cout<<GridLogMessage <<"MobiusFermion action as Scaled Shamir kernel"<<std::endl;
std::cout<<GridLogMessage <<"======================"<<std::endl;
for(auto mass: masses) {
RealD b=1.5;// Scale factor b+c=2, b-c=1
RealD c=0.5;
FermActs.push_back(new MobiusFermionD(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c));
}
LatticePropagator point_source(UGrid);
// LatticePropagator wall_source(UGrid);
Coordinate Origin({0,0,0,0});
PointSource (Origin,point_source);
// Z2WallSource (RNG4,0,wall_source);
std::vector<LatticePropagator> PointProps(nmass,UGrid);
// std::vector<LatticePropagator> GaussProps(nmass,UGrid);
// std::vector<LatticePropagator> Z2Props (nmass,UGrid);
for(int m=0;m<nmass;m++) {
Solve(*FermActs[m],point_source ,PointProps[m]);
}
LatticeComplex phase(UGrid);
Coordinate mom({0,0,0,0});
MakePhase(mom,phase);
for(int m1=0 ;m1<nmass;m1++) {
for(int m2=m1;m2<nmass;m2++) {
std::stringstream ssp,ssg,ssz;
ssp<<config<< "_m" << m1 << "_m"<< m2 << "_point_meson.xml";
ssz<<config<< "_m" << m1 << "_m"<< m2 << "_wall_meson.xml";
MesonTrace(ssp.str(),PointProps[m1],PointProps[m2],phase);
// MesonTrace(ssz.str(),Z2Props[m1],Z2Props[m2],phase);
}}
Grid_finalize();
}

Some files were not shown because too many files have changed in this diff Show More