mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-09 23:45:36 +00:00
Global edit with change to View usage. autoView() creates a wrapper object that closes the view when scope closes.
This commit is contained in:
parent
f39c2a240b
commit
1a4c8c3387
@ -29,9 +29,11 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#ifndef GRID_ALGORITHMS_H
|
#ifndef GRID_ALGORITHMS_H
|
||||||
#define GRID_ALGORITHMS_H
|
#define GRID_ALGORITHMS_H
|
||||||
|
|
||||||
|
NAMESPACE_CHECK(algorithms);
|
||||||
#include <Grid/algorithms/SparseMatrix.h>
|
#include <Grid/algorithms/SparseMatrix.h>
|
||||||
#include <Grid/algorithms/LinearOperator.h>
|
#include <Grid/algorithms/LinearOperator.h>
|
||||||
#include <Grid/algorithms/Preconditioner.h>
|
#include <Grid/algorithms/Preconditioner.h>
|
||||||
|
NAMESPACE_CHECK(SparseMatrix);
|
||||||
|
|
||||||
#include <Grid/algorithms/approx/Zolotarev.h>
|
#include <Grid/algorithms/approx/Zolotarev.h>
|
||||||
#include <Grid/algorithms/approx/Chebyshev.h>
|
#include <Grid/algorithms/approx/Chebyshev.h>
|
||||||
@ -41,10 +43,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#include <Grid/algorithms/approx/Forecast.h>
|
#include <Grid/algorithms/approx/Forecast.h>
|
||||||
#include <Grid/algorithms/approx/RemezGeneral.h>
|
#include <Grid/algorithms/approx/RemezGeneral.h>
|
||||||
#include <Grid/algorithms/approx/ZMobius.h>
|
#include <Grid/algorithms/approx/ZMobius.h>
|
||||||
|
NAMESPACE_CHECK(approx);
|
||||||
#include <Grid/algorithms/iterative/Deflation.h>
|
#include <Grid/algorithms/iterative/Deflation.h>
|
||||||
#include <Grid/algorithms/iterative/ConjugateGradient.h>
|
#include <Grid/algorithms/iterative/ConjugateGradient.h>
|
||||||
|
NAMESPACE_CHECK(ConjGrad);
|
||||||
#include <Grid/algorithms/iterative/BiCGSTAB.h>
|
#include <Grid/algorithms/iterative/BiCGSTAB.h>
|
||||||
|
NAMESPACE_CHECK(BiCGSTAB);
|
||||||
#include <Grid/algorithms/iterative/ConjugateResidual.h>
|
#include <Grid/algorithms/iterative/ConjugateResidual.h>
|
||||||
#include <Grid/algorithms/iterative/NormalEquations.h>
|
#include <Grid/algorithms/iterative/NormalEquations.h>
|
||||||
#include <Grid/algorithms/iterative/SchurRedBlack.h>
|
#include <Grid/algorithms/iterative/SchurRedBlack.h>
|
||||||
@ -62,7 +66,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
|
#include <Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h>
|
||||||
#include <Grid/algorithms/iterative/PowerMethod.h>
|
#include <Grid/algorithms/iterative/PowerMethod.h>
|
||||||
|
|
||||||
|
NAMESPACE_CHECK(PowerMethod);
|
||||||
#include <Grid/algorithms/CoarsenedMatrix.h>
|
#include <Grid/algorithms/CoarsenedMatrix.h>
|
||||||
|
NAMESPACE_CHECK(CoarsendMatrix);
|
||||||
#include <Grid/algorithms/FFT.h>
|
#include <Grid/algorithms/FFT.h>
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -186,10 +186,10 @@ public:
|
|||||||
|
|
||||||
hermop.HermOp(*Tn,y);
|
hermop.HermOp(*Tn,y);
|
||||||
|
|
||||||
auto y_v = y.View(AcceleratorWrite);
|
autoView( y_v , y, AcceleratorWrite);
|
||||||
auto Tn_v = Tn->View(AcceleratorWrite);
|
autoView( Tn_v , (*Tn), AcceleratorWrite);
|
||||||
auto Tnp_v = Tnp->View(AcceleratorWrite);
|
autoView( Tnp_v , (*Tnp), AcceleratorWrite);
|
||||||
auto Tnm_v = Tnm->View(AcceleratorWrite);
|
autoView( Tnm_v , (*Tnm), AcceleratorWrite);
|
||||||
const int Nsimd = CComplex::Nsimd();
|
const int Nsimd = CComplex::Nsimd();
|
||||||
accelerator_forNB(ss, FineGrid->oSites(), Nsimd, {
|
accelerator_forNB(ss, FineGrid->oSites(), Nsimd, {
|
||||||
coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
|
coalescedWrite(y_v[ss],xscale*y_v(ss)+mscale*Tn_v(ss));
|
||||||
@ -246,13 +246,14 @@ public:
|
|||||||
CartesianStencil<siteVector,siteVector,int> Stencil;
|
CartesianStencil<siteVector,siteVector,int> Stencil;
|
||||||
|
|
||||||
std::vector<CoarseMatrix> A;
|
std::vector<CoarseMatrix> A;
|
||||||
|
|
||||||
///////////////////////
|
///////////////////////
|
||||||
// Interface
|
// Interface
|
||||||
///////////////////////
|
///////////////////////
|
||||||
GridBase * Grid(void) { return _grid; }; // this is all the linalg routines need to know
|
GridBase * Grid(void) { return _grid; }; // this is all the linalg routines need to know
|
||||||
|
|
||||||
RealD M (const CoarseVector &in, CoarseVector &out){
|
RealD M (const CoarseVector &in, CoarseVector &out)
|
||||||
|
{
|
||||||
|
|
||||||
conformable(_grid,in.Grid());
|
conformable(_grid,in.Grid());
|
||||||
conformable(in.Grid(),out.Grid());
|
conformable(in.Grid(),out.Grid());
|
||||||
@ -263,12 +264,13 @@ public:
|
|||||||
double comms_usec = -usecond();
|
double comms_usec = -usecond();
|
||||||
Stencil.HaloExchange(in,compressor);
|
Stencil.HaloExchange(in,compressor);
|
||||||
comms_usec += usecond();
|
comms_usec += usecond();
|
||||||
|
|
||||||
auto in_v = in.View(AcceleratorRead);
|
autoView( in_v , in, AcceleratorRead);
|
||||||
auto out_v = out.View(AcceleratorWrite);
|
autoView( out_v , out, AcceleratorWrite);
|
||||||
typedef LatticeView<Cobj> Aview;
|
typedef LatticeView<Cobj> Aview;
|
||||||
|
|
||||||
Vector<Aview> AcceleratorViewContainer;
|
Vector<Aview> AcceleratorViewContainer;
|
||||||
|
|
||||||
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
|
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
|
||||||
Aview *Aview_p = & AcceleratorViewContainer[0];
|
Aview *Aview_p = & AcceleratorViewContainer[0];
|
||||||
|
|
||||||
@ -307,13 +309,15 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
coalescedWrite(out_v[ss](b),res);
|
coalescedWrite(out_v[ss](b),res);
|
||||||
});
|
});
|
||||||
usecs +=usecond();
|
usecs +=usecond();
|
||||||
|
|
||||||
double nrm_usec=-usecond();
|
double nrm_usec=-usecond();
|
||||||
RealD Nout= norm2(out);
|
RealD Nout= norm2(out);
|
||||||
nrm_usec+=usecond();
|
nrm_usec+=usecond();
|
||||||
|
|
||||||
|
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
|
||||||
|
|
||||||
return Nout;
|
return Nout;
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -346,8 +350,8 @@ public:
|
|||||||
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
|
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer.push_back(A[p].View(AcceleratorRead));
|
||||||
Aview *Aview_p = & AcceleratorViewContainer[0];
|
Aview *Aview_p = & AcceleratorViewContainer[0];
|
||||||
|
|
||||||
auto out_v = out.View(AcceleratorWrite);
|
autoView( out_v , out, AcceleratorWrite);
|
||||||
auto in_v = in.View(AcceleratorRead);
|
autoView( in_v , in, AcceleratorRead);
|
||||||
|
|
||||||
const int Nsimd = CComplex::Nsimd();
|
const int Nsimd = CComplex::Nsimd();
|
||||||
typedef decltype(coalescedRead(in_v[0])) calcVector;
|
typedef decltype(coalescedRead(in_v[0])) calcVector;
|
||||||
@ -375,6 +379,7 @@ public:
|
|||||||
}
|
}
|
||||||
coalescedWrite(out_v[ss](b),res);
|
coalescedWrite(out_v[ss](b),res);
|
||||||
});
|
});
|
||||||
|
for(int p=0;p<geom.npoint;p++) AcceleratorViewContainer[p].ViewClose();
|
||||||
}
|
}
|
||||||
void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out)
|
void MdirAll(const CoarseVector &in,std::vector<CoarseVector> &out)
|
||||||
{
|
{
|
||||||
@ -542,10 +547,10 @@ public:
|
|||||||
|
|
||||||
blockMaskedInnerProduct(oZProj,omask,Subspace.subspace[j],Mphi);
|
blockMaskedInnerProduct(oZProj,omask,Subspace.subspace[j],Mphi);
|
||||||
|
|
||||||
auto iZProj_v = iZProj.View(AcceleratorRead) ;
|
autoView( iZProj_v , iZProj, AcceleratorRead) ;
|
||||||
auto oZProj_v = oZProj.View(AcceleratorRead) ;
|
autoView( oZProj_v , oZProj, AcceleratorRead) ;
|
||||||
auto A_p = A[p].View(AcceleratorWrite);
|
autoView( A_p , A[p], AcceleratorWrite);
|
||||||
auto A_self = A[self_stencil].View(AcceleratorWrite);
|
autoView( A_self , A[self_stencil], AcceleratorWrite);
|
||||||
|
|
||||||
accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });
|
accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });
|
||||||
// if( disp!= 0 ) { accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });}
|
// if( disp!= 0 ) { accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{ coalescedWrite(A_p[ss](j,i),oZProj_v(ss)); });}
|
||||||
@ -563,11 +568,11 @@ public:
|
|||||||
mult(tmp,phi,oddmask ); linop.Op(tmp,Mphio);
|
mult(tmp,phi,oddmask ); linop.Op(tmp,Mphio);
|
||||||
|
|
||||||
{
|
{
|
||||||
auto tmp_ = tmp.View(AcceleratorWrite);
|
autoView( tmp_ , tmp, AcceleratorWrite);
|
||||||
auto evenmask_ = evenmask.View(AcceleratorRead);
|
autoView( evenmask_ , evenmask, AcceleratorRead);
|
||||||
auto oddmask_ = oddmask.View(AcceleratorRead);
|
autoView( oddmask_ , oddmask, AcceleratorRead);
|
||||||
auto Mphie_ = Mphie.View(AcceleratorRead);
|
autoView( Mphie_ , Mphie, AcceleratorRead);
|
||||||
auto Mphio_ = Mphio.View(AcceleratorRead);
|
autoView( Mphio_ , Mphio, AcceleratorRead);
|
||||||
accelerator_for(ss, FineGrid->oSites(), Fobj::Nsimd(),{
|
accelerator_for(ss, FineGrid->oSites(), Fobj::Nsimd(),{
|
||||||
coalescedWrite(tmp_[ss],evenmask_(ss)*Mphie_(ss) + oddmask_(ss)*Mphio_(ss));
|
coalescedWrite(tmp_[ss],evenmask_(ss)*Mphie_(ss) + oddmask_(ss)*Mphio_(ss));
|
||||||
});
|
});
|
||||||
@ -575,8 +580,8 @@ public:
|
|||||||
|
|
||||||
blockProject(SelfProj,tmp,Subspace.subspace);
|
blockProject(SelfProj,tmp,Subspace.subspace);
|
||||||
|
|
||||||
auto SelfProj_ = SelfProj.View(AcceleratorRead);
|
autoView( SelfProj_ , SelfProj, AcceleratorRead);
|
||||||
auto A_self = A[self_stencil].View(AcceleratorWrite);
|
autoView( A_self , A[self_stencil], AcceleratorWrite);
|
||||||
|
|
||||||
accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{
|
accelerator_for(ss, Grid()->oSites(), Fobj::Nsimd(),{
|
||||||
for(int j=0;j<nbasis;j++){
|
for(int j=0;j<nbasis;j++){
|
||||||
|
@ -36,7 +36,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
NAMESPACE_BEGIN(Grid);
|
NAMESPACE_BEGIN(Grid);
|
||||||
|
|
||||||
template<class scalar> struct FFTW { };
|
template<class scalar> struct FFTW { };
|
||||||
@ -190,7 +189,7 @@ public:
|
|||||||
typedef typename sobj::scalar_type scalar;
|
typedef typename sobj::scalar_type scalar;
|
||||||
|
|
||||||
Lattice<sobj> pgbuf(&pencil_g);
|
Lattice<sobj> pgbuf(&pencil_g);
|
||||||
auto pgbuf_v = pgbuf.View(CpuWrite);
|
autoView(pgbuf_v , pgbuf, CpuWrite);
|
||||||
|
|
||||||
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
|
typedef typename FFTW<scalar>::FFTW_scalar FFTW_scalar;
|
||||||
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
|
typedef typename FFTW<scalar>::FFTW_plan FFTW_plan;
|
||||||
|
@ -122,12 +122,14 @@ class BiCGSTAB : public OperatorFunction<Field>
|
|||||||
|
|
||||||
LinearCombTimer.Start();
|
LinearCombTimer.Start();
|
||||||
bo = beta * omega;
|
bo = beta * omega;
|
||||||
auto p_v = p.View(AcceleratorWrite);
|
{
|
||||||
auto r_v = r.View(AcceleratorWrite);
|
autoView( p_v , p, AcceleratorWrite);
|
||||||
auto v_v = v.View(AcceleratorWrite);
|
autoView( r_v , r, AcceleratorRead);
|
||||||
accelerator_for(ss, p_v.size(), Field::vector_object::Nsimd(),{
|
autoView( v_v , v, AcceleratorRead);
|
||||||
coalescedWrite(p_v[ss], beta*p_v(ss) - bo*v_v(ss) + r_v(ss));
|
accelerator_for(ss, p_v.size(), Field::vector_object::Nsimd(),{
|
||||||
});
|
coalescedWrite(p_v[ss], beta*p_v(ss) - bo*v_v(ss) + r_v(ss));
|
||||||
|
});
|
||||||
|
}
|
||||||
LinearCombTimer.Stop();
|
LinearCombTimer.Stop();
|
||||||
LinalgTimer.Stop();
|
LinalgTimer.Stop();
|
||||||
|
|
||||||
@ -142,16 +144,20 @@ class BiCGSTAB : public OperatorFunction<Field>
|
|||||||
alpha = rho / Calpha.real();
|
alpha = rho / Calpha.real();
|
||||||
|
|
||||||
LinearCombTimer.Start();
|
LinearCombTimer.Start();
|
||||||
auto h_v = h.View(AcceleratorWrite);
|
{
|
||||||
auto psi_v = psi.View(AcceleratorWrite);
|
autoView( p_v , p, AcceleratorRead);
|
||||||
accelerator_for(ss, h_v.size(), Field::vector_object::Nsimd(),{
|
autoView( r_v , r, AcceleratorRead);
|
||||||
coalescedWrite(h_v[ss], alpha*p_v(ss) + psi_v(ss));
|
autoView( v_v , v, AcceleratorRead);
|
||||||
});
|
autoView( psi_v,psi, AcceleratorRead);
|
||||||
|
autoView( h_v , h, AcceleratorWrite);
|
||||||
auto s_v = s.View(AcceleratorWrite);
|
autoView( s_v , s, AcceleratorWrite);
|
||||||
accelerator_for(ss, s_v.size(), Field::vector_object::Nsimd(),{
|
accelerator_for(ss, h_v.size(), Field::vector_object::Nsimd(),{
|
||||||
coalescedWrite(s_v[ss], -alpha*v_v(ss) + r_v(ss));
|
coalescedWrite(h_v[ss], alpha*p_v(ss) + psi_v(ss));
|
||||||
});
|
});
|
||||||
|
accelerator_for(ss, s_v.size(), Field::vector_object::Nsimd(),{
|
||||||
|
coalescedWrite(s_v[ss], -alpha*v_v(ss) + r_v(ss));
|
||||||
|
});
|
||||||
|
}
|
||||||
LinearCombTimer.Stop();
|
LinearCombTimer.Stop();
|
||||||
LinalgTimer.Stop();
|
LinalgTimer.Stop();
|
||||||
|
|
||||||
@ -166,13 +172,19 @@ class BiCGSTAB : public OperatorFunction<Field>
|
|||||||
omega = Comega.real() / norm2(t);
|
omega = Comega.real() / norm2(t);
|
||||||
|
|
||||||
LinearCombTimer.Start();
|
LinearCombTimer.Start();
|
||||||
auto t_v = t.View(AcceleratorWrite);
|
{
|
||||||
accelerator_for(ss, psi_v.size(), Field::vector_object::Nsimd(),{
|
autoView( psi_v,psi, AcceleratorWrite);
|
||||||
coalescedWrite(psi_v[ss], h_v(ss) + omega * s_v(ss));
|
autoView( r_v , r, AcceleratorWrite);
|
||||||
coalescedWrite(r_v[ss], -omega * t_v(ss) + s_v(ss));
|
autoView( h_v , h, AcceleratorRead);
|
||||||
});
|
autoView( s_v , s, AcceleratorRead);
|
||||||
|
autoView( t_v , t, AcceleratorRead);
|
||||||
|
accelerator_for(ss, psi_v.size(), Field::vector_object::Nsimd(),{
|
||||||
|
coalescedWrite(psi_v[ss], h_v(ss) + omega * s_v(ss));
|
||||||
|
coalescedWrite(r_v[ss], -omega * t_v(ss) + s_v(ss));
|
||||||
|
});
|
||||||
|
}
|
||||||
LinearCombTimer.Stop();
|
LinearCombTimer.Stop();
|
||||||
|
|
||||||
cp = norm2(r);
|
cp = norm2(r);
|
||||||
LinalgTimer.Stop();
|
LinalgTimer.Stop();
|
||||||
|
|
||||||
|
@ -141,16 +141,16 @@ public:
|
|||||||
|
|
||||||
LinearCombTimer.Start();
|
LinearCombTimer.Start();
|
||||||
{
|
{
|
||||||
auto psi_v = psi.View(AcceleratorWrite);
|
autoView( psi_v , psi, AcceleratorWrite);
|
||||||
auto p_v = p.View(AcceleratorWrite);
|
autoView( p_v , p, AcceleratorWrite);
|
||||||
auto r_v = r.View(AcceleratorWrite);
|
autoView( r_v , r, AcceleratorWrite);
|
||||||
accelerator_for(ss,p_v.size(), Field::vector_object::Nsimd(),{
|
accelerator_for(ss,p_v.size(), Field::vector_object::Nsimd(),{
|
||||||
coalescedWrite(psi_v[ss], a * p_v(ss) + psi_v(ss));
|
coalescedWrite(psi_v[ss], a * p_v(ss) + psi_v(ss));
|
||||||
coalescedWrite(p_v[ss] , b * p_v(ss) + r_v (ss));
|
coalescedWrite(p_v[ss] , b * p_v(ss) + r_v (ss));
|
||||||
});
|
});
|
||||||
LinearCombTimer.Stop();
|
|
||||||
LinalgTimer.Stop();
|
|
||||||
}
|
}
|
||||||
|
LinearCombTimer.Stop();
|
||||||
|
LinalgTimer.Stop();
|
||||||
|
|
||||||
std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
|
std::cout << GridLogIterative << "ConjugateGradient: Iteration " << k
|
||||||
<< " residual " << sqrt(cp/ssq) << " target " << Tolerance << std::endl;
|
<< " residual " << sqrt(cp/ssq) << " target " << Tolerance << std::endl;
|
||||||
|
@ -57,16 +57,17 @@ void basisOrthogonalize(std::vector<Field> &basis,Field &w,int k)
|
|||||||
template<class Field>
|
template<class Field>
|
||||||
void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, int k0,int k1,int Nm)
|
void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, int k0,int k1,int Nm)
|
||||||
{
|
{
|
||||||
typedef decltype(basis[0].View(CpuWrite)) View;
|
|
||||||
auto tmp_v = basis[0].View(CpuWrite);
|
|
||||||
Vector<View> basis_v(basis.size(),tmp_v);
|
|
||||||
View *basis_vp = &basis_v[0];
|
|
||||||
typedef typename Field::vector_object vobj;
|
|
||||||
GridBase* grid = basis[0].Grid();
|
GridBase* grid = basis[0].Grid();
|
||||||
|
|
||||||
for(int k=0;k<basis.size();k++){
|
typedef typename Field::vector_object vobj;
|
||||||
basis_v[k] = basis[k].View(CpuWrite);
|
typedef decltype(basis[0].View(CpuWrite)) View;
|
||||||
}
|
|
||||||
|
Vector<View> basis_v; basis_v.reserve(basis.size());
|
||||||
|
|
||||||
|
for(int k=0;k<basis.size();k++) basis_v.push_back(basis[k].View(CpuWrite));
|
||||||
|
|
||||||
|
View *basis_vp = &basis_v[0];
|
||||||
|
|
||||||
#if 1
|
#if 1
|
||||||
std::vector < vobj , commAllocator<vobj> > Bt(thread_max() * Nm); // Thread private
|
std::vector < vobj , commAllocator<vobj> > Bt(thread_max() * Nm); // Thread private
|
||||||
thread_region
|
thread_region
|
||||||
@ -142,6 +143,7 @@ void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, i
|
|||||||
coalescedWrite(basis_vp[jj][sss],coalescedRead(Bp[ss*nrot+j]));
|
coalescedWrite(basis_vp[jj][sss],coalescedRead(Bp[ss*nrot+j]));
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -149,20 +151,22 @@ void basisRotate(std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j0, int j1, i
|
|||||||
template<class Field>
|
template<class Field>
|
||||||
void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm)
|
void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,int j, int k0,int k1,int Nm)
|
||||||
{
|
{
|
||||||
typedef decltype(basis[0].View(AcceleratorWrite)) View;
|
|
||||||
typedef typename Field::vector_object vobj;
|
|
||||||
GridBase* grid = basis[0].Grid();
|
GridBase* grid = basis[0].Grid();
|
||||||
|
typedef typename Field::vector_object vobj;
|
||||||
|
typedef decltype(basis[0].View(AcceleratorWrite)) View;
|
||||||
|
|
||||||
result.Checkerboard() = basis[0].Checkerboard();
|
result.Checkerboard() = basis[0].Checkerboard();
|
||||||
auto result_v=result.View(AcceleratorWrite);
|
|
||||||
Vector<View> basis_v(basis.size(),result_v);
|
autoView(result_v,result, AcceleratorWrite);
|
||||||
|
Vector<View> basis_v; basis_v.reserve(basis.size());
|
||||||
View * basis_vp = &basis_v[0];
|
View * basis_vp = &basis_v[0];
|
||||||
for(int k=0;k<basis.size();k++){
|
|
||||||
basis_v[k] = basis[k].View(AcceleratorRead);
|
for(int k=0;k<basis.size();k++) basis_v.push_back(basis[k].View(AcceleratorRead));
|
||||||
}
|
|
||||||
Vector<double> Qt_jv(Nm);
|
Vector<double> Qt_jv(Nm); double * Qt_j = & Qt_jv[0];
|
||||||
double * Qt_j = & Qt_jv[0];
|
|
||||||
for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
|
for(int k=0;k<Nm;++k) Qt_j[k]=Qt(j,k);
|
||||||
|
|
||||||
accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
|
accelerator_for(ss, grid->oSites(),vobj::Nsimd(),{
|
||||||
auto B=coalescedRead(basis_vp[k0][ss]);
|
auto B=coalescedRead(basis_vp[k0][ss]);
|
||||||
B=Zero();
|
B=Zero();
|
||||||
@ -171,6 +175,7 @@ void basisRotateJ(Field &result,std::vector<Field> &basis,Eigen::MatrixXd& Qt,in
|
|||||||
}
|
}
|
||||||
coalescedWrite(result_v[ss], B);
|
coalescedWrite(result_v[ss], B);
|
||||||
});
|
});
|
||||||
|
for(int k=0;k<basis.size();k++) basis_v[k].ViewClose();
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Field>
|
template<class Field>
|
||||||
|
@ -169,8 +169,9 @@ private:
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
static void Print(void);
|
static void Print(void);
|
||||||
static void ViewClose(void* AccPtr,ViewMode mode);
|
static int isOpen (void* CpuPtr);
|
||||||
static void *ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
|
static void ViewClose(void* CpuPtr,ViewMode mode);
|
||||||
|
static void *ViewOpen (void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -461,6 +461,17 @@ void MemoryManager::Print(void)
|
|||||||
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
|
std::cout << GridLogDebug << "--------------------------------------------" << std::endl;
|
||||||
|
|
||||||
};
|
};
|
||||||
|
int MemoryManager::isOpen (void* _CpuPtr)
|
||||||
|
{
|
||||||
|
uint64_t CpuPtr = (uint64_t)_CpuPtr;
|
||||||
|
if ( EntryPresent(CpuPtr) ){
|
||||||
|
auto AccCacheIterator = EntryLookup(CpuPtr);
|
||||||
|
auto & AccCache = AccCacheIterator->second;
|
||||||
|
return AccCache.cpuLock+AccCache.accLock;
|
||||||
|
} else {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
|
||||||
|
@ -9,6 +9,7 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
|
|
||||||
void MemoryManager::ViewClose(void* AccPtr,ViewMode mode){};
|
void MemoryManager::ViewClose(void* AccPtr,ViewMode mode){};
|
||||||
void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; };
|
void *MemoryManager::ViewOpen(void* CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint){ return CpuPtr; };
|
||||||
|
int MemoryManager::isOpen (void* CpuPtr) { return 0;}
|
||||||
void MemoryManager::Print(void){};
|
void MemoryManager::Print(void){};
|
||||||
void MemoryManager::NotifyDeletion(void *ptr){};
|
void MemoryManager::NotifyDeletion(void *ptr){};
|
||||||
|
|
||||||
|
@ -72,12 +72,14 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
auto rhs_v = rhs.View(AcceleratorRead);
|
{
|
||||||
auto buffer_p = & buffer[0];
|
autoView(rhs_v , rhs, AcceleratorRead);
|
||||||
auto table = &Cshift_table[0];
|
auto buffer_p = & buffer[0];
|
||||||
accelerator_for(i,ent,1,{
|
auto table = &Cshift_table[0];
|
||||||
buffer_p[table[i].first]=rhs_v[table[i].second];
|
accelerator_for(i,ent,1,{
|
||||||
});
|
buffer_p[table[i].first]=rhs_v[table[i].second];
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////
|
||||||
@ -100,8 +102,8 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
|
|||||||
int e2=rhs.Grid()->_slice_block[dimension];
|
int e2=rhs.Grid()->_slice_block[dimension];
|
||||||
int n1=rhs.Grid()->_slice_stride[dimension];
|
int n1=rhs.Grid()->_slice_stride[dimension];
|
||||||
|
|
||||||
auto rhs_v = rhs.View(AcceleratorRead);
|
|
||||||
if ( cbmask ==0x3){
|
if ( cbmask ==0x3){
|
||||||
|
autoView(rhs_v , rhs, AcceleratorRead);
|
||||||
accelerator_for2d(n,e1,b,e2,1,{
|
accelerator_for2d(n,e1,b,e2,1,{
|
||||||
int o = n*n1;
|
int o = n*n1;
|
||||||
int offset = b+n*e2;
|
int offset = b+n*e2;
|
||||||
@ -110,8 +112,8 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
|
|||||||
extract<vobj>(temp,pointers,offset);
|
extract<vobj>(temp,pointers,offset);
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
|
autoView(rhs_v , rhs, AcceleratorRead);
|
||||||
|
|
||||||
|
|
||||||
Coordinate rdim=rhs.Grid()->_rdimensions;
|
Coordinate rdim=rhs.Grid()->_rdimensions;
|
||||||
Coordinate cdm =rhs.Grid()->_checker_dim_mask;
|
Coordinate cdm =rhs.Grid()->_checker_dim_mask;
|
||||||
std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
|
std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
|
||||||
@ -179,12 +181,14 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
auto rhs_v = rhs.View(AcceleratorWrite);
|
{
|
||||||
auto buffer_p = & buffer[0];
|
autoView( rhs_v, rhs, AcceleratorWrite);
|
||||||
auto table = &Cshift_table[0];
|
auto buffer_p = & buffer[0];
|
||||||
accelerator_for(i,ent,1,{
|
auto table = &Cshift_table[0];
|
||||||
rhs_v[table[i].first]=buffer_p[table[i].second];
|
accelerator_for(i,ent,1,{
|
||||||
});
|
rhs_v[table[i].first]=buffer_p[table[i].second];
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
@ -204,7 +208,7 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
|
|||||||
int e2=rhs.Grid()->_slice_block[dimension];
|
int e2=rhs.Grid()->_slice_block[dimension];
|
||||||
|
|
||||||
if(cbmask ==0x3 ) {
|
if(cbmask ==0x3 ) {
|
||||||
auto rhs_v = rhs.View(AcceleratorWrite);
|
autoView( rhs_v , rhs, AcceleratorWrite);
|
||||||
accelerator_for2d(n,e1,b,e2,1,{
|
accelerator_for2d(n,e1,b,e2,1,{
|
||||||
int o = n*rhs.Grid()->_slice_stride[dimension];
|
int o = n*rhs.Grid()->_slice_stride[dimension];
|
||||||
int offset = b+n*rhs.Grid()->_slice_block[dimension];
|
int offset = b+n*rhs.Grid()->_slice_block[dimension];
|
||||||
@ -216,7 +220,7 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
|
|||||||
// Test_cshift_red_black code.
|
// Test_cshift_red_black code.
|
||||||
// std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
|
// std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
|
||||||
std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl;
|
std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl;
|
||||||
auto rhs_v = rhs.View(CpuWrite);
|
autoView( rhs_v, rhs, CpuWrite);
|
||||||
for(int n=0;n<e1;n++){
|
for(int n=0;n<e1;n++){
|
||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
int o = n*rhs.Grid()->_slice_stride[dimension];
|
int o = n*rhs.Grid()->_slice_stride[dimension];
|
||||||
@ -272,13 +276,14 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
auto rhs_v = rhs.View(AcceleratorRead);
|
{
|
||||||
auto lhs_v = lhs.View(AcceleratorWrite);
|
autoView(rhs_v , rhs, AcceleratorRead);
|
||||||
auto table = &Cshift_table[0];
|
autoView(lhs_v , lhs, AcceleratorWrite);
|
||||||
accelerator_for(i,ent,1,{
|
auto table = &Cshift_table[0];
|
||||||
lhs_v[table[i].first]=rhs_v[table[i].second];
|
accelerator_for(i,ent,1,{
|
||||||
});
|
lhs_v[table[i].first]=rhs_v[table[i].second];
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type)
|
template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type)
|
||||||
@ -315,12 +320,14 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
|
|||||||
}}
|
}}
|
||||||
}
|
}
|
||||||
|
|
||||||
auto rhs_v = rhs.View(AcceleratorRead);
|
{
|
||||||
auto lhs_v = lhs.View(AcceleratorWrite);
|
autoView( rhs_v, rhs, AcceleratorRead);
|
||||||
auto table = &Cshift_table[0];
|
autoView( lhs_v, lhs, AcceleratorWrite);
|
||||||
accelerator_for(i,ent,1,{
|
auto table = &Cshift_table[0];
|
||||||
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
|
accelerator_for(i,ent,1,{
|
||||||
});
|
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////
|
||||||
|
@ -87,12 +87,14 @@ sobj eval(const uint64_t ss, const sobj &arg)
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <class lobj> accelerator_inline
|
template <class lobj> accelerator_inline
|
||||||
const lobj & eval(const uint64_t ss, const LatticeExprView<lobj> &arg)
|
const lobj & eval(const uint64_t ss, const LatticeView<lobj> &arg)
|
||||||
{
|
{
|
||||||
return arg[ss];
|
return arg[ss];
|
||||||
}
|
}
|
||||||
|
|
||||||
// What needs this?
|
// What needs this?
|
||||||
|
// Cannot be legal on accelerator
|
||||||
|
// Comparison must convert
|
||||||
#if 1
|
#if 1
|
||||||
template <class lobj> accelerator_inline
|
template <class lobj> accelerator_inline
|
||||||
const lobj & eval(const uint64_t ss, const Lattice<lobj> &arg)
|
const lobj & eval(const uint64_t ss, const Lattice<lobj> &arg)
|
||||||
|
@ -36,9 +36,9 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
template<class obj1,class obj2,class obj3> inline
|
template<class obj1,class obj2,class obj3> inline
|
||||||
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
|
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
|
||||||
ret.Checkerboard() = lhs.Checkerboard();
|
ret.Checkerboard() = lhs.Checkerboard();
|
||||||
auto ret_v = ret.View(AcceleratorWrite);
|
autoView( ret_v , ret, AcceleratorWrite);
|
||||||
auto lhs_v = lhs.View(AcceleratorRead);
|
autoView( lhs_v , lhs, AcceleratorRead);
|
||||||
auto rhs_v = rhs.View(AcceleratorRead);
|
autoView( rhs_v , rhs, AcceleratorRead);
|
||||||
conformable(ret,rhs);
|
conformable(ret,rhs);
|
||||||
conformable(lhs,rhs);
|
conformable(lhs,rhs);
|
||||||
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
||||||
@ -55,9 +55,9 @@ void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
|
|||||||
ret.Checkerboard() = lhs.Checkerboard();
|
ret.Checkerboard() = lhs.Checkerboard();
|
||||||
conformable(ret,rhs);
|
conformable(ret,rhs);
|
||||||
conformable(lhs,rhs);
|
conformable(lhs,rhs);
|
||||||
auto ret_v = ret.View(AcceleratorWrite);
|
autoView( ret_v , ret, AcceleratorWrite);
|
||||||
auto lhs_v = lhs.View(AcceleratorRead);
|
autoView( lhs_v , lhs, AcceleratorRead);
|
||||||
auto rhs_v = rhs.View(AcceleratorRead);
|
autoView( rhs_v , rhs, AcceleratorRead);
|
||||||
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
||||||
decltype(coalescedRead(obj1())) tmp;
|
decltype(coalescedRead(obj1())) tmp;
|
||||||
auto lhs_t=lhs_v(ss);
|
auto lhs_t=lhs_v(ss);
|
||||||
@ -72,9 +72,9 @@ void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
|
|||||||
ret.Checkerboard() = lhs.Checkerboard();
|
ret.Checkerboard() = lhs.Checkerboard();
|
||||||
conformable(ret,rhs);
|
conformable(ret,rhs);
|
||||||
conformable(lhs,rhs);
|
conformable(lhs,rhs);
|
||||||
auto ret_v = ret.View(AcceleratorWrite);
|
autoView( ret_v , ret, AcceleratorWrite);
|
||||||
auto lhs_v = lhs.View(AcceleratorRead);
|
autoView( lhs_v , lhs, AcceleratorRead);
|
||||||
auto rhs_v = rhs.View(AcceleratorRead);
|
autoView( rhs_v , rhs, AcceleratorRead);
|
||||||
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
||||||
decltype(coalescedRead(obj1())) tmp;
|
decltype(coalescedRead(obj1())) tmp;
|
||||||
auto lhs_t=lhs_v(ss);
|
auto lhs_t=lhs_v(ss);
|
||||||
@ -88,9 +88,9 @@ void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const Lattice<obj3> &rhs){
|
|||||||
ret.Checkerboard() = lhs.Checkerboard();
|
ret.Checkerboard() = lhs.Checkerboard();
|
||||||
conformable(ret,rhs);
|
conformable(ret,rhs);
|
||||||
conformable(lhs,rhs);
|
conformable(lhs,rhs);
|
||||||
auto ret_v = ret.View(AcceleratorWrite);
|
autoView( ret_v , ret, AcceleratorWrite);
|
||||||
auto lhs_v = lhs.View(AcceleratorRead);
|
autoView( lhs_v , lhs, AcceleratorRead);
|
||||||
auto rhs_v = rhs.View(AcceleratorRead);
|
autoView( rhs_v , rhs, AcceleratorRead);
|
||||||
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
||||||
decltype(coalescedRead(obj1())) tmp;
|
decltype(coalescedRead(obj1())) tmp;
|
||||||
auto lhs_t=lhs_v(ss);
|
auto lhs_t=lhs_v(ss);
|
||||||
@ -107,8 +107,8 @@ template<class obj1,class obj2,class obj3> inline
|
|||||||
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
|
void mult(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
|
||||||
ret.Checkerboard() = lhs.Checkerboard();
|
ret.Checkerboard() = lhs.Checkerboard();
|
||||||
conformable(lhs,ret);
|
conformable(lhs,ret);
|
||||||
auto ret_v = ret.View(AcceleratorWrite);
|
autoView( ret_v , ret, AcceleratorWrite);
|
||||||
auto lhs_v = lhs.View(AcceleratorRead);
|
autoView( lhs_v , lhs, AcceleratorRead);
|
||||||
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
||||||
decltype(coalescedRead(obj1())) tmp;
|
decltype(coalescedRead(obj1())) tmp;
|
||||||
mult(&tmp,&lhs_v(ss),&rhs);
|
mult(&tmp,&lhs_v(ss),&rhs);
|
||||||
@ -120,8 +120,8 @@ template<class obj1,class obj2,class obj3> inline
|
|||||||
void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
|
void mac(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
|
||||||
ret.Checkerboard() = lhs.Checkerboard();
|
ret.Checkerboard() = lhs.Checkerboard();
|
||||||
conformable(ret,lhs);
|
conformable(ret,lhs);
|
||||||
auto ret_v = ret.View(AcceleratorWrite);
|
autoView( ret_v , ret, AcceleratorWrite);
|
||||||
auto lhs_v = lhs.View(AcceleratorRead);
|
autoView( lhs_v , lhs, AcceleratorRead);
|
||||||
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
||||||
decltype(coalescedRead(obj1())) tmp;
|
decltype(coalescedRead(obj1())) tmp;
|
||||||
auto lhs_t=lhs_v(ss);
|
auto lhs_t=lhs_v(ss);
|
||||||
@ -134,8 +134,8 @@ template<class obj1,class obj2,class obj3> inline
|
|||||||
void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
|
void sub(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
|
||||||
ret.Checkerboard() = lhs.Checkerboard();
|
ret.Checkerboard() = lhs.Checkerboard();
|
||||||
conformable(ret,lhs);
|
conformable(ret,lhs);
|
||||||
auto ret_v = ret.View(AcceleratorWrite);
|
autoView( ret_v , ret, AcceleratorWrite);
|
||||||
auto lhs_v = lhs.View(AcceleratorRead);
|
autoView( lhs_v , lhs, AcceleratorRead);
|
||||||
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
||||||
decltype(coalescedRead(obj1())) tmp;
|
decltype(coalescedRead(obj1())) tmp;
|
||||||
auto lhs_t=lhs_v(ss);
|
auto lhs_t=lhs_v(ss);
|
||||||
@ -147,8 +147,8 @@ template<class obj1,class obj2,class obj3> inline
|
|||||||
void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
|
void add(Lattice<obj1> &ret,const Lattice<obj2> &lhs,const obj3 &rhs){
|
||||||
ret.Checkerboard() = lhs.Checkerboard();
|
ret.Checkerboard() = lhs.Checkerboard();
|
||||||
conformable(lhs,ret);
|
conformable(lhs,ret);
|
||||||
auto ret_v = ret.View(AcceleratorWrite);
|
autoView( ret_v , ret, AcceleratorWrite);
|
||||||
auto lhs_v = lhs.View(AcceleratorRead);
|
autoView( lhs_v , lhs, AcceleratorRead);
|
||||||
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
accelerator_for(ss,lhs_v.size(),obj1::Nsimd(),{
|
||||||
decltype(coalescedRead(obj1())) tmp;
|
decltype(coalescedRead(obj1())) tmp;
|
||||||
auto lhs_t=lhs_v(ss);
|
auto lhs_t=lhs_v(ss);
|
||||||
@ -164,8 +164,8 @@ template<class obj1,class obj2,class obj3> inline
|
|||||||
void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
|
void mult(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
|
||||||
ret.Checkerboard() = rhs.Checkerboard();
|
ret.Checkerboard() = rhs.Checkerboard();
|
||||||
conformable(ret,rhs);
|
conformable(ret,rhs);
|
||||||
auto ret_v = ret.View(AcceleratorWrite);
|
autoView( ret_v , ret, AcceleratorWrite);
|
||||||
auto rhs_v = lhs.View(AcceleratorRead);
|
autoView( rhs_v , lhs, AcceleratorRead);
|
||||||
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
|
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
|
||||||
decltype(coalescedRead(obj1())) tmp;
|
decltype(coalescedRead(obj1())) tmp;
|
||||||
auto rhs_t=rhs_v(ss);
|
auto rhs_t=rhs_v(ss);
|
||||||
@ -178,8 +178,8 @@ template<class obj1,class obj2,class obj3> inline
|
|||||||
void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
|
void mac(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
|
||||||
ret.Checkerboard() = rhs.Checkerboard();
|
ret.Checkerboard() = rhs.Checkerboard();
|
||||||
conformable(ret,rhs);
|
conformable(ret,rhs);
|
||||||
auto ret_v = ret.View(AcceleratorWrite);
|
autoView( ret_v , ret, AcceleratorWrite);
|
||||||
auto rhs_v = lhs.View(AcceleratorRead);
|
autoView( rhs_v , lhs, AcceleratorRead);
|
||||||
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
|
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
|
||||||
decltype(coalescedRead(obj1())) tmp;
|
decltype(coalescedRead(obj1())) tmp;
|
||||||
auto rhs_t=rhs_v(ss);
|
auto rhs_t=rhs_v(ss);
|
||||||
@ -192,8 +192,8 @@ template<class obj1,class obj2,class obj3> inline
|
|||||||
void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
|
void sub(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
|
||||||
ret.Checkerboard() = rhs.Checkerboard();
|
ret.Checkerboard() = rhs.Checkerboard();
|
||||||
conformable(ret,rhs);
|
conformable(ret,rhs);
|
||||||
auto ret_v = ret.View(AcceleratorWrite);
|
autoView( ret_v , ret, AcceleratorWrite);
|
||||||
auto rhs_v = lhs.View(AcceleratorRead);
|
autoView( rhs_v , lhs, AcceleratorRead);
|
||||||
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
|
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
|
||||||
decltype(coalescedRead(obj1())) tmp;
|
decltype(coalescedRead(obj1())) tmp;
|
||||||
auto rhs_t=rhs_v(ss);
|
auto rhs_t=rhs_v(ss);
|
||||||
@ -205,8 +205,8 @@ template<class obj1,class obj2,class obj3> inline
|
|||||||
void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
|
void add(Lattice<obj1> &ret,const obj2 &lhs,const Lattice<obj3> &rhs){
|
||||||
ret.Checkerboard() = rhs.Checkerboard();
|
ret.Checkerboard() = rhs.Checkerboard();
|
||||||
conformable(ret,rhs);
|
conformable(ret,rhs);
|
||||||
auto ret_v = ret.View(AcceleratorWrite);
|
autoView( ret_v , ret, AcceleratorWrite);
|
||||||
auto rhs_v = lhs.View(AcceleratorRead);
|
autoView( rhs_v , lhs, AcceleratorRead);
|
||||||
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
|
accelerator_for(ss,rhs_v.size(),obj1::Nsimd(),{
|
||||||
decltype(coalescedRead(obj1())) tmp;
|
decltype(coalescedRead(obj1())) tmp;
|
||||||
auto rhs_t=rhs_v(ss);
|
auto rhs_t=rhs_v(ss);
|
||||||
@ -220,9 +220,9 @@ void axpy(Lattice<vobj> &ret,sobj a,const Lattice<vobj> &x,const Lattice<vobj> &
|
|||||||
ret.Checkerboard() = x.Checkerboard();
|
ret.Checkerboard() = x.Checkerboard();
|
||||||
conformable(ret,x);
|
conformable(ret,x);
|
||||||
conformable(x,y);
|
conformable(x,y);
|
||||||
auto ret_v = ret.View(AcceleratorWrite);
|
autoView( ret_v , ret, AcceleratorWrite);
|
||||||
auto x_v = x.View(AcceleratorRead);
|
autoView( x_v , x, AcceleratorRead);
|
||||||
auto y_v = y.View(AcceleratorRead);
|
autoView( y_v , y, AcceleratorRead);
|
||||||
accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
|
accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
|
||||||
auto tmp = a*x_v(ss)+y_v(ss);
|
auto tmp = a*x_v(ss)+y_v(ss);
|
||||||
coalescedWrite(ret_v[ss],tmp);
|
coalescedWrite(ret_v[ss],tmp);
|
||||||
@ -233,9 +233,9 @@ void axpby(Lattice<vobj> &ret,sobj a,sobj b,const Lattice<vobj> &x,const Lattice
|
|||||||
ret.Checkerboard() = x.Checkerboard();
|
ret.Checkerboard() = x.Checkerboard();
|
||||||
conformable(ret,x);
|
conformable(ret,x);
|
||||||
conformable(x,y);
|
conformable(x,y);
|
||||||
auto ret_v = ret.View(AcceleratorWrite);
|
autoView( ret_v , ret, AcceleratorWrite);
|
||||||
auto x_v = x.View(AcceleratorRead);
|
autoView( x_v , x, AcceleratorRead);
|
||||||
auto y_v = y.View(AcceleratorRead);
|
autoView( y_v , y, AcceleratorRead);
|
||||||
accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
|
accelerator_for(ss,x_v.size(),vobj::Nsimd(),{
|
||||||
auto tmp = a*x_v(ss)+b*y_v(ss);
|
auto tmp = a*x_v(ss)+b*y_v(ss);
|
||||||
coalescedWrite(ret_v[ss],tmp);
|
coalescedWrite(ret_v[ss],tmp);
|
||||||
|
@ -84,6 +84,7 @@ public:
|
|||||||
/////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////
|
||||||
void SetViewMode(ViewMode mode) {
|
void SetViewMode(ViewMode mode) {
|
||||||
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode);
|
LatticeView<vobj> accessor(*( (LatticeAccelerator<vobj> *) this),mode);
|
||||||
|
accessor.ViewClose();
|
||||||
}
|
}
|
||||||
/////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////
|
||||||
// Return a view object that may be dereferenced in site loops.
|
// Return a view object that may be dereferenced in site loops.
|
||||||
@ -123,6 +124,7 @@ public:
|
|||||||
auto tmp = eval(ss,exprCopy);
|
auto tmp = eval(ss,exprCopy);
|
||||||
vstream(me[ss],tmp);
|
vstream(me[ss],tmp);
|
||||||
});
|
});
|
||||||
|
me.ViewClose();
|
||||||
ExpressionViewClose(exprCopy);
|
ExpressionViewClose(exprCopy);
|
||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
@ -145,6 +147,7 @@ public:
|
|||||||
auto tmp = eval(ss,exprCopy);
|
auto tmp = eval(ss,exprCopy);
|
||||||
vstream(me[ss],tmp);
|
vstream(me[ss],tmp);
|
||||||
});
|
});
|
||||||
|
me.ViewClose();
|
||||||
ExpressionViewClose(exprCopy);
|
ExpressionViewClose(exprCopy);
|
||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
@ -166,6 +169,7 @@ public:
|
|||||||
auto tmp = eval(ss,exprCopy);
|
auto tmp = eval(ss,exprCopy);
|
||||||
vstream(me[ss],tmp);
|
vstream(me[ss],tmp);
|
||||||
});
|
});
|
||||||
|
me.ViewClose();
|
||||||
ExpressionViewClose(exprCopy);
|
ExpressionViewClose(exprCopy);
|
||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
@ -221,6 +225,7 @@ public:
|
|||||||
thread_for(ss,me.size(),{
|
thread_for(ss,me.size(),{
|
||||||
me[ss]= r;
|
me[ss]= r;
|
||||||
});
|
});
|
||||||
|
me.ViewClose();
|
||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -278,6 +283,7 @@ public:
|
|||||||
accelerator_for(ss,me.size(),vobj::Nsimd(),{
|
accelerator_for(ss,me.size(),vobj::Nsimd(),{
|
||||||
coalescedWrite(me[ss],him(ss));
|
coalescedWrite(me[ss],him(ss));
|
||||||
});
|
});
|
||||||
|
me.ViewClose(); him.ViewClose();
|
||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -292,6 +298,7 @@ public:
|
|||||||
accelerator_for(ss,me.size(),vobj::Nsimd(),{
|
accelerator_for(ss,me.size(),vobj::Nsimd(),{
|
||||||
coalescedWrite(me[ss],him(ss));
|
coalescedWrite(me[ss],him(ss));
|
||||||
});
|
});
|
||||||
|
me.ViewClose(); him.ViewClose();
|
||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
///////////////////////////////////////////
|
///////////////////////////////////////////
|
||||||
|
@ -78,9 +78,9 @@ template<class vfunctor,class lobj,class robj>
|
|||||||
inline Lattice<vPredicate> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs)
|
inline Lattice<vPredicate> LLComparison(vfunctor op,const Lattice<lobj> &lhs,const Lattice<robj> &rhs)
|
||||||
{
|
{
|
||||||
Lattice<vPredicate> ret(rhs.Grid());
|
Lattice<vPredicate> ret(rhs.Grid());
|
||||||
auto lhs_v = lhs.View(CpuRead);
|
autoView( lhs_v, lhs, CpuRead);
|
||||||
auto rhs_v = rhs.View(CpuRead);
|
autoView( rhs_v, rhs, CpuRead);
|
||||||
auto ret_v = ret.View(CpuWrite);
|
autoView( ret_v, ret, CpuWrite);
|
||||||
thread_for( ss, rhs_v.size(), {
|
thread_for( ss, rhs_v.size(), {
|
||||||
ret_v[ss]=op(lhs_v[ss],rhs_v[ss]);
|
ret_v[ss]=op(lhs_v[ss],rhs_v[ss]);
|
||||||
});
|
});
|
||||||
@ -93,8 +93,8 @@ template<class vfunctor,class lobj,class robj>
|
|||||||
inline Lattice<vPredicate> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs)
|
inline Lattice<vPredicate> LSComparison(vfunctor op,const Lattice<lobj> &lhs,const robj &rhs)
|
||||||
{
|
{
|
||||||
Lattice<vPredicate> ret(lhs.Grid());
|
Lattice<vPredicate> ret(lhs.Grid());
|
||||||
auto lhs_v = lhs.View(CpuRead);
|
autoView( lhs_v, lhs, CpuRead);
|
||||||
auto ret_v = ret.View(CpuWrite);
|
autoView( ret_v, ret, CpuWrite);
|
||||||
thread_for( ss, lhs_v.size(), {
|
thread_for( ss, lhs_v.size(), {
|
||||||
ret_v[ss]=op(lhs_v[ss],rhs);
|
ret_v[ss]=op(lhs_v[ss],rhs);
|
||||||
});
|
});
|
||||||
@ -107,8 +107,8 @@ template<class vfunctor,class lobj,class robj>
|
|||||||
inline Lattice<vPredicate> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs)
|
inline Lattice<vPredicate> SLComparison(vfunctor op,const lobj &lhs,const Lattice<robj> &rhs)
|
||||||
{
|
{
|
||||||
Lattice<vPredicate> ret(rhs.Grid());
|
Lattice<vPredicate> ret(rhs.Grid());
|
||||||
auto rhs_v = rhs.View(CpuRead);
|
autoView( rhs_v, rhs, CpuRead);
|
||||||
auto ret_v = ret.View(CpuWrite);
|
autoView( ret_v, ret, CpuWrite);
|
||||||
thread_for( ss, rhs_v.size(), {
|
thread_for( ss, rhs_v.size(), {
|
||||||
ret_v[ss]=op(lhs,rhs_v[ss]);
|
ret_v[ss]=op(lhs,rhs_v[ss]);
|
||||||
});
|
});
|
||||||
|
@ -37,7 +37,7 @@ template<class iobj> inline void LatticeCoordinate(Lattice<iobj> &l,int mu)
|
|||||||
GridBase *grid = l.Grid();
|
GridBase *grid = l.Grid();
|
||||||
int Nsimd = grid->iSites();
|
int Nsimd = grid->iSites();
|
||||||
|
|
||||||
auto l_v = l.View(CpuWrite);
|
autoView(l_v, l, CpuWrite);
|
||||||
thread_for( o, grid->oSites(), {
|
thread_for( o, grid->oSites(), {
|
||||||
vector_type vI;
|
vector_type vI;
|
||||||
Coordinate gcoor;
|
Coordinate gcoor;
|
||||||
|
@ -43,8 +43,8 @@ template<class vobj>
|
|||||||
inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced>
|
inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced>
|
||||||
{
|
{
|
||||||
Lattice<typename vobj::tensor_reduced> ret(rhs.Grid());
|
Lattice<typename vobj::tensor_reduced> ret(rhs.Grid());
|
||||||
auto rhs_v = rhs.View(AcceleratorRead);
|
autoView( rhs_v , rhs, AcceleratorRead);
|
||||||
auto ret_v = ret.View(AcceleratorWrite);
|
autoView( ret_v , ret, AcceleratorWrite);
|
||||||
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
|
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
|
||||||
coalescedWrite(ret_v[ss],innerProduct(rhs_v(ss),rhs_v(ss)));
|
coalescedWrite(ret_v[ss],innerProduct(rhs_v(ss),rhs_v(ss)));
|
||||||
});
|
});
|
||||||
@ -56,9 +56,9 @@ template<class vobj>
|
|||||||
inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) -> Lattice<typename vobj::tensor_reduced>
|
inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs) -> Lattice<typename vobj::tensor_reduced>
|
||||||
{
|
{
|
||||||
Lattice<typename vobj::tensor_reduced> ret(rhs.Grid());
|
Lattice<typename vobj::tensor_reduced> ret(rhs.Grid());
|
||||||
auto lhs_v = lhs.View(AcceleratorRead);
|
autoView( lhs_v , lhs, AcceleratorRead);
|
||||||
auto rhs_v = rhs.View(AcceleratorRead);
|
autoView( rhs_v , rhs, AcceleratorRead);
|
||||||
auto ret_v = ret.View(AcceleratorWrite);
|
autoView( ret_v , ret, AcceleratorWrite);
|
||||||
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
|
accelerator_for(ss,rhs_v.size(),vobj::Nsimd(),{
|
||||||
coalescedWrite(ret_v[ss],innerProduct(lhs_v(ss),rhs_v(ss)));
|
coalescedWrite(ret_v[ss],innerProduct(lhs_v(ss),rhs_v(ss)));
|
||||||
});
|
});
|
||||||
@ -73,9 +73,9 @@ inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Latt
|
|||||||
typedef decltype(coalescedRead(ll())) sll;
|
typedef decltype(coalescedRead(ll())) sll;
|
||||||
typedef decltype(coalescedRead(rr())) srr;
|
typedef decltype(coalescedRead(rr())) srr;
|
||||||
Lattice<decltype(outerProduct(ll(),rr()))> ret(rhs.Grid());
|
Lattice<decltype(outerProduct(ll(),rr()))> ret(rhs.Grid());
|
||||||
auto lhs_v = lhs.View(AcceleratorRead);
|
autoView( lhs_v , lhs, AcceleratorRead);
|
||||||
auto rhs_v = rhs.View(AcceleratorRead);
|
autoView( rhs_v , rhs, AcceleratorRead);
|
||||||
auto ret_v = ret.View(AcceleratorWrite);
|
autoView( ret_v , ret, AcceleratorWrite);
|
||||||
accelerator_for(ss,rhs_v.size(),1,{
|
accelerator_for(ss,rhs_v.size(),1,{
|
||||||
// FIXME had issues with scalar version of outer
|
// FIXME had issues with scalar version of outer
|
||||||
// Use vector [] operator and don't read coalesce this loop
|
// Use vector [] operator and don't read coalesce this loop
|
||||||
|
@ -51,9 +51,9 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
|
|||||||
int block =FullGrid->_slice_block [Orthog];
|
int block =FullGrid->_slice_block [Orthog];
|
||||||
int nblock=FullGrid->_slice_nblock[Orthog];
|
int nblock=FullGrid->_slice_nblock[Orthog];
|
||||||
int ostride=FullGrid->_ostride[Orthog];
|
int ostride=FullGrid->_ostride[Orthog];
|
||||||
auto X_v = X.View(CpuRead);
|
autoView( X_v , X, CpuRead);
|
||||||
auto Y_v = Y.View(CpuRead);
|
autoView( Y_v , Y, CpuRead);
|
||||||
auto R_v = R.View(CpuWrite);
|
autoView( R_v , R, CpuWrite);
|
||||||
thread_region
|
thread_region
|
||||||
{
|
{
|
||||||
std::vector<vobj> s_x(Nblock);
|
std::vector<vobj> s_x(Nblock);
|
||||||
@ -97,8 +97,8 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
|
|||||||
int nblock=FullGrid->_slice_nblock[Orthog];
|
int nblock=FullGrid->_slice_nblock[Orthog];
|
||||||
int ostride=FullGrid->_ostride[Orthog];
|
int ostride=FullGrid->_ostride[Orthog];
|
||||||
|
|
||||||
auto X_v = X.View(CpuRead);
|
autoView( X_v , X, CpuRead);
|
||||||
auto R_v = R.View(CpuWrite);
|
autoView( R_v , R, CpuWrite);
|
||||||
|
|
||||||
thread_region
|
thread_region
|
||||||
{
|
{
|
||||||
@ -156,8 +156,8 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
|
|||||||
int ostride=FullGrid->_ostride[Orthog];
|
int ostride=FullGrid->_ostride[Orthog];
|
||||||
|
|
||||||
typedef typename vobj::vector_typeD vector_typeD;
|
typedef typename vobj::vector_typeD vector_typeD;
|
||||||
auto lhs_v = lhs.View(CpuRead);
|
autoView( lhs_v , lhs, CpuRead);
|
||||||
auto rhs_v = rhs.View(CpuRead);
|
autoView( rhs_v , rhs, CpuRead);
|
||||||
thread_region {
|
thread_region {
|
||||||
std::vector<vobj> Left(Nblock);
|
std::vector<vobj> Left(Nblock);
|
||||||
std::vector<vobj> Right(Nblock);
|
std::vector<vobj> Right(Nblock);
|
||||||
|
@ -46,8 +46,8 @@ auto PeekIndex(const Lattice<vobj> &lhs,int i) -> Lattice<decltype(peekIndex<Ind
|
|||||||
{
|
{
|
||||||
Lattice<decltype(peekIndex<Index>(vobj(),i))> ret(lhs.Grid());
|
Lattice<decltype(peekIndex<Index>(vobj(),i))> ret(lhs.Grid());
|
||||||
ret.Checkerboard()=lhs.Checkerboard();
|
ret.Checkerboard()=lhs.Checkerboard();
|
||||||
auto ret_v = ret.View(CpuWrite);
|
autoView( ret_v, ret, CpuWrite);
|
||||||
auto lhs_v = lhs.View(CpuRead);
|
autoView( lhs_v, lhs, CpuRead);
|
||||||
thread_for( ss, lhs_v.size(), {
|
thread_for( ss, lhs_v.size(), {
|
||||||
ret_v[ss] = peekIndex<Index>(lhs_v[ss],i);
|
ret_v[ss] = peekIndex<Index>(lhs_v[ss],i);
|
||||||
});
|
});
|
||||||
@ -58,8 +58,8 @@ auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekInd
|
|||||||
{
|
{
|
||||||
Lattice<decltype(peekIndex<Index>(vobj(),i,j))> ret(lhs.Grid());
|
Lattice<decltype(peekIndex<Index>(vobj(),i,j))> ret(lhs.Grid());
|
||||||
ret.Checkerboard()=lhs.Checkerboard();
|
ret.Checkerboard()=lhs.Checkerboard();
|
||||||
auto ret_v = ret.View(CpuWrite);
|
autoView( ret_v, ret, CpuWrite);
|
||||||
auto lhs_v = lhs.View(CpuRead);
|
autoView( lhs_v, lhs, CpuRead);
|
||||||
thread_for( ss, lhs_v.size(), {
|
thread_for( ss, lhs_v.size(), {
|
||||||
ret_v[ss] = peekIndex<Index>(lhs_v[ss],i,j);
|
ret_v[ss] = peekIndex<Index>(lhs_v[ss],i,j);
|
||||||
});
|
});
|
||||||
@ -72,8 +72,8 @@ auto PeekIndex(const Lattice<vobj> &lhs,int i,int j) -> Lattice<decltype(peekInd
|
|||||||
template<int Index,class vobj>
|
template<int Index,class vobj>
|
||||||
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0))> & rhs,int i)
|
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0))> & rhs,int i)
|
||||||
{
|
{
|
||||||
auto rhs_v = rhs.View(CpuRead);
|
autoView( rhs_v, rhs, CpuRead);
|
||||||
auto lhs_v = lhs.View(CpuWrite);
|
autoView( lhs_v, lhs, CpuWrite);
|
||||||
thread_for( ss, lhs_v.size(), {
|
thread_for( ss, lhs_v.size(), {
|
||||||
pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i);
|
pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i);
|
||||||
});
|
});
|
||||||
@ -81,8 +81,8 @@ void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj()
|
|||||||
template<int Index,class vobj>
|
template<int Index,class vobj>
|
||||||
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0,0))> & rhs,int i,int j)
|
void PokeIndex(Lattice<vobj> &lhs,const Lattice<decltype(peekIndex<Index>(vobj(),0,0))> & rhs,int i,int j)
|
||||||
{
|
{
|
||||||
auto rhs_v = rhs.View(CpuRead);
|
autoView( rhs_v, rhs, CpuRead);
|
||||||
auto lhs_v = lhs.View(CpuWrite);
|
autoView( lhs_v, lhs, CpuWrite);
|
||||||
thread_for( ss, lhs_v.size(), {
|
thread_for( ss, lhs_v.size(), {
|
||||||
pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i,j);
|
pokeIndex<Index>(lhs_v[ss],rhs_v[ss],i,j);
|
||||||
});
|
});
|
||||||
@ -111,7 +111,7 @@ void pokeSite(const sobj &s,Lattice<vobj> &l,const Coordinate &site){
|
|||||||
|
|
||||||
// extract-modify-merge cycle is easiest way and this is not perf critical
|
// extract-modify-merge cycle is easiest way and this is not perf critical
|
||||||
ExtractBuffer<sobj> buf(Nsimd);
|
ExtractBuffer<sobj> buf(Nsimd);
|
||||||
auto l_v = l.View(CpuWrite);
|
autoView( l_v , l, CpuWrite);
|
||||||
if ( rank == grid->ThisRank() ) {
|
if ( rank == grid->ThisRank() ) {
|
||||||
extract(l_v[odx],buf);
|
extract(l_v[odx],buf);
|
||||||
buf[idx] = s;
|
buf[idx] = s;
|
||||||
@ -141,7 +141,7 @@ void peekSite(sobj &s,const Lattice<vobj> &l,const Coordinate &site){
|
|||||||
grid->GlobalCoorToRankIndex(rank,odx,idx,site);
|
grid->GlobalCoorToRankIndex(rank,odx,idx,site);
|
||||||
|
|
||||||
ExtractBuffer<sobj> buf(Nsimd);
|
ExtractBuffer<sobj> buf(Nsimd);
|
||||||
auto l_v = l.View(CpuWrite);
|
autoView( l_v , l, CpuWrite);
|
||||||
extract(l_v[odx],buf);
|
extract(l_v[odx],buf);
|
||||||
|
|
||||||
s = buf[idx];
|
s = buf[idx];
|
||||||
@ -173,7 +173,7 @@ inline void peekLocalSite(sobj &s,const Lattice<vobj> &l,Coordinate &site){
|
|||||||
idx= grid->iIndex(site);
|
idx= grid->iIndex(site);
|
||||||
odx= grid->oIndex(site);
|
odx= grid->oIndex(site);
|
||||||
|
|
||||||
auto l_v = l.View(CpuRead);
|
autoView( l_v , l, CpuRead);
|
||||||
scalar_type * vp = (scalar_type *)&l_v[odx];
|
scalar_type * vp = (scalar_type *)&l_v[odx];
|
||||||
scalar_type * pt = (scalar_type *)&s;
|
scalar_type * pt = (scalar_type *)&s;
|
||||||
|
|
||||||
@ -202,7 +202,7 @@ inline void pokeLocalSite(const sobj &s,Lattice<vobj> &l,Coordinate &site){
|
|||||||
idx= grid->iIndex(site);
|
idx= grid->iIndex(site);
|
||||||
odx= grid->oIndex(site);
|
odx= grid->oIndex(site);
|
||||||
|
|
||||||
auto l_v = l.View(CpuWrite);
|
autoView( l_v , l, CpuWrite);
|
||||||
scalar_type * vp = (scalar_type *)&l_v[odx];
|
scalar_type * vp = (scalar_type *)&l_v[odx];
|
||||||
scalar_type * pt = (scalar_type *)&s;
|
scalar_type * pt = (scalar_type *)&s;
|
||||||
for(int w=0;w<words;w++){
|
for(int w=0;w<words;w++){
|
||||||
|
@ -40,8 +40,8 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
|
|
||||||
template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
|
template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
|
||||||
Lattice<vobj> ret(lhs.Grid());
|
Lattice<vobj> ret(lhs.Grid());
|
||||||
auto lhs_v = lhs.View(AcceleratorRead);
|
autoView( lhs_v, lhs, AcceleratorRead);
|
||||||
auto ret_v = ret.View(AcceleratorWrite);
|
autoView( ret_v, ret, AcceleratorWrite);
|
||||||
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
|
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
|
||||||
coalescedWrite(ret_v[ss], adj(lhs_v(ss)));
|
coalescedWrite(ret_v[ss], adj(lhs_v(ss)));
|
||||||
});
|
});
|
||||||
@ -50,8 +50,8 @@ template<class vobj> inline Lattice<vobj> adj(const Lattice<vobj> &lhs){
|
|||||||
|
|
||||||
template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
|
template<class vobj> inline Lattice<vobj> conjugate(const Lattice<vobj> &lhs){
|
||||||
Lattice<vobj> ret(lhs.Grid());
|
Lattice<vobj> ret(lhs.Grid());
|
||||||
auto lhs_v = lhs.View(AcceleratorRead);
|
autoView( lhs_v, lhs, AcceleratorRead);
|
||||||
auto ret_v = ret.View(AcceleratorWrite);
|
autoView( ret_v, ret, AcceleratorWrite);
|
||||||
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
|
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
|
||||||
coalescedWrite( ret_v[ss] , conjugate(lhs_v(ss)));
|
coalescedWrite( ret_v[ss] , conjugate(lhs_v(ss)));
|
||||||
});
|
});
|
||||||
|
@ -79,11 +79,11 @@ template<class vobj>
|
|||||||
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
|
inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
|
||||||
{
|
{
|
||||||
#if defined(GRID_CUDA)||defined(GRID_HIP)
|
#if defined(GRID_CUDA)||defined(GRID_HIP)
|
||||||
auto arg_v = arg.View(AcceleratorRead);
|
autoView( arg_v, arg, AcceleratorRead);
|
||||||
Integer osites = arg.Grid()->oSites();
|
Integer osites = arg.Grid()->oSites();
|
||||||
auto ssum= sum_gpu(&arg_v[0],osites);
|
auto ssum= sum_gpu(&arg_v[0],osites);
|
||||||
#else
|
#else
|
||||||
auto arg_v = arg.View(CpuRead);
|
autoView(arg_v, arg, CpuRead);
|
||||||
Integer osites = arg.Grid()->oSites();
|
Integer osites = arg.Grid()->oSites();
|
||||||
auto ssum= sum_cpu(&arg_v[0],osites);
|
auto ssum= sum_cpu(&arg_v[0],osites);
|
||||||
#endif
|
#endif
|
||||||
@ -113,8 +113,8 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
|
|||||||
const uint64_t sites = grid->oSites();
|
const uint64_t sites = grid->oSites();
|
||||||
|
|
||||||
// Might make all code paths go this way.
|
// Might make all code paths go this way.
|
||||||
auto left_v = left.View(AcceleratorRead);
|
autoView( left_v , left, AcceleratorRead);
|
||||||
auto right_v=right.View(AcceleratorRead);
|
autoView( right_v,right, AcceleratorRead);
|
||||||
|
|
||||||
// GPU - SIMT lane compliance...
|
// GPU - SIMT lane compliance...
|
||||||
typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t;
|
typedef decltype(innerProduct(left_v[0],right_v[0])) inner_t;
|
||||||
@ -168,9 +168,9 @@ axpby_norm_fast(Lattice<vobj> &z,sobj a,sobj b,const Lattice<vobj> &x,const Latt
|
|||||||
const uint64_t sites = grid->oSites();
|
const uint64_t sites = grid->oSites();
|
||||||
|
|
||||||
// GPU
|
// GPU
|
||||||
auto x_v=x.View(AcceleratorRead);
|
autoView( x_v, x, AcceleratorRead);
|
||||||
auto y_v=y.View(AcceleratorRead);
|
autoView( y_v, y, AcceleratorRead);
|
||||||
auto z_v=z.View(AcceleratorWrite);
|
autoView( z_v, z, AcceleratorWrite);
|
||||||
|
|
||||||
typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
|
typedef decltype(innerProduct(x_v[0],y_v[0])) inner_t;
|
||||||
Vector<inner_t> inner_tmp(sites);
|
Vector<inner_t> inner_tmp(sites);
|
||||||
@ -257,7 +257,7 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
|
|||||||
|
|
||||||
// sum over reduced dimension planes, breaking out orthog dir
|
// sum over reduced dimension planes, breaking out orthog dir
|
||||||
// Parallel over orthog direction
|
// Parallel over orthog direction
|
||||||
auto Data_v=Data.View(CpuRead);
|
autoView( Data_v, Data, CpuRead);
|
||||||
thread_for( r,rd, {
|
thread_for( r,rd, {
|
||||||
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
|
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
|
||||||
for(int n=0;n<e1;n++){
|
for(int n=0;n<e1;n++){
|
||||||
@ -335,8 +335,8 @@ static void sliceInnerProductVector( std::vector<ComplexD> & result, const Latti
|
|||||||
int e2= grid->_slice_block [orthogdim];
|
int e2= grid->_slice_block [orthogdim];
|
||||||
int stride=grid->_slice_stride[orthogdim];
|
int stride=grid->_slice_stride[orthogdim];
|
||||||
|
|
||||||
auto lhv=lhs.View(CpuRead);
|
autoView( lhv, lhs, CpuRead);
|
||||||
auto rhv=rhs.View(CpuRead);
|
autoView( rhv, rhs, CpuRead);
|
||||||
thread_for( r,rd,{
|
thread_for( r,rd,{
|
||||||
|
|
||||||
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
|
int so=r*grid->_ostride[orthogdim]; // base offset for start of plane
|
||||||
@ -443,9 +443,9 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
|
|||||||
|
|
||||||
tensor_reduced at; at=av;
|
tensor_reduced at; at=av;
|
||||||
|
|
||||||
auto Rv=R.View(CpuWrite);
|
autoView( Rv, R, CpuWrite);
|
||||||
auto Xv=X.View(CpuRead);
|
autoView( Xv, X, CpuRead);
|
||||||
auto Yv=Y.View(CpuRead);
|
autoView( Yv, Y, CpuRead);
|
||||||
thread_for2d( n, e1, b,e2, {
|
thread_for2d( n, e1, b,e2, {
|
||||||
int ss= so+n*stride+b;
|
int ss= so+n*stride+b;
|
||||||
Rv[ss] = at*Xv[ss]+Yv[ss];
|
Rv[ss] = at*Xv[ss]+Yv[ss];
|
||||||
@ -501,9 +501,9 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
|
|||||||
int nblock=FullGrid->_slice_nblock[Orthog];
|
int nblock=FullGrid->_slice_nblock[Orthog];
|
||||||
int ostride=FullGrid->_ostride[Orthog];
|
int ostride=FullGrid->_ostride[Orthog];
|
||||||
|
|
||||||
auto X_v=X.View(CpuRead);
|
autoView( X_v, X, CpuRead);
|
||||||
auto Y_v=Y.View(CpuRead);
|
autoView( Y_v, Y, CpuRead);
|
||||||
auto R_v=R.View(CpuWrite);
|
autoView( R_v, R, CpuWrite);
|
||||||
thread_region
|
thread_region
|
||||||
{
|
{
|
||||||
Vector<vobj> s_x(Nblock);
|
Vector<vobj> s_x(Nblock);
|
||||||
@ -554,8 +554,8 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
|
|||||||
int block =FullGrid->_slice_block [Orthog];
|
int block =FullGrid->_slice_block [Orthog];
|
||||||
int nblock=FullGrid->_slice_nblock[Orthog];
|
int nblock=FullGrid->_slice_nblock[Orthog];
|
||||||
int ostride=FullGrid->_ostride[Orthog];
|
int ostride=FullGrid->_ostride[Orthog];
|
||||||
auto R_v = R.View(CpuWrite);
|
autoView( R_v, R, CpuWrite);
|
||||||
auto X_v = X.View(CpuRead);
|
autoView( X_v, X, CpuRead);
|
||||||
thread_region
|
thread_region
|
||||||
{
|
{
|
||||||
std::vector<vobj> s_x(Nblock);
|
std::vector<vobj> s_x(Nblock);
|
||||||
@ -613,8 +613,8 @@ static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice<vobj>
|
|||||||
|
|
||||||
typedef typename vobj::vector_typeD vector_typeD;
|
typedef typename vobj::vector_typeD vector_typeD;
|
||||||
|
|
||||||
auto lhs_v=lhs.View(CpuRead);
|
autoView( lhs_v, lhs, CpuRead);
|
||||||
auto rhs_v=rhs.View(CpuRead);
|
autoView( rhs_v, rhs, CpuRead);
|
||||||
thread_region
|
thread_region
|
||||||
{
|
{
|
||||||
std::vector<vobj> Left(Nblock);
|
std::vector<vobj> Left(Nblock);
|
||||||
|
@ -375,7 +375,7 @@ public:
|
|||||||
int osites = _grid->oSites(); // guaranteed to be <= l.Grid()->oSites() by a factor multiplicity
|
int osites = _grid->oSites(); // guaranteed to be <= l.Grid()->oSites() by a factor multiplicity
|
||||||
int words = sizeof(scalar_object) / sizeof(scalar_type);
|
int words = sizeof(scalar_object) / sizeof(scalar_type);
|
||||||
|
|
||||||
auto l_v = l.View(CpuWrite);
|
autoView(l_v, l, CpuWrite);
|
||||||
thread_for( ss, osites, {
|
thread_for( ss, osites, {
|
||||||
ExtractBuffer<scalar_object> buf(Nsimd);
|
ExtractBuffer<scalar_object> buf(Nsimd);
|
||||||
for (int m = 0; m < multiplicity; m++) { // Draw from same generator multiplicity times
|
for (int m = 0; m < multiplicity; m++) { // Draw from same generator multiplicity times
|
||||||
|
@ -41,8 +41,8 @@ template<class vobj>
|
|||||||
inline auto trace(const Lattice<vobj> &lhs) -> Lattice<decltype(trace(vobj()))>
|
inline auto trace(const Lattice<vobj> &lhs) -> Lattice<decltype(trace(vobj()))>
|
||||||
{
|
{
|
||||||
Lattice<decltype(trace(vobj()))> ret(lhs.Grid());
|
Lattice<decltype(trace(vobj()))> ret(lhs.Grid());
|
||||||
auto ret_v = ret.View(AcceleratorWrite);
|
autoView(ret_v , ret, AcceleratorWrite);
|
||||||
auto lhs_v = lhs.View(AcceleratorRead);
|
autoView(lhs_v , lhs, AcceleratorRead);
|
||||||
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
|
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
|
||||||
coalescedWrite(ret_v[ss], trace(lhs_v(ss)));
|
coalescedWrite(ret_v[ss], trace(lhs_v(ss)));
|
||||||
});
|
});
|
||||||
@ -56,8 +56,8 @@ template<int Index,class vobj>
|
|||||||
inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(vobj()))>
|
inline auto TraceIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(traceIndex<Index>(vobj()))>
|
||||||
{
|
{
|
||||||
Lattice<decltype(traceIndex<Index>(vobj()))> ret(lhs.Grid());
|
Lattice<decltype(traceIndex<Index>(vobj()))> ret(lhs.Grid());
|
||||||
auto ret_v = ret.View(AcceleratorWrite);
|
autoView( ret_v , ret, AcceleratorWrite);
|
||||||
auto lhs_v = lhs.View(AcceleratorRead);
|
autoView( lhs_v , lhs, AcceleratorRead);
|
||||||
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
|
accelerator_for( ss, lhs_v.size(), vobj::Nsimd(), {
|
||||||
coalescedWrite(ret_v[ss], traceIndex<Index>(lhs_v(ss)));
|
coalescedWrite(ret_v[ss], traceIndex<Index>(lhs_v(ss)));
|
||||||
});
|
});
|
||||||
|
@ -46,11 +46,12 @@ inline void subdivides(GridBase *coarse,GridBase *fine)
|
|||||||
////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// remove and insert a half checkerboard
|
// remove and insert a half checkerboard
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full){
|
template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full)
|
||||||
|
{
|
||||||
half.Checkerboard() = cb;
|
half.Checkerboard() = cb;
|
||||||
|
|
||||||
auto half_v = half.View(CpuWrite);
|
autoView( half_v, half, CpuWrite);
|
||||||
auto full_v = full.View(CpuRead);
|
autoView( full_v, full, CpuRead);
|
||||||
thread_for(ss, full.Grid()->oSites(),{
|
thread_for(ss, full.Grid()->oSites(),{
|
||||||
int cbos;
|
int cbos;
|
||||||
Coordinate coor;
|
Coordinate coor;
|
||||||
@ -63,10 +64,11 @@ template<class vobj> inline void pickCheckerboard(int cb,Lattice<vobj> &half,con
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){
|
template<class vobj> inline void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half)
|
||||||
|
{
|
||||||
int cb = half.Checkerboard();
|
int cb = half.Checkerboard();
|
||||||
auto half_v = half.View(CpuRead);
|
autoView( half_v , half, CpuRead);
|
||||||
auto full_v = full.View(CpuWrite);
|
autoView( full_v , full, CpuWrite);
|
||||||
thread_for(ss,full.Grid()->oSites(),{
|
thread_for(ss,full.Grid()->oSites(),{
|
||||||
|
|
||||||
Coordinate coor;
|
Coordinate coor;
|
||||||
@ -92,79 +94,15 @@ inline void blockProject(Lattice<iVector<CComplex,nbasis > > &coarseData,
|
|||||||
|
|
||||||
Lattice<CComplex> ip(coarse);
|
Lattice<CComplex> ip(coarse);
|
||||||
|
|
||||||
auto coarseData_ = coarseData.View(AcceleratorWrite);
|
autoView( coarseData_ , coarseData, AcceleratorWrite);
|
||||||
auto ip_ = ip.View(AcceleratorWrite);
|
autoView( ip_ , ip, AcceleratorWrite);
|
||||||
for(int v=0;v<nbasis;v++) {
|
for(int v=0;v<nbasis;v++) {
|
||||||
blockInnerProduct(ip,Basis[v],fineData);
|
blockInnerProduct(ip,Basis[v],fineData);
|
||||||
accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
|
accelerator_for( sc, coarse->oSites(), vobj::Nsimd(), {
|
||||||
coalescedWrite(coarseData_[sc](v),ip_(sc));
|
coalescedWrite(coarseData_[sc](v),ip_(sc));
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#if 0
|
|
||||||
template<class vobj,class CComplex,int nbasis>
|
|
||||||
inline void blockProject1(Lattice<iVector<CComplex,nbasis > > &coarseData,
|
|
||||||
const Lattice<vobj> &fineData,
|
|
||||||
const std::vector<Lattice<vobj> > &Basis)
|
|
||||||
{
|
|
||||||
typedef iVector<CComplex,nbasis > coarseSiteData;
|
|
||||||
coarseSiteData elide;
|
|
||||||
typedef decltype(coalescedRead(elide)) ScalarComplex;
|
|
||||||
GridBase * fine = fineData.Grid();
|
|
||||||
GridBase * coarse= coarseData.Grid();
|
|
||||||
int _ndimension = coarse->_ndimension;
|
|
||||||
|
|
||||||
// checks
|
|
||||||
assert( nbasis == Basis.size() );
|
|
||||||
subdivides(coarse,fine);
|
|
||||||
for(int i=0;i<nbasis;i++){
|
|
||||||
conformable(Basis[i],fineData);
|
|
||||||
}
|
|
||||||
|
|
||||||
Coordinate block_r (_ndimension);
|
|
||||||
|
|
||||||
for(int d=0 ; d<_ndimension;d++){
|
|
||||||
block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
|
|
||||||
assert(block_r[d]*coarse->_rdimensions[d] == fine->_rdimensions[d]);
|
|
||||||
}
|
|
||||||
int blockVol = fine->oSites()/coarse->oSites();
|
|
||||||
|
|
||||||
coarseData=Zero();
|
|
||||||
|
|
||||||
auto fineData_ = fineData.View(AcceleratorRead);
|
|
||||||
auto coarseData_ = coarseData.View(AcceleratorWrite);
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
// To make this lock free, loop over coars parallel, and then loop over fine associated with coarse.
|
|
||||||
// Otherwise do fine inner product per site, and make the update atomic
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
accelerator_for( sci, nbasis*coarse->oSites(), vobj::Nsimd(), {
|
|
||||||
|
|
||||||
auto sc=sci/nbasis;
|
|
||||||
auto i=sci%nbasis;
|
|
||||||
auto Basis_ = Basis[i].View(AcceleratorRead);
|
|
||||||
|
|
||||||
Coordinate coor_c(_ndimension);
|
|
||||||
Lexicographic::CoorFromIndex(coor_c,sc,coarse->_rdimensions); // Block coordinate
|
|
||||||
|
|
||||||
int sf;
|
|
||||||
decltype(innerProduct(Basis_(sf),fineData_(sf))) reduce=Zero();
|
|
||||||
|
|
||||||
for(int sb=0;sb<blockVol;sb++){
|
|
||||||
|
|
||||||
Coordinate coor_b(_ndimension);
|
|
||||||
Coordinate coor_f(_ndimension);
|
|
||||||
|
|
||||||
Lexicographic::CoorFromIndex(coor_b,sb,block_r);
|
|
||||||
for(int d=0;d<_ndimension;d++) coor_f[d]=coor_c[d]*block_r[d]+coor_b[d];
|
|
||||||
Lexicographic::IndexFromCoor(coor_f,sf,fine->_rdimensions);
|
|
||||||
|
|
||||||
reduce=reduce+innerProduct(Basis_(sf),fineData_(sf));
|
|
||||||
}
|
|
||||||
coalescedWrite(coarseData_[sc](i),reduce);
|
|
||||||
});
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template<class vobj,class CComplex>
|
template<class vobj,class CComplex>
|
||||||
inline void blockZAXPY(Lattice<vobj> &fineZ,
|
inline void blockZAXPY(Lattice<vobj> &fineZ,
|
||||||
@ -191,10 +129,10 @@ inline void blockZAXPY(Lattice<vobj> &fineZ,
|
|||||||
assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]);
|
assert(block_r[d]*coarse->_rdimensions[d]==fine->_rdimensions[d]);
|
||||||
}
|
}
|
||||||
|
|
||||||
auto fineZ_ = fineZ.View(AcceleratorWrite);
|
autoView( fineZ_ , fineZ, AcceleratorWrite);
|
||||||
auto fineX_ = fineX.View(AcceleratorRead);
|
autoView( fineX_ , fineX, AcceleratorRead);
|
||||||
auto fineY_ = fineY.View(AcceleratorRead);
|
autoView( fineY_ , fineY, AcceleratorRead);
|
||||||
auto coarseA_= coarseA.View(AcceleratorRead);
|
autoView( coarseA_, coarseA, AcceleratorRead);
|
||||||
|
|
||||||
accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), {
|
accelerator_for(sf, fine->oSites(), CComplex::Nsimd(), {
|
||||||
|
|
||||||
@ -229,8 +167,8 @@ inline void blockInnerProduct(Lattice<CComplex> &CoarseInner,
|
|||||||
// Precision promotion?
|
// Precision promotion?
|
||||||
fine_inner = localInnerProduct(fineX,fineY);
|
fine_inner = localInnerProduct(fineX,fineY);
|
||||||
blockSum(coarse_inner,fine_inner);
|
blockSum(coarse_inner,fine_inner);
|
||||||
auto CoarseInner_ = CoarseInner.View(AcceleratorWrite);
|
autoView( CoarseInner_ , CoarseInner, AcceleratorWrite);
|
||||||
auto coarse_inner_ = coarse_inner.View(AcceleratorRead);
|
autoView( coarse_inner_ , coarse_inner, AcceleratorRead);
|
||||||
accelerator_for(ss, coarse->oSites(), 1, {
|
accelerator_for(ss, coarse->oSites(), 1, {
|
||||||
CoarseInner_[ss] = coarse_inner_[ss];
|
CoarseInner_[ss] = coarse_inner_[ss];
|
||||||
});
|
});
|
||||||
@ -265,8 +203,8 @@ inline void blockSum(Lattice<vobj> &coarseData,const Lattice<vobj> &fineData)
|
|||||||
|
|
||||||
// Turn this around to loop threaded over sc and interior loop
|
// Turn this around to loop threaded over sc and interior loop
|
||||||
// over sf would thread better
|
// over sf would thread better
|
||||||
auto coarseData_ = coarseData.View(AcceleratorWrite);
|
autoView( coarseData_ , coarseData, AcceleratorWrite);
|
||||||
auto fineData_ = fineData.View(AcceleratorRead);
|
autoView( fineData_ , fineData, AcceleratorRead);
|
||||||
|
|
||||||
accelerator_for(sc,coarse->oSites(),1,{
|
accelerator_for(sc,coarse->oSites(),1,{
|
||||||
|
|
||||||
@ -359,8 +297,8 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
|
|||||||
for(int d=0 ; d<_ndimension;d++){
|
for(int d=0 ; d<_ndimension;d++){
|
||||||
block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
|
block_r[d] = fine->_rdimensions[d] / coarse->_rdimensions[d];
|
||||||
}
|
}
|
||||||
auto fineData_ = fineData.View(AcceleratorWrite);
|
autoView( fineData_ , fineData, AcceleratorWrite);
|
||||||
auto coarseData_ = coarseData.View(AcceleratorRead);
|
autoView( coarseData_ , coarseData, AcceleratorRead);
|
||||||
|
|
||||||
// Loop with a cache friendly loop ordering
|
// Loop with a cache friendly loop ordering
|
||||||
accelerator_for(sf,fine->oSites(),1,{
|
accelerator_for(sf,fine->oSites(),1,{
|
||||||
@ -373,7 +311,7 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
|
|||||||
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
|
Lexicographic::IndexFromCoor(coor_c,sc,coarse->_rdimensions);
|
||||||
|
|
||||||
for(int i=0;i<nbasis;i++) {
|
for(int i=0;i<nbasis;i++) {
|
||||||
/* auto basis_ = Basis[i].View( );*/
|
/* auto basis_ = Basis[i], );*/
|
||||||
if(i==0) fineData_[sf]=coarseData_[sc](i) *basis_[sf]);
|
if(i==0) fineData_[sf]=coarseData_[sc](i) *basis_[sf]);
|
||||||
else fineData_[sf]=fineData_[sf]+coarseData_[sc](i)*basis_[sf]);
|
else fineData_[sf]=fineData_[sf]+coarseData_[sc](i)*basis_[sf]);
|
||||||
}
|
}
|
||||||
@ -394,8 +332,8 @@ inline void blockPromote(const Lattice<iVector<CComplex,nbasis > > &coarseData,
|
|||||||
for(int i=0;i<nbasis;i++) {
|
for(int i=0;i<nbasis;i++) {
|
||||||
Lattice<iScalar<CComplex> > ip = PeekIndex<0>(coarseData,i);
|
Lattice<iScalar<CComplex> > ip = PeekIndex<0>(coarseData,i);
|
||||||
Lattice<CComplex> cip(coarse);
|
Lattice<CComplex> cip(coarse);
|
||||||
auto cip_ = cip.View(AcceleratorWrite);
|
autoView( cip_ , cip, AcceleratorWrite);
|
||||||
auto ip_ = ip.View(AcceleratorRead);
|
autoView( ip_ , ip, AcceleratorRead);
|
||||||
accelerator_forNB(sc,coarse->oSites(),CComplex::Nsimd(),{
|
accelerator_forNB(sc,coarse->oSites(),CComplex::Nsimd(),{
|
||||||
coalescedWrite(cip_[sc], ip_(sc)());
|
coalescedWrite(cip_[sc], ip_(sc)());
|
||||||
});
|
});
|
||||||
@ -469,8 +407,8 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
|
|||||||
Coordinate rdt = Tg->_rdimensions;
|
Coordinate rdt = Tg->_rdimensions;
|
||||||
Coordinate ist = Tg->_istride;
|
Coordinate ist = Tg->_istride;
|
||||||
Coordinate ost = Tg->_ostride;
|
Coordinate ost = Tg->_ostride;
|
||||||
auto t_v = To.View(AcceleratorWrite);
|
autoView( t_v , To, AcceleratorWrite);
|
||||||
auto f_v = From.View(AcceleratorRead);
|
autoView( f_v , From, AcceleratorRead);
|
||||||
accelerator_for(idx,Fg->lSites(),1,{
|
accelerator_for(idx,Fg->lSites(),1,{
|
||||||
sobj s;
|
sobj s;
|
||||||
Coordinate Fcoor(nd);
|
Coordinate Fcoor(nd);
|
||||||
@ -717,7 +655,7 @@ unvectorizeToLexOrdArray(std::vector<sobj> &out, const Lattice<vobj> &in)
|
|||||||
}
|
}
|
||||||
|
|
||||||
//loop over outer index
|
//loop over outer index
|
||||||
auto in_v = in.View(CpuRead);
|
autoView( in_v , in, CpuRead);
|
||||||
thread_for(in_oidx,in_grid->oSites(),{
|
thread_for(in_oidx,in_grid->oSites(),{
|
||||||
//Assemble vector of pointers to output elements
|
//Assemble vector of pointers to output elements
|
||||||
ExtractPointerArray<sobj> out_ptrs(in_nsimd);
|
ExtractPointerArray<sobj> out_ptrs(in_nsimd);
|
||||||
@ -810,7 +748,7 @@ vectorizeFromLexOrdArray( std::vector<sobj> &in, Lattice<vobj> &out)
|
|||||||
icoor[lane].resize(ndim);
|
icoor[lane].resize(ndim);
|
||||||
grid->iCoorFromIindex(icoor[lane],lane);
|
grid->iCoorFromIindex(icoor[lane],lane);
|
||||||
}
|
}
|
||||||
auto out_v = out.View(CpuWrite);
|
autoView( out_v , out, CpuWrite);
|
||||||
thread_for(oidx, grid->oSites(),{
|
thread_for(oidx, grid->oSites(),{
|
||||||
//Assemble vector of pointers to output elements
|
//Assemble vector of pointers to output elements
|
||||||
ExtractPointerArray<sobj> ptrs(nsimd);
|
ExtractPointerArray<sobj> ptrs(nsimd);
|
||||||
@ -913,7 +851,7 @@ void precisionChange(Lattice<VobjOut> &out, const Lattice<VobjIn> &in)
|
|||||||
std::vector<SobjOut> in_slex_conv(in_grid->lSites());
|
std::vector<SobjOut> in_slex_conv(in_grid->lSites());
|
||||||
unvectorizeToLexOrdArray(in_slex_conv, in);
|
unvectorizeToLexOrdArray(in_slex_conv, in);
|
||||||
|
|
||||||
auto out_v = out.View(CpuWrite);
|
autoView( out_v , out, CpuWrite);
|
||||||
thread_for(out_oidx,out_grid->oSites(),{
|
thread_for(out_oidx,out_grid->oSites(),{
|
||||||
Coordinate out_ocoor(ndim);
|
Coordinate out_ocoor(ndim);
|
||||||
out_grid->oCoorFromOindex(out_ocoor, out_oidx);
|
out_grid->oCoorFromOindex(out_ocoor, out_oidx);
|
||||||
|
@ -41,8 +41,8 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
template<class vobj>
|
template<class vobj>
|
||||||
inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){
|
inline Lattice<vobj> transpose(const Lattice<vobj> &lhs){
|
||||||
Lattice<vobj> ret(lhs.Grid());
|
Lattice<vobj> ret(lhs.Grid());
|
||||||
auto ret_v = ret.View(AcceleratorWrite);
|
autoView( ret_v, ret, AcceleratorWrite);
|
||||||
auto lhs_v = lhs.View(AcceleratorRead);
|
autoView( lhs_v, lhs, AcceleratorRead);
|
||||||
accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{
|
accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{
|
||||||
coalescedWrite(ret_v[ss], transpose(lhs_v(ss)));
|
coalescedWrite(ret_v[ss], transpose(lhs_v(ss)));
|
||||||
});
|
});
|
||||||
@ -56,8 +56,8 @@ template<int Index,class vobj>
|
|||||||
inline auto TransposeIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(transposeIndex<Index>(vobj()))>
|
inline auto TransposeIndex(const Lattice<vobj> &lhs) -> Lattice<decltype(transposeIndex<Index>(vobj()))>
|
||||||
{
|
{
|
||||||
Lattice<decltype(transposeIndex<Index>(vobj()))> ret(lhs.Grid());
|
Lattice<decltype(transposeIndex<Index>(vobj()))> ret(lhs.Grid());
|
||||||
auto ret_v = ret.View(AcceleratorWrite);
|
autoView( ret_v, ret, AcceleratorWrite);
|
||||||
auto lhs_v = lhs.View(AcceleratorRead);
|
autoView( lhs_v, lhs, AcceleratorRead);
|
||||||
accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{
|
accelerator_for(ss,lhs_v.size(),vobj::Nsimd(),{
|
||||||
coalescedWrite(ret_v[ss] , transposeIndex<Index>(lhs_v(ss)));
|
coalescedWrite(ret_v[ss] , transposeIndex<Index>(lhs_v(ss)));
|
||||||
});
|
});
|
||||||
|
@ -35,8 +35,8 @@ NAMESPACE_BEGIN(Grid);
|
|||||||
|
|
||||||
template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs_i,RealD y){
|
template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs_i,RealD y){
|
||||||
Lattice<obj> ret_i(rhs_i.Grid());
|
Lattice<obj> ret_i(rhs_i.Grid());
|
||||||
auto rhs = rhs_i.View(AcceleratorRead);
|
autoView( rhs, rhs_i, AcceleratorRead);
|
||||||
auto ret = ret_i.View(AcceleratorWrite);
|
autoView( ret, ret_i, AcceleratorWrite);
|
||||||
ret.Checkerboard() = rhs.Checkerboard();
|
ret.Checkerboard() = rhs.Checkerboard();
|
||||||
accelerator_for(ss,rhs.size(),1,{
|
accelerator_for(ss,rhs.size(),1,{
|
||||||
ret[ss]=pow(rhs[ss],y);
|
ret[ss]=pow(rhs[ss],y);
|
||||||
@ -45,8 +45,8 @@ template<class obj> Lattice<obj> pow(const Lattice<obj> &rhs_i,RealD y){
|
|||||||
}
|
}
|
||||||
template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs_i,Integer y){
|
template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs_i,Integer y){
|
||||||
Lattice<obj> ret_i(rhs_i.Grid());
|
Lattice<obj> ret_i(rhs_i.Grid());
|
||||||
auto rhs = rhs_i.View(AcceleratorRead);
|
autoView( rhs , rhs_i, AcceleratorRead);
|
||||||
auto ret = ret_i.View(AcceleratorWrite);
|
autoView( ret , ret_i, AcceleratorWrite);
|
||||||
ret.Checkerboard() = rhs.Checkerboard();
|
ret.Checkerboard() = rhs.Checkerboard();
|
||||||
accelerator_for(ss,rhs.size(),obj::Nsimd(),{
|
accelerator_for(ss,rhs.size(),obj::Nsimd(),{
|
||||||
coalescedWrite(ret[ss],mod(rhs(ss),y));
|
coalescedWrite(ret[ss],mod(rhs(ss),y));
|
||||||
@ -56,8 +56,8 @@ template<class obj> Lattice<obj> mod(const Lattice<obj> &rhs_i,Integer y){
|
|||||||
|
|
||||||
template<class obj> Lattice<obj> div(const Lattice<obj> &rhs_i,Integer y){
|
template<class obj> Lattice<obj> div(const Lattice<obj> &rhs_i,Integer y){
|
||||||
Lattice<obj> ret_i(rhs_i.Grid());
|
Lattice<obj> ret_i(rhs_i.Grid());
|
||||||
auto ret = ret_i.View(AcceleratorWrite);
|
autoView( ret , ret_i, AcceleratorWrite);
|
||||||
auto rhs = rhs_i.View(AcceleratorRead);
|
autoView( rhs , rhs_i, AcceleratorRead);
|
||||||
ret.Checkerboard() = rhs_i.Checkerboard();
|
ret.Checkerboard() = rhs_i.Checkerboard();
|
||||||
accelerator_for(ss,rhs.size(),obj::Nsimd(),{
|
accelerator_for(ss,rhs.size(),obj::Nsimd(),{
|
||||||
coalescedWrite(ret[ss],div(rhs(ss),y));
|
coalescedWrite(ret[ss],div(rhs(ss),y));
|
||||||
@ -67,8 +67,8 @@ template<class obj> Lattice<obj> div(const Lattice<obj> &rhs_i,Integer y){
|
|||||||
|
|
||||||
template<class obj> Lattice<obj> expMat(const Lattice<obj> &rhs_i, RealD alpha, Integer Nexp = DEFAULT_MAT_EXP){
|
template<class obj> Lattice<obj> expMat(const Lattice<obj> &rhs_i, RealD alpha, Integer Nexp = DEFAULT_MAT_EXP){
|
||||||
Lattice<obj> ret_i(rhs_i.Grid());
|
Lattice<obj> ret_i(rhs_i.Grid());
|
||||||
auto rhs = rhs_i.View(AcceleratorRead);
|
autoView( rhs , rhs_i, AcceleratorRead);
|
||||||
auto ret = ret_i.View(AcceleratorWrite);
|
autoView( ret , ret_i, AcceleratorWrite);
|
||||||
ret.Checkerboard() = rhs.Checkerboard();
|
ret.Checkerboard() = rhs.Checkerboard();
|
||||||
accelerator_for(ss,rhs.size(),obj::Nsimd(),{
|
accelerator_for(ss,rhs.size(),obj::Nsimd(),{
|
||||||
coalescedWrite(ret[ss],Exponentiate(rhs(ss),alpha, Nexp));
|
coalescedWrite(ret[ss],Exponentiate(rhs(ss),alpha, Nexp));
|
||||||
|
@ -25,6 +25,7 @@ void accelerator_inline conformable(GridBase *lhs,GridBase *rhs)
|
|||||||
template<class vobj> class LatticeAccelerator : public LatticeBase
|
template<class vobj> class LatticeAccelerator : public LatticeBase
|
||||||
{
|
{
|
||||||
protected:
|
protected:
|
||||||
|
//public:
|
||||||
GridBase *_grid;
|
GridBase *_grid;
|
||||||
int checkerboard;
|
int checkerboard;
|
||||||
vobj *_odata; // A managed pointer
|
vobj *_odata; // A managed pointer
|
||||||
@ -47,7 +48,7 @@ public:
|
|||||||
// The copy constructor for this will need to be used by device lambda functions
|
// The copy constructor for this will need to be used by device lambda functions
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////////
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
class LatticeExprView : public LatticeAccelerator<vobj>
|
class LatticeView : public LatticeAccelerator<vobj>
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
// Rvalue
|
// Rvalue
|
||||||
@ -68,7 +69,12 @@ public:
|
|||||||
accelerator_inline uint64_t end(void) const { return this->_odata_size; };
|
accelerator_inline uint64_t end(void) const { return this->_odata_size; };
|
||||||
accelerator_inline uint64_t size(void) const { return this->_odata_size; };
|
accelerator_inline uint64_t size(void) const { return this->_odata_size; };
|
||||||
|
|
||||||
LatticeExprView(const LatticeAccelerator<vobj> &refer_to_me) : LatticeAccelerator<vobj> (refer_to_me){}
|
LatticeView(const LatticeAccelerator<vobj> &refer_to_me) : LatticeAccelerator<vobj> (refer_to_me){}
|
||||||
|
LatticeView(const LatticeView<vobj> &refer_to_me) = default; // Trivially copyable
|
||||||
|
LatticeView(const LatticeAccelerator<vobj> &refer_to_me,ViewMode mode) : LatticeAccelerator<vobj> (refer_to_me)
|
||||||
|
{
|
||||||
|
this->ViewOpen(mode);
|
||||||
|
}
|
||||||
|
|
||||||
// Host functions
|
// Host functions
|
||||||
void ViewOpen(ViewMode mode)
|
void ViewOpen(ViewMode mode)
|
||||||
@ -89,46 +95,20 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
|
// Little autoscope assister
|
||||||
|
template<class View>
|
||||||
///////////////////////////////////////////////////////////////////////
|
class ViewCloser
|
||||||
// An object to be stored in a shared_ptr to clean up after last view.
|
|
||||||
// UserView constructor,destructor updates view manager
|
|
||||||
// Non-copyable object??? Second base with copy/= deleted?
|
|
||||||
///////////////////////////////////////////////////////////////////////
|
|
||||||
class MemViewDeleter {
|
|
||||||
public:
|
|
||||||
void *cpu_ptr;
|
|
||||||
ViewMode mode;
|
|
||||||
~MemViewDeleter(){
|
|
||||||
MemoryManager::ViewClose(cpu_ptr,mode);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
template<class vobj>
|
|
||||||
class LatticeView : public LatticeExprView<vobj>
|
|
||||||
{
|
{
|
||||||
#ifndef GRID_UVM
|
View v; // Take a copy of view and call view close when I go out of scope automatically
|
||||||
std::shared_ptr<MemViewDeleter> Deleter;
|
public:
|
||||||
#endif
|
ViewCloser(View &_v) : v(_v) {};
|
||||||
public:
|
~ViewCloser() { v.ViewClose(); }
|
||||||
#ifdef GRID_UVM
|
|
||||||
LatticeView(const LatticeAccelerator<vobj> &refer_to_me,ViewMode mode) :
|
|
||||||
LatticeExprView<vobj> (refer_to_me)
|
|
||||||
{
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
LatticeView(const LatticeView<vobj> &orig) : LatticeExprView<vobj>(orig) { }
|
|
||||||
LatticeView(const LatticeAccelerator<vobj> &refer_to_me,ViewMode mode) :
|
|
||||||
LatticeExprView<vobj> (refer_to_me), Deleter(new MemViewDeleter)
|
|
||||||
{
|
|
||||||
// std::cout << "FIXME - copy shared pointer? View Open in LatticeView"<<std::hex<<this->_odata<<std::dec<<" mode "<<mode <<std::endl;
|
|
||||||
this->ViewOpen(mode);
|
|
||||||
Deleter->cpu_ptr = this->cpu_ptr;
|
|
||||||
Deleter->mode = mode;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#define autoView(l_v,l,mode) \
|
||||||
|
auto l_v = l.View(mode); \
|
||||||
|
ViewCloser<decltype(l_v)> _autoView##l_v(l_v);
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Lattice expression types used by ET to assemble the AST
|
// Lattice expression types used by ET to assemble the AST
|
||||||
//
|
//
|
||||||
@ -142,7 +122,7 @@ template <typename T> using is_lattice = std::is_base_of<LatticeBase, T>;
|
|||||||
template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >;
|
template <typename T> using is_lattice_expr = std::is_base_of<LatticeExpressionBase,T >;
|
||||||
|
|
||||||
template<class T, bool isLattice> struct ViewMapBase { typedef T Type; };
|
template<class T, bool isLattice> struct ViewMapBase { typedef T Type; };
|
||||||
template<class T> struct ViewMapBase<T,true> { typedef LatticeExprView<typename T::vector_object> Type; };
|
template<class T> struct ViewMapBase<T,true> { typedef LatticeView<typename T::vector_object> Type; };
|
||||||
template<class T> using ViewMap = ViewMapBase<T,std::is_base_of<LatticeBase, T>::value >;
|
template<class T> using ViewMap = ViewMapBase<T,std::is_base_of<LatticeBase, T>::value >;
|
||||||
|
|
||||||
template <typename Op, typename _T1>
|
template <typename Op, typename _T1>
|
||||||
|
@ -232,15 +232,17 @@ public:
|
|||||||
if ( Params.twists[mu] ) {
|
if ( Params.twists[mu] ) {
|
||||||
Uconj = where(coor==neglink,-Uconj,Uconj);
|
Uconj = where(coor==neglink,-Uconj,Uconj);
|
||||||
}
|
}
|
||||||
|
|
||||||
auto U_v = U.View(CpuRead);
|
{
|
||||||
auto Uds_v = Uds.View(CpuWrite);
|
autoView( U_v , U, CpuRead);
|
||||||
auto Uconj_v = Uconj.View(CpuRead);
|
autoView( Uconj_v , Uconj, CpuRead);
|
||||||
auto Utmp_v= Utmp.View(CpuWrite);
|
autoView( Uds_v , Uds, CpuWrite);
|
||||||
thread_foreach(ss,U_v,{
|
autoView( Utmp_v, Utmp, CpuWrite);
|
||||||
Uds_v[ss](0)(mu) = U_v[ss]();
|
thread_foreach(ss,U_v,{
|
||||||
Uds_v[ss](1)(mu) = Uconj_v[ss]();
|
Uds_v[ss](0)(mu) = U_v[ss]();
|
||||||
});
|
Uds_v[ss](1)(mu) = Uconj_v[ss]();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
U = adj(Cshift(U ,mu,-1)); // correct except for spanning the boundary
|
U = adj(Cshift(U ,mu,-1)); // correct except for spanning the boundary
|
||||||
Uconj = adj(Cshift(Uconj,mu,-1));
|
Uconj = adj(Cshift(Uconj,mu,-1));
|
||||||
@ -250,19 +252,25 @@ public:
|
|||||||
Utmp = where(coor==0,Uconj,Utmp);
|
Utmp = where(coor==0,Uconj,Utmp);
|
||||||
}
|
}
|
||||||
|
|
||||||
thread_foreach(ss,Utmp_v,{
|
{
|
||||||
Uds_v[ss](0)(mu+4) = Utmp_v[ss]();
|
autoView( Uds_v , Uds, CpuWrite);
|
||||||
});
|
autoView( Utmp_v, Utmp, CpuWrite);
|
||||||
|
thread_foreach(ss,Utmp_v,{
|
||||||
|
Uds_v[ss](0)(mu+4) = Utmp_v[ss]();
|
||||||
|
});
|
||||||
|
}
|
||||||
Utmp = Uconj;
|
Utmp = Uconj;
|
||||||
if ( Params.twists[mu] ) {
|
if ( Params.twists[mu] ) {
|
||||||
Utmp = where(coor==0,U,Utmp);
|
Utmp = where(coor==0,U,Utmp);
|
||||||
}
|
}
|
||||||
|
|
||||||
thread_foreach(ss,Utmp_v,{
|
{
|
||||||
Uds_v[ss](1)(mu+4) = Utmp_v[ss]();
|
autoView( Uds_v , Uds, CpuWrite);
|
||||||
});
|
autoView( Utmp_v, Utmp, CpuWrite);
|
||||||
|
thread_foreach(ss,Utmp_v,{
|
||||||
|
Uds_v[ss](1)(mu+4) = Utmp_v[ss]();
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -272,11 +280,14 @@ public:
|
|||||||
GaugeLinkField link(mat.Grid());
|
GaugeLinkField link(mat.Grid());
|
||||||
// use lorentz for flavour as hack.
|
// use lorentz for flavour as hack.
|
||||||
auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde, A));
|
auto tmp = TraceIndex<SpinIndex>(outerProduct(Btilde, A));
|
||||||
auto link_v = link.View(CpuWrite);
|
|
||||||
auto tmp_v = tmp.View(CpuRead);
|
{
|
||||||
thread_foreach(ss,tmp_v,{
|
autoView( link_v , link, CpuWrite);
|
||||||
link_v[ss]() = tmp_v[ss](0, 0) + conjugate(tmp_v[ss](1, 1));
|
autoView( tmp_v , tmp, CpuRead);
|
||||||
});
|
thread_foreach(ss,tmp_v,{
|
||||||
|
link_v[ss]() = tmp_v[ss](0, 0) + conjugate(tmp_v[ss](1, 1));
|
||||||
|
});
|
||||||
|
}
|
||||||
PokeIndex<LorentzIndex>(mat, link, mu);
|
PokeIndex<LorentzIndex>(mat, link, mu);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -306,16 +317,18 @@ public:
|
|||||||
|
|
||||||
GaugeLinkField tmp(mat.Grid());
|
GaugeLinkField tmp(mat.Grid());
|
||||||
tmp = Zero();
|
tmp = Zero();
|
||||||
auto tmp_v = tmp.View(CpuWrite);
|
{
|
||||||
auto Atilde_v = Atilde.View(CpuRead);
|
autoView( tmp_v , tmp, CpuWrite);
|
||||||
auto Btilde_v = Btilde.View(CpuRead);
|
autoView( Atilde_v , Atilde, CpuRead);
|
||||||
thread_for(ss,tmp.Grid()->oSites(),{
|
autoView( Btilde_v , Btilde, CpuRead);
|
||||||
for (int s = 0; s < Ls; s++) {
|
thread_for(ss,tmp.Grid()->oSites(),{
|
||||||
int sF = s + Ls * ss;
|
for (int s = 0; s < Ls; s++) {
|
||||||
auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde_v[sF], Atilde_v[sF]));
|
int sF = s + Ls * ss;
|
||||||
tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1));
|
auto ttmp = traceIndex<SpinIndex>(outerProduct(Btilde_v[sF], Atilde_v[sF]));
|
||||||
}
|
tmp_v[ss]() = tmp_v[ss]() + ttmp(0, 0) + conjugate(ttmp(1, 1));
|
||||||
});
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
PokeIndex<LorentzIndex>(mat, tmp, mu);
|
PokeIndex<LorentzIndex>(mat, tmp, mu);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -264,8 +264,8 @@ private:
|
|||||||
{
|
{
|
||||||
CloverFieldType T(F.Grid());
|
CloverFieldType T(F.Grid());
|
||||||
T = Zero();
|
T = Zero();
|
||||||
auto T_v = T.View(CpuWrite);
|
autoView(T_v,T,CpuWrite);
|
||||||
auto F_v = F.View(CpuRead);
|
autoView(F_v,F,CpuRead);
|
||||||
thread_for(i, CloverTerm.Grid()->oSites(),
|
thread_for(i, CloverTerm.Grid()->oSites(),
|
||||||
{
|
{
|
||||||
T_v[i]()(0, 1) = timesMinusI(F_v[i]()());
|
T_v[i]()(0, 1) = timesMinusI(F_v[i]()());
|
||||||
@ -282,8 +282,8 @@ private:
|
|||||||
CloverFieldType T(F.Grid());
|
CloverFieldType T(F.Grid());
|
||||||
T = Zero();
|
T = Zero();
|
||||||
|
|
||||||
auto T_v = T.View(CpuWrite);
|
autoView(T_v, T,CpuWrite);
|
||||||
auto F_v = F.View(CpuRead);
|
autoView(F_v, F,CpuRead);
|
||||||
thread_for(i, CloverTerm.Grid()->oSites(),
|
thread_for(i, CloverTerm.Grid()->oSites(),
|
||||||
{
|
{
|
||||||
T_v[i]()(0, 1) = -F_v[i]()();
|
T_v[i]()(0, 1) = -F_v[i]()();
|
||||||
@ -300,8 +300,8 @@ private:
|
|||||||
CloverFieldType T(F.Grid());
|
CloverFieldType T(F.Grid());
|
||||||
T = Zero();
|
T = Zero();
|
||||||
|
|
||||||
auto T_v = T.View(CpuWrite);
|
autoView(T_v,T,CpuWrite);
|
||||||
auto F_v = F.View(CpuRead);
|
autoView(F_v,F,CpuRead);
|
||||||
thread_for(i, CloverTerm.Grid()->oSites(),
|
thread_for(i, CloverTerm.Grid()->oSites(),
|
||||||
{
|
{
|
||||||
T_v[i]()(0, 0) = timesMinusI(F_v[i]()());
|
T_v[i]()(0, 0) = timesMinusI(F_v[i]()());
|
||||||
@ -318,8 +318,8 @@ private:
|
|||||||
CloverFieldType T(F.Grid());
|
CloverFieldType T(F.Grid());
|
||||||
T = Zero();
|
T = Zero();
|
||||||
|
|
||||||
auto T_v = T.View(CpuWrite);
|
autoView( T_v , T, CpuWrite);
|
||||||
auto F_v = F.View(CpuRead);
|
autoView( F_v , F, CpuRead);
|
||||||
thread_for(i, CloverTerm.Grid()->oSites(),
|
thread_for(i, CloverTerm.Grid()->oSites(),
|
||||||
{
|
{
|
||||||
T_v[i]()(0, 1) = timesI(F_v[i]()());
|
T_v[i]()(0, 1) = timesI(F_v[i]()());
|
||||||
@ -336,8 +336,8 @@ private:
|
|||||||
CloverFieldType T(F.Grid());
|
CloverFieldType T(F.Grid());
|
||||||
T = Zero();
|
T = Zero();
|
||||||
|
|
||||||
auto T_v = T.View(CpuWrite);
|
autoView( T_v ,T,CpuWrite);
|
||||||
auto F_v = F.View(CpuRead);
|
autoView( F_v ,F,CpuRead);
|
||||||
thread_for(i, CloverTerm.Grid()->oSites(),
|
thread_for(i, CloverTerm.Grid()->oSites(),
|
||||||
{
|
{
|
||||||
T_v[i]()(0, 1) = -(F_v[i]()());
|
T_v[i]()(0, 1) = -(F_v[i]()());
|
||||||
@ -355,8 +355,8 @@ private:
|
|||||||
|
|
||||||
T = Zero();
|
T = Zero();
|
||||||
|
|
||||||
auto T_v = T.View(CpuWrite);
|
autoView( T_v , T,CpuWrite);
|
||||||
auto F_v = F.View(CpuRead);
|
autoView( F_v , F,CpuRead);
|
||||||
thread_for(i, CloverTerm.Grid()->oSites(),
|
thread_for(i, CloverTerm.Grid()->oSites(),
|
||||||
{
|
{
|
||||||
T_v[i]()(0, 0) = timesI(F_v[i]()());
|
T_v[i]()(0, 0) = timesI(F_v[i]()());
|
||||||
|
@ -106,9 +106,9 @@ public:
|
|||||||
const _SpinorField & phi,
|
const _SpinorField & phi,
|
||||||
int mu)
|
int mu)
|
||||||
{
|
{
|
||||||
auto out_v= out.View(CpuWrite);
|
autoView( out_v, out, CpuWrite);
|
||||||
auto phi_v= phi.View(CpuRead);
|
autoView( phi_v, phi, CpuRead);
|
||||||
auto Umu_v= Umu.View(CpuRead);
|
autoView( Umu_v, Umu, CpuRead);
|
||||||
thread_for(sss,out.Grid()->oSites(),{
|
thread_for(sss,out.Grid()->oSites(),{
|
||||||
multLink(out_v[sss],Umu_v[sss],phi_v[sss],mu);
|
multLink(out_v[sss],Umu_v[sss],phi_v[sss],mu);
|
||||||
});
|
});
|
||||||
@ -191,18 +191,19 @@ public:
|
|||||||
int Ls=Btilde.Grid()->_fdimensions[0];
|
int Ls=Btilde.Grid()->_fdimensions[0];
|
||||||
GaugeLinkField tmp(mat.Grid());
|
GaugeLinkField tmp(mat.Grid());
|
||||||
tmp = Zero();
|
tmp = Zero();
|
||||||
auto tmp_v = tmp.View(CpuWrite);
|
{
|
||||||
auto Btilde_v = Btilde.View(CpuRead);
|
autoView( tmp_v , tmp, CpuWrite);
|
||||||
auto Atilde_v = Atilde.View(CpuRead);
|
autoView( Btilde_v , Btilde, CpuRead);
|
||||||
thread_for(sss,tmp.Grid()->oSites(),{
|
autoView( Atilde_v , Atilde, CpuRead);
|
||||||
int sU=sss;
|
thread_for(sss,tmp.Grid()->oSites(),{
|
||||||
for(int s=0;s<Ls;s++){
|
int sU=sss;
|
||||||
int sF = s+Ls*sU;
|
for(int s=0;s<Ls;s++){
|
||||||
tmp_v[sU] = tmp_v[sU]+ traceIndex<SpinIndex>(outerProduct(Btilde_v[sF],Atilde_v[sF])); // ordering here
|
int sF = s+Ls*sU;
|
||||||
}
|
tmp_v[sU] = tmp_v[sU]+ traceIndex<SpinIndex>(outerProduct(Btilde_v[sF],Atilde_v[sF])); // ordering here
|
||||||
});
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
PokeIndex<LorentzIndex>(mat,tmp,mu);
|
PokeIndex<LorentzIndex>(mat,tmp,mu);
|
||||||
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -50,9 +50,9 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
|
|||||||
|
|
||||||
chi_i.Checkerboard()=psi_i.Checkerboard();
|
chi_i.Checkerboard()=psi_i.Checkerboard();
|
||||||
GridBase *grid=psi_i.Grid();
|
GridBase *grid=psi_i.Grid();
|
||||||
auto psi = psi_i.View(AcceleratorRead);
|
autoView(psi , psi_i,AcceleratorRead);
|
||||||
auto phi = phi_i.View(AcceleratorRead);
|
autoView(phi , phi_i,AcceleratorRead);
|
||||||
auto chi = chi_i.View(AcceleratorWrite);
|
autoView(chi , chi_i,AcceleratorWrite);
|
||||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||||
|
|
||||||
auto pdiag = &diag[0];
|
auto pdiag = &diag[0];
|
||||||
@ -93,9 +93,9 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
|
|||||||
{
|
{
|
||||||
chi_i.Checkerboard()=psi_i.Checkerboard();
|
chi_i.Checkerboard()=psi_i.Checkerboard();
|
||||||
GridBase *grid=psi_i.Grid();
|
GridBase *grid=psi_i.Grid();
|
||||||
auto psi = psi_i.View(AcceleratorRead);
|
autoView(psi , psi_i,AcceleratorRead);
|
||||||
auto phi = phi_i.View(AcceleratorRead);
|
autoView(phi , phi_i,AcceleratorRead);
|
||||||
auto chi = chi_i.View(AcceleratorWrite);
|
autoView(chi , chi_i,AcceleratorWrite);
|
||||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||||
|
|
||||||
auto pdiag = &diag[0];
|
auto pdiag = &diag[0];
|
||||||
@ -131,8 +131,8 @@ CayleyFermion5D<Impl>::MooeeInv (const FermionField &psi_i, FermionField &chi
|
|||||||
chi_i.Checkerboard()=psi_i.Checkerboard();
|
chi_i.Checkerboard()=psi_i.Checkerboard();
|
||||||
GridBase *grid=psi_i.Grid();
|
GridBase *grid=psi_i.Grid();
|
||||||
|
|
||||||
auto psi = psi_i.View(AcceleratorRead);
|
autoView(psi , psi_i,AcceleratorRead);
|
||||||
auto chi = chi_i.View(AcceleratorWrite);
|
autoView(chi , chi_i,AcceleratorWrite);
|
||||||
|
|
||||||
int Ls=this->Ls;
|
int Ls=this->Ls;
|
||||||
|
|
||||||
@ -193,8 +193,8 @@ CayleyFermion5D<Impl>::MooeeInvDag (const FermionField &psi_i, FermionField &chi
|
|||||||
GridBase *grid=psi_i.Grid();
|
GridBase *grid=psi_i.Grid();
|
||||||
int Ls=this->Ls;
|
int Ls=this->Ls;
|
||||||
|
|
||||||
auto psi = psi_i.View(AcceleratorRead);
|
autoView(psi , psi_i,AcceleratorRead);
|
||||||
auto chi = chi_i.View(AcceleratorWrite);
|
autoView(chi , chi_i,AcceleratorWrite);
|
||||||
|
|
||||||
auto plee = & lee [0];
|
auto plee = & lee [0];
|
||||||
auto pdee = & dee [0];
|
auto pdee = & dee [0];
|
||||||
|
@ -65,9 +65,9 @@ CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
|
|||||||
EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
|
EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
|
||||||
chi_i.Checkerboard()=psi_i.Checkerboard();
|
chi_i.Checkerboard()=psi_i.Checkerboard();
|
||||||
GridBase *grid=psi_i.Grid();
|
GridBase *grid=psi_i.Grid();
|
||||||
auto psi = psi_i.View(CpuRead);
|
autoView(psi, psi_i,CpuRead);
|
||||||
auto phi = phi_i.View(CpuRead);
|
autoView(phi, phi_i,CpuRead);
|
||||||
auto chi = chi_i.View(CpuWrite);
|
autoView(chi, chi_i,CpuWrite);
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
int LLs = grid->_rdimensions[0];
|
int LLs = grid->_rdimensions[0];
|
||||||
const int nsimd= Simd::Nsimd();
|
const int nsimd= Simd::Nsimd();
|
||||||
@ -213,9 +213,9 @@ CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
|
|||||||
EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
|
EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
|
||||||
chi_i.Checkerboard()=psi_i.Checkerboard();
|
chi_i.Checkerboard()=psi_i.Checkerboard();
|
||||||
GridBase *grid=psi_i.Grid();
|
GridBase *grid=psi_i.Grid();
|
||||||
auto psi=psi_i.View(CpuRead);
|
autoView(psi,psi_i,CpuRead);
|
||||||
auto phi=phi_i.View(CpuRead);
|
autoView(phi,phi_i,CpuRead);
|
||||||
auto chi=chi_i.View(CpuWrite);
|
autoView(chi,chi_i,CpuWrite);
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
int LLs = grid->_rdimensions[0];
|
int LLs = grid->_rdimensions[0];
|
||||||
int nsimd= Simd::Nsimd();
|
int nsimd= Simd::Nsimd();
|
||||||
@ -357,8 +357,8 @@ CayleyFermion5D<Impl>::MooeeInternalAsm(const FermionField &psi_i, FermionField
|
|||||||
Vector<iSinglet<Simd> > &Matm)
|
Vector<iSinglet<Simd> > &Matm)
|
||||||
{
|
{
|
||||||
EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
|
EnableIf<Impl::LsVectorised&&EnableBool,int> sfinae=0;
|
||||||
auto psi = psi_i.View(CpuRead);
|
autoView(psi , psi_i,CpuRead);
|
||||||
auto chi = chi_i.View(CpuWrite);
|
autoView(chi , chi_i,CpuWrite);
|
||||||
#ifndef AVX512
|
#ifndef AVX512
|
||||||
{
|
{
|
||||||
SiteHalfSpinor BcastP;
|
SiteHalfSpinor BcastP;
|
||||||
@ -535,8 +535,8 @@ CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi_i, FermionField
|
|||||||
EnableIf<Impl::LsVectorised,int> sfinae=0;
|
EnableIf<Impl::LsVectorised,int> sfinae=0;
|
||||||
#ifndef AVX512
|
#ifndef AVX512
|
||||||
{
|
{
|
||||||
auto psi = psi_i.View(CpuRead);
|
autoView(psi , psi_i,CpuRead);
|
||||||
auto chi = chi_i.View(CpuWrite);
|
autoView(chi , chi_i,CpuWrite);
|
||||||
|
|
||||||
SiteHalfSpinor BcastP;
|
SiteHalfSpinor BcastP;
|
||||||
SiteHalfSpinor BcastM;
|
SiteHalfSpinor BcastM;
|
||||||
@ -586,8 +586,8 @@ CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi_i, FermionField
|
|||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
{
|
{
|
||||||
auto psi = psi_i.View(CpuRead);
|
autoView(psi , psi_i,CpuRead);
|
||||||
auto chi = chi_i.View(CpuWrite);
|
autoView(chi , chi_i,CpuWrite);
|
||||||
// pointers
|
// pointers
|
||||||
// MASK_REGS;
|
// MASK_REGS;
|
||||||
#define Chi_00 %zmm0
|
#define Chi_00 %zmm0
|
||||||
|
@ -46,9 +46,9 @@ void DomainWallEOFAFermion<Impl>::M5D(const FermionField& psi_i, const FermionFi
|
|||||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
GridBase* grid = psi_i.Grid();
|
GridBase* grid = psi_i.Grid();
|
||||||
auto phi = phi_i.View(AcceleratorRead);
|
autoView( phi , phi_i, AcceleratorRead);
|
||||||
auto psi = psi_i.View(AcceleratorRead);
|
autoView( psi , psi_i, AcceleratorRead);
|
||||||
auto chi = chi_i.View(AcceleratorWrite);
|
autoView( chi , chi_i, AcceleratorWrite);
|
||||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||||
auto pdiag = &diag[0];
|
auto pdiag = &diag[0];
|
||||||
auto pupper = &upper[0];
|
auto pupper = &upper[0];
|
||||||
@ -82,9 +82,9 @@ void DomainWallEOFAFermion<Impl>::M5Ddag(const FermionField& psi_i, const Fermio
|
|||||||
GridBase* grid = psi_i.Grid();
|
GridBase* grid = psi_i.Grid();
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
|
|
||||||
auto psi = psi_i.View(AcceleratorRead);
|
autoView( psi , psi_i, AcceleratorRead);
|
||||||
auto phi = phi_i.View(AcceleratorRead);
|
autoView( phi , phi_i, AcceleratorRead);
|
||||||
auto chi = chi_i.View(AcceleratorWrite);
|
autoView( chi , chi_i, AcceleratorWrite);
|
||||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||||
auto pdiag = &diag[0];
|
auto pdiag = &diag[0];
|
||||||
auto pupper = &upper[0];
|
auto pupper = &upper[0];
|
||||||
@ -116,8 +116,8 @@ void DomainWallEOFAFermion<Impl>::MooeeInv(const FermionField& psi_i, FermionFie
|
|||||||
{
|
{
|
||||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||||
GridBase* grid = psi_i.Grid();
|
GridBase* grid = psi_i.Grid();
|
||||||
auto psi=psi_i.View(AcceleratorRead);
|
autoView( psi, psi_i, AcceleratorRead);
|
||||||
auto chi=chi_i.View(AcceleratorWrite);
|
autoView( chi, chi_i, AcceleratorWrite);
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
|
|
||||||
auto plee = & this->lee[0];
|
auto plee = & this->lee[0];
|
||||||
@ -172,8 +172,8 @@ void DomainWallEOFAFermion<Impl>::MooeeInvDag(const FermionField& psi_i, Fermion
|
|||||||
{
|
{
|
||||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||||
GridBase* grid = psi_i.Grid();
|
GridBase* grid = psi_i.Grid();
|
||||||
auto psi = psi_i.View(AcceleratorRead);
|
autoView( psi, psi_i, AcceleratorRead);
|
||||||
auto chi = chi_i.View(AcceleratorWrite);
|
autoView( chi, chi_i, AcceleratorWrite);
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
|
|
||||||
auto plee = & this->lee[0];
|
auto plee = & this->lee[0];
|
||||||
|
@ -221,10 +221,10 @@ void ImprovedStaggeredFermion5D<Impl>::DhopDir(const FermionField &in, FermionFi
|
|||||||
|
|
||||||
Compressor compressor;
|
Compressor compressor;
|
||||||
Stencil.HaloExchange(in,compressor);
|
Stencil.HaloExchange(in,compressor);
|
||||||
auto Umu_v = Umu.View(CpuRead);
|
autoView( Umu_v , Umu, CpuRead);
|
||||||
auto UUUmu_v = UUUmu.View(CpuRead);
|
autoView( UUUmu_v , UUUmu, CpuRead);
|
||||||
auto in_v = in.View(CpuRead);
|
autoView( in_v , in, CpuRead);
|
||||||
auto out_v = out.View(CpuWrite);
|
autoView( out_v , out, CpuWrite);
|
||||||
thread_for( ss,Umu.Grid()->oSites(),{
|
thread_for( ss,Umu.Grid()->oSites(),{
|
||||||
for(int s=0;s<Ls;s++){
|
for(int s=0;s<Ls;s++){
|
||||||
int sU=ss;
|
int sU=ss;
|
||||||
|
@ -250,10 +250,10 @@ void ImprovedStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGauge
|
|||||||
////////////////////////
|
////////////////////////
|
||||||
// Call the single hop
|
// Call the single hop
|
||||||
////////////////////////
|
////////////////////////
|
||||||
auto U_v = U.View(CpuRead);
|
autoView( U_v , U, CpuRead);
|
||||||
auto UUU_v = UUU.View(CpuRead);
|
autoView( UUU_v , UUU, CpuRead);
|
||||||
auto B_v = B.View(CpuWrite);
|
autoView( B_v , B, CpuWrite);
|
||||||
auto Btilde_v = Btilde.View(CpuWrite);
|
autoView( Btilde_v , Btilde, CpuWrite);
|
||||||
thread_for(sss,B.Grid()->oSites(),{
|
thread_for(sss,B.Grid()->oSites(),{
|
||||||
Kernels::DhopDirKernel(st, U_v, UUU_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1);
|
Kernels::DhopDirKernel(st, U_v, UUU_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1);
|
||||||
});
|
});
|
||||||
@ -378,10 +378,10 @@ void ImprovedStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionFiel
|
|||||||
|
|
||||||
Compressor compressor;
|
Compressor compressor;
|
||||||
Stencil.HaloExchange(in, compressor);
|
Stencil.HaloExchange(in, compressor);
|
||||||
auto Umu_v = Umu.View(CpuRead);
|
autoView( Umu_v , Umu, CpuRead);
|
||||||
auto UUUmu_v = UUUmu.View(CpuRead);
|
autoView( UUUmu_v , UUUmu, CpuRead);
|
||||||
auto in_v = in.View(CpuRead);
|
autoView( in_v , in, CpuRead);
|
||||||
auto out_v = out.View(CpuWrite);
|
autoView( out_v , out, CpuWrite);
|
||||||
thread_for( sss, in.Grid()->oSites(),{
|
thread_for( sss, in.Grid()->oSites(),{
|
||||||
Kernels::DhopDirKernel(Stencil, Umu_v, UUUmu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp);
|
Kernels::DhopDirKernel(Stencil, Umu_v, UUUmu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp);
|
||||||
});
|
});
|
||||||
|
@ -44,9 +44,9 @@ void MobiusEOFAFermion<Impl>::M5D(const FermionField &psi_i, const FermionField
|
|||||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||||
GridBase *grid = psi_i.Grid();
|
GridBase *grid = psi_i.Grid();
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
auto psi = psi_i.View(AcceleratorRead);
|
autoView(psi , psi_i, AcceleratorRead);
|
||||||
auto phi = phi_i.View(AcceleratorRead);
|
autoView(phi , phi_i, AcceleratorRead);
|
||||||
auto chi = chi_i.View(AcceleratorWrite);
|
autoView(chi , chi_i, AcceleratorWrite);
|
||||||
|
|
||||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||||
|
|
||||||
@ -84,9 +84,9 @@ void MobiusEOFAFermion<Impl>::M5D_shift(const FermionField &psi_i, const Fermion
|
|||||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||||
GridBase *grid = psi_i.Grid();
|
GridBase *grid = psi_i.Grid();
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
auto psi = psi_i.View(AcceleratorRead);
|
autoView(psi , psi_i, AcceleratorRead);
|
||||||
auto phi = phi_i.View(AcceleratorRead);
|
autoView(phi , phi_i, AcceleratorRead);
|
||||||
auto chi = chi_i.View(AcceleratorWrite);
|
autoView(chi , chi_i, AcceleratorWrite);
|
||||||
|
|
||||||
auto pm = this->pm;
|
auto pm = this->pm;
|
||||||
int shift_s = (pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
|
int shift_s = (pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
|
||||||
@ -132,9 +132,9 @@ void MobiusEOFAFermion<Impl>::M5Ddag(const FermionField &psi_i, const FermionFie
|
|||||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||||
GridBase *grid = psi_i.Grid();
|
GridBase *grid = psi_i.Grid();
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
auto psi = psi_i.View(AcceleratorRead);
|
autoView(psi , psi_i, AcceleratorRead);
|
||||||
auto phi = phi_i.View(AcceleratorRead);
|
autoView(phi , phi_i, AcceleratorRead);
|
||||||
auto chi = chi_i.View(AcceleratorWrite);
|
autoView(chi , chi_i, AcceleratorWrite);
|
||||||
|
|
||||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||||
|
|
||||||
@ -174,9 +174,9 @@ void MobiusEOFAFermion<Impl>::M5Ddag_shift(const FermionField &psi_i, const Ferm
|
|||||||
GridBase *grid = psi_i.Grid();
|
GridBase *grid = psi_i.Grid();
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
|
int shift_s = (this->pm == 1) ? (Ls-1) : 0; // s-component modified by shift operator
|
||||||
auto psi = psi_i.View(AcceleratorRead);
|
autoView(psi , psi_i, AcceleratorRead);
|
||||||
auto phi = phi_i.View(AcceleratorRead);
|
autoView(phi , phi_i, AcceleratorRead);
|
||||||
auto chi = chi_i.View(AcceleratorWrite);
|
autoView(chi , chi_i, AcceleratorWrite);
|
||||||
|
|
||||||
assert(phi.Checkerboard() == psi.Checkerboard());
|
assert(phi.Checkerboard() == psi.Checkerboard());
|
||||||
|
|
||||||
@ -226,8 +226,8 @@ void MobiusEOFAFermion<Impl>::MooeeInv(const FermionField &psi_i, FermionField &
|
|||||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||||
GridBase *grid = psi_i.Grid();
|
GridBase *grid = psi_i.Grid();
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
auto psi = psi_i.View(AcceleratorRead);
|
autoView(psi , psi_i, AcceleratorRead);
|
||||||
auto chi = chi_i.View(AcceleratorWrite);
|
autoView(chi , chi_i, AcceleratorWrite);
|
||||||
|
|
||||||
auto plee = & this->lee [0];
|
auto plee = & this->lee [0];
|
||||||
auto pdee = & this->dee [0];
|
auto pdee = & this->dee [0];
|
||||||
@ -286,8 +286,8 @@ void MobiusEOFAFermion<Impl>::MooeeInv_shift(const FermionField &psi_i, FermionF
|
|||||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||||
GridBase *grid = psi_i.Grid();
|
GridBase *grid = psi_i.Grid();
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
auto psi = psi_i.View(AcceleratorRead);
|
autoView(psi , psi_i, AcceleratorRead);
|
||||||
auto chi = chi_i.View(AcceleratorWrite);
|
autoView(chi , chi_i, AcceleratorWrite);
|
||||||
|
|
||||||
auto pm = this->pm;
|
auto pm = this->pm;
|
||||||
auto plee = & this->lee [0];
|
auto plee = & this->lee [0];
|
||||||
@ -354,8 +354,8 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag(const FermionField &psi_i, FermionFiel
|
|||||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||||
GridBase *grid = psi_i.Grid();
|
GridBase *grid = psi_i.Grid();
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
auto psi = psi_i.View(AcceleratorRead);
|
autoView(psi , psi_i, AcceleratorRead);
|
||||||
auto chi = chi_i.View(AcceleratorWrite);
|
autoView(chi , chi_i, AcceleratorWrite);
|
||||||
|
|
||||||
auto plee = & this->lee [0];
|
auto plee = & this->lee [0];
|
||||||
auto pdee = & this->dee [0];
|
auto pdee = & this->dee [0];
|
||||||
@ -410,8 +410,8 @@ void MobiusEOFAFermion<Impl>::MooeeInvDag_shift(const FermionField &psi_i, Fermi
|
|||||||
{
|
{
|
||||||
chi_i.Checkerboard() = psi_i.Checkerboard();
|
chi_i.Checkerboard() = psi_i.Checkerboard();
|
||||||
GridBase *grid = psi_i.Grid();
|
GridBase *grid = psi_i.Grid();
|
||||||
auto psi = psi_i.View(AcceleratorRead);
|
autoView(psi , psi_i, AcceleratorRead);
|
||||||
auto chi = chi_i.View(AcceleratorWrite);
|
autoView(chi , chi_i, AcceleratorWrite);
|
||||||
int Ls = this->Ls;
|
int Ls = this->Ls;
|
||||||
|
|
||||||
auto pm = this->pm;
|
auto pm = this->pm;
|
||||||
|
@ -208,9 +208,9 @@ void NaiveStaggeredFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeFie
|
|||||||
////////////////////////
|
////////////////////////
|
||||||
// Call the single hop
|
// Call the single hop
|
||||||
////////////////////////
|
////////////////////////
|
||||||
auto U_v = U.View(CpuRead);
|
autoView( U_v , U, CpuRead);
|
||||||
auto B_v = B.View(CpuWrite);
|
autoView( B_v , B, CpuWrite);
|
||||||
auto Btilde_v = Btilde.View(CpuWrite);
|
autoView( Btilde_v , Btilde, CpuWrite);
|
||||||
thread_for(sss,B.Grid()->oSites(),{
|
thread_for(sss,B.Grid()->oSites(),{
|
||||||
Kernels::DhopDirKernel(st, U_v, U_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1);
|
Kernels::DhopDirKernel(st, U_v, U_v, st.CommBuf(), sss, sss, B_v, Btilde_v, mu,1);
|
||||||
});
|
});
|
||||||
@ -315,9 +315,9 @@ void NaiveStaggeredFermion<Impl>::DhopDir(const FermionField &in, FermionField &
|
|||||||
|
|
||||||
Compressor compressor;
|
Compressor compressor;
|
||||||
Stencil.HaloExchange(in, compressor);
|
Stencil.HaloExchange(in, compressor);
|
||||||
auto Umu_v = Umu.View(CpuRead);
|
autoView( Umu_v , Umu, CpuRead);
|
||||||
auto in_v = in.View(CpuRead);
|
autoView( in_v , in, CpuRead);
|
||||||
auto out_v = out.View(CpuWrite);
|
autoView( out_v , out, CpuWrite);
|
||||||
// thread_for( sss, in.Grid()->oSites(),{
|
// thread_for( sss, in.Grid()->oSites(),{
|
||||||
// Kernels::DhopDirKernel(Stencil, Umu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp);
|
// Kernels::DhopDirKernel(Stencil, Umu_v, Stencil.CommBuf(), sss, sss, in_v, out_v, dir, disp);
|
||||||
// });
|
// });
|
||||||
|
@ -261,11 +261,11 @@ void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st, LebesgueOrder &lo,
|
|||||||
GridBase *FGrid=in.Grid();
|
GridBase *FGrid=in.Grid();
|
||||||
GridBase *UGrid=U.Grid();
|
GridBase *UGrid=U.Grid();
|
||||||
typedef StaggeredKernels<Impl> ThisKernel;
|
typedef StaggeredKernels<Impl> ThisKernel;
|
||||||
auto UUU_v = UUU.View(AcceleratorRead);
|
autoView( UUU_v , UUU, AcceleratorRead);
|
||||||
auto U_v = U.View(AcceleratorRead);
|
autoView( U_v , U, AcceleratorRead);
|
||||||
auto in_v = in.View(AcceleratorRead);
|
autoView( in_v , in, AcceleratorRead);
|
||||||
auto out_v = out.View(AcceleratorWrite);
|
autoView( out_v , out, AcceleratorWrite);
|
||||||
auto st_v = st.View(AcceleratorRead);
|
autoView( st_v , st, AcceleratorRead);
|
||||||
SiteSpinor * buf = st.CommBuf();
|
SiteSpinor * buf = st.CommBuf();
|
||||||
|
|
||||||
int Ls=1;
|
int Ls=1;
|
||||||
@ -301,11 +301,11 @@ void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st, LebesgueOrder &lo,
|
|||||||
GridBase *FGrid=in.Grid();
|
GridBase *FGrid=in.Grid();
|
||||||
GridBase *UGrid=U.Grid();
|
GridBase *UGrid=U.Grid();
|
||||||
typedef StaggeredKernels<Impl> ThisKernel;
|
typedef StaggeredKernels<Impl> ThisKernel;
|
||||||
auto UUU_v= U.View(AcceleratorRead);
|
autoView( UUU_v , U, AcceleratorRead);
|
||||||
auto U_v = U.View(AcceleratorRead);
|
autoView( U_v , U, AcceleratorRead);
|
||||||
auto in_v = in.View(AcceleratorRead);
|
autoView( in_v , in, AcceleratorRead);
|
||||||
auto out_v = out.View(AcceleratorWrite);
|
autoView( out_v , out, AcceleratorWrite);
|
||||||
auto st_v = st.View(AcceleratorRead);
|
autoView( st_v , st, AcceleratorRead);
|
||||||
SiteSpinor * buf = st.CommBuf();
|
SiteSpinor * buf = st.CommBuf();
|
||||||
|
|
||||||
int Ls=1;
|
int Ls=1;
|
||||||
|
@ -475,12 +475,12 @@ void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
|
|||||||
// Inefficient comms method but not performance critical.
|
// Inefficient comms method but not performance critical.
|
||||||
tmp1 = Cshift(q_in_1, mu, 1);
|
tmp1 = Cshift(q_in_1, mu, 1);
|
||||||
tmp2 = Cshift(q_in_2, mu, 1);
|
tmp2 = Cshift(q_in_2, mu, 1);
|
||||||
auto tmp1_v = tmp1.View(CpuWrite);
|
autoView( tmp1_v , tmp1, CpuWrite);
|
||||||
auto tmp2_v = tmp2.View(CpuWrite);
|
autoView( tmp2_v , tmp2, CpuWrite);
|
||||||
auto q_in_1_v=q_in_1.View(CpuRead);
|
autoView( q_in_1_v,q_in_1, CpuRead);
|
||||||
auto q_in_2_v=q_in_2.View(CpuRead);
|
autoView( q_in_2_v,q_in_2, CpuRead);
|
||||||
auto q_out_v = q_out.View(CpuRead);
|
autoView( q_out_v , q_out, CpuRead);
|
||||||
auto Umu_v = Umu.View(CpuRead);
|
autoView( Umu_v , Umu, CpuRead);
|
||||||
thread_for(sU, Umu.Grid()->oSites(),{
|
thread_for(sU, Umu.Grid()->oSites(),{
|
||||||
Kernels::ContractConservedCurrentSiteFwd(tmp1_v[sU],
|
Kernels::ContractConservedCurrentSiteFwd(tmp1_v[sU],
|
||||||
q_in_2_v[sU],
|
q_in_2_v[sU],
|
||||||
@ -526,11 +526,11 @@ void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
|||||||
tmp = lattice_cmplx*q_in;
|
tmp = lattice_cmplx*q_in;
|
||||||
tmpBwd = Cshift(tmp, mu, -1);
|
tmpBwd = Cshift(tmp, mu, -1);
|
||||||
|
|
||||||
auto coords_v = coords.View(CpuRead);
|
autoView( coords_v , coords, CpuRead);
|
||||||
auto tmpFwd_v = tmpFwd.View(CpuRead);
|
autoView( tmpFwd_v , tmpFwd, CpuRead);
|
||||||
auto tmpBwd_v = tmpBwd.View(CpuRead);
|
autoView( tmpBwd_v , tmpBwd, CpuRead);
|
||||||
auto Umu_v = Umu.View(CpuRead);
|
autoView( Umu_v , Umu, CpuRead);
|
||||||
auto q_out_v = q_out.View(CpuWrite);
|
autoView( q_out_v , q_out, CpuWrite);
|
||||||
|
|
||||||
thread_for(sU, Umu.Grid()->oSites(), {
|
thread_for(sU, Umu.Grid()->oSites(), {
|
||||||
|
|
||||||
|
@ -348,18 +348,18 @@ template <class Impl>
|
|||||||
void WilsonKernels<Impl>::DhopDirAll( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls,
|
void WilsonKernels<Impl>::DhopDirAll( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int Ls,
|
||||||
int Nsite, const FermionField &in, std::vector<FermionField> &out)
|
int Nsite, const FermionField &in, std::vector<FermionField> &out)
|
||||||
{
|
{
|
||||||
auto U_v = U.View(AcceleratorRead);
|
autoView(U_v ,U,AcceleratorRead);
|
||||||
auto in_v = in.View(AcceleratorRead);
|
autoView(in_v ,in,AcceleratorRead);
|
||||||
auto st_v = st.View(AcceleratorRead);
|
autoView(st_v ,st,AcceleratorRead);
|
||||||
|
|
||||||
auto out_Xm = out[0].View(AcceleratorWrite);
|
autoView(out_Xm,out[0],AcceleratorWrite);
|
||||||
auto out_Ym = out[1].View(AcceleratorWrite);
|
autoView(out_Ym,out[1],AcceleratorWrite);
|
||||||
auto out_Zm = out[2].View(AcceleratorWrite);
|
autoView(out_Zm,out[2],AcceleratorWrite);
|
||||||
auto out_Tm = out[3].View(AcceleratorWrite);
|
autoView(out_Tm,out[3],AcceleratorWrite);
|
||||||
auto out_Xp = out[4].View(AcceleratorWrite);
|
autoView(out_Xp,out[4],AcceleratorWrite);
|
||||||
auto out_Yp = out[5].View(AcceleratorWrite);
|
autoView(out_Yp,out[5],AcceleratorWrite);
|
||||||
auto out_Zp = out[6].View(AcceleratorWrite);
|
autoView(out_Zp,out[6],AcceleratorWrite);
|
||||||
auto out_Tp = out[7].View(AcceleratorWrite);
|
autoView(out_Tp,out[7],AcceleratorWrite);
|
||||||
auto CBp=st.CommBuf();
|
auto CBp=st.CommBuf();
|
||||||
accelerator_forNB(sss,Nsite*Ls,Simd::Nsimd(),{
|
accelerator_forNB(sss,Nsite*Ls,Simd::Nsimd(),{
|
||||||
int sU=sss/Ls;
|
int sU=sss/Ls;
|
||||||
@ -383,10 +383,10 @@ void WilsonKernels<Impl>::DhopDirKernel( StencilImpl &st, DoubledGaugeField &U,S
|
|||||||
assert(dirdisp<=7);
|
assert(dirdisp<=7);
|
||||||
assert(dirdisp>=0);
|
assert(dirdisp>=0);
|
||||||
|
|
||||||
auto U_v = U.View(AcceleratorRead);
|
autoView(U_v ,U ,AcceleratorRead);
|
||||||
auto in_v = in.View(AcceleratorRead);
|
autoView(in_v ,in ,AcceleratorRead);
|
||||||
auto out_v = out.View(AcceleratorWrite);
|
autoView(out_v,out,AcceleratorWrite);
|
||||||
auto st_v = st.View(AcceleratorRead);
|
autoView(st_v ,st ,AcceleratorRead);
|
||||||
auto CBp=st.CommBuf();
|
auto CBp=st.CommBuf();
|
||||||
#define LoopBody(Dir) \
|
#define LoopBody(Dir) \
|
||||||
case Dir : \
|
case Dir : \
|
||||||
@ -438,10 +438,10 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
|||||||
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
||||||
int interior,int exterior)
|
int interior,int exterior)
|
||||||
{
|
{
|
||||||
auto U_v = U.View(AcceleratorRead);
|
autoView(U_v , U,AcceleratorRead);
|
||||||
auto in_v = in.View(AcceleratorRead);
|
autoView(in_v , in,AcceleratorRead);
|
||||||
auto out_v = out.View(AcceleratorWrite);
|
autoView(out_v,out,AcceleratorWrite);
|
||||||
auto st_v = st.View(AcceleratorRead);
|
autoView(st_v , st,AcceleratorRead);
|
||||||
|
|
||||||
if( interior && exterior ) {
|
if( interior && exterior ) {
|
||||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;}
|
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSite); return;}
|
||||||
@ -469,10 +469,10 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
|||||||
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
int Ls, int Nsite, const FermionField &in, FermionField &out,
|
||||||
int interior,int exterior)
|
int interior,int exterior)
|
||||||
{
|
{
|
||||||
auto U_v = U.View(AcceleratorRead);
|
autoView(U_v ,U,AcceleratorRead);
|
||||||
auto in_v = in.View(AcceleratorRead);
|
autoView(in_v ,in,AcceleratorRead);
|
||||||
auto out_v = out.View(AcceleratorWrite);
|
autoView(out_v,out,AcceleratorWrite);
|
||||||
auto st_v = st.View(AcceleratorRead);
|
autoView(st_v ,st,AcceleratorRead);
|
||||||
|
|
||||||
if( interior && exterior ) {
|
if( interior && exterior ) {
|
||||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDag); return;}
|
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteDag); return;}
|
||||||
|
@ -86,8 +86,8 @@ public:
|
|||||||
|
|
||||||
// Move this elsewhere? FIXME
|
// Move this elsewhere? FIXME
|
||||||
static inline void AddLink(Field &U, LinkField &W, int mu) { // U[mu] += W
|
static inline void AddLink(Field &U, LinkField &W, int mu) { // U[mu] += W
|
||||||
auto U_v = U.View(CpuWrite);
|
autoView(U_v,U,CpuWrite);
|
||||||
auto W_v = W.View(CpuRead);
|
autoView(W_v,W,CpuRead);
|
||||||
thread_for( ss, U.Grid()->oSites(), {
|
thread_for( ss, U.Grid()->oSites(), {
|
||||||
U_v[ss](mu) = U_v[ss](mu) + W_v[ss]();
|
U_v[ss](mu) = U_v[ss](mu) + W_v[ss]();
|
||||||
});
|
});
|
||||||
@ -131,15 +131,15 @@ public:
|
|||||||
//static std::chrono::duration<double> diff;
|
//static std::chrono::duration<double> diff;
|
||||||
|
|
||||||
//auto start = std::chrono::high_resolution_clock::now();
|
//auto start = std::chrono::high_resolution_clock::now();
|
||||||
auto U_v = U.View(CpuWrite);
|
autoView(U_v,U,CpuWrite);
|
||||||
auto P_v = P.View(CpuRead);
|
autoView(P_v,P,CpuRead);
|
||||||
thread_for(ss, P.Grid()->oSites(),{
|
thread_for(ss, P.Grid()->oSites(),{
|
||||||
for (int mu = 0; mu < Nd; mu++) {
|
for (int mu = 0; mu < Nd; mu++) {
|
||||||
U_v[ss](mu) = ProjectOnGroup(Exponentiate(P_v[ss](mu), ep, Nexp) * U_v[ss](mu));
|
U_v[ss](mu) = ProjectOnGroup(Exponentiate(P_v[ss](mu), ep, Nexp) * U_v[ss](mu));
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
//auto end = std::chrono::high_resolution_clock::now();
|
//auto end = std::chrono::high_resolution_clock::now();
|
||||||
// diff += end - start;
|
// diff += end - start;
|
||||||
// std::cout << "Time to exponentiate matrix " << diff.count() << " s\n";
|
// std::cout << "Time to exponentiate matrix " << diff.count() << " s\n";
|
||||||
}
|
}
|
||||||
|
@ -89,8 +89,8 @@ public:
|
|||||||
action = (2.0 * Ndim + mass_square) * phisquared - lambda * phisquared * phisquared;
|
action = (2.0 * Ndim + mass_square) * phisquared - lambda * phisquared * phisquared;
|
||||||
|
|
||||||
|
|
||||||
auto p_v = p.View(CpuRead);
|
autoView( p_v , p, CpuRead);
|
||||||
auto action_v = action.View(CpuWrite);
|
autoView( action_v , action, CpuWrite);
|
||||||
for (int mu = 0; mu < Ndim; mu++)
|
for (int mu = 0; mu < Ndim; mu++)
|
||||||
{
|
{
|
||||||
// pshift = Cshift(p, mu, +1); // not efficient, implement with stencils
|
// pshift = Cshift(p, mu, +1); // not efficient, implement with stencils
|
||||||
@ -146,8 +146,8 @@ public:
|
|||||||
for (int point = 0; point < npoint; point++)
|
for (int point = 0; point < npoint; point++)
|
||||||
{
|
{
|
||||||
|
|
||||||
auto p_v = p.View(CpuRead);
|
autoView( p_v , p, CpuRead);
|
||||||
auto force_v = force.View(CpuWrite);
|
autoView( force_v , force, CpuWrite);
|
||||||
|
|
||||||
int permute_type;
|
int permute_type;
|
||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
|
@ -81,7 +81,7 @@ static Registrar<OneFlavourRatioEOFModule<FermionImplementationPolicy>,
|
|||||||
static Registrar< ConjugateGradientModule<WilsonFermionR::FermionField>,
|
static Registrar< ConjugateGradientModule<WilsonFermionR::FermionField>,
|
||||||
HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __CGWFmodXMLInit("ConjugateGradient");
|
HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __CGWFmodXMLInit("ConjugateGradient");
|
||||||
static Registrar< BiCGSTABModule<WilsonFermionR::FermionField>,
|
static Registrar< BiCGSTABModule<WilsonFermionR::FermionField>,
|
||||||
HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __CGWFmodXMLInit("BiCGSTAB");
|
HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __BiCGWFmodXMLInit("BiCGSTAB");
|
||||||
static Registrar< ConjugateResidualModule<WilsonFermionR::FermionField>,
|
static Registrar< ConjugateResidualModule<WilsonFermionR::FermionField>,
|
||||||
HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __CRWFmodXMLInit("ConjugateResidual");
|
HMC_SolverModuleFactory<solver_string, WilsonFermionR::FermionField, Serialiser> > __CRWFmodXMLInit("ConjugateResidual");
|
||||||
|
|
||||||
|
@ -185,13 +185,14 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
|
|||||||
|
|
||||||
for(int i=0;i<Lblock;i++){
|
for(int i=0;i<Lblock;i++){
|
||||||
|
|
||||||
auto lhs_v = lhs_wi[i].View(CpuRead);
|
// Recreate view potentially expensive outside fo UVM mode
|
||||||
|
autoView(lhs_v,lhs_wi[i],CpuRead);
|
||||||
auto left = conjugate(lhs_v[ss]);
|
auto left = conjugate(lhs_v[ss]);
|
||||||
|
|
||||||
for(int j=0;j<Rblock;j++){
|
for(int j=0;j<Rblock;j++){
|
||||||
|
|
||||||
SpinMatrix_v vv;
|
SpinMatrix_v vv;
|
||||||
auto rhs_v = rhs_vj[j].View(CpuRead);
|
// Recreate view potentially expensive outside fo UVM mode
|
||||||
|
autoView(rhs_v,rhs_vj[j],CpuRead);
|
||||||
auto right = rhs_v[ss];
|
auto right = rhs_v[ss];
|
||||||
for(int s1=0;s1<Ns;s1++){
|
for(int s1=0;s1<Ns;s1++){
|
||||||
for(int s2=0;s2<Ns;s2++){
|
for(int s2=0;s2<Ns;s2++){
|
||||||
@ -204,11 +205,10 @@ void A2Autils<FImpl>::MesonField(TensorType &mat,
|
|||||||
int base = Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*r;
|
int base = Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*r;
|
||||||
for ( int m=0;m<Nmom;m++){
|
for ( int m=0;m<Nmom;m++){
|
||||||
int idx = m+base;
|
int idx = m+base;
|
||||||
auto mom_v = mom[m].View(CpuRead);
|
autoView(mom_v,mom[m],CpuRead);
|
||||||
auto phase = mom_v[ss];
|
auto phase = mom_v[ss];
|
||||||
mac(&lvSum[idx],&vv,&phase);
|
mac(&lvSum[idx],&vv,&phase);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -371,7 +371,7 @@ void A2Autils<FImpl>::PionFieldXX(Eigen::Tensor<ComplexD,3> &mat,
|
|||||||
|
|
||||||
for(int i=0;i<Lblock;i++){
|
for(int i=0;i<Lblock;i++){
|
||||||
|
|
||||||
auto wi_v = wi[i].View(CpuRead);
|
autoView(wi_v,wi[i],CpuRead);
|
||||||
auto w = conjugate(wi_v[ss]);
|
auto w = conjugate(wi_v[ss]);
|
||||||
if (g5) {
|
if (g5) {
|
||||||
w()(2)(0) = - w()(2)(0);
|
w()(2)(0) = - w()(2)(0);
|
||||||
@ -383,7 +383,7 @@ void A2Autils<FImpl>::PionFieldXX(Eigen::Tensor<ComplexD,3> &mat,
|
|||||||
}
|
}
|
||||||
for(int j=0;j<Rblock;j++){
|
for(int j=0;j<Rblock;j++){
|
||||||
|
|
||||||
auto vj_v=vj[j].View(CpuRead);
|
autoView(vj_v,vj[j],CpuRead);
|
||||||
auto v = vj_v[ss];
|
auto v = vj_v[ss];
|
||||||
auto vv = v()(0)(0);
|
auto vv = v()(0)(0);
|
||||||
|
|
||||||
@ -518,12 +518,12 @@ void A2Autils<FImpl>::PionFieldWVmom(Eigen::Tensor<ComplexD,4> &mat,
|
|||||||
|
|
||||||
for(int i=0;i<Lblock;i++){
|
for(int i=0;i<Lblock;i++){
|
||||||
|
|
||||||
auto wi_v = wi[i].View(CpuRead);
|
autoView(wi_v,wi[i],CpuRead);
|
||||||
auto w = conjugate(wi_v[ss]);
|
auto w = conjugate(wi_v[ss]);
|
||||||
|
|
||||||
for(int j=0;j<Rblock;j++){
|
for(int j=0;j<Rblock;j++){
|
||||||
|
|
||||||
auto vj_v = vj[j].View(CpuRead);
|
autoView(vj_v,vj[j],CpuRead);
|
||||||
auto v = vj_v[ss];
|
auto v = vj_v[ss];
|
||||||
|
|
||||||
auto vv = w()(0)(0) * v()(0)(0)// Gamma5 Dirac basis explicitly written out
|
auto vv = w()(0)(0) * v()(0)(0)// Gamma5 Dirac basis explicitly written out
|
||||||
@ -544,7 +544,7 @@ void A2Autils<FImpl>::PionFieldWVmom(Eigen::Tensor<ComplexD,4> &mat,
|
|||||||
int base = Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*r;
|
int base = Nmom*i+Nmom*Lblock*j+Nmom*Lblock*Rblock*r;
|
||||||
for ( int m=0;m<Nmom;m++){
|
for ( int m=0;m<Nmom;m++){
|
||||||
int idx = m+base;
|
int idx = m+base;
|
||||||
auto mom_v = mom[m].View(CpuRead);
|
autoView(mom_v,mom[m],CpuRead);
|
||||||
auto phase = mom_v[ss];
|
auto phase = mom_v[ss];
|
||||||
mac(&lvSum[idx],&vv,&phase()()());
|
mac(&lvSum[idx],&vv,&phase()()());
|
||||||
}
|
}
|
||||||
@ -730,13 +730,13 @@ void A2Autils<FImpl>::AslashField(TensorType &mat,
|
|||||||
|
|
||||||
for(int i=0;i<Lblock;i++)
|
for(int i=0;i<Lblock;i++)
|
||||||
{
|
{
|
||||||
auto wi_v = lhs_wi[i].View(CpuRead);
|
autoView(wi_v,lhs_wi[i],CpuRead);
|
||||||
auto left = conjugate(wi_v[ss]);
|
auto left = conjugate(wi_v[ss]);
|
||||||
|
|
||||||
for(int j=0;j<Rblock;j++)
|
for(int j=0;j<Rblock;j++)
|
||||||
{
|
{
|
||||||
SpinMatrix_v vv;
|
SpinMatrix_v vv;
|
||||||
auto vj_v = rhs_vj[j].View(CpuRead);
|
autoView(vj_v,rhs_vj[j],CpuRead);
|
||||||
auto right = vj_v[ss];
|
auto right = vj_v[ss];
|
||||||
|
|
||||||
for(int s1=0;s1<Ns;s1++)
|
for(int s1=0;s1<Ns;s1++)
|
||||||
@ -752,8 +752,8 @@ void A2Autils<FImpl>::AslashField(TensorType &mat,
|
|||||||
|
|
||||||
for ( int m=0;m<Nem;m++)
|
for ( int m=0;m<Nem;m++)
|
||||||
{
|
{
|
||||||
auto emB0_v = emB0[m].View(CpuRead);
|
autoView(emB0_v,emB0[m],CpuRead);
|
||||||
auto emB1_v = emB1[m].View(CpuRead);
|
autoView(emB1_v,emB1[m],CpuRead);
|
||||||
int idx = m+base;
|
int idx = m+base;
|
||||||
auto b0 = emB0_v[ss];
|
auto b0 = emB0_v[ss];
|
||||||
auto b1 = emB1_v[ss];
|
auto b1 = emB1_v[ss];
|
||||||
@ -1014,21 +1014,21 @@ A2Autils<FImpl>::ContractWWVV(std::vector<PropagatorField> &WWVV,
|
|||||||
for(int d_o=0;d_o<N_d;d_o+=d_unroll){
|
for(int d_o=0;d_o<N_d;d_o+=d_unroll){
|
||||||
for(int t=0;t<N_t;t++){
|
for(int t=0;t<N_t;t++){
|
||||||
for(int s=0;s<N_s;s++){
|
for(int s=0;s<N_s;s++){
|
||||||
auto vs_v = vs[s].View(CpuRead);
|
autoView(vs_v,vs[s],CpuRead);
|
||||||
auto tmp1 = vs_v[ss];
|
auto tmp1 = vs_v[ss];
|
||||||
vobj tmp2 = Zero();
|
vobj tmp2 = Zero();
|
||||||
vobj tmp3 = Zero();
|
vobj tmp3 = Zero();
|
||||||
for(int d=d_o;d<MIN(d_o+d_unroll,N_d);d++){
|
for(int d=d_o;d<MIN(d_o+d_unroll,N_d);d++){
|
||||||
auto vd_v = vd[d].View(CpuRead);
|
autoView(vd_v,vd[d],CpuRead);
|
||||||
Scalar_v coeff = WW_sd(t,s,d);
|
Scalar_v coeff = WW_sd(t,s,d);
|
||||||
tmp3 = conjugate(vd_v[ss]);
|
tmp3 = conjugate(vd_v[ss]);
|
||||||
mac(&tmp2, &coeff, &tmp3);
|
mac(&tmp2, &coeff, &tmp3);
|
||||||
}
|
}
|
||||||
|
|
||||||
//////////////////////////
|
//////////////////////////
|
||||||
// Fast outer product of tmp1 with a sum of terms suppressed by d_unroll
|
// Fast outer product of tmp1 with a sum of terms suppressed by d_unroll
|
||||||
//////////////////////////
|
//////////////////////////
|
||||||
OuterProductWWVV(WWVV[t], tmp1, tmp2, Ns, ss);
|
OuterProductWWVV(WWVV[t], tmp1, tmp2, Ns, ss);
|
||||||
|
|
||||||
}}
|
}}
|
||||||
}
|
}
|
||||||
@ -1067,21 +1067,20 @@ A2Autils<FImpl>::ContractWWVV(std::vector<PropagatorField> &WWVV,
|
|||||||
thread_for(ss,grid->oSites(),{
|
thread_for(ss,grid->oSites(),{
|
||||||
for(int d_o=0;d_o<N_d;d_o+=d_unroll){
|
for(int d_o=0;d_o<N_d;d_o+=d_unroll){
|
||||||
for(int s=0;s<N_s;s++){
|
for(int s=0;s<N_s;s++){
|
||||||
auto vs_v = vs[s].View(CpuRead);
|
autoView(vs_v,vs[s],CpuRead);
|
||||||
auto tmp1 = vs_v[ss];
|
auto tmp1 = vs_v[ss];
|
||||||
vobj tmp2 = Zero();
|
vobj tmp2 = Zero();
|
||||||
vobj tmp3 = Zero();
|
vobj tmp3 = Zero();
|
||||||
for(int d=d_o;d<MIN(d_o+d_unroll,N_d);d++){
|
for(int d=d_o;d<MIN(d_o+d_unroll,N_d);d++){
|
||||||
auto vd_v = vd[d].View(CpuRead);
|
autoView(vd_v,vd[d],CpuRead);
|
||||||
Scalar_v coeff = buf(s,d);
|
Scalar_v coeff = buf(s,d);
|
||||||
tmp3 = conjugate(vd_v[ss]);
|
tmp3 = conjugate(vd_v[ss]);
|
||||||
mac(&tmp2, &coeff, &tmp3);
|
mac(&tmp2, &coeff, &tmp3);
|
||||||
}
|
}
|
||||||
|
//////////////////////////
|
||||||
//////////////////////////
|
// Fast outer product of tmp1 with a sum of terms suppressed by d_unroll
|
||||||
// Fast outer product of tmp1 with a sum of terms suppressed by d_unroll
|
//////////////////////////
|
||||||
//////////////////////////
|
OuterProductWWVV(WWVV[t], tmp1, tmp2, Ns, ss);
|
||||||
OuterProductWWVV(WWVV[t], tmp1, tmp2, Ns, ss);
|
|
||||||
}}
|
}}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@ -1093,7 +1092,7 @@ inline void A2Autils<FImpl>::OuterProductWWVV(PropagatorField &WWVV,
|
|||||||
const vobj &rhs,
|
const vobj &rhs,
|
||||||
const int Ns, const int ss)
|
const int Ns, const int ss)
|
||||||
{
|
{
|
||||||
auto WWVV_v = WWVV.View(CpuWrite);
|
autoView(WWVV_v,WWVV,CpuWrite);
|
||||||
for (int s1 = 0; s1 < Ns; s1++){
|
for (int s1 = 0; s1 < Ns; s1++){
|
||||||
for (int s2 = 0; s2 < Ns; s2++){
|
for (int s2 = 0; s2 < Ns; s2++){
|
||||||
WWVV_v[ss]()(s1,s2)(0, 0) += lhs()(s1)(0) * rhs()(s2)(0);
|
WWVV_v[ss]()(s1,s2)(0, 0) += lhs()(s1)(0) * rhs()(s2)(0);
|
||||||
@ -1122,10 +1121,10 @@ void A2Autils<FImpl>::ContractFourQuarkColourDiagonal(const PropagatorField &WWV
|
|||||||
|
|
||||||
GridBase *grid = WWVV0.Grid();
|
GridBase *grid = WWVV0.Grid();
|
||||||
|
|
||||||
auto WWVV0_v = WWVV0.View(CpuRead);
|
autoView(WWVV0_v , WWVV0,CpuRead);
|
||||||
auto WWVV1_v = WWVV1.View(CpuRead);
|
autoView(WWVV1_v , WWVV1,CpuRead);
|
||||||
auto O_trtr_v= O_trtr.View(CpuWrite);
|
autoView(O_trtr_v, O_trtr,CpuWrite);
|
||||||
auto O_fig8_v= O_fig8.View(CpuWrite);
|
autoView(O_fig8_v, O_fig8,CpuWrite);
|
||||||
thread_for(ss,grid->oSites(),{
|
thread_for(ss,grid->oSites(),{
|
||||||
|
|
||||||
typedef typename ComplexField::vector_object vobj;
|
typedef typename ComplexField::vector_object vobj;
|
||||||
@ -1166,10 +1165,10 @@ void A2Autils<FImpl>::ContractFourQuarkColourMix(const PropagatorField &WWVV0,
|
|||||||
|
|
||||||
GridBase *grid = WWVV0.Grid();
|
GridBase *grid = WWVV0.Grid();
|
||||||
|
|
||||||
auto WWVV0_v = WWVV0.View(CpuRead);
|
autoView( WWVV0_v , WWVV0,CpuRead);
|
||||||
auto WWVV1_v = WWVV1.View(CpuRead);
|
autoView( WWVV1_v , WWVV1,CpuRead);
|
||||||
auto O_trtr_v= O_trtr.View(CpuWrite);
|
autoView( O_trtr_v, O_trtr,CpuWrite);
|
||||||
auto O_fig8_v= O_fig8.View(CpuWrite);
|
autoView( O_fig8_v, O_fig8,CpuWrite);
|
||||||
|
|
||||||
thread_for(ss,grid->oSites(),{
|
thread_for(ss,grid->oSites(),{
|
||||||
|
|
||||||
|
@ -273,10 +273,10 @@ void BaryonUtils<FImpl>::ContractBaryons(const PropagatorField &q1_left,
|
|||||||
for (int ie=0; ie < 6 ; ie++)
|
for (int ie=0; ie < 6 ; ie++)
|
||||||
wick_contraction[ie] = (quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0;
|
wick_contraction[ie] = (quarks_left[0] == quarks_right[epsilon[ie][0]] && quarks_left[1] == quarks_right[epsilon[ie][1]] && quarks_left[2] == quarks_right[epsilon[ie][2]]) ? 1 : 0;
|
||||||
|
|
||||||
auto vbaryon_corr= baryon_corr.View(CpuWrite);
|
autoView(vbaryon_corr, baryon_corr,CpuWrite);
|
||||||
auto v1 = q1_left.View(CpuRead);
|
autoView( v1 , q1_left, CpuRead);
|
||||||
auto v2 = q2_left.View(CpuRead);
|
autoView( v2 , q2_left, CpuRead);
|
||||||
auto v3 = q3_left.View(CpuRead);
|
autoView( v3 , q3_left, CpuRead);
|
||||||
|
|
||||||
// accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
|
// accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
|
||||||
thread_for(ss,grid->oSites(),{
|
thread_for(ss,grid->oSites(),{
|
||||||
@ -560,10 +560,10 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_Eye(const PropagatorField &qq_loop,
|
|||||||
{
|
{
|
||||||
GridBase *grid = qs_ti.Grid();
|
GridBase *grid = qs_ti.Grid();
|
||||||
|
|
||||||
auto vcorr= stn_corr.View(CpuWrite);
|
autoView( vcorr, stn_corr, CpuWrite);
|
||||||
auto vq_loop = qq_loop.View(CpuRead);
|
autoView( vq_loop , qq_loop, CpuRead);
|
||||||
auto vd_tf = qd_tf.View(CpuRead);
|
autoView( vd_tf , qd_tf, CpuRead);
|
||||||
auto vs_ti = qs_ti.View(CpuRead);
|
autoView( vs_ti , qs_ti, CpuRead);
|
||||||
|
|
||||||
// accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
|
// accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
|
||||||
thread_for(ss,grid->oSites(),{
|
thread_for(ss,grid->oSites(),{
|
||||||
@ -597,12 +597,11 @@ void BaryonUtils<FImpl>::Sigma_to_Nucleon_NonEye(const PropagatorField &qq_ti,
|
|||||||
{
|
{
|
||||||
GridBase *grid = qs_ti.Grid();
|
GridBase *grid = qs_ti.Grid();
|
||||||
|
|
||||||
auto vcorr= stn_corr.View(CpuWrite);
|
autoView( vcorr , stn_corr, CpuWrite);
|
||||||
auto vq_ti = qq_ti.View(CpuRead);
|
autoView( vq_ti , qq_ti, CpuRead);
|
||||||
auto vq_tf = qq_tf.View(CpuRead);
|
autoView( vq_tf , qq_tf, CpuRead);
|
||||||
auto vd_tf = qd_tf.View(CpuRead);
|
autoView( vd_tf , qd_tf, CpuRead);
|
||||||
auto vs_ti = qs_ti.View(CpuRead);
|
autoView( vs_ti , qs_ti, CpuRead);
|
||||||
|
|
||||||
// accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
|
// accelerator_for(ss, grid->oSites(), grid->Nsimd(), {
|
||||||
thread_for(ss,grid->oSites(),{
|
thread_for(ss,grid->oSites(),{
|
||||||
auto Dq_ti = vq_ti[ss];
|
auto Dq_ti = vq_ti[ss];
|
||||||
|
@ -47,8 +47,8 @@ void axpibg5x(Lattice<vobj> &z,const Lattice<vobj> &x,Coeff a,Coeff b)
|
|||||||
GridBase *grid=x.Grid();
|
GridBase *grid=x.Grid();
|
||||||
|
|
||||||
Gamma G5(Gamma::Algebra::Gamma5);
|
Gamma G5(Gamma::Algebra::Gamma5);
|
||||||
auto x_v = x.View(AcceleratorRead);
|
autoView(x_v, x, AcceleratorRead);
|
||||||
auto z_v = z.View(AcceleratorWrite);
|
autoView(z_v, z, AcceleratorWrite);
|
||||||
accelerator_for( ss, x_v.size(),vobj::Nsimd(), {
|
accelerator_for( ss, x_v.size(),vobj::Nsimd(), {
|
||||||
auto tmp = a*x_v(ss) + G5*(b*timesI(x_v(ss)));
|
auto tmp = a*x_v(ss) + G5*(b*timesI(x_v(ss)));
|
||||||
coalescedWrite(z_v[ss],tmp);
|
coalescedWrite(z_v[ss],tmp);
|
||||||
@ -63,9 +63,9 @@ void axpby_ssp(Lattice<vobj> &z, Coeff a,const Lattice<vobj> &x,Coeff b,const La
|
|||||||
conformable(x,z);
|
conformable(x,z);
|
||||||
GridBase *grid=x.Grid();
|
GridBase *grid=x.Grid();
|
||||||
int Ls = grid->_rdimensions[0];
|
int Ls = grid->_rdimensions[0];
|
||||||
auto x_v = x.View(AcceleratorRead);
|
autoView( x_v, x, AcceleratorRead);
|
||||||
auto y_v = y.View(AcceleratorRead);
|
autoView( y_v, y, AcceleratorRead);
|
||||||
auto z_v = z.View(AcceleratorWrite);
|
autoView( z_v, z, AcceleratorWrite);
|
||||||
// FIXME -- need a new class of accelerator_loop to implement this
|
// FIXME -- need a new class of accelerator_loop to implement this
|
||||||
//
|
//
|
||||||
uint64_t nloop = grid->oSites()/Ls;
|
uint64_t nloop = grid->oSites()/Ls;
|
||||||
@ -85,9 +85,9 @@ void ag5xpby_ssp(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,const L
|
|||||||
GridBase *grid=x.Grid();
|
GridBase *grid=x.Grid();
|
||||||
int Ls = grid->_rdimensions[0];
|
int Ls = grid->_rdimensions[0];
|
||||||
Gamma G5(Gamma::Algebra::Gamma5);
|
Gamma G5(Gamma::Algebra::Gamma5);
|
||||||
auto x_v = x.View(AcceleratorRead);
|
autoView( x_v, x, AcceleratorRead);
|
||||||
auto y_v = y.View(AcceleratorRead);
|
autoView( y_v, y, AcceleratorRead);
|
||||||
auto z_v = z.View(AcceleratorWrite);
|
autoView( z_v, z, AcceleratorWrite);
|
||||||
uint64_t nloop = grid->oSites()/Ls;
|
uint64_t nloop = grid->oSites()/Ls;
|
||||||
accelerator_for(sss,nloop,vobj::Nsimd(),{
|
accelerator_for(sss,nloop,vobj::Nsimd(),{
|
||||||
uint64_t ss = sss*Ls;
|
uint64_t ss = sss*Ls;
|
||||||
@ -104,9 +104,9 @@ void axpbg5y_ssp(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,const L
|
|||||||
conformable(x,z);
|
conformable(x,z);
|
||||||
GridBase *grid=x.Grid();
|
GridBase *grid=x.Grid();
|
||||||
int Ls = grid->_rdimensions[0];
|
int Ls = grid->_rdimensions[0];
|
||||||
auto x_v = x.View(AcceleratorRead);
|
autoView( x_v, x, AcceleratorRead);
|
||||||
auto y_v = y.View(AcceleratorRead);
|
autoView( y_v, y, AcceleratorRead);
|
||||||
auto z_v = z.View(AcceleratorWrite);
|
autoView( z_v, z, AcceleratorWrite);
|
||||||
Gamma G5(Gamma::Algebra::Gamma5);
|
Gamma G5(Gamma::Algebra::Gamma5);
|
||||||
uint64_t nloop = grid->oSites()/Ls;
|
uint64_t nloop = grid->oSites()/Ls;
|
||||||
accelerator_for(sss,nloop,vobj::Nsimd(),{
|
accelerator_for(sss,nloop,vobj::Nsimd(),{
|
||||||
@ -125,9 +125,9 @@ void ag5xpbg5y_ssp(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,const
|
|||||||
GridBase *grid=x.Grid();
|
GridBase *grid=x.Grid();
|
||||||
int Ls = grid->_rdimensions[0];
|
int Ls = grid->_rdimensions[0];
|
||||||
|
|
||||||
auto x_v = x.View(AcceleratorRead);
|
autoView( x_v, x, AcceleratorRead);
|
||||||
auto y_v = y.View(AcceleratorRead);
|
autoView( y_v, y, AcceleratorRead);
|
||||||
auto z_v = z.View(AcceleratorWrite);
|
autoView( z_v, z, AcceleratorWrite);
|
||||||
Gamma G5(Gamma::Algebra::Gamma5);
|
Gamma G5(Gamma::Algebra::Gamma5);
|
||||||
uint64_t nloop = grid->oSites()/Ls;
|
uint64_t nloop = grid->oSites()/Ls;
|
||||||
accelerator_for(sss,nloop,vobj::Nsimd(),{
|
accelerator_for(sss,nloop,vobj::Nsimd(),{
|
||||||
@ -147,9 +147,9 @@ void axpby_ssp_pminus(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,co
|
|||||||
GridBase *grid=x.Grid();
|
GridBase *grid=x.Grid();
|
||||||
int Ls = grid->_rdimensions[0];
|
int Ls = grid->_rdimensions[0];
|
||||||
|
|
||||||
auto x_v = x.View(AcceleratorRead);
|
autoView( x_v, x, AcceleratorRead);
|
||||||
auto y_v = y.View(AcceleratorRead);
|
autoView( y_v, y, AcceleratorRead);
|
||||||
auto z_v = z.View(AcceleratorWrite);
|
autoView( z_v, z, AcceleratorWrite);
|
||||||
uint64_t nloop = grid->oSites()/Ls;
|
uint64_t nloop = grid->oSites()/Ls;
|
||||||
accelerator_for(sss,nloop,vobj::Nsimd(),{
|
accelerator_for(sss,nloop,vobj::Nsimd(),{
|
||||||
uint64_t ss = sss*Ls;
|
uint64_t ss = sss*Ls;
|
||||||
@ -168,9 +168,9 @@ void axpby_ssp_pplus(Lattice<vobj> &z,Coeff a,const Lattice<vobj> &x,Coeff b,con
|
|||||||
conformable(x,z);
|
conformable(x,z);
|
||||||
GridBase *grid=x.Grid();
|
GridBase *grid=x.Grid();
|
||||||
int Ls = grid->_rdimensions[0];
|
int Ls = grid->_rdimensions[0];
|
||||||
auto x_v = x.View(AcceleratorRead);
|
autoView( x_v, x, AcceleratorRead);
|
||||||
auto y_v = y.View(AcceleratorRead);
|
autoView( y_v, y, AcceleratorRead);
|
||||||
auto z_v = z.View(AcceleratorWrite);
|
autoView( z_v, z, AcceleratorWrite);
|
||||||
uint64_t nloop = grid->oSites()/Ls;
|
uint64_t nloop = grid->oSites()/Ls;
|
||||||
accelerator_for(sss,nloop,vobj::Nsimd(),{
|
accelerator_for(sss,nloop,vobj::Nsimd(),{
|
||||||
uint64_t ss = sss*Ls;
|
uint64_t ss = sss*Ls;
|
||||||
@ -189,8 +189,8 @@ void G5R5(Lattice<vobj> &z,const Lattice<vobj> &x)
|
|||||||
conformable(x,z);
|
conformable(x,z);
|
||||||
int Ls = grid->_rdimensions[0];
|
int Ls = grid->_rdimensions[0];
|
||||||
Gamma G5(Gamma::Algebra::Gamma5);
|
Gamma G5(Gamma::Algebra::Gamma5);
|
||||||
auto x_v = x.View(AcceleratorRead);
|
autoView( x_v, x, AcceleratorRead);
|
||||||
auto z_v = z.View(AcceleratorWrite);
|
autoView( z_v, z, AcceleratorWrite);
|
||||||
uint64_t nloop = grid->oSites()/Ls;
|
uint64_t nloop = grid->oSites()/Ls;
|
||||||
accelerator_for(sss,nloop,vobj::Nsimd(),{
|
accelerator_for(sss,nloop,vobj::Nsimd(),{
|
||||||
uint64_t ss = sss*Ls;
|
uint64_t ss = sss*Ls;
|
||||||
@ -222,8 +222,8 @@ void G5C(Lattice<iVector<CComplex, nbasis>> &z, const Lattice<iVector<CComplex,
|
|||||||
static_assert(nbasis % 2 == 0, "");
|
static_assert(nbasis % 2 == 0, "");
|
||||||
int nb = nbasis / 2;
|
int nb = nbasis / 2;
|
||||||
|
|
||||||
auto z_v = z.View(AcceleratorWrite);
|
autoView( z_v, z, AcceleratorWrite);
|
||||||
auto x_v = x.View(AcceleratorRead);
|
autoView( x_v, x, AcceleratorRead);
|
||||||
accelerator_for(ss,grid->oSites(),CComplex::Nsimd(),
|
accelerator_for(ss,grid->oSites(),CComplex::Nsimd(),
|
||||||
{
|
{
|
||||||
for(int n = 0; n < nb; ++n) {
|
for(int n = 0; n < nb; ++n) {
|
||||||
|
@ -222,10 +222,10 @@ public:
|
|||||||
conformable(subgroup, Determinant);
|
conformable(subgroup, Determinant);
|
||||||
int i0, i1;
|
int i0, i1;
|
||||||
su2SubGroupIndex(i0, i1, su2_index);
|
su2SubGroupIndex(i0, i1, su2_index);
|
||||||
auto subgroup_v = subgroup.View(CpuWrite);
|
|
||||||
auto source_v = source.View(CpuRead);
|
|
||||||
auto Determinant_v = Determinant.View(CpuWrite);
|
|
||||||
|
|
||||||
|
autoView( subgroup_v , subgroup,CpuWrite);
|
||||||
|
autoView( source_v , source,CpuRead);
|
||||||
|
autoView( Determinant_v , Determinant,CpuWrite);
|
||||||
thread_for(ss, grid->oSites(), {
|
thread_for(ss, grid->oSites(), {
|
||||||
|
|
||||||
subgroup_v[ss]()()(0, 0) = source_v[ss]()()(i0, i0);
|
subgroup_v[ss]()()(0, 0) = source_v[ss]()()(i0, i0);
|
||||||
@ -257,8 +257,8 @@ public:
|
|||||||
su2SubGroupIndex(i0, i1, su2_index);
|
su2SubGroupIndex(i0, i1, su2_index);
|
||||||
|
|
||||||
dest = 1.0; // start out with identity
|
dest = 1.0; // start out with identity
|
||||||
auto dest_v = dest.View(CpuWrite);
|
autoView( dest_v , dest, CpuWrite);
|
||||||
auto subgroup_v = subgroup.View(CpuRead);
|
autoView( subgroup_v, subgroup, CpuRead);
|
||||||
thread_for(ss, grid->oSites(),
|
thread_for(ss, grid->oSites(),
|
||||||
{
|
{
|
||||||
dest_v[ss]()()(i0, i0) = subgroup_v[ss]()()(0, 0);
|
dest_v[ss]()()(i0, i0) = subgroup_v[ss]()()(0, 0);
|
||||||
@ -266,6 +266,7 @@ public:
|
|||||||
dest_v[ss]()()(i1, i0) = subgroup_v[ss]()()(1, 0);
|
dest_v[ss]()()(i1, i0) = subgroup_v[ss]()()(1, 0);
|
||||||
dest_v[ss]()()(i1, i1) = subgroup_v[ss]()()(1, 1);
|
dest_v[ss]()()(i1, i1) = subgroup_v[ss]()()(1, 1);
|
||||||
});
|
});
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
///////////////////////////////////////////////
|
///////////////////////////////////////////////
|
||||||
@ -608,8 +609,8 @@ public:
|
|||||||
|
|
||||||
// reunitarise??
|
// reunitarise??
|
||||||
template <typename LatticeMatrixType>
|
template <typename LatticeMatrixType>
|
||||||
static void LieRandomize(GridParallelRNG &pRNG, LatticeMatrixType &out,
|
static void LieRandomize(GridParallelRNG &pRNG, LatticeMatrixType &out, double scale = 1.0)
|
||||||
double scale = 1.0) {
|
{
|
||||||
GridBase *grid = out.Grid();
|
GridBase *grid = out.Grid();
|
||||||
|
|
||||||
typedef typename LatticeMatrixType::vector_type vector_type;
|
typedef typename LatticeMatrixType::vector_type vector_type;
|
||||||
@ -618,8 +619,7 @@ public:
|
|||||||
typedef iSinglet<vector_type> vTComplexType;
|
typedef iSinglet<vector_type> vTComplexType;
|
||||||
|
|
||||||
typedef Lattice<vTComplexType> LatticeComplexType;
|
typedef Lattice<vTComplexType> LatticeComplexType;
|
||||||
typedef typename GridTypeMapper<
|
typedef typename GridTypeMapper<typename LatticeMatrixType::vector_object>::scalar_object MatrixType;
|
||||||
typename LatticeMatrixType::vector_object>::scalar_object MatrixType;
|
|
||||||
|
|
||||||
LatticeComplexType ca(grid);
|
LatticeComplexType ca(grid);
|
||||||
LatticeMatrixType lie(grid);
|
LatticeMatrixType lie(grid);
|
||||||
@ -629,6 +629,7 @@ public:
|
|||||||
MatrixType ta;
|
MatrixType ta;
|
||||||
|
|
||||||
lie = Zero();
|
lie = Zero();
|
||||||
|
|
||||||
for (int a = 0; a < AdjointDimension; a++) {
|
for (int a = 0; a < AdjointDimension; a++) {
|
||||||
random(pRNG, ca);
|
random(pRNG, ca);
|
||||||
|
|
||||||
@ -640,6 +641,7 @@ public:
|
|||||||
la = ci * ca * ta;
|
la = ci * ca * ta;
|
||||||
|
|
||||||
lie = lie + la; // e^{i la ta}
|
lie = lie + la; // e^{i la ta}
|
||||||
|
|
||||||
}
|
}
|
||||||
taExp(lie, out);
|
taExp(lie, out);
|
||||||
}
|
}
|
||||||
|
@ -67,6 +67,7 @@ void Gather_plane_simple_table (Vector<std::pair<int,int> >& table,const Lattice
|
|||||||
{
|
{
|
||||||
int num=table.size();
|
int num=table.size();
|
||||||
std::pair<int,int> *table_v = & table[0];
|
std::pair<int,int> *table_v = & table[0];
|
||||||
|
|
||||||
auto rhs_v = rhs.View(AcceleratorRead);
|
auto rhs_v = rhs.View(AcceleratorRead);
|
||||||
accelerator_forNB( i,num, vobj::Nsimd(), {
|
accelerator_forNB( i,num, vobj::Nsimd(), {
|
||||||
typedef decltype(coalescedRead(buffer[0])) compressed_t;
|
typedef decltype(coalescedRead(buffer[0])) compressed_t;
|
||||||
@ -75,6 +76,7 @@ void Gather_plane_simple_table (Vector<std::pair<int,int> >& table,const Lattice
|
|||||||
compress.Compress(&tmp_c,0,rhs_v(so+table_v[i].second));
|
compress.Compress(&tmp_c,0,rhs_v(so+table_v[i].second));
|
||||||
coalescedWrite(buffer[off+o],tmp_c);
|
coalescedWrite(buffer[off+o],tmp_c);
|
||||||
});
|
});
|
||||||
|
rhs_v.ViewClose();
|
||||||
// Further optimisatoin: i) software prefetch the first element of the next table entry, prefetch the table
|
// Further optimisatoin: i) software prefetch the first element of the next table entry, prefetch the table
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -104,6 +106,7 @@ void Gather_plane_exchange_table(Vector<std::pair<int,int> >& table,const Lattic
|
|||||||
so+tp[2*j+1].second,
|
so+tp[2*j+1].second,
|
||||||
type);
|
type);
|
||||||
});
|
});
|
||||||
|
rhs_v.ViewClose();
|
||||||
}
|
}
|
||||||
|
|
||||||
struct StencilEntry {
|
struct StencilEntry {
|
||||||
@ -181,31 +184,30 @@ class CartesianStencilAccelerator {
|
|||||||
template<class vobj,class cobj,class Parameters>
|
template<class vobj,class cobj,class Parameters>
|
||||||
class CartesianStencilView : public CartesianStencilAccelerator<vobj,cobj,Parameters>
|
class CartesianStencilView : public CartesianStencilAccelerator<vobj,cobj,Parameters>
|
||||||
{
|
{
|
||||||
#ifndef GRID_UVM
|
private:
|
||||||
std::shared_ptr<MemViewDeleter> Deleter;
|
int *closed;
|
||||||
#endif
|
StencilEntry *cpu_ptr;
|
||||||
|
ViewMode mode;
|
||||||
public:
|
public:
|
||||||
//
|
// default copy constructor
|
||||||
#ifdef GRID_UVM
|
CartesianStencilView (const CartesianStencilView &refer_to_me) = default;
|
||||||
CartesianStencilView (const CartesianStencilAccelerator<vobj,cobj,Parameters> &refer_to_me,ViewMode mode)
|
|
||||||
: CartesianStencilAccelerator<vobj,cobj,Parameters>(refer_to_me){};
|
|
||||||
#else
|
|
||||||
CartesianStencilView (const CartesianStencilView &refer_to_me)
|
|
||||||
: CartesianStencilAccelerator<vobj,cobj,Parameters>(refer_to_me), Deleter(refer_to_me.Deleter)
|
|
||||||
{ }
|
|
||||||
CartesianStencilView (const CartesianStencilAccelerator<vobj,cobj,Parameters> &refer_to_me,ViewMode mode)
|
|
||||||
: CartesianStencilAccelerator<vobj,cobj,Parameters>(refer_to_me), Deleter(new MemViewDeleter)
|
|
||||||
{
|
|
||||||
Deleter->cpu_ptr =(void *)this->_entries_p;
|
|
||||||
Deleter->mode = mode;
|
|
||||||
this->_entries_p =(StencilEntry *)
|
|
||||||
|
|
||||||
|
CartesianStencilView (const CartesianStencilAccelerator<vobj,cobj,Parameters> &refer_to_me,ViewMode _mode)
|
||||||
|
: CartesianStencilAccelerator<vobj,cobj,Parameters>(refer_to_me),
|
||||||
|
cpu_ptr(this->_entries_p),
|
||||||
|
mode(_mode)
|
||||||
|
{
|
||||||
|
this->_entries_p =(StencilEntry *)
|
||||||
MemoryManager::ViewOpen(this->_entries_p,
|
MemoryManager::ViewOpen(this->_entries_p,
|
||||||
this->_npoints*this->_osites*sizeof(StencilEntry),
|
this->_npoints*this->_osites*sizeof(StencilEntry),
|
||||||
mode,
|
mode,
|
||||||
AdviseDefault);
|
AdviseDefault);
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
void ViewClose(void)
|
||||||
|
{
|
||||||
|
MemoryManager::ViewClose(this->cpu_ptr,this->mode);
|
||||||
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -252,9 +252,9 @@ public:
|
|||||||
double start=usecond();
|
double start=usecond();
|
||||||
for(int i=0;i<Nloop;i++){
|
for(int i=0;i<Nloop;i++){
|
||||||
z=a*x-y;
|
z=a*x-y;
|
||||||
auto x_v = x.View(CpuWrite);
|
autoView( x_v , x, CpuWrite);
|
||||||
auto y_v = y.View(CpuWrite);
|
autoView( y_v , y, CpuWrite);
|
||||||
auto z_v = z.View(CpuRead);
|
autoView( z_v , z, CpuRead);
|
||||||
x_v[0]=z_v[0]; // force serial dependency to prevent optimise away
|
x_v[0]=z_v[0]; // force serial dependency to prevent optimise away
|
||||||
y_v[4]=z_v[4];
|
y_v[4]=z_v[4];
|
||||||
}
|
}
|
||||||
@ -534,11 +534,13 @@ public:
|
|||||||
{
|
{
|
||||||
LatticeGaugeField Umu5d(FGrid);
|
LatticeGaugeField Umu5d(FGrid);
|
||||||
std::vector<LatticeColourMatrix> U(4,FGrid);
|
std::vector<LatticeColourMatrix> U(4,FGrid);
|
||||||
auto Umu_v = Umu.View(CpuRead);
|
{
|
||||||
auto Umu5d_v = Umu5d.View(CpuWrite);
|
autoView( Umu_v , Umu , CpuRead);
|
||||||
for(int ss=0;ss<Umu.Grid()->oSites();ss++){
|
autoView( Umu5d_v, Umu5d, CpuWrite);
|
||||||
for(int s=0;s<Ls;s++){
|
for(int ss=0;ss<Umu.Grid()->oSites();ss++){
|
||||||
Umu5d_v[Ls*ss+s] = Umu_v[ss];
|
for(int s=0;s<Ls;s++){
|
||||||
|
Umu5d_v[Ls*ss+s] = Umu_v[ss];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
ref = Zero();
|
ref = Zero();
|
||||||
|
@ -129,8 +129,8 @@ int main (int argc, char ** argv)
|
|||||||
LatticeGaugeField Umu5d(FGrid);
|
LatticeGaugeField Umu5d(FGrid);
|
||||||
std::vector<LatticeColourMatrix> U(4,FGrid);
|
std::vector<LatticeColourMatrix> U(4,FGrid);
|
||||||
{
|
{
|
||||||
auto Umu5d_v = Umu5d.View(CpuWrite);
|
autoView( Umu5d_v, Umu5d, CpuWrite);
|
||||||
auto Umu_v = Umu.View(CpuRead);
|
autoView( Umu_v , Umu , CpuRead);
|
||||||
for(int ss=0;ss<Umu.Grid()->oSites();ss++){
|
for(int ss=0;ss<Umu.Grid()->oSites();ss++){
|
||||||
for(int s=0;s<Ls;s++){
|
for(int s=0;s<Ls;s++){
|
||||||
Umu5d_v[Ls*ss+s] = Umu_v[ss];
|
Umu5d_v[Ls*ss+s] = Umu_v[ss];
|
||||||
@ -258,8 +258,8 @@ int main (int argc, char ** argv)
|
|||||||
// ref = src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
|
// ref = src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
|
||||||
tmp = U[mu]*Cshift(src,mu+1,1);
|
tmp = U[mu]*Cshift(src,mu+1,1);
|
||||||
{
|
{
|
||||||
auto ref_v = ref.View(CpuWrite);
|
autoView( ref_v, ref, CpuWrite);
|
||||||
auto tmp_v = tmp.View(CpuRead);
|
autoView( tmp_v, tmp, CpuRead);
|
||||||
for(int i=0;i<ref_v.size();i++){
|
for(int i=0;i<ref_v.size();i++){
|
||||||
ref_v[i]+= tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]; ;
|
ref_v[i]+= tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]; ;
|
||||||
}
|
}
|
||||||
@ -268,8 +268,8 @@ int main (int argc, char ** argv)
|
|||||||
tmp =adj(U[mu])*src;
|
tmp =adj(U[mu])*src;
|
||||||
tmp =Cshift(tmp,mu+1,-1);
|
tmp =Cshift(tmp,mu+1,-1);
|
||||||
{
|
{
|
||||||
auto ref_v = ref.View(CpuWrite);
|
autoView( ref_v, ref, CpuWrite);
|
||||||
auto tmp_v = tmp.View(CpuRead);
|
autoView( tmp_v, tmp, CpuRead);
|
||||||
for(int i=0;i<ref_v.size();i++){
|
for(int i=0;i<ref_v.size();i++){
|
||||||
ref_v[i]+= tmp_v[i] - Gamma(Gmu[mu])*tmp_v[i]; ;
|
ref_v[i]+= tmp_v[i] - Gamma(Gmu[mu])*tmp_v[i]; ;
|
||||||
}
|
}
|
||||||
|
@ -130,11 +130,13 @@ void benchDw(std::vector<int> & latt4, int Ls, int threads,int report )
|
|||||||
LatticeGaugeField Umu5d(FGrid);
|
LatticeGaugeField Umu5d(FGrid);
|
||||||
|
|
||||||
// replicate across fifth dimension
|
// replicate across fifth dimension
|
||||||
auto Umu5d_v = Umu5d.View(CpuWrite);
|
{
|
||||||
auto Umu_v = Umu.View(CpuRead);
|
autoView(Umu5d_v , Umu5d, CpuWrite);
|
||||||
for(int ss=0;ss<Umu.Grid()->oSites();ss++){
|
autoView( Umu_v , Umu, CpuRead);
|
||||||
for(int s=0;s<Ls;s++){
|
for(int ss=0;ss<Umu.Grid()->oSites();ss++){
|
||||||
Umu5d_v[Ls*ss+s] = Umu_v[ss];
|
for(int s=0;s<Ls;s++){
|
||||||
|
Umu5d_v[Ls*ss+s] = Umu_v[ss];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -79,7 +79,7 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
double start=usecond();
|
double start=usecond();
|
||||||
thread_for(t,threads,{
|
thread_for(t,threads,{
|
||||||
auto x_t = x[t].View(CpuRead);
|
autoView( x_t , x[t],CpuRead);
|
||||||
sum[t] = x_t[0];
|
sum[t] = x_t[0];
|
||||||
for(int i=0;i<Nloop;i++){
|
for(int i=0;i<Nloop;i++){
|
||||||
for(auto ss=x_t.begin();ss<x_t.end();ss++){
|
for(auto ss=x_t.begin();ss<x_t.end();ss++){
|
||||||
|
@ -177,9 +177,7 @@ int main (int argc, char ** argv)
|
|||||||
Real nn;
|
Real nn;
|
||||||
double start=usecond();
|
double start=usecond();
|
||||||
for(int i=0;i<Nloop;i++){
|
for(int i=0;i<Nloop;i++){
|
||||||
auto x_v = x.View(CpuWrite);
|
|
||||||
nn=norm2(x);
|
nn=norm2(x);
|
||||||
vsplat(x_v[0]._internal[0],nn);
|
|
||||||
}
|
}
|
||||||
double stop=usecond();
|
double stop=usecond();
|
||||||
double time = (stop-start)/Nloop*1000;
|
double time = (stop-start)/Nloop*1000;
|
||||||
|
@ -85,11 +85,11 @@ void sliceInnerProductMesonField(std::vector< std::vector<ComplexD> > &mat,
|
|||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
int ss= so+n*stride+b;
|
int ss= so+n*stride+b;
|
||||||
for(int i=0;i<Lblock;i++){
|
for(int i=0;i<Lblock;i++){
|
||||||
auto lhs_v = lhs[i].View(CpuRead);
|
autoView(lhs_v, lhs[i], CpuRead);
|
||||||
auto left = conjugate(lhs_v[ss]);
|
auto left = conjugate(lhs_v[ss]);
|
||||||
for(int j=0;j<Rblock;j++){
|
for(int j=0;j<Rblock;j++){
|
||||||
int idx = i+Lblock*j+Lblock*Rblock*r;
|
int idx = i+Lblock*j+Lblock*Rblock*r;
|
||||||
auto rhs_v = rhs[j].View(CpuRead);
|
autoView(rhs_v, rhs[j], CpuRead);
|
||||||
auto right = rhs_v[ss];
|
auto right = rhs_v[ss];
|
||||||
vector_type vv = left()(0)(0) * right()(0)(0)
|
vector_type vv = left()(0)(0) * right()(0)(0)
|
||||||
+ left()(0)(1) * right()(0)(1)
|
+ left()(0)(1) * right()(0)(1)
|
||||||
@ -221,12 +221,12 @@ void sliceInnerProductMesonFieldGamma(std::vector< std::vector<ComplexD> > &mat,
|
|||||||
for(int b=0;b<e2;b++){
|
for(int b=0;b<e2;b++){
|
||||||
int ss= so+n*stride+b;
|
int ss= so+n*stride+b;
|
||||||
for(int i=0;i<Lblock;i++){
|
for(int i=0;i<Lblock;i++){
|
||||||
auto lhs_v=lhs[i].View(CpuRead);
|
autoView(lhs_v,lhs[i],CpuRead);
|
||||||
auto left = conjugate(lhs_v[ss]);
|
auto left = conjugate(lhs_v[ss]);
|
||||||
for(int j=0;j<Rblock;j++){
|
for(int j=0;j<Rblock;j++){
|
||||||
for(int mu=0;mu<Ngamma;mu++){
|
for(int mu=0;mu<Ngamma;mu++){
|
||||||
|
|
||||||
auto rhs_v = rhs[j].View(CpuRead);
|
autoView(rhs_v,rhs[j],CpuRead);
|
||||||
auto right = Gamma(gammas[mu])*rhs_v[ss];
|
auto right = Gamma(gammas[mu])*rhs_v[ss];
|
||||||
|
|
||||||
vector_type vv = left()(0)(0) * right()(0)(0)
|
vector_type vv = left()(0)(0) * right()(0)(0)
|
||||||
@ -370,12 +370,12 @@ void sliceInnerProductMesonFieldGamma1(std::vector< std::vector<ComplexD> > &mat
|
|||||||
int ss= so+n*stride+b;
|
int ss= so+n*stride+b;
|
||||||
for(int i=0;i<Lblock;i++){
|
for(int i=0;i<Lblock;i++){
|
||||||
|
|
||||||
auto lhs_v=lhs[i].View(CpuRead);
|
autoView(lhs_v,lhs[i],CpuRead);
|
||||||
auto left = conjugate(lhs_v[ss]);
|
auto left = conjugate(lhs_v[ss]);
|
||||||
for(int j=0;j<Rblock;j++){
|
for(int j=0;j<Rblock;j++){
|
||||||
|
|
||||||
SpinMatrix_v vv;
|
SpinMatrix_v vv;
|
||||||
auto rhs_v = rhs[j].View(CpuRead);
|
autoView(rhs_v,rhs[j],CpuRead);
|
||||||
auto right = rhs_v[ss];
|
auto right = rhs_v[ss];
|
||||||
for(int s1=0;s1<Ns;s1++){
|
for(int s1=0;s1<Ns;s1++){
|
||||||
for(int s2=0;s2<Ns;s2++){
|
for(int s2=0;s2<Ns;s2++){
|
||||||
@ -518,12 +518,12 @@ void sliceInnerProductMesonFieldGammaMom(std::vector< std::vector<ComplexD> > &m
|
|||||||
|
|
||||||
for(int i=0;i<Lblock;i++){
|
for(int i=0;i<Lblock;i++){
|
||||||
|
|
||||||
auto lhs_v = lhs[i].View(CpuRead);
|
autoView(lhs_v,lhs[i],CpuRead);
|
||||||
auto left = conjugate(lhs_v[ss]);
|
auto left = conjugate(lhs_v[ss]);
|
||||||
for(int j=0;j<Rblock;j++){
|
for(int j=0;j<Rblock;j++){
|
||||||
|
|
||||||
SpinMatrix_v vv;
|
SpinMatrix_v vv;
|
||||||
auto rhs_v = rhs[j].View(CpuRead);
|
autoView(rhs_v,rhs[j],CpuRead);
|
||||||
auto right = rhs_v[ss];
|
auto right = rhs_v[ss];
|
||||||
for(int s1=0;s1<Ns;s1++){
|
for(int s1=0;s1<Ns;s1++){
|
||||||
for(int s2=0;s2<Ns;s2++){
|
for(int s2=0;s2<Ns;s2++){
|
||||||
@ -537,7 +537,7 @@ void sliceInnerProductMesonFieldGammaMom(std::vector< std::vector<ComplexD> > &m
|
|||||||
// Trigger unroll
|
// Trigger unroll
|
||||||
for ( int m=0;m<Nmom;m++){
|
for ( int m=0;m<Nmom;m++){
|
||||||
int idx = m+base;
|
int idx = m+base;
|
||||||
auto mom_v = mom[m].View(CpuRead);
|
autoView(mom_v,mom[m],CpuRead);
|
||||||
auto phase = mom_v[ss];
|
auto phase = mom_v[ss];
|
||||||
mac(&lvSum[idx],&vv,&phase);
|
mac(&lvSum[idx],&vv,&phase);
|
||||||
}
|
}
|
||||||
|
@ -66,9 +66,9 @@ int main (int argc, char ** argv)
|
|||||||
LatticeColourMatrix x(&Grid);// random(pRNG,x);
|
LatticeColourMatrix x(&Grid);// random(pRNG,x);
|
||||||
LatticeColourMatrix y(&Grid);// random(pRNG,y);
|
LatticeColourMatrix y(&Grid);// random(pRNG,y);
|
||||||
|
|
||||||
auto x_v = x.View(AcceleratorRead);
|
autoView( x_v , x, AcceleratorRead);
|
||||||
auto y_v = y.View(AcceleratorRead);
|
autoView( y_v , y, AcceleratorRead);
|
||||||
auto z_v = z.View(AcceleratorWrite);
|
autoView( z_v , z, AcceleratorWrite);
|
||||||
const uint64_t Nsite = x_v.size();
|
const uint64_t Nsite = x_v.size();
|
||||||
const uint64_t nsimd = vComplex::Nsimd();
|
const uint64_t nsimd = vComplex::Nsimd();
|
||||||
for(int64_t i=0;i<Nwarm;i++){
|
for(int64_t i=0;i<Nwarm;i++){
|
||||||
@ -116,9 +116,9 @@ int main (int argc, char ** argv)
|
|||||||
LatticeColourMatrix x(&Grid);// random(pRNG,x);
|
LatticeColourMatrix x(&Grid);// random(pRNG,x);
|
||||||
LatticeColourMatrix y(&Grid);// random(pRNG,y);
|
LatticeColourMatrix y(&Grid);// random(pRNG,y);
|
||||||
|
|
||||||
auto x_v = x.View(AcceleratorWrite);
|
autoView( x_v , x, AcceleratorWrite);
|
||||||
auto y_v = y.View(AcceleratorRead);
|
autoView( y_v , y, AcceleratorRead);
|
||||||
auto z_v = z.View(AcceleratorRead);
|
autoView( z_v , z, AcceleratorRead);
|
||||||
const uint64_t Nsite = x_v.size();
|
const uint64_t Nsite = x_v.size();
|
||||||
const uint64_t nsimd = vComplex::Nsimd();
|
const uint64_t nsimd = vComplex::Nsimd();
|
||||||
for(int64_t i=0;i<Nwarm;i++){
|
for(int64_t i=0;i<Nwarm;i++){
|
||||||
@ -167,9 +167,9 @@ int main (int argc, char ** argv)
|
|||||||
LatticeColourMatrix x(&Grid);// random(pRNG,x);
|
LatticeColourMatrix x(&Grid);// random(pRNG,x);
|
||||||
LatticeColourMatrix y(&Grid);// random(pRNG,y);
|
LatticeColourMatrix y(&Grid);// random(pRNG,y);
|
||||||
|
|
||||||
auto x_v = x.View(AcceleratorRead);
|
autoView( x_v , x, AcceleratorRead);
|
||||||
auto y_v = y.View(AcceleratorRead);
|
autoView( y_v , y, AcceleratorRead);
|
||||||
auto z_v = z.View(AcceleratorWrite);
|
autoView( z_v , z, AcceleratorWrite);
|
||||||
const uint64_t Nsite = x_v.size();
|
const uint64_t Nsite = x_v.size();
|
||||||
const uint64_t nsimd = vComplex::Nsimd();
|
const uint64_t nsimd = vComplex::Nsimd();
|
||||||
for(int64_t i=0;i<Nwarm;i++){
|
for(int64_t i=0;i<Nwarm;i++){
|
||||||
@ -220,10 +220,10 @@ int main (int argc, char ** argv)
|
|||||||
LatticeColourMatrix y(&Grid);// random(pRNG,y);
|
LatticeColourMatrix y(&Grid);// random(pRNG,y);
|
||||||
LatticeColourMatrix w(&Grid);// random(pRNG,y);
|
LatticeColourMatrix w(&Grid);// random(pRNG,y);
|
||||||
|
|
||||||
auto x_v = x.View(AcceleratorRead);
|
autoView( x_v , x, AcceleratorRead);
|
||||||
auto y_v = y.View(AcceleratorRead);
|
autoView( y_v , y, AcceleratorRead);
|
||||||
auto z_v = z.View(AcceleratorRead);
|
autoView( z_v , z, AcceleratorRead);
|
||||||
auto w_v = w.View(AcceleratorWrite);
|
autoView( w_v , w, AcceleratorWrite);
|
||||||
const uint64_t Nsite = x_v.size();
|
const uint64_t Nsite = x_v.size();
|
||||||
const uint64_t nsimd = vComplex::Nsimd();
|
const uint64_t nsimd = vComplex::Nsimd();
|
||||||
for(int64_t i=0;i<Nwarm;i++){
|
for(int64_t i=0;i<Nwarm;i++){
|
||||||
|
@ -125,8 +125,8 @@ int main (int argc, char ** argv)
|
|||||||
// ref = src + Gamma(Gamma::Algebra::GammaX)* src ; // 1-gamma_x
|
// ref = src + Gamma(Gamma::Algebra::GammaX)* src ; // 1-gamma_x
|
||||||
tmp = U[mu]*Cshift(src,mu,1);
|
tmp = U[mu]*Cshift(src,mu,1);
|
||||||
{
|
{
|
||||||
auto ref_v = ref.View(CpuWrite);
|
autoView( ref_v, ref, CpuWrite);
|
||||||
auto tmp_v = tmp.View(CpuWrite);
|
autoView( tmp_v, tmp, CpuWrite);
|
||||||
for(int i=0;i<ref_v.size();i++){
|
for(int i=0;i<ref_v.size();i++){
|
||||||
ref_v[i]+= tmp_v[i] - Gamma(Gmu[mu])*tmp_v[i]; ;
|
ref_v[i]+= tmp_v[i] - Gamma(Gmu[mu])*tmp_v[i]; ;
|
||||||
}
|
}
|
||||||
@ -135,8 +135,8 @@ int main (int argc, char ** argv)
|
|||||||
tmp =adj(U[mu])*src;
|
tmp =adj(U[mu])*src;
|
||||||
tmp =Cshift(tmp,mu,-1);
|
tmp =Cshift(tmp,mu,-1);
|
||||||
{
|
{
|
||||||
auto ref_v = ref.View(CpuWrite);
|
autoView( ref_v, ref, CpuWrite);
|
||||||
auto tmp_v = tmp.View(CpuWrite);
|
autoView( tmp_v, tmp, CpuWrite);
|
||||||
for(int i=0;i<ref_v.size();i++){
|
for(int i=0;i<ref_v.size();i++){
|
||||||
ref_v[i]+= tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]; ;
|
ref_v[i]+= tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]; ;
|
||||||
}
|
}
|
||||||
@ -187,8 +187,8 @@ int main (int argc, char ** argv)
|
|||||||
for(int ss=0;ss<0;ss++ ){
|
for(int ss=0;ss<0;ss++ ){
|
||||||
for(int i=0;i<Ns;i++){
|
for(int i=0;i<Ns;i++){
|
||||||
for(int j=0;j<Nc;j++){
|
for(int j=0;j<Nc;j++){
|
||||||
auto ref_v = ref.View(CpuWrite);
|
autoView( ref_v, ref, CpuWrite);
|
||||||
auto result_v = result.View(CpuWrite);
|
autoView( result_v, result, CpuWrite);
|
||||||
ComplexF * ref_p = (ComplexF *)&ref_v[ss]()(i)(j);
|
ComplexF * ref_p = (ComplexF *)&ref_v[ss]()(i)(j);
|
||||||
ComplexF * res_p = (ComplexF *)&result_v[ss]()(i)(j);
|
ComplexF * res_p = (ComplexF *)&result_v[ss]()(i)(j);
|
||||||
std::cout<<GridLogMessage << ss<< " "<<i<<" "<<j<<" "<< (*ref_p)<<" " <<(*res_p)<<std::endl;
|
std::cout<<GridLogMessage << ss<< " "<<i<<" "<<j<<" "<< (*ref_p)<<" " <<(*res_p)<<std::endl;
|
||||||
@ -204,8 +204,8 @@ int main (int argc, char ** argv)
|
|||||||
// ref = src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
|
// ref = src - Gamma(Gamma::Algebra::GammaX)* src ; // 1+gamma_x
|
||||||
tmp = U[mu]*Cshift(src,mu,1);
|
tmp = U[mu]*Cshift(src,mu,1);
|
||||||
{
|
{
|
||||||
auto ref_v = ref.View(CpuWrite);
|
autoView( ref_v, ref, CpuWrite);
|
||||||
auto tmp_v = tmp.View(CpuWrite);
|
autoView( tmp_v, tmp, CpuWrite);
|
||||||
for(int i=0;i<ref_v.size();i++){
|
for(int i=0;i<ref_v.size();i++){
|
||||||
ref_v[i]+= tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]; ;
|
ref_v[i]+= tmp_v[i] + Gamma(Gmu[mu])*tmp_v[i]; ;
|
||||||
}
|
}
|
||||||
@ -214,8 +214,8 @@ int main (int argc, char ** argv)
|
|||||||
tmp =adj(U[mu])*src;
|
tmp =adj(U[mu])*src;
|
||||||
tmp =Cshift(tmp,mu,-1);
|
tmp =Cshift(tmp,mu,-1);
|
||||||
{
|
{
|
||||||
auto ref_v = ref.View(CpuWrite);
|
autoView( ref_v, ref, CpuWrite);
|
||||||
auto tmp_v = tmp.View(CpuWrite);
|
autoView( tmp_v, tmp, CpuWrite);
|
||||||
for(int i=0;i<ref_v.size();i++){
|
for(int i=0;i<ref_v.size();i++){
|
||||||
ref_v[i]+= tmp_v[i] - Gamma(Gmu[mu])*tmp_v[i]; ;
|
ref_v[i]+= tmp_v[i] - Gamma(Gmu[mu])*tmp_v[i]; ;
|
||||||
}
|
}
|
||||||
|
@ -29,7 +29,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
|||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
using namespace Grid;
|
using namespace Grid;
|
||||||
;
|
|
||||||
|
|
||||||
template<class d>
|
template<class d>
|
||||||
struct scal {
|
struct scal {
|
||||||
@ -51,6 +50,7 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
std::cout << GridLogMessage << "::::: NB: to enable a quick bit reproducibility check use the --checksums flag. " << std::endl;
|
std::cout << GridLogMessage << "::::: NB: to enable a quick bit reproducibility check use the --checksums flag. " << std::endl;
|
||||||
|
|
||||||
|
{
|
||||||
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi());
|
GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexD::Nsimd()),GridDefaultMpi());
|
||||||
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
|
||||||
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
GridCartesian * FGrid = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
|
||||||
@ -100,6 +100,8 @@ int main (int argc, char ** argv)
|
|||||||
ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
|
ConjugateGradient<LatticeFermionD> CG(1.0e-8,10000);
|
||||||
CG(HermOpEO,src_o,result_o_2);
|
CG(HermOpEO,src_o,result_o_2);
|
||||||
|
|
||||||
|
MemoryManager::Print();
|
||||||
|
|
||||||
LatticeFermionD diff_o(FrbGrid);
|
LatticeFermionD diff_o(FrbGrid);
|
||||||
RealD diff = axpy_norm(diff_o, -1.0, result_o, result_o_2);
|
RealD diff = axpy_norm(diff_o, -1.0, result_o, result_o_2);
|
||||||
|
|
||||||
@ -130,7 +132,9 @@ int main (int argc, char ** argv)
|
|||||||
std::cout << GridLogMessage << " CG checksums "<<std::hex << scidac_csuma << " "<<scidac_csumb<<std::endl;
|
std::cout << GridLogMessage << " CG checksums "<<std::hex << scidac_csuma << " "<<scidac_csumb<<std::endl;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
MemoryManager::Print();
|
||||||
|
|
||||||
Grid_finalize();
|
Grid_finalize();
|
||||||
}
|
}
|
||||||
|
@ -107,9 +107,8 @@ int main(int argc, char ** argv)
|
|||||||
// Implement a stencil code that should agree with cshift!
|
// Implement a stencil code that should agree with cshift!
|
||||||
for(int i=0;i<Check.Grid()->oSites();i++){
|
for(int i=0;i<Check.Grid()->oSites();i++){
|
||||||
auto SE = gStencil.GetEntry(0,i);
|
auto SE = gStencil.GetEntry(0,i);
|
||||||
auto check = Check.View(CpuWrite);
|
autoView(check, Check, CpuWrite);
|
||||||
auto foo = Foo.View(CpuRead);
|
autoView( foo, Foo, CpuRead);
|
||||||
|
|
||||||
// Encapsulate in a general wrapper
|
// Encapsulate in a general wrapper
|
||||||
check[i] = foo[SE->_offset]; auto tmp=check[i];
|
check[i] = foo[SE->_offset]; auto tmp=check[i];
|
||||||
if (SE->_permute & 0x1 ) { permute(check[i],tmp,0); tmp=check[i];}
|
if (SE->_permute & 0x1 ) { permute(check[i],tmp,0); tmp=check[i];}
|
||||||
@ -147,8 +146,8 @@ int main(int argc, char ** argv)
|
|||||||
}}}}
|
}}}}
|
||||||
|
|
||||||
if (nrm > 1.0e-4) {
|
if (nrm > 1.0e-4) {
|
||||||
auto check = Check.View(CpuRead);
|
autoView( check , Check, CpuRead);
|
||||||
auto bar = Bar.View(CpuRead);
|
autoView( bar , Bar, CpuRead);
|
||||||
for(int i=0;i<check.size();i++){
|
for(int i=0;i<check.size();i++){
|
||||||
std::cout << i<<" Check "<<check[i]<< "\n"<<i<<" Bar "<<bar[i]<<std::endl;
|
std::cout << i<<" Check "<<check[i]<< "\n"<<i<<" Bar "<<bar[i]<<std::endl;
|
||||||
}
|
}
|
||||||
|
@ -109,8 +109,8 @@ int main(int argc, char ** argv) {
|
|||||||
StencilEntry *SE;
|
StencilEntry *SE;
|
||||||
SE = myStencil.GetEntry(permute_type,0,i);
|
SE = myStencil.GetEntry(permute_type,0,i);
|
||||||
|
|
||||||
auto check = Check.View(CpuWrite);
|
autoView( check , Check, CpuWrite);
|
||||||
auto foo = Foo.View(CpuRead);
|
autoView( foo , Foo, CpuRead);
|
||||||
if ( SE->_is_local && SE->_permute )
|
if ( SE->_is_local && SE->_permute )
|
||||||
permute(check[i],foo[SE->_offset],permute_type);
|
permute(check[i],foo[SE->_offset],permute_type);
|
||||||
else if (SE->_is_local)
|
else if (SE->_is_local)
|
||||||
@ -151,8 +151,8 @@ int main(int argc, char ** argv) {
|
|||||||
}}}}
|
}}}}
|
||||||
|
|
||||||
if (nrm > 1.0e-4) {
|
if (nrm > 1.0e-4) {
|
||||||
auto check = Check.View(CpuRead);
|
autoView( check , Check, CpuRead);
|
||||||
auto bar = Bar.View(CpuRead);
|
autoView( bar , Bar, CpuRead);
|
||||||
for(int i=0;i<check.size();i++){
|
for(int i=0;i<check.size();i++){
|
||||||
std::cout << i<<" Check "<<check[i]<< "\n"<<i<<" Bar "<<bar[i]<<std::endl;
|
std::cout << i<<" Check "<<check[i]<< "\n"<<i<<" Bar "<<bar[i]<<std::endl;
|
||||||
}
|
}
|
||||||
@ -210,8 +210,8 @@ int main(int argc, char ** argv) {
|
|||||||
SE = EStencil.GetEntry(permute_type,0,i);
|
SE = EStencil.GetEntry(permute_type,0,i);
|
||||||
// std::cout << "Even source "<< i<<" -> " <<SE->_offset << " "<< SE->_is_local<<std::endl;
|
// std::cout << "Even source "<< i<<" -> " <<SE->_offset << " "<< SE->_is_local<<std::endl;
|
||||||
|
|
||||||
auto ocheck = OCheck.View(CpuWrite);
|
autoView( ocheck , OCheck, CpuWrite);
|
||||||
auto efoo = EFoo.View(CpuRead);
|
autoView( efoo , EFoo, CpuRead);
|
||||||
if ( SE->_is_local && SE->_permute )
|
if ( SE->_is_local && SE->_permute )
|
||||||
permute(ocheck[i],efoo[SE->_offset],permute_type);
|
permute(ocheck[i],efoo[SE->_offset],permute_type);
|
||||||
else if (SE->_is_local)
|
else if (SE->_is_local)
|
||||||
@ -226,8 +226,8 @@ int main(int argc, char ** argv) {
|
|||||||
SE = OStencil.GetEntry(permute_type,0,i);
|
SE = OStencil.GetEntry(permute_type,0,i);
|
||||||
// std::cout << "ODD source "<< i<<" -> " <<SE->_offset << " "<< SE->_is_local<<std::endl;
|
// std::cout << "ODD source "<< i<<" -> " <<SE->_offset << " "<< SE->_is_local<<std::endl;
|
||||||
|
|
||||||
auto echeck = ECheck.View(CpuWrite);
|
autoView( echeck , ECheck, CpuWrite);
|
||||||
auto ofoo = OFoo.View(CpuRead);
|
autoView( ofoo , OFoo, CpuRead);
|
||||||
if ( SE->_is_local && SE->_permute )
|
if ( SE->_is_local && SE->_permute )
|
||||||
permute(echeck[i],ofoo[SE->_offset],permute_type);
|
permute(echeck[i],ofoo[SE->_offset],permute_type);
|
||||||
else if (SE->_is_local)
|
else if (SE->_is_local)
|
||||||
|
@ -89,8 +89,8 @@ int main (int argc, char ** argv)
|
|||||||
////////////////////////////////////
|
////////////////////////////////////
|
||||||
LatticeGaugeField Umu5d(FGrid);
|
LatticeGaugeField Umu5d(FGrid);
|
||||||
{
|
{
|
||||||
auto umu5d = Umu5d.View(CpuWrite);
|
autoView(umu5d, Umu5d, CpuWrite);
|
||||||
auto umu = Umu.View(CpuRead);
|
autoView( umu, Umu , CpuRead);
|
||||||
for(int ss=0;ss<Umu.Grid()->oSites();ss++){
|
for(int ss=0;ss<Umu.Grid()->oSites();ss++){
|
||||||
for(int s=0;s<Ls;s++){
|
for(int s=0;s<Ls;s++){
|
||||||
umu5d[Ls*ss+s] = umu[ss];
|
umu5d[Ls*ss+s] = umu[ss];
|
||||||
|
@ -570,8 +570,8 @@ void TestConserved1(Action & Ddwf, Action & Ddwfrev,
|
|||||||
LatticeGaugeField Umu5d(FGrid);
|
LatticeGaugeField Umu5d(FGrid);
|
||||||
std::vector<LatticeColourMatrix> U(4,FGrid);
|
std::vector<LatticeColourMatrix> U(4,FGrid);
|
||||||
{
|
{
|
||||||
auto Umu5d_v = Umu5d.View(CpuWrite);
|
autoView( Umu5d_v , Umu5d, CpuWrite);
|
||||||
auto Umu_v = Umu.View(CpuRead);
|
autoView( Umu_v , Umu , CpuRead);
|
||||||
for(int ss=0;ss<Umu.Grid()->oSites();ss++){
|
for(int ss=0;ss<Umu.Grid()->oSites();ss++){
|
||||||
for(int s=0;s<Ls;s++){
|
for(int s=0;s<Ls;s++){
|
||||||
Umu5d_v[Ls*ss+s] = Umu_v[ss];
|
Umu5d_v[Ls*ss+s] = Umu_v[ss];
|
||||||
@ -597,8 +597,8 @@ void TestConserved1(Action & Ddwf, Action & Ddwfrev,
|
|||||||
{
|
{
|
||||||
RealD diag = 5.0 - Ddwf.M5;
|
RealD diag = 5.0 - Ddwf.M5;
|
||||||
mass = Ddwf.mass;
|
mass = Ddwf.mass;
|
||||||
auto psi=result5.View(CpuRead);
|
autoView( psi,result5,CpuRead);
|
||||||
auto chi=tmp.View(CpuWrite);
|
autoView( chi,tmp, CpuWrite);
|
||||||
thread_for(sss,UGrid->oSites(),{
|
thread_for(sss,UGrid->oSites(),{
|
||||||
uint64_t ss= sss*Ls;
|
uint64_t ss= sss*Ls;
|
||||||
typedef vSpinColourVector spinor;
|
typedef vSpinColourVector spinor;
|
||||||
|
@ -98,9 +98,9 @@ int main (int argc, char ** argv)
|
|||||||
PokeIndex<LorentzIndex>(mom,mommu,mu);
|
PokeIndex<LorentzIndex>(mom,mommu,mu);
|
||||||
|
|
||||||
// fourth order exponential approx
|
// fourth order exponential approx
|
||||||
auto mom_v = mom.View(CpuRead);
|
autoView( mom_v, mom, CpuRead);
|
||||||
auto U_v = U.View(CpuRead);
|
autoView( U_v , U, CpuRead);
|
||||||
auto Uprime_v = Uprime.View(CpuWrite);
|
autoView(Uprime_v, Uprime, CpuWrite);
|
||||||
thread_foreach( i,mom_v,{
|
thread_foreach( i,mom_v,{
|
||||||
Uprime_v[i](mu) =
|
Uprime_v[i](mu) =
|
||||||
U_v[i](mu)
|
U_v[i](mu)
|
||||||
|
@ -100,9 +100,9 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
// fourth order exponential approx
|
// fourth order exponential approx
|
||||||
|
|
||||||
auto mom_v = mom.View(CpuRead);
|
autoView( mom_v, mom, CpuRead);
|
||||||
auto U_v = U.View(CpuRead);
|
autoView( U_v , U, CpuRead);
|
||||||
auto Uprime_v = Uprime.View(CpuWrite);
|
autoView(Uprime_v, Uprime, CpuWrite);
|
||||||
|
|
||||||
thread_foreach( i,mom_v,{
|
thread_foreach( i,mom_v,{
|
||||||
Uprime_v[i](mu) = U_v[i](mu)
|
Uprime_v[i](mu) = U_v[i](mu)
|
||||||
|
@ -110,9 +110,9 @@ int main (int argc, char** argv)
|
|||||||
PokeIndex<LorentzIndex>(mom, mommu, mu);
|
PokeIndex<LorentzIndex>(mom, mommu, mu);
|
||||||
|
|
||||||
// fourth order exponential approx
|
// fourth order exponential approx
|
||||||
auto mom_v = mom.View(CpuRead);
|
autoView( mom_v, mom, CpuRead);
|
||||||
auto U_v = U.View(CpuRead);
|
autoView( U_v , U, CpuRead);
|
||||||
auto Uprime_v = Uprime.View(CpuWrite);
|
autoView(Uprime_v, Uprime, CpuWrite);
|
||||||
|
|
||||||
thread_foreach(i,mom_v,{
|
thread_foreach(i,mom_v,{
|
||||||
Uprime_v[i](mu) = U_v[i](mu)
|
Uprime_v[i](mu) = U_v[i](mu)
|
||||||
|
@ -119,9 +119,9 @@ int main (int argc, char ** argv)
|
|||||||
PokeIndex<LorentzIndex>(mom,mommu,mu);
|
PokeIndex<LorentzIndex>(mom,mommu,mu);
|
||||||
|
|
||||||
// fourth order exponential approx
|
// fourth order exponential approx
|
||||||
auto mom_v = mom.View(CpuRead);
|
autoView( mom_v, mom, CpuRead);
|
||||||
auto U_v = U.View(CpuRead);
|
autoView( U_v , U, CpuRead);
|
||||||
auto Uprime_v = Uprime.View(CpuWrite);
|
autoView(Uprime_v, Uprime, CpuWrite);
|
||||||
|
|
||||||
thread_foreach(i,mom_v,{
|
thread_foreach(i,mom_v,{
|
||||||
Uprime_v[i](mu) = U_v[i](mu)
|
Uprime_v[i](mu) = U_v[i](mu)
|
||||||
|
@ -114,9 +114,9 @@ int main (int argc, char** argv)
|
|||||||
PokeIndex<LorentzIndex>(mom, mommu, mu);
|
PokeIndex<LorentzIndex>(mom, mommu, mu);
|
||||||
|
|
||||||
// fourth order exponential approx
|
// fourth order exponential approx
|
||||||
auto mom_v = mom.View(CpuRead);
|
autoView( mom_v, mom, CpuRead);
|
||||||
auto U_v = U.View(CpuRead);
|
autoView( U_v , U, CpuRead);
|
||||||
auto Uprime_v = Uprime.View(CpuWrite);
|
autoView(Uprime_v, Uprime, CpuWrite);
|
||||||
|
|
||||||
thread_foreach(i,mom_v,{
|
thread_foreach(i,mom_v,{
|
||||||
Uprime_v[i](mu) = U_v[i](mu)
|
Uprime_v[i](mu) = U_v[i](mu)
|
||||||
|
@ -85,9 +85,9 @@ int main (int argc, char ** argv)
|
|||||||
PokeIndex<LorentzIndex>(mom,mommu,mu);
|
PokeIndex<LorentzIndex>(mom,mommu,mu);
|
||||||
|
|
||||||
// fourth order exponential approx
|
// fourth order exponential approx
|
||||||
auto Uprime_v = Uprime.View(CpuWrite);
|
autoView(Uprime_v, Uprime, CpuWrite);
|
||||||
auto U_v = U.View(CpuRead);
|
autoView( U_v , U, CpuRead);
|
||||||
auto mom_v = mom.View(CpuRead);
|
autoView( mom_v, mom, CpuRead);
|
||||||
thread_foreach(i,mom_v,{ // exp(pmu dt) * Umu
|
thread_foreach(i,mom_v,{ // exp(pmu dt) * Umu
|
||||||
Uprime_v[i](mu) = U_v[i](mu) + mom_v[i](mu)*U_v[i](mu)*dt ;
|
Uprime_v[i](mu) = U_v[i](mu) + mom_v[i](mu)*U_v[i](mu)*dt ;
|
||||||
});
|
});
|
||||||
|
@ -87,9 +87,9 @@ int main (int argc, char ** argv)
|
|||||||
PokeIndex<LorentzIndex>(mom,mommu,mu);
|
PokeIndex<LorentzIndex>(mom,mommu,mu);
|
||||||
|
|
||||||
// fourth order exponential approx
|
// fourth order exponential approx
|
||||||
auto mom_v = mom.View(CpuRead);
|
autoView( mom_v, mom, CpuRead);
|
||||||
auto Uprime_v= Uprime.View(CpuWrite);
|
autoView(Uprime_v, Uprime, CpuWrite);
|
||||||
auto U_v = U.View(CpuRead);
|
autoView( U_v , U, CpuRead);
|
||||||
thread_foreach(i,mom_v,{ // exp(pmu dt) * Umu
|
thread_foreach(i,mom_v,{ // exp(pmu dt) * Umu
|
||||||
Uprime_v[i](mu) = U_v[i](mu) + mom_v[i](mu)*U_v[i](mu)*dt ;
|
Uprime_v[i](mu) = U_v[i](mu) + mom_v[i](mu)*U_v[i](mu)*dt ;
|
||||||
});
|
});
|
||||||
|
@ -105,9 +105,9 @@ int main (int argc, char ** argv)
|
|||||||
PokeIndex<LorentzIndex>(mom,mommu,mu);
|
PokeIndex<LorentzIndex>(mom,mommu,mu);
|
||||||
|
|
||||||
// fourth order exponential approx
|
// fourth order exponential approx
|
||||||
auto U_v = U.View(CpuRead);
|
autoView( U_v , U, CpuRead);
|
||||||
auto mom_v = mom.View(CpuRead);
|
autoView( mom_v, mom, CpuRead);
|
||||||
auto Uprime_v = Uprime.View(CpuWrite);
|
autoView(Uprime_v, Uprime, CpuWrite);
|
||||||
thread_foreach(i,mom_v,{
|
thread_foreach(i,mom_v,{
|
||||||
Uprime_v[i](mu) = U_v[i](mu)
|
Uprime_v[i](mu) = U_v[i](mu)
|
||||||
+ mom_v[i](mu)*U_v[i](mu)*dt
|
+ mom_v[i](mu)*U_v[i](mu)*dt
|
||||||
|
@ -99,9 +99,9 @@ int main (int argc, char ** argv)
|
|||||||
PokeIndex<LorentzIndex>(mom,mommu,mu);
|
PokeIndex<LorentzIndex>(mom,mommu,mu);
|
||||||
|
|
||||||
// fourth order exponential approx
|
// fourth order exponential approx
|
||||||
auto mom_v = mom.View(CpuRead);
|
autoView( mom_v, mom, CpuRead);
|
||||||
auto U_v = U.View(CpuRead);
|
autoView( U_v , U, CpuRead);
|
||||||
auto Uprime_v = Uprime.View(CpuWrite);
|
autoView(Uprime_v, Uprime, CpuWrite);
|
||||||
|
|
||||||
thread_foreach(i,mom_v,{
|
thread_foreach(i,mom_v,{
|
||||||
Uprime_v[i](mu) = U_v[i](mu)
|
Uprime_v[i](mu) = U_v[i](mu)
|
||||||
|
@ -101,9 +101,9 @@ int main (int argc, char ** argv)
|
|||||||
PokeIndex<LorentzIndex>(mom,mommu,mu);
|
PokeIndex<LorentzIndex>(mom,mommu,mu);
|
||||||
|
|
||||||
// fourth order exponential approx
|
// fourth order exponential approx
|
||||||
auto U_v = U.View(CpuRead);
|
autoView( U_v , U, CpuRead);
|
||||||
auto mom_v = mom.View(CpuRead);
|
autoView( mom_v, mom, CpuRead);
|
||||||
auto Uprime_v = Uprime.View(CpuWrite);
|
autoView(Uprime_v, Uprime, CpuWrite);
|
||||||
thread_foreach(i,mom_v,{
|
thread_foreach(i,mom_v,{
|
||||||
Uprime_v[i](mu) = U_v[i](mu)
|
Uprime_v[i](mu) = U_v[i](mu)
|
||||||
+ mom_v[i](mu)*U_v[i](mu)*dt
|
+ mom_v[i](mu)*U_v[i](mu)*dt
|
||||||
|
@ -112,9 +112,9 @@ int main (int argc, char** argv)
|
|||||||
PokeIndex<LorentzIndex>(mom, mommu, mu);
|
PokeIndex<LorentzIndex>(mom, mommu, mu);
|
||||||
|
|
||||||
// fourth order exponential approx
|
// fourth order exponential approx
|
||||||
auto mom_v = mom.View(CpuRead);
|
autoView( mom_v, mom, CpuRead);
|
||||||
auto U_v = U.View(CpuRead);
|
autoView( U_v , U, CpuRead);
|
||||||
auto Uprime_v = Uprime.View(CpuWrite);
|
autoView(Uprime_v, Uprime, CpuWrite);
|
||||||
|
|
||||||
thread_foreach(i,mom_v,{
|
thread_foreach(i,mom_v,{
|
||||||
Uprime_v[i](mu) = U_v[i](mu)
|
Uprime_v[i](mu) = U_v[i](mu)
|
||||||
|
@ -115,9 +115,9 @@ int main (int argc, char** argv)
|
|||||||
SU3::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
|
SU3::GaussianFundamentalLieAlgebraMatrix(RNG4, mommu); // Traceless antihermitian momentum; gaussian in lie alg
|
||||||
|
|
||||||
PokeIndex<LorentzIndex>(mom, mommu, mu);
|
PokeIndex<LorentzIndex>(mom, mommu, mu);
|
||||||
auto U_v = U.View(CpuRead);
|
autoView( U_v , U, CpuRead);
|
||||||
auto mom_v = mom.View(CpuRead);
|
autoView( mom_v, mom, CpuRead);
|
||||||
auto Uprime_v = Uprime.View(CpuWrite);
|
autoView(Uprime_v, Uprime, CpuWrite);
|
||||||
// fourth order exponential approx
|
// fourth order exponential approx
|
||||||
thread_foreach( i, mom_v,{
|
thread_foreach( i, mom_v,{
|
||||||
Uprime_v[i](mu) = U_v[i](mu) + mom_v[i](mu)*U_v[i](mu)*dt + mom_v[i](mu) *mom_v[i](mu) *U_v[i](mu)*(dt*dt/2.0)
|
Uprime_v[i](mu) = U_v[i](mu) + mom_v[i](mu)*U_v[i](mu)*dt + mom_v[i](mu) *mom_v[i](mu) *U_v[i](mu)*(dt*dt/2.0)
|
||||||
|
@ -101,9 +101,9 @@ int main (int argc, char ** argv)
|
|||||||
PokeIndex<LorentzIndex>(mom,mommu,mu);
|
PokeIndex<LorentzIndex>(mom,mommu,mu);
|
||||||
|
|
||||||
// fourth order exponential approx
|
// fourth order exponential approx
|
||||||
auto U_v = U.View(CpuRead);
|
autoView( U_v , U, CpuRead);
|
||||||
auto mom_v = mom.View(CpuRead);
|
autoView( mom_v, mom, CpuRead);
|
||||||
auto Uprime_v = Uprime.View(CpuWrite);
|
autoView(Uprime_v, Uprime, CpuWrite);
|
||||||
thread_foreach(i,mom_v,{
|
thread_foreach(i,mom_v,{
|
||||||
Uprime_v[i](mu) = U_v[i](mu)
|
Uprime_v[i](mu) = U_v[i](mu)
|
||||||
+ mom_v[i](mu)*U_v[i](mu)*dt
|
+ mom_v[i](mu)*U_v[i](mu)*dt
|
||||||
|
@ -87,9 +87,9 @@ int main (int argc, char ** argv)
|
|||||||
PokeIndex<LorentzIndex>(mom,mommu,mu);
|
PokeIndex<LorentzIndex>(mom,mommu,mu);
|
||||||
|
|
||||||
// fourth order exponential approx
|
// fourth order exponential approx
|
||||||
auto Uprime_v = Uprime.View(CpuWrite);
|
autoView(Uprime_v, Uprime, CpuWrite);
|
||||||
auto U_v = U.View(CpuRead);
|
autoView( U_v , U, CpuRead);
|
||||||
auto mom_v = mom.View(CpuRead);
|
autoView( mom_v, mom, CpuRead);
|
||||||
thread_foreach(i,mom_v,{ // exp(pmu dt) * Umu
|
thread_foreach(i,mom_v,{ // exp(pmu dt) * Umu
|
||||||
Uprime_v[i](mu) = U_v[i](mu) + mom_v[i](mu)*U_v[i](mu)*dt ;
|
Uprime_v[i](mu) = U_v[i](mu) + mom_v[i](mu)*U_v[i](mu)*dt ;
|
||||||
});
|
});
|
||||||
|
@ -105,9 +105,9 @@ int main (int argc, char ** argv)
|
|||||||
PokeIndex<LorentzIndex>(mom,mommu,mu);
|
PokeIndex<LorentzIndex>(mom,mommu,mu);
|
||||||
|
|
||||||
// fourth order exponential approx
|
// fourth order exponential approx
|
||||||
auto U_v = U.View(CpuRead);
|
autoView( U_v , U, CpuRead);
|
||||||
auto mom_v = mom.View(CpuRead);
|
autoView( mom_v, mom, CpuRead);
|
||||||
auto Uprime_v = Uprime.View(CpuWrite);
|
autoView(Uprime_v, Uprime, CpuWrite);
|
||||||
thread_foreach( i,mom_v,{
|
thread_foreach( i,mom_v,{
|
||||||
Uprime_v[i](mu) = U_v[i](mu);
|
Uprime_v[i](mu) = U_v[i](mu);
|
||||||
Uprime_v[i](mu) += mom_v[i](mu)*U_v[i](mu)*dt ;
|
Uprime_v[i](mu) += mom_v[i](mu)*U_v[i](mu)*dt ;
|
||||||
|
@ -105,9 +105,9 @@ int main(int argc, char **argv)
|
|||||||
Hmom -= real(sum(trace(mommu * mommu)));
|
Hmom -= real(sum(trace(mommu * mommu)));
|
||||||
PokeIndex<LorentzIndex>(mom, mommu, mu);
|
PokeIndex<LorentzIndex>(mom, mommu, mu);
|
||||||
|
|
||||||
auto Uprime_v = Uprime.View(CpuWrite);
|
autoView(Uprime_v, Uprime, CpuWrite);
|
||||||
auto U_v = U.View(CpuRead);
|
autoView( U_v , U, CpuRead);
|
||||||
auto mom_v = mom.View(CpuRead);
|
autoView( mom_v, mom, CpuRead);
|
||||||
thread_foreach(ss,mom_v,
|
thread_foreach(ss,mom_v,
|
||||||
{
|
{
|
||||||
Uprime_v[ss]._internal[mu] = ProjectOnGroup(Exponentiate(mom_v[ss]._internal[mu], dt, 12) * U_v[ss]._internal[mu]);
|
Uprime_v[ss]._internal[mu] = ProjectOnGroup(Exponentiate(mom_v[ss]._internal[mu], dt, 12) * U_v[ss]._internal[mu]);
|
||||||
|
@ -114,9 +114,9 @@ int main (int argc, char ** argv)
|
|||||||
PokeIndex<LorentzIndex>(mom,mommu,mu);
|
PokeIndex<LorentzIndex>(mom,mommu,mu);
|
||||||
|
|
||||||
// fourth order exponential approx
|
// fourth order exponential approx
|
||||||
auto mom_v = mom.View(CpuRead);
|
autoView( mom_v, mom, CpuRead);
|
||||||
auto U_v = U.View(CpuRead);
|
autoView( U_v , U, CpuRead);
|
||||||
auto Uprime_v = Uprime.View(CpuWrite);
|
autoView(Uprime_v, Uprime, CpuWrite);
|
||||||
|
|
||||||
thread_foreach(i,mom_v,{
|
thread_foreach(i,mom_v,{
|
||||||
Uprime_v[i](mu) = U_v[i](mu)
|
Uprime_v[i](mu) = U_v[i](mu)
|
||||||
|
@ -300,8 +300,8 @@ int main (int argc, char ** argv)
|
|||||||
int nb=nbasisc/2;
|
int nb=nbasisc/2;
|
||||||
CoarseAggregates.CreateSubspaceChebyshev(CRNG,PosdefLdop,nb,12.0,0.02,500,100,100,0.0);
|
CoarseAggregates.CreateSubspaceChebyshev(CRNG,PosdefLdop,nb,12.0,0.02,500,100,100,0.0);
|
||||||
for(int n=0;n<nb;n++){
|
for(int n=0;n<nb;n++){
|
||||||
auto subspace = CoarseAggregates.subspace[n].View(CpuRead);
|
autoView( subspace, CoarseAggregates.subspace[n] ,CpuRead);
|
||||||
auto subspace_g5 = CoarseAggregates.subspace[n+nb].View(CpuWrite);
|
autoView( subspace_g5,CoarseAggregates.subspace[n+nb],CpuWrite);
|
||||||
for(int nn=0;nn<nb;nn++){
|
for(int nn=0;nn<nb;nn++){
|
||||||
for(int site=0;site<Coarse5d->oSites();site++){
|
for(int site=0;site<Coarse5d->oSites();site++){
|
||||||
subspace_g5[site](nn) = subspace[site](nn);
|
subspace_g5[site](nn) = subspace[site](nn);
|
||||||
|
Loading…
Reference in New Issue
Block a user