mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-09 21:50:45 +01:00
Offload more loops
This commit is contained in:
parent
e97f3688db
commit
8720aecb80
@ -258,15 +258,16 @@ private:
|
|||||||
CloverFieldType CloverTermDagEven, CloverTermDagOdd; // Clover term Dag EO
|
CloverFieldType CloverTermDagEven, CloverTermDagOdd; // Clover term Dag EO
|
||||||
CloverFieldType CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO
|
CloverFieldType CloverTermInvDagEven, CloverTermInvDagOdd; // Clover term Inv Dag EO
|
||||||
|
|
||||||
|
public:
|
||||||
// eventually these can be compressed into 6x6 blocks instead of the 12x12
|
// eventually these can be compressed into 6x6 blocks instead of the 12x12
|
||||||
// using the DeGrand-Rossi basis for the gamma matrices
|
// using the DeGrand-Rossi basis for the gamma matrices
|
||||||
CloverFieldType fillCloverYZ(const GaugeLinkField &F)
|
CloverFieldType fillCloverYZ(const GaugeLinkField &F)
|
||||||
{
|
{
|
||||||
CloverFieldType T(F.Grid());
|
CloverFieldType T(F.Grid());
|
||||||
T = Zero();
|
T = Zero();
|
||||||
autoView(T_v,T,CpuWrite);
|
autoView(T_v,T,AcceleratorWrite);
|
||||||
autoView(F_v,F,CpuRead);
|
autoView(F_v,F,AcceleratorRead);
|
||||||
thread_for(i, CloverTerm.Grid()->oSites(),
|
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
|
||||||
{
|
{
|
||||||
T_v[i]()(0, 1) = timesMinusI(F_v[i]()());
|
T_v[i]()(0, 1) = timesMinusI(F_v[i]()());
|
||||||
T_v[i]()(1, 0) = timesMinusI(F_v[i]()());
|
T_v[i]()(1, 0) = timesMinusI(F_v[i]()());
|
||||||
@ -282,9 +283,9 @@ private:
|
|||||||
CloverFieldType T(F.Grid());
|
CloverFieldType T(F.Grid());
|
||||||
T = Zero();
|
T = Zero();
|
||||||
|
|
||||||
autoView(T_v, T,CpuWrite);
|
autoView(T_v, T,AcceleratorWrite);
|
||||||
autoView(F_v, F,CpuRead);
|
autoView(F_v, F,AcceleratorRead);
|
||||||
thread_for(i, CloverTerm.Grid()->oSites(),
|
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
|
||||||
{
|
{
|
||||||
T_v[i]()(0, 1) = -F_v[i]()();
|
T_v[i]()(0, 1) = -F_v[i]()();
|
||||||
T_v[i]()(1, 0) = F_v[i]()();
|
T_v[i]()(1, 0) = F_v[i]()();
|
||||||
@ -300,9 +301,9 @@ private:
|
|||||||
CloverFieldType T(F.Grid());
|
CloverFieldType T(F.Grid());
|
||||||
T = Zero();
|
T = Zero();
|
||||||
|
|
||||||
autoView(T_v,T,CpuWrite);
|
autoView(T_v,T,AcceleratorWrite);
|
||||||
autoView(F_v,F,CpuRead);
|
autoView(F_v,F,AcceleratorRead);
|
||||||
thread_for(i, CloverTerm.Grid()->oSites(),
|
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
|
||||||
{
|
{
|
||||||
T_v[i]()(0, 0) = timesMinusI(F_v[i]()());
|
T_v[i]()(0, 0) = timesMinusI(F_v[i]()());
|
||||||
T_v[i]()(1, 1) = timesI(F_v[i]()());
|
T_v[i]()(1, 1) = timesI(F_v[i]()());
|
||||||
@ -318,9 +319,9 @@ private:
|
|||||||
CloverFieldType T(F.Grid());
|
CloverFieldType T(F.Grid());
|
||||||
T = Zero();
|
T = Zero();
|
||||||
|
|
||||||
autoView( T_v , T, CpuWrite);
|
autoView( T_v , T, AcceleratorWrite);
|
||||||
autoView( F_v , F, CpuRead);
|
autoView( F_v , F, AcceleratorRead);
|
||||||
thread_for(i, CloverTerm.Grid()->oSites(),
|
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
|
||||||
{
|
{
|
||||||
T_v[i]()(0, 1) = timesI(F_v[i]()());
|
T_v[i]()(0, 1) = timesI(F_v[i]()());
|
||||||
T_v[i]()(1, 0) = timesI(F_v[i]()());
|
T_v[i]()(1, 0) = timesI(F_v[i]()());
|
||||||
@ -336,9 +337,9 @@ private:
|
|||||||
CloverFieldType T(F.Grid());
|
CloverFieldType T(F.Grid());
|
||||||
T = Zero();
|
T = Zero();
|
||||||
|
|
||||||
autoView( T_v ,T,CpuWrite);
|
autoView( T_v ,T,AcceleratorWrite);
|
||||||
autoView( F_v ,F,CpuRead);
|
autoView( F_v ,F,AcceleratorRead);
|
||||||
thread_for(i, CloverTerm.Grid()->oSites(),
|
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
|
||||||
{
|
{
|
||||||
T_v[i]()(0, 1) = -(F_v[i]()());
|
T_v[i]()(0, 1) = -(F_v[i]()());
|
||||||
T_v[i]()(1, 0) = (F_v[i]()());
|
T_v[i]()(1, 0) = (F_v[i]()());
|
||||||
@ -355,9 +356,9 @@ private:
|
|||||||
|
|
||||||
T = Zero();
|
T = Zero();
|
||||||
|
|
||||||
autoView( T_v , T,CpuWrite);
|
autoView( T_v , T,AcceleratorWrite);
|
||||||
autoView( F_v , F,CpuRead);
|
autoView( F_v , F,AcceleratorRead);
|
||||||
thread_for(i, CloverTerm.Grid()->oSites(),
|
accelerator_for(i, CloverTerm.Grid()->oSites(),1,
|
||||||
{
|
{
|
||||||
T_v[i]()(0, 0) = timesI(F_v[i]()());
|
T_v[i]()(0, 0) = timesI(F_v[i]()());
|
||||||
T_v[i]()(1, 1) = timesMinusI(F_v[i]()());
|
T_v[i]()(1, 1) = timesMinusI(F_v[i]()());
|
||||||
|
@ -106,10 +106,10 @@ public:
|
|||||||
const _SpinorField & phi,
|
const _SpinorField & phi,
|
||||||
int mu)
|
int mu)
|
||||||
{
|
{
|
||||||
autoView( out_v, out, CpuWrite);
|
autoView( out_v, out, AcceleratorWrite);
|
||||||
autoView( phi_v, phi, CpuRead);
|
autoView( phi_v, phi, AcceleratorRead);
|
||||||
autoView( Umu_v, Umu, CpuRead);
|
autoView( Umu_v, Umu, AcceleratorRead);
|
||||||
thread_for(sss,out.Grid()->oSites(),{
|
accelerator_for(sss,out.Grid()->oSites(),1,{
|
||||||
multLink(out_v[sss],Umu_v[sss],phi_v[sss],mu);
|
multLink(out_v[sss],Umu_v[sss],phi_v[sss],mu);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@ -192,10 +192,10 @@ public:
|
|||||||
GaugeLinkField tmp(mat.Grid());
|
GaugeLinkField tmp(mat.Grid());
|
||||||
tmp = Zero();
|
tmp = Zero();
|
||||||
{
|
{
|
||||||
autoView( tmp_v , tmp, CpuWrite);
|
autoView( tmp_v , tmp, AcceleratorWrite);
|
||||||
autoView( Btilde_v , Btilde, CpuRead);
|
autoView( Btilde_v , Btilde, AcceleratorRead);
|
||||||
autoView( Atilde_v , Atilde, CpuRead);
|
autoView( Atilde_v , Atilde, AcceleratorRead);
|
||||||
thread_for(sss,tmp.Grid()->oSites(),{
|
accelerator_for(sss,tmp.Grid()->oSites(),1,{
|
||||||
int sU=sss;
|
int sU=sss;
|
||||||
for(int s=0;s<Ls;s++){
|
for(int s=0;s<Ls;s++){
|
||||||
int sF = s+Ls*sU;
|
int sF = s+Ls*sU;
|
||||||
|
@ -467,32 +467,7 @@ void WilsonFermion<Impl>::ContractConservedCurrent(PropagatorField &q_in_1,
|
|||||||
conformable(_grid, q_in_1.Grid());
|
conformable(_grid, q_in_1.Grid());
|
||||||
conformable(_grid, q_in_2.Grid());
|
conformable(_grid, q_in_2.Grid());
|
||||||
conformable(_grid, q_out.Grid());
|
conformable(_grid, q_out.Grid());
|
||||||
#if 0
|
assert(0);
|
||||||
PropagatorField tmp1(_grid), tmp2(_grid);
|
|
||||||
q_out = Zero();
|
|
||||||
|
|
||||||
// Forward, need q1(x + mu), q2(x). Backward, need q1(x), q2(x + mu).
|
|
||||||
// Inefficient comms method but not performance critical.
|
|
||||||
tmp1 = Cshift(q_in_1, mu, 1);
|
|
||||||
tmp2 = Cshift(q_in_2, mu, 1);
|
|
||||||
autoView( tmp1_v , tmp1, CpuWrite);
|
|
||||||
autoView( tmp2_v , tmp2, CpuWrite);
|
|
||||||
autoView( q_in_1_v,q_in_1, CpuRead);
|
|
||||||
autoView( q_in_2_v,q_in_2, CpuRead);
|
|
||||||
autoView( q_out_v , q_out, CpuRead);
|
|
||||||
autoView( Umu_v , Umu, CpuRead);
|
|
||||||
thread_for(sU, Umu.Grid()->oSites(),{
|
|
||||||
Kernels::ContractConservedCurrentSiteFwd(tmp1_v[sU],
|
|
||||||
q_in_2_v[sU],
|
|
||||||
q_out_v[sU],
|
|
||||||
Umu_v, sU, mu);
|
|
||||||
Kernels::ContractConservedCurrentSiteBwd(q_in_1_v[sU],
|
|
||||||
tmp2_v[sU],
|
|
||||||
q_out_v[sU],
|
|
||||||
Umu_v, sU, mu);
|
|
||||||
});
|
|
||||||
#else
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -508,62 +483,7 @@ void WilsonFermion<Impl>::SeqConservedCurrent(PropagatorField &q_in,
|
|||||||
{
|
{
|
||||||
conformable(_grid, q_in.Grid());
|
conformable(_grid, q_in.Grid());
|
||||||
conformable(_grid, q_out.Grid());
|
conformable(_grid, q_out.Grid());
|
||||||
#if 0
|
assert(0);
|
||||||
|
|
||||||
// Lattice<iSinglet<Simd>> ph(_grid), coor(_grid);
|
|
||||||
Complex i(0.0,1.0);
|
|
||||||
PropagatorField tmpFwd(_grid), tmpBwd(_grid), tmp(_grid);
|
|
||||||
unsigned int tshift = (mu == Tp) ? 1 : 0;
|
|
||||||
unsigned int LLt = GridDefaultLatt()[Tp];
|
|
||||||
|
|
||||||
q_out = Zero();
|
|
||||||
LatticeInteger coords(_grid);
|
|
||||||
LatticeCoordinate(coords, Tp);
|
|
||||||
|
|
||||||
// Need q(x + mu) and q(x - mu).
|
|
||||||
tmp = Cshift(q_in, mu, 1);
|
|
||||||
tmpFwd = tmp*lattice_cmplx;
|
|
||||||
tmp = lattice_cmplx*q_in;
|
|
||||||
tmpBwd = Cshift(tmp, mu, -1);
|
|
||||||
|
|
||||||
autoView( coords_v , coords, CpuRead);
|
|
||||||
autoView( tmpFwd_v , tmpFwd, CpuRead);
|
|
||||||
autoView( tmpBwd_v , tmpBwd, CpuRead);
|
|
||||||
autoView( Umu_v , Umu, CpuRead);
|
|
||||||
autoView( q_out_v , q_out, CpuWrite);
|
|
||||||
|
|
||||||
thread_for(sU, Umu.Grid()->oSites(), {
|
|
||||||
|
|
||||||
// Compute the sequential conserved current insertion only if our simd
|
|
||||||
// object contains a timeslice we need.
|
|
||||||
vPredicate t_mask;
|
|
||||||
t_mask() = ((coords_v[sU] >= tmin) && (coords_v[sU] <= tmax));
|
|
||||||
Integer timeSlices = Reduce(t_mask());
|
|
||||||
|
|
||||||
if (timeSlices > 0) {
|
|
||||||
Kernels::SeqConservedCurrentSiteFwd(tmpFwd_v[sU],
|
|
||||||
q_out_v[sU],
|
|
||||||
Umu_v, sU, mu, t_mask);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Repeat for backward direction.
|
|
||||||
t_mask() = ((coords_v[sU] >= (tmin + tshift)) &&
|
|
||||||
(coords_v[sU] <= (tmax + tshift)));
|
|
||||||
|
|
||||||
//if tmax = LLt-1 (last timeslice) include timeslice 0 if the time is shifted (mu=3)
|
|
||||||
unsigned int t0 = 0;
|
|
||||||
if((tmax==LLt-1) && (tshift==1)) t_mask() = (t_mask() || (coords_v[sU] == t0 ));
|
|
||||||
|
|
||||||
timeSlices = Reduce(t_mask());
|
|
||||||
|
|
||||||
if (timeSlices > 0) {
|
|
||||||
Kernels::SeqConservedCurrentSiteBwd(tmpBwd_v[sU],
|
|
||||||
q_out_v[sU],
|
|
||||||
Umu_v, sU, mu, t_mask);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
#else
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
NAMESPACE_END(Grid);
|
NAMESPACE_END(Grid);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user