mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-13 01:05:36 +00:00
Implemented acclerator-optimized versions of localCopyRegion and insertSliceLocal to speed up padding
Fixed const correctness on PaddedCell methods Fixed compile issues on Crusher Added timing breakdowns for PaddedCell::Expand and the padded implementations of the staples, visible under --log Performance Optimized kernel for StaplePadded Test_iwasaki_action_newstaple now repeats the calculation 10 times and reports average timings
This commit is contained in:
parent
bb71e9a96a
commit
f44dce390f
@ -697,8 +697,68 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
|
|||||||
for(int d=0;d<nd;d++){
|
for(int d=0;d<nd;d++){
|
||||||
assert(Fg->_processors[d] == Tg->_processors[d]);
|
assert(Fg->_processors[d] == Tg->_processors[d]);
|
||||||
}
|
}
|
||||||
|
|
||||||
// the above should guarantee that the operations are local
|
// the above should guarantee that the operations are local
|
||||||
|
|
||||||
|
#if 1
|
||||||
|
|
||||||
|
size_t nsite = 1;
|
||||||
|
for(int i=0;i<nd;i++) nsite *= RegionSize[i];
|
||||||
|
|
||||||
|
size_t tbytes = 4*nsite*sizeof(int);
|
||||||
|
int *table = (int*)malloc(tbytes);
|
||||||
|
|
||||||
|
thread_for(idx, nsite, {
|
||||||
|
Coordinate from_coor, to_coor;
|
||||||
|
size_t rem = idx;
|
||||||
|
for(int i=0;i<nd;i++){
|
||||||
|
size_t base_i = rem % RegionSize[i]; rem /= RegionSize[i];
|
||||||
|
from_coor[i] = base_i + FromLowerLeft[i];
|
||||||
|
to_coor[i] = base_i + ToLowerLeft[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
int foidx = Fg->oIndex(from_coor);
|
||||||
|
int fiidx = Fg->iIndex(from_coor);
|
||||||
|
int toidx = Tg->oIndex(to_coor);
|
||||||
|
int tiidx = Tg->iIndex(to_coor);
|
||||||
|
int* tt = table + 4*idx;
|
||||||
|
tt[0] = foidx;
|
||||||
|
tt[1] = fiidx;
|
||||||
|
tt[2] = toidx;
|
||||||
|
tt[3] = tiidx;
|
||||||
|
});
|
||||||
|
|
||||||
|
int* table_d = (int*)acceleratorAllocDevice(tbytes);
|
||||||
|
acceleratorCopyToDevice(table,table_d,tbytes);
|
||||||
|
|
||||||
|
typedef typename vobj::vector_type vector_type;
|
||||||
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
|
|
||||||
|
autoView(from_v,From,AcceleratorRead);
|
||||||
|
autoView(to_v,To,AcceleratorWrite);
|
||||||
|
|
||||||
|
accelerator_for(idx,nsite,1,{
|
||||||
|
static const int words=sizeof(vobj)/sizeof(vector_type);
|
||||||
|
int* tt = table_d + 4*idx;
|
||||||
|
int from_oidx = *tt++;
|
||||||
|
int from_lane = *tt++;
|
||||||
|
int to_oidx = *tt++;
|
||||||
|
int to_lane = *tt;
|
||||||
|
|
||||||
|
const vector_type* from = (const vector_type *)&from_v[from_oidx];
|
||||||
|
vector_type* to = (vector_type *)&to_v[to_oidx];
|
||||||
|
|
||||||
|
scalar_type stmp;
|
||||||
|
for(int w=0;w<words;w++){
|
||||||
|
stmp = getlane(from[w], from_lane);
|
||||||
|
putlane(to[w], stmp, to_lane);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
acceleratorFreeDevice(table_d);
|
||||||
|
free(table);
|
||||||
|
|
||||||
|
|
||||||
|
#else
|
||||||
Coordinate ldf = Fg->_ldimensions;
|
Coordinate ldf = Fg->_ldimensions;
|
||||||
Coordinate rdf = Fg->_rdimensions;
|
Coordinate rdf = Fg->_rdimensions;
|
||||||
Coordinate isf = Fg->_istride;
|
Coordinate isf = Fg->_istride;
|
||||||
@ -738,6 +798,8 @@ void localCopyRegion(const Lattice<vobj> &From,Lattice<vobj> & To,Coordinate Fro
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -830,6 +892,8 @@ void ExtractSlice(Lattice<vobj> &lowDim,const Lattice<vobj> & higherDim,int slic
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
//Insert subvolume orthogonal to direction 'orthog' with slice index 'slice_lo' from 'lowDim' onto slice index 'slice_hi' of higherDim
|
||||||
|
//The local dimensions of both 'lowDim' and 'higherDim' orthogonal to 'orthog' should be the same
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
|
void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int slice_lo,int slice_hi, int orthog)
|
||||||
{
|
{
|
||||||
@ -851,6 +915,65 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if 1
|
||||||
|
size_t nsite = lg->lSites()/lg->LocalDimensions()[orthog];
|
||||||
|
size_t tbytes = 4*nsite*sizeof(int);
|
||||||
|
int *table = (int*)malloc(tbytes);
|
||||||
|
|
||||||
|
thread_for(idx,nsite,{
|
||||||
|
Coordinate lcoor(nl);
|
||||||
|
Coordinate hcoor(nh);
|
||||||
|
lcoor[orthog] = slice_lo;
|
||||||
|
hcoor[orthog] = slice_hi;
|
||||||
|
size_t rem = idx;
|
||||||
|
for(int mu=0;mu<nl;mu++){
|
||||||
|
if(mu != orthog){
|
||||||
|
int xmu = rem % lg->LocalDimensions()[mu]; rem /= lg->LocalDimensions()[mu];
|
||||||
|
lcoor[mu] = hcoor[mu] = xmu;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
int loidx = lg->oIndex(lcoor);
|
||||||
|
int liidx = lg->iIndex(lcoor);
|
||||||
|
int hoidx = hg->oIndex(hcoor);
|
||||||
|
int hiidx = hg->iIndex(hcoor);
|
||||||
|
int* tt = table + 4*idx;
|
||||||
|
tt[0] = loidx;
|
||||||
|
tt[1] = liidx;
|
||||||
|
tt[2] = hoidx;
|
||||||
|
tt[3] = hiidx;
|
||||||
|
});
|
||||||
|
|
||||||
|
int* table_d = (int*)acceleratorAllocDevice(tbytes);
|
||||||
|
acceleratorCopyToDevice(table,table_d,tbytes);
|
||||||
|
|
||||||
|
typedef typename vobj::vector_type vector_type;
|
||||||
|
typedef typename vobj::scalar_type scalar_type;
|
||||||
|
|
||||||
|
autoView(lowDim_v,lowDim,AcceleratorRead);
|
||||||
|
autoView(higherDim_v,higherDim,AcceleratorWrite);
|
||||||
|
|
||||||
|
accelerator_for(idx,nsite,1,{
|
||||||
|
static const int words=sizeof(vobj)/sizeof(vector_type);
|
||||||
|
int* tt = table_d + 4*idx;
|
||||||
|
int from_oidx = *tt++;
|
||||||
|
int from_lane = *tt++;
|
||||||
|
int to_oidx = *tt++;
|
||||||
|
int to_lane = *tt;
|
||||||
|
|
||||||
|
const vector_type* from = (const vector_type *)&lowDim_v[from_oidx];
|
||||||
|
vector_type* to = (vector_type *)&higherDim_v[to_oidx];
|
||||||
|
|
||||||
|
scalar_type stmp;
|
||||||
|
for(int w=0;w<words;w++){
|
||||||
|
stmp = getlane(from[w], from_lane);
|
||||||
|
putlane(to[w], stmp, to_lane);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
acceleratorFreeDevice(table_d);
|
||||||
|
free(table);
|
||||||
|
|
||||||
|
#else
|
||||||
// the above should guarantee that the operations are local
|
// the above should guarantee that the operations are local
|
||||||
autoView(lowDimv,lowDim,CpuRead);
|
autoView(lowDimv,lowDim,CpuRead);
|
||||||
autoView(higherDimv,higherDim,CpuWrite);
|
autoView(higherDimv,higherDim,CpuWrite);
|
||||||
@ -866,6 +989,7 @@ void InsertSliceLocal(const Lattice<vobj> &lowDim, Lattice<vobj> & higherDim,int
|
|||||||
pokeLocalSite(s,higherDimv,hcoor);
|
pokeLocalSite(s,higherDimv,hcoor);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -95,7 +95,7 @@ public:
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
inline Lattice<vobj> Extract(const Lattice<vobj> &in)
|
inline Lattice<vobj> Extract(const Lattice<vobj> &in) const
|
||||||
{
|
{
|
||||||
Lattice<vobj> out(unpadded_grid);
|
Lattice<vobj> out(unpadded_grid);
|
||||||
|
|
||||||
@ -106,7 +106,7 @@ public:
|
|||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
inline Lattice<vobj> Exchange(const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>())
|
inline Lattice<vobj> Exchange(const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
|
||||||
{
|
{
|
||||||
GridBase *old_grid = in.Grid();
|
GridBase *old_grid = in.Grid();
|
||||||
int dims = old_grid->Nd();
|
int dims = old_grid->Nd();
|
||||||
@ -118,7 +118,7 @@ public:
|
|||||||
}
|
}
|
||||||
// expand up one dim at a time
|
// expand up one dim at a time
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
inline Lattice<vobj> Expand(int dim, const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>())
|
inline Lattice<vobj> Expand(int dim, const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
|
||||||
{
|
{
|
||||||
GridBase *old_grid = in.Grid();
|
GridBase *old_grid = in.Grid();
|
||||||
GridCartesian *new_grid = grids[dim];//These are new grids
|
GridCartesian *new_grid = grids[dim];//These are new grids
|
||||||
@ -130,20 +130,40 @@ public:
|
|||||||
else conformable(old_grid,grids[dim-1]);
|
else conformable(old_grid,grids[dim-1]);
|
||||||
|
|
||||||
std::cout << " dim "<<dim<<" local "<<local << " padding to "<<plocal<<std::endl;
|
std::cout << " dim "<<dim<<" local "<<local << " padding to "<<plocal<<std::endl;
|
||||||
|
|
||||||
|
double tins=0, tshift=0;
|
||||||
|
|
||||||
// Middle bit
|
// Middle bit
|
||||||
|
double t = usecond();
|
||||||
for(int x=0;x<local[dim];x++){
|
for(int x=0;x<local[dim];x++){
|
||||||
InsertSliceLocal(in,padded,x,depth+x,dim);
|
InsertSliceLocal(in,padded,x,depth+x,dim);
|
||||||
}
|
}
|
||||||
|
tins += usecond() - t;
|
||||||
|
|
||||||
// High bit
|
// High bit
|
||||||
|
t = usecond();
|
||||||
shifted = cshift.Cshift(in,dim,depth);
|
shifted = cshift.Cshift(in,dim,depth);
|
||||||
|
tshift += usecond() - t;
|
||||||
|
|
||||||
|
t=usecond();
|
||||||
for(int x=0;x<depth;x++){
|
for(int x=0;x<depth;x++){
|
||||||
InsertSliceLocal(shifted,padded,local[dim]-depth+x,depth+local[dim]+x,dim);
|
InsertSliceLocal(shifted,padded,local[dim]-depth+x,depth+local[dim]+x,dim);
|
||||||
}
|
}
|
||||||
|
tins += usecond() - t;
|
||||||
|
|
||||||
// Low bit
|
// Low bit
|
||||||
|
t = usecond();
|
||||||
shifted = cshift.Cshift(in,dim,-depth);
|
shifted = cshift.Cshift(in,dim,-depth);
|
||||||
|
tshift += usecond() - t;
|
||||||
|
|
||||||
|
t = usecond();
|
||||||
for(int x=0;x<depth;x++){
|
for(int x=0;x<depth;x++){
|
||||||
InsertSliceLocal(shifted,padded,x,x,dim);
|
InsertSliceLocal(shifted,padded,x,x,dim);
|
||||||
}
|
}
|
||||||
|
tins += usecond() - t;
|
||||||
|
|
||||||
|
std::cout << GridLogPerformance << "PaddedCell::Expand timings: cshift:" << tshift/1000 << "ms, insert-slice:" << tins/1000 << "ms" << std::endl;
|
||||||
|
|
||||||
return padded;
|
return padded;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -359,6 +359,7 @@ public:
|
|||||||
assert(Cell.depth >= 1);
|
assert(Cell.depth >= 1);
|
||||||
GridBase *ggrid = U_padded[0].Grid(); //padded cell grid
|
GridBase *ggrid = U_padded[0].Grid(); //padded cell grid
|
||||||
|
|
||||||
|
double t0 = usecond();
|
||||||
//Generate shift arrays
|
//Generate shift arrays
|
||||||
std::vector<Coordinate> shifts;
|
std::vector<Coordinate> shifts;
|
||||||
for(int mu=0;mu<Nd;mu++){
|
for(int mu=0;mu<Nd;mu++){
|
||||||
@ -382,54 +383,73 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
int shift_mu_off = shifts.size()/Nd;
|
||||||
|
|
||||||
|
double t1 = usecond();
|
||||||
//Generate local stencil
|
//Generate local stencil
|
||||||
GeneralLocalStencil gStencil(ggrid,shifts);
|
GeneralLocalStencil gStencil(ggrid,shifts);
|
||||||
|
|
||||||
|
double t2 = usecond();
|
||||||
|
|
||||||
|
//Open views to padded gauge links and keep open over mu loop
|
||||||
|
typedef LatticeView<typename GaugeMat::vector_object> GaugeViewType;
|
||||||
|
size_t vsize = Nd*sizeof(GaugeViewType);
|
||||||
|
GaugeViewType* Ug_dirs_v_host = (GaugeViewType*)malloc(vsize);
|
||||||
|
for(int i=0;i<Nd;i++) Ug_dirs_v_host[i] = U_padded[i].View(AcceleratorRead);
|
||||||
|
GaugeViewType* Ug_dirs_v = (GaugeViewType*)acceleratorAllocDevice(vsize);
|
||||||
|
acceleratorCopyToDevice(Ug_dirs_v_host,Ug_dirs_v,vsize);
|
||||||
|
|
||||||
GaugeMat gStaple(ggrid);
|
GaugeMat gStaple(ggrid);
|
||||||
|
|
||||||
int off = 0;
|
int outer_off = 0;
|
||||||
for(int mu=0;mu<Nd;mu++){
|
for(int mu=0;mu<Nd;mu++){
|
||||||
gStaple = Zero();
|
{ //view scope
|
||||||
|
autoView( gStaple_v , gStaple, AcceleratorWrite);
|
||||||
for(int nu=0;nu<Nd;nu++){
|
auto gStencil_v = gStencil.View();
|
||||||
if(nu != mu){
|
|
||||||
{
|
accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), {
|
||||||
autoView( rgStaple_v , gStaple, AcceleratorRead);
|
decltype(coalescedRead(Ug_dirs_v[0][0])) stencil_ss;
|
||||||
autoView( gStaple_v , gStaple, AcceleratorWrite);
|
stencil_ss = Zero();
|
||||||
auto gStencil_v = gStencil.View();
|
int off = outer_off;
|
||||||
autoView( Ug_mu_v , U_padded[mu], AcceleratorRead);
|
|
||||||
autoView( Ug_nu_v , U_padded[nu], AcceleratorRead);
|
for(int nu=0;nu<Nd;nu++){
|
||||||
|
if(nu != mu){
|
||||||
accelerator_for(ss, ggrid->oSites(), ggrid->Nsimd(), {
|
GeneralStencilEntry const* e = gStencil_v.GetEntry(off++,ss);
|
||||||
auto stencil_ss = coalescedRead(rgStaple_v[ss]);
|
auto U0 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
|
||||||
|
e = gStencil_v.GetEntry(off++,ss);
|
||||||
GeneralStencilEntry const* e = gStencil_v.GetEntry(off,ss);
|
auto U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
|
||||||
auto Udag_nu_x = adj(coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd));
|
e = gStencil_v.GetEntry(off++,ss);
|
||||||
e = gStencil_v.GetEntry(off+1,ss);
|
auto U2 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
|
||||||
auto Udag_mu_xpnu = adj(coalescedReadGeneralPermute(Ug_mu_v[e->_offset], e->_permute, Nd));
|
|
||||||
e = gStencil_v.GetEntry(off+2,ss);
|
|
||||||
auto U_nu_xpmu = coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd);
|
|
||||||
|
|
||||||
stencil_ss = stencil_ss + U_nu_xpmu * Udag_mu_xpnu * Udag_nu_x;
|
stencil_ss = stencil_ss + U2 * U1 * U0;
|
||||||
|
|
||||||
e = gStencil_v.GetEntry(off+3,ss);
|
e = gStencil_v.GetEntry(off++,ss);
|
||||||
auto U_nu_xmnu = coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd);
|
U0 = coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd);
|
||||||
e = gStencil_v.GetEntry(off+4,ss);
|
e = gStencil_v.GetEntry(off++,ss);
|
||||||
auto Udag_mu_xmnu = adj(coalescedReadGeneralPermute(Ug_mu_v[e->_offset], e->_permute, Nd));
|
U1 = adj(coalescedReadGeneralPermute(Ug_dirs_v[mu][e->_offset], e->_permute, Nd));
|
||||||
e = gStencil_v.GetEntry(off+5,ss);
|
e = gStencil_v.GetEntry(off++,ss);
|
||||||
auto Udag_nu_xmnu_pmu = adj(coalescedReadGeneralPermute(Ug_nu_v[e->_offset], e->_permute, Nd));
|
U2 = adj(coalescedReadGeneralPermute(Ug_dirs_v[nu][e->_offset], e->_permute, Nd));
|
||||||
|
|
||||||
stencil_ss = stencil_ss + Udag_nu_xmnu_pmu * Udag_mu_xmnu * U_nu_xmnu;
|
stencil_ss = stencil_ss + U2 * U1 * U0;
|
||||||
|
|
||||||
coalescedWrite(gStaple_v[ss],stencil_ss);
|
|
||||||
}
|
}
|
||||||
);
|
}
|
||||||
} //ensure views are all closed!
|
|
||||||
off += 6;
|
coalescedWrite(gStaple_v[ss],stencil_ss);
|
||||||
}//nu!=mu
|
}
|
||||||
}//nu loop
|
);
|
||||||
|
} //ensure views are all closed!
|
||||||
|
|
||||||
staple[mu] = Cell.Extract(gStaple);
|
staple[mu] = Cell.Extract(gStaple);
|
||||||
|
outer_off += shift_mu_off;
|
||||||
}//mu loop
|
}//mu loop
|
||||||
|
|
||||||
|
for(int i=0;i<Nd;i++) Ug_dirs_v_host[i].ViewClose();
|
||||||
|
free(Ug_dirs_v_host);
|
||||||
|
acceleratorFreeDevice(Ug_dirs_v);
|
||||||
|
|
||||||
|
double t3=usecond();
|
||||||
|
|
||||||
|
std::cout << GridLogPerformance << "StaplePaddedAll timings: coord:" << (t1-t0)/1000 << "ms, stencil:" << (t2-t1)/1000 << "ms, kernel:" << (t3-t2)/1000 << "ms" << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -805,7 +825,7 @@ public:
|
|||||||
// the sum over all staples on each site
|
// the sum over all staples on each site
|
||||||
//////////////////////////////////////////////////
|
//////////////////////////////////////////////////
|
||||||
static void RectStapleDouble(GaugeMat &U2, const GaugeMat &U, int mu) {
|
static void RectStapleDouble(GaugeMat &U2, const GaugeMat &U, int mu) {
|
||||||
U2 = U * CshiftLink(U, mu, 1);
|
U2 = U * Gimpl::CshiftLink(U, mu, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////
|
||||||
@ -826,9 +846,9 @@ public:
|
|||||||
|
|
||||||
// Up staple ___ ___
|
// Up staple ___ ___
|
||||||
// | |
|
// | |
|
||||||
tmp = CshiftLink(adj(U[nu]), nu, -1);
|
tmp = Gimpl::CshiftLink(adj(U[nu]), nu, -1);
|
||||||
tmp = adj(U2[mu]) * tmp;
|
tmp = adj(U2[mu]) * tmp;
|
||||||
tmp = CshiftLink(tmp, mu, -2);
|
tmp = Gimpl::CshiftLink(tmp, mu, -2);
|
||||||
|
|
||||||
Staple2x1 = Gimpl::CovShiftForward(U[nu], nu, tmp);
|
Staple2x1 = Gimpl::CovShiftForward(U[nu], nu, tmp);
|
||||||
|
|
||||||
@ -836,14 +856,14 @@ public:
|
|||||||
// |___ ___|
|
// |___ ___|
|
||||||
//
|
//
|
||||||
tmp = adj(U2[mu]) * U[nu];
|
tmp = adj(U2[mu]) * U[nu];
|
||||||
Staple2x1 += Gimpl::CovShiftBackward(U[nu], nu, CshiftLink(tmp, mu, -2));
|
Staple2x1 += Gimpl::CovShiftBackward(U[nu], nu, Gimpl::CshiftLink(tmp, mu, -2));
|
||||||
|
|
||||||
// ___ ___
|
// ___ ___
|
||||||
// | ___|
|
// | ___|
|
||||||
// |___ ___|
|
// |___ ___|
|
||||||
//
|
//
|
||||||
|
|
||||||
Stap += CshiftLink(Gimpl::CovShiftForward(U[mu], mu, Staple2x1), mu, 1);
|
Stap += Gimpl::CshiftLink(Gimpl::CovShiftForward(U[mu], mu, Staple2x1), mu, 1);
|
||||||
|
|
||||||
// ___ ___
|
// ___ ___
|
||||||
// |___ |
|
// |___ |
|
||||||
@ -852,7 +872,7 @@ public:
|
|||||||
|
|
||||||
// tmp= Staple2x1* Cshift(U[mu],mu,-2);
|
// tmp= Staple2x1* Cshift(U[mu],mu,-2);
|
||||||
// Stap+= Cshift(tmp,mu,1) ;
|
// Stap+= Cshift(tmp,mu,1) ;
|
||||||
Stap += CshiftLink(Staple2x1, mu, 1) * CshiftLink(U[mu], mu, -1);
|
Stap += Gimpl::CshiftLink(Staple2x1, mu, 1) * Gimpl::CshiftLink(U[mu], mu, -1);
|
||||||
;
|
;
|
||||||
|
|
||||||
// --
|
// --
|
||||||
@ -860,10 +880,10 @@ public:
|
|||||||
//
|
//
|
||||||
// | |
|
// | |
|
||||||
|
|
||||||
tmp = CshiftLink(adj(U2[nu]), nu, -2);
|
tmp = Gimpl::CshiftLink(adj(U2[nu]), nu, -2);
|
||||||
tmp = Gimpl::CovShiftBackward(U[mu], mu, tmp);
|
tmp = Gimpl::CovShiftBackward(U[mu], mu, tmp);
|
||||||
tmp = U2[nu] * CshiftLink(tmp, nu, 2);
|
tmp = U2[nu] * Gimpl::CshiftLink(tmp, nu, 2);
|
||||||
Stap += CshiftLink(tmp, mu, 1);
|
Stap += Gimpl::CshiftLink(tmp, mu, 1);
|
||||||
|
|
||||||
// | |
|
// | |
|
||||||
//
|
//
|
||||||
@ -872,8 +892,8 @@ public:
|
|||||||
|
|
||||||
tmp = Gimpl::CovShiftBackward(U[mu], mu, U2[nu]);
|
tmp = Gimpl::CovShiftBackward(U[mu], mu, U2[nu]);
|
||||||
tmp = adj(U2[nu]) * tmp;
|
tmp = adj(U2[nu]) * tmp;
|
||||||
tmp = CshiftLink(tmp, nu, -2);
|
tmp = Gimpl::CshiftLink(tmp, nu, -2);
|
||||||
Stap += CshiftLink(tmp, mu, 1);
|
Stap += Gimpl::CshiftLink(tmp, mu, 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1006,6 +1026,7 @@ public:
|
|||||||
assert(Cell.depth >= 2);
|
assert(Cell.depth >= 2);
|
||||||
GridBase *ggrid = U_padded[0].Grid(); //padded cell grid
|
GridBase *ggrid = U_padded[0].Grid(); //padded cell grid
|
||||||
|
|
||||||
|
double t0 = usecond();
|
||||||
std::vector<Coordinate> shifts;
|
std::vector<Coordinate> shifts;
|
||||||
for (int mu = 0; mu < Nd; mu++){
|
for (int mu = 0; mu < Nd; mu++){
|
||||||
for (int nu = 0; nu < Nd; nu++) {
|
for (int nu = 0; nu < Nd; nu++) {
|
||||||
@ -1060,8 +1081,11 @@ public:
|
|||||||
}
|
}
|
||||||
size_t nshift = shifts.size();
|
size_t nshift = shifts.size();
|
||||||
int mu_off_delta = nshift / Nd;
|
int mu_off_delta = nshift / Nd;
|
||||||
|
double t1 = usecond();
|
||||||
|
|
||||||
GeneralLocalStencil gStencil(ggrid,shifts);
|
GeneralLocalStencil gStencil(ggrid,shifts);
|
||||||
|
double t2 = usecond();
|
||||||
|
|
||||||
//Open views to padded gauge links and keep open over mu loop
|
//Open views to padded gauge links and keep open over mu loop
|
||||||
typedef LatticeView<typename GaugeMat::vector_object> GaugeViewType;
|
typedef LatticeView<typename GaugeMat::vector_object> GaugeViewType;
|
||||||
size_t vsize = Nd*sizeof(GaugeViewType);
|
size_t vsize = Nd*sizeof(GaugeViewType);
|
||||||
@ -1183,6 +1207,10 @@ public:
|
|||||||
for(int i=0;i<Nd;i++) Ug_dirs_v_host[i].ViewClose();
|
for(int i=0;i<Nd;i++) Ug_dirs_v_host[i].ViewClose();
|
||||||
free(Ug_dirs_v_host);
|
free(Ug_dirs_v_host);
|
||||||
acceleratorFreeDevice(Ug_dirs_v);
|
acceleratorFreeDevice(Ug_dirs_v);
|
||||||
|
|
||||||
|
double t3 = usecond();
|
||||||
|
|
||||||
|
std::cout << GridLogPerformance << "RectStaplePaddedAll timings: coord:" << (t1-t0)/1000 << "ms, stencil:" << (t2-t1)/1000 << "ms, kernel:" << (t3-t2)/1000 << "ms" << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -1198,13 +1226,18 @@ public:
|
|||||||
StapleAll(Stap, U);
|
StapleAll(Stap, U);
|
||||||
RectStapleAll(RectStap, U);
|
RectStapleAll(RectStap, U);
|
||||||
#else
|
#else
|
||||||
|
double t0 = usecond();
|
||||||
//Use the padded cell with maximal reuse
|
//Use the padded cell with maximal reuse
|
||||||
PaddedCell Ghost(2, dynamic_cast<GridCartesian*>(U[0].Grid()));
|
PaddedCell Ghost(2, dynamic_cast<GridCartesian*>(U[0].Grid()));
|
||||||
CshiftImplGauge<Gimpl> cshift_impl;
|
CshiftImplGauge<Gimpl> cshift_impl;
|
||||||
std::vector<GaugeMat> U_pad(Nd, Ghost.grids.back());
|
std::vector<GaugeMat> U_pad(Nd, Ghost.grids.back());
|
||||||
for(int mu=0;mu<Nd;mu++) U_pad[mu] = Ghost.Exchange(U[mu], cshift_impl);
|
for(int mu=0;mu<Nd;mu++) U_pad[mu] = Ghost.Exchange(U[mu], cshift_impl);
|
||||||
|
double t1 = usecond();
|
||||||
StaplePaddedAll(Stap, U_pad, Ghost);
|
StaplePaddedAll(Stap, U_pad, Ghost);
|
||||||
|
double t2 = usecond();
|
||||||
RectStaplePaddedAll(RectStap, U_pad, Ghost);
|
RectStaplePaddedAll(RectStap, U_pad, Ghost);
|
||||||
|
double t3 = usecond();
|
||||||
|
std::cout << GridLogPerformance << "StapleAndRectStapleAll timings: pad:" << (t1-t0)/1000 << "ms, staple:" << (t2-t1)/1000 << "ms, rect-staple:" << (t3-t2)/1000 << "ms" << std::endl;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -165,18 +165,24 @@ int main (int argc, char ** argv)
|
|||||||
IwasakiGaugeActionOrig<Gimpl> action_orig(beta);
|
IwasakiGaugeActionOrig<Gimpl> action_orig(beta);
|
||||||
IwasakiGaugeAction<Gimpl> action_new(beta);
|
IwasakiGaugeAction<Gimpl> action_new(beta);
|
||||||
|
|
||||||
double t0 = usecond();
|
double torig=0, tnew=0;
|
||||||
action_orig.deriv(U, derivOrig);
|
int ntest = 10;
|
||||||
double t1 = usecond();
|
for(int i=0;i<ntest;i++){
|
||||||
action_new.deriv(U, derivNew);
|
double t0 = usecond();
|
||||||
double t2 = usecond();
|
action_orig.deriv(U, derivOrig);
|
||||||
|
double t1 = usecond();
|
||||||
|
action_new.deriv(U, derivNew);
|
||||||
|
double t2 = usecond();
|
||||||
|
|
||||||
GaugeLorentz diff = derivOrig - derivNew;
|
GaugeLorentz diff = derivOrig - derivNew;
|
||||||
double n = norm2(diff);
|
double n = norm2(diff);
|
||||||
std::cout << GridLogMessage << "Difference " << n << " (expect 0)" << std::endl;
|
std::cout << GridLogMessage << "Difference " << n << " (expect 0)" << std::endl;
|
||||||
assert(n<1e-10);
|
assert(n<1e-10);
|
||||||
|
|
||||||
std::cout << GridLogMessage << "Timings orig: " << (t1-t0)/1000 << "ms, new: " << (t2-t1)/1000 << "ms" << std::endl;
|
std::cout << GridLogMessage << "Timings orig: " << (t1-t0)/1000 << "ms, new: " << (t2-t1)/1000 << "ms" << std::endl;
|
||||||
|
torig += (t1-t0)/1000; tnew += (t2-t1)/1000;
|
||||||
|
}
|
||||||
|
std::cout << GridLogMessage << "Avg timings " << ntest << " iterations: orig:" << torig/ntest << "ms, new:" << tnew/ntest << "ms" << std::endl;
|
||||||
|
|
||||||
Grid_finalize();
|
Grid_finalize();
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user