mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 07:55:35 +00:00
Eliminate both GPU issue and threading bottle neck by avoiding malloc in coordinate handling
This commit is contained in:
parent
e158b60bce
commit
4962f59477
@ -115,9 +115,9 @@ private:
|
|||||||
double flops_call;
|
double flops_call;
|
||||||
uint64_t usec;
|
uint64_t usec;
|
||||||
|
|
||||||
std::vector<int> dimensions;
|
Coordinate dimensions;
|
||||||
std::vector<int> processors;
|
Coordinate processors;
|
||||||
std::vector<int> processor_coor;
|
Coordinate processor_coor;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
@ -137,7 +137,7 @@ public:
|
|||||||
{
|
{
|
||||||
flops=0;
|
flops=0;
|
||||||
usec =0;
|
usec =0;
|
||||||
std::vector<int> layout(Nd,1);
|
Coordinate layout(Nd,1);
|
||||||
sgrid = new GridCartesian(dimensions,layout,processors);
|
sgrid = new GridCartesian(dimensions,layout,processors);
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -146,7 +146,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
void FFT_dim_mask(Lattice<vobj> &result,const Lattice<vobj> &source,std::vector<int> mask,int sign){
|
void FFT_dim_mask(Lattice<vobj> &result,const Lattice<vobj> &source,Coordinate mask,int sign){
|
||||||
|
|
||||||
conformable(result.Grid(),vgrid);
|
conformable(result.Grid(),vgrid);
|
||||||
conformable(source.Grid(),vgrid);
|
conformable(source.Grid(),vgrid);
|
||||||
@ -162,7 +162,7 @@ public:
|
|||||||
|
|
||||||
template<class vobj>
|
template<class vobj>
|
||||||
void FFT_all_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int sign){
|
void FFT_all_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int sign){
|
||||||
std::vector<int> mask(Nd,1);
|
Coordinate mask(Nd,1);
|
||||||
FFT_dim_mask(result,source,mask,sign);
|
FFT_dim_mask(result,source,mask,sign);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -178,8 +178,8 @@ public:
|
|||||||
int L = vgrid->_ldimensions[dim];
|
int L = vgrid->_ldimensions[dim];
|
||||||
int G = vgrid->_fdimensions[dim];
|
int G = vgrid->_fdimensions[dim];
|
||||||
|
|
||||||
std::vector<int> layout(Nd,1);
|
Coordinate layout(Nd,1);
|
||||||
std::vector<int> pencil_gd(vgrid->_fdimensions);
|
Coordinate pencil_gd(vgrid->_fdimensions);
|
||||||
|
|
||||||
pencil_gd[dim] = G*processors[dim];
|
pencil_gd[dim] = G*processors[dim];
|
||||||
|
|
||||||
@ -228,45 +228,37 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Barrel shift and collect global pencil
|
// Barrel shift and collect global pencil
|
||||||
std::vector<int> lcoor(Nd), gcoor(Nd);
|
Coordinate lcoor(Nd), gcoor(Nd);
|
||||||
result = source;
|
result = source;
|
||||||
int pc = processor_coor[dim];
|
int pc = processor_coor[dim];
|
||||||
for(int p=0;p<processors[dim];p++) {
|
for(int p=0;p<processors[dim];p++) {
|
||||||
thread_region {
|
thread_loop( (int idx=0;idx<sgrid->lSites();idx++), {
|
||||||
std::vector<int> cbuf(Nd);
|
Coordinate cbuf(Nd);
|
||||||
sobj s;
|
sobj s;
|
||||||
|
sgrid->LocalIndexToLocalCoor(idx,cbuf);
|
||||||
thread_loop_in_region( (int idx=0;idx<sgrid->lSites();idx++), {
|
peekLocalSite(s,result,cbuf);
|
||||||
sgrid->LocalIndexToLocalCoor(idx,cbuf);
|
cbuf[dim]+=((pc+p) % processors[dim])*L;
|
||||||
peekLocalSite(s,result,cbuf);
|
// cbuf[dim]+=p*L;
|
||||||
cbuf[dim]+=((pc+p) % processors[dim])*L;
|
pokeLocalSite(s,pgbuf,cbuf);
|
||||||
// cbuf[dim]+=p*L;
|
});
|
||||||
pokeLocalSite(s,pgbuf,cbuf);
|
if (p != processors[dim] - 1) {
|
||||||
} );
|
result = Cshift(result,dim,L);
|
||||||
}
|
}
|
||||||
if (p != processors[dim] - 1)
|
|
||||||
{
|
|
||||||
result = Cshift(result,dim,L);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Loop over orthog coords
|
// Loop over orthog coords
|
||||||
int NN=pencil_g.lSites();
|
int NN=pencil_g.lSites();
|
||||||
GridStopWatch timer;
|
GridStopWatch timer;
|
||||||
timer.Start();
|
timer.Start();
|
||||||
thread_region {
|
thread_loop( (int idx=0;idx<NN;idx++), {
|
||||||
|
Coordinate cbuf(Nd);
|
||||||
std::vector<int> cbuf(Nd);
|
pencil_g.LocalIndexToLocalCoor(idx, cbuf);
|
||||||
|
if ( cbuf[dim] == 0 ) { // restricts loop to plane at lcoor[dim]==0
|
||||||
thread_loop_in_region( (int idx=0;idx<NN;idx++), {
|
FFTW_scalar *in = (FFTW_scalar *)&pgbuf[idx];
|
||||||
pencil_g.LocalIndexToLocalCoor(idx, cbuf);
|
FFTW_scalar *out= (FFTW_scalar *)&pgbuf[idx];
|
||||||
if ( cbuf[dim] == 0 ) { // restricts loop to plane at lcoor[dim]==0
|
FFTW<scalar>::fftw_execute_dft(p,in,out);
|
||||||
FFTW_scalar *in = (FFTW_scalar *)&pgbuf[idx];
|
}
|
||||||
FFTW_scalar *out= (FFTW_scalar *)&pgbuf[idx];
|
});
|
||||||
FFTW<scalar>::fftw_execute_dft(p,in,out);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
timer.Stop();
|
timer.Stop();
|
||||||
|
|
||||||
// performance counting
|
// performance counting
|
||||||
@ -277,19 +269,15 @@ public:
|
|||||||
flops+= flops_call*NN;
|
flops+= flops_call*NN;
|
||||||
|
|
||||||
// writing out result
|
// writing out result
|
||||||
thread_region {
|
thread_loop( (int idx=0;idx<sgrid->lSites();idx++), {
|
||||||
|
Coordinate clbuf(Nd), cgbuf(Nd);
|
||||||
std::vector<int> clbuf(Nd), cgbuf(Nd);
|
sobj s;
|
||||||
sobj s;
|
sgrid->LocalIndexToLocalCoor(idx,clbuf);
|
||||||
|
cgbuf = clbuf;
|
||||||
thread_loop_in_region( (int idx=0;idx<sgrid->lSites();idx++), {
|
cgbuf[dim] = clbuf[dim]+L*pc;
|
||||||
sgrid->LocalIndexToLocalCoor(idx,clbuf);
|
peekLocalSite(s,pgbuf,cgbuf);
|
||||||
cgbuf = clbuf;
|
pokeLocalSite(s,result,clbuf);
|
||||||
cgbuf[dim] = clbuf[dim]+L*pc;
|
});
|
||||||
peekLocalSite(s,pgbuf,cgbuf);
|
|
||||||
pokeLocalSite(s,result,clbuf);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
result = result*div;
|
result = result*div;
|
||||||
|
|
||||||
// destroying plan
|
// destroying plan
|
||||||
|
Loading…
Reference in New Issue
Block a user