1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-10 07:55:35 +00:00

Eliminate both GPU issue and threading bottle neck by avoiding malloc in coordinate handling

This commit is contained in:
paboyle 2018-02-24 22:24:37 +00:00
parent e158b60bce
commit 4962f59477

View File

@ -115,9 +115,9 @@ private:
double flops_call; double flops_call;
uint64_t usec; uint64_t usec;
std::vector<int> dimensions; Coordinate dimensions;
std::vector<int> processors; Coordinate processors;
std::vector<int> processor_coor; Coordinate processor_coor;
public: public:
@ -137,7 +137,7 @@ public:
{ {
flops=0; flops=0;
usec =0; usec =0;
std::vector<int> layout(Nd,1); Coordinate layout(Nd,1);
sgrid = new GridCartesian(dimensions,layout,processors); sgrid = new GridCartesian(dimensions,layout,processors);
}; };
@ -146,7 +146,7 @@ public:
} }
template<class vobj> template<class vobj>
void FFT_dim_mask(Lattice<vobj> &result,const Lattice<vobj> &source,std::vector<int> mask,int sign){ void FFT_dim_mask(Lattice<vobj> &result,const Lattice<vobj> &source,Coordinate mask,int sign){
conformable(result.Grid(),vgrid); conformable(result.Grid(),vgrid);
conformable(source.Grid(),vgrid); conformable(source.Grid(),vgrid);
@ -162,7 +162,7 @@ public:
template<class vobj> template<class vobj>
void FFT_all_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int sign){ void FFT_all_dim(Lattice<vobj> &result,const Lattice<vobj> &source,int sign){
std::vector<int> mask(Nd,1); Coordinate mask(Nd,1);
FFT_dim_mask(result,source,mask,sign); FFT_dim_mask(result,source,mask,sign);
} }
@ -178,8 +178,8 @@ public:
int L = vgrid->_ldimensions[dim]; int L = vgrid->_ldimensions[dim];
int G = vgrid->_fdimensions[dim]; int G = vgrid->_fdimensions[dim];
std::vector<int> layout(Nd,1); Coordinate layout(Nd,1);
std::vector<int> pencil_gd(vgrid->_fdimensions); Coordinate pencil_gd(vgrid->_fdimensions);
pencil_gd[dim] = G*processors[dim]; pencil_gd[dim] = G*processors[dim];
@ -228,45 +228,37 @@ public:
} }
// Barrel shift and collect global pencil // Barrel shift and collect global pencil
std::vector<int> lcoor(Nd), gcoor(Nd); Coordinate lcoor(Nd), gcoor(Nd);
result = source; result = source;
int pc = processor_coor[dim]; int pc = processor_coor[dim];
for(int p=0;p<processors[dim];p++) { for(int p=0;p<processors[dim];p++) {
thread_region { thread_loop( (int idx=0;idx<sgrid->lSites();idx++), {
std::vector<int> cbuf(Nd); Coordinate cbuf(Nd);
sobj s; sobj s;
sgrid->LocalIndexToLocalCoor(idx,cbuf);
thread_loop_in_region( (int idx=0;idx<sgrid->lSites();idx++), { peekLocalSite(s,result,cbuf);
sgrid->LocalIndexToLocalCoor(idx,cbuf); cbuf[dim]+=((pc+p) % processors[dim])*L;
peekLocalSite(s,result,cbuf); // cbuf[dim]+=p*L;
cbuf[dim]+=((pc+p) % processors[dim])*L; pokeLocalSite(s,pgbuf,cbuf);
// cbuf[dim]+=p*L; });
pokeLocalSite(s,pgbuf,cbuf); if (p != processors[dim] - 1) {
} ); result = Cshift(result,dim,L);
} }
if (p != processors[dim] - 1)
{
result = Cshift(result,dim,L);
}
} }
// Loop over orthog coords // Loop over orthog coords
int NN=pencil_g.lSites(); int NN=pencil_g.lSites();
GridStopWatch timer; GridStopWatch timer;
timer.Start(); timer.Start();
thread_region { thread_loop( (int idx=0;idx<NN;idx++), {
Coordinate cbuf(Nd);
std::vector<int> cbuf(Nd); pencil_g.LocalIndexToLocalCoor(idx, cbuf);
if ( cbuf[dim] == 0 ) { // restricts loop to plane at lcoor[dim]==0
thread_loop_in_region( (int idx=0;idx<NN;idx++), { FFTW_scalar *in = (FFTW_scalar *)&pgbuf[idx];
pencil_g.LocalIndexToLocalCoor(idx, cbuf); FFTW_scalar *out= (FFTW_scalar *)&pgbuf[idx];
if ( cbuf[dim] == 0 ) { // restricts loop to plane at lcoor[dim]==0 FFTW<scalar>::fftw_execute_dft(p,in,out);
FFTW_scalar *in = (FFTW_scalar *)&pgbuf[idx]; }
FFTW_scalar *out= (FFTW_scalar *)&pgbuf[idx]; });
FFTW<scalar>::fftw_execute_dft(p,in,out);
}
});
}
timer.Stop(); timer.Stop();
// performance counting // performance counting
@ -277,19 +269,15 @@ public:
flops+= flops_call*NN; flops+= flops_call*NN;
// writing out result // writing out result
thread_region { thread_loop( (int idx=0;idx<sgrid->lSites();idx++), {
Coordinate clbuf(Nd), cgbuf(Nd);
std::vector<int> clbuf(Nd), cgbuf(Nd); sobj s;
sobj s; sgrid->LocalIndexToLocalCoor(idx,clbuf);
cgbuf = clbuf;
thread_loop_in_region( (int idx=0;idx<sgrid->lSites();idx++), { cgbuf[dim] = clbuf[dim]+L*pc;
sgrid->LocalIndexToLocalCoor(idx,clbuf); peekLocalSite(s,pgbuf,cgbuf);
cgbuf = clbuf; pokeLocalSite(s,result,clbuf);
cgbuf[dim] = clbuf[dim]+L*pc; });
peekLocalSite(s,pgbuf,cgbuf);
pokeLocalSite(s,result,clbuf);
});
}
result = result*div; result = result*div;
// destroying plan // destroying plan