mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 07:55:35 +00:00
Fixed the stencil sector and Wilson now agrees between stencil based implementation
and the cshift based implementation. Managed to reduce the volume of code in this sector a little, but consolidation would be good, perhaps taking common logic out into simple helper functions
This commit is contained in:
parent
b0485894b3
commit
dcc23faa4a
@ -39,13 +39,6 @@
|
|||||||
|
|
||||||
namespace Grid {
|
namespace Grid {
|
||||||
|
|
||||||
struct CommsRequest {
|
|
||||||
int words;
|
|
||||||
int unified_buffer_offset;
|
|
||||||
int tag;
|
|
||||||
int to_rank;
|
|
||||||
int from_rank;
|
|
||||||
} ;
|
|
||||||
|
|
||||||
|
|
||||||
class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
|
class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
|
||||||
@ -69,7 +62,6 @@ namespace Grid {
|
|||||||
int _unified_buffer_size;
|
int _unified_buffer_size;
|
||||||
int _request_count;
|
int _request_count;
|
||||||
|
|
||||||
std::vector<CommsRequest> CommsRequests;
|
|
||||||
|
|
||||||
CartesianStencil(GridBase *grid,
|
CartesianStencil(GridBase *grid,
|
||||||
int npoints,
|
int npoints,
|
||||||
@ -90,7 +82,6 @@ namespace Grid {
|
|||||||
template<class vobj,class cobj, class compressor> void
|
template<class vobj,class cobj, class compressor> void
|
||||||
HaloExchange(const Lattice<vobj> &source,std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,compressor &compress)
|
HaloExchange(const Lattice<vobj> &source,std::vector<cobj,alignedAllocator<cobj> > &u_comm_buf,compressor &compress)
|
||||||
{
|
{
|
||||||
std::cout<< "HaloExchange comm_buf.size()="<< u_comm_buf.size()<<" unified_buffer_size"<< _unified_buffer_size<< std::endl;
|
|
||||||
// conformable(source._grid,_grid);
|
// conformable(source._grid,_grid);
|
||||||
assert(source._grid==_grid);
|
assert(source._grid==_grid);
|
||||||
if (u_comm_buf.size() != _unified_buffer_size ) u_comm_buf.resize(_unified_buffer_size);
|
if (u_comm_buf.size() != _unified_buffer_size ) u_comm_buf.resize(_unified_buffer_size);
|
||||||
@ -141,7 +132,6 @@ namespace Grid {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
std::cout<< "HaloExchange complete"<< std::endl;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class vobj,class cobj, class compressor>
|
template<class vobj,class cobj, class compressor>
|
||||||
@ -194,24 +184,18 @@ namespace Grid {
|
|||||||
_grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
|
_grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
|
||||||
assert (xmit_to_rank != _grid->ThisRank());
|
assert (xmit_to_rank != _grid->ThisRank());
|
||||||
assert (recv_from_rank != _grid->ThisRank());
|
assert (recv_from_rank != _grid->ThisRank());
|
||||||
|
|
||||||
// FIXME Implement asynchronous send & also avoid buffer copy
|
// FIXME Implement asynchronous send & also avoid buffer copy
|
||||||
_grid->SendToRecvFrom((void *)&send_buf[0],
|
_grid->SendToRecvFrom((void *)&send_buf[0],
|
||||||
xmit_to_rank,
|
xmit_to_rank,
|
||||||
(void *)&recv_buf[0],
|
(void *)&recv_buf[0],
|
||||||
recv_from_rank,
|
recv_from_rank,
|
||||||
bytes);
|
bytes);
|
||||||
printf("GatherStartComms communicated offnode x %d\n",x);fflush(stdout);
|
|
||||||
|
|
||||||
printf("GatherStartComms inserting %le to u_comm_offset %d buf size %d for dim %d shift %d\n",
|
|
||||||
*( (RealF *) &recv_buf[0]),
|
|
||||||
u_comm_offset,buffer_size,
|
|
||||||
dimension,shift
|
|
||||||
); fflush(stdout);
|
|
||||||
for(int i=0;i<buffer_size;i++){
|
for(int i=0;i<buffer_size;i++){
|
||||||
u_comm_buf[u_comm_offset+i]=recv_buf[i];
|
u_comm_buf[u_comm_offset+i]=recv_buf[i];
|
||||||
}
|
}
|
||||||
u_comm_offset+=buffer_size;
|
u_comm_offset+=buffer_size;
|
||||||
printf("GatherStartComms inserted x %d\n",x);fflush(stdout);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -248,7 +232,7 @@ namespace Grid {
|
|||||||
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
|
int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension];
|
||||||
int words = sizeof(cobj)/sizeof(vector_type);
|
int words = sizeof(cobj)/sizeof(vector_type);
|
||||||
|
|
||||||
/* FIXME ALTERNATE BUFFER DETERMINATION */
|
/* FIXME ALTERNATE BUFFER DETERMINATION ; possibly slow to allocate*/
|
||||||
std::vector<std::vector<scalar_object> > send_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
|
std::vector<std::vector<scalar_object> > send_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
|
||||||
std::vector<std::vector<scalar_object> > recv_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
|
std::vector<std::vector<scalar_object> > recv_buf_extract(Nsimd,std::vector<scalar_object>(buffer_size) );
|
||||||
int bytes = buffer_size*sizeof(scalar_object);
|
int bytes = buffer_size*sizeof(scalar_object);
|
||||||
@ -267,25 +251,21 @@ namespace Grid {
|
|||||||
for(int x=0;x<rd;x++){
|
for(int x=0;x<rd;x++){
|
||||||
|
|
||||||
int any_offnode = ( ((x+sshift)%fd) >= rd );
|
int any_offnode = ( ((x+sshift)%fd) >= rd );
|
||||||
std::cout<<"any_offnode ="<<any_offnode<<std::endl;
|
|
||||||
if ( any_offnode ) {
|
if ( any_offnode ) {
|
||||||
// FIXME call local permute copy if none are offnode.
|
|
||||||
for(int i=0;i<Nsimd;i++){
|
for(int i=0;i<Nsimd;i++){
|
||||||
pointers[i] = &send_buf_extract[i][0];
|
pointers[i] = &send_buf_extract[i][0];
|
||||||
}
|
}
|
||||||
int sx = (x+sshift)%rd;
|
int sx = (x+sshift)%rd;
|
||||||
|
|
||||||
std::cout<< "Gathering "<< x <<std::endl;
|
|
||||||
Gather_plane_extract<cobj>(rhs,pointers,dimension,sx,cbmask,compress);
|
Gather_plane_extract<cobj>(rhs,pointers,dimension,sx,cbmask,compress);
|
||||||
std::cout<< "Gathered "<<std::endl;
|
|
||||||
for(int i=0;i<Nsimd;i++){
|
for(int i=0;i<Nsimd;i++){
|
||||||
|
|
||||||
std::vector<int> icoor;
|
|
||||||
_grid->iCoorFromIindex(icoor,i);
|
|
||||||
|
|
||||||
int inner_bit = (Nsimd>>(permute_type+1));
|
int inner_bit = (Nsimd>>(permute_type+1));
|
||||||
int ic= (i&inner_bit)? 1:0;
|
int ic= (i&inner_bit)? 1:0;
|
||||||
assert(ic==icoor[dimension]);
|
|
||||||
|
|
||||||
int my_coor = rd*ic + x;
|
int my_coor = rd*ic + x;
|
||||||
int nbr_coor = my_coor+sshift;
|
int nbr_coor = my_coor+sshift;
|
||||||
@ -301,12 +281,9 @@ namespace Grid {
|
|||||||
if (nbr_ic) nbr_lane|=inner_bit;
|
if (nbr_ic) nbr_lane|=inner_bit;
|
||||||
assert (sx == nbr_ox);
|
assert (sx == nbr_ox);
|
||||||
|
|
||||||
std::cout<<"nbr_proc "<<nbr_proc<< " x "<<x<<" nbr_x "<<nbr_ox << " lane "<<i << " nbr_lane "<<nbr_lane
|
|
||||||
<< " nbr_ic "<<nbr_ic << " mycoor "<< my_coor<< " nbr_coor "<<nbr_coor<<std::endl;
|
|
||||||
|
|
||||||
if(nbr_proc){
|
if(nbr_proc){
|
||||||
|
|
||||||
std::cout<< "MPI sending "<<std::endl;
|
|
||||||
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
_grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank);
|
||||||
|
|
||||||
_grid->SendToRecvFrom((void *)&send_buf_extract[nbr_lane][0],
|
_grid->SendToRecvFrom((void *)&send_buf_extract[nbr_lane][0],
|
||||||
@ -314,23 +291,20 @@ namespace Grid {
|
|||||||
(void *)&recv_buf_extract[i][0],
|
(void *)&recv_buf_extract[i][0],
|
||||||
recv_from_rank,
|
recv_from_rank,
|
||||||
bytes);
|
bytes);
|
||||||
std::cout<< "MPI complete "<<std::endl;
|
|
||||||
|
|
||||||
rpointers[i] = &recv_buf_extract[i][0];
|
rpointers[i] = &recv_buf_extract[i][0];
|
||||||
std::cout<<"lane "<<i<<" data "<<*( (Real *) rpointers[i])<<std::endl;
|
|
||||||
} else {
|
} else {
|
||||||
rpointers[i] = &send_buf_extract[nbr_lane][0];
|
rpointers[i] = &send_buf_extract[nbr_lane][0];
|
||||||
std::cout<<"lane "<<i<<" data "<<*( (Real *) rpointers[i])<<std::endl;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Here we don't want to scatter, just place into a buffer.
|
// Here we don't want to scatter, just place into a buffer.
|
||||||
std::cout<< "merging u_comm_offset "<< u_comm_offset<<" comm_buf_size" << u_comm_buf.size() <<std::endl;
|
|
||||||
|
|
||||||
for(int i=0;i<buffer_size;i++){
|
for(int i=0;i<buffer_size;i++){
|
||||||
assert(u_comm_offset+i<_unified_buffer_size);
|
assert(u_comm_offset+i<_unified_buffer_size);
|
||||||
merge(u_comm_buf[u_comm_offset+i],rpointers,i);
|
merge(u_comm_buf[u_comm_offset+i],rpointers,i);
|
||||||
}
|
}
|
||||||
|
|
||||||
u_comm_offset+=buffer_size;
|
u_comm_offset+=buffer_size;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -24,7 +24,6 @@ const int WilsonMatrix::Tm = 7;
|
|||||||
|
|
||||||
void Point(int p) {
|
void Point(int p) {
|
||||||
mu=p;
|
mu=p;
|
||||||
std::cout << "WilsonCompressor.Point " << mu<<std::endl;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
vHalfSpinColourVector operator () (const vSpinColourVector &in)
|
vHalfSpinColourVector operator () (const vSpinColourVector &in)
|
||||||
@ -193,7 +192,6 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
|
|||||||
chi_p = &tmp;
|
chi_p = &tmp;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
std::cout<<"Xm for site "<<ss<<" l "<<local<<" p "<<perm<<" chi "<<Reduce(TensorRemove(innerProduct(*chi_p,*chi_p)))<<std::endl;
|
|
||||||
mult(&(Uchi()),&(Umu._odata[ss](Xm)),&(*chi_p)());
|
mult(&(Uchi()),&(Umu._odata[ss](Xm)),&(*chi_p)());
|
||||||
accumReconXm(result,Uchi);
|
accumReconXm(result,Uchi);
|
||||||
|
|
||||||
|
@ -19,7 +19,6 @@ namespace Grid {
|
|||||||
_distances = distances;
|
_distances = distances;
|
||||||
_unified_buffer_size=0;
|
_unified_buffer_size=0;
|
||||||
_request_count =0;
|
_request_count =0;
|
||||||
CommsRequests.resize(0);
|
|
||||||
|
|
||||||
int osites = _grid->oSites();
|
int osites = _grid->oSites();
|
||||||
|
|
||||||
@ -117,6 +116,7 @@ namespace Grid {
|
|||||||
GridBase *grid=_grid;
|
GridBase *grid=_grid;
|
||||||
|
|
||||||
int fd = _grid->_fdimensions[dimension];
|
int fd = _grid->_fdimensions[dimension];
|
||||||
|
int ld = _grid->_ldimensions[dimension];
|
||||||
int rd = _grid->_rdimensions[dimension];
|
int rd = _grid->_rdimensions[dimension];
|
||||||
int pd = _grid->_processors[dimension];
|
int pd = _grid->_processors[dimension];
|
||||||
int simd_layout = _grid->_simd_layout[dimension];
|
int simd_layout = _grid->_simd_layout[dimension];
|
||||||
@ -137,9 +137,10 @@ namespace Grid {
|
|||||||
|
|
||||||
for(int x=0;x<rd;x++){
|
for(int x=0;x<rd;x++){
|
||||||
|
|
||||||
int comm_proc = ((x+sshift)/rd)%pd;
|
int offnode = (((x+sshift)%fd) >= rd );
|
||||||
int offnode = (comm_proc!=0);
|
// int comm_proc = ((x+sshift)/ld)%pd;
|
||||||
int sx = (x+sshift)%rd;
|
// int offnode = (comm_proc!=0);
|
||||||
|
int sx = (x+sshift)%rd;
|
||||||
|
|
||||||
if (!offnode) {
|
if (!offnode) {
|
||||||
|
|
||||||
@ -157,17 +158,9 @@ namespace Grid {
|
|||||||
int recv_from_rank;
|
int recv_from_rank;
|
||||||
int xmit_to_rank;
|
int xmit_to_rank;
|
||||||
|
|
||||||
CommsRequest cr;
|
int unified_buffer_offset = _unified_buffer_size;
|
||||||
|
|
||||||
cr.tag = _request_count++;
|
|
||||||
cr.words = words;
|
|
||||||
cr.unified_buffer_offset = _unified_buffer_size;
|
|
||||||
_unified_buffer_size += words;
|
_unified_buffer_size += words;
|
||||||
grid->ShiftedRanks(dimension,comm_proc,cr.to_rank,cr.from_rank);
|
ScatterPlane(point,dimension,x,cbmask,unified_buffer_offset); // permute/extract/merge is done in comms phase
|
||||||
|
|
||||||
CommsRequests.push_back(cr);
|
|
||||||
|
|
||||||
ScatterPlane(point,dimension,x,cbmask,cr.unified_buffer_offset); // permute/extract/merge is done in comms phase
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -21,7 +21,7 @@ int main (int argc, char ** argv)
|
|||||||
Grid_init(&argc,&argv);
|
Grid_init(&argc,&argv);
|
||||||
|
|
||||||
std::vector<int> simd_layout({1,1,2,2});
|
std::vector<int> simd_layout({1,1,2,2});
|
||||||
std::vector<int> mpi_layout ({2,1,1,2});
|
std::vector<int> mpi_layout ({2,2,2,2});
|
||||||
std::vector<int> latt_size ({8,8,8,8});
|
std::vector<int> latt_size ({8,8,8,8});
|
||||||
|
|
||||||
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
|
GridCartesian Grid(latt_size,simd_layout,mpi_layout);
|
||||||
@ -76,7 +76,8 @@ int main (int argc, char ** argv)
|
|||||||
std::cout << "norm result "<< norm2(result)<<std::endl;
|
std::cout << "norm result "<< norm2(result)<<std::endl;
|
||||||
std::cout << "norm ref "<< norm2(ref)<<std::endl;
|
std::cout << "norm ref "<< norm2(ref)<<std::endl;
|
||||||
|
|
||||||
for(int ss=0;ss<10;ss++ ){
|
// for(int ss=0;ss<10;ss++ ){
|
||||||
|
for(int ss=0;ss<0;ss++ ){
|
||||||
for(int i=0;i<Ns;i++){
|
for(int i=0;i<Ns;i++){
|
||||||
for(int j=0;j<Nc;j++){
|
for(int j=0;j<Nc;j++){
|
||||||
ComplexF * ref_p = (ComplexF *)&ref._odata[ss]()(i)(j);
|
ComplexF * ref_p = (ComplexF *)&ref._odata[ss]()(i)(j);
|
||||||
|
Loading…
Reference in New Issue
Block a user