mirror of
https://github.com/paboyle/Grid.git
synced 2024-12-23 19:35:26 +00:00
Pushed the overlap comms tweaks
This commit is contained in:
parent
dafc74020c
commit
fc6ad65751
@ -116,7 +116,8 @@ int main (int argc, char ** argv)
|
||||
|
||||
typename DomainWallFermionR::ImplParams params;
|
||||
params.overlapCommsCompute = overlapComms;
|
||||
|
||||
|
||||
RealD NP = UGrid->_Nprocessors;
|
||||
|
||||
DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);
|
||||
|
||||
@ -136,6 +137,7 @@ int main (int argc, char ** argv)
|
||||
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
|
||||
std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NP<<std::endl;
|
||||
err = ref-result;
|
||||
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
||||
Dw.Report();
|
||||
@ -193,6 +195,7 @@ int main (int argc, char ** argv)
|
||||
double flops=(1344.0*volume*ncall)/2;
|
||||
|
||||
std::cout<<GridLogMessage << "Deo mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||
std::cout<<GridLogMessage << "Deo mflop/s per node "<< flops/(t1-t0)/NP<<std::endl;
|
||||
}
|
||||
|
||||
Dw.DhopEO(src_o,r_e,DaggerNo);
|
||||
|
@ -96,6 +96,7 @@ namespace Grid {
|
||||
Integer to_rank;
|
||||
Integer from_rank;
|
||||
Integer bytes;
|
||||
volatile Integer done;
|
||||
};
|
||||
|
||||
std::vector<Packet> Packets;
|
||||
@ -107,6 +108,8 @@ namespace Grid {
|
||||
p.to_rank = to;
|
||||
p.from_rank= from;
|
||||
p.bytes = bytes;
|
||||
p.done = 0;
|
||||
comms_bytes+=2.0*bytes;
|
||||
Packets.push_back(p);
|
||||
}
|
||||
|
||||
@ -118,6 +121,7 @@ namespace Grid {
|
||||
Packets[i].recv_buf,
|
||||
Packets[i].from_rank,
|
||||
Packets[i].bytes);
|
||||
Packets[i].done = 1;
|
||||
}
|
||||
commtime+=usecond();
|
||||
}
|
||||
@ -129,27 +133,38 @@ namespace Grid {
|
||||
cobj * mpointer;
|
||||
std::vector<scalar_object *> rpointers;
|
||||
Integer buffer_size;
|
||||
Integer packet_id;
|
||||
};
|
||||
|
||||
std::vector<Merge> Mergers;
|
||||
|
||||
void AddMerge(cobj *merge_p,std::vector<scalar_object *> &rpointers,Integer buffer_size) {
|
||||
void AddMerge(cobj *merge_p,std::vector<scalar_object *> &rpointers,Integer buffer_size,Integer packet_id) {
|
||||
Merge m;
|
||||
m.mpointer = merge_p;
|
||||
m.rpointers= rpointers;
|
||||
m.buffer_size = buffer_size;
|
||||
m.packet_id = packet_id;
|
||||
Mergers.push_back(m);
|
||||
}
|
||||
|
||||
void CommsMerge(void ) {
|
||||
mergetime-=usecond();
|
||||
//PARALLEL_NESTED_LOOP2
|
||||
for(int i=0;i<Mergers.size();i++){
|
||||
|
||||
|
||||
spintime-=usecond();
|
||||
int packet_id = Mergers[i].packet_id;
|
||||
while(! Packets[packet_id].done ); // spin for completion
|
||||
spintime+=usecond();
|
||||
|
||||
mergetime-=usecond();
|
||||
PARALLEL_FOR_LOOP
|
||||
for(int o=0;o<Mergers[i].buffer_size;o++){
|
||||
merge(Mergers[i].mpointer[o],Mergers[i].rpointers,o);
|
||||
merge1(Mergers[i].mpointer[o],Mergers[i].rpointers,o);
|
||||
}
|
||||
}
|
||||
mergetime+=usecond();
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////
|
||||
@ -188,6 +203,8 @@ PARALLEL_FOR_LOOP
|
||||
double commtime;
|
||||
double halogtime;
|
||||
double mergetime;
|
||||
double spintime;
|
||||
double comms_bytes;
|
||||
double gathermtime;
|
||||
double splicetime;
|
||||
double nosplicetime;
|
||||
@ -206,9 +223,11 @@ PARALLEL_FOR_LOOP
|
||||
commtime=0;
|
||||
halogtime=0;
|
||||
mergetime=0;
|
||||
spintime=0;
|
||||
gathermtime=0;
|
||||
splicetime=0;
|
||||
nosplicetime=0;
|
||||
comms_bytes=0;
|
||||
#endif
|
||||
_npoints = npoints;
|
||||
_grid = grid;
|
||||
@ -218,8 +237,9 @@ PARALLEL_FOR_LOOP
|
||||
|
||||
int osites = _grid->oSites();
|
||||
|
||||
for(int i=0;i<npoints;i++){
|
||||
for(int ii=0;ii<npoints;ii++){
|
||||
|
||||
int i = ii; // reverse direction to get SIMD comms done first
|
||||
int point = i;
|
||||
|
||||
_entries[i].resize( osites);
|
||||
@ -512,10 +532,10 @@ PARALLEL_FOR_LOOP
|
||||
|
||||
void HaloExchangeComplete(std::thread &thr)
|
||||
{
|
||||
CommsMerge(); // spins
|
||||
jointime-=usecond();
|
||||
thr.join();
|
||||
jointime+=usecond();
|
||||
CommsMerge();
|
||||
}
|
||||
|
||||
void HaloGather(const Lattice<vobj> &source,compressor &compress)
|
||||
@ -750,7 +770,7 @@ PARALLEL_FOR_LOOP
|
||||
}
|
||||
}
|
||||
|
||||
AddMerge(&comm_buf[u_comm_offset],rpointers,buffer_size);
|
||||
AddMerge(&comm_buf[u_comm_offset],rpointers,buffer_size,Packets.size()-1);
|
||||
|
||||
u_comm_offset +=buffer_size;
|
||||
}
|
||||
|
@ -249,7 +249,9 @@ void WilsonFermion5D<Impl>::Report(void)
|
||||
std::cout<<GridLogMessage << "Stencil gather "<<Stencil.gathertime<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Stencil gather simd "<<Stencil.gathermtime<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Stencil merge simd "<<Stencil.mergetime<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Stencil spin simd "<<Stencil.spintime<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "********************"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Stencil MB/s "<<(double)Stencil.comms_bytes/Stencil.commtime<<std::endl;
|
||||
std::cout<<GridLogMessage << "Stencil comm time "<<Stencil.commtime<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "Stencil join time "<<Stencil.jointime<<" us"<<std::endl;
|
||||
std::cout<<GridLogMessage << "********************"<<std::endl;
|
||||
@ -425,6 +427,8 @@ void WilsonFermion5D<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st, Le
|
||||
// assert((dag==DaggerNo) ||(dag==DaggerYes));
|
||||
alltime-=usecond();
|
||||
|
||||
int calls;
|
||||
int updates;
|
||||
Compressor compressor(dag);
|
||||
|
||||
// Assume balanced KMP_AFFINITY; this is forced in GridThread.h
|
||||
|
@ -62,13 +62,13 @@ namespace Grid {
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
|
||||
|
||||
void DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
|
||||
|
||||
void DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
|
||||
int DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
|
||||
|
||||
int DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true);
|
||||
|
||||
WilsonKernels(const ImplParams &p= ImplParams());
|
||||
|
||||
|
@ -310,7 +310,7 @@ namespace QCD {
|
||||
|
||||
|
||||
template<class Impl>
|
||||
void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||
int WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
|
||||
{
|
||||
@ -385,6 +385,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeF
|
||||
if ( SE->_permute ) {
|
||||
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if ( Nonlocal && (!SE->_is_local) ) {
|
||||
@ -397,7 +398,6 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeF
|
||||
num++;
|
||||
}
|
||||
|
||||
|
||||
// Yp
|
||||
SE=st.GetEntry(ptype,Yp,ss);
|
||||
offset = SE->_offset;
|
||||
@ -556,6 +556,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeF
|
||||
vstream(ref()(3)(0),result_30*(-0.5));
|
||||
vstream(ref()(3)(1),result_31*(-0.5));
|
||||
vstream(ref()(3)(2),result_32*(-0.5));
|
||||
return 1;
|
||||
} else if ( num ) {
|
||||
vstream(ref()(0)(0),ref()(0)(0)+result_00*(-0.5));
|
||||
vstream(ref()(0)(1),ref()(0)(1)+result_01*(-0.5));
|
||||
@ -569,14 +570,16 @@ void WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeF
|
||||
vstream(ref()(3)(0),ref()(3)(0)+result_30*(-0.5));
|
||||
vstream(ref()(3)(1),ref()(3)(1)+result_31*(-0.5));
|
||||
vstream(ref()(3)(2),ref()(3)(2)+result_32*(-0.5));
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
template<class Impl>
|
||||
void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
int WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
|
||||
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
|
||||
int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
|
||||
{
|
||||
@ -822,6 +825,7 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeFiel
|
||||
vstream(ref()(3)(0),result_30*(-0.5));
|
||||
vstream(ref()(3)(1),result_31*(-0.5));
|
||||
vstream(ref()(3)(2),result_32*(-0.5));
|
||||
return 1;
|
||||
} else if ( num ) {
|
||||
vstream(ref()(0)(0),ref()(0)(0)+result_00*(-0.5));
|
||||
vstream(ref()(0)(1),ref()(0)(1)+result_01*(-0.5));
|
||||
@ -835,7 +839,9 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeFiel
|
||||
vstream(ref()(3)(0),ref()(3)(0)+result_30*(-0.5));
|
||||
vstream(ref()(3)(1),ref()(3)(1)+result_31*(-0.5));
|
||||
vstream(ref()(3)(2),ref()(3)(2)+result_32*(-0.5));
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -217,5 +217,50 @@ void merge(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<class vobj> inline
|
||||
void merge1(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int offset)
|
||||
{
|
||||
typedef typename vobj::scalar_type scalar_type ;
|
||||
typedef typename vobj::vector_type vector_type ;
|
||||
|
||||
const int Nsimd=vobj::vector_type::Nsimd();
|
||||
const int words=sizeof(vobj)/sizeof(vector_type);
|
||||
|
||||
scalar_type *pointer;
|
||||
scalar_type *vp = (scalar_type *)&vec;
|
||||
|
||||
// assert( (((uint64_t)vp)&(sizeof(scalar_type)-1)) == 0);
|
||||
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
pointer=(scalar_type *)&extracted[i][offset];
|
||||
for(int w=0;w<words;w++){
|
||||
vp[w*Nsimd+i] = pointer[w];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<class vobj> inline
|
||||
void merge2(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int offset)
|
||||
{
|
||||
typedef typename vobj::scalar_type scalar_type ;
|
||||
typedef typename vobj::vector_type vector_type ;
|
||||
|
||||
const int Nsimd=vobj::vector_type::Nsimd();
|
||||
const int words=sizeof(vobj)/sizeof(vector_type);
|
||||
|
||||
scalar_type *pointer;
|
||||
scalar_type *vp = (scalar_type *)&vec;
|
||||
// assert( (((uint64_t)vp)&(sizeof(scalar_type)-1)) == 0);
|
||||
|
||||
for(int w=0;w<words;w++){
|
||||
for(int i=0;i<Nsimd;i++){
|
||||
pointer=(scalar_type *)&extracted[i][offset];
|
||||
vp[w*Nsimd+i] =pointer[w];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user