From fc6ad657514c7966291c19f22af89de5d5a96f93 Mon Sep 17 00:00:00 2001 From: paboyle Date: Mon, 11 Jan 2016 06:34:22 -0800 Subject: [PATCH] Pushed the overlap comms tweaks --- benchmarks/Benchmark_dwf.cc | 5 ++- lib/Stencil.h | 34 ++++++++++++---- lib/qcd/action/fermion/WilsonFermion5D.cc | 4 ++ lib/qcd/action/fermion/WilsonKernels.h | 14 +++---- lib/qcd/action/fermion/WilsonKernelsHand.cc | 12 ++++-- lib/tensors/Tensor_extract_merge.h | 45 +++++++++++++++++++++ 6 files changed, 96 insertions(+), 18 deletions(-) diff --git a/benchmarks/Benchmark_dwf.cc b/benchmarks/Benchmark_dwf.cc index 9bd20c7a..c1033a1c 100644 --- a/benchmarks/Benchmark_dwf.cc +++ b/benchmarks/Benchmark_dwf.cc @@ -116,7 +116,8 @@ int main (int argc, char ** argv) typename DomainWallFermionR::ImplParams params; params.overlapCommsCompute = overlapComms; - + + RealD NP = UGrid->_Nprocessors; DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params); @@ -136,6 +137,7 @@ int main (int argc, char ** argv) std::cout< Packets; @@ -107,6 +108,8 @@ namespace Grid { p.to_rank = to; p.from_rank= from; p.bytes = bytes; + p.done = 0; + comms_bytes+=2.0*bytes; Packets.push_back(p); } @@ -118,6 +121,7 @@ namespace Grid { Packets[i].recv_buf, Packets[i].from_rank, Packets[i].bytes); + Packets[i].done = 1; } commtime+=usecond(); } @@ -129,27 +133,38 @@ namespace Grid { cobj * mpointer; std::vector rpointers; Integer buffer_size; + Integer packet_id; }; std::vector Mergers; - void AddMerge(cobj *merge_p,std::vector &rpointers,Integer buffer_size) { + void AddMerge(cobj *merge_p,std::vector &rpointers,Integer buffer_size,Integer packet_id) { Merge m; m.mpointer = merge_p; m.rpointers= rpointers; m.buffer_size = buffer_size; + m.packet_id = packet_id; Mergers.push_back(m); } void CommsMerge(void ) { - mergetime-=usecond(); + //PARALLEL_NESTED_LOOP2 for(int i=0;ioSites(); - for(int i=0;i &source,compressor &compress) @@ -750,7 +770,7 @@ PARALLEL_FOR_LOOP } } - AddMerge(&comm_buf[u_comm_offset],rpointers,buffer_size); + AddMerge(&comm_buf[u_comm_offset],rpointers,buffer_size,Packets.size()-1); u_comm_offset +=buffer_size; } diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index 0bffbb44..c9982e4a 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -249,7 +249,9 @@ void WilsonFermion5D::Report(void) std::cout<::DhopInternalCommsOverlapCompute(StencilImpl & st, Le // assert((dag==DaggerNo) ||(dag==DaggerYes)); alltime-=usecond(); + int calls; + int updates; Compressor compressor(dag); // Assume balanced KMP_AFFINITY; this is forced in GridThread.h diff --git a/lib/qcd/action/fermion/WilsonKernels.h b/lib/qcd/action/fermion/WilsonKernels.h index bec4aaa5..b7698730 100644 --- a/lib/qcd/action/fermion/WilsonKernels.h +++ b/lib/qcd/action/fermion/WilsonKernels.h @@ -62,13 +62,13 @@ namespace Grid { std::vector > &buf, int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true); - void DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U, - std::vector > &buf, - int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true); - - void DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U, - std::vector > &buf, - int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true); + int DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U, + std::vector > &buf, + int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true); + + int DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U, + std::vector > &buf, + int sF,int sU,const FermionField &in, FermionField &out,bool local= true, bool nonlocal=true); WilsonKernels(const ImplParams &p= ImplParams()); diff --git a/lib/qcd/action/fermion/WilsonKernelsHand.cc b/lib/qcd/action/fermion/WilsonKernelsHand.cc index 99eb8224..d3d0a6bc 100644 --- a/lib/qcd/action/fermion/WilsonKernelsHand.cc +++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc @@ -310,7 +310,7 @@ namespace QCD { template -void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U, +int WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U, std::vector > &buf, int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal) { @@ -385,6 +385,7 @@ void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeF if ( SE->_permute ) { PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc... } + } if ( Nonlocal && (!SE->_is_local) ) { @@ -397,7 +398,6 @@ void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeF num++; } - // Yp SE=st.GetEntry(ptype,Yp,ss); offset = SE->_offset; @@ -556,6 +556,7 @@ void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeF vstream(ref()(3)(0),result_30*(-0.5)); vstream(ref()(3)(1),result_31*(-0.5)); vstream(ref()(3)(2),result_32*(-0.5)); + return 1; } else if ( num ) { vstream(ref()(0)(0),ref()(0)(0)+result_00*(-0.5)); vstream(ref()(0)(1),ref()(0)(1)+result_01*(-0.5)); @@ -569,14 +570,16 @@ void WilsonKernels::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeF vstream(ref()(3)(0),ref()(3)(0)+result_30*(-0.5)); vstream(ref()(3)(1),ref()(3)(1)+result_31*(-0.5)); vstream(ref()(3)(2),ref()(3)(2)+result_32*(-0.5)); + return 1; } + return 0; } template -void WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U, +int WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U, std::vector > &buf, int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal) { @@ -822,6 +825,7 @@ void WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeFiel vstream(ref()(3)(0),result_30*(-0.5)); vstream(ref()(3)(1),result_31*(-0.5)); vstream(ref()(3)(2),result_32*(-0.5)); + return 1; } else if ( num ) { vstream(ref()(0)(0),ref()(0)(0)+result_00*(-0.5)); vstream(ref()(0)(1),ref()(0)(1)+result_01*(-0.5)); @@ -835,7 +839,9 @@ void WilsonKernels::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeFiel vstream(ref()(3)(0),ref()(3)(0)+result_30*(-0.5)); vstream(ref()(3)(1),ref()(3)(1)+result_31*(-0.5)); vstream(ref()(3)(2),ref()(3)(2)+result_32*(-0.5)); + return 1; } + return 0; } /* diff --git a/lib/tensors/Tensor_extract_merge.h b/lib/tensors/Tensor_extract_merge.h index e10c666c..9fd780c5 100644 --- a/lib/tensors/Tensor_extract_merge.h +++ b/lib/tensors/Tensor_extract_merge.h @@ -217,5 +217,50 @@ void merge(vobj &vec,std::vector &extracted,int } } } + +template inline +void merge1(vobj &vec,std::vector &extracted,int offset) +{ + typedef typename vobj::scalar_type scalar_type ; + typedef typename vobj::vector_type vector_type ; + + const int Nsimd=vobj::vector_type::Nsimd(); + const int words=sizeof(vobj)/sizeof(vector_type); + + scalar_type *pointer; + scalar_type *vp = (scalar_type *)&vec; + + // assert( (((uint64_t)vp)&(sizeof(scalar_type)-1)) == 0); + + for(int i=0;i inline +void merge2(vobj &vec,std::vector &extracted,int offset) +{ + typedef typename vobj::scalar_type scalar_type ; + typedef typename vobj::vector_type vector_type ; + + const int Nsimd=vobj::vector_type::Nsimd(); + const int words=sizeof(vobj)/sizeof(vector_type); + + scalar_type *pointer; + scalar_type *vp = (scalar_type *)&vec; + // assert( (((uint64_t)vp)&(sizeof(scalar_type)-1)) == 0); + + for(int w=0;w