1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-04 11:15:55 +01:00

Regressing to not overlap comms and compute becasue bluewaters, edison, and cori are so rubbish at it.

This commit is contained in:
Peter Boyle 2016-02-16 13:56:44 -06:00
parent 340a29b735
commit 81395e85d1
3 changed files with 290 additions and 21 deletions

View File

@ -101,7 +101,16 @@ namespace Grid {
std::vector<Packet> Packets; std::vector<Packet> Packets;
#define SEND_IMMEDIATE
#define SERIAL_SENDS
void AddPacket(void *xmit,void * rcv, Integer to,Integer from,Integer bytes){ void AddPacket(void *xmit,void * rcv, Integer to,Integer from,Integer bytes){
comms_bytes+=2.0*bytes;
#ifdef SEND_IMMEDIATE
commtime-=usecond();
_grid->SendToRecvFrom(xmit,to,rcv,from,bytes);
commtime+=usecond();
#endif
Packet p; Packet p;
p.send_buf = xmit; p.send_buf = xmit;
p.recv_buf = rcv; p.recv_buf = rcv;
@ -111,19 +120,21 @@ namespace Grid {
p.done = 0; p.done = 0;
comms_bytes+=2.0*bytes; comms_bytes+=2.0*bytes;
Packets.push_back(p); Packets.push_back(p);
} }
#undef SERIAL_SENDS
#ifdef SERIAL_SENDS #ifdef SERIAL_SENDS
void Communicate(void ) { void Communicate(void ) {
commtime-=usecond(); commtime-=usecond();
for(int i=0;i<Packets.size();i++){ for(int i=0;i<Packets.size();i++){
#ifndef SEND_IMMEDIATE
_grid->SendToRecvFrom( _grid->SendToRecvFrom(
Packets[i].send_buf, Packets[i].send_buf,
Packets[i].to_rank, Packets[i].to_rank,
Packets[i].recv_buf, Packets[i].recv_buf,
Packets[i].from_rank, Packets[i].from_rank,
Packets[i].bytes); Packets[i].bytes);
#endif
Packets[i].done = 1; Packets[i].done = 1;
} }
commtime+=usecond(); commtime+=usecond();
@ -138,18 +149,22 @@ namespace Grid {
for(int ii=0;ii<concurrency;ii++){ for(int ii=0;ii<concurrency;ii++){
int j = i+ii; int j = i+ii;
if ( j<Packets.size() ) { if ( j<Packets.size() ) {
#ifndef SEND_IMMEDIATE
_grid->SendToRecvFromBegin(reqs[j], _grid->SendToRecvFromBegin(reqs[j],
Packets[j].send_buf, Packets[j].send_buf,
Packets[j].to_rank, Packets[j].to_rank,
Packets[j].recv_buf, Packets[j].recv_buf,
Packets[j].from_rank, Packets[j].from_rank,
Packets[j].bytes); Packets[j].bytes);
#endif
} }
} }
for(int ii=0;ii<concurrency;ii++){ for(int ii=0;ii<concurrency;ii++){
int j = i+ii; int j = i+ii;
if ( j<Packets.size() ) { if ( j<Packets.size() ) {
#ifndef SEND_IMMEDIATE
_grid->SendToRecvFromComplete(reqs[i]); _grid->SendToRecvFromComplete(reqs[i]);
#endif
} }
} }
for(int ii=0;ii<concurrency;ii++){ for(int ii=0;ii<concurrency;ii++){
@ -181,7 +196,17 @@ namespace Grid {
m.rpointers= rpointers; m.rpointers= rpointers;
m.buffer_size = buffer_size; m.buffer_size = buffer_size;
m.packet_id = packet_id; m.packet_id = packet_id;
#ifdef SEND_IMMEDIATE
mergetime-=usecond();
PARALLEL_FOR_LOOP
for(int o=0;o<m.buffer_size;o++){
merge1(m.mpointer[o],m.rpointers,o);
}
mergetime+=usecond();
#else
Mergers.push_back(m); Mergers.push_back(m);
#endif
} }
void CommsMerge(void ) { void CommsMerge(void ) {
@ -193,12 +218,14 @@ namespace Grid {
while(! Packets[packet_id].done ); // spin for completion while(! Packets[packet_id].done ); // spin for completion
spintime+=usecond(); spintime+=usecond();
#ifndef SEND_IMMEDIATE
mergetime-=usecond(); mergetime-=usecond();
PARALLEL_FOR_LOOP PARALLEL_FOR_LOOP
for(int o=0;o<Mergers[i].buffer_size;o++){ for(int o=0;o<Mergers[i].buffer_size;o++){
merge1(Mergers[i].mpointer[o],Mergers[i].rpointers,o); merge1(Mergers[i].mpointer[o],Mergers[i].rpointers,o);
} }
mergetime+=usecond(); mergetime+=usecond();
#endif
} }
} }
@ -689,15 +716,8 @@ PARALLEL_FOR_LOOP
assert (recv_from_rank != _grid->ThisRank()); assert (recv_from_rank != _grid->ThisRank());
// FIXME Implement asynchronous send & also avoid buffer copy // FIXME Implement asynchronous send & also avoid buffer copy
/*
_grid->SendToRecvFrom((void *)&send_buf[0],
xmit_to_rank,
(void *)&comm_buf[u_comm_offset],
recv_from_rank,
bytes);
*/
AddPacket((void *)&u_send_buf[u_comm_offset], AddPacket((void *)&u_send_buf[u_comm_offset],
(void *)&comm_buf[u_comm_offset], (void *) &comm_buf[u_comm_offset],
xmit_to_rank, xmit_to_rank,
recv_from_rank, recv_from_rank,
bytes); bytes);

View File

@ -280,11 +280,11 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
DoubledGaugeField & U, DoubledGaugeField & U,
const FermionField &in, FermionField &out,int dag) const FermionField &in, FermionField &out,int dag)
{ {
if ( Impl::overlapCommsCompute () ) { // if ( Impl::overlapCommsCompute () ) {
DhopInternalCommsOverlapCompute(st,lo,U,in,out,dag); // DhopInternalCommsOverlapCompute(st,lo,U,in,out,dag);
} else { // } else {
DhopInternalCommsThenCompute(st,lo,U,in,out,dag); DhopInternalCommsThenCompute(st,lo,U,in,out,dag);
} // }
} }
template<class Impl> template<class Impl>
@ -319,7 +319,7 @@ void WilsonFermion5D<Impl>::DhopInternalCommsThenCompute(StencilImpl & st, Lebes
dslashtime -=usecond(); dslashtime -=usecond();
if ( dag == DaggerYes ) { if ( dag == DaggerYes ) {
if( this->HandOptDslash ) { if( this->HandOptDslash ) {
#pragma omp parallel for schedule(static) PARALLEL_FOR_LOOP
for(int ss=0;ss<U._grid->oSites();ss++){ for(int ss=0;ss<U._grid->oSites();ss++){
int sU=ss; int sU=ss;
for(int s=0;s<Ls;s++){ for(int s=0;s<Ls;s++){
@ -398,7 +398,7 @@ PARALLEL_FOR_LOOP
} }
*/ */
#pragma omp parallel for schedule(static) PARALLEL_FOR_LOOP
for(int ss=0;ss<U._grid->oSites();ss++){ for(int ss=0;ss<U._grid->oSites();ss++){
int sU=ss; int sU=ss;
for(int s=0;s<Ls;s++){ for(int s=0;s<Ls;s++){
@ -426,6 +426,7 @@ void WilsonFermion5D<Impl>::DhopInternalCommsOverlapCompute(StencilImpl & st, Le
DoubledGaugeField & U, DoubledGaugeField & U,
const FermionField &in, FermionField &out,int dag) const FermionField &in, FermionField &out,int dag)
{ {
assert(0);
// assert((dag==DaggerNo) ||(dag==DaggerYes)); // assert((dag==DaggerNo) ||(dag==DaggerYes));
alltime-=usecond(); alltime-=usecond();

View File

@ -308,7 +308,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
namespace Grid { namespace Grid {
namespace QCD { namespace QCD {
#if 0
template<class Impl> template<class Impl>
int WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U, int WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
@ -844,11 +844,12 @@ int WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField
return 0; return 0;
} }
/* #else
template<class Impl> template<class Impl>
void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U, int WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf, std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal) int ss,int sU,const FermionField &in, FermionField &out, bool Local, bool Nonlocal)
{ {
typedef typename Simd::scalar_type S; typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V; typedef typename Simd::vector_type V;
@ -1087,7 +1088,254 @@ void WilsonKernels<Impl >::DiracOptHandDhopSite(StencilImpl &st,DoubledGaugeFiel
vstream(ref()(3)(2),result_32); vstream(ref()(3)(2),result_32);
} }
} }
*/
template<class Impl>
int WilsonKernels<Impl >::DiracOptHandDhopSiteDag(StencilImpl &st,DoubledGaugeField &U,
std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> > &buf,
int ss,int sU,const FermionField &in, FermionField &out,bool l, bool nl)
{
// std::cout << "Hand op Dhop "<<std::endl;
typedef typename Simd::scalar_type S;
typedef typename Simd::vector_type V;
REGISTER Simd result_00; // 12 regs on knc
REGISTER Simd result_01;
REGISTER Simd result_02;
REGISTER Simd result_10;
REGISTER Simd result_11;
REGISTER Simd result_12;
REGISTER Simd result_20;
REGISTER Simd result_21;
REGISTER Simd result_22;
REGISTER Simd result_30;
REGISTER Simd result_31;
REGISTER Simd result_32; // 20 left
REGISTER Simd Chi_00; // two spinor; 6 regs
REGISTER Simd Chi_01;
REGISTER Simd Chi_02;
REGISTER Simd Chi_10;
REGISTER Simd Chi_11;
REGISTER Simd Chi_12; // 14 left
REGISTER Simd UChi_00; // two spinor; 6 regs
REGISTER Simd UChi_01;
REGISTER Simd UChi_02;
REGISTER Simd UChi_10;
REGISTER Simd UChi_11;
REGISTER Simd UChi_12; // 8 left
REGISTER Simd U_00; // two rows of U matrix
REGISTER Simd U_10;
REGISTER Simd U_20;
REGISTER Simd U_01;
REGISTER Simd U_11;
REGISTER Simd U_21; // 2 reg left.
#define Chimu_00 Chi_00
#define Chimu_01 Chi_01
#define Chimu_02 Chi_02
#define Chimu_10 Chi_10
#define Chimu_11 Chi_11
#define Chimu_12 Chi_12
#define Chimu_20 UChi_00
#define Chimu_21 UChi_01
#define Chimu_22 UChi_02
#define Chimu_30 UChi_10
#define Chimu_31 UChi_11
#define Chimu_32 UChi_12
StencilEntry *SE;
int offset,local,perm, ptype;
// Xp
SE=st.GetEntry(ptype,Xp,ss);
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
if ( local ) {
LOAD_CHIMU;
XP_PROJ;
if ( perm) {
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI;
}
{
MULT_2SPIN(Xp);
}
XP_RECON;
// Yp
SE=st.GetEntry(ptype,Yp,ss);
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
if ( local ) {
LOAD_CHIMU;
YP_PROJ;
if ( perm) {
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI;
}
{
MULT_2SPIN(Yp);
}
YP_RECON_ACCUM;
// Zp
SE=st.GetEntry(ptype,Zp,ss);
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
if ( local ) {
LOAD_CHIMU;
ZP_PROJ;
if ( perm) {
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI;
}
{
MULT_2SPIN(Zp);
}
ZP_RECON_ACCUM;
// Tp
SE=st.GetEntry(ptype,Tp,ss);
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
if ( local ) {
LOAD_CHIMU;
TP_PROJ;
if ( perm) {
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI;
}
{
MULT_2SPIN(Tp);
}
TP_RECON_ACCUM;
// Xm
SE=st.GetEntry(ptype,Xm,ss);
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
if ( local ) {
LOAD_CHIMU;
XM_PROJ;
if ( perm) {
PERMUTE_DIR(3); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI;
}
{
MULT_2SPIN(Xm);
}
XM_RECON_ACCUM;
// Ym
SE=st.GetEntry(ptype,Ym,ss);
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
if ( local ) {
LOAD_CHIMU;
YM_PROJ;
if ( perm) {
PERMUTE_DIR(2); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI;
}
{
MULT_2SPIN(Ym);
}
YM_RECON_ACCUM;
// Zm
SE=st.GetEntry(ptype,Zm,ss);
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
if ( local ) {
LOAD_CHIMU;
ZM_PROJ;
if ( perm) {
PERMUTE_DIR(1); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI;
}
{
MULT_2SPIN(Zm);
}
ZM_RECON_ACCUM;
// Tm
SE=st.GetEntry(ptype,Tm,ss);
offset = SE->_offset;
local = SE->_is_local;
perm = SE->_permute;
if ( local ) {
LOAD_CHIMU;
TM_PROJ;
if ( perm) {
PERMUTE_DIR(0); // T==0, Z==1, Y==2, Z==3 expect 1,2,2,2 simd layout etc...
}
} else {
LOAD_CHI;
}
{
MULT_2SPIN(Tm);
}
TM_RECON_ACCUM;
{
SiteSpinor & ref (out._odata[ss]);
vstream(ref()(0)(0),result_00);
vstream(ref()(0)(1),result_01);
vstream(ref()(0)(2),result_02);
vstream(ref()(1)(0),result_10);
vstream(ref()(1)(1),result_11);
vstream(ref()(1)(2),result_12);
vstream(ref()(2)(0),result_20);
vstream(ref()(2)(1),result_21);
vstream(ref()(2)(2),result_22);
vstream(ref()(3)(0),result_30);
vstream(ref()(3)(1),result_31);
vstream(ref()(3)(2),result_32);
}
}
#endif
//////////////////////////////////////////////// ////////////////////////////////////////////////
// Specialise Gparity to simple implementation // Specialise Gparity to simple implementation
//////////////////////////////////////////////// ////////////////////////////////////////////////