From d19321dfdec93688affa998e14b88352aecee3b6 Mon Sep 17 00:00:00 2001 From: paboyle Date: Sun, 10 Jan 2016 19:20:16 +0000 Subject: [PATCH] Overlap comms compute changes --- lib/Stencil.h | 274 +++++++++++++--------- lib/algorithms/CoarsenedMatrix.h | 6 +- lib/cshift/Cshift_common.h | 6 +- lib/qcd/action/fermion/WilsonFermion.cc | 39 ++- lib/qcd/action/fermion/WilsonFermion.h | 3 - lib/qcd/action/fermion/WilsonFermion5D.cc | 59 +++-- lib/qcd/action/fermion/WilsonFermion5D.h | 1 + tests/Test_stencil.cc | 16 +- 8 files changed, 220 insertions(+), 184 deletions(-) diff --git a/lib/Stencil.h b/lib/Stencil.h index f48fc6f1..b1479a5e 100644 --- a/lib/Stencil.h +++ b/lib/Stencil.h @@ -7,8 +7,6 @@ Copyright (C) 2015 Author: Peter Boyle -Author: Peter Boyle -Author: paboyle This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -88,11 +86,78 @@ namespace Grid { typedef typename cobj::scalar_type scalar_type; typedef typename cobj::scalar_object scalar_object; + ////////////////////////////////////////// + // Comms packet queue for asynch thread + ////////////////////////////////////////// + + struct Packet { + void * send_buf; + void * recv_buf; + Integer to_rank; + Integer from_rank; + Integer bytes; + }; + + std::vector Packets; + + void AddPacket(void *xmit,void * rcv, Integer to,Integer from,Integer bytes){ + Packet p; + p.send_buf = xmit; + p.recv_buf = rcv; + p.to_rank = to; + p.from_rank= from; + p.bytes = bytes; + Packets.push_back(p); + } + + void Communicate(void ) { + for(int i=0;iSendToRecvFrom(Packets[i].send_buf, + Packets[i].to_rank, + Packets[i].recv_buf, + Packets[i].from_rank, + Packets[i].bytes); + } + } + + /////////////////////////////////////////// + // Simd merge queue for asynch comms + /////////////////////////////////////////// + struct Merge { + cobj * mpointer; + std::vector rpointers; + Integer buffer_size; + }; + + std::vector Mergers; + + void AddMerge(cobj *merge_p,std::vector &rpointers,Integer buffer_size) { + Merge m; + m.mpointer = merge_p; + m.rpointers= rpointers; + m.buffer_size = buffer_size; + Mergers.push_back(m); + } + + void CommsMerge(void ) { + mergetime-=usecond(); + for(int i=0;i _directions; std::vector _distances; @@ -101,19 +166,21 @@ namespace Grid { // npoints x Osites() of these std::vector > _entries; - - // Comms buffers - std::vector > send_buf_extract; - std::vector > recv_buf_extract; - std::vector pointers; - std::vector rpointers; - Vector send_buf; - inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point][osite]; } + // Comms buffers + std::vector > u_simd_send_buf; + std::vector > u_simd_recv_buf; + Vector u_send_buf; + Vector comm_buf; + int u_comm_offset; int _unified_buffer_size; - int _request_count; + ///////////////////////////////////////// + // Timing info; ugly; possibly temporary + ///////////////////////////////////////// +#define TIMING_HACK +#ifdef TIMING_HACK double buftime; double gathertime; double commtime; @@ -124,9 +191,7 @@ namespace Grid { double gathermtime; double splicetime; double nosplicetime; - - - +#endif CartesianStencil(GridBase *grid, int npoints, @@ -135,6 +200,7 @@ namespace Grid { const std::vector &distances) : _entries(npoints), _permute_type(npoints), _comm_buf_size(npoints) { +#ifdef TIMING_HACK gathertime=0; commtime=0; commstime=0; @@ -145,13 +211,12 @@ namespace Grid { buftime=0; splicetime=0; nosplicetime=0; - +#endif _npoints = npoints; _grid = grid; _directions = directions; _distances = distances; _unified_buffer_size=0; - _request_count =0; int osites = _grid->oSites(); @@ -197,22 +262,25 @@ namespace Grid { sshift[1] = _grid->CheckerBoardShiftForCB(_checkerboard,dimension,shift,Odd); if ( sshift[0] == sshift[1] ) { - // std::cout<<"Comms 0x3"< o"<<_entries[i][ss]._offset<<"; l"<< - // _entries[i][ss]._is_local<<"; p"<<_entries[i][ss]._permute<Nsimd(); + u_simd_send_buf.resize(Nsimd); + u_simd_recv_buf.resize(Nsimd); + for(int l=0;l_fdimensions[dimension]; @@ -276,17 +344,15 @@ namespace Grid { assert(shift_slice_nblock[dimension]*_grid->_slice_block[dimension]; // done in reduced dims, so SIMD factored - // std::cout << " dim " <CheckerBoardShiftForCB(_checkerboard,dimension,shift,cb); - for(int x=0;xPermuteType(dimension); int sx = (x+sshift)%rd; @@ -310,16 +376,9 @@ namespace Grid { } else { int comm_proc = ((x+sshift)/rd)%pd; offnode = (comm_proc!= 0); - // std::cout << "Stencil x "<x) && (grid->_processor_coor[dimension]==0) ) { wraparound = 1; @@ -337,15 +396,13 @@ namespace Grid { int words = buffer_size; if (cbmask != 0x3) words=words>>1; - // GatherPlaneSimple (point,dimension,sx,cbmask); - int rank = grid->_processor; int recv_from_rank; int xmit_to_rank; int unified_buffer_offset = _unified_buffer_size; _unified_buffer_size += words; - // std::cout<< "Comms dim "< &directions, -// const std::vector &distances); + std::thread HaloExchangeBegin(const Lattice &source,compressor &compress) { + Mergers.resize(0); + Packets.resize(0); + HaloGather(source,compress); + return std::thread([&] { this->Communicate(); }); + } - // Add to tables for various cases; is this mistaken. only local if 1 proc in dim - // Can this be avoided with simpler coding of comms? - // void Local (int point, int dimension,int shift,int cbmask); - // void Comms (int point, int dimension,int shift,int cbmask); - // void CopyPlane(int point, int dimension,int lplane,int rplane,int cbmask,int permute,int wrap); - // void ScatterPlane (int point,int dimension,int plane,int cbmask,int offset,int wrap); - - // Could allow a functional munging of the halo to another type during the comms. - // this could implement the 16bit/32bit/64bit compression. - void HaloExchange(const Lattice &source,std::vector > &u_comm_buf,compressor &compress) + void HaloExchange(const Lattice &source,compressor &compress) + { + auto thr = HaloExchangeBegin(source,compress); + HaloExchangeComplete(thr); + } + + void HaloExchangeComplete(std::thread &thr) { - std::thread thr = HaloExchangeBegin(source,u_comm_buf,compress); thr.join(); + CommsMerge(); } - std::thread HaloExchangeBegin(const Lattice &source,std::vector > & u_comm_buf,compressor &compress) { - return std::thread([&] { this->HaloExchangeBlocking(source,u_comm_buf,compress); }); - } - - void HaloExchangeBlocking(const Lattice &source,std::vector > &u_comm_buf,compressor &compress) + void HaloGather(const Lattice &source,compressor &compress) { // conformable(source._grid,_grid); assert(source._grid==_grid); halotime-=usecond(); - if (u_comm_buf.size() != _unified_buffer_size ) u_comm_buf.resize(_unified_buffer_size); - int u_comm_offset=0; + + assert (comm_buf.size() == _unified_buffer_size ); + u_comm_offset=0; // Gather all comms buffers for(int point = 0 ; point < _npoints; point++) { @@ -506,35 +558,34 @@ namespace Grid { if ( sshift[0] == sshift[1] ) { if (splice_dim) { splicetime-=usecond(); - GatherStartCommsSimd(source,dimension,shift,0x3,u_comm_buf,u_comm_offset,compress); + GatherSimd(source,dimension,shift,0x3,compress); splicetime+=usecond(); } else { nosplicetime-=usecond(); - GatherStartComms(source,dimension,shift,0x3,u_comm_buf,u_comm_offset,compress); + Gather(source,dimension,shift,0x3,compress); nosplicetime+=usecond(); } } else { - // std::cout << "dim "< &rhs,int dimension,int shift,int cbmask, - std::vector > &u_comm_buf, - int &u_comm_offset,compressor & compress) + void Gather(const Lattice &rhs,int dimension,int shift,int cbmask,compressor & compress) { typedef typename cobj::vector_type vector_type; typedef typename cobj::scalar_type scalar_type; @@ -555,8 +606,6 @@ namespace Grid { int buffer_size = _grid->_slice_nblock[dimension]*_grid->_slice_block[dimension]; - if(send_buf.size()CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb); @@ -573,7 +622,7 @@ namespace Grid { int bytes = words * sizeof(cobj); gathertime-=usecond(); - Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask,compress); + Gather_plane_simple (rhs,u_send_buf,dimension,sx,cbmask,compress,u_comm_offset); gathertime+=usecond(); int rank = _grid->_processor; @@ -585,11 +634,19 @@ namespace Grid { // FIXME Implement asynchronous send & also avoid buffer copy commtime-=usecond(); + /* _grid->SendToRecvFrom((void *)&send_buf[0], xmit_to_rank, - (void *)&u_comm_buf[u_comm_offset], + (void *)&comm_buf[u_comm_offset], recv_from_rank, bytes); + */ + AddPacket((void *)&u_send_buf[u_comm_offset], + (void *)&comm_buf[u_comm_offset], + xmit_to_rank, + recv_from_rank, + bytes); + commtime+=usecond(); u_comm_offset+=words; @@ -598,13 +655,10 @@ namespace Grid { } - void GatherStartCommsSimd(const Lattice &rhs,int dimension,int shift,int cbmask, - std::vector > &u_comm_buf, - int &u_comm_offset,compressor &compress) + void GatherSimd(const Lattice &rhs,int dimension,int shift,int cbmask,compressor &compress) { buftime-=usecond(); const int Nsimd = _grid->Nsimd(); - int fd = _grid->_fdimensions[dimension]; int rd = _grid->_rdimensions[dimension]; @@ -628,20 +682,11 @@ namespace Grid { assert(cbmask==0x3); // Fixme think there is a latent bug if not true - // Should grow to max size and then cost very little thereafter - send_buf_extract.resize(Nsimd); - recv_buf_extract.resize(Nsimd); - for(int l=0;l rpointers(Nsimd); + std::vector spointers(Nsimd); + buftime+=usecond(); /////////////////////////////////////////// @@ -659,16 +704,19 @@ namespace Grid { if ( any_offnode ) { for(int i=0;i(rhs,pointers,dimension,sx,cbmask,compress); + Gather_plane_extract(rhs,spointers,dimension,sx,cbmask,compress); gathermtime+=usecond(); for(int i=0;i>(permute_type+1)); int ic= (i&inner_bit)? 1:0; @@ -680,45 +728,43 @@ namespace Grid { int nbr_ox = (nbr_lcoor%rd); // outer coord of peer int nbr_lane = (i&(~inner_bit)); - int recv_from_rank; - int xmit_to_rank; - if (nbr_ic) nbr_lane|=inner_bit; assert (sx == nbr_ox); - + auto rp = &u_simd_recv_buf[i ][u_comm_offset]; + auto sp = &u_simd_send_buf[nbr_lane][u_comm_offset]; + + void *vrp = (void *)rp; + void *vsp = (void *)sp; + + if(nbr_proc){ + int recv_from_rank; + int xmit_to_rank; + _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); commstime-=usecond(); - _grid->SendToRecvFrom((void *)&send_buf_extract[nbr_lane][0], - xmit_to_rank, - (void *)&recv_buf_extract[i][0], - recv_from_rank, - bytes); + AddPacket( vsp,vrp,xmit_to_rank,recv_from_rank,bytes); commstime+=usecond(); - rpointers[i] = &recv_buf_extract[i][0]; + rpointers[i] = rp; } else { - rpointers[i] = &send_buf_extract[nbr_lane][0]; + + rpointers[i] = sp; + } } - // std::cout << " CommsSimd ["< A; - std::vector > comm_buf; /////////////////////// // Interface @@ -217,7 +216,7 @@ namespace Grid { conformable(in._grid,out._grid); SimpleCompressor compressor; - Stencil.HaloExchange(in,comm_buf,compressor); + Stencil.HaloExchange(in,compressor); PARALLEL_FOR_LOOP for(int ss=0;ssoSites();ss++){ @@ -234,7 +233,7 @@ PARALLEL_FOR_LOOP } else if(SE->_is_local) { nbr = in._odata[SE->_offset]; } else { - nbr = comm_buf[SE->_offset]; + nbr = Stencil.comm_buf[SE->_offset]; } res = res + A[point]._odata[ss]*nbr; } @@ -258,7 +257,6 @@ PARALLEL_FOR_LOOP Stencil(&CoarseGrid,geom.npoint,Even,geom.directions,geom.displacements), A(geom.npoint,&CoarseGrid) { - comm_buf.resize(Stencil._unified_buffer_size); }; void CoarsenOperator(GridBase *FineGrid,LinearOperatorBase > &linop, diff --git a/lib/cshift/Cshift_common.h b/lib/cshift/Cshift_common.h index 8caf1a52..82c5f124 100644 --- a/lib/cshift/Cshift_common.h +++ b/lib/cshift/Cshift_common.h @@ -44,7 +44,7 @@ public: // Gather for when there is no need to SIMD split with compression /////////////////////////////////////////////////////////////////// template void -Gather_plane_simple (const Lattice &rhs,std::vector > &buffer,int dimension,int plane,int cbmask,compressor &compress) +Gather_plane_simple (const Lattice &rhs,std::vector > &buffer,int dimension,int plane,int cbmask,compressor &compress, int off=0) { int rd = rhs._grid->_rdimensions[dimension]; @@ -63,7 +63,7 @@ PARALLEL_NESTED_LOOP2 for(int b=0;b_slice_stride[dimension]; int bo = n*rhs._grid->_slice_block[dimension]; - buffer[bo+b]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid); + buffer[off+bo+b]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid); } } } else { @@ -73,7 +73,7 @@ PARALLEL_NESTED_LOOP2 int o = n*rhs._grid->_slice_stride[dimension]; int ocb=1<CheckerBoardFromOindex(o+b);// Could easily be a table lookup if ( ocb &cbmask ) { - buffer[bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid); + buffer[off+bo++]=compress(rhs._odata[so+o+b],dimension,plane,so+o+b,rhs._grid); } } } diff --git a/lib/qcd/action/fermion/WilsonFermion.cc b/lib/qcd/action/fermion/WilsonFermion.cc index 8f35f8a5..57d7862b 100644 --- a/lib/qcd/action/fermion/WilsonFermion.cc +++ b/lib/qcd/action/fermion/WilsonFermion.cc @@ -58,7 +58,6 @@ namespace QCD { UmuOdd (&Hgrid) { // Allocate the required comms buffer - comm_buf.resize(Stencil._unified_buffer_size); // this is always big enough to contain EO ImportGauge(_Umu); } @@ -153,7 +152,7 @@ namespace QCD { FermionField Atilde(B._grid); Atilde = A; - st.HaloExchange(B,comm_buf,compressor); + st.HaloExchange(B,compressor); for(int mu=0;muoSites();sss++){ - Kernels::DiracOptDhopDir(st,U,comm_buf,sss,sss,B,Btilde,mu,gamma); + Kernels::DiracOptDhopDir(st,U,st.comm_buf,sss,sss,B,Btilde,mu,gamma); } ////////////////////////////////////////////////// @@ -274,11 +273,11 @@ PARALLEL_FOR_LOOP Compressor compressor(dag); - Stencil.HaloExchange(in,comm_buf,compressor); + Stencil.HaloExchange(in,compressor); PARALLEL_FOR_LOOP for(int sss=0;sssoSites();sss++){ - Kernels::DiracOptDhopDir(Stencil,Umu,comm_buf,sss,sss,in,out,dirdisp,gamma); + Kernels::DiracOptDhopDir(Stencil,Umu,Stencil.comm_buf,sss,sss,in,out,dirdisp,gamma); } }; @@ -300,30 +299,30 @@ PARALLEL_FOR_LOOP assert((dag==DaggerNo) ||(dag==DaggerYes)); Compressor compressor(dag); - st.HaloExchange(in,comm_buf,compressor); + st.HaloExchange(in,compressor); if ( dag == DaggerYes ) { if( HandOptDslash ) { PARALLEL_FOR_LOOP for(int sss=0;sssoSites();sss++){ - Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sss,sss,in,out); + Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out); } } else { PARALLEL_FOR_LOOP for(int sss=0;sssoSites();sss++){ - Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sss,sss,in,out); + Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out); } } } else { if( HandOptDslash ) { PARALLEL_FOR_LOOP for(int sss=0;sssoSites();sss++){ - Kernels::DiracOptHandDhopSite(st,U,comm_buf,sss,sss,in,out); + Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out); } } else { PARALLEL_FOR_LOOP for(int sss=0;sssoSites();sss++){ - Kernels::DiracOptDhopSite(st,U,comm_buf,sss,sss,in,out); + Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out); } } } @@ -338,8 +337,7 @@ PARALLEL_FOR_LOOP Compressor compressor(dag); - std::thread comms_thread = st.HaloExchangeBegin(in,comm_buf,compressor); - comms_thread.join(); + auto handle = st.HaloExchangeBegin(in,compressor); bool local = true; bool nonlocal = false; @@ -347,28 +345,29 @@ PARALLEL_FOR_LOOP if( HandOptDslash ) { PARALLEL_FOR_LOOP for(int sss=0;sssoSites();sss++){ - Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sss,sss,in,out,local,nonlocal); + Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal); } } else { PARALLEL_FOR_LOOP for(int sss=0;sssoSites();sss++){ - Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sss,sss,in,out,local,nonlocal); + Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal); } } } else { if( HandOptDslash ) { PARALLEL_FOR_LOOP for(int sss=0;sssoSites();sss++){ - Kernels::DiracOptHandDhopSite(st,U,comm_buf,sss,sss,in,out,local,nonlocal); + Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal); } } else { PARALLEL_FOR_LOOP for(int sss=0;sssoSites();sss++){ - Kernels::DiracOptDhopSite(st,U,comm_buf,sss,sss,in,out,local,nonlocal); + Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal); } } } + st.HaloExchangeComplete(handle); local = false; nonlocal = true; @@ -376,24 +375,24 @@ PARALLEL_FOR_LOOP if( HandOptDslash ) { PARALLEL_FOR_LOOP for(int sss=0;sssoSites();sss++){ - Kernels::DiracOptHandDhopSiteDag(st,U,comm_buf,sss,sss,in,out,local,nonlocal); + Kernels::DiracOptHandDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal); } } else { PARALLEL_FOR_LOOP for(int sss=0;sssoSites();sss++){ - Kernels::DiracOptDhopSiteDag(st,U,comm_buf,sss,sss,in,out,local,nonlocal); + Kernels::DiracOptDhopSiteDag(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal); } } } else { if( HandOptDslash ) { PARALLEL_FOR_LOOP for(int sss=0;sssoSites();sss++){ - Kernels::DiracOptHandDhopSite(st,U,comm_buf,sss,sss,in,out,local,nonlocal); + Kernels::DiracOptHandDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal); } } else { PARALLEL_FOR_LOOP for(int sss=0;sssoSites();sss++){ - Kernels::DiracOptDhopSite(st,U,comm_buf,sss,sss,in,out,local,nonlocal); + Kernels::DiracOptDhopSite(st,U,st.comm_buf,sss,sss,in,out,local,nonlocal); } } } diff --git a/lib/qcd/action/fermion/WilsonFermion.h b/lib/qcd/action/fermion/WilsonFermion.h index 5d835c41..6bd9fb38 100644 --- a/lib/qcd/action/fermion/WilsonFermion.h +++ b/lib/qcd/action/fermion/WilsonFermion.h @@ -152,9 +152,6 @@ namespace Grid { DoubledGaugeField Umu; DoubledGaugeField UmuEven; DoubledGaugeField UmuOdd; - - // Comms buffer - std::vector > comm_buf; }; diff --git a/lib/qcd/action/fermion/WilsonFermion5D.cc b/lib/qcd/action/fermion/WilsonFermion5D.cc index 9a1669b6..ba2d7e32 100644 --- a/lib/qcd/action/fermion/WilsonFermion5D.cc +++ b/lib/qcd/action/fermion/WilsonFermion5D.cc @@ -98,12 +98,11 @@ WilsonFermion5D::WilsonFermion5D(GaugeField &_Umu, } // Allocate the required comms buffer - comm_buf.resize(Stencil._unified_buffer_size); // this is always big enough to contain EO - ImportGauge(_Umu); commtime=0; jointime=0; dslashtime=0; + dslash1time=0; } template void WilsonFermion5D::ImportGauge(const GaugeField &_Umu) @@ -121,7 +120,7 @@ void WilsonFermion5D::DhopDir(const FermionField &in, FermionField &out,in // assert( (dir>=0)&&(dir<4) ); //must do x,y,z or t; Compressor compressor(DaggerNo); - Stencil.HaloExchange(in,comm_buf,compressor); + Stencil.HaloExchange(in,compressor); int skip = (disp==1) ? 0 : 1; @@ -136,7 +135,7 @@ PARALLEL_FOR_LOOP for(int s=0;s::DerivInternal(StencilImpl & st, FermionField Btilde(B._grid); FermionField Atilde(B._grid); - st.HaloExchange(B,comm_buf,compressor); + st.HaloExchange(B,compressor); Atilde=A; @@ -184,7 +183,7 @@ PARALLEL_FOR_LOOP assert ( sF< B._grid->oSites()); assert ( sU< U._grid->oSites()); - Kernels::DiracOptDhopDir(st,U,comm_buf,sF,sU,B,Btilde,mu,gamma); + Kernels::DiracOptDhopDir(st,U,st.comm_buf,sF,sU,B,Btilde,mu,gamma); //////////////////////////// // spin trace outer product @@ -236,9 +235,10 @@ template void WilsonFermion5D::Report(void) { std::cout<::DhopInternalCommsThenCompute(StencilImpl & st, Lebes int nwork = U._grid->oSites(); commtime -=usecond(); - std::thread thr = st.HaloExchangeBegin(in,comm_buf,compressor); + auto handle = st.HaloExchangeBegin(in,compressor); + st.HaloExchangeComplete(handle); commtime +=usecond(); jointime -=usecond(); - thr.join(); jointime +=usecond(); // Dhop takes the 4d grid from U, and makes a 5d index for fermion @@ -319,7 +319,7 @@ void WilsonFermion5D::DhopInternalCommsThenCompute(StencilImpl & st, Lebes int sU=ss; for(int s=0;s::DhopInternalCommsOverlapCompute(StencilImpl & st, Le int nwork = U._grid->oSites(); commtime -=usecond(); - std::thread thr = st.HaloExchangeBegin(in,comm_buf,compressor); + auto handle = st.HaloExchangeBegin(in,compressor); commtime +=usecond(); // Dhop takes the 4d grid from U, and makes a 5d index for fermion @@ -450,7 +450,7 @@ PARALLEL_FOR_LOOP int sU=ss; for(int s=0;sHandOptDslash ) { PARALLEL_FOR_LOOP @@ -503,7 +503,7 @@ PARALLEL_FOR_LOOP int sU=ss; for(int s=0;s > comm_buf(myStencil._unified_buffer_size); SimpleCompressor compress; - myStencil.HaloExchange(Foo,comm_buf,compress); + myStencil.HaloExchange(Foo,compress); Bar = Cshift(Foo,dir,disp); @@ -117,7 +116,7 @@ int main (int argc, char ** argv) else if (SE->_is_local) Check._odata[i] = Foo._odata[SE->_offset]; else - Check._odata[i] = comm_buf[SE->_offset]; + Check._odata[i] = myStencil.comm_buf[SE->_offset]; } Real nrmC = norm2(Check); @@ -181,13 +180,10 @@ int main (int argc, char ** argv) ocoor[dir]=(ocoor[dir]+disp)%Fine._rdimensions[dir]; } - std::vector > Ecomm_buf(EStencil._unified_buffer_size); - std::vector > Ocomm_buf(OStencil._unified_buffer_size); - SimpleCompressor compress; - EStencil.HaloExchange(EFoo,Ecomm_buf,compress); - OStencil.HaloExchange(OFoo,Ocomm_buf,compress); + EStencil.HaloExchange(EFoo,compress); + OStencil.HaloExchange(OFoo,compress); Bar = Cshift(Foo,dir,disp); @@ -211,7 +207,7 @@ int main (int argc, char ** argv) else if (SE->_is_local) OCheck._odata[i] = EFoo._odata[SE->_offset]; else - OCheck._odata[i] = Ecomm_buf[SE->_offset]; + OCheck._odata[i] = EStencil.comm_buf[SE->_offset]; } for(int i=0;ioSites();i++){ int permute_type; @@ -224,7 +220,7 @@ int main (int argc, char ** argv) else if (SE->_is_local) ECheck._odata[i] = OFoo._odata[SE->_offset]; else - ECheck._odata[i] = Ocomm_buf[SE->_offset]; + ECheck._odata[i] = OStencil.comm_buf[SE->_offset]; } setCheckerboard(Check,ECheck);