diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 1db03813..7ace084c 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -364,9 +364,10 @@ public: //////////////////////////////////////////////////////////////////////// void CommunicateBegin(std::vector > &reqs) { + FlightRecorder::StepLog("Communicate begin"); // All GPU kernel tasks must complete - accelerator_barrier(); // All kernels should ALREADY be complete - _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer + // accelerator_barrier(); // All kernels should ALREADY be complete + // _grid->StencilBarrier(); // Everyone is here, so noone running slow and still using receive buffer // But the HaloGather had a barrier too. for(int i=0;iStencilSendToRecvFromBegin(MpiReqs, @@ -386,18 +387,20 @@ public: void CommunicateComplete(std::vector > &reqs) { + FlightRecorder::StepLog("Start communicate complete"); _grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done if ( this->partialDirichlet ) DslashLogPartial(); else if ( this->fullDirichlet ) DslashLogDirichlet(); else DslashLogFull(); - acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete - accelerator_barrier(); + // acceleratorCopySynchronise();// is in the StencilSendToRecvFromComplete + // accelerator_barrier(); _grid->StencilBarrier(); // run any checksums for(int i=0;i void HaloGather(const Lattice &source,compressor &compress) { - accelerator_barrier(); + // accelerator_barrier(); _grid->StencilBarrier();// Synch shared memory on a single nodes assert(source.Grid()==_grid); @@ -487,7 +490,7 @@ public: HaloGatherDir(source,compress,point,face_idx); } accelerator_barrier(); // All my local gathers are complete - _grid->StencilBarrier();// Synch shared memory on a single nodes + // _grid->StencilBarrier();// Synch shared memory on a single nodes face_table_computed=1; assert(u_comm_offset==_unified_buffer_size); }