diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index fa03183c..2a666a04 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -501,6 +501,9 @@ public: void HaloGather(const Lattice &source,compressor &compress) { // accelerator_barrier(); + ////////////////////////////////// + // I will overwrite my send buffers + ////////////////////////////////// _grid->StencilBarrier();// Synch shared memory on a single nodes assert(source.Grid()==_grid); @@ -514,7 +517,12 @@ public: HaloGatherDir(source,compress,point,face_idx); } accelerator_barrier(); // All my local gathers are complete - // _grid->StencilBarrier();// Synch shared memory on a single nodes +#ifdef NVLINK_GET + #warning "NVLINK_GET" + _grid->StencilBarrier(); // He can now get mu local gather, I can get his + // Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check + // Or issue barrier AFTER the DMA is running +#endif face_table_computed=1; assert(u_comm_offset==_unified_buffer_size); } @@ -553,6 +561,7 @@ public: coalescedWrite(to[j] ,coalescedRead(from [j])); }); acceleratorFenceComputeStream(); + // Also fenced in WilsonKernels } }