Merge 570b72a47b into ba9bbe0221

Bounce MPI through host
CSHIFT with bounce throuhgh Host memory on MPI packets
2025-06-11 03:46:55 +01:00 · 2025-02-13 14:16:15 +00:00 · 2025-02-12 19:34:59 +00:00 · 2025-02-12 19:09:53 +00:00 · 2025-02-12 17:37:55 +00:00 · 2025-02-12 14:59:28 +00:00
19 changed files with 813 additions and 348 deletions
--- a/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
+++ b/Grid/algorithms/iterative/ImplicitlyRestartedLanczos.h
@ -245,9 +245,10 @@ until convergence
 	_HermOp(src_n,tmp);
 	//	std::cout << GridLogMessage<< tmp<<std::endl; exit(0);
 	//	std::cout << GridLogIRL << " _HermOp " << norm2(tmp) << std::endl;
-	RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
+//	RealD vnum = real(innerProduct(src_n,tmp)); // HermOp.
+	RealD vnum = real(innerProduct(tmp,tmp)); // HermOp^2.
 	RealD vden = norm2(src_n);
-	RealD na = vnum/vden;
+	RealD na = std::sqrt(vnum/vden);
 	if (fabs(evalMaxApprox/na - 1.0) < 0.0001)
 	  i=_MAX_ITER_IRL_MEVAPP_;
 	evalMaxApprox = na;
@ -255,6 +256,7 @@ until convergence
 	src_n = tmp;
      }
    }
+    std::cout << GridLogIRL << " Final evalMaxApprox  " << evalMaxApprox << std::endl;
 	
    std::vector<RealD> lme(Nm);  
    std::vector<RealD> lme2(Nm);
--- a/Grid/allocator/AlignedAllocator.h
+++ b/Grid/allocator/AlignedAllocator.h
@ -175,10 +175,11 @@ template<typename _Tp>  inline bool operator!=(const devAllocator<_Tp>&, const d
 // Template typedefs
 ////////////////////////////////////////////////////////////////////////////////
 template<class T> using hostVector          = std::vector<T,alignedAllocator<T> >;           // Needs autoview
-template<class T> using Vector              = std::vector<T,uvmAllocator<T> >;               // 
+template<class T> using Vector              = std::vector<T,uvmAllocator<T> >;               // Really want to deprecate
 template<class T> using uvmVector           = std::vector<T,uvmAllocator<T> >;               // auto migrating page
 template<class T> using deviceVector        = std::vector<T,devAllocator<T> >;               // device vector

+/*
 template<class T> class vecView
 {
 protected:
@ -214,6 +215,7 @@ template<class T> vecView<T> VectorView(Vector<T> &vec,ViewMode _mode)
 #define autoVecView(v_v,v,mode)					\
  auto v_v = VectorView(v,mode);				\
  ViewCloser<decltype(v_v)> _autoView##v_v(v_v);
+*/

 NAMESPACE_END(Grid);

--- a/Grid/allocator/MemoryManagerCache.cc
+++ b/Grid/allocator/MemoryManagerCache.cc
@ -9,6 +9,7 @@ static char print_buffer [ MAXLINE ];
 #define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer << std::endl;
 #define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug  << print_buffer << std::endl;
 //#define dprintf(...) 
+//#define mprintf(...) 

 ////////////////////////////////////////////////////////////
 // For caching copies of data on device
@ -109,7 +110,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
  ///////////////////////////////////////////////////////////
  assert(AccCache.state!=Empty);
  
-  dprintf("MemoryManager: Discard(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
+  dprintf("MemoryManager: Discard(%lx) %lx",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
  assert(AccCache.accLock==0);
  assert(AccCache.cpuLock==0);
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
@ -119,7 +120,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
    DeviceBytes   -=AccCache.bytes;
    LRUremove(AccCache);
    AccCache.AccPtr=(uint64_t) NULL;
-    dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld\n",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);  
+    dprintf("MemoryManager: Free(%lx) LRU %ld Total %ld",(uint64_t)AccCache.AccPtr,DeviceLRUBytes,DeviceBytes);  
  }
  uint64_t CpuPtr = AccCache.CpuPtr;
  EntryErase(CpuPtr);
@ -139,7 +140,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
  ///////////////////////////////////////////////////////////////////////////
  assert(AccCache.state!=Empty);
  
-  mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld\n",
+  mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld",
 	  (uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr,
 	  (uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock); 
  if (AccCache.accLock!=0) return;
@ -153,7 +154,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
    AccCache.AccPtr=(uint64_t)NULL;
    AccCache.state=CpuDirty; // CPU primary now
    DeviceBytes   -=AccCache.bytes;
-    dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);  
+    dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld ",(uint64_t)AccCache.AccPtr,DeviceBytes);  
  }
  //  uint64_t CpuPtr = AccCache.CpuPtr;
  DeviceEvictions++;
@ -167,7 +168,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
  assert(AccCache.AccPtr!=(uint64_t)NULL);
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
  acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
-  mprintf("MemoryManager: acceleratorCopyFromDevice Flush size %ld AccPtr %lx -> CpuPtr %lx\n",(uint64_t)AccCache.bytes,(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
+  mprintf("MemoryManager: acceleratorCopyFromDevice Flush size %ld AccPtr %lx -> CpuPtr %lx",(uint64_t)AccCache.bytes,(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
  DeviceToHostBytes+=AccCache.bytes;
  DeviceToHostXfer++;
  AccCache.state=Consistent;
@ -182,7 +183,7 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
    AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
    DeviceBytes+=AccCache.bytes;
  }
-  mprintf("MemoryManager: acceleratorCopyToDevice   Clone size %ld AccPtr %lx <- CpuPtr %lx\n",
+  mprintf("MemoryManager: acceleratorCopyToDevice   Clone size %ld AccPtr %lx <- CpuPtr %lx",
 	  (uint64_t)AccCache.bytes,
 	  (uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
  acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
@ -210,7 +211,7 @@ void MemoryManager::CpuDiscard(AcceleratorViewEntry &AccCache)
 void MemoryManager::ViewClose(void* Ptr,ViewMode mode)
 {
  if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
-    dprintf("AcceleratorViewClose %lx\n",(uint64_t)Ptr);
+    dprintf("AcceleratorViewClose %lx",(uint64_t)Ptr);
    AcceleratorViewClose((uint64_t)Ptr);
  } else if( (mode==CpuRead)||(mode==CpuWrite)){
    CpuViewClose((uint64_t)Ptr);
@ -222,7 +223,7 @@ void *MemoryManager::ViewOpen(void* _CpuPtr,size_t bytes,ViewMode mode,ViewAdvis
 {
  uint64_t CpuPtr = (uint64_t)_CpuPtr;
  if( (mode==AcceleratorRead)||(mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard) ){
-    dprintf("AcceleratorViewOpen %lx\n",(uint64_t)CpuPtr);
+    dprintf("AcceleratorViewOpen %lx",(uint64_t)CpuPtr);
    return (void *) AcceleratorViewOpen(CpuPtr,bytes,mode,hint);
  } else if( (mode==CpuRead)||(mode==CpuWrite)){
    return (void *)CpuViewOpen(CpuPtr,bytes,mode,hint);
@ -265,7 +266,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
  assert(AccCache.cpuLock==0);  // Programming error

  if(AccCache.state!=Empty) {
-    dprintf("ViewOpen found entry %lx %lx : sizes %ld %ld accLock %ld\n",
+    dprintf("ViewOpen found entry %lx %lx : sizes %ld %ld accLock %ld",
 		    (uint64_t)AccCache.CpuPtr,
 		    (uint64_t)CpuPtr,
 		    (uint64_t)AccCache.bytes,
@ -305,7 +306,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
      AccCache.state  = Consistent; // Empty + AccRead => Consistent
    }
    AccCache.accLock= 1;
-    dprintf("Copied Empty entry into device accLock= %d\n",AccCache.accLock);
+    dprintf("Copied Empty entry into device accLock= %d",AccCache.accLock);
  } else if(AccCache.state==CpuDirty ){
    if(mode==AcceleratorWriteDiscard) {
      CpuDiscard(AccCache);
@ -318,21 +319,21 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
      AccCache.state  = Consistent; // CpuDirty + AccRead => Consistent
    }
    AccCache.accLock++;
-    dprintf("CpuDirty entry into device ++accLock= %d\n",AccCache.accLock);
+    dprintf("CpuDirty entry into device ++accLock= %d",AccCache.accLock);
  } else if(AccCache.state==Consistent) {
    if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
      AccCache.state  = AccDirty;   // Consistent + AcceleratorWrite=> AccDirty
    else
      AccCache.state  = Consistent; // Consistent + AccRead => Consistent
    AccCache.accLock++;
-    dprintf("Consistent entry into device ++accLock= %d\n",AccCache.accLock);
+    dprintf("Consistent entry into device ++accLock= %d",AccCache.accLock);
  } else if(AccCache.state==AccDirty) {
    if((mode==AcceleratorWrite)||(mode==AcceleratorWriteDiscard))
      AccCache.state  = AccDirty; // AccDirty + AcceleratorWrite=> AccDirty
    else
      AccCache.state  = AccDirty; // AccDirty + AccRead => AccDirty
    AccCache.accLock++;
-    dprintf("AccDirty entry ++accLock= %d\n",AccCache.accLock);
+    dprintf("AccDirty entry ++accLock= %d",AccCache.accLock);
  } else {
    assert(0);
  }
@ -341,7 +342,7 @@ uint64_t MemoryManager::AcceleratorViewOpen(uint64_t CpuPtr,size_t bytes,ViewMod
  // If view is opened on device must remove from LRU
  if(AccCache.LRU_valid==1){
    // must possibly remove from LRU as now locked on GPU
-    dprintf("AccCache entry removed from LRU \n");
+    dprintf("AccCache entry removed from LRU ");
    LRUremove(AccCache);
  }

@ -364,10 +365,10 @@ void MemoryManager::AcceleratorViewClose(uint64_t CpuPtr)
  AccCache.accLock--;
  // Move to LRU queue if not locked and close on device
  if(AccCache.accLock==0) {
-    dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
+    dprintf("AccleratorViewClose %lx AccLock decremented to %ld move to LRU queue",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
    LRUinsert(AccCache);
  } else {
-    dprintf("AccleratorViewClose %lx AccLock decremented to %ld\n",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
+    dprintf("AccleratorViewClose %lx AccLock decremented to %ld",(uint64_t)CpuPtr,(uint64_t)AccCache.accLock);
  }
 }
 void MemoryManager::CpuViewClose(uint64_t CpuPtr)
--- a/Grid/communicator/Communicator_base.h
+++ b/Grid/communicator/Communicator_base.h
@ -33,6 +33,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 ///////////////////////////////////
 #include <Grid/communicator/SharedMemory.h>

+#define NVLINK_GET
+
 NAMESPACE_BEGIN(Grid);

 extern bool Stencil_force_mpi ;
@ -192,6 +194,11 @@ public:
 				      void *recv,
 				      int recv_from_rank,int do_recv,
 				      int xbytes,int rbytes,int dir);
+
+  // Could do a PollHtoD and have a CommsMerge dependence
+  void StencilSendToRecvFromPollDtoH (std::vector<CommsRequest_t> &list);
+  void StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list);
+
  double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 				    void *xmit,
 				    int xmit_to_rank,int do_xmit,
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@ -30,6 +30,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 NAMESPACE_BEGIN(Grid);

+
 Grid_MPI_Comm       CartesianCommunicator::communicator_world;

 ////////////////////////////////////////////
@ -362,8 +363,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int bytes)
 {
  std::vector<MpiCommsRequest_t> reqs(0);
-  unsigned long  xcrc = crc32(0L, Z_NULL, 0);
-  unsigned long  rcrc = crc32(0L, Z_NULL, 0);

  int myrank = _processor;
  int ierr;
@ -379,9 +378,6 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 		    communicator,MPI_STATUS_IGNORE);
  assert(ierr==0);

-  //  xcrc = crc32(xcrc,(unsigned char *)xmit,bytes);
-  //  rcrc = crc32(rcrc,(unsigned char *)recv,bytes);
-  //  printf("proc %d SendToRecvFrom %d bytes xcrc %lx rcrc %lx\n",_processor,bytes,xcrc,rcrc); fflush
 }
 // Basic Halo comms primitive
 double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
@ -399,6 +395,8 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,


 #ifdef ACCELERATOR_AWARE_MPI
+void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
+void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
 double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
 							   void *xmit,
 							   int dest,int dox,
@ -561,53 +559,105 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
  
  if (dox) {
    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
-#undef DEVICE_TO_HOST_CONCURRENT // pipeline
-#ifdef DEVICE_TO_HOST_CONCURRENT
+
      tag= dir+_processor*32;

      host_xmit = this->HostBufferMalloc(xbytes);
-      acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
+      CommsRequest_t srq;
+
+      srq.ev = acceleratorCopyFromDeviceAsynch(xmit, host_xmit,xbytes); // Make this Asynch
      
      //      ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
      //      assert(ierr==0);
      //      off_node_bytes+=xbytes;

-      CommsRequest_t srq;
      srq.PacketType = InterNodeXmit;
      srq.bytes      = xbytes;
      //      srq.req        = xrq;
      srq.host_buf   = host_xmit;
      srq.device_buf = xmit;
+      srq.tag        = tag;
+      srq.dest       = dest;
+      srq.commdir    = commdir;
      list.push_back(srq);
-#else
-      tag= dir+_processor*32;
-
-      host_xmit = this->HostBufferMalloc(xbytes);
-      const int chunks=1;
-      for(int n=0;n<chunks;n++){
-	void * host_xmitc = (void *)( (uint64_t) host_xmit + n*xbytes/chunks);
-	void * xmitc      = (void *)( (uint64_t) xmit      + n*xbytes/chunks);
-	acceleratorCopyFromDeviceAsynch(xmitc, host_xmitc,xbytes/chunks); // Make this Asynch
-      }
-      acceleratorCopySynchronise(); // Complete all pending copy transfers
-      
-      ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
-      assert(ierr==0);
-      off_node_bytes+=xbytes;
-
-      CommsRequest_t srq;
-      srq.PacketType = InterNodeXmit;
-      srq.bytes      = xbytes;
-      srq.req        = xrq;
-      srq.host_buf   = host_xmit;
-      srq.device_buf = xmit;
-      list.push_back(srq);
-#endif
    }
  }

  return off_node_bytes;
 }
+/*
+ * In the interest of better pipelining, poll for completion on each DtoH and 
+ * start MPI_ISend in the meantime
+ */
+void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list)
+{
+  int pending = 0;
+  do {
+
+    pending = 0;
+
+    for(int idx = 0; idx<list.size();idx++){
+
+      if ( list[idx].PacketType==InterNodeRecv ) {
+
+	int flag = 0;
+	MPI_Status status;
+	int ierr = MPI_Test(&list[idx].req,&flag,&status);
+	assert(ierr==0);
+
+	if ( flag ) {
+	  //	  std::cout << " PollIrecv "<<idx<<" flag "<<flag<<std::endl;
+	  acceleratorCopyToDeviceAsynch(list[idx].host_buf,list[idx].device_buf,list[idx].bytes);
+	  list[idx].PacketType=InterNodeReceiveHtoD;
+	} else {
+	  pending ++;
+	}
+      }
+    }
+    //    std::cout << " PollIrecv "<<pending<<" pending requests"<<std::endl;
+  } while ( pending );
+  
+}
+void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list)
+{
+  int pending = 0;
+  do {
+
+    pending = 0;
+
+    for(int idx = 0; idx<list.size();idx++){
+
+      if ( list[idx].PacketType==InterNodeXmit ) {
+
+	if ( acceleratorEventIsComplete(list[idx].ev) ) {
+
+	  void *host_xmit = list[idx].host_buf;
+	  uint32_t xbytes = list[idx].bytes;
+	  int dest        = list[idx].dest;
+	  int tag         = list[idx].tag;
+	  int commdir     = list[idx].commdir;
+	  ///////////////////
+	  // Send packet
+	  ///////////////////
+
+	  //	  std::cout << " DtoH is complete for index "<<idx<<" calling MPI_Isend "<<std::endl;
+	  
+	  MPI_Request xrq;
+	  int ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
+	  assert(ierr==0);
+
+	  list[idx].req        = xrq; // Update the MPI request in the list
+
+	  list[idx].PacketType=InterNodeXmitISend;
+
+	} else {
+	  // not done, so return to polling loop
+	  pending++;
+	}
+      }
+    }
+  } while (pending);
+}  

 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
@ -644,69 +694,84 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
   * - complete all copies
   * - post MPI send asynch
   */
+#ifdef NVLINK_GET
+  if ( dor ) {

-  //  static int printed;
-  //  if((printed<8) && this->IsBoss() ) {
-  //    printf("dir %d doX %d doR %d Face size %ld %ld\n",dir,dox,dor,xbytes,rbytes);
-  //    printed++;
-  //  }
-  
+    if ( ! ( (gfrom ==MPI_UNDEFINED) || Stencil_force_mpi ) ) {
+      // Intranode
+      void *shm = (void *) this->ShmBufferTranslate(from,xmit);
+      assert(shm!=NULL);
+
+      CommsRequest_t srq;
+
+      srq.ev = acceleratorCopyDeviceToDeviceAsynch(shm,recv,rbytes);
+
+      srq.PacketType = IntraNodeRecv;
+      srq.bytes      = xbytes;
+      //      srq.req        = xrq;
+      srq.host_buf   = NULL;
+      srq.device_buf = xmit;
+      srq.tag        = -1;
+      srq.dest       = dest;
+      srq.commdir    = dir;
+      list.push_back(srq);
+    }
+  }  
+#else
  if (dox) {

-    if ( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) {
-#ifdef DEVICE_TO_HOST_CONCURRENT
-      tag= dir+_processor*32;
-      // Find the send in the prepared list
-      int list_idx=-1;
-      for(int idx = 0; idx<list.size();idx++){
-
-	if ( (list[idx].device_buf==xmit)
-	   &&(list[idx].PacketType==InterNodeXmit)
-	   &&(list[idx].bytes==xbytes) ) {
-
-	  list_idx = idx;
-	  host_xmit = list[idx].host_buf;
-	}
-      }
-      assert(list_idx != -1); // found it
-      ierr =MPI_Isend(host_xmit, xbytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
-      assert(ierr==0);
-      list[list_idx].req        = xrq; // Update the MPI request in the list
-      off_node_bytes+=xbytes;
-#endif      
-    } else {
+    if ( !( (gdest == MPI_UNDEFINED) || Stencil_force_mpi ) ) {
+      // Intranode
      void *shm = (void *) this->ShmBufferTranslate(dest,recv);
      assert(shm!=NULL);
-      acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
+
+      CommsRequest_t srq;
+      
+      srq.ev = acceleratorCopyDeviceToDeviceAsynch(xmit,shm,xbytes);
+
+      srq.PacketType = IntraNodeXmit;
+      srq.bytes      = xbytes;
+      //      srq.req        = xrq;
+      srq.host_buf   = NULL;
+      srq.device_buf = xmit;
+      srq.tag        = -1;
+      srq.dest       = dest;
+      srq.commdir    = dir;
+      list.push_back(srq);
+      
    }
  }
+#endif
  return off_node_bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list,int dir)
 {
-  int nreq=list.size();
+  //  int nreq=list.size();

-  if (nreq==0) return;
-  std::vector<MPI_Status> status(nreq);
-  std::vector<MPI_Request> MpiRequests(nreq);
+  //  if (nreq==0) return;
+  //  std::vector<MPI_Status> status(nreq);
+  //  std::vector<MPI_Request> MpiRequests(nreq);

-  for(int r=0;r<nreq;r++){
-    MpiRequests[r] = list[r].req;
-  }
+  //  for(int r=0;r<nreq;r++){
+  //    MpiRequests[r] = list[r].req;
+  //  }
  
-  int ierr = MPI_Waitall(nreq,&MpiRequests[0],&status[0]);
-  assert(ierr==0);
+  //  int ierr = MPI_Waitall(nreq,&MpiRequests[0],&status[0]); // Sends are guaranteed in order. No harm in not completing.
+  //  assert(ierr==0);

-  for(int r=0;r<nreq;r++){
-    if ( list[r].PacketType==InterNodeRecv ) {
-      acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
-    }
-  }
+  //  for(int r=0;r<nreq;r++){
+  //    if ( list[r].PacketType==InterNodeRecv ) {
+  //      acceleratorCopyToDeviceAsynch(list[r].host_buf,list[r].device_buf,list[r].bytes);
+  //    }
+  //  }
+  
+  acceleratorCopySynchronise(); // Complete all pending copy transfers D2D
  
-  acceleratorCopySynchronise(); // Complete all pending copy transfers
  list.resize(0);               // Delete the list
  this->HostBufferFreeAll();    // Clean up the buffer allocs
-  this->StencilBarrier(); 
+#ifndef NVLINK_GET
+  this->StencilBarrier(); // if PUT must check our nbrs have filled our receive buffers.
+#endif   
 }
 #endif
 ////////////////////////////////////////////
--- a/Grid/communicator/Communicator_none.cc
+++ b/Grid/communicator/Communicator_none.cc
@ -132,6 +132,8 @@ double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
 {
  return 2.0*bytes;
 }
+void CartesianCommunicator::StencilSendToRecvFromPollIRecv(std::vector<CommsRequest_t> &list) {};
+void CartesianCommunicator::StencilSendToRecvFromPollDtoH(std::vector<CommsRequest_t> &list) {};
 double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequest_t> &list,
 							   void *xmit,
 							   int xmit_to_rank,int dox,
@ -139,7 +141,7 @@ double CartesianCommunicator::StencilSendToRecvFromPrepare(std::vector<CommsRequ
 							   int recv_from_rank,int dor,
 							   int xbytes,int rbytes, int dir)
 {
-  return xbytes+rbytes;
+  return 0.0;
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
--- a/Grid/communicator/SharedMemory.h
+++ b/Grid/communicator/SharedMemory.h
@ -50,12 +50,30 @@ typedef MPI_Request MpiCommsRequest_t;
 #ifdef ACCELERATOR_AWARE_MPI
 typedef MPI_Request CommsRequest_t;
 #else
-enum PacketType_t { InterNodeXmit, InterNodeRecv, IntraNodeXmit, IntraNodeRecv };
+/*
+ * Enable state transitions as each packet flows.
+ */
+enum PacketType_t {
+  FaceGather,
+  InterNodeXmit,
+  InterNodeRecv,
+  IntraNodeXmit,
+  IntraNodeRecv,
+  InterNodeXmitISend,
+  InterNodeReceiveHtoD
+};
+/*
+ *Package arguments needed for various actions along packet flow
+ */
 typedef struct {
  PacketType_t PacketType;
  void *host_buf;
  void *device_buf;
+  int dest;
+  int tag;
+  int commdir;
  unsigned long bytes;
+  acceleratorEvent_t ev;
  MpiCommsRequest_t req;
 } CommsRequest_t;
 #endif
--- a/Grid/cshift/Cshift_mpi.h
+++ b/Grid/cshift/Cshift_mpi.h
@ -68,7 +68,7 @@ template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension
  if(Cshift_verbose) std::cout << GridLogPerformance << "Cshift took "<< (t1-t0)/1e3 << " ms"<<std::endl;
  return ret;
 }
-#if 1
+
 template<class vobj> void Cshift_comms(Lattice<vobj>& ret,const Lattice<vobj> &rhs,int dimension,int shift)
 {
  int sshift[2];
@ -125,7 +125,11 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
  int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
  static deviceVector<vobj> send_buf; send_buf.resize(buffer_size);
  static deviceVector<vobj> recv_buf; recv_buf.resize(buffer_size);
-    
+#ifndef ACCELERATOR_AWARE_MPI
+  static hostVector<vobj> hsend_buf;  hsend_buf.resize(buffer_size);
+  static hostVector<vobj> hrecv_buf;  hrecv_buf.resize(buffer_size);
+#endif
+  
  int cb= (cbmask==0x2)? Odd : Even;
  int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
  RealD tcopy=0.0;
@ -156,16 +160,29 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
      //      int rank           = grid->_processor;
      int recv_from_rank;
      int xmit_to_rank;
+
      grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
      
      tcomms-=usecond();
      grid->Barrier();

+#ifdef ACCELERATOR_AWARE_MPI
      grid->SendToRecvFrom((void *)&send_buf[0],
 			   xmit_to_rank,
 			   (void *)&recv_buf[0],
 			   recv_from_rank,
 			   bytes);
+#else
+      // bouncy bouncy
+      acceleratorCopyFromDevice(&send_buf[0],&hsend_buf[0],bytes);
+      grid->SendToRecvFrom((void *)&hsend_buf[0],
+			   xmit_to_rank,
+			   (void *)&hrecv_buf[0],
+			   recv_from_rank,
+			   bytes);
+      acceleratorCopyToDevice(&hrecv_buf[0],&recv_buf[0],bytes);
+#endif
+
      xbytes+=bytes;
      grid->Barrier();
      tcomms+=usecond();
@ -226,12 +243,17 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
  static std::vector<deviceVector<scalar_object> >  recv_buf_extract; recv_buf_extract.resize(Nsimd);
  scalar_object *  recv_buf_extract_mpi;
  scalar_object *  send_buf_extract_mpi;
- 
+
+
  for(int s=0;s<Nsimd;s++){
    send_buf_extract[s].resize(buffer_size);
    recv_buf_extract[s].resize(buffer_size);
  }
-
+#ifndef ACCELERATOR_AWARE_MPI
+  hostVector<scalar_object> hsend_buf; hsend_buf.resize(buffer_size);
+  hostVector<scalar_object> hrecv_buf; hrecv_buf.resize(buffer_size);
+#endif
+  
  int bytes = buffer_size*sizeof(scalar_object);

  ExtractPointerArray<scalar_object>  pointers(Nsimd); // 
@ -283,11 +305,22 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo

 	send_buf_extract_mpi = &send_buf_extract[nbr_lane][0];
 	recv_buf_extract_mpi = &recv_buf_extract[i][0];
+#ifdef ACCELERATOR_AWARE_MPI
 	grid->SendToRecvFrom((void *)send_buf_extract_mpi,
 			     xmit_to_rank,
 			     (void *)recv_buf_extract_mpi,
 			     recv_from_rank,
 			     bytes);
+#else
+      // bouncy bouncy
+	acceleratorCopyFromDevice((void *)send_buf_extract_mpi,(void *)&hsend_buf[0],bytes);
+	grid->SendToRecvFrom((void *)&hsend_buf[0],
+			     xmit_to_rank,
+			     (void *)&hrecv_buf[0],
+			     recv_from_rank,
+			     bytes);
+	acceleratorCopyToDevice((void *)&hrecv_buf[0],(void *)recv_buf_extract_mpi,bytes);
+#endif

 	xbytes+=bytes;
 	grid->Barrier();
@ -311,234 +344,6 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
    std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
  }
 }
-#else 
-template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
-{
-  typedef typename vobj::vector_type vector_type;
-  typedef typename vobj::scalar_type scalar_type;
-
-  GridBase *grid=rhs.Grid();
-  Lattice<vobj> temp(rhs.Grid());
-
-  int fd              = rhs.Grid()->_fdimensions[dimension];
-  int rd              = rhs.Grid()->_rdimensions[dimension];
-  int pd              = rhs.Grid()->_processors[dimension];
-  int simd_layout     = rhs.Grid()->_simd_layout[dimension];
-  int comm_dim        = rhs.Grid()->_processors[dimension] >1 ;
-  assert(simd_layout==1);
-  assert(comm_dim==1);
-  assert(shift>=0);
-  assert(shift<fd);
-  RealD tcopy=0.0;
-  RealD tgather=0.0;
-  RealD tscatter=0.0;
-  RealD tcomms=0.0;
-  uint64_t xbytes=0;
-  
-  int buffer_size = rhs.Grid()->_slice_nblock[dimension]*rhs.Grid()->_slice_block[dimension];
-  static cshiftVector<vobj> send_buf_v; send_buf_v.resize(buffer_size);
-  static cshiftVector<vobj> recv_buf_v; recv_buf_v.resize(buffer_size);
-  vobj *send_buf;
-  vobj *recv_buf;
-  {
-    grid->ShmBufferFreeAll();
-    size_t bytes = buffer_size*sizeof(vobj);
-    send_buf=(vobj *)grid->ShmBufferMalloc(bytes);
-    recv_buf=(vobj *)grid->ShmBufferMalloc(bytes);
-  }
-    
-  int cb= (cbmask==0x2)? Odd : Even;
-  int sshift= rhs.Grid()->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
-
-  for(int x=0;x<rd;x++){       
-
-    int sx        =  (x+sshift)%rd;
-    int comm_proc = ((x+sshift)/rd)%pd;
-    
-    if (comm_proc==0) {
-
-      tcopy-=usecond();
-      Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
-      tcopy+=usecond();
-
-    } else {
-
-      int words = buffer_size;
-      if (cbmask != 0x3) words=words>>1;
-
-      int bytes = words * sizeof(vobj);
-
-      tgather-=usecond();
-      Gather_plane_simple (rhs,send_buf_v,dimension,sx,cbmask);
-      tgather+=usecond();
-
-      //      int rank           = grid->_processor;
-      int recv_from_rank;
-      int xmit_to_rank;
-      grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
-
-
-      tcomms-=usecond();
-      //      grid->Barrier();
-
-      acceleratorCopyDeviceToDevice((void *)&send_buf_v[0],(void *)&send_buf[0],bytes);
-      grid->SendToRecvFrom((void *)&send_buf[0],
-			   xmit_to_rank,
-			   (void *)&recv_buf[0],
-			   recv_from_rank,
-			   bytes);
-      xbytes+=bytes;
-      acceleratorCopyDeviceToDevice((void *)&recv_buf[0],(void *)&recv_buf_v[0],bytes);
-
-      //      grid->Barrier();
-      tcomms+=usecond();
-
-      tscatter-=usecond();
-      Scatter_plane_simple (ret,recv_buf_v,dimension,x,cbmask);
-      tscatter+=usecond();
-    }
-  }
-  if(Cshift_verbose){
-    std::cout << GridLogPerformance << " Cshift copy    "<<tcopy/1e3<<" ms"<<std::endl;
-    std::cout << GridLogPerformance << " Cshift gather  "<<tgather/1e3<<" ms"<<std::endl;
-    std::cout << GridLogPerformance << " Cshift scatter "<<tscatter/1e3<<" ms"<<std::endl;
-    std::cout << GridLogPerformance << " Cshift comm    "<<tcomms/1e3<<" ms"<<std::endl;
-    std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s "<<2*xbytes<< " Bytes "<<std::endl;
-  }
-}
-
-template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
-{
-  GridBase *grid=rhs.Grid();
-  const int Nsimd = grid->Nsimd();
-  typedef typename vobj::vector_type vector_type;
-  typedef typename vobj::scalar_object scalar_object;
-  typedef typename vobj::scalar_type scalar_type;
-   
-  int fd = grid->_fdimensions[dimension];
-  int rd = grid->_rdimensions[dimension];
-  int ld = grid->_ldimensions[dimension];
-  int pd = grid->_processors[dimension];
-  int simd_layout     = grid->_simd_layout[dimension];
-  int comm_dim        = grid->_processors[dimension] >1 ;
-
-  //std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
-  //    << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout 
-  //    << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
-
-  assert(comm_dim==1);
-  assert(simd_layout==2);
-  assert(shift>=0);
-  assert(shift<fd);
-  RealD tcopy=0.0;
-  RealD tgather=0.0;
-  RealD tscatter=0.0;
-  RealD tcomms=0.0;
-  uint64_t xbytes=0;
-
-  int permute_type=grid->PermuteType(dimension);
-
-  ///////////////////////////////////////////////
-  // Simd direction uses an extract/merge pair
-  ///////////////////////////////////////////////
-  int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
-  //  int words = sizeof(vobj)/sizeof(vector_type);
-
-  static std::vector<cshiftVector<scalar_object> >  send_buf_extract; send_buf_extract.resize(Nsimd);
-  static std::vector<cshiftVector<scalar_object> >  recv_buf_extract; recv_buf_extract.resize(Nsimd);
-  scalar_object *  recv_buf_extract_mpi;
-  scalar_object *  send_buf_extract_mpi;
-  {
-    size_t bytes = sizeof(scalar_object)*buffer_size;
-    grid->ShmBufferFreeAll();
-    send_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
-    recv_buf_extract_mpi = (scalar_object *)grid->ShmBufferMalloc(bytes);
-  }
-  for(int s=0;s<Nsimd;s++){
-    send_buf_extract[s].resize(buffer_size);
-    recv_buf_extract[s].resize(buffer_size);
-  }
-
-  int bytes = buffer_size*sizeof(scalar_object);
-
-  ExtractPointerArray<scalar_object>  pointers(Nsimd); // 
-  ExtractPointerArray<scalar_object> rpointers(Nsimd); // received pointers
-
-  ///////////////////////////////////////////
-  // Work out what to send where
-  ///////////////////////////////////////////
-  int cb    = (cbmask==0x2)? Odd : Even;
-  int sshift= grid->CheckerBoardShiftForCB(rhs.Checkerboard(),dimension,shift,cb);
-
-  // loop over outer coord planes orthog to dim
-  for(int x=0;x<rd;x++){       
-
-    // FIXME call local permute copy if none are offnode.
-    for(int i=0;i<Nsimd;i++){       
-      pointers[i] = &send_buf_extract[i][0];
-    }
-    tgather-=usecond();
-    int sx   = (x+sshift)%rd;
-    Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
-    tgather+=usecond();
-
-    for(int i=0;i<Nsimd;i++){
-      
-      int inner_bit = (Nsimd>>(permute_type+1));
-      int ic= (i&inner_bit)? 1:0;
-
-      int my_coor          = rd*ic + x;
-      int nbr_coor         = my_coor+sshift;
-      int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
-
-      int nbr_ic   = (nbr_coor%ld)/rd;    // inner coord of peer
-      int nbr_ox   = (nbr_coor%rd);       // outer coord of peer
-      int nbr_lane = (i&(~inner_bit));
-
-      int recv_from_rank;
-      int xmit_to_rank;
-
-      if (nbr_ic) nbr_lane|=inner_bit;
-
-      assert (sx == nbr_ox);
-
-      if(nbr_proc){
-	grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 
-
-	tcomms-=usecond();
-	//	grid->Barrier();
-
-	acceleratorCopyDeviceToDevice((void *)&send_buf_extract[nbr_lane][0],(void *)send_buf_extract_mpi,bytes);
-	grid->SendToRecvFrom((void *)send_buf_extract_mpi,
-			     xmit_to_rank,
-			     (void *)recv_buf_extract_mpi,
-			     recv_from_rank,
-			     bytes);
-	acceleratorCopyDeviceToDevice((void *)recv_buf_extract_mpi,(void *)&recv_buf_extract[i][0],bytes);
-	xbytes+=bytes;
-
-	//	grid->Barrier();
-	tcomms+=usecond();
-	rpointers[i] = &recv_buf_extract[i][0];
-      } else { 
-	rpointers[i] = &send_buf_extract[nbr_lane][0];
-      }
-
-    }
-    tscatter-=usecond();
-    Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
-    tscatter+=usecond();
-
-  }
-  if(Cshift_verbose){
-    std::cout << GridLogPerformance << " Cshift (s) copy    "<<tcopy/1e3<<" ms"<<std::endl;
-    std::cout << GridLogPerformance << " Cshift (s) gather  "<<tgather/1e3<<" ms"<<std::endl;
-    std::cout << GridLogPerformance << " Cshift (s) scatter "<<tscatter/1e3<<" ms"<<std::endl;
-    std::cout << GridLogPerformance << " Cshift (s) comm    "<<tcomms/1e3<<" ms"<<std::endl;
-    std::cout << GridLogPerformance << " Cshift BW      "<<(2.0*xbytes)/tcomms<<" MB/s"<<std::endl;
-  }
-}
-#endif

 NAMESPACE_END(Grid); 

--- a/Grid/lattice/PaddedCell.h
+++ b/Grid/lattice/PaddedCell.h
@ -466,6 +466,12 @@ public:
    static deviceVector<vobj> recv_buf;
    send_buf.resize(buffer_size*2*depth);    
    recv_buf.resize(buffer_size*2*depth);
+#ifndef ACCELERATOR_AWARE_MPI
+    static hostVector<vobj> hsend_buf; 
+    static hostVector<vobj> hrecv_buf;
+    hsend_buf.resize(buffer_size*2*depth);    
+    hrecv_buf.resize(buffer_size*2*depth);
+#endif    

    std::vector<MpiCommsRequest_t> fwd_req;   
    std::vector<MpiCommsRequest_t> bwd_req;   
@ -495,9 +501,17 @@ public:
      t_gather+=usecond()-t;

      t=usecond();
+#ifdef ACCELERATOR_AWARE_MPI
      grid->SendToRecvFromBegin(fwd_req,
 				(void *)&send_buf[d*buffer_size], xmit_to_rank,
 				(void *)&recv_buf[d*buffer_size], recv_from_rank, bytes, tag);
+#else
+      acceleratorCopyFromDevice(&send_buf[d*buffer_size],&hsend_buf[d*buffer_size],bytes);
+      grid->SendToRecvFromBegin(fwd_req,
+				(void *)&hsend_buf[d*buffer_size], xmit_to_rank,
+				(void *)&hrecv_buf[d*buffer_size], recv_from_rank, bytes, tag);
+      acceleratorCopyToDevice(&hrecv_buf[d*buffer_size],&recv_buf[d*buffer_size],bytes);
+#endif
      t_comms+=usecond()-t;
     }
    for ( int d=0;d < depth ; d ++ ) {
@ -508,9 +522,17 @@ public:
      t_gather+= usecond() - t;

      t=usecond();
+#ifdef ACCELERATOR_AWARE_MPI
      grid->SendToRecvFromBegin(bwd_req,
 				(void *)&send_buf[(d+depth)*buffer_size], recv_from_rank,
 				(void *)&recv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
+#else
+      acceleratorCopyFromDevice(&send_buf[(d+depth)*buffer_size],&hsend_buf[(d+depth)*buffer_size],bytes);
+      grid->SendToRecvFromBegin(bwd_req,
+				(void *)&hsend_buf[(d+depth)*buffer_size], recv_from_rank,
+				(void *)&hrecv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
+      acceleratorCopyToDevice(&hrecv_buf[(d+depth)*buffer_size],&recv_buf[(d+depth)*buffer_size],bytes);
+#endif      
      t_comms+=usecond()-t;
    }

--- a/Grid/qcd/action/fermion/WilsonCompressor.h
+++ b/Grid/qcd/action/fermion/WilsonCompressor.h
@ -484,6 +484,12 @@ public:
    this->face_table_computed=1;
    assert(this->u_comm_offset==this->_unified_buffer_size);
    accelerator_barrier();
+#ifdef NVLINK_GET
+    #warning "NVLINK_GET"
+    this->_grid->StencilBarrier(); // He can now get mu local gather, I can get his
+    // Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check
+    // Or issue barrier AFTER the DMA is running
+#endif    
  }

 };
--- a/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
+++ b/Grid/qcd/action/fermion/implementation/WilsonKernelsImplementation.h
@ -504,7 +504,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
    autoView(st_v , st,AcceleratorRead);

   if( interior && exterior ) {
-     acceleratorFenceComputeStream();
+     //     acceleratorFenceComputeStream();
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL(GenericDhopSite); return;}
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite);    return;}
 #ifndef GRID_CUDA
@ -517,7 +517,7 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st,  DoubledGaugeField
     if (Opt == WilsonKernelsStatic::OptInlineAsm  ) {  ASM_CALL(AsmDhopSiteInt);    return;}
 #endif
   } else if( exterior ) {
-     // dependent on result of merge
+     //     // dependent on result of merge
     acceleratorFenceComputeStream();
     if (Opt == WilsonKernelsStatic::OptGeneric    ) { KERNEL_CALL_EXT(GenericDhopSiteExt); return;}
     if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL_EXT(HandDhopSiteExt);    return;}
--- a/Grid/stencil/Stencil.h
+++ b/Grid/stencil/Stencil.h
@ -363,12 +363,16 @@ public:
  ////////////////////////////////////////////////////////////////////////
  void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
+    //    std::cout << "Communicate Begin "<<std::endl;
+    //    _grid->Barrier();
    FlightRecorder::StepLog("Communicate begin");
    // All GPU kernel tasks must complete
    //    accelerator_barrier();     // All kernels should ALREADY be complete
    //    _grid->StencilBarrier();   // Everyone is here, so noone running slow and still using receive buffer
                               // But the HaloGather had a barrier too.
    for(int i=0;i<Packets.size();i++){
+      //      std::cout << "Communicate prepare "<<i<<std::endl;
+      //      _grid->Barrier();
      _grid->StencilSendToRecvFromPrepare(MpiReqs,
 					  Packets[i].send_buf,
 					  Packets[i].to_rank,Packets[i].do_send,
@ -376,8 +380,15 @@ public:
 					  Packets[i].from_rank,Packets[i].do_recv,
 					  Packets[i].xbytes,Packets[i].rbytes,i);
    }
+    //    std::cout << "Communicate PollDtoH "<<std::endl;
+    //    _grid->Barrier();
+    _grid->StencilSendToRecvFromPollDtoH (MpiReqs); /* Starts MPI*/
+    //    std::cout << "Communicate CopySynch "<<std::endl;
+    //    _grid->Barrier();
    acceleratorCopySynchronise();
+    // Starts intranode
    for(int i=0;i<Packets.size();i++){
+      //      std::cout << "Communicate Begin "<<i<<std::endl;
      _grid->StencilSendToRecvFromBegin(MpiReqs,
 					Packets[i].send_buf,
 					Packets[i].to_rank,Packets[i].do_send,
@ -395,7 +406,14 @@ public:

  void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
+    //    std::cout << "Communicate Complete "<<std::endl;
+    //    _grid->Barrier();
    FlightRecorder::StepLog("Start communicate complete");
+    //    std::cout << "Communicate Complete PollIRecv "<<std::endl;
+    //    _grid->Barrier();
+    _grid->StencilSendToRecvFromPollIRecv(MpiReqs);
+    //    std::cout << "Communicate Complete Complete "<<std::endl;
+    //    _grid->Barrier();
    _grid->StencilSendToRecvFromComplete(MpiReqs,0); // MPI is done
    if   ( this->partialDirichlet ) DslashLogPartial();
    else if ( this->fullDirichlet ) DslashLogDirichlet();
@ -483,6 +501,9 @@ public:
  void HaloGather(const Lattice<vobj> &source,compressor &compress)
  {
    //    accelerator_barrier();
+    //////////////////////////////////
+    // I will overwrite my send buffers
+    //////////////////////////////////
    _grid->StencilBarrier();// Synch shared memory on a single nodes

    assert(source.Grid()==_grid);
@ -496,7 +517,12 @@ public:
      HaloGatherDir(source,compress,point,face_idx);
    }
    accelerator_barrier(); // All my local gathers are complete
-    //    _grid->StencilBarrier();// Synch shared memory on a single nodes
+#ifdef NVLINK_GET
+    #warning "NVLINK_GET"
+    _grid->StencilBarrier(); // He can now get mu local gather, I can get his
+    // Synch shared memory on a single nodes; could use an asynchronous barrier here and defer check
+    // Or issue barrier AFTER the DMA is running
+#endif    
    face_table_computed=1;
    assert(u_comm_offset==_unified_buffer_size);
  }
@ -535,6 +561,7 @@ public:
 	  coalescedWrite(to[j] ,coalescedRead(from [j]));
      });
      acceleratorFenceComputeStream();
+      // Also fenced in WilsonKernels
    }
  }
  
@ -663,7 +690,6 @@ public:
 	}
      }
    }
-    std::cout << "BuildSurfaceList size is "<<surface_list.size()<<std::endl;
    surface_list.resize(surface_list_size);
    std::vector<int> surface_list_host(surface_list_size);
    int32_t ss=0;
@ -683,6 +709,7 @@ public:
      }
    }
    acceleratorCopyToDevice(&surface_list_host[0],&surface_list[0],surface_list_size*sizeof(int));
+    std::cout << GridLogMessage<<"BuildSurfaceList size is "<<surface_list_size<<std::endl;
  }
  /// Introduce a block structure and switch off comms on boundaries
  void DirichletBlock(const Coordinate &dirichlet_block)
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@ -343,9 +343,26 @@ inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};

 inline void acceleratorCopySynchronise(void) {  theCopyAccelerator->wait(); }

-inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  {  theCopyAccelerator->memcpy(to,from,bytes);}
-inline void acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes)  { theCopyAccelerator->memcpy(to,from,bytes); }
-inline void acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); }
+
+///////
+// Asynch event interface
+///////
+typedef sycl::event acceleratorEvent_t;
+
+inline void acceleratorEventWait(acceleratorEvent_t ev)
+{
+  ev.wait();
+}
+
+inline int acceleratorEventIsComplete(acceleratorEvent_t ev)
+{
+  return (ev.get_info<sycl::info::event::command_execution_status>() == sycl::info::event_command_status::complete);
+}
+
+inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)  { return theCopyAccelerator->memcpy(to,from,bytes);}
+inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes)        { return theCopyAccelerator->memcpy(to,from,bytes); }
+inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes)      { return theCopyAccelerator->memcpy(to,from,bytes); }
+
 inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)  { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
 inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
 inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();}
@ -358,8 +375,10 @@ inline int  acceleratorIsCommunicable(void *ptr)
  else return 0;
 #endif
  return 1;
+
 }

+
 #endif

 //////////////////////////////////////////////
--- a/benchmarks/Benchmark_dwf_fp32.cc
+++ b/benchmarks/Benchmark_dwf_fp32.cc
@ -52,7 +52,7 @@ int main (int argc, char ** argv)

  int threads = GridThread::GetThreads();

-  int Ls=8;
+  int Ls=16;
  for(int i=0;i<argc;i++) {
    if(std::string(argv[i]) == "-Ls"){
      std::stringstream ss(argv[i+1]); ss >> Ls;
--- a/systems/Aurora/benchmarks/bench16.pbs
+++ b/systems/Aurora/benchmarks/bench16.pbs
@ -32,15 +32,9 @@ export MPICH_OFI_NIC_POLICY=GPU
 # Local vol 16.16.16.32
 #

-VOL 128.64.128.96
-MPI 4.4.4.3
-NPROC 192
-mpiexec -np 192 -ppn 12 -envall ./gpu_tile.sh ./Benchmark_dwf_fp32 --mpi 4.4.4.3 --grid 128.64.128.96 --shm-mpi 0 --shm 2048 --device-mem 32000 --accelerator-threads 32 --comms-overlap
-
-
-LX=32
+LX=16
 LY=16
-LZ=32
+LZ=16
 LT=32

 NX=2
--- a/systems/Aurora/benchmarks/gpu_tile.sh
+++ b/systems/Aurora/benchmarks/gpu_tile.sh
@ -19,7 +19,7 @@ export ONEAPI_DEVICE_FILTER=gpu,level_zero

 export SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS=0
 export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
-export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:3
+export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:4
 export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY=1
 #export SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE=0:2
 #export SYCL_PI_LEVEL_ZERO_USM_RESIDENT=1
--- a/tests/lanczos/LanParams.xml
+++ b/tests/lanczos/LanParams.xml
@ -0,0 +1,6 @@
+<?xml version="1.0"?>
+<grid>
+  <LanczosParameters>
+    <mass>-3.5</mass>
+  </LanczosParameters>
+</grid>
--- a/tests/lanczos/Test_wilson_DWFKernel.cc
+++ b/tests/lanczos/Test_wilson_DWFKernel.cc
@ -0,0 +1,278 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./tests/Test_dwf_lanczos.cc
+
+Copyright (C) 2015
+
+Author: Chulwoo Jung <chulwoo@bnl.gov>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+ ;
+
+typedef WilsonFermionD FermionOp;
+typedef typename WilsonFermionD::FermionField FermionField;
+
+
+RealD AllZero(RealD x) { return 0.; }
+
+namespace Grid {
+
+#if 0
+template<typename Field>
+class RationalHermOp : public LinearFunction<Field> {
+public:
+  using LinearFunction<Field>::operator();  
+//  OperatorFunction<Field>   & _poly;
+  LinearOperatorBase<Field> &_Linop;
+  RealD _massDen, _massNum;
+   
+  FunctionHermOp(LinearOperatorBase<Field>& linop, RealD massDen,RealD massNum)
+    :  _Linop(linop) ,_massDen(massDen),_massNum(massNum) {};
+      
+  void operator()(const Field& in, Field& out) {
+//    _poly(_Linop,in,out);
+  } 
+};
+#endif
+
+template<class Matrix,class Field>
+class InvG5LinearOperator : public LinearOperatorBase<Field> {
+  Matrix &_Mat;
+  RealD _num;
+  RealD _Tol;
+  Integer _MaxIt;
+  Gamma g5;
+
+public:
+  InvG5LinearOperator(Matrix &Mat,RealD num): _Mat(Mat),_num(num), _Tol(1e-12),_MaxIt(10000), g5(Gamma::Algebra::Gamma5) {};
+
+  // Support for coarsening to a multigrid
+  void OpDiag (const Field &in, Field &out) {
+    assert(0);
+    _Mat.Mdiag(in,out);
+  }
+  void OpDir  (const Field &in, Field &out,int dir,int disp) {
+    assert(0);
+    _Mat.Mdir(in,out,dir,disp);
+  }
+  void OpDirAll  (const Field &in, std::vector<Field> &out){
+    assert(0);
+    _Mat.MdirAll(in,out);
+  };
+  void Op     (const Field &in, Field &out){
+    assert(0);
+    _Mat.M(in,out);
+  }
+  void AdjOp     (const Field &in, Field &out){
+    assert(0);
+    _Mat.Mdag(in,out);
+  }
+  void HermOpAndNorm(const Field &in, Field &out,RealD &n1,RealD &n2){
+    HermOp(in,out);
+    ComplexD dot = innerProduct(in,out);
+    n1=real(dot);
+    n2=norm2(out);
+  }
+  void HermOp(const Field &in, Field &out){
+     Field tmp(in.Grid());
+     MdagMLinearOperator<Matrix,Field> denom(_Mat);
+     ConjugateGradient<Field> CG(_Tol,_MaxIt); 
+     _Mat.M(in,tmp);
+     tmp += _num*in;
+     _Mat.Mdag(tmp,out);
+     CG(denom,out,tmp);
+     out = g5*tmp;
+  }
+};
+
+
+struct LanczosParameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
+		  		RealD, mass , 
+				RealD, resid,
+	  			RealD, ChebyLow,
+	  			RealD, ChebyHigh,
+	  			Integer, ChebyOrder)
+//                                  Integer, StartTrajectory,
+//                                  Integer, Trajectories, /* @brief Number of sweeps in this run */
+//                                  bool, MetropolisTest,
+//                                  Integer, NoMetropolisUntil,
+//                                  std::string, StartingType,
+//                                  Integer, SW,
+//				  RealD, Kappa,
+//                                  IntegratorParameters, MD)
+
+  LanczosParameters() {
+    ////////////////////////////// Default values
+      mass = 0;
+//    MetropolisTest    = true;
+//    NoMetropolisUntil = 10;
+//    StartTrajectory   = 0;
+//    SW                = 2;
+//    Trajectories      = 10;
+//    StartingType      = "HotStart";
+    /////////////////////////////////
+  }
+
+  template <class ReaderClass >
+  LanczosParameters(Reader<ReaderClass> & TheReader){
+    initialize(TheReader);
+  }
+
+  template < class ReaderClass > 
+  void initialize(Reader<ReaderClass> &TheReader){
+//    std::cout << GridLogMessage << "Reading HMC\n";
+    read(TheReader, "HMC", *this);
+  }
+
+
+  void print_parameters() const {
+//    std::cout << GridLogMessage << "[HMC parameters] Trajectories            : " << Trajectories << "\n";
+//    std::cout << GridLogMessage << "[HMC parameters] Start trajectory        : " << StartTrajectory << "\n";
+//    std::cout << GridLogMessage << "[HMC parameters] Metropolis test (on/off): " << std::boolalpha << MetropolisTest << "\n";
+//    std::cout << GridLogMessage << "[HMC parameters] Thermalization trajs    : " << NoMetropolisUntil << "\n";
+//    std::cout << GridLogMessage << "[HMC parameters] Starting type           : " << StartingType << "\n";
+//    MD.print_parameters();
+  }
+  
+};
+
+}
+
+int main(int argc, char** argv) {
+  Grid_init(&argc, &argv);
+
+  GridCartesian* UGrid = SpaceTimeGrid::makeFourDimGrid(
+      GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()),
+      GridDefaultMpi());
+  GridRedBlackCartesian* UrbGrid =
+      SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian* FGrid = UGrid;
+  GridRedBlackCartesian* FrbGrid = UrbGrid;
+//  printf("UGrid=%p UrbGrid=%p FGrid=%p FrbGrid=%p\n", UGrid, UrbGrid, FGrid, FrbGrid);
+
+  std::vector<int> seeds4({1, 2, 3, 4});
+  std::vector<int> seeds5({5, 6, 7, 8});
+  GridParallelRNG RNG5(FGrid);
+  RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG RNG4(UGrid);
+  RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG RNG5rb(FrbGrid);
+  RNG5.SeedFixedIntegers(seeds5);
+
+  LatticeGaugeField Umu(UGrid);
+//  SU<Nc>::HotConfiguration(RNG4, Umu);
+
+  FieldMetaData header;
+  std::string file("./config");
+
+  int precision32 = 0;
+  int tworow      = 0;
+//  NerscIO::writeConfiguration(Umu,file,tworow,precision32);
+  NerscIO::readConfiguration(Umu,header,file);
+
+/*
+  std::vector<LatticeColourMatrix> U(4, UGrid);
+  for (int mu = 0; mu < Nd; mu++) {
+    U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
+  }
+*/
+
+  int Nstop = 5;
+  int Nk = 10;
+  int Np = 90;
+  int Nm = Nk + Np;
+  int MaxIt = 10000;
+  RealD resid = 1.0e-5;
+
+  RealD mass = -1.0;
+
+  LanczosParameters LanParams;
+#if 1
+  {
+    XmlReader  HMCrd("LanParams.xml");
+    read(HMCrd,"LanczosParameters",LanParams);
+  }
+#else
+  {
+    LanParams.mass = mass;
+  }
+#endif
+  std::cout << GridLogMessage<< LanParams <<std::endl;
+  { 
+    XmlWriter HMCwr("LanParams.xml.out");
+    write(HMCwr,"LanczosParameters",LanParams);
+  }
+
+  mass=LanParams.mass;
+  resid=LanParams.resid;
+
+
+while ( mass > - 5.0){
+  FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,2.+mass);
+  InvG5LinearOperator<FermionOp,LatticeFermion> HermOp(WilsonOperator,-2.); /// <-----
+  //SchurDiagTwoOperator<FermionOp,FermionField> HermOp(WilsonOperator);
+//  Gamma5HermitianLinearOperator <FermionOp,LatticeFermion> HermOp2(WilsonOperator); /// <-----
+
+  std::vector<double> Coeffs{0, 0, 1.};
+  Polynomial<FermionField> PolyX(Coeffs);
+  Chebyshev<FermionField> Cheby(LanParams.ChebyLow,LanParams.ChebyHigh,LanParams.ChebyOrder);
+
+       FunctionHermOp<FermionField> OpCheby(Cheby,HermOp);
+//     InvHermOp<FermionField> Op(WilsonOperator,HermOp);
+     PlainHermOp<FermionField> Op     (HermOp);
+//     PlainHermOp<FermionField> Op2     (HermOp2);
+
+  ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby, Op, Nstop, Nk, Nm, resid, MaxIt);
+
+  std::vector<RealD> eval(Nm);
+  FermionField src(FGrid);
+  gaussian(RNG5, src);
+  std::vector<FermionField> evec(Nm, FGrid);
+  for (int i = 0; i < 1; i++) {
+    std::cout << i << " / " << Nm << " grid pointer " << evec[i].Grid()
+              << std::endl;
+  };
+
+  int Nconv;
+  IRL.calc(eval, evec, src, Nconv);
+
+  std::cout << mass <<" : " << eval << std::endl;
+
+  Gamma g5(Gamma::Algebra::Gamma5) ;
+  ComplexD dot;
+  FermionField tmp(FGrid);
+  for (int i = 0; i < Nstop ; i++) {
+    tmp = g5*evec[i];
+    dot = innerProduct(tmp,evec[i]);
+    std::cout << mass << " : " << eval[i]  << " " << real(dot) << " " << imag(dot)  << std::endl ;
+  }
+  src  = evec[0]+evec[1]+evec[2];
+  mass += -0.1;
+}
+
+  Grid_finalize();
+}
--- a/tests/lanczos/Test_wilson_specflow.cc
+++ b/tests/lanczos/Test_wilson_specflow.cc
@ -0,0 +1,211 @@
+/*************************************************************************************
+
+Grid physics library, www.github.com/paboyle/Grid
+
+Source file: ./tests/Test_dwf_lanczos.cc
+
+Copyright (C) 2015
+
+Author: Chulwoo Jung <chulwoo@bnl.gov>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+See the full license in the file "LICENSE" in the top level distribution
+directory
+*************************************************************************************/
+/*  END LEGAL */
+#include <Grid/Grid.h>
+
+using namespace std;
+using namespace Grid;
+ ;
+
+typedef WilsonFermionD FermionOp;
+typedef typename WilsonFermionD::FermionField FermionField;
+
+
+RealD AllZero(RealD x) { return 0.; }
+
+namespace Grid {
+
+struct LanczosParameters: Serializable {
+  GRID_SERIALIZABLE_CLASS_MEMBERS(LanczosParameters,
+		  		RealD, mass , 
+	  			RealD, ChebyLow,
+	  			RealD, ChebyHigh,
+	  			Integer, ChebyOrder)
+//                                  Integer, StartTrajectory,
+//                                  Integer, Trajectories, /* @brief Number of sweeps in this run */
+//                                  bool, MetropolisTest,
+//                                  Integer, NoMetropolisUntil,
+//                                  std::string, StartingType,
+//                                  Integer, SW,
+//				  RealD, Kappa,
+//                                  IntegratorParameters, MD)
+
+  LanczosParameters() {
+    ////////////////////////////// Default values
+      mass = 0;
+//    MetropolisTest    = true;
+//    NoMetropolisUntil = 10;
+//    StartTrajectory   = 0;
+//    SW                = 2;
+//    Trajectories      = 10;
+//    StartingType      = "HotStart";
+    /////////////////////////////////
+  }
+
+  template <class ReaderClass >
+  LanczosParameters(Reader<ReaderClass> & TheReader){
+    initialize(TheReader);
+  }
+
+  template < class ReaderClass > 
+  void initialize(Reader<ReaderClass> &TheReader){
+//    std::cout << GridLogMessage << "Reading HMC\n";
+    read(TheReader, "HMC", *this);
+  }
+
+
+  void print_parameters() const {
+//    std::cout << GridLogMessage << "[HMC parameters] Trajectories            : " << Trajectories << "\n";
+//    std::cout << GridLogMessage << "[HMC parameters] Start trajectory        : " << StartTrajectory << "\n";
+//    std::cout << GridLogMessage << "[HMC parameters] Metropolis test (on/off): " << std::boolalpha << MetropolisTest << "\n";
+//    std::cout << GridLogMessage << "[HMC parameters] Thermalization trajs    : " << NoMetropolisUntil << "\n";
+//    std::cout << GridLogMessage << "[HMC parameters] Starting type           : " << StartingType << "\n";
+//    MD.print_parameters();
+  }
+  
+};
+
+}
+
+int main(int argc, char** argv) {
+  Grid_init(&argc, &argv);
+
+  GridCartesian* UGrid = SpaceTimeGrid::makeFourDimGrid(
+      GridDefaultLatt(), GridDefaultSimd(Nd, vComplex::Nsimd()),
+      GridDefaultMpi());
+  GridRedBlackCartesian* UrbGrid =
+      SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
+  GridCartesian* FGrid = UGrid;
+  GridRedBlackCartesian* FrbGrid = UrbGrid;
+//  printf("UGrid=%p UrbGrid=%p FGrid=%p FrbGrid=%p\n", UGrid, UrbGrid, FGrid, FrbGrid);
+
+  std::vector<int> seeds4({1, 2, 3, 4});
+  std::vector<int> seeds5({5, 6, 7, 8});
+  GridParallelRNG RNG5(FGrid);
+  RNG5.SeedFixedIntegers(seeds5);
+  GridParallelRNG RNG4(UGrid);
+  RNG4.SeedFixedIntegers(seeds4);
+  GridParallelRNG RNG5rb(FrbGrid);
+  RNG5.SeedFixedIntegers(seeds5);
+
+  LatticeGaugeField Umu(UGrid);
+//  SU<Nc>::HotConfiguration(RNG4, Umu);
+
+  FieldMetaData header;
+  std::string file("./config");
+
+  int precision32 = 0;
+  int tworow      = 0;
+//  NerscIO::writeConfiguration(Umu,file,tworow,precision32);
+  NerscIO::readConfiguration(Umu,header,file);
+
+/*
+  std::vector<LatticeColourMatrix> U(4, UGrid);
+  for (int mu = 0; mu < Nd; mu++) {
+    U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
+  }
+*/
+
+  int Nstop = 10;
+  int Nk = 20;
+  int Np = 80;
+  int Nm = Nk + Np;
+  int MaxIt = 10000;
+  RealD resid = 1.0e-5;
+
+  RealD mass = -1.0;
+
+  LanczosParameters LanParams;
+#if 1
+  {
+    XmlReader  HMCrd("LanParams.xml");
+    read(HMCrd,"LanczosParameters",LanParams);
+  }
+#else
+  {
+    LanParams.mass = mass;
+  }
+#endif
+  std::cout << GridLogMessage<< LanParams <<std::endl;
+  { 
+    XmlWriter HMCwr("LanParams.xml.out");
+    write(HMCwr,"LanczosParameters",LanParams);
+  }
+
+  mass=LanParams.mass;
+
+
+while ( mass > - 5.0){
+  FermionOp WilsonOperator(Umu,*FGrid,*FrbGrid,mass);
+  MdagMLinearOperator<FermionOp,FermionField> HermOp(WilsonOperator); /// <-----
+  //SchurDiagTwoOperator<FermionOp,FermionField> HermOp(WilsonOperator);
+  Gamma5HermitianLinearOperator <FermionOp,LatticeFermion> HermOp2(WilsonOperator); /// <-----
+
+  std::vector<double> Coeffs{0, 1.};
+  Polynomial<FermionField> PolyX(Coeffs);
+//  Chebyshev<FermionField> Cheby(0.5, 60., 31);
+//                                  RealD, ChebyLow,
+//                                RealD, ChebyHigh,
+//                                Integer, ChebyOrder)
+
+  Chebyshev<FermionField> Cheby(LanParams.ChebyLow,LanParams.ChebyHigh,LanParams.ChebyOrder);
+
+  FunctionHermOp<FermionField> OpCheby(Cheby,HermOp);
+     PlainHermOp<FermionField> Op     (HermOp);
+     PlainHermOp<FermionField> Op2     (HermOp2);
+
+  ImplicitlyRestartedLanczos<FermionField> IRL(OpCheby, Op2, Nstop, Nk, Nm, resid, MaxIt);
+
+  std::vector<RealD> eval(Nm);
+  FermionField src(FGrid);
+  gaussian(RNG5, src);
+  std::vector<FermionField> evec(Nm, FGrid);
+  for (int i = 0; i < 1; i++) {
+    std::cout << i << " / " << Nm << " grid pointer " << evec[i].Grid()
+              << std::endl;
+  };
+
+  int Nconv;
+  IRL.calc(eval, evec, src, Nconv);
+
+  std::cout << mass <<" : " << eval << std::endl;
+
+  Gamma g5(Gamma::Algebra::Gamma5) ;
+  ComplexD dot;
+  FermionField tmp(FGrid);
+  for (int i = 0; i < Nstop ; i++) {
+    tmp = g5*evec[i];
+    dot = innerProduct(tmp,evec[i]);
+    std::cout << mass << " : " << eval[i]  << " " << real(dot) << " " << imag(dot)  << std::endl ;
+  }
+  src  = evec[0]+evec[1]+evec[2];
+  mass += -0.1;
+}
+
+  Grid_finalize();
+}
Author	SHA1	Message	Date
chulwoo1	dabf3cb207	Merge `570b72a47b` into `ba9bbe0221`	2025-02-13 14:16:15 +00:00
Peter Boyle	ba9bbe0221	Bounce MPI through host	2025-02-12 19:34:59 +00:00
Peter Boyle	4c3dd82d84	CSHIFT with bounce throuhgh Host memory on MPI packets	2025-02-12 19:09:53 +00:00
Peter Boyle	44e911b5b7	Comment change	2025-02-12 17:37:55 +00:00
Peter Boyle	a7a16df9d0	GET not put has kinder barrier sequence for NVLINK type access as when GET is done, I can use it without barrier. Moves a barrier to a nicer place, overlapped with DtoH DMA	2025-02-12 14:59:28 +00:00
Peter Boyle	382e0abefd	Was issueing a double fence -- the gather also fences	2025-02-12 14:57:28 +00:00
Peter Boyle	6fdefe5b90	Barrier sequencing if doing "GET" not "PUT" is different. This is somewhat better timing for Barriers	2025-02-12 14:55:20 +00:00
Peter Boyle	4788dd8e2e	More states in packet progression for GPU non aware MPI	2025-02-12 14:53:57 +00:00
Peter Boyle	1cc5f221f3	GET not put ordering is better as I know when I've got all MY data	2025-02-12 14:53:05 +00:00
Peter Boyle	93251bfba0	GET not put for better ordering in the downstream dependent kernels -- I know when I'm done, so we can move a barrier / handshake between ranks intranode to a point off critical path	2025-02-12 14:50:21 +00:00
Peter Boyle	18b79508b8	New line better for pretty print	2025-02-12 14:49:48 +00:00
Peter Boyle	4de5ed1613	Remove vector view. The std::vector will not inform Memory manager of deletion and so a stale entry could be left. It is not and should not be used.	2025-02-12 14:48:46 +00:00
Peter Boyle	0baaddbe98	Pipeline mode commit on Aurora. 5+ TF/s on 16^3x32 per tile at 384 nodes. More concurrency/fine grained scheduling is possible.	2025-02-04 19:27:26 +00:00
Chulwoo Jung	570b72a47b	Bugfix. Sorry!	2025-01-21 15:37:39 -05:00
Chulwoo Jung	a5798a89ed	Merge branch 'develop' into specflow	2025-01-21 12:13:24 -05:00
Chulwoo Jung	f7e2f9a401	Checking in spectral flow and DWF/Mobius kernel eigenvalue measurement	2025-01-16 20:47:33 +00:00
Chulwoo Jung	2848a9b558	DWF Kernel lanczos working(?)	2025-01-16 01:29:56 +00:00