Better messaging

Tests on frontier with real speed up . 3.5x on 16^3 at mq=0.01
Normal memory SendToRecvFrom asynchronous for use in general stencil
2025-06-21 17:22:03 +01:00 · 2023-10-20 19:33:04 -04:00 · 2023-10-20 19:27:13 -04:00 · 2023-10-20 19:27:13 -04:00 · 2023-10-20 19:27:13 -04:00 · 2023-10-20 19:27:13 -04:00
13 changed files with 609 additions and 138 deletions
--- a/Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h
+++ b/Grid/algorithms/multigrid/GeneralCoarsenedMatrix.h
@ -47,6 +47,7 @@ public:
  typedef Lattice<siteVector>                 CoarseVector;
  typedef Lattice<iMatrix<CComplex,nbasis > > CoarseMatrix;
  typedef iMatrix<CComplex,nbasis >  Cobj;
+  typedef iVector<CComplex,nbasis >  Cvec;
  typedef Lattice< CComplex >   CoarseScalar; // used for inner products on fine field
  typedef Lattice<Fobj >        FineField;
  typedef CoarseVector Field;
@ -62,6 +63,7 @@ public:
  
  std::vector<CoarseMatrix> _A;
  std::vector<CoarseMatrix> _Adag;
+  std::vector<CoarseVector> MultTemporaries;

  ///////////////////////
  // Interface
@ -124,11 +126,8 @@ public:
  }
  void Mult (std::vector<CoarseMatrix> &A,const CoarseVector &in, CoarseVector &out)
  {
-    RealD tviews=0;
-    RealD ttot=0;
-    RealD tmult=0;
-    RealD texch=0;
-    RealD text=0;
+    RealD tviews=0;    RealD ttot=0;    RealD tmult=0;   RealD texch=0;    RealD text=0; RealD ttemps=0; RealD tcopy=0;
+
    ttot=-usecond();
    conformable(CoarseGrid(),in.Grid());
    conformable(in.Grid(),out.Grid());
@ -136,71 +135,102 @@ public:
    CoarseVector tin=in;

    texch-=usecond();
-    CoarseVector pin  = Cell.Exchange(tin);
+    CoarseVector pin = Cell.ExchangePeriodic(tin);
    texch+=usecond();

-    CoarseVector pout(pin.Grid()); pout=Zero();
+    CoarseVector pout(pin.Grid());

    int npoint = geom.npoint;
    typedef LatticeView<Cobj> Aview;
+    typedef LatticeView<Cvec> Vview;
      
    const int Nsimd = CComplex::Nsimd();
    
-    int osites=pin.Grid()->oSites();
-    //    int gsites=pin.Grid()->gSites();
+    int64_t osites=pin.Grid()->oSites();

-    RealD flops = 1.0* npoint * nbasis * nbasis * 8 * osites;
-    RealD bytes = (1.0*osites*sizeof(siteMatrix)*npoint+2.0*osites*sizeof(siteVector))*npoint;
-      
-    //    for(int point=0;point<npoint;point++){
-    //      conformable(A[point],pin);
-    //    }
+    RealD flops = 1.0* npoint * nbasis * nbasis * 8.0 * osites * CComplex::Nsimd();
+    RealD bytes = 1.0*osites*sizeof(siteMatrix)*npoint
+                + 2.0*osites*sizeof(siteVector)*npoint;
      
    {
      tviews-=usecond();
      autoView( in_v , pin, AcceleratorRead);
-      autoView( out_v , pout, AcceleratorWrite);
+      autoView( out_v , pout, AcceleratorWriteDiscard);
      autoView( Stencil_v  , Stencil, AcceleratorRead);
      tviews+=usecond();

-      for(int point=0;point<npoint;point++){
+      // Static and prereserve to keep UVM region live and not resized across multiple calls
+      ttemps-=usecond();
+      MultTemporaries.resize(npoint,pin.Grid());       
+      ttemps+=usecond();
+      std::vector<Aview> AcceleratorViewContainer_h;
+      std::vector<Vview> AcceleratorVecViewContainer_h; 
+
      tviews-=usecond();
-	autoView( A_v, A[point],AcceleratorRead);
+      for(int p=0;p<npoint;p++) {
+	AcceleratorViewContainer_h.push_back(      A[p].View(AcceleratorRead));
+	AcceleratorVecViewContainer_h.push_back(MultTemporaries[p].View(AcceleratorWrite));
+      }
      tviews+=usecond();
+
+      static deviceVector<Aview> AcceleratorViewContainer; AcceleratorViewContainer.resize(npoint);
+      static deviceVector<Vview> AcceleratorVecViewContainer; AcceleratorVecViewContainer.resize(npoint); 
+      
+      auto Aview_p = &AcceleratorViewContainer[0];
+      auto Vview_p = &AcceleratorVecViewContainer[0];
+      tcopy-=usecond();
+      acceleratorCopyToDevice(&AcceleratorViewContainer_h[0],&AcceleratorViewContainer[0],npoint *sizeof(Aview));
+      acceleratorCopyToDevice(&AcceleratorVecViewContainer_h[0],&AcceleratorVecViewContainer[0],npoint *sizeof(Vview));
+      tcopy+=usecond();
+
      tmult-=usecond();
-	accelerator_for(sss, osites*nbasis, Nsimd, {
-
-	    typedef decltype(coalescedRead(in_v[0]))    calcVector;
-
-	    int ss = sss/nbasis;
-	    int b  = sss%nbasis;
-
+      accelerator_for(spb, osites*nbasis*npoint, Nsimd, {
+	  typedef decltype(coalescedRead(in_v[0](0))) calcComplex;
+	  int32_t ss   = spb/(nbasis*npoint);
+	  int32_t bp   = spb%(nbasis*npoint);
+	  int32_t b    = bp/npoint;
+	  int32_t point= bp%npoint;
 	  auto SE  = Stencil_v.GetEntry(point,ss);
 	  auto nbr = coalescedReadGeneralPermute(in_v[SE->_offset],SE->_permute,Nd);
-	    auto res = out_v(ss)(b);
-	    for(int bb=0;bb<nbasis;bb++) {
-	      res = res + coalescedRead(A_v[ss](b,bb))*nbr(bb);
+	  auto res = coalescedRead(Aview_p[point][ss](b,0))*nbr(0);
+	  for(int bb=1;bb<nbasis;bb++) {
+	    res = res + coalescedRead(Aview_p[point][ss](b,bb))*nbr(bb);
+	  }
+	  coalescedWrite(Vview_p[point][ss](b),res);
+      });
+      accelerator_for(sb, osites*nbasis, Nsimd, {
+	  int ss = sb/nbasis;
+	  int b  = sb%nbasis;
+	  auto res = coalescedRead(Vview_p[0][ss](b));
+	  for(int point=1;point<npoint;point++){
+	    res = res + coalescedRead(Vview_p[point][ss](b));
 	  }
 	  coalescedWrite(out_v[ss](b),res);
      });
-
      tmult+=usecond();
+      for(int p=0;p<npoint;p++) {
+	AcceleratorViewContainer_h[p].ViewClose();
+	AcceleratorVecViewContainer_h[p].ViewClose();
      }
    }
+
    text-=usecond();
    out = Cell.Extract(pout);
    text+=usecond();
    ttot+=usecond();
    
-    std::cout << GridLogDebug<<"Coarse Mult Aviews "<<tviews<<" us"<<std::endl;
-    std::cout << GridLogDebug<<"Coarse Mult exch "<<texch<<" us"<<std::endl;
-    std::cout << GridLogDebug<<"Coarse Mult mult "<<tmult<<" us"<<std::endl;
-    std::cout << GridLogDebug<<"Coarse Mult ext  "<<text<<" us"<<std::endl;
-    std::cout << GridLogDebug<<"Coarse Mult tot  "<<ttot<<" us"<<std::endl;
-    std::cout << GridLogDebug<<"Coarse Kernel flop/s "<< flops/tmult<<" mflop/s"<<std::endl;
-    std::cout << GridLogDebug<<"Coarse Kernel bytes/s"<< bytes/tmult<<" MB/s"<<std::endl;
-    std::cout << GridLogDebug<<"Coarse overall flops/s "<< flops/ttot<<" mflop/s"<<std::endl;
-    std::cout << GridLogDebug<<"Coarse total bytes   "<< bytes/1e6<<" MB"<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse Mult Aviews "<<tviews<<" us"<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse Mult exch "<<texch<<" us"<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse Mult mult "<<tmult<<" us"<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse Mult ext  "<<text<<" us"<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse Mult temps "<<ttemps<<" us"<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse Mult copy  "<<tcopy<<" us"<<std::endl;
+    std::cout << GridLogPerformance<<"Coarse Mult tot  "<<ttot<<" us"<<std::endl;
+    //    std::cout << GridLogPerformance<<std::endl;
+    //    std::cout << GridLogPerformance<<"Coarse Kernel flop/s "<< flops/tmult<<" mflop/s"<<std::endl;
+    //    std::cout << GridLogPerformance<<"Coarse Kernel bytes/s"<< bytes/tmult<<" MB/s"<<std::endl;
+    //    std::cout << GridLogPerformance<<"Coarse overall flops/s "<< flops/ttot<<" mflop/s"<<std::endl;
+    //    std::cout << GridLogPerformance<<"Coarse total bytes   "<< bytes/1e6<<" MB"<<std::endl;

  };

--- a/Grid/allocator/MemoryManager.h
+++ b/Grid/allocator/MemoryManager.h
@ -209,9 +209,9 @@ private:
  static void     CpuViewClose(uint64_t Ptr);
  static uint64_t CpuViewOpen(uint64_t  CpuPtr,size_t bytes,ViewMode mode,ViewAdvise hint);
 #endif
-  static void NotifyDeletion(void * CpuPtr);

 public:
+  static void NotifyDeletion(void * CpuPtr);
  static void Print(void);
  static void PrintAll(void);
  static void PrintState( void* CpuPtr);
--- a/Grid/allocator/MemoryManagerCache.cc
+++ b/Grid/allocator/MemoryManagerCache.cc
@ -8,7 +8,7 @@ NAMESPACE_BEGIN(Grid);
 static char print_buffer [ MAXLINE ];

 #define mprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer;
-#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogMemory << print_buffer;
+#define dprintf(...) snprintf (print_buffer,MAXLINE, __VA_ARGS__ ); std::cout << GridLogDebug << print_buffer;
 //#define dprintf(...) 


@ -111,7 +111,7 @@ void MemoryManager::AccDiscard(AcceleratorViewEntry &AccCache)
  ///////////////////////////////////////////////////////////
  assert(AccCache.state!=Empty);
  
-  mprintf("MemoryManager: Discard(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
+  dprintf("MemoryManager: Discard(%lx) %lx\n",(uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr); 
  assert(AccCache.accLock==0);
  assert(AccCache.cpuLock==0);
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
@ -141,7 +141,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
  ///////////////////////////////////////////////////////////////////////////
  assert(AccCache.state!=Empty);
  
-  mprintf("MemoryManager: Evict cpu %lx acc %lx cpuLock %ld accLock %ld\n",
+  mprintf("MemoryManager: Evict CpuPtr %lx AccPtr %lx cpuLock %ld accLock %ld\n",
 	  (uint64_t)AccCache.CpuPtr,(uint64_t)AccCache.AccPtr,
 	  (uint64_t)AccCache.cpuLock,(uint64_t)AccCache.accLock); 
  if (AccCache.accLock!=0) return;
@ -155,7 +155,7 @@ void MemoryManager::Evict(AcceleratorViewEntry &AccCache)
    AccCache.AccPtr=(uint64_t)NULL;
    AccCache.state=CpuDirty; // CPU primary now
    DeviceBytes   -=AccCache.bytes;
-    dprintf("MemoryManager: Free(%lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);  
+    dprintf("MemoryManager: Free(AccPtr %lx) footprint now %ld \n",(uint64_t)AccCache.AccPtr,DeviceBytes);  
  }
  //  uint64_t CpuPtr = AccCache.CpuPtr;
  DeviceEvictions++;
@ -169,7 +169,7 @@ void MemoryManager::Flush(AcceleratorViewEntry &AccCache)
  assert(AccCache.AccPtr!=(uint64_t)NULL);
  assert(AccCache.CpuPtr!=(uint64_t)NULL);
  acceleratorCopyFromDevice((void *)AccCache.AccPtr,(void *)AccCache.CpuPtr,AccCache.bytes);
-  mprintf("MemoryManager: Flush  %lx -> %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
+  mprintf("MemoryManager: acceleratorCopyFromDevice Flush AccPtr %lx -> CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
  DeviceToHostBytes+=AccCache.bytes;
  DeviceToHostXfer++;
  AccCache.state=Consistent;
@ -184,7 +184,7 @@ void MemoryManager::Clone(AcceleratorViewEntry &AccCache)
    AccCache.AccPtr=(uint64_t)AcceleratorAllocate(AccCache.bytes);
    DeviceBytes+=AccCache.bytes;
  }
-  mprintf("MemoryManager: Clone %lx <- %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
+  mprintf("MemoryManager: acceleratorCopyToDevice   Clone AccPtr %lx <- CpuPtr %lx\n",(uint64_t)AccCache.AccPtr,(uint64_t)AccCache.CpuPtr); fflush(stdout);
  acceleratorCopyToDevice((void *)AccCache.CpuPtr,(void *)AccCache.AccPtr,AccCache.bytes);
  HostToDeviceBytes+=AccCache.bytes;
  HostToDeviceXfer++;
--- a/Grid/communicator/Communicator_base.h
+++ b/Grid/communicator/Communicator_base.h
@ -138,6 +138,14 @@ public:
  ////////////////////////////////////////////////////////////
  // Face exchange, buffer swap in translational invariant way
  ////////////////////////////////////////////////////////////
+  void CommsComplete(std::vector<CommsRequest_t> &list);
+  void SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+			   void *xmit,
+			   int dest,
+			   void *recv,
+			   int from,
+			   int bytes,int dir);
+  
  void SendToRecvFrom(void *xmit,
 		      int xmit_to_rank,
 		      void *recv,
--- a/Grid/communicator/Communicator_mpi3.cc
+++ b/Grid/communicator/Communicator_mpi3.cc
@ -306,6 +306,44 @@ void CartesianCommunicator::GlobalSumVector(double *d,int N)
  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
+
+void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+						void *xmit,
+						int dest,
+						void *recv,
+						int from,
+						int bytes,int dir)
+{
+  MPI_Request xrq;
+  MPI_Request rrq;
+
+  assert(dest != _processor);
+  assert(from != _processor);
+
+  int tag;
+
+  tag= dir+from*32;
+  int ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator,&rrq);
+  assert(ierr==0);
+  list.push_back(rrq);
+  
+  tag= dir+_processor*32;
+  ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator,&xrq);
+  assert(ierr==0);
+  list.push_back(xrq);
+}
+void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list)
+{
+  int nreq=list.size();
+
+  if (nreq==0) return;
+
+  std::vector<MPI_Status> status(nreq);
+  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
+  assert(ierr==0);
+  list.resize(0);
+}
+
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int dest,
--- a/Grid/communicator/Communicator_none.cc
+++ b/Grid/communicator/Communicator_none.cc
@ -91,6 +91,17 @@ void CartesianCommunicator::SendToRecvFrom(void *xmit,
 {
  assert(0);
 }
+void CartesianCommunicator::CommsComplete(std::vector<CommsRequest_t> &list){ assert(0);}
+void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+						void *xmit,
+						int dest,
+						void *recv,
+						int from,
+						int bytes,int dir)
+{
+  assert(0);
+}
+
 void CartesianCommunicator::AllToAll(int dim,void  *in,void *out,uint64_t words,uint64_t bytes)
 {
  bcopy(in,out,bytes*words);
--- a/Grid/lattice/Lattice_crc.h
+++ b/Grid/lattice/Lattice_crc.h
@ -29,7 +29,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 NAMESPACE_BEGIN(Grid);

-template<class vobj> void DumpSliceNorm(std::string s,Lattice<vobj> &f,int mu=-1)
+template<class vobj> void DumpSliceNorm(std::string s,const Lattice<vobj> &f,int mu=-1)
 {
  auto ff = localNorm2(f);
  if ( mu==-1 ) mu = f.Grid()->Nd()-1;
--- a/Grid/lattice/PaddedCell.h
+++ b/Grid/lattice/PaddedCell.h
@ -45,6 +45,170 @@ struct CshiftImplGauge: public CshiftImplBase<typename Gimpl::GaugeLinkField::ve
  typename Gimpl::GaugeLinkField Cshift(const typename Gimpl::GaugeLinkField &in, int dir, int shift) const override{ return Gimpl::CshiftLink(in,dir,shift); }
 };  

+
+/*
+ *
+ * TODO: 
+ *  -- address elementsof vobj via thread block in Scatter/Gather
+ *  -- overlap comms with motion in Face_exchange
+ *
+ */
+
+template<class vobj> inline void ScatterSlice(const cshiftVector<vobj> &buf,
+					      Lattice<vobj> &lat,
+					      int x,
+					      int dim,
+					      int offset=0)
+{
+  const int Nsimd=vobj::Nsimd();
+  typedef typename vobj::scalar_object sobj;
+
+  GridBase *grid = lat.Grid();
+  Coordinate simd = grid->_simd_layout;
+  int Nd          = grid->Nd();
+  int block       = grid->_slice_block[dim];
+  int stride      = grid->_slice_stride[dim];
+  int nblock      = grid->_slice_nblock[dim];
+  int rd          = grid->_rdimensions[dim];
+
+  int ox = x%rd;
+  int ix = x/rd;
+
+  int isites = 1; for(int d=0;d<Nd;d++) if( d!=dim) isites*=simd[d];
+
+  Coordinate rsimd= simd;  rsimd[dim]=1; // maybe reduce Nsimd
+
+  int rNsimd = 1; for(int d=0;d<Nd;d++) rNsimd*=rsimd[d];
+  int rNsimda= Nsimd/simd[dim]; // should be equal
+  assert(rNsimda==rNsimd);
+  int face_ovol=block*nblock;
+
+  //  assert(buf.size()==face_ovol*rNsimd);
+
+  /*This will work GPU ONLY unless rNsimd is put in the lexico index*/
+  //Let's make it work on GPU and then make a special accelerator_for that
+  //doesn't hide the SIMD direction and keeps explicit in the threadIdx
+  //for cross platform
+  // FIXME -- can put internal indices into thread loop
+  auto buf_p = & buf[0];
+  autoView(lat_v, lat, AcceleratorRead);
+  accelerator_for(ss, face_ovol/simd[dim],Nsimd,{
+
+    // scalar layout won't coalesce
+    int blane=acceleratorSIMTlane(Nsimd); // buffer lane
+    int olane=blane%rNsimd;               // reduced lattice lane
+    int obit =blane/rNsimd;
+
+    ///////////////////////////////////////////////////////////////
+    // osite -- potentially one bit from simd in the buffer: (ss<<1)|obit
+    ///////////////////////////////////////////////////////////////
+    int ssp = ss*simd[dim]+obit;
+    int b    = ssp%block;
+    int n    = ssp/block;
+    int osite= b+n*stride + ox*block;
+
+    ////////////////////////////////////////////
+    // isite -- map lane within buffer to lane within lattice
+    ////////////////////////////////////////////
+    Coordinate icoor;
+    int lane;
+    Lexicographic::CoorFromIndex(icoor,olane,rsimd);
+    icoor[dim]=ix;
+    Lexicographic::IndexFromCoor(icoor,lane,simd);
+
+    ///////////////////////////////////////////
+    // Transfer into lattice - will coalesce
+    ///////////////////////////////////////////
+    sobj obj = extractLane(blane,buf_p[ss+offset]);
+    insertLane(lane,lat_v[osite],obj);
+  });
+}
+
+template<class vobj> inline void GatherSlice(cshiftVector<vobj> &buf,
+					     const Lattice<vobj> &lat,
+					     int x,
+					     int dim,
+					     int offset=0)
+{
+  const int Nsimd=vobj::Nsimd();
+  typedef typename vobj::scalar_object sobj;
+
+  autoView(lat_v, lat, AcceleratorRead);
+
+  GridBase *grid = lat.Grid();
+  Coordinate simd = grid->_simd_layout;
+  int Nd          = grid->Nd();
+  int block       = grid->_slice_block[dim];
+  int stride      = grid->_slice_stride[dim];
+  int nblock      = grid->_slice_nblock[dim];
+  int rd          = grid->_rdimensions[dim];
+
+  int ox = x%rd;
+  int ix = x/rd;
+
+  int isites = 1; for(int d=0;d<Nd;d++) if( d!=dim) isites*=simd[d];
+
+  Coordinate rsimd= simd;  rsimd[dim]=1; // maybe reduce Nsimd
+
+  int rNsimd = 1; for(int d=0;d<Nd;d++) rNsimd*=rsimd[d];
+  
+  int face_ovol=block*nblock;
+
+  //  assert(buf.size()==face_ovol*rNsimd);
+
+  /*This will work GPU ONLY unless rNsimd is put in the lexico index*/
+  //Let's make it work on GPU and then make a special accelerator_for that
+  //doesn't hide the SIMD direction and keeps explicit in the threadIdx
+  //for cross platform
+  //For CPU perhaps just run a loop over Nsimd
+  auto buf_p = & buf[0];
+  accelerator_for(ss, face_ovol/simd[dim],Nsimd,{
+
+    // scalar layout won't coalesce
+    int blane=acceleratorSIMTlane(Nsimd); // buffer lane
+    int olane=blane%rNsimd;               // reduced lattice lane
+    int obit =blane/rNsimd;
+
+    ////////////////////////////////////////////
+    // osite
+    ////////////////////////////////////////////
+    int ssp = ss*simd[dim]+obit;
+    int b    = ssp%block;
+    int n    = ssp/block;
+    int osite= b+n*stride + ox*block;
+
+    ////////////////////////////////////////////
+    // isite -- map lane within buffer to lane within lattice
+    ////////////////////////////////////////////
+    Coordinate icoor;
+    int lane;
+    Lexicographic::CoorFromIndex(icoor,olane,rsimd);
+    icoor[dim]=ix;
+    Lexicographic::IndexFromCoor(icoor,lane,simd);
+
+    ///////////////////////////////////////////
+    // Take out of lattice
+    ///////////////////////////////////////////
+
+    sobj obj = extractLane(lane,lat_v[osite]);
+    insertLane(blane,buf_p[ss+offset],obj);
+
+  });
+  /*
+  int words =block*nblock/simd[dim];
+  std::vector<vobj> tbuf(words);
+  acceleratorCopyFromDevice((void *)&buf[offset],(void *)&tbuf[0],words*sizeof(vobj));
+  typedef typename vobj::scalar_type scalar;
+  scalar *sbuf = (scalar *)&tbuf[0];
+  scalar tmp=0.0;
+  for(int w=0;w<words*sizeof(vobj)/sizeof(scalar);w++){
+    tmp=tmp+conjugate(sbuf[w])*sbuf[w];
+  }
+  std::cout << " Gathered buffer norm "<<tmp<<std::endl;
+  */
+}
+
+
 class PaddedCell {
 public:
  GridCartesian * unpadded_grid;
@ -82,19 +246,20 @@ public:
    Coordinate processors=unpadded_grid->_processors;
    Coordinate plocal    =unpadded_grid->LocalDimensions();
    Coordinate global(dims);
-
+    GridCartesian *old_grid = unpadded_grid;
    // expand up one dim at a time
    for(int d=0;d<dims;d++){

      if ( processors[d] > 1 ) { 
 	plocal[d] += 2*depth; 
-      }
      
 	for(int d=0;d<dims;d++){
 	  global[d] = plocal[d]*processors[d];
 	}

-      grids.push_back(new GridCartesian(global,simd,processors));
+	old_grid = new GridCartesian(global,simd,processors);
+      }
+      grids.push_back(old_grid);
    }
  };
  template<class vobj>
@ -125,6 +290,17 @@ public:
    }
    return tmp;
  }
+  template<class vobj>
+  inline Lattice<vobj> ExchangePeriodic(const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
+  {
+    GridBase *old_grid = in.Grid();
+    int dims = old_grid->Nd();
+    Lattice<vobj> tmp = in;
+    for(int d=0;d<dims;d++){
+      tmp = ExpandPeriodic(d,tmp,cshift); // rvalue && assignment
+    }
+    return tmp;
+  }
  // expand up one dim at a time
  template<class vobj>
  inline Lattice<vobj> Expand(int dim, const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
@ -148,13 +324,22 @@ public:

    if ( islocal ) {

+      // replace with a copy and maybe grid swizzle
      double t = usecond();
-      for(int x=0;x<local[dim];x++){
-	InsertSliceLocal(in,padded,x,x,dim);
-      }
+      padded = in;
      tins += usecond() - t;
      
    } else {
+
+      //////////////////////////////////////////////
+      // Replace sequence with
+      // ---------------------
+      // (i) Gather high face(s); start comms
+      // (ii) Gather low  face(s); start comms
+      // (iii) Copy middle bit with localCopyRegion
+      // (iv) Complete high face(s), insert slice(s)
+      // (iv) Complete low  face(s), insert slice(s)
+      //////////////////////////////////////////////
      // Middle bit
      double t = usecond();
      for(int x=0;x<local[dim];x++){
@ -183,14 +368,198 @@ public:
 	InsertSliceLocal(shifted,padded,x,x,dim);
      }
      tins += usecond() - t;
+
    }
-    std::cout << GridLogDebug << "PaddedCell::Expand timings: cshift:" << tshift/1000 << "ms, insert-slice:" << tins/1000 << "ms" << std::endl;
+    std::cout << GridLogPerformance << "PaddedCell::Expand timings: cshift:" << tshift/1000 << "ms, insert-slice:" << tins/1000 << "ms" << std::endl;
    
    return padded;
  }

+  template<class vobj>
+  inline Lattice<vobj> ExpandPeriodic(int dim, const Lattice<vobj> &in, const CshiftImplBase<vobj> &cshift = CshiftImplDefault<vobj>()) const
+  {
+    Coordinate processors=unpadded_grid->_processors;
+    GridBase *old_grid = in.Grid();
+    GridCartesian *new_grid = grids[dim];//These are new grids
+    Lattice<vobj>  padded(new_grid);
+    Lattice<vobj> shifted(old_grid);    
+    Coordinate local     =old_grid->LocalDimensions();
+    Coordinate plocal    =new_grid->LocalDimensions();
+    if(dim==0) conformable(old_grid,unpadded_grid);
+    else       conformable(old_grid,grids[dim-1]);
+
+    //    std::cout << " dim "<<dim<<" local "<<local << " padding to "<<plocal<<std::endl;
+    double tins=0, tshift=0;
+
+    int islocal = 0 ;
+    if ( processors[dim] == 1 ) islocal = 1;
+
+    if ( islocal ) {
+
+      // replace with a copy and maybe grid swizzle
+      double t = usecond();
+      padded = in;
+      tins += usecond() - t;
+      
+    } else {
+
+      //////////////////////////////////////////////
+      // Replace sequence with
+      // ---------------------
+      // (i) Gather high face(s); start comms
+      // (ii) Gather low  face(s); start comms
+      // (iii) Copy middle bit with localCopyRegion
+      // (iv) Complete high face(s), insert slice(s)
+      // (iv) Complete low  face(s), insert slice(s)
+      //////////////////////////////////////////////
+      Face_exchange(in,padded,dim,depth);
+    }
+    return padded;
+  }
+  template<class vobj>
+  void Face_exchange(const Lattice<vobj> &from,
+		     Lattice<vobj> &to,
+		     int dimension,int depth) const
+  {
+    typedef typename vobj::vector_type vector_type;
+    typedef typename vobj::scalar_type scalar_type;
+    typedef typename vobj::scalar_object sobj;
+
+    RealD t_gather=0.0;
+    RealD t_scatter=0.0;
+    RealD t_comms=0.0;
+    RealD t_copy=0.0;
+    
+    //    std::cout << GridLogMessage << "dimension " <<dimension<<std::endl;
+    //    DumpSliceNorm(std::string("Face_exchange from"),from,dimension);
+    GridBase *grid=from.Grid();
+    GridBase *new_grid=to.Grid();
+
+    Coordinate lds = from.Grid()->_ldimensions;
+    Coordinate nlds=   to.Grid()->_ldimensions;
+    Coordinate simd= from.Grid()->_simd_layout;
+    int ld    = lds[dimension];
+    int nld   = to.Grid()->_ldimensions[dimension];
+    const int Nsimd = vobj::Nsimd();
+
+    assert(depth<=lds[dimension]); // A must be on neighbouring node
+    assert(depth>0);   // A caller bug if zero
+    assert(ld+2*depth==nld);
+    ////////////////////////////////////////////////////////////////////////////
+    // Face size and byte calculations
+    ////////////////////////////////////////////////////////////////////////////
+    int buffer_size = 1;
+    for(int d=0;d<lds.size();d++){
+      if ( d!= dimension) buffer_size=buffer_size*lds[d];
+    }
+    buffer_size = buffer_size  / Nsimd;
+    int rNsimd = Nsimd / simd[dimension];
+    assert( buffer_size == from.Grid()->_slice_nblock[dimension]*from.Grid()->_slice_block[dimension] / simd[dimension]);
+
+    static cshiftVector<vobj> send_buf; 
+    static cshiftVector<vobj> recv_buf;
+    send_buf.resize(buffer_size*2*depth);    
+    recv_buf.resize(buffer_size*2*depth);
+
+    std::vector<CommsRequest_t> fwd_req;   
+    std::vector<CommsRequest_t> bwd_req;   
+
+    int words = buffer_size;
+    int bytes = words * sizeof(vobj);
+
+    ////////////////////////////////////////////////////////////////////////////
+    // Communication coords
+    ////////////////////////////////////////////////////////////////////////////
+    int comm_proc = 1;
+    int xmit_to_rank;
+    int recv_from_rank;
+    grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
+
+    ////////////////////////////////////////////////////////////////////////////
+    // Gather all surface terms up to depth "d"
+    ////////////////////////////////////////////////////////////////////////////
+    RealD t;
+    int plane=0;
+    for ( int d=0;d < depth ; d ++ ) {
+      int tag = d*1024 + dimension*2+0;
+
+      t=usecond();
+      GatherSlice(send_buf,from,d,dimension,plane*buffer_size); plane++;
+      t_gather+=usecond()-t;
+
+      t=usecond();
+      grid->SendToRecvFromBegin(fwd_req,
+				(void *)&send_buf[d*buffer_size], xmit_to_rank,
+				(void *)&recv_buf[d*buffer_size], recv_from_rank, bytes, tag);
+      t_comms+=usecond()-t;
+     }
+    for ( int d=0;d < depth ; d ++ ) {
+      int tag = d*1024 + dimension*2+1;
+
+      t=usecond();
+      GatherSlice(send_buf,from,ld-depth+d,dimension,plane*buffer_size); plane++;
+      t_gather+= usecond() - t;
+
+      t=usecond();
+      grid->SendToRecvFromBegin(bwd_req,
+				(void *)&send_buf[(d+depth)*buffer_size], recv_from_rank,
+				(void *)&recv_buf[(d+depth)*buffer_size], xmit_to_rank, bytes,tag);
+      t_comms+=usecond()-t;
+    }
+
+    ////////////////////////////////////////////////////////////////////////////
+    // Copy interior -- overlap this with comms
+    ////////////////////////////////////////////////////////////////////////////
+    int Nd = new_grid->Nd();
+    Coordinate LL(Nd,0);
+    Coordinate sz = grid->_ldimensions;
+    Coordinate toLL(Nd,0);
+    toLL[dimension]=depth;
+    t=usecond();
+    localCopyRegion(from,to,LL,toLL,sz);
+    t_copy= usecond() - t;
+    
+    ////////////////////////////////////////////////////////////////////////////
+    // Scatter all faces
+    ////////////////////////////////////////////////////////////////////////////
+    //    DumpSliceNorm(std::string("Face_exchange to before scatter"),to,dimension);
+
+    plane=0;
+
+    t=usecond();
+    grid->CommsComplete(fwd_req);
+    t_comms+= usecond() - t;
+
+    t=usecond();
+    for ( int d=0;d < depth ; d ++ ) {
+      ScatterSlice(recv_buf,to,nld-depth+d,dimension,plane*buffer_size); plane++;
+    }
+    t_scatter= usecond() - t;
+
+    t=usecond();
+    grid->CommsComplete(bwd_req);
+    t_comms+= usecond() - t;
+    
+    t=usecond();
+    for ( int d=0;d < depth ; d ++ ) {
+      ScatterSlice(recv_buf,to,d,dimension,plane*buffer_size); plane++;
+    }
+    t_scatter+= usecond() - t;
+    //    DumpSliceNorm(std::string("Face_exchange to scatter 1st "),to,dimension);
+
+    //DumpSliceNorm(std::string("Face_exchange to done"),to,dimension);
+    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: gather :" << t_gather/1000  << "ms"<<std::endl;
+    //    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: gather :" << 2.0*bytes/t_gather << "MB/s"<<std::endl;
+    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: scatter:" << t_scatter/1000   << "ms"<<std::endl;
+    //    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: scatter:" << 2.0*bytes/t_scatter<< "MB/s"<<std::endl;
+    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: copy   :" << t_copy/1000      << "ms"<<std::endl;
+    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: comms  :" << t_comms/1000     << "ms"<<std::endl;
+    //    std::cout << GridLogPerformance << "PaddedCell::Expand new timings: comms  :" << (RealD)4.0*bytes/t_comms   << "MB/s"<<std::endl;
+  }
+  
 };
 

 NAMESPACE_END(Grid);

+
--- a/Grid/parallelIO/BinaryIO.h
+++ b/Grid/parallelIO/BinaryIO.h
@ -165,7 +165,7 @@ class BinaryIO {
 	 * FIXME -- 128^3 x 256 x 16 will overflow.
 	 */
 	
-	int global_site;
+	int64_t global_site;

 	Lexicographic::CoorFromIndex(coor,local_site,local_vol);

@ -175,8 +175,8 @@ class BinaryIO {

 	Lexicographic::IndexFromCoor(coor,global_site,global_vol);

-	uint32_t gsite29   = global_site%29;
-	uint32_t gsite31   = global_site%31;
+	uint64_t gsite29   = global_site%29;
+	uint64_t gsite31   = global_site%31;
 	
 	site_crc = crc32(0,(unsigned char *)site_buf,sizeof(fobj));
 	//	std::cout << "Site "<<local_site << " crc "<<std::hex<<site_crc<<std::dec<<std::endl;
@ -545,7 +545,9 @@ class BinaryIO {
 				       const std::string &format,
 				       uint32_t &nersc_csum,
 				       uint32_t &scidac_csuma,
-				       uint32_t &scidac_csumb)
+				       uint32_t &scidac_csumb,
+				       int control=BINARYIO_LEXICOGRAPHIC
+				       )
  {
    typedef typename vobj::scalar_object sobj;
    typedef typename vobj::Realified::scalar_type word;    word w=0;
@ -556,7 +558,7 @@ class BinaryIO {
    std::vector<sobj> scalardata(lsites); 
    std::vector<fobj>     iodata(lsites); // Munge, checksum, byte order in here
    
-    IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC,
+    IOobject(w,grid,iodata,file,offset,format,BINARYIO_READ|control,
 	     nersc_csum,scidac_csuma,scidac_csumb);

    GridStopWatch timer; 
@ -582,7 +584,8 @@ class BinaryIO {
 					  const std::string &format,
 					  uint32_t &nersc_csum,
 					  uint32_t &scidac_csuma,
-					  uint32_t &scidac_csumb)
+					  uint32_t &scidac_csumb,
+					  int control=BINARYIO_LEXICOGRAPHIC)
  {
    typedef typename vobj::scalar_object sobj;
    typedef typename vobj::Realified::scalar_type word;    word w=0;
@ -607,7 +610,7 @@ class BinaryIO {
    while (attemptsLeft >= 0)
    {
      grid->Barrier();
-      IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|BINARYIO_LEXICOGRAPHIC,
+      IOobject(w,grid,iodata,file,offset,format,BINARYIO_WRITE|control,
 	             nersc_csum,scidac_csuma,scidac_csumb);
      if (checkWrite)
      {
@ -617,7 +620,7 @@ class BinaryIO {

        std::cout << GridLogMessage << "writeLatticeObject: read back object" << std::endl;
        grid->Barrier();
-        IOobject(w,grid,ckiodata,file,ckoffset,format,BINARYIO_READ|BINARYIO_LEXICOGRAPHIC,
+        IOobject(w,grid,ckiodata,file,ckoffset,format,BINARYIO_READ|control,
 	               cknersc_csum,ckscidac_csuma,ckscidac_csumb);
        if ((cknersc_csum != nersc_csum) or (ckscidac_csuma != scidac_csuma) or (ckscidac_csumb != scidac_csumb))
        {
--- a/Grid/parallelIO/IldgIO.h
+++ b/Grid/parallelIO/IldgIO.h
@ -206,7 +206,7 @@ class GridLimeReader : public BinaryIO {
  // Read a generic lattice field and verify checksum
  ////////////////////////////////////////////
  template<class vobj>
-  void readLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name)
+  void readLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name,int control=BINARYIO_LEXICOGRAPHIC)
  {
    typedef typename vobj::scalar_object sobj;
    scidacChecksum scidacChecksum_;
@ -238,7 +238,7 @@ class GridLimeReader : public BinaryIO {
 	uint64_t offset= ftello(File);
 	//	std::cout << " ReadLatticeObject from offset "<<offset << std::endl;
 	BinarySimpleMunger<sobj,sobj> munge;
-	BinaryIO::readLatticeObject< vobj, sobj >(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
+	BinaryIO::readLatticeObject< vobj, sobj >(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb,control);
 	std::cout << GridLogMessage << "SciDAC checksum A " << std::hex << scidac_csuma << std::dec << std::endl;
 	std::cout << GridLogMessage << "SciDAC checksum B " << std::hex << scidac_csumb << std::dec << std::endl;
 	/////////////////////////////////////////////
@ -408,7 +408,7 @@ class GridLimeWriter : public BinaryIO
  // in communicator used by the field.Grid()
  ////////////////////////////////////////////////////
  template<class vobj>
-  void writeLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name)
+  void writeLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name,int control=BINARYIO_LEXICOGRAPHIC)
  {
    ////////////////////////////////////////////////////////////////////
    // NB: FILE and iostream are jointly writing disjoint sequences in the
@ -459,7 +459,7 @@ class GridLimeWriter : public BinaryIO
    ///////////////////////////////////////////
    std::string format = getFormatString<vobj>();
    BinarySimpleMunger<sobj,sobj> munge;
-    BinaryIO::writeLatticeObject<vobj,sobj>(field, filename, munge, offset1, format,nersc_csum,scidac_csuma,scidac_csumb);
+    BinaryIO::writeLatticeObject<vobj,sobj>(field, filename, munge, offset1, format,nersc_csum,scidac_csuma,scidac_csumb,control);

    ///////////////////////////////////////////
    // Wind forward and close the record
@ -512,7 +512,8 @@ class ScidacWriter : public GridLimeWriter {
  ////////////////////////////////////////////////
  template <class vobj, class userRecord>
  void writeScidacFieldRecord(Lattice<vobj> &field,userRecord _userRecord,
-                              const unsigned int recordScientificPrec = 0) 
+                              const unsigned int recordScientificPrec = 0,
+			      int control=BINARYIO_LEXICOGRAPHIC)
  {
    GridBase * grid = field.Grid();

@ -534,7 +535,7 @@ class ScidacWriter : public GridLimeWriter {
      writeLimeObject(0,0,_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
    }
    // Collective call
-    writeLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA));      // Closes message with checksum
+    writeLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA),control);      // Closes message with checksum
  }
 };

@ -553,7 +554,8 @@ class ScidacReader : public GridLimeReader {
  // Write generic lattice field in scidac format
  ////////////////////////////////////////////////
  template <class vobj, class userRecord>
-  void readScidacFieldRecord(Lattice<vobj> &field,userRecord &_userRecord) 
+  void readScidacFieldRecord(Lattice<vobj> &field,userRecord &_userRecord,
+			     int control=BINARYIO_LEXICOGRAPHIC) 
  {
    typedef typename vobj::scalar_object sobj;
    GridBase * grid = field.Grid();
@ -571,7 +573,7 @@ class ScidacReader : public GridLimeReader {
    readLimeObject(header ,std::string("FieldMetaData"),std::string(GRID_FORMAT)); // Open message 
    readLimeObject(_userRecord,_userRecord.SerialisableClassName(),std::string(SCIDAC_RECORD_XML));
    readLimeObject(_scidacRecord,_scidacRecord.SerialisableClassName(),std::string(SCIDAC_PRIVATE_RECORD_XML));
-    readLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA));
+    readLimeLatticeBinaryObject(field,std::string(ILDG_BINARY_DATA),control);
  }
  void skipPastBinaryRecord(void) {
    std::string rec_name(ILDG_BINARY_DATA);
--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@ -283,6 +283,7 @@ void GridBanner(void)
    std::cout << "Build " << GRID_BUILD_STR(GRID_BUILD_REF) << std::endl;
 #endif
    std::cout << std::endl;
+    std::cout << std::setprecision(9);
 }

 void Grid_init(int *argc,char ***argv)
--- a/tests/debug/Test_general_coarse_hdcg.cc
+++ b/tests/debug/Test_general_coarse_hdcg.cc
@ -144,10 +144,10 @@ int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);

-  const int Ls=16;
+  const int Ls=24;
  const int nbasis = 40;
  const int cb = 0 ;
-  RealD mass=0.01;
+  RealD mass=0.00078;
  RealD M5=1.8;
  RealD b=1.5;
  RealD c=0.5;
@ -219,10 +219,10 @@ int main (int argc, char ** argv)
  ////////////////////////////////////////////////////////////
  LittleDiracOperator LittleDiracOp(geom,FrbGrid,Coarse5d);

-  bool load=true;
+  bool load=false;
  if ( load ) {
-    LoadBasis(Aggregates,"Subspace.scidac");
-    LoadOperator(LittleDiracOp,"LittleDiracOp.scidac");
+    LoadBasis(Aggregates,"/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.scidac");
+    LoadOperator(LittleDiracOp,"/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/LittleDiracOp.scidac");
  } else {
    Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,
 				       95.0,0.1,
@ -235,8 +235,8 @@ int main (int argc, char ** argv)
 				       100,
 				       0.0);
    LittleDiracOp.CoarsenOperator(FineHermOp,Aggregates);
-    SaveBasis(Aggregates,"Subspace.scidac");
-    SaveOperator(LittleDiracOp,"LittleDiracOp.scidac");
+    SaveBasis(Aggregates,"/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.scidac");
+    SaveOperator(LittleDiracOp,"/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/LittleDiracOp.scidac");
  }
  
  // Try projecting to one hop only
@ -261,7 +261,10 @@ int main (int argc, char ** argv)
  int Nconv;
  std::vector<RealD>            eval(Nm);
  std::vector<CoarseVector>     evec(Nm,Coarse5d);
-  CoarseVector c_src(Coarse5d); c_src=1.0;
+  CoarseVector c_src(Coarse5d);
+  //c_src=1.0;
+  random(CRNG,c_src);
+
  CoarseVector c_res(Coarse5d); 
  CoarseVector c_ref(Coarse5d); 

--- a/tests/debug/Test_general_coarse_hdcg_phys.cc
+++ b/tests/debug/Test_general_coarse_hdcg_phys.cc
@ -44,7 +44,8 @@ void SaveOperator(Coarsened &Operator,std::string file)
  WR.open(file);
  for(int p=0;p<Operator._A.size();p++){
    auto tmp = Operator.Cell.Extract(Operator._A[p]);
-    WR.writeScidacFieldRecord(tmp,record);
+    WR.writeScidacFieldRecord(tmp,record,0,0);
+    //    WR.writeScidacFieldRecord(tmp,record,0,BINARYIO_LEXICOGRAPHIC);
  }
  WR.close();
 #endif
@ -59,7 +60,8 @@ void LoadOperator(Coarsened &Operator,std::string file)
  assert(Operator._A.size()==Operator.geom.npoint);
  for(int p=0;p<Operator.geom.npoint;p++){
    conformable(Operator._A[p].Grid(),Operator.CoarseGrid());
-    RD.readScidacFieldRecord(Operator._A[p],record);
+    //    RD.readScidacFieldRecord(Operator._A[p],record,BINARYIO_LEXICOGRAPHIC);
+    RD.readScidacFieldRecord(Operator._A[p],record,0);
  }    
  RD.close();
  Operator.ExchangeCoarseLinks();
@ -73,7 +75,8 @@ void SaveBasis(aggregation &Agg,std::string file)
  ScidacWriter WR(Agg.FineGrid->IsBoss());
  WR.open(file);
  for(int b=0;b<Agg.subspace.size();b++){
-    WR.writeScidacFieldRecord(Agg.subspace[b],record);
+    //WR.writeScidacFieldRecord(Agg.subspace[b],record,0,BINARYIO_LEXICOGRAPHIC);
+    WR.writeScidacFieldRecord(Agg.subspace[b],record,0,0);
  }
  WR.close();
 #endif
@ -86,7 +89,8 @@ void LoadBasis(aggregation &Agg, std::string file)
  ScidacReader RD ;
  RD.open(file);
  for(int b=0;b<Agg.subspace.size();b++){
-    RD.readScidacFieldRecord(Agg.subspace[b],record);
+    //    RD.readScidacFieldRecord(Agg.subspace[b],record,BINARYIO_LEXICOGRAPHIC);
+    RD.readScidacFieldRecord(Agg.subspace[b],record,0);
  }    
  RD.close();
 #endif
@ -182,7 +186,7 @@ int main (int argc, char ** argv)
  LatticeGaugeField Umu(UGrid);

  FieldMetaData header;
-  std::string file("ckpoint_lat.4000");
+  std::string file("ckpoint_lat.2250");
  NerscIO::readConfiguration(Umu,header,file);

  //////////////////////// Fermion action //////////////////////////////////
@ -219,24 +223,26 @@ int main (int argc, char ** argv)
  ////////////////////////////////////////////////////////////
  LittleDiracOperator LittleDiracOp(geom,FrbGrid,Coarse5d);

-  bool load=true;
-  if ( load ) {
-    LoadBasis(Aggregates,"Subspace.scidac");
-    LoadOperator(LittleDiracOp,"LittleDiracOp.scidac");
+  std::string subspace_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/Subspace.phys.nolex.scidac");
+  std::string ldop_file("/lustre/orion/phy157/proj-shared/phy157_dwf/paboyle/LittleDiracOp.phys.nolex.scidac");
+  bool load_agg=true;
+  bool load_mat=true;
+  if ( load_agg ) {
+    LoadBasis(Aggregates,subspace_file);
  } else {
    Aggregates.CreateSubspaceChebyshev(RNG5,HermOpEO,nbasis,
-				       95.0,0.1,
-				       //				     400,200,200 -- 48 iters
-				       //				     600,200,200 -- 38 iters, 162s
-				       //				     600,200,100 -- 38 iters, 169s
-				       //				     600,200,50  -- 88 iters. 370s 
-				       800,
+				       95.0,0.05,
+				       1000,
+				       200,
 				       200,
-				       100,
 				       0.0);
+    SaveBasis(Aggregates,subspace_file);
+  }
+  if ( load_mat ) {
+    LoadOperator(LittleDiracOp,ldop_file);
+  } else {
    LittleDiracOp.CoarsenOperator(FineHermOp,Aggregates);
-    SaveBasis(Aggregates,"Subspace.scidac");
-    SaveOperator(LittleDiracOp,"LittleDiracOp.scidac");
+    SaveOperator(LittleDiracOp,ldop_file);
  }
  
  // Try projecting to one hop only
@ -250,13 +256,14 @@ int main (int argc, char ** argv)
  //////////////////////////////////////////
  // Build a coarse lanczos
  //////////////////////////////////////////
-  Chebyshev<CoarseVector>      IRLCheby(0.2,40.0,71);  // 1 iter
+  //  Chebyshev<CoarseVector>      IRLCheby(0.01,44.0,201);  // 1 iter
+  Chebyshev<CoarseVector>      IRLCheby(0.005,44.0,401);  // 1 iter
  FunctionHermOp<CoarseVector> IRLOpCheby(IRLCheby,CoarseOp);
  PlainHermOp<CoarseVector>    IRLOp    (CoarseOp);
-  int Nk=48;
-  int Nm=64;
+  int Nk=160;
+  int Nm=240;
  int Nstop=Nk;
-  ImplicitlyRestartedLanczos<CoarseVector> IRL(IRLOpCheby,IRLOp,Nstop,Nk,Nm,1.0e-5,20);
+  ImplicitlyRestartedLanczos<CoarseVector> IRL(IRLOpCheby,IRLOp,Nstop,Nk,Nm,1.0e-3,20);

  int Nconv;
  std::vector<RealD>            eval(Nm);
@ -281,20 +288,20 @@ int main (int argc, char ** argv)
  //  HPDSolver<CoarseVector> HPDSolve(CoarseOp,CG,CoarseZeroGuesser);
  HPDSolver<CoarseVector> HPDSolve(CoarseOp,CG,DeflCoarseGuesser);
  c_res=Zero();
-  HPDSolve(c_src,c_res); c_ref = c_res;
-  std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
-  std::cout << GridLogMessage<<"ref norm "<<norm2(c_ref)<<std::endl;
+  //  HPDSolve(c_src,c_res); c_ref = c_res;
+  //  std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
+  //  std::cout << GridLogMessage<<"ref norm "<<norm2(c_ref)<<std::endl;
  //////////////////////////////////////////////////////////////////////////
  // Deflated (with real op EV's) solve for the projected coarse op
  // Work towards ADEF1 in the coarse space
  //////////////////////////////////////////////////////////////////////////
  HPDSolver<CoarseVector> HPDSolveProj(CoarseOpProj,CG,DeflCoarseGuesser);
  c_res=Zero();
-  HPDSolveProj(c_src,c_res);
-  std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
-  std::cout << GridLogMessage<<"res norm "<<norm2(c_res)<<std::endl;
-  c_res = c_res - c_ref;
-  std::cout << "Projected solver error "<<norm2(c_res)<<std::endl;
+  //  HPDSolveProj(c_src,c_res);
+  //  std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
+  //  std::cout << GridLogMessage<<"res norm "<<norm2(c_res)<<std::endl;
+  //  c_res = c_res - c_ref;
+  //  std::cout << "Projected solver error "<<norm2(c_res)<<std::endl;

  //////////////////////////////////////////////////////////////////////
  // Coarse ADEF1 with deflation space
@ -331,22 +338,22 @@ int main (int argc, char ** argv)
 	   CoarseSmoother,
 	   evec,eval);

-  c_res=Zero();
-  cADEF1(c_src,c_res);
-  std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
-  std::cout << GridLogMessage<<"cADEF1 res norm "<<norm2(c_res)<<std::endl;
-  c_res = c_res - c_ref;
-  std::cout << "cADEF1 solver error "<<norm2(c_res)<<std::endl;
+  //  c_res=Zero();
+  //  cADEF1(c_src,c_res);
+  //  std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
+  //  std::cout << GridLogMessage<<"cADEF1 res norm "<<norm2(c_res)<<std::endl;
+  //  c_res = c_res - c_ref;
+  //  std::cout << "cADEF1 solver error "<<norm2(c_res)<<std::endl;
  
  //  cADEF1.Tolerance = 4.0e-2;
  //  cADEF1.Tolerance = 1.0e-1;
-  cADEF1.Tolerance = 5.0e-2;
-  c_res=Zero();
-  cADEF1(c_src,c_res);
-  std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
-  std::cout << GridLogMessage<<"cADEF1 res norm "<<norm2(c_res)<<std::endl;
-  c_res = c_res - c_ref;
-  std::cout << "cADEF1 solver error "<<norm2(c_res)<<std::endl;
+  //  cADEF1.Tolerance = 5.0e-2;
+  //  c_res=Zero();
+  //  cADEF1(c_src,c_res);
+  //  std::cout << GridLogMessage<<"src norm "<<norm2(c_src)<<std::endl;
+  //  std::cout << GridLogMessage<<"cADEF1 res norm "<<norm2(c_res)<<std::endl;
+  //  c_res = c_res - c_ref;
+  //  std::cout << "cADEF1 solver error "<<norm2(c_res)<<std::endl;
  
  //////////////////////////////////////////
  // Build a smoother
@ -379,7 +386,7 @@ int main (int argc, char ** argv)

    for(int o=0;o<ords.size();o++){

-      ConjugateGradient<CoarseVector>  CGsloppy(4.0e-2,maxit,false);
+      ConjugateGradient<CoarseVector>  CGsloppy(5.0e-2,maxit,false);
      HPDSolver<CoarseVector> HPDSolveSloppy(CoarseOp,CGsloppy,DeflCoarseGuesser);
      
      //    ChebyshevSmoother<LatticeFermionD,HermFineMatrix > Smoother(lo,92,10,FineHermOp); // 36 best case
@ -404,13 +411,12 @@ int main (int argc, char ** argv)
 		 HPDSolve,
 		 Aggregates);
      
-      result=Zero();
-      HDCGdefl(src,result);
+      //      result=Zero();
+      //      HDCGdefl(src,result);

      result=Zero();
      HDCG(src,result);
      
-      
    }
  }
Author	SHA1	Message	Date
Peter Boyle	351795ac3a	Better messaging	2023-10-20 19:33:04 -04:00
Peter Boyle	9c9c42d0df	Tests on frontier with real speed up . 3.5x on 16^3 at mq=0.01	2023-10-20 19:27:13 -04:00
Peter Boyle	b6ad1bafc7	Normal memory SendToRecvFrom asynchronous for use in general stencil code	2023-10-20 19:27:13 -04:00
Peter Boyle	a5ca40f446	Better verbose -- track CPU GPU motion under --log Memory, others go to debug output stream	2023-10-20 19:27:13 -04:00
Peter Boyle	9ab54c5565	Overlap comms & data copy/buffer assembly in Ghost zone exchange	2023-10-20 19:27:13 -04:00
Peter Boyle	4341d96bde	Massively sped up coarse grid mult, comms Save 3ms spend (60% of time !) on cudaMalloc !!	2023-10-20 19:27:13 -04:00
Peter Boyle	5fac47a26d	Faster halo exchange	2023-10-20 19:27:13 -04:00
Peter Boyle	e064f17346	Faster halo exchange	2023-10-20 19:27:13 -04:00
Peter Boyle	afe10ba2a2	More digits	2023-10-20 19:27:13 -04:00
Peter Boyle	7cc3435ba8	Imporved General coarsened matrix	2023-10-20 19:27:13 -04:00
Peter Boyle	541772313c	Verbosity	2023-10-20 19:27:13 -04:00
Peter Boyle	3747494a09	Notify delet public	2023-10-20 19:27:13 -04:00
Peter Boyle	f2b98d0dcc	Const safety	2023-10-20 19:27:13 -04:00
Peter Boyle	80471bf762	Alternate implementation involving face operations	2023-10-20 19:27:13 -04:00
Peter Boyle	a06f63c110	Improved I/O and non-lexico option exposed to SciDAC format	2023-10-20 19:27:13 -04:00
Peter Boyle	0ae4478cd9	Checkpoint the subspace and ldop	2023-10-20 19:27:13 -04:00
Peter Boyle	ae4e705e09	Use random vec as easier for debug	2023-10-20 19:27:13 -04:00
Peter Boyle	f5dcea9dbf	Updates for Frontier	2023-10-20 19:27:12 -04:00