diff --git a/lib/communicator/SharedMemoryMPI.cc b/lib/communicator/SharedMemoryMPI.cc
index 45edbb07..1fa84dfb 100644
--- a/lib/communicator/SharedMemoryMPI.cc
+++ b/lib/communicator/SharedMemoryMPI.cc
@@ -226,6 +226,48 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 };
 #endif // MMAP
 
+#ifdef GRID_MPI3_SHM_NONE
+void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
+{
+  std::cout << "SharedMemoryAllocate "<< bytes<< " MMAP anonymous implementation "<<std::endl;
+  assert(_ShmSetup==1);
+  assert(_ShmAlloc==0);
+  //////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // allocate the shared windows for our group
+  //////////////////////////////////////////////////////////////////////////////////////////////////////////
+  MPI_Barrier(WorldShmComm);
+  WorldShmCommBufs.resize(WorldShmSize);
+  
+  ////////////////////////////////////////////////////////////////////////////////////////////
+  // Hugetlbf and others map filesystems as mappable huge pages
+  ////////////////////////////////////////////////////////////////////////////////////////////
+  char shm_name [NAME_MAX];
+  assert(WorldShmSize == 1);
+  for(int r=0;r<WorldShmSize;r++){
+    
+    int fd=-1;
+    int mmap_flag = MAP_SHARED |MAP_ANONYMOUS ;
+#ifdef MAP_POPULATE    
+    mmap_flag|=MAP_POPULATE;
+#endif
+#ifdef MAP_HUGETLB
+    if ( flags ) mmap_flag |= MAP_HUGETLB;
+#endif
+    void *ptr = (void *) mmap(NULL, bytes, PROT_READ | PROT_WRITE, mmap_flag,fd, 0); 
+    if ( ptr == (void *)MAP_FAILED ) {    
+      printf("mmap %s failed\n",shm_name);
+      perror("failed mmap");      assert(0);    
+    }
+    assert(((uint64_t)ptr&0x3F)==0);
+    close(fd);
+    WorldShmCommBufs[r] =ptr;
+    std::cout << "Set WorldShmCommBufs["<<r<<"]="<<ptr<< "("<< bytes<< "bytes)"<<std::endl;
+  }
+  _ShmAlloc=1;
+  _ShmAllocBytes  = bytes;
+};
+#endif // MMAP
+
 #ifdef GRID_MPI3_SHMOPEN
 ////////////////////////////////////////////////////////////////////////////////////////////
 // POSIX SHMOPEN ; as far as I know Linux does not allow EXPLICIT HugePages with this case
@@ -246,7 +288,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 	
       size_t size = bytes;
       
-      sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",WorldNode,r);
+      sprintf(shm_name,"/myGrid_mpi3_shm_%d_%d",WorldNode,r);
       
       shm_unlink(shm_name);
       int fd=shm_open(shm_name,O_RDWR|O_CREAT,0666);
diff --git a/lib/parallelIO/BinaryIO.h b/lib/parallelIO/BinaryIO.h
index b40a75af..39acf0e0 100644
--- a/lib/parallelIO/BinaryIO.h
+++ b/lib/parallelIO/BinaryIO.h
@@ -91,7 +91,7 @@ class BinaryIO {
     typedef typename vobj::scalar_object sobj;
 
     GridBase *grid = lat._grid;
-    int lsites = grid->lSites();
+    uint64_t lsites = grid->lSites();
 
     std::vector<sobj> scalardata(lsites); 
     unvectorizeToLexOrdArray(scalardata,lat);    
@@ -160,7 +160,9 @@ class BinaryIO {
 
 	/* 
 	 * Scidac csum  is rather more heavyweight
+	 * FIXME -- 128^3 x 256 x 16 will overflow.
 	 */
+	
 	int global_site;
 
 	Lexicographic::CoorFromIndex(coor,local_site,local_vol);
@@ -261,7 +263,7 @@ class BinaryIO {
 			      GridBase *grid,
 			      std::vector<fobj> &iodata,
 			      std::string file,
-			      Integer offset,
+			      uint64_t offset,
 			      const std::string &format, int control,
 			      uint32_t &nersc_csum,
 			      uint32_t &scidac_csuma,
@@ -523,7 +525,7 @@ class BinaryIO {
   static inline void readLatticeObject(Lattice<vobj> &Umu,
 				       std::string file,
 				       munger munge,
-				       Integer offset,
+				       uint64_t offset,
 				       const std::string &format,
 				       uint32_t &nersc_csum,
 				       uint32_t &scidac_csuma,
@@ -533,7 +535,7 @@ class BinaryIO {
     typedef typename vobj::Realified::scalar_type word;    word w=0;
 
     GridBase *grid = Umu._grid;
-    int lsites = grid->lSites();
+    uint64_t lsites = grid->lSites();
 
     std::vector<sobj> scalardata(lsites); 
     std::vector<fobj>     iodata(lsites); // Munge, checksum, byte order in here
@@ -544,7 +546,7 @@ class BinaryIO {
     GridStopWatch timer; 
     timer.Start();
 
-    parallel_for(int x=0;x<lsites;x++) munge(iodata[x], scalardata[x]);
+    parallel_for(uint64_t x=0;x<lsites;x++) munge(iodata[x], scalardata[x]);
 
     vectorizeFromLexOrdArray(scalardata,Umu);    
     grid->Barrier();
@@ -560,7 +562,7 @@ class BinaryIO {
     static inline void writeLatticeObject(Lattice<vobj> &Umu,
 					  std::string file,
 					  munger munge,
-					  Integer offset,
+					  uint64_t offset,
 					  const std::string &format,
 					  uint32_t &nersc_csum,
 					  uint32_t &scidac_csuma,
@@ -569,7 +571,7 @@ class BinaryIO {
     typedef typename vobj::scalar_object sobj;
     typedef typename vobj::Realified::scalar_type word;    word w=0;
     GridBase *grid = Umu._grid;
-    int lsites = grid->lSites();
+    uint64_t lsites = grid->lSites();
 
     std::vector<sobj> scalardata(lsites); 
     std::vector<fobj>     iodata(lsites); // Munge, checksum, byte order in here
@@ -580,7 +582,7 @@ class BinaryIO {
     GridStopWatch timer; timer.Start();
     unvectorizeToLexOrdArray(scalardata,Umu);    
 
-    parallel_for(int x=0;x<lsites;x++) munge(scalardata[x],iodata[x]);
+    parallel_for(uint64_t x=0;x<lsites;x++) munge(scalardata[x],iodata[x]);
 
     grid->Barrier();
     timer.Stop();
@@ -597,7 +599,7 @@ class BinaryIO {
   static inline void readRNG(GridSerialRNG &serial,
 			     GridParallelRNG &parallel,
 			     std::string file,
-			     Integer offset,
+			     uint64_t offset,
 			     uint32_t &nersc_csum,
 			     uint32_t &scidac_csuma,
 			     uint32_t &scidac_csumb)
@@ -610,8 +612,8 @@ class BinaryIO {
     std::string format = "IEEE32BIG";
 
     GridBase *grid = parallel._grid;
-    int gsites = grid->gSites();
-    int lsites = grid->lSites();
+    uint64_t gsites = grid->gSites();
+    uint64_t lsites = grid->lSites();
 
     uint32_t nersc_csum_tmp   = 0;
     uint32_t scidac_csuma_tmp = 0;
@@ -626,7 +628,7 @@ class BinaryIO {
 	     nersc_csum,scidac_csuma,scidac_csumb);
 
     timer.Start();
-    parallel_for(int lidx=0;lidx<lsites;lidx++){
+    parallel_for(uint64_t lidx=0;lidx<lsites;lidx++){
       std::vector<RngStateType> tmp(RngStateCount);
       std::copy(iodata[lidx].begin(),iodata[lidx].end(),tmp.begin());
       parallel.SetState(tmp,lidx);
@@ -659,7 +661,7 @@ class BinaryIO {
   static inline void writeRNG(GridSerialRNG &serial,
 			      GridParallelRNG &parallel,
 			      std::string file,
-			      Integer offset,
+			      uint64_t offset,
 			      uint32_t &nersc_csum,
 			      uint32_t &scidac_csuma,
 			      uint32_t &scidac_csumb)
@@ -670,8 +672,8 @@ class BinaryIO {
     typedef std::array<RngStateType,RngStateCount> RNGstate;
 
     GridBase *grid = parallel._grid;
-    int gsites = grid->gSites();
-    int lsites = grid->lSites();
+    uint64_t gsites = grid->gSites();
+    uint64_t lsites = grid->lSites();
 
     uint32_t nersc_csum_tmp;
     uint32_t scidac_csuma_tmp;
@@ -684,7 +686,7 @@ class BinaryIO {
 
     timer.Start();
     std::vector<RNGstate> iodata(lsites);
-    parallel_for(int lidx=0;lidx<lsites;lidx++){
+    parallel_for(uint64_t lidx=0;lidx<lsites;lidx++){
       std::vector<RngStateType> tmp(RngStateCount);
       parallel.GetState(tmp,lidx);
       std::copy(tmp.begin(),tmp.end(),iodata[lidx].begin());
diff --git a/lib/parallelIO/IldgIO.h b/lib/parallelIO/IldgIO.h
index b0bd7e2c..b81d1e43 100644
--- a/lib/parallelIO/IldgIO.h
+++ b/lib/parallelIO/IldgIO.h
@@ -337,6 +337,20 @@ class GridLimeWriter : public BinaryIO {
   template<class vobj>
   void writeLimeLatticeBinaryObject(Lattice<vobj> &field,std::string record_name)
   {
+    ////////////////////////////////////////////////////////////////////
+    // NB: FILE and iostream are jointly writing disjoint sequences in the
+    // the same file through different file handles (integer units).
+    // 
+    // These are both buffered, so why I think this code is right is as follows.
+    //
+    // i)  write record header to FILE *File, telegraphing the size; flush
+    // ii) ftello reads the offset from FILE *File . 
+    // iii) iostream / MPI Open independently seek this offset. Write sequence direct to disk.
+    //      Closes iostream and flushes.
+    // iv) fseek on FILE * to end of this disjoint section.
+    //  v) Continue writing scidac record.
+    ////////////////////////////////////////////////////////////////////
+
     ////////////////////////////////////////////
     // Create record header
     ////////////////////////////////////////////
@@ -350,25 +364,24 @@ class GridLimeWriter : public BinaryIO {
     //    std::cout << "W Gsites "           <<field._grid->_gsites<<std::endl;
     //    std::cout << "W Payload expected " <<PayloadSize<<std::endl;
 
-    ////////////////////////////////////////////////////////////////////
-    // NB: FILE and iostream are jointly writing disjoint sequences in the
-    // the same file through different file handles (integer units).
-    // 
-    // These are both buffered, so why I think this code is right is as follows.
-    //
-    // i)  write record header to FILE *File, telegraphing the size. 
-    // ii) ftello reads the offset from FILE *File .
-    // iii) iostream / MPI Open independently seek this offset. Write sequence direct to disk.
-    //      Closes iostream and flushes.
-    // iv) fseek on FILE * to end of this disjoint section.
-    //  v) Continue writing scidac record.
-    ////////////////////////////////////////////////////////////////////
-    uint64_t offset = ftello(File);
-    //    std::cout << " Writing to offset "<<offset << std::endl;
+    fflush(File);
+
+    ///////////////////////////////////////////
+    // Write by other means into the binary record
+    ///////////////////////////////////////////
+    uint64_t offset1 = ftello(File);    //    std::cout << " Writing to offset "<<offset1 << std::endl;
     std::string format = getFormatString<vobj>();
     BinarySimpleMunger<sobj,sobj> munge;
-    BinaryIO::writeLatticeObject<vobj,sobj>(field, filename, munge, offset, format,nersc_csum,scidac_csuma,scidac_csumb);
-    //    fseek(File,0,SEEK_END);    offset = ftello(File);std::cout << " offset now "<<offset << std::endl;
+    BinaryIO::writeLatticeObject<vobj,sobj>(field, filename, munge, offset1, format,nersc_csum,scidac_csuma,scidac_csumb);
+
+    ///////////////////////////////////////////
+    // Wind forward and close the record
+    ///////////////////////////////////////////
+    fseek(File,0,SEEK_END);             
+    uint64_t offset2 = ftello(File);     //    std::cout << " now at offset "<<offset2 << std::endl;
+
+    assert((offset2-offset1) == PayloadSize);
+
     err=limeWriterCloseRecord(LimeW);  assert(err>=0);
 
     ////////////////////////////////////////
diff --git a/lib/parallelIO/NerscIO.h b/lib/parallelIO/NerscIO.h
index 786839f2..e2c2bc39 100644
--- a/lib/parallelIO/NerscIO.h
+++ b/lib/parallelIO/NerscIO.h
@@ -57,7 +57,7 @@ namespace Grid {
       // for the header-reader
       static inline int readHeader(std::string file,GridBase *grid,  FieldMetaData &field)
       {
-      int offset=0;
+      uint64_t offset=0;
       std::map<std::string,std::string> header;
       std::string line;
 
@@ -139,7 +139,7 @@ namespace Grid {
       typedef Lattice<iLorentzColourMatrix<vsimd> > GaugeField;
 
       GridBase *grid = Umu._grid;
-      int offset = readHeader(file,Umu._grid,header);
+      uint64_t offset = readHeader(file,Umu._grid,header);
 
       FieldMetaData clone(header);
 
@@ -236,7 +236,7 @@ namespace Grid {
 	GaugeStatistics(Umu,header);
 	MachineCharacteristics(header);
 
-	int offset;
+	uint64_t offset;
   
 	truncate(file);
 
@@ -278,7 +278,7 @@ namespace Grid {
 	header.plaquette=0.0;
 	MachineCharacteristics(header);
 
-	int offset;
+	uint64_t offset;
   
 #ifdef RNG_RANLUX
 	header.floating_point = std::string("UINT64");
@@ -313,7 +313,7 @@ namespace Grid {
 
 	GridBase *grid = parallel._grid;
 
-	int offset = readHeader(file,grid,header);
+	uint64_t offset = readHeader(file,grid,header);
 
 	FieldMetaData clone(header);
 
diff --git a/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc b/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc
index 3dff4b90..b55b66d9 100644
--- a/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc
+++ b/tests/lanczos/Test_dwf_compressed_lanczos_reorg.cc
@@ -180,7 +180,6 @@ int main (int argc, char ** argv) {
   GridCartesian         * CoarseGrid4    = SpaceTimeGrid::makeFourDimGrid(coarseLatt, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
   GridRedBlackCartesian * CoarseGrid4rb  = SpaceTimeGrid::makeFourDimRedBlackGrid(CoarseGrid4);
   GridCartesian         * CoarseGrid5    = SpaceTimeGrid::makeFiveDimGrid(cLs,CoarseGrid4);
-  GridRedBlackCartesian * CoarseGrid5rb  = SpaceTimeGrid::makeFourDimRedBlackGrid(CoarseGrid5);
 
   // Gauge field
   LatticeGaugeField Umu(UGrid);
@@ -206,7 +205,7 @@ int main (int argc, char ** argv) {
 
   const int nbasis= 60;
   assert(nbasis==Ns1);
-  LocalCoherenceLanczosScidac<vSpinColourVector,vTComplex,nbasis> _LocalCoherenceLanczos(FrbGrid,CoarseGrid5rb,HermOp,Odd);
+  LocalCoherenceLanczosScidac<vSpinColourVector,vTComplex,nbasis> _LocalCoherenceLanczos(FrbGrid,CoarseGrid5,HermOp,Odd);
   std::cout << GridLogMessage << "Constructed LocalCoherenceLanczos" << std::endl;
 
   assert( (Params.doFine)||(Params.doFineRead));