Merge branch 'develop' into feature/json-fix

2025-11-03 13:34:33 +00:00 · 2017-09-08 13:42:20 +01:00
parent 7cb2b11f26 1184ed29ae
commit 13fa70ac1a
40 changed files with 2426 additions and 563 deletions
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@@ -10,8 +10,8 @@ if BUILD_COMMS_MPI3
  extra_sources+=communicator/Communicator_base.cc
 endif

-if BUILD_COMMS_MPI3L
-  extra_sources+=communicator/Communicator_mpi3_leader.cc
+if BUILD_COMMS_MPIT
+  extra_sources+=communicator/Communicator_mpit.cc
  extra_sources+=communicator/Communicator_base.cc
 endif

--- a/lib/algorithms/iterative/BlockConjugateGradient.h
+++ b/lib/algorithms/iterative/BlockConjugateGradient.h
@@ -87,15 +87,22 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
  ////////////////////////////////////////////////////////////////////////////////////////////////////
  sliceInnerProductMatrix(m_rr,R,R,Orthog);

-  ////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Cholesky from Eigen
-  // There exists a ldlt that is documented as more stable
-  ////////////////////////////////////////////////////////////////////////////////////////////////////
-  Eigen::MatrixXcd L    = m_rr.llt().matrixL(); 
+  // Force manifest hermitian to avoid rounding related
+  m_rr = 0.5*(m_rr+m_rr.adjoint());

+#if 0
+  std::cout << " Calling Cholesky  ldlt on m_rr "  << m_rr <<std::endl;
+  Eigen::MatrixXcd L_ldlt = m_rr.ldlt().matrixL(); 
+  std::cout << " Called Cholesky  ldlt on m_rr "  << L_ldlt <<std::endl;
+  auto  D_ldlt = m_rr.ldlt().vectorD(); 
+  std::cout << " Called Cholesky  ldlt on m_rr "  << D_ldlt <<std::endl;
+#endif
+
+  //  std::cout << " Calling Cholesky  llt on m_rr "  <<std::endl;
+  Eigen::MatrixXcd L    = m_rr.llt().matrixL(); 
+  //  std::cout << " Called Cholesky  llt on m_rr "  << L <<std::endl;
  C    = L.adjoint();
  Cinv = C.inverse();
-
  ////////////////////////////////////////////////////////////////////////////////////////////////////
  // Q = R C^{-1}
  //
@@ -103,7 +110,6 @@ void ThinQRfact (Eigen::MatrixXcd &m_rr,
  //
  // NB maddMatrix conventions are Right multiplication X[j] a[j,i] already
  ////////////////////////////////////////////////////////////////////////////////////////////////////
-  // FIXME:: make a sliceMulMatrix to avoid zero vector
  sliceMulMatrix(Q,Cinv,R,Orthog);
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -199,7 +205,12 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)

  Linop.HermOp(X, AD);
  tmp = B - AD;  
+  //std::cout << GridLogMessage << " initial tmp " << norm2(tmp)<< std::endl;
  ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
+  //std::cout << GridLogMessage << " initial Q " << norm2(Q)<< std::endl;
+  //std::cout << GridLogMessage << " m_rr " << m_rr<<std::endl;
+  //std::cout << GridLogMessage << " m_C " << m_C<<std::endl;
+  //std::cout << GridLogMessage << " m_Cinv " << m_Cinv<<std::endl;
  D=Q;

  std::cout << GridLogMessage<<"BlockCGrQ computed initial residual and QR fact " <<std::endl;
@@ -221,13 +232,15 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
    MatrixTimer.Start();
    Linop.HermOp(D, Z);      
    MatrixTimer.Stop();
+    //std::cout << GridLogMessage << " norm2 Z " <<norm2(Z)<<std::endl;

    //4. M  = [D^dag Z]^{-1}
    sliceInnerTimer.Start();
    sliceInnerProductMatrix(m_DZ,D,Z,Orthog);
    sliceInnerTimer.Stop();
    m_M       = m_DZ.inverse();
-
+    //std::cout << GridLogMessage << " m_DZ " <<m_DZ<<std::endl;
+    
    //5. X  = X + D MC
    m_tmp     = m_M * m_C;
    sliceMaddTimer.Start();
--- a/lib/allocator/AlignedAllocator.cc
+++ b/lib/allocator/AlignedAllocator.cc
@@ -1,7 +1,5 @@
-
-
-
 #include <Grid/GridCore.h>
+#include <fcntl.h>

 namespace Grid {

@@ -11,7 +9,7 @@ int PointerCache::victim;

 void *PointerCache::Insert(void *ptr,size_t bytes) {

-  if (bytes < 4096 ) return NULL;
+  if (bytes < 4096 ) return ptr;

 #ifdef GRID_OMP
  assert(omp_in_parallel()==0);
@@ -63,4 +61,37 @@ void *PointerCache::Lookup(size_t bytes) {
  return NULL;
 }

+
+void check_huge_pages(void *Buf,uint64_t BYTES)
+{
+#ifdef __linux__
+  int fd = open("/proc/self/pagemap", O_RDONLY);
+  assert(fd >= 0);
+  const int page_size = 4096;
+  uint64_t virt_pfn = (uint64_t)Buf / page_size;
+  off_t offset = sizeof(uint64_t) * virt_pfn;
+  uint64_t npages = (BYTES + page_size-1) / page_size;
+  uint64_t pagedata[npages];
+  uint64_t ret = lseek(fd, offset, SEEK_SET);
+  assert(ret == offset);
+  ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
+  assert(ret == sizeof(uint64_t) * npages);
+  int nhugepages = npages / 512;
+  int n4ktotal, nnothuge;
+  n4ktotal = 0;
+  nnothuge = 0;
+  for (int i = 0; i < nhugepages; ++i) {
+    uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size;
+    for (int j = 0; j < 512; ++j) {
+      uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size;
+      ++n4ktotal;
+      if (pageaddr != baseaddr + j * page_size)
+	++nnothuge;
+      }
+  }
+  int rank = CartesianCommunicator::RankWorld();
+  printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge);
+#endif
+}
+
 }
--- a/lib/allocator/AlignedAllocator.h
+++ b/lib/allocator/AlignedAllocator.h
@@ -64,6 +64,8 @@ namespace Grid {

  };

+  void check_huge_pages(void *Buf,uint64_t BYTES);
+
 ////////////////////////////////////////////////////////////////////
 // A lattice of something, but assume the something is SIMDized.
 ////////////////////////////////////////////////////////////////////
@@ -92,12 +94,20 @@ public:
    size_type bytes = __n*sizeof(_Tp);

    _Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
-    
+    //    if ( ptr != NULL ) 
+    //      std::cout << "alignedAllocator "<<__n << " cache hit "<< std::hex << ptr <<std::dec <<std::endl;
+
+    //////////////////
+    // Hack 2MB align; could make option probably doesn't need configurability
+    //////////////////
+//define GRID_ALLOC_ALIGN (128)
+#define GRID_ALLOC_ALIGN (2*1024*1024)
 #ifdef HAVE_MM_MALLOC_H
-    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,128);
+    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN);
 #else
-    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(128,bytes);
+    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes);
 #endif
+    //    std::cout << "alignedAllocator " << std::hex << ptr <<std::dec <<std::endl;
    // First touch optimise in threaded loop
    uint8_t *cp = (uint8_t *)ptr;
 #ifdef GRID_OMP
@@ -111,6 +121,7 @@ public:

  void deallocate(pointer __p, size_type __n) { 
    size_type bytes = __n * sizeof(_Tp);
+
    pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes);

 #ifdef HAVE_MM_MALLOC_H
@@ -189,16 +200,18 @@ public:
  pointer allocate(size_type __n, const void* _p= 0) 
  {
 #ifdef HAVE_MM_MALLOC_H
-    _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128);
+    _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),GRID_ALLOC_ALIGN);
 #else
-    _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
+    _Tp * ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,__n*sizeof(_Tp));
 #endif
    size_type bytes = __n*sizeof(_Tp);
    uint8_t *cp = (uint8_t *)ptr;
+    if ( ptr ) { 
    // One touch per 4k page, static OMP loop to catch same loop order
 #pragma omp parallel for schedule(static)
-    for(size_type n=0;n<bytes;n+=4096){
-      cp[n]=0;
+      for(size_type n=0;n<bytes;n+=4096){
+	cp[n]=0;
+      }
    }
    return ptr;
  }
--- a/lib/cartesian/Cartesian_base.h
+++ b/lib/cartesian/Cartesian_base.h
@@ -185,17 +185,18 @@ public:
    ////////////////////////////////////////////////////////////////

    void show_decomposition(){
-      std::cout << GridLogMessage << "Full Dimensions    : " << _fdimensions << std::endl;
-      std::cout << GridLogMessage << "Global Dimensions  : " << _gdimensions << std::endl;
-      std::cout << GridLogMessage << "Local Dimensions   : " << _ldimensions << std::endl;
-      std::cout << GridLogMessage << "Reduced Dimensions : " << _rdimensions << std::endl;
-      std::cout << GridLogMessage << "Outer strides      : " << _ostride << std::endl;
-      std::cout << GridLogMessage << "Inner strides      : " << _istride << std::endl;
-      std::cout << GridLogMessage << "iSites             : " << _isites << std::endl;
-      std::cout << GridLogMessage << "oSites             : " << _osites << std::endl;
-      std::cout << GridLogMessage << "lSites             : " << lSites() << std::endl;        
-      std::cout << GridLogMessage << "gSites             : " << gSites() << std::endl;
-      std::cout << GridLogMessage << "Nd                 : " << _ndimension << std::endl;             
+      std::cout << GridLogMessage << "\tFull Dimensions    : " << _fdimensions << std::endl;
+      std::cout << GridLogMessage << "\tSIMD layout        : " << _simd_layout << std::endl;
+      std::cout << GridLogMessage << "\tGlobal Dimensions  : " << _gdimensions << std::endl;
+      std::cout << GridLogMessage << "\tLocal Dimensions   : " << _ldimensions << std::endl;
+      std::cout << GridLogMessage << "\tReduced Dimensions : " << _rdimensions << std::endl;
+      std::cout << GridLogMessage << "\tOuter strides      : " << _ostride << std::endl;
+      std::cout << GridLogMessage << "\tInner strides      : " << _istride << std::endl;
+      std::cout << GridLogMessage << "\tiSites             : " << _isites << std::endl;
+      std::cout << GridLogMessage << "\toSites             : " << _osites << std::endl;
+      std::cout << GridLogMessage << "\tlSites             : " << lSites() << std::endl;        
+      std::cout << GridLogMessage << "\tgSites             : " << gSites() << std::endl;
+      std::cout << GridLogMessage << "\tNd                 : " << _ndimension << std::endl;             
    } 

    ////////////////////////////////////////////////////////////////
--- a/lib/cartesian/Cartesian_full.h
+++ b/lib/cartesian/Cartesian_full.h
@@ -62,77 +62,81 @@ public:
      return shift;
    }
    GridCartesian(const std::vector<int> &dimensions,
-		  const std::vector<int> &simd_layout,
-		  const std::vector<int> &processor_grid
-		  ) : GridBase(processor_grid)
+                  const std::vector<int> &simd_layout,
+                  const std::vector<int> &processor_grid) : GridBase(processor_grid)
    {
-        ///////////////////////
-        // Grid information
-        ///////////////////////
-        _ndimension = dimensions.size();
-            
-        _fdimensions.resize(_ndimension);
-        _gdimensions.resize(_ndimension);
-        _ldimensions.resize(_ndimension);
-        _rdimensions.resize(_ndimension);
-        _simd_layout.resize(_ndimension);
-	_lstart.resize(_ndimension);
-	_lend.resize(_ndimension);
-            
-        _ostride.resize(_ndimension);
-        _istride.resize(_ndimension);
-            
-        _fsites = _gsites = _osites = _isites = 1;
+      ///////////////////////
+      // Grid information
+      ///////////////////////
+      _ndimension = dimensions.size();

-        for(int d=0;d<_ndimension;d++){
-	  _fdimensions[d] = dimensions[d]; // Global dimensions
-	  _gdimensions[d] = _fdimensions[d]; // Global dimensions
-	  _simd_layout[d] = simd_layout[d];
-	  _fsites = _fsites * _fdimensions[d];
-	  _gsites = _gsites * _gdimensions[d];
+      _fdimensions.resize(_ndimension);
+      _gdimensions.resize(_ndimension);
+      _ldimensions.resize(_ndimension);
+      _rdimensions.resize(_ndimension);
+      _simd_layout.resize(_ndimension);
+      _lstart.resize(_ndimension);
+      _lend.resize(_ndimension);

-	  //FIXME check for exact division
+      _ostride.resize(_ndimension);
+      _istride.resize(_ndimension);

-	  // Use a reduced simd grid
-	  _ldimensions[d]= _gdimensions[d]/_processors[d];  //local dimensions
-	  _rdimensions[d]= _ldimensions[d]/_simd_layout[d]; //overdecomposition
-	  _lstart[d]     = _processor_coor[d]*_ldimensions[d];
-	  _lend[d]       = _processor_coor[d]*_ldimensions[d]+_ldimensions[d]-1;
-	  _osites  *= _rdimensions[d];
-	  _isites  *= _simd_layout[d];
-                
-	  // Addressing support
-	  if ( d==0 ) {
-	    _ostride[d] = 1;
-	    _istride[d] = 1;
-	  } else {
-	    _ostride[d] = _ostride[d-1]*_rdimensions[d-1];
-	    _istride[d] = _istride[d-1]*_simd_layout[d-1];
-	  }
+      _fsites = _gsites = _osites = _isites = 1;
+
+      for (int d = 0; d < _ndimension; d++)
+      {
+        _fdimensions[d] = dimensions[d];   // Global dimensions
+        _gdimensions[d] = _fdimensions[d]; // Global dimensions
+        _simd_layout[d] = simd_layout[d];
+        _fsites = _fsites * _fdimensions[d];
+        _gsites = _gsites * _gdimensions[d];
+
+        // Use a reduced simd grid
+        _ldimensions[d] = _gdimensions[d] / _processors[d]; //local dimensions
+        assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
+
+        _rdimensions[d] = _ldimensions[d] / _simd_layout[d]; //overdecomposition
+        assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
+
+        _lstart[d] = _processor_coor[d] * _ldimensions[d];
+        _lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;
+        _osites *= _rdimensions[d];
+        _isites *= _simd_layout[d];
+
+        // Addressing support
+        if (d == 0)
+        {
+          _ostride[d] = 1;
+          _istride[d] = 1;
        }
-        
-        ///////////////////////
-        // subplane information
-        ///////////////////////
-        _slice_block.resize(_ndimension);
-        _slice_stride.resize(_ndimension);
-        _slice_nblock.resize(_ndimension);
-            
-        int block =1;
-        int nblock=1;
-        for(int d=0;d<_ndimension;d++) nblock*=_rdimensions[d];
-            
-        for(int d=0;d<_ndimension;d++){
-            nblock/=_rdimensions[d];
-            _slice_block[d] =block;
-            _slice_stride[d]=_ostride[d]*_rdimensions[d];
-            _slice_nblock[d]=nblock;
-            block = block*_rdimensions[d];
+        else
+        {
+          _ostride[d] = _ostride[d - 1] * _rdimensions[d - 1];
+          _istride[d] = _istride[d - 1] * _simd_layout[d - 1];
        }
+      }

+      ///////////////////////
+      // subplane information
+      ///////////////////////
+      _slice_block.resize(_ndimension);
+      _slice_stride.resize(_ndimension);
+      _slice_nblock.resize(_ndimension);
+
+      int block = 1;
+      int nblock = 1;
+      for (int d = 0; d < _ndimension; d++)
+        nblock *= _rdimensions[d];
+
+      for (int d = 0; d < _ndimension; d++)
+      {
+        nblock /= _rdimensions[d];
+        _slice_block[d] = block;
+        _slice_stride[d] = _ostride[d] * _rdimensions[d];
+        _slice_nblock[d] = nblock;
+        block = block * _rdimensions[d];
+      }
    };
 };
-
-
 }
 #endif
--- a/lib/cartesian/Cartesian_red_black.h
+++ b/lib/cartesian/Cartesian_red_black.h
@@ -131,21 +131,21 @@ public:
      Init(dimensions,simd_layout,processor_grid,checker_dim_mask,0);
    }
    void Init(const std::vector<int> &dimensions,
-	      const std::vector<int> &simd_layout,
-	      const std::vector<int> &processor_grid,
-	      const std::vector<int> &checker_dim_mask,
-	      int checker_dim)
+              const std::vector<int> &simd_layout,
+              const std::vector<int> &processor_grid,
+              const std::vector<int> &checker_dim_mask,
+              int checker_dim)
    {
-    ///////////////////////
-    // Grid information
-    ///////////////////////
+      ///////////////////////
+      // Grid information
+      ///////////////////////
      _checker_dim = checker_dim;
-      assert(checker_dim_mask[checker_dim]==1);
+      assert(checker_dim_mask[checker_dim] == 1);
      _ndimension = dimensions.size();
-      assert(checker_dim_mask.size()==_ndimension);
-      assert(processor_grid.size()==_ndimension);
-      assert(simd_layout.size()==_ndimension);
-      
+      assert(checker_dim_mask.size() == _ndimension);
+      assert(processor_grid.size() == _ndimension);
+      assert(simd_layout.size() == _ndimension);
+
      _fdimensions.resize(_ndimension);
      _gdimensions.resize(_ndimension);
      _ldimensions.resize(_ndimension);
@@ -153,114 +153,133 @@ public:
      _simd_layout.resize(_ndimension);
      _lstart.resize(_ndimension);
      _lend.resize(_ndimension);
-      
+
      _ostride.resize(_ndimension);
      _istride.resize(_ndimension);
-      
+
      _fsites = _gsites = _osites = _isites = 1;
-	
-      _checker_dim_mask=checker_dim_mask;

-      for(int d=0;d<_ndimension;d++){
-	_fdimensions[d] = dimensions[d];
-	_gdimensions[d] = _fdimensions[d];
-	_fsites = _fsites * _fdimensions[d];
-	_gsites = _gsites * _gdimensions[d];
-        
-	if (d==_checker_dim) {
-	  _gdimensions[d] = _gdimensions[d]/2; // Remove a checkerboard
-	}
-	_ldimensions[d] = _gdimensions[d]/_processors[d];
-	_lstart[d]     = _processor_coor[d]*_ldimensions[d];
-	_lend[d]       = _processor_coor[d]*_ldimensions[d]+_ldimensions[d]-1;
+      _checker_dim_mask = checker_dim_mask;

-	// Use a reduced simd grid
-	_simd_layout[d] = simd_layout[d];
-	_rdimensions[d]= _ldimensions[d]/_simd_layout[d];
-	assert(_rdimensions[d]>0);
+      for (int d = 0; d < _ndimension; d++)
+      {
+        _fdimensions[d] = dimensions[d];
+        _gdimensions[d] = _fdimensions[d];
+        _fsites = _fsites * _fdimensions[d];
+        _gsites = _gsites * _gdimensions[d];

-	// all elements of a simd vector must have same checkerboard.
-	// If Ls vectorised, this must still be the case; e.g. dwf rb5d
-	if ( _simd_layout[d]>1 ) {
-	  if ( checker_dim_mask[d] ) { 
-	    assert( (_rdimensions[d]&0x1) == 0 );
-	  }
-	}
+        if (d == _checker_dim)
+        {
+          assert((_gdimensions[d] & 0x1) == 0);
+          _gdimensions[d] = _gdimensions[d] / 2; // Remove a checkerboard
+        }
+        _ldimensions[d] = _gdimensions[d] / _processors[d];
+        assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
+        _lstart[d] = _processor_coor[d] * _ldimensions[d];
+        _lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;

-	_osites *= _rdimensions[d];
-	_isites *= _simd_layout[d];
-        
-	// Addressing support
-	if ( d==0 ) {
-	  _ostride[d] = 1;
-	  _istride[d] = 1;
-	} else {
-	  _ostride[d] = _ostride[d-1]*_rdimensions[d-1];
-	  _istride[d] = _istride[d-1]*_simd_layout[d-1];
-	}
+        // Use a reduced simd grid
+        _simd_layout[d] = simd_layout[d];
+        _rdimensions[d] = _ldimensions[d] / _simd_layout[d]; // this is not checking if this is integer
+        assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
+        assert(_rdimensions[d] > 0);

+        // all elements of a simd vector must have same checkerboard.
+        // If Ls vectorised, this must still be the case; e.g. dwf rb5d
+        if (_simd_layout[d] > 1)
+        {
+          if (checker_dim_mask[d])
+          {
+            assert((_rdimensions[d] & 0x1) == 0);
+          }
+        }

+        _osites *= _rdimensions[d];
+        _isites *= _simd_layout[d];
+
+        // Addressing support
+        if (d == 0)
+        {
+          _ostride[d] = 1;
+          _istride[d] = 1;
+        }
+        else
+        {
+          _ostride[d] = _ostride[d - 1] * _rdimensions[d - 1];
+          _istride[d] = _istride[d - 1] * _simd_layout[d - 1];
+        }
      }
-            
+
      ////////////////////////////////////////////////////////////////////////////////////////////
      // subplane information
      ////////////////////////////////////////////////////////////////////////////////////////////
      _slice_block.resize(_ndimension);
      _slice_stride.resize(_ndimension);
      _slice_nblock.resize(_ndimension);
-        
-      int block =1;
-      int nblock=1;
-      for(int d=0;d<_ndimension;d++) nblock*=_rdimensions[d];
-      
-      for(int d=0;d<_ndimension;d++){
-	nblock/=_rdimensions[d];
-	_slice_block[d] =block;
-	_slice_stride[d]=_ostride[d]*_rdimensions[d];
-	_slice_nblock[d]=nblock;
-	block = block*_rdimensions[d];
+
+      int block = 1;
+      int nblock = 1;
+      for (int d = 0; d < _ndimension; d++)
+        nblock *= _rdimensions[d];
+
+      for (int d = 0; d < _ndimension; d++)
+      {
+        nblock /= _rdimensions[d];
+        _slice_block[d] = block;
+        _slice_stride[d] = _ostride[d] * _rdimensions[d];
+        _slice_nblock[d] = nblock;
+        block = block * _rdimensions[d];
      }

      ////////////////////////////////////////////////
      // Create a checkerboard lookup table
      ////////////////////////////////////////////////
      int rvol = 1;
-      for(int d=0;d<_ndimension;d++){
-	rvol=rvol * _rdimensions[d];
+      for (int d = 0; d < _ndimension; d++)
+      {
+        rvol = rvol * _rdimensions[d];
      }
      _checker_board.resize(rvol);
-      for(int osite=0;osite<_osites;osite++){
-	_checker_board[osite] = CheckerBoardFromOindex (osite);
+      for (int osite = 0; osite < _osites; osite++)
+      {
+        _checker_board[osite] = CheckerBoardFromOindex(osite);
      }
-      
    };
-protected:
+
+  protected:
    virtual int oIndex(std::vector<int> &coor)
    {
-      int idx=0;
-      for(int d=0;d<_ndimension;d++) {
-	if( d==_checker_dim ) {
-	  idx+=_ostride[d]*((coor[d]/2)%_rdimensions[d]);
-	} else {
-	  idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
-	}
+      int idx = 0;
+      for (int d = 0; d < _ndimension; d++)
+      {
+        if (d == _checker_dim)
+        {
+          idx += _ostride[d] * ((coor[d] / 2) % _rdimensions[d]);
+        }
+        else
+        {
+          idx += _ostride[d] * (coor[d] % _rdimensions[d]);
+        }
      }
      return idx;
    };
-        
+
    virtual int iIndex(std::vector<int> &lcoor)
    {
-        int idx=0;
-        for(int d=0;d<_ndimension;d++) {
-	  if( d==_checker_dim ) {
-	    idx+=_istride[d]*(lcoor[d]/(2*_rdimensions[d]));
-	  } else { 
-	    idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
-	  }
-	}
-        return idx;
+      int idx = 0;
+      for (int d = 0; d < _ndimension; d++)
+      {
+        if (d == _checker_dim)
+        {
+          idx += _istride[d] * (lcoor[d] / (2 * _rdimensions[d]));
+        }
+        else
+        {
+          idx += _istride[d] * (lcoor[d] / _rdimensions[d]);
+        }
+      }
+      return idx;
    }
 };
-
 }
 #endif
--- a/lib/communicator/Communicator_base.cc
+++ b/lib/communicator/Communicator_base.cc
@@ -26,6 +26,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/GridCore.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <limits.h>
+#include <sys/mman.h>

 namespace Grid {

@@ -33,8 +37,11 @@ namespace Grid {
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////
 void *              CartesianCommunicator::ShmCommBuf;
-uint64_t            CartesianCommunicator::MAX_MPI_SHM_BYTES   = 128*1024*1024; 
-CartesianCommunicator::CommunicatorPolicy_t  CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
+uint64_t            CartesianCommunicator::MAX_MPI_SHM_BYTES   = 1024LL*1024LL*1024LL; 
+CartesianCommunicator::CommunicatorPolicy_t  
+CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
+int CartesianCommunicator::nCommThreads = -1;
+int CartesianCommunicator::Hugepages = 0;

 /////////////////////////////////
 // Alloc, free shmem region
@@ -89,25 +96,43 @@ void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
  GlobalSumVector((double *)c,2*N);
 }

-#if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPI3L)
+#if !defined( GRID_COMMS_MPI3) 

 int                      CartesianCommunicator::NodeCount(void)    { return ProcessorCount();};
 int                      CartesianCommunicator::RankCount(void)    { return ProcessorCount();};
-
-double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-						       void *xmit,
-						       int xmit_to_rank,
-						       void *recv,
-						       int recv_from_rank,
-						       int bytes)
+#endif
+#if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPIT)
+double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
+						     int xmit_to_rank,
+						     void *recv,
+						     int recv_from_rank,
+						     int bytes, int dir)
 {
+  std::vector<CommsRequest_t> list;
+  // Discard the "dir"
+  SendToRecvFromBegin   (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
+  SendToRecvFromComplete(list);
+  return 2.0*bytes;
+}
+double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+							 void *xmit,
+							 int xmit_to_rank,
+							 void *recv,
+							 int recv_from_rank,
+							 int bytes, int dir)
+{
+  // Discard the "dir"
  SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
  return 2.0*bytes;
 }
-void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
+void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
 {
  SendToRecvFromComplete(waitall);
 }
+#endif
+
+#if !defined( GRID_COMMS_MPI3) 
+
 void CartesianCommunicator::StencilBarrier(void){};

 commVector<uint8_t> CartesianCommunicator::ShmBufStorageVector;
@@ -121,8 +146,25 @@ void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) {
  return NULL;
 }
 void CartesianCommunicator::ShmInitGeneric(void){
+#if 1
+
+  int mmap_flag = MAP_SHARED | MAP_ANONYMOUS;
+#ifdef MAP_HUGETLB
+  if ( Hugepages ) mmap_flag |= MAP_HUGETLB;
+#endif
+  ShmCommBuf =(void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE, mmap_flag, -1, 0); 
+  if (ShmCommBuf == (void *)MAP_FAILED) {
+    perror("mmap failed ");
+    exit(EXIT_FAILURE);  
+  }
+#ifdef MADV_HUGEPAGE
+  if (!Hugepages ) madvise(ShmCommBuf,MAX_MPI_SHM_BYTES,MADV_HUGEPAGE);
+#endif
+#else 
  ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES);
  ShmCommBuf=(void *)&ShmBufStorageVector[0];
+#endif
+  bzero(ShmCommBuf,MAX_MPI_SHM_BYTES);
 }

 #endif
--- a/lib/communicator/Communicator_base.h
+++ b/lib/communicator/Communicator_base.h
@@ -38,7 +38,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifdef GRID_COMMS_MPI3
 #include <mpi.h>
 #endif
-#ifdef GRID_COMMS_MPI3L
+#ifdef GRID_COMMS_MPIT
 #include <mpi.h>
 #endif
 #ifdef GRID_COMMS_SHMEM
@@ -50,12 +50,24 @@ namespace Grid {
 class CartesianCommunicator {
  public:    

-  // 65536 ranks per node adequate for now
+
+  ////////////////////////////////////////////
+  // Isend/Irecv/Wait, or Sendrecv blocking
+  ////////////////////////////////////////////
+  enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential };
+  static CommunicatorPolicy_t CommunicatorPolicy;
+  static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; }
+
+  ///////////////////////////////////////////
+  // Up to 65536 ranks per node adequate for now
  // 128MB shared memory for comms enought for 48^4 local vol comms
  // Give external control (command line override?) of this
-
-  static const int      MAXLOG2RANKSPERNODE = 16;            
-  static uint64_t MAX_MPI_SHM_BYTES;
+  ///////////////////////////////////////////
+  static const int MAXLOG2RANKSPERNODE = 16;            
+  static uint64_t  MAX_MPI_SHM_BYTES;
+  static int       nCommThreads;
+  // use explicit huge pages
+  static int       Hugepages;

  // Communicator should know nothing of the physics grid, only processor grid.
  int              _Nprocessors;     // How many in all
@@ -64,14 +76,18 @@ class CartesianCommunicator {
  std::vector<int> _processor_coor;  // linear processor coordinate
  unsigned long _ndimension;

-#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPI3L)
+#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT)
  static MPI_Comm communicator_world;
-         MPI_Comm communicator;
+
+  MPI_Comm              communicator;
+  std::vector<MPI_Comm> communicator_halo;
+
  typedef MPI_Request CommsRequest_t;
 #else 
  typedef int CommsRequest_t;
 #endif

+
  ////////////////////////////////////////////////////////////////////
  // Helper functionality for SHM Windows common to all other impls
  ////////////////////////////////////////////////////////////////////
@@ -117,11 +133,7 @@ class CartesianCommunicator {
  /////////////////////////////////
  static void * ShmCommBuf;

-  // Isend/Irecv/Wait, or Sendrecv blocking
-  enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential };
-  static CommunicatorPolicy_t CommunicatorPolicy;
-  static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; }
-
+  
  size_t heap_top;
  size_t heap_bytes;

@@ -211,14 +223,21 @@ class CartesianCommunicator {
  
  void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);

+  double StencilSendToRecvFrom(void *xmit,
+			       int xmit_to_rank,
+			       void *recv,
+			       int recv_from_rank,
+			       int bytes,int dir);
+
  double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-				  void *xmit,
-				  int xmit_to_rank,
-				  void *recv,
-				  int recv_from_rank,
-				  int bytes);
+				    void *xmit,
+				    int xmit_to_rank,
+				    void *recv,
+				    int recv_from_rank,
+				    int bytes,int dir);
  
-  void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
+  
+  void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i);
  void StencilBarrier(void);

  ////////////////////////////////////////////////////////////
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@@ -41,9 +41,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifdef HAVE_NUMAIF_H
 #include <numaif.h>
 #endif
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
+

 namespace Grid {

@@ -200,7 +198,46 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
  ShmCommBuf = 0;
  ShmCommBufs.resize(ShmSize);

-#if 1
+  ////////////////////////////////////////////////////////////////////////////////////////////
+  // Hugetlbf and others map filesystems as mappable huge pages
+  ////////////////////////////////////////////////////////////////////////////////////////////
+#ifdef GRID_MPI3_SHMMMAP
+  char shm_name [NAME_MAX];
+  for(int r=0;r<ShmSize;r++){
+    
+    size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES;
+    sprintf(shm_name,GRID_SHM_PATH "/Grid_mpi3_shm_%d_%d",GroupRank,r);
+    //sprintf(shm_name,"/var/lib/hugetlbfs/group/wheel/pagesize-2MB/" "Grid_mpi3_shm_%d_%d",GroupRank,r);
+    //    printf("Opening file %s \n",shm_name);
+    int fd=open(shm_name,O_RDWR|O_CREAT,0666);
+    if ( fd == -1) { 
+      printf("open %s failed\n",shm_name);
+      perror("open hugetlbfs");
+      exit(0);
+    }
+    int mmap_flag = MAP_SHARED ;
+#ifdef MAP_POPULATE    
+    mmap_flag|=MAP_POPULATE;
+#endif
+#ifdef MAP_HUGETLB
+    if ( Hugepages ) mmap_flag |= MAP_HUGETLB;
+#endif
+    void *ptr = (void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE, mmap_flag,fd, 0); 
+    if ( ptr == (void *)MAP_FAILED ) {    
+      printf("mmap %s failed\n",shm_name);
+      perror("failed mmap");      assert(0);    
+    }
+    assert(((uint64_t)ptr&0x3F)==0);
+    ShmCommBufs[r] =ptr;
+    
+  }
+#endif
+  ////////////////////////////////////////////////////////////////////////////////////////////
+  // POSIX SHMOPEN ; as far as I know Linux does not allow EXPLICIT HugePages with this case
+  // tmpfs (Larry Meadows says) does not support explicit huge page, and this is used for 
+  // the posix shm virtual file system
+  ////////////////////////////////////////////////////////////////////////////////////////////
+#ifdef GRID_MPI3_SHMOPEN
  char shm_name [NAME_MAX];
  if ( ShmRank == 0 ) {
    for(int r=0;r<ShmSize;r++){
@@ -213,13 +250,22 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
      int fd=shm_open(shm_name,O_RDWR|O_CREAT,0666);
      if ( fd < 0 ) {	perror("failed shm_open");	assert(0);      }
      ftruncate(fd, size);
+      
+      int mmap_flag = MAP_SHARED;
+#ifdef MAP_POPULATE 
+      mmap_flag |= MAP_POPULATE;
+#endif
+#ifdef MAP_HUGETLB
+      if (Hugepages) mmap_flag |= MAP_HUGETLB;
+#endif
+      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);

-      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
-      if ( ptr == MAP_FAILED ) {       perror("failed mmap");      assert(0);    }
+      if ( ptr == (void * )MAP_FAILED ) {       perror("failed mmap");      assert(0);    }
      assert(((uint64_t)ptr&0x3F)==0);

-      // Try to force numa domain on the shm segment if we have numaif.h
-#ifdef HAVE_NUMAIF_H
+// Experiments; Experiments; Try to force numa domain on the shm segment if we have numaif.h
+#if 0
+//#ifdef HAVE_NUMAIF_H
 	int status;
 	int flags=MPOL_MF_MOVE;
 #ifdef KNL
@@ -236,7 +282,7 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
 	  if (ierr && (page==0)) perror("numa relocate command failed");
 	}
 #endif
-      ShmCommBufs[r] =ptr;
+	ShmCommBufs[r] =ptr;
      
    }
  }
@@ -258,21 +304,32 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
      ShmCommBufs[r] =ptr;
    }
  }
-
-#else
+#endif
+  ////////////////////////////////////////////////////////////////////////////////////////////
+  // SHMGET SHMAT and SHM_HUGETLB flag
+  ////////////////////////////////////////////////////////////////////////////////////////////
+#ifdef GRID_MPI3_SHMGET
  std::vector<int> shmids(ShmSize);

  if ( ShmRank == 0 ) {
    for(int r=0;r<ShmSize;r++){
      size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES;
-      key_t key   = 0x4545 + r;
-      if ((shmids[r]= shmget(key,size, SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W)) < 0) {
+      key_t key   = IPC_PRIVATE;
+      int flags = IPC_CREAT | SHM_R | SHM_W;
+#ifdef SHM_HUGETLB
+      if (Hugepages) flags|=SHM_HUGETLB;
+#endif
+      if ((shmids[r]= shmget(key,size, flags)) ==-1) {
 	int errsv = errno;
 	printf("Errno %d\n",errsv);
+	printf("key   %d\n",key);
+	printf("size  %lld\n",size);
+	printf("flags %d\n",flags);
 	perror("shmget");
 	exit(1);
+      } else { 
+	printf("shmid: 0x%x\n", shmids[r]);
      }
-      printf("shmid: 0x%x\n", shmids[r]);
    }
  }
  MPI_Barrier(ShmComm);
@@ -397,8 +454,14 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 { 
  int ierr;
  communicator=communicator_world;
+
  _ndimension = processors.size();

+  communicator_halo.resize (2*_ndimension);
+  for(int i=0;i<_ndimension*2;i++){
+    MPI_Comm_dup(communicator,&communicator_halo[i]);
+  }
+
  ////////////////////////////////////////////////////////////////
  // Assert power of two shm_size.
  ////////////////////////////////////////////////////////////////
@@ -621,13 +684,27 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
  }
 }

-double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
-						       void *xmit,
-						       int dest,
-						       void *recv,
-						       int from,
-						       int bytes)
+double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
+						     int dest,
+						     void *recv,
+						     int from,
+						     int bytes,int dir)
 {
+  std::vector<CommsRequest_t> list;
+  double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir);
+  StencilSendToRecvFromComplete(list,dir);
+  return offbytes;
+}
+
+double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+							 void *xmit,
+							 int dest,
+							 void *recv,
+							 int from,
+							 int bytes,int dir)
+{
+  assert(dir < communicator_halo.size());
+
  MPI_Request xrq;
  MPI_Request rrq;

@@ -646,26 +723,26 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
  gfrom = MPI_UNDEFINED;
 #endif
  if ( gfrom ==MPI_UNDEFINED) {
-    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
+    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator_halo[dir],&rrq);
    assert(ierr==0);
    list.push_back(rrq);
    off_node_bytes+=bytes;
  }

  if ( gdest == MPI_UNDEFINED ) {
-    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
+    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator_halo[dir],&xrq);
    assert(ierr==0);
    list.push_back(xrq);
    off_node_bytes+=bytes;
  }

  if ( CommunicatorPolicy == CommunicatorPolicySequential ) { 
-    this->StencilSendToRecvFromComplete(list);
+    this->StencilSendToRecvFromComplete(list,dir);
  }

  return off_node_bytes;
 }
-void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
+void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
 {
  SendToRecvFromComplete(waitall);
 }
--- a/lib/communicator/Communicator_mpit.cc
+++ b/lib/communicator/Communicator_mpit.cc
@@ -0,0 +1,286 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/communicator/Communicator_mpi.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include <Grid/GridCore.h>
+#include <Grid/GridQCDcore.h>
+#include <Grid/qcd/action/ActionCore.h>
+#include <mpi.h>
+
+namespace Grid {
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Info that is setup once and indept of cartesian layout
+///////////////////////////////////////////////////////////////////////////////////////////////////
+MPI_Comm CartesianCommunicator::communicator_world;
+
+// Should error check all MPI calls.
+void CartesianCommunicator::Init(int *argc, char ***argv) {
+  int flag;
+  int provided;
+  MPI_Initialized(&flag); // needed to coexist with other libs apparently
+  if ( !flag ) {
+    MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
+    if ( provided != MPI_THREAD_MULTIPLE ) {
+      QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
+    }
+  }
+  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
+  ShmInitGeneric();
+}
+
+CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
+{
+  _ndimension = processors.size();
+  std::vector<int> periodic(_ndimension,1);
+
+  _Nprocessors=1;
+  _processors = processors;
+  _processor_coor.resize(_ndimension);
+  
+  MPI_Cart_create(communicator_world, _ndimension,&_processors[0],&periodic[0],1,&communicator);
+  MPI_Comm_rank(communicator,&_processor);
+  MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
+
+  for(int i=0;i<_ndimension;i++){
+    _Nprocessors*=_processors[i];
+  }
+
+  communicator_halo.resize (2*_ndimension);
+  for(int i=0;i<_ndimension*2;i++){
+    MPI_Comm_dup(communicator,&communicator_halo[i]);
+  }
+  
+  int Size; 
+  MPI_Comm_size(communicator,&Size);
+  
+  assert(Size==_Nprocessors);
+}
+void CartesianCommunicator::GlobalSum(uint32_t &u){
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
+  assert(ierr==0);
+}
+void CartesianCommunicator::GlobalSum(uint64_t &u){
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
+  assert(ierr==0);
+}
+void CartesianCommunicator::GlobalXOR(uint32_t &u){
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
+  assert(ierr==0);
+}
+void CartesianCommunicator::GlobalXOR(uint64_t &u){
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
+  assert(ierr==0);
+}
+void CartesianCommunicator::GlobalSum(float &f){
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
+  assert(ierr==0);
+}
+void CartesianCommunicator::GlobalSumVector(float *f,int N)
+{
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
+  assert(ierr==0);
+}
+void CartesianCommunicator::GlobalSum(double &d)
+{
+  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
+  assert(ierr==0);
+}
+void CartesianCommunicator::GlobalSumVector(double *d,int N)
+{
+  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
+  assert(ierr==0);
+}
+void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
+{
+  int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
+  assert(ierr==0);
+}
+int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
+{
+  int rank;
+  int ierr=MPI_Cart_rank  (communicator, &coor[0], &rank);
+  assert(ierr==0);
+  return rank;
+}
+void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
+{
+  coor.resize(_ndimension);
+  int ierr=MPI_Cart_coords  (communicator, rank, _ndimension,&coor[0]);
+  assert(ierr==0);
+}
+
+// Basic Halo comms primitive
+void CartesianCommunicator::SendToRecvFrom(void *xmit,
+					   int dest,
+					   void *recv,
+					   int from,
+					   int bytes)
+{
+  std::vector<CommsRequest_t> reqs(0);
+  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
+  SendToRecvFromComplete(reqs);
+}
+
+void CartesianCommunicator::SendRecvPacket(void *xmit,
+					   void *recv,
+					   int sender,
+					   int receiver,
+					   int bytes)
+{
+  MPI_Status stat;
+  assert(sender != receiver);
+  int tag = sender;
+  if ( _processor == sender ) {
+    MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
+  }
+  if ( _processor == receiver ) { 
+    MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
+  }
+}
+
+// Basic Halo comms primitive
+void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+						void *xmit,
+						int dest,
+						void *recv,
+						int from,
+						int bytes)
+{
+  int myrank = _processor;
+  int ierr;
+  if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { 
+    MPI_Request xrq;
+    MPI_Request rrq;
+
+    ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
+    ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
+    
+    assert(ierr==0);
+    list.push_back(xrq);
+    list.push_back(rrq);
+  } else { 
+    // Give the CPU to MPI immediately; can use threads to overlap optionally
+    ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
+		      recv,bytes,MPI_CHAR,from, from,
+		      communicator,MPI_STATUS_IGNORE);
+    assert(ierr==0);
+  }
+}
+void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
+{
+  if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { 
+    int nreq=list.size();
+    std::vector<MPI_Status> status(nreq);
+    int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
+    assert(ierr==0);
+  }
+}
+
+void CartesianCommunicator::Barrier(void)
+{
+  int ierr = MPI_Barrier(communicator);
+  assert(ierr==0);
+}
+
+void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
+{
+  int ierr=MPI_Bcast(data,
+		     bytes,
+		     MPI_BYTE,
+		     root,
+		     communicator);
+  assert(ierr==0);
+}
+  ///////////////////////////////////////////////////////
+  // Should only be used prior to Grid Init finished.
+  // Check for this?
+  ///////////////////////////////////////////////////////
+int CartesianCommunicator::RankWorld(void){ 
+  int r; 
+  MPI_Comm_rank(communicator_world,&r);
+  return r;
+}
+void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
+{
+  int ierr= MPI_Bcast(data,
+		      bytes,
+		      MPI_BYTE,
+		      root,
+		      communicator_world);
+  assert(ierr==0);
+}
+
+double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+							 void *xmit,
+							 int xmit_to_rank,
+							 void *recv,
+							 int recv_from_rank,
+							 int bytes,int dir)
+{
+  int myrank = _processor;
+  int ierr;
+  assert(dir < communicator_halo.size());
+  
+  //  std::cout << " sending on communicator "<<dir<<" " <<communicator_halo[dir]<<std::endl;
+  // Give the CPU to MPI immediately; can use threads to overlap optionally
+  MPI_Request req[2];
+  MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[dir],&req[1]);
+  MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank  ,myrank        , communicator_halo[dir],&req[0]);
+
+  list.push_back(req[0]);
+  list.push_back(req[1]);
+  return 2.0*bytes;
+}
+void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
+{ 
+  int nreq=waitall.size();
+  MPI_Waitall(nreq, &waitall[0], MPI_STATUSES_IGNORE);
+};
+double CartesianCommunicator::StencilSendToRecvFrom(void *xmit,
+						    int xmit_to_rank,
+						    void *recv,
+						    int recv_from_rank,
+						    int bytes,int dir)
+{
+  int myrank = _processor;
+  int ierr;
+  assert(dir < communicator_halo.size());
+  
+  //  std::cout << " sending on communicator "<<dir<<" " <<communicator_halo[dir]<<std::endl;
+  // Give the CPU to MPI immediately; can use threads to overlap optionally
+  MPI_Request req[2];
+  MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[dir],&req[1]);
+  MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank  ,myrank        , communicator_halo[dir],&req[0]);
+  MPI_Waitall(2, req, MPI_STATUSES_IGNORE);
+  return 2.0*bytes;
+}
+
+
+
+}
+
--- a/lib/cshift/Cshift.h
+++ b/lib/cshift/Cshift.h
@@ -42,7 +42,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/cshift/Cshift_mpi.h>
 #endif 

-#ifdef GRID_COMMS_MPI3L
+#ifdef GRID_COMMS_MPIT
 #include <Grid/cshift/Cshift_mpi.h>
 #endif 

--- a/lib/lattice/Lattice_reduction.h
+++ b/lib/lattice/Lattice_reduction.h
@@ -369,6 +369,7 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
  }
 };

+/*
 inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
 {
  int NN    = BlockSolverGrid->_ndimension;
@@ -387,6 +388,7 @@ inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Or
  }
  return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); 
 }
+*/

 template<class vobj>
 static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) 
@@ -398,14 +400,15 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
  int Nblock = X._grid->GlobalDimensions()[Orthog];

  GridBase *FullGrid  = X._grid;
-  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);

-  Lattice<vobj> Xslice(SliceGrid);
-  Lattice<vobj> Rslice(SliceGrid);
+  //  Lattice<vobj> Xslice(SliceGrid);
+  //  Lattice<vobj> Rslice(SliceGrid);

  assert( FullGrid->_simd_layout[Orthog]==1);
  int nh =  FullGrid->_ndimension;
-  int nl = SliceGrid->_ndimension;
+  //  int nl = SliceGrid->_ndimension;
+  int nl = nh-1;

  //FIXME package in a convenient iterator
  //Should loop over a plane orthogonal to direction "Orthog"
@@ -448,14 +451,14 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
  int Nblock = X._grid->GlobalDimensions()[Orthog];

  GridBase *FullGrid  = X._grid;
-  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
-
-  Lattice<vobj> Xslice(SliceGrid);
-  Lattice<vobj> Rslice(SliceGrid);
+  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+  //  Lattice<vobj> Xslice(SliceGrid);
+  //  Lattice<vobj> Rslice(SliceGrid);

  assert( FullGrid->_simd_layout[Orthog]==1);
  int nh =  FullGrid->_ndimension;
-  int nl = SliceGrid->_ndimension;
+  //  int nl = SliceGrid->_ndimension;
+  int nl=1;

  //FIXME package in a convenient iterator
  //Should loop over a plane orthogonal to direction "Orthog"
@@ -498,18 +501,19 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj>
  typedef typename vobj::vector_type vector_type;
  
  GridBase *FullGrid  = lhs._grid;
-  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
  
  int Nblock = FullGrid->GlobalDimensions()[Orthog];
  
-  Lattice<vobj> Lslice(SliceGrid);
-  Lattice<vobj> Rslice(SliceGrid);
+  //  Lattice<vobj> Lslice(SliceGrid);
+  //  Lattice<vobj> Rslice(SliceGrid);
  
  mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);

  assert( FullGrid->_simd_layout[Orthog]==1);
  int nh =  FullGrid->_ndimension;
-  int nl = SliceGrid->_ndimension;
+  //  int nl = SliceGrid->_ndimension;
+  int nl = nh-1;

  //FIXME package in a convenient iterator
  //Should loop over a plane orthogonal to direction "Orthog"
@@ -550,6 +554,14 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj>
      mat += mat_thread;
    }  
  }
+
+  for(int i=0;i<Nblock;i++){
+  for(int j=0;j<Nblock;j++){
+    ComplexD sum = mat(i,j);
+    FullGrid->GlobalSum(sum);
+    mat(i,j)=sum;
+  }}
+
  return;
 }

--- a/lib/log/Log.cc
+++ b/lib/log/Log.cc
@@ -95,7 +95,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
 ////////////////////////////////////////////////////////////
 void Grid_quiesce_nodes(void) {
  int me = 0;
-#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPI3L)
+#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT)
  MPI_Comm_rank(MPI_COMM_WORLD, &me);
 #endif
 #ifdef GRID_COMMS_SHMEM
--- a/lib/parallelIO/BinaryIO.h
+++ b/lib/parallelIO/BinaryIO.h
@@ -29,7 +29,7 @@
 #ifndef GRID_BINARY_IO_H
 #define GRID_BINARY_IO_H

-#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) 
+#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT) 
 #define USE_MPI_IO
 #else
 #undef  USE_MPI_IO
@@ -98,35 +98,39 @@ class BinaryIO {

    NerscChecksum(grid,scalardata,nersc_csum);
  }
-  
-  template<class fobj> static inline void NerscChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &nersc_csum)
+
+  template <class fobj>
+  static inline void NerscChecksum(GridBase *grid, std::vector<fobj> &fbuf, uint32_t &nersc_csum)
  {
-    const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t);
+    const uint64_t size32 = sizeof(fobj) / sizeof(uint32_t);

-
-    uint64_t lsites              =grid->lSites();
-    if (fbuf.size()==1) {
-      lsites=1;
+    uint64_t lsites = grid->lSites();
+    if (fbuf.size() == 1)
+    {
+      lsites = 1;
    }

-#pragma omp parallel
-    { 
-      uint32_t nersc_csum_thr=0;
+    #pragma omp parallel
+    {
+      uint32_t nersc_csum_thr = 0;

-#pragma omp for
-      for(uint64_t local_site=0;local_site<lsites;local_site++){
-	uint32_t * site_buf = (uint32_t *)&fbuf[local_site];
-	for(uint64_t j=0;j<size32;j++){
-	  nersc_csum_thr=nersc_csum_thr+site_buf[j];
-	}
+      #pragma omp for
+      for (uint64_t local_site = 0; local_site < lsites; local_site++)
+      {
+        uint32_t *site_buf = (uint32_t *)&fbuf[local_site];
+        for (uint64_t j = 0; j < size32; j++)
+        {
+          nersc_csum_thr = nersc_csum_thr + site_buf[j];
+        }
      }

-#pragma omp critical
+      #pragma omp critical
      {
-	nersc_csum  += nersc_csum_thr;
+        nersc_csum += nersc_csum_thr;
      }
    }
  }
+
  template<class fobj> static inline void ScidacChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &scidac_csuma,uint32_t &scidac_csumb)
  {
    const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t);
@@ -266,7 +270,7 @@ class BinaryIO {
    grid->Barrier();
    GridStopWatch timer; 
    GridStopWatch bstimer;
-
+    
    nersc_csum=0;
    scidac_csuma=0;
    scidac_csumb=0;
@@ -362,18 +366,22 @@ class BinaryIO {
 #else 
 	assert(0);
 #endif
-      } else { 
-	std::cout<< GridLogMessage<< "C++ read I/O "<< file<<" : "
-		 << iodata.size()*sizeof(fobj)<<" bytes"<<std::endl;
-	std::ifstream fin;
-	fin.open(file,std::ios::binary|std::ios::in);
-	if ( control & BINARYIO_MASTER_APPEND )  {
-	  fin.seekg(-sizeof(fobj),fin.end);
-	} else { 
-	  fin.seekg(offset+myrank*lsites*sizeof(fobj));
-	}
-	fin.read((char *)&iodata[0],iodata.size()*sizeof(fobj));assert( fin.fail()==0);
-	fin.close();
+      } else {
+        std::cout << GridLogMessage << "C++ read I/O " << file << " : "
+                  << iodata.size() * sizeof(fobj) << " bytes" << std::endl;
+        std::ifstream fin;
+        fin.open(file, std::ios::binary | std::ios::in);
+        if (control & BINARYIO_MASTER_APPEND)
+        {
+          fin.seekg(-sizeof(fobj), fin.end);
+        }
+        else
+        {
+          fin.seekg(offset + myrank * lsites * sizeof(fobj));
+        }
+        fin.read((char *)&iodata[0], iodata.size() * sizeof(fobj));
+        assert(fin.fail() == 0);
+        fin.close();
      }
      timer.Stop();

@@ -405,30 +413,78 @@ class BinaryIO {
      timer.Start();
      if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) {
 #ifdef USE_MPI_IO
-	std::cout<< GridLogMessage<< "MPI write I/O "<< file<< std::endl;
-	ierr=MPI_File_open(grid->communicator,(char *) file.c_str(), MPI_MODE_RDWR|MPI_MODE_CREATE,MPI_INFO_NULL, &fh); assert(ierr==0);
-	ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);                        assert(ierr==0);
-	ierr=MPI_File_write_all(fh, &iodata[0], 1, localArray, &status);                                        assert(ierr==0);
-	MPI_File_close(&fh);
-	MPI_Type_free(&fileArray);
-	MPI_Type_free(&localArray);
+        std::cout << GridLogMessage << "MPI write I/O " << file << std::endl;
+        ierr = MPI_File_open(grid->communicator, (char *)file.c_str(), MPI_MODE_RDWR | MPI_MODE_CREATE, MPI_INFO_NULL, &fh);
+        std::cout << GridLogMessage << "Checking for errors" << std::endl;
+        if (ierr != MPI_SUCCESS)
+        {
+          char error_string[BUFSIZ];
+          int length_of_error_string, error_class;
+
+          MPI_Error_class(ierr, &error_class);
+          MPI_Error_string(error_class, error_string, &length_of_error_string);
+          fprintf(stderr, "%3d: %s\n", myrank, error_string);
+          MPI_Error_string(ierr, error_string, &length_of_error_string);
+          fprintf(stderr, "%3d: %s\n", myrank, error_string);
+          MPI_Abort(MPI_COMM_WORLD, 1); //assert(ierr == 0);
+        }
+
+        std::cout << GridLogDebug << "MPI read I/O set view " << file << std::endl;
+        ierr = MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);
+        assert(ierr == 0);
+
+        std::cout << GridLogDebug << "MPI read I/O write all " << file << std::endl;
+        ierr = MPI_File_write_all(fh, &iodata[0], 1, localArray, &status);
+        assert(ierr == 0);
+
+        MPI_File_close(&fh);
+        MPI_Type_free(&fileArray);
+        MPI_Type_free(&localArray);
 #else 
 	assert(0);
 #endif
      } else { 
-	std::ofstream fout; fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
-	std::cout<< GridLogMessage<< "C++ write I/O "<< file<<" : "
-		 << iodata.size()*sizeof(fobj)<<" bytes"<<std::endl;
-	if ( control & BINARYIO_MASTER_APPEND )  {
+        
+	std::ofstream fout; 
+  fout.exceptions ( std::fstream::failbit | std::fstream::badbit );
+  try {
+    fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
+  } catch (const std::fstream::failure& exc) {
+    std::cout << GridLogError << "Error in opening the file " << file << " for output" <<std::endl;
+    std::cout << GridLogError << "Exception description: " << exc.what() << std::endl;
+    std::cout << GridLogError << "Probable cause: wrong path, inaccessible location "<< std::endl;
+    #ifdef USE_MPI_IO
+    MPI_Abort(MPI_COMM_WORLD,1);
+    #else
+    exit(1);
+    #endif
+  }
+	std::cout << GridLogMessage<< "C++ write I/O "<< file<<" : "
+		        << iodata.size()*sizeof(fobj)<<" bytes"<<std::endl;
+	
+  if ( control & BINARYIO_MASTER_APPEND )  {
 	  fout.seekp(0,fout.end);
 	} else {
 	  fout.seekp(offset+myrank*lsites*sizeof(fobj));
 	}
-	fout.write((char *)&iodata[0],iodata.size()*sizeof(fobj));assert( fout.fail()==0);
+  
+  try {
+  	fout.write((char *)&iodata[0],iodata.size()*sizeof(fobj));//assert( fout.fail()==0);
+  }
+  catch (const std::fstream::failure& exc) {
+    std::cout << "Exception in writing file " << file << std::endl;
+    std::cout << GridLogError << "Exception description: "<< exc.what() << std::endl;
+    #ifdef USE_MPI_IO
+    MPI_Abort(MPI_COMM_WORLD,1);
+    #else
+    exit(1);
+    #endif
+  }
+
 	fout.close();
-      }
-      timer.Stop();
-    }
+  }
+  timer.Stop();
+  }

    std::cout<<GridLogMessage<<"IOobject: ";
    if ( control & BINARYIO_READ) std::cout << " read  ";
@@ -442,11 +498,14 @@ class BinaryIO {
    //////////////////////////////////////////////////////////////////////////////
    // Safety check
    //////////////////////////////////////////////////////////////////////////////
-    grid->Barrier();
-    grid->GlobalSum(nersc_csum);
-    grid->GlobalXOR(scidac_csuma);
-    grid->GlobalXOR(scidac_csumb);
-    grid->Barrier();
+    // if the data size is 1 we do not want to sum over the MPI ranks
+    if (iodata.size() != 1){
+      grid->Barrier();
+      grid->GlobalSum(nersc_csum);
+      grid->GlobalXOR(scidac_csuma);
+      grid->GlobalXOR(scidac_csumb);
+      grid->Barrier();
+    }
  }

  /////////////////////////////////////////////////////////////////////////////
@@ -546,9 +605,9 @@ class BinaryIO {
    int gsites = grid->gSites();
    int lsites = grid->lSites();

-    uint32_t nersc_csum_tmp;
-    uint32_t scidac_csuma_tmp;
-    uint32_t scidac_csumb_tmp;
+    uint32_t nersc_csum_tmp   = 0;
+    uint32_t scidac_csuma_tmp = 0;
+    uint32_t scidac_csumb_tmp = 0;

    GridStopWatch timer;

--- a/lib/qcd/action/fermion/CayleyFermion5D.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5D.cc
@@ -414,7 +414,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
  for(int i=0; i < Ls; i++){
    as[i] = 1.0;
    omega[i] = gamma[i]*zolo_hi; //NB reciprocal relative to Chroma NEF code
-    //    assert(fabs(omega[i])>0.0);
+    assert(omega[i]!=Coeff_t(0.0));
    bs[i] = 0.5*(bpc/omega[i] + bmc);
    cs[i] = 0.5*(bpc/omega[i] - bmc);
  }
@@ -429,7 +429,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
  
  for(int i=0;i<Ls;i++){
    bee[i]=as[i]*(bs[i]*(4.0-this->M5) +1.0);     
-    //    assert(fabs(bee[i])>0.0);
+    assert(bee[i]!=Coeff_t(0.0));
    cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5));
    beo[i]=as[i]*bs[i];
    ceo[i]=-as[i]*cs[i];
@@ -455,11 +455,17 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
    dee[i] = bee[i];
    
    if ( i < Ls-1 ) {
+
+      assert(bee[i]!=Coeff_t(0.0));
+      assert(bee[0]!=Coeff_t(0.0));
      
      lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column
      
      leem[i]=mass*cee[Ls-1]/bee[0];
-      for(int j=0;j<i;j++)  leem[i]*= aee[j]/bee[j+1];
+      for(int j=0;j<i;j++) {
+	assert(bee[j+1]!=Coeff_t(0.0));
+	leem[i]*= aee[j]/bee[j+1];
+      }
      
      uee[i] =-aee[i]/bee[i];   // up-diag entry on the ith row
      
@@ -478,7 +484,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
  { 
    Coeff_t delta_d=mass*cee[Ls-1];
    for(int j=0;j<Ls-1;j++) {
-      //      assert(fabs(bee[j])>0.0);
+      assert(bee[j] != Coeff_t(0.0));
      delta_d *= cee[j]/bee[j];
    }
    dee[Ls-1] += delta_d;
--- a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc
+++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc
@@ -230,8 +230,15 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
 {
  Compressor compressor;
  int LLs = in._grid->_rdimensions[0];
+
+
+
+  DhopTotalTime -= usecond();
+  DhopCommTime -= usecond();
  st.HaloExchange(in,compressor);
+  DhopCommTime += usecond();
  
+  DhopComputeTime -= usecond();
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
  if (dag == DaggerYes) {
    parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
@@ -244,12 +251,15 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
 	Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out);
    }
  }
+  DhopComputeTime += usecond();
+  DhopTotalTime   += usecond();
 }


 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
 {
+  DhopCalls+=1;
  conformable(in._grid,FermionRedBlackGrid());    // verifies half grid
  conformable(in._grid,out._grid); // drops the cb check

@@ -261,6 +271,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionFie
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
 {
+  DhopCalls+=1;
  conformable(in._grid,FermionRedBlackGrid());    // verifies half grid
  conformable(in._grid,out._grid); // drops the cb check

@@ -272,6 +283,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionFie
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
 {
+  DhopCalls+=2;
  conformable(in._grid,FermionGrid()); // verifies full grid
  conformable(in._grid,out._grid);

@@ -280,6 +292,54 @@ void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField
  DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag);
 }

+template<class Impl>
+void ImprovedStaggeredFermion5D<Impl>::Report(void) 
+{
+  std::vector<int> latt = GridDefaultLatt();          
+  RealD volume = Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
+  RealD NP = _FourDimGrid->_Nprocessors;
+  RealD NN = _FourDimGrid->NodeCount();
+
+  std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
+
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Number of DhopEO Calls   : " 
+	    << DhopCalls   << std::endl;
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D TotalTime   /Calls       : " 
+	    << DhopTotalTime   / DhopCalls << " us" << std::endl;
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D CommTime    /Calls       : " 
+	    << DhopCommTime    / DhopCalls << " us" << std::endl;
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D ComputeTime/Calls        : " 
+	    << DhopComputeTime / DhopCalls << " us" << std::endl;
+
+  // Average the compute time
+  _FourDimGrid->GlobalSum(DhopComputeTime);
+  DhopComputeTime/=NP;
+
+  RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
+  std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
+  std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
+  std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
+  
+  RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
+  std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
+  std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
+  std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
+
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Stencil"    <<std::endl;  Stencil.Report();
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilEven"<<std::endl;  StencilEven.Report();
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilOdd" <<std::endl;  StencilOdd.Report();
+}
+template<class Impl>
+void ImprovedStaggeredFermion5D<Impl>::ZeroCounters(void) 
+{
+  DhopCalls       = 0;
+  DhopTotalTime    = 0;
+  DhopCommTime    = 0;
+  DhopComputeTime = 0;
+  Stencil.ZeroCounters();
+  StencilEven.ZeroCounters();
+  StencilOdd.ZeroCounters();
+}

 /////////////////////////////////////////////////////////////////////////
 // Implement the general interface. Here we use SAME mass on all slices
--- a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h
+++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h
@@ -55,6 +55,16 @@ namespace QCD {
      FermionField _tmp;
      FermionField &tmp(void) { return _tmp; }

+      ////////////////////////////////////////
+      // Performance monitoring
+      ////////////////////////////////////////
+      void Report(void);
+      void ZeroCounters(void);
+      double DhopTotalTime;
+      double DhopCalls;
+      double DhopCommTime;
+      double DhopComputeTime;
+
      ///////////////////////////////////////////////////////////////
      // Implement the abstract base
      ///////////////////////////////////////////////////////////////
--- a/lib/qcd/action/fermion/WilsonCompressor.h
+++ b/lib/qcd/action/fermion/WilsonCompressor.h
@@ -238,7 +238,33 @@ template<typename HCS,typename HS,typename S> using WilsonCompressor = WilsonCom
 template<class vobj,class cobj>
 class WilsonStencil : public CartesianStencil<vobj,cobj> {
 public:
-
+  double timer0;
+  double timer1;
+  double timer2;
+  double timer3;
+  double timer4;
+  double timer5;
+  double timer6;
+  uint64_t callsi;
+  void ZeroCountersi(void)
+  {
+    timer0=0;
+    timer1=0;
+    timer2=0;
+    timer3=0;
+    timer4=0;
+    timer5=0;
+    timer6=0;
+    callsi=0;
+  }
+  void Reporti(int calls)
+  {
+    if ( timer0 ) std::cout << GridLogMessage << " timer0 (HaloGatherOpt) " <<timer0/calls <<std::endl;
+    if ( timer1 ) std::cout << GridLogMessage << " timer1 (Communicate)   " <<timer1/calls <<std::endl;
+    if ( timer2 ) std::cout << GridLogMessage << " timer2 (CommsMerge )   " <<timer2/calls <<std::endl;
+    if ( timer3 ) std::cout << GridLogMessage << " timer3 (commsMergeShm) " <<timer3/calls <<std::endl;
+    if ( timer4 ) std::cout << GridLogMessage << " timer4 " <<timer4 <<std::endl;
+  }
  typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;

  std::vector<int> same_node;
@@ -252,6 +278,7 @@ public:
    : CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) ,
    same_node(npoints)
  { 
+    ZeroCountersi();
    surface_list.resize(0);
  };

@@ -261,7 +288,6 @@ public:
    // Here we know the distance is 1 for WilsonStencil
    for(int point=0;point<this->_npoints;point++){
      same_node[point] = this->SameNode(point);
-      //      std::cout << " dir " <<point<<" same_node " <<same_node[point]<<std::endl;
    }
    
    for(int site = 0 ;site< vol4;site++){
@@ -282,17 +308,28 @@ public:
  {
    std::vector<std::vector<CommsRequest_t> > reqs;
    this->HaloExchangeOptGather(source,compress);
-    this->CommunicateBegin(reqs);
-    this->CommunicateComplete(reqs);
+    double t1=usecond();
+    // Asynchronous MPI calls multidirectional, Isend etc...
+    //    this->CommunicateBegin(reqs);
+    //    this->CommunicateComplete(reqs);
+    // Non-overlapped directions within a thread. Asynchronous calls except MPI3, threaded up to comm threads ways.
+    this->Communicate();
+    double t2=usecond(); timer1 += t2-t1;
    this->CommsMerge(compress);
+    double t3=usecond(); timer2 += t3-t2;
    this->CommsMergeSHM(compress);
+    double t4=usecond(); timer3 += t4-t3;
  }
  
  template <class compressor>
  void HaloExchangeOptGather(const Lattice<vobj> &source,compressor &compress) 
  {
    this->Prepare();
+    double t0=usecond();
    this->HaloGatherOpt(source,compress);
+    double t1=usecond();
+    timer0 += t1-t0;
+    callsi++;
  }

  template <class compressor>
@@ -304,7 +341,9 @@ public:
    typedef typename compressor::SiteHalfSpinor     SiteHalfSpinor;
    typedef typename compressor::SiteHalfCommSpinor SiteHalfCommSpinor;

+    this->mpi3synctime_g-=usecond();
    this->_grid->StencilBarrier();
+    this->mpi3synctime_g+=usecond();

    assert(source._grid==this->_grid);
    this->halogtime-=usecond();
@@ -323,7 +362,6 @@ public:
    int dag = compress.dag;
    int face_idx=0;
    if ( dag ) { 
-      //	std::cout << " Optimised Dagger compress " <<std::endl;
      assert(same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx));
      assert(same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx));
      assert(same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx));
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -123,22 +123,24 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  int vol4;
  vol4=FourDimGrid.oSites();
  Stencil.BuildSurfaceList(LLs,vol4);
+
  vol4=FourDimRedBlackGrid.oSites();
  StencilEven.BuildSurfaceList(LLs,vol4);
   StencilOdd.BuildSurfaceList(LLs,vol4);

-  std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size()
-                       <<" " << StencilEven.surface_list.size()<<std::endl;
+   //  std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size()
+   //                       <<" " << StencilEven.surface_list.size()<<std::endl;

 }
     
 template<class Impl>
 void WilsonFermion5D<Impl>::Report(void)
 {
-    std::vector<int> latt = GridDefaultLatt();          
-    RealD volume = Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
-    RealD NP = _FourDimGrid->_Nprocessors;
-    RealD NN = _FourDimGrid->NodeCount();
+  RealD NP     = _FourDimGrid->_Nprocessors;
+  RealD NN     = _FourDimGrid->NodeCount();
+  RealD volume = Ls;  
+  std::vector<int> latt = _FourDimGrid->GlobalDimensions();
+  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];

  if ( DhopCalls > 0 ) {
    std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
@@ -184,6 +186,11 @@ void WilsonFermion5D<Impl>::Report(void)
    std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl;  StencilEven.Report();
    std::cout << GridLogMessage << "WilsonFermion5D StencilOdd" <<std::endl;  StencilOdd.Report();
  }
+  if ( DhopCalls > 0){
+    std::cout << GridLogMessage << "WilsonFermion5D Stencil     Reporti()"    <<std::endl;  Stencil.Reporti(DhopCalls);
+    std::cout << GridLogMessage << "WilsonFermion5D StencilEven Reporti()"<<std::endl;  StencilEven.Reporti(DhopCalls);
+    std::cout << GridLogMessage << "WilsonFermion5D StencilOdd  Reporti()" <<std::endl;  StencilOdd.Reporti(DhopCalls);
+  }
 }

 template<class Impl>
@@ -203,6 +210,9 @@ void WilsonFermion5D<Impl>::ZeroCounters(void) {
  Stencil.ZeroCounters();
  StencilEven.ZeroCounters();
  StencilOdd.ZeroCounters();
+  Stencil.ZeroCountersi();
+  StencilEven.ZeroCountersi();
+  StencilOdd.ZeroCountersi();
 }


@@ -379,7 +389,6 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
 {
 #ifdef GRID_OMP
  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
-  typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;

  Compressor compressor(dag);

@@ -388,46 +397,70 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg

  DhopFaceTime-=usecond();
  st.HaloExchangeOptGather(in,compressor);
-  DhopFaceTime+=usecond();
-  std::vector<std::vector<CommsRequest_t> > reqs;
-
-  // Rely on async comms; start comms before merge of local data
-  DhopCommTime-=usecond();
-  st.CommunicateBegin(reqs);
-
-  DhopFaceTime-=usecond();
-  st.CommsMergeSHM(compressor);
+  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
  DhopFaceTime+=usecond();

-  // Perhaps use omp task and region
-#pragma omp parallel 
+  double ctime=0;
+  double ptime=0;
+
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Ugly explicit thread mapping introduced for OPA reasons.
+  //////////////////////////////////////////////////////////////////////////////////////////////////////
+#pragma omp parallel reduction(max:ctime) reduction(max:ptime)
  { 
+    int tid = omp_get_thread_num();
    int nthreads = omp_get_num_threads();
-    int me = omp_get_thread_num();
-    int myoff, mywork;
-
-    GridThread::GetWork(len,me-1,mywork,myoff,nthreads-1);
-    int sF = LLs * myoff;
-
-    if ( me == 0 ) {
-      st.CommunicateComplete(reqs);
-      DhopCommTime+=usecond();
-    } else { 
-      // Interior links in stencil
-      if ( me==1 ) DhopComputeTime-=usecond();
-      if (dag == DaggerYes) Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,1,0);
-      else      	    Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,1,0);
-      if ( me==1 ) DhopComputeTime+=usecond();
+    int ncomms = CartesianCommunicator::nCommThreads;
+    if (ncomms == -1) ncomms = 1;
+    assert(nthreads > ncomms);
+    if (tid >= ncomms) {
+      double start = usecond();
+      nthreads -= ncomms;
+      int ttid = tid - ncomms;
+      int n = U._grid->oSites();
+      int chunk = n / nthreads;
+      int rem = n % nthreads;
+      int myblock, myn;
+      if (ttid < rem) {
+	myblock = ttid * chunk + ttid;
+	myn = chunk+1;
+      } else {
+	myblock = ttid*chunk + rem;
+	myn = chunk;
+      }
+      
+      // do the compute
+      if (dag == DaggerYes) {
+	for (int ss = myblock; ss < myblock+myn; ++ss) {
+	  int sU = ss;
+	  int sF = LLs * sU;
+	  Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0);
+	}
+      } else {
+	for (int ss = myblock; ss < myblock+myn; ++ss) {
+	  int sU = ss;
+	  int sF = LLs * sU;
+	  Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0);
+	}
+      }
+	ptime = usecond() - start;
+    }
+    {
+      double start = usecond();
+      st.CommunicateThreaded();
+      ctime = usecond() - start;
    }
  }
+  DhopCommTime += ctime;
+  DhopComputeTime+=ptime;
+
+  // First to enter, last to leave timing
+  st.CollateThreads();

  DhopFaceTime-=usecond();
  st.CommsMerge(compressor);
  DhopFaceTime+=usecond();

-  // Load imbalance alert. Should use dynamic schedule OMP for loop
-  // Perhaps create a list of only those sites with face work, and 
-  // load balance process the list.
  DhopComputeTime2-=usecond();
  if (dag == DaggerYes) {
    int sz=st.surface_list.size();
@@ -448,11 +481,9 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
 #else 
  assert(0);
 #endif
-
 }


-
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
 					 DoubledGaugeField & U,
--- a/lib/qcd/hmc/HMCResourceManager.h
+++ b/lib/qcd/hmc/HMCResourceManager.h
@@ -165,7 +165,7 @@ class HMCResourceManager {
  // Grids
  //////////////////////////////////////////////////////////////

-  void AddGrid(std::string s, GridModule& M) {
+  void AddGrid(const std::string s, GridModule& M) {
    // Check for name clashes
    auto search = Grids.find(s);
    if (search != Grids.end()) {
@@ -174,14 +174,24 @@ class HMCResourceManager {
      exit(1);
    }
    Grids[s] = std::move(M);
+    std::cout << GridLogMessage << "::::::::::::::::::::::::::::::::::::::::" <<std::endl;
+    std::cout << GridLogMessage << "HMCResourceManager:" << std::endl;
+    std::cout << GridLogMessage << "Created grid set with name '" << s << "' and decomposition for the full cartesian " << std::endl;
+    Grids[s].show_full_decomposition();
+    std::cout << GridLogMessage << "::::::::::::::::::::::::::::::::::::::::" <<std::endl;
  }

  // Add a named grid set, 4d shortcut
-  void AddFourDimGrid(std::string s) {
+  void AddFourDimGrid(const std::string s) {
    GridFourDimModule<vComplex> Mod;
    AddGrid(s, Mod);
  }

+  // Add a named grid set, 4d shortcut + tweak simd lanes
+  void AddFourDimGrid(const std::string s, const std::vector<int> simd_decomposition) {
+    GridFourDimModule<vComplex> Mod(simd_decomposition);
+    AddGrid(s, Mod);
+  }


  GridCartesian* GetCartesian(std::string s = "") {
--- a/lib/qcd/hmc/HMC_GridModules.h
+++ b/lib/qcd/hmc/HMC_GridModules.h
@@ -33,28 +33,29 @@ directory
 namespace Grid {

 // Resources
-// Modules for grids 
+// Modules for grids

 // Introduce another namespace HMCModules?

-class GridModuleParameters: Serializable{   
+class GridModuleParameters: Serializable{
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(GridModuleParameters,
  std::string, lattice,
  std::string, mpi);

-  std::vector<int> getLattice(){return strToVec<int>(lattice);}
-  std::vector<int> getMpi()    {return strToVec<int>(mpi);}
+  std::vector<int> getLattice() const {return strToVec<int>(lattice);}
+  std::vector<int> getMpi()     const {return strToVec<int>(mpi);}

-  void check(){
-    if (getLattice().size() != getMpi().size()) {
-      std::cout << GridLogError 
+
+  void check() const {
+    if (getLattice().size() != getMpi().size() ) {
+      std::cout << GridLogError
                << "Error in GridModuleParameters: lattice and mpi dimensions "
                   "do not match"
                << std::endl;
      exit(1);
    }
-  }    
+  }

  template <class ReaderClass>
  GridModuleParameters(Reader<ReaderClass>& Reader, std::string n = "LatticeGrid"):name(n) {
@@ -75,51 +76,94 @@ private:
 // Lower level class
 class GridModule {
 public:
-  GridCartesian* get_full() { 
+  GridCartesian* get_full() {
    std::cout << GridLogDebug << "Getting cartesian in module"<< std::endl;
    return grid_.get(); }
-  GridRedBlackCartesian* get_rb() { 
+  GridRedBlackCartesian* get_rb() {
    std::cout << GridLogDebug << "Getting rb-cartesian in module"<< std::endl;
    return rbgrid_.get(); }

  void set_full(GridCartesian* grid) { grid_.reset(grid); }
  void set_rb(GridRedBlackCartesian* rbgrid) { rbgrid_.reset(rbgrid); }
+  void show_full_decomposition(){ grid_->show_decomposition(); }
+  void show_rb_decomposition(){ rbgrid_->show_decomposition(); }

 protected:
  std::unique_ptr<GridCartesian> grid_;
  std::unique_ptr<GridRedBlackCartesian> rbgrid_;
-  
+
 };

 ////////////////////////////////////
 // Classes for the user
 ////////////////////////////////////
 // Note: the space time grid should be out of the QCD namespace
-template< class vector_type>
-class GridFourDimModule : public GridModule {
- public:
-  GridFourDimModule() {
+template <class vector_type>
+class GridFourDimModule : public GridModule
+{
+public:
+  GridFourDimModule()
+  {
    using namespace QCD;
    set_full(SpaceTimeGrid::makeFourDimGrid(
-        GridDefaultLatt(), GridDefaultSimd(4, vector_type::Nsimd()),
+        GridDefaultLatt(), 
+        GridDefaultSimd(4, vector_type::Nsimd()),
        GridDefaultMpi()));
    set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get()));
  }

-  GridFourDimModule(GridModuleParameters Params) {
+  GridFourDimModule(const std::vector<int> tweak_simd)
+  {
+    using namespace QCD;
+    if (tweak_simd.size() != 4)
+    {
+      std::cout << GridLogError
+                << "Error in GridFourDimModule: SIMD size different from 4" 
+                << std::endl;
+      exit(1);
+    }
+
+    // Checks that the product agrees with the expectation
+    int simd_sum = 1;
+    for (auto &n : tweak_simd)
+      simd_sum *= n;
+    std::cout << GridLogDebug << "TweakSIMD: " << tweak_simd << "  Sum: " << simd_sum << std::endl;
+
+    if (simd_sum == vector_type::Nsimd())
+    {
+      set_full(SpaceTimeGrid::makeFourDimGrid(
+          GridDefaultLatt(), 
+          tweak_simd, 
+          GridDefaultMpi()));
+      set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get()));
+    }
+    else
+    {
+      std::cout << GridLogError 
+                << "Error in GridFourDimModule: SIMD lanes must sum to " 
+                << vector_type::Nsimd() 
+                << std::endl;
+    }
+  }
+
+  GridFourDimModule(const GridModuleParameters Params)
+  {
    using namespace QCD;
-    Params.check();
    std::vector<int> lattice_v = Params.getLattice();
    std::vector<int> mpi_v = Params.getMpi();
-    if (lattice_v.size() == 4) {
+    if (lattice_v.size() == 4)
+    {
      set_full(SpaceTimeGrid::makeFourDimGrid(
-          lattice_v, GridDefaultSimd(4, vector_type::Nsimd()),
+          lattice_v, 
+          GridDefaultSimd(4, vector_type::Nsimd()),
          mpi_v));
      set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get()));
-    } else {
-      std::cout << GridLogError 
-          << "Error in GridFourDimModule: lattice dimension different from 4"
-          << std::endl;
+    }
+    else
+    {
+      std::cout << GridLogError
+                << "Error in GridFourDimModule: lattice dimension different from 4"
+                << std::endl;
      exit(1);
    }
  }
--- a/lib/qcd/modules/ObservableModules.h
+++ b/lib/qcd/modules/ObservableModules.h
@@ -84,8 +84,6 @@ class PlaquetteMod: public ObservableModule<PlaquetteLogger<Impl>, NoParameters>
  typedef ObservableModule<PlaquetteLogger<Impl>, NoParameters> ObsBase;
  using ObsBase::ObsBase; // for constructors

-
-
  // acquire resource
  virtual void initialize(){
    this->ObservablePtr.reset(new PlaquetteLogger<Impl>());
@@ -94,23 +92,22 @@ class PlaquetteMod: public ObservableModule<PlaquetteLogger<Impl>, NoParameters>
  PlaquetteMod(): ObsBase(NoParameters()){}
 };

+
 template < class Impl >
-class TopologicalChargeMod: public ObservableModule<TopologicalCharge<Impl>, NoParameters>{
-  typedef ObservableModule<TopologicalCharge<Impl>, NoParameters> ObsBase;
+class TopologicalChargeMod: public ObservableModule<TopologicalCharge<Impl>, TopologyObsParameters>{
+  typedef ObservableModule<TopologicalCharge<Impl>, TopologyObsParameters> ObsBase;
  using ObsBase::ObsBase; // for constructors

-
-
  // acquire resource
  virtual void initialize(){
-    this->ObservablePtr.reset(new TopologicalCharge<Impl>());
+    this->ObservablePtr.reset(new TopologicalCharge<Impl>(this->Par_));
  }
  public:
-  TopologicalChargeMod(): ObsBase(NoParameters()){}
+  TopologicalChargeMod(TopologyObsParameters Par): ObsBase(Par){}
+  TopologicalChargeMod(): ObsBase(){}
 };


-
 }// QCD temporarily here


--- a/lib/qcd/observables/topological_charge.h
+++ b/lib/qcd/observables/topological_charge.h
@@ -33,9 +33,45 @@ directory
 namespace Grid {
 namespace QCD {

+struct TopologySmearingParameters : Serializable {
+    GRID_SERIALIZABLE_CLASS_MEMBERS(TopologySmearingParameters,
+    int, steps,
+    float, step_size,
+    int, meas_interval,
+    float, maxTau);
+
+    TopologySmearingParameters(int s = 0, float ss = 0.0f, int mi = 0, float mT = 0.0f):
+        steps(s), step_size(ss), meas_interval(mi), maxTau(mT){}
+
+    template < class ReaderClass >
+    TopologySmearingParameters(Reader<ReaderClass>& Reader){
+        read(Reader, "Smearing", *this);  
+    }  
+};
+
+
+
+struct TopologyObsParameters : Serializable {
+    GRID_SERIALIZABLE_CLASS_MEMBERS(TopologyObsParameters,
+      int, interval,
+      bool, do_smearing,
+      TopologySmearingParameters, Smearing);  
+
+    TopologyObsParameters(int interval = 1, bool smearing = false):
+        interval(interval), Smearing(smearing){}
+
+    template <class ReaderClass >
+      TopologyObsParameters(Reader<ReaderClass>& Reader){
+        read(Reader, "TopologyMeasurement", *this);
+  }
+};
+
+
 // this is only defined for a gauge theory
 template <class Impl>
 class TopologicalCharge : public HmcObservable<typename Impl::Field> {
+    TopologyObsParameters Pars;
+
 public:
    // here forces the Impl to be of gauge fields
    // if not the compiler will complain
@@ -44,20 +80,39 @@ class TopologicalCharge : public HmcObservable<typename Impl::Field> {
    // necessary for HmcObservable compatibility
    typedef typename Impl::Field Field;

+    TopologicalCharge(int interval = 1, bool do_smearing = false):
+        Pars(interval, do_smearing){}
+    
+    TopologicalCharge(TopologyObsParameters P):Pars(P){
+        std::cout << GridLogDebug << "Creating TopologicalCharge " << std::endl;
+    }
+
    void TrajectoryComplete(int traj,
                            Field &U,
                            GridSerialRNG &sRNG,
                            GridParallelRNG &pRNG) {

-    Real q = WilsonLoops<Impl>::TopologicalCharge(U);
+    if (traj%Pars.interval == 0){
+        // Smearing
+        Field Usmear = U;
+        int def_prec = std::cout.precision();
+        
+        if (Pars.do_smearing){
+            // using wilson flow by default here
+            WilsonFlow<PeriodicGimplR> WF(Pars.Smearing.steps, Pars.Smearing.step_size, Pars.Smearing.meas_interval);
+            WF.smear_adaptive(Usmear, U, Pars.Smearing.maxTau);
+            Real T0   = WF.energyDensityPlaquette(Usmear);
+            std::cout << GridLogMessage << std::setprecision(std::numeric_limits<Real>::digits10 + 1)
+                      << "T0                : [ " << traj << " ] "<< T0 << std::endl;
+        }

-    int def_prec = std::cout.precision();
+        Real q    = WilsonLoops<Impl>::TopologicalCharge(Usmear);
+        std::cout << GridLogMessage
+            << std::setprecision(std::numeric_limits<Real>::digits10 + 1)
+            << "Topological Charge: [ " << traj << " ] "<< q << std::endl;

-    std::cout << GridLogMessage
-        << std::setprecision(std::numeric_limits<Real>::digits10 + 1)
-        << "Topological Charge: [ " << traj << " ] "<< q << std::endl;
-
-    std::cout.precision(def_prec);
+        std::cout.precision(def_prec);
+        }
    }

 };
--- a/lib/qcd/utils/GaugeFix.h
+++ b/lib/qcd/utils/GaugeFix.h
@@ -26,12 +26,14 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    /*  END LEGAL */
 //#include <Grid/Grid.h>

-using namespace Grid;
-using namespace Grid::QCD;
+#ifndef GRID_QCD_GAUGE_FIX_H
+#define GRID_QCD_GAUGE_FIX_H
+namespace Grid {
+namespace QCD {

 template <class Gimpl> 
 class FourierAcceleratedGaugeFixer  : public Gimpl {
-  public:
+ public:
  INHERIT_GIMPL_TYPES(Gimpl);

  typedef typename Gimpl::GaugeLinkField GaugeMat;
@@ -186,3 +188,6 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
  }  
 };

+}
+}
+#endif
--- a/lib/simd/Grid_neon.h
+++ b/lib/simd/Grid_neon.h
@@ -82,11 +82,11 @@ namespace Optimization {
      double tmp[2]={a,b};
      return vld1q_f64(tmp);
    }
-    //Real double // N:tbc
+    //Real double
    inline float64x2_t operator()(double a){
      return vdupq_n_f64(a);
    }
-    //Integer // N:tbc
+    //Integer
    inline uint32x4_t operator()(Integer a){
      return vdupq_n_u32(a);
    }
@@ -124,33 +124,32 @@ namespace Optimization {
  // Nils: Vset untested; not used currently in Grid at all;
  // git commit 4a8c4ccfba1d05159348d21a9698028ea847e77b
  struct Vset{
-    // Complex float // N:ok
+    // Complex float
    inline float32x4_t operator()(Grid::ComplexF *a){
      float tmp[4]={a[1].imag(),a[1].real(),a[0].imag(),a[0].real()};
      return vld1q_f32(tmp);
    }
-    // Complex double // N:ok
+    // Complex double
    inline float64x2_t operator()(Grid::ComplexD *a){
      double tmp[2]={a[0].imag(),a[0].real()};
      return vld1q_f64(tmp);
    }
-    // Real float // N:ok
+    // Real float
    inline float32x4_t operator()(float *a){
      float tmp[4]={a[3],a[2],a[1],a[0]};
      return vld1q_f32(tmp);
    }
-    // Real double // N:ok
+    // Real double
    inline float64x2_t operator()(double *a){
      double tmp[2]={a[1],a[0]};
      return vld1q_f64(tmp);
    }
-    // Integer // N:ok
+    // Integer
    inline uint32x4_t operator()(Integer *a){
      return vld1q_dup_u32(a);
    }
  };

-  // N:leaving as is
  template <typename Out_type, typename In_type>
  struct Reduce{
    //Need templated class to overload output type
@@ -249,9 +248,9 @@ namespace Optimization {
      return vfmaq_f32(r4, r0, a); //  ar*br-ai*bi ai*br+ar*bi ...

      // no fma, use mul and add
-      //float32x4_t r5;
-      //r5 = vmulq_f32(r0, a);
-      //return vaddq_f32(r4, r5);
+      // float32x4_t r5;
+      // r5 = vmulq_f32(r0, a);
+      // return vaddq_f32(r4, r5);
    }
    // Complex double
    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
@@ -272,9 +271,9 @@ namespace Optimization {
      return vfmaq_f64(r4, r0, a); //  ar*br-ai*bi ai*br+ar*bi

      // no fma, use mul and add
-      //float64x2_t r5;
-      //r5 = vmulq_f64(r0, a);
-      //return vaddq_f64(r4, r5);
+      // float64x2_t r5;
+      // r5 = vmulq_f64(r0, a);
+      // return vaddq_f64(r4, r5);
    }
  };

@@ -421,11 +420,6 @@ namespace Optimization {
      }
    }

-// working, but no restriction on n
-//    template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n); };
-//    template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n); };
-
-// restriction on n
    template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n%4); };
    template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n%2); };

@@ -441,7 +435,7 @@ namespace Optimization {
      sb = vcvt_high_f32_f16(h);
      // there is no direct conversion from lower float32x4_t to float64x2_t
      // vextq_f16 not supported by clang 3.8 / 4.0 / arm clang
-      //float16x8_t h1 = vextq_f16(h, h, 4); // correct, but not supported by clang
+      // float16x8_t h1 = vextq_f16(h, h, 4); // correct, but not supported by clang
      // workaround for clang
      uint32x4_t h1u = reinterpret_cast<uint32x4_t>(h);
      float16x8_t h1 = reinterpret_cast<float16x8_t>(vextq_u32(h1u, h1u, 2));
@@ -547,7 +541,7 @@ namespace Optimization {


  //Complex double Reduce
-  template<> // N:by Boyle
+  template<>
  inline Grid::ComplexD Reduce<Grid::ComplexD, float64x2_t>::operator()(float64x2_t in){
    u128d conv; conv.v = in;
    return Grid::ComplexD(conv.f[0],conv.f[1]);
@@ -562,9 +556,7 @@ namespace Optimization {
  //Integer Reduce
  template<>
  inline Integer Reduce<Integer, uint32x4_t>::operator()(uint32x4_t in){
-    // FIXME unimplemented
-    printf("Reduce : Missing integer implementation -> FIX\n");
-    assert(0);
+    return vaddvq_u32(in);
  }
 }

@@ -603,4 +595,5 @@ namespace Optimization {
  typedef Optimization::TimesMinusI TimesMinusISIMD;
  typedef Optimization::TimesI      TimesISIMD;

-}
+}
+
--- a/lib/stencil/Stencil.h
+++ b/lib/stencil/Stencil.h
@@ -176,6 +176,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
  // Timing info; ugly; possibly temporary
  /////////////////////////////////////////
  double commtime;
+  double mpi3synctime;
+  double mpi3synctime_g;
+  double shmmergetime;
  double gathertime;
  double gathermtime;
  double halogtime;
@@ -185,6 +188,10 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
  double splicetime;
  double nosplicetime;
  double calls;
+  std::vector<double> comm_bytes_thr;
+  std::vector<double> comm_time_thr;
+  std::vector<double> comm_enter_thr;
+  std::vector<double> comm_leave_thr;

  ////////////////////////////////////////
  // Stencil query
@@ -248,35 +255,120 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
  //////////////////////////////////////////
  // Comms packet queue for asynch thread
  //////////////////////////////////////////
+  void CommunicateThreaded()
+  {
+#ifdef GRID_OMP
+    // must be called in parallel region
+    int mythread = omp_get_thread_num();
+    int nthreads = CartesianCommunicator::nCommThreads;
+#else
+    int mythread = 0;
+    int nthreads = 1;
+#endif
+    if (nthreads == -1) nthreads = 1;
+    if (mythread < nthreads) {
+      comm_enter_thr[mythread] = usecond();
+      for (int i = mythread; i < Packets.size(); i += nthreads) {
+	uint64_t bytes = _grid->StencilSendToRecvFrom(Packets[i].send_buf,
+						      Packets[i].to_rank,
+						      Packets[i].recv_buf,
+						      Packets[i].from_rank,
+						      Packets[i].bytes,i);
+	comm_bytes_thr[mythread] += bytes;
+      }
+      comm_leave_thr[mythread]= usecond();
+      comm_time_thr[mythread] += comm_leave_thr[mythread] - comm_enter_thr[mythread];
+    }
+  }
+  
+  void CollateThreads(void)
+  {
+    int nthreads = CartesianCommunicator::nCommThreads;
+    double first=0.0;
+    double last =0.0;
+
+    for(int t=0;t<nthreads;t++) {
+
+      double t0 = comm_enter_thr[t];
+      double t1 = comm_leave_thr[t];
+      comms_bytes+=comm_bytes_thr[t];
+
+      comm_enter_thr[t] = 0.0;
+      comm_leave_thr[t] = 0.0;
+      comm_time_thr[t]   = 0.0;
+      comm_bytes_thr[t]=0;
+
+      if ( first == 0.0 ) first = t0;                   // first is t0
+      if ( (t0 > 0.0) && ( t0 < first ) ) first = t0;   // min time seen
+
+      if ( t1 > last ) last = t1;                       // max time seen
+      
+    }
+    commtime+= last-first;
+  }
  void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
    reqs.resize(Packets.size());
    commtime-=usecond();
    for(int i=0;i<Packets.size();i++){
      comms_bytes+=_grid->StencilSendToRecvFromBegin(reqs[i],
-					  Packets[i].send_buf,
-					  Packets[i].to_rank,
-					  Packets[i].recv_buf,
-					  Packets[i].from_rank,
-					  Packets[i].bytes);
+						     Packets[i].send_buf,
+						     Packets[i].to_rank,
+						     Packets[i].recv_buf,
+						     Packets[i].from_rank,
+						     Packets[i].bytes,i);
    }
  }

  void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
    for(int i=0;i<Packets.size();i++){
-      _grid->StencilSendToRecvFromComplete(reqs[i]);
+      _grid->StencilSendToRecvFromComplete(reqs[i],i);
    }
    commtime+=usecond();
  }
+  void Communicate(void)
+  {
+#ifdef GRID_OMP
+#pragma omp parallel 
+    {
+      // must be called in parallel region
+      int mythread  = omp_get_thread_num();
+      int maxthreads= omp_get_max_threads();
+      int nthreads = CartesianCommunicator::nCommThreads;
+      assert(nthreads <= maxthreads);
+
+      if (nthreads == -1) nthreads = 1;
+#else
+      int mythread = 0;
+      int nthreads = 1;
+#endif
+      if (mythread < nthreads) {
+	for (int i = mythread; i < Packets.size(); i += nthreads) {
+	  double start = usecond();
+	  comm_bytes_thr[mythread] += _grid->StencilSendToRecvFrom(Packets[i].send_buf,
+								   Packets[i].to_rank,
+								   Packets[i].recv_buf,
+								   Packets[i].from_rank,
+								   Packets[i].bytes,i);
+	  comm_time_thr[mythread] += usecond() - start;
+	}
+      }
+#ifdef GRID_OMP
+    }
+#endif
+  }
  
  template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress) 
  {
    std::vector<std::vector<CommsRequest_t> > reqs;
    Prepare();
    HaloGather(source,compress);
-    CommunicateBegin(reqs);
-    CommunicateComplete(reqs);
+    // Concurrent
+    //CommunicateBegin(reqs);
+    //CommunicateComplete(reqs);
+    // Sequential, possibly threaded
+    Communicate();
    CommsMergeSHM(compress); 
    CommsMerge(compress); 
  }
@@ -337,7 +429,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
  template<class compressor>
  void HaloGather(const Lattice<vobj> &source,compressor &compress)
  {
+    mpi3synctime_g-=usecond();
    _grid->StencilBarrier();// Synch shared memory on a single nodes
+    mpi3synctime_g+=usecond();

    // conformable(source._grid,_grid);
    assert(source._grid==_grid);
@@ -397,8 +491,12 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
    CommsMerge(decompress,Mergers,Decompressions); 
  }
  template<class decompressor>  void CommsMergeSHM(decompressor decompress) {
+    mpi3synctime-=usecond();    
    _grid->StencilBarrier();// Synch shared memory on a single nodes
+    mpi3synctime+=usecond();    
+    shmmergetime-=usecond();    
    CommsMerge(decompress,MergersSHM,DecompressionsSHM);
+    shmmergetime+=usecond();    
  }

  template<class decompressor>
@@ -442,7 +540,12 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 		  int checkerboard,
 		  const std::vector<int> &directions,
 		  const std::vector<int> &distances) 
-   :   _permute_type(npoints), _comm_buf_size(npoints)
+   : _permute_type(npoints), 
+    _comm_buf_size(npoints),
+    comm_bytes_thr(npoints), 
+    comm_enter_thr(npoints),
+    comm_leave_thr(npoints), 
+       comm_time_thr(npoints)
  {
    face_table_computed=0;
    _npoints = npoints;
@@ -996,6 +1099,15 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
  void ZeroCounters(void) {
    gathertime = 0.;
    commtime = 0.;
+    mpi3synctime=0.;
+    mpi3synctime_g=0.;
+    shmmergetime=0.;
+    for(int i=0;i<_npoints;i++){
+      comm_time_thr[i]=0;
+      comm_bytes_thr[i]=0;
+      comm_enter_thr[i]=0;
+      comm_leave_thr[i]=0;
+    }
    halogtime = 0.;
    mergetime = 0.;
    decompresstime = 0.;
@@ -1011,6 +1123,18 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 #define PRINTIT(A) AVERAGE(A); std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<<std::endl;
    RealD NP = _grid->_Nprocessors;
    RealD NN = _grid->NodeCount();
+    double t = 0;
+    // if comm_time_thr is set they were all done in parallel so take the max
+    // but add up the bytes
+    int threaded = 0 ;
+    for (int i = 0; i < 8; ++i) {
+      if ( comm_time_thr[i]>0.0 ) {
+	threaded = 1;
+	comms_bytes += comm_bytes_thr[i];
+	if (t < comm_time_thr[i]) t = comm_time_thr[i];
+      }
+    }
+    if (threaded) commtime += t;
    
    _grid->GlobalSum(commtime);    commtime/=NP;
    if ( calls > 0. ) {
@@ -1026,6 +1150,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 	std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"<<std::endl;
 	std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000.*NP/NN << " GB/s per node"<<std::endl;
      }
+      PRINTIT(mpi3synctime);
+      PRINTIT(mpi3synctime_g);
+      PRINTIT(shmmergetime);
      PRINTIT(splicetime);
      PRINTIT(nosplicetime);
    }
--- a/lib/tensors/Tensor_arith_mul.h
+++ b/lib/tensors/Tensor_arith_mul.h
@@ -98,7 +98,9 @@ template<class rtype,class vtype,class mtype,int N>
 strong_inline void mult(iVector<rtype,N> * __restrict__ ret,
                 const iVector<vtype,N> * __restrict__ rhs,
                 const iScalar<mtype> * __restrict__ lhs){
-    mult(ret,lhs,rhs);
+    for(int c1=0;c1<N;c1++){
+        mult(&ret->_internal[c1],&rhs->_internal[c1],&lhs->_internal);
+    }                 
 }
    

--- a/lib/util/Init.cc
+++ b/lib/util/Init.cc
@@ -219,9 +219,15 @@ void Grid_init(int *argc,char ***argv)
    int MB;
    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--shm");
    GridCmdOptionInt(arg,MB);
-    CartesianCommunicator::MAX_MPI_SHM_BYTES = MB*1024*1024;
+    uint64_t MB64 = MB;
+    CartesianCommunicator::MAX_MPI_SHM_BYTES = MB64*1024LL*1024LL;
  }

+  if( GridCmdOptionExists(*argv,*argv+*argc,"--shm-hugepages") ){
+    CartesianCommunicator::Hugepages = 1;
+  }
+
+
  if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
    Grid_debug_handler_init();
  }
@@ -304,6 +310,7 @@ void Grid_init(int *argc,char ***argv)
    std::cout<<GridLogMessage<<"  --threads n     : default number of OMP threads"<<std::endl;
    std::cout<<GridLogMessage<<"  --grid n.n.n.n  : default Grid size"<<std::endl;
    std::cout<<GridLogMessage<<"  --shm  M        : allocate M megabytes of shared memory for comms"<<std::endl;
+    std::cout<<GridLogMessage<<"  --shm-hugepages : use explicit huge pages in mmap call "<<std::endl;    
    std::cout<<GridLogMessage<<std::endl;
    std::cout<<GridLogMessage<<"Verbose and debug:"<<std::endl;
    std::cout<<GridLogMessage<<std::endl;
@@ -317,7 +324,7 @@ void Grid_init(int *argc,char ***argv)
    std::cout<<GridLogMessage<<std::endl;
    std::cout<<GridLogMessage<<"  --comms-concurrent : Asynchronous MPI calls; several dirs at a time "<<std::endl;    
    std::cout<<GridLogMessage<<"  --comms-sequential : Synchronous MPI calls; one dirs at a time "<<std::endl;    
-    std::cout<<GridLogMessage<<"  --comms-overlap : Overlap comms with compute "<<std::endl;    
+    std::cout<<GridLogMessage<<"  --comms-overlap    : Overlap comms with compute "<<std::endl;    
    std::cout<<GridLogMessage<<std::endl;
    std::cout<<GridLogMessage<<"  --dslash-generic: Wilson kernel for generic Nc"<<std::endl;    
    std::cout<<GridLogMessage<<"  --dslash-unroll : Wilson kernel for Nc=3"<<std::endl;    
@@ -356,10 +363,15 @@ void Grid_init(int *argc,char ***argv)
  if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-sequential") ){
    CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
  }
+
  if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
    LebesgueOrder::UseLebesgueOrder=1;
  }
-
+  CartesianCommunicator::nCommThreads = -1;
+  if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-threads") ){
+    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--comms-threads");
+    GridCmdOptionInt(arg,CartesianCommunicator::nCommThreads);
+  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking");
    GridCmdOptionIntVector(arg,LebesgueOrder::Block);
@@ -374,10 +386,13 @@ void Grid_init(int *argc,char ***argv)
 		  Grid_default_latt,
 		  Grid_default_mpi);

-  std::cout << GridLogDebug << "Requesting "<< CartesianCommunicator::MAX_MPI_SHM_BYTES <<" byte stencil comms buffers "<<std::endl;
+  std::cout << GridLogMessage << "Requesting "<< CartesianCommunicator::MAX_MPI_SHM_BYTES <<" byte stencil comms buffers "<<std::endl;
+  if ( CartesianCommunicator::Hugepages) {
+    std::cout << GridLogMessage << "Mapped stencil comms buffers as MAP_HUGETLB "<<std::endl;
+  }

  if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
-    std::cout<<GridLogMessage<<"Grid Decomposition\n";
+    std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
    std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
    std::cout<<GridLogMessage<<"\tMPI tasks      : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl;
    std::cout<<GridLogMessage<<"\tvRealF         : "<<sizeof(vRealF)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl;
@@ -393,7 +408,7 @@ void Grid_init(int *argc,char ***argv)

 void Grid_finalize(void)
 {
-#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3)
+#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT)
  MPI_Finalize();
  Grid_unquiesce_nodes();
 #endif