Merge branch 'develop' into feature/hadrons

2026-01-11 12:29:33 +00:00 · 2017-08-24 17:05:45 +01:00
parent 2bcb704af2 5fa386ddc9
commit 21b02760c3
42 changed files with 1906 additions and 512 deletions
--- a/lib/algorithms/iterative/BlockConjugateGradient.h
+++ b/lib/algorithms/iterative/BlockConjugateGradient.h
@@ -199,7 +199,12 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)

  Linop.HermOp(X, AD);
  tmp = B - AD;  
+  //std::cout << GridLogMessage << " initial tmp " << norm2(tmp)<< std::endl;
  ThinQRfact (m_rr, m_C, m_Cinv, Q, tmp);
+  //std::cout << GridLogMessage << " initial Q " << norm2(Q)<< std::endl;
+  //std::cout << GridLogMessage << " m_rr " << m_rr<<std::endl;
+  //std::cout << GridLogMessage << " m_C " << m_C<<std::endl;
+  //std::cout << GridLogMessage << " m_Cinv " << m_Cinv<<std::endl;
  D=Q;

  std::cout << GridLogMessage<<"BlockCGrQ computed initial residual and QR fact " <<std::endl;
@@ -221,13 +226,15 @@ void BlockCGrQsolve(LinearOperatorBase<Field> &Linop, const Field &B, Field &X)
    MatrixTimer.Start();
    Linop.HermOp(D, Z);      
    MatrixTimer.Stop();
+    //std::cout << GridLogMessage << " norm2 Z " <<norm2(Z)<<std::endl;

    //4. M  = [D^dag Z]^{-1}
    sliceInnerTimer.Start();
    sliceInnerProductMatrix(m_DZ,D,Z,Orthog);
    sliceInnerTimer.Stop();
    m_M       = m_DZ.inverse();
-
+    //std::cout << GridLogMessage << " m_DZ " <<m_DZ<<std::endl;
+    
    //5. X  = X + D MC
    m_tmp     = m_M * m_C;
    sliceMaddTimer.Start();
--- a/lib/allocator/AlignedAllocator.cc
+++ b/lib/allocator/AlignedAllocator.cc
@@ -11,7 +11,7 @@ int PointerCache::victim;

 void *PointerCache::Insert(void *ptr,size_t bytes) {

-  if (bytes < 4096 ) return NULL;
+  if (bytes < 4096 ) return ptr;

 #ifdef GRID_OMP
  assert(omp_in_parallel()==0);
--- a/lib/allocator/AlignedAllocator.h
+++ b/lib/allocator/AlignedAllocator.h
@@ -98,7 +98,14 @@ public:
 #else
    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(128,bytes);
 #endif
-
+    // First touch optimise in threaded loop
+    uint8_t *cp = (uint8_t *)ptr;
+#ifdef GRID_OMP
+#pragma omp parallel for
+#endif
+    for(size_type n=0;n<bytes;n+=4096){
+      cp[n]=0;
+    }
    return ptr;
  }

@@ -186,6 +193,13 @@ public:
 #else
    _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
 #endif
+    size_type bytes = __n*sizeof(_Tp);
+    uint8_t *cp = (uint8_t *)ptr;
+    // One touch per 4k page, static OMP loop to catch same loop order
+#pragma omp parallel for schedule(static)
+    for(size_type n=0;n<bytes;n+=4096){
+      cp[n]=0;
+    }
    return ptr;
  }
  void deallocate(pointer __p, size_type) { 
--- a/lib/cartesian/Cartesian_base.h
+++ b/lib/cartesian/Cartesian_base.h
@@ -185,17 +185,18 @@ public:
    ////////////////////////////////////////////////////////////////

    void show_decomposition(){
-      std::cout << GridLogMessage << "Full Dimensions    : " << _fdimensions << std::endl;
-      std::cout << GridLogMessage << "Global Dimensions  : " << _gdimensions << std::endl;
-      std::cout << GridLogMessage << "Local Dimensions   : " << _ldimensions << std::endl;
-      std::cout << GridLogMessage << "Reduced Dimensions : " << _rdimensions << std::endl;
-      std::cout << GridLogMessage << "Outer strides      : " << _ostride << std::endl;
-      std::cout << GridLogMessage << "Inner strides      : " << _istride << std::endl;
-      std::cout << GridLogMessage << "iSites             : " << _isites << std::endl;
-      std::cout << GridLogMessage << "oSites             : " << _osites << std::endl;
-      std::cout << GridLogMessage << "lSites             : " << lSites() << std::endl;        
-      std::cout << GridLogMessage << "gSites             : " << gSites() << std::endl;
-      std::cout << GridLogMessage << "Nd                 : " << _ndimension << std::endl;             
+      std::cout << GridLogMessage << "\tFull Dimensions    : " << _fdimensions << std::endl;
+      std::cout << GridLogMessage << "\tSIMD layout        : " << _simd_layout << std::endl;
+      std::cout << GridLogMessage << "\tGlobal Dimensions  : " << _gdimensions << std::endl;
+      std::cout << GridLogMessage << "\tLocal Dimensions   : " << _ldimensions << std::endl;
+      std::cout << GridLogMessage << "\tReduced Dimensions : " << _rdimensions << std::endl;
+      std::cout << GridLogMessage << "\tOuter strides      : " << _ostride << std::endl;
+      std::cout << GridLogMessage << "\tInner strides      : " << _istride << std::endl;
+      std::cout << GridLogMessage << "\tiSites             : " << _isites << std::endl;
+      std::cout << GridLogMessage << "\toSites             : " << _osites << std::endl;
+      std::cout << GridLogMessage << "\tlSites             : " << lSites() << std::endl;        
+      std::cout << GridLogMessage << "\tgSites             : " << gSites() << std::endl;
+      std::cout << GridLogMessage << "\tNd                 : " << _ndimension << std::endl;             
    } 

    ////////////////////////////////////////////////////////////////
--- a/lib/cartesian/Cartesian_full.h
+++ b/lib/cartesian/Cartesian_full.h
@@ -62,77 +62,81 @@ public:
      return shift;
    }
    GridCartesian(const std::vector<int> &dimensions,
-		  const std::vector<int> &simd_layout,
-		  const std::vector<int> &processor_grid
-		  ) : GridBase(processor_grid)
+                  const std::vector<int> &simd_layout,
+                  const std::vector<int> &processor_grid) : GridBase(processor_grid)
    {
-        ///////////////////////
-        // Grid information
-        ///////////////////////
-        _ndimension = dimensions.size();
-            
-        _fdimensions.resize(_ndimension);
-        _gdimensions.resize(_ndimension);
-        _ldimensions.resize(_ndimension);
-        _rdimensions.resize(_ndimension);
-        _simd_layout.resize(_ndimension);
-	_lstart.resize(_ndimension);
-	_lend.resize(_ndimension);
-            
-        _ostride.resize(_ndimension);
-        _istride.resize(_ndimension);
-            
-        _fsites = _gsites = _osites = _isites = 1;
+      ///////////////////////
+      // Grid information
+      ///////////////////////
+      _ndimension = dimensions.size();

-        for(int d=0;d<_ndimension;d++){
-	  _fdimensions[d] = dimensions[d]; // Global dimensions
-	  _gdimensions[d] = _fdimensions[d]; // Global dimensions
-	  _simd_layout[d] = simd_layout[d];
-	  _fsites = _fsites * _fdimensions[d];
-	  _gsites = _gsites * _gdimensions[d];
+      _fdimensions.resize(_ndimension);
+      _gdimensions.resize(_ndimension);
+      _ldimensions.resize(_ndimension);
+      _rdimensions.resize(_ndimension);
+      _simd_layout.resize(_ndimension);
+      _lstart.resize(_ndimension);
+      _lend.resize(_ndimension);

-	  //FIXME check for exact division
+      _ostride.resize(_ndimension);
+      _istride.resize(_ndimension);

-	  // Use a reduced simd grid
-	  _ldimensions[d]= _gdimensions[d]/_processors[d];  //local dimensions
-	  _rdimensions[d]= _ldimensions[d]/_simd_layout[d]; //overdecomposition
-	  _lstart[d]     = _processor_coor[d]*_ldimensions[d];
-	  _lend[d]       = _processor_coor[d]*_ldimensions[d]+_ldimensions[d]-1;
-	  _osites  *= _rdimensions[d];
-	  _isites  *= _simd_layout[d];
-                
-	  // Addressing support
-	  if ( d==0 ) {
-	    _ostride[d] = 1;
-	    _istride[d] = 1;
-	  } else {
-	    _ostride[d] = _ostride[d-1]*_rdimensions[d-1];
-	    _istride[d] = _istride[d-1]*_simd_layout[d-1];
-	  }
+      _fsites = _gsites = _osites = _isites = 1;
+
+      for (int d = 0; d < _ndimension; d++)
+      {
+        _fdimensions[d] = dimensions[d];   // Global dimensions
+        _gdimensions[d] = _fdimensions[d]; // Global dimensions
+        _simd_layout[d] = simd_layout[d];
+        _fsites = _fsites * _fdimensions[d];
+        _gsites = _gsites * _gdimensions[d];
+
+        // Use a reduced simd grid
+        _ldimensions[d] = _gdimensions[d] / _processors[d]; //local dimensions
+        assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
+
+        _rdimensions[d] = _ldimensions[d] / _simd_layout[d]; //overdecomposition
+        assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
+
+        _lstart[d] = _processor_coor[d] * _ldimensions[d];
+        _lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;
+        _osites *= _rdimensions[d];
+        _isites *= _simd_layout[d];
+
+        // Addressing support
+        if (d == 0)
+        {
+          _ostride[d] = 1;
+          _istride[d] = 1;
        }
-        
-        ///////////////////////
-        // subplane information
-        ///////////////////////
-        _slice_block.resize(_ndimension);
-        _slice_stride.resize(_ndimension);
-        _slice_nblock.resize(_ndimension);
-            
-        int block =1;
-        int nblock=1;
-        for(int d=0;d<_ndimension;d++) nblock*=_rdimensions[d];
-            
-        for(int d=0;d<_ndimension;d++){
-            nblock/=_rdimensions[d];
-            _slice_block[d] =block;
-            _slice_stride[d]=_ostride[d]*_rdimensions[d];
-            _slice_nblock[d]=nblock;
-            block = block*_rdimensions[d];
+        else
+        {
+          _ostride[d] = _ostride[d - 1] * _rdimensions[d - 1];
+          _istride[d] = _istride[d - 1] * _simd_layout[d - 1];
        }
+      }

+      ///////////////////////
+      // subplane information
+      ///////////////////////
+      _slice_block.resize(_ndimension);
+      _slice_stride.resize(_ndimension);
+      _slice_nblock.resize(_ndimension);
+
+      int block = 1;
+      int nblock = 1;
+      for (int d = 0; d < _ndimension; d++)
+        nblock *= _rdimensions[d];
+
+      for (int d = 0; d < _ndimension; d++)
+      {
+        nblock /= _rdimensions[d];
+        _slice_block[d] = block;
+        _slice_stride[d] = _ostride[d] * _rdimensions[d];
+        _slice_nblock[d] = nblock;
+        block = block * _rdimensions[d];
+      }
    };
 };
-
-
 }
 #endif
--- a/lib/cartesian/Cartesian_red_black.h
+++ b/lib/cartesian/Cartesian_red_black.h
@@ -131,21 +131,21 @@ public:
      Init(dimensions,simd_layout,processor_grid,checker_dim_mask,0);
    }
    void Init(const std::vector<int> &dimensions,
-	      const std::vector<int> &simd_layout,
-	      const std::vector<int> &processor_grid,
-	      const std::vector<int> &checker_dim_mask,
-	      int checker_dim)
+              const std::vector<int> &simd_layout,
+              const std::vector<int> &processor_grid,
+              const std::vector<int> &checker_dim_mask,
+              int checker_dim)
    {
-    ///////////////////////
-    // Grid information
-    ///////////////////////
+      ///////////////////////
+      // Grid information
+      ///////////////////////
      _checker_dim = checker_dim;
-      assert(checker_dim_mask[checker_dim]==1);
+      assert(checker_dim_mask[checker_dim] == 1);
      _ndimension = dimensions.size();
-      assert(checker_dim_mask.size()==_ndimension);
-      assert(processor_grid.size()==_ndimension);
-      assert(simd_layout.size()==_ndimension);
-      
+      assert(checker_dim_mask.size() == _ndimension);
+      assert(processor_grid.size() == _ndimension);
+      assert(simd_layout.size() == _ndimension);
+
      _fdimensions.resize(_ndimension);
      _gdimensions.resize(_ndimension);
      _ldimensions.resize(_ndimension);
@@ -153,114 +153,133 @@ public:
      _simd_layout.resize(_ndimension);
      _lstart.resize(_ndimension);
      _lend.resize(_ndimension);
-      
+
      _ostride.resize(_ndimension);
      _istride.resize(_ndimension);
-      
+
      _fsites = _gsites = _osites = _isites = 1;
-	
-      _checker_dim_mask=checker_dim_mask;

-      for(int d=0;d<_ndimension;d++){
-	_fdimensions[d] = dimensions[d];
-	_gdimensions[d] = _fdimensions[d];
-	_fsites = _fsites * _fdimensions[d];
-	_gsites = _gsites * _gdimensions[d];
-        
-	if (d==_checker_dim) {
-	  _gdimensions[d] = _gdimensions[d]/2; // Remove a checkerboard
-	}
-	_ldimensions[d] = _gdimensions[d]/_processors[d];
-	_lstart[d]     = _processor_coor[d]*_ldimensions[d];
-	_lend[d]       = _processor_coor[d]*_ldimensions[d]+_ldimensions[d]-1;
+      _checker_dim_mask = checker_dim_mask;

-	// Use a reduced simd grid
-	_simd_layout[d] = simd_layout[d];
-	_rdimensions[d]= _ldimensions[d]/_simd_layout[d];
-	assert(_rdimensions[d]>0);
+      for (int d = 0; d < _ndimension; d++)
+      {
+        _fdimensions[d] = dimensions[d];
+        _gdimensions[d] = _fdimensions[d];
+        _fsites = _fsites * _fdimensions[d];
+        _gsites = _gsites * _gdimensions[d];

-	// all elements of a simd vector must have same checkerboard.
-	// If Ls vectorised, this must still be the case; e.g. dwf rb5d
-	if ( _simd_layout[d]>1 ) {
-	  if ( checker_dim_mask[d] ) { 
-	    assert( (_rdimensions[d]&0x1) == 0 );
-	  }
-	}
+        if (d == _checker_dim)
+        {
+          assert((_gdimensions[d] & 0x1) == 0);
+          _gdimensions[d] = _gdimensions[d] / 2; // Remove a checkerboard
+        }
+        _ldimensions[d] = _gdimensions[d] / _processors[d];
+        assert(_ldimensions[d] * _processors[d] == _gdimensions[d]);
+        _lstart[d] = _processor_coor[d] * _ldimensions[d];
+        _lend[d] = _processor_coor[d] * _ldimensions[d] + _ldimensions[d] - 1;

-	_osites *= _rdimensions[d];
-	_isites *= _simd_layout[d];
-        
-	// Addressing support
-	if ( d==0 ) {
-	  _ostride[d] = 1;
-	  _istride[d] = 1;
-	} else {
-	  _ostride[d] = _ostride[d-1]*_rdimensions[d-1];
-	  _istride[d] = _istride[d-1]*_simd_layout[d-1];
-	}
+        // Use a reduced simd grid
+        _simd_layout[d] = simd_layout[d];
+        _rdimensions[d] = _ldimensions[d] / _simd_layout[d]; // this is not checking if this is integer
+        assert(_rdimensions[d] * _simd_layout[d] == _ldimensions[d]);
+        assert(_rdimensions[d] > 0);

+        // all elements of a simd vector must have same checkerboard.
+        // If Ls vectorised, this must still be the case; e.g. dwf rb5d
+        if (_simd_layout[d] > 1)
+        {
+          if (checker_dim_mask[d])
+          {
+            assert((_rdimensions[d] & 0x1) == 0);
+          }
+        }

+        _osites *= _rdimensions[d];
+        _isites *= _simd_layout[d];
+
+        // Addressing support
+        if (d == 0)
+        {
+          _ostride[d] = 1;
+          _istride[d] = 1;
+        }
+        else
+        {
+          _ostride[d] = _ostride[d - 1] * _rdimensions[d - 1];
+          _istride[d] = _istride[d - 1] * _simd_layout[d - 1];
+        }
      }
-            
+
      ////////////////////////////////////////////////////////////////////////////////////////////
      // subplane information
      ////////////////////////////////////////////////////////////////////////////////////////////
      _slice_block.resize(_ndimension);
      _slice_stride.resize(_ndimension);
      _slice_nblock.resize(_ndimension);
-        
-      int block =1;
-      int nblock=1;
-      for(int d=0;d<_ndimension;d++) nblock*=_rdimensions[d];
-      
-      for(int d=0;d<_ndimension;d++){
-	nblock/=_rdimensions[d];
-	_slice_block[d] =block;
-	_slice_stride[d]=_ostride[d]*_rdimensions[d];
-	_slice_nblock[d]=nblock;
-	block = block*_rdimensions[d];
+
+      int block = 1;
+      int nblock = 1;
+      for (int d = 0; d < _ndimension; d++)
+        nblock *= _rdimensions[d];
+
+      for (int d = 0; d < _ndimension; d++)
+      {
+        nblock /= _rdimensions[d];
+        _slice_block[d] = block;
+        _slice_stride[d] = _ostride[d] * _rdimensions[d];
+        _slice_nblock[d] = nblock;
+        block = block * _rdimensions[d];
      }

      ////////////////////////////////////////////////
      // Create a checkerboard lookup table
      ////////////////////////////////////////////////
      int rvol = 1;
-      for(int d=0;d<_ndimension;d++){
-	rvol=rvol * _rdimensions[d];
+      for (int d = 0; d < _ndimension; d++)
+      {
+        rvol = rvol * _rdimensions[d];
      }
      _checker_board.resize(rvol);
-      for(int osite=0;osite<_osites;osite++){
-	_checker_board[osite] = CheckerBoardFromOindex (osite);
+      for (int osite = 0; osite < _osites; osite++)
+      {
+        _checker_board[osite] = CheckerBoardFromOindex(osite);
      }
-      
    };
-protected:
+
+  protected:
    virtual int oIndex(std::vector<int> &coor)
    {
-      int idx=0;
-      for(int d=0;d<_ndimension;d++) {
-	if( d==_checker_dim ) {
-	  idx+=_ostride[d]*((coor[d]/2)%_rdimensions[d]);
-	} else {
-	  idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
-	}
+      int idx = 0;
+      for (int d = 0; d < _ndimension; d++)
+      {
+        if (d == _checker_dim)
+        {
+          idx += _ostride[d] * ((coor[d] / 2) % _rdimensions[d]);
+        }
+        else
+        {
+          idx += _ostride[d] * (coor[d] % _rdimensions[d]);
+        }
      }
      return idx;
    };
-        
+
    virtual int iIndex(std::vector<int> &lcoor)
    {
-        int idx=0;
-        for(int d=0;d<_ndimension;d++) {
-	  if( d==_checker_dim ) {
-	    idx+=_istride[d]*(lcoor[d]/(2*_rdimensions[d]));
-	  } else { 
-	    idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
-	  }
-	}
-        return idx;
+      int idx = 0;
+      for (int d = 0; d < _ndimension; d++)
+      {
+        if (d == _checker_dim)
+        {
+          idx += _istride[d] * (lcoor[d] / (2 * _rdimensions[d]));
+        }
+        else
+        {
+          idx += _istride[d] * (lcoor[d] / _rdimensions[d]);
+        }
+      }
+      return idx;
    }
 };
-
 }
 #endif
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@@ -37,7 +37,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <sys/ipc.h>
 #include <sys/shm.h>
 #include <sys/mman.h>
-//#include <zlib.h>
+#include <zlib.h>
+#ifdef HAVE_NUMAIF_H
+#include <numaif.h>
+#endif
 #ifndef SHM_HUGETLB
 #define SHM_HUGETLB 04000
 #endif
@@ -214,6 +217,25 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
      if ( ptr == MAP_FAILED ) {       perror("failed mmap");      assert(0);    }
      assert(((uint64_t)ptr&0x3F)==0);
+
+      // Try to force numa domain on the shm segment if we have numaif.h
+#ifdef HAVE_NUMAIF_H
+	int status;
+	int flags=MPOL_MF_MOVE;
+#ifdef KNL
+	int nodes=1; // numa domain == MCDRAM
+	// Find out if in SNC2,SNC4 mode ?
+#else
+	int nodes=r; // numa domain == MPI ID
+#endif
+	unsigned long count=1;
+	for(uint64_t page=0;page<size;page+=4096){
+	  void *pages = (void *) ( page + (uint64_t)ptr );
+	  uint64_t *cow_it = (uint64_t *)pages;	*cow_it = 1;
+	  ierr= move_pages(0,count, &pages,&nodes,&status,flags);
+	  if (ierr && (page==0)) perror("numa relocate command failed");
+	}
+#endif
      ShmCommBufs[r] =ptr;
      
    }
--- a/lib/lattice/Lattice_reduction.h
+++ b/lib/lattice/Lattice_reduction.h
@@ -369,6 +369,7 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
  }
 };

+/*
 inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
 {
  int NN    = BlockSolverGrid->_ndimension;
@@ -387,6 +388,7 @@ inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Or
  }
  return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); 
 }
+*/

 template<class vobj>
 static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) 
@@ -398,14 +400,15 @@ static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice
  int Nblock = X._grid->GlobalDimensions()[Orthog];

  GridBase *FullGrid  = X._grid;
-  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);

-  Lattice<vobj> Xslice(SliceGrid);
-  Lattice<vobj> Rslice(SliceGrid);
+  //  Lattice<vobj> Xslice(SliceGrid);
+  //  Lattice<vobj> Rslice(SliceGrid);

  assert( FullGrid->_simd_layout[Orthog]==1);
  int nh =  FullGrid->_ndimension;
-  int nl = SliceGrid->_ndimension;
+  //  int nl = SliceGrid->_ndimension;
+  int nl = nh-1;

  //FIXME package in a convenient iterator
  //Should loop over a plane orthogonal to direction "Orthog"
@@ -448,14 +451,14 @@ static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<
  int Nblock = X._grid->GlobalDimensions()[Orthog];

  GridBase *FullGrid  = X._grid;
-  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
-
-  Lattice<vobj> Xslice(SliceGrid);
-  Lattice<vobj> Rslice(SliceGrid);
+  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+  //  Lattice<vobj> Xslice(SliceGrid);
+  //  Lattice<vobj> Rslice(SliceGrid);

  assert( FullGrid->_simd_layout[Orthog]==1);
  int nh =  FullGrid->_ndimension;
-  int nl = SliceGrid->_ndimension;
+  //  int nl = SliceGrid->_ndimension;
+  int nl=1;

  //FIXME package in a convenient iterator
  //Should loop over a plane orthogonal to direction "Orthog"
@@ -498,18 +501,19 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj>
  typedef typename vobj::vector_type vector_type;
  
  GridBase *FullGrid  = lhs._grid;
-  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
  
  int Nblock = FullGrid->GlobalDimensions()[Orthog];
  
-  Lattice<vobj> Lslice(SliceGrid);
-  Lattice<vobj> Rslice(SliceGrid);
+  //  Lattice<vobj> Lslice(SliceGrid);
+  //  Lattice<vobj> Rslice(SliceGrid);
  
  mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);

  assert( FullGrid->_simd_layout[Orthog]==1);
  int nh =  FullGrid->_ndimension;
-  int nl = SliceGrid->_ndimension;
+  //  int nl = SliceGrid->_ndimension;
+  int nl = nh-1;

  //FIXME package in a convenient iterator
  //Should loop over a plane orthogonal to direction "Orthog"
@@ -540,7 +544,8 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj>
      for(int i=0;i<Nblock;i++){
      for(int j=0;j<Nblock;j++){
 	auto tmp = innerProduct(Left[i],Right[j]);
-	vector_typeD rtmp = TensorRemove(tmp);
+	//	vector_typeD rtmp = TensorRemove(tmp);
+	auto rtmp = TensorRemove(tmp);
 	mat_thread(i,j) += Reduce(rtmp);
      }}
    }}
@@ -549,6 +554,14 @@ static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj>
      mat += mat_thread;
    }  
  }
+
+  for(int i=0;i<Nblock;i++){
+  for(int j=0;j<Nblock;j++){
+    ComplexD sum = mat(i,j);
+    FullGrid->GlobalSum(sum);
+    mat(i,j)=sum;
+  }}
+
  return;
 }

--- a/lib/parallelIO/BinaryIO.h
+++ b/lib/parallelIO/BinaryIO.h
@@ -98,35 +98,39 @@ class BinaryIO {

    NerscChecksum(grid,scalardata,nersc_csum);
  }
-  
-  template<class fobj> static inline void NerscChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &nersc_csum)
+
+  template <class fobj>
+  static inline void NerscChecksum(GridBase *grid, std::vector<fobj> &fbuf, uint32_t &nersc_csum)
  {
-    const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t);
+    const uint64_t size32 = sizeof(fobj) / sizeof(uint32_t);

-
-    uint64_t lsites              =grid->lSites();
-    if (fbuf.size()==1) {
-      lsites=1;
+    uint64_t lsites = grid->lSites();
+    if (fbuf.size() == 1)
+    {
+      lsites = 1;
    }

-#pragma omp parallel
-    { 
-      uint32_t nersc_csum_thr=0;
+    #pragma omp parallel
+    {
+      uint32_t nersc_csum_thr = 0;

-#pragma omp for
-      for(uint64_t local_site=0;local_site<lsites;local_site++){
-	uint32_t * site_buf = (uint32_t *)&fbuf[local_site];
-	for(uint64_t j=0;j<size32;j++){
-	  nersc_csum_thr=nersc_csum_thr+site_buf[j];
-	}
+      #pragma omp for
+      for (uint64_t local_site = 0; local_site < lsites; local_site++)
+      {
+        uint32_t *site_buf = (uint32_t *)&fbuf[local_site];
+        for (uint64_t j = 0; j < size32; j++)
+        {
+          nersc_csum_thr = nersc_csum_thr + site_buf[j];
+        }
      }

-#pragma omp critical
+      #pragma omp critical
      {
-	nersc_csum  += nersc_csum_thr;
+        nersc_csum += nersc_csum_thr;
      }
    }
  }
+
  template<class fobj> static inline void ScidacChecksum(GridBase *grid,std::vector<fobj> &fbuf,uint32_t &scidac_csuma,uint32_t &scidac_csumb)
  {
    const uint64_t size32 = sizeof(fobj)/sizeof(uint32_t);
@@ -266,7 +270,7 @@ class BinaryIO {
    grid->Barrier();
    GridStopWatch timer; 
    GridStopWatch bstimer;
-
+    
    nersc_csum=0;
    scidac_csuma=0;
    scidac_csumb=0;
@@ -362,18 +366,22 @@ class BinaryIO {
 #else 
 	assert(0);
 #endif
-      } else { 
-	std::cout<< GridLogMessage<< "C++ read I/O "<< file<<" : "
-		 << iodata.size()*sizeof(fobj)<<" bytes"<<std::endl;
-	std::ifstream fin;
-	fin.open(file,std::ios::binary|std::ios::in);
-	if ( control & BINARYIO_MASTER_APPEND )  {
-	  fin.seekg(-sizeof(fobj),fin.end);
-	} else { 
-	  fin.seekg(offset+myrank*lsites*sizeof(fobj));
-	}
-	fin.read((char *)&iodata[0],iodata.size()*sizeof(fobj));assert( fin.fail()==0);
-	fin.close();
+      } else {
+        std::cout << GridLogMessage << "C++ read I/O " << file << " : "
+                  << iodata.size() * sizeof(fobj) << " bytes" << std::endl;
+        std::ifstream fin;
+        fin.open(file, std::ios::binary | std::ios::in);
+        if (control & BINARYIO_MASTER_APPEND)
+        {
+          fin.seekg(-sizeof(fobj), fin.end);
+        }
+        else
+        {
+          fin.seekg(offset + myrank * lsites * sizeof(fobj));
+        }
+        fin.read((char *)&iodata[0], iodata.size() * sizeof(fobj));
+        assert(fin.fail() == 0);
+        fin.close();
      }
      timer.Stop();

@@ -405,30 +413,78 @@ class BinaryIO {
      timer.Start();
      if ( (control & BINARYIO_LEXICOGRAPHIC) && (nrank > 1) ) {
 #ifdef USE_MPI_IO
-	std::cout<< GridLogMessage<< "MPI write I/O "<< file<< std::endl;
-	ierr=MPI_File_open(grid->communicator,(char *) file.c_str(), MPI_MODE_RDWR|MPI_MODE_CREATE,MPI_INFO_NULL, &fh); assert(ierr==0);
-	ierr=MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);                        assert(ierr==0);
-	ierr=MPI_File_write_all(fh, &iodata[0], 1, localArray, &status);                                        assert(ierr==0);
-	MPI_File_close(&fh);
-	MPI_Type_free(&fileArray);
-	MPI_Type_free(&localArray);
+        std::cout << GridLogMessage << "MPI write I/O " << file << std::endl;
+        ierr = MPI_File_open(grid->communicator, (char *)file.c_str(), MPI_MODE_RDWR | MPI_MODE_CREATE, MPI_INFO_NULL, &fh);
+        std::cout << GridLogMessage << "Checking for errors" << std::endl;
+        if (ierr != MPI_SUCCESS)
+        {
+          char error_string[BUFSIZ];
+          int length_of_error_string, error_class;
+
+          MPI_Error_class(ierr, &error_class);
+          MPI_Error_string(error_class, error_string, &length_of_error_string);
+          fprintf(stderr, "%3d: %s\n", myrank, error_string);
+          MPI_Error_string(ierr, error_string, &length_of_error_string);
+          fprintf(stderr, "%3d: %s\n", myrank, error_string);
+          MPI_Abort(MPI_COMM_WORLD, 1); //assert(ierr == 0);
+        }
+
+        std::cout << GridLogDebug << "MPI read I/O set view " << file << std::endl;
+        ierr = MPI_File_set_view(fh, disp, mpiObject, fileArray, "native", MPI_INFO_NULL);
+        assert(ierr == 0);
+
+        std::cout << GridLogDebug << "MPI read I/O write all " << file << std::endl;
+        ierr = MPI_File_write_all(fh, &iodata[0], 1, localArray, &status);
+        assert(ierr == 0);
+
+        MPI_File_close(&fh);
+        MPI_Type_free(&fileArray);
+        MPI_Type_free(&localArray);
 #else 
 	assert(0);
 #endif
      } else { 
-	std::ofstream fout; fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
-	std::cout<< GridLogMessage<< "C++ write I/O "<< file<<" : "
-		 << iodata.size()*sizeof(fobj)<<" bytes"<<std::endl;
-	if ( control & BINARYIO_MASTER_APPEND )  {
+        
+	std::ofstream fout; 
+  fout.exceptions ( std::fstream::failbit | std::fstream::badbit );
+  try {
+    fout.open(file,std::ios::binary|std::ios::out|std::ios::in);
+  } catch (const std::fstream::failure& exc) {
+    std::cout << GridLogError << "Error in opening the file " << file << " for output" <<std::endl;
+    std::cout << GridLogError << "Exception description: " << exc.what() << std::endl;
+    std::cout << GridLogError << "Probable cause: wrong path, inaccessible location "<< std::endl;
+    #ifdef USE_MPI_IO
+    MPI_Abort(MPI_COMM_WORLD,1);
+    #else
+    exit(1);
+    #endif
+  }
+	std::cout << GridLogMessage<< "C++ write I/O "<< file<<" : "
+		        << iodata.size()*sizeof(fobj)<<" bytes"<<std::endl;
+	
+  if ( control & BINARYIO_MASTER_APPEND )  {
 	  fout.seekp(0,fout.end);
 	} else {
 	  fout.seekp(offset+myrank*lsites*sizeof(fobj));
 	}
-	fout.write((char *)&iodata[0],iodata.size()*sizeof(fobj));assert( fout.fail()==0);
+  
+  try {
+  	fout.write((char *)&iodata[0],iodata.size()*sizeof(fobj));//assert( fout.fail()==0);
+  }
+  catch (const std::fstream::failure& exc) {
+    std::cout << "Exception in writing file " << file << std::endl;
+    std::cout << GridLogError << "Exception description: "<< exc.what() << std::endl;
+    #ifdef USE_MPI_IO
+    MPI_Abort(MPI_COMM_WORLD,1);
+    #else
+    exit(1);
+    #endif
+  }
+
 	fout.close();
-      }
-      timer.Stop();
-    }
+  }
+  timer.Stop();
+  }

    std::cout<<GridLogMessage<<"IOobject: ";
    if ( control & BINARYIO_READ) std::cout << " read  ";
@@ -442,11 +498,14 @@ class BinaryIO {
    //////////////////////////////////////////////////////////////////////////////
    // Safety check
    //////////////////////////////////////////////////////////////////////////////
-    grid->Barrier();
-    grid->GlobalSum(nersc_csum);
-    grid->GlobalXOR(scidac_csuma);
-    grid->GlobalXOR(scidac_csumb);
-    grid->Barrier();
+    // if the data size is 1 we do not want to sum over the MPI ranks
+    if (iodata.size() != 1){
+      grid->Barrier();
+      grid->GlobalSum(nersc_csum);
+      grid->GlobalXOR(scidac_csuma);
+      grid->GlobalXOR(scidac_csumb);
+      grid->Barrier();
+    }
  }

  /////////////////////////////////////////////////////////////////////////////
@@ -546,9 +605,9 @@ class BinaryIO {
    int gsites = grid->gSites();
    int lsites = grid->lSites();

-    uint32_t nersc_csum_tmp;
-    uint32_t scidac_csuma_tmp;
-    uint32_t scidac_csumb_tmp;
+    uint32_t nersc_csum_tmp   = 0;
+    uint32_t scidac_csuma_tmp = 0;
+    uint32_t scidac_csumb_tmp = 0;

    GridStopWatch timer;

--- a/lib/perfmon/PerfCount.cc
+++ b/lib/perfmon/PerfCount.cc
@@ -40,7 +40,7 @@ const PerformanceCounter::PerformanceCounterConfig PerformanceCounter::Performan
  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES          ,  "CPUCYCLES.........." , INSTRUCTIONS},
  { PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS        ,  "INSTRUCTIONS......." , CPUCYCLES   },
    // 4
-#ifdef AVX512
+#ifdef KNL
    { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", CPUCYCLES    },
    { PERF_TYPE_RAW, RawConfig(0x01,0x04), "L1_MISS_LOADS......", L1D_READ_ACCESS  },
    { PERF_TYPE_RAW, RawConfig(0x40,0x04), "ALL_LOADS..........", L1D_READ_ACCESS    },
--- a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc
+++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.cc
@@ -230,8 +230,15 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
 {
  Compressor compressor;
  int LLs = in._grid->_rdimensions[0];
+
+
+
+  DhopTotalTime -= usecond();
+  DhopCommTime -= usecond();
  st.HaloExchange(in,compressor);
+  DhopCommTime += usecond();
  
+  DhopComputeTime -= usecond();
  // Dhop takes the 4d grid from U, and makes a 5d index for fermion
  if (dag == DaggerYes) {
    parallel_for (int ss = 0; ss < U._grid->oSites(); ss++) {
@@ -244,12 +251,15 @@ void ImprovedStaggeredFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOr
 	Kernels::DhopSite(st,lo,U,UUU,st.CommBuf(),LLs,sU,in,out);
    }
  }
+  DhopComputeTime += usecond();
+  DhopTotalTime   += usecond();
 }


 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
 {
+  DhopCalls+=1;
  conformable(in._grid,FermionRedBlackGrid());    // verifies half grid
  conformable(in._grid,out._grid); // drops the cb check

@@ -261,6 +271,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopOE(const FermionField &in, FermionFie
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
 {
+  DhopCalls+=1;
  conformable(in._grid,FermionRedBlackGrid());    // verifies half grid
  conformable(in._grid,out._grid); // drops the cb check

@@ -272,6 +283,7 @@ void ImprovedStaggeredFermion5D<Impl>::DhopEO(const FermionField &in, FermionFie
 template<class Impl>
 void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
 {
+  DhopCalls+=2;
  conformable(in._grid,FermionGrid()); // verifies full grid
  conformable(in._grid,out._grid);

@@ -280,6 +292,54 @@ void ImprovedStaggeredFermion5D<Impl>::Dhop(const FermionField &in, FermionField
  DhopInternal(Stencil,Lebesgue,Umu,UUUmu,in,out,dag);
 }

+template<class Impl>
+void ImprovedStaggeredFermion5D<Impl>::Report(void) 
+{
+  std::vector<int> latt = GridDefaultLatt();          
+  RealD volume = Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
+  RealD NP = _FourDimGrid->_Nprocessors;
+  RealD NN = _FourDimGrid->NodeCount();
+
+  std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
+
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Number of DhopEO Calls   : " 
+	    << DhopCalls   << std::endl;
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D TotalTime   /Calls       : " 
+	    << DhopTotalTime   / DhopCalls << " us" << std::endl;
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D CommTime    /Calls       : " 
+	    << DhopCommTime    / DhopCalls << " us" << std::endl;
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D ComputeTime/Calls        : " 
+	    << DhopComputeTime / DhopCalls << " us" << std::endl;
+
+  // Average the compute time
+  _FourDimGrid->GlobalSum(DhopComputeTime);
+  DhopComputeTime/=NP;
+
+  RealD mflops = 1154*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
+  std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
+  std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
+  std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NN << std::endl;
+  
+  RealD Fullmflops = 1154*volume*DhopCalls/(DhopTotalTime)/2; // 2 for red black counting
+  std::cout << GridLogMessage << "Average mflops/s per call (full)         : " << Fullmflops << std::endl;
+  std::cout << GridLogMessage << "Average mflops/s per call per rank (full): " << Fullmflops/NP << std::endl;
+  std::cout << GridLogMessage << "Average mflops/s per call per node (full): " << Fullmflops/NN << std::endl;
+
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D Stencil"    <<std::endl;  Stencil.Report();
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilEven"<<std::endl;  StencilEven.Report();
+  std::cout << GridLogMessage << "ImprovedStaggeredFermion5D StencilOdd" <<std::endl;  StencilOdd.Report();
+}
+template<class Impl>
+void ImprovedStaggeredFermion5D<Impl>::ZeroCounters(void) 
+{
+  DhopCalls       = 0;
+  DhopTotalTime    = 0;
+  DhopCommTime    = 0;
+  DhopComputeTime = 0;
+  Stencil.ZeroCounters();
+  StencilEven.ZeroCounters();
+  StencilOdd.ZeroCounters();
+}

 /////////////////////////////////////////////////////////////////////////
 // Implement the general interface. Here we use SAME mass on all slices
--- a/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h
+++ b/lib/qcd/action/fermion/ImprovedStaggeredFermion5D.h
@@ -55,6 +55,16 @@ namespace QCD {
      FermionField _tmp;
      FermionField &tmp(void) { return _tmp; }

+      ////////////////////////////////////////
+      // Performance monitoring
+      ////////////////////////////////////////
+      void Report(void);
+      void ZeroCounters(void);
+      double DhopTotalTime;
+      double DhopCalls;
+      double DhopCommTime;
+      double DhopComputeTime;
+
      ///////////////////////////////////////////////////////////////
      // Implement the abstract base
      ///////////////////////////////////////////////////////////////
--- a/lib/qcd/action/scalar/ScalarImpl.h
+++ b/lib/qcd/action/scalar/ScalarImpl.h
@@ -93,6 +93,8 @@ class ScalarImplTypes {
  class ScalarAdjMatrixImplTypes {
  public:
    typedef S Simd;
+    typedef QCD::SU<N> Group;
+    
    template <typename vtype>
    using iImplField   = iScalar<iScalar<iMatrix<vtype, N>>>;
    template <typename vtype>
@@ -108,7 +110,7 @@ class ScalarImplTypes {
    typedef Field                PropagatorField;

    static inline void generate_momenta(Field& P, GridParallelRNG& pRNG) {
-      QCD::SU<N>::GaussianFundamentalLieAlgebraMatrix(pRNG, P);
+      Group::GaussianFundamentalLieAlgebraMatrix(pRNG, P);
    }

    static inline Field projectForce(Field& P) {return P;}
@@ -122,11 +124,11 @@ class ScalarImplTypes {
    }

    static inline void HotConfiguration(GridParallelRNG &pRNG, Field &U) {
-      QCD::SU<N>::LieRandomize(pRNG, U);
+      Group::GaussianFundamentalLieAlgebraMatrix(pRNG, U);
    }

    static inline void TepidConfiguration(GridParallelRNG &pRNG, Field &U) {
-      QCD::SU<N>::LieRandomize(pRNG, U, 0.01);
+      Group::GaussianFundamentalLieAlgebraMatrix(pRNG, U, 0.01);
    }

    static inline void ColdConfiguration(GridParallelRNG &pRNG, Field &U) {
--- a/lib/qcd/action/scalar/ScalarInteractionAction.h
+++ b/lib/qcd/action/scalar/ScalarInteractionAction.h
@@ -81,7 +81,7 @@ namespace Grid {
      phiStencil.HaloExchange(p, compressor);
      Field action(p._grid), pshift(p._grid), phisquared(p._grid);
      phisquared = p*p;
-      action = (2.0*Ndim + mass_square)*phisquared + lambda*phisquared*phisquared;
+      action = (2.0*Ndim + mass_square)*phisquared - lambda/24.*phisquared*phisquared;
      for (int mu = 0; mu < Ndim; mu++) {
 	//  pshift = Cshift(p, mu, +1);  // not efficient, implement with stencils
 	parallel_for (int i = 0; i < p._grid->oSites(); i++) {
@@ -98,7 +98,7 @@ namespace Grid {
 	      permute(temp2, *temp, permute_type);
 	      action._odata[i] -= temp2*(*t_p) + (*t_p)*temp2;
 	    } else {
-	      action._odata[i] -= *temp*(*t_p) + (*t_p)*(*temp);
+	      action._odata[i] -= (*temp)*(*t_p) + (*t_p)*(*temp);
 	    }
 	  } else {
 	    action._odata[i] -= phiStencil.CommBuf()[SE->_offset]*(*t_p) + (*t_p)*phiStencil.CommBuf()[SE->_offset];
@@ -113,7 +113,7 @@ namespace Grid {

    virtual void deriv(const Field &p, Field &force) {
      assert(p._grid->Nd() == Ndim);
-      force = (2.0*Ndim + mass_square)*p + 2.0*lambda*p*p*p;
+      force = (2.0*Ndim + mass_square)*p - lambda/12.*p*p*p;
      // move this outside
      static Stencil phiStencil(p._grid, npoint, 0, directions, displacements);
      phiStencil.HaloExchange(p, compressor);
--- a/lib/qcd/hmc/HMC.h
+++ b/lib/qcd/hmc/HMC.h
@@ -76,7 +76,7 @@ struct HMCparameters: Serializable {

  template < class ReaderClass > 
  void initialize(Reader<ReaderClass> &TheReader){
-  	std::cout << "Reading HMC\n";
+  	std::cout << GridLogMessage << "Reading HMC\n";
  	read(TheReader, "HMC", *this);
  }

--- a/lib/qcd/hmc/HMCResourceManager.h
+++ b/lib/qcd/hmc/HMCResourceManager.h
@@ -165,7 +165,7 @@ class HMCResourceManager {
  // Grids
  //////////////////////////////////////////////////////////////

-  void AddGrid(std::string s, GridModule& M) {
+  void AddGrid(const std::string s, GridModule& M) {
    // Check for name clashes
    auto search = Grids.find(s);
    if (search != Grids.end()) {
@@ -174,14 +174,24 @@ class HMCResourceManager {
      exit(1);
    }
    Grids[s] = std::move(M);
+    std::cout << GridLogMessage << "::::::::::::::::::::::::::::::::::::::::" <<std::endl;
+    std::cout << GridLogMessage << "HMCResourceManager:" << std::endl;
+    std::cout << GridLogMessage << "Created grid set with name '" << s << "' and decomposition for the full cartesian " << std::endl;
+    Grids[s].show_full_decomposition();
+    std::cout << GridLogMessage << "::::::::::::::::::::::::::::::::::::::::" <<std::endl;
  }

  // Add a named grid set, 4d shortcut
-  void AddFourDimGrid(std::string s) {
+  void AddFourDimGrid(const std::string s) {
    GridFourDimModule<vComplex> Mod;
    AddGrid(s, Mod);
  }

+  // Add a named grid set, 4d shortcut + tweak simd lanes
+  void AddFourDimGrid(const std::string s, const std::vector<int> simd_decomposition) {
+    GridFourDimModule<vComplex> Mod(simd_decomposition);
+    AddGrid(s, Mod);
+  }


  GridCartesian* GetCartesian(std::string s = "") {
@@ -253,6 +263,7 @@ class HMCResourceManager {
  template<class T, class... Types>
  void AddObservable(Types&&... Args){
    ObservablesList.push_back(std::unique_ptr<T>(new T(std::forward<Types>(Args)...)));
+    ObservablesList.back()->print_parameters();
  }

  std::vector<HmcObservable<typename ImplementationPolicy::Field>* > GetObservables(){
@@ -297,4 +308,4 @@ private:
 }
 }

-#endif  // HMC_RESOURCE_MANAGER_H
+#endif  // HMC_RESOURCE_MANAGER_H
--- a/lib/qcd/hmc/HMC_GridModules.h
+++ b/lib/qcd/hmc/HMC_GridModules.h
@@ -33,28 +33,29 @@ directory
 namespace Grid {

 // Resources
-// Modules for grids 
+// Modules for grids

 // Introduce another namespace HMCModules?

-class GridModuleParameters: Serializable{   
+class GridModuleParameters: Serializable{
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(GridModuleParameters,
  std::string, lattice,
  std::string, mpi);

-  std::vector<int> getLattice(){return strToVec<int>(lattice);}
-  std::vector<int> getMpi()    {return strToVec<int>(mpi);}
+  std::vector<int> getLattice() const {return strToVec<int>(lattice);}
+  std::vector<int> getMpi()     const {return strToVec<int>(mpi);}

-  void check(){
-    if (getLattice().size() != getMpi().size()) {
-      std::cout << GridLogError 
+
+  void check() const {
+    if (getLattice().size() != getMpi().size() ) {
+      std::cout << GridLogError
                << "Error in GridModuleParameters: lattice and mpi dimensions "
                   "do not match"
                << std::endl;
      exit(1);
    }
-  }    
+  }

  template <class ReaderClass>
  GridModuleParameters(Reader<ReaderClass>& Reader, std::string n = "LatticeGrid"):name(n) {
@@ -75,51 +76,94 @@ private:
 // Lower level class
 class GridModule {
 public:
-  GridCartesian* get_full() { 
+  GridCartesian* get_full() {
    std::cout << GridLogDebug << "Getting cartesian in module"<< std::endl;
    return grid_.get(); }
-  GridRedBlackCartesian* get_rb() { 
+  GridRedBlackCartesian* get_rb() {
    std::cout << GridLogDebug << "Getting rb-cartesian in module"<< std::endl;
    return rbgrid_.get(); }

  void set_full(GridCartesian* grid) { grid_.reset(grid); }
  void set_rb(GridRedBlackCartesian* rbgrid) { rbgrid_.reset(rbgrid); }
+  void show_full_decomposition(){ grid_->show_decomposition(); }
+  void show_rb_decomposition(){ rbgrid_->show_decomposition(); }

 protected:
  std::unique_ptr<GridCartesian> grid_;
  std::unique_ptr<GridRedBlackCartesian> rbgrid_;
-  
+
 };

 ////////////////////////////////////
 // Classes for the user
 ////////////////////////////////////
 // Note: the space time grid should be out of the QCD namespace
-template< class vector_type>
-class GridFourDimModule : public GridModule {
- public:
-  GridFourDimModule() {
+template <class vector_type>
+class GridFourDimModule : public GridModule
+{
+public:
+  GridFourDimModule()
+  {
    using namespace QCD;
    set_full(SpaceTimeGrid::makeFourDimGrid(
-        GridDefaultLatt(), GridDefaultSimd(4, vector_type::Nsimd()),
+        GridDefaultLatt(), 
+        GridDefaultSimd(4, vector_type::Nsimd()),
        GridDefaultMpi()));
    set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get()));
  }

-  GridFourDimModule(GridModuleParameters Params) {
+  GridFourDimModule(const std::vector<int> tweak_simd)
+  {
+    using namespace QCD;
+    if (tweak_simd.size() != 4)
+    {
+      std::cout << GridLogError
+                << "Error in GridFourDimModule: SIMD size different from 4" 
+                << std::endl;
+      exit(1);
+    }
+
+    // Checks that the product agrees with the expectation
+    int simd_sum = 1;
+    for (auto &n : tweak_simd)
+      simd_sum *= n;
+    std::cout << GridLogDebug << "TweakSIMD: " << tweak_simd << "  Sum: " << simd_sum << std::endl;
+
+    if (simd_sum == vector_type::Nsimd())
+    {
+      set_full(SpaceTimeGrid::makeFourDimGrid(
+          GridDefaultLatt(), 
+          tweak_simd, 
+          GridDefaultMpi()));
+      set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get()));
+    }
+    else
+    {
+      std::cout << GridLogError 
+                << "Error in GridFourDimModule: SIMD lanes must sum to " 
+                << vector_type::Nsimd() 
+                << std::endl;
+    }
+  }
+
+  GridFourDimModule(const GridModuleParameters Params)
+  {
    using namespace QCD;
-    Params.check();
    std::vector<int> lattice_v = Params.getLattice();
    std::vector<int> mpi_v = Params.getMpi();
-    if (lattice_v.size() == 4) {
+    if (lattice_v.size() == 4)
+    {
      set_full(SpaceTimeGrid::makeFourDimGrid(
-          lattice_v, GridDefaultSimd(4, vector_type::Nsimd()),
+          lattice_v, 
+          GridDefaultSimd(4, vector_type::Nsimd()),
          mpi_v));
      set_rb(SpaceTimeGrid::makeFourDimRedBlackGrid(grid_.get()));
-    } else {
-      std::cout << GridLogError 
-          << "Error in GridFourDimModule: lattice dimension different from 4"
-          << std::endl;
+    }
+    else
+    {
+      std::cout << GridLogError
+                << "Error in GridFourDimModule: lattice dimension different from 4"
+                << std::endl;
      exit(1);
    }
  }
--- a/lib/qcd/modules/ObservableModules.h
+++ b/lib/qcd/modules/ObservableModules.h
@@ -84,8 +84,6 @@ class PlaquetteMod: public ObservableModule<PlaquetteLogger<Impl>, NoParameters>
  typedef ObservableModule<PlaquetteLogger<Impl>, NoParameters> ObsBase;
  using ObsBase::ObsBase; // for constructors

-
-
  // acquire resource
  virtual void initialize(){
    this->ObservablePtr.reset(new PlaquetteLogger<Impl>());
@@ -94,23 +92,22 @@ class PlaquetteMod: public ObservableModule<PlaquetteLogger<Impl>, NoParameters>
  PlaquetteMod(): ObsBase(NoParameters()){}
 };

+
 template < class Impl >
-class TopologicalChargeMod: public ObservableModule<TopologicalCharge<Impl>, NoParameters>{
-  typedef ObservableModule<TopologicalCharge<Impl>, NoParameters> ObsBase;
+class TopologicalChargeMod: public ObservableModule<TopologicalCharge<Impl>, TopologyObsParameters>{
+  typedef ObservableModule<TopologicalCharge<Impl>, TopologyObsParameters> ObsBase;
  using ObsBase::ObsBase; // for constructors

-
-
  // acquire resource
  virtual void initialize(){
-    this->ObservablePtr.reset(new TopologicalCharge<Impl>());
+    this->ObservablePtr.reset(new TopologicalCharge<Impl>(this->Par_));
  }
  public:
-  TopologicalChargeMod(): ObsBase(NoParameters()){}
+  TopologicalChargeMod(TopologyObsParameters Par): ObsBase(Par){}
+  TopologicalChargeMod(): ObsBase(){}
 };


-
 }// QCD temporarily here


--- a/lib/qcd/observables/topological_charge.h
+++ b/lib/qcd/observables/topological_charge.h
@@ -33,9 +33,45 @@ directory
 namespace Grid {
 namespace QCD {

+struct TopologySmearingParameters : Serializable {
+    GRID_SERIALIZABLE_CLASS_MEMBERS(TopologySmearingParameters,
+    int, steps,
+    float, step_size,
+    int, meas_interval,
+    float, maxTau);
+
+    TopologySmearingParameters(int s = 0, float ss = 0.0f, int mi = 0, float mT = 0.0f):
+        steps(s), step_size(ss), meas_interval(mi), maxTau(mT){}
+
+    template < class ReaderClass >
+    TopologySmearingParameters(Reader<ReaderClass>& Reader){
+        read(Reader, "Smearing", *this);  
+    }  
+};
+
+
+
+struct TopologyObsParameters : Serializable {
+    GRID_SERIALIZABLE_CLASS_MEMBERS(TopologyObsParameters,
+      int, interval,
+      bool, do_smearing,
+      TopologySmearingParameters, Smearing);  
+
+    TopologyObsParameters(int interval = 1, bool smearing = false):
+        interval(interval), Smearing(smearing){}
+
+    template <class ReaderClass >
+      TopologyObsParameters(Reader<ReaderClass>& Reader){
+        read(Reader, "TopologyMeasurement", *this);
+  }
+};
+
+
 // this is only defined for a gauge theory
 template <class Impl>
 class TopologicalCharge : public HmcObservable<typename Impl::Field> {
+    TopologyObsParameters Pars;
+
 public:
    // here forces the Impl to be of gauge fields
    // if not the compiler will complain
@@ -44,20 +80,39 @@ class TopologicalCharge : public HmcObservable<typename Impl::Field> {
    // necessary for HmcObservable compatibility
    typedef typename Impl::Field Field;

+    TopologicalCharge(int interval = 1, bool do_smearing = false):
+        Pars(interval, do_smearing){}
+    
+    TopologicalCharge(TopologyObsParameters P):Pars(P){
+        std::cout << GridLogDebug << "Creating TopologicalCharge " << std::endl;
+    }
+
    void TrajectoryComplete(int traj,
                            Field &U,
                            GridSerialRNG &sRNG,
                            GridParallelRNG &pRNG) {

-    Real q = WilsonLoops<Impl>::TopologicalCharge(U);
+    if (traj%Pars.interval == 0){
+        // Smearing
+        Field Usmear = U;
+        int def_prec = std::cout.precision();
+        
+        if (Pars.do_smearing){
+            // using wilson flow by default here
+            WilsonFlow<PeriodicGimplR> WF(Pars.Smearing.steps, Pars.Smearing.step_size, Pars.Smearing.meas_interval);
+            WF.smear_adaptive(Usmear, U, Pars.Smearing.maxTau);
+            Real T0   = WF.energyDensityPlaquette(Usmear);
+            std::cout << GridLogMessage << std::setprecision(std::numeric_limits<Real>::digits10 + 1)
+                      << "T0                : [ " << traj << " ] "<< T0 << std::endl;
+        }

-    int def_prec = std::cout.precision();
+        Real q    = WilsonLoops<Impl>::TopologicalCharge(Usmear);
+        std::cout << GridLogMessage
+            << std::setprecision(std::numeric_limits<Real>::digits10 + 1)
+            << "Topological Charge: [ " << traj << " ] "<< q << std::endl;

-    std::cout << GridLogMessage
-        << std::setprecision(std::numeric_limits<Real>::digits10 + 1)
-        << "Topological Charge: [ " << traj << " ] "<< q << std::endl;
-
-    std::cout.precision(def_prec);
+        std::cout.precision(def_prec);
+        }
    }

 };
--- a/lib/qcd/smearing/WilsonFlow.h
+++ b/lib/qcd/smearing/WilsonFlow.h
@@ -108,7 +108,7 @@ void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, Real
    if (maxTau - taus < epsilon){
        epsilon = maxTau-taus;
    }
-    std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl;
+    //std::cout << GridLogMessage << "Integration epsilon : " << epsilon << std::endl;
    GaugeField Z(U._grid);
    GaugeField Zprime(U._grid);
    GaugeField tmp(U._grid), Uprime(U._grid);
@@ -138,10 +138,10 @@ void WilsonFlow<Gimpl>::evolve_step_adaptive(typename Gimpl::GaugeField &U, Real
    // adjust integration step
    
    taus += epsilon;
-    std::cout << GridLogMessage << "Adjusting integration step with distance: " << diff << std::endl;
+    //std::cout << GridLogMessage << "Adjusting integration step with distance: " << diff << std::endl;
    
    epsilon = epsilon*0.95*std::pow(1e-4/diff,1./3.);
-    std::cout << GridLogMessage << "New epsilon : " << epsilon << std::endl;
+    //std::cout << GridLogMessage << "New epsilon : " << epsilon << std::endl;

 }

@@ -166,7 +166,6 @@ void WilsonFlow<Gimpl>::smear(GaugeField& out, const GaugeField& in) const {
    out = in;
    for (unsigned int step = 1; step <= Nstep; step++) {
        auto start = std::chrono::high_resolution_clock::now();
-        std::cout << GridLogMessage << "Evolution time :"<< tau(step) << std::endl;
        evolve_step(out);
        auto end = std::chrono::high_resolution_clock::now();
        std::chrono::duration<double> diff = end - start;
@@ -191,7 +190,7 @@ void WilsonFlow<Gimpl>::smear_adaptive(GaugeField& out, const GaugeField& in, Re
    unsigned int step = 0;
    do{
        step++;
-        std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl;
+        //std::cout << GridLogMessage << "Evolution time :"<< taus << std::endl;
        evolve_step_adaptive(out, maxTau);
        std::cout << GridLogMessage << "[WilsonFlow] Energy density (plaq) : "
            << step << "  "
--- a/lib/qcd/utils/GaugeFix.h
+++ b/lib/qcd/utils/GaugeFix.h
@@ -26,12 +26,14 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    /*  END LEGAL */
 //#include <Grid/Grid.h>

-using namespace Grid;
-using namespace Grid::QCD;
+#ifndef GRID_QCD_GAUGE_FIX_H
+#define GRID_QCD_GAUGE_FIX_H
+namespace Grid {
+namespace QCD {

 template <class Gimpl> 
 class FourierAcceleratedGaugeFixer  : public Gimpl {
-  public:
+ public:
  INHERIT_GIMPL_TYPES(Gimpl);

  typedef typename Gimpl::GaugeLinkField GaugeMat;
@@ -186,3 +188,6 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
  }  
 };

+}
+}
+#endif
--- a/lib/qcd/utils/SUn.h
+++ b/lib/qcd/utils/SUn.h
@@ -716,8 +716,7 @@ template<typename GaugeField,typename GaugeMat>

    for (int a = 0; a < AdjointDimension; a++) {
      generator(a, Ta);
-      auto tmp = - 2.0 * (trace(timesI(Ta) * in)) * scale;// 2.0 for the normalization of the trace in the fundamental rep
-      pokeColour(h_out, tmp, a);
+      pokeColour(h_out, - 2.0 * (trace(timesI(Ta) * in)) * scale, a);
    }
  }

--- a/lib/serialisation/Hdf5IO.cc
+++ b/lib/serialisation/Hdf5IO.cc
@@ -65,10 +65,12 @@ Hdf5Reader::Hdf5Reader(const std::string &fileName)
                      Hdf5Type<unsigned int>::type());
 }

-void Hdf5Reader::push(const std::string &s)
+bool Hdf5Reader::push(const std::string &s)
 {
  group_ = group_.openGroup(s);
  path_.push_back(s);
+  
+  return true;
 }

 void Hdf5Reader::pop(void)
--- a/lib/serialisation/Hdf5IO.h
+++ b/lib/serialisation/Hdf5IO.h
@@ -54,7 +54,7 @@ namespace Grid
  public:
    Hdf5Reader(const std::string &fileName);
    virtual ~Hdf5Reader(void) = default;
-    void push(const std::string &s);
+    bool push(const std::string &s);
    void pop(void);
    template <typename U>
    void readDefault(const std::string &s, U &output);
--- a/lib/simd/Grid_neon.h
+++ b/lib/simd/Grid_neon.h
@@ -1,13 +1,14 @@
-    /*************************************************************************************
+/*************************************************************************************

-    Grid physics library, www.github.com/paboyle/Grid 
+    Grid physics library, www.github.com/paboyle/Grid

    Source file: ./lib/simd/Grid_neon.h

    Copyright (C) 2015

-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: neo <cossu@post.kek.jp>
+    Author: Nils Meyer <nils.meyer@ur.de>
+    Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+    Author: neo <cossu@post.kek.jp>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -26,19 +27,25 @@ Author: neo <cossu@post.kek.jp>
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
-//----------------------------------------------------------------------
-/*! @file Grid_sse4.h
-  @brief Optimization libraries for NEON (ARM) instructions set ARMv8

-  Experimental - Using intrinsics - DEVELOPING! 
+/*
+
+  ARMv8 NEON intrinsics layer by
+
+  Nils Meyer <nils.meyer@ur.de>,
+  University of Regensburg, Germany
+  SFB/TRR55
+
 */
-// Time-stamp: <2015-07-10 17:45:09 neo>
-//----------------------------------------------------------------------

+#ifndef GEN_SIMD_WIDTH
+#define GEN_SIMD_WIDTH 16u
+#endif
+
+#include "Grid_generic_types.h"
 #include <arm_neon.h>

-// ARMv8 supports double precision
-
+namespace Grid {
 namespace Optimization {

  template<class vtype>
@@ -46,16 +53,20 @@ namespace Optimization {
    float32x4_t f;
    vtype v;
  };
-
  union u128f {
    float32x4_t v;
    float f[4];
  };
  union u128d {
    float64x2_t v;
-    double f[4];
+    double f[2];
  };
-  
+  // half precision
+  union u128h {
+    float16x8_t v;
+    uint16_t f[8];
+  };
+
  struct Vsplat{
    //Complex float
    inline float32x4_t operator()(float a, float b){
@@ -64,31 +75,31 @@ namespace Optimization {
    }
    // Real float
    inline float32x4_t operator()(float a){
-      return vld1q_dup_f32(&a);
+      return vdupq_n_f32(a);
    }
    //Complex double
-    inline float32x4_t operator()(double a, double b){
-      float tmp[4]={(float)a,(float)b,(float)a,(float)b};
-      return vld1q_f32(tmp);
+    inline float64x2_t operator()(double a, double b){
+      double tmp[2]={a,b};
+      return vld1q_f64(tmp);
    }
-    //Real double
-    inline float32x4_t operator()(double a){
-      return vld1q_dup_f32(&a);
+    //Real double // N:tbc
+    inline float64x2_t operator()(double a){
+      return vdupq_n_f64(a);
    }
-    //Integer
+    //Integer // N:tbc
    inline uint32x4_t operator()(Integer a){
-      return vld1q_dup_u32(&a);
+      return vdupq_n_u32(a);
    }
  };

  struct Vstore{
-    //Float 
+    //Float
    inline void operator()(float32x4_t a, float* F){
      vst1q_f32(F, a);
    }
    //Double
-    inline void operator()(float32x4_t a, double* D){
-      vst1q_f32((float*)D, a);
+    inline void operator()(float64x2_t a, double* D){
+      vst1q_f64(D, a);
    }
    //Integer
    inline void operator()(uint32x4_t a, Integer* I){
@@ -97,54 +108,54 @@ namespace Optimization {

  };

-  struct Vstream{
-    //Float
+  struct Vstream{ // N:equivalents to _mm_stream_p* in NEON?
+    //Float // N:generic
    inline void operator()(float * a, float32x4_t b){
-    
+      memcpy(a,&b,4*sizeof(float));
    }
-    //Double
-    inline void operator()(double * a, float32x4_t b){
-  
+    //Double // N:generic
+    inline void operator()(double * a, float64x2_t b){
+      memcpy(a,&b,2*sizeof(double));
    }


  };

+  // Nils: Vset untested; not used currently in Grid at all;
+  // git commit 4a8c4ccfba1d05159348d21a9698028ea847e77b
  struct Vset{
-    // Complex float 
+    // Complex float // N:ok
    inline float32x4_t operator()(Grid::ComplexF *a){
-      float32x4_t foo;
-      return foo;
+      float tmp[4]={a[1].imag(),a[1].real(),a[0].imag(),a[0].real()};
+      return vld1q_f32(tmp);
    }
-    // Complex double 
-    inline float32x4_t operator()(Grid::ComplexD *a){
-      float32x4_t foo;
-      return foo;
+    // Complex double // N:ok
+    inline float64x2_t operator()(Grid::ComplexD *a){
+      double tmp[2]={a[0].imag(),a[0].real()};
+      return vld1q_f64(tmp);
    }
-    // Real float 
+    // Real float // N:ok
    inline float32x4_t operator()(float *a){
-      float32x4_t foo;
-      return foo;
+      float tmp[4]={a[3],a[2],a[1],a[0]};
+      return vld1q_f32(tmp);
    }
-    // Real double
-    inline float32x4_t operator()(double *a){
-      float32x4_t foo;
-      return foo;
+    // Real double // N:ok
+    inline float64x2_t operator()(double *a){
+      double tmp[2]={a[1],a[0]};
+      return vld1q_f64(tmp);
    }
-    // Integer
+    // Integer // N:ok
    inline uint32x4_t operator()(Integer *a){
-      uint32x4_t foo;
-      return foo;
+      return vld1q_dup_u32(a);
    }
-
-
  };

+  // N:leaving as is
  template <typename Out_type, typename In_type>
  struct Reduce{
    //Need templated class to overload output type
    //General form must generate error if compiled
-    inline Out_type operator()(In_type in){
+      inline Out_type operator()(In_type in){
      printf("Error, using wrong Reduce function\n");
      exit(1);
      return 0;
@@ -184,26 +195,98 @@ namespace Optimization {
    }
  };

+  struct MultRealPart{
+    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
+      float32x4_t re = vtrn1q_f32(a, a);
+      return vmulq_f32(re, b);
+    }
+    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
+      float64x2_t re = vzip1q_f64(a, a);
+      return vmulq_f64(re, b);
+    }
+  };
+
+  struct MaddRealPart{
+    inline float32x4_t operator()(float32x4_t a, float32x4_t b, float32x4_t c){
+      float32x4_t re = vtrn1q_f32(a, a);
+      return vfmaq_f32(c, re, b);
+    }
+    inline float64x2_t operator()(float64x2_t a, float64x2_t b, float64x2_t c){
+      float64x2_t re = vzip1q_f64(a, a);
+      return vfmaq_f64(c, re, b);
+    }
+  };
+
+  struct Div{
+    // Real float
+    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
+      return vdivq_f32(a, b);
+    }
+    // Real double
+    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
+      return vdivq_f64(a, b);
+    }
+  };
+
  struct MultComplex{
    // Complex float
    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
-      float32x4_t foo;
-      return foo;
+
+      float32x4_t r0, r1, r2, r3, r4;
+
+      // a = ar ai Ar Ai
+      // b = br bi Br Bi
+      // collect real/imag part, negate bi and Bi
+      r0 = vtrn1q_f32(b, b);       //  br  br  Br  Br
+      r1 = vnegq_f32(b);           // -br -bi -Br -Bi
+      r2 = vtrn2q_f32(b, r1);      //  bi -bi  Bi -Bi
+
+      // the fun part
+      r3 = vmulq_f32(r2, a);       //  bi*ar -bi*ai ...
+      r4 = vrev64q_f32(r3);        // -bi*ai  bi*ar ...
+
+      // fma(a,b,c) = a+b*c
+      return vfmaq_f32(r4, r0, a); //  ar*br-ai*bi ai*br+ar*bi ...
+
+      // no fma, use mul and add
+      //float32x4_t r5;
+      //r5 = vmulq_f32(r0, a);
+      //return vaddq_f32(r4, r5);
    }
    // Complex double
    inline float64x2_t operator()(float64x2_t a, float64x2_t b){
-      float32x4_t foo;
-      return foo;
+
+      float64x2_t r0, r1, r2, r3, r4;
+
+      // b = br bi
+      // collect real/imag part, negate bi
+      r0 = vtrn1q_f64(b, b);       //  br  br
+      r1 = vnegq_f64(b);           // -br -bi
+      r2 = vtrn2q_f64(b, r1);      //  bi -bi
+
+      // the fun part
+      r3 = vmulq_f64(r2, a);       //  bi*ar -bi*ai
+      r4 = vextq_f64(r3,r3,1);     // -bi*ai  bi*ar
+
+      // fma(a,b,c) = a+b*c
+      return vfmaq_f64(r4, r0, a); //  ar*br-ai*bi ai*br+ar*bi
+
+      // no fma, use mul and add
+      //float64x2_t r5;
+      //r5 = vmulq_f64(r0, a);
+      //return vaddq_f64(r4, r5);
    }
  };

  struct Mult{
    // Real float
    inline float32x4_t mac(float32x4_t a, float32x4_t b, float32x4_t c){
-      return vaddq_f32(vmulq_f32(b,c),a);
+      //return vaddq_f32(vmulq_f32(b,c),a);
+      return vfmaq_f32(a, b, c);
    }
    inline float64x2_t mac(float64x2_t a, float64x2_t b, float64x2_t c){
-      return vaddq_f64(vmulq_f64(b,c),a);
+      //return vaddq_f64(vmulq_f64(b,c),a);
+      return vfmaq_f64(a, b, c);
    }
    inline float32x4_t operator()(float32x4_t a, float32x4_t b){
      return vmulq_f32(a,b);
@@ -221,89 +304,275 @@ namespace Optimization {
  struct Conj{
    // Complex single
    inline float32x4_t operator()(float32x4_t in){
-      return in;
+      // ar ai br bi -> ar -ai br -bi
+      float32x4_t r0, r1;
+      r0 = vnegq_f32(in);        // -ar -ai -br -bi
+      r1 = vrev64q_f32(r0);      // -ai -ar -bi -br
+      return vtrn1q_f32(in, r1); //  ar -ai  br -bi
    }
    // Complex double
-    //inline float32x4_t operator()(float32x4_t in){
-    // return 0;
-    //}
+    inline float64x2_t operator()(float64x2_t in){
+
+      float64x2_t r0, r1;
+      r0 = vextq_f64(in, in, 1);    //  ai  ar
+      r1 = vnegq_f64(r0);           // -ai -ar
+      return vextq_f64(r0, r1, 1);  //  ar -ai
+    }
    // do not define for integer input
  };

  struct TimesMinusI{
    //Complex single
    inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
-      return in;
+      // ar ai br bi -> ai -ar ai -br
+      float32x4_t r0, r1;
+      r0 = vnegq_f32(in);        // -ar -ai -br -bi
+      r1 = vrev64q_f32(in);      //  ai  ar  bi  br
+      return vtrn1q_f32(r1, r0); //  ar -ai  br -bi
    }
    //Complex double
-    //inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
-    //  return in;
-    //}
-
-
+    inline float64x2_t operator()(float64x2_t in, float64x2_t ret){
+      // a ib -> b -ia
+      float64x2_t tmp;
+      tmp = vnegq_f64(in);
+      return vextq_f64(in, tmp, 1);
+    }
  };

  struct TimesI{
    //Complex single
    inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
-      //need shuffle
-      return in;
+      // ar ai br bi -> -ai ar -bi br
+      float32x4_t r0, r1;
+      r0 = vnegq_f32(in);        // -ar -ai -br -bi
+      r1 = vrev64q_f32(r0);      // -ai -ar -bi -br
+      return vtrn1q_f32(r1, in); // -ai  ar -bi  br
    }
    //Complex double
-    //inline float32x4_t operator()(float32x4_t in, float32x4_t ret){
-    //  return 0;
-    //}
+    inline float64x2_t operator()(float64x2_t in, float64x2_t ret){
+      // a ib -> -b ia
+      float64x2_t tmp;
+      tmp = vnegq_f64(in);
+      return vextq_f64(tmp, in, 1);
+    }
+  };
+
+  struct Permute{
+
+    static inline float32x4_t Permute0(float32x4_t in){ // N:ok
+      // AB CD -> CD AB
+      return vextq_f32(in, in, 2);
+    };
+    static inline float32x4_t Permute1(float32x4_t in){ // N:ok
+      // AB CD -> BA DC
+      return vrev64q_f32(in);
+    };
+    static inline float32x4_t Permute2(float32x4_t in){ // N:not used by Boyle
+      return in;
+    };
+    static inline float32x4_t Permute3(float32x4_t in){ // N:not used by Boyle
+      return in;
+    };
+
+    static inline float64x2_t Permute0(float64x2_t in){ // N:ok
+      // AB -> BA
+      return vextq_f64(in, in, 1);
+    };
+    static inline float64x2_t Permute1(float64x2_t in){ // N:not used by Boyle
+      return in;
+    };
+    static inline float64x2_t Permute2(float64x2_t in){ // N:not used by Boyle
+      return in;
+    };
+    static inline float64x2_t Permute3(float64x2_t in){ // N:not used by Boyle
+      return in;
+    };
+
+  };
+
+  struct Rotate{
+
+    static inline float32x4_t rotate(float32x4_t in,int n){ // N:ok
+      switch(n){
+      case 0: // AB CD -> AB CD
+        return tRotate<0>(in);
+        break;
+      case 1: // AB CD -> BC DA
+        return tRotate<1>(in);
+        break;
+      case 2: // AB CD -> CD AB
+        return tRotate<2>(in);
+        break;
+      case 3: // AB CD -> DA BC
+        return tRotate<3>(in);
+        break;
+      default: assert(0);
+      }
+    }
+    static inline float64x2_t rotate(float64x2_t in,int n){ // N:ok
+      switch(n){
+      case 0: // AB -> AB
+        return tRotate<0>(in);
+        break;
+      case 1: // AB -> BA
+        return tRotate<1>(in);
+        break;
+      default: assert(0);
+      }
+    }
+
+// working, but no restriction on n
+//    template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n); };
+//    template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n); };
+
+// restriction on n
+    template<int n> static inline float32x4_t tRotate(float32x4_t in){ return vextq_f32(in,in,n%4); };
+    template<int n> static inline float64x2_t tRotate(float64x2_t in){ return vextq_f64(in,in,n%2); };
+
+  };
+
+  struct PrecisionChange {
+
+    static inline float16x8_t StoH (const float32x4_t &a,const float32x4_t &b) {
+      float16x4_t h = vcvt_f16_f32(a);
+      return vcvt_high_f16_f32(h, b);
+    }
+    static inline void  HtoS (float16x8_t h,float32x4_t &sa,float32x4_t &sb) {
+      sb = vcvt_high_f32_f16(h);
+      // there is no direct conversion from lower float32x4_t to float64x2_t
+      // vextq_f16 not supported by clang 3.8 / 4.0 / arm clang
+      //float16x8_t h1 = vextq_f16(h, h, 4); // correct, but not supported by clang
+      // workaround for clang
+      uint32x4_t h1u = reinterpret_cast<uint32x4_t>(h);
+      float16x8_t h1 = reinterpret_cast<float16x8_t>(vextq_u32(h1u, h1u, 2));
+      sa = vcvt_high_f32_f16(h1);
+    }
+    static inline float32x4_t DtoS (float64x2_t a,float64x2_t b) {
+      float32x2_t s = vcvt_f32_f64(a);
+      return vcvt_high_f32_f64(s, b);
+
+    }
+    static inline void StoD (float32x4_t s,float64x2_t &a,float64x2_t &b) {
+      b = vcvt_high_f64_f32(s);
+      // there is no direct conversion from lower float32x4_t to float64x2_t
+      float32x4_t s1 = vextq_f32(s, s, 2);
+      a = vcvt_high_f64_f32(s1);
+
+    }
+    static inline float16x8_t DtoH (float64x2_t a,float64x2_t b,float64x2_t c,float64x2_t d) {
+      float32x4_t s1 = DtoS(a, b);
+      float32x4_t s2 = DtoS(c, d);
+      return StoH(s1, s2);
+    }
+    static inline void HtoD (float16x8_t h,float64x2_t &a,float64x2_t &b,float64x2_t &c,float64x2_t &d) {
+      float32x4_t s1, s2;
+      HtoS(h, s1, s2);
+      StoD(s1, a, b);
+      StoD(s2, c, d);
+    }
+  };
+
+  //////////////////////////////////////////////
+  // Exchange support
+
+  struct Exchange{
+    static inline void Exchange0(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
+      // in1: ABCD -> out1: ABEF
+      // in2: EFGH -> out2: CDGH
+
+      // z: CDAB
+      float32x4_t z = vextq_f32(in1, in1, 2);
+      // out1: ABEF
+      out1 = vextq_f32(z, in2, 2);
+
+      // z: GHEF
+      z = vextq_f32(in2, in2, 2);
+      // out2: CDGH
+      out2 = vextq_f32(in1, z, 2);
+    };
+
+    static inline void Exchange1(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
+      // in1: ABCD -> out1: AECG
+      // in2: EFGH -> out2: BFDH
+      out1 = vtrn1q_f32(in1, in2);
+      out2 = vtrn2q_f32(in1, in2);
+    };
+    static inline void Exchange2(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
+      assert(0);
+      return;
+    };
+    static inline void Exchange3(float32x4_t &out1,float32x4_t &out2,float32x4_t in1,float32x4_t in2){
+      assert(0);
+      return;
+    };
+    // double precision
+    static inline void Exchange0(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
+      // in1: AB -> out1: AC
+      // in2: CD -> out2: BD
+      out1 = vzip1q_f64(in1, in2);
+      out2 = vzip2q_f64(in1, in2);
+    };
+    static inline void Exchange1(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
+      assert(0);
+      return;
+    };
+    static inline void Exchange2(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
+      assert(0);
+      return;
+    };
+    static inline void Exchange3(float64x2_t &out1,float64x2_t &out2,float64x2_t in1,float64x2_t in2){
+      assert(0);
+      return;
+    };
  };

  //////////////////////////////////////////////
  // Some Template specialization
-  template < typename vtype > 
-    void permute(vtype &a, vtype b, int perm) {

-  }; 

  //Complex float Reduce
  template<>
  inline Grid::ComplexF Reduce<Grid::ComplexF, float32x4_t>::operator()(float32x4_t in){
-    return 0;
+    float32x4_t v1; // two complex
+    v1 = Optimization::Permute::Permute0(in);
+    v1 = vaddq_f32(v1,in);
+    u128f conv;    conv.v=v1;
+    return Grid::ComplexF(conv.f[0],conv.f[1]);
  }
  //Real float Reduce
  template<>
  inline Grid::RealF Reduce<Grid::RealF, float32x4_t>::operator()(float32x4_t in){
-    float32x2_t high = vget_high_f32(in);
-    float32x2_t low = vget_low_f32(in);
-    float32x2_t tmp = vadd_f32(low, high);
-    float32x2_t sum = vpadd_f32(tmp, tmp);
-    return vget_lane_f32(sum,0);
+    return vaddvq_f32(in);
  }
-  
-  
+
+
  //Complex double Reduce
-  template<>
+  template<> // N:by Boyle
  inline Grid::ComplexD Reduce<Grid::ComplexD, float64x2_t>::operator()(float64x2_t in){
-    return 0;
+    u128d conv; conv.v = in;
+    return Grid::ComplexD(conv.f[0],conv.f[1]);
  }
-  
+
  //Real double Reduce
  template<>
  inline Grid::RealD Reduce<Grid::RealD, float64x2_t>::operator()(float64x2_t in){
-    float64x2_t sum = vpaddq_f64(in, in);
-    return vgetq_lane_f64(sum,0);
+    return vaddvq_f64(in);
  }

  //Integer Reduce
  template<>
  inline Integer Reduce<Integer, uint32x4_t>::operator()(uint32x4_t in){
    // FIXME unimplemented
-   printf("Reduce : Missing integer implementation -> FIX\n");
+    printf("Reduce : Missing integer implementation -> FIX\n");
    assert(0);
  }
 }

 //////////////////////////////////////////////////////////////////////////////////////
-// Here assign types 
-namespace Grid {
+// Here assign types

+// typedef Optimization::vech SIMD_Htype; // Reduced precision type
+  typedef float16x8_t  SIMD_Htype; // Half precision type
  typedef float32x4_t  SIMD_Ftype; // Single precision type
  typedef float64x2_t  SIMD_Dtype; // Double precision type
  typedef uint32x4_t   SIMD_Itype; // Integer type
@@ -312,13 +581,6 @@ namespace Grid {
  inline void prefetch_HINT_T0(const char *ptr){};


-  // Gpermute function
-  template < typename VectorSIMD > 
-    inline void Gpermute(VectorSIMD &y,const VectorSIMD &b, int perm ) {
-    Optimization::permute(y.v,b.v,perm);
-  }
-
-
  // Function name aliases
  typedef Optimization::Vsplat   VsplatSIMD;
  typedef Optimization::Vstore   VstoreSIMD;
@@ -326,16 +588,19 @@ namespace Grid {
  typedef Optimization::Vstream  VstreamSIMD;
  template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;

- 
+


  // Arithmetic operations
  typedef Optimization::Sum         SumSIMD;
  typedef Optimization::Sub         SubSIMD;
+  typedef Optimization::Div         DivSIMD;
  typedef Optimization::Mult        MultSIMD;
  typedef Optimization::MultComplex MultComplexSIMD;
+  typedef Optimization::MultRealPart MultRealPartSIMD;
+  typedef Optimization::MaddRealPart MaddRealPartSIMD;
  typedef Optimization::Conj        ConjSIMD;
  typedef Optimization::TimesMinusI TimesMinusISIMD;
  typedef Optimization::TimesI      TimesISIMD;

-}
+}
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -53,7 +53,7 @@ directory
 #if defined IMCI
 #include "Grid_imci.h"
 #endif
-#ifdef NEONv8
+#ifdef NEONV8
 #include "Grid_neon.h"
 #endif
 #if defined QPX
--- a/lib/stencil/Lebesgue.cc
+++ b/lib/stencil/Lebesgue.cc
@@ -32,8 +32,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 namespace Grid {

 int LebesgueOrder::UseLebesgueOrder;
+#ifdef KNL
 std::vector<int> LebesgueOrder::Block({8,2,2,2});
-
+#else
+std::vector<int> LebesgueOrder::Block({2,2,2,2});
+#endif
 LebesgueOrder::IndexInteger LebesgueOrder::alignup(IndexInteger n){
  n--;           // 1000 0011 --> 1000 0010
  n |= n >> 1;   // 1000 0010 | 0100 0001 = 1100 0011
@@ -51,8 +54,31 @@ LebesgueOrder::LebesgueOrder(GridBase *_grid)
  if ( Block[0]==0) ZGraph();
  else if ( Block[1]==0) NoBlocking();
  else CartesianBlocking();
-}

+  if (0) {
+    std::cout << "Thread Interleaving"<<std::endl;
+    ThreadInterleave();
+  } 
+}
+void LebesgueOrder::ThreadInterleave(void)
+{
+  std::vector<IndexInteger> reorder = _LebesgueReorder;
+  std::vector<IndexInteger> throrder;
+  int vol = _LebesgueReorder.size();
+  int threads = GridThread::GetThreads();
+  int blockbits=3;
+  int blocklen = 8;
+  int msk      = 0x7;
+
+  for(int t=0;t<threads;t++){
+    for(int ss=0;ss<vol;ss++){
+       if ( ( ss >> blockbits) % threads == t ) { 
+         throrder.push_back(reorder[ss]);
+       }
+    }
+  }
+  _LebesgueReorder = throrder;
+}
 void LebesgueOrder::NoBlocking(void) 
 {
  std::cout<<GridLogDebug<<"Lexicographic : no cache blocking"<<std::endl;
--- a/lib/stencil/Lebesgue.h
+++ b/lib/stencil/Lebesgue.h
@@ -70,6 +70,8 @@ namespace Grid {
 		  std::vector<IndexInteger> & xi,
 		  std::vector<IndexInteger> &dims);

+    void ThreadInterleave(void);
+
  private:
    std::vector<IndexInteger> _LebesgueReorder;

--- a/lib/tensors/Tensor_arith_mul.h
+++ b/lib/tensors/Tensor_arith_mul.h
@@ -98,7 +98,9 @@ template<class rtype,class vtype,class mtype,int N>
 strong_inline void mult(iVector<rtype,N> * __restrict__ ret,
                 const iVector<vtype,N> * __restrict__ rhs,
                 const iScalar<mtype> * __restrict__ lhs){
-    mult(ret,lhs,rhs);
+    for(int c1=0;c1<N;c1++){
+        mult(&ret->_internal[c1],&rhs->_internal[c1],&lhs->_internal);
+    }                 
 }
    

--- a/lib/util/Init.cc
+++ b/lib/util/Init.cc
@@ -377,7 +377,7 @@ void Grid_init(int *argc,char ***argv)
  std::cout << GridLogDebug << "Requesting "<< CartesianCommunicator::MAX_MPI_SHM_BYTES <<" byte stencil comms buffers "<<std::endl;

  if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
-    std::cout<<GridLogMessage<<"Grid Decomposition\n";
+    std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
    std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
    std::cout<<GridLogMessage<<"\tMPI tasks      : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl;
    std::cout<<GridLogMessage<<"\tvRealF         : "<<sizeof(vRealF)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl;