Merge pull request #358 from mmphys/feature/serialisation-test

Add a ragged std::vector to the serialisation test
Test nested std::vector<grid tensor>
2025-09-18 01:01:04 +01:00 · 2021-06-09 10:16:25 +01:00 · 2021-06-08 00:05:35 +01:00 · 2021-06-07 23:25:38 +01:00 · 2021-06-07 23:25:07 +01:00 · 2021-06-04 10:34:46 +01:00
34 changed files with 531 additions and 3403 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -10,8 +10,6 @@
 *~
 *#
 *.sublime-*
 .ctags
 tags
 # Precompiled Headers #
 #######################
@@ -90,6 +88,7 @@ Thumbs.db
 # build directory #
 ###################
 build*/*
 Documentation/_build
 # IDE related files #
 #####################
--- a/Grid/algorithms/approx/Chebyshev.h
+++ b/Grid/algorithms/approx/Chebyshev.h
@@ -240,14 +240,12 @@ public:
    Field T0(grid); T0 = in;  
    Field T1(grid); 
    Field T2(grid);
    Field Tout(grid);
    Field y(grid);
    Field *Tnm = &T0;
    Field *Tn  = &T1;
    Field *Tnp = &T2;
    std::cout << GridLogMessage << "Chebyshev() starts"<<std::endl;
    // Tn=T1 = (xscale M + mscale)in
    RealD xscale = 2.0/(hi-lo);
    RealD mscale = -(hi+lo)/(hi-lo);
@@ -256,7 +254,7 @@ public:
    // sum = .5 c[0] T0 + c[1] T1
    //    out = ()*T0 + Coeffs[1]*T1;
-    axpby(Tout,0.5*Coeffs[0],Coeffs[1],T0,T1);
+    axpby(out,0.5*Coeffs[0],Coeffs[1],T0,T1);
    for(int n=2;n<order;n++){
      Linop.HermOp(*Tn,y);
@@ -277,7 +275,7 @@ public:
      axpby(y,xscale,mscale,y,(*Tn));
      axpby(*Tnp,2.0,-1.0,y,(*Tnm));
      if ( Coeffs[n] != 0.0) {
-	axpy(Tout,Coeffs[n],*Tnp,Tout);
+	axpy(out,Coeffs[n],*Tnp,out);
      }
 #endif
      // Cycle pointers to avoid copies
@@ -287,8 +285,6 @@ public:
      Tnp    =swizzle;
    }
    out = Tout;
    std::cout << GridLogMessage << "Chebyshev() ends"<<std::endl;
  }
 };
@@ -381,26 +377,24 @@ public:
    Field T0(grid); T0 = in;  
    Field T1(grid); 
    Field T2(grid);
    Field Tout(grid);
    Field  y(grid);
    Field *Tnm = &T0;
    Field *Tn  = &T1;
    Field *Tnp = &T2;
    std::cout << GridLogMessage << "ChebyshevLanczos() starts"<<std::endl;
    // Tn=T1 = (xscale M )*in
    AminusMuSq(Linop,T0,T1);
    // sum = .5 c[0] T0 + c[1] T1
-    Tout = (0.5*Coeffs[0])*T0 + Coeffs[1]*T1;
+    out = (0.5*Coeffs[0])*T0 + Coeffs[1]*T1;
    for(int n=2;n<order;n++){
      AminusMuSq(Linop,*Tn,y);
      *Tnp=2.0*y-(*Tnm);
-      Tout=Tout+Coeffs[n]* (*Tnp);
+      out=out+Coeffs[n]* (*Tnp);
      // Cycle pointers to avoid copies
      Field *swizzle = Tnm;
@@ -409,8 +403,6 @@ public:
      Tnp    =swizzle;
    }
    out=Tout;
    std::cout << GridLogMessage << "ChebyshevLanczos() ends"<<std::endl;
  }
 };
 NAMESPACE_END(Grid);
--- a/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h
+++ b/Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h
--- a/Grid/allocator/MemoryManager.cc
+++ b/Grid/allocator/MemoryManager.cc
@@ -44,7 +44,7 @@ void  MemoryManager::AcceleratorFree    (void *ptr,size_t bytes)
  if ( __freeme ) {
    acceleratorFreeDevice(__freeme);
    total_device-=bytes;
-//       PrintBytes();
+    //    PrintBytes();
  }
 }
 void *MemoryManager::SharedAllocate(size_t bytes)
@@ -53,8 +53,8 @@ void *MemoryManager::SharedAllocate(size_t bytes)
  if ( ptr == (void *) NULL ) {
    ptr = (void *) acceleratorAllocShared(bytes);
    total_shared+=bytes;
-        std::cout <<GridLogMessage<<"AcceleratorAllocate: allocated Shared pointer "<<std::hex<<ptr<<std::dec<<std::endl;
+    //    std::cout <<"AcceleratorAllocate: allocated Shared pointer "<<std::hex<<ptr<<std::dec<<std::endl;
-//        PrintBytes();
+    //    PrintBytes();
  }
  return ptr;
 }
@@ -74,7 +74,6 @@ void *MemoryManager::CpuAllocate(size_t bytes)
  if ( ptr == (void *) NULL ) {
    ptr = (void *) acceleratorAllocShared(bytes);
    total_host+=bytes;
 //    std::cout << GridLogMessage<< "MemoryManager:: CpuAllocate  total_host= "<<total_host<<" "<< ptr << std::endl;
  }
  return ptr;
 }
@@ -84,7 +83,6 @@ void  MemoryManager::CpuFree    (void *_ptr,size_t bytes)
  void *__freeme = Insert(_ptr,bytes,Cpu);
  if ( __freeme ) { 
    acceleratorFreeShared(__freeme);
 //    std::cout << GridLogMessage<< "MemoryManager:: CpuFree  total_host= "<<total_host<<" "<< __freeme << std::endl;
    total_host-=bytes;
  }
 }
--- a/Grid/communicator/SharedMemoryMPI.cc
+++ b/Grid/communicator/SharedMemoryMPI.cc
@@ -35,9 +35,6 @@ Author: Christoph Lehner <christoph@lhnr.de>
 #endif
 #ifdef GRID_HIP
 #include <hip/hip_runtime_api.h>
 #endif
 #ifdef GRID_SYCl
 #endif
 NAMESPACE_BEGIN(Grid); 
@@ -73,7 +70,6 @@ void GlobalSharedMemory::Init(Grid_MPI_Comm comm)
  WorldNodes = WorldSize/WorldShmSize;
  assert( (WorldNodes * WorldShmSize) == WorldSize );
  // FIXME: Check all WorldShmSize are the same ?
  /////////////////////////////////////////////////////////////////////
@@ -450,47 +446,7 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 ////////////////////////////////////////////////////////////////////////////////////////////
 // Hugetlbfs mapping intended
 ////////////////////////////////////////////////////////////////////////////////////////////
-#if defined(GRID_CUDA) ||defined(GRID_HIP)  || defined(GRID_SYCL)
+#if defined(GRID_CUDA) ||defined(GRID_HIP)
 //if defined(GRID_SYCL)
 #if 0
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
  void * ShmCommBuf ; 
  assert(_ShmSetup==1);
  assert(_ShmAlloc==0);
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  // allocate the pointer array for shared windows for our group
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  MPI_Barrier(WorldShmComm);
  WorldShmCommBufs.resize(WorldShmSize);
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Each MPI rank should allocate our own buffer
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
  ShmCommBuf = acceleratorAllocDevice(bytes);
  if (ShmCommBuf == (void *)NULL ) {
    std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
    exit(EXIT_FAILURE);  
  }
  std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes 
 	    << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
  SharedMemoryZero(ShmCommBuf,bytes);
  assert(WorldShmSize == 1);
  for(int r=0;r<WorldShmSize;r++){
    WorldShmCommBufs[r] = ShmCommBuf;
  }
  _ShmAllocBytes=bytes;
  _ShmAlloc=1;
 }
 #endif
 #if defined(GRID_CUDA) ||defined(GRID_HIP) ||defined(GRID_SYCL)  
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 {
  void * ShmCommBuf ; 
@@ -513,16 +469,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Each MPI rank should allocate our own buffer
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
 #ifdef GRID_SYCL_LEVEL_ZERO_IPC
  auto zeDevice = cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_device());
  auto zeContext= cl::sycl::get_native<cl::sycl::backend::level_zero>(theGridAccelerator->get_context());
  ze_device_mem_alloc_desc_t zeDesc = {};
  zeMemAllocDevice(zeContext,&zeDesc,bytes,2*1024*1024,zeDevice,&ShmCommBuf);
  std::cout << WorldRank << header " SharedMemoryMPI.cc zeMemAllocDevice "<< bytes 
 	      << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
 #else  
  ShmCommBuf = acceleratorAllocDevice(bytes);
-#endif  
+
  if (ShmCommBuf == (void *)NULL ) {
    std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
    exit(EXIT_FAILURE);  
@@ -532,8 +480,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes 
 	      << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
  }
-  //  SharedMemoryZero(ShmCommBuf,bytes);
+  SharedMemoryZero(ShmCommBuf,bytes);
-  std::cout<< "Setting up IPC"<<std::endl;
+
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Loop over ranks/gpu's on our node
  ///////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -543,23 +491,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    //////////////////////////////////////////////////
    // If it is me, pass around the IPC access key
    //////////////////////////////////////////////////
 #ifdef GRID_SYCL_LEVEL_ZERO_IPC
    ze_ipc_mem_handle_t handle;
    if ( r==WorldShmRank ) { 
      auto err = zeMemGetIpcHandle(zeContext,ShmCommBuf,&handle);
      if ( err != ZE_RESULT_SUCCESS ) {
 	std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
 	exit(EXIT_FAILURE);
      } else {
 	std::cerr << "SharedMemoryMPI.cc zeMemGetIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
      }
      std::cerr<<"Allocated IpcHandle rank "<<r<<" (hex) ";
      for(int c=0;c<ZE_MAX_IPC_HANDLE_SIZE;c++){
 	std::cerr<<std::hex<<(uint32_t)((uint8_t)handle.data[c])<<std::dec;
      }
      std::cerr<<std::endl;
    }
 #endif
 #ifdef GRID_CUDA
    cudaIpcMemHandle_t handle;
    if ( r==WorldShmRank ) { 
@@ -596,25 +527,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
    // If I am not the source, overwrite thisBuf with remote buffer
    ///////////////////////////////////////////////////////////////
    void * thisBuf = ShmCommBuf;
 #ifdef GRID_SYCL_LEVEL_ZERO_IPC
    if ( r!=WorldShmRank ) {
      thisBuf = nullptr;
      std::cerr<<"Using IpcHandle rank "<<r<<" ";
      for(int c=0;c<ZE_MAX_IPC_HANDLE_SIZE;c++){
 	std::cerr<<std::hex<<(uint32_t)((uint8_t)handle.data[c])<<std::dec;
      }
      std::cerr<<std::endl;
      auto err = zeMemOpenIpcHandle(zeContext,zeDevice,handle,0,&thisBuf);
      if ( err != ZE_RESULT_SUCCESS ) {
 	std::cerr << "SharedMemoryMPI.cc "<<zeContext<<" "<<zeDevice<<std::endl;
 	std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle failed for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl; 
 	exit(EXIT_FAILURE);
      } else {
 	std::cerr << "SharedMemoryMPI.cc zeMemOpenIpcHandle succeeded for rank "<<r<<" "<<std::hex<<err<<std::dec<<std::endl;
      }
      assert(thisBuf!=nullptr);
    }
 #endif
 #ifdef GRID_CUDA
    if ( r!=WorldShmRank ) { 
      auto err = cudaIpcOpenMemHandle(&thisBuf,handle,cudaIpcMemLazyEnablePeerAccess);
@@ -645,8 +557,6 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
  _ShmAllocBytes=bytes;
  _ShmAlloc=1;
 }
 #endif
 #else 
 #ifdef GRID_MPI3_SHMMMAP
 void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
@@ -817,16 +727,16 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
 /////////////////////////////////////////////////////////////////////////
 void GlobalSharedMemory::SharedMemoryZero(void *dest,size_t bytes)
 {
-#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
+#ifdef GRID_CUDA
-  acceleratorMemSet(dest,0,bytes);
+  cudaMemset(dest,0,bytes);
 #else
  bzero(dest,bytes);
 #endif
 }
 void GlobalSharedMemory::SharedMemoryCopy(void *dest,void *src,size_t bytes)
 {
-#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
+#ifdef GRID_CUDA
-  acceleratorCopyToDevice(src,dest,bytes);
+  cudaMemcpy(dest,src,bytes,cudaMemcpyDefault);
 #else   
  bcopy(src,dest,bytes);
 #endif
@@ -890,7 +800,7 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
  }
 #endif
-  //SharedMemoryTest();
+  SharedMemoryTest();
 }
 //////////////////////////////////////////////////////////////////
 // On node barrier
--- a/Grid/parallelIO/NerscIO.h
+++ b/Grid/parallelIO/NerscIO.h
@@ -198,7 +198,7 @@ public:
      std::cerr << " nersc_csum  " <<std::hex<< nersc_csum << " " << header.checksum<< std::dec<< std::endl;
      exit(0);
    }
-    assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-1 );
+    assert(fabs(clone.plaquette -header.plaquette ) < 1.0e-5 );
    assert(fabs(clone.link_trace-header.link_trace) < 1.0e-6 );
    assert(nersc_csum == header.checksum );
--- a/Grid/serialisation/BaseIO.cc
+++ b/Grid/serialisation/BaseIO.cc
@@ -0,0 +1,35 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./lib/serialisation/BaseIO.h
 Copyright (C) 2015
 Author: Michael Marshall <michael.marshall@ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/GridCore.h>
 NAMESPACE_BEGIN(Grid)
 std::uint64_t EigenIO::EigenResizeCounter(0);
 NAMESPACE_END(Grid)
--- a/Grid/serialisation/BaseIO.h
+++ b/Grid/serialisation/BaseIO.h
@@ -9,6 +9,7 @@
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: Guido Cossu <guido.cossu@ed.ac.uk>
 Author: Michael Marshall <michael.marshall@ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -30,6 +31,7 @@ Author: Guido Cossu <guido.cossu@ed.ac.uk>
 #ifndef GRID_SERIALISATION_ABSTRACT_READER_H
 #define GRID_SERIALISATION_ABSTRACT_READER_H
 #include <atomic>
 #include <type_traits>
 #include <Grid/tensors/Tensors.h>
 #include <Grid/serialisation/VectorUtils.h>
@@ -110,6 +112,10 @@ namespace Grid {
    template <typename ET>
    inline typename std::enable_if<is_tensor_of_container<ET>::value, typename Traits<ET>::scalar_type *>::type
    getFirstScalar(ET &eigenTensor) { return eigenTensor.data()->begin(); }
    // Counter for resized EigenTensors (poor man's substitute for allocator)
    // Defined in BinaryIO.cc
    extern std::uint64_t EigenResizeCounter;
  }
  // Abstract writer/reader classes ////////////////////////////////////////////
@@ -497,8 +503,14 @@ namespace Grid {
  typename std::enable_if<EigenIO::is_tensor_variable<ETensor>::value, void>::type
  Reader<T>::Reshape(ETensor &t, const std::array<typename ETensor::Index, ETensor::NumDimensions> &dims )
  {
 #ifdef GRID_OMP
    // The memory counter is the reason this must be done from the primary thread
    assert(omp_in_parallel()==0 && "Deserialisation which resizes Eigen tensor must happen from primary thread");
 #endif
    EigenIO::EigenResizeCounter -= static_cast<uint64_t>(t.size()) * sizeof(typename ETensor::Scalar);
    //t.reshape( dims );
    t.resize( dims );
    EigenIO::EigenResizeCounter += static_cast<uint64_t>(t.size()) * sizeof(typename ETensor::Scalar);
  }
  template <typename T>
--- a/Grid/serialisation/Hdf5IO.cc
+++ b/Grid/serialisation/Hdf5IO.cc
@@ -1,3 +1,34 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./Grid/serialisation/VectorUtils.h
 Copyright (C) 2015
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Peter Boyle <paboyle@ed.ac.uk>
 Author: Guido Cossu <guido.cossu@ed.ac.uk>
 Author: Michael Marshall <michael.marshall@ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace Grid;
--- a/Grid/serialisation/Hdf5IO.h
+++ b/Grid/serialisation/Hdf5IO.h
@@ -1,3 +1,34 @@
 /*************************************************************************************
 Grid physics library, www.github.com/paboyle/Grid
 Source file: ./Grid/serialisation/VectorUtils.h
 Copyright (C) 2015
 Author: Peter Boyle <paboyle@ed.ac.uk>
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Guido Cossu <guido.cossu@ed.ac.uk>
 Author: Michael Marshall <michael.marshall@ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
 (at your option) any later version.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License along
 with this program; if not, write to the Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
 #ifndef GRID_SERIALISATION_HDF5_H
 #define GRID_SERIALISATION_HDF5_H
@@ -34,11 +65,13 @@ namespace Grid
    template <typename U>
    void writeDefault(const std::string &s, const U &x);
    template <typename U>
-    typename std::enable_if<element<std::vector<U>>::is_number, void>::type
+    void writeRagged(const std::string &s, const std::vector<U> &x);
    template <typename U>
    typename std::enable_if<is_flattenable<std::vector<U>>::value>::type
    writeDefault(const std::string &s, const std::vector<U> &x);
    template <typename U>
-    typename std::enable_if<!element<std::vector<U>>::is_number, void>::type
+    typename std::enable_if<!is_flattenable<std::vector<U>>::value>::type
-    writeDefault(const std::string &s, const std::vector<U> &x);
+    writeDefault(const std::string &s, const std::vector<U> &x) { writeRagged(s, x); }
    template <typename U>
    void writeMultiDim(const std::string &s, const std::vector<size_t> & Dimensions, const U * pDataRowMajor, size_t NumElements);
    H5NS::Group & getGroup(void);
@@ -64,11 +97,13 @@ namespace Grid
    template <typename U>
    void readDefault(const std::string &s, U &output);
    template <typename U>
-    typename std::enable_if<element<std::vector<U>>::is_number, void>::type
+    void readRagged(const std::string &s, std::vector<U> &x);
    template <typename U>
    typename std::enable_if<is_flattenable<std::vector<U>>::value>::type
    readDefault(const std::string &s, std::vector<U> &x);
    template <typename U>
-    typename std::enable_if<!element<std::vector<U>>::is_number, void>::type
+    typename std::enable_if<!is_flattenable<std::vector<U>>::value>::type
-    readDefault(const std::string &s, std::vector<U> &x);
+    readDefault(const std::string &s, std::vector<U> &x) { readRagged(s, x); }
    template <typename U>
    void readMultiDim(const std::string &s, std::vector<U> &buf, std::vector<size_t> &dim);
    H5NS::Group & getGroup(void);
@@ -176,24 +211,30 @@ namespace Grid
  }
  template <typename U>
-  typename std::enable_if<element<std::vector<U>>::is_number, void>::type
+  typename std::enable_if<is_flattenable<std::vector<U>>::value>::type
  Hdf5Writer::writeDefault(const std::string &s, const std::vector<U> &x)
  {
-    // alias to element type
+    if (isRegularShape(x))
-    typedef typename element<std::vector<U>>::type Element;
+    {
-    
+      // alias to element type
-    // flatten the vector and getting dimensions
+      using Scalar = typename is_flattenable<std::vector<U>>::type;
-    Flatten<std::vector<U>> flat(x);
+      
-    std::vector<size_t> dim;
+      // flatten the vector and getting dimensions
-    const auto           &flatx = flat.getFlatVector();
+      Flatten<std::vector<U>> flat(x);
-    for (auto &d: flat.getDim())
+      std::vector<size_t> dim;
-      dim.push_back(d);
+      const auto           &flatx = flat.getFlatVector();
-    writeMultiDim<Element>(s, dim, &flatx[0], flatx.size());
+      for (auto &d: flat.getDim())
        dim.push_back(d);
      writeMultiDim<Scalar>(s, dim, &flatx[0], flatx.size());
    }
    else
    {
      writeRagged(s, x);
    }
  }
  template <typename U>
-  typename std::enable_if<!element<std::vector<U>>::is_number, void>::type
+  void Hdf5Writer::writeRagged(const std::string &s, const std::vector<U> &x)
  Hdf5Writer::writeDefault(const std::string &s, const std::vector<U> &x)
  {
    push(s);
    writeSingleAttribute(x.size(), HDF5_GRID_GUARD "vector_size",
@@ -229,7 +270,7 @@ namespace Grid
  void Hdf5Reader::readMultiDim(const std::string &s, std::vector<U> &buf, std::vector<size_t> &dim)
  {
    // alias to element type
-    typedef typename element<std::vector<U>>::type Element;
+    using Scalar = typename is_flattenable<std::vector<U>>::type;
    // read the dimensions
    H5NS::DataSpace       dataSpace;
@@ -260,37 +301,44 @@ namespace Grid
      H5NS::DataSet dataSet;
      dataSet = group_.openDataSet(s);
-      dataSet.read(buf.data(), Hdf5Type<Element>::type());
+      dataSet.read(buf.data(), Hdf5Type<Scalar>::type());
    }
    else
    {
      H5NS::Attribute attribute;
      attribute = group_.openAttribute(s);
-      attribute.read(Hdf5Type<Element>::type(), buf.data());
+      attribute.read(Hdf5Type<Scalar>::type(), buf.data());
    }
  }
  template <typename U>
-  typename std::enable_if<element<std::vector<U>>::is_number, void>::type
+  typename std::enable_if<is_flattenable<std::vector<U>>::value>::type
  Hdf5Reader::readDefault(const std::string &s, std::vector<U> &x)
  {
-    // alias to element type
+    if (H5Lexists        (group_.getId(), s.c_str(), H5P_DEFAULT) > 0
-    typedef typename element<std::vector<U>>::type Element;
+     && H5Aexists_by_name(group_.getId(), s.c_str(), HDF5_GRID_GUARD "vector_size", H5P_DEFAULT ) > 0)
    {
      readRagged(s, x);
    }
    else
    {
      // alias to element type
      using Scalar = typename is_flattenable<std::vector<U>>::type;
-    std::vector<size_t>   dim;
+      std::vector<size_t>   dim;
-    std::vector<Element>  buf;
+      std::vector<Scalar>   buf;
-    readMultiDim( s, buf, dim );
+      readMultiDim( s, buf, dim );
-    // reconstruct the multidimensional vector
+      // reconstruct the multidimensional vector
-    Reconstruct<std::vector<U>> r(buf, dim);
+      Reconstruct<std::vector<U>> r(buf, dim);
-    
+
-    x = r.getVector();
+      x = r.getVector();
    }
  }
  template <typename U>
-  typename std::enable_if<!element<std::vector<U>>::is_number, void>::type
+  void Hdf5Reader::readRagged(const std::string &s, std::vector<U> &x)
  Hdf5Reader::readDefault(const std::string &s, std::vector<U> &x)
  {
    uint64_t size;
--- a/Grid/serialisation/MacroMagic.h
+++ b/Grid/serialisation/MacroMagic.h
@@ -118,13 +118,13 @@ static inline std::string SerialisableClassName(void) {return std::string(#cname
 static constexpr bool isEnum = false; \
 GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_MEMBER,__VA_ARGS__))\
 template <typename T>\
-static inline void write(Writer<T> &WR,const std::string &s, const cname &obj){ \
+static inline void write(::Grid::Writer<T> &WR,const std::string &s, const cname &obj){ \
  push(WR,s);\
  GRID_MACRO_EVAL(GRID_MACRO_MAP(GRID_MACRO_WRITE_MEMBER,__VA_ARGS__))	\
  pop(WR);\
 }\
 template <typename T>\
-static inline void read(Reader<T> &RD,const std::string &s, cname &obj){	\
+static inline void read(::Grid::Reader<T> &RD,const std::string &s, cname &obj){	\
  if (!push(RD,s))\
  {\
    std::cout << ::Grid::GridLogWarning << "IO: Cannot open node '" << s << "'" << std::endl; \
--- a/Grid/serialisation/VectorUtils.h
+++ b/Grid/serialisation/VectorUtils.h
@@ -9,7 +9,8 @@
 Author: Antonin Portelli <antonin.portelli@me.com>
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
- 
+ Author: Michael Marshall <michael.marshall@ed.ac.uk>
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation; either version 2 of the License, or
@@ -236,21 +237,36 @@ namespace Grid {
    }
  }
-  // Vector element trait //////////////////////////////////////////////////////  
+  // is_flattenable<T>::value is true if T is a std::vector<> which can be flattened //////////////////////
-  template <typename T>
+  template <typename T, typename V = void>
-  struct element
+  struct is_flattenable : std::false_type
  {
-    typedef T type;
+    using type      = T;
-    static constexpr bool is_number = false;
+    using grid_type = T;
    static constexpr int vecRank = 0;
    static constexpr bool isGridTensor = false;
    static constexpr bool children_flattenable = std::is_arithmetic<T>::value or is_complex<T>::value;
  };
-  
+
  template <typename T>
-  struct element<std::vector<T>>
+  struct is_flattenable<T, typename std::enable_if<isGridTensor<T>::value>::type> : std::false_type
  {
-    typedef typename element<T>::type type;
+    using type      = typename GridTypeMapper<T>::scalar_type;
-    static constexpr bool is_number = std::is_arithmetic<T>::value
+    using grid_type = T;
-                                      or is_complex<T>::value
+    static constexpr int vecRank = 0;
-                                      or element<T>::is_number;
+    static constexpr bool isGridTensor = true;
    static constexpr bool children_flattenable = true;
  };
  template <typename T>
  struct is_flattenable<std::vector<T>, typename std::enable_if<is_flattenable<T>::children_flattenable>::type>
  : std::true_type
  {
    using type      = typename is_flattenable<T>::type;
    using grid_type = typename is_flattenable<T>::grid_type;
    static constexpr bool isGridTensor = is_flattenable<T>::isGridTensor;
    static constexpr int vecRank = is_flattenable<T>::vecRank + 1;
    static constexpr bool children_flattenable = true;
  };
  // Vector flattening utility class ////////////////////////////////////////////
@@ -259,23 +275,30 @@ namespace Grid {
  class Flatten
  {
  public:
-    typedef typename element<V>::type Element;
+    using Scalar  = typename is_flattenable<V>::type;
    static constexpr bool isGridTensor = is_flattenable<V>::isGridTensor;
  public:
-    explicit                     Flatten(const V &vector);
+    explicit                    Flatten(const V &vector);
-    const V &                    getVector(void);
+    const V &                   getVector(void)     const { return vector_; }
-    const std::vector<Element> & getFlatVector(void);
+    const std::vector<Scalar> & getFlatVector(void) const { return flatVector_; }
-    const std::vector<size_t>  & getDim(void);
+    const std::vector<size_t> & getDim(void)        const { return dim_; }
  private:
-    void accumulate(const Element &e);
+    template <typename W> typename std::enable_if<!is_flattenable<W>::value && !is_flattenable<W>::isGridTensor>::type
-    template <typename W>
+    accumulate(const W &e);
-    void accumulate(const W &v);
+    template <typename W> typename std::enable_if<!is_flattenable<W>::value &&  is_flattenable<W>::isGridTensor>::type
-    void accumulateDim(const Element &e);
+    accumulate(const W &e);
-    template <typename W>
+    template <typename W> typename std::enable_if< is_flattenable<W>::value>::type
-    void accumulateDim(const W &v);
+    accumulate(const W &v);
    template <typename W> typename std::enable_if<!is_flattenable<W>::value && !is_flattenable<W>::isGridTensor>::type
    accumulateDim(const W &e) {} // Innermost is a scalar - do nothing
    template <typename W> typename std::enable_if<!is_flattenable<W>::value &&  is_flattenable<W>::isGridTensor>::type
    accumulateDim(const W &e);
    template <typename W> typename std::enable_if< is_flattenable<W>::value>::type
    accumulateDim(const W &v);
  private:
-    const V              &vector_;
+    const V             &vector_;
-    std::vector<Element> flatVector_;
+    std::vector<Scalar> flatVector_;
-    std::vector<size_t>  dim_;
+    std::vector<size_t> dim_;
  };
  // Class to reconstruct a multidimensional std::vector
@@ -283,38 +306,57 @@ namespace Grid {
  class Reconstruct
  {
  public:
-    typedef typename element<V>::type Element;
+    using Scalar  = typename is_flattenable<V>::type;
    static constexpr bool isGridTensor = is_flattenable<V>::isGridTensor;
  public:
-    Reconstruct(const std::vector<Element> &flatVector,
+    Reconstruct(const std::vector<Scalar> &flatVector,
                const std::vector<size_t> &dim);
-    const V &                    getVector(void);
+    const V &                   getVector(void)     const { return vector_; }
-    const std::vector<Element> & getFlatVector(void);
+    const std::vector<Scalar> & getFlatVector(void) const { return flatVector_; }
-    const std::vector<size_t>  & getDim(void);
+    const std::vector<size_t> & getDim(void)        const { return dim_; }
  private:
-    void fill(std::vector<Element> &v);
+    template <typename W> typename std::enable_if<!is_flattenable<W>::value && !is_flattenable<W>::isGridTensor>::type
-    template <typename W>
+    fill(W &v);
-    void fill(W &v);
+    template <typename W> typename std::enable_if<!is_flattenable<W>::value &&  is_flattenable<W>::isGridTensor>::type
-    void resize(std::vector<Element> &v, const unsigned int dim);
+    fill(W &v);
-    template <typename W>
+    template <typename W> typename std::enable_if< is_flattenable<W>::value>::type
-    void resize(W &v, const unsigned int dim);
+    fill(W &v);
    template <typename W> typename std::enable_if< is_flattenable<W>::value &&  is_flattenable<W>::vecRank==1>::type
    resize(W &v, const unsigned int dim);
    template <typename W> typename std::enable_if< is_flattenable<W>::value && (is_flattenable<W>::vecRank>1)>::type
    resize(W &v, const unsigned int dim);
    template <typename W> typename std::enable_if<!is_flattenable<W>::isGridTensor>::type
    checkInnermost(const W &e) {} // Innermost is a scalar - do nothing
    template <typename W> typename std::enable_if< is_flattenable<W>::isGridTensor>::type
    checkInnermost(const W &e);
  private:
-    V                          vector_;
+    V                         vector_;
-    const std::vector<Element> &flatVector_;
+    const std::vector<Scalar> &flatVector_;
-    std::vector<size_t>        dim_;
+    std::vector<size_t>       dim_;
-    size_t                     ind_{0};
+    size_t                    ind_{0};
-    unsigned int               dimInd_{0};
+    unsigned int              dimInd_{0};
  };
  // Flatten class template implementation
  template <typename V>
-  void Flatten<V>::accumulate(const Element &e)
+  template <typename W> typename std::enable_if<!is_flattenable<W>::value && !is_flattenable<W>::isGridTensor>::type
  Flatten<V>::accumulate(const W &e)
  {
    flatVector_.push_back(e);
  }
  template <typename V>
-  template <typename W>
+  template <typename W> typename std::enable_if<!is_flattenable<W>::value && is_flattenable<W>::isGridTensor>::type
-  void Flatten<V>::accumulate(const W &v)
+  Flatten<V>::accumulate(const W &e)
  {
    for (const Scalar &x: e) {
      flatVector_.push_back(x);
    }
  }
  template <typename V>
  template <typename W> typename std::enable_if<is_flattenable<W>::value>::type
  Flatten<V>::accumulate(const W &v)
  {
    for (auto &e: v)
    {
@@ -323,11 +365,17 @@ namespace Grid {
  }
  template <typename V>
-  void Flatten<V>::accumulateDim(const Element &e) {};
+  template <typename W> typename std::enable_if<!is_flattenable<W>::value && is_flattenable<W>::isGridTensor>::type
  Flatten<V>::accumulateDim(const W &e)
  {
    using Traits = GridTypeMapper<typename is_flattenable<W>::grid_type>;
    for (int rank=0; rank < Traits::Rank; ++rank)
      dim_.push_back(Traits::Dimension(rank));
  }
  template <typename V>
-  template <typename W>
+  template <typename W> typename std::enable_if<is_flattenable<W>::value>::type
-  void Flatten<V>::accumulateDim(const W &v)
+  Flatten<V>::accumulateDim(const W &v)
  {
    dim_.push_back(v.size());
    accumulateDim(v[0]);
@@ -337,42 +385,36 @@ namespace Grid {
  Flatten<V>::Flatten(const V &vector)
  : vector_(vector)
  {
    accumulate(vector_);
    accumulateDim(vector_);
-  }
+    std::size_t TotalSize{ dim_[0] };
-  
+    for (int i = 1; i < dim_.size(); ++i) {
-  template <typename V>
+      TotalSize *= dim_[i];
-  const V & Flatten<V>::getVector(void)
+    }
-  {
+    flatVector_.reserve(TotalSize);
-    return vector_;
+    accumulate(vector_);
  }
  template <typename V>
  const std::vector<typename Flatten<V>::Element> &
  Flatten<V>::getFlatVector(void)
  {
    return flatVector_;
  }
  template <typename V>
  const std::vector<size_t> & Flatten<V>::getDim(void)
  {
    return dim_;
  }
  // Reconstruct class template implementation
  template <typename V>
-  void Reconstruct<V>::fill(std::vector<Element> &v)
+  template <typename W> typename std::enable_if<!is_flattenable<W>::value && !is_flattenable<W>::isGridTensor>::type
  Reconstruct<V>::fill(W &v)
  {
    v = flatVector_[ind_++];
  }
  template <typename V>
  template <typename W> typename std::enable_if<!is_flattenable<W>::value &&  is_flattenable<W>::isGridTensor>::type
  Reconstruct<V>::fill(W &v)
  {
    for (auto &e: v)
    {
      e = flatVector_[ind_++];
    }
  }
-  
+
  template <typename V>
-  template <typename W>
+  template <typename W> typename std::enable_if<is_flattenable<W>::value>::type
-  void Reconstruct<V>::fill(W &v)
+  Reconstruct<V>::fill(W &v)
  {
    for (auto &e: v)
    {
@@ -381,14 +423,15 @@ namespace Grid {
  }
  template <typename V>
-  void Reconstruct<V>::resize(std::vector<Element> &v, const unsigned int dim)
+  template <typename W> typename std::enable_if<is_flattenable<W>::value && is_flattenable<W>::vecRank==1>::type
  Reconstruct<V>::resize(W &v, const unsigned int dim)
  {
    v.resize(dim_[dim]);
  }
  template <typename V>
-  template <typename W>
+  template <typename W> typename std::enable_if<is_flattenable<W>::value && (is_flattenable<W>::vecRank>1)>::type
-  void Reconstruct<V>::resize(W &v, const unsigned int dim)
+  Reconstruct<V>::resize(W &v, const unsigned int dim)
  {
    v.resize(dim_[dim]);
    for (auto &e: v)
@@ -398,34 +441,31 @@ namespace Grid {
  }
  template <typename V>
-  Reconstruct<V>::Reconstruct(const std::vector<Element> &flatVector,
+  template <typename W> typename std::enable_if<is_flattenable<W>::isGridTensor>::type
  Reconstruct<V>::checkInnermost(const W &)
  {
    using Traits = GridTypeMapper<typename is_flattenable<W>::grid_type>;
    const int gridRank{Traits::Rank};
    const int dimRank{static_cast<int>(dim_.size())};
    assert(dimRank >= gridRank && "Tensor rank too low for Grid tensor");
    for (int i=0; i<gridRank; ++i) {
      assert(dim_[dimRank - gridRank + i] == Traits::Dimension(i) && "Tensor dimension doesn't match Grid tensor");
    }
    dim_.resize(dimRank - gridRank);
  }
  template <typename V>
  Reconstruct<V>::Reconstruct(const std::vector<Scalar> &flatVector,
                              const std::vector<size_t> &dim)
  : flatVector_(flatVector)
  , dim_(dim)
  {
    checkInnermost(vector_);
    assert(dim_.size() == is_flattenable<V>::vecRank && "Tensor rank doesn't match nested std::vector rank");
    resize(vector_, 0);
    fill(vector_);
  }
  template <typename V>
  const V & Reconstruct<V>::getVector(void)
  {
    return vector_;
  }
  template <typename V>
  const std::vector<typename Reconstruct<V>::Element> &
  Reconstruct<V>::getFlatVector(void)
  {
    return flatVector_;
  }
  template <typename V>
  const std::vector<size_t> & Reconstruct<V>::getDim(void)
  {
    return dim_;
  }
  // Vector IO utilities ///////////////////////////////////////////////////////
  // helper function to read space-separated values
  template <typename T>
@@ -459,6 +499,64 @@ namespace Grid {
    return os;
  }
  // In general, scalar types are considered "flattenable" (regularly shaped)
  template <typename T>
  bool isRegularShapeHelper(const std::vector<T> &, std::vector<std::size_t> &, int, bool)
  {
    return true;
  }
  template <typename T>
  bool isRegularShapeHelper(const std::vector<std::vector<T>> &v, std::vector<std::size_t> &Dims, int Depth, bool bFirst)
  {
    if( bFirst)
    {
      assert( Dims.size() == Depth     && "Bug: Delete this message after testing" );
      Dims.push_back(v[0].size());
      if (!Dims[Depth])
        return false;
    }
    else
    {
      assert( Dims.size() >= Depth + 1 && "Bug: Delete this message after testing" );
    }
    for (std::size_t i = 0; i < v.size(); ++i)
    {
      if (v[i].size() != Dims[Depth] || !isRegularShapeHelper(v[i], Dims, Depth + 1, bFirst && i==0))
      {
        return false;
      }
    }
    return true;
  }
  template <typename T>
  bool isRegularShape(const T &t) { return true; }
  template <typename T>
  bool isRegularShape(const std::vector<T> &v) { return !v.empty(); }
  // Return non-zero if all dimensions of this std::vector<std::vector<T>> are regularly shaped
  template <typename T>
  bool isRegularShape(const std::vector<std::vector<T>> &v)
  {
    if (v.empty() || v[0].empty())
      return false;
    // Make sure all of my rows are the same size
    std::vector<std::size_t> Dims;
    Dims.reserve(is_flattenable<T>::vecRank);
    Dims.push_back(v.size());
    Dims.push_back(v[0].size());
    for (std::size_t i = 0; i < Dims[0]; ++i)
    {
      if (v[i].size() != Dims[1] || !isRegularShapeHelper(v[i], Dims, 2, i==0))
      {
        return false;
      }
    }
    return true;
  }
 }
 // helper function to read space-separated values
--- a/Grid/tensors/Tensor_class.h
+++ b/Grid/tensors/Tensor_class.h
@@ -417,7 +417,7 @@ public:
      stream << "{";
      for (int j = 0; j < N; j++) {
 	stream << o._internal[i][j];
-	if (i < N - 1) stream << ",";
+	if (j < N - 1) stream << ",";
      }
      stream << "}";
      if (i != N - 1) stream << "\n\t\t";
--- a/Grid/tensors/Tensor_extract_merge.h
+++ b/Grid/tensors/Tensor_extract_merge.h
@@ -1,5 +1,5 @@
 /*************************************************************************************
-n
+
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/tensors/Tensor_extract_merge.h
@@ -153,7 +153,7 @@ void insertLane(int lane, vobj & __restrict__ vec,const typename vobj::scalar_ob
 // Extract to a bunch of scalar object pointers of different scalar type, with offset. Useful for precision change
 ////////////////////////////////////////////////////////////////////////
 template<class vobj, class sobj> accelerator
-void extract(const vobj &vec,const ExtractPointerArray<sobj> &extracted, int offset)
+void extract(const vobj &vec,ExtractPointerArray<sobj> &extracted, int offset)
 {
  typedef typename GridTypeMapper<sobj>::scalar_type sobj_scalar_type;
  typedef typename GridTypeMapper<vobj>::scalar_type scalar_type;
@@ -181,7 +181,7 @@ void extract(const vobj &vec,const ExtractPointerArray<sobj> &extracted, int off
 // Merge bunch of scalar object pointers of different scalar type, with offset. Useful for precision change
 ////////////////////////////////////////////////////////////////////////
 template<class vobj, class sobj> accelerator
-void merge(vobj &vec,const ExtractPointerArray<sobj> &extracted, int offset)
+void merge(vobj &vec,ExtractPointerArray<sobj> &extracted, int offset)
 {
  typedef typename GridTypeMapper<sobj>::scalar_type sobj_scalar_type;
  typedef typename GridTypeMapper<vobj>::scalar_type scalar_type;
--- a/Grid/threads/Accelerator.cc
+++ b/Grid/threads/Accelerator.cc
@@ -171,6 +171,7 @@ void acceleratorInit(void)
 #ifdef GRID_SYCL
 cl::sycl::queue *theGridAccelerator;
 void acceleratorInit(void)
 {
  int nDevices = 1;
@@ -178,10 +179,6 @@ void acceleratorInit(void)
  cl::sycl::device selectedDevice { selector };
  theGridAccelerator = new sycl::queue (selectedDevice);
 #ifdef GRID_SYCL_LEVEL_ZERO_IPC
  zeInit(0);
 #endif
  char * localRankStr = NULL;
  int rank = 0, world_rank=0; 
 #define ENV_LOCAL_RANK_OMPI    "OMPI_COMM_WORLD_LOCAL_RANK"
--- a/Grid/threads/Accelerator.h
+++ b/Grid/threads/Accelerator.h
@@ -39,10 +39,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifdef HAVE_MM_MALLOC_H
 #include <mm_malloc.h>
 #endif
 #ifdef __APPLE__
 // no memalign
 inline void *memalign(size_t align, size_t bytes) { return malloc(bytes); }
 #endif
 NAMESPACE_BEGIN(Grid);
@@ -198,9 +194,6 @@ inline void *acceleratorAllocShared(size_t bytes)
    ptr = (void *) NULL;
    printf(" cudaMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err));
  }
  size_t free,total;
  cudaMemGetInfo(&free,&total);
  std::cout<<"cudaMemGetInfo  "<<free<<" / "<<total<<std::endl;
  return ptr;
 };
 inline void *acceleratorAllocDevice(size_t bytes)
@@ -211,9 +204,6 @@ inline void *acceleratorAllocDevice(size_t bytes)
    ptr = (void *) NULL;
    printf(" cudaMalloc failed for %d %s \n",bytes,cudaGetErrorString(err));
  }
  size_t free,total;
  cudaMemGetInfo(&free,&total);
  std::cout<<"cudaMemGetInfo  "<<free<<" / "<<total<<std::endl;
  return ptr;
 };
 inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
@@ -243,13 +233,6 @@ inline int  acceleratorIsCommunicable(void *ptr)
 NAMESPACE_END(Grid);
 #include <CL/sycl.hpp>
 #include <CL/sycl/usm.hpp>
 #define GRID_SYCL_LEVEL_ZERO_IPC
 #ifdef GRID_SYCL_LEVEL_ZERO_IPC
 #include <level_zero/ze_api.h>
 #include <CL/sycl/backend/level_zero.hpp>
 #endif
 NAMESPACE_BEGIN(Grid);
 extern cl::sycl::queue *theGridAccelerator;
@@ -274,14 +257,11 @@ accelerator_inline int acceleratorSIMTlane(int Nsimd) {
      unsigned long nt=acceleratorThreads();				\
      unsigned long unum1 = num1;					\
      unsigned long unum2 = num2;					\
      if(nt < 8)nt=8;							\
      cl::sycl::range<3> local {nt,1,nsimd};				\
      cl::sycl::range<3> global{unum1,unum2,nsimd};			\
      cgh.parallel_for<class dslash>(					\
      cl::sycl::nd_range<3>(global,local), \
-      [=] (cl::sycl::nd_item<3> item) /*mutable*/     \
+      [=] (cl::sycl::nd_item<3> item) /*mutable*/ {   \
      [[intel::reqd_sub_group_size(8)]]	      \
      {						      \
      auto iter1    = item.get_global_id(0);	      \
      auto iter2    = item.get_global_id(1);	      \
      auto lane     = item.get_global_id(2);	      \
@@ -429,8 +409,6 @@ inline void acceleratorMemSet(void *base,int value,size_t bytes) { hipMemset(bas
 #undef GRID_SIMT
 #define accelerator 
 #define accelerator_inline strong_inline
 #define accelerator_for(iterator,num,nsimd, ... )   thread_for(iterator, num, { __VA_ARGS__ });
--- a/Grid/util/Init.cc
+++ b/Grid/util/Init.cc
@@ -56,8 +56,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 static int
 feenableexcept (unsigned int excepts)
 {
 #if 0
  // Fails on Apple M1
  static fenv_t fenv;
  unsigned int new_excepts = excepts & FE_ALL_EXCEPT;
  unsigned int old_excepts;  // previous masks
@@ -72,8 +70,6 @@ feenableexcept (unsigned int excepts)
  iold_excepts  = (int) old_excepts;
  return ( fesetenv (&fenv) ? -1 : iold_excepts );
 #endif
  return 0;
 }
 #endif
@@ -167,13 +163,6 @@ void GridCmdOptionInt(std::string &str,int & val)
  return;
 }
 // ypj [add]
 void GridCmdOptionFloat(std::string &str,double & val)
 {
  std::stringstream ss(str);
  ss>>val;
  return;
 }
 void GridParseLayout(char **argv,int argc,
 		     Coordinate &latt_c,
--- a/Grid/util/Init.h
+++ b/Grid/util/Init.h
@@ -57,8 +57,7 @@ void GridCmdOptionCSL(std::string str,std::vector<std::string> & vec);
 template<class VectorInt>
 void GridCmdOptionIntVector(const std::string &str,VectorInt & vec);
 void GridCmdOptionInt(std::string &str,int & val);
-// ypj [add]
+
 void GridCmdOptionFloat(std::string &str,double & val);
 void GridParseLayout(char **argv,int argc,
 		     std::vector<int> &latt,
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,5 +1,5 @@
 # additional include paths necessary to compile the C++ library
-SUBDIRS = Grid HMC benchmarks tests examples
+SUBDIRS = Grid HMC benchmarks tests
 include $(top_srcdir)/doxygen.inc
--- a/benchmarks/Benchmark_ITT.cc
+++ b/benchmarks/Benchmark_ITT.cc
@@ -133,30 +133,34 @@ public:
 	std::vector<HalfSpinColourVectorD *> xbuf(8);
 	std::vector<HalfSpinColourVectorD *> rbuf(8);
-	//Grid.ShmBufferFreeAll();
+	Grid.ShmBufferFreeAll();
 	uint64_t bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
 	for(int d=0;d<8;d++){
-	  xbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
+	  xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
-	  rbuf[d] = (HalfSpinColourVectorD *)acceleratorAllocDevice(bytes);
+	  rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	  //	  bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	  //	  bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	}
 	int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
 	int ncomm;
 	double dbytes;
 	std::vector<double> times(Nloop);
 	for(int i=0;i<Nloop;i++){
-        for(int dir=0;dir<8;dir++) {
+	  double start=usecond();
 	  int mu =dir % 4;
 	  if (mpi_layout[mu]>1 ) {
-	    std::vector<double> times(Nloop);
+	  dbytes=0;
-	    for(int i=0;i<Nloop;i++){
+	  ncomm=0;
-	      dbytes=0;	        
+	  thread_for(dir,8,{
-	      double start=usecond();
+		     
 	    double tbytes;
 	    int mu =dir % 4;
 	    if (mpi_layout[mu]>1 ) {
 	      int xmit_to_rank;
 	      int recv_from_rank;
 	      if ( dir == mu ) { 
 		int comm_proc=1;
 		Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
@@ -164,38 +168,40 @@ public:
 		int comm_proc = mpi_layout[mu]-1;
 		Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	      }
-	      Grid.SendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
+	      tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
-				  (void *)&rbuf[dir][0], recv_from_rank,
+						 (void *)&rbuf[dir][0], recv_from_rank,
-				  bytes);
+						 bytes,dir);
-	      dbytes+=bytes;
+	      thread_critical {
-	     
+		ncomm++;
-	      double stop=usecond();
+		dbytes+=tbytes;
-	      t_time[i] = stop-start; // microseconds
+	      }
 	    }
-	    timestat.statistics(t_time);
+          });
-	  
+	  Grid.Barrier();
-	    dbytes=dbytes*ppn;
+	  double stop=usecond();
-	    double xbytes    = dbytes*0.5;
+	  t_time[i] = stop-start; // microseconds
 	    double bidibytes = dbytes;
 	    std::cout<<GridLogMessage << lat<<"\t"<<Ls<<"\t "
 		     << bytes << " \t "
 		     <<xbytes/timestat.mean<<" \t "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " \t "
 		     <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
 		     << "\t\t"<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
 		     << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
 	  }
 	}
-	for(int d=0;d<8;d++){
+
-	  acceleratorFreeDevice(xbuf[d]);
+	timestat.statistics(t_time);
-	  acceleratorFreeDevice(rbuf[d]);
+
-	}
+	dbytes=dbytes*ppn;
-      }
+	double xbytes    = dbytes*0.5;
-    } 
+	//	double rbytes    = dbytes*0.5;
 	double bidibytes = dbytes;
 	std::cout<<GridLogMessage << lat<<"\t"<<Ls<<"\t "
 		 << bytes << " \t "
 		 <<xbytes/timestat.mean<<" \t "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " \t "
 		 <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
 		 << "\t\t"<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
 		 << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
 	    }
    }    
    return;
  }
-  
+
  static void Memory(void)
@@ -275,6 +281,7 @@ public:
    uint64_t lmax=32;
 #define NLOOP (1000*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
    GridSerialRNG          sRNG;      sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
    for(int lat=8;lat<=lmax;lat+=8){
@@ -438,7 +445,7 @@ public:
 	// 1344= 3*(2*8+6)*2*8 + 8*3*2*2 + 3*4*2*8
 	// 1344 = Nc* (6+(Nc-1)*8)*2*Nd + Nd*Nc*2*2  + Nd*Nc*Ns*2
 	//	double flops=(1344.0*volume)/2;
-#if 0
+#if 1
 	double fps = Nc* (6+(Nc-1)*8)*Ns*Nd + Nd*Nc*Ns  + Nd*Nc*Ns*2;
 #else
 	double fps = Nc* (6+(Nc-1)*8)*Ns*Nd + 2*Nd*Nc*Ns  + 2*Nd*Nc*Ns*2;
@@ -709,12 +716,12 @@ int main (int argc, char ** argv)
  if ( do_su4 ) {
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    std::cout<<GridLogMessage << " SU(4) benchmark " <<std::endl;
+    std::cout<<GridLogMessage << " Memory benchmark " <<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    Benchmark::SU4();
  }
-  if ( do_comms ) {
+  if ( do_comms && (NN>1) ) {
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << " Communications benchmark " <<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
--- a/configure.ac
+++ b/configure.ac
@@ -815,7 +815,6 @@ AC_CONFIG_FILES(tests/smearing/Makefile)
 AC_CONFIG_FILES(tests/qdpxx/Makefile)
 AC_CONFIG_FILES(tests/testu01/Makefile)
 AC_CONFIG_FILES(benchmarks/Makefile)
 AC_CONFIG_FILES(examples/Makefile)
 AC_OUTPUT
 echo ""
--- a/documentation/Grid.pdf
+++ b/documentation/Grid.pdf
--- a/documentation/manual.rst
+++ b/documentation/manual.rst
@@ -1787,7 +1787,7 @@ Hdf5Writer     Hdf5Reader       HDF5
 Write interfaces, similar to the XML facilities in QDP++ are presented. However,
 the serialisation routines are automatically generated by the macro, and a virtual
-reader adn writer interface enables writing to any of a number of formats.
+reader and writer interface enables writing to any of a number of formats.
 **Example**::
@@ -1814,6 +1814,91 @@ reader adn writer interface enables writing to any of a number of formats.
   }
 Eigen tensor support -- added 2019H1
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 The Serialisation library was expanded in 2019 to support de/serialisation of
 Eigen tensors. De/serialisation of existing types was not changed. Data files
 without Eigen tensors remain compatible with earlier versions of Grid and other readers.
 Conversely, data files containing serialised Eigen tensors is a breaking change.
 Eigen tensor serialisation support was added to BaseIO, which was modified to provide a Traits class
 to recognise Eigen tensors with elements that are either: primitive scalars (arithmetic and complex types);
 or Grid tensors.
 **Traits determining de/serialisable scalars**::
  // Is this an Eigen tensor
  template<typename T> struct is_tensor : std::integral_constant<bool,
    std::is_base_of<Eigen::TensorBase<T, Eigen::ReadOnlyAccessors>, T>::value> {};
  // Is this an Eigen tensor of a supported scalar
  template<typename T, typename V = void> struct is_tensor_of_scalar : public std::false_type {};
  template<typename T> struct is_tensor_of_scalar<T, typename std::enable_if<is_tensor<T>::value && is_scalar<typename T::Scalar>::value>::type> : public std::true_type {};
  // Is this an Eigen tensor of a supported container
  template<typename T, typename V = void> struct is_tensor_of_container : public std::false_type {};
  template<typename T> struct is_tensor_of_container<T, typename std::enable_if<is_tensor<T>::value && isGridTensor<typename T::Scalar>::value>::type> : public std::true_type {};
 Eigen tensors are regular, multidimensional objects, and each Reader/Writer
 was extended to support this new datatype. Where the Eigen tensor contains
 a Grid tensor, the dimensions of the data written are the dimensions of the
 Eigen tensor plus the dimensions of the underlying Grid scalar. Dimensions
 of size 1 are preserved.
 **New Reader/Writer methods for multi-dimensional data**::
  template <typename U>
  void readMultiDim(const std::string &s, std::vector<U> &buf, std::vector<size_t> &dim);
  template <typename U>
  void writeMultiDim(const std::string &s, const std::vector<size_t> & Dimensions, const U * pDataRowMajor, size_t NumElements);
 On readback, the Eigen tensor rank must match the data being read, but the tensor
 dimensions will be resized if necessary. Resizing is not possible for Eigen::TensorMap<T>
 because these tensors use a buffer provided at construction, and this buffer cannot be changed.
 Deserialisation failures cause Grid to assert.
 HDF5 Optimisations -- added June 2021
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 Grid serialisation is intended to be light, deterministic and provide a layer of abstraction over
 multiple file formats. HDF5 excels at handling multi-dimensional data, and the Grid HDF5Reader/HDF5Writer exploits this.
 When serialising nested ``std::vector<T>``, where ``T`` is an arithmetic or complex type,
 the Hdf5Writer writes the data as an Hdf5 DataSet object.
 However, nested ``std::vector<std::vector<...T>>`` might be "ragged", i.e. not necessarily regular. E.g. a 3d nested
 ``std::vector`` might contain 2 rows, the first being a 2x2 block and the second row being a 1 x 2 block.
 A bug existed whereby this was not checked on write, so nested, ragged vectors
 were written as a regular dataset, with a buffer under/overrun and jumbled contents.
 Clearly this was not used in production, as the bug went undetected until now. Fixing this bug
 is an opportunity to further optimise the HDF5 file format.
 The goals of this change are to:
 * Make changes to the Hdf5 file format only -- i.e. do not impact other file formats
 * Implement file format changes in such a way that they are transparent to the Grid reader
 * Correct the bug for ragged vectors of numeric / complex types
 * Extend the support of nested std::vector<T> to arbitrarily nested Grid tensors
 The trait class ``element`` has been redefined to ``is_flattenable``, which is a trait class for
 potentially "flattenable" objects. These are (possibly nested) ``std::vector<T>`` where ``T`` is
 an arithmetic, complex or Grid tensor type. Flattenable objects are tested on write
 (with the function ``isRegularShape``) to see whether they actually are regular.
 Flattenable, regular objects are written to a multidimensional HDF5 DataSet.
 Otherwise, an Hdf5 sub group is created with the object "name", and each element of the outer dimension is
 recursively written to as object "name_n", where n is a 0-indexed number.
 On readback (by Grid)), the presence of a subgroup containing the attribute ``Grid_vector_size`` triggers a
 "ragged read", otherwise a read from a DataSet is attempted.
 Data parallel field IO
 -----------------------
--- a/examples/Example_Laplacian.cc
+++ b/examples/Example_Laplacian.cc
@@ -1,396 +0,0 @@
 #include <Grid/Grid.h>
 using namespace Grid;
 /*
 /////////////////////////////////////////////////////////////////////////////////////////////
 // Grid/algorithms/SparseMatrix.h: Interface defining what I expect of a general sparse matrix, such as a Fermion action
 /////////////////////////////////////////////////////////////////////////////////////////////
 template<class Field> class SparseMatrixBase {
 public:
  virtual GridBase *Grid(void) =0;
  virtual void  M    (const Field &in, Field &out)=0;
  virtual void  Mdag (const Field &in, Field &out)=0;
  virtual void  MdagM(const Field &in, Field &out) {
    Field tmp (in.Grid());
    M(in,tmp);
    Mdag(tmp,out);
  }
  virtual  void Mdiag    (const Field &in, Field &out)=0;
  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp)=0;
  virtual  void MdirAll  (const Field &in, std::vector<Field> &out)=0;
 };
 */
 const std::vector<int> directions   ({Xdir,Ydir,Zdir,Xdir,Ydir,Zdir});
 const std::vector<int> displacements({1,1,1,-1,-1,-1});
 template<class Field> class FreeLaplacianCshift : public SparseMatrixBase<Field>
 {
 public:
  GridBase *grid;
  FreeLaplacianCshift(GridBase *_grid)
  {
    grid=_grid;
  };
  virtual GridBase *Grid(void) { return grid; };
  virtual void  M    (const Field &in, Field &out)
  {
    out = Zero();
    for(int mu=0;mu<Nd-1;mu++) {
      out = out + Cshift(in,mu,1) + Cshift(in,mu,-1) - 2.0 * in;
    }
  };
  virtual void  Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian
  virtual  void Mdiag    (const Field &in, Field &out)                  {assert(0);}; // Unimplemented need only for multigrid
  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid
  virtual  void MdirAll  (const Field &in, std::vector<Field> &out)     {assert(0);}; // Unimplemented need only for multigrid
 };
 template<class Gimpl,class Field> class CovariantLaplacianCshift : public SparseMatrixBase<Field>
 {
 public:
  INHERIT_GIMPL_TYPES(Gimpl);
  GridBase *grid;
  GaugeField U;
  CovariantLaplacianCshift(GaugeField &_U)    :
    grid(_U.Grid()),
    U(_U) {  };
  virtual GridBase *Grid(void) { return grid; };
  virtual void  M    (const Field &in, Field &out)
  {
    out=Zero();
    for(int mu=0;mu<Nd-1;mu++) {
      GaugeLinkField Umu = PeekIndex<LorentzIndex>(U, mu); // NB: Inefficent
      out = out + Gimpl::CovShiftForward(Umu,mu,in);    
      out = out + Gimpl::CovShiftBackward(Umu,mu,in);    
      out = out - 2.0*in;
    }
  };
  virtual void  Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian
  virtual  void Mdiag    (const Field &in, Field &out)                  {assert(0);}; // Unimplemented need only for multigrid
  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid
  virtual  void MdirAll  (const Field &in, std::vector<Field> &out)     {assert(0);}; // Unimplemented need only for multigrid
 };
 #define LEG_LOAD(Dir)						 \
  SE = st.GetEntry(ptype, Dir, ss);				 \
  if (SE->_is_local ) {						 \
    int perm= SE->_permute;					 \
    chi = coalescedReadPermute(in[SE->_offset],ptype,perm,lane); \
  } else {							 \
    chi = coalescedRead(buf[SE->_offset],lane);			 \
  }								 \
  acceleratorSynchronise();
 template<class Field> class FreeLaplacianStencil : public SparseMatrixBase<Field>
 {
 public:
  typedef typename Field::vector_object siteObject;
  typedef CartesianStencil<siteObject, siteObject, int> StencilImpl;
  GridBase *grid;
  StencilImpl Stencil;
  SimpleCompressor<siteObject> Compressor;
  FreeLaplacianStencil(GridBase *_grid)
    : Stencil    (_grid,6,Even,directions,displacements,0), grid(_grid)
  {  };
  virtual GridBase *Grid(void) { return grid; };
  virtual void  M    (const Field &_in, Field &_out)
  {
    ///////////////////////////////////////////////
    // Halo exchange for this geometry of stencil
    ///////////////////////////////////////////////
    Stencil.HaloExchange(_in, Compressor);
    ///////////////////////////////////
    // Arithmetic expressions
    ///////////////////////////////////
    // Views; device friendly/accessible pointers
    auto st = Stencil.View(AcceleratorRead);
    auto buf = st.CommBuf();
    autoView( in     , _in    , AcceleratorRead);
    autoView( out    , _out   , AcceleratorWrite);
    typedef typename Field::vector_object        vobj;
    typedef decltype(coalescedRead(in[0])) calcObj;
    const int      Nsimd = vobj::Nsimd();
    const uint64_t NN = grid->oSites();
    accelerator_for( ss, NN, Nsimd, {
 	StencilEntry *SE;
 	const int lane=acceleratorSIMTlane(Nsimd);
 	calcObj chi;
 	calcObj res;
 	int ptype;
 	res                 = coalescedRead(in[ss])*(-6.0);
 	LEG_LOAD(0);	res = res + chi;
 	LEG_LOAD(1);	res = res + chi;
 	LEG_LOAD(2);	res = res + chi;
 	LEG_LOAD(3);	res = res + chi;
 	LEG_LOAD(4);	res = res + chi;
 	LEG_LOAD(5);	res = res + chi;
 	coalescedWrite(out[ss], res,lane);
    });
  };
  virtual void  Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian
  virtual  void Mdiag    (const Field &in, Field &out)                  {assert(0);}; // Unimplemented need only for multigrid
  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid
  virtual  void MdirAll  (const Field &in, std::vector<Field> &out)     {assert(0);}; // Unimplemented need only for multigrid
 };
 template<class Gimpl,class Field> class CovariantLaplacianStencil : public SparseMatrixBase<Field>
 {
 public:
  INHERIT_GIMPL_TYPES(Gimpl);
  typedef typename Field::vector_object siteObject;
  template <typename vtype> using iImplDoubledGaugeField = iVector<iScalar<iMatrix<vtype, Nc> >, Nds>;
  typedef iImplDoubledGaugeField<Simd> SiteDoubledGaugeField;
  typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
  typedef CartesianStencil<siteObject, siteObject, int> StencilImpl;
  GridBase *grid;
  StencilImpl Stencil;
  SimpleCompressor<siteObject> Compressor;
  DoubledGaugeField Uds;
  CovariantLaplacianStencil(GaugeField &Umu)
    :
      grid(Umu.Grid()),
      Stencil    (grid,6,Even,directions,displacements,0),
      Uds(grid)
  {
    for (int mu = 0; mu < Nd; mu++) {
      auto U = PeekIndex<LorentzIndex>(Umu, mu);
      PokeIndex<LorentzIndex>(Uds, U, mu );
      U = adj(Cshift(U, mu, -1));
      PokeIndex<LorentzIndex>(Uds, U, mu + 4);
    }
  };
  virtual GridBase *Grid(void) { return grid; };
  virtual void  M    (const Field &_in, Field &_out)
  {
    ///////////////////////////////////////////////
    // Halo exchange for this geometry of stencil
    ///////////////////////////////////////////////
    Stencil.HaloExchange(_in, Compressor);
    ///////////////////////////////////
    // Arithmetic expressions
    ///////////////////////////////////
    auto st = Stencil.View(AcceleratorRead);
    auto buf = st.CommBuf();
    autoView( in     , _in    , AcceleratorRead);
    autoView( out    , _out   , AcceleratorWrite);
    autoView( U     , Uds    , AcceleratorRead);
    typedef typename Field::vector_object        vobj;
    typedef decltype(coalescedRead(in[0]))    calcObj;
    typedef decltype(coalescedRead(U[0](0))) calcLink;
    const int      Nsimd = vobj::Nsimd();
    const uint64_t NN = grid->oSites();
    accelerator_for( ss, NN, Nsimd, {
 	StencilEntry *SE;
 	const int lane=acceleratorSIMTlane(Nsimd);
 	calcObj chi;
 	calcObj res;
 	calcObj Uchi;
 	calcLink UU;
 	int ptype;
 	res                 = coalescedRead(in[ss])*(-6.0);
 #define LEG_LOAD_MULT(leg,polarisation)			\
 	UU = coalescedRead(U[ss](polarisation));	\
 	LEG_LOAD(leg);					\
 	mult(&Uchi(), &UU, &chi());			\
 	res = res + Uchi;
 	LEG_LOAD_MULT(0,Xp);
 	LEG_LOAD_MULT(1,Yp);
 	LEG_LOAD_MULT(2,Zp);
 	LEG_LOAD_MULT(3,Xm);
 	LEG_LOAD_MULT(4,Ym);
 	LEG_LOAD_MULT(5,Zm);
 	coalescedWrite(out[ss], res,lane);
    });
  };
  virtual void  Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian
  virtual  void Mdiag    (const Field &in, Field &out)                  {assert(0);}; // Unimplemented need only for multigrid
  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid
  virtual  void MdirAll  (const Field &in, std::vector<Field> &out)     {assert(0);}; // Unimplemented need only for multigrid
 };
 #undef LEG_LOAD_MULT
 #undef LEG_LOAD
 int main(int argc, char ** argv)
 {
  Grid_init(&argc, &argv);
  typedef LatticeColourVector Field;
  auto latt_size   = GridDefaultLatt();
  auto simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  auto mpi_layout  = GridDefaultMpi();
  GridCartesian    Grid(latt_size,simd_layout,mpi_layout);
  GridParallelRNG  RNG(&Grid);  RNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
  FreeLaplacianCshift<Field>  FLcs(&Grid);
  FreeLaplacianStencil<Field> FLst(&Grid);
  LatticeGaugeField U(&Grid);
  SU<Nc>::ColdConfiguration(RNG,U);
  std::cout << " Gauge field has norm " <<norm2(U)<<std::endl;
  CovariantLaplacianCshift <PeriodicGimplR,Field> CLcs(U);
  CovariantLaplacianStencil<PeriodicGimplR,Field> CLst(U);
  Field in(&Grid); gaussian(RNG,in);
  Field out_FLcs(&Grid);
  Field out_FLst(&Grid);
  Field out_CLcs(&Grid);
  Field out_CLst(&Grid);
  Field diff(&Grid);
  ////////////////////////////////////////////////////////
  // First test: in free field these should all agree
  ////////////////////////////////////////////////////////
  FLcs.M(in,out_FLcs);
  FLst.M(in,out_FLst);
  CLcs.M(in,out_CLcs);
  CLst.M(in,out_CLst);
  std:: cout << "******************************************************************" <<std::endl;
  std:: cout << " Test A: consistency of four different Laplacian implementations " <<std::endl;
  std:: cout << "******************************************************************" <<std::endl;
  std:: cout << " Input test vector " <<norm2(in)<<std::endl;
  std:: cout << "--------------------------------------------------------" <<std::endl;
  std:: cout << " Free cshift  output vector " <<norm2(out_FLcs)<<std::endl;
  std:: cout << " Free stencil output vector " <<norm2(out_FLst)<<std::endl;
  std:: cout << " Cov  cshift  output vector " <<norm2(out_CLcs)<<std::endl;
  std:: cout << " Cov  stencil output vector " <<norm2(out_CLst)<<std::endl;
  std:: cout << "--------------------------------------------------------" <<std::endl;
  diff = out_FLcs - out_FLst;
  std:: cout << " Difference between free Cshift Laplacian and free Stencil Laplacian      = " <<norm2(diff)<<std::endl;
  diff = out_FLcs - out_CLcs;
  std:: cout << " Difference between free Cshift Laplacian and covariant Cshift Laplacian  = " <<norm2(diff)<<std::endl;
  diff = out_FLcs - out_CLst;
  std:: cout << " Difference between free Cshift Laplacian and covariant Stencil Laplacian = " <<norm2(diff)<<std::endl;
  std:: cout << "--------------------------------------------------------" <<std::endl;
  std:: cout << "******************************************************************" <<std::endl;
  std:: cout << " Test B: gauge covariance  " <<std::endl;
  std:: cout << "******************************************************************" <<std::endl;
  LatticeGaugeField     U_GT(&Grid); // Gauge transformed field
  LatticeColourMatrix   g(&Grid);   // local Gauge xform matrix
  U_GT = U;
  // Make a random xform to teh gauge field
  SU<Nc>::RandomGaugeTransform(RNG,U_GT,g); // Unit gauge
  Field in_GT(&Grid); 
  Field out_GT(&Grid);
  Field out_CLcs_GT(&Grid);
  Field out_CLst_GT(&Grid);
  CovariantLaplacianCshift <PeriodicGimplR,Field> CLcs_GT(U_GT);
  CovariantLaplacianStencil<PeriodicGimplR,Field> CLst_GT(U_GT);
  in_GT  = g*in;
  out_GT = g*out_FLcs;
  // Check M^GT_xy in_GT = g(x) M_xy g^dag(y) g(y) in = g(x) out(x)
  CLcs_GT.M(in_GT,out_CLcs_GT);
  CLst_GT.M(in_GT,out_CLst_GT);
  diff = out_CLcs_GT - out_GT;
  std:: cout << " Difference between Gauge xformed result and covariant Cshift Laplacian in xformed gauge  = " <<norm2(diff)<<std::endl;
  diff = out_CLst_GT - out_GT;
  std:: cout << " Difference between Gauge xformed result and covariant Stencil Laplacian in xformed gauge  = " <<norm2(diff)<<std::endl;
  std:: cout << "--------------------------------------------------------" <<std::endl;
  std:: cout << "******************************************************************" <<std::endl;
  std:: cout << " Test C: compare in free Field to \"Feynman rule\"  " <<std::endl;
  std:: cout << "******************************************************************" <<std::endl;
  std::vector<int> dim_mask({1,1,1,0}); // 3d FFT
  FFT theFFT(&Grid);
  Field out(&Grid);
  Field F_out(&Grid);
  Field F_in(&Grid);
  // FFT the random input vector
  theFFT.FFT_dim_mask(F_in,in,dim_mask,FFT::forward);
  // Convolution theorem: multiply by Fourier representation of (discrete) Laplacian to apply diff op
  LatticeComplexD    lap(&Grid); lap = Zero();
  LatticeComplexD    kmu(&Grid); 
  ComplexD ci(0.0,1.0);
  for(int mu=0;mu<3;mu++) {
    RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
    LatticeCoordinate(kmu,mu);
    kmu = TwoPiL * kmu;
    // (e^ik_mu + e^-ik_mu - 2) = 2( cos kmu - 1) ~ 2 (1 - k_mu^2/2 -1 ) =  - k_mu^2 + O(k^4)
    lap = lap + 2.0*cos(kmu) - 2.0;
  }  
  F_out = lap * F_in;
  // Inverse FFT the result
  theFFT.FFT_dim_mask(out,F_out,dim_mask,FFT::backward);
  std::cout<<"Fourier xformed (in)             "<<norm2(F_in)<<std::endl;
  std::cout<<"Fourier xformed Laplacian x (in) "<<norm2(F_out)<<std::endl;
  std::cout<<"Momentum space Laplacian application  "<< norm2(out)<<std::endl;
  std::cout<<"Stencil Laplacian application         "<< norm2(out_CLcs)<<std::endl;
  diff = out_CLcs - out;
  std::cout<<"diff "<< norm2(diff)<<std::endl;
  Grid_finalize();
 }
--- a/examples/Example_Laplacian_smearing.cc
+++ b/examples/Example_Laplacian_smearing.cc
@@ -1,127 +0,0 @@
 #include <Grid/Grid.h>
 using namespace Grid;
 // Function used for Chebyshev smearing
 // 
 Real MomentumSmearing(Real p2)
 {
  return (1 - 4.0*p2) * exp(-p2/4);
 }
 Real DistillationSmearing(Real p2)
 {
  if ( p2 > 0.5 ) return 0.0;
  else return 1.0;
 }
 // Flip sign to make prop to p^2, not -p^2 relative to last example
 template<class Gimpl,class Field> class CovariantLaplacianCshift : public SparseMatrixBase<Field>
 {
 public:
  INHERIT_GIMPL_TYPES(Gimpl);
  GridBase *grid;
  GaugeField U;
  CovariantLaplacianCshift(GaugeField &_U)    :
    grid(_U.Grid()),
    U(_U) {  };
  virtual GridBase *Grid(void) { return grid; };
  virtual void  M    (const Field &in, Field &out)
  {
    out=Zero();
    for(int mu=0;mu<Nd-1;mu++) {
      GaugeLinkField Umu = PeekIndex<LorentzIndex>(U, mu); // NB: Inefficent
      out = out - Gimpl::CovShiftForward(Umu,mu,in);    
      out = out - Gimpl::CovShiftBackward(Umu,mu,in);    
      out = out + 2.0*in;
    }
  };
  virtual void  Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian
  virtual  void Mdiag    (const Field &in, Field &out)                  {assert(0);}; // Unimplemented need only for multigrid
  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid
  virtual  void MdirAll  (const Field &in, std::vector<Field> &out)     {assert(0);}; // Unimplemented need only for multigrid
 };
 int main(int argc, char ** argv)
 {
  Grid_init(&argc, &argv);
  typedef LatticeColourVector Field;
  auto latt_size   = GridDefaultLatt();
  auto simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  auto mpi_layout  = GridDefaultMpi();
  GridCartesian    Grid(latt_size,simd_layout,mpi_layout);
  GridParallelRNG  RNG(&Grid);  RNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
  LatticeGaugeField U(&Grid);
  SU<Nc>::ColdConfiguration(RNG,U);
  typedef CovariantLaplacianCshift <PeriodicGimplR,Field> Laplacian_t;
  Laplacian_t Laplacian(U);
  ColourVector ColourKronecker;
  ColourKronecker = Zero();
  ColourKronecker()()(0) = 1.0;
  Coordinate site({latt_size[0]/2,
                   latt_size[1]/2,
                   latt_size[2]/2,
 		   0});
  Field kronecker(&Grid);
  kronecker = Zero();
  pokeSite(ColourKronecker,kronecker,site);
  Field psi(&Grid), chi(&Grid);
  //////////////////////////////////////
  // Classic Wuppertal smearing
  //////////////////////////////////////
  Integer Iterations = 80;
  Real width = 2.0;
  Real coeff = (width*width) / Real(4*Iterations);
  chi=kronecker;
  //  chi = (1-p^2/2N)^N kronecker
  for(int n = 0; n < Iterations; ++n) {
    Laplacian.M(chi,psi);
    chi = chi - coeff*psi;
  }
  std::cout << " Wuppertal smeared operator is chi = \n" << chi <<std::endl;
  /////////////////////////////////////
  // Chebyshev smearing
  /////////////////////////////////////
  RealD lo = 0.0;
  RealD hi = 12.0; // Analytic free field bound
  HermitianLinearOperator<Laplacian_t,Field> HermOp(Laplacian);
  std::cout << " Checking spectral range of our POSITIVE definite operator \n";
  PowerMethod<Field> PM;
  PM(HermOp,kronecker);
  //  Chebyshev<Field> ChebySmear(lo,hi,20,DistillationSmearing);
  Chebyshev<Field> ChebySmear(lo,hi,20,MomentumSmearing);
  {
    std::ofstream of("chebysmear");
    ChebySmear.csv(of);
  }
  ChebySmear(HermOp,kronecker,chi);
  std::cout << " Chebyshev smeared operator is chi = \n" << chi <<std::endl;
  Grid_finalize();
 }
--- a/examples/Example_Laplacian_solver.cc
+++ b/examples/Example_Laplacian_solver.cc
@@ -1,129 +0,0 @@
 #include <Grid/Grid.h>
 using namespace Grid;
 template<class Field>
 void SimpleConjugateGradient(LinearOperatorBase<Field> &HPDop,const Field &b, Field &x)
 {
    RealD cp, c, alpha, d, beta, ssq, qq;
    RealD Tolerance=1.0e-10;
    int MaxIterations=10000;
    Field p(b), mmp(b), r(b);
    HPDop.HermOpAndNorm(x, mmp, d, beta);
    r = b - mmp;
    p = r;
    cp = alpha = norm2(p);
    ssq = norm2(b);
    RealD rsq = Tolerance * Tolerance * ssq;
    for (int k = 1; k <= MaxIterations; k++) {
      c = cp;
      HPDop.HermOp(p, mmp);
      d = real(innerProduct(p,mmp));
      alpha = c / d;
      r = r - alpha *mmp;
      cp = norm2(r);
      beta = cp / c;
      x   = x   + alpha* p ;
      p   = r   + beta* p ;
      std::cout << "iteration "<<k<<" cp " <<std::sqrt(cp/ssq) << std::endl;
      if (cp <= rsq) {
        return;
      }
    }
    assert(0);
 }
 // Flip sign to make prop to p^2, not -p^2 relative to last example
 template<class Gimpl,class Field> class CovariantLaplacianCshift : public SparseMatrixBase<Field>
 {
 public:
  INHERIT_GIMPL_TYPES(Gimpl);
  GridBase *grid;
  GaugeField U;
  RealD m2=1.0e-2;
  CovariantLaplacianCshift(GaugeField &_U)    :
    grid(_U.Grid()),
    U(_U) {  };
  virtual GridBase *Grid(void) { return grid; };
  virtual void  M    (const Field &in, Field &out)
  {
    out=Zero();
    for(int mu=0;mu<Nd-1;mu++) {
      GaugeLinkField Umu = PeekIndex<LorentzIndex>(U, mu); // NB: Inefficent
      out = out - Gimpl::CovShiftForward(Umu,mu,in);    
      out = out - Gimpl::CovShiftBackward(Umu,mu,in);    
      out = out + 2.0*in + m2*in;
    }
  };
  virtual void  Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian
  virtual  void Mdiag    (const Field &in, Field &out)                  {assert(0);}; // Unimplemented need only for multigrid
  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid
  virtual  void MdirAll  (const Field &in, std::vector<Field> &out)     {assert(0);}; // Unimplemented need only for multigrid
 };
 int main(int argc, char ** argv)
 {
  Grid_init(&argc, &argv);
  typedef LatticeColourVector Field;
  auto latt_size   = GridDefaultLatt();
  auto simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  auto mpi_layout  = GridDefaultMpi();
  GridCartesian    Grid(latt_size,simd_layout,mpi_layout);
  GridParallelRNG  RNG(&Grid);  RNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
  LatticeGaugeField U(&Grid);
  SU<Nc>::ColdConfiguration(RNG,U);
  typedef CovariantLaplacianCshift <PeriodicGimplR,Field> Laplacian_t;
  Laplacian_t Laplacian(U);
  ColourVector ColourKronecker;
  ColourKronecker = Zero();
  ColourKronecker()()(0) = 1.0;
  Coordinate site({0,0,0,0}); // Point source at origin
  Field kronecker(&Grid);
  kronecker = Zero();
  pokeSite(ColourKronecker,kronecker,site);
  Field psi(&Grid); psi=Zero();
  HermitianLinearOperator<Laplacian_t,Field> HermOp(Laplacian);
  SimpleConjugateGradient(HermOp, kronecker,psi);
  Field r(&Grid);
  Laplacian.M(psi,r);
  r=kronecker-r;
  std::cout << "True residual "<< norm2(r) <<std::endl;
  // Optionally print the result vector
  // std::cout << psi<<std::endl;
  Grid_finalize();
 }
--- a/examples/Example_Mobius_spectrum.cc
+++ b/examples/Example_Mobius_spectrum.cc
@@ -1,314 +0,0 @@
 /*
 * Warning: This code illustrative only: not well tested, and not meant for production use
 * without regression / tests being applied
 */
 #include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
 template<class Gimpl,class Field> class CovariantLaplacianCshift : public SparseMatrixBase<Field>
 {
 public:
  INHERIT_GIMPL_TYPES(Gimpl);
  GridBase *grid;
  GaugeField U;
  CovariantLaplacianCshift(GaugeField &_U)    :
    grid(_U.Grid()),
    U(_U) {  };
  virtual GridBase *Grid(void) { return grid; };
  virtual void  M    (const Field &in, Field &out)
  {
    out=Zero();
    for(int mu=0;mu<Nd-1;mu++) {
      GaugeLinkField Umu = PeekIndex<LorentzIndex>(U, mu); // NB: Inefficent
      out = out - Gimpl::CovShiftForward(Umu,mu,in);    
      out = out - Gimpl::CovShiftBackward(Umu,mu,in);    
      out = out + 2.0*in;
    }
  };
  virtual void  Mdag (const Field &in, Field &out) { M(in,out);}; // Laplacian is hermitian
  virtual  void Mdiag    (const Field &in, Field &out)                  {assert(0);}; // Unimplemented need only for multigrid
  virtual  void Mdir     (const Field &in, Field &out,int dir, int disp){assert(0);}; // Unimplemented need only for multigrid
  virtual  void MdirAll  (const Field &in, std::vector<Field> &out)     {assert(0);}; // Unimplemented need only for multigrid
 };
 void MakePhase(Coordinate mom,LatticeComplex &phase)
 {
  GridBase *grid = phase.Grid();
  auto latt_size = grid->GlobalDimensions();
  ComplexD ci(0.0,1.0);
  phase=Zero();
  LatticeComplex coor(phase.Grid());
  for(int mu=0;mu<Nd;mu++){
    RealD TwoPiL =  M_PI * 2.0/ latt_size[mu];
    LatticeCoordinate(coor,mu);
    phase = phase + (TwoPiL * mom[mu]) * coor;
  }
  phase = exp(phase*ci);
 }
 void PointSource(Coordinate &coor,LatticePropagator &source)
 {
  //  Coordinate coor({0,0,0,0});
  source=Zero();
  SpinColourMatrix kronecker; kronecker=1.0;
  pokeSite(kronecker,source,coor);
 }
 void Z2WallSource(GridParallelRNG &RNG,int tslice,LatticePropagator &source)
 {
  GridBase *grid = source.Grid();
  LatticeComplex noise(grid);
  LatticeComplex zz(grid); zz=Zero();
  LatticeInteger t(grid);
  RealD nrm=1.0/sqrt(2);
  bernoulli(RNG, noise); // 0,1 50:50
  noise = (2.*noise - Complex(1,1))*nrm;
  LatticeCoordinate(t,Tdir);
  noise = where(t==Integer(tslice), noise, zz);
  source = 1.0;
  source = source*noise;
  std::cout << " Z2 wall " << norm2(source) << std::endl;
 }
 template<class Field>
 void GaussianSmear(LatticeGaugeField &U,Field &unsmeared,Field &smeared)
 {
  typedef CovariantLaplacianCshift <PeriodicGimplR,Field> Laplacian_t;
  Laplacian_t Laplacian(U);
  Integer Iterations = 40;
  Real width = 2.0;
  Real coeff = (width*width) / Real(4*Iterations);
  Field tmp(U.Grid());
  smeared=unsmeared;
  //  chi = (1-p^2/2N)^N kronecker
  for(int n = 0; n < Iterations; ++n) {
    Laplacian.M(smeared,tmp);
    smeared = smeared - coeff*tmp;
    std::cout << " smear iter " << n<<" " <<norm2(smeared)<<std::endl;
  }
 }
 void GaussianSource(Coordinate &site,LatticeGaugeField &U,LatticePropagator &source)
 {
  LatticePropagator tmp(source.Grid());
  PointSource(site,source);
  std::cout << " GaussianSource Kronecker "<< norm2(source)<<std::endl;
  tmp = source;
  GaussianSmear(U,tmp,source);
  std::cout << " GaussianSource Smeared "<< norm2(source)<<std::endl;
 }
 void GaussianWallSource(GridParallelRNG &RNG,int tslice,LatticeGaugeField &U,LatticePropagator &source)
 {
  Z2WallSource(RNG,tslice,source);
  auto tmp = source;
  GaussianSmear(U,tmp,source);
 }
 void SequentialSource(int tslice,Coordinate &mom,LatticePropagator &spectator,LatticePropagator &source)
 {
  assert(mom.size()==Nd);
  assert(mom[Tdir] == 0);
  GridBase * grid = spectator.Grid();
  LatticeInteger ts(grid);
  LatticeCoordinate(ts,Tdir);
  source = Zero();
  source = where(ts==Integer(tslice),spectator,source); // Stick in a slice of the spectator, zero everywhere else
  LatticeComplex phase(grid);
  MakePhase(mom,phase);
  source = source *phase;
 }
 template<class Action>
 void Solve(Action &D,LatticePropagator &source,LatticePropagator &propagator)
 {
  GridBase *UGrid = D.GaugeGrid();
  GridBase *FGrid = D.FermionGrid();
  LatticeFermion src4  (UGrid); 
  LatticeFermion src5  (FGrid); 
  LatticeFermion result5(FGrid);
  LatticeFermion result4(UGrid);
  ConjugateGradient<LatticeFermion> CG(1.0e-8,100000);
  SchurRedBlackDiagMooeeSolve<LatticeFermion> schur(CG);
  ZeroGuesser<LatticeFermion> ZG; // Could be a DeflatedGuesser if have eigenvectors
  for(int s=0;s<Nd;s++){
    for(int c=0;c<Nc;c++){
      PropToFerm<Action>(src4,source,s,c);
      D.ImportPhysicalFermionSource(src4,src5);
      result5=Zero();
      schur(D,src5,result5,ZG);
      std::cout<<GridLogMessage
 	       <<"spin "<<s<<" color "<<c
 	       <<" norm2(src5d) "   <<norm2(src5)
               <<" norm2(result5d) "<<norm2(result5)<<std::endl;
      D.ExportPhysicalFermionSolution(result5,result4);
      FermToProp<Action>(propagator,result4,s,c);
    }
  }
 }
 class MesonFile: Serializable {
 public:
  GRID_SERIALIZABLE_CLASS_MEMBERS(MesonFile, std::vector<std::vector<Complex> >, data);
 };
 void MesonTrace(std::string file,LatticePropagator &q1,LatticePropagator &q2,LatticeComplex &phase)
 {
  const int nchannel=4;
  Gamma::Algebra Gammas[nchannel][2] = {
    {Gamma::Algebra::Gamma5      ,Gamma::Algebra::Gamma5},
    {Gamma::Algebra::GammaTGamma5,Gamma::Algebra::GammaTGamma5},
    {Gamma::Algebra::GammaTGamma5,Gamma::Algebra::Gamma5},
    {Gamma::Algebra::Gamma5      ,Gamma::Algebra::GammaTGamma5}
  };
  Gamma G5(Gamma::Algebra::Gamma5);
  LatticeComplex meson_CF(q1.Grid());
  MesonFile MF;
  for(int ch=0;ch<nchannel;ch++){
    Gamma Gsrc(Gammas[ch][0]);
    Gamma Gsnk(Gammas[ch][1]);
    meson_CF = trace(G5*adj(q1)*G5*Gsnk*q2*adj(Gsrc));
    std::vector<TComplex> meson_T;
    sliceSum(meson_CF,meson_T, Tdir);
    int nt=meson_T.size();
    std::vector<Complex> corr(nt);
    for(int t=0;t<nt;t++){
      corr[t] = TensorRemove(meson_T[t]); // Yes this is ugly, not figured a work around
      std::cout << " channel "<<ch<<" t "<<t<<" " <<corr[t]<<std::endl;
    }
    MF.data.push_back(corr);
  }
  {
    XmlWriter WR(file);
    write(WR,"MesonFile",MF);
  }
 }
 int main (int argc, char ** argv)
 {
  const int Ls=8;
  Grid_init(&argc,&argv);
  // Double precision grids
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), 
 								   GridDefaultSimd(Nd,vComplex::Nsimd()),
 								   GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  //////////////////////////////////////////////////////////////////////
  // You can manage seeds however you like.
  // Recommend SeedUniqueString.
  //////////////////////////////////////////////////////////////////////
  std::vector<int> seeds4({1,2,3,4}); 
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
  LatticeGaugeField Umu(UGrid);
  std::string config;
  if( argc > 1 && argv[1][0] != '-' )
  {
    std::cout<<GridLogMessage <<"Loading configuration from "<<argv[1]<<std::endl;
    FieldMetaData header;
    NerscIO::readConfiguration(Umu, header, argv[1]);
    config=argv[1];
  }
  else
  {
    std::cout<<GridLogMessage <<"Using hot configuration"<<std::endl;
    SU<Nc>::ColdConfiguration(Umu);
    //    SU<Nc>::HotConfiguration(RNG4,Umu);
    config="HotConfig";
  }
  std::vector<RealD> masses({ 0.03,0.04,0.45} ); // u/d, s, c ??
  int nmass = masses.size();
  std::vector<MobiusFermionR *> FermActs;
  std::cout<<GridLogMessage <<"======================"<<std::endl;
  std::cout<<GridLogMessage <<"MobiusFermion action as Scaled Shamir kernel"<<std::endl;
  std::cout<<GridLogMessage <<"======================"<<std::endl;
  for(auto mass: masses) {
    RealD M5=1.0;
    RealD b=1.5;// Scale factor b+c=2, b-c=1
    RealD c=0.5;
    FermActs.push_back(new MobiusFermionR(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,b,c));
  }
  LatticePropagator point_source(UGrid);
  LatticePropagator wall_source(UGrid);
  LatticePropagator gaussian_source(UGrid);
  Coordinate Origin({0,0,0,0});
  PointSource   (Origin,point_source);
  Z2WallSource  (RNG4,0,wall_source);
  GaussianSource(Origin,Umu,gaussian_source);
  std::vector<LatticePropagator> PointProps(nmass,UGrid);
  std::vector<LatticePropagator> GaussProps(nmass,UGrid);
  std::vector<LatticePropagator> Z2Props   (nmass,UGrid);
  for(int m=0;m<nmass;m++) {
    Solve(*FermActs[m],point_source   ,PointProps[m]);
    Solve(*FermActs[m],gaussian_source,GaussProps[m]);
    Solve(*FermActs[m],wall_source    ,Z2Props[m]);
  }
  LatticeComplex phase(UGrid);
  Coordinate mom({0,0,0,0});
  MakePhase(mom,phase);
  for(int m1=0 ;m1<nmass;m1++) {
  for(int m2=m1;m2<nmass;m2++) {
    std::stringstream ssp,ssg,ssz;
    ssp<<config<< "_m" << m1 << "_m"<< m2 << "_point_meson.xml";
    ssg<<config<< "_m" << m1 << "_m"<< m2 << "_smeared_meson.xml";
    ssz<<config<< "_m" << m1 << "_m"<< m2 << "_wall_meson.xml";
    MesonTrace(ssp.str(),PointProps[m1],PointProps[m2],phase);
    MesonTrace(ssg.str(),GaussProps[m1],GaussProps[m2],phase);
    MesonTrace(ssz.str(),Z2Props[m1],Z2Props[m2],phase);
  }}
  Grid_finalize();
 }
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -1,6 +0,0 @@
 SUBDIRS = . 
 include Make.inc
--- a/scripts/filelist
+++ b/scripts/filelist
@@ -82,18 +82,3 @@ for f in $TESTS; do
    echo >> Make.inc
 done
 cd ..
 # examples Make.inc
 cd $home/examples/
 echo> Make.inc
 TESTS=`ls *.cc`
 TESTLIST=`echo ${TESTS} | sed s/.cc//g `
 echo bin_PROGRAMS = ${TESTLIST} > Make.inc
 echo >> Make.inc
 for f in $TESTS; do
    BNAME=`basename $f .cc`
    echo ${BNAME}_SOURCES=$f  >> Make.inc
    echo ${BNAME}_LDADD='$(top_builddir)/Grid/libGrid.a'>> Make.inc
    echo >> Make.inc
 done
 cd ..
--- a/tests/IO/Test_serialisation.cc
+++ b/tests/IO/Test_serialisation.cc
@@ -48,7 +48,9 @@ public:
                          std::vector<double>, array,
                          std::vector<std::vector<double> >, twodimarray,
 			  std::vector<std::vector<std::vector<std::complex<double>> > >, cmplx3darray,
-			  SpinColourMatrix, scm
+			  SpinColourMatrix, scm,
                          std::vector<std::vector<std::vector<int> > >, ragged,
                          std::vector<std::vector<SpinColourMatrix> >, vscm
                          );
  myclass() {}
  myclass(int i)
@@ -56,6 +58,10 @@ public:
  , twodimarray(3,std::vector<double>(5, 1.23456))
  , cmplx3darray(3,std::vector<std::vector<std::complex<double>>>(5, std::vector<std::complex<double>>(7, std::complex<double>(1.2, 3.4))))
  , ve(2, myenum::blue)
  , ragged( {{{i+1},{i+2,i+3}}, // ragged
            {{i+4,i+5,i+6,i+7},{i+8,i+9,i+10,i+11},{i+12,i+13,i+14,i+15}}, // block
            {{i+16,i+17},{i+18,i+19,i+20}}} ) //ragged
  , vscm(3, std::vector<SpinColourMatrix>(5))
  {
    e=myenum::red;
    x=i;
@@ -68,6 +74,13 @@ public:
    scm()(0, 2)(1, 1) = 6.336;
    scm()(2, 1)(2, 2) = 7.344;
    scm()(1, 1)(2, 0) = 8.3534;
    int Counter = i;
    for( auto & v : vscm ) {
      for( auto & j : v ) {
        j = std::complex<double>(Counter, -Counter);
        Counter++;
      }
    }
  }
 };
--- a/tests/lanczos/Test_dwf_block_lanczos.README
+++ b/tests/lanczos/Test_dwf_block_lanczos.README
@@ -1,73 +0,0 @@
 #Example script 
 DIR=/gpfs/alpine/phy157/proj-shared/phy157dwf/chulwoo/Grid/BL/build/tests/lanczos
 BIN=${DIR}/Test_dwf_block_lanczos
 VOL='--grid 16.16.16.32 '
 GRID='--mpi 1.1.1.4 ' 
 CONF='--gconf ckpoint_lat.IEEE64BIG.2000 '
 OPT='--mass 0.01 --M5 1.8 --phase in.params --omega in.params --shm 4096' 
 #BL='--rbl 16.1024.128.1000.10 --split 1.1.4.4 --check_int 100 --resid 1.0e-5 --cheby_l 0.007 --cheby_u 7 --cheby_n 51'
 BL='--rbl 4.128.16.100.10 --split 1.1.1.4 --check_int 25 --resid 1.0e-5 --cheby_l 0.007 --cheby_u 7 --cheby_n 51'
 ARGS=${CONF}" "${OPT}" "${BL}" "${VOL}" "${GRID}
 export APP="${BIN}  ${ARGS}"
 echo APP=${APP}
 #export JS="jsrun --nrs 32 -a4 -g4 -c42 -dpacked -b packed:7 --smpiargs="-gpu" "
 export JS="jsrun --nrs 1 -a4 -g4 -c42 -dpacked -b  packed:10  --smpiargs="-gpu" "
 $JS  $APP
 #sample in.param 
 boundary_phase 0 1 0
 boundary_phase 1 1 0
 boundary_phase 2 1 0
 boundary_phase 3 -1 0
 omega 0 0.5 0
 omega 1 0.5 0
 omega 2 0.5 0
 omega 3 0.5 0
 omega 4 0.5 0
 omega 5 0.5 0
 omega 6 0.5 0
 omega 7 0.5 0
 omega 8 0.5 0
 omega 9 0.5 0
 omega 10 0.5 0
 omega 11 0.5 0
 #output 
 Grid : Message : 1.717474 s :  Gauge Configuration ckpoint_lat.IEEE64BIG.2000
 Grid : Message : 1.717478 s :  boundary_phase[0] = (1,0)
 Grid : Message : 1.717497 s :  boundary_phase[1] = (1,0)
 Grid : Message : 1.717500 s :  boundary_phase[2] = (1,0)
 Grid : Message : 1.717503 s :  boundary_phase[3] = (-1,0)
 Grid : Message : 1.717506 s :  Ls 12
 Grid : Message : 1.717507 s :  mass 0.01
 Grid : Message : 1.717510 s :  M5 1.8
 Grid : Message : 1.717512 s :  mob_b 1.5
 Grid : Message : 1.717514 s :  omega[0] = (0.5,0)
 Grid : Message : 1.717517 s :  omega[1] = (0.5,0)
 Grid : Message : 1.717520 s :  omega[2] = (0.5,0)
 Grid : Message : 1.717523 s :  omega[3] = (0.5,0)
 Grid : Message : 1.717526 s :  omega[4] = (0.5,0)
 Grid : Message : 1.717529 s :  omega[5] = (0.5,0)
 Grid : Message : 1.717532 s :  omega[6] = (0.5,0)
 Grid : Message : 1.717535 s :  omega[7] = (0.5,0)
 Grid : Message : 1.717538 s :  omega[8] = (0.5,0)
 Grid : Message : 1.717541 s :  omega[9] = (0.5,0)
 Grid : Message : 1.717544 s :  omega[10] = (0.5,0)
 Grid : Message : 1.717547 s :  omega[11] = (0.5,0)
 Grid : Message : 1.717550 s :  Nu 4
 Grid : Message : 1.717551 s :  Nk 128
 Grid : Message : 1.717552 s :  Np 16
 Grid : Message : 1.717553 s :  Nm 288
 Grid : Message : 1.717554 s :  Nstop 100
 Grid : Message : 1.717555 s :  Ntest 25
 Grid : Message : 1.717557 s :  MaxIter 10
 Grid : Message : 1.717558 s :  resid 1e-05
 Grid : Message : 1.717560 s :  Cheby Poly 0.007,7,51
--- a/tests/lanczos/Test_dwf_block_lanczos.cc
+++ b/tests/lanczos/Test_dwf_block_lanczos.cc
@@ -1,403 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/Test_dwf_block_lanczos.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 #include <Grid/util/Init.h>
 #include <Grid/algorithms/iterative/ImplicitlyRestartedBlockLanczos.h>
 using namespace std;
 using namespace Grid;
 //using namespace Grid::QCD;
 //typedef typename GparityDomainWallFermionR::FermionField FermionField;
 typedef typename ZMobiusFermionF::FermionField FermionField;
 RealD AllZero(RealD x){ return 0.;}
 class CmdJobParams 
 {
  public:
    std::string gaugefile;
    int Ls;
    double mass;
    double M5;
    double mob_b;
    std::vector<ComplexD> omega;
    std::vector<Complex> boundary_phase;
    std::vector<int> mpi_split;
    LanczosType Impl;
    int Nu;
    int Nk;
    int Np;
    int Nm;
    int Nstop;
    int Ntest;
    int MaxIter;
    double resid;
    double low;
    double high;
    int order;
    CmdJobParams()
      : gaugefile("Hot"),
        Ls(8), mass(0.01), M5(1.8), mob_b(1.5),
        Impl(LanczosType::irbl),mpi_split(4,1),
        Nu(4), Nk(200), Np(200), Nstop(100), Ntest(1), MaxIter(10), resid(1.0e-8), 
        low(0.2), high(5.5), order(11)
    {Nm=Nk+Np;};
    void Parse(char **argv, int argc);
 };
 void CmdJobParams::Parse(char **argv,int argc)
 {
  std::string arg;
  std::vector<int> vi;
  double re,im;
  int expect, idx;
  std::string vstr;
  std::ifstream pfile;
  if( GridCmdOptionExists(argv,argv+argc,"--gconf") ){
    gaugefile = GridCmdOptionPayload(argv,argv+argc,"--gconf");
  }
  if( GridCmdOptionExists(argv,argv+argc,"--phase") ){
    arg = GridCmdOptionPayload(argv,argv+argc,"--phase");
    pfile.open(arg);
    assert(pfile);
    expect = 0;
    while( pfile >> vstr ) {
      if ( vstr.compare("boundary_phase") == 0 ) {
        pfile >> vstr;
        GridCmdOptionInt(vstr,idx);
        assert(expect==idx);
        pfile >> vstr;
        GridCmdOptionFloat(vstr,re);
        pfile >> vstr;
        GridCmdOptionFloat(vstr,im);
        boundary_phase.push_back({re,im});
        expect++;
      }
    }
    pfile.close();
  } else {
    for (int i=0; i<4; ++i) boundary_phase.push_back({1.,0.});
  }
  if( GridCmdOptionExists(argv,argv+argc,"--omega") ){
    arg = GridCmdOptionPayload(argv,argv+argc,"--omega");
    pfile.open(arg);
    assert(pfile);
    Ls = 0;
    while( pfile >> vstr ) {
      if ( vstr.compare("omega") == 0 ) {
        pfile >> vstr;
        GridCmdOptionInt(vstr,idx);
        assert(Ls==idx);
        pfile >> vstr;
        GridCmdOptionFloat(vstr,re);
        pfile >> vstr;
        GridCmdOptionFloat(vstr,im);
        omega.push_back({re,im});
        Ls++;
      }
    }
    pfile.close();
  } else {
    if( GridCmdOptionExists(argv,argv+argc,"--Ls") ){
      arg = GridCmdOptionPayload(argv,argv+argc,"--Ls");
      GridCmdOptionInt(arg,Ls);
    }
  }
  if( GridCmdOptionExists(argv,argv+argc,"--mass") ){
    arg = GridCmdOptionPayload(argv,argv+argc,"--mass");
    GridCmdOptionFloat(arg,mass);
  }
  if( GridCmdOptionExists(argv,argv+argc,"--M5") ){
    arg = GridCmdOptionPayload(argv,argv+argc,"--M5");
    GridCmdOptionFloat(arg,M5);
  }
  if( GridCmdOptionExists(argv,argv+argc,"--mob_b") ){
    arg = GridCmdOptionPayload(argv,argv+argc,"--mob_b");
    GridCmdOptionFloat(arg,mob_b);
  }
  if( GridCmdOptionExists(argv,argv+argc,"--irbl") ){
    arg = GridCmdOptionPayload(argv,argv+argc,"--irbl");
    GridCmdOptionIntVector(arg,vi);
    Nu = vi[0];
    Nk = vi[1];
    Np = vi[2];
    Nstop = vi[3];
    MaxIter = vi[4];
    // ypj[fixme] mode overriding message is needed.
    Impl = LanczosType::irbl;
    Nm = Nk+Np;
  }
  // block Lanczos with explicit extension of its dimensions
  if( GridCmdOptionExists(argv,argv+argc,"--rbl") ){
    arg = GridCmdOptionPayload(argv,argv+argc,"--rbl");
    GridCmdOptionIntVector(arg,vi);
    Nu = vi[0];
    Nk = vi[1];
    Np = vi[2]; // vector space is enlarged by adding Np vectors
    Nstop = vi[3];
    MaxIter = vi[4];
    // ypj[fixme] mode overriding message is needed.
    Impl = LanczosType::rbl;
    Nm = Nk+Np*MaxIter;
  }
 #if 1
  // block Lanczos with explicit extension of its dimensions
  if( GridCmdOptionExists(argv,argv+argc,"--split") ){
    arg = GridCmdOptionPayload(argv,argv+argc,"--split");
    GridCmdOptionIntVector(arg,vi);
    for(int i=0;i<mpi_split.size();i++)
    mpi_split[i] = vi[i];
  }
 #endif
  if( GridCmdOptionExists(argv,argv+argc,"--check_int") ){
    arg = GridCmdOptionPayload(argv,argv+argc,"--check_int");
    GridCmdOptionInt(arg,Ntest);
  }
  if( GridCmdOptionExists(argv,argv+argc,"--resid") ){
    arg = GridCmdOptionPayload(argv,argv+argc,"--resid");
    GridCmdOptionFloat(arg,resid);
  }
  if( GridCmdOptionExists(argv,argv+argc,"--cheby_l") ){
    arg = GridCmdOptionPayload(argv,argv+argc,"--cheby_l");
    GridCmdOptionFloat(arg,low);
  }
  if( GridCmdOptionExists(argv,argv+argc,"--cheby_u") ){
    arg = GridCmdOptionPayload(argv,argv+argc,"--cheby_u");
    GridCmdOptionFloat(arg,high);
  }
  if( GridCmdOptionExists(argv,argv+argc,"--cheby_n") ){
    arg = GridCmdOptionPayload(argv,argv+argc,"--cheby_n");
    GridCmdOptionInt(arg,order);
  }
  if ( CartesianCommunicator::RankWorld() == 0 ) {
    std::streamsize ss = std::cout.precision();
    std::cout << GridLogMessage <<" Gauge Configuration "<< gaugefile << '\n';
    std::cout.precision(15);
    for ( int i=0; i<4; ++i ) std::cout << GridLogMessage <<" boundary_phase["<< i << "] = " << boundary_phase[i] << '\n';
    std::cout.precision(ss);
    std::cout << GridLogMessage <<" Ls "<< Ls << '\n';
    std::cout << GridLogMessage <<" mass "<< mass << '\n';
    std::cout << GridLogMessage <<" M5 "<< M5 << '\n';
    std::cout << GridLogMessage <<" mob_b "<< mob_b << '\n';
    std::cout.precision(15);
    for ( int i=0; i<Ls; ++i ) std::cout << GridLogMessage <<" omega["<< i << "] = " << omega[i] << '\n';
    std::cout.precision(ss);
    std::cout << GridLogMessage <<" Nu "<< Nu << '\n'; 
    std::cout << GridLogMessage <<" Nk "<< Nk << '\n'; 
    std::cout << GridLogMessage <<" Np "<< Np << '\n'; 
    std::cout << GridLogMessage <<" Nm "<< Nm << '\n'; 
    std::cout << GridLogMessage <<" Nstop "<< Nstop << '\n'; 
    std::cout << GridLogMessage <<" Ntest "<< Ntest << '\n'; 
    std::cout << GridLogMessage <<" MaxIter "<< MaxIter << '\n'; 
    std::cout << GridLogMessage <<" resid "<< resid << '\n'; 
    std::cout << GridLogMessage <<" Cheby Poly "<< low << "," << high << "," << order << std::endl; 
  }
 }
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  CmdJobParams JP;
  JP.Parse(argv,argc);
  GridCartesian         * UGridD   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplexF::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(JP.Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(JP.Ls,UGrid);
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
  // ypj [note] why seed RNG5 again? bug? In this case, run with a default seed().
  GridParallelRNG          RNG5rb(FrbGrid);  RNG5rb.SeedFixedIntegers(seeds5);
  LatticeGaugeField UmuD(UGridD); 
  LatticeGaugeFieldF Umu(UGrid); 
  std::vector<LatticeColourMatrixF> U(4,UGrid);
  if ( JP.gaugefile.compare("Hot") == 0 ) {
    SU3::HotConfiguration(RNG4, UmuD);
  } else {
    FieldMetaData header;
    NerscIO::readConfiguration(UmuD,header,JP.gaugefile);
    // ypj [fixme] additional checks for the loaded configuration?
  }
  precisionChange(Umu,UmuD);
  for(int mu=0;mu<Nd;mu++){
    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
  }
  RealD mass = JP.mass;
  RealD M5 = JP.M5;
 // ypj [fixme] flexible support for a various Fermions
 //  RealD mob_b = JP.mob_b;      // Gparity
 //  std::vector<ComplexD> omega; // ZMobius
 //  GparityMobiusFermionD ::ImplParams params;
 //  std::vector<int> twists({1,1,1,0});
 //  params.twists = twists;
 //  GparityMobiusFermionR  Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,mob_b,mob_b-1.,params);
 //  SchurDiagTwoOperator<GparityMobiusFermionR,FermionField> HermOp(Ddwf);
 //  int mrhs = JP.Nu;
  int Ndir=4;
  auto mpi_layout  = GridDefaultMpi();
  std::vector<int> mpi_split (Ndir,1);
 #if 0
    int tmp=mrhs, dir=0;
    std::cout << GridLogMessage  << "dir= "<<dir <<"tmp= "<<tmp<<"mpi_split= "<<mpi_split[dir]<<"mpi_layout= "<<mpi_split[dir]<<std::endl;
    while ( tmp> 1) {
    if ((mpi_split[dir]*2) <= mpi_layout[dir]){
        mpi_split[dir] *=2;
        tmp = tmp/2;
    }
    std::cout << GridLogMessage  << "dir= "<<dir <<"tmp= "<<tmp<<"mpi_split= "<<mpi_split[dir]<<"mpi_layout= "<<mpi_layout[dir]<<std::endl;
        dir = (dir+1)%Ndir;
    }
 #endif
    int mrhs=1;
    for(int i =0;i<Ndir;i++){
      mpi_split[i] = mpi_layout[i] / JP.mpi_split[i] ;
      mrhs *= JP.mpi_split[i];
    }
    std::cout << GridLogMessage  << "mpi_layout= " << mpi_layout << std::endl;
    std::cout << GridLogMessage  << "mpi_split= " << mpi_split << std::endl;
    std::cout << GridLogMessage  << "mrhs= " << mrhs << std::endl;
 //    assert(JP.Nu==tmp);
  /////////////////////////////////////////////
  // Split into 1^4 mpi communicators
  /////////////////////////////////////////////
  GridCartesian         * SGrid = new GridCartesian(GridDefaultLatt(),
                                                    GridDefaultSimd(Nd,vComplexF::Nsimd()),
                                                    mpi_split,
                                                    *UGrid);
  GridCartesian         * SFGrid   = SpaceTimeGrid::makeFiveDimGrid(JP.Ls,SGrid);
  GridRedBlackCartesian * SrbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(SGrid);
  GridRedBlackCartesian * SFrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(JP.Ls,SGrid);
  LatticeGaugeFieldF s_Umu(SGrid);
  Grid_split  (Umu,s_Umu);
  //WilsonFermionR::ImplParams params;
  ZMobiusFermionF::ImplParams params;
  params.overlapCommsCompute = true;
  params.boundary_phases = JP.boundary_phase;
  ZMobiusFermionF  Ddwf(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,JP.omega,1.,0.,params);
 //  SchurDiagTwoOperator<ZMobiusFermionR,FermionField> HermOp(Ddwf);
  SchurDiagOneOperator<ZMobiusFermionF,FermionField> HermOp(Ddwf);
  ZMobiusFermionF  Dsplit(s_Umu,*SFGrid,*SFrbGrid,*SGrid,*SrbGrid,mass,M5,JP.omega,1.,0.,params);
 //  SchurDiagTwoOperator<ZMobiusFermionR,FermionField> SHermOp(Dsplit);
  SchurDiagOneOperator<ZMobiusFermionF,FermionField> SHermOp(Dsplit);
  //std::vector<double> Coeffs { 0.,-1.}; 
  // ypj [note] this may not be supported by some compilers
  std::vector<double> Coeffs({ 0.,-1.}); 
  Polynomial<FermionField> PolyX(Coeffs);
  //Chebyshev<FermionField> Cheb(0.2,5.5,11);
  Chebyshev<FermionField> Cheb(JP.low,JP.high,JP.order);
 //  Cheb.csv(std::cout);
  ImplicitlyRestartedBlockLanczos<FermionField> IRBL(HermOp, SHermOp,
 						     FrbGrid,SFrbGrid,mrhs,
                                                     Cheb,
                                                     JP.Nstop, JP.Ntest,
                                                     JP.Nu, JP.Nk, JP.Nm,
                                                     JP.resid,
                                                     JP.MaxIter,
 						     IRBLdiagonaliseWithEigen);
 //						     IRBLdiagonaliseWithLAPACK);
  IRBL.split_test=0;
  std::vector<RealD> eval(JP.Nm);
  std::vector<FermionField> src(JP.Nu,FrbGrid);
 if (0)
 {
 // in case RNG is too slow
  std::cout << GridLogMessage << "Using RNG5"<<std::endl;
  FermionField src_tmp(FGrid);
  for ( int i=0; i<JP.Nu; ++i ){
 //    gaussian(RNG5,src_tmp);
     ComplexD rnd;
     RealD re;
     fillScalar(re,RNG5._gaussian[0],RNG5._generators[0]);
    std::cout << i <<" / "<< JP.Nm  <<" re "<< re  << std::endl;
 // printf("%d / %d re %e\n",i,FGrid->_processor,re);
    src_tmp=re;
    pickCheckerboard(Odd,src[i],src_tmp);
  }
  RNG5.Report();
 } else {
  std::cout << GridLogMessage << "Using RNG5rb"<<std::endl;
  for ( int i=0; i<JP.Nu; ++i )
    gaussian(RNG5rb,src[i]);
  RNG5rb.Report();
 }
  std::vector<FermionField> evec(JP.Nm,FrbGrid);
  for(int i=0;i<1;++i){
    std::cout << GridLogMessage << i <<" / "<< JP.Nm <<" grid pointer "<< evec[i].Grid() << std::endl;
  };
  int Nconv;
  IRBL.calc(eval,evec,src,Nconv,JP.Impl);
  Grid_finalize();
 }
--- a/tests/lanczos/dwf.batch
+++ b/tests/lanczos/dwf.batch
@@ -1,27 +0,0 @@
 # run script on Cori
 #!/bin/bash
 #SBATCH  -t 30
 #SBATCH  -N 16
 #SBATCH  -C knl
 #SBATCH  -A mp13
 #SBATCH  --ntasks-per-node=1
 export OMP_PROC_BIND=true
 export OMP_PLACES=threads
 export OMP_NUM_THREADS=64
 #BIN=benchmarks/Benchmark_ITT
 #OPT='--cpu-bind=cores' 
 BIN=./Test_dwf_block_lanczos
 rundir=.
 CONF='--gconf '${rundir}'/ckpoint_lat.IEEE64BIG.5000'
 VOL='--grid 16.16.16.32'
 GRID='--mpi 1.2.2.4'
 OPT='--mass 0.00054 --M5 1.8 --phase in.params --omega in.params --shm 4096'
 BL='--rbl 4.32.32.30.7 --split 1.2.2.1 --check_int 8 --resid 1.0e-5 --cheby_l 0.0027 --cheby_u 7 --cheby_n 51'
 #echo srun $OPT $BIN $gridoptions $jobparams
 #srun $OPT $BIN $gridoptions $jobparams
 echo srun $BIN $CONF $OPT $BL $VOL $GRID
 srun $BIN $CONF $OPT $BL $VOL $GRID
--- a/tests/lanczos/in.params
+++ b/tests/lanczos/in.params
@@ -1,17 +0,0 @@
 boundary_phase 0 1 0
 boundary_phase 1 1 0
 boundary_phase 2 1 0
 boundary_phase 3 -1 0
 omega 0 0.375 0
 omega 1 0.375 0
 omega 2 0.375 0
 omega 3 0.375 0
 omega 4 0.375 0
 omega 5 0.375 0
 omega 6 0.375 0
 omega 7 0.375 0
 omega 8 0.375 0
 omega 9 0.375 0
 omega 10 0.375 0
 omega 11 0.375 0
Author	SHA1	Message	Date
Antonin Portelli	a269a3d919	Merge pull request #358 from mmphys/feature/serialisation-test Add a ragged std::vector to the serialisation test	2021-06-09 10:16:25 +01:00
Michael Marshall	0c4f585496	Test nested std::vector<grid tensor>	2021-06-08 00:05:35 +01:00
Michael Marshall	33d2df46a0	Merge branch 'develop' into feature/serialisation-test * develop: Update README.md removing Travis CI constantly failing due to overtime (no way we can compile Grid on free time anymore)	2021-06-07 23:25:38 +01:00
Michael Marshall	2df308f649	Add a ragged vector to the serialisation tests. NB: Already had nested (regular) std::vector<std::vector<...>>	2021-06-07 23:25:07 +01:00
Antonin Portelli	298a6ec51e	Merge pull request #357 from mmphys/bugfix/ragged Bugfix/ragged Multi-dimensional ragged vectors	2021-06-04 10:34:46 +01:00
Michael Marshall	e5dbe488a6	Merge branch 'develop' into bugfix/ragged * develop: Remove synch	2021-06-03 08:25:56 +01:00
Michael Marshall	393727b93b	Documentation update (briefly) covering serialisation changes. For review	2021-06-01 15:49:37 +01:00
Michael Marshall	2b1fcd78c3	Fixes post review with Peter: a) Correct bug in isRegularShape - detect 3d matrix where 1st slice is 2x2 and second slice is 2x1; b) Synchronisation of EigenResizeCounter done by checking we're the OMP primary thread; c) Move definition of EigenResizeCounter to new file, BaseIO.cc	2021-05-31 22:24:54 +01:00
Michael Marshall	0a4e0b49a0	BaseIO: Added "EigenResizeCounter" to keep track of any allocations/deallocations to Eigen tensors during readback. On read, if the tensor is resized, EigenResizeCounter += delta memory (in bytes)	2021-05-31 12:49:56 +01:00
Michael Marshall	76af169f05	Add global namespace to Writer<T> and Reader<T> inside GRID_SERIALIZABLE_CLASS_MEMBERS (so that "using Grid" not necessary). Fix issue with output of Grid::iMatrix so that M<3>{{148,149,150,} {151,152,153,} {154155156}} becomes M<3>{{148,149,150} {151,152,153} {154,155,156}}	2021-05-31 08:43:02 +01:00
Michael Marshall	7b89232251	Extended HDF5 serialisation of std::vector<T> where T now also includes Grid scalar/vector/matrix Changed VectorUtils element traits to is_flattenable, because: a) contract changed on what it does; and b) no other Grid dependencies on element. Needs review. Initial tests work ... needs proper regression testing.	2021-05-30 20:27:53 +01:00
Michael Marshall	ef0ddd5d04	std::vector serialisation in hdf5 uses a different format if the vector is ragged. When reading back std::vector we need to check which format we're reading (since we don't know a priori) and this involves looking for attributes that may not exist. The c++ API: a) throws; and b) prints voluminous logging. Switched to non-throwing, non-logging, C version of the API after code review.	2021-05-24 18:43:55 +01:00
Michael Marshall	9b73dacf50	First row might still be ragged if multi dimensional. attrExists() doesn't throw, but easier to wrap in try ... catch than to explain in comment.	2021-05-22 04:34:32 +01:00
Michael Marshall	244b4aa07f	Serialise std::vector of numeric types as multidimensional object if size is regular ... or individually if ragged	2021-05-21 20:08:56 +01:00