Merge branch 'develop' into feature/mpi3

MPI 3 compile on non-linux
Travis fail fix attempt
2025-10-31 03:54:33 +00:00 · 2016-10-25 06:02:33 +01:00 · 2016-10-25 06:01:12 +01:00 · 2016-10-25 01:45:53 +01:00 · 2016-10-25 01:08:51 +01:00 · 2016-10-25 01:05:52 +01:00
43 changed files with 2834 additions and 3051 deletions
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@@ -153,9 +153,10 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
-    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NP<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
    err = ref-result; 
    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
    assert (norm2(err)< 1.0e-5 );
    Dw.Report();
  }
@@ -192,7 +193,7 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage << "Called Dw s_inner "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
-    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NP<<std::endl;
+    std::cout<<GridLogMessage << "mflop/s per rank =  "<< flops/(t1-t0)/NP<<std::endl;
    sDw.Report();
    if(0){
@@ -208,8 +209,7 @@ int main (int argc, char ** argv)
    std::cout<<GridLogMessage<< "res norms "<< norm2(result)<<" " <<norm2(sresult)<<std::endl;
-
+    RealD sum=0;
    RealF sum=0;
    for(int x=0;x<latt4[0];x++){
    for(int y=0;y<latt4[1];y++){
    for(int z=0;z<latt4[2];z++){
@@ -227,12 +227,12 @@ int main (int argc, char ** argv)
      }
    }}}}}
    std::cout<<GridLogMessage<<" difference between normal and simd is "<<sum<<std::endl;
    assert (sum< 1.0e-5 );
    if (1) {
      LatticeFermion sr_eo(sFGrid);
      LatticeFermion serr(sFGrid);
      LatticeFermion ssrc_e (sFrbGrid);
      LatticeFermion ssrc_o (sFrbGrid);
@@ -244,8 +244,6 @@ int main (int argc, char ** argv)
      setCheckerboard(sr_eo,ssrc_o);
      setCheckerboard(sr_eo,ssrc_e);
      serr = sr_eo-ssrc; 
      std::cout<<GridLogMessage << "EO src norm diff   "<< norm2(serr)<<std::endl;
      sr_e = zero;
      sr_o = zero;
@@ -263,7 +261,7 @@ int main (int argc, char ** argv)
      double flops=(1344.0*volume*ncall)/2;
      std::cout<<GridLogMessage << "sDeo mflop/s =   "<< flops/(t1-t0)<<std::endl;
-      std::cout<<GridLogMessage << "sDeo mflop/s per node   "<< flops/(t1-t0)/NP<<std::endl;
+      std::cout<<GridLogMessage << "sDeo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl;
      sDw.Report();
      sDw.DhopEO(ssrc_o,sr_e,DaggerNo);
@@ -273,9 +271,18 @@ int main (int argc, char ** argv)
      pickCheckerboard(Even,ssrc_e,sresult);
      pickCheckerboard(Odd ,ssrc_o,sresult);
      ssrc_e = ssrc_e - sr_e;
      RealD error = norm2(ssrc_e);
      std::cout<<GridLogMessage << "sE norm diff   "<< norm2(ssrc_e)<< "  vec nrm"<<norm2(sr_e) <<std::endl;
      ssrc_o = ssrc_o - sr_o;
      error+= norm2(ssrc_o);
      std::cout<<GridLogMessage << "sO norm diff   "<< norm2(ssrc_o)<< "  vec nrm"<<norm2(sr_o) <<std::endl;
      if(error>1.0e-5) { 
 	setCheckerboard(ssrc,ssrc_o);
 	setCheckerboard(ssrc,ssrc_e);
 	std::cout<< ssrc << std::endl;
      }
    }
@@ -307,7 +314,7 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
  err = ref-result; 
  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
-
+  assert(norm2(err)<1.0e-5);
  LatticeFermion src_e (FrbGrid);
  LatticeFermion src_o (FrbGrid);
  LatticeFermion r_e   (FrbGrid);
@@ -334,7 +341,7 @@ int main (int argc, char ** argv)
    double flops=(1344.0*volume*ncall)/2;
    std::cout<<GridLogMessage << "Deo mflop/s =   "<< flops/(t1-t0)<<std::endl;
-    std::cout<<GridLogMessage << "Deo mflop/s per node   "<< flops/(t1-t0)/NP<<std::endl;
+    std::cout<<GridLogMessage << "Deo mflop/s per rank   "<< flops/(t1-t0)/NP<<std::endl;
    Dw.Report();
  }
  Dw.DhopEO(src_o,r_e,DaggerNo);
@@ -350,11 +357,14 @@ int main (int argc, char ** argv)
  err = r_eo-result; 
  std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
  assert(norm2(err)<1.0e-5);
  pickCheckerboard(Even,src_e,err);
  pickCheckerboard(Odd,src_o,err);
  std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl;
  std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl;
  assert(norm2(src_e)<1.0e-5);
  assert(norm2(src_o)<1.0e-5);
  }
--- a/configure.ac
+++ b/configure.ac
@@ -244,6 +244,9 @@ case ${ac_COMMS} in
     mpi)
       AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] )
     ;;
     mpi3)
       AC_DEFINE([GRID_COMMS_MPI3],[1],[GRID_COMMS_MPI3] )
     ;;
     shmem)
       AC_DEFINE([GRID_COMMS_SHMEM],[1],[GRID_COMMS_SHMEM] )
     ;;
@@ -253,6 +256,7 @@ case ${ac_COMMS} in
 esac
 AM_CONDITIONAL(BUILD_COMMS_SHMEM,[ test "X${ac_COMMS}X" == "XshmemX" ])
 AM_CONDITIONAL(BUILD_COMMS_MPI,[ test "X${ac_COMMS}X" == "XmpiX" || test "X${ac_COMMS}X" == "Xmpi-autoX" ])
 AM_CONDITIONAL(BUILD_COMMS_MPI3,[ test "X${ac_COMMS}X" == "Xmpi3X"] )
 AM_CONDITIONAL(BUILD_COMMS_NONE,[ test "X${ac_COMMS}X" == "XnoneX" ])
 ############### RNG selection
--- a/lib/AlignedAllocator.h
+++ b/lib/AlignedAllocator.h
@@ -40,14 +40,6 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <mm_malloc.h>
 #endif
 #ifdef GRID_COMMS_SHMEM
 extern "C" { 
 #include <mpp/shmem.h>
 extern void * shmem_align(size_t, size_t);
 extern void  shmem_free(void *);
 }
 #endif
 namespace Grid {
 ////////////////////////////////////////////////////////////////////
@@ -65,53 +57,21 @@ public:
  typedef _Tp        value_type;
  template<typename _Tp1>  struct rebind { typedef alignedAllocator<_Tp1> other; };
  alignedAllocator() throw() { }
  alignedAllocator(const alignedAllocator&) throw() { }
  template<typename _Tp1> alignedAllocator(const alignedAllocator<_Tp1>&) throw() { }
  ~alignedAllocator() throw() { }
  pointer       address(reference __x)       const { return &__x; }
  //  const_pointer address(const_reference __x) const { return &__x; }
  size_type  max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
  pointer allocate(size_type __n, const void* _p= 0)
  { 
 #ifdef GRID_COMMS_SHMEM
    _Tp *ptr = (_Tp *) shmem_align(__n*sizeof(_Tp),64);
 #define PARANOID_SYMMETRIC_HEAP
 #ifdef PARANOID_SYMMETRIC_HEAP
    static void * bcast;
    static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
    bcast = (void *) ptr;
    shmem_broadcast32((void *)&bcast,(void *)&bcast,sizeof(void *)/4,0,0,0,shmem_n_pes(),psync);
    if ( bcast != ptr ) {
      std::printf("inconsistent alloc pe %d %lx %lx \n",shmem_my_pe(),bcast,ptr);std::fflush(stdout);
      BACKTRACEFILE();
      exit(0);
    }
    assert( bcast == (void *) ptr);
 #endif 
 #else
 #ifdef HAVE_MM_MALLOC_H
    _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128);
 #else
    _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
 #endif
 #endif
    _Tp tmp;
 #ifdef GRID_NUMA
 #pragma omp parallel for schedule(static)
@@ -123,27 +83,109 @@ public:
  }
  void deallocate(pointer __p, size_type) { 
 #ifdef GRID_COMMS_SHMEM
    shmem_free((void *)__p);
 #else
 #ifdef HAVE_MM_MALLOC_H
    _mm_free((void *)__p); 
 #else
    free((void *)__p);
 #endif
 #endif
  }
  void construct(pointer __p, const _Tp& __val) { };
  void construct(pointer __p) { };
  void destroy(pointer __p) { };
 };
 template<typename _Tp>  inline bool operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
 template<typename _Tp>  inline bool operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
-template<typename _Tp>  inline bool
+//////////////////////////////////////////////////////////////////////////////////////////
-operator==(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return true; }
+// MPI3 : comms must use shm region
 // SHMEM: comms must use symmetric heap
 //////////////////////////////////////////////////////////////////////////////////////////
 #ifdef GRID_COMMS_SHMEM
 extern "C" { 
 #include <mpp/shmem.h>
 extern void * shmem_align(size_t, size_t);
 extern void  shmem_free(void *);
 }
 #define PARANOID_SYMMETRIC_HEAP
 #endif
-template<typename _Tp>  inline bool
+template<typename _Tp>
-operator!=(const alignedAllocator<_Tp>&, const alignedAllocator<_Tp>&){ return false; }
+class commAllocator {
 public: 
  typedef std::size_t     size_type;
  typedef std::ptrdiff_t  difference_type;
  typedef _Tp*       pointer;
  typedef const _Tp* const_pointer;
  typedef _Tp&       reference;
  typedef const _Tp& const_reference;
  typedef _Tp        value_type;
  template<typename _Tp1>  struct rebind { typedef commAllocator<_Tp1> other; };
  commAllocator() throw() { }
  commAllocator(const commAllocator&) throw() { }
  template<typename _Tp1> commAllocator(const commAllocator<_Tp1>&) throw() { }
  ~commAllocator() throw() { }
  pointer       address(reference __x)       const { return &__x; }
  size_type  max_size() const throw() { return size_t(-1) / sizeof(_Tp); }
 #ifdef GRID_COMMS_SHMEM
  pointer allocate(size_type __n, const void* _p= 0)
  {
 #ifdef CRAY
    _Tp *ptr = (_Tp *) shmem_align(__n*sizeof(_Tp),64);
 #else
    _Tp *ptr = (_Tp *) shmem_align(64,__n*sizeof(_Tp));
 #endif
 #ifdef PARANOID_SYMMETRIC_HEAP
    static void * bcast;
    static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
    bcast = (void *) ptr;
    shmem_broadcast32((void *)&bcast,(void *)&bcast,sizeof(void *)/4,0,0,0,shmem_n_pes(),psync);
    if ( bcast != ptr ) {
      std::printf("inconsistent alloc pe %d %lx %lx \n",shmem_my_pe(),bcast,ptr);std::fflush(stdout);
      //      BACKTRACEFILE();
      exit(0);
    }
    assert( bcast == (void *) ptr);
 #endif 
    return ptr;
  }
  void deallocate(pointer __p, size_type) { 
    shmem_free((void *)__p);
  }
 #else
  pointer allocate(size_type __n, const void* _p= 0) 
  {
 #ifdef HAVE_MM_MALLOC_H
    _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128);
 #else
    _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
 #endif
    return ptr;
  }
  void deallocate(pointer __p, size_type) { 
 #ifdef HAVE_MM_MALLOC_H
    _mm_free((void *)__p); 
 #else
    free((void *)__p);
 #endif
  }
 #endif
  void construct(pointer __p, const _Tp& __val) { };
  void construct(pointer __p) { };
  void destroy(pointer __p) { };
 };
 template<typename _Tp>  inline bool operator==(const commAllocator<_Tp>&, const commAllocator<_Tp>&){ return true; }
 template<typename _Tp>  inline bool operator!=(const commAllocator<_Tp>&, const commAllocator<_Tp>&){ return false; }
 ////////////////////////////////////////////////////////////////////////////////
 // Template typedefs
 ////////////////////////////////////////////////////////////////////////////////
 template<class T> using Vector     = std::vector<T,alignedAllocator<T> >;           
 template<class T> using commVector = std::vector<T,commAllocator<T> >;              
 template<class T> using Matrix     = std::vector<std::vector<T,alignedAllocator<T> > >;
 }; // namespace Grid
 #endif
--- a/lib/Cshift.h
+++ b/lib/Cshift.h
@@ -38,6 +38,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/cshift/Cshift_mpi.h>
 #endif 
 #ifdef GRID_COMMS_MPI3
 #include <Grid/cshift/Cshift_mpi.h>
 #endif 
 #ifdef GRID_COMMS_SHMEM
 #include <Grid/cshift/Cshift_mpi.h> // uses same implementation of communicator
 #endif 
--- a/lib/FFT.h
+++ b/lib/FFT.h
@@ -200,18 +200,14 @@ namespace Grid {
 					       sign,FFTW_ESTIMATE);
 	}
-	double add,mul,fma;
+    std::vector<int> lcoor(Nd), gcoor(Nd);
 	FFTW<scalar>::fftw_flops(p,&add,&mul,&fma);
 	flops_call = add+mul+2.0*fma;
 	GridStopWatch timer;
 	// Barrel shift and collect global pencil
 	for(int p=0;p<processors[dim];p++) { 
 	  for(int idx=0;idx<sgrid->lSites();idx++) { 
-	    std::vector<int> lcoor(Nd);
+	    
    	    sgrid->LocalIndexToLocalCoor(idx,lcoor);
 	    sobj s;
@@ -228,14 +224,11 @@ namespace Grid {
 	// Loop over orthog coords
 	int NN=pencil_g.lSites();
-
+	GridStopWatch timer;
-	GridStopWatch Timer;
+	timer.Start();
 	Timer.Start();
 PARALLEL_FOR_LOOP
 	for(int idx=0;idx<NN;idx++) {
 	  std::vector<int> lcoor(Nd);
 	  pencil_g.LocalIndexToLocalCoor(idx,lcoor);
 	  if ( lcoor[dim] == 0 ) {  // restricts loop to plane at lcoor[dim]==0
@@ -245,15 +238,17 @@ PARALLEL_FOR_LOOP
 	  }
 	}
-        Timer.Stop();
+        timer.Stop();
 	usec += Timer.useconds();
 	flops+= flops_call*NN;
          double add,mul,fma;
          FFTW<scalar>::fftw_flops(p,&add,&mul,&fma);
          flops_call = add+mul+2.0*fma;
          usec += timer.useconds();
          flops+= flops_call*NN;
        int pc = processor_coor[dim];
        for(int idx=0;idx<sgrid->lSites();idx++) {
 	  std::vector<int> lcoor(Nd);
 	  sgrid->LocalIndexToLocalCoor(idx,lcoor);
-	  std::vector<int> gcoor = lcoor;
+	  gcoor = lcoor;
 	  // extract the result
 	  sobj s;
 	  gcoor[dim] = lcoor[dim]+L*pc;
--- a/lib/Init.cc
+++ b/lib/Init.cc
@@ -171,14 +171,17 @@ std::string GridCmdVectorIntToString(const std::vector<int> & vec){
 /////////////////////////////////////////////////////////
 //
 /////////////////////////////////////////////////////////
 static int Grid_is_initialised = 0;
 void Grid_init(int *argc,char ***argv)
 {
  GridLogger::StopWatch.Start();
  CartesianCommunicator::Init(argc,argv);
  // Parse command line args.
  GridLogger::StopWatch.Start();
  std::string arg;
  std::vector<std::string> logstreams;
  std::string defaultLog("Error,Warning,Message,Performance");
@@ -216,11 +219,14 @@ void Grid_init(int *argc,char ***argv)
  if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
    LebesgueOrder::UseLebesgueOrder=1;
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking");
    GridCmdOptionIntVector(arg,LebesgueOrder::Block);
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--timestamp") ){
    GridLogTimestamp(1);
  }
  GridParseLayout(*argv,*argc,
 		  Grid_default_latt,
 		  Grid_default_mpi);
@@ -274,12 +280,14 @@ void Grid_init(int *argc,char ***argv)
  std::cout << "GNU General Public License for more details."<<std::endl;
  std::cout << COL_BACKGROUND <<std::endl;
  std::cout << std::endl;
  Grid_is_initialised = 1;
 }
 void Grid_finalize(void)
 {
-#ifdef GRID_COMMS_MPI
+#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3)
  MPI_Finalize();
  Grid_unquiesce_nodes();
 #endif
--- a/lib/Init.h
+++ b/lib/Init.h
@@ -33,6 +33,7 @@ namespace Grid {
  void Grid_init(int *argc,char ***argv);
  void Grid_finalize(void);
  // internal, controled with --handle
  void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr);
  void Grid_debug_handler_init(void);
@@ -44,6 +45,7 @@ namespace Grid {
  const std::vector<int> &GridDefaultMpi(void);
  const int              &GridThreads(void)  ;
  void                    GridSetThreads(int t) ;
  void GridLogTimestamp(int);
  // Common parsing chores
  std::string GridCmdOptionPayload(char ** begin, char ** end, const std::string & option);
--- a/lib/Log.cc
+++ b/lib/Log.cc
@@ -34,8 +34,13 @@ directory
 namespace Grid {
 GridStopWatch Logger::StopWatch;
 int Logger::timestamp;
 std::ostream Logger::devnull(0);
 void GridLogTimestamp(int on){
  Logger::Timestamp(on);
 }
 Colours GridLogColours(0);
 GridLogger GridLogError(1, "Error", GridLogColours, "RED");
 GridLogger GridLogWarning(1, "Warning", GridLogColours, "YELLOW");
@@ -73,7 +78,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
 ////////////////////////////////////////////////////////////
 void Grid_quiesce_nodes(void) {
  int me = 0;
-#ifdef GRID_COMMS_MPI
+#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3)
  MPI_Comm_rank(MPI_COMM_WORLD, &me);
 #endif
 #ifdef GRID_COMMS_SHMEM
--- a/lib/Log.h
+++ b/lib/Log.h
@@ -39,8 +39,9 @@
 namespace Grid {
 //////////////////////////////////////////////////////////////////////////////////////////////////
 // Dress the output; use std::chrono for time stamping via the StopWatch class
-int Rank(void); // used for early stage debug before library init
+//////////////////////////////////////////////////////////////////////////////////////////////////
 class Colours{
@@ -55,7 +56,6 @@ public:
  void Active(bool activate){
    is_active=activate;
    if (is_active){
     colour["BLACK"]  ="\033[30m";
     colour["RED"]    ="\033[31m";
@@ -77,10 +77,7 @@ public:
      colour["WHITE"] ="";
      colour["NORMAL"]="";
    }
  };
 };
@@ -88,6 +85,7 @@ class Logger {
 protected:
  Colours &Painter;
  int active;
  static int timestamp;
  std::string name, topName;
  std::string COLOUR;
@@ -99,8 +97,7 @@ public:
  std::string evidence() {return Painter.colour["YELLOW"];}
  std::string colour() {return Painter.colour[COLOUR];}
-  Logger(std::string topNm, int on, std::string nm, Colours& col_class, std::string col)
+  Logger(std::string topNm, int on, std::string nm, Colours& col_class, std::string col)  : active(on),
  : active(on),
    name(nm),
    topName(topNm),
    Painter(col_class),
@@ -108,16 +105,20 @@ public:
  void Active(int on) {active = on;};
  int  isActive(void) {return active;};
  static void Timestamp(int on) {timestamp = on;};
  friend std::ostream& operator<< (std::ostream& stream, Logger& log){
    if ( log.active ) {
      stream << log.background()<< log.topName << log.background()<< " : ";
      stream << log.colour() <<std::setw(14) << std::left << log.name << log.background() << " : ";
      if ( log.timestamp ) {
 	StopWatch.Stop();
 	GridTime now = StopWatch.Elapsed();
 	StopWatch.Start();
-      stream << log.background()<< log.topName << log.background()<< " : ";
+	stream << log.evidence()<< now << log.background() << " : " ;
-      stream << log.colour() <<std::setw(14) << std::left << log.name << log.background() << " : ";
+      }
-      stream << log.evidence()<< now << log.background() << " : " << log.colour();
+      stream << log.colour();
      return stream;
    } else { 
      return devnull;
@@ -149,7 +150,7 @@ extern void * Grid_backtrace_buffer[_NBACKTRACE];
 #define BACKTRACEFILE() {\
 char string[20];					\
-std::sprintf(string,"backtrace.%d",Rank());				\
+std::sprintf(string,"backtrace.%d",CartesianCommunicator::RankWorld()); \
 std::FILE * fp = std::fopen(string,"w");				\
 BACKTRACEFP(fp)\
 std::fclose(fp);	    \
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@@ -1,14 +1,22 @@
 extra_sources=
 if BUILD_COMMS_MPI
  extra_sources+=communicator/Communicator_mpi.cc
  extra_sources+=communicator/Communicator_base.cc
 endif
 if BUILD_COMMS_MPI3
  extra_sources+=communicator/Communicator_mpi3.cc
  extra_sources+=communicator/Communicator_base.cc
 endif
 if BUILD_COMMS_SHMEM
  extra_sources+=communicator/Communicator_shmem.cc
  extra_sources+=communicator/Communicator_base.cc
 endif
 if BUILD_COMMS_NONE
  extra_sources+=communicator/Communicator_none.cc
  extra_sources+=communicator/Communicator_base.cc
 endif
 #
--- a/lib/Stencil.h
+++ b/lib/Stencil.h
@@ -70,20 +70,20 @@
 namespace Grid {
-template<class vobj,class cobj,class compressor> void 
+inline void Gather_plane_simple_table_compute (GridBase *grid,int dimension,int plane,int cbmask,
-Gather_plane_simple_table_compute (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer,int dimension,int plane,int cbmask,compressor &compress, int off,std::vector<std::pair<int,int> >& table)
+					       int off,std::vector<std::pair<int,int> > & table)
 {
  table.resize(0);
-  int rd = rhs._grid->_rdimensions[dimension];
+  int rd = grid->_rdimensions[dimension];
-  if ( !rhs._grid->CheckerBoarded(dimension) ) {
+  if ( !grid->CheckerBoarded(dimension) ) {
    cbmask = 0x3;
  }
-  int so= plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
+  int so= plane*grid->_ostride[dimension]; // base offset for start of plane 
-  int e1=rhs._grid->_slice_nblock[dimension];
+  int e1=grid->_slice_nblock[dimension];
-  int e2=rhs._grid->_slice_block[dimension];
+  int e2=grid->_slice_block[dimension];
-  int stride=rhs._grid->_slice_stride[dimension];
+  int stride=grid->_slice_stride[dimension];
  if ( cbmask == 0x3 ) { 
    table.resize(e1*e2);
    for(int n=0;n<e1;n++){
@@ -99,7 +99,7 @@ Gather_plane_simple_table_compute (const Lattice<vobj> &rhs,std::vector<cobj,ali
     for(int n=0;n<e1;n++){
       for(int b=0;b<e2;b++){
 	 int o  = n*stride;
-	 int ocb=1<<rhs._grid->CheckerBoardFromOindexTable(o+b);
+	 int ocb=1<<grid->CheckerBoardFromOindexTable(o+b);
 	 if ( ocb &cbmask ) {
 	   table[bo]=std::pair<int,int>(bo,o+b); bo++;
 	 }
@@ -109,8 +109,7 @@ Gather_plane_simple_table_compute (const Lattice<vobj> &rhs,std::vector<cobj,ali
 }
 template<class vobj,class cobj,class compressor> void 
-Gather_plane_simple_table (std::vector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer,
+Gather_plane_simple_table (std::vector<std::pair<int,int> >& table,const Lattice<vobj> &rhs,cobj *buffer,compressor &compress, int off,int so)
 			   compressor &compress, int off,int so)
 {
 PARALLEL_FOR_LOOP     
     for(int i=0;i<table.size();i++){
@@ -118,19 +117,6 @@ PARALLEL_FOR_LOOP
     }
 }
 template<class vobj,class cobj,class compressor> void 
 Gather_plane_simple_stencil (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer,int dimension,int plane,int cbmask,compressor &compress, int off,
 			     double &t_table ,double & t_data )
 {
  std::vector<std::pair<int,int> > table;
  Gather_plane_simple_table_compute (rhs, buffer,dimension,plane,cbmask,compress,off,table);
  int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
  Gather_plane_simple_table         (table,rhs,buffer,compress,off,so);
 }
 struct StencilEntry { 
  uint64_t _offset;
  uint64_t _byte_offset;
@@ -143,6 +129,7 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,std::vector<cobj,alignedAl
 class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal fill in.
 public:
  typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
  typedef uint32_t StencilInteger;
  typedef typename cobj::vector_type vector_type;
  typedef typename cobj::scalar_type scalar_type;
@@ -158,7 +145,6 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,std::vector<cobj,alignedAl
    Integer to_rank;
    Integer from_rank;
    Integer bytes;
 	 volatile Integer done;
  };
  std::vector<Packet> Packets;
@@ -166,81 +152,53 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,std::vector<cobj,alignedAl
  int face_table_computed;
  std::vector<std::vector<std::pair<int,int> > > face_table ;
 #define SEND_IMMEDIATE
 #define SERIAL_SENDS
  void AddPacket(void *xmit,void * rcv, Integer to,Integer from,Integer bytes){
 #ifdef SEND_IMMEDIATE
 	 commtime-=usecond();
 	 _grid->SendToRecvFrom(xmit,to,rcv,from,bytes);
 	 commtime+=usecond();
 #endif
    Packet p;
    p.send_buf = xmit;
    p.recv_buf = rcv;
    p.to_rank  = to;
    p.from_rank= from;
    p.bytes    = bytes;
 	 p.done     = 0;
    comms_bytes+=2.0*bytes;
    Packets.push_back(p);
  }
- #ifdef SERIAL_SENDS
+  void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
-       void Communicate(void ) { 
+  {
    reqs.resize(Packets.size());
    commtime-=usecond();
    for(int i=0;i<Packets.size();i++){
- #ifndef SEND_IMMEDIATE
+	_grid->StencilSendToRecvFromBegin(reqs[i],
 	   _grid->SendToRecvFrom(
 					  Packets[i].send_buf,
 					  Packets[i].to_rank,
 					  Packets[i].recv_buf,
 					  Packets[i].from_rank,
 					  Packets[i].bytes);
- #endif
+	/*
-	   Packets[i].done = 1;
+      }else{
 	_grid->SendToRecvFromBegin(reqs[i],
 				   Packets[i].send_buf,
 				   Packets[i].to_rank,
 				   Packets[i].recv_buf,
 				   Packets[i].from_rank,
 				   Packets[i].bytes);
      }
 	*/
    }
    commtime+=usecond();
  }
- #else
+  void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
-       void Communicate(void ) { 
+  {
 	 typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
 	 std::vector<std::vector<CommsRequest_t> > reqs(Packets.size());
    commtime-=usecond();
-	 const int concurrency=2;
+
-	 for(int i=0;i<Packets.size();i+=concurrency){
+    for(int i=0;i<Packets.size();i++){
-	   for(int ii=0;ii<concurrency;ii++){
+      //      if( ShmDirectCopy ) 
-	     int j = i+ii;
+	_grid->StencilSendToRecvFromComplete(reqs[i]);
-	     if ( j<Packets.size() ) {
+	//      else 
- #ifndef SEND_IMMEDIATE
+	//	_grid->SendToRecvFromComplete(reqs[i]);
 	       _grid->SendToRecvFromBegin(reqs[j],
 					  Packets[j].send_buf,
 					  Packets[j].to_rank,
 					  Packets[j].recv_buf,
 					  Packets[j].from_rank,
 					  Packets[j].bytes);
 #endif
 	     }
 	   }
 	   for(int ii=0;ii<concurrency;ii++){
 	     int j = i+ii;
 	     if ( j<Packets.size() ) {
 #ifndef SEND_IMMEDIATE
 	       _grid->SendToRecvFromComplete(reqs[i]);
 #endif
 	     }
 	   }
 	   for(int ii=0;ii<concurrency;ii++){
 	     int j = i+ii;
 	     if ( j<Packets.size() ) {
 	       Packets[j].done = 1;
 	     }
 	   }
    }
    commtime+=usecond();
  }
 #endif
  ///////////////////////////////////////////
  // Simd merge queue for asynch comms
@@ -260,36 +218,19 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,std::vector<cobj,alignedAl
    m.rpointers= rpointers;
    m.buffer_size = buffer_size;
    m.packet_id   = packet_id;
 #ifdef SEND_IMMEDIATE
 	 mergetime-=usecond();
 PARALLEL_FOR_LOOP
 	 for(int o=0;o<m.buffer_size;o++){
 	   merge1(m.mpointer[o],m.rpointers,o);
 	 }
 	 mergetime+=usecond();
 #else
    Mergers.push_back(m);
 #endif
  }
  void CommsMerge(void ) { 
-	 //PARALLEL_NESTED_LOOP2 
+
    for(int i=0;i<Mergers.size();i++){	
 	 spintime-=usecond();
 	 int packet_id = Mergers[i].packet_id;
 	 while(! Packets[packet_id].done ); // spin for completion
 	 spintime+=usecond();
 #ifndef SEND_IMMEDIATE
      mergetime-=usecond();
 PARALLEL_FOR_LOOP
      for(int o=0;o<Mergers[i].buffer_size;o++){
 	merge1(Mergers[i].mpointer[o],Mergers[i].rpointers,o);
      }
      mergetime+=usecond();
 #endif
    }
  }
@@ -312,24 +253,19 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,std::vector<cobj,alignedAl
  // Flat vector, change layout for cache friendly.
  Vector<StencilEntry>  _entries;
       inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point+_npoints*osite]; }
  void PrecomputeByteOffsets(void){
    for(int i=0;i<_entries.size();i++){
      if( _entries[i]._is_local ) {
 	_entries[i]._byte_offset = _entries[i]._offset*sizeof(vobj);
      } else { 
 	     // PrecomputeByteOffsets [5] 16384/32768 140735768678528 140735781261056 2581581952
 	_entries[i]._byte_offset = _entries[i]._offset*sizeof(cobj);
      }
    }
  };
-       inline uint64_t Touch(int ent) {
+  inline StencilEntry * GetEntry(int &ptype,int point,int osite) { ptype = _permute_type[point]; return & _entries[point+_npoints*osite]; }
 	 //	 _mm_prefetch((char *)&_entries[ent],_MM_HINT_T0);
       }
  inline uint64_t GetInfo(int &ptype,int &local,int &perm,int point,int ent,uint64_t base) {
-	 uint64_t cbase = (uint64_t)&comm_buf[0];
+    uint64_t cbase = (uint64_t)&u_recv_buf_p[0];
    local = _entries[ent]._is_local;
    perm  = _entries[ent]._permute;
    if (perm)  ptype = _permute_type[point]; 
@@ -340,20 +276,33 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,std::vector<cobj,alignedAl
    }
  }
  inline uint64_t GetPFInfo(int ent,uint64_t base) {
-	 uint64_t cbase = (uint64_t)&comm_buf[0];
+    uint64_t cbase = (uint64_t)&u_recv_buf_p[0];
    int local = _entries[ent]._is_local;
    if (local) return  base + _entries[ent]._byte_offset;
    else       return cbase + _entries[ent]._byte_offset;
  }
-       // Comms buffers
+  ///////////////////////////////////////////////////////////
-       std::vector<Vector<scalar_object> > u_simd_send_buf;
+  // Unified Comms buffers for all directions
-       std::vector<Vector<scalar_object> > u_simd_recv_buf;
+  ///////////////////////////////////////////////////////////
-       Vector<cobj>          u_send_buf;
+  // Vectors that live on the symmetric heap in case of SHMEM
-       Vector<cobj>          comm_buf;
+  //  std::vector<commVector<scalar_object> > u_simd_send_buf_hide;
  //  std::vector<commVector<scalar_object> > u_simd_recv_buf_hide;
  //  commVector<cobj>          u_send_buf_hide;
  //  commVector<cobj>          u_recv_buf_hide;
  // These are used; either SHM objects or refs to the above symmetric heap vectors
  // depending on comms target
  cobj* u_recv_buf_p;
  cobj* u_send_buf_p;
  std::vector<scalar_object *> u_simd_send_buf;
  std::vector<scalar_object *> u_simd_recv_buf;
  int u_comm_offset;
  int _unified_buffer_size;
  cobj *CommBuf(void) { return u_recv_buf_p; }
  /////////////////////////////////////////
  // Timing info; ugly; possibly temporary
  /////////////////////////////////////////
@@ -435,7 +384,6 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,std::vector<cobj,alignedAl
      int i = ii; // reverse direction to get SIMD comms done first
      int point = i;
      int dimension    = directions[i];
      int displacement = distances[i];
      int shift = displacement;
@@ -482,18 +430,25 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,std::vector<cobj,alignedAl
 	}
      }
    }
       u_send_buf.resize(_unified_buffer_size);
       comm_buf.resize(_unified_buffer_size);
       PrecomputeByteOffsets();
    /////////////////////////////////////////////////////////////////////////////////
    // Try to allocate for receiving in a shared memory region, fall back to buffer
    /////////////////////////////////////////////////////////////////////////////////
    const int Nsimd = grid->Nsimd();
    _grid->ShmBufferFreeAll();
    u_simd_send_buf.resize(Nsimd);
    u_simd_recv_buf.resize(Nsimd);
    u_send_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
    u_recv_buf_p=(cobj *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(cobj));
    for(int l=0;l<Nsimd;l++){
-	 u_simd_send_buf[l].resize(_unified_buffer_size);
+      u_simd_recv_buf[l] = (scalar_object *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object));
-	 u_simd_recv_buf[l].resize(_unified_buffer_size);
+      u_simd_send_buf[l] = (scalar_object *)_grid->ShmBufferMalloc(_unified_buffer_size*sizeof(scalar_object));
    }
    PrecomputeByteOffsets();
  }
  void Local     (int point, int dimension,int shiftpm,int cbmask)
@@ -717,38 +672,22 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,std::vector<cobj,alignedAl
    }
  }
-
+  template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress) 
       template<class compressor>
       void HaloExchange(const Lattice<vobj> &source,compressor &compress) 
  {
    std::vector<std::vector<CommsRequest_t> > reqs;
    calls++;
    Mergers.resize(0);
    Packets.resize(0);
    _grid->StencilBarrier();
    HaloGather(source,compress);
-	 this->Communicate();
+    this->CommunicateBegin(reqs);
    _grid->StencilBarrier();
    this->CommunicateComplete(reqs);
    _grid->StencilBarrier();
    CommsMerge(); // spins
  }
-#if 0
+  
-       // Overlapping comms and compute typically slows down compute and  is useless
+  template<class compressor> void HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx)
       // unless memory bandwidth greatly exceeds network
       template<class compressor>
       std::thread HaloExchangeBegin(const Lattice<vobj> &source,compressor &compress) {
 	 Mergers.resize(0); 
 	 Packets.resize(0);
 	 HaloGather(source,compress);
 	 return std::thread([&] { this->Communicate(); });
       }
       void HaloExchangeComplete(std::thread &thr) 
       {
 	 CommsMerge(); // spins
 	 jointime-=usecond();
 	 thr.join();
 	 jointime+=usecond();
       }
 #endif
       template<class compressor>
       void HaloGatherDir(const Lattice<vobj> &source,compressor &compress,int point,int & face_idx)
  {
    int dimension    = _directions[point];
    int displacement = _distances[point];
@@ -806,7 +745,6 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,std::vector<cobj,alignedAl
    assert(source._grid==_grid);
    halogtime-=usecond();
 	 assert (comm_buf.size() == _unified_buffer_size );
    u_comm_offset=0;
    // Gather all comms buffers
@@ -863,37 +801,48 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,std::vector<cobj,alignedAl
 	if ( !face_table_computed ) {
 	  t_table-=usecond();
 	  face_table.resize(face_idx+1);
-		 Gather_plane_simple_table_compute (rhs,u_send_buf,dimension,sx,cbmask,compress,u_comm_offset,face_table[face_idx]);
+	  Gather_plane_simple_table_compute ((GridBase *)_grid,dimension,sx,cbmask,u_comm_offset,
 					     face_table[face_idx]);
 	  t_table+=usecond();
 	}
 	       t_data-=usecond();
 	       Gather_plane_simple_table         (face_table[face_idx],rhs,u_send_buf,compress,u_comm_offset,so);
 	       face_idx++;
 	       t_data+=usecond();
 	       gathertime+=usecond();
 	       //	       Gather_plane_simple_stencil (rhs,u_send_buf,dimension,sx,cbmask,compress,u_comm_offset,t_table,t_data);
 	int rank           = _grid->_processor;
 	int recv_from_rank;
 	int xmit_to_rank;
 	_grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
 	assert (xmit_to_rank   != _grid->ThisRank());
 	assert (recv_from_rank != _grid->ThisRank());
-	       //      FIXME Implement asynchronous send & also avoid buffer copy
+	/////////////////////////////////////////////////////////
-	       AddPacket((void *)&u_send_buf[u_comm_offset],
+	// try the direct copy if possible
-			 (void *)  &comm_buf[u_comm_offset],
+	/////////////////////////////////////////////////////////
 	cobj *send_buf = (cobj *)_grid->ShmBufferTranslate(xmit_to_rank,u_recv_buf_p);
 	if ( send_buf==NULL ) { 
 	  send_buf = u_send_buf_p;
 	}
 	//	std::cout << " send_bufs  "<<std::hex<< send_buf <<" ubp "<<u_send_buf_p <<std::dec<<std::endl;
 	t_data-=usecond();
 	assert(u_send_buf_p!=NULL);
 	assert(send_buf!=NULL);
 	Gather_plane_simple_table         (face_table[face_idx],rhs,send_buf,compress,u_comm_offset,so);  face_idx++;
 	t_data+=usecond();
 	AddPacket((void *)&send_buf[u_comm_offset],
 		  (void *)&u_recv_buf_p[u_comm_offset],
 		  xmit_to_rank,
 		  recv_from_rank,
 		  bytes);
 	gathertime+=usecond();
 	u_comm_offset+=words;
      }
    }
  }
  template<class compressor>
  void  GatherSimd(const Lattice<vobj> &rhs,int dimension,int shift,int cbmask,compressor &compress,int & face_idx)
  {
@@ -974,10 +923,6 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,std::vector<cobj,alignedAl
 	  auto rp = &u_simd_recv_buf[i       ][u_comm_offset];
 	  auto sp = &u_simd_send_buf[nbr_lane][u_comm_offset];
 		 void *vrp = (void *)rp;
 		 void *vsp = (void *)sp;
 	  if(nbr_proc){
 	    int recv_from_rank;
@@ -985,9 +930,17 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,std::vector<cobj,alignedAl
 	    _grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 
-		   AddPacket( vsp,vrp,xmit_to_rank,recv_from_rank,bytes);
+	    scalar_object *shm = (scalar_object *) _grid->ShmBufferTranslate(recv_from_rank,sp);
 	    //	    if ((ShmDirectCopy==0)||(shm==NULL)) { 
 	    if (shm==NULL) { 
 	      shm = rp;
 	    } 
-		   rpointers[i] = rp;
+	    // if Direct, StencilSendToRecvFrom will suppress copy to a peer on node
 	    // assuming above pointer flip
 	    AddPacket((void *)sp,(void *)rp,xmit_to_rank,recv_from_rank,bytes);
 	    rpointers[i] = shm;
 	  } else { 
@@ -996,7 +949,7 @@ Gather_plane_simple_stencil (const Lattice<vobj> &rhs,std::vector<cobj,alignedAl
 	  }
 	}
-	       AddMerge(&comm_buf[u_comm_offset],rpointers,buffer_size,Packets.size()-1);
+	AddMerge(&u_recv_buf_p[u_comm_offset],rpointers,buffer_size,Packets.size()-1);
 	u_comm_offset     +=buffer_size;
      }
--- a/lib/Threads.h
+++ b/lib/Threads.h
@@ -127,6 +127,22 @@ class GridThread {
    ThreadBarrier();
  };
  static void bcopy(const void *src, void *dst, size_t len) {
 #ifdef GRID_OMP
 #pragma omp parallel 
    {
      const char *c_src =(char *) src;
      char *c_dest=(char *) dst;
      int me,mywork,myoff;
      GridThread::GetWorkBarrier(len,me, mywork,myoff);
      bcopy(&c_src[myoff],&c_dest[myoff],mywork);
    }
 #else 
    bcopy(src,dst,len);
 #endif
  }
 };
 }
--- a/lib/cartesian/Cartesian_base.h
+++ b/lib/cartesian/Cartesian_base.h
@@ -77,7 +77,7 @@ public:
    // GridCartesian / GridRedBlackCartesian
    ////////////////////////////////////////////////////////////////
    virtual int CheckerBoarded(int dim)=0;
-    virtual int CheckerBoard(std::vector<int> site)=0;
+    virtual int CheckerBoard(std::vector<int> &site)=0;
    virtual int CheckerBoardDestination(int source_cb,int shift,int dim)=0;
    virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
    virtual int CheckerBoardShiftForCB(int source_cb,int dim,int shift,int cb)=0;
--- a/lib/cartesian/Cartesian_full.h
+++ b/lib/cartesian/Cartesian_full.h
@@ -49,7 +49,7 @@ public:
    virtual int CheckerBoarded(int dim){
      return 0;
    }
-    virtual int CheckerBoard(std::vector<int> site){
+    virtual int CheckerBoard(std::vector<int> &site){
        return 0;
    }
    virtual int CheckerBoardDestination(int cb,int shift,int dim){
--- a/lib/cartesian/Cartesian_red_black.h
+++ b/lib/cartesian/Cartesian_red_black.h
@@ -49,7 +49,7 @@ public:
      if( dim==_checker_dim) return 1;
      else return 0;
    }
-    virtual int CheckerBoard(std::vector<int> site){
+    virtual int CheckerBoard(std::vector<int> &site){
      int linear=0;
      assert(site.size()==_ndimension);
      for(int d=0;d<_ndimension;d++){ 
--- a/lib/communicator/Communicator_base.cc
+++ b/lib/communicator/Communicator_base.cc
@@ -0,0 +1,132 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/communicator/Communicator_none.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include "Grid.h"
 namespace Grid {
 ///////////////////////////////////////////////////////////////
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////
 int CartesianCommunicator::ShmRank;
 int CartesianCommunicator::ShmSize;
 int CartesianCommunicator::GroupRank;
 int CartesianCommunicator::GroupSize;
 int CartesianCommunicator::WorldRank;
 int CartesianCommunicator::WorldSize;
 int CartesianCommunicator::Slave;
 void *              CartesianCommunicator::ShmCommBuf;
 /////////////////////////////////
 // Alloc, free shmem region
 /////////////////////////////////
 void *CartesianCommunicator::ShmBufferMalloc(size_t bytes){
  //  bytes = (bytes+sizeof(vRealD))&(~(sizeof(vRealD)-1));// align up bytes
  void *ptr = (void *)heap_top;
  heap_top  += bytes;
  heap_bytes+= bytes;
  std::cout <<"Shm alloc "<<ptr<<std::endl;
  assert(heap_bytes < MAX_MPI_SHM_BYTES);
  return ptr;
 }
 void CartesianCommunicator::ShmBufferFreeAll(void) { 
  heap_top  =(size_t)ShmBufferSelf();
  heap_bytes=0;
 }
 /////////////////////////////////
 // Grid information queries
 /////////////////////////////////
 int                      CartesianCommunicator::IsBoss(void)            { return _processor==0; };
 int                      CartesianCommunicator::BossRank(void)          { return 0; };
 int                      CartesianCommunicator::ThisRank(void)          { return _processor; };
 const std::vector<int> & CartesianCommunicator::ThisProcessorCoor(void) { return _processor_coor; };
 const std::vector<int> & CartesianCommunicator::ProcessorGrid(void)     { return _processors; };
 int                      CartesianCommunicator::ProcessorCount(void)    { return _Nprocessors; };
 ////////////////////////////////////////////////////////////////////////////////
 // very VERY rarely (Log, serial RNG) we need world without a grid
 ////////////////////////////////////////////////////////////////////////////////
 int  CartesianCommunicator::RankWorld(void){ return WorldRank; };
 int CartesianCommunicator::Ranks    (void) { return WorldSize; };
 int CartesianCommunicator::Nodes    (void) { return GroupSize; };
 int CartesianCommunicator::Cores    (void) { return ShmSize;   };
 int CartesianCommunicator::NodeRank (void) { return GroupRank; };
 int CartesianCommunicator::CoreRank (void) { return ShmRank;   };
 void CartesianCommunicator::GlobalSum(ComplexF &c)
 {
  GlobalSumVector((float *)&c,2);
 }
 void CartesianCommunicator::GlobalSumVector(ComplexF *c,int N)
 {
  GlobalSumVector((float *)c,2*N);
 }
 void CartesianCommunicator::GlobalSum(ComplexD &c)
 {
  GlobalSumVector((double *)&c,2);
 }
 void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
 {
  GlobalSumVector((double *)c,2*N);
 }
 #ifndef GRID_COMMS_MPI3
 void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						       void *xmit,
 						       int xmit_to_rank,
 						       void *recv,
 						       int recv_from_rank,
 						       int bytes)
 {
  SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
 {
  SendToRecvFromComplete(waitall);
 }
 void CartesianCommunicator::StencilBarrier(void){};
 commVector<uint8_t> CartesianCommunicator::ShmBufStorageVector;
 void *CartesianCommunicator::ShmBufferSelf(void) { return ShmCommBuf; }
 void *CartesianCommunicator::ShmBuffer(int rank) {
  return NULL;
 }
 void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) { 
  return NULL;
 }
 void CartesianCommunicator::ShmInitGeneric(void){
  ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES);
  ShmCommBuf=(void *)&ShmBufStorageVector[0];
 }
 #endif
 }
--- a/lib/communicator/Communicator_base.h
+++ b/lib/communicator/Communicator_base.h
@@ -34,77 +34,137 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifdef GRID_COMMS_MPI
 #include <mpi.h>
 #endif
 #ifdef GRID_COMMS_MPI3
 #include <mpi.h>
 #endif
 #ifdef GRID_COMMS_SHMEM
 #include <mpp/shmem.h>
 #endif
 namespace Grid {
 class CartesianCommunicator {
  public:    
-  // Communicator should know nothing of the physics grid, only processor grid.
+  // 65536 ranks per node adequate for now
  // 128MB shared memory for comms enought for 48^4 local vol comms
  // Give external control (command line override?) of this
  static const int      MAXLOG2RANKSPERNODE = 16;            
  static const uint64_t MAX_MPI_SHM_BYTES   = 128*1024*1024; 
  // Communicator should know nothing of the physics grid, only processor grid.
  int              _Nprocessors;     // How many in all
  std::vector<int> _processors;      // Which dimensions get relayed out over processors lanes.
  int              _processor;       // linear processor rank
  std::vector<int> _processor_coor;  // linear processor coordinate
  unsigned long _ndimension;
-#ifdef GRID_COMMS_MPI
+#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3)
  MPI_Comm communicator;
  static MPI_Comm communicator_world;
  typedef MPI_Request CommsRequest_t;
 #else 
  typedef int CommsRequest_t;
 #endif
  ////////////////////////////////////////////////////////////////////
  // Helper functionality for SHM Windows common to all other impls
  ////////////////////////////////////////////////////////////////////
  // Longer term; drop this in favour of a master / slave model with 
  // cartesian communicator on a subset of ranks, slave ranks controlled
  // by group leader with data xfer via shared memory
  ////////////////////////////////////////////////////////////////////
 #ifdef  GRID_COMMS_MPI3
  std::vector<int>  WorldDims;
  std::vector<int>  GroupDims;
  std::vector<int>  ShmDims;
  std::vector<int> GroupCoor;
  std::vector<int> ShmCoor;
  std::vector<int> WorldCoor;
  static std::vector<int> GroupRanks; 
  static std::vector<int> MyGroup;
  static int ShmSetup;
  static MPI_Win ShmWindow; 
  static MPI_Comm ShmComm;
  std::vector<int>  LexicographicToWorldRank;
  static std::vector<void *> ShmCommBufs;
 #else 
  static void ShmInitGeneric(void);
  static commVector<uint8_t> ShmBufStorageVector;
 #endif 
  static void * ShmCommBuf;
  size_t heap_top;
  size_t heap_bytes;
  void *ShmBufferSelf(void);
  void *ShmBuffer(int rank);
  void *ShmBufferTranslate(int rank,void * local_p);
  void *ShmBufferMalloc(size_t bytes);
  void ShmBufferFreeAll(void) ;
  ////////////////////////////////////////////////
  // Must call in Grid startup
  ////////////////////////////////////////////////
  static void Init(int *argc, char ***argv);
-    // Constructor
+  ////////////////////////////////////////////////
  // Constructor of any given grid
  ////////////////////////////////////////////////
  CartesianCommunicator(const std::vector<int> &pdimensions_in);
-    // Wraps MPI_Cart routines
+  ////////////////////////////////////////////////////////////////////////////////////////
  // Wraps MPI_Cart routines, or implements equivalent on other impls
  ////////////////////////////////////////////////////////////////////////////////////////
  void ShiftedRanks(int dim,int shift,int & source, int & dest);
  int  RankFromProcessorCoor(std::vector<int> &coor);
  void ProcessorCoorFromRank(int rank,std::vector<int> &coor);
  /////////////////////////////////
-    // Grid information queries
+  // Grid information and queries
  /////////////////////////////////
-    int                      IsBoss(void)            { return _processor==0; };
+  static int ShmRank;
-    int                      BossRank(void)          { return 0; };
+  static int ShmSize;
-    int                      ThisRank(void)          { return _processor; };
+  static int GroupSize;
-    const std::vector<int> & ThisProcessorCoor(void) { return _processor_coor; };
+  static int GroupRank;
-    const std::vector<int> & ProcessorGrid(void)     { return _processors; };
+  static int WorldRank;
-    int                      ProcessorCount(void)    { return _Nprocessors; };
+  static int WorldSize;
  static int Slave;
  int                      IsBoss(void)            ;
  int                      BossRank(void)          ;
  int                      ThisRank(void)          ;
  const std::vector<int> & ThisProcessorCoor(void) ;
  const std::vector<int> & ProcessorGrid(void)     ;
  int                      ProcessorCount(void)    ;
  static int Ranks    (void);
  static int Nodes    (void);
  static int Cores    (void);
  static int NodeRank (void);
  static int CoreRank (void);
  ////////////////////////////////////////////////////////////////////////////////
  // very VERY rarely (Log, serial RNG) we need world without a grid
  ////////////////////////////////////////////////////////////////////////////////
  static int  RankWorld(void) ;
  static void BroadcastWorld(int root,void* data, int bytes);
  ////////////////////////////////////////////////////////////
  // Reduction
  ////////////////////////////////////////////////////////////
  void GlobalSum(RealF &);
  void GlobalSumVector(RealF *,int N);
  void GlobalSum(RealD &);
  void GlobalSumVector(RealD *,int N);
  void GlobalSum(uint32_t &);
  void GlobalSum(uint64_t &);
-
+  void GlobalSum(ComplexF &c);
-    void GlobalSum(ComplexF &c)
+  void GlobalSumVector(ComplexF *c,int N);
-    {
+  void GlobalSum(ComplexD &c);
-      GlobalSumVector((float *)&c,2);
+  void GlobalSumVector(ComplexD *c,int N);
    }
    void GlobalSumVector(ComplexF *c,int N)
    {
      GlobalSumVector((float *)c,2*N);
    }
    void GlobalSum(ComplexD &c)
    {
      GlobalSumVector((double *)&c,2);
    }
    void GlobalSumVector(ComplexD *c,int N)
    {
      GlobalSumVector((double *)c,2*N);
    }
  template<class obj> void GlobalSum(obj &o){
    typedef typename obj::scalar_type scalar_type;
@@ -112,6 +172,7 @@ class CartesianCommunicator {
    scalar_type * ptr = (scalar_type *)& o;
    GlobalSumVector(ptr,words);
  }
  ////////////////////////////////////////////////////////////
  // Face exchange, buffer swap in translational invariant way
  ////////////////////////////////////////////////////////////
@@ -133,8 +194,19 @@ class CartesianCommunicator {
 			   void *recv,
 			   int recv_from_rank,
 			   int bytes);
  void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
  void StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 				  void *xmit,
 				  int xmit_to_rank,
 				  void *recv,
 				  int recv_from_rank,
 				  int bytes);
  void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
  void StencilBarrier(void);
  ////////////////////////////////////////////////////////////
  // Barrier
  ////////////////////////////////////////////////////////////
@@ -144,13 +216,12 @@ class CartesianCommunicator {
  // Broadcast a buffer and composite larger
  ////////////////////////////////////////////////////////////
  void Broadcast(int root,void* data, int bytes);
  template<class obj> void Broadcast(int root,obj &data)
    {
      Broadcast(root,(void *)&data,sizeof(data));
    };
    static void BroadcastWorld(int root,void* data, int bytes);
 }; 
 }
--- a/lib/communicator/Communicator_mpi.cc
+++ b/lib/communicator/Communicator_mpi.cc
@@ -30,6 +30,12 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 namespace Grid {
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 MPI_Comm CartesianCommunicator::communicator_world;
 // Should error check all MPI calls.
 void CartesianCommunicator::Init(int *argc, char ***argv) {
  int flag;
@@ -37,12 +43,15 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
  if ( !flag ) {
    MPI_Init(argc,argv);
  }
-}
+  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
-
+  MPI_Comm_rank(communicator_world,&WorldRank);
-  int Rank(void) {
+  MPI_Comm_size(communicator_world,&WorldSize);
-    int pe;
+  ShmRank=0;
-    MPI_Comm_rank(MPI_COMM_WORLD,&pe);
+  ShmSize=1;
-    return pe;
+  GroupRank=WorldRank;
  GroupSize=WorldSize;
  Slave    =0;
  ShmInitGeneric();
 }
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
@@ -54,7 +63,7 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
  _processors = processors;
  _processor_coor.resize(_ndimension);
-  MPI_Cart_create(MPI_COMM_WORLD, _ndimension,&_processors[0],&periodic[0],1,&communicator);
+  MPI_Cart_create(communicator_world, _ndimension,&_processors[0],&periodic[0],1,&communicator);
  MPI_Comm_rank(communicator,&_processor);
  MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
@@ -67,7 +76,6 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
  assert(Size==_Nprocessors);
 }
 void CartesianCommunicator::GlobalSum(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
  assert(ierr==0);
@@ -168,7 +176,6 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &
  int nreq=list.size();
  std::vector<MPI_Status> status(nreq);
  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
  assert(ierr==0);
 }
@@ -187,14 +194,17 @@ void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 		     communicator);
  assert(ierr==0);
 }
-
+  ///////////////////////////////////////////////////////
  // Should only be used prior to Grid Init finished.
  // Check for this?
  ///////////////////////////////////////////////////////
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
  int ierr= MPI_Bcast(data,
 		      bytes,
 		      MPI_BYTE,
 		      root,
-		      MPI_COMM_WORLD);
+		      communicator_world);
  assert(ierr==0);
 }
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@@ -0,0 +1,574 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/communicator/Communicator_mpi.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include "Grid.h"
 #include <mpi.h>
 namespace Grid {
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 int CartesianCommunicator::ShmSetup = 0;
 MPI_Comm CartesianCommunicator::communicator_world;
 MPI_Comm CartesianCommunicator::ShmComm;
 MPI_Win  CartesianCommunicator::ShmWindow;
 std::vector<int> CartesianCommunicator::GroupRanks;  
 std::vector<int> CartesianCommunicator::MyGroup;
 std::vector<void *> CartesianCommunicator::ShmCommBufs;
 void *CartesianCommunicator::ShmBufferSelf(void)
 {
  return ShmCommBufs[ShmRank];
 }
 void *CartesianCommunicator::ShmBuffer(int rank)
 {
  int gpeer = GroupRanks[rank];
  if (gpeer == MPI_UNDEFINED){
    return NULL;
  } else { 
    return ShmCommBufs[gpeer];
  }
 }
 void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p)
 {
  int gpeer = GroupRanks[rank];
  if (gpeer == MPI_UNDEFINED){
    return NULL;
  } else { 
    uint64_t offset = (uint64_t)local_p - (uint64_t)ShmCommBufs[ShmRank];
    uint64_t remote = (uint64_t)ShmCommBufs[gpeer]+offset;
    return (void *) remote;
  }
 }
 void CartesianCommunicator::Init(int *argc, char ***argv) {
  int flag;
  MPI_Initialized(&flag); // needed to coexist with other libs apparently
  if ( !flag ) {
    MPI_Init(argc,argv);
  }
  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
  MPI_Comm_rank(communicator_world,&WorldRank);
  MPI_Comm_size(communicator_world,&WorldSize);
  /////////////////////////////////////////////////////////////////////
  // Split into groups that can share memory
  /////////////////////////////////////////////////////////////////////
  MPI_Comm_split_type(communicator_world, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&ShmComm);
  MPI_Comm_rank(ShmComm     ,&ShmRank);
  MPI_Comm_size(ShmComm     ,&ShmSize);
  GroupSize = WorldSize/ShmSize;
  /////////////////////////////////////////////////////////////////////
  // find world ranks in our SHM group (i.e. which ranks are on our node)
  /////////////////////////////////////////////////////////////////////
  MPI_Group WorldGroup, ShmGroup;
  MPI_Comm_group (communicator_world, &WorldGroup); 
  MPI_Comm_group (ShmComm, &ShmGroup);
  std::vector<int> world_ranks(WorldSize); 
  GroupRanks.resize(WorldSize); 
  MyGroup.resize(ShmSize);
  for(int r=0;r<WorldSize;r++) world_ranks[r]=r;
  MPI_Group_translate_ranks (WorldGroup,WorldSize,&world_ranks[0],ShmGroup, &GroupRanks[0]); 
  ///////////////////////////////////////////////////////////////////
  // Identify who is in my group and noninate the leader
    ///////////////////////////////////////////////////////////////////
  int g=0;
  for(int rank=0;rank<WorldSize;rank++){
    if(GroupRanks[rank]!=MPI_UNDEFINED){
      assert(g<ShmSize);
      MyGroup[g++] = rank;
    }
  }
  std::sort(MyGroup.begin(),MyGroup.end(),std::less<int>());
  int myleader = MyGroup[0];
  std::vector<int> leaders_1hot(WorldSize,0);
  std::vector<int> leaders_group(GroupSize,0);
  leaders_1hot [ myleader ] = 1;
  ///////////////////////////////////////////////////////////////////
  // global sum leaders over comm world
  ///////////////////////////////////////////////////////////////////
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&leaders_1hot[0],WorldSize,MPI_INT,MPI_SUM,communicator_world);
  assert(ierr==0);
  ///////////////////////////////////////////////////////////////////
  // find the group leaders world rank
  ///////////////////////////////////////////////////////////////////
  int group=0;
  for(int l=0;l<WorldSize;l++){
    if(leaders_1hot[l]){
      leaders_group[group++] = l;
    }
  }
  ///////////////////////////////////////////////////////////////////
  // Identify the rank of the group in which I (and my leader) live
  ///////////////////////////////////////////////////////////////////
  GroupRank=-1;
  for(int g=0;g<GroupSize;g++){
    if (myleader == leaders_group[g]){
      GroupRank=g;
    }
  }
  assert(GroupRank!=-1);
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  // allocate the shared window for our group
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  ShmCommBuf = 0;
  ierr = MPI_Win_allocate_shared(MAX_MPI_SHM_BYTES,1,MPI_INFO_NULL,ShmComm,&ShmCommBuf,&ShmWindow);
  assert(ierr==0);
  // KNL hack -- force to numa-domain 1 in flat
 #if 0
  //#include <numaif.h>
  for(uint64_t page=0;page<MAX_MPI_SHM_BYTES;page+=4096){
    void *pages = (void *) ( page + ShmCommBuf );
    int status;
    int flags=MPOL_MF_MOVE_ALL;
    int nodes=1; // numa domain == MCDRAM
    unsigned long count=1;
    ierr= move_pages(0,count, &pages,&nodes,&status,flags);
    if (ierr && (page==0)) perror("numa relocate command failed");
  }
 #endif
  MPI_Win_lock_all (MPI_MODE_NOCHECK, ShmWindow);
  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Plan: allocate a fixed SHM region. Scratch that is just used via some scheme during stencil comms, with no allocate free.
  /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  ShmCommBufs.resize(ShmSize);
  for(int r=0;r<ShmSize;r++){
    MPI_Aint sz;
    int dsp_unit;
    MPI_Win_shared_query (ShmWindow, r, &sz, &dsp_unit, &ShmCommBufs[r]);
  }
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  // Verbose for now
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  if (WorldRank == 0){
    std::cout<<GridLogMessage<< "Grid MPI-3 configuration: detected ";
    std::cout<< WorldSize << " Ranks " ;
    std::cout<< GroupSize << " Nodes " ;
    std::cout<<  ShmSize  << " with ranks-per-node "<<std::endl;
    std::cout<<GridLogMessage     <<"Grid MPI-3 configuration: allocated shared memory region of size ";
    std::cout<<std::hex << MAX_MPI_SHM_BYTES <<" ShmCommBuf address = "<<ShmCommBuf << std::dec<<std::endl;
    for(int g=0;g<GroupSize;g++){
      std::cout<<GridLogMessage<<" Node "<<g<<" led by MPI rank "<<leaders_group[g]<<std::endl;
    }
    std::cout<<GridLogMessage<<" Boss Node Shm Pointers are {";
    for(int g=0;g<ShmSize;g++){
      std::cout<<std::hex<<ShmCommBufs[g]<<std::dec;
      if(g!=ShmSize-1) std::cout<<",";
      else std::cout<<"}"<<std::endl;
    }
  }
  for(int g=0;g<GroupSize;g++){
    if ( (ShmRank == 0) && (GroupRank==g) )  std::cout<<GridLogMessage<<"["<<g<<"] Node Group "<<g<<" is ranks {";
    for(int r=0;r<ShmSize;r++){
      if ( (ShmRank == 0) && (GroupRank==g) ) {
 	std::cout<<MyGroup[r];
 	if(r<ShmSize-1) std::cout<<",";
 	else std::cout<<"}"<<std::endl;
      }
      MPI_Barrier(communicator_world);
    }
  }
  assert(ShmSetup==0);  ShmSetup=1;
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Want to implement some magic ... Group sub-cubes into those on same node
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////
 void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
 {
  std::vector<int> coor = _processor_coor;
  assert(std::abs(shift) <_processors[dim]);
  coor[dim] = (_processor_coor[dim] + shift + _processors[dim])%_processors[dim];
  Lexicographic::IndexFromCoor(coor,source,_processors);
  source = LexicographicToWorldRank[source];
  coor[dim] = (_processor_coor[dim] - shift + _processors[dim])%_processors[dim];
  Lexicographic::IndexFromCoor(coor,dest,_processors);
  dest = LexicographicToWorldRank[dest];
 }
 int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
 {
  int rank;
  Lexicographic::IndexFromCoor(coor,rank,_processors);
  rank = LexicographicToWorldRank[rank];
  return rank;
 }
 void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
 {
  Lexicographic::CoorFromIndex(coor,rank,_processors);
  rank = LexicographicToWorldRank[rank];
 }
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 { 
  int ierr;
  communicator=communicator_world;
  _ndimension = processors.size();
  ////////////////////////////////////////////////////////////////
  // Assert power of two shm_size.
  ////////////////////////////////////////////////////////////////
  int log2size = -1;
  for(int i=0;i<=MAXLOG2RANKSPERNODE;i++){  
    if ( (0x1<<i) == ShmSize ) {
      log2size = i;
      break;
    }
  }
  assert(log2size != -1);
  ////////////////////////////////////////////////////////////////
  // Identify subblock of ranks on node spreading across dims
  // in a maximally symmetrical way
  ////////////////////////////////////////////////////////////////
  int dim = 0;
  std::vector<int> WorldDims = processors;
  ShmDims.resize(_ndimension,1);
  GroupDims.resize(_ndimension);
  ShmCoor.resize(_ndimension);
  GroupCoor.resize(_ndimension);
  WorldCoor.resize(_ndimension);
  for(int l2=0;l2<log2size;l2++){
    while ( WorldDims[dim] / ShmDims[dim] <= 1 ) dim=(dim+1)%_ndimension;
    ShmDims[dim]*=2;
    dim=(dim+1)%_ndimension;
  }
  ////////////////////////////////////////////////////////////////
  // Establish torus of processes and nodes with sub-blockings
  ////////////////////////////////////////////////////////////////
  for(int d=0;d<_ndimension;d++){
    GroupDims[d] = WorldDims[d]/ShmDims[d];
  }
  ////////////////////////////////////////////////////////////////
  // Check processor counts match
  ////////////////////////////////////////////////////////////////
  _Nprocessors=1;
  _processors = processors;
  _processor_coor.resize(_ndimension);
  for(int i=0;i<_ndimension;i++){
    _Nprocessors*=_processors[i];
  }
  assert(WorldSize==_Nprocessors);
  ////////////////////////////////////////////////////////////////
  // Establish mapping between lexico physics coord and WorldRank
  // 
  ////////////////////////////////////////////////////////////////
  LexicographicToWorldRank.resize(WorldSize,0);
  Lexicographic::CoorFromIndex(GroupCoor,GroupRank,GroupDims);
  Lexicographic::CoorFromIndex(ShmCoor,ShmRank,ShmDims);
  for(int d=0;d<_ndimension;d++){
    WorldCoor[d] = GroupCoor[d]*ShmDims[d]+ShmCoor[d];
  }
  _processor_coor = WorldCoor;
  int lexico;
  Lexicographic::IndexFromCoor(WorldCoor,lexico,WorldDims);
  LexicographicToWorldRank[lexico]=WorldRank;
  _processor = lexico;
  ///////////////////////////////////////////////////////////////////
  // global sum Lexico to World mapping
  ///////////////////////////////////////////////////////////////////
  ierr=MPI_Allreduce(MPI_IN_PLACE,&LexicographicToWorldRank[0],WorldSize,MPI_INT,MPI_SUM,communicator);
  assert(ierr==0);
 };
 void CartesianCommunicator::GlobalSum(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(uint64_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(float &f){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(float *f,int N)
 {
  int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(double &d)
 {
  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(double *d,int N)
 {
  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int dest,
 					   void *recv,
 					   int from,
 					   int bytes)
 {
  std::vector<CommsRequest_t> reqs(0);
  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
  SendToRecvFromComplete(reqs);
 }
 void CartesianCommunicator::SendRecvPacket(void *xmit,
 					   void *recv,
 					   int sender,
 					   int receiver,
 					   int bytes)
 {
  MPI_Status stat;
  assert(sender != receiver);
  int tag = sender;
  if ( _processor == sender ) {
    MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
  }
  if ( _processor == receiver ) { 
    MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
  }
 }
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
 						void *recv,
 						int from,
 						int bytes)
 {
 #if 0
  this->StencilBarrier();
  MPI_Request xrq;
  MPI_Request rrq;
  static int sequence;
  int ierr;
  int tag;
  int check;
  assert(dest != _processor);
  assert(from != _processor);
  int gdest = GroupRanks[dest];
  int gfrom = GroupRanks[from];
  int gme   = GroupRanks[_processor];
  sequence++;
  char *from_ptr = (char *)ShmCommBufs[ShmRank];
  int small = (bytes<MAX_MPI_SHM_BYTES);
  typedef uint64_t T;
  int words = bytes/sizeof(T);
  assert(((size_t)bytes &(sizeof(T)-1))==0);
  assert(gme == ShmRank);
  if ( small && (gdest !=MPI_UNDEFINED) ) {
    char *to_ptr   = (char *)ShmCommBufs[gdest];
    assert(gme != gdest);
    T *ip = (T *)xmit;
    T *op = (T *)to_ptr;
 PARALLEL_FOR_LOOP 
    for(int w=0;w<words;w++) {
      op[w]=ip[w];
    }
    bcopy(&_processor,&to_ptr[bytes],sizeof(_processor));
    bcopy(&  sequence,&to_ptr[bytes+4],sizeof(sequence));
  } else { 
    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
    assert(ierr==0);
    list.push_back(xrq);
  }
  this->StencilBarrier();
  if (small && (gfrom !=MPI_UNDEFINED) ) {
    T *ip = (T *)from_ptr;
    T *op = (T *)recv;
 PARALLEL_FOR_LOOP 
    for(int w=0;w<words;w++) {
      op[w]=ip[w];
    }
    bcopy(&from_ptr[bytes]  ,&tag  ,sizeof(tag));
    bcopy(&from_ptr[bytes+4],&check,sizeof(check));
    assert(check==sequence);
    assert(tag==from);
  } else { 
    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
    assert(ierr==0);
    list.push_back(rrq);
  }
  this->StencilBarrier();
 #else
  MPI_Request xrq;
  MPI_Request rrq;
  int rank = _processor;
  int ierr;
  ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
  ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
  assert(ierr==0);
  list.push_back(xrq);
  list.push_back(rrq);
 #endif
 }
 void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						       void *xmit,
 						       int dest,
 						       void *recv,
 						       int from,
 						       int bytes)
 {
  MPI_Request xrq;
  MPI_Request rrq;
  int ierr;
  assert(dest != _processor);
  assert(from != _processor);
  int gdest = GroupRanks[dest];
  int gfrom = GroupRanks[from];
  int gme   = GroupRanks[_processor];
  assert(gme == ShmRank);
  if ( gdest == MPI_UNDEFINED ) {
    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
    assert(ierr==0);
    list.push_back(xrq);
  }
  if ( gfrom ==MPI_UNDEFINED) {
    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
    assert(ierr==0);
    list.push_back(rrq);
  }
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  SendToRecvFromComplete(list);
 }
 void CartesianCommunicator::StencilBarrier(void)
 {
  MPI_Win_sync (ShmWindow);   
  MPI_Barrier  (ShmComm);
  MPI_Win_sync (ShmWindow);   
 }
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  int nreq=list.size();
  std::vector<MPI_Status> status(nreq);
  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
  assert(ierr==0);
 }
 void CartesianCommunicator::Barrier(void)
 {
  int ierr = MPI_Barrier(communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 {
  int ierr=MPI_Bcast(data,
 		     bytes,
 		     MPI_BYTE,
 		     root,
 		     communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
  int ierr= MPI_Bcast(data,
 		      bytes,
 		      MPI_BYTE,
 		      root,
 		      communicator_world);
  assert(ierr==0);
 }
 }
--- a/lib/communicator/Communicator_none.cc
+++ b/lib/communicator/Communicator_none.cc
@@ -28,12 +28,22 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include "Grid.h"
 namespace Grid {
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 void CartesianCommunicator::Init(int *argc, char *** arv)
 {
  WorldRank = 0;
  WorldSize = 1;
  ShmRank=0;
  ShmSize=1;
  GroupRank=WorldRank;
  GroupSize=WorldSize;
  Slave    =0;
  ShmInitGeneric();
 }
 int Rank(void ){ return 0; };
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 {
  _processors = processors;
@@ -89,30 +99,16 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &
  assert(0);
 }
-void CartesianCommunicator::Barrier(void)
+void CartesianCommunicator::Barrier(void){}
-{
+void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {}
-}
+void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { }
-
+int  CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor) {  return 0;}
-void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
+void CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor){  assert(0);}
 {
 }
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
 }
 void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
 {
  source =0;
  dest=0;
 }
 int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
 {
  return 0;
 }
 void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
 {
 }
 }
--- a/lib/communicator/Communicator_shmem.cc
+++ b/lib/communicator/Communicator_shmem.cc
@@ -39,17 +39,22 @@ namespace Grid {
    BACKTRACEFILE();		   \
  }\
 }
-int Rank(void) {
+
-  return shmem_my_pe();
+
-}
+///////////////////////////////////////////////////////////////////////////////////////////////////
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 typedef struct HandShake_t { 
  uint64_t seq_local;
  uint64_t seq_remote;
 } HandShake;
 static Vector< HandShake > XConnections;
 static Vector< HandShake > RConnections;
 void CartesianCommunicator::Init(int *argc, char ***argv) {
  shmem_init();
  XConnections.resize(shmem_n_pes());
@@ -60,8 +65,17 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
    RConnections[pe].seq_local = 0;
    RConnections[pe].seq_remote= 0;
  }
  WorldSize = shmem_n_pes();
  WorldRank = shmem_my_pe();
  ShmRank=0;
  ShmSize=1;
  GroupRank=WorldRank;
  GroupSize=WorldSize;
  Slave    =0;
  shmem_barrier_all();
  ShmInitGeneric();
 }
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 {
  _ndimension = processors.size();
@@ -230,12 +244,9 @@ void CartesianCommunicator::SendRecvPacket(void *xmit,
  if ( _processor == sender ) {
    printf("Sender SHMEM pt2pt %d -> %d\n",sender,receiver);
    // Check he has posted a receive
    while(SendSeq->seq_remote == SendSeq->seq_local);
    printf("Sender receive %d posted\n",sender,receiver);
    // Advance our send count
    seq = ++(SendSeq->seq_local);
@@ -244,26 +255,19 @@ void CartesianCommunicator::SendRecvPacket(void *xmit,
    shmem_putmem(recv,xmit,bytes,receiver);
    shmem_fence();
    printf("Sender sent payload %d\n",seq);
    //Notify him we're done
    shmem_putmem((void *)&(RecvSeq->seq_remote),&seq,sizeof(seq),receiver);
    shmem_fence();
    printf("Sender ringing door bell  %d\n",seq);
  }
  if ( _processor == receiver ) {
    printf("Receiver SHMEM pt2pt %d->%d\n",sender,receiver);
    // Post a receive
    seq = ++(RecvSeq->seq_local);
    shmem_putmem((void *)&(SendSeq->seq_remote),&seq,sizeof(seq),sender);
    printf("Receiver Opening letter box %d\n",seq);
    // Now wait until he has advanced our reception counter
    while(RecvSeq->seq_remote != RecvSeq->seq_local);
    printf("Receiver Got the mail %d\n",seq);
  }
 }
--- a/lib/cshift/Cshift_common.h
+++ b/lib/cshift/Cshift_common.h
@@ -45,7 +45,7 @@ public:
 // Gather for when there is no need to SIMD split with compression
 ///////////////////////////////////////////////////////////////////
 template<class vobj,class cobj,class compressor> void 
-Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<cobj,alignedAllocator<cobj> > &buffer,int dimension,int plane,int cbmask,compressor &compress, int off=0)
+Gather_plane_simple (const Lattice<vobj> &rhs,commVector<cobj> &buffer,int dimension,int plane,int cbmask,compressor &compress, int off=0)
 {
  int rd = rhs._grid->_rdimensions[dimension];
@@ -114,6 +114,7 @@ PARALLEL_NESTED_LOOP2
 	int o      =   n*n1;
 	int offset = b+n*n2;
 	cobj temp =compress(rhs._odata[so+o+b]);
 	extract<cobj>(temp,pointers,offset);
      }
@@ -121,6 +122,7 @@ PARALLEL_NESTED_LOOP2
  } else { 
    assert(0); //Fixme think this is buggy
    for(int n=0;n<e1;n++){
      for(int b=0;b<e2;b++){
 	int o=n*rhs._grid->_slice_stride[dimension];
@@ -139,7 +141,7 @@ PARALLEL_NESTED_LOOP2
 //////////////////////////////////////////////////////
 // Gather for when there is no need to SIMD split
 //////////////////////////////////////////////////////
-template<class vobj> void Gather_plane_simple (const Lattice<vobj> &rhs,std::vector<vobj,alignedAllocator<vobj> > &buffer,             int dimension,int plane,int cbmask)
+template<class vobj> void Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer, int dimension,int plane,int cbmask)
 {
  SimpleCompressor<vobj> dontcompress;
  Gather_plane_simple (rhs,buffer,dimension,plane,cbmask,dontcompress);
@@ -157,7 +159,7 @@ template<class vobj> void Gather_plane_extract(const Lattice<vobj> &rhs,std::vec
 //////////////////////////////////////////////////////
 // Scatter for when there is no need to SIMD split
 //////////////////////////////////////////////////////
-template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllocator<vobj> > &buffer, int dimension,int plane,int cbmask)
+template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vobj> &buffer, int dimension,int plane,int cbmask)
 {
  int rd = rhs._grid->_rdimensions[dimension];
--- a/lib/cshift/Cshift_mpi.h
+++ b/lib/cshift/Cshift_mpi.h
@@ -119,8 +119,8 @@ template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &r
  assert(shift<fd);
  int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
-  std::vector<vobj,alignedAllocator<vobj> > send_buf(buffer_size);
+  commVector<vobj> send_buf(buffer_size);
-  std::vector<vobj,alignedAllocator<vobj> > recv_buf(buffer_size);
+  commVector<vobj> recv_buf(buffer_size);
  int cb= (cbmask==0x2)? Odd : Even;
  int sshift= rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
@@ -191,8 +191,8 @@ template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vo
  int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
  int words = sizeof(vobj)/sizeof(vector_type);
-  std::vector<Vector<scalar_object> >   send_buf_extract(Nsimd,Vector<scalar_object>(buffer_size) );
+  std::vector<commVector<scalar_object> >   send_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
-  std::vector<Vector<scalar_object> >   recv_buf_extract(Nsimd,Vector<scalar_object>(buffer_size) );
+  std::vector<commVector<scalar_object> >   recv_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
  int bytes = buffer_size*sizeof(scalar_object);
--- a/lib/lattice/Lattice_base.h
+++ b/lib/lattice/Lattice_base.h
@@ -65,9 +65,6 @@ public:
 class LatticeExpressionBase {};
 template<class T> using Vector = std::vector<T,alignedAllocator<T> >;               // Aligned allocator??
 template<class T> using Matrix = std::vector<std::vector<T,alignedAllocator<T> > >; // Aligned allocator??
 template <typename Op, typename T1>                           
 class LatticeUnaryExpression  : public std::pair<Op,std::tuple<T1> > , public LatticeExpressionBase {
 public:
--- a/lib/lattice/Lattice_peekpoke.h
+++ b/lib/lattice/Lattice_peekpoke.h
@@ -164,15 +164,17 @@ PARALLEL_FOR_LOOP
      assert( l.checkerboard== l._grid->CheckerBoard(site));
      assert( sizeof(sobj)*Nsimd == sizeof(vobj));
      static const int words=sizeof(vobj)/sizeof(vector_type);
      int odx,idx;
      idx= grid->iIndex(site);
      odx= grid->oIndex(site);
-      std::vector<sobj> buf(Nsimd);
+      scalar_type * vp = (scalar_type *)&l._odata[odx];
      scalar_type * pt = (scalar_type *)&s;
-      extract(l._odata[odx],buf);
+      for(int w=0;w<words;w++){
-      
+        pt[w] = vp[idx+w*Nsimd];
-      s = buf[idx];
+      }
      return;
    };
@@ -190,18 +192,17 @@ PARALLEL_FOR_LOOP
      assert( l.checkerboard== l._grid->CheckerBoard(site));
      assert( sizeof(sobj)*Nsimd == sizeof(vobj));
      static const int words=sizeof(vobj)/sizeof(vector_type);
      int odx,idx;
      idx= grid->iIndex(site);
      odx= grid->oIndex(site);
-      std::vector<sobj> buf(Nsimd);
+      scalar_type * vp = (scalar_type *)&l._odata[odx];
      scalar_type * pt = (scalar_type *)&s;
-      // extract-modify-merge cycle is easiest way and this is not perf critical
+      for(int w=0;w<words;w++){
-      extract(l._odata[odx],buf);
+        vp[idx+w*Nsimd] = pt[w];
-      
+      }
      buf[idx] = s;
      merge(l._odata[odx],buf);
      return;
    };
--- a/lib/lattice/Lattice_rng.h
+++ b/lib/lattice/Lattice_rng.h
@@ -31,14 +31,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <random>
 // Have not enable RNG_SPRNG_SHA256 by default yet.
 // Uncomment the following line to see the effect of the new RNG.
 // #define RNG_SPRNG_SHA256
 #ifdef RNG_SPRNG_SHA256
 #include "rng/sprng-sha256.h"
 #endif
 namespace Grid {
@@ -118,11 +110,7 @@ namespace Grid {
    int _seeded;
    // One generator per site.
    // Uniform and Gaussian distributions from these generators.
-#ifdef RNG_SPRNG_SHA256
+#ifdef RNG_RANLUX
    typedef uint32_t      RngStateType;
    typedef SprngSha256 RngEngine;
    static const int RngStateCount = 22;
 #elif defined RNG_RANLUX
    typedef uint64_t      RngStateType;
    typedef std::ranlux48 RngEngine;
    static const int RngStateCount = 15;
@@ -285,34 +273,6 @@ namespace Grid {
    }
 #ifdef RNG_SPRNG_SHA256
    template<class source> void Seed(source &src)
    {
      std::vector<int> gcoor;
      long gsites = _grid->_gsites;
      RngState rs;
      for (int i = 0; i < 8; ++i) {
        splitRngState(rs, rs, src());
      }
      for(long gidx=0;gidx<gsites;gidx++){
        int rank,o_idx,i_idx;
        _grid->GlobalIndexToGlobalCoor(gidx,gcoor);
        _grid->GlobalCoorToRankIndex(rank,o_idx,i_idx,gcoor);
        int l_idx=generator_idx(o_idx,i_idx);
        if( rank == _grid->ThisRank() ){
          splitRngState(_generators[l_idx].rs, rs, gidx);
        }
      }
      _seeded=1;
    }
 #else
    // This loop could be made faster to avoid the Ahmdahl by
    // i)  seed generators on each timeslice, for x=y=z=0;
    // ii) seed generators on each z for x=y=0
@@ -337,8 +297,9 @@ namespace Grid {
 	int l_idx=generator_idx(o_idx,i_idx);
-	std::vector<int> site_seeds(4);
+	const int num_rand_seed=16;
-	for(int i=0;i<4;i++){
+	std::vector<int> site_seeds(num_rand_seed);
 	for(int i=0;i<site_seeds.size();i++){
 	  site_seeds[i]= ui(pseeder);
 	}
@@ -352,7 +313,6 @@ namespace Grid {
      }
      _seeded=1;
    }    
 #endif
    //FIXME implement generic IO and create state save/restore
    //void SaveState(const std::string<char> &file);
--- a/lib/lattice/rng/rng-state.h
+++ b/lib/lattice/rng/rng-state.h
@@ -1,353 +0,0 @@
 // vim: set ts=2 sw=2 expandtab:
 // Copyright (c) 2016 Luchang Jin
 // All rights reserved.
 // This program is free software: you can redistribute it and/or modify
 // it under the terms of the GNU General Public License as published by
 // the Free Software Foundation, either version 2 of the License, or
 // (at your option) any later version.
 //
 // This program is distributed in the hope that it will be useful,
 // but WITHOUT ANY WARRANTY; without even the implied warranty of
 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 // GNU General Public License for more details.
 //
 // You should have received a copy of the GNU General Public License
 // along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #pragma once
 #ifndef INCLUDE_RNG_STATE_H
 #define INCLUDE_RNG_STATE_H
 #include "show.h"
 #ifndef USE_OPENSSL
 #include "sha256.h"
 #else
 #include <openssl/sha.h>
 #endif
 #include <stdint.h>
 #include <endian.h>
 #include <cstring>
 #include <cmath>
 #include <cassert>
 #include <string>
 #include <ostream>
 #include <istream>
 #include <vector>
 #ifdef CURRENT_DEFAULT_NAMESPACE_NAME
 namespace CURRENT_DEFAULT_NAMESPACE_NAME {
 #endif
 struct RngState;
 inline void reset(RngState& rs);
 inline void reset(RngState& rs, const std::string& seed);
 inline void reset(RngState& rs, const long seed)
 {
  reset(rs, show(seed));
 }
 inline void splitRngState(RngState& rs, const RngState& rs0, const std::string& sindex);
 inline void splitRngState(RngState& rs, const RngState& rs0, const long sindex = 0)
 {
  splitRngState(rs, rs0, show(sindex));
 }
 inline uint64_t randGen(RngState& rs);
 inline double uRandGen(RngState& rs, const double upper = 1.0, const double lower = 0.0);
 inline double gRandGen(RngState& rs, const double sigma = 1.0, const double center = 0.0);
 inline void computeHashWithInput(uint32_t hash[8], const RngState& rs, const std::string& input);
 struct RngState
 {
  uint64_t numBytes;
  uint32_t hash[8];
  unsigned long index;
  //
  uint64_t cache[3];
  double gaussian;
  int cacheAvail;
  bool gaussianAvail;
  //
  inline void init()
  {
    reset(*this);
  }
  //
  RngState()
  {
    init();
  }
  RngState(const std::string& seed)
  {
    reset(*this, seed);
  }
  RngState(const long seed)
  {
    reset(*this, seed);
  }
  RngState(const RngState& rs0, const std::string& sindex)
  {
    splitRngState(*this, rs0, sindex);
  }
  RngState(const RngState& rs0, const long sindex)
  {
    splitRngState(*this, rs0, sindex);
  }
  //
  RngState split(const std::string& sindex)
  {
    RngState rs(*this, sindex);
    return rs;
  }
  RngState split(const long sindex)
  {
    RngState rs(*this, sindex);
    return rs;
  }
 };
 const size_t RNG_STATE_NUM_OF_INT32 = 2 + 8 + 2 + 3 * 2 + 2 + 1 + 1;
 inline uint64_t patchTwoUint32(const uint32_t a, const uint32_t b)
 {
  return (uint64_t)a << 32 | (uint64_t)b;
 }
 inline void splitTwoUint32(uint32_t& a, uint32_t& b, const uint64_t x)
 {
  b = (uint32_t)x;
  a = (uint32_t)(x >> 32);
  assert(x == patchTwoUint32(a, b));
 }
 inline void exportRngState(uint32_t* v, const RngState& rs)
 {
  assert(22 == RNG_STATE_NUM_OF_INT32);
  splitTwoUint32(v[0], v[1], rs.numBytes);
  for (int i = 0; i < 8; ++i) {
    v[2 + i] = rs.hash[i];
  }
  splitTwoUint32(v[10], v[11], rs.index);
  for (int i = 0; i < 3; ++i) {
    splitTwoUint32(v[12 + i * 2], v[12 + i * 2 + 1], rs.cache[i]);
  }
  union {
    double d;
    uint64_t l;
  } g;
  g.d = rs.gaussian;
  splitTwoUint32(v[18], v[19], g.l);
  v[20] = rs.cacheAvail;
  v[21] = rs.gaussianAvail;
 }
 inline void importRngState(RngState& rs, const uint32_t* v)
 {
  assert(22 == RNG_STATE_NUM_OF_INT32);
  rs.numBytes = patchTwoUint32(v[0], v[1]);
  for (int i = 0; i < 8; ++i) {
    rs.hash[i] = v[2 + i];
  }
  rs.index = patchTwoUint32(v[10], v[11]);
  for (int i = 0; i < 3; ++i) {
    rs.cache[i] = patchTwoUint32(v[12 + i * 2], v[12 + i * 2 + 1]);
  }
  union {
    double d;
    uint64_t l;
  } g;
  g.l = patchTwoUint32(v[18], v[19]);
  rs.gaussian = g.d;
  rs.cacheAvail = v[20];
  rs.gaussianAvail = v[21];
 }
 inline void exportRngState(std::vector<uint32_t>& v, const RngState& rs)
 {
  v.resize(RNG_STATE_NUM_OF_INT32);
  exportRngState(v.data(), rs);
 }
 inline void importRngState(RngState& rs, const std::vector<uint32_t>& v)
 {
  assert(RNG_STATE_NUM_OF_INT32 == v.size());
  importRngState(rs, v.data());
 }
 inline std::ostream& operator<<(std::ostream& os, const RngState& rs)
 {
  std::vector<uint32_t> v(RNG_STATE_NUM_OF_INT32);
  exportRngState(v, rs);
  for (size_t i = 0; i < v.size() - 1; ++i) {
    os << v[i] << " ";
  }
  os << v.back();
  return os;
 }
 inline std::istream& operator>>(std::istream& is, RngState& rs)
 {
  std::vector<uint32_t> v(RNG_STATE_NUM_OF_INT32);
  for (size_t i = 0; i < v.size(); ++i) {
    is >> v[i];
  }
  importRngState(rs, v);
  return is;
 }
 inline std::string show(const RngState& rs)
 {
  return shows(rs);
 }
 inline bool operator==(const RngState& rs1, const RngState& rs2)
 {
  return 0 == memcmp(&rs1, &rs2, sizeof(RngState));
 }
 inline void reset(RngState& rs)
 {
  std::memset(&rs, 0, sizeof(RngState));
  rs.numBytes = 0;
  rs.hash[0] = 0;
  rs.hash[1] = 0;
  rs.hash[2] = 0;
  rs.hash[3] = 0;
  rs.hash[4] = 0;
  rs.hash[5] = 0;
  rs.hash[6] = 0;
  rs.hash[7] = 0;
  rs.index = 0;
  rs.cache[0] = 0;
  rs.cache[1] = 0;
  rs.cache[2] = 0;
  rs.gaussian = 0.0;
  rs.cacheAvail = 0;
  rs.gaussianAvail = false;
 }
 inline void reset(RngState& rs, const std::string& seed)
 {
  reset(rs);
  splitRngState(rs, rs, seed);
 }
 inline void computeHashWithInput(uint32_t hash[8], const RngState& rs, const std::string& input)
 {
  std::string data(32, ' ');
  for (int i = 0; i < 8; ++i) {
    data[i*4 + 0] = (rs.hash[i] >> 24) & 0xFF;
    data[i*4 + 1] = (rs.hash[i] >> 16) & 0xFF;
    data[i*4 + 2] = (rs.hash[i] >>  8) & 0xFF;
    data[i*4 + 3] =  rs.hash[i]        & 0xFF;
  }
  data += input;
 #ifndef USE_OPENSSL
  sha256::computeHash(hash, (const uint8_t*)data.c_str(), data.length());
 #else
  {
    uint8_t rawHash[32];
    SHA256((unsigned char*)data.c_str(), data.length(), rawHash);
    for (int i = 0; i < 8; ++i) {
      hash[i] = (((uint32_t)rawHash[i*4 + 0]) << 24)
              + (((uint32_t)rawHash[i*4 + 1]) << 16)
              + (((uint32_t)rawHash[i*4 + 2]) <<  8)
              + ( (uint32_t)rawHash[i*4 + 3]);
    }
  }
 #endif
 }
 inline void splitRngState(RngState& rs, const RngState& rs0, const std::string& sindex)
  // produce a new rng ``rs'' uniquely identified by ``rs0'' and ``sindex''
  // will not affect old rng ``rs0''
  // the function should behave correctly even if ``rs'' is actually ``rs0''
 {
  std::string input = ssprintf("[%lu] {%s}", rs0.index, sindex.c_str());
  rs.numBytes = rs0.numBytes + 64 * ((32 + input.length() + 1 + 8 - 1) / 64 + 1);
  computeHashWithInput(rs.hash, rs0, input);
  rs.index = 0;
  rs.cache[0] = 0;
  rs.cache[1] = 0;
  rs.cache[2] = 0;
  rs.gaussian = 0.0;
  rs.cacheAvail = 0;
  rs.gaussianAvail = false;
 }
 inline uint64_t randGen(RngState& rs)
 {
  assert(0 <= rs.cacheAvail && rs.cacheAvail <= 3);
  rs.index += 1;
  if (rs.cacheAvail > 0) {
    rs.cacheAvail -= 1;
    uint64_t r = rs.cache[rs.cacheAvail];
    rs.cache[rs.cacheAvail] = 0;
    return r;
  } else {
    uint32_t hash[8];
    computeHashWithInput(hash, rs, ssprintf("[%lu]", rs.index));
    rs.cache[0] = patchTwoUint32(hash[0], hash[1]);
    rs.cache[1] = patchTwoUint32(hash[2], hash[3]);
    rs.cache[2] = patchTwoUint32(hash[4], hash[5]);
    rs.cacheAvail = 3;
    return patchTwoUint32(hash[6], hash[7]);
  }
 }
 inline double uRandGen(RngState& rs, const double upper, const double lower)
 {
  uint64_t u = randGen(rs);
  const double fac = 1.0 / (256.0 * 256.0 * 256.0 * 256.0) / (256.0 * 256.0 * 256.0 * 256.0);
  return u * fac * (upper - lower) + lower;
 }
 inline double gRandGen(RngState& rs, const double sigma, const double center)
 {
  rs.index += 1;
  if (rs.gaussianAvail) {
    rs.gaussianAvail = false;
    return rs.gaussian * sigma + center;
  } else {
    // pick 2 uniform numbers in the square extending from
    // -1 to 1 in each direction, see if they are in the
    // unit circle, and try again if they are not.
    int num_try = 1;
    double v1, v2, rsq;
    do {
      v1 = uRandGen(rs, 1.0, -1.0);
      v2 = uRandGen(rs, 1.0, -1.0);
      if ((num_try % 1000)==0) {
        printf("gRandGen : WARNING num_try=%d v1=%e v2=%e\n",num_try,v1,v2);
      }
      rsq = v1*v1 + v2*v2;
      num_try++;
    } while ((num_try < 10000) && (rsq >= 1.0 || rsq == 0));
    if (num_try > 9999) {
      printf("gRandGen : WARNING failed after 10000 tries (corrupted RNG?), returning ridiculous numbers (1e+10)\n");
      return 1e+10;
    }
    double fac = std::sqrt(-2.0 * std::log(rsq)/rsq);
    rs.gaussian = v1 * fac;
    rs.gaussianAvail = true;
    return v2 * fac * sigma + center;
  }
 }
 #ifdef CURRENT_DEFAULT_NAMESPACE_NAME
 }
 #endif
 #endif
--- a/lib/lattice/rng/sha256.h
+++ b/lib/lattice/rng/sha256.h
@@ -1,348 +0,0 @@
 // vim: set ts=2 sw=2 expandtab:
 // Copyright (c) 2016 Luchang Jin
 // All rights reserved.
 // This program is free software: you can redistribute it and/or modify
 // it under the terms of the GNU General Public License as published by
 // the Free Software Foundation, either version 2 of the License, or
 // (at your option) any later version.
 //
 // This program is distributed in the hope that it will be useful,
 // but WITHOUT ANY WARRANTY; without even the implied warranty of
 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 // GNU General Public License for more details.
 //
 // You should have received a copy of the GNU General Public License
 // along with this program.  If not, see <http://www.gnu.org/licenses/>.
 // Code within namespace sha256 are originally from Stephan Brumme.
 // see http://create.stephan-brumme.com/disclaimer.html
 #pragma once
 #include <stdint.h>
 #include <endian.h>
 #include <cstring>
 #include <cmath>
 #include <cassert>
 #include <string>
 #include <ostream>
 #include <istream>
 #include <vector>
 #ifdef CURRENT_DEFAULT_NAMESPACE_NAME
 namespace CURRENT_DEFAULT_NAMESPACE_NAME {
 #endif
 namespace sha256 {
  const size_t BlockSize = 512 / 8;
  const size_t HashBytes = 32;
  const size_t HashValues = HashBytes / 4;
  inline uint32_t rotate(uint32_t a, uint32_t c)
  {
    return (a >> c) | (a << (32 - c));
  }
  inline uint32_t swap(uint32_t x)
  {
    return (x >> 24) |
      ((x >>  8) & 0x0000FF00) |
      ((x <<  8) & 0x00FF0000) |
      (x << 24);
  }
  inline uint32_t f1(uint32_t e, uint32_t f, uint32_t g)
  // mix functions for processBlock()
  {
    uint32_t term1 = rotate(e, 6) ^ rotate(e, 11) ^ rotate(e, 25);
    uint32_t term2 = (e & f) ^ (~e & g); //(g ^ (e & (f ^ g)))
    return term1 + term2;
  }
  inline uint32_t f2(uint32_t a, uint32_t b, uint32_t c)
  // mix functions for processBlock()
  {
    uint32_t term1 = rotate(a, 2) ^ rotate(a, 13) ^ rotate(a, 22);
    uint32_t term2 = ((a | b) & c) | (a & b); //(a & (b ^ c)) ^ (b & c);
    return term1 + term2;
  }
  inline void processBlock(uint32_t newHash[8], const uint32_t oldHash[8], const uint8_t data[64])
    // process 64 bytes of data
    // newHash and oldHash and be the same
  {
    // get last hash
    uint32_t a = oldHash[0];
    uint32_t b = oldHash[1];
    uint32_t c = oldHash[2];
    uint32_t d = oldHash[3];
    uint32_t e = oldHash[4];
    uint32_t f = oldHash[5];
    uint32_t g = oldHash[6];
    uint32_t h = oldHash[7];
    // data represented as 16x 32-bit words
    const uint32_t* input = (uint32_t*) data;
    // convert to big endian
    uint32_t words[64];
    int i;
    for (i = 0; i < 16; i++) {
 #if defined(__BYTE_ORDER) && (__BYTE_ORDER != 0) && (__BYTE_ORDER == __BIG_ENDIAN)
      words[i] =      input[i];
 #else
      words[i] = swap(input[i]);
 #endif
    }
    uint32_t x,y; // temporaries
    // first round
    x = h + f1(e,f,g) + 0x428a2f98 + words[ 0]; y = f2(a,b,c); d += x; h = x + y;
    x = g + f1(d,e,f) + 0x71374491 + words[ 1]; y = f2(h,a,b); c += x; g = x + y;
    x = f + f1(c,d,e) + 0xb5c0fbcf + words[ 2]; y = f2(g,h,a); b += x; f = x + y;
    x = e + f1(b,c,d) + 0xe9b5dba5 + words[ 3]; y = f2(f,g,h); a += x; e = x + y;
    x = d + f1(a,b,c) + 0x3956c25b + words[ 4]; y = f2(e,f,g); h += x; d = x + y;
    x = c + f1(h,a,b) + 0x59f111f1 + words[ 5]; y = f2(d,e,f); g += x; c = x + y;
    x = b + f1(g,h,a) + 0x923f82a4 + words[ 6]; y = f2(c,d,e); f += x; b = x + y;
    x = a + f1(f,g,h) + 0xab1c5ed5 + words[ 7]; y = f2(b,c,d); e += x; a = x + y;
    // secound round
    x = h + f1(e,f,g) + 0xd807aa98 + words[ 8]; y = f2(a,b,c); d += x; h = x + y;
    x = g + f1(d,e,f) + 0x12835b01 + words[ 9]; y = f2(h,a,b); c += x; g = x + y;
    x = f + f1(c,d,e) + 0x243185be + words[10]; y = f2(g,h,a); b += x; f = x + y;
    x = e + f1(b,c,d) + 0x550c7dc3 + words[11]; y = f2(f,g,h); a += x; e = x + y;
    x = d + f1(a,b,c) + 0x72be5d74 + words[12]; y = f2(e,f,g); h += x; d = x + y;
    x = c + f1(h,a,b) + 0x80deb1fe + words[13]; y = f2(d,e,f); g += x; c = x + y;
    x = b + f1(g,h,a) + 0x9bdc06a7 + words[14]; y = f2(c,d,e); f += x; b = x + y;
    x = a + f1(f,g,h) + 0xc19bf174 + words[15]; y = f2(b,c,d); e += x; a = x + y;
    // extend to 24 words
    for (; i < 24; i++)
      words[i] = words[i-16] +
        (rotate(words[i-15],  7) ^ rotate(words[i-15], 18) ^ (words[i-15] >>  3)) +
        words[i-7] +
        (rotate(words[i- 2], 17) ^ rotate(words[i- 2], 19) ^ (words[i- 2] >> 10));
    // third round
    x = h + f1(e,f,g) + 0xe49b69c1 + words[16]; y = f2(a,b,c); d += x; h = x + y;
    x = g + f1(d,e,f) + 0xefbe4786 + words[17]; y = f2(h,a,b); c += x; g = x + y;
    x = f + f1(c,d,e) + 0x0fc19dc6 + words[18]; y = f2(g,h,a); b += x; f = x + y;
    x = e + f1(b,c,d) + 0x240ca1cc + words[19]; y = f2(f,g,h); a += x; e = x + y;
    x = d + f1(a,b,c) + 0x2de92c6f + words[20]; y = f2(e,f,g); h += x; d = x + y;
    x = c + f1(h,a,b) + 0x4a7484aa + words[21]; y = f2(d,e,f); g += x; c = x + y;
    x = b + f1(g,h,a) + 0x5cb0a9dc + words[22]; y = f2(c,d,e); f += x; b = x + y;
    x = a + f1(f,g,h) + 0x76f988da + words[23]; y = f2(b,c,d); e += x; a = x + y;
    // extend to 32 words
    for (; i < 32; i++)
      words[i] = words[i-16] +
        (rotate(words[i-15],  7) ^ rotate(words[i-15], 18) ^ (words[i-15] >>  3)) +
        words[i-7] +
        (rotate(words[i- 2], 17) ^ rotate(words[i- 2], 19) ^ (words[i- 2] >> 10));
    // fourth round
    x = h + f1(e,f,g) + 0x983e5152 + words[24]; y = f2(a,b,c); d += x; h = x + y;
    x = g + f1(d,e,f) + 0xa831c66d + words[25]; y = f2(h,a,b); c += x; g = x + y;
    x = f + f1(c,d,e) + 0xb00327c8 + words[26]; y = f2(g,h,a); b += x; f = x + y;
    x = e + f1(b,c,d) + 0xbf597fc7 + words[27]; y = f2(f,g,h); a += x; e = x + y;
    x = d + f1(a,b,c) + 0xc6e00bf3 + words[28]; y = f2(e,f,g); h += x; d = x + y;
    x = c + f1(h,a,b) + 0xd5a79147 + words[29]; y = f2(d,e,f); g += x; c = x + y;
    x = b + f1(g,h,a) + 0x06ca6351 + words[30]; y = f2(c,d,e); f += x; b = x + y;
    x = a + f1(f,g,h) + 0x14292967 + words[31]; y = f2(b,c,d); e += x; a = x + y;
    // extend to 40 words
    for (; i < 40; i++)
      words[i] = words[i-16] +
        (rotate(words[i-15],  7) ^ rotate(words[i-15], 18) ^ (words[i-15] >>  3)) +
        words[i-7] +
        (rotate(words[i- 2], 17) ^ rotate(words[i- 2], 19) ^ (words[i- 2] >> 10));
    // fifth round
    x = h + f1(e,f,g) + 0x27b70a85 + words[32]; y = f2(a,b,c); d += x; h = x + y;
    x = g + f1(d,e,f) + 0x2e1b2138 + words[33]; y = f2(h,a,b); c += x; g = x + y;
    x = f + f1(c,d,e) + 0x4d2c6dfc + words[34]; y = f2(g,h,a); b += x; f = x + y;
    x = e + f1(b,c,d) + 0x53380d13 + words[35]; y = f2(f,g,h); a += x; e = x + y;
    x = d + f1(a,b,c) + 0x650a7354 + words[36]; y = f2(e,f,g); h += x; d = x + y;
    x = c + f1(h,a,b) + 0x766a0abb + words[37]; y = f2(d,e,f); g += x; c = x + y;
    x = b + f1(g,h,a) + 0x81c2c92e + words[38]; y = f2(c,d,e); f += x; b = x + y;
    x = a + f1(f,g,h) + 0x92722c85 + words[39]; y = f2(b,c,d); e += x; a = x + y;
    // extend to 48 words
    for (; i < 48; i++)
      words[i] = words[i-16] +
        (rotate(words[i-15],  7) ^ rotate(words[i-15], 18) ^ (words[i-15] >>  3)) +
        words[i-7] +
        (rotate(words[i- 2], 17) ^ rotate(words[i- 2], 19) ^ (words[i- 2] >> 10));
    // sixth round
    x = h + f1(e,f,g) + 0xa2bfe8a1 + words[40]; y = f2(a,b,c); d += x; h = x + y;
    x = g + f1(d,e,f) + 0xa81a664b + words[41]; y = f2(h,a,b); c += x; g = x + y;
    x = f + f1(c,d,e) + 0xc24b8b70 + words[42]; y = f2(g,h,a); b += x; f = x + y;
    x = e + f1(b,c,d) + 0xc76c51a3 + words[43]; y = f2(f,g,h); a += x; e = x + y;
    x = d + f1(a,b,c) + 0xd192e819 + words[44]; y = f2(e,f,g); h += x; d = x + y;
    x = c + f1(h,a,b) + 0xd6990624 + words[45]; y = f2(d,e,f); g += x; c = x + y;
    x = b + f1(g,h,a) + 0xf40e3585 + words[46]; y = f2(c,d,e); f += x; b = x + y;
    x = a + f1(f,g,h) + 0x106aa070 + words[47]; y = f2(b,c,d); e += x; a = x + y;
    // extend to 56 words
    for (; i < 56; i++)
      words[i] = words[i-16] +
        (rotate(words[i-15],  7) ^ rotate(words[i-15], 18) ^ (words[i-15] >>  3)) +
        words[i-7] +
        (rotate(words[i- 2], 17) ^ rotate(words[i- 2], 19) ^ (words[i- 2] >> 10));
    // seventh round
    x = h + f1(e,f,g) + 0x19a4c116 + words[48]; y = f2(a,b,c); d += x; h = x + y;
    x = g + f1(d,e,f) + 0x1e376c08 + words[49]; y = f2(h,a,b); c += x; g = x + y;
    x = f + f1(c,d,e) + 0x2748774c + words[50]; y = f2(g,h,a); b += x; f = x + y;
    x = e + f1(b,c,d) + 0x34b0bcb5 + words[51]; y = f2(f,g,h); a += x; e = x + y;
    x = d + f1(a,b,c) + 0x391c0cb3 + words[52]; y = f2(e,f,g); h += x; d = x + y;
    x = c + f1(h,a,b) + 0x4ed8aa4a + words[53]; y = f2(d,e,f); g += x; c = x + y;
    x = b + f1(g,h,a) + 0x5b9cca4f + words[54]; y = f2(c,d,e); f += x; b = x + y;
    x = a + f1(f,g,h) + 0x682e6ff3 + words[55]; y = f2(b,c,d); e += x; a = x + y;
    // extend to 64 words
    for (; i < 64; i++)
      words[i] = words[i-16] +
        (rotate(words[i-15],  7) ^ rotate(words[i-15], 18) ^ (words[i-15] >>  3)) +
        words[i-7] +
        (rotate(words[i- 2], 17) ^ rotate(words[i- 2], 19) ^ (words[i- 2] >> 10));
    // eigth round
    x = h + f1(e,f,g) + 0x748f82ee + words[56]; y = f2(a,b,c); d += x; h = x + y;
    x = g + f1(d,e,f) + 0x78a5636f + words[57]; y = f2(h,a,b); c += x; g = x + y;
    x = f + f1(c,d,e) + 0x84c87814 + words[58]; y = f2(g,h,a); b += x; f = x + y;
    x = e + f1(b,c,d) + 0x8cc70208 + words[59]; y = f2(f,g,h); a += x; e = x + y;
    x = d + f1(a,b,c) + 0x90befffa + words[60]; y = f2(e,f,g); h += x; d = x + y;
    x = c + f1(h,a,b) + 0xa4506ceb + words[61]; y = f2(d,e,f); g += x; c = x + y;
    x = b + f1(g,h,a) + 0xbef9a3f7 + words[62]; y = f2(c,d,e); f += x; b = x + y;
    x = a + f1(f,g,h) + 0xc67178f2 + words[63]; y = f2(b,c,d); e += x; a = x + y;
    // update hash
    newHash[0] = a + oldHash[0];
    newHash[1] = b + oldHash[1];
    newHash[2] = c + oldHash[2];
    newHash[3] = d + oldHash[3];
    newHash[4] = e + oldHash[4];
    newHash[5] = f + oldHash[5];
    newHash[6] = g + oldHash[6];
    newHash[7] = h + oldHash[7];
  }
  inline void processInput(
      uint32_t hash[8],
      const uint32_t oldHash[8], const uint64_t numBytes,
      const uint8_t* input, const size_t inputSize)
    // process final block, less than 64 bytes
    // newHash and oldHash and be the same
  {
    // the input bytes are considered as bits strings, where the first bit is the most significant bit of the byte
    // - append "1" bit to message
    // - append "0" bits until message length in bit mod 512 is 448
    // - append length as 64 bit integer
    // process initial parts of input
    std::memmove(hash, oldHash, 32);
    const int nBlocks = inputSize / 64;
    for (int i = 0; i < nBlocks; ++i) {
      processBlock(hash, hash, input + i * 64);
    }
    // initialize buffer from input
    const size_t bufferSize = inputSize - nBlocks * 64;
    unsigned char buffer[BlockSize];
    std::memcpy(buffer, input + nBlocks * 64, bufferSize);
    // number of bits
    size_t paddedLength = bufferSize * 8;
    // plus one bit set to 1 (always appended)
    paddedLength++;
    // number of bits must be (numBits % 512) = 448
    size_t lower11Bits = paddedLength & 511;
    if (lower11Bits <= 448) {
      paddedLength +=       448 - lower11Bits;
    } else {
      paddedLength += 512 + 448 - lower11Bits;
    }
    // convert from bits to bytes
    paddedLength /= 8;
    // only needed if additional data flows over into a second block
    unsigned char extra[BlockSize];
    // append a "1" bit, 128 => binary 10000000
    if (bufferSize < BlockSize) {
      buffer[bufferSize] = 128;
    } else {
      extra[0] = 128;
    }
    size_t i;
    for (i = bufferSize + 1; i < BlockSize; i++) {
      buffer[i] = 0;
    }
    for (; i < paddedLength; i++) {
      extra[i - BlockSize] = 0;
    }
    // add message length in bits as 64 bit number
    uint64_t msgBits = 8 * (numBytes + inputSize);
    // find right position
    unsigned char* addLength;
    if (paddedLength < BlockSize) {
      addLength = buffer + paddedLength;
    } else {
      addLength = extra + paddedLength - BlockSize;
    }
    // must be big endian
    *addLength++ = (unsigned char)((msgBits >> 56) & 0xFF);
    *addLength++ = (unsigned char)((msgBits >> 48) & 0xFF);
    *addLength++ = (unsigned char)((msgBits >> 40) & 0xFF);
    *addLength++ = (unsigned char)((msgBits >> 32) & 0xFF);
    *addLength++ = (unsigned char)((msgBits >> 24) & 0xFF);
    *addLength++ = (unsigned char)((msgBits >> 16) & 0xFF);
    *addLength++ = (unsigned char)((msgBits >>  8) & 0xFF);
    *addLength   = (unsigned char)( msgBits        & 0xFF);
    // process blocks
    processBlock(hash, hash, buffer);
    // flowed over into a second block ?
    if (paddedLength > BlockSize) {
      processBlock(hash, hash, extra);
    }
  }
  inline void setInitialHash(uint32_t hash[8])
  {
    hash[0] = 0x6a09e667;
    hash[1] = 0xbb67ae85;
    hash[2] = 0x3c6ef372;
    hash[3] = 0xa54ff53a;
    hash[4] = 0x510e527f;
    hash[5] = 0x9b05688c;
    hash[6] = 0x1f83d9ab;
    hash[7] = 0x5be0cd19;
  }
  inline void computeHash(uint32_t hash[8], const void* data, const size_t size)
  {
    uint32_t initHash[8];
    setInitialHash(initHash);
    processInput(hash, initHash, 0, (const uint8_t*)data, size);
  }
  inline void rawHashFromHash(uint8_t rawHash[HashBytes], const uint32_t hash[HashValues])
  {
    uint8_t* current = rawHash;
    for (size_t i = 0; i < HashValues; i++) {
      *current++ = (hash[i] >> 24) & 0xFF;
      *current++ = (hash[i] >> 16) & 0xFF;
      *current++ = (hash[i] >>  8) & 0xFF;
      *current++ =  hash[i]        & 0xFF;
    }
  }
  inline std::string showRawHash(const uint8_t rawHash[HashBytes])
  {
    std::string result;
    result.reserve(2 * HashBytes);
    for (size_t i = 0; i < HashBytes; i++) {
      static const char dec2hex[16+1] = "0123456789abcdef";
      result += dec2hex[(rawHash[i] >> 4) & 15];
      result += dec2hex[ rawHash[i]       & 15];
    }
    return result;
  }
  inline std::string showHash(const uint32_t hash[8])
  {
    unsigned char rawHash[HashBytes];
    rawHashFromHash(rawHash, hash);
    return showRawHash(rawHash);
  }
 }
 #ifdef CURRENT_DEFAULT_NAMESPACE_NAME
 }
 #endif
--- a/lib/lattice/rng/show.h
+++ b/lib/lattice/rng/show.h
@@ -1,125 +0,0 @@
 // vim: set ts=2 sw=2 expandtab:
 // Copyright (c) 2014 Luchang Jin
 // All rights reserved.
 // This program is free software: you can redistribute it and/or modify
 // it under the terms of the GNU General Public License as published by
 // the Free Software Foundation, either version 2 of the License, or
 // (at your option) any later version.
 //
 // This program is distributed in the hope that it will be useful,
 // but WITHOUT ANY WARRANTY; without even the implied warranty of
 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 // GNU General Public License for more details.
 //
 // You should have received a copy of the GNU General Public License
 // along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #pragma once
 #ifndef INCLUDE_SHOW_H
 #define INCLUDE_SHOW_H
 #include <sstream>
 #include <string>
 #include <cstdarg>
 #include <cstring>
 #include <cstdlib>
 #include <cstdio>
 #include <sstream>
 #ifdef CURRENT_DEFAULT_NAMESPACE_NAME
 namespace CURRENT_DEFAULT_NAMESPACE_NAME {
 #endif
 inline std::string vssprintf(const char* fmt, va_list args)
 {
  std::string str;
  char* cstr;
  vasprintf(&cstr, fmt, args);
  str += std::string(cstr);
  std::free(cstr);
  return str;
 }
 inline std::string ssprintf(const char* fmt, ...)
 {
  va_list args;
  va_start(args, fmt);
  return vssprintf(fmt, args);
 }
 inline std::string show()
 {
  return "";
 }
 inline std::string show(const int& x)
 {
  return ssprintf("%d", x);
 }
 inline std::string show(const unsigned int& x)
 {
  return ssprintf("%u", x);
 }
 inline std::string show(const long& x)
 {
  return ssprintf("%ld", x);
 }
 inline std::string show(const unsigned long& x)
 {
  return ssprintf("%lu", x);
 }
 inline std::string show(const double& x)
 {
  return ssprintf("%24.17E", x);
 }
 inline std::string show(const bool& x)
 {
  return x ? "true" : "false";
 }
 inline std::string show(const std::string& x)
 {
  std::ostringstream out;
  out << x;
  return out.str();
 }
 template <class T>
 std::string shows(const T& x)
 {
  std::ostringstream out;
  out << x;
  return out.str();
 }
 template <class T>
 T& reads(T& x, const std::string& str)
 {
  std::istringstream in(str);
  in >> x;
  return x;
 }
 inline void fdisplay(FILE* fp, const std::string& str)
 {
  fprintf(fp, "%s", str.c_str());
 }
 inline void fdisplayln(FILE* fp, const std::string& str)
 {
  fprintf(fp, "%s\n", str.c_str());
 }
 #ifdef CURRENT_DEFAULT_NAMESPACE_NAME
 }
 #endif
 #endif
--- a/lib/lattice/rng/sprng-sha256.h
+++ b/lib/lattice/rng/sprng-sha256.h
@@ -1,115 +0,0 @@
 // vim: set ts=2 sw=2 expandtab:
 // Copyright (c) 2016 Luchang Jin
 // All rights reserved.
 // This program is free software: you can redistribute it and/or modify
 // it under the terms of the GNU General Public License as published by
 // the Free Software Foundation, either version 2 of the License, or
 // (at your option) any later version.
 //
 // This program is distributed in the hope that it will be useful,
 // but WITHOUT ANY WARRANTY; without even the implied warranty of
 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 // GNU General Public License for more details.
 //
 // You should have received a copy of the GNU General Public License
 // along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #pragma once
 #ifndef INCLUDE_SPRNG_SHA256_H
 #define INCLUDE_SPRNG_SHA256_H
 #include "rng-state.h"
 #include <array>
 #include <cstring>
 #include <ostream>
 #include <istream>
 #ifdef CURRENT_DEFAULT_NAMESPACE_NAME
 namespace CURRENT_DEFAULT_NAMESPACE_NAME {
 #endif
 struct SprngSha256
 {
  RngState rs;
  //
  using result_type = uint64_t;
  //
  static constexpr result_type default_seed = 0;
  //
  explicit SprngSha256(result_type val = default_seed)
  {
    seed(val);
  }
  template<typename Sseq, typename = typename
    std::enable_if<!std::is_same<Sseq, SprngSha256>::value>
    ::type>
  explicit SprngSha256(Sseq& q)
  {
    seed(q);
  }
  //
  static constexpr result_type min()
  {
    return 0;
  }
  //
  static constexpr result_type max()
  {
    return UINT64_MAX;
  }
  //
  void seed(result_type val = default_seed)
  {
    reset(rs, (long)val);
  }
  template <class Sseq>
  typename std::enable_if<std::is_class<Sseq>::value>::type
  seed(Sseq& q)
  {
    std::array<uint32_t, 8> seq;
    q.generate(seq.begin(), seq.end());
    reset(rs);
    for (size_t i = 0; i < seq.size(); ++i) {
      splitRngState(rs, rs, seq[i]);
    }
  }
  //
  result_type operator()()
  {
    return randGen(rs);
  }
  //
  void discard(unsigned long long z)
  {
    for (unsigned long long i = 0; i < z; ++i) {
      randGen(rs);
    }
  }
 };
 inline std::ostream& operator<<(std::ostream& os, const SprngSha256& ss)
 {
  os << ss.rs;
  return os;
 }
 inline std::istream& operator>>(std::istream& is, SprngSha256& ss)
 {
  is >> ss.rs;
  return is;
 }
 inline bool operator==(const SprngSha256& ss1, const SprngSha256& ss2)
 {
  return ss1.rs == ss2.rs;
 }
 #ifdef CURRENT_DEFAULT_NAMESPACE_NAME
 }
 #endif
 #endif
--- a/lib/qcd/action/fermion/FermionOperatorImpl.h
+++ b/lib/qcd/action/fermion/FermionOperatorImpl.h
@@ -33,7 +33,6 @@ directory
 #define GRID_QCD_FERMION_OPERATOR_IMPL_H
 namespace Grid {
 namespace QCD {
@@ -108,13 +107,14 @@ namespace Grid {
  INHERIT_GIMPL_TYPES(Base)	 \
  INHERIT_FIMPL_TYPES(Base)
-    ///////
+  /////////////////////////////////////////////////////////////////////////////
  // Single flavour four spinors with colour index
-    ///////
+  /////////////////////////////////////////////////////////////////////////////
  template <class S, class Representation = FundamentalRepresentation,class _Coeff_t = RealD >
-    class WilsonImpl
+  class WilsonImpl : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > {
-      : public PeriodicGaugeImpl<GaugeImplTypes<S, Representation::Dimension > > {
+
    public:
    static const int Dimension = Representation::Dimension;
    typedef PeriodicGaugeImpl<GaugeImplTypes<S, Dimension > > Gimpl;
@@ -124,7 +124,6 @@ namespace Grid {
    const bool LsVectorised=false;
    typedef _Coeff_t Coeff_t;
    INHERIT_GIMPL_TYPES(Gimpl);
    template <typename vtype> using iImplSpinor            = iScalar<iVector<iVector<vtype, Dimension>, Ns> >;
@@ -158,8 +157,7 @@ namespace Grid {
    }
    template <class ref>
-      inline void loadLinkElement(Simd &reg,
+    inline void loadLinkElement(Simd &reg, ref &memory) {
 				  ref &memory) {
      reg = memory;
    }
@@ -202,9 +200,10 @@ namespace Grid {
    }
  };
-    ///////
+  ////////////////////////////////////////////////////////////////////////////////////
  // Single flavour four spinors with colour index, 5d redblack
-    ///////
+  ////////////////////////////////////////////////////////////////////////////////////
 template<class S,int Nrepresentation=Nc,class _Coeff_t = RealD>
 class DomainWallVec5dImpl :  public PeriodicGaugeImpl< GaugeImplTypes< S,Nrepresentation> > { 
  public:
@@ -227,12 +226,9 @@ namespace Grid {
  typedef Lattice<SiteSpinor> FermionField;
  // Make the doubled gauge field a *scalar*
-      typedef iImplDoubledGaugeField<typename Simd::scalar_type>
+  typedef iImplDoubledGaugeField<typename Simd::scalar_type>  SiteDoubledGaugeField;  // This is a scalar
-      SiteDoubledGaugeField;  // This is a scalar
+  typedef iImplGaugeField<typename Simd::scalar_type>         SiteScalarGaugeField;  // scalar
-      typedef iImplGaugeField<typename Simd::scalar_type>
+  typedef iImplGaugeLink<typename Simd::scalar_type>          SiteScalarGaugeLink;  // scalar
      SiteScalarGaugeField;  // scalar
      typedef iImplGaugeLink<typename Simd::scalar_type>
      SiteScalarGaugeLink;  // scalar
  typedef Lattice<SiteDoubledGaugeField> DoubledGaugeField;
@@ -250,6 +246,7 @@ namespace Grid {
  inline void loadLinkElement(Simd &reg, ref &memory) {
    vsplat(reg, memory);
  }
  inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U,
 		       const SiteHalfSpinor &chi, int mu, StencilEntry *SE,
 		       StencilImpl &St) {
@@ -262,8 +259,8 @@ namespace Grid {
    mult(&phi(), &UU(), &chi());
  }
-      inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &Uds,
+  inline void DoubleStore(GridBase *GaugeGrid, DoubledGaugeField &Uds,const GaugeField &Umu) 
-			      const GaugeField &Umu) {
+  {
    SiteScalarGaugeField ScalarUmu;
    SiteDoubledGaugeField ScalarUds;
@@ -289,13 +286,13 @@ namespace Grid {
    }
  }
-      inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,
+  inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,FermionField &A, int mu) 
-				FermionField &A, int mu) {
+  {
    assert(0);
  }
-      inline void InsertForce5D(GaugeField &mat, FermionField &Btilde,
+  inline void InsertForce5D(GaugeField &mat, FermionField &Btilde,FermionField &Atilde, int mu) 
-				FermionField &Atilde, int mu) {
+  {
 	assert(0);
  }
 };
@@ -305,9 +302,9 @@ namespace Grid {
    ////////////////////////////////////////////////////////////////////////////////////////
 template <class S, int Nrepresentation,class _Coeff_t = RealD>
-    class GparityWilsonImpl
+class GparityWilsonImpl : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresentation> > {
      : public ConjugateGaugeImpl<GaugeImplTypes<S, Nrepresentation> > {
 public:
 static const int Dimension = Nrepresentation;
 const bool LsVectorised=false;
@@ -317,15 +314,9 @@ namespace Grid {
 INHERIT_GIMPL_TYPES(Gimpl);
-      template <typename vtype>
+ template <typename vtype> using iImplSpinor            = iVector<iVector<iVector<vtype, Nrepresentation>, Ns>, Ngp>;
-      using iImplSpinor =
+ template <typename vtype> using iImplHalfSpinor        = iVector<iVector<iVector<vtype, Nrepresentation>, Nhs>, Ngp>;
-      iVector<iVector<iVector<vtype, Nrepresentation>, Ns>, Ngp>;
+ template <typename vtype> using iImplDoubledGaugeField = iVector<iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>, Ngp>;
      template <typename vtype>
      using iImplHalfSpinor =
 	iVector<iVector<iVector<vtype, Nrepresentation>, Nhs>, Ngp>;
      template <typename vtype>
      using iImplDoubledGaugeField =
 	iVector<iVector<iScalar<iMatrix<vtype, Nrepresentation> >, Nds>, Ngp>;
 typedef iImplSpinor<Simd> SiteSpinor;
 typedef iImplHalfSpinor<Simd> SiteHalfSpinor;
@@ -341,7 +332,6 @@ namespace Grid {
 ImplParams Params;
 GparityWilsonImpl(const ImplParams &p = ImplParams()) : Params(p){};
 bool overlapCommsCompute(void) { return Params.overlapCommsCompute; };
@@ -351,6 +341,7 @@ namespace Grid {
 inline void multLink(SiteHalfSpinor &phi, const SiteDoubledGaugeField &U,
 		      const SiteHalfSpinor &chi, int mu, StencilEntry *SE,
 		      StencilImpl &St) {
  typedef SiteHalfSpinor vobj;
   typedef typename SiteHalfSpinor::scalar_object sobj;
@@ -419,7 +410,6 @@ namespace Grid {
 inline void DoubleStore(GridBase *GaugeGrid,DoubledGaugeField &Uds,const GaugeField &Umu)
 {
   conformable(Uds._grid,GaugeGrid);
   conformable(Umu._grid,GaugeGrid);
@@ -429,7 +419,6 @@ namespace Grid {
   Lattice<iScalar<vInteger> > coor(GaugeGrid);
   for(int mu=0;mu<Nd;mu++){
     LatticeCoordinate(coor,mu);
@@ -443,7 +432,6 @@ namespace Grid {
       Uconj = where(coor==neglink,-Uconj,Uconj);
     }
 PARALLEL_FOR_LOOP
     for(auto ss=U.begin();ss<U.end();ss++){
       Uds[ss](0)(mu) = U[ss]();
@@ -477,8 +465,8 @@ namespace Grid {
 }
-      inline void InsertForce4D(GaugeField &mat, FermionField &Btilde,
+ inline void InsertForce4D(GaugeField &mat, FermionField &Btilde, FermionField &A, int mu) {
-				FermionField &A, int mu) {
+
   // DhopDir provides U or Uconj depending on coor/flavour.
   GaugeLinkField link(mat._grid);
   // use lorentz for flavour as hack.
@@ -491,8 +479,8 @@ namespace Grid {
   return;
 }
-      inline void InsertForce5D(GaugeField &mat, FermionField &Btilde,
+ inline void InsertForce5D(GaugeField &mat, FermionField &Btilde, FermionField &Atilde, int mu) {
-				FermionField &Atilde, int mu) {
+
   int Ls = Btilde._grid->_fdimensions[0];
   GaugeLinkField tmp(mat._grid);
@@ -508,13 +496,13 @@ namespace Grid {
   PokeIndex<LorentzIndex>(mat, tmp, mu);
   return;
 }
 };
 typedef WilsonImpl<vComplex,  FundamentalRepresentation > WilsonImplR;   // Real.. whichever prec
 typedef WilsonImpl<vComplexF, FundamentalRepresentation > WilsonImplF;  // Float
 typedef WilsonImpl<vComplexD, FundamentalRepresentation > WilsonImplD;  // Double
 typedef WilsonImpl<vComplex,  FundamentalRepresentation, ComplexD > ZWilsonImplR; // Real.. whichever prec
 typedef WilsonImpl<vComplexF, FundamentalRepresentation, ComplexD > ZWilsonImplF; // Float
 typedef WilsonImpl<vComplexD, FundamentalRepresentation, ComplexD > ZWilsonImplD; // Double
@@ -538,6 +526,7 @@ namespace Grid {
 typedef GparityWilsonImpl<vComplex , Nc> GparityWilsonImplR;  // Real.. whichever prec
 typedef GparityWilsonImpl<vComplexF, Nc> GparityWilsonImplF;  // Float
 typedef GparityWilsonImpl<vComplexD, Nc> GparityWilsonImplD;  // Double
-}
+
-}
+}}
 #endif
--- a/lib/qcd/action/fermion/WilsonFermion.cc
+++ b/lib/qcd/action/fermion/WilsonFermion.cc
@@ -166,7 +166,7 @@ void WilsonFermion<Impl>::DerivInternal(StencilImpl &st, DoubledGaugeField &U,
    ////////////////////////
    PARALLEL_FOR_LOOP
    for (int sss = 0; sss < B._grid->oSites(); sss++) {
-      Kernels::DiracOptDhopDir(st, U, st.comm_buf, sss, sss, B, Btilde, mu,
+      Kernels::DiracOptDhopDir(st, U, st.CommBuf(), sss, sss, B, Btilde, mu,
                               gamma);
    }
@@ -277,7 +277,7 @@ void WilsonFermion<Impl>::DhopDirDisp(const FermionField &in, FermionField &out,
  PARALLEL_FOR_LOOP
  for (int sss = 0; sss < in._grid->oSites(); sss++) {
-    Kernels::DiracOptDhopDir(Stencil, Umu, Stencil.comm_buf, sss, sss, in, out,
+    Kernels::DiracOptDhopDir(Stencil, Umu, Stencil.CommBuf(), sss, sss, in, out,
                             dirdisp, gamma);
  }
 };
@@ -295,13 +295,13 @@ void WilsonFermion<Impl>::DhopInternal(StencilImpl &st, LebesgueOrder &lo,
  if (dag == DaggerYes) {
    PARALLEL_FOR_LOOP
    for (int sss = 0; sss < in._grid->oSites(); sss++) {
-      Kernels::DiracOptDhopSiteDag(st, lo, U, st.comm_buf, sss, sss, 1, 1, in,
+      Kernels::DiracOptDhopSiteDag(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in,
                                   out);
    }
  } else {
    PARALLEL_FOR_LOOP
    for (int sss = 0; sss < in._grid->oSites(); sss++) {
-      Kernels::DiracOptDhopSite(st, lo, U, st.comm_buf, sss, sss, 1, 1, in,
+      Kernels::DiracOptDhopSite(st, lo, U, st.CommBuf(), sss, sss, 1, 1, in,
                                out);
    }
  }
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -185,18 +185,14 @@ void WilsonFermion5D<Impl>::Report(void)
  if ( DhopCalls > 0 ) {
    std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
    std::cout << GridLogMessage << "WilsonFermion5D Number of Dhop Calls     : " << DhopCalls   << std::endl;
-    std::cout << GridLogMessage << "WilsonFermion5D Total Communication time : " << DhopCommTime
+    std::cout << GridLogMessage << "WilsonFermion5D Total Communication time : " << DhopCommTime<< " us" << std::endl;
-              << " us" << std::endl;
+    std::cout << GridLogMessage << "WilsonFermion5D CommTime/Calls           : " << DhopCommTime / DhopCalls << " us" << std::endl;
-    std::cout << GridLogMessage << "WilsonFermion5D CommTime/Calls           : "
+    std::cout << GridLogMessage << "WilsonFermion5D Total Compute time       : " << DhopComputeTime << " us" << std::endl;
-              << DhopCommTime / DhopCalls << " us" << std::endl;
+    std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls        : " << DhopComputeTime / DhopCalls << " us" << std::endl;
    std::cout << GridLogMessage << "WilsonFermion5D Total Compute time       : "
              << DhopComputeTime << " us" << std::endl;
    std::cout << GridLogMessage << "WilsonFermion5D ComputeTime/Calls        : "
              << DhopComputeTime / DhopCalls << " us" << std::endl;
-    RealD mflops = 1344*volume*DhopCalls/DhopComputeTime;
+    RealD mflops = 1344*volume*DhopCalls/DhopComputeTime/2; // 2 for red black counting
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
-    std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NP << std::endl;
+    std::cout << GridLogMessage << "Average mflops/s per call per rank       : " << mflops/NP << std::endl;
   }
@@ -210,12 +206,9 @@ void WilsonFermion5D<Impl>::Report(void)
    std::cout << GridLogMessage << "WilsonFermion5D Total Dhop Compute time  : " <<DerivDhopComputeTime <<" us"<<std::endl;
    std::cout << GridLogMessage << "WilsonFermion5D Dhop ComputeTime/Calls   : " <<DerivDhopComputeTime/DerivCalls<<" us" <<std::endl;
    RealD mflops = 144*volume*DerivCalls/DerivDhopComputeTime;
    std::cout << GridLogMessage << "Average mflops/s per call                : " << mflops << std::endl;
    std::cout << GridLogMessage << "Average mflops/s per call per node       : " << mflops/NP << std::endl;
  }
  if (DerivCalls > 0 || DhopCalls > 0){
@@ -275,7 +268,7 @@ PARALLEL_FOR_LOOP
    for(int s=0;s<Ls;s++){
      int sU=ss;
      int sF = s+Ls*sU; 
-      Kernels::DiracOptDhopDir(Stencil,Umu,Stencil.comm_buf,sF,sU,in,out,dirdisp,gamma);
+      Kernels::DiracOptDhopDir(Stencil,Umu,Stencil.CommBuf(),sF,sU,in,out,dirdisp,gamma);
    }
  }
 };
@@ -327,8 +320,7 @@ void WilsonFermion5D<Impl>::DerivInternal(StencilImpl & st,
        assert(sF < B._grid->oSites());
        assert(sU < U._grid->oSites());
-        Kernels::DiracOptDhopDir(st, U, st.comm_buf, sF, sU, B, Btilde, mu,
+        Kernels::DiracOptDhopDir(st, U, st.CommBuf(), sF, sU, B, Btilde, mu, gamma);
                                 gamma);
        ////////////////////////////
        // spin trace outer product
@@ -396,7 +388,6 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
 					 DoubledGaugeField & U,
 					 const FermionField &in, FermionField &out,int dag)
 {
  DhopCalls++;
  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
  Compressor compressor(dag);
@@ -413,8 +404,7 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
    for (int ss = 0; ss < U._grid->oSites(); ss++) {
      int sU = ss;
      int sF = LLs * sU;
-      Kernels::DiracOptDhopSiteDag(st, lo, U, st.comm_buf, sF, sU, LLs, 1, in,
+      Kernels::DiracOptDhopSiteDag(st, lo, U, st.CommBuf(), sF, sU, LLs, 1, in, out);
                                   out);
    }
 #ifdef AVX512
  } else if (stat.is_init() ) {
@@ -428,11 +418,10 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
    int mythread = omp_get_thread_num();
    stat.enter(mythread);
 #pragma omp for nowait
-   for(int ss=0;ss<U._grid->oSites();ss++)
+    for(int ss=0;ss<U._grid->oSites();ss++) {
    {
      int sU=ss;
      int sF=LLs*sU;
-       Kernels::DiracOptDhopSite(st,lo,U,st.comm_buf,sF,sU,LLs,1,in,out);
+      Kernels::DiracOptDhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
    }
    stat.exit(mythread);
    }
@@ -443,8 +432,7 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
    for (int ss = 0; ss < U._grid->oSites(); ss++) {
      int sU = ss;
      int sF = LLs * sU;
-      Kernels::DiracOptDhopSite(st, lo, U, st.comm_buf, sF, sU, LLs, 1, in,
+      Kernels::DiracOptDhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out);
                                out);
    }
  }
  DhopComputeTime+=usecond();
@@ -454,6 +442,7 @@ void WilsonFermion5D<Impl>::DhopInternal(StencilImpl & st, LebesgueOrder &lo,
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int dag)
 {
  DhopCalls++;
  conformable(in._grid,FermionRedBlackGrid());    // verifies half grid
  conformable(in._grid,out._grid); // drops the cb check
@@ -465,6 +454,7 @@ void WilsonFermion5D<Impl>::DhopOE(const FermionField &in, FermionField &out,int
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int dag)
 {
  DhopCalls++;
  conformable(in._grid,FermionRedBlackGrid());    // verifies half grid
  conformable(in._grid,out._grid); // drops the cb check
@@ -476,6 +466,7 @@ void WilsonFermion5D<Impl>::DhopEO(const FermionField &in, FermionField &out,int
 template<class Impl>
 void WilsonFermion5D<Impl>::Dhop(const FermionField &in, FermionField &out,int dag)
 {
  DhopCalls+=2;
  conformable(in._grid,FermionGrid()); // verifies full grid
  conformable(in._grid,out._grid);
--- a/lib/qcd/action/fermion/WilsonFermion5D.h
+++ b/lib/qcd/action/fermion/WilsonFermion5D.h
@@ -34,7 +34,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #include <Grid/Stat.h>
 namespace Grid {
 namespace QCD {
  ////////////////////////////////////////////////////////////////////////////////
@@ -182,7 +181,7 @@ namespace Grid {
    std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  comm_buf;
  };
-  }
+
-}
+}}
 #endif
--- a/lib/qcd/action/fermion/WilsonKernels.cc
+++ b/lib/qcd/action/fermion/WilsonKernels.cc
@@ -43,9 +43,8 @@ WilsonKernels<Impl>::WilsonKernels(const ImplParams &p) : Base(p){};
 ////////////////////////////////////////////
 template <class Impl>
-void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(
+void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-    StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+						     SiteHalfSpinor *buf, int sF,
    std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf, int sF,
 						     int sU, const FermionField &in, FermionField &out) {
  SiteHalfSpinor tmp;
  SiteHalfSpinor chi;
@@ -220,9 +219,8 @@ void WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(
 // Need controls to do interior, exterior, or both
 template <class Impl>
-void WilsonKernels<Impl>::DiracOptGenericDhopSite(
+void WilsonKernels<Impl>::DiracOptGenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
-    StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+						  SiteHalfSpinor *buf, int sF,
    std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf, int sF,
 						  int sU, const FermionField &in, FermionField &out) {
  SiteHalfSpinor tmp;
  SiteHalfSpinor chi;
@@ -396,10 +394,9 @@ void WilsonKernels<Impl>::DiracOptGenericDhopSite(
 };
 template <class Impl>
-void WilsonKernels<Impl>::DiracOptDhopDir(
+void WilsonKernels<Impl>::DiracOptDhopDir( StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor *buf, int sF,
    StencilImpl &st, DoubledGaugeField &U,
    std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf, int sF,
 					   int sU, const FermionField &in, FermionField &out, int dir, int gamma) {
  SiteHalfSpinor tmp;
  SiteHalfSpinor chi;
  SiteSpinor result;
--- a/lib/qcd/action/fermion/WilsonKernels.h
+++ b/lib/qcd/action/fermion/WilsonKernels.h
@@ -32,7 +32,6 @@ directory
 #define GRID_QCD_DHOP_H
 namespace Grid {
 namespace QCD {
  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -56,16 +55,11 @@ namespace Grid {
  template <bool EnableBool = true>
  typename std::enable_if<Impl::Dimension == 3 && Nc == 3 &&EnableBool, void>::type
-	DiracOptDhopSite(
+  DiracOptDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
-			 StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+		   int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
 			 std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
 			 int sF, int sU, int Ls, int Ns, const FermionField &in,
 			 FermionField &out) {
 #ifdef AVX512
    if (AsmOpt) {
-	  WilsonKernels<Impl>::DiracOptAsmDhopSite(st, lo, U, buf, sF, sU, Ls, Ns,
+      WilsonKernels<Impl>::DiracOptAsmDhopSite(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
 						   in, out);
    } else {
 #else
    {
@@ -73,11 +67,9 @@ namespace Grid {
      for (int site = 0; site < Ns; site++) {
 	for (int s = 0; s < Ls; s++) {
 	  if (HandOpt)
-		  WilsonKernels<Impl>::DiracOptHandDhopSite(st, lo, U, buf, sF, sU,
+	    WilsonKernels<Impl>::DiracOptHandDhopSite(st,lo,U,buf,sF,sU,in,out);
 							    in, out);
 	  else
-		  WilsonKernels<Impl>::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU,
+	    WilsonKernels<Impl>::DiracOptGenericDhopSite(st,lo,U,buf,sF,sU,in,out);
 							       in, out);
 	  sF++;
 	}
 	sU++;
@@ -87,15 +79,12 @@ namespace Grid {
  template <bool EnableBool = true>
  typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type
-	  DiracOptDhopSite(
+  DiracOptDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
-			   StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+		   int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
-			   std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
+     
 			   int sF, int sU, int Ls, int Ns, const FermionField &in,
 			   FermionField &out) {
    for (int site = 0; site < Ns; site++) {
      for (int s = 0; s < Ls; s++) {
-	      WilsonKernels<Impl>::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU, in,
+	WilsonKernels<Impl>::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU, in, out);
 							   out);
 	sF++;
      }
      sU++;
@@ -103,17 +92,12 @@ namespace Grid {
  }
  template <bool EnableBool = true>
-	  typename std::enable_if<Impl::Dimension == 3 && Nc == 3 && EnableBool,
+  typename std::enable_if<Impl::Dimension == 3 && Nc == 3 && EnableBool,void>::type
-				  void>::type
+  DiracOptDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
-	  DiracOptDhopSiteDag(
+		      int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
 			      StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
 			      std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
 			      int sF, int sU, int Ls, int Ns, const FermionField &in,
 			      FermionField &out) {
 #ifdef AVX512
    if (AsmOpt) {
-				      WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st, lo, U, buf, sF, sU, Ls,
+      WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
 										  Ns, in, out);
    } else {
 #else
    {
@@ -121,11 +105,9 @@ namespace Grid {
      for (int site = 0; site < Ns; site++) {
 	for (int s = 0; s < Ls; s++) {
 	  if (HandOpt)
-					      WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st, lo, U, buf, sF, sU,
+	    WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
 											   in, out);
 	  else
-					      WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st, lo, U, buf, sF,
+	    WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
 											      sU, in, out);
 	  sF++;
 	}
 	sU++;
@@ -134,73 +116,48 @@ namespace Grid {
  }
  template <bool EnableBool = true>
-				      typename std::enable_if<
+  typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool,void>::type
-				      (Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool,
+  DiracOptDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,SiteHalfSpinor * buf,
-				      void>::type
+		      int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
-				      DiracOptDhopSiteDag(
+
 							  StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
 							  std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
 							  int sF, int sU, int Ls, int Ns, const FermionField &in,
 							  FermionField &out) {
    for (int site = 0; site < Ns; site++) {
      for (int s = 0; s < Ls; s++) {
-					    WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st, lo, U, buf, sF, sU,
+	WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
 											    in, out);
 	sF++;
      }
      sU++;
    }
  }
-				    void DiracOptDhopDir(
+  void DiracOptDhopDir(StencilImpl &st, DoubledGaugeField &U,SiteHalfSpinor * buf,
-							 StencilImpl &st, DoubledGaugeField &U,
+		       int sF, int sU, const FermionField &in, FermionField &out, int dirdisp, int gamma);
 							 std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
 							 int sF, int sU, const FermionField &in, FermionField &out, int dirdisp,
 							 int gamma);
 private:
     // Specialised variants
-				    void DiracOptGenericDhopSite(
+  void DiracOptGenericDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
 								 StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
 								 std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
 			       int sF, int sU, const FermionField &in, FermionField &out);
-				    void DiracOptGenericDhopSiteDag(
+  void DiracOptGenericDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
 								    StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
 								    std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
 				  int sF, int sU, const FermionField &in, FermionField &out);
-				    void DiracOptAsmDhopSite(
+  void DiracOptAsmDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
-							     StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+			   int sF, int sU, int Ls, int Ns, const FermionField &in,FermionField &out);
 							     std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
 							     int sF, int sU, int Ls, int Ns, const FermionField &in,
 							     FermionField &out);
-				    void DiracOptAsmDhopSiteDag(
+  void DiracOptAsmDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
-								StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
+			      int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out);
 								std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
 								int sF, int sU, int Ls, int Ns, const FermionField &in,
 								FermionField &out);
-				    void DiracOptHandDhopSite(
+  void DiracOptHandDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
 							      StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
 							      std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
 			    int sF, int sU, const FermionField &in, FermionField &out);
-				    void DiracOptHandDhopSiteDag(
+  void DiracOptHandDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
 								 StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U,
 								 std::vector<SiteHalfSpinor, alignedAllocator<SiteHalfSpinor> > &buf,
 			       int sF, int sU, const FermionField &in, FermionField &out);
 public:
  WilsonKernels(const ImplParams &p = ImplParams());
 };
-      }
+}}
    }
 #endif
--- a/lib/qcd/action/fermion/WilsonKernelsAsm.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsAsm.cc
@@ -38,26 +38,22 @@ namespace Grid {
 ///////////////////////////////////////////////////////////
 // Default to no assembler implementation
 ///////////////////////////////////////////////////////////
-    template<class Impl>
+template<class Impl> void 
-      void WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+WilsonKernels<Impl >::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
                             std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
                             int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
    {
      assert(0);
    }
    template<class Impl>
      void WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
                                std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 {
  assert(0);
 }
-
+template<class Impl> void 
 WilsonKernels<Impl >::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
 					     int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 {
  assert(0);
 }
 #if defined(AVX512) 
    ///////////////////////////////////////////////////////////
    // If we are AVX512 specialise the single precision routine
    ///////////////////////////////////////////////////////////
@@ -84,16 +80,14 @@ namespace Grid {
 #define FX(A) WILSONASM_ ##A
 #undef KERNEL_DAG
-    template<>
+template<> void 
-    void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+WilsonKernels<WilsonImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
 							 std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 						int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 #define KERNEL_DAG
-    template<>
+template<> void 
-    void WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+WilsonKernels<WilsonImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
 							    std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 						   int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
@@ -109,31 +103,26 @@ namespace Grid {
 #define MULT_2SPIN(ptr,pf) MULT_ADDSUB_2SPIN_LS(ptr,pf)
 #undef KERNEL_DAG
-    template<>
+template<> void 
-    void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,
 								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							 int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 #define KERNEL_DAG
-    template<>
+template<> void 
-    void WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,
+WilsonKernels<DomainWallVec5dImplF>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
 								     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							    int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out)
 #include <qcd/action/fermion/WilsonKernelsAsmBody.h>
 #endif
 #define INSTANTIATE_ASM(A)\
-template void WilsonKernels<A>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,\
+template void WilsonKernels<A>::DiracOptAsmDhopSite(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
                                   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,\
                                  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
-template void WilsonKernels<A>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U,\
+ \
-                                   std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,\
+template void WilsonKernels<A>::DiracOptAsmDhopSiteDag(StencilImpl &st,LebesgueOrder & lo,DoubledGaugeField &U, SiteHalfSpinor *buf,\
                                  int ss,int ssU,int Ls,int Ns,const FermionField &in, FermionField &out);\
 INSTANTIATE_ASM(WilsonImplF);
 INSTANTIATE_ASM(WilsonImplD);
 INSTANTIATE_ASM(ZWilsonImplF);
@@ -144,6 +133,6 @@ INSTANTIATE_ASM(DomainWallVec5dImplF);
 INSTANTIATE_ASM(DomainWallVec5dImplD);
 INSTANTIATE_ASM(ZDomainWallVec5dImplF);
 INSTANTIATE_ASM(ZDomainWallVec5dImplD);
-  }
+
-}
+}}
--- a/lib/qcd/action/fermion/WilsonKernelsHand.cc
+++ b/lib/qcd/action/fermion/WilsonKernelsHand.cc
@@ -311,9 +311,8 @@ namespace Grid {
 namespace QCD {
-  template<class Impl>
+template<class Impl> void 
-  void WilsonKernels<Impl>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
+WilsonKernels<Impl>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor  *buf,
 					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 					  int ss,int sU,const FermionField &in, FermionField &out)
 {
  typedef typename Simd::scalar_type S;
@@ -555,8 +554,7 @@ namespace QCD {
 }
 template<class Impl>
-  void WilsonKernels<Impl>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
+void WilsonKernels<Impl>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
 					       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 						  int ss,int sU,const FermionField &in, FermionField &out)
 {
  //  std::cout << "Hand op Dhop "<<std::endl;
@@ -798,37 +796,34 @@ namespace QCD {
  }
 }
  ////////////////////////////////////////////////
  // Specialise Gparity to simple implementation
  ////////////////////////////////////////////////
-template<>
+template<> void 
-void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
+WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							SiteHalfSpinor *buf,
 							int sF,int sU,const FermionField &in, FermionField &out)
 {
  assert(0);
 }
-template<>
+template<> void 
-void WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
+WilsonKernels<GparityWilsonImplF>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
-								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
+							   SiteHalfSpinor *buf,
 							   int sF,int sU,const FermionField &in, FermionField &out)
 {
  assert(0);
 }
-template<>
+template<> void 
-void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
+WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
 							     std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							int sF,int sU,const FermionField &in, FermionField &out)
 {
  assert(0);
 }
-template<>
+template<> void 
-void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,
+WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,
 								std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,
 							   int sF,int sU,const FermionField &in, FermionField &out)
 {
  assert(0);
@@ -840,11 +835,9 @@ void WilsonKernels<GparityWilsonImplD>::DiracOptHandDhopSiteDag(StencilImpl &st,
 // Need Nc=3 though //
 #define INSTANTIATE_THEM(A) \
-template void WilsonKernels<A>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,\
+template void WilsonKernels<A>::DiracOptHandDhopSite(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
 							       std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,\
 						     int ss,int sU,const FermionField &in, FermionField &out); \
-template void WilsonKernels<A>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,\
+template void WilsonKernels<A>::DiracOptHandDhopSiteDag(StencilImpl &st,LebesgueOrder &lo,DoubledGaugeField &U,SiteHalfSpinor *buf,\
 								  std::vector<SiteHalfSpinor,alignedAllocator<SiteHalfSpinor> >  &buf,\
 							int ss,int sU,const FermionField &in, FermionField &out);
 INSTANTIATE_THEM(WilsonImplF);
--- a/lib/simd/Grid_avx512.h
+++ b/lib/simd/Grid_avx512.h
@@ -42,6 +42,16 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 namespace Grid{
 namespace Optimization {
  union u512f {
    __m512 v;
    float f[16];
  };
  union u512d {
    __m512d v;
    double f[8];
  };
  struct Vsplat{
    //Complex float
    inline __m512 operator()(float a, float b){
@@ -361,8 +371,14 @@ namespace Optimization {
  // Some Template specialization
  // Hack for CLANG until mm512_reduce_add_ps etc... are implemented in GCC and Clang releases
-#undef GNU_CLANG_COMPILER 
+<<<<<<< HEAD
 #define GNU_CLANG_COMPILER 
 #ifdef GNU_CLANG_COMPILER
 =======
 #ifndef __INTEL_COMPILER
 #warning "Slow reduction due to incomplete reduce intrinsics"
 >>>>>>> develop
  //Complex float Reduce
  template<>
    inline Grid::ComplexF Reduce<Grid::ComplexF, __m512>::operator()(__m512 in){
--- a/lib/tensors/Tensor_extract_merge.h
+++ b/lib/tensors/Tensor_extract_merge.h
@@ -250,8 +250,7 @@ void merge(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int
  }
 }
-template<class vobj> inline 
+template<class vobj> inline void merge1(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int offset)
 void merge1(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int offset)
 {
  typedef typename vobj::scalar_type scalar_type ;
  typedef typename vobj::vector_type vector_type ;
@@ -269,8 +268,7 @@ void merge1(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int
  }}
 }
-template<class vobj> inline 
+template<class vobj> inline void merge2(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int offset)
 void merge2(vobj &vec,std::vector<typename vobj::scalar_object *> &extracted,int offset)
 {
  typedef typename vobj::scalar_type scalar_type ;
  typedef typename vobj::vector_type vector_type ;
--- a/tests/Test_stencil.cc
+++ b/tests/Test_stencil.cc
@@ -116,7 +116,7 @@ int main (int argc, char ** argv)
 	  else if (SE->_is_local)
 	    Check._odata[i] = Foo._odata[SE->_offset];
 	  else 
-	    Check._odata[i] = myStencil.comm_buf[SE->_offset];
+	    Check._odata[i] = myStencil.CommBuf()[SE->_offset];
 	}
 	Real nrmC = norm2(Check);
@@ -207,7 +207,7 @@ int main (int argc, char ** argv)
 	  else if (SE->_is_local)
 	    OCheck._odata[i] = EFoo._odata[SE->_offset];
 	  else 
-	    OCheck._odata[i] = EStencil.comm_buf[SE->_offset];
+	    OCheck._odata[i] = EStencil.CommBuf()[SE->_offset];
 	}
 	for(int i=0;i<ECheck._grid->oSites();i++){
 	  int permute_type;
@@ -220,7 +220,7 @@ int main (int argc, char ** argv)
 	  else if (SE->_is_local)
 	    ECheck._odata[i] = OFoo._odata[SE->_offset];
 	  else 
-	    ECheck._odata[i] = OStencil.comm_buf[SE->_offset];
+	    ECheck._odata[i] = OStencil.CommBuf()[SE->_offset];
 	}
 	setCheckerboard(Check,ECheck);
Author	SHA1	Message	Date
paboyle	b820076b91	Merge branch 'develop' into feature/mpi3	2016-10-25 06:02:33 +01:00
paboyle	09f66100d3	MPI 3 compile on non-linux	2016-10-25 06:01:12 +01:00
azusayamaguchi	d7d92af09d	Travis fail fix attempt	2016-10-25 01:45:53 +01:00
azusayamaguchi	460d0753a1	Merge branch 'develop' into feature/mpi3 Conflicts: lib/simd/Grid_avx512.h	2016-10-25 01:08:51 +01:00
azusayamaguchi	8f8058f8a5	More random bits on parallel seeding	2016-10-25 01:05:52 +01:00
azusayamaguchi	d97a27f483	Verbose	2016-10-25 01:05:31 +01:00
azusayamaguchi	7c3363b91e	Compiles all comms targets	2016-10-25 00:04:17 +01:00
azusayamaguchi	b94478fa51	mpi, mpi3, shmem all compile. mpi, mpi3 pass single node multi-rank	2016-10-24 23:45:31 +01:00
Antonin Portelli	13bf0482e3	FFT optimisation	2016-10-24 19:25:40 +01:00
Antonin Portelli	a795b5705e	memory optimisation	2016-10-24 19:25:15 +01:00
Antonin Portelli	392e064513	fast local peek-poke	2016-10-24 19:24:21 +01:00
azusayamaguchi	b6a65059a2	Update to use shared memory to contain the stencil comms buffers Tested on 2.1.1.1 1.2.1.1 4.1.1.1 1.4.1.1 2.2.1.1 subnode decompositions	2016-10-24 17:30:43 +01:00
azusayamaguchi	ea25a4d9ac	Works	2016-10-23 06:10:05 +01:00
azusayamaguchi	c190221fd3	Internal SHM comms in non-simd directions working Need to fix simd directions	2016-10-22 18:14:27 +01:00
azusayamaguchi	0fcd2e7188	Simplify the comms structure prior to implementing Shared memory direct bouncs	2016-10-21 22:44:10 +01:00
azusayamaguchi	910b8dd6a1	use simd type	2016-10-21 22:35:29 +01:00
azusayamaguchi	75ebd3a0d1	Typo fixes and rotate for CLANG	2016-10-21 22:34:29 +01:00
azusayamaguchi	09fd5c43a7	Reasonably fast version	2016-10-21 15:17:39 +01:00
azusayamaguchi	f22317748f	Merge branch 'feature/mpi3' of https://github.com/paboyle/Grid into feature/mpi3	2016-10-21 13:36:35 +01:00
azusayamaguchi	6a9eae6b6b	Reporting improvements	2016-10-21 13:36:18 +01:00
azusayamaguchi	fad96cf250	StencilBufs	2016-10-21 13:36:00 +01:00
azusayamaguchi	f331809c27	Use variable type for loop	2016-10-21 13:35:37 +01:00
paboyle	2c54a53d0a	Compile verbose reduce	2016-10-21 12:12:14 +01:00
paboyle	306160ad9a	bcopy threaded	2016-10-21 12:07:28 +01:00
azusayamaguchi	20a091c3ed	Intel vs. Clang intrinsics differences absorbed	2016-10-21 09:08:36 +01:00
azusayamaguchi	202078eb1b	Cray / OpenSHMEM ordering differs	2016-10-21 09:07:20 +01:00
paboyle	a762b1fb71	MPI3 working with a bounce through shared memory on my laptop. Longer term plan: make the "u_comm_buf" in Stencil point to the shared region and avoid the send between ranks on same node.	2016-10-21 09:03:26 +01:00
paboyle	5b5925b8e5	Forgot to add	2016-10-20 17:09:40 +01:00
paboyle	b58adc6a4b	commVector	2016-10-20 17:00:15 +01:00
paboyle	f9d5e95d72	allocator template typedefs moved to AlignedAllocator	2016-10-20 16:59:39 +01:00
paboyle	4f8e636a43	commVector	2016-10-20 16:59:16 +01:00
paboyle	9b39f35ae6	commVector different for SHMEM compat	2016-10-20 16:58:53 +01:00
paboyle	5fe2b85cbd	MPI3 and shared memory support	2016-10-20 16:58:01 +01:00
paboyle	c7cccaaa69	Comm vector for shmem	2016-10-20 16:57:31 +01:00
paboyle	cbcfea466f	MPI3	2016-10-20 16:57:14 +01:00
paboyle	4955672fc3	MPI3	2016-10-20 16:57:00 +01:00
paboyle	39f1c880b8	mpi3	2016-10-20 16:56:40 +01:00
paboyle	8c043da5b7	SHMEM and comms allocator made different	2016-10-20 16:56:05 +01:00
paboyle	3cbe974eb4	Layout	2016-10-20 16:55:21 +01:00