Merge pull request #69 from jch1g10/feature/qed-fvol

2026-05-15 06:34:31 +01:00 · 2016-11-26 07:04:04 +09:00
parent 330a9b3f4c 6b8ee7bae0
commit f7293f2ddb
40 changed files with 1433 additions and 919 deletions
@@ -20,7 +20,7 @@ License: GPL v2.
 Last update Nov 2016.
-_Please send all pull requests to the `develop` branch._
+_Please do not send pull requests to the `master` branch which is reserved for releases._
 ### Bug report
@@ -29,7 +29,7 @@ _To help us tracking and solving more efficiently issues with Grid, please repor
 When you file an issue, please go though the following checklist:
 1. Check that the code is pointing to the `HEAD` of `develop` or any commit in `master` which is tagged with a version number. 
-2. Give a description of the target platform (CPU, network, compiler).
+2. Give a description of the target platform (CPU, network, compiler). Please give the full CPU part description, using for example `cat /proc/cpuinfo | grep 'model name' | uniq` (Linux) or `sysctl machdep.cpu.brand_string` (macOS) and the full output the `--version` option of your compiler.
 3. Give the exact `configure` command used.
 4. Attach `config.log`.
 5. Attach `config.summary`.
@@ -45,7 +45,7 @@ are provided, similar to HPF and cmfortran, and user control is given over the m
 array indices to both MPI tasks and SIMD processing elements.
 * Identically shaped arrays then be processed with perfect data parallelisation.
-* Such identically shapped arrays are called conformable arrays.
+* Such identically shaped arrays are called conformable arrays.
 The transformation is based on the observation that Cartesian array processing involves
 identical processing to be performed on different regions of the Cartesian array.
@@ -127,14 +127,15 @@ make -C tests/<subdir> tests
 The following options can be use with the `--enable-simd=` option to target different communication interfaces:
-| `<comm>`      | Description                                  |
+| `<comm>`       | Description                                                   |
-| ------------- | -------------------------------------------- |
+| -------------- | ------------------------------------------------------------- |
-| `none`        | no communications                            |
+| `none`         | no communications                                             |
-| `mpi[-auto]`  | MPI communications                           |
+| `mpi[-auto]`   | MPI communications                                            |
-| `mpi3[-auto]` | MPI communications using MPI 3 shared memory |
+| `mpi3[-auto]`  | MPI communications using MPI 3 shared memory                  |
-| `shmem `      | Cray SHMEM communications                    |
+| `mpi3l[-auto]` | MPI communications using MPI 3 shared memory and leader model |
 | `shmem `       | Cray SHMEM communications                                     |
-For `mpi` and `mpi3` the optional `-auto` suffix instructs the `configure` scripts to determine all the necessary compilation and linking flags. This is done by extracting the informations from the MPI wrapper specified in the environment variable `MPICXX` (if not specified `configure` will scan though a list of default names).
+For the MPI interfaces the optional `-auto` suffix instructs the `configure` scripts to determine all the necessary compilation and linking flags. This is done by extracting the informations from the MPI wrapper specified in the environment variable `MPICXX` (if not specified `configure` will scan though a list of default names).
 ### Possible SIMD types
@@ -160,7 +161,7 @@ Alternatively, some CPU codenames can be directly used:
 | `BGQ`       | Blue Gene/Q                            |
 #### Notes:
- We currently support AVX512 only for the Intel compiler. Support for GCC and clang will appear in future versions.
+- We currently support AVX512 only for the Intel compiler. Support for GCC and clang will appear in future versions of Grid when the AVX512 support within GCC and clang will be more advanced.
 - For BG/Q only [bgclang](http://trac.alcf.anl.gov/projects/llvm-bgq) is supported. We do not presently plan to support more compilers for this platform.
 - BG/Q performances are currently rather poor. This is being investigated for future versions.
@@ -171,7 +172,7 @@ The following configuration is recommended for the Intel Knights Landing platfor
 ``` bash
 ../configure --enable-precision=double\
             --enable-simd=KNL        \
-             --enable-comms=mpi3-auto \
+             --enable-comms=mpi-auto \
             --with-gmp=<path>        \
             --with-mpfr=<path>       \
             --enable-mkl             \
@@ -183,10 +184,9 @@ where `<path>` is the UNIX prefix where GMP and MPFR are installed. If you are w
 ``` bash
 ../configure --enable-precision=double\
             --enable-simd=KNL        \
-             --enable-comms=mpi3      \
+             --enable-comms=mpi       \
             --with-gmp=<path>        \
             --with-mpfr=<path>       \
             --enable-mkl             \
             CXX=CC CC=cc
 ```
@@ -42,15 +42,14 @@ int main (int argc, char ** argv)
  int Nloop=10;
  int nmu=0;
-  for(int mu=0;mu<4;mu++) if (mpi_layout[mu]>1) nmu++;
+  for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking concurrent halo exchange in "<<nmu<<" dimensions"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
-
+  int maxlat=16;
-
+  for(int lat=4;lat<=maxlat;lat+=2){
  for(int lat=4;lat<=32;lat+=2){
    for(int Ls=1;Ls<=16;Ls*=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],
@@ -125,7 +124,7 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
-  for(int lat=4;lat<=32;lat+=2){
+  for(int lat=4;lat<=maxlat;lat+=2){
    for(int Ls=1;Ls<=16;Ls*=2){
      std::vector<int> latt_size  ({lat,lat,lat,lat});
@@ -194,128 +193,83 @@ int main (int argc, char ** argv)
    }
  }  
-#if 0
+  Nloop=100;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "= Benchmarking sequential persistent halo exchange in "<<nmu<<" dimensions"<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking concurrent STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
-
+  for(int lat=4;lat<=maxlat;lat+=2){
  for(int lat=4;lat<=32;lat+=2){
    for(int Ls=1;Ls<=16;Ls*=2){
-      std::vector<int> latt_size  ({lat,lat,lat,lat});
+      std::vector<int> latt_size  ({lat*mpi_layout[0],
      				    lat*mpi_layout[1],
      				    lat*mpi_layout[2],
      				    lat*mpi_layout[3]});
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
-      std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
+      std::vector<HalfSpinColourVectorD *> xbuf(8);
-      std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
+      std::vector<HalfSpinColourVectorD *> rbuf(8);
-
+      Grid.ShmBufferFreeAll();
      for(int d=0;d<8;d++){
 	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
      }
      int ncomm;
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
      double start=usecond();
      for(int i=0;i<Nloop;i++){
-      std::vector<CartesianCommunicator::CommsRequest_t> empty;
+	std::vector<CartesianCommunicator::CommsRequest_t> requests;
      std::vector<std::vector<CartesianCommunicator::CommsRequest_t> > requests_fwd(Nd,empty);
      std::vector<std::vector<CartesianCommunicator::CommsRequest_t> > requests_bwd(Nd,empty);
      for(int mu=0;mu<4;mu++){
 	ncomm=0;
-	if (mpi_layout[mu]>1 ) {
+	for(int mu=0;mu<4;mu++){
 	  ncomm++;
-	  int comm_proc;
+	  if (mpi_layout[mu]>1 ) {
 	  int xmit_to_rank;
 	  int recv_from_rank;
-	  comm_proc=1;
+	    ncomm++;
-	  Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+	    int comm_proc=1;
-	  Grid.SendToRecvFromInit(requests_fwd[mu],
+	    int xmit_to_rank;
-				  (void *)&xbuf[mu][0],
+	    int recv_from_rank;
 				  xmit_to_rank,
 				  (void *)&rbuf[mu][0],
 				  recv_from_rank,
 				  bytes);
-	  comm_proc = mpi_layout[mu]-1;
+	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	  Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+	    Grid.StencilSendToRecvFromBegin(requests,
-	  Grid.SendToRecvFromInit(requests_bwd[mu],
+					    (void *)&xbuf[mu][0],
-				  (void *)&xbuf[mu+4][0],
+					    xmit_to_rank,
-				  xmit_to_rank,
+					    (void *)&rbuf[mu][0],
-				  (void *)&rbuf[mu+4][0],
+					    recv_from_rank,
-				  recv_from_rank,
+					    bytes);
 				  bytes);
-	}
+	    comm_proc = mpi_layout[mu]-1;
      }
-      {
+	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	double start=usecond();
+	    Grid.StencilSendToRecvFromBegin(requests,
-	for(int i=0;i<Nloop;i++){
+					    (void *)&xbuf[mu+4][0],
 					    xmit_to_rank,
 					    (void *)&rbuf[mu+4][0],
 					    recv_from_rank,
 					    bytes);
 	  for(int mu=0;mu<4;mu++){
 	    if (mpi_layout[mu]>1 ) {
 	      Grid.SendToRecvFromBegin(requests_fwd[mu]);
 	      Grid.SendToRecvFromComplete(requests_fwd[mu]);
 	      Grid.SendToRecvFromBegin(requests_bwd[mu]);
 	      Grid.SendToRecvFromComplete(requests_bwd[mu]);
 	    }
 	  }
 	  Grid.Barrier();
 	}
-	
+	Grid.StencilSendToRecvFromComplete(requests);
-	double stop=usecond();
+	Grid.Barrier();
 	double dbytes    = bytes;
 	double xbytes    = Nloop*dbytes*2.0*ncomm;
 	double rbytes    = xbytes;
 	double bidibytes = xbytes+rbytes;
 	double time = stop-start;
 	std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
      }
      double stop=usecond();
      double dbytes    = bytes;
      double xbytes    = Nloop*dbytes*2.0*ncomm;
      double rbytes    = xbytes;
      double bidibytes = xbytes+rbytes;
-      {
+      double time = stop-start; // microseconds
 	double start=usecond();
 	for(int i=0;i<Nloop;i++){
 	  for(int mu=0;mu<4;mu++){
 	    if (mpi_layout[mu]>1 ) {
 	      Grid.SendToRecvFromBegin(requests_fwd[mu]);
 	      Grid.SendToRecvFromBegin(requests_bwd[mu]);
 	      Grid.SendToRecvFromComplete(requests_fwd[mu]);
 	      Grid.SendToRecvFromComplete(requests_bwd[mu]);
 	    }
 	  }
 	  Grid.Barrier();
 	}
 	double stop=usecond();
 	double dbytes    = bytes;
 	double xbytes    = Nloop*dbytes*2.0*ncomm;
 	double rbytes    = xbytes;
 	double bidibytes = xbytes+rbytes;
 	double time = stop-start;
 	std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
      }
      std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
    }
  }    
 #endif
  Grid_finalize();
 }
@@ -44,7 +44,6 @@ struct scal {
    Gamma::GammaT
  };
 bool overlapComms = false;
 typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
 typedef WilsonFermion5D<DomainWallVec5dImplF> WilsonFermion5DF;
 typedef WilsonFermion5D<DomainWallVec5dImplD> WilsonFermion5DD;
@@ -54,10 +53,6 @@ int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  if( GridCmdOptionExists(argv,argv+argc,"--asynch") ){
    overlapComms = true;
  }
  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
@@ -126,14 +121,21 @@ int main (int argc, char ** argv)
  RealD NP = UGrid->_Nprocessors;
  for(int doasm=1;doasm<2;doasm++){
    QCD::WilsonKernelsStatic::AsmOpt=doasm;
  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
-  std::cout<<GridLogMessage << "Naive wilson implementation "<<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
-  std::cout << GridLogMessage<< "Calling Dw"<<std::endl;
+  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop                  "<<std::endl;
  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
  if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
  if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  int ncall =100;
  if (1) {
@@ -162,6 +164,17 @@ int main (int argc, char ** argv)
  if (1)
  {
    std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
    std::cout << GridLogMessage<< "* Benchmarking WilsonFermion5D<DomainWallVec5dImplR>::Dhop "<<std::endl;
    std::cout << GridLogMessage<< "* Vectorising fifth dimension by "<<vComplex::Nsimd()<<std::endl;
    if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
    if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
    if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
    if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
    if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
    std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
    typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
    LatticeFermion ssrc(sFGrid);
    LatticeFermion sref(sFGrid);
@@ -248,6 +261,16 @@ int main (int argc, char ** argv)
      sr_e = zero;
      sr_o = zero;
      std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
      std::cout << GridLogMessage<< "* Benchmarking WilsonFermion5D<DomainWallVec5dImplR>::DhopEO "<<std::endl;
      std::cout << GridLogMessage<< "* Vectorising fifth dimension by "<<vComplex::Nsimd()<<std::endl;
      if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
      if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
      if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
      if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
      if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
      std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
      sDw.ZeroCounters();
      sDw.stat.init("DhopEO");
      double t0=usecond();
@@ -308,7 +331,7 @@ int main (int argc, char ** argv)
    ref = -0.5*ref;
  }
  Dw.Dhop(src,result,1);
-  std::cout << GridLogMessage << "Naive wilson implementation Dag" << std::endl;
+  std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
  std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
  std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
  std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
@@ -322,13 +345,22 @@ int main (int argc, char ** argv)
  LatticeFermion r_eo  (FGrid);
-  std::cout<<GridLogMessage << "Calling Deo and Doe"<<std::endl;
+  std::cout<<GridLogMessage << "Calling Deo and Doe and assert Deo+Doe == Dunprec"<<std::endl;
  pickCheckerboard(Even,src_e,src);
  pickCheckerboard(Odd,src_o,src);
  std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl;
  std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;
  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::DhopEO                "<<std::endl;
  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
  if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
  if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
  {
    Dw.ZeroCounters();
    double t0=usecond();
@@ -366,8 +398,5 @@ int main (int argc, char ** argv)
  assert(norm2(src_e)<1.0e-5);
  assert(norm2(src_o)<1.0e-5);
  }
  Grid_finalize();
 }
@@ -1,153 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./benchmarks/Benchmark_dwf.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 template<class d>
 struct scal {
  d internal;
 };
  Gamma::GammaMatrix Gmu [] = {
    Gamma::GammaX,
    Gamma::GammaY,
    Gamma::GammaZ,
    Gamma::GammaT
  };
 bool overlapComms = false;
 int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  if( GridCmdOptionExists(argv,argv+argc,"--asynch") ){
    overlapComms = true;
  }
  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
  std::vector<int> latt4 = GridDefaultLatt();
  const int Ls=16;
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
  LatticeFermion src   (FGrid); random(RNG5,src);
  LatticeFermion result(FGrid); result=zero;
  LatticeFermion    ref(FGrid);    ref=zero;
  LatticeFermion    tmp(FGrid);
  LatticeFermion    err(FGrid);
  ColourMatrix cm = Complex(1.0,0.0);
  LatticeGaugeField Umu(UGrid); 
  random(RNG4,Umu);
  LatticeGaugeField Umu5d(FGrid); 
  // replicate across fifth dimension
  for(int ss=0;ss<Umu._grid->oSites();ss++){
    for(int s=0;s<Ls;s++){
      Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
    }
  }
  ////////////////////////////////////
  // Naive wilson implementation
  ////////////////////////////////////
  std::vector<LatticeColourMatrix> U(4,FGrid);
  for(int mu=0;mu<Nd;mu++){
    U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
  }
  if (1)
  {
    ref = zero;
    for(int mu=0;mu<Nd;mu++){
      tmp = U[mu]*Cshift(src,mu+1,1);
      ref=ref + tmp - Gamma(Gmu[mu])*tmp;
      tmp =adj(U[mu])*src;
      tmp =Cshift(tmp,mu+1,-1);
      ref=ref + tmp + Gamma(Gmu[mu])*tmp;
    }
    ref = -0.5*ref;
  }
  RealD mass=0.1;
  RealD M5  =1.8;
  typename DomainWallFermionR::ImplParams params; 
  params.overlapCommsCompute = overlapComms;
  RealD NP = UGrid->_Nprocessors;
  QCD::WilsonKernelsStatic::AsmOpt=1;
  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);
  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
  int ncall =50;
  if (1) {
    double t0=usecond();
    for(int i=0;i<ncall;i++){
      Dw.Dhop(src,result,0);
    }
    double t1=usecond();
    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
    double flops=1344*volume*ncall;
    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NP<<std::endl;
    err = ref-result; 
    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
    //    Dw.Report();
  }
  Grid_finalize();
 }
@@ -51,16 +51,18 @@ int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  const int Ls=8;
  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
  if ( getenv("ASMOPT") )  {
    QCD::WilsonKernelsStatic::AsmOpt=1;
  } else { 
    QCD::WilsonKernelsStatic::AsmOpt=0;
  }
  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking DWF"<<std::endl;
  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
@@ -58,6 +58,19 @@ int main (int argc, char ** argv)
  std::vector<int> seeds({1,2,3,4});
  RealD mass = 0.1;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  std::cout << GridLogMessage<< "* Benchmarking WilsonFermionR::Dhop                  "<<std::endl;
  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
  if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
  if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
  std::cout<<GridLogMessage << "============================================================================="<< std::endl;
  std::cout<<GridLogMessage << "= Benchmarking Wilson" << std::endl;
  std::cout<<GridLogMessage << "============================================================================="<< std::endl;
@@ -1,175 +0,0 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./tests/Test_zmm.cc
    Copyright (C) 2015
 Author: paboyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace Grid;
 using namespace Grid::QCD;
 int bench(std::ofstream &os, std::vector<int> &latt4,int Ls);
 int main(int argc,char **argv)
 {
  Grid_init(&argc,&argv);
  std::ofstream os("zmm.dat");
  os << "#V Ls Lxy Lzt C++ Asm OMP L1 " <<std::endl;
  std::cout<<GridLogMessage << "====================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking ZMM"<<std::endl;
  std::cout<<GridLogMessage << "====================================================================="<<std::endl;
  std::cout<<GridLogMessage << "Volume \t\t\t\tC++DW/MFLOPs\tASM-DW/MFLOPs\tdiff"<<std::endl;
  std::cout<<GridLogMessage << "====================================================================="<<std::endl;
  for(int L=4;L<=32;L+=4){
    for(int m=1;m<=2;m++){
      for(int Ls=8;Ls<=16;Ls+=8){
 	std::vector<int> grid({L,L,m*L,m*L});
  std::cout << GridLogMessage <<"\t";
 	for(int i=0;i<4;i++) { 
 	  std::cout << grid[i]<<"x";
 	}
 	std::cout << Ls<<"\t\t";
 	bench(os,grid,Ls);
      }
    }
  }
 }
 int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
 {
  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
  std::vector<int> mpi_layout  = GridDefaultMpi();
  int threads = GridThread::GetThreads();
  std::vector<int> seeds4({1,2,3,4});
  std::vector<int> seeds5({5,6,7,8});
  GridSerialRNG sRNG; sRNG.SeedFixedIntegers(seeds4);
  LatticeFermion src (FGrid);
  LatticeFermion tmp (FGrid);
  LatticeFermion srce(FrbGrid);
  LatticeFermion resulto(FrbGrid); resulto=zero;
  LatticeFermion resulta(FrbGrid); resulta=zero;
  LatticeFermion junk(FrbGrid); junk=zero;
  LatticeFermion diff(FrbGrid); 
  LatticeGaugeField Umu(UGrid);
  double mfc, mfa, mfo, mfl1;
  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
  random(RNG5,src);
 #if 1
  random(RNG4,Umu);
 #else
  int mmu=2;
  std::vector<LatticeColourMatrix> U(4,UGrid);
  for(int mu=0;mu<Nd;mu++){
    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
    if ( mu!=mmu ) U[mu] = zero;
    if ( mu==mmu ) U[mu] = 1.0;
    PokeIndex<LorentzIndex>(Umu,U[mu],mu);
  }
 #endif
 pickCheckerboard(Even,srce,src);
  RealD mass=0.1;
  RealD M5  =1.8;
  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
  int ncall=50;
  double t0=usecond();
  for(int i=0;i<ncall;i++){
    Dw.DhopOE(srce,resulto,0);
  }
  double t1=usecond();
  double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
  double flops=1344*volume/2;
  mfc = flops*ncall/(t1-t0);
  std::cout<<mfc<<"\t\t";
  QCD::WilsonKernelsStatic::AsmOpt=1;
  t0=usecond();
  for(int i=0;i<ncall;i++){
    Dw.DhopOE(srce,resulta,0);
  }
  t1=usecond();
  mfa = flops*ncall/(t1-t0);
  std::cout<<mfa<<"\t\t";
  /*
  int dag=DaggerNo;
  t0=usecond();
  for(int i=0;i<1;i++){
    Dw.DhopInternalOMPbench(Dw.StencilEven,Dw.LebesgueEvenOdd,Dw.UmuOdd,srce,resulta,dag);
  }
  t1=usecond();
  mfo = flops*100/(t1-t0);
  std::cout<<GridLogMessage << "Called ASM-OMP Dw"<< " mflop/s =   "<< mfo<<std::endl;
  t0=usecond();
  for(int i=0;i<1;i++){
    Dw.DhopInternalL1bench(Dw.StencilEven,Dw.LebesgueEvenOdd,Dw.UmuOdd,srce,resulta,dag);
  }
  t1=usecond();
  mfl1= flops*100/(t1-t0);
  std::cout<<GridLogMessage << "Called ASM-L1 Dw"<< " mflop/s =   "<< mfl1<<std::endl;
  os << latt4[0]*latt4[1]*latt4[2]*latt4[3]<< " "<<Ls<<" "<< latt4[0] <<" " <<latt4[2]<< " "
     << mfc<<" "
     << mfa<<" "
     << mfo<<" "
     << mfl1<<std::endl;
  */
 #if 0
  for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
    Dw.DhopOE(srce,resulta,0);
    PerformanceCounter Counter(i);
    Counter.Start();
    Dw.DhopOE(srce,resulta,0);
    Counter.Stop();
    Counter.Report();
  }
 #endif
  //resulta = (-0.5) * resulta;
  diff = resulto-resulta;
  std::cout<<norm2(diff)<<std::endl;
  return 0;
 }
@@ -1,18 +1,12 @@
 #!/usr/bin/env bash
 EIGEN_URL='http://bitbucket.org/eigen/eigen/get/3.2.9.tar.bz2'
 FFTW_URL=http://www.fftw.org/fftw-3.3.4.tar.gz
 echo "-- deploying Eigen source..."
 wget ${EIGEN_URL} --no-check-certificate
 ./scripts/update_eigen.sh `basename ${EIGEN_URL}`
 rm `basename ${EIGEN_URL}`
 echo "-- copying fftw prototypes..."
 wget ${FFTW_URL}
 ./scripts/update_fftw.sh `basename ${FFTW_URL}`
 rm `basename ${FFTW_URL}`
 echo '-- generating Make.inc files...'
 ./scripts/filelist
 echo '-- generating configure script...'
@@ -253,15 +253,23 @@ AC_ARG_ENABLE([comms],[AC_HELP_STRING([--enable-comms=none|mpi|mpi-auto|mpi3|mpi
 case ${ac_COMMS} in
     none)
        AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] )
        comms_type='none'
     ;;
-     mpi|mpi-auto)
+     mpi3l*)
-        AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] )
+       AC_DEFINE([GRID_COMMS_MPI3L],[1],[GRID_COMMS_MPI3L] )
       comms_type='mpi3l'
     ;;
-     mpi3|mpi3-auto)
+     mpi3*)
        AC_DEFINE([GRID_COMMS_MPI3],[1],[GRID_COMMS_MPI3] )
        comms_type='mpi3'
     ;;
     mpi*)
        AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] )
        comms_type='mpi'
     ;;
     shmem)
        AC_DEFINE([GRID_COMMS_SHMEM],[1],[GRID_COMMS_SHMEM] )
        comms_type='shmem'
     ;;
     *)
        AC_MSG_ERROR([${ac_COMMS} unsupported --enable-comms option]); 
@@ -279,12 +287,11 @@ case ${ac_COMMS} in
        ;;
 esac
-AM_CONDITIONAL(BUILD_COMMS_SHMEM,[ test "X${ac_COMMS}X" == "XshmemX" ])
+AM_CONDITIONAL(BUILD_COMMS_SHMEM, [ test "${comms_type}X" == "shmemX" ])
-AM_CONDITIONAL(BUILD_COMMS_MPI,
+AM_CONDITIONAL(BUILD_COMMS_MPI,   [ test "${comms_type}X" == "mpiX" ])
-               [ test "X${ac_COMMS}X" == "XmpiX" || test "X${ac_COMMS}X" == "Xmpi-autoX" ])
+AM_CONDITIONAL(BUILD_COMMS_MPI3,  [ test "${comms_type}X" == "mpi3X" ] )
-AM_CONDITIONAL(BUILD_COMMS_MPI3,
+AM_CONDITIONAL(BUILD_COMMS_MPI3L, [ test "${comms_type}X" == "mpi3lX" ] )
-               [ test "X${ac_COMMS}X" == "Xmpi3X" || test "X${ac_COMMS}X" == "Xmpi3-autoX" ])
+AM_CONDITIONAL(BUILD_COMMS_NONE,  [ test "${comms_type}X" == "noneX" ])
 AM_CONDITIONAL(BUILD_COMMS_NONE,[ test "X${ac_COMMS}X" == "XnoneX" ])
 ############### RNG selection
 AC_ARG_ENABLE([rng],[AC_HELP_STRING([--enable-rng=ranlux48|mt19937],\
@@ -379,7 +386,7 @@ compiler version            : ${ax_cv_gxx_version}
 ----- BUILD OPTIONS -----------------------------------
 SIMD                        : ${ac_SIMD}
 Threading                   : ${ac_openmp} 
-Communications type         : ${ac_COMMS}
+Communications type         : ${comms_type}
 Default precision           : ${ac_PRECISION}
 RNG choice                  : ${ac_RNG} 
 GMP                         : `if test "x$have_gmp" = xtrue; then echo yes; else echo no; fi`
@@ -42,6 +42,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/cshift/Cshift_mpi.h>
 #endif 
 #ifdef GRID_COMMS_MPI3L
 #include <Grid/cshift/Cshift_mpi.h>
 #endif 
 #ifdef GRID_COMMS_SHMEM
 #include <Grid/cshift/Cshift_mpi.h> // uses same implementation of communicator
 #endif 
@@ -147,6 +147,13 @@ void GridCmdOptionIntVector(std::string &str,std::vector<int> & vec)
  return;
 }
 void GridCmdOptionInt(std::string &str,int & val)
 {
  std::stringstream ss(str);
  ss>>val;
  return;
 }
 void GridParseLayout(char **argv,int argc,
 		     std::vector<int> &latt,
@@ -177,14 +184,12 @@ void GridParseLayout(char **argv,int argc,
    assert(ompthreads.size()==1);
    GridThread::SetThreads(ompthreads[0]);
  }
  if( GridCmdOptionExists(argv,argv+argc,"--cores") ){
-    std::vector<int> cores(0);
+    int cores;
    arg= GridCmdOptionPayload(argv,argv+argc,"--cores");
-    GridCmdOptionIntVector(arg,cores);
+    GridCmdOptionInt(arg,cores);
-    GridThread::SetCores(cores[0]);
+    GridThread::SetCores(cores);
  }
 }
 std::string GridCmdVectorIntToString(const std::vector<int> & vec){
@@ -193,7 +198,7 @@ std::string GridCmdVectorIntToString(const std::vector<int> & vec){
  return oss.str();
 }
 /////////////////////////////////////////////////////////
-//
+// Reinit guard
 /////////////////////////////////////////////////////////
 static int Grid_is_initialised = 0;
@@ -202,27 +207,31 @@ void Grid_init(int *argc,char ***argv)
 {
  GridLogger::StopWatch.Start();
  std::string arg;
  ////////////////////////////////////
  // Shared memory block size
  ////////////////////////////////////
  if( GridCmdOptionExists(*argv,*argv+*argc,"--shm") ){
    int MB;
    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--shm");
    GridCmdOptionInt(arg,MB);
    CartesianCommunicator::MAX_MPI_SHM_BYTES = MB*1024*1024;
  }
  CartesianCommunicator::Init(argc,argv);
-  // Parse command line args.
+  ////////////////////////////////////
  // Logging
  ////////////////////////////////////
  std::string arg;
  std::vector<std::string> logstreams;
  std::string defaultLog("Error,Warning,Message,Performance");
  GridCmdOptionCSL(defaultLog,logstreams);
  GridLogConfigure(logstreams);
-  if( GridCmdOptionExists(*argv,*argv+*argc,"--help") ){
+  if( !GridCmdOptionExists(*argv,*argv+*argc,"--debug-stdout") ){
-    std::cout<<GridLogMessage<<"--help : this message"<<std::endl;
+    Grid_quiesce_nodes();
    std::cout<<GridLogMessage<<"--debug-signals : catch sigsegv and print a blame report"<<std::endl;
    std::cout<<GridLogMessage<<"--debug-stdout  : print stdout from EVERY node"<<std::endl;    
    std::cout<<GridLogMessage<<"--decomposition : report on default omp,mpi and simd decomposition"<<std::endl;    
    std::cout<<GridLogMessage<<"--mpi n.n.n.n   : default MPI decomposition"<<std::endl;    
    std::cout<<GridLogMessage<<"--threads n     : default number of OMP threads"<<std::endl;
    std::cout<<GridLogMessage<<"--grid n.n.n.n  : default Grid size"<<std::endl;    
    std::cout<<GridLogMessage<<"--log list      : comma separted list of streams from Error,Warning,Message,Performance,Iterative,Integrator,Debug,Colours"<<std::endl;
    exit(EXIT_SUCCESS);
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--log") ){
@@ -231,38 +240,39 @@ void Grid_init(int *argc,char ***argv)
    GridLogConfigure(logstreams);
  }
-  if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
+  ////////////////////////////////////
-    Grid_debug_handler_init();
+  // Help message
-  }
+  ////////////////////////////////////
-  if( !GridCmdOptionExists(*argv,*argv+*argc,"--debug-stdout") ){
+
-    Grid_quiesce_nodes();
+  if( GridCmdOptionExists(*argv,*argv+*argc,"--help") ){
-  }
+    std::cout<<GridLogMessage<<"  --help : this message"<<std::endl;
-  if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-opt") ){
+    std::cout<<GridLogMessage<<std::endl;
-    QCD::WilsonKernelsStatic::HandOpt=1;
+    std::cout<<GridLogMessage<<"Geometry:"<<std::endl;
-  }
+    std::cout<<GridLogMessage<<"  --mpi n.n.n.n   : default MPI decomposition"<<std::endl;    
-  if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
+    std::cout<<GridLogMessage<<"  --threads n     : default number of OMP threads"<<std::endl;
-    LebesgueOrder::UseLebesgueOrder=1;
+    std::cout<<GridLogMessage<<"  --grid n.n.n.n  : default Grid size"<<std::endl;    
-  }
+    std::cout<<GridLogMessage<<"  --shm  M        : allocate M megabytes of shared memory for comms"<<std::endl;    
-  if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
+    std::cout<<GridLogMessage<<std::endl;
-    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking");
+    std::cout<<GridLogMessage<<"Verbose and debug:"<<std::endl;
-    GridCmdOptionIntVector(arg,LebesgueOrder::Block);
+    std::cout<<GridLogMessage<<"  --log list      : comma separted list of streams from Error,Warning,Message,Performance,Iterative,Integrator,Debug,Colours"<<std::endl;
-  }
+    std::cout<<GridLogMessage<<"  --decomposition : report on default omp,mpi and simd decomposition"<<std::endl;    
-  if( GridCmdOptionExists(*argv,*argv+*argc,"--timestamp") ){
+    std::cout<<GridLogMessage<<"  --debug-signals : catch sigsegv and print a blame report"<<std::endl;
-    GridLogTimestamp(1);
+    std::cout<<GridLogMessage<<"  --debug-stdout  : print stdout from EVERY node"<<std::endl;    
    std::cout<<GridLogMessage<<"  --notimestamp   : suppress millisecond resolution stamps"<<std::endl;    
    std::cout<<GridLogMessage<<std::endl;
    std::cout<<GridLogMessage<<"Performance:"<<std::endl;
    std::cout<<GridLogMessage<<"  --dslash-generic: Wilson kernel for generic Nc"<<std::endl;    
    std::cout<<GridLogMessage<<"  --dslash-unroll : Wilson kernel for Nc=3"<<std::endl;    
    std::cout<<GridLogMessage<<"  --dslash-asm    : Wilson kernel for AVX512"<<std::endl;    
    std::cout<<GridLogMessage<<"  --lebesgue      : Cache oblivious Lebesgue curve/Morton order/Z-graph stencil looping"<<std::endl;    
    std::cout<<GridLogMessage<<"  --cacheblocking n.m.o.p : Hypercuboidal cache blocking"<<std::endl;    
    std::cout<<GridLogMessage<<std::endl;
    exit(EXIT_SUCCESS);
  }
-  GridParseLayout(*argv,*argc,
+  ////////////////////////////////////
-		  Grid_default_latt,
+  // Banner
-		  Grid_default_mpi);
+  ////////////////////////////////////
  if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
    std::cout<<GridLogMessage<<"Grid Decomposition\n";
    std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
    std::cout<<GridLogMessage<<"\tMPI tasks      : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl;
    std::cout<<GridLogMessage<<"\tvRealF         : "<<sizeof(vRealF)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl;
    std::cout<<GridLogMessage<<"\tvRealD         : "<<sizeof(vRealD)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealD::Nsimd()))<<std::endl;
    std::cout<<GridLogMessage<<"\tvComplexF      : "<<sizeof(vComplexF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexF::Nsimd()))<<std::endl;
    std::cout<<GridLogMessage<<"\tvComplexD      : "<<sizeof(vComplexD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexD::Nsimd()))<<std::endl;
  }
  std::string COL_RED    = GridLogColours.colour["RED"];
  std::string COL_PURPLE = GridLogColours.colour["PURPLE"];
@@ -272,7 +282,6 @@ void Grid_init(int *argc,char ***argv)
  std::string COL_YELLOW = GridLogColours.colour["YELLOW"];
  std::string COL_BACKGROUND = GridLogColours.colour["NORMAL"];
  std::cout <<std::endl;
  std::cout <<COL_RED  << "__|__|__|__|__"<<             "|__|__|_"<<COL_PURPLE<<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
  std::cout <<COL_RED  << "__|__|__|__|__"<<             "|__|__|_"<<COL_PURPLE<<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
@@ -305,6 +314,55 @@ void Grid_init(int *argc,char ***argv)
  std::cout << COL_BACKGROUND <<std::endl;
  std::cout << std::endl;
  ////////////////////////////////////
  // Debug and performance options
  ////////////////////////////////////
  if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
    Grid_debug_handler_init();
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-unroll") ){
    QCD::WilsonKernelsStatic::Opt=QCD::WilsonKernelsStatic::OptHandUnroll;
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-asm") ){
    QCD::WilsonKernelsStatic::Opt=QCD::WilsonKernelsStatic::OptInlineAsm;
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-generic") ){
    QCD::WilsonKernelsStatic::Opt=QCD::WilsonKernelsStatic::OptGeneric;
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
    LebesgueOrder::UseLebesgueOrder=1;
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking");
    GridCmdOptionIntVector(arg,LebesgueOrder::Block);
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--notimestamp") ){
    GridLogTimestamp(0);
  } else { 
    GridLogTimestamp(1);
  }
  GridParseLayout(*argv,*argc,
 		  Grid_default_latt,
 		  Grid_default_mpi);
  std::cout << GridLogMessage << "Requesting "<< CartesianCommunicator::MAX_MPI_SHM_BYTES <<" byte stencil comms buffers "<<std::endl;
  if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
    std::cout<<GridLogMessage<<"Grid Decomposition\n";
    std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
    std::cout<<GridLogMessage<<"\tMPI tasks      : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl;
    std::cout<<GridLogMessage<<"\tvRealF         : "<<sizeof(vRealF)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl;
    std::cout<<GridLogMessage<<"\tvRealD         : "<<sizeof(vRealD)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealD::Nsimd()))<<std::endl;
    std::cout<<GridLogMessage<<"\tvComplexF      : "<<sizeof(vComplexF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexF::Nsimd()))<<std::endl;
    std::cout<<GridLogMessage<<"\tvComplexD      : "<<sizeof(vComplexD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexD::Nsimd()))<<std::endl;
  }
  Grid_is_initialised = 1;
 }
@@ -9,6 +9,11 @@ if BUILD_COMMS_MPI3
  extra_sources+=communicator/Communicator_base.cc
 endif
 if BUILD_COMMS_MPI3L
  extra_sources+=communicator/Communicator_mpi3_leader.cc
  extra_sources+=communicator/Communicator_base.cc
 endif
 if BUILD_COMMS_SHMEM
  extra_sources+=communicator/Communicator_shmem.cc
  extra_sources+=communicator/Communicator_base.cc
@@ -43,6 +43,9 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #else
 #include <sys/syscall.h>
 #endif
 #ifdef __x86_64__
 #include <x86intrin.h>
 #endif
 namespace Grid {
@@ -86,7 +89,6 @@ inline uint64_t cyclecount(void){
   return tmp;
 }
 #elif defined __x86_64__
 #include <x86intrin.h>
 inline uint64_t cyclecount(void){ 
  return __rdtsc();
  //  unsigned int dummy;
@@ -31,14 +31,8 @@ namespace Grid {
 ///////////////////////////////////////////////////////////////
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////
 int CartesianCommunicator::ShmRank;
 int CartesianCommunicator::ShmSize;
 int CartesianCommunicator::GroupRank;
 int CartesianCommunicator::GroupSize;
 int CartesianCommunicator::WorldRank;
 int CartesianCommunicator::WorldSize;
 int CartesianCommunicator::Slave;
 void *              CartesianCommunicator::ShmCommBuf;
 uint64_t            CartesianCommunicator::MAX_MPI_SHM_BYTES   = 128*1024*1024; 
 /////////////////////////////////
 // Alloc, free shmem region
@@ -48,7 +42,12 @@ void *CartesianCommunicator::ShmBufferMalloc(size_t bytes){
  void *ptr = (void *)heap_top;
  heap_top  += bytes;
  heap_bytes+= bytes;
-  assert(heap_bytes < MAX_MPI_SHM_BYTES);
+  if (heap_bytes >= MAX_MPI_SHM_BYTES) {
    std::cout<< " ShmBufferMalloc exceeded shared heap size -- try increasing with --shm <MB> flag" <<std::endl;
    std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl;
    std::cout<< " Current value is " << (MAX_MPI_SHM_BYTES/(1024*1024)) <<std::endl;
    assert(heap_bytes<MAX_MPI_SHM_BYTES);
  }
  return ptr;
 }
 void CartesianCommunicator::ShmBufferFreeAll(void) { 
@@ -69,12 +68,6 @@ int                      CartesianCommunicator::ProcessorCount(void)    { return
 ////////////////////////////////////////////////////////////////////////////////
 // very VERY rarely (Log, serial RNG) we need world without a grid
 ////////////////////////////////////////////////////////////////////////////////
 int  CartesianCommunicator::RankWorld(void){ return WorldRank; };
 int CartesianCommunicator::Ranks    (void) { return WorldSize; };
 int CartesianCommunicator::Nodes    (void) { return GroupSize; };
 int CartesianCommunicator::Cores    (void) { return ShmSize;   };
 int CartesianCommunicator::NodeRank (void) { return GroupRank; };
 int CartesianCommunicator::CoreRank (void) { return ShmRank;   };
 void CartesianCommunicator::GlobalSum(ComplexF &c)
 {
@@ -93,7 +86,7 @@ void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
  GlobalSumVector((double *)c,2*N);
 }
-#ifndef GRID_COMMS_MPI3
+#if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPI3L)
 void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						       void *xmit,
@@ -1,3 +1,4 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
@@ -37,6 +38,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifdef GRID_COMMS_MPI3
 #include <mpi.h>
 #endif
 #ifdef GRID_COMMS_MPI3L
 #include <mpi.h>
 #endif
 #ifdef GRID_COMMS_SHMEM
 #include <mpp/shmem.h>
 #endif
@@ -51,7 +55,7 @@ class CartesianCommunicator {
  // Give external control (command line override?) of this
  static const int      MAXLOG2RANKSPERNODE = 16;            
-  static const uint64_t MAX_MPI_SHM_BYTES   = 128*1024*1024; 
+  static uint64_t MAX_MPI_SHM_BYTES;
  // Communicator should know nothing of the physics grid, only processor grid.
  int              _Nprocessors;     // How many in all
@@ -60,9 +64,9 @@ class CartesianCommunicator {
  std::vector<int> _processor_coor;  // linear processor coordinate
  unsigned long _ndimension;
-#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3)
+#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPI3L)
  MPI_Comm communicator;
  static MPI_Comm communicator_world;
         MPI_Comm communicator;
  typedef MPI_Request CommsRequest_t;
 #else 
  typedef int CommsRequest_t;
@@ -75,7 +79,15 @@ class CartesianCommunicator {
  // cartesian communicator on a subset of ranks, slave ranks controlled
  // by group leader with data xfer via shared memory
  ////////////////////////////////////////////////////////////////////
-#ifdef  GRID_COMMS_MPI3
+#ifdef GRID_COMMS_MPI3
  static int ShmRank;
  static int ShmSize;
  static int GroupRank;
  static int GroupSize;
  static int WorldRank;
  static int WorldSize;
  std::vector<int>  WorldDims;
  std::vector<int>  GroupDims;
  std::vector<int>  ShmDims;
@@ -93,13 +105,20 @@ class CartesianCommunicator {
  std::vector<int>  LexicographicToWorldRank;
  static std::vector<void *> ShmCommBufs;
 #else 
  static void ShmInitGeneric(void);
  static commVector<uint8_t> ShmBufStorageVector;
 #endif 
  /////////////////////////////////
  // Grid information and queries
  // Implemented in Communicator_base.C
  /////////////////////////////////
  static void * ShmCommBuf;
  size_t heap_top;
  size_t heap_bytes;
  void *ShmBufferSelf(void);
  void *ShmBuffer(int rank);
  void *ShmBufferTranslate(int rank,void * local_p);
@@ -123,28 +142,12 @@ class CartesianCommunicator {
  int  RankFromProcessorCoor(std::vector<int> &coor);
  void ProcessorCoorFromRank(int rank,std::vector<int> &coor);
  /////////////////////////////////
  // Grid information and queries
  /////////////////////////////////
  static int ShmRank;
  static int ShmSize;
  static int GroupSize;
  static int GroupRank;
  static int WorldRank;
  static int WorldSize;
  static int Slave;
  int                      IsBoss(void)            ;
  int                      BossRank(void)          ;
  int                      ThisRank(void)          ;
  const std::vector<int> & ThisProcessorCoor(void) ;
  const std::vector<int> & ProcessorGrid(void)     ;
  int                      ProcessorCount(void)    ;
  static int Ranks    (void);
  static int Nodes    (void);
  static int Cores    (void);
  static int NodeRank (void);
  static int CoreRank (void);
  ////////////////////////////////////////////////////////////////////////////////
  // very VERY rarely (Log, serial RNG) we need world without a grid
@@ -44,13 +44,6 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
    MPI_Init(argc,argv);
  }
  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
  MPI_Comm_rank(communicator_world,&WorldRank);
  MPI_Comm_size(communicator_world,&WorldSize);
  ShmRank=0;
  ShmSize=1;
  GroupRank=WorldRank;
  GroupSize=WorldSize;
  Slave    =0;
  ShmInitGeneric();
 }
@@ -198,6 +191,11 @@ void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
  // Should only be used prior to Grid Init finished.
  // Check for this?
  ///////////////////////////////////////////////////////
 int CartesianCommunicator::RankWorld(void){ 
  int r; 
  MPI_Comm_rank(communicator_world,&r);
  return r;
 }
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
  int ierr= MPI_Bcast(data,
@@ -30,12 +30,18 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 namespace Grid {
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 int CartesianCommunicator::ShmSetup = 0;
 int CartesianCommunicator::ShmRank;
 int CartesianCommunicator::ShmSize;
 int CartesianCommunicator::GroupRank;
 int CartesianCommunicator::GroupSize;
 int CartesianCommunicator::WorldRank;
 int CartesianCommunicator::WorldSize;
 MPI_Comm CartesianCommunicator::communicator_world;
 MPI_Comm CartesianCommunicator::ShmComm;
 MPI_Win  CartesianCommunicator::ShmWindow;
@@ -97,15 +103,15 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
  std::vector<int> world_ranks(WorldSize); 
  GroupRanks.resize(WorldSize); 
  MyGroup.resize(ShmSize);
  for(int r=0;r<WorldSize;r++) world_ranks[r]=r;
  MPI_Group_translate_ranks (WorldGroup,WorldSize,&world_ranks[0],ShmGroup, &GroupRanks[0]); 
  ///////////////////////////////////////////////////////////////////
  // Identify who is in my group and noninate the leader
-    ///////////////////////////////////////////////////////////////////
+  ///////////////////////////////////////////////////////////////////
  int g=0;
  MyGroup.resize(ShmSize);
  for(int rank=0;rank<WorldSize;rank++){
    if(GroupRanks[rank]!=MPI_UNDEFINED){
      assert(g<ShmSize);
@@ -0,0 +1,870 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/communicator/Communicator_mpi.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include "Grid.h"
 #include <mpi.h>
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 /// Workarounds:
 /// i) bloody mac os doesn't implement unnamed semaphores since it is "optional" posix.
 ///    darwin dispatch semaphores don't seem to be multiprocess.
 ///
 /// ii) openmpi under --mca shmem posix works with two squadrons per node; 
 ///     openmpi under default mca settings (I think --mca shmem mmap) on MacOS makes two squadrons map the SAME
 ///     memory as each other, despite their living on different communicators. This appears to be a bug in OpenMPI.
 ///
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 #include <semaphore.h>
 typedef sem_t *Grid_semaphore;
 #define SEM_INIT(S)      S = sem_open(sem_name,0,0600,0); assert ( S != SEM_FAILED );
 #define SEM_INIT_EXCL(S) sem_unlink(sem_name); S = sem_open(sem_name,O_CREAT|O_EXCL,0600,0); assert ( S != SEM_FAILED );
 #define SEM_POST(S) assert ( sem_post(S) == 0 ); 
 #define SEM_WAIT(S) assert ( sem_wait(S) == 0 );
 #include <sys/mman.h>
 namespace Grid {
 enum { COMMAND_ISEND, COMMAND_IRECV, COMMAND_WAITALL };
 struct Descriptor {
  uint64_t buf;
  size_t bytes;
  int rank;
  int tag;
  int command;
  MPI_Request request;
 };
 const int pool = 48;
 class SlaveState {
 public:
  volatile int head;
  volatile int start;
  volatile int tail;
  volatile Descriptor Descrs[pool];
 };
 class Slave {
 public:
  Grid_semaphore  sem_head;
  Grid_semaphore  sem_tail;
  SlaveState *state;
  MPI_Comm squadron;
  uint64_t     base;
  int universe_rank;
  int vertical_rank;
  char sem_name [NAME_MAX];
  ////////////////////////////////////////////////////////////
  // Descriptor circular pointers
  ////////////////////////////////////////////////////////////
  Slave() {};
  void Init(SlaveState * _state,MPI_Comm _squadron,int _universe_rank,int _vertical_rank);
  void SemInit(void) {
    sprintf(sem_name,"/Grid_mpi3_sem_head_%d",universe_rank);
    //    printf("SEM_NAME: %s \n",sem_name);
    SEM_INIT(sem_head);
    sprintf(sem_name,"/Grid_mpi3_sem_tail_%d",universe_rank);
    //    printf("SEM_NAME: %s \n",sem_name);
    SEM_INIT(sem_tail);
  }  
  void SemInitExcl(void) {
    sprintf(sem_name,"/Grid_mpi3_sem_head_%d",universe_rank);
    //    printf("SEM_INIT_EXCL: %s \n",sem_name);
    SEM_INIT_EXCL(sem_head);
    sprintf(sem_name,"/Grid_mpi3_sem_tail_%d",universe_rank);
    //    printf("SEM_INIT_EXCL: %s \n",sem_name);
    SEM_INIT_EXCL(sem_tail);
  }  
  void WakeUpDMA(void) { 
    SEM_POST(sem_head);
  };
  void WakeUpCompute(void) { 
    SEM_POST(sem_tail);
  };
  void WaitForCommand(void) { 
    SEM_WAIT(sem_head);
  };
  void WaitForComplete(void) { 
    SEM_WAIT(sem_tail);
  };
  void EventLoop (void) {
    //    std::cout<< " Entering event loop "<<std::endl;
    while(1){
      WaitForCommand();
      //      std::cout << "Getting command "<<std::endl;
      Event();
    }
  }
  int Event (void) ;
  uint64_t QueueCommand(int command,void *buf, int bytes, int hashtag, MPI_Comm comm,int u_rank) ;
  void WaitAll() {
    //    std::cout << "Queueing WAIT command  "<<std::endl;
    QueueCommand(COMMAND_WAITALL,0,0,0,squadron,0);
    //    std::cout << "Waking up DMA "<<std::endl;
    WakeUpDMA();
    //    std::cout << "Waiting from semaphore "<<std::endl;
    WaitForComplete();
    //    std::cout << "Checking FIFO is empty "<<std::endl;
    assert ( state->tail == state->head );
  }
 };
 ////////////////////////////////////////////////////////////////////////
 // One instance of a data mover.
 // Master and Slave must agree on location in shared memory
 ////////////////////////////////////////////////////////////////////////
 class MPIoffloadEngine { 
 public:
  static std::vector<Slave> Slaves;
  static int ShmSetup;
  static int UniverseRank;
  static int UniverseSize;
  static MPI_Comm communicator_universe;
  static MPI_Comm communicator_cached;
  static MPI_Comm HorizontalComm;
  static int HorizontalRank;
  static int HorizontalSize;
  static MPI_Comm VerticalComm;
  static MPI_Win  VerticalWindow; 
  static int VerticalSize;
  static int VerticalRank;
  static std::vector<void *> VerticalShmBufs;
  static std::vector<std::vector<int> > UniverseRanks;
  static std::vector<int> UserCommunicatorToWorldRanks; 
  static MPI_Group WorldGroup, CachedGroup;
  static void CommunicatorInit (MPI_Comm &communicator_world,
 				MPI_Comm &ShmComm,
 				void * &ShmCommBuf);
  static void MapCommRankToWorldRank(int &hashtag, int & comm_world_peer,int tag, MPI_Comm comm,int commrank);
  /////////////////////////////////////////////////////////
  // routines for master proc must handle any communicator
  /////////////////////////////////////////////////////////
  static void QueueSend(int slave,void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
     //    std::cout<< " Queueing send  "<< bytes<< " slave "<< slave << " to comm "<<rank  <<std::endl;
    Slaves[slave].QueueCommand(COMMAND_ISEND,buf,bytes,tag,comm,rank);
    //    std::cout << "Queued send command to rank "<< rank<< " via "<<slave <<std::endl;
    Slaves[slave].WakeUpDMA();
    //    std::cout << "Waking up DMA "<< slave<<std::endl;
  };
  static void QueueRecv(int slave, void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
    //    std::cout<< " Queueing recv "<< bytes<< " slave "<< slave << " from comm "<<rank  <<std::endl;
    Slaves[slave].QueueCommand(COMMAND_IRECV,buf,bytes,tag,comm,rank);
    //    std::cout << "Queued recv command from rank "<< rank<< " via "<<slave <<std::endl;
    Slaves[slave].WakeUpDMA();
    //    std::cout << "Waking up DMA "<< slave<<std::endl;
  };
  static void WaitAll() {
    for(int s=1;s<VerticalSize;s++) {
      //      std::cout << "Waiting for slave "<< s<<std::endl;
      Slaves[s].WaitAll();
    }
    //    std::cout << " Wait all Complete "<<std::endl;
  };
  static void GetWork(int nwork, int me, int & mywork, int & myoff,int units){
    int basework = nwork/units;
    int backfill = units-(nwork%units);
    if ( me >= units ) { 
      mywork = myoff = 0;
    } else { 
      mywork = (nwork+me)/units;
      myoff  = basework * me;
      if ( me > backfill ) 
 	myoff+= (me-backfill);
    }
    return;
  };
  static void QueueMultiplexedSend(void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
    uint8_t * cbuf = (uint8_t *) buf;
    int mywork, myoff, procs;
    procs = VerticalSize-1;
    for(int s=0;s<procs;s++) {
      GetWork(bytes,s,mywork,myoff,procs);
      QueueSend(s+1,&cbuf[myoff],mywork,tag,comm,rank);
    }
  };
  static void QueueMultiplexedRecv(void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
    uint8_t * cbuf = (uint8_t *) buf;
    int mywork, myoff, procs;
    procs = VerticalSize-1;
    for(int s=0;s<procs;s++) {
      GetWork(bytes,s,mywork,myoff,procs);
      QueueRecv(s+1,&cbuf[myoff],mywork,tag,comm,rank);
    }
  };
 };
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 std::vector<Slave> MPIoffloadEngine::Slaves;
 int MPIoffloadEngine::UniverseRank;
 int MPIoffloadEngine::UniverseSize;
 MPI_Comm  MPIoffloadEngine::communicator_universe;
 MPI_Comm  MPIoffloadEngine::communicator_cached;
 MPI_Group MPIoffloadEngine::WorldGroup;
 MPI_Group MPIoffloadEngine::CachedGroup;
 MPI_Comm MPIoffloadEngine::HorizontalComm;
 int      MPIoffloadEngine::HorizontalRank;
 int      MPIoffloadEngine::HorizontalSize;
 MPI_Comm MPIoffloadEngine::VerticalComm;
 int      MPIoffloadEngine::VerticalSize;
 int      MPIoffloadEngine::VerticalRank;
 MPI_Win  MPIoffloadEngine::VerticalWindow; 
 std::vector<void *>            MPIoffloadEngine::VerticalShmBufs;
 std::vector<std::vector<int> > MPIoffloadEngine::UniverseRanks;
 std::vector<int>               MPIoffloadEngine::UserCommunicatorToWorldRanks; 
 int MPIoffloadEngine::ShmSetup = 0;
 void MPIoffloadEngine::CommunicatorInit (MPI_Comm &communicator_world,
 					 MPI_Comm &ShmComm,
 					 void * &ShmCommBuf)
 {      
  int flag;
  assert(ShmSetup==0);  
  //////////////////////////////////////////////////////////////////////
  // Universe is all nodes prior to squadron grouping
  //////////////////////////////////////////////////////////////////////
  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_universe);
  MPI_Comm_rank(communicator_universe,&UniverseRank);
  MPI_Comm_size(communicator_universe,&UniverseSize);
  /////////////////////////////////////////////////////////////////////
  // Split into groups that can share memory (Verticals)
  /////////////////////////////////////////////////////////////////////
 #undef MPI_SHARED_MEM_DEBUG
 #ifdef  MPI_SHARED_MEM_DEBUG
  MPI_Comm_split(communicator_universe,(UniverseRank/4),UniverseRank,&VerticalComm);
 #else 
  MPI_Comm_split_type(communicator_universe, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&VerticalComm);
 #endif
  MPI_Comm_rank(VerticalComm     ,&VerticalRank);
  MPI_Comm_size(VerticalComm     ,&VerticalSize);
  //////////////////////////////////////////////////////////////////////
  // Split into horizontal groups by rank in squadron
  //////////////////////////////////////////////////////////////////////
  MPI_Comm_split(communicator_universe,VerticalRank,UniverseRank,&HorizontalComm);
  MPI_Comm_rank(HorizontalComm,&HorizontalRank);
  MPI_Comm_size(HorizontalComm,&HorizontalSize);
  assert(HorizontalSize*VerticalSize==UniverseSize);
  ////////////////////////////////////////////////////////////////////////////////
  // What is my place in the world
  ////////////////////////////////////////////////////////////////////////////////
  int WorldRank=0;
  if(VerticalRank==0) WorldRank = HorizontalRank;
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&WorldRank,1,MPI_INT,MPI_SUM,VerticalComm);
  assert(ierr==0);
  ////////////////////////////////////////////////////////////////////////////////
  // Where is the world in the universe?
  ////////////////////////////////////////////////////////////////////////////////
  UniverseRanks = std::vector<std::vector<int> >(HorizontalSize,std::vector<int>(VerticalSize,0));
  UniverseRanks[WorldRank][VerticalRank] = UniverseRank;
  for(int w=0;w<HorizontalSize;w++){
    ierr=MPI_Allreduce(MPI_IN_PLACE,&UniverseRanks[w][0],VerticalSize,MPI_INT,MPI_SUM,communicator_universe);
    assert(ierr==0);
  }
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  // allocate the shared window for our group, pass back Shm info to CartesianCommunicator
  //////////////////////////////////////////////////////////////////////////////////////////////////////////
  VerticalShmBufs.resize(VerticalSize);
 #undef MPI_SHARED_MEM
 #ifdef MPI_SHARED_MEM
  ierr = MPI_Win_allocate_shared(CartesianCommunicator::MAX_MPI_SHM_BYTES,1,MPI_INFO_NULL,VerticalComm,&ShmCommBuf,&VerticalWindow);
  ierr|= MPI_Win_lock_all (MPI_MODE_NOCHECK, VerticalWindow);
  assert(ierr==0);
  //  std::cout<<"SHM "<<ShmCommBuf<<std::endl;
  for(int r=0;r<VerticalSize;r++){
    MPI_Aint sz;
    int dsp_unit;
    MPI_Win_shared_query (VerticalWindow, r, &sz, &dsp_unit, &VerticalShmBufs[r]);
    //    std::cout<<"SHM "<<r<<" " <<VerticalShmBufs[r]<<std::endl;
  }
 #else 
  char shm_name [NAME_MAX];
  MPI_Barrier(VerticalComm);
  if ( VerticalRank == 0 ) {
    for(int r=0;r<VerticalSize;r++){
      size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES;
      if ( r>0 ) size = sizeof(SlaveState);
      sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",WorldRank,r);
      shm_unlink(shm_name);
      int fd=shm_open(shm_name,O_RDWR|O_CREAT,0600);
      if ( fd < 0 ) {
 	perror("failed shm_open");
 	assert(0);
      }
      ftruncate(fd, size);
      VerticalShmBufs[r] = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
      if ( VerticalShmBufs[r] == MAP_FAILED ) { 
 	perror("failed mmap");
 	assert(0);
      }
      uint64_t * check = (uint64_t *) VerticalShmBufs[r];
      check[0] = WorldRank;
      check[1] = r;
      //      std::cout<<"SHM "<<r<<" " <<VerticalShmBufs[r]<<std::endl;
    }
  }
  MPI_Barrier(VerticalComm);
  if ( VerticalRank != 0 ) { 
  for(int r=0;r<VerticalSize;r++){
    size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES ;
    if ( r>0 ) size = sizeof(SlaveState);
    sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",WorldRank,r);
    int fd=shm_open(shm_name,O_RDWR|O_CREAT,0600);
    if ( fd<0 ) {
      perror("failed shm_open");
      assert(0);
    }
    VerticalShmBufs[r] = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
    uint64_t * check = (uint64_t *) VerticalShmBufs[r];
    assert(check[0]== WorldRank);
    assert(check[1]== r);
    std::cerr<<"SHM "<<r<<" " <<VerticalShmBufs[r]<<std::endl;
  }
  }
 #endif
  MPI_Barrier(VerticalComm);
  //////////////////////////////////////////////////////////////////////
  // Map rank of leader on node in their in new world, to the
  // rank in this vertical plane's horizontal communicator
  //////////////////////////////////////////////////////////////////////
  communicator_world = HorizontalComm;
  ShmComm            = VerticalComm;
  ShmCommBuf         = VerticalShmBufs[0];
  MPI_Comm_group (communicator_world, &WorldGroup); 
  ///////////////////////////////////////////////////////////
  // Start the slave data movers
  ///////////////////////////////////////////////////////////
  if ( VerticalRank != 0 ) {
    Slave indentured;
    indentured.Init( (SlaveState *) VerticalShmBufs[VerticalRank], VerticalComm, UniverseRank,VerticalRank);
    indentured.SemInitExcl();// init semaphore in shared memory
    MPI_Barrier(VerticalComm);
    MPI_Barrier(VerticalComm);
    indentured.EventLoop();
    assert(0);
  } else {
    Slaves.resize(VerticalSize);
    for(int i=1;i<VerticalSize;i++){
      Slaves[i].Init((SlaveState *)VerticalShmBufs[i],VerticalComm, UniverseRanks[HorizontalRank][i],i);
    }
    MPI_Barrier(VerticalComm);
    for(int i=1;i<VerticalSize;i++){
      Slaves[i].SemInit();// init semaphore in shared memory
    }
    MPI_Barrier(VerticalComm);
  }
  ///////////////////////////////////////////////////////////
  // Verbose for now
  ///////////////////////////////////////////////////////////
  ShmSetup=1;
  if (UniverseRank == 0){
    std::cout<<GridLogMessage << "Grid MPI-3 configuration: detected ";
    std::cout<<UniverseSize   << " Ranks " ;
    std::cout<<HorizontalSize << " Nodes " ;
    std::cout<<VerticalSize   << " with ranks-per-node "<<std::endl;
    std::cout<<GridLogMessage << "Grid MPI-3 configuration: using one lead process per node " << std::endl;
    std::cout<<GridLogMessage << "Grid MPI-3 configuration: reduced communicator has size " << HorizontalSize << std::endl;
    for(int g=0;g<HorizontalSize;g++){
      std::cout<<GridLogMessage<<" Node "<<g<<" led by MPI rank "<< UniverseRanks[g][0]<<std::endl;
    }
    for(int g=0;g<HorizontalSize;g++){
      std::cout<<GridLogMessage<<" { ";
      for(int s=0;s<VerticalSize;s++){
 	std::cout<< UniverseRanks[g][s];
 	if ( s<VerticalSize-1 ) {
 	  std::cout<<",";
 	}
      }
      std::cout<<" } "<<std::endl;
    }
  }
 };
  ///////////////////////////////////////////////////////////////////////////////////////////////
  // Map the communicator into communicator_world, and find the neighbour.
  // Cache the mappings; cache size is 1.
  ///////////////////////////////////////////////////////////////////////////////////////////////
 void MPIoffloadEngine::MapCommRankToWorldRank(int &hashtag, int & comm_world_peer,int tag, MPI_Comm comm,int rank) {
  if ( comm == HorizontalComm ) {
    comm_world_peer = rank;
    //    std::cout << " MapCommRankToWorldRank  horiz " <<rank<<"->"<<comm_world_peer<<std::endl;
  } else if ( comm == communicator_cached ) {
    comm_world_peer = UserCommunicatorToWorldRanks[rank];
    //    std::cout << " MapCommRankToWorldRank  cached " <<rank<<"->"<<comm_world_peer<<std::endl;
  } else { 
    int size;
    MPI_Comm_size(comm,&size);
    UserCommunicatorToWorldRanks.resize(size);
    std::vector<int> cached_ranks(size); 
    for(int r=0;r<size;r++) {
      cached_ranks[r]=r;
    }
    communicator_cached=comm;
    MPI_Comm_group(communicator_cached, &CachedGroup);
    MPI_Group_translate_ranks(CachedGroup,size,&cached_ranks[0],WorldGroup, &UserCommunicatorToWorldRanks[0]); 
    comm_world_peer = UserCommunicatorToWorldRanks[rank];
    //    std::cout << " MapCommRankToWorldRank  cache miss " <<rank<<"->"<<comm_world_peer<<std::endl;
    assert(comm_world_peer != MPI_UNDEFINED);
  }
  assert( (tag & (~0xFFFFL)) ==0); 
  uint64_t icomm = (uint64_t)comm;
  int comm_hash = ((icomm>>0 )&0xFFFF)^((icomm>>16)&0xFFFF)
                ^ ((icomm>>32)&0xFFFF)^((icomm>>48)&0xFFFF);
  //  hashtag = (comm_hash<<15) | tag;      
  hashtag = tag;      
 };
 void Slave::Init(SlaveState * _state,MPI_Comm _squadron,int _universe_rank,int _vertical_rank)
 {
  squadron=_squadron;
  universe_rank=_universe_rank;
  vertical_rank=_vertical_rank;
  state   =_state;
  //  std::cout << "state "<<_state<<" comm "<<_squadron<<" universe_rank"<<universe_rank <<std::endl;
  state->head = state->tail = state->start = 0;
  base = (uint64_t)MPIoffloadEngine::VerticalShmBufs[0];
  int rank; MPI_Comm_rank(_squadron,&rank);
 }
 #define PERI_PLUS(A) ( (A+1)%pool )
 int Slave::Event (void) {
  static int tail_last;
  static int head_last;
  static int start_last;
  int ierr;
  ////////////////////////////////////////////////////
  // Try to advance the start pointers
  ////////////////////////////////////////////////////
  int s=state->start;
  if ( s != state->head ) {
    switch ( state->Descrs[s].command ) {
    case COMMAND_ISEND:
      /*
            std::cout<< " Send "<<s << " ptr "<< state<<" "<< state->Descrs[s].buf<< "["<<state->Descrs[s].bytes<<"]"
      	       << " to " << state->Descrs[s].rank<< " tag" << state->Descrs[s].tag
       << " Comm " << MPIoffloadEngine::communicator_universe<< " me " <<universe_rank<< std::endl;
      */
      ierr = MPI_Isend((void *)(state->Descrs[s].buf+base), 
 		       state->Descrs[s].bytes, 
 		       MPI_CHAR,
 		       state->Descrs[s].rank,
 		       state->Descrs[s].tag,
 		       MPIoffloadEngine::communicator_universe,
 		       (MPI_Request *)&state->Descrs[s].request);
      assert(ierr==0);
      state->start = PERI_PLUS(s);
      return 1;
      break;
    case COMMAND_IRECV:
      /*
      std::cout<< " Recv "<<s << " ptr "<< state<<" "<< state->Descrs[s].buf<< "["<<state->Descrs[s].bytes<<"]"
 	       << " from " << state->Descrs[s].rank<< " tag" << state->Descrs[s].tag
 	       << " Comm " << MPIoffloadEngine::communicator_universe<< " me "<< universe_rank<< std::endl;
      */
      ierr=MPI_Irecv((void *)(state->Descrs[s].buf+base), 
 		     state->Descrs[s].bytes, 
 		     MPI_CHAR,
 		     state->Descrs[s].rank,
 		     state->Descrs[s].tag,
 		     MPIoffloadEngine::communicator_universe,
 		     (MPI_Request *)&state->Descrs[s].request);
      //      std::cout<< " Request is "<<state->Descrs[s].request<<std::endl;
      //      std::cout<< " Request0 is "<<state->Descrs[0].request<<std::endl;
      assert(ierr==0);
      state->start = PERI_PLUS(s);
      return 1;
      break;
    case COMMAND_WAITALL:
      for(int t=state->tail;t!=s; t=PERI_PLUS(t) ){
 	MPI_Wait((MPI_Request *)&state->Descrs[t].request,MPI_STATUS_IGNORE);
      };
      s=PERI_PLUS(s);
      state->start = s;
      state->tail  = s;
      WakeUpCompute();
      return 1;
      break;
    default:
      assert(0);
      break;
    }
  }
  return 0;
 }
  //////////////////////////////////////////////////////////////////////////////
  // External interaction with the queue
  //////////////////////////////////////////////////////////////////////////////
 uint64_t Slave::QueueCommand(int command,void *buf, int bytes, int tag, MPI_Comm comm,int commrank) 
 {
  /////////////////////////////////////////
  // Spin; if FIFO is full until not full
  /////////////////////////////////////////
  int head =state->head;
  int next = PERI_PLUS(head);
  // Set up descriptor
  int worldrank;
  int hashtag;
  MPI_Comm    communicator;
  MPI_Request request;
  MPIoffloadEngine::MapCommRankToWorldRank(hashtag,worldrank,tag,comm,commrank);
  uint64_t relative= (uint64_t)buf - base;
  state->Descrs[head].buf    = relative;
  state->Descrs[head].bytes  = bytes;
  state->Descrs[head].rank   = MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank];
  state->Descrs[head].tag    = hashtag;
  state->Descrs[head].command= command;
  /*  
  if ( command == COMMAND_ISEND ) { 
  std::cout << "QueueSend from "<< universe_rank <<" to commrank " << commrank 
            << " to worldrank " << worldrank <<std::endl;
  std::cout << " via VerticalRank "<< vertical_rank <<" to universerank " << MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank]<<std::endl;
  std::cout << " QueueCommand "<<buf<<"["<<bytes<<"]" << std::endl;
  } 
  if ( command == COMMAND_IRECV ) { 
  std::cout << "QueueRecv on "<< universe_rank <<" from commrank " << commrank 
            << " from worldrank " << worldrank <<std::endl;
  std::cout << " via VerticalRank "<< vertical_rank <<" from universerank " << MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank]<<std::endl;
  std::cout << " QueueSend "<<buf<<"["<<bytes<<"]" << std::endl;
  } 
  */
  // Block until FIFO has space
  while( state->tail==next );
  // Msync on weak order architectures
  // Advance pointer
  state->head = next;
  return 0;
 }
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 MPI_Comm CartesianCommunicator::communicator_world;
 void CartesianCommunicator::Init(int *argc, char ***argv) 
 {
  int flag;
  MPI_Initialized(&flag); // needed to coexist with other libs apparently
  if ( !flag ) {
    MPI_Init(argc,argv);
  }
  communicator_world = MPI_COMM_WORLD;
  MPI_Comm ShmComm;
  MPIoffloadEngine::CommunicatorInit (communicator_world,ShmComm,ShmCommBuf);
 }
 void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
 {
  int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
  assert(ierr==0);
 }
 int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
 {
  int rank;
  int ierr=MPI_Cart_rank  (communicator, &coor[0], &rank);
  assert(ierr==0);
  return rank;
 }
 void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
 {
  coor.resize(_ndimension);
  int ierr=MPI_Cart_coords  (communicator, rank, _ndimension,&coor[0]);
  assert(ierr==0);
 }
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 { 
  _ndimension = processors.size();
  std::vector<int> periodic(_ndimension,1);
  _Nprocessors=1;
  _processors = processors;
  for(int i=0;i<_ndimension;i++){
    _Nprocessors*=_processors[i];
  }
  int Size; 
  MPI_Comm_size(communicator_world,&Size);
  assert(Size==_Nprocessors);
  _processor_coor.resize(_ndimension);
  MPI_Cart_create(communicator_world, _ndimension,&_processors[0],&periodic[0],1,&communicator);
  MPI_Comm_rank  (communicator,&_processor);
  MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
 };
 void CartesianCommunicator::GlobalSum(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(uint64_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(float &f){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(float *f,int N)
 {
  int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(double &d)
 {
  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(double *d,int N)
 {
  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int dest,
 					   void *recv,
 					   int from,
 					   int bytes)
 {
  std::vector<CommsRequest_t> reqs(0);
  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
  SendToRecvFromComplete(reqs);
 }
 void CartesianCommunicator::SendRecvPacket(void *xmit,
 					   void *recv,
 					   int sender,
 					   int receiver,
 					   int bytes)
 {
  MPI_Status stat;
  assert(sender != receiver);
  int tag = sender;
  if ( _processor == sender ) {
    MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
  }
  if ( _processor == receiver ) { 
    MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
  }
 }
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
 						void *recv,
 						int from,
 						int bytes)
 {
  MPI_Request xrq;
  MPI_Request rrq;
  int rank = _processor;
  int ierr;
  ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
  ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
  assert(ierr==0);
  list.push_back(xrq);
  list.push_back(rrq);
 }
 void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						       void *xmit,
 						       int dest,
 						       void *recv,
 						       int from,
 						       int bytes)
 {
  uint64_t xmit_i = (uint64_t) xmit;
  uint64_t recv_i = (uint64_t) recv;
  uint64_t shm    = (uint64_t) ShmCommBuf;
  // assert xmit and recv lie in shared memory region
  assert( (xmit_i >= shm) && (xmit_i+bytes <= shm+MAX_MPI_SHM_BYTES) );
  assert( (recv_i >= shm) && (recv_i+bytes <= shm+MAX_MPI_SHM_BYTES) );
  assert(from!=_processor);
  assert(dest!=_processor);
  MPIoffloadEngine::QueueMultiplexedSend(xmit,bytes,_processor,communicator,dest);
  MPIoffloadEngine::QueueMultiplexedRecv(recv,bytes,from,communicator,from);
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  MPIoffloadEngine::WaitAll();
 }
 void CartesianCommunicator::StencilBarrier(void)
 {
 }
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  int nreq=list.size();
  std::vector<MPI_Status> status(nreq);
  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
  assert(ierr==0);
 }
 void CartesianCommunicator::Barrier(void)
 {
  int ierr = MPI_Barrier(communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 {
  int ierr=MPI_Bcast(data,
 		     bytes,
 		     MPI_BYTE,
 		     root,
 		     communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
  int ierr= MPI_Bcast(data,
 		      bytes,
 		      MPI_BYTE,
 		      root,
 		      communicator_world);
  assert(ierr==0);
 }
 void *CartesianCommunicator::ShmBufferSelf(void) { return ShmCommBuf; }
 void *CartesianCommunicator::ShmBuffer(int rank) {
  return NULL;
 }
 void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) { 
  return NULL;
 }
 };
@@ -34,13 +34,6 @@ namespace Grid {
 void CartesianCommunicator::Init(int *argc, char *** arv)
 {
  WorldRank = 0;
  WorldSize = 1;
  ShmRank=0;
  ShmSize=1;
  GroupRank=WorldRank;
  GroupSize=WorldSize;
  Slave    =0;
  ShmInitGeneric();
 }
@@ -99,6 +92,7 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &
  assert(0);
 }
 int  CartesianCommunicator::RankWorld(void){return 0;}
 void CartesianCommunicator::Barrier(void){}
 void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {}
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { }
@@ -50,11 +50,16 @@ typedef struct HandShake_t {
  uint64_t seq_remote;
 } HandShake;
 std::array<long,_SHMEM_REDUCE_SYNC_SIZE> make_psync_init(void) {
  array<long,_SHMEM_REDUCE_SYNC_SIZE> ret;
  ret.fill(SHMEM_SYNC_VALUE);
  return ret;
 }
 static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync_init = make_psync_init();
 static Vector< HandShake > XConnections;
 static Vector< HandShake > RConnections;
 void CartesianCommunicator::Init(int *argc, char ***argv) {
  shmem_init();
  XConnections.resize(shmem_n_pes());
@@ -65,13 +70,6 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
    RConnections[pe].seq_local = 0;
    RConnections[pe].seq_remote= 0;
  }
  WorldSize = shmem_n_pes();
  WorldRank = shmem_my_pe();
  ShmRank=0;
  ShmSize=1;
  GroupRank=WorldRank;
  GroupSize=WorldSize;
  Slave    =0;
  shmem_barrier_all();
  ShmInitGeneric();
 }
@@ -103,7 +101,7 @@ void CartesianCommunicator::GlobalSum(uint32_t &u){
  static long long source ;
  static long long dest   ;
  static long long llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
-  static long      psync[_SHMEM_REDUCE_SYNC_SIZE];
+  static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init;
  //  int nreduce=1;
  //  int pestart=0;
@@ -119,7 +117,7 @@ void CartesianCommunicator::GlobalSum(uint64_t &u){
  static long long source ;
  static long long dest   ;
  static long long llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
-  static long      psync[_SHMEM_REDUCE_SYNC_SIZE];
+  static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init;
  //  int nreduce=1;
  //  int pestart=0;
@@ -135,7 +133,7 @@ void CartesianCommunicator::GlobalSum(float &f){
  static float source ;
  static float dest   ;
  static float llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
-  static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
+  static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init;
  source = f;
  dest   =0.0;
@@ -147,7 +145,7 @@ void CartesianCommunicator::GlobalSumVector(float *f,int N)
  static float source ;
  static float dest   = 0 ;
  static float llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
-  static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
+  static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init;
  if ( shmem_addr_accessible(f,_processor)  ){
    shmem_float_sum_to_all(f,f,N,0,0,_Nprocessors,llwrk,psync);
@@ -166,7 +164,7 @@ void CartesianCommunicator::GlobalSum(double &d)
  static double source;
  static double dest  ;
  static double llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
-  static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
+  static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init;
  source = d;
  dest   = 0;
@@ -178,7 +176,8 @@ void CartesianCommunicator::GlobalSumVector(double *d,int N)
  static double source ;
  static double dest   ;
  static double llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
-  static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
+  static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init;
  if ( shmem_addr_accessible(d,_processor)  ){
    shmem_double_sum_to_all(d,d,N,0,0,_Nprocessors,llwrk,psync);
@@ -295,7 +294,7 @@ void CartesianCommunicator::Barrier(void)
 }
 void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 {
-  static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
+  static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init;
  static uint32_t word;
  uint32_t *array = (uint32_t *) data;
  assert( (bytes % 4)==0);
@@ -318,7 +317,7 @@ void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 }
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
-  static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
+  static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init;
  static uint32_t word;
  uint32_t *array = (uint32_t *) data;
  assert( (bytes % 4)==0);
@@ -32,8 +32,7 @@ directory
 namespace Grid {
 namespace QCD {
-int WilsonKernelsStatic::HandOpt;
+int WilsonKernelsStatic::Opt;
 int WilsonKernelsStatic::AsmOpt;
 template <class Impl>
 WilsonKernels<Impl>::WilsonKernels(const ImplParams &p) : Base(p){};
@@ -40,9 +40,9 @@ namespace QCD {
  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 class WilsonKernelsStatic { 
 public:
  enum { OptGeneric, OptHandUnroll, OptInlineAsm };
  // S-direction is INNERMOST and takes no part in the parity.
-  static int AsmOpt;  // these are a temporary hack
+  static int Opt;  // these are a temporary hack
  static int HandOpt; // these are a temporary hack
 };
 template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public WilsonKernelsStatic { 
@@ -56,24 +56,40 @@ public:
  template <bool EnableBool = true>
  typename std::enable_if<Impl::Dimension == 3 && Nc == 3 &&EnableBool, void>::type
  DiracOptDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
-		   int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
+		   int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) 
  {
    switch(Opt) {
 #ifdef AVX512
-    if (AsmOpt) {
+    case OptInlineAsm:
      WilsonKernels<Impl>::DiracOptAsmDhopSite(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
    } else {
 #else
    {
 #endif
      for (int site = 0; site < Ns; site++) {
 	for (int s = 0; s < Ls; s++) {
-	  if (HandOpt)
+	  WilsonKernels<Impl>::DiracOptAsmDhopSite(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
 	    WilsonKernels<Impl>::DiracOptHandDhopSite(st,lo,U,buf,sF,sU,in,out);
 	  else
 	    WilsonKernels<Impl>::DiracOptGenericDhopSite(st,lo,U,buf,sF,sU,in,out);
 	  sF++;
 	}
 	sU++;
      }
      break;
 #endif
    case OptHandUnroll:
      for (int site = 0; site < Ns; site++) {
 	for (int s = 0; s < Ls; s++) {
 	  WilsonKernels<Impl>::DiracOptHandDhopSite(st,lo,U,buf,sF,sU,in,out);
 	  sF++;
 	}
 	sU++;
      }
      break;
    case OptGeneric:
      for (int site = 0; site < Ns; site++) {
 	for (int s = 0; s < Ls; s++) {
 	  WilsonKernels<Impl>::DiracOptGenericDhopSite(st,lo,U,buf,sF,sU,in,out);
 	  sF++;
 	}
 	sU++;
      }
      break;
    default:
      assert(0);
    }
  }
@@ -81,7 +97,7 @@ public:
  typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type
  DiracOptDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
 		   int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
-     
+    // no kernel choice  
    for (int site = 0; site < Ns; site++) {
      for (int s = 0; s < Ls; s++) {
 	WilsonKernels<Impl>::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU, in, out);
@@ -95,23 +111,39 @@ public:
  typename std::enable_if<Impl::Dimension == 3 && Nc == 3 && EnableBool,void>::type
  DiracOptDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
 		      int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
    switch(Opt) {
 #ifdef AVX512
-    if (AsmOpt) {
+    case OptInlineAsm:
      WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
    } else {
 #else
    {
 #endif
      for (int site = 0; site < Ns; site++) {
 	for (int s = 0; s < Ls; s++) {
-	  if (HandOpt)
+	  WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
 	    WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
 	  else
 	    WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
 	  sF++;
 	}
 	sU++;
      }
      break;
 #endif
    case OptHandUnroll:
      for (int site = 0; site < Ns; site++) {
 	for (int s = 0; s < Ls; s++) {
 	  WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
 	  sF++;
 	}
 	sU++;
      }
      break;
    case OptGeneric:
      for (int site = 0; site < Ns; site++) {
 	for (int s = 0; s < Ls; s++) {
 	  WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
 	  sF++;
 	}
 	sU++;
      }
      break;
    default:
      assert(0);
    }
  }
@@ -172,6 +172,8 @@ namespace QCD{
      pokeLorentz(aTilde, r, mu);
    }
    fft.FFT_all_dim(out, aTilde, FFT::backward);
    out = real(out);
  }
 //  template<class Gimpl>
 //  void Photon<Gimpl>::FeynmanGaugeMomentumSpacePropagator_L(GaugeField &out,
@@ -32,7 +32,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 namespace Grid {
 int LebesgueOrder::UseLebesgueOrder;
-std::vector<int> LebesgueOrder::Block({2,2,2,2});
+std::vector<int> LebesgueOrder::Block({8,2,2,2});
 LebesgueOrder::IndexInteger LebesgueOrder::alignup(IndexInteger n){
  n--;           // 1000 0011 --> 1000 0010
@@ -5,7 +5,7 @@
 BEGIN_QEDFVOL_NAMESPACE
-template <class Gimpl> class WilsonLoops : public Gimpl {
+template <class Gimpl> class NewWilsonLoops : public Gimpl {
 public:
  INHERIT_GIMPL_TYPES(Gimpl);
@@ -45,7 +45,7 @@ public:
                            const std::vector<GaugeMat> &U) {
    LatticeComplex sitePlaq(U[0]._grid);
    Plaq = zero;
-    for (int mu = 1; mu < Nd; mu++) {
+    for (int mu = 1; mu < U[0]._grid->_ndimension; mu++) {
      for (int nu = 0; nu < mu; nu++) {
        traceDirPlaquette(sitePlaq, U, mu, nu);
        Plaq = Plaq + sitePlaq;
@@ -55,10 +55,10 @@ public:
  //////////////////////////////////////////////////
  // sum over all x,y,z,t and over all planes of plaquette
  //////////////////////////////////////////////////
-  static RealD sumPlaquette(const GaugeLorentz &Umu) {
+  static Real sumPlaquette(const GaugeLorentz &Umu) {
    std::vector<GaugeMat> U(4, Umu._grid);
-    for (int mu = 0; mu < Nd; mu++) {
+    for (int mu = 0; mu < Umu._grid->_ndimension; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
    }
@@ -73,11 +73,12 @@ public:
  //////////////////////////////////////////////////
  // average over all x,y,z,t and over all planes of plaquette
  //////////////////////////////////////////////////
-  static RealD avgPlaquette(const GaugeLorentz &Umu) {
+  static Real avgPlaquette(const GaugeLorentz &Umu) {
-    RealD sumplaq = sumPlaquette(Umu);
+    int ndim = Umu._grid->_ndimension;
-    double vol = Umu._grid->gSites();
+    Real sumplaq = sumPlaquette(Umu);
-    double faces = (1.0 * Nd * (Nd - 1)) / 2.0;
+    Real vol = Umu._grid->gSites();
-    return sumplaq / vol / faces / Nc; // Nd , Nc dependent... FIXME
+    Real faces = (1.0 * ndim * (ndim - 1)) / 2.0;
    return sumplaq / vol / faces / Nc; // Nc dependent... FIXME
  }
  //////////////////////////////////////////////////
@@ -112,18 +113,53 @@ public:
                                const int Rmu, const int Rnu,
                                const int mu, const int nu) {
    GaugeMat sp(U[0]._grid);
-    WilsonLoop(sp, U, Rmu, Rnu, mu, nu);
+    wilsonLoop(sp, U, Rmu, Rnu, mu, nu);
    wl = trace(sp);
  }
  //////////////////////////////////////////////////
  // sum over all planes of Wilson loop
  //////////////////////////////////////////////////
  static void siteWilsonLoop(LatticeComplex &Wl,
-                            const std::vector<GaugeMat> &U
+                            const std::vector<GaugeMat> &U,
                            const int R1, const int R2) {
    LatticeComplex siteWl(U[0]._grid);
    Wl = zero;
-    for (int mu = 1; mu < Nd; mu++) {
+    for (int mu = 1; mu < U[0]._grid->_ndimension; mu++) {
      for (int nu = 0; nu < mu; nu++) {
        traceWilsonLoop(siteWl, U, R1, R2, mu, nu);
        Wl = Wl + siteWl;
        traceWilsonLoop(siteWl, U, R2, R1, mu, nu);
        Wl = Wl + siteWl;
      }
    }
  }
  //////////////////////////////////////////////////
  // sum over planes of Wilson loop with length R1
  // in the time direction
  //////////////////////////////////////////////////
  static void siteTimelikeWilsonLoop(LatticeComplex &Wl,
                            const std::vector<GaugeMat> &U,
                            const int R1, const int R2) {
    LatticeComplex siteWl(U[0]._grid);
    int ndim = U[0]._grid->_ndimension;
    Wl = zero;
    for (int nu = 0; nu < ndim - 1; nu++) {
      traceWilsonLoop(siteWl, U, R1, R2, ndim-1, nu);
      Wl = Wl + siteWl;
    }
  }
  //////////////////////////////////////////////////
  // sum Wilson loop over all planes orthogonal to the time direction
  //////////////////////////////////////////////////
  static void siteSpatialWilsonLoop(LatticeComplex &Wl,
                            const std::vector<GaugeMat> &U,
                            const int R1, const int R2) {
    LatticeComplex siteWl(U[0]._grid);
    Wl = zero;
    for (int mu = 1; mu < U[0]._grid->_ndimension - 1; mu++) {
      for (int nu = 0; nu < mu; nu++) {
        traceWilsonLoop(siteWl, U, R1, R2, mu, nu);
        Wl = Wl + siteWl;
@@ -135,11 +171,11 @@ public:
  //////////////////////////////////////////////////
  // sum over all x,y,z,t and over all planes of Wilson loop
  //////////////////////////////////////////////////
-  static RealD sumWilsonLoop(const GaugeLorentz &Umu,
+  static Real sumWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
    std::vector<GaugeMat> U(4, Umu._grid);
-    for (int mu = 0; mu < Nd; mu++) {
+    for (int mu = 0; mu < Umu._grid->_ndimension; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
    }
@@ -152,15 +188,77 @@ public:
    return p.real();
  }
  //////////////////////////////////////////////////
  // sum over all x,y,z,t and over all planes of timelike Wilson loop
  //////////////////////////////////////////////////
  static Real sumTimelikeWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
    std::vector<GaugeMat> U(4, Umu._grid);
    for (int mu = 0; mu < Umu._grid->_ndimension; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
    }
    LatticeComplex Wl(Umu._grid);
    siteTimelikeWilsonLoop(Wl, U, R1, R2);
    TComplex Tp = sum(Wl);
    Complex p = TensorRemove(Tp);
    return p.real();
  }
  //////////////////////////////////////////////////
  // sum over all x,y,z,t and over all planes of spatial Wilson loop
  //////////////////////////////////////////////////
  static Real sumSpatialWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
    std::vector<GaugeMat> U(4, Umu._grid);
    for (int mu = 0; mu < Umu._grid->_ndimension; mu++) {
      U[mu] = PeekIndex<LorentzIndex>(Umu, mu);
    }
    LatticeComplex Wl(Umu._grid);
    siteSpatialWilsonLoop(Wl, U, R1, R2);
    TComplex Tp = sum(Wl);
    Complex p = TensorRemove(Tp);
    return p.real();
  }
  //////////////////////////////////////////////////
  // average over all x,y,z,t and over all planes of Wilson loop
  //////////////////////////////////////////////////
-  static RealD avgPlaquette(const GaugeLorentz &Umu,
+  static Real avgWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
-    RealD sumWl = sumWilsonLoop(Umu);
+    int ndim = Umu._grid->_ndimension;
-    double vol = Umu._grid->gSites();
+    Real sumWl = sumWilsonLoop(Umu, R1, R2);
-    double faces = 1.0 * Nd * (Nd - 1);
+    Real vol = Umu._grid->gSites();
-    return sumWl / vol / faces / Nc; // Nd , Nc dependent... FIXME
+    Real faces = 1.0 * ndim * (ndim - 1);
    return sumWl / vol / faces / Nc; // Nc dependent... FIXME
  }
  //////////////////////////////////////////////////
  // average over all x,y,z,t and over all planes of timelike Wilson loop
  //////////////////////////////////////////////////
  static Real avgTimelikeWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
    int ndim = Umu._grid->_ndimension;
    Real sumWl = sumTimelikeWilsonLoop(Umu, R1, R2);
    Real vol = Umu._grid->gSites();
    Real faces = 1.0 * (ndim - 1);
    return sumWl / vol / faces / Nc; // Nc dependent... FIXME
  }
  //////////////////////////////////////////////////
  // average over all x,y,z,t and over all planes of spatial Wilson loop
  //////////////////////////////////////////////////
  static Real avgSpatialWilsonLoop(const GaugeLorentz &Umu,
                            const int R1, const int R2) {
    int ndim = Umu._grid->_ndimension;
    Real sumWl = sumSpatialWilsonLoop(Umu, R1, R2);
    Real vol = Umu._grid->gSites();
    Real faces = 1.0 * (ndim - 1) * (ndim - 2);
    return sumWl / vol / faces / Nc; // Nc dependent... FIXME
  }
 };
 END_QEDFVOL_NAMESPACE
@@ -1,4 +1,5 @@
 #include <Global.hpp>
 #include <WilsonLoops.h>
 using namespace Grid;
 using namespace QCD;
@@ -24,10 +25,11 @@ public:
  typedef Lattice<SiteGaugeField> GaugeField;
 };
-typedef QedGimpl<vComplex>      QedGimplR;
+typedef QedGimpl<vComplex>              QedGimplR;
-typedef Photon<QedGimplR>       PhotonR;
+typedef PeriodicGaugeImpl<QedGimplR>    QedPeriodicGimplR;
-typedef PhotonR::GaugeField     EmField;
+typedef Photon<QedGimplR>               PhotonR;
-typedef PhotonR::GaugeLinkField EmComp;
+typedef PhotonR::GaugeField             EmField;
 typedef PhotonR::GaugeLinkField         EmComp;
 int main(int argc, char *argv[])
 {
@@ -60,55 +62,31 @@ int main(int argc, char *argv[])
    PhotonR          photon(PhotonR::Gauge::Feynman,
                            PhotonR::ZmScheme::QedL);
    EmField          a(&grid);
    EmField          expA(&grid);
    Real wlA, logWlA;
    pRNG.SeedRandomDevice();
    photon.StochasticField(a, pRNG);
-    // Calculate log of plaquette
+    // Exponentiate photon field
-    EmComp              plaqA(&grid);
+    Complex imag_unit(0, 1);
-    EmComp              wlA(&grid);
+    expA = exp(imag_unit*a);
    EmComp              tmp(&grid);
    std::vector<EmComp> a_comp(4, &grid);
-    for (int dir = 0; dir < Nd; dir++) {
+    // Calculate Wilson loops
-      a_comp[dir] = PeekIndex<LorentzIndex>(a, dir);
+    for(int i=1; i<=10; i++){
        LOG(Message) << i << 'x' << i << " Wilson loop" << std::endl;
        wlA = NewWilsonLoops<QedPeriodicGimplR>::avgWilsonLoop(expA, i, i) * 3;
        logWlA = -2*log(wlA);
        LOG(Message) << "-2log(W) average: " << logWlA << std::endl;
        wlA = NewWilsonLoops<QedPeriodicGimplR>::avgTimelikeWilsonLoop(expA, i, i) * 3;
        logWlA = -2*log(wlA);
        LOG(Message) << "-2log(W) timelike: " << logWlA << std::endl;
        wlA = NewWilsonLoops<QedPeriodicGimplR>::avgSpatialWilsonLoop(expA, i, i) * 3;
        logWlA = -2*log(wlA);
        LOG(Message) << "-2log(W) spatial: " << logWlA << std::endl;
    }
    plaqA = zero;
    wlA = zero;
    for(int mu = 1; mu < Nd; mu++) {
        for(int nu = 0; nu < mu; nu++) {
            tmp = a_comp[mu] + Cshift(a_comp[nu], mu, 1) - Cshift(a_comp[mu], nu, 1) - a_comp[nu];
            plaqA = plaqA + cos(tmp);
            tmp = a_comp[mu] + Cshift(a_comp[mu], mu, 1)
                  + Cshift(a_comp[nu], mu, 2) + Cshift(Cshift(a_comp[nu], mu, 2), nu, 1)
                  - Cshift(Cshift(a_comp[mu], nu, 2), mu, 1) - Cshift(a_comp[mu], nu, 2)
                  - Cshift(a_comp[nu], nu, 1) - a_comp[nu];
            wlA = wlA + cos(tmp);
        }
    }
    Real vol = grid.gSites();
    Real faces = (1.0 * Nd * (Nd - 1)) / 2.0;
    Complex avgPlaqA = sum(trace(plaqA));
    avgPlaqA = avgPlaqA / vol / faces;
    Complex avgWlA = sum(trace(wlA));
    avgWlA = avgWlA / vol / faces;
    TComplex tplaqsite;
    LatticeComplex plaqtrace = trace(plaqA);
    std::vector<int> site0 = {0,0,0,0};
    peekSite(tplaqsite, plaqtrace, site0);
    Complex plaqsite = TensorRemove(tplaqsite);
    LOG(Message) << "Plaquette average: " << avgPlaqA << std::endl;
    LOG(Message) << "2x2 Wilson Loop average: " << avgWlA << std::endl;
    LOG(Message) << "Plaquette (one site): " << plaqsite / faces << std::endl;
    // epilogue
    LOG(Message) << "Grid is finalizing now" << std::endl;
    Grid_finalize();
@@ -1 +0,0 @@
 ./configure --host=arm-linux-gnueabihf  CXX=clang++-3.5 CXXFLAGS='-std=c++11 -O3 -target arm-linux-gnueabihf -I/usr/arm-linux-gnueabihf/include/ -I/home/neo/Codes/gmp6.0/gmp-arm/include/ -I/usr/arm-linux-gnueabihf/include/c++/4.8.2/arm-linux-gnueabihf/ -L/home/neo/Codes/gmp6.0/gmp-arm/lib/ -I/home/neo/Codes/mpfr3.1.2/mpfr-arm/include/ -L/home/neo/Codes/mpfr3.1.2/mpfr-arm/lib/ -static -mcpu=cortex-a7' --enable-simd=NEONv7
@@ -1,3 +0,0 @@
 #./configure --host=arm-linux-gnueabihf  CXX=clang++-3.5 CXXFLAGS='-std=c++11 -O3 -target arm-linux-gnueabihf -I/usr/arm-linux-gnueabihf/include/ -I/home/neo/Codes/gmp6.0/gmp-arm/include/ -I/usr/lib/llvm-3.5/lib/clang/3.5.0/include/ -L/home/neo/Codes/gmp6.0/gmp-arm/lib/ -I/home/neo/Codes/mpfr3.1.2/mpfr-arm/include/ -L/home/neo/Codes/mpfr3.1.2/mpfr-arm/lib/ -static -mcpu=cortex-a57' --enable-simd=NEONv7
 ./configure --host=aarch64-linux-gnu  CXX=clang++-3.5 CXXFLAGS='-std=c++11 -O3 -target aarch64-linux-gnu -static -I/home/neo/Codes/gmp6.0/gmp-armv8/include/ -L/home/neo/Codes/gmp6.0/gmp-armv8/lib/ -I/home/neo/Codes/mpfr3.1.2/mpfr-armv8/include/ -L/home/neo/Codes/mpfr3.1.2/mpfr-armv8/lib/ -I/usr/aarch64-linux-gnu/include/ -I/usr/aarch64-linux-gnu/include/c++/4.8.2/aarch64-linux-gnu/' --enable-simd=NEONv7
@@ -1,9 +0,0 @@
 for omp in 1 2 4
 do
 echo > wilson.t$omp
 for vol in 4.4.4.4 4.4.4.8 4.4.8.8  4.8.8.8  8.8.8.8   8.8.8.16 8.8.16.16  8.16.16.16
 do   
 perf=` ./benchmarks/Grid_wilson --grid $vol --omp $omp  | grep mflop | awk '{print $3}'`
 echo $vol $perf >> wilson.t$omp
 done
 done
@@ -1,46 +0,0 @@
 #!/bin/bash  -e
 DIRS="clang-avx clang-avx-openmp clang-avx-openmp-mpi clang-avx-mpi clang-avx2 clang-avx2-openmp clang-avx2-openmp-mpi clang-avx2-mpi clang-sse"
 EXTRADIRS="g++-avx g++-sse4 icpc-avx icpc-avx2 icpc-avx512"
 BLACK="\033[30m"
 RED="\033[31m"
 GREEN="\033[32m"
 YELLOW="\033[33m"
 BLUE="\033[34m"
 PINK="\033[35m"
 CYAN="\033[36m"
 WHITE="\033[37m"
 NORMAL="\033[0;39m"
 for D in $DIRS
 do
 echo
 echo -e $RED ==============================
 echo -e $GREEN $D
 echo -e $RED ==============================
 echo -e $BLUE
  cd builds/$D
  make clean all -j 8
  cd ../../
 echo -e $NORMAL
 done
 if [ "X$1" == "Xextra" ]
 then
 for D in $EXTRADIRS
 do
 echo
 echo -e $RED ==============================
 echo -e $RED $D
 echo -e $RED ==============================
 echo -e $BLUE
  cd builds/$D
  make clean all -j 8
  cd ../../
 echo -e $NORMAL
 done
 fi
@@ -1,11 +0,0 @@
 #!/bin/bash
 DIRS="clang-avx clang-avx-openmp clang-avx-openmp-mpi clang-avx-mpi clang-avx2 clang-avx2-openmp clang-avx2-openmp-mpi clang-avx2-mpi icpc-avx icpc-avx2 icpc-avx512 g++-sse4 g++-avx clang-sse icpc-avx-openmp-mpi icpc-avx-openmp"
 for D in $DIRS
 do
  mkdir -p builds/$D
  cd builds/$D
  ../../scripts/configure-commands $D
  cd ../..
 done
@@ -1,89 +0,0 @@
 #!/bin/bash
 WD=$1
 BLACK="\033[30m"
 RED="\033[31m"
 GREEN="\033[32m"
 YELLOW="\033[33m"
 BLUE="\033[34m"
 PINK="\033[35m"
 CYAN="\033[36m"
 WHITE="\033[37m"
 NORMAL="\033[0;39m"
 echo
 echo -e $RED ==============================
 echo -e $GREEN $WD
 echo -e $RED ==============================
 echo -e $YELLOW
 case $WD in
 g++-avx)
  CXX=g++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
  ;;
 g++-avx-openmp)
  CXX=g++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -std=c++11" LIBS="-fopenmp -lgmp -lmpfr" --enable-comms=none
  ;;
 g++5-sse4)
  CXX=g++-5 ../../configure --enable-simd=SSE4 CXXFLAGS="-msse4 -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
  ;;
 g++5-avx)
  CXX=g++-5 ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
  ;;
 icpc-avx)
  CXX=icpc ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
  ;;
 icpc-avx-openmp-mpi)
 CXX=icpc ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp -lgmp -lmpfr" --enable-comms=mpi
 ;;
 icpc-avx-openmp)
 CXX=icpc ../../configure --enable-precision=single --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -std=c++11" LIBS="-fopenmp -lgmp -lmpfr" --enable-comms=mpi
 ;;
 icpc-avx2)
  CXX=icpc ../../configure --enable-simd=AVX2 CXXFLAGS="-march=core-avx2 -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
  ;;
 icpc-avx512)
  CXX=icpc ../../configure --enable-simd=AVX512 CXXFLAGS="-xCOMMON-AVX512 -O3  -std=c++11" --host=none  LIBS="-lgmp -lmpfr" --enable-comms=none
  ;;
 icpc-mic)
  CXX=icpc ../../configure --host=none --enable-simd=IMCI CXXFLAGS="-mmic -O3  -std=c++11" LDFLAGS=-mmic LIBS="-lgmp -lmpfr" --enable-comms=none
  ;;
 icpc-mic-avx512)
  CXX=icpc ../../configure --host=none --enable-simd=IMCI CXXFLAGS="-xCOMMON_AVX512 -O3  -std=c++11" LDFLAGS=-xCOMMON_AVX512 LIBS="-lgmp -lmpfr" --enable-comms=none
  ;;
 clang-sse)
 CXX=clang++ ../../configure --enable-precision=single --enable-simd=SSE4 CXXFLAGS="-msse4 -O3 -std=c++11"  LIBS="-lgmp -lmpfr" --enable-comms=none
  ;;
 clang-avx)
 CXX=clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -std=c++11"  LIBS="-lgmp -lmpfr" --enable-comms=none
  ;;
 clang-avx2)
 CXX=clang++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -O3 -std=c++11"  LIBS="-lgmp -lmpfr" --enable-comms=none
  ;;
 clang-avx-openmp)
 CXX=clang-omp++ ../../configure --enable-precision=double --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -std=c++11" LDFLAGS="-fopenmp"  LIBS="-lgmp -lmpfr" --enable-comms=none
  ;;
 clang-xc30)
 CXX=$HOME/Clang/install/bin/clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -std=c++11 -I/opt/gcc/4.9.2/snos/include/g++/x86_64-suse-linux/ -I/opt/gcc/4.9.2/snos/include/g++/ " LDFLAGS=""  LIBS="-lgmp -lmpfr" --enable-comms=none
  ;;
 clang-xc30-openmp)
 CXX=$HOME/Clang/install/bin/clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -std=c++11 -I/opt/gcc/4.9.2/snos/include/g++/x86_64-suse-linux/ -I/opt/gcc/4.9.2/snos/include/g++/ " LDFLAGS="-fopenmp"  LIBS="-lgmp -lmpfr" --enable-comms=none
  ;;
 clang-avx2-openmp)
 CXX=clang-omp++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -fopenmp -O3 -std=c++11" LDFLAGS="-fopenmp"  LIBS="-lgmp -lmpfr" --enable-comms=none
  ;;
 clang-avx-openmp-mpi)
 CXX=clang-omp++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp -lgmp -lmpfr" --enable-comms=mpi
 ;;
 clang-avx2-openmp-mpi)
 CXX=clang-omp++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp -lgmp -lmpfr" --enable-comms=mpi
 ;;
 clang-avx-mpi)
 CXX=clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -lgmp -lmpfr" --enable-comms=mpi
 ;;
 clang-avx2-mpi)
 CXX=clang++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -lgmp -lmpfr" --enable-comms=mpi
 ;;
 clang-avx2)
 CXX=clang++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -O3 -std=c++11" LDFLAGS="-L/usr/local/lib/" LIBS="-lgmp -lmpfr" --enable-comms=none
 ;;
 esac
 echo -e $NORMAL
@@ -1,10 +0,0 @@
 #!/bin/bash
 DIRS="g++-avx-openmp g++-avx clang-xc30 clang-xc30-openmp"
 for D in $DIRS
 do
  mkdir -p builds/$D
  cd builds/$D
  ../../scripts/configure-commands $D
  cd ../..
 done
@@ -1,10 +0,0 @@
 #!/bin/bash
 DIRS="build-icpc-mic"
 for D in $DIRS
 do
  mkdir -p $D
 cd $D
  ../configure-commands
  cd ..
 done
@@ -12,6 +12,7 @@ Grid physics library, www.github.com/paboyle/Grid
 Source file: $1
 Copyright (C) 2015
 Copyright (C) 2016
 EOF
@@ -38,8 +39,21 @@ See the full license in the file "LICENSE" in the top level distribution directo
 /*  END LEGAL */
 EOF
 cat message > tmp.fil
-cat $1 >> tmp.fil
+
 NOTICE=`grep -n "END LEGAL" $1 | awk '{ print $1 }'  `
 if [ "X$NOTICE" != "X" ]
 then
    echo "found notice ending on line $NOTICE"
    awk 'BEGIN { P=0 } { if ( P ) print } /END LEGAL/{P=1} ' $1 >> tmp.fil
 else
    cat $1 >> tmp.fil
 fi
 cp tmp.fil $1
 shift
@@ -1,2 +0,0 @@
 module swap PrgEnv-cray PrgEnv-intel
 module swap intel/14.0.4.211 intel/15.0.2.164
@@ -1,4 +0,0 @@
 aclocal -I m4
 autoheader -f
 automake -f --add-missing
 autoconf -f
@@ -1,18 +0,0 @@
 #!/usr/bin/env bash
 if (( $# != 1 )); then
    echo "usage: `basename $0` <archive>" 1>&2
    exit 1
 fi
 ARC=$1
 INITDIR=`pwd`
 rm -rf lib/fftw
 mkdir lib/fftw
 ARCDIR=`tar -tf ${ARC} | head -n1 | sed -e 's@/.*@@'`
 tar -xf ${ARC}
 cp ${ARCDIR}/api/fftw3.h lib/fftw/
 cd ${INITDIR}
 rm -rf ${ARCDIR}
@@ -1,7 +0,0 @@
 plot 'wilson.t1' u 2 w l t "AVX1-OMP=1"
 replot 'wilson.t2' u 2 w l t "AVX1-OMP=2"
 replot 'wilson.t4' u 2 w l t "AVX1-OMP=4"
 set terminal 'pdf'
 set output 'wilson_clang.pdf'
 replot
 quit
@@ -102,16 +102,14 @@ int main (int argc, char ** argv)
    PokeIndex<LorentzIndex>(mom,mommu,mu);
    // fourth order exponential approx
-    parallel_for(auto i=mom.begin();i<mom.end();i++){
+    parallel_for(auto i=mom.begin();i<mom.end();i++) {
-      Uprime[i](mu) =
+      Uprime[i](mu)  =	  U[i](mu);
-	  U[i](mu)
+      Uprime[i](mu) += mom[i](mu)*U[i](mu)*dt ;
-	+ mom[i](mu)*U[i](mu)*dt 
+      Uprime[i](mu) += mom[i](mu) *mom[i](mu) *U[i](mu)*(dt*dt/2.0);
-	+ mom[i](mu) *mom[i](mu) *U[i](mu)*(dt*dt/2.0)
+      Uprime[i](mu) += mom[i](mu) *mom[i](mu) *mom[i](mu) *U[i](mu)*(dt*dt*dt/6.0);
-	+ mom[i](mu) *mom[i](mu) *mom[i](mu) *U[i](mu)*(dt*dt*dt/6.0)
+      Uprime[i](mu) += mom[i](mu) *mom[i](mu) *mom[i](mu) *mom[i](mu) *U[i](mu)*(dt*dt*dt*dt/24.0);
-	+ mom[i](mu) *mom[i](mu) *mom[i](mu) *mom[i](mu) *U[i](mu)*(dt*dt*dt*dt/24.0)
+      Uprime[i](mu) += mom[i](mu) *mom[i](mu) *mom[i](mu) *mom[i](mu) *mom[i](mu) *U[i](mu)*(dt*dt*dt*dt*dt/120.0);
-	+ mom[i](mu) *mom[i](mu) *mom[i](mu) *mom[i](mu) *mom[i](mu) *U[i](mu)*(dt*dt*dt*dt*dt/120.0)
+      Uprime[i](mu) += mom[i](mu) *mom[i](mu) *mom[i](mu) *mom[i](mu) *mom[i](mu) *mom[i](mu) *U[i](mu)*(dt*dt*dt*dt*dt*dt/720.0);
 	+ mom[i](mu) *mom[i](mu) *mom[i](mu) *mom[i](mu) *mom[i](mu) *mom[i](mu) *U[i](mu)*(dt*dt*dt*dt*dt*dt/720.0)
 	;
    }
  }
		`@@ -1 +0,0 @@`
			`./configure --host=arm-linux-gnueabihf CXX=clang++-3.5 CXXFLAGS='-std=c++11 -O3 -target arm-linux-gnueabihf -I/usr/arm-linux-gnueabihf/include/ -I/home/neo/Codes/gmp6.0/gmp-arm/include/ -I/usr/arm-linux-gnueabihf/include/c++/4.8.2/arm-linux-gnueabihf/ -L/home/neo/Codes/gmp6.0/gmp-arm/lib/ -I/home/neo/Codes/mpfr3.1.2/mpfr-arm/include/ -L/home/neo/Codes/mpfr3.1.2/mpfr-arm/lib/ -static -mcpu=cortex-a7' --enable-simd=NEONv7`
		`@@ -1,3 +0,0 @@`
			`#./configure --host=arm-linux-gnueabihf CXX=clang++-3.5 CXXFLAGS='-std=c++11 -O3 -target arm-linux-gnueabihf -I/usr/arm-linux-gnueabihf/include/ -I/home/neo/Codes/gmp6.0/gmp-arm/include/ -I/usr/lib/llvm-3.5/lib/clang/3.5.0/include/ -L/home/neo/Codes/gmp6.0/gmp-arm/lib/ -I/home/neo/Codes/mpfr3.1.2/mpfr-arm/include/ -L/home/neo/Codes/mpfr3.1.2/mpfr-arm/lib/ -static -mcpu=cortex-a57' --enable-simd=NEONv7`

			`./configure --host=aarch64-linux-gnu CXX=clang++-3.5 CXXFLAGS='-std=c++11 -O3 -target aarch64-linux-gnu -static -I/home/neo/Codes/gmp6.0/gmp-armv8/include/ -L/home/neo/Codes/gmp6.0/gmp-armv8/lib/ -I/home/neo/Codes/mpfr3.1.2/mpfr-armv8/include/ -L/home/neo/Codes/mpfr3.1.2/mpfr-armv8/lib/ -I/usr/aarch64-linux-gnu/include/ -I/usr/aarch64-linux-gnu/include/c++/4.8.2/aarch64-linux-gnu/' --enable-simd=NEONv7`
		`@@ -1,2 +0,0 @@`
			`module swap PrgEnv-cray PrgEnv-intel`
			`module swap intel/14.0.4.211 intel/15.0.2.164`