Merge branch 'develop' into release/v0.6.0

2025-11-04 05:54:32 +00:00 · 2016-11-02 13:59:18 +00:00
parent bc248b6948 afdeb2b13c
commit c067051d5f
34 changed files with 1252 additions and 822 deletions
--- a/benchmarks/Benchmark_comms.cc
+++ b/benchmarks/Benchmark_comms.cc
@@ -42,15 +42,14 @@ int main (int argc, char ** argv)

  int Nloop=10;
  int nmu=0;
-  for(int mu=0;mu<4;mu++) if (mpi_layout[mu]>1) nmu++;
+  for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;
+
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking concurrent halo exchange in "<<nmu<<" dimensions"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;
-
-
-
-  for(int lat=4;lat<=32;lat+=2){
+  int maxlat=16;
+  for(int lat=4;lat<=maxlat;lat+=2){
    for(int Ls=1;Ls<=16;Ls*=2){

      std::vector<int> latt_size  ({lat*mpi_layout[0],
@@ -125,7 +124,7 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;


-  for(int lat=4;lat<=32;lat+=2){
+  for(int lat=4;lat<=maxlat;lat+=2){
    for(int Ls=1;Ls<=16;Ls*=2){

      std::vector<int> latt_size  ({lat,lat,lat,lat});
@@ -194,128 +193,83 @@ int main (int argc, char ** argv)
    }
  }  

-#if 0
-
+  Nloop=100;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "= Benchmarking sequential persistent halo exchange in "<<nmu<<" dimensions"<<std::endl;
+  std::cout<<GridLogMessage << "= Benchmarking concurrent STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "  L  "<<"\t\t"<<" Ls  "<<"\t\t"<<"bytes"<<"\t\t"<<"MB/s uni"<<"\t\t"<<"MB/s bidi"<<std::endl;

-
-  for(int lat=4;lat<=32;lat+=2){
+  for(int lat=4;lat<=maxlat;lat+=2){
    for(int Ls=1;Ls<=16;Ls*=2){

-      std::vector<int> latt_size  ({lat,lat,lat,lat});
+      std::vector<int> latt_size  ({lat*mpi_layout[0],
+      				    lat*mpi_layout[1],
+      				    lat*mpi_layout[2],
+      				    lat*mpi_layout[3]});

      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);

-      std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
-      std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
-
+      std::vector<HalfSpinColourVectorD *> xbuf(8);
+      std::vector<HalfSpinColourVectorD *> rbuf(8);
+      Grid.ShmBufferFreeAll();
+      for(int d=0;d<8;d++){
+	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
+      }

      int ncomm;
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);

+      double start=usecond();
+      for(int i=0;i<Nloop;i++){

-      std::vector<CartesianCommunicator::CommsRequest_t> empty;
-      std::vector<std::vector<CartesianCommunicator::CommsRequest_t> > requests_fwd(Nd,empty);
-      std::vector<std::vector<CartesianCommunicator::CommsRequest_t> > requests_bwd(Nd,empty);
+	std::vector<CartesianCommunicator::CommsRequest_t> requests;

-      for(int mu=0;mu<4;mu++){
 	ncomm=0;
-	if (mpi_layout[mu]>1 ) {
-	  ncomm++;
-
-	  int comm_proc;
-	  int xmit_to_rank;
-	  int recv_from_rank;
-
-	  comm_proc=1;
-	  Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	  Grid.SendToRecvFromInit(requests_fwd[mu],
-				  (void *)&xbuf[mu][0],
-				  xmit_to_rank,
-				  (void *)&rbuf[mu][0],
-				  recv_from_rank,
-				  bytes);
-
-	  comm_proc = mpi_layout[mu]-1;
-	  Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
-	  Grid.SendToRecvFromInit(requests_bwd[mu],
-				  (void *)&xbuf[mu+4][0],
-				  xmit_to_rank,
-				  (void *)&rbuf[mu+4][0],
-				  recv_from_rank,
-				  bytes);
-
-	}
-      }
-
-      {
-	double start=usecond();
-	for(int i=0;i<Nloop;i++){
+	for(int mu=0;mu<4;mu++){
+	
+	  if (mpi_layout[mu]>1 ) {
 	  
-	  for(int mu=0;mu<4;mu++){
+	    ncomm++;
+	    int comm_proc=1;
+	    int xmit_to_rank;
+	    int recv_from_rank;
 	    
-	    if (mpi_layout[mu]>1 ) {
-	      
-	      Grid.SendToRecvFromBegin(requests_fwd[mu]);
-	      Grid.SendToRecvFromComplete(requests_fwd[mu]);
-	      Grid.SendToRecvFromBegin(requests_bwd[mu]);
-	      Grid.SendToRecvFromComplete(requests_bwd[mu]);
-	    }
-	  }
-	  Grid.Barrier();
-	}
+	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+	    Grid.StencilSendToRecvFromBegin(requests,
+					    (void *)&xbuf[mu][0],
+					    xmit_to_rank,
+					    (void *)&rbuf[mu][0],
+					    recv_from_rank,
+					    bytes);
 	
-	double stop=usecond();
-	
-	double dbytes    = bytes;
-	double xbytes    = Nloop*dbytes*2.0*ncomm;
-	double rbytes    = xbytes;
-	double bidibytes = xbytes+rbytes;
-	
-	double time = stop-start;
-	
-	std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
-
-      }
-
-
-      {
-	double start=usecond();
-	for(int i=0;i<Nloop;i++){
+	    comm_proc = mpi_layout[mu]-1;
+	  
+	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
+	    Grid.StencilSendToRecvFromBegin(requests,
+					    (void *)&xbuf[mu+4][0],
+					    xmit_to_rank,
+					    (void *)&rbuf[mu+4][0],
+					    recv_from_rank,
+					    bytes);
 	  
-	  for(int mu=0;mu<4;mu++){
-	    
-	    if (mpi_layout[mu]>1 ) {
-	      
-	      Grid.SendToRecvFromBegin(requests_fwd[mu]);
-	      Grid.SendToRecvFromBegin(requests_bwd[mu]);
-	      Grid.SendToRecvFromComplete(requests_fwd[mu]);
-	      Grid.SendToRecvFromComplete(requests_bwd[mu]);
-	    }
 	  }
-	  Grid.Barrier();
 	}
-	
-	double stop=usecond();
-	
-	double dbytes    = bytes;
-	double xbytes    = Nloop*dbytes*2.0*ncomm;
-	double rbytes    = xbytes;
-	double bidibytes = xbytes+rbytes;
-	
-	double time = stop-start;
-	
-	std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
+	Grid.StencilSendToRecvFromComplete(requests);
+	Grid.Barrier();

      }
+      double stop=usecond();

+      double dbytes    = bytes;
+      double xbytes    = Nloop*dbytes*2.0*ncomm;
+      double rbytes    = xbytes;
+      double bidibytes = xbytes+rbytes;
+
+      double time = stop-start; // microseconds
+
+      std::cout<<GridLogMessage << lat<<"\t\t"<<Ls<<"\t\t"<<bytes<<"\t\t"<<xbytes/time<<"\t\t"<<bidibytes/time<<std::endl;
    }
-  }
-
-#endif
-
+  }    
  Grid_finalize();
 }
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@@ -44,7 +44,6 @@ struct scal {
    Gamma::GammaT
  };

-bool overlapComms = false;
 typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
 typedef WilsonFermion5D<DomainWallVec5dImplF> WilsonFermion5DF;
 typedef WilsonFermion5D<DomainWallVec5dImplD> WilsonFermion5DD;
@@ -54,10 +53,6 @@ int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);

-  if( GridCmdOptionExists(argv,argv+argc,"--asynch") ){
-    overlapComms = true;
-  }
-
  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;

@@ -126,14 +121,21 @@ int main (int argc, char ** argv)

  RealD NP = UGrid->_Nprocessors;

-  for(int doasm=1;doasm<2;doasm++){
-
-    QCD::WilsonKernelsStatic::AsmOpt=doasm;
-
  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
-  
-  std::cout<<GridLogMessage << "Naive wilson implementation "<<std::endl;
-  std::cout << GridLogMessage<< "Calling Dw"<<std::endl;
+
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::Dhop                  "<<std::endl;
+  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
+  if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
+  if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+
  int ncall =100;
  if (1) {

@@ -162,6 +164,17 @@ int main (int argc, char ** argv)

  if (1)
  {
+
+    std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
+    std::cout << GridLogMessage<< "* Benchmarking WilsonFermion5D<DomainWallVec5dImplR>::Dhop "<<std::endl;
+    std::cout << GridLogMessage<< "* Vectorising fifth dimension by "<<vComplex::Nsimd()<<std::endl;
+    if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
+    if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
+    if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
+    if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
+    if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
+    std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
+
    typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
    LatticeFermion ssrc(sFGrid);
    LatticeFermion sref(sFGrid);
@@ -248,6 +261,16 @@ int main (int argc, char ** argv)
      sr_e = zero;
      sr_o = zero;

+      std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
+      std::cout << GridLogMessage<< "* Benchmarking WilsonFermion5D<DomainWallVec5dImplR>::DhopEO "<<std::endl;
+      std::cout << GridLogMessage<< "* Vectorising fifth dimension by "<<vComplex::Nsimd()<<std::endl;
+      if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
+      if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
+      if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
+      if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
+      if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
+      std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
+
      sDw.ZeroCounters();
      sDw.stat.init("DhopEO");
      double t0=usecond();
@@ -308,7 +331,7 @@ int main (int argc, char ** argv)
    ref = -0.5*ref;
  }
  Dw.Dhop(src,result,1);
-  std::cout << GridLogMessage << "Naive wilson implementation Dag" << std::endl;
+  std::cout << GridLogMessage << "Compare to naive wilson implementation Dag to verify correctness" << std::endl;
  std::cout<<GridLogMessage << "Called DwDag"<<std::endl;
  std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
  std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
@@ -322,13 +345,22 @@ int main (int argc, char ** argv)
  LatticeFermion r_eo  (FGrid);


-  std::cout<<GridLogMessage << "Calling Deo and Doe"<<std::endl;
+  std::cout<<GridLogMessage << "Calling Deo and Doe and assert Deo+Doe == Dunprec"<<std::endl;
  pickCheckerboard(Even,src_e,src);
  pickCheckerboard(Odd,src_o,src);

  std::cout<<GridLogMessage << "src_e"<<norm2(src_e)<<std::endl;
  std::cout<<GridLogMessage << "src_o"<<norm2(src_o)<<std::endl;

+  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Benchmarking DomainWallFermionR::DhopEO                "<<std::endl;
+  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
+  if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
+  if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
+  std::cout << GridLogMessage<< "*********************************************************" <<std::endl;
  {
    Dw.ZeroCounters();
    double t0=usecond();
@@ -366,8 +398,5 @@ int main (int argc, char ** argv)
  assert(norm2(src_e)<1.0e-5);
  assert(norm2(src_o)<1.0e-5);

-
-  }
-
  Grid_finalize();
 }
--- a/benchmarks/Benchmark_dwf_ntpf.cc
+++ b/benchmarks/Benchmark_dwf_ntpf.cc
@@ -1,153 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./benchmarks/Benchmark_dwf.cc
-
-    Copyright (C) 2015
-
-Author: Peter Boyle <paboyle@ph.ed.ac.uk>
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#include <Grid/Grid.h>
-
-using namespace std;
-using namespace Grid;
-using namespace Grid::QCD;
-
-template<class d>
-struct scal {
-  d internal;
-};
-
-  Gamma::GammaMatrix Gmu [] = {
-    Gamma::GammaX,
-    Gamma::GammaY,
-    Gamma::GammaZ,
-    Gamma::GammaT
-  };
-
-bool overlapComms = false;
-
-
-int main (int argc, char ** argv)
-{
-  Grid_init(&argc,&argv);
-
-  if( GridCmdOptionExists(argv,argv+argc,"--asynch") ){
-    overlapComms = true;
-  }
-
-  int threads = GridThread::GetThreads();
-  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
-
-  std::vector<int> latt4 = GridDefaultLatt();
-  const int Ls=16;
-  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
-  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
-  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
-  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
-
-  std::vector<int> seeds4({1,2,3,4});
-  std::vector<int> seeds5({5,6,7,8});
-
-  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
-  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
-
-  LatticeFermion src   (FGrid); random(RNG5,src);
-  LatticeFermion result(FGrid); result=zero;
-  LatticeFermion    ref(FGrid);    ref=zero;
-  LatticeFermion    tmp(FGrid);
-  LatticeFermion    err(FGrid);
-
-  ColourMatrix cm = Complex(1.0,0.0);
-
-  LatticeGaugeField Umu(UGrid); 
-  random(RNG4,Umu);
-
-  LatticeGaugeField Umu5d(FGrid); 
-
-  // replicate across fifth dimension
-  for(int ss=0;ss<Umu._grid->oSites();ss++){
-    for(int s=0;s<Ls;s++){
-      Umu5d._odata[Ls*ss+s] = Umu._odata[ss];
-    }
-  }
-
-  ////////////////////////////////////
-  // Naive wilson implementation
-  ////////////////////////////////////
-  std::vector<LatticeColourMatrix> U(4,FGrid);
-  for(int mu=0;mu<Nd;mu++){
-    U[mu] = PeekIndex<LorentzIndex>(Umu5d,mu);
-  }
-
-  if (1)
-  {
-    ref = zero;
-    for(int mu=0;mu<Nd;mu++){
-
-      tmp = U[mu]*Cshift(src,mu+1,1);
-      ref=ref + tmp - Gamma(Gmu[mu])*tmp;
-
-      tmp =adj(U[mu])*src;
-      tmp =Cshift(tmp,mu+1,-1);
-      ref=ref + tmp + Gamma(Gmu[mu])*tmp;
-    }
-    ref = -0.5*ref;
-  }
-
-  RealD mass=0.1;
-  RealD M5  =1.8;
-
-  typename DomainWallFermionR::ImplParams params; 
-  params.overlapCommsCompute = overlapComms;
-  
-  RealD NP = UGrid->_Nprocessors;
-
-
-  QCD::WilsonKernelsStatic::AsmOpt=1;
-
-  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5,params);
-  
-  std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
-  int ncall =50;
-  if (1) {
-
-    double t0=usecond();
-    for(int i=0;i<ncall;i++){
-      Dw.Dhop(src,result,0);
-    }
-    double t1=usecond();
-    
-    double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-    double flops=1344*volume*ncall;
-
-    std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
-    std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
-    std::cout<<GridLogMessage << "norm ref    "<< norm2(ref)<<std::endl;
-    std::cout<<GridLogMessage << "mflop/s =   "<< flops/(t1-t0)<<std::endl;
-    std::cout<<GridLogMessage << "mflop/s per node =  "<< flops/(t1-t0)/NP<<std::endl;
-    err = ref-result; 
-    std::cout<<GridLogMessage << "norm diff   "<< norm2(err)<<std::endl;
-    //    Dw.Report();
-  }
-  Grid_finalize();
-}
--- a/benchmarks/Benchmark_dwf_sweep.cc
+++ b/benchmarks/Benchmark_dwf_sweep.cc
@@ -51,16 +51,18 @@ int main (int argc, char ** argv)
 {
  Grid_init(&argc,&argv);

+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+
  const int Ls=8;
  int threads = GridThread::GetThreads();
  std::cout<<GridLogMessage << "Grid is setup to use "<<threads<<" threads"<<std::endl;
-
-  if ( getenv("ASMOPT") )  {
-    QCD::WilsonKernelsStatic::AsmOpt=1;
-  } else { 
-    QCD::WilsonKernelsStatic::AsmOpt=0;
-  }
-
  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking DWF"<<std::endl;
  std::cout<<GridLogMessage << "=========================================================================="<<std::endl;
--- a/benchmarks/Benchmark_wilson_sweep.cc
+++ b/benchmarks/Benchmark_wilson_sweep.cc
@@ -58,6 +58,19 @@ int main (int argc, char ** argv)
  std::vector<int> seeds({1,2,3,4});
  RealD mass = 0.1;

+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Kernel options --dslash-generic, --dslash-unroll, --dslash-asm" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+  std::cout << GridLogMessage<< "* Benchmarking WilsonFermionR::Dhop                  "<<std::endl;
+  std::cout << GridLogMessage<< "* Vectorising space-time by "<<vComplex::Nsimd()<<std::endl;
+  if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
+  if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
+  if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
+  std::cout << GridLogMessage<< "*****************************************************************" <<std::endl;
+
  std::cout<<GridLogMessage << "============================================================================="<< std::endl;
  std::cout<<GridLogMessage << "= Benchmarking Wilson" << std::endl;
  std::cout<<GridLogMessage << "============================================================================="<< std::endl;
--- a/benchmarks/Benchmark_zmm.cc
+++ b/benchmarks/Benchmark_zmm.cc
@@ -1,175 +0,0 @@
-    /*************************************************************************************
-
-    Grid physics library, www.github.com/paboyle/Grid 
-
-    Source file: ./tests/Test_zmm.cc
-
-    Copyright (C) 2015
-
-Author: paboyle <paboyle@ph.ed.ac.uk>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License along
-    with this program; if not, write to the Free Software Foundation, Inc.,
-    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-
-    See the full license in the file "LICENSE" in the top level distribution directory
-    *************************************************************************************/
-    /*  END LEGAL */
-#include <Grid/Grid.h>
-
-
-using namespace Grid;
-using namespace Grid::QCD;
-
-
-int bench(std::ofstream &os, std::vector<int> &latt4,int Ls);
-
-int main(int argc,char **argv)
-{
-  Grid_init(&argc,&argv);
-  std::ofstream os("zmm.dat");
-
-  os << "#V Ls Lxy Lzt C++ Asm OMP L1 " <<std::endl;
-  std::cout<<GridLogMessage << "====================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "= Benchmarking ZMM"<<std::endl;
-  std::cout<<GridLogMessage << "====================================================================="<<std::endl;
-  std::cout<<GridLogMessage << "Volume \t\t\t\tC++DW/MFLOPs\tASM-DW/MFLOPs\tdiff"<<std::endl;
-  std::cout<<GridLogMessage << "====================================================================="<<std::endl;
-  for(int L=4;L<=32;L+=4){
-    for(int m=1;m<=2;m++){
-      for(int Ls=8;Ls<=16;Ls+=8){
-	std::vector<int> grid({L,L,m*L,m*L});
-  std::cout << GridLogMessage <<"\t";
-	for(int i=0;i<4;i++) { 
-	  std::cout << grid[i]<<"x";
-	}
-	std::cout << Ls<<"\t\t";
-	bench(os,grid,Ls);
-      }
-    }
-  }
-}
-
-int bench(std::ofstream &os, std::vector<int> &latt4,int Ls)
-{
-
-  GridCartesian         * UGrid   = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
-  GridRedBlackCartesian * UrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
-  GridCartesian         * FGrid   = SpaceTimeGrid::makeFiveDimGrid(Ls,UGrid);
-  GridRedBlackCartesian * FrbGrid = SpaceTimeGrid::makeFiveDimRedBlackGrid(Ls,UGrid);
-
-  std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplex::Nsimd());
-  std::vector<int> mpi_layout  = GridDefaultMpi();
-  int threads = GridThread::GetThreads();
-
-  std::vector<int> seeds4({1,2,3,4});
-  std::vector<int> seeds5({5,6,7,8});
-
-  GridSerialRNG sRNG; sRNG.SeedFixedIntegers(seeds4);
-
-  LatticeFermion src (FGrid);
-  LatticeFermion tmp (FGrid);
-  LatticeFermion srce(FrbGrid);
-
-  LatticeFermion resulto(FrbGrid); resulto=zero;
-  LatticeFermion resulta(FrbGrid); resulta=zero;
-  LatticeFermion junk(FrbGrid); junk=zero;
-  LatticeFermion diff(FrbGrid); 
-  LatticeGaugeField Umu(UGrid);
-
-  double mfc, mfa, mfo, mfl1;
-
-  GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
-  GridParallelRNG          RNG5(FGrid);  RNG5.SeedFixedIntegers(seeds5);
-  random(RNG5,src);
-#if 1
-  random(RNG4,Umu);
-#else
-  int mmu=2;
-  std::vector<LatticeColourMatrix> U(4,UGrid);
-  for(int mu=0;mu<Nd;mu++){
-    U[mu] = PeekIndex<LorentzIndex>(Umu,mu);
-    if ( mu!=mmu ) U[mu] = zero;
-    if ( mu==mmu ) U[mu] = 1.0;
-    PokeIndex<LorentzIndex>(Umu,U[mu],mu);
-  }
-#endif
- pickCheckerboard(Even,srce,src);
-
-  RealD mass=0.1;
-  RealD M5  =1.8;
-  DomainWallFermionR Dw(Umu,*FGrid,*FrbGrid,*UGrid,*UrbGrid,mass,M5);
-
-  int ncall=50;
-  double t0=usecond();
-  for(int i=0;i<ncall;i++){
-    Dw.DhopOE(srce,resulto,0);
-  }
-  double t1=usecond();
-
-  double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
-  double flops=1344*volume/2;
-
-  mfc = flops*ncall/(t1-t0);
-  std::cout<<mfc<<"\t\t";
-
-  QCD::WilsonKernelsStatic::AsmOpt=1;
-  t0=usecond();
-  for(int i=0;i<ncall;i++){
-    Dw.DhopOE(srce,resulta,0);
-  }
-  t1=usecond();
-  mfa = flops*ncall/(t1-t0);
-  std::cout<<mfa<<"\t\t";
-  /*
-  int dag=DaggerNo;
-  t0=usecond();
-  for(int i=0;i<1;i++){
-    Dw.DhopInternalOMPbench(Dw.StencilEven,Dw.LebesgueEvenOdd,Dw.UmuOdd,srce,resulta,dag);
-  }
-  t1=usecond();
-  mfo = flops*100/(t1-t0);
-  std::cout<<GridLogMessage << "Called ASM-OMP Dw"<< " mflop/s =   "<< mfo<<std::endl;
-
-  t0=usecond();
-  for(int i=0;i<1;i++){
-    Dw.DhopInternalL1bench(Dw.StencilEven,Dw.LebesgueEvenOdd,Dw.UmuOdd,srce,resulta,dag);
-  }
-  t1=usecond();
-  mfl1= flops*100/(t1-t0);
-  std::cout<<GridLogMessage << "Called ASM-L1 Dw"<< " mflop/s =   "<< mfl1<<std::endl;
-  os << latt4[0]*latt4[1]*latt4[2]*latt4[3]<< " "<<Ls<<" "<< latt4[0] <<" " <<latt4[2]<< " "
-     << mfc<<" "
-     << mfa<<" "
-     << mfo<<" "
-     << mfl1<<std::endl;
-  */
-
-#if 0
-  for(int i=0;i< PerformanceCounter::NumTypes(); i++ ){
-    Dw.DhopOE(srce,resulta,0);
-    PerformanceCounter Counter(i);
-    Counter.Start();
-    Dw.DhopOE(srce,resulta,0);
-    Counter.Stop();
-    Counter.Report();
-  }
-#endif
-  //resulta = (-0.5) * resulta;
-
-  diff = resulto-resulta;
-  std::cout<<norm2(diff)<<std::endl;
-  return 0;
-}
-
-
--- a/bootstrap.sh
+++ b/bootstrap.sh
@@ -1,18 +1,12 @@
 #!/usr/bin/env bash

 EIGEN_URL='http://bitbucket.org/eigen/eigen/get/3.2.9.tar.bz2'
-FFTW_URL=http://www.fftw.org/fftw-3.3.4.tar.gz

 echo "-- deploying Eigen source..."
 wget ${EIGEN_URL} --no-check-certificate
 ./scripts/update_eigen.sh `basename ${EIGEN_URL}`
 rm `basename ${EIGEN_URL}`

-echo "-- copying fftw prototypes..."
-wget ${FFTW_URL}
-./scripts/update_fftw.sh `basename ${FFTW_URL}`
-rm `basename ${FFTW_URL}`
-
 echo '-- generating Make.inc files...'
 ./scripts/filelist
 echo '-- generating configure script...'
--- a/configure.ac
+++ b/configure.ac
@@ -260,6 +260,9 @@ case ${ac_COMMS} in
     mpi3|mpi3-auto)
        AC_DEFINE([GRID_COMMS_MPI3],[1],[GRID_COMMS_MPI3] )
     ;;
+     mpi3l)
+       AC_DEFINE([GRID_COMMS_MPI3L],[1],[GRID_COMMS_MPI3L] )
+     ;;
     shmem)
        AC_DEFINE([GRID_COMMS_SHMEM],[1],[GRID_COMMS_SHMEM] )
     ;;
@@ -280,10 +283,9 @@ case ${ac_COMMS} in
 esac

 AM_CONDITIONAL(BUILD_COMMS_SHMEM,[ test "X${ac_COMMS}X" == "XshmemX" ])
-AM_CONDITIONAL(BUILD_COMMS_MPI,
-               [ test "X${ac_COMMS}X" == "XmpiX" || test "X${ac_COMMS}X" == "Xmpi-autoX" ])
-AM_CONDITIONAL(BUILD_COMMS_MPI3,
-               [ test "X${ac_COMMS}X" == "Xmpi3X" || test "X${ac_COMMS}X" == "Xmpi3-autoX" ])
+AM_CONDITIONAL(BUILD_COMMS_MPI,[ test "X${ac_COMMS}X" == "XmpiX" || test "X${ac_COMMS}X" == "Xmpi-autoX" ])
+AM_CONDITIONAL(BUILD_COMMS_MPI3,[ test "X${ac_COMMS}X" == "Xmpi3X"] )
+AM_CONDITIONAL(BUILD_COMMS_MPI3L,[ test "X${ac_COMMS}X" == "Xmpi3lX"] )
 AM_CONDITIONAL(BUILD_COMMS_NONE,[ test "X${ac_COMMS}X" == "XnoneX" ])

 ############### RNG selection
--- a/lib/Cshift.h
+++ b/lib/Cshift.h
@@ -42,6 +42,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/cshift/Cshift_mpi.h>
 #endif 

+#ifdef GRID_COMMS_MPI3L
+#include <Grid/cshift/Cshift_mpi.h>
+#endif 
+
 #ifdef GRID_COMMS_SHMEM
 #include <Grid/cshift/Cshift_mpi.h> // uses same implementation of communicator
 #endif 
--- a/lib/Init.cc
+++ b/lib/Init.cc
@@ -123,6 +123,13 @@ void GridCmdOptionIntVector(std::string &str,std::vector<int> & vec)
  return;
 }

+void GridCmdOptionInt(std::string &str,int & val)
+{
+  std::stringstream ss(str);
+  ss>>val;
+  return;
+}
+

 void GridParseLayout(char **argv,int argc,
 		     std::vector<int> &latt,
@@ -153,14 +160,12 @@ void GridParseLayout(char **argv,int argc,
    assert(ompthreads.size()==1);
    GridThread::SetThreads(ompthreads[0]);
  }
-
  if( GridCmdOptionExists(argv,argv+argc,"--cores") ){
-    std::vector<int> cores(0);
+    int cores;
    arg= GridCmdOptionPayload(argv,argv+argc,"--cores");
-    GridCmdOptionIntVector(arg,cores);
-    GridThread::SetCores(cores[0]);
+    GridCmdOptionInt(arg,cores);
+    GridThread::SetCores(cores);
  }
-
 }

 std::string GridCmdVectorIntToString(const std::vector<int> & vec){
@@ -169,7 +174,7 @@ std::string GridCmdVectorIntToString(const std::vector<int> & vec){
  return oss.str();
 }
 /////////////////////////////////////////////////////////
-//
+// Reinit guard
 /////////////////////////////////////////////////////////
 static int Grid_is_initialised = 0;

@@ -178,27 +183,31 @@ void Grid_init(int *argc,char ***argv)
 {
  GridLogger::StopWatch.Start();

+  std::string arg;
+
+  ////////////////////////////////////
+  // Shared memory block size
+  ////////////////////////////////////
+  if( GridCmdOptionExists(*argv,*argv+*argc,"--shm") ){
+    int MB;
+    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--shm");
+    GridCmdOptionInt(arg,MB);
+    CartesianCommunicator::MAX_MPI_SHM_BYTES = MB*1024*1024;
+  }
+
  CartesianCommunicator::Init(argc,argv);

-  // Parse command line args.
+  ////////////////////////////////////
+  // Logging
+  ////////////////////////////////////

-  std::string arg;
  std::vector<std::string> logstreams;
  std::string defaultLog("Error,Warning,Message,Performance");
-
  GridCmdOptionCSL(defaultLog,logstreams);
  GridLogConfigure(logstreams);

-  if( GridCmdOptionExists(*argv,*argv+*argc,"--help") ){
-    std::cout<<GridLogMessage<<"--help : this message"<<std::endl;
-    std::cout<<GridLogMessage<<"--debug-signals : catch sigsegv and print a blame report"<<std::endl;
-    std::cout<<GridLogMessage<<"--debug-stdout  : print stdout from EVERY node"<<std::endl;    
-    std::cout<<GridLogMessage<<"--decomposition : report on default omp,mpi and simd decomposition"<<std::endl;    
-    std::cout<<GridLogMessage<<"--mpi n.n.n.n   : default MPI decomposition"<<std::endl;    
-    std::cout<<GridLogMessage<<"--threads n     : default number of OMP threads"<<std::endl;
-    std::cout<<GridLogMessage<<"--grid n.n.n.n  : default Grid size"<<std::endl;    
-    std::cout<<GridLogMessage<<"--log list      : comma separted list of streams from Error,Warning,Message,Performance,Iterative,Integrator,Debug,Colours"<<std::endl;
-    exit(EXIT_SUCCESS);
+  if( !GridCmdOptionExists(*argv,*argv+*argc,"--debug-stdout") ){
+    Grid_quiesce_nodes();
  }

  if( GridCmdOptionExists(*argv,*argv+*argc,"--log") ){
@@ -207,38 +216,39 @@ void Grid_init(int *argc,char ***argv)
    GridLogConfigure(logstreams);
  }

-  if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
-    Grid_debug_handler_init();
-  }
-  if( !GridCmdOptionExists(*argv,*argv+*argc,"--debug-stdout") ){
-    Grid_quiesce_nodes();
-  }
-  if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-opt") ){
-    QCD::WilsonKernelsStatic::HandOpt=1;
-  }
-  if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
-    LebesgueOrder::UseLebesgueOrder=1;
-  }
-  if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
-    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking");
-    GridCmdOptionIntVector(arg,LebesgueOrder::Block);
-  }
-  if( GridCmdOptionExists(*argv,*argv+*argc,"--timestamp") ){
-    GridLogTimestamp(1);
+  ////////////////////////////////////
+  // Help message
+  ////////////////////////////////////
+
+  if( GridCmdOptionExists(*argv,*argv+*argc,"--help") ){
+    std::cout<<GridLogMessage<<"  --help : this message"<<std::endl;
+    std::cout<<GridLogMessage<<std::endl;
+    std::cout<<GridLogMessage<<"Geometry:"<<std::endl;
+    std::cout<<GridLogMessage<<"  --mpi n.n.n.n   : default MPI decomposition"<<std::endl;    
+    std::cout<<GridLogMessage<<"  --threads n     : default number of OMP threads"<<std::endl;
+    std::cout<<GridLogMessage<<"  --grid n.n.n.n  : default Grid size"<<std::endl;    
+    std::cout<<GridLogMessage<<"  --shm  M        : allocate M megabytes of shared memory for comms"<<std::endl;    
+    std::cout<<GridLogMessage<<std::endl;
+    std::cout<<GridLogMessage<<"Verbose and debug:"<<std::endl;
+    std::cout<<GridLogMessage<<"  --log list      : comma separted list of streams from Error,Warning,Message,Performance,Iterative,Integrator,Debug,Colours"<<std::endl;
+    std::cout<<GridLogMessage<<"  --decomposition : report on default omp,mpi and simd decomposition"<<std::endl;    
+    std::cout<<GridLogMessage<<"  --debug-signals : catch sigsegv and print a blame report"<<std::endl;
+    std::cout<<GridLogMessage<<"  --debug-stdout  : print stdout from EVERY node"<<std::endl;    
+    std::cout<<GridLogMessage<<"  --timestamp     : tag with millisecond resolution stamps"<<std::endl;    
+    std::cout<<GridLogMessage<<std::endl;
+    std::cout<<GridLogMessage<<"Performance:"<<std::endl;
+    std::cout<<GridLogMessage<<"  --dslash-generic: Wilson kernel for generic Nc"<<std::endl;    
+    std::cout<<GridLogMessage<<"  --dslash-unroll : Wilson kernel for Nc=3"<<std::endl;    
+    std::cout<<GridLogMessage<<"  --dslash-asm    : Wilson kernel for AVX512"<<std::endl;    
+    std::cout<<GridLogMessage<<"  --lebesgue      : Cache oblivious Lebesgue curve/Morton order/Z-graph stencil looping"<<std::endl;    
+    std::cout<<GridLogMessage<<"  --cacheblocking n.m.o.p : Hypercuboidal cache blocking"<<std::endl;    
+    std::cout<<GridLogMessage<<std::endl;
+    exit(EXIT_SUCCESS);
  }

-  GridParseLayout(*argv,*argc,
-		  Grid_default_latt,
-		  Grid_default_mpi);
-  if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
-    std::cout<<GridLogMessage<<"Grid Decomposition\n";
-    std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
-    std::cout<<GridLogMessage<<"\tMPI tasks      : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl;
-    std::cout<<GridLogMessage<<"\tvRealF         : "<<sizeof(vRealF)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl;
-    std::cout<<GridLogMessage<<"\tvRealD         : "<<sizeof(vRealD)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealD::Nsimd()))<<std::endl;
-    std::cout<<GridLogMessage<<"\tvComplexF      : "<<sizeof(vComplexF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexF::Nsimd()))<<std::endl;
-    std::cout<<GridLogMessage<<"\tvComplexD      : "<<sizeof(vComplexD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexD::Nsimd()))<<std::endl;
-  }
+  ////////////////////////////////////
+  // Banner
+  ////////////////////////////////////

  std::string COL_RED    = GridLogColours.colour["RED"];
  std::string COL_PURPLE = GridLogColours.colour["PURPLE"];
@@ -247,7 +257,6 @@ void Grid_init(int *argc,char ***argv)
  std::string COL_BLUE   = GridLogColours.colour["BLUE"];
  std::string COL_YELLOW = GridLogColours.colour["YELLOW"];
  std::string COL_BACKGROUND = GridLogColours.colour["NORMAL"];
-
  
  std::cout <<std::endl;
  std::cout <<COL_RED  << "__|__|__|__|__"<<             "|__|__|_"<<COL_PURPLE<<"_|__|__|"<<                "__|__|__|__|__"<<std::endl; 
@@ -281,6 +290,53 @@ void Grid_init(int *argc,char ***argv)
  std::cout << COL_BACKGROUND <<std::endl;
  std::cout << std::endl;

+  ////////////////////////////////////
+  // Debug and performance options
+  ////////////////////////////////////
+
+
+  if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
+    Grid_debug_handler_init();
+  }
+  if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-unroll") ){
+    QCD::WilsonKernelsStatic::Opt=QCD::WilsonKernelsStatic::OptHandUnroll;
+  }
+  if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-asm") ){
+    QCD::WilsonKernelsStatic::Opt=QCD::WilsonKernelsStatic::OptInlineAsm;
+  }
+  if( GridCmdOptionExists(*argv,*argv+*argc,"--dslash-generic") ){
+    QCD::WilsonKernelsStatic::Opt=QCD::WilsonKernelsStatic::OptGeneric;
+  }
+  if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
+    LebesgueOrder::UseLebesgueOrder=1;
+  }
+
+
+  if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
+    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking");
+    GridCmdOptionIntVector(arg,LebesgueOrder::Block);
+  }
+  if( GridCmdOptionExists(*argv,*argv+*argc,"--timestamp") ){
+    GridLogTimestamp(1);
+  }
+
+  GridParseLayout(*argv,*argc,
+		  Grid_default_latt,
+		  Grid_default_mpi);
+
+  std::cout << GridLogMessage << "Requesting "<< CartesianCommunicator::MAX_MPI_SHM_BYTES <<" byte stencil comms buffers "<<std::endl;
+
+  if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
+    std::cout<<GridLogMessage<<"Grid Decomposition\n";
+    std::cout<<GridLogMessage<<"\tOpenMP threads : "<<GridThread::GetThreads()<<std::endl;
+    std::cout<<GridLogMessage<<"\tMPI tasks      : "<<GridCmdVectorIntToString(GridDefaultMpi())<<std::endl;
+    std::cout<<GridLogMessage<<"\tvRealF         : "<<sizeof(vRealF)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealF::Nsimd()))<<std::endl;
+    std::cout<<GridLogMessage<<"\tvRealD         : "<<sizeof(vRealD)*8    <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vRealD::Nsimd()))<<std::endl;
+    std::cout<<GridLogMessage<<"\tvComplexF      : "<<sizeof(vComplexF)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexF::Nsimd()))<<std::endl;
+    std::cout<<GridLogMessage<<"\tvComplexD      : "<<sizeof(vComplexD)*8 <<"bits ; " <<GridCmdVectorIntToString(GridDefaultSimd(4,vComplexD::Nsimd()))<<std::endl;
+  }
+
+
  Grid_is_initialised = 1;
 }

--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@@ -9,6 +9,11 @@ if BUILD_COMMS_MPI3
  extra_sources+=communicator/Communicator_base.cc
 endif

+if BUILD_COMMS_MPI3L
+  extra_sources+=communicator/Communicator_mpi3_leader.cc
+  extra_sources+=communicator/Communicator_base.cc
+endif
+
 if BUILD_COMMS_SHMEM
  extra_sources+=communicator/Communicator_shmem.cc
  extra_sources+=communicator/Communicator_base.cc
--- a/lib/communicator/Communicator_base.cc
+++ b/lib/communicator/Communicator_base.cc
@@ -31,14 +31,8 @@ namespace Grid {
 ///////////////////////////////////////////////////////////////
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////
-int CartesianCommunicator::ShmRank;
-int CartesianCommunicator::ShmSize;
-int CartesianCommunicator::GroupRank;
-int CartesianCommunicator::GroupSize;
-int CartesianCommunicator::WorldRank;
-int CartesianCommunicator::WorldSize;
-int CartesianCommunicator::Slave;
 void *              CartesianCommunicator::ShmCommBuf;
+uint64_t            CartesianCommunicator::MAX_MPI_SHM_BYTES   = 128*1024*1024; 

 /////////////////////////////////
 // Alloc, free shmem region
@@ -48,7 +42,12 @@ void *CartesianCommunicator::ShmBufferMalloc(size_t bytes){
  void *ptr = (void *)heap_top;
  heap_top  += bytes;
  heap_bytes+= bytes;
-  assert(heap_bytes < MAX_MPI_SHM_BYTES);
+  if (heap_bytes >= MAX_MPI_SHM_BYTES) {
+    std::cout<< " ShmBufferMalloc exceeded shared heap size -- try increasing with --shm <MB> flag" <<std::endl;
+    std::cout<< " Parameter specified in units of MB (megabytes) " <<std::endl;
+    std::cout<< " Current value is " << (MAX_MPI_SHM_BYTES/(1024*1024)) <<std::endl;
+    assert(heap_bytes<MAX_MPI_SHM_BYTES);
+  }
  return ptr;
 }
 void CartesianCommunicator::ShmBufferFreeAll(void) { 
@@ -69,12 +68,6 @@ int                      CartesianCommunicator::ProcessorCount(void)    { return
 ////////////////////////////////////////////////////////////////////////////////
 // very VERY rarely (Log, serial RNG) we need world without a grid
 ////////////////////////////////////////////////////////////////////////////////
-int  CartesianCommunicator::RankWorld(void){ return WorldRank; };
-int CartesianCommunicator::Ranks    (void) { return WorldSize; };
-int CartesianCommunicator::Nodes    (void) { return GroupSize; };
-int CartesianCommunicator::Cores    (void) { return ShmSize;   };
-int CartesianCommunicator::NodeRank (void) { return GroupRank; };
-int CartesianCommunicator::CoreRank (void) { return ShmRank;   };

 void CartesianCommunicator::GlobalSum(ComplexF &c)
 {
@@ -93,7 +86,7 @@ void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
  GlobalSumVector((double *)c,2*N);
 }

-#ifndef GRID_COMMS_MPI3
+#if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPI3L)

 void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						       void *xmit,
--- a/lib/communicator/Communicator_base.h
+++ b/lib/communicator/Communicator_base.h
@@ -1,3 +1,4 @@
+
    /*************************************************************************************

    Grid physics library, www.github.com/paboyle/Grid 
@@ -37,6 +38,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifdef GRID_COMMS_MPI3
 #include <mpi.h>
 #endif
+#ifdef GRID_COMMS_MPI3L
+#include <mpi.h>
+#endif
 #ifdef GRID_COMMS_SHMEM
 #include <mpp/shmem.h>
 #endif
@@ -51,7 +55,7 @@ class CartesianCommunicator {
  // Give external control (command line override?) of this

  static const int      MAXLOG2RANKSPERNODE = 16;            
-  static const uint64_t MAX_MPI_SHM_BYTES   = 128*1024*1024; 
+  static uint64_t MAX_MPI_SHM_BYTES;

  // Communicator should know nothing of the physics grid, only processor grid.
  int              _Nprocessors;     // How many in all
@@ -60,9 +64,9 @@ class CartesianCommunicator {
  std::vector<int> _processor_coor;  // linear processor coordinate
  unsigned long _ndimension;

-#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3)
-  MPI_Comm communicator;
+#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPI3L)
  static MPI_Comm communicator_world;
+         MPI_Comm communicator;
  typedef MPI_Request CommsRequest_t;
 #else 
  typedef int CommsRequest_t;
@@ -75,7 +79,15 @@ class CartesianCommunicator {
  // cartesian communicator on a subset of ranks, slave ranks controlled
  // by group leader with data xfer via shared memory
  ////////////////////////////////////////////////////////////////////
-#ifdef  GRID_COMMS_MPI3
+#ifdef GRID_COMMS_MPI3
+
+  static int ShmRank;
+  static int ShmSize;
+  static int GroupRank;
+  static int GroupSize;
+  static int WorldRank;
+  static int WorldSize;
+
  std::vector<int>  WorldDims;
  std::vector<int>  GroupDims;
  std::vector<int>  ShmDims;
@@ -83,7 +95,7 @@ class CartesianCommunicator {
  std::vector<int> GroupCoor;
  std::vector<int> ShmCoor;
  std::vector<int> WorldCoor;
-  
+
  static std::vector<int> GroupRanks; 
  static std::vector<int> MyGroup;
  static int ShmSetup;
@@ -93,13 +105,20 @@ class CartesianCommunicator {
  std::vector<int>  LexicographicToWorldRank;
  
  static std::vector<void *> ShmCommBufs;
+
 #else 
  static void ShmInitGeneric(void);
  static commVector<uint8_t> ShmBufStorageVector;
 #endif 
+
+  /////////////////////////////////
+  // Grid information and queries
+  // Implemented in Communicator_base.C
+  /////////////////////////////////
  static void * ShmCommBuf;
  size_t heap_top;
  size_t heap_bytes;
+
  void *ShmBufferSelf(void);
  void *ShmBuffer(int rank);
  void *ShmBufferTranslate(int rank,void * local_p);
@@ -123,28 +142,12 @@ class CartesianCommunicator {
  int  RankFromProcessorCoor(std::vector<int> &coor);
  void ProcessorCoorFromRank(int rank,std::vector<int> &coor);
  
-  /////////////////////////////////
-  // Grid information and queries
-  /////////////////////////////////
-  static int ShmRank;
-  static int ShmSize;
-  static int GroupSize;
-  static int GroupRank;
-  static int WorldRank;
-  static int WorldSize;
-  static int Slave;
-  
  int                      IsBoss(void)            ;
  int                      BossRank(void)          ;
  int                      ThisRank(void)          ;
  const std::vector<int> & ThisProcessorCoor(void) ;
  const std::vector<int> & ProcessorGrid(void)     ;
  int                      ProcessorCount(void)    ;
-  static int Ranks    (void);
-  static int Nodes    (void);
-  static int Cores    (void);
-  static int NodeRank (void);
-  static int CoreRank (void);

  ////////////////////////////////////////////////////////////////////////////////
  // very VERY rarely (Log, serial RNG) we need world without a grid
--- a/lib/communicator/Communicator_mpi.cc
+++ b/lib/communicator/Communicator_mpi.cc
@@ -44,13 +44,6 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
    MPI_Init(argc,argv);
  }
  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
-  MPI_Comm_rank(communicator_world,&WorldRank);
-  MPI_Comm_size(communicator_world,&WorldSize);
-  ShmRank=0;
-  ShmSize=1;
-  GroupRank=WorldRank;
-  GroupSize=WorldSize;
-  Slave    =0;
  ShmInitGeneric();
 }

@@ -198,6 +191,11 @@ void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
  // Should only be used prior to Grid Init finished.
  // Check for this?
  ///////////////////////////////////////////////////////
+int CartesianCommunicator::RankWorld(void){ 
+  int r; 
+  MPI_Comm_rank(communicator_world,&r);
+  return r;
+}
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
  int ierr= MPI_Bcast(data,
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@@ -30,12 +30,18 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>

 namespace Grid {

-
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 int CartesianCommunicator::ShmSetup = 0;

+int CartesianCommunicator::ShmRank;
+int CartesianCommunicator::ShmSize;
+int CartesianCommunicator::GroupRank;
+int CartesianCommunicator::GroupSize;
+int CartesianCommunicator::WorldRank;
+int CartesianCommunicator::WorldSize;
+
 MPI_Comm CartesianCommunicator::communicator_world;
 MPI_Comm CartesianCommunicator::ShmComm;
 MPI_Win  CartesianCommunicator::ShmWindow;
@@ -97,15 +103,15 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
  
  std::vector<int> world_ranks(WorldSize); 
  GroupRanks.resize(WorldSize); 
-  MyGroup.resize(ShmSize);
  for(int r=0;r<WorldSize;r++) world_ranks[r]=r;
  
  MPI_Group_translate_ranks (WorldGroup,WorldSize,&world_ranks[0],ShmGroup, &GroupRanks[0]); 

  ///////////////////////////////////////////////////////////////////
  // Identify who is in my group and noninate the leader
-    ///////////////////////////////////////////////////////////////////
+  ///////////////////////////////////////////////////////////////////
  int g=0;
+  MyGroup.resize(ShmSize);
  for(int rank=0;rank<WorldSize;rank++){
    if(GroupRanks[rank]!=MPI_UNDEFINED){
      assert(g<ShmSize);
--- a/lib/communicator/Communicator_mpi3_leader.cc
+++ b/lib/communicator/Communicator_mpi3_leader.cc
@@ -0,0 +1,871 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/communicator/Communicator_mpi.cc
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#include "Grid.h"
+#include <mpi.h>
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Workarounds:
+/// i) bloody mac os doesn't implement unnamed semaphores since it is "optional" posix.
+///    darwin dispatch semaphores don't seem to be multiprocess.
+///
+/// ii) openmpi under --mca shmem posix works with two squadrons per node; 
+///     openmpi under default mca settings (I think --mca shmem mmap) on MacOS makes two squadrons map the SAME
+///     memory as each other, despite their living on different communicators. This appears to be a bug in OpenMPI.
+///
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+#include <semaphore.h>
+typedef sem_t *Grid_semaphore;
+
+#define SEM_INIT(S)      S = sem_open(sem_name,0,0600,0); assert ( S != SEM_FAILED );
+#define SEM_INIT_EXCL(S) sem_unlink(sem_name); S = sem_open(sem_name,O_CREAT|O_EXCL,0600,0); assert ( S != SEM_FAILED );
+#define SEM_POST(S) assert ( sem_post(S) == 0 ); 
+#define SEM_WAIT(S) assert ( sem_wait(S) == 0 );
+
+#include <sys/mman.h>
+
+
+namespace Grid {
+
+enum { COMMAND_ISEND, COMMAND_IRECV, COMMAND_WAITALL };
+
+struct Descriptor {
+  uint64_t buf;
+  size_t bytes;
+  int rank;
+  int tag;
+  int command;
+  MPI_Request request;
+};
+
+const int pool = 48;
+
+class SlaveState {
+public:
+  volatile int head;
+  volatile int start;
+  volatile int tail;
+  volatile Descriptor Descrs[pool];
+};
+
+class Slave {
+public:
+  Grid_semaphore  sem_head;
+  Grid_semaphore  sem_tail;
+  SlaveState *state;
+  MPI_Comm squadron;
+  uint64_t     base;
+  int universe_rank;
+  int vertical_rank;
+  char sem_name [NAME_MAX];
+  ////////////////////////////////////////////////////////////
+  // Descriptor circular pointers
+  ////////////////////////////////////////////////////////////
+  Slave() {};
+
+  void Init(SlaveState * _state,MPI_Comm _squadron,int _universe_rank,int _vertical_rank);
+
+  void SemInit(void) {
+    sprintf(sem_name,"/Grid_mpi3_sem_head_%d",universe_rank);
+    printf("SEM_NAME: %s \n",sem_name);
+    SEM_INIT(sem_head);
+    sprintf(sem_name,"/Grid_mpi3_sem_tail_%d",universe_rank);
+    printf("SEM_NAME: %s \n",sem_name);
+    SEM_INIT(sem_tail);
+  }  
+  void SemInitExcl(void) {
+    sprintf(sem_name,"/Grid_mpi3_sem_head_%d",universe_rank);
+    printf("SEM_INIT_EXCL: %s \n",sem_name);
+    SEM_INIT_EXCL(sem_head);
+    sprintf(sem_name,"/Grid_mpi3_sem_tail_%d",universe_rank);
+    printf("SEM_INIT_EXCL: %s \n",sem_name);
+    SEM_INIT_EXCL(sem_tail);
+  }  
+  void WakeUpDMA(void) { 
+    SEM_POST(sem_head);
+  };
+  void WakeUpCompute(void) { 
+    SEM_POST(sem_tail);
+  };
+  void WaitForCommand(void) { 
+    SEM_WAIT(sem_head);
+  };
+  void WaitForComplete(void) { 
+    SEM_WAIT(sem_tail);
+  };
+  void EventLoop (void) {
+    std::cout<< " Entering event loop "<<std::endl;
+    while(1){
+      WaitForCommand();
+      //      std::cout << "Getting command "<<std::endl;
+      Event();
+    }
+  }
+
+  int Event (void) ;
+
+  uint64_t QueueCommand(int command,void *buf, int bytes, int hashtag, MPI_Comm comm,int u_rank) ;
+
+  void WaitAll() {
+    //    std::cout << "Queueing WAIT command  "<<std::endl;
+    QueueCommand(COMMAND_WAITALL,0,0,0,squadron,0);
+    //    std::cout << "Waking up DMA "<<std::endl;
+    WakeUpDMA();
+    //    std::cout << "Waiting from semaphore "<<std::endl;
+    WaitForComplete();
+    //    std::cout << "Checking FIFO is empty "<<std::endl;
+    assert ( state->tail == state->head );
+  }
+};
+
+////////////////////////////////////////////////////////////////////////
+// One instance of a data mover.
+// Master and Slave must agree on location in shared memory
+////////////////////////////////////////////////////////////////////////
+
+class MPIoffloadEngine { 
+public:
+
+  static std::vector<Slave> Slaves;
+
+  static int ShmSetup;
+  
+  static int UniverseRank;
+  static int UniverseSize;
+  
+  static MPI_Comm communicator_universe;
+  static MPI_Comm communicator_cached;
+
+  static MPI_Comm HorizontalComm;
+  static int HorizontalRank;
+  static int HorizontalSize;
+  
+  static MPI_Comm VerticalComm;
+  static MPI_Win  VerticalWindow; 
+  static int VerticalSize;
+  static int VerticalRank;
+  
+  static std::vector<void *> VerticalShmBufs;
+  static std::vector<std::vector<int> > UniverseRanks;
+  static std::vector<int> UserCommunicatorToWorldRanks; 
+  
+  static MPI_Group WorldGroup, CachedGroup;
+  
+  static void CommunicatorInit (MPI_Comm &communicator_world,
+				MPI_Comm &ShmComm,
+				void * &ShmCommBuf);
+
+  static void MapCommRankToWorldRank(int &hashtag, int & comm_world_peer,int tag, MPI_Comm comm,int commrank);
+
+  /////////////////////////////////////////////////////////
+  // routines for master proc must handle any communicator
+  /////////////////////////////////////////////////////////
+
+  static void QueueSend(int slave,void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
+     //    std::cout<< " Queueing send  "<< bytes<< " slave "<< slave << " to comm "<<rank  <<std::endl;
+    Slaves[slave].QueueCommand(COMMAND_ISEND,buf,bytes,tag,comm,rank);
+    //    std::cout << "Queued send command to rank "<< rank<< " via "<<slave <<std::endl;
+    Slaves[slave].WakeUpDMA();
+    //    std::cout << "Waking up DMA "<< slave<<std::endl;
+  };
+
+  static void QueueRecv(int slave, void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
+    //    std::cout<< " Queueing recv "<< bytes<< " slave "<< slave << " from comm "<<rank  <<std::endl;
+    Slaves[slave].QueueCommand(COMMAND_IRECV,buf,bytes,tag,comm,rank);
+    //    std::cout << "Queued recv command from rank "<< rank<< " via "<<slave <<std::endl;
+    Slaves[slave].WakeUpDMA();
+    //    std::cout << "Waking up DMA "<< slave<<std::endl;
+  };
+
+  static void WaitAll() {
+    for(int s=1;s<VerticalSize;s++) {
+      //      std::cout << "Waiting for slave "<< s<<std::endl;
+      Slaves[s].WaitAll();
+    }
+    //    std::cout << " Wait all Complete "<<std::endl;
+  };
+
+  static void GetWork(int nwork, int me, int & mywork, int & myoff,int units){
+    int basework = nwork/units;
+    int backfill = units-(nwork%units);
+    if ( me >= units ) { 
+      mywork = myoff = 0;
+    } else { 
+      mywork = (nwork+me)/units;
+      myoff  = basework * me;
+      if ( me > backfill ) 
+	myoff+= (me-backfill);
+    }
+    return;
+  };
+
+  static void QueueMultiplexedSend(void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
+    uint8_t * cbuf = (uint8_t *) buf;
+    int mywork, myoff, procs;
+    procs = VerticalSize-1;
+    for(int s=0;s<procs;s++) {
+      GetWork(bytes,s,mywork,myoff,procs);
+      QueueSend(s+1,&cbuf[myoff],mywork,tag,comm,rank);
+    }
+  };
+
+  static void QueueMultiplexedRecv(void *buf, int bytes, int tag, MPI_Comm comm,int rank) {
+    uint8_t * cbuf = (uint8_t *) buf;
+    int mywork, myoff, procs;
+    procs = VerticalSize-1;
+    for(int s=0;s<procs;s++) {
+      GetWork(bytes,s,mywork,myoff,procs);
+      QueueRecv(s+1,&cbuf[myoff],mywork,tag,comm,rank);
+    }
+  };
+
+};
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Info that is setup once and indept of cartesian layout
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+std::vector<Slave> MPIoffloadEngine::Slaves;
+    
+int MPIoffloadEngine::UniverseRank;
+int MPIoffloadEngine::UniverseSize;
+
+MPI_Comm  MPIoffloadEngine::communicator_universe;
+MPI_Comm  MPIoffloadEngine::communicator_cached;
+MPI_Group MPIoffloadEngine::WorldGroup;
+MPI_Group MPIoffloadEngine::CachedGroup;
+
+MPI_Comm MPIoffloadEngine::HorizontalComm;
+int      MPIoffloadEngine::HorizontalRank;
+int      MPIoffloadEngine::HorizontalSize;
+
+MPI_Comm MPIoffloadEngine::VerticalComm;
+int      MPIoffloadEngine::VerticalSize;
+int      MPIoffloadEngine::VerticalRank;
+MPI_Win  MPIoffloadEngine::VerticalWindow; 
+std::vector<void *>            MPIoffloadEngine::VerticalShmBufs;
+std::vector<std::vector<int> > MPIoffloadEngine::UniverseRanks;
+std::vector<int>               MPIoffloadEngine::UserCommunicatorToWorldRanks; 
+
+int MPIoffloadEngine::ShmSetup = 0;
+
+void MPIoffloadEngine::CommunicatorInit (MPI_Comm &communicator_world,
+					 MPI_Comm &ShmComm,
+					 void * &ShmCommBuf)
+{      
+  int flag;
+  assert(ShmSetup==0);  
+  
+  //////////////////////////////////////////////////////////////////////
+  // Universe is all nodes prior to squadron grouping
+  //////////////////////////////////////////////////////////////////////
+  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_universe);
+  MPI_Comm_rank(communicator_universe,&UniverseRank);
+  MPI_Comm_size(communicator_universe,&UniverseSize);
+  
+  /////////////////////////////////////////////////////////////////////
+  // Split into groups that can share memory (Verticals)
+  /////////////////////////////////////////////////////////////////////
+#define MPI_SHARED_MEM_DEBUG
+#ifdef  MPI_SHARED_MEM_DEBUG
+  MPI_Comm_split(communicator_universe,(UniverseRank/4),UniverseRank,&VerticalComm);
+#else 
+  MPI_Comm_split_type(communicator_universe, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,&VerticalComm);
+#endif
+  MPI_Comm_rank(VerticalComm     ,&VerticalRank);
+  MPI_Comm_size(VerticalComm     ,&VerticalSize);
+  
+  //////////////////////////////////////////////////////////////////////
+  // Split into horizontal groups by rank in squadron
+  //////////////////////////////////////////////////////////////////////
+  MPI_Comm_split(communicator_universe,VerticalRank,UniverseRank,&HorizontalComm);
+  MPI_Comm_rank(HorizontalComm,&HorizontalRank);
+  MPI_Comm_size(HorizontalComm,&HorizontalSize);
+  assert(HorizontalSize*VerticalSize==UniverseSize);
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  // What is my place in the world
+  ////////////////////////////////////////////////////////////////////////////////
+  int WorldRank=0;
+  if(VerticalRank==0) WorldRank = HorizontalRank;
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,&WorldRank,1,MPI_INT,MPI_SUM,VerticalComm);
+  assert(ierr==0);
+  
+  ////////////////////////////////////////////////////////////////////////////////
+  // Where is the world in the universe?
+  ////////////////////////////////////////////////////////////////////////////////
+  UniverseRanks = std::vector<std::vector<int> >(HorizontalSize,std::vector<int>(VerticalSize,0));
+  UniverseRanks[WorldRank][VerticalRank] = UniverseRank;
+  for(int w=0;w<HorizontalSize;w++){
+    ierr=MPI_Allreduce(MPI_IN_PLACE,&UniverseRanks[w][0],VerticalSize,MPI_INT,MPI_SUM,communicator_universe);
+    assert(ierr==0);
+  }
+  
+  //////////////////////////////////////////////////////////////////////////////////////////////////////////
+  // allocate the shared window for our group, pass back Shm info to CartesianCommunicator
+  //////////////////////////////////////////////////////////////////////////////////////////////////////////
+  VerticalShmBufs.resize(VerticalSize);
+
+#undef MPI_SHARED_MEM
+#ifdef MPI_SHARED_MEM
+  ierr = MPI_Win_allocate_shared(CartesianCommunicator::MAX_MPI_SHM_BYTES,1,MPI_INFO_NULL,VerticalComm,&ShmCommBuf,&VerticalWindow);
+  ierr|= MPI_Win_lock_all (MPI_MODE_NOCHECK, VerticalWindow);
+  assert(ierr==0);
+  //  std::cout<<"SHM "<<ShmCommBuf<<std::endl;
+
+  for(int r=0;r<VerticalSize;r++){
+    MPI_Aint sz;
+    int dsp_unit;
+    MPI_Win_shared_query (VerticalWindow, r, &sz, &dsp_unit, &VerticalShmBufs[r]);
+    //    std::cout<<"SHM "<<r<<" " <<VerticalShmBufs[r]<<std::endl;
+  }
+#else 
+  char shm_name [NAME_MAX];
+  MPI_Barrier(VerticalComm);
+
+  if ( VerticalRank == 0 ) {
+    for(int r=0;r<VerticalSize;r++){
+
+      size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES;
+      if ( r>0 ) size = sizeof(SlaveState);
+
+      sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",WorldRank,r);
+      
+      shm_unlink(shm_name);
+
+      int fd=shm_open(shm_name,O_RDWR|O_CREAT,0600);
+      if ( fd < 0 ) {
+	perror("failed shm_open");
+	assert(0);
+      }
+
+      ftruncate(fd, size);
+
+      VerticalShmBufs[r] = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+
+      if ( VerticalShmBufs[r] == MAP_FAILED ) { 
+	perror("failed mmap");
+	assert(0);
+      }
+
+      uint64_t * check = (uint64_t *) VerticalShmBufs[r];
+      check[0] = WorldRank;
+      check[1] = r;
+
+      //      std::cout<<"SHM "<<r<<" " <<VerticalShmBufs[r]<<std::endl;
+    }
+  }
+
+  MPI_Barrier(VerticalComm);
+
+  if ( VerticalRank != 0 ) { 
+  for(int r=0;r<VerticalSize;r++){
+
+    size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES ;
+    if ( r>0 ) size = sizeof(SlaveState);
+    
+    sprintf(shm_name,"/Grid_mpi3_shm_%d_%d",WorldRank,r);
+    
+    int fd=shm_open(shm_name,O_RDWR|O_CREAT,0600);
+    if ( fd<0 ) {
+      perror("failed shm_open");
+      assert(0);
+    }
+    VerticalShmBufs[r] = mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+
+    uint64_t * check = (uint64_t *) VerticalShmBufs[r];
+    assert(check[0]== WorldRank);
+    assert(check[1]== r);
+    std::cerr<<"SHM "<<r<<" " <<VerticalShmBufs[r]<<std::endl;
+  }
+  }
+#endif
+  MPI_Barrier(VerticalComm);
+
+  //////////////////////////////////////////////////////////////////////
+  // Map rank of leader on node in their in new world, to the
+  // rank in this vertical plane's horizontal communicator
+  //////////////////////////////////////////////////////////////////////
+  communicator_world = HorizontalComm;
+  ShmComm            = VerticalComm;
+  ShmCommBuf         = VerticalShmBufs[0];
+  MPI_Comm_group (communicator_world, &WorldGroup); 
+  
+  ///////////////////////////////////////////////////////////
+  // Start the slave data movers
+  ///////////////////////////////////////////////////////////
+  if ( VerticalRank != 0 ) {
+    Slave indentured;
+    indentured.Init( (SlaveState *) VerticalShmBufs[VerticalRank], VerticalComm, UniverseRank,VerticalRank);
+    indentured.SemInitExcl();// init semaphore in shared memory
+    MPI_Barrier(VerticalComm);
+    MPI_Barrier(VerticalComm);
+    indentured.EventLoop();
+    assert(0);
+  } else {
+    Slaves.resize(VerticalSize);
+    for(int i=1;i<VerticalSize;i++){
+      Slaves[i].Init((SlaveState *)VerticalShmBufs[i],VerticalComm, UniverseRanks[HorizontalRank][i],i);
+    }
+    MPI_Barrier(VerticalComm);
+    for(int i=1;i<VerticalSize;i++){
+      Slaves[i].SemInit();// init semaphore in shared memory
+    }
+    MPI_Barrier(VerticalComm);
+  }
+  
+  ///////////////////////////////////////////////////////////
+  // Verbose for now
+  ///////////////////////////////////////////////////////////
+  
+  ShmSetup=1;
+  
+  if (UniverseRank == 0){
+      
+    std::cout<<GridLogMessage << "Grid MPI-3 configuration: detected ";
+    std::cout<<UniverseSize   << " Ranks " ;
+    std::cout<<HorizontalSize << " Nodes " ;
+    std::cout<<VerticalSize   << " with ranks-per-node "<<std::endl;
+    
+    std::cout<<GridLogMessage << "Grid MPI-3 configuration: using one lead process per node " << std::endl;
+    std::cout<<GridLogMessage << "Grid MPI-3 configuration: reduced communicator has size " << HorizontalSize << std::endl;
+    
+    for(int g=0;g<HorizontalSize;g++){
+      std::cout<<GridLogMessage<<" Node "<<g<<" led by MPI rank "<< UniverseRanks[g][0]<<std::endl;
+    }
+    
+    for(int g=0;g<HorizontalSize;g++){
+      std::cout<<GridLogMessage<<" { ";
+      for(int s=0;s<VerticalSize;s++){
+	std::cout<< UniverseRanks[g][s];
+	if ( s<VerticalSize-1 ) {
+	  std::cout<<",";
+	}
+      }
+      std::cout<<" } "<<std::endl;
+    }
+  }
+};
+
+  ///////////////////////////////////////////////////////////////////////////////////////////////
+  // Map the communicator into communicator_world, and find the neighbour.
+  // Cache the mappings; cache size is 1.
+  ///////////////////////////////////////////////////////////////////////////////////////////////
+void MPIoffloadEngine::MapCommRankToWorldRank(int &hashtag, int & comm_world_peer,int tag, MPI_Comm comm,int rank) {
+
+  if ( comm == HorizontalComm ) {
+    comm_world_peer = rank;
+    //    std::cout << " MapCommRankToWorldRank  horiz " <<rank<<"->"<<comm_world_peer<<std::endl;
+  } else if ( comm == communicator_cached ) {
+    comm_world_peer = UserCommunicatorToWorldRanks[rank];
+    //    std::cout << " MapCommRankToWorldRank  cached " <<rank<<"->"<<comm_world_peer<<std::endl;
+  } else { 
+    
+    int size;
+
+    MPI_Comm_size(comm,&size);
+
+    UserCommunicatorToWorldRanks.resize(size);
+
+    std::vector<int> cached_ranks(size); 
+
+    for(int r=0;r<size;r++) {
+      cached_ranks[r]=r;
+    }
+
+    communicator_cached=comm;
+    
+    MPI_Comm_group(communicator_cached, &CachedGroup);
+    
+    MPI_Group_translate_ranks(CachedGroup,size,&cached_ranks[0],WorldGroup, &UserCommunicatorToWorldRanks[0]); 
+    
+    comm_world_peer = UserCommunicatorToWorldRanks[rank];
+    //    std::cout << " MapCommRankToWorldRank  cache miss " <<rank<<"->"<<comm_world_peer<<std::endl;
+    
+    assert(comm_world_peer != MPI_UNDEFINED);
+  }
+
+  assert( (tag & (~0xFFFFL)) ==0); 
+  
+  uint64_t icomm = (uint64_t)comm;
+  int comm_hash = ((icomm>>0 )&0xFFFF)^((icomm>>16)&0xFFFF)
+                ^ ((icomm>>32)&0xFFFF)^((icomm>>48)&0xFFFF);
+  
+  //  hashtag = (comm_hash<<15) | tag;      
+  hashtag = tag;      
+
+};
+
+void Slave::Init(SlaveState * _state,MPI_Comm _squadron,int _universe_rank,int _vertical_rank)
+{
+  squadron=_squadron;
+  universe_rank=_universe_rank;
+  vertical_rank=_vertical_rank;
+  state   =_state;
+  std::cout << "state "<<_state<<" comm "<<_squadron<<" universe_rank"<<universe_rank <<std::endl;
+  state->head = state->tail = state->start = 0;
+  base = (uint64_t)MPIoffloadEngine::VerticalShmBufs[0];
+  int rank; MPI_Comm_rank(_squadron,&rank);
+}
+#define PERI_PLUS(A) ( (A+1)%pool )
+int Slave::Event (void) {
+
+  static int tail_last;
+  static int head_last;
+  static int start_last;
+  int ierr;
+
+  ////////////////////////////////////////////////////
+  // Try to advance the start pointers
+  ////////////////////////////////////////////////////
+  int s=state->start;
+  if ( s != state->head ) {
+    switch ( state->Descrs[s].command ) {
+    case COMMAND_ISEND:
+      /*
+            std::cout<< " Send "<<s << " ptr "<< state<<" "<< state->Descrs[s].buf<< "["<<state->Descrs[s].bytes<<"]"
+      	       << " to " << state->Descrs[s].rank<< " tag" << state->Descrs[s].tag
+       << " Comm " << MPIoffloadEngine::communicator_universe<< " me " <<universe_rank<< std::endl;
+      */
+      ierr = MPI_Isend((void *)(state->Descrs[s].buf+base), 
+		       state->Descrs[s].bytes, 
+		       MPI_CHAR,
+		       state->Descrs[s].rank,
+		       state->Descrs[s].tag,
+		       MPIoffloadEngine::communicator_universe,
+		       (MPI_Request *)&state->Descrs[s].request);
+      assert(ierr==0);
+      state->start = PERI_PLUS(s);
+      return 1;
+      break;
+
+    case COMMAND_IRECV:
+      /*
+      std::cout<< " Recv "<<s << " ptr "<< state<<" "<< state->Descrs[s].buf<< "["<<state->Descrs[s].bytes<<"]"
+	       << " from " << state->Descrs[s].rank<< " tag" << state->Descrs[s].tag
+	       << " Comm " << MPIoffloadEngine::communicator_universe<< " me "<< universe_rank<< std::endl;
+      */
+      ierr=MPI_Irecv((void *)(state->Descrs[s].buf+base), 
+		     state->Descrs[s].bytes, 
+		     MPI_CHAR,
+		     state->Descrs[s].rank,
+		     state->Descrs[s].tag,
+		     MPIoffloadEngine::communicator_universe,
+		     (MPI_Request *)&state->Descrs[s].request);
+
+      //      std::cout<< " Request is "<<state->Descrs[s].request<<std::endl;
+      //      std::cout<< " Request0 is "<<state->Descrs[0].request<<std::endl;
+      assert(ierr==0);
+      state->start = PERI_PLUS(s);
+      return 1;
+      break;
+
+    case COMMAND_WAITALL:
+
+      for(int t=state->tail;t!=s; t=PERI_PLUS(t) ){
+	MPI_Wait((MPI_Request *)&state->Descrs[t].request,MPI_STATUS_IGNORE);
+      };
+      s=PERI_PLUS(s);
+      state->start = s;
+      state->tail  = s;
+
+      WakeUpCompute();
+
+      return 1;
+      break;
+
+    default:
+      assert(0);
+      break;
+    }
+  }
+  return 0;
+}
+  //////////////////////////////////////////////////////////////////////////////
+  // External interaction with the queue
+  //////////////////////////////////////////////////////////////////////////////
+  
+uint64_t Slave::QueueCommand(int command,void *buf, int bytes, int tag, MPI_Comm comm,int commrank) 
+{
+  /////////////////////////////////////////
+  // Spin; if FIFO is full until not full
+  /////////////////////////////////////////
+  int head =state->head;
+  int next = PERI_PLUS(head);
+    
+  // Set up descriptor
+  int worldrank;
+  int hashtag;
+  MPI_Comm    communicator;
+  MPI_Request request;
+  
+  MPIoffloadEngine::MapCommRankToWorldRank(hashtag,worldrank,tag,comm,commrank);
+
+  uint64_t relative= (uint64_t)buf - base;
+  state->Descrs[head].buf    = relative;
+  state->Descrs[head].bytes  = bytes;
+  state->Descrs[head].rank   = MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank];
+  state->Descrs[head].tag    = hashtag;
+  state->Descrs[head].command= command;
+
+  /*  
+  if ( command == COMMAND_ISEND ) { 
+  std::cout << "QueueSend from "<< universe_rank <<" to commrank " << commrank 
+            << " to worldrank " << worldrank <<std::endl;
+  std::cout << " via VerticalRank "<< vertical_rank <<" to universerank " << MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank]<<std::endl;
+  std::cout << " QueueCommand "<<buf<<"["<<bytes<<"]" << std::endl;
+  } 
+  if ( command == COMMAND_IRECV ) { 
+  std::cout << "QueueRecv on "<< universe_rank <<" from commrank " << commrank 
+            << " from worldrank " << worldrank <<std::endl;
+  std::cout << " via VerticalRank "<< vertical_rank <<" from universerank " << MPIoffloadEngine::UniverseRanks[worldrank][vertical_rank]<<std::endl;
+  std::cout << " QueueSend "<<buf<<"["<<bytes<<"]" << std::endl;
+  } 
+  */
+  // Block until FIFO has space
+  while( state->tail==next );
+
+  // Msync on weak order architectures
+  // Advance pointer
+  state->head = next;
+
+  return 0;
+}
+  
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Info that is setup once and indept of cartesian layout
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+MPI_Comm CartesianCommunicator::communicator_world;
+
+void CartesianCommunicator::Init(int *argc, char ***argv) 
+{
+  int flag;
+  MPI_Initialized(&flag); // needed to coexist with other libs apparently
+  if ( !flag ) {
+    MPI_Init(argc,argv);
+  }
+  communicator_world = MPI_COMM_WORLD;
+  MPI_Comm ShmComm;
+  MPIoffloadEngine::CommunicatorInit (communicator_world,ShmComm,ShmCommBuf);
+}
+void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
+{
+  int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
+  assert(ierr==0);
+}
+int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
+{
+  int rank;
+  int ierr=MPI_Cart_rank  (communicator, &coor[0], &rank);
+  assert(ierr==0);
+  return rank;
+}
+void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
+{
+  coor.resize(_ndimension);
+  int ierr=MPI_Cart_coords  (communicator, rank, _ndimension,&coor[0]);
+  assert(ierr==0);
+}
+
+CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
+{ 
+  _ndimension = processors.size();
+  std::vector<int> periodic(_ndimension,1);
+
+  _Nprocessors=1;
+  _processors = processors;
+
+  for(int i=0;i<_ndimension;i++){
+    _Nprocessors*=_processors[i];
+  }
+
+  int Size; 
+  MPI_Comm_size(communicator_world,&Size);
+  assert(Size==_Nprocessors);
+
+  _processor_coor.resize(_ndimension);
+  MPI_Cart_create(communicator_world, _ndimension,&_processors[0],&periodic[0],1,&communicator);
+  MPI_Comm_rank  (communicator,&_processor);
+  MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
+};
+
+void CartesianCommunicator::GlobalSum(uint32_t &u){
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
+  assert(ierr==0);
+}
+void CartesianCommunicator::GlobalSum(uint64_t &u){
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
+  assert(ierr==0);
+}
+void CartesianCommunicator::GlobalSum(float &f){
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
+  assert(ierr==0);
+}
+void CartesianCommunicator::GlobalSumVector(float *f,int N)
+{
+  int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
+  assert(ierr==0);
+}
+void CartesianCommunicator::GlobalSum(double &d)
+{
+  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
+  assert(ierr==0);
+}
+void CartesianCommunicator::GlobalSumVector(double *d,int N)
+{
+  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
+  assert(ierr==0);
+}
+
+// Basic Halo comms primitive
+void CartesianCommunicator::SendToRecvFrom(void *xmit,
+					   int dest,
+					   void *recv,
+					   int from,
+					   int bytes)
+{
+  std::vector<CommsRequest_t> reqs(0);
+  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
+  SendToRecvFromComplete(reqs);
+}
+
+void CartesianCommunicator::SendRecvPacket(void *xmit,
+					   void *recv,
+					   int sender,
+					   int receiver,
+					   int bytes)
+{
+  MPI_Status stat;
+  assert(sender != receiver);
+  int tag = sender;
+  if ( _processor == sender ) {
+    MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
+  }
+  if ( _processor == receiver ) { 
+    MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
+  }
+}
+
+// Basic Halo comms primitive
+void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+						void *xmit,
+						int dest,
+						void *recv,
+						int from,
+						int bytes)
+{
+  MPI_Request xrq;
+  MPI_Request rrq;
+  int rank = _processor;
+  int ierr;
+  ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
+  ierr|=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
+  
+  assert(ierr==0);
+
+  list.push_back(xrq);
+  list.push_back(rrq);
+}
+
+void CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+						       void *xmit,
+						       int dest,
+						       void *recv,
+						       int from,
+						       int bytes)
+{
+  uint64_t xmit_i = (uint64_t) xmit;
+  uint64_t recv_i = (uint64_t) recv;
+  uint64_t shm    = (uint64_t) ShmCommBuf;
+  // assert xmit and recv lie in shared memory region
+  assert( (xmit_i >= shm) && (xmit_i+bytes <= shm+MAX_MPI_SHM_BYTES) );
+  assert( (recv_i >= shm) && (recv_i+bytes <= shm+MAX_MPI_SHM_BYTES) );
+  assert(from!=_processor);
+  assert(dest!=_processor);
+  MPIoffloadEngine::QueueMultiplexedSend(xmit,bytes,_processor,communicator,dest);
+  MPIoffloadEngine::QueueMultiplexedRecv(recv,bytes,from,communicator,from);
+}
+
+
+void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &list)
+{
+  MPIoffloadEngine::WaitAll();
+}
+
+void CartesianCommunicator::StencilBarrier(void)
+{
+}
+
+void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
+{
+  int nreq=list.size();
+  std::vector<MPI_Status> status(nreq);
+  int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
+  assert(ierr==0);
+}
+
+void CartesianCommunicator::Barrier(void)
+{
+  int ierr = MPI_Barrier(communicator);
+  assert(ierr==0);
+}
+
+void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
+{
+  int ierr=MPI_Bcast(data,
+		     bytes,
+		     MPI_BYTE,
+		     root,
+		     communicator);
+  assert(ierr==0);
+}
+
+void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
+{
+  int ierr= MPI_Bcast(data,
+		      bytes,
+		      MPI_BYTE,
+		      root,
+		      communicator_world);
+  assert(ierr==0);
+}
+
+void *CartesianCommunicator::ShmBufferSelf(void) { return ShmCommBuf; }
+
+void *CartesianCommunicator::ShmBuffer(int rank) {
+  return NULL;
+}
+void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) { 
+  return NULL;
+}
+
+
+};
+
--- a/lib/communicator/Communicator_none.cc
+++ b/lib/communicator/Communicator_none.cc
@@ -34,13 +34,6 @@ namespace Grid {

 void CartesianCommunicator::Init(int *argc, char *** arv)
 {
-  WorldRank = 0;
-  WorldSize = 1;
-  ShmRank=0;
-  ShmSize=1;
-  GroupRank=WorldRank;
-  GroupSize=WorldSize;
-  Slave    =0;
  ShmInitGeneric();
 }

@@ -99,6 +92,7 @@ void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &
  assert(0);
 }

+int  CartesianCommunicator::RankWorld(void){return 0;}
 void CartesianCommunicator::Barrier(void){}
 void CartesianCommunicator::Broadcast(int root,void* data, int bytes) {}
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes) { }
--- a/lib/communicator/Communicator_shmem.cc
+++ b/lib/communicator/Communicator_shmem.cc
@@ -50,11 +50,16 @@ typedef struct HandShake_t {
  uint64_t seq_remote;
 } HandShake;

+std::array<long,_SHMEM_REDUCE_SYNC_SIZE> make_psync_init(void) {
+  array<long,_SHMEM_REDUCE_SYNC_SIZE> ret;
+  ret.fill(SHMEM_SYNC_VALUE);
+  return ret;
+}
+static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync_init = make_psync_init();

 static Vector< HandShake > XConnections;
 static Vector< HandShake > RConnections;

-
 void CartesianCommunicator::Init(int *argc, char ***argv) {
  shmem_init();
  XConnections.resize(shmem_n_pes());
@@ -65,13 +70,6 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
    RConnections[pe].seq_local = 0;
    RConnections[pe].seq_remote= 0;
  }
-  WorldSize = shmem_n_pes();
-  WorldRank = shmem_my_pe();
-  ShmRank=0;
-  ShmSize=1;
-  GroupRank=WorldRank;
-  GroupSize=WorldSize;
-  Slave    =0;
  shmem_barrier_all();
  ShmInitGeneric();
 }
@@ -103,7 +101,7 @@ void CartesianCommunicator::GlobalSum(uint32_t &u){
  static long long source ;
  static long long dest   ;
  static long long llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
-  static long      psync[_SHMEM_REDUCE_SYNC_SIZE];
+  static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init;

  //  int nreduce=1;
  //  int pestart=0;
@@ -119,7 +117,7 @@ void CartesianCommunicator::GlobalSum(uint64_t &u){
  static long long source ;
  static long long dest   ;
  static long long llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
-  static long      psync[_SHMEM_REDUCE_SYNC_SIZE];
+  static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init;

  //  int nreduce=1;
  //  int pestart=0;
@@ -135,7 +133,7 @@ void CartesianCommunicator::GlobalSum(float &f){
  static float source ;
  static float dest   ;
  static float llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
-  static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
+  static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init;

  source = f;
  dest   =0.0;
@@ -147,7 +145,7 @@ void CartesianCommunicator::GlobalSumVector(float *f,int N)
  static float source ;
  static float dest   = 0 ;
  static float llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
-  static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
+  static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init;

  if ( shmem_addr_accessible(f,_processor)  ){
    shmem_float_sum_to_all(f,f,N,0,0,_Nprocessors,llwrk,psync);
@@ -166,7 +164,7 @@ void CartesianCommunicator::GlobalSum(double &d)
  static double source;
  static double dest  ;
  static double llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
-  static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
+  static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init;

  source = d;
  dest   = 0;
@@ -178,7 +176,8 @@ void CartesianCommunicator::GlobalSumVector(double *d,int N)
  static double source ;
  static double dest   ;
  static double llwrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];
-  static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
+  static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init;
+

  if ( shmem_addr_accessible(d,_processor)  ){
    shmem_double_sum_to_all(d,d,N,0,0,_Nprocessors,llwrk,psync);
@@ -295,7 +294,7 @@ void CartesianCommunicator::Barrier(void)
 }
 void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 {
-  static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
+  static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init;
  static uint32_t word;
  uint32_t *array = (uint32_t *) data;
  assert( (bytes % 4)==0);
@@ -318,7 +317,7 @@ void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 }
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
-  static long  psync[_SHMEM_REDUCE_SYNC_SIZE];
+  static std::array<long,_SHMEM_REDUCE_SYNC_SIZE> psync =  psync_init;
  static uint32_t word;
  uint32_t *array = (uint32_t *) data;
  assert( (bytes % 4)==0);
--- a/lib/qcd/action/fermion/WilsonKernels.cc
+++ b/lib/qcd/action/fermion/WilsonKernels.cc
@@ -32,8 +32,7 @@ directory
 namespace Grid {
 namespace QCD {

-int WilsonKernelsStatic::HandOpt;
-int WilsonKernelsStatic::AsmOpt;
+int WilsonKernelsStatic::Opt;

 template <class Impl>
 WilsonKernels<Impl>::WilsonKernels(const ImplParams &p) : Base(p){};
--- a/lib/qcd/action/fermion/WilsonKernels.h
+++ b/lib/qcd/action/fermion/WilsonKernels.h
@@ -40,9 +40,9 @@ namespace QCD {
  ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 class WilsonKernelsStatic { 
 public:
+  enum { OptGeneric, OptHandUnroll, OptInlineAsm };
  // S-direction is INNERMOST and takes no part in the parity.
-  static int AsmOpt;  // these are a temporary hack
-  static int HandOpt; // these are a temporary hack
+  static int Opt;  // these are a temporary hack
 };
 
 template<class Impl> class WilsonKernels : public FermionOperator<Impl> , public WilsonKernelsStatic { 
@@ -56,24 +56,40 @@ public:
  template <bool EnableBool = true>
  typename std::enable_if<Impl::Dimension == 3 && Nc == 3 &&EnableBool, void>::type
  DiracOptDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
-		   int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
+		   int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) 
+  {
+    switch(Opt) {
 #ifdef AVX512
-    if (AsmOpt) {
-      WilsonKernels<Impl>::DiracOptAsmDhopSite(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
-    } else {
-#else
-    {
-#endif
+    case OptInlineAsm:
      for (int site = 0; site < Ns; site++) {
 	for (int s = 0; s < Ls; s++) {
-	  if (HandOpt)
-	    WilsonKernels<Impl>::DiracOptHandDhopSite(st,lo,U,buf,sF,sU,in,out);
-	  else
-	    WilsonKernels<Impl>::DiracOptGenericDhopSite(st,lo,U,buf,sF,sU,in,out);
+	  WilsonKernels<Impl>::DiracOptAsmDhopSite(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
 	  sF++;
 	}
 	sU++;
      }
+      break;
+#endif
+    case OptHandUnroll:
+      for (int site = 0; site < Ns; site++) {
+	for (int s = 0; s < Ls; s++) {
+	  WilsonKernels<Impl>::DiracOptHandDhopSite(st,lo,U,buf,sF,sU,in,out);
+	  sF++;
+	}
+	sU++;
+      }
+      break;
+    case OptGeneric:
+      for (int site = 0; site < Ns; site++) {
+	for (int s = 0; s < Ls; s++) {
+	  WilsonKernels<Impl>::DiracOptGenericDhopSite(st,lo,U,buf,sF,sU,in,out);
+	  sF++;
+	}
+	sU++;
+      }
+      break;
+    default:
+      assert(0);
    }
  }
     
@@ -81,7 +97,7 @@ public:
  typename std::enable_if<(Impl::Dimension != 3 || (Impl::Dimension == 3 && Nc != 3)) && EnableBool, void>::type
  DiracOptDhopSite(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
 		   int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
-     
+    // no kernel choice  
    for (int site = 0; site < Ns; site++) {
      for (int s = 0; s < Ls; s++) {
 	WilsonKernels<Impl>::DiracOptGenericDhopSite(st, lo, U, buf, sF, sU, in, out);
@@ -95,23 +111,39 @@ public:
  typename std::enable_if<Impl::Dimension == 3 && Nc == 3 && EnableBool,void>::type
  DiracOptDhopSiteDag(StencilImpl &st, LebesgueOrder &lo, DoubledGaugeField &U, SiteHalfSpinor * buf,
 		      int sF, int sU, int Ls, int Ns, const FermionField &in, FermionField &out) {
+
+    switch(Opt) {
 #ifdef AVX512
-    if (AsmOpt) {
-      WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
-    } else {
-#else
-    {
-#endif
+    case OptInlineAsm:
      for (int site = 0; site < Ns; site++) {
 	for (int s = 0; s < Ls; s++) {
-	  if (HandOpt)
-	    WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
-	  else
-	    WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
+	  WilsonKernels<Impl>::DiracOptAsmDhopSiteDag(st,lo,U,buf,sF,sU,Ls,Ns,in,out);
 	  sF++;
 	}
 	sU++;
      }
+      break;
+#endif
+    case OptHandUnroll:
+      for (int site = 0; site < Ns; site++) {
+	for (int s = 0; s < Ls; s++) {
+	  WilsonKernels<Impl>::DiracOptHandDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
+	  sF++;
+	}
+	sU++;
+      }
+      break;
+    case OptGeneric:
+      for (int site = 0; site < Ns; site++) {
+	for (int s = 0; s < Ls; s++) {
+	  WilsonKernels<Impl>::DiracOptGenericDhopSiteDag(st,lo,U,buf,sF,sU,in,out);
+	  sF++;
+	}
+	sU++;
+      }
+      break;
+    default:
+      assert(0);
    }
  }

--- a/lib/stencil/Lebesgue.cc
+++ b/lib/stencil/Lebesgue.cc
@@ -32,7 +32,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 namespace Grid {

 int LebesgueOrder::UseLebesgueOrder;
-std::vector<int> LebesgueOrder::Block({2,2,2,2});
+std::vector<int> LebesgueOrder::Block({8,2,2,2});

 LebesgueOrder::IndexInteger LebesgueOrder::alignup(IndexInteger n){
  n--;           // 1000 0011 --> 1000 0010
--- a/scripts/arm_configure.experimental
+++ b/scripts/arm_configure.experimental
@@ -1 +0,0 @@
-./configure --host=arm-linux-gnueabihf  CXX=clang++-3.5 CXXFLAGS='-std=c++11 -O3 -target arm-linux-gnueabihf -I/usr/arm-linux-gnueabihf/include/ -I/home/neo/Codes/gmp6.0/gmp-arm/include/ -I/usr/arm-linux-gnueabihf/include/c++/4.8.2/arm-linux-gnueabihf/ -L/home/neo/Codes/gmp6.0/gmp-arm/lib/ -I/home/neo/Codes/mpfr3.1.2/mpfr-arm/include/ -L/home/neo/Codes/mpfr3.1.2/mpfr-arm/lib/ -static -mcpu=cortex-a7' --enable-simd=NEONv7
--- a/scripts/arm_configure.experimental_cortex57
+++ b/scripts/arm_configure.experimental_cortex57
@@ -1,3 +0,0 @@
-#./configure --host=arm-linux-gnueabihf  CXX=clang++-3.5 CXXFLAGS='-std=c++11 -O3 -target arm-linux-gnueabihf -I/usr/arm-linux-gnueabihf/include/ -I/home/neo/Codes/gmp6.0/gmp-arm/include/ -I/usr/lib/llvm-3.5/lib/clang/3.5.0/include/ -L/home/neo/Codes/gmp6.0/gmp-arm/lib/ -I/home/neo/Codes/mpfr3.1.2/mpfr-arm/include/ -L/home/neo/Codes/mpfr3.1.2/mpfr-arm/lib/ -static -mcpu=cortex-a57' --enable-simd=NEONv7
-
-./configure --host=aarch64-linux-gnu  CXX=clang++-3.5 CXXFLAGS='-std=c++11 -O3 -target aarch64-linux-gnu -static -I/home/neo/Codes/gmp6.0/gmp-armv8/include/ -L/home/neo/Codes/gmp6.0/gmp-armv8/lib/ -I/home/neo/Codes/mpfr3.1.2/mpfr-armv8/include/ -L/home/neo/Codes/mpfr3.1.2/mpfr-armv8/lib/ -I/usr/aarch64-linux-gnu/include/ -I/usr/aarch64-linux-gnu/include/c++/4.8.2/aarch64-linux-gnu/' --enable-simd=NEONv7
--- a/scripts/bench_wilson.sh
+++ b/scripts/bench_wilson.sh
@@ -1,9 +0,0 @@
-for omp in 1 2 4
-do
-echo > wilson.t$omp
-for vol in 4.4.4.4 4.4.4.8 4.4.8.8  4.8.8.8  8.8.8.8   8.8.8.16 8.8.16.16  8.16.16.16
-do   
-perf=` ./benchmarks/Grid_wilson --grid $vol --omp $omp  | grep mflop | awk '{print $3}'`
-echo $vol $perf >> wilson.t$omp
-done
-done
--- a/scripts/build-all
+++ b/scripts/build-all
@@ -1,46 +0,0 @@
-#!/bin/bash  -e
-
-DIRS="clang-avx clang-avx-openmp clang-avx-openmp-mpi clang-avx-mpi clang-avx2 clang-avx2-openmp clang-avx2-openmp-mpi clang-avx2-mpi clang-sse"
-EXTRADIRS="g++-avx g++-sse4 icpc-avx icpc-avx2 icpc-avx512"
-BLACK="\033[30m"
-RED="\033[31m"
-GREEN="\033[32m"
-YELLOW="\033[33m"
-BLUE="\033[34m"
-PINK="\033[35m"
-CYAN="\033[36m"
-WHITE="\033[37m"
-NORMAL="\033[0;39m"
-
-for D in $DIRS
-do
-
-echo
-echo -e $RED ==============================
-echo -e $GREEN $D
-echo -e $RED ==============================
-echo -e $BLUE
-
-  cd builds/$D
-  make clean all -j 8
-  cd ../../
-echo -e $NORMAL
-done
-
-if [ "X$1" == "Xextra" ]
-then
-for D in $EXTRADIRS
-do
-
-echo
-echo -e $RED ==============================
-echo -e $RED $D
-echo -e $RED ==============================
-echo -e $BLUE
-
-  cd builds/$D
-  make clean all -j 8
-  cd ../../
-echo -e $NORMAL
-done
-fi
--- a/scripts/configure-all
+++ b/scripts/configure-all
@@ -1,11 +0,0 @@
-#!/bin/bash
-
-DIRS="clang-avx clang-avx-openmp clang-avx-openmp-mpi clang-avx-mpi clang-avx2 clang-avx2-openmp clang-avx2-openmp-mpi clang-avx2-mpi icpc-avx icpc-avx2 icpc-avx512 g++-sse4 g++-avx clang-sse icpc-avx-openmp-mpi icpc-avx-openmp"
-
-for D in $DIRS
-do
-  mkdir -p builds/$D
-  cd builds/$D
-  ../../scripts/configure-commands $D
-  cd ../..
-done
--- a/scripts/configure-commands
+++ b/scripts/configure-commands
@@ -1,89 +0,0 @@
-#!/bin/bash
-WD=$1
-BLACK="\033[30m"
-RED="\033[31m"
-GREEN="\033[32m"
-YELLOW="\033[33m"
-BLUE="\033[34m"
-PINK="\033[35m"
-CYAN="\033[36m"
-WHITE="\033[37m"
-NORMAL="\033[0;39m"
-echo
-echo -e $RED ==============================
-echo -e $GREEN $WD
-echo -e $RED ==============================
-echo -e $YELLOW
-
-case $WD in
-g++-avx)
-  CXX=g++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
-  ;;
-g++-avx-openmp)
-  CXX=g++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -std=c++11" LIBS="-fopenmp -lgmp -lmpfr" --enable-comms=none
-  ;;
-g++5-sse4)
-  CXX=g++-5 ../../configure --enable-simd=SSE4 CXXFLAGS="-msse4 -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
-  ;;
-g++5-avx)
-  CXX=g++-5 ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
-  ;;
-icpc-avx)
-  CXX=icpc ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
-  ;;
-icpc-avx-openmp-mpi)
-CXX=icpc ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp -lgmp -lmpfr" --enable-comms=mpi
-;;
-icpc-avx-openmp)
-CXX=icpc ../../configure --enable-precision=single --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -std=c++11" LIBS="-fopenmp -lgmp -lmpfr" --enable-comms=mpi
-;;
-icpc-avx2)
-  CXX=icpc ../../configure --enable-simd=AVX2 CXXFLAGS="-march=core-avx2 -O3 -std=c++11" LIBS="-lgmp -lmpfr" --enable-comms=none
-  ;;
-icpc-avx512)
-  CXX=icpc ../../configure --enable-simd=AVX512 CXXFLAGS="-xCOMMON-AVX512 -O3  -std=c++11" --host=none  LIBS="-lgmp -lmpfr" --enable-comms=none
-  ;;
-icpc-mic)
-  CXX=icpc ../../configure --host=none --enable-simd=IMCI CXXFLAGS="-mmic -O3  -std=c++11" LDFLAGS=-mmic LIBS="-lgmp -lmpfr" --enable-comms=none
-  ;;
-icpc-mic-avx512)
-  CXX=icpc ../../configure --host=none --enable-simd=IMCI CXXFLAGS="-xCOMMON_AVX512 -O3  -std=c++11" LDFLAGS=-xCOMMON_AVX512 LIBS="-lgmp -lmpfr" --enable-comms=none
-  ;;
-clang-sse)
-CXX=clang++ ../../configure --enable-precision=single --enable-simd=SSE4 CXXFLAGS="-msse4 -O3 -std=c++11"  LIBS="-lgmp -lmpfr" --enable-comms=none
-  ;;
-clang-avx)
-CXX=clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -std=c++11"  LIBS="-lgmp -lmpfr" --enable-comms=none
-  ;;
-clang-avx2)
-CXX=clang++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -O3 -std=c++11"  LIBS="-lgmp -lmpfr" --enable-comms=none
-  ;;
-clang-avx-openmp)
-CXX=clang-omp++ ../../configure --enable-precision=double --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -std=c++11" LDFLAGS="-fopenmp"  LIBS="-lgmp -lmpfr" --enable-comms=none
-  ;;
-clang-xc30)
-CXX=$HOME/Clang/install/bin/clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -std=c++11 -I/opt/gcc/4.9.2/snos/include/g++/x86_64-suse-linux/ -I/opt/gcc/4.9.2/snos/include/g++/ " LDFLAGS=""  LIBS="-lgmp -lmpfr" --enable-comms=none
-  ;;
-clang-xc30-openmp)
-CXX=$HOME/Clang/install/bin/clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -std=c++11 -I/opt/gcc/4.9.2/snos/include/g++/x86_64-suse-linux/ -I/opt/gcc/4.9.2/snos/include/g++/ " LDFLAGS="-fopenmp"  LIBS="-lgmp -lmpfr" --enable-comms=none
-  ;;
-clang-avx2-openmp)
-CXX=clang-omp++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -fopenmp -O3 -std=c++11" LDFLAGS="-fopenmp"  LIBS="-lgmp -lmpfr" --enable-comms=none
-  ;;
-clang-avx-openmp-mpi)
-CXX=clang-omp++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp -lgmp -lmpfr" --enable-comms=mpi
-;;
-clang-avx2-openmp-mpi)
-CXX=clang-omp++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -fopenmp -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -fopenmp -lgmp -lmpfr" --enable-comms=mpi
-;;
-clang-avx-mpi)
-CXX=clang++ ../../configure --enable-simd=AVX CXXFLAGS="-mavx -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -lgmp -lmpfr" --enable-comms=mpi
-;;
-clang-avx2-mpi)
-CXX=clang++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -O3 -I/opt/local/include/openmpi-mp/ -std=c++11" LDFLAGS=-L/opt/local/lib/openmpi-mp/ LIBS="-lmpi -lmpi_cxx -lgmp -lmpfr" --enable-comms=mpi
-;;
-clang-avx2)
-CXX=clang++ ../../configure --enable-simd=AVX2 CXXFLAGS="-mavx2 -mfma -O3 -std=c++11" LDFLAGS="-L/usr/local/lib/" LIBS="-lgmp -lmpfr" --enable-comms=none
-;;
-esac
-echo -e $NORMAL
--- a/scripts/configure-cray
+++ b/scripts/configure-cray
@@ -1,10 +0,0 @@
-#!/bin/bash
-DIRS="g++-avx-openmp g++-avx clang-xc30 clang-xc30-openmp"
-
-for D in $DIRS
-do
-  mkdir -p builds/$D
-  cd builds/$D
-  ../../scripts/configure-commands $D
-  cd ../..
-done
--- a/scripts/configure-mic
+++ b/scripts/configure-mic
@@ -1,10 +0,0 @@
-#!/bin/bash
-DIRS="build-icpc-mic"
-
-for D in $DIRS
-do
-  mkdir -p $D
- cd $D
-  ../configure-commands
-  cd ..
-done
--- a/scripts/copyright
+++ b/scripts/copyright
@@ -12,6 +12,7 @@ Grid physics library, www.github.com/paboyle/Grid
 Source file: $1

 Copyright (C) 2015
+Copyright (C) 2016

 EOF

@@ -38,8 +39,21 @@ See the full license in the file "LICENSE" in the top level distribution directo
 /*  END LEGAL */
 EOF

+
 cat message > tmp.fil
-cat $1 >> tmp.fil
+
+NOTICE=`grep -n "END LEGAL" $1 | awk '{ print $1 }'  `
+
+if [ "X$NOTICE" != "X" ]
+then
+    echo "found notice ending on line $NOTICE"
+    awk 'BEGIN { P=0 } { if ( P ) print } /END LEGAL/{P=1} ' $1 >> tmp.fil
+else
+    cat $1 >> tmp.fil
+      
+fi
+
+
 cp tmp.fil $1

 shift
--- a/scripts/cray-modules
+++ b/scripts/cray-modules
@@ -1,2 +0,0 @@
-module swap PrgEnv-cray PrgEnv-intel
-module swap intel/14.0.4.211 intel/15.0.2.164
--- a/scripts/reconfigure_script
+++ b/scripts/reconfigure_script
@@ -1,4 +0,0 @@
-aclocal -I m4
-autoheader -f
-automake -f --add-missing
-autoconf -f
--- a/scripts/update_fftw.sh
+++ b/scripts/update_fftw.sh
@@ -1,18 +0,0 @@
-#!/usr/bin/env bash
-
-if (( $# != 1 )); then
-    echo "usage: `basename $0` <archive>" 1>&2
-    exit 1
-fi
-ARC=$1
-
-INITDIR=`pwd`
-rm -rf lib/fftw
-mkdir lib/fftw
-
-ARCDIR=`tar -tf ${ARC} | head -n1 | sed -e 's@/.*@@'`
-tar -xf ${ARC}
-cp ${ARCDIR}/api/fftw3.h lib/fftw/
-
-cd ${INITDIR}
-rm -rf ${ARCDIR}
--- a/scripts/wilson.gnu
+++ b/scripts/wilson.gnu
@@ -1,7 +0,0 @@
-plot 'wilson.t1' u 2 w l t "AVX1-OMP=1"
-replot 'wilson.t2' u 2 w l t "AVX1-OMP=2"
-replot 'wilson.t4' u 2 w l t "AVX1-OMP=4"
-set terminal 'pdf'
-set output 'wilson_clang.pdf'
-replot
-quit
				`@@ -1 +0,0 @@`
				`./configure --host=arm-linux-gnueabihf CXX=clang++-3.5 CXXFLAGS='-std=c++11 -O3 -target arm-linux-gnueabihf -I/usr/arm-linux-gnueabihf/include/ -I/home/neo/Codes/gmp6.0/gmp-arm/include/ -I/usr/arm-linux-gnueabihf/include/c++/4.8.2/arm-linux-gnueabihf/ -L/home/neo/Codes/gmp6.0/gmp-arm/lib/ -I/home/neo/Codes/mpfr3.1.2/mpfr-arm/include/ -L/home/neo/Codes/mpfr3.1.2/mpfr-arm/lib/ -static -mcpu=cortex-a7' --enable-simd=NEONv7`