Merge branch 'develop' of https://github.com/paboyle/Grid into develop

2025-10-22 16:54:48 +01:00 · 2017-09-04 14:10:15 +01:00
parent b83b2b1415 4b4c2a715b
commit a5fe07c077
24 changed files with 1182 additions and 267 deletions
--- a/.travis.yml
+++ b/.travis.yml
@@ -9,68 +9,6 @@ matrix:
    - os:        osx
      osx_image: xcode8.3
      compiler: clang
    - compiler: gcc
      dist: trusty
      sudo: required
      addons:
        apt:
          sources:
            - ubuntu-toolchain-r-test
          packages:
            - g++-4.9
            - libmpfr-dev
            - libgmp-dev
            - libmpc-dev
            - libopenmpi-dev
            - openmpi-bin
            - binutils-dev
      env: VERSION=-4.9
    - compiler: gcc
      dist: trusty
      sudo: required
      addons:
        apt:
          sources:
            - ubuntu-toolchain-r-test
          packages:
            - g++-5
            - libmpfr-dev
            - libgmp-dev
            - libmpc-dev
            - libopenmpi-dev
            - openmpi-bin
            - binutils-dev
      env: VERSION=-5
    - compiler: clang
      dist: trusty
      addons:
        apt:
          sources:
            - ubuntu-toolchain-r-test
          packages:
            - g++-4.8
            - libmpfr-dev
            - libgmp-dev
            - libmpc-dev
            - libopenmpi-dev
            - openmpi-bin
            - binutils-dev
      env: CLANG_LINK=http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
    - compiler: clang
      dist: trusty
      addons:
        apt:
          sources:
            - ubuntu-toolchain-r-test
          packages:
            - g++-4.8
            - libmpfr-dev
            - libgmp-dev
            - libmpc-dev
            - libopenmpi-dev
            - openmpi-bin
            - binutils-dev
      env: CLANG_LINK=http://llvm.org/releases/3.7.0/clang+llvm-3.7.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz
 before_install:
    - export GRIDDIR=`pwd`
@@ -106,9 +44,3 @@ script:
    - make -j4
    - ./benchmarks/Benchmark_dwf --threads 1 --debug-signals
    - make check
    - echo make clean
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then ../configure --enable-precision=single --enable-simd=SSE4 --enable-comms=mpi-auto ; fi
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then make -j4; fi
    - if [[ "$TRAVIS_OS_NAME" == "linux" ]] && [[ "$CC" == "clang" ]]; then mpirun.openmpi -n 2 ./benchmarks/Benchmark_dwf --threads 1 --mpi 2.1.1.1; fi
--- a/README.md
+++ b/README.md
@@ -1,18 +1,4 @@
-# Grid
+# Grid [![Teamcity status](http://ci.cliath.ph.ed.ac.uk/app/rest/builds/aggregated/strob:(buildType:(affectedProject(id:Grid)),branch:name:develop)/statusIcon.svg)](http://ci.cliath.ph.ed.ac.uk/project.html?projectId=Grid&tab=projectOverview) [![Travis status](https://travis-ci.org/paboyle/Grid.svg?branch=develop)](https://travis-ci.org/paboyle/Grid)
 <table>
 <tr>
    <td>Last stable release</td>
    <td><a href="https://travis-ci.org/paboyle/Grid">
    <img src="https://travis-ci.org/paboyle/Grid.svg?branch=master"></a>
    </td>
 </tr>
 <tr>
    <td>Development branch</td>
    <td><a href="https://travis-ci.org/paboyle/Grid">
    <img src="https://travis-ci.org/paboyle/Grid.svg?branch=develop"></a>
    </td>
 </tr>
 </table>
 **Data parallel C++ mathematical object library.**
--- a/4
+++ b/4
@@ -6,6 +6,7 @@ Large item work list:
 1)- BG/Q port and check
 2)- Christoph's local basis expansion Lanczos
 3)- Precision conversion and sort out localConvert      <-- partial
  - Consistent linear solver flop count/rate -- PARTIAL, time but no flop/s yet
 4)- Physical propagator interface
 5)- Conserved currents
@@ -13,7 +14,8 @@ Large item work list:
 7)- HDCR resume
 Recent DONE 
-- MultiRHS with spread out extra dim -- Go through filesystem with SciDAC I/O
+
 -- MultiRHS with spread out extra dim -- Go through filesystem with SciDAC I/O.  <--- DONE
 -- Lanczos Remove DenseVector, DenseMatrix; Use Eigen instead. <-- DONE
 -- GaugeFix into central location                      <-- DONE
 -- Scidac and Ildg metadata handling                   <-- DONE
--- a/benchmarks/Benchmark_ITT.cc
+++ b/benchmarks/Benchmark_ITT.cc
@@ -32,6 +32,19 @@ using namespace std;
 using namespace Grid;
 using namespace Grid::QCD;
 typedef WilsonFermion5D<DomainWallVec5dImplR> WilsonFermion5DR;
 typedef WilsonFermion5D<DomainWallVec5dImplF> WilsonFermion5DF;
 typedef WilsonFermion5D<DomainWallVec5dImplD> WilsonFermion5DD;
 std::vector<int> L_list;
 std::vector<int> Ls_list;
 std::vector<double> mflop_list;
 double mflop_ref;
 double mflop_ref_err;
 int NN_global;
 struct time_statistics{
  double mean;
@@ -95,13 +108,15 @@ public:
  static void Comms(void)
  {
-    int Nloop=100;
+    int Nloop=200;
    int nmu=0;
    int maxlat=32;
    std::vector<int> simd_layout = GridDefaultSimd(Nd,vComplexD::Nsimd());
    std::vector<int> mpi_layout  = GridDefaultMpi();
    for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;
    std::vector<double> t_time(Nloop);
    time_statistics timestat;
@@ -133,13 +148,14 @@ public:
 	  bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	}
 	int ncomm;
 	int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
 	int ncomm;
 	double dbytes;
 	std::vector<double> times(Nloop);
 	for(int i=0;i<Nloop;i++){
 	  double start=usecond();
 	  std::vector<CartesianCommunicator::CommsRequest_t> requests;
 	  dbytes=0;
 	  ncomm=0;
@@ -150,7 +166,6 @@ public:
 	    if (mpi_layout[mu]>1 ) {
 	      ncomm++;
 	      int xmit_to_rank;
 	      int recv_from_rank;
 	      if ( dir == mu ) { 
@@ -160,18 +175,18 @@ public:
 		int comm_proc = mpi_layout[mu]-1;
 		Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	      }
-#if 0
+	      tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
-	      tbytes= Grid.StencilSendToRecvFromBegin(requests,
+						 (void *)&rbuf[dir][0], recv_from_rank,
-						      (void *)&xbuf[dir][0],
+						 bytes,dir);
 						      xmit_to_rank,
 						      (void *)&rbuf[dir][0],
 						      recv_from_rank,
 						      bytes,dir);
 	      Grid.StencilSendToRecvFromComplete(requests,dir);
 #endif
 	      requests.resize(0);
 #ifdef GRID_OMP
 #pragma omp atomic
 #endif
 	      ncomm++;
 #ifdef GRID_OMP
 #pragma omp atomic
 #endif
 	      dbytes+=tbytes;
 	    }
 	  }
@@ -181,13 +196,15 @@ public:
 	}
 	timestat.statistics(t_time);
 	//	for(int i=0;i<t_time.size();i++){
 	//	  std::cout << i<<" "<<t_time[i]<<std::endl;
 	//	}
 	dbytes=dbytes*ppn;
 	double xbytes    = dbytes*0.5;
 	double rbytes    = dbytes*0.5;
 	double bidibytes = dbytes;
 	std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
 		 <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
 		 <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
@@ -196,7 +213,8 @@ public:
 		 << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
-      }
+	
 	    }
    }    
    return;
@@ -218,7 +236,7 @@ public:
    std::cout<<GridLogMessage << "----------------------------------------------------------"<<std::endl;
  uint64_t lmax=48;
-#define NLOOP (10*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
+#define NLOOP (100*lmax*lmax*lmax*lmax/lat/lat/lat/lat)
    GridSerialRNG          sRNG;      sRNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));
    for(int lat=8;lat<=lmax;lat+=4){
@@ -253,8 +271,7 @@ public:
    }
  };
-
+  static double DWF5(int Ls,int L)
  static void DWF(int Ls,int L)
  {
    RealD mass=0.1;
    RealD M5  =1.8;
@@ -262,6 +279,7 @@ public:
    double mflops;
    double mflops_best = 0;
    double mflops_worst= 0;
    std::vector<double> mflops_all;
    ///////////////////////////////////////////////////////
    // Set/Get the layout & grid size
@@ -274,6 +292,189 @@ public:
 								       GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
    uint64_t NP = TmpGrid->RankCount();
    uint64_t NN = TmpGrid->NodeCount();
    NN_global=NN;
    uint64_t SHM=NP/NN;
    std::vector<int> internal;
    if      ( SHM == 1 )   internal = std::vector<int>({1,1,1,1});
    else if ( SHM == 2 )   internal = std::vector<int>({2,1,1,1});
    else if ( SHM == 4 )   internal = std::vector<int>({2,2,1,1});
    else if ( SHM == 8 )   internal = std::vector<int>({2,2,2,1});
    else assert(0);
    std::vector<int> nodes({mpi[0]/internal[0],mpi[1]/internal[1],mpi[2]/internal[2],mpi[3]/internal[3]});
    std::vector<int> latt4({local[0]*nodes[0],local[1]*nodes[1],local[2]*nodes[2],local[3]*nodes[3]});
    ///////// Welcome message ////////////
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << "Benchmark DWF Ls vec on "<<L<<"^4 local volume "<<std::endl;
    std::cout<<GridLogMessage << "* Global volume  : "<<GridCmdVectorIntToString(latt4)<<std::endl;
    std::cout<<GridLogMessage << "* Ls             : "<<Ls<<std::endl;
    std::cout<<GridLogMessage << "* MPI ranks      : "<<GridCmdVectorIntToString(mpi)<<std::endl;
    std::cout<<GridLogMessage << "* Intranode      : "<<GridCmdVectorIntToString(internal)<<std::endl;
    std::cout<<GridLogMessage << "* nodes          : "<<GridCmdVectorIntToString(nodes)<<std::endl;
    std::cout<<GridLogMessage << "* Using "<<threads<<" threads"<<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    ///////// Lattice Init ////////////
    GridCartesian         * UGrid    = SpaceTimeGrid::makeFourDimGrid(latt4, GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
    GridRedBlackCartesian * UrbGrid  = SpaceTimeGrid::makeFourDimRedBlackGrid(UGrid);
    GridCartesian         * sUGrid   = SpaceTimeGrid::makeFourDimDWFGrid(latt4,GridDefaultMpi());
    GridRedBlackCartesian * sUrbGrid = SpaceTimeGrid::makeFourDimRedBlackGrid(sUGrid);
    GridCartesian         * sFGrid   = SpaceTimeGrid::makeFiveDimDWFGrid(Ls,UGrid);
    GridRedBlackCartesian * sFrbGrid = SpaceTimeGrid::makeFiveDimDWFRedBlackGrid(Ls,UGrid);
    ///////// RNG Init ////////////
    std::vector<int> seeds4({1,2,3,4});
    std::vector<int> seeds5({5,6,7,8});
    GridParallelRNG          RNG4(UGrid);  RNG4.SeedFixedIntegers(seeds4);
    GridParallelRNG          RNG5(sFGrid);  RNG5.SeedFixedIntegers(seeds5);
    std::cout << GridLogMessage << "Initialised RNGs" << std::endl;
    ///////// Source preparation ////////////
    LatticeFermion src   (sFGrid); random(RNG5,src);
    LatticeFermion tmp   (sFGrid);
    RealD N2 = 1.0/::sqrt(norm2(src));
    src = src*N2;
    LatticeGaugeField Umu(UGrid);  SU3::HotConfiguration(RNG4,Umu); 
    WilsonFermion5DR sDw(Umu,*sFGrid,*sFrbGrid,*sUGrid,*sUrbGrid,M5);
    LatticeFermion src_e (sFrbGrid);
    LatticeFermion src_o (sFrbGrid);
    LatticeFermion r_e   (sFrbGrid);
    LatticeFermion r_o   (sFrbGrid);
    LatticeFermion r_eo  (sFGrid);
    LatticeFermion err   (sFGrid);
    {
      pickCheckerboard(Even,src_e,src);
      pickCheckerboard(Odd,src_o,src);
 #if defined(AVX512) 
      const int num_cases = 6;
      std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O ");
 #else
      const int num_cases = 4;
      std::string fmt("U/S ; U/O ; G/S ; G/O ");
 #endif
      controls Cases [] = {
 #ifdef AVX512
 	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
 #endif
 	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{ QCD::WilsonKernelsStatic::OptGeneric   , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{ QCD::WilsonKernelsStatic::OptGeneric   , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  }
      }; 
      for(int c=0;c<num_cases;c++) {
 	QCD::WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
 	QCD::WilsonKernelsStatic::Opt   = Cases[c].Opt;
 	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptInlineAsm ) std::cout << GridLogMessage<< "* Using Asm Nc=3   WilsonKernels" <<std::endl;
 	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsAndCompute ) std::cout << GridLogMessage<< "* Using Overlapped Comms/Compute" <<std::endl;
 	if ( WilsonKernelsStatic::Comms == WilsonKernelsStatic::CommsThenCompute) std::cout << GridLogMessage<< "* Using sequential comms compute" <<std::endl;
 	if ( sizeof(Real)==4 )   std::cout << GridLogMessage<< "* SINGLE precision "<<std::endl;
 	if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 	int nwarm = 100;
 	double t0=usecond();
 	sFGrid->Barrier();
 	for(int i=0;i<nwarm;i++){
 	  sDw.DhopEO(src_o,r_e,DaggerNo);
 	}
 	sFGrid->Barrier();
 	double t1=usecond();
 	//	uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0);
 	//	if (ncall < 500) ncall = 500;
 	uint64_t ncall = 500;
 	sFGrid->Broadcast(0,&ncall,sizeof(ncall));
 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
 	sDw.ZeroCounters();
 	time_statistics timestat;
 	std::vector<double> t_time(ncall);
 	for(uint64_t i=0;i<ncall;i++){
 	  t0=usecond();
 	  sDw.DhopEO(src_o,r_e,DaggerNo);
 	  t1=usecond();
 	  t_time[i] = t1-t0;
 	}
 	sFGrid->Barrier();
 	double volume=Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
 	double flops=(1344.0*volume)/2;
 	double mf_hi, mf_lo, mf_err;
 	timestat.statistics(t_time);
 	mf_hi = flops/timestat.min;
 	mf_lo = flops/timestat.max;
 	mf_err= flops/timestat.min * timestat.err/timestat.mean;
 	mflops = flops/timestat.mean;
 	mflops_all.push_back(mflops);
 	if ( mflops_best == 0   ) mflops_best = mflops;
 	if ( mflops_worst== 0   ) mflops_worst= mflops;
 	if ( mflops>mflops_best ) mflops_best = mflops;
 	if ( mflops<mflops_worst) mflops_worst= mflops;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s =   "<< mflops << " ("<<mf_err<<") " << mf_lo<<"-"<<mf_hi <<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s per rank   "<< mflops/NP<<std::endl;
 	std::cout<<GridLogMessage << std::fixed << std::setprecision(1)<<"sDeo mflop/s per node   "<< mflops/NN<<std::endl;
 	sDw.Report();
      }
      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " sDeo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " sDeo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Performance Robustness   =   "<< mflops_worst/mflops_best <<std::endl;
      std::cout<<GridLogMessage <<fmt << std::endl;
      std::cout<<GridLogMessage ;
      for(int i=0;i<mflops_all.size();i++){
 	std::cout<<mflops_all[i]/NN<<" ; " ;
      }
      std::cout<<std::endl;
      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    }
    return mflops_best;
  }
  static double DWF(int Ls,int L)
  {
    RealD mass=0.1;
    RealD M5  =1.8;
    double mflops;
    double mflops_best = 0;
    double mflops_worst= 0;
    std::vector<double> mflops_all;
    ///////////////////////////////////////////////////////
    // Set/Get the layout & grid size
    ///////////////////////////////////////////////////////
    int threads = GridThread::GetThreads();
    std::vector<int> mpi = GridDefaultMpi(); assert(mpi.size()==4);
    std::vector<int> local({L,L,L,L});
    GridCartesian         * TmpGrid   = SpaceTimeGrid::makeFourDimGrid(std::vector<int>({64,64,64,64}), 
 								       GridDefaultSimd(Nd,vComplex::Nsimd()),GridDefaultMpi());
    uint64_t NP = TmpGrid->RankCount();
    uint64_t NN = TmpGrid->NodeCount();
    NN_global=NN;
    uint64_t SHM=NP/NN;
    std::vector<int> internal;
@@ -364,13 +565,15 @@ public:
 #if defined(AVX512) 
      const int num_cases = 6;
      std::string fmt("A/S ; A/O ; U/S ; U/O ; G/S ; G/O ");
 #else
      const int num_cases = 4;
      std::string fmt("U/S ; U/O ; G/S ; G/O ");
 #endif
      controls Cases [] = {
-#if defined(AVX512) 
+#ifdef AVX512
 	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{ QCD::WilsonKernelsStatic::OptInlineAsm , QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
 #endif
 	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsThenCompute ,CartesianCommunicator::CommunicatorPolicySequential  },
 	{ QCD::WilsonKernelsStatic::OptHandUnroll, QCD::WilsonKernelsStatic::CommsAndCompute  ,CartesianCommunicator::CommunicatorPolicySequential  },
@@ -380,6 +583,10 @@ public:
      for(int c=0;c<num_cases;c++) {
 	QCD::WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
 	QCD::WilsonKernelsStatic::Opt   = Cases[c].Opt;
 	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptGeneric   ) std::cout << GridLogMessage<< "* Using GENERIC Nc WilsonKernels" <<std::endl;
 	if ( WilsonKernelsStatic::Opt == WilsonKernelsStatic::OptHandUnroll) std::cout << GridLogMessage<< "* Using Nc=3       WilsonKernels" <<std::endl;
@@ -390,11 +597,7 @@ public:
 	if ( sizeof(Real)==8 )   std::cout << GridLogMessage<< "* DOUBLE precision "<<std::endl;
 	std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-
+	int nwarm = 200;
 	QCD::WilsonKernelsStatic::Comms = Cases[c].CommsOverlap;
 	QCD::WilsonKernelsStatic::Opt   = Cases[c].Opt;
 	CartesianCommunicator::SetCommunicatorPolicy(Cases[c].CommsAsynch);
 	int nwarm = 10;
 	double t0=usecond();
 	FGrid->Barrier();
 	for(int i=0;i<nwarm;i++){
@@ -402,7 +605,10 @@ public:
 	}
 	FGrid->Barrier();
 	double t1=usecond();
-	uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0);
+	//	uint64_t ncall = (uint64_t) 2.5*1000.0*1000.0*nwarm/(t1-t0);
 	//	if (ncall < 500) ncall = 500;
 	uint64_t ncall = 1000;
 	FGrid->Broadcast(0,&ncall,sizeof(ncall));
 	//	std::cout << GridLogMessage << " Estimate " << ncall << " calls per second"<<std::endl;
@@ -428,7 +634,7 @@ public:
 	mf_err= flops/timestat.min * timestat.err/timestat.mean;
 	mflops = flops/timestat.mean;
-
+	mflops_all.push_back(mflops);
 	if ( mflops_best == 0   ) mflops_best = mflops;
 	if ( mflops_worst== 0   ) mflops_worst= mflops;
 	if ( mflops>mflops_best ) mflops_best = mflops;
@@ -450,12 +656,20 @@ public:
      }
      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Best  mflop/s        =   "<< mflops_best <<std::endl;
+      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Best  mflop/s        =   "<< mflops_best << " ; " << mflops_best/NN<<" per node " <<std::endl;
-      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Worst mflop/s        =   "<< mflops_worst<<std::endl;
+      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Deo Worst mflop/s        =   "<< mflops_worst<< " ; " << mflops_worst/NN<<" per node " <<std::endl;
      std::cout<<GridLogMessage << L<<"^4 x "<<Ls<< " Performance Robustness   =   "<< mflops_worst/mflops_best <<std::endl;
      std::cout<<GridLogMessage <<fmt << std::endl;
      std::cout<<GridLogMessage ;
      for(int i=0;i<mflops_all.size();i++){
 	std::cout<<mflops_all[i]/NN<<" ; " ;
      }
      std::cout<<std::endl;
      std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    }
    return mflops_best;
  }
 };
@@ -465,8 +679,11 @@ int main (int argc, char ** argv)
  Grid_init(&argc,&argv);
  CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
 #ifdef KNL
  LebesgueOrder::Block = std::vector<int>({8,2,2,2});
 #else
  LebesgueOrder::Block = std::vector<int>({2,2,2,2});
-
+#endif
  Benchmark::Decomposition();
  int do_memory=1;
@@ -493,26 +710,66 @@ int main (int argc, char ** argv)
    // empty for now
  }
  int sel=2;
  std::vector<int> L_list({8,12,16,24});
  std::vector<double> wilson;
  std::vector<double> dwf4;
  std::vector<double> dwf5;
  if ( do_wilson ) {
    int Ls=1;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << " Wilson dslash 4D vectorised" <<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    Benchmark::DWF(Ls,16);
+    for(int l=0;l<L_list.size();l++){
-    Benchmark::DWF(Ls,24);
+      wilson.push_back(Benchmark::DWF(1,L_list[l]));
-    Benchmark::DWF(Ls,32);
+    }
  }
  int Ls=16;
  if ( do_dwf ) {
    int Ls=16;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
-    Benchmark::DWF(Ls,8);
+    for(int l=0;l<L_list.size();l++){
-    Benchmark::DWF(Ls,12);
+      dwf4.push_back(Benchmark::DWF(Ls,L_list[l]));
-    Benchmark::DWF(Ls,16);
+    }
    Benchmark::DWF(Ls,24);
  }
  if ( do_dwf ) {
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    std::cout<<GridLogMessage << " Domain wall dslash 4D vectorised" <<std::endl;
    std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
    for(int l=0;l<L_list.size();l++){
      dwf5.push_back(Benchmark::DWF5(Ls,L_list[l]));
    }
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << " Summary table Ls="<<Ls <<std::endl;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "L \t\t Wilson \t DWF4 \t DWF5 " <<std::endl;
  for(int l=0;l<L_list.size();l++){
    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]<<" \t "<<dwf4[l]<<" \t "<<dwf5[l] <<std::endl;
  }
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << " Per Node Summary table Ls="<<Ls <<std::endl;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  int NN=NN_global;
  std::cout<<GridLogMessage << " L \t\t Wilson\t\t DWF4  \t\t DWF5 " <<std::endl;
  for(int l=0;l<L_list.size();l++){
    std::cout<<GridLogMessage << L_list[l] <<" \t\t "<< wilson[l]/NN<<" \t "<<dwf4[l]/NN<<" \t "<<dwf5[l] /NN<<std::endl;
  }
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  std::cout<<GridLogMessage << " Comparison point result: "  << dwf4[sel]/NN <<std::endl;
  std::cout<<GridLogMessage << "=================================================================================="<<std::endl;
  }
  Grid_finalize();
 }
--- a/benchmarks/Benchmark_comms.cc
+++ b/benchmarks/Benchmark_comms.cc
@@ -68,7 +68,7 @@ int main (int argc, char ** argv)
  int Nloop=100;
  int nmu=0;
-  int maxlat=24;
+  int maxlat=32;
  for(int mu=0;mu<Nd;mu++) if (mpi_layout[mu]>1) nmu++;
  std::cout << GridLogMessage << "Number of iterations to average: "<< Nloop << std::endl;
@@ -80,7 +80,7 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  header();
  for(int lat=4;lat<=maxlat;lat+=4){
-    for(int Ls=8;Ls<=32;Ls*=2){
+    for(int Ls=8;Ls<=8;Ls*=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],
      				    lat*mpi_layout[1],
@@ -92,11 +92,16 @@ int main (int argc, char ** argv)
      RealD Nnode = Grid.NodeCount();
      RealD ppn = Nrank/Nnode;
-      std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
+      std::vector<Vector<HalfSpinColourVectorD> > xbuf(8);	
-      std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
+      std::vector<Vector<HalfSpinColourVectorD> > rbuf(8);
      int ncomm;
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
      for(int mu=0;mu<8;mu++){
 	xbuf[mu].resize(lat*lat*lat*Ls);
 	rbuf[mu].resize(lat*lat*lat*Ls);
 	//	std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] <<std::endl;
      }
      for(int i=0;i<Nloop;i++){
      double start=usecond();
@@ -112,7 +117,6 @@ int main (int argc, char ** argv)
 	    int comm_proc=1;
 	    int xmit_to_rank;
 	    int recv_from_rank;
 	    Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	    Grid.SendToRecvFromBegin(requests,
 				   (void *)&xbuf[mu][0],
@@ -163,7 +167,7 @@ int main (int argc, char ** argv)
  header();
  for(int lat=4;lat<=maxlat;lat+=4){
-    for(int Ls=8;Ls<=32;Ls*=2){
+    for(int Ls=8;Ls<=8;Ls*=2){
      std::vector<int> latt_size  ({lat,lat,lat,lat});
@@ -172,9 +176,14 @@ int main (int argc, char ** argv)
      RealD Nnode = Grid.NodeCount();
      RealD ppn = Nrank/Nnode;
-      std::vector<std::vector<HalfSpinColourVectorD> > xbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
+      std::vector<Vector<HalfSpinColourVectorD> > xbuf(8);
-      std::vector<std::vector<HalfSpinColourVectorD> > rbuf(8,std::vector<HalfSpinColourVectorD>(lat*lat*lat*Ls));
+      std::vector<Vector<HalfSpinColourVectorD> > rbuf(8);
      for(int mu=0;mu<8;mu++){
 	xbuf[mu].resize(lat*lat*lat*Ls);
 	rbuf[mu].resize(lat*lat*lat*Ls);
 	//	std::cout << " buffers " << std::hex << (uint64_t)&xbuf[mu][0] <<" " << (uint64_t)&rbuf[mu][0] <<std::endl;
      }
      int ncomm;
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
@@ -249,7 +258,7 @@ int main (int argc, char ** argv)
  header();
  for(int lat=4;lat<=maxlat;lat+=4){
-    for(int Ls=8;Ls<=32;Ls*=2){
+    for(int Ls=8;Ls<=8;Ls*=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],
      				    lat*mpi_layout[1],
@@ -299,7 +308,7 @@ int main (int argc, char ** argv)
 					      xmit_to_rank,
 					      (void *)&rbuf[mu][0],
 					      recv_from_rank,
-					      bytes);
+					      bytes,mu);
 	    comm_proc = mpi_layout[mu]-1;
@@ -310,11 +319,11 @@ int main (int argc, char ** argv)
 					      xmit_to_rank,
 					      (void *)&rbuf[mu+4][0],
 					      recv_from_rank,
-					      bytes);
+					      bytes,mu+4);
 	  }
 	}
-	Grid.StencilSendToRecvFromComplete(requests);
+	Grid.StencilSendToRecvFromComplete(requests,0);
 	Grid.Barrier();
 	double stop=usecond();
 	t_time[i] = stop-start; // microseconds
@@ -346,7 +355,7 @@ int main (int argc, char ** argv)
  header();
  for(int lat=4;lat<=maxlat;lat+=4){
-    for(int Ls=8;Ls<=32;Ls*=2){
+    for(int Ls=8;Ls<=8;Ls*=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],
      				    lat*mpi_layout[1],
@@ -393,8 +402,8 @@ int main (int argc, char ** argv)
 					      xmit_to_rank,
 					      (void *)&rbuf[mu][0],
 					      recv_from_rank,
-					      bytes);
+					      bytes,mu);
-	    Grid.StencilSendToRecvFromComplete(requests);
+	    Grid.StencilSendToRecvFromComplete(requests,mu);
 	    requests.resize(0);
 	    comm_proc = mpi_layout[mu]-1;
@@ -406,8 +415,8 @@ int main (int argc, char ** argv)
 					      xmit_to_rank,
 					      (void *)&rbuf[mu+4][0],
 					      recv_from_rank,
-					      bytes);
+					      bytes,mu+4);
-	    Grid.StencilSendToRecvFromComplete(requests);
+	    Grid.StencilSendToRecvFromComplete(requests,mu+4);
 	    requests.resize(0);
 	  }
@@ -436,5 +445,97 @@ int main (int argc, char ** argv)
    }
  }    
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= Benchmarking threaded STENCIL halo exchange in "<<nmu<<" dimensions"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  header();
  for(int lat=4;lat<=maxlat;lat+=4){
    for(int Ls=8;Ls<=8;Ls*=2){
      std::vector<int> latt_size  ({lat*mpi_layout[0],
      				    lat*mpi_layout[1],
      				    lat*mpi_layout[2],
      				    lat*mpi_layout[3]});
      GridCartesian     Grid(latt_size,simd_layout,mpi_layout);
      RealD Nrank = Grid._Nprocessors;
      RealD Nnode = Grid.NodeCount();
      RealD ppn = Nrank/Nnode;
      std::vector<HalfSpinColourVectorD *> xbuf(8);
      std::vector<HalfSpinColourVectorD *> rbuf(8);
      Grid.ShmBufferFreeAll();
      for(int d=0;d<8;d++){
 	xbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	rbuf[d] = (HalfSpinColourVectorD *)Grid.ShmBufferMalloc(lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	bzero((void *)xbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
 	bzero((void *)rbuf[d],lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD));
      }
      int ncomm;
      int bytes=lat*lat*lat*Ls*sizeof(HalfSpinColourVectorD);
      double dbytes;
      for(int i=0;i<Nloop;i++){
 	double start=usecond();
 	std::vector<CartesianCommunicator::CommsRequest_t> requests;
 	dbytes=0;
 	ncomm=0;
 	parallel_for(int dir=0;dir<8;dir++){
 	  double tbytes;
 	  int mu =dir % 4;
 	  if (mpi_layout[mu]>1 ) {
 	    ncomm++;
 	    int xmit_to_rank;
 	    int recv_from_rank;
 	    if ( dir == mu ) { 
 	      int comm_proc=1;
 	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	    } else { 
 	      int comm_proc = mpi_layout[mu]-1;
 	      Grid.ShiftedRanks(mu,comm_proc,xmit_to_rank,recv_from_rank);
 	    }
 	    tbytes= Grid.StencilSendToRecvFrom((void *)&xbuf[dir][0], xmit_to_rank,
 					       (void *)&rbuf[dir][0], recv_from_rank, bytes,dir);
 #pragma omp atomic
 	    dbytes+=tbytes;
 	  }
 	}
 	Grid.Barrier();
 	double stop=usecond();
 	t_time[i] = stop-start; // microseconds
      }
      timestat.statistics(t_time);
      dbytes=dbytes*ppn;
      double xbytes    = dbytes*0.5;
      double rbytes    = dbytes*0.5;
      double bidibytes = dbytes;
      std::cout<<GridLogMessage << std::setw(4) << lat<<"\t"<<Ls<<"\t"
               <<std::setw(11) << bytes<< std::fixed << std::setprecision(1) << std::setw(7)
               <<std::right<< xbytes/timestat.mean<<"  "<< xbytes*timestat.err/(timestat.mean*timestat.mean)<< " "
               <<xbytes/timestat.max <<" "<< xbytes/timestat.min  
               << "\t\t"<<std::setw(7)<< bidibytes/timestat.mean<< "  " << bidibytes*timestat.err/(timestat.mean*timestat.mean) << " "
               << bidibytes/timestat.max << " " << bidibytes/timestat.min << std::endl;
    }
  }    
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  std::cout<<GridLogMessage << "= All done; Bye Bye"<<std::endl;
  std::cout<<GridLogMessage << "===================================================================================================="<<std::endl;
  Grid_finalize();
 }
--- a/benchmarks/Benchmark_dwf.cc
+++ b/benchmarks/Benchmark_dwf.cc
@@ -503,9 +503,9 @@ int main (int argc, char ** argv)
  std::cout<<GridLogMessage << "norm diff even  "<< norm2(src_e)<<std::endl;
  std::cout<<GridLogMessage << "norm diff odd   "<< norm2(src_o)<<std::endl;
-  //assert(norm2(src_e)<1.0e-4);
+  assert(norm2(src_e)<1.0e-4);
-  //assert(norm2(src_o)<1.0e-4);
+  assert(norm2(src_o)<1.0e-4);
  Grid_finalize();
  exit(0);
 }
--- a/configure.ac
+++ b/configure.ac
@@ -342,14 +342,14 @@ case ${ac_COMMS} in
        AC_DEFINE([GRID_COMMS_NONE],[1],[GRID_COMMS_NONE] )
        comms_type='none'
     ;;
     mpi3l*)
       AC_DEFINE([GRID_COMMS_MPI3L],[1],[GRID_COMMS_MPI3L] )
       comms_type='mpi3l'
     ;;
     mpi3*)
        AC_DEFINE([GRID_COMMS_MPI3],[1],[GRID_COMMS_MPI3] )
        comms_type='mpi3'
     ;;
     mpit)
        AC_DEFINE([GRID_COMMS_MPIT],[1],[GRID_COMMS_MPIT] )
        comms_type='mpit'
     ;;
     mpi*)
        AC_DEFINE([GRID_COMMS_MPI],[1],[GRID_COMMS_MPI] )
        comms_type='mpi'
@@ -377,7 +377,7 @@ esac
 AM_CONDITIONAL(BUILD_COMMS_SHMEM, [ test "${comms_type}X" == "shmemX" ])
 AM_CONDITIONAL(BUILD_COMMS_MPI,   [ test "${comms_type}X" == "mpiX" ])
 AM_CONDITIONAL(BUILD_COMMS_MPI3,  [ test "${comms_type}X" == "mpi3X" ] )
-AM_CONDITIONAL(BUILD_COMMS_MPI3L, [ test "${comms_type}X" == "mpi3lX" ] )
+AM_CONDITIONAL(BUILD_COMMS_MPIT,  [ test "${comms_type}X" == "mpitX" ] )
 AM_CONDITIONAL(BUILD_COMMS_NONE,  [ test "${comms_type}X" == "noneX" ])
 ############### RNG selection
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@@ -10,8 +10,8 @@ if BUILD_COMMS_MPI3
  extra_sources+=communicator/Communicator_base.cc
 endif
-if BUILD_COMMS_MPI3L
+if BUILD_COMMS_MPIT
-  extra_sources+=communicator/Communicator_mpi3_leader.cc
+  extra_sources+=communicator/Communicator_mpit.cc
  extra_sources+=communicator/Communicator_base.cc
 endif
--- a/lib/allocator/AlignedAllocator.cc
+++ b/lib/allocator/AlignedAllocator.cc
@@ -1,7 +1,5 @@
 #include <Grid/GridCore.h>
 #include <fcntl.h>
 namespace Grid {
@@ -63,4 +61,37 @@ void *PointerCache::Lookup(size_t bytes) {
  return NULL;
 }
 void check_huge_pages(void *Buf,uint64_t BYTES)
 {
 #ifdef __linux__
  int fd = open("/proc/self/pagemap", O_RDONLY);
  assert(fd >= 0);
  const int page_size = 4096;
  uint64_t virt_pfn = (uint64_t)Buf / page_size;
  off_t offset = sizeof(uint64_t) * virt_pfn;
  uint64_t npages = (BYTES + page_size-1) / page_size;
  uint64_t pagedata[npages];
  uint64_t ret = lseek(fd, offset, SEEK_SET);
  assert(ret == offset);
  ret = ::read(fd, pagedata, sizeof(uint64_t)*npages);
  assert(ret == sizeof(uint64_t) * npages);
  int nhugepages = npages / 512;
  int n4ktotal, nnothuge;
  n4ktotal = 0;
  nnothuge = 0;
  for (int i = 0; i < nhugepages; ++i) {
    uint64_t baseaddr = (pagedata[i*512] & 0x7fffffffffffffULL) * page_size;
    for (int j = 0; j < 512; ++j) {
      uint64_t pageaddr = (pagedata[i*512+j] & 0x7fffffffffffffULL) * page_size;
      ++n4ktotal;
      if (pageaddr != baseaddr + j * page_size)
 	++nnothuge;
      }
  }
  int rank = CartesianCommunicator::RankWorld();
  printf("rank %d Allocated %d 4k pages, %d not in huge pages\n", rank, n4ktotal, nnothuge);
 #endif
 }
 }
--- a/lib/allocator/AlignedAllocator.h
+++ b/lib/allocator/AlignedAllocator.h
@@ -64,6 +64,8 @@ namespace Grid {
  };
  void check_huge_pages(void *Buf,uint64_t BYTES);
 ////////////////////////////////////////////////////////////////////
 // A lattice of something, but assume the something is SIMDized.
 ////////////////////////////////////////////////////////////////////
@@ -92,12 +94,20 @@ public:
    size_type bytes = __n*sizeof(_Tp);
    _Tp *ptr = (_Tp *) PointerCache::Lookup(bytes);
    //    if ( ptr != NULL ) 
    //      std::cout << "alignedAllocator "<<__n << " cache hit "<< std::hex << ptr <<std::dec <<std::endl;
    //////////////////
    // Hack 2MB align; could make option probably doesn't need configurability
    //////////////////
 //define GRID_ALLOC_ALIGN (128)
 #define GRID_ALLOC_ALIGN (2*1024*1024)
 #ifdef HAVE_MM_MALLOC_H
-    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,128);
+    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) _mm_malloc(bytes,GRID_ALLOC_ALIGN);
 #else
-    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(128,bytes);
+    if ( ptr == (_Tp *) NULL ) ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,bytes);
 #endif
    //    std::cout << "alignedAllocator " << std::hex << ptr <<std::dec <<std::endl;
    // First touch optimise in threaded loop
    uint8_t *cp = (uint8_t *)ptr;
 #ifdef GRID_OMP
@@ -111,6 +121,7 @@ public:
  void deallocate(pointer __p, size_type __n) { 
    size_type bytes = __n * sizeof(_Tp);
    pointer __freeme = (pointer)PointerCache::Insert((void *)__p,bytes);
 #ifdef HAVE_MM_MALLOC_H
@@ -189,9 +200,9 @@ public:
  pointer allocate(size_type __n, const void* _p= 0) 
  {
 #ifdef HAVE_MM_MALLOC_H
-    _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),128);
+    _Tp * ptr = (_Tp *) _mm_malloc(__n*sizeof(_Tp),GRID_ALLOC_ALIGN);
 #else
-    _Tp * ptr = (_Tp *) memalign(128,__n*sizeof(_Tp));
+    _Tp * ptr = (_Tp *) memalign(GRID_ALLOC_ALIGN,__n*sizeof(_Tp));
 #endif
    size_type bytes = __n*sizeof(_Tp);
    uint8_t *cp = (uint8_t *)ptr;
--- a/lib/communicator/Communicator_base.cc
+++ b/lib/communicator/Communicator_base.cc
@@ -26,6 +26,10 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/GridCore.h>
 #include <fcntl.h>
 #include <unistd.h>
 #include <limits.h>
 #include <sys/mman.h>
 namespace Grid {
@@ -34,7 +38,10 @@ namespace Grid {
 ///////////////////////////////////////////////////////////////
 void *              CartesianCommunicator::ShmCommBuf;
 uint64_t            CartesianCommunicator::MAX_MPI_SHM_BYTES   = 128*1024*1024; 
-CartesianCommunicator::CommunicatorPolicy_t  CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
+CartesianCommunicator::CommunicatorPolicy_t  
 CartesianCommunicator::CommunicatorPolicy= CartesianCommunicator::CommunicatorPolicyConcurrent;
 int CartesianCommunicator::nCommThreads = -1;
 int CartesianCommunicator::Hugepages = 0;
 /////////////////////////////////
 // Alloc, free shmem region
@@ -89,25 +96,43 @@ void CartesianCommunicator::GlobalSumVector(ComplexD *c,int N)
  GlobalSumVector((double *)c,2*N);
 }
-#if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPI3L)
+#if !defined( GRID_COMMS_MPI3) 
 int                      CartesianCommunicator::NodeCount(void)    { return ProcessorCount();};
 int                      CartesianCommunicator::RankCount(void)    { return ProcessorCount();};
-
+#endif
-double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+#if !defined( GRID_COMMS_MPI3) && !defined (GRID_COMMS_MPIT)
-						       void *xmit,
+double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
-						       int xmit_to_rank,
+						     int xmit_to_rank,
-						       void *recv,
+						     void *recv,
-						       int recv_from_rank,
+						     int recv_from_rank,
-						       int bytes)
+						     int bytes, int dir)
 {
  std::vector<CommsRequest_t> list;
  // Discard the "dir"
  SendToRecvFromBegin   (list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
  SendToRecvFromComplete(list);
  return 2.0*bytes;
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
 							 int xmit_to_rank,
 							 void *recv,
 							 int recv_from_rank,
 							 int bytes, int dir)
 {
  // Discard the "dir"
  SendToRecvFromBegin(list,xmit,xmit_to_rank,recv,recv_from_rank,bytes);
  return 2.0*bytes;
 }
-void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
+void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
 {
  SendToRecvFromComplete(waitall);
 }
 #endif
 #if !defined( GRID_COMMS_MPI3) 
 void CartesianCommunicator::StencilBarrier(void){};
 commVector<uint8_t> CartesianCommunicator::ShmBufStorageVector;
@@ -121,8 +146,22 @@ void *CartesianCommunicator::ShmBufferTranslate(int rank,void * local_p) {
  return NULL;
 }
 void CartesianCommunicator::ShmInitGeneric(void){
 #if 1
  int mmap_flag = MAP_SHARED | MAP_ANONYMOUS;
 #ifdef MAP_HUGETLB
  if ( Hugepages ) mmap_flag |= MAP_HUGETLB;
 #endif
  ShmCommBuf =(void *) mmap(NULL, MAX_MPI_SHM_BYTES, PROT_READ | PROT_WRITE, mmap_flag, -1, 0); 
  if (ShmCommBuf == (void *)MAP_FAILED) {
    perror("mmap failed ");
    exit(EXIT_FAILURE);  
  }
 #else 
  ShmBufStorageVector.resize(MAX_MPI_SHM_BYTES);
  ShmCommBuf=(void *)&ShmBufStorageVector[0];
 #endif
  bzero(ShmCommBuf,MAX_MPI_SHM_BYTES);
 }
 #endif
--- a/lib/communicator/Communicator_base.h
+++ b/lib/communicator/Communicator_base.h
@@ -38,7 +38,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifdef GRID_COMMS_MPI3
 #include <mpi.h>
 #endif
-#ifdef GRID_COMMS_MPI3L
+#ifdef GRID_COMMS_MPIT
 #include <mpi.h>
 #endif
 #ifdef GRID_COMMS_SHMEM
@@ -50,12 +50,24 @@ namespace Grid {
 class CartesianCommunicator {
  public:    
-  // 65536 ranks per node adequate for now
+
  ////////////////////////////////////////////
  // Isend/Irecv/Wait, or Sendrecv blocking
  ////////////////////////////////////////////
  enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential };
  static CommunicatorPolicy_t CommunicatorPolicy;
  static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; }
  ///////////////////////////////////////////
  // Up to 65536 ranks per node adequate for now
  // 128MB shared memory for comms enought for 48^4 local vol comms
  // Give external control (command line override?) of this
-
+  ///////////////////////////////////////////
-  static const int      MAXLOG2RANKSPERNODE = 16;            
+  static const int MAXLOG2RANKSPERNODE = 16;            
-  static uint64_t MAX_MPI_SHM_BYTES;
+  static uint64_t  MAX_MPI_SHM_BYTES;
  static int       nCommThreads;
  // use explicit huge pages
  static int       Hugepages;
  // Communicator should know nothing of the physics grid, only processor grid.
  int              _Nprocessors;     // How many in all
@@ -64,14 +76,18 @@ class CartesianCommunicator {
  std::vector<int> _processor_coor;  // linear processor coordinate
  unsigned long _ndimension;
-#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPI3L)
+#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT)
  static MPI_Comm communicator_world;
-         MPI_Comm communicator;
+
  MPI_Comm              communicator;
  std::vector<MPI_Comm> communicator_halo;
  typedef MPI_Request CommsRequest_t;
 #else 
  typedef int CommsRequest_t;
 #endif
  ////////////////////////////////////////////////////////////////////
  // Helper functionality for SHM Windows common to all other impls
  ////////////////////////////////////////////////////////////////////
@@ -117,10 +133,6 @@ class CartesianCommunicator {
  /////////////////////////////////
  static void * ShmCommBuf;
  // Isend/Irecv/Wait, or Sendrecv blocking
  enum CommunicatorPolicy_t { CommunicatorPolicyConcurrent, CommunicatorPolicySequential };
  static CommunicatorPolicy_t CommunicatorPolicy;
  static void SetCommunicatorPolicy(CommunicatorPolicy_t policy ) { CommunicatorPolicy = policy; }
  size_t heap_top;
  size_t heap_bytes;
@@ -211,14 +223,21 @@ class CartesianCommunicator {
  void SendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
-  double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+  double StencilSendToRecvFrom(void *xmit,
-				  void *xmit,
+			       int xmit_to_rank,
-				  int xmit_to_rank,
+			       void *recv,
-				  void *recv,
+			       int recv_from_rank,
-				  int recv_from_rank,
+			       int bytes,int dir);
 				  int bytes);
-  void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall);
+  double StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 				    void *xmit,
 				    int xmit_to_rank,
 				    void *recv,
 				    int recv_from_rank,
 				    int bytes,int dir);
  void StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int i);
  void StencilBarrier(void);
  ////////////////////////////////////////////////////////////
--- a/lib/communicator/Communicator_mpi3.cc
+++ b/lib/communicator/Communicator_mpi3.cc
@@ -41,9 +41,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #ifdef HAVE_NUMAIF_H
 #include <numaif.h>
 #endif
-#ifndef SHM_HUGETLB
+
 #define SHM_HUGETLB 04000
 #endif
 namespace Grid {
@@ -214,12 +212,18 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
      if ( fd < 0 ) {	perror("failed shm_open");	assert(0);      }
      ftruncate(fd, size);
-      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+      int mmap_flag = MAP_SHARED;
 #ifdef MAP_HUGETLB
      if (Hugepages) mmap_flag |= MAP_HUGETLB;
 #endif
      void * ptr =  mmap(NULL,size, PROT_READ | PROT_WRITE, mmap_flag, fd, 0);
      if ( ptr == MAP_FAILED ) {       perror("failed mmap");      assert(0);    }
      assert(((uint64_t)ptr&0x3F)==0);
-      // Try to force numa domain on the shm segment if we have numaif.h
+// Experiments; Experiments; Try to force numa domain on the shm segment if we have numaif.h
-#ifdef HAVE_NUMAIF_H
+#if 0
 //#ifdef HAVE_NUMAIF_H
 	int status;
 	int flags=MPOL_MF_MOVE;
 #ifdef KNL
@@ -266,7 +270,11 @@ void CartesianCommunicator::Init(int *argc, char ***argv) {
    for(int r=0;r<ShmSize;r++){
      size_t size = CartesianCommunicator::MAX_MPI_SHM_BYTES;
      key_t key   = 0x4545 + r;
-      if ((shmids[r]= shmget(key,size, SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W)) < 0) {
+      int flags = IPC_CREAT | SHM_R | SHM_W;
 #ifdef SHM_HUGETLB
      flags|=SHM_HUGETLB;
 #endif
      if ((shmids[r]= shmget(key,size, flags)) < 0) {
 	int errsv = errno;
 	printf("Errno %d\n",errsv);
 	perror("shmget");
@@ -397,8 +405,14 @@ CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 { 
  int ierr;
  communicator=communicator_world;
  _ndimension = processors.size();
  communicator_halo.resize (2*_ndimension);
  for(int i=0;i<_ndimension*2;i++){
    MPI_Comm_dup(communicator,&communicator_halo[i]);
  }
  ////////////////////////////////////////////////////////////////
  // Assert power of two shm_size.
  ////////////////////////////////////////////////////////////////
@@ -621,13 +635,27 @@ void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &lis
  }
 }
-double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
+double CartesianCommunicator::StencilSendToRecvFrom( void *xmit,
-						       void *xmit,
+						     int dest,
-						       int dest,
+						     void *recv,
-						       void *recv,
+						     int from,
-						       int from,
+						     int bytes,int dir)
 						       int bytes)
 {
  std::vector<CommsRequest_t> list;
  double offbytes = StencilSendToRecvFromBegin(list,xmit,dest,recv,from,bytes,dir);
  StencilSendToRecvFromComplete(list,dir);
  return offbytes;
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
 							 int dest,
 							 void *recv,
 							 int from,
 							 int bytes,int dir)
 {
  assert(dir < communicator_halo.size());
  MPI_Request xrq;
  MPI_Request rrq;
@@ -646,26 +674,26 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
  gfrom = MPI_UNDEFINED;
 #endif
  if ( gfrom ==MPI_UNDEFINED) {
-    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
+    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator_halo[dir],&rrq);
    assert(ierr==0);
    list.push_back(rrq);
    off_node_bytes+=bytes;
  }
  if ( gdest == MPI_UNDEFINED ) {
-    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
+    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator_halo[dir],&xrq);
    assert(ierr==0);
    list.push_back(xrq);
    off_node_bytes+=bytes;
  }
  if ( CommunicatorPolicy == CommunicatorPolicySequential ) { 
-    this->StencilSendToRecvFromComplete(list);
+    this->StencilSendToRecvFromComplete(list,dir);
  }
  return off_node_bytes;
 }
-void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall)
+void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
 {
  SendToRecvFromComplete(waitall);
 }
--- a/lib/communicator/Communicator_mpit.cc
+++ b/lib/communicator/Communicator_mpit.cc
@@ -0,0 +1,286 @@
    /*************************************************************************************
    Grid physics library, www.github.com/paboyle/Grid 
    Source file: ./lib/communicator/Communicator_mpi.cc
    Copyright (C) 2015
 Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
    See the full license in the file "LICENSE" in the top level distribution directory
    *************************************************************************************/
    /*  END LEGAL */
 #include <Grid/GridCore.h>
 #include <Grid/GridQCDcore.h>
 #include <Grid/qcd/action/ActionCore.h>
 #include <mpi.h>
 namespace Grid {
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 // Info that is setup once and indept of cartesian layout
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 MPI_Comm CartesianCommunicator::communicator_world;
 // Should error check all MPI calls.
 void CartesianCommunicator::Init(int *argc, char ***argv) {
  int flag;
  int provided;
  MPI_Initialized(&flag); // needed to coexist with other libs apparently
  if ( !flag ) {
    MPI_Init_thread(argc,argv,MPI_THREAD_MULTIPLE,&provided);
    if ( provided != MPI_THREAD_MULTIPLE ) {
      QCD::WilsonKernelsStatic::Comms = QCD::WilsonKernelsStatic::CommsThenCompute;
    }
  }
  MPI_Comm_dup (MPI_COMM_WORLD,&communicator_world);
  ShmInitGeneric();
 }
 CartesianCommunicator::CartesianCommunicator(const std::vector<int> &processors)
 {
  _ndimension = processors.size();
  std::vector<int> periodic(_ndimension,1);
  _Nprocessors=1;
  _processors = processors;
  _processor_coor.resize(_ndimension);
  MPI_Cart_create(communicator_world, _ndimension,&_processors[0],&periodic[0],1,&communicator);
  MPI_Comm_rank(communicator,&_processor);
  MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
  for(int i=0;i<_ndimension;i++){
    _Nprocessors*=_processors[i];
  }
  communicator_halo.resize (2*_ndimension);
  for(int i=0;i<_ndimension*2;i++){
    MPI_Comm_dup(communicator,&communicator_halo[i]);
  }
  int Size; 
  MPI_Comm_size(communicator,&Size);
  assert(Size==_Nprocessors);
 }
 void CartesianCommunicator::GlobalSum(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(uint64_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalXOR(uint32_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT32_T,MPI_BXOR,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalXOR(uint64_t &u){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&u,1,MPI_UINT64_T,MPI_BXOR,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(float &f){
  int ierr=MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(float *f,int N)
 {
  int ierr=MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSum(double &d)
 {
  int ierr = MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::GlobalSumVector(double *d,int N)
 {
  int ierr = MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
 {
  int ierr=MPI_Cart_shift(communicator,dim,shift,&source,&dest);
  assert(ierr==0);
 }
 int CartesianCommunicator::RankFromProcessorCoor(std::vector<int> &coor)
 {
  int rank;
  int ierr=MPI_Cart_rank  (communicator, &coor[0], &rank);
  assert(ierr==0);
  return rank;
 }
 void  CartesianCommunicator::ProcessorCoorFromRank(int rank, std::vector<int> &coor)
 {
  coor.resize(_ndimension);
  int ierr=MPI_Cart_coords  (communicator, rank, _ndimension,&coor[0]);
  assert(ierr==0);
 }
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFrom(void *xmit,
 					   int dest,
 					   void *recv,
 					   int from,
 					   int bytes)
 {
  std::vector<CommsRequest_t> reqs(0);
  SendToRecvFromBegin(reqs,xmit,dest,recv,from,bytes);
  SendToRecvFromComplete(reqs);
 }
 void CartesianCommunicator::SendRecvPacket(void *xmit,
 					   void *recv,
 					   int sender,
 					   int receiver,
 					   int bytes)
 {
  MPI_Status stat;
  assert(sender != receiver);
  int tag = sender;
  if ( _processor == sender ) {
    MPI_Send(xmit, bytes, MPI_CHAR,receiver,tag,communicator);
  }
  if ( _processor == receiver ) { 
    MPI_Recv(recv, bytes, MPI_CHAR,sender,tag,communicator,&stat);
  }
 }
 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 						void *xmit,
 						int dest,
 						void *recv,
 						int from,
 						int bytes)
 {
  int myrank = _processor;
  int ierr;
  if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { 
    MPI_Request xrq;
    MPI_Request rrq;
    ierr =MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&rrq);
    ierr|=MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&xrq);
    assert(ierr==0);
    list.push_back(xrq);
    list.push_back(rrq);
  } else { 
    // Give the CPU to MPI immediately; can use threads to overlap optionally
    ierr=MPI_Sendrecv(xmit,bytes,MPI_CHAR,dest,myrank,
 		      recv,bytes,MPI_CHAR,from, from,
 		      communicator,MPI_STATUS_IGNORE);
    assert(ierr==0);
  }
 }
 void CartesianCommunicator::SendToRecvFromComplete(std::vector<CommsRequest_t> &list)
 {
  if ( CommunicatorPolicy == CommunicatorPolicyConcurrent ) { 
    int nreq=list.size();
    std::vector<MPI_Status> status(nreq);
    int ierr = MPI_Waitall(nreq,&list[0],&status[0]);
    assert(ierr==0);
  }
 }
 void CartesianCommunicator::Barrier(void)
 {
  int ierr = MPI_Barrier(communicator);
  assert(ierr==0);
 }
 void CartesianCommunicator::Broadcast(int root,void* data, int bytes)
 {
  int ierr=MPI_Bcast(data,
 		     bytes,
 		     MPI_BYTE,
 		     root,
 		     communicator);
  assert(ierr==0);
 }
  ///////////////////////////////////////////////////////
  // Should only be used prior to Grid Init finished.
  // Check for this?
  ///////////////////////////////////////////////////////
 int CartesianCommunicator::RankWorld(void){ 
  int r; 
  MPI_Comm_rank(communicator_world,&r);
  return r;
 }
 void CartesianCommunicator::BroadcastWorld(int root,void* data, int bytes)
 {
  int ierr= MPI_Bcast(data,
 		      bytes,
 		      MPI_BYTE,
 		      root,
 		      communicator_world);
  assert(ierr==0);
 }
 double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsRequest_t> &list,
 							 void *xmit,
 							 int xmit_to_rank,
 							 void *recv,
 							 int recv_from_rank,
 							 int bytes,int dir)
 {
  int myrank = _processor;
  int ierr;
  assert(dir < communicator_halo.size());
  //  std::cout << " sending on communicator "<<dir<<" " <<communicator_halo[dir]<<std::endl;
  // Give the CPU to MPI immediately; can use threads to overlap optionally
  MPI_Request req[2];
  MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[dir],&req[1]);
  MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank  ,myrank        , communicator_halo[dir],&req[0]);
  list.push_back(req[0]);
  list.push_back(req[1]);
  return 2.0*bytes;
 }
 void CartesianCommunicator::StencilSendToRecvFromComplete(std::vector<CommsRequest_t> &waitall,int dir)
 { 
  int nreq=waitall.size();
  MPI_Waitall(nreq, &waitall[0], MPI_STATUSES_IGNORE);
 };
 double CartesianCommunicator::StencilSendToRecvFrom(void *xmit,
 						    int xmit_to_rank,
 						    void *recv,
 						    int recv_from_rank,
 						    int bytes,int dir)
 {
  int myrank = _processor;
  int ierr;
  assert(dir < communicator_halo.size());
  //  std::cout << " sending on communicator "<<dir<<" " <<communicator_halo[dir]<<std::endl;
  // Give the CPU to MPI immediately; can use threads to overlap optionally
  MPI_Request req[2];
  MPI_Irecv(recv,bytes,MPI_CHAR,recv_from_rank,recv_from_rank, communicator_halo[dir],&req[1]);
  MPI_Isend(xmit,bytes,MPI_CHAR,xmit_to_rank  ,myrank        , communicator_halo[dir],&req[0]);
  MPI_Waitall(2, req, MPI_STATUSES_IGNORE);
  return 2.0*bytes;
 }
 }
--- a/lib/cshift/Cshift.h
+++ b/lib/cshift/Cshift.h
@@ -42,7 +42,7 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 #include <Grid/cshift/Cshift_mpi.h>
 #endif 
-#ifdef GRID_COMMS_MPI3L
+#ifdef GRID_COMMS_MPIT
 #include <Grid/cshift/Cshift_mpi.h>
 #endif 
--- a/lib/log/Log.cc
+++ b/lib/log/Log.cc
@@ -95,7 +95,7 @@ void GridLogConfigure(std::vector<std::string> &logstreams) {
 ////////////////////////////////////////////////////////////
 void Grid_quiesce_nodes(void) {
  int me = 0;
-#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPI3L)
+#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT)
  MPI_Comm_rank(MPI_COMM_WORLD, &me);
 #endif
 #ifdef GRID_COMMS_SHMEM
--- a/lib/parallelIO/BinaryIO.h
+++ b/lib/parallelIO/BinaryIO.h
@@ -29,7 +29,7 @@
 #ifndef GRID_BINARY_IO_H
 #define GRID_BINARY_IO_H
-#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) 
+#if defined(GRID_COMMS_MPI) || defined(GRID_COMMS_MPI3) || defined(GRID_COMMS_MPIT) 
 #define USE_MPI_IO
 #else
 #undef  USE_MPI_IO
--- a/lib/qcd/action/fermion/CayleyFermion5D.cc
+++ b/lib/qcd/action/fermion/CayleyFermion5D.cc
@@ -414,7 +414,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
  for(int i=0; i < Ls; i++){
    as[i] = 1.0;
    omega[i] = gamma[i]*zolo_hi; //NB reciprocal relative to Chroma NEF code
-    //    assert(fabs(omega[i])>0.0);
+    assert(omega[i]!=Coeff_t(0.0));
    bs[i] = 0.5*(bpc/omega[i] + bmc);
    cs[i] = 0.5*(bpc/omega[i] - bmc);
  }
@@ -429,7 +429,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
  for(int i=0;i<Ls;i++){
    bee[i]=as[i]*(bs[i]*(4.0-this->M5) +1.0);     
-    //    assert(fabs(bee[i])>0.0);
+    assert(bee[i]!=Coeff_t(0.0));
    cee[i]=as[i]*(1.0-cs[i]*(4.0-this->M5));
    beo[i]=as[i]*bs[i];
    ceo[i]=-as[i]*cs[i];
@@ -456,10 +456,16 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
    if ( i < Ls-1 ) {
      assert(bee[i]!=Coeff_t(0.0));
      assert(bee[0]!=Coeff_t(0.0));
      lee[i] =-cee[i+1]/bee[i]; // sub-diag entry on the ith column
      leem[i]=mass*cee[Ls-1]/bee[0];
-      for(int j=0;j<i;j++)  leem[i]*= aee[j]/bee[j+1];
+      for(int j=0;j<i;j++) {
 	assert(bee[j+1]!=Coeff_t(0.0));
 	leem[i]*= aee[j]/bee[j+1];
      }
      uee[i] =-aee[i]/bee[i];   // up-diag entry on the ith row
@@ -478,7 +484,7 @@ void CayleyFermion5D<Impl>::SetCoefficientsInternal(RealD zolo_hi,std::vector<Co
  { 
    Coeff_t delta_d=mass*cee[Ls-1];
    for(int j=0;j<Ls-1;j++) {
-      //      assert(fabs(bee[j])>0.0);
+      assert(bee[j] != Coeff_t(0.0));
      delta_d *= cee[j]/bee[j];
    }
    dee[Ls-1] += delta_d;
--- a/lib/qcd/action/fermion/WilsonCompressor.h
+++ b/lib/qcd/action/fermion/WilsonCompressor.h
@@ -238,7 +238,33 @@ template<typename HCS,typename HS,typename S> using WilsonCompressor = WilsonCom
 template<class vobj,class cobj>
 class WilsonStencil : public CartesianStencil<vobj,cobj> {
 public:
-
+  double timer0;
  double timer1;
  double timer2;
  double timer3;
  double timer4;
  double timer5;
  double timer6;
  uint64_t callsi;
  void ZeroCountersi(void)
  {
    timer0=0;
    timer1=0;
    timer2=0;
    timer3=0;
    timer4=0;
    timer5=0;
    timer6=0;
    callsi=0;
  }
  void Reporti(int calls)
  {
    if ( timer0 ) std::cout << GridLogMessage << " timer0 (HaloGatherOpt) " <<timer0/calls <<std::endl;
    if ( timer1 ) std::cout << GridLogMessage << " timer1 (Communicate)   " <<timer1/calls <<std::endl;
    if ( timer2 ) std::cout << GridLogMessage << " timer2 (CommsMerge )   " <<timer2/calls <<std::endl;
    if ( timer3 ) std::cout << GridLogMessage << " timer3 (commsMergeShm) " <<timer3/calls <<std::endl;
    if ( timer4 ) std::cout << GridLogMessage << " timer4 " <<timer4 <<std::endl;
  }
  typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
  std::vector<int> same_node;
@@ -252,6 +278,7 @@ public:
    : CartesianStencil<vobj,cobj> (grid,npoints,checkerboard,directions,distances) ,
    same_node(npoints)
  { 
    ZeroCountersi();
    surface_list.resize(0);
  };
@@ -261,7 +288,6 @@ public:
    // Here we know the distance is 1 for WilsonStencil
    for(int point=0;point<this->_npoints;point++){
      same_node[point] = this->SameNode(point);
      //      std::cout << " dir " <<point<<" same_node " <<same_node[point]<<std::endl;
    }
    for(int site = 0 ;site< vol4;site++){
@@ -282,17 +308,28 @@ public:
  {
    std::vector<std::vector<CommsRequest_t> > reqs;
    this->HaloExchangeOptGather(source,compress);
-    this->CommunicateBegin(reqs);
+    double t1=usecond();
-    this->CommunicateComplete(reqs);
+    // Asynchronous MPI calls multidirectional, Isend etc...
    //    this->CommunicateBegin(reqs);
    //    this->CommunicateComplete(reqs);
    // Non-overlapped directions within a thread. Asynchronous calls except MPI3, threaded up to comm threads ways.
    this->Communicate();
    double t2=usecond(); timer1 += t2-t1;
    this->CommsMerge(compress);
    double t3=usecond(); timer2 += t3-t2;
    this->CommsMergeSHM(compress);
    double t4=usecond(); timer3 += t4-t3;
  }
  template <class compressor>
  void HaloExchangeOptGather(const Lattice<vobj> &source,compressor &compress) 
  {
    this->Prepare();
    double t0=usecond();
    this->HaloGatherOpt(source,compress);
    double t1=usecond();
    timer0 += t1-t0;
    callsi++;
  }
  template <class compressor>
@@ -304,7 +341,9 @@ public:
    typedef typename compressor::SiteHalfSpinor     SiteHalfSpinor;
    typedef typename compressor::SiteHalfCommSpinor SiteHalfCommSpinor;
    this->mpi3synctime_g-=usecond();
    this->_grid->StencilBarrier();
    this->mpi3synctime_g+=usecond();
    assert(source._grid==this->_grid);
    this->halogtime-=usecond();
@@ -323,7 +362,6 @@ public:
    int dag = compress.dag;
    int face_idx=0;
    if ( dag ) { 
      //	std::cout << " Optimised Dagger compress " <<std::endl;
      assert(same_node[Xp]==this->HaloGatherDir(source,XpCompress,Xp,face_idx));
      assert(same_node[Yp]==this->HaloGatherDir(source,YpCompress,Yp,face_idx));
      assert(same_node[Zp]==this->HaloGatherDir(source,ZpCompress,Zp,face_idx));
--- a/lib/qcd/action/fermion/WilsonFermion5D.cc
+++ b/lib/qcd/action/fermion/WilsonFermion5D.cc
@@ -123,22 +123,24 @@ WilsonFermion5D<Impl>::WilsonFermion5D(GaugeField &_Umu,
  int vol4;
  vol4=FourDimGrid.oSites();
  Stencil.BuildSurfaceList(LLs,vol4);
  vol4=FourDimRedBlackGrid.oSites();
  StencilEven.BuildSurfaceList(LLs,vol4);
   StencilOdd.BuildSurfaceList(LLs,vol4);
-  std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size()
+   //  std::cout << GridLogMessage << " SurfaceLists "<< Stencil.surface_list.size()
-                       <<" " << StencilEven.surface_list.size()<<std::endl;
+   //                       <<" " << StencilEven.surface_list.size()<<std::endl;
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::Report(void)
 {
-    std::vector<int> latt = GridDefaultLatt();          
+  RealD NP     = _FourDimGrid->_Nprocessors;
-    RealD volume = Ls;  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
+  RealD NN     = _FourDimGrid->NodeCount();
-    RealD NP = _FourDimGrid->_Nprocessors;
+  RealD volume = Ls;  
-    RealD NN = _FourDimGrid->NodeCount();
+  std::vector<int> latt = _FourDimGrid->GlobalDimensions();
  for(int mu=0;mu<Nd;mu++) volume=volume*latt[mu];
  if ( DhopCalls > 0 ) {
    std::cout << GridLogMessage << "#### Dhop calls report " << std::endl;
@@ -184,6 +186,11 @@ void WilsonFermion5D<Impl>::Report(void)
    std::cout << GridLogMessage << "WilsonFermion5D StencilEven"<<std::endl;  StencilEven.Report();
    std::cout << GridLogMessage << "WilsonFermion5D StencilOdd" <<std::endl;  StencilOdd.Report();
  }
  if ( DhopCalls > 0){
    std::cout << GridLogMessage << "WilsonFermion5D Stencil     Reporti()"    <<std::endl;  Stencil.Reporti(DhopCalls);
    std::cout << GridLogMessage << "WilsonFermion5D StencilEven Reporti()"<<std::endl;  StencilEven.Reporti(DhopCalls);
    std::cout << GridLogMessage << "WilsonFermion5D StencilOdd  Reporti()" <<std::endl;  StencilOdd.Reporti(DhopCalls);
  }
 }
 template<class Impl>
@@ -203,6 +210,9 @@ void WilsonFermion5D<Impl>::ZeroCounters(void) {
  Stencil.ZeroCounters();
  StencilEven.ZeroCounters();
  StencilOdd.ZeroCounters();
  Stencil.ZeroCountersi();
  StencilEven.ZeroCountersi();
  StencilOdd.ZeroCountersi();
 }
@@ -379,7 +389,6 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
 {
 #ifdef GRID_OMP
  //  assert((dag==DaggerNo) ||(dag==DaggerYes));
  typedef CartesianCommunicator::CommsRequest_t CommsRequest_t;
  Compressor compressor(dag);
@@ -388,46 +397,70 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
  DhopFaceTime-=usecond();
  st.HaloExchangeOptGather(in,compressor);
-  DhopFaceTime+=usecond();
+  st.CommsMergeSHM(compressor);// Could do this inside parallel region overlapped with comms
  std::vector<std::vector<CommsRequest_t> > reqs;
  // Rely on async comms; start comms before merge of local data
  DhopCommTime-=usecond();
  st.CommunicateBegin(reqs);
  DhopFaceTime-=usecond();
  st.CommsMergeSHM(compressor);
  DhopFaceTime+=usecond();
-  // Perhaps use omp task and region
+  double ctime=0;
-#pragma omp parallel 
+  double ptime=0;
  //////////////////////////////////////////////////////////////////////////////////////////////////////
  // Ugly explicit thread mapping introduced for OPA reasons.
  //////////////////////////////////////////////////////////////////////////////////////////////////////
 #pragma omp parallel reduction(max:ctime) reduction(max:ptime)
  { 
    int tid = omp_get_thread_num();
    int nthreads = omp_get_num_threads();
-    int me = omp_get_thread_num();
+    int ncomms = CartesianCommunicator::nCommThreads;
-    int myoff, mywork;
+    if (ncomms == -1) ncomms = 1;
    assert(nthreads > ncomms);
    if (tid >= ncomms) {
      double start = usecond();
      nthreads -= ncomms;
      int ttid = tid - ncomms;
      int n = U._grid->oSites();
      int chunk = n / nthreads;
      int rem = n % nthreads;
      int myblock, myn;
      if (ttid < rem) {
 	myblock = ttid * chunk + ttid;
 	myn = chunk+1;
      } else {
 	myblock = ttid*chunk + rem;
 	myn = chunk;
      }
-    GridThread::GetWork(len,me-1,mywork,myoff,nthreads-1);
+      // do the compute
-    int sF = LLs * myoff;
+      if (dag == DaggerYes) {
-
+	for (int ss = myblock; ss < myblock+myn; ++ss) {
-    if ( me == 0 ) {
+	  int sU = ss;
-      st.CommunicateComplete(reqs);
+	  int sF = LLs * sU;
-      DhopCommTime+=usecond();
+	  Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0);
-    } else { 
+	}
-      // Interior links in stencil
+      } else {
-      if ( me==1 ) DhopComputeTime-=usecond();
+	for (int ss = myblock; ss < myblock+myn; ++ss) {
-      if (dag == DaggerYes) Kernels::DhopSiteDag(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,1,0);
+	  int sU = ss;
-      else      	    Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,myoff,LLs,mywork,in,out,1,0);
+	  int sF = LLs * sU;
-      if ( me==1 ) DhopComputeTime+=usecond();
+	  Kernels::DhopSite(st,lo,U,st.CommBuf(),sF,sU,LLs,1,in,out,1,0);
 	}
      }
 	ptime = usecond() - start;
    }
    {
      double start = usecond();
      st.CommunicateThreaded();
      ctime = usecond() - start;
    }
  }
  DhopCommTime += ctime;
  DhopComputeTime+=ptime;
  // First to enter, last to leave timing
  st.CollateThreads();
  DhopFaceTime-=usecond();
  st.CommsMerge(compressor);
  DhopFaceTime+=usecond();
  // Load imbalance alert. Should use dynamic schedule OMP for loop
  // Perhaps create a list of only those sites with face work, and 
  // load balance process the list.
  DhopComputeTime2-=usecond();
  if (dag == DaggerYes) {
    int sz=st.surface_list.size();
@@ -448,11 +481,9 @@ void WilsonFermion5D<Impl>::DhopInternalOverlappedComms(StencilImpl & st, Lebesg
 #else 
  assert(0);
 #endif
 }
 template<class Impl>
 void WilsonFermion5D<Impl>::DhopInternalSerialComms(StencilImpl & st, LebesgueOrder &lo,
 					 DoubledGaugeField & U,
--- a/lib/qcd/utils/GaugeFix.h
+++ b/lib/qcd/utils/GaugeFix.h
@@ -26,6 +26,8 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    /*  END LEGAL */
 //#include <Grid/Grid.h>
 #ifndef GRID_QCD_GAUGE_FIX_H
 #define GRID_QCD_GAUGE_FIX_H
 namespace Grid {
 namespace QCD {
@@ -188,3 +190,4 @@ class FourierAcceleratedGaugeFixer  : public Gimpl {
 }
 }
 #endif
--- a/lib/stencil/Stencil.h
+++ b/lib/stencil/Stencil.h
@@ -176,6 +176,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
  // Timing info; ugly; possibly temporary
  /////////////////////////////////////////
  double commtime;
  double mpi3synctime;
  double mpi3synctime_g;
  double shmmergetime;
  double gathertime;
  double gathermtime;
  double halogtime;
@@ -185,6 +188,10 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
  double splicetime;
  double nosplicetime;
  double calls;
  std::vector<double> comm_bytes_thr;
  std::vector<double> comm_time_thr;
  std::vector<double> comm_enter_thr;
  std::vector<double> comm_leave_thr;
  ////////////////////////////////////////
  // Stencil query
@@ -248,35 +255,120 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
  //////////////////////////////////////////
  // Comms packet queue for asynch thread
  //////////////////////////////////////////
  void CommunicateThreaded()
  {
 #ifdef GRID_OMP
    // must be called in parallel region
    int mythread = omp_get_thread_num();
    int nthreads = CartesianCommunicator::nCommThreads;
 #else
    int mythread = 0;
    int nthreads = 1;
 #endif
    if (nthreads == -1) nthreads = 1;
    if (mythread < nthreads) {
      comm_enter_thr[mythread] = usecond();
      for (int i = mythread; i < Packets.size(); i += nthreads) {
 	uint64_t bytes = _grid->StencilSendToRecvFrom(Packets[i].send_buf,
 						      Packets[i].to_rank,
 						      Packets[i].recv_buf,
 						      Packets[i].from_rank,
 						      Packets[i].bytes,i);
 	comm_bytes_thr[mythread] += bytes;
      }
      comm_leave_thr[mythread]= usecond();
      comm_time_thr[mythread] += comm_leave_thr[mythread] - comm_enter_thr[mythread];
    }
  }
  void CollateThreads(void)
  {
    int nthreads = CartesianCommunicator::nCommThreads;
    double first=0.0;
    double last =0.0;
    for(int t=0;t<nthreads;t++) {
      double t0 = comm_enter_thr[t];
      double t1 = comm_leave_thr[t];
      comms_bytes+=comm_bytes_thr[t];
      comm_enter_thr[t] = 0.0;
      comm_leave_thr[t] = 0.0;
      comm_time_thr[t]   = 0.0;
      comm_bytes_thr[t]=0;
      if ( first == 0.0 ) first = t0;                   // first is t0
      if ( (t0 > 0.0) && ( t0 < first ) ) first = t0;   // min time seen
      if ( t1 > last ) last = t1;                       // max time seen
    }
    commtime+= last-first;
  }
  void CommunicateBegin(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
    reqs.resize(Packets.size());
    commtime-=usecond();
    for(int i=0;i<Packets.size();i++){
      comms_bytes+=_grid->StencilSendToRecvFromBegin(reqs[i],
-					  Packets[i].send_buf,
+						     Packets[i].send_buf,
-					  Packets[i].to_rank,
+						     Packets[i].to_rank,
-					  Packets[i].recv_buf,
+						     Packets[i].recv_buf,
-					  Packets[i].from_rank,
+						     Packets[i].from_rank,
-					  Packets[i].bytes);
+						     Packets[i].bytes,i);
    }
  }
  void CommunicateComplete(std::vector<std::vector<CommsRequest_t> > &reqs)
  {
    for(int i=0;i<Packets.size();i++){
-      _grid->StencilSendToRecvFromComplete(reqs[i]);
+      _grid->StencilSendToRecvFromComplete(reqs[i],i);
    }
    commtime+=usecond();
  }
  void Communicate(void)
  {
 #ifdef GRID_OMP
 #pragma omp parallel 
    {
      // must be called in parallel region
      int mythread  = omp_get_thread_num();
      int maxthreads= omp_get_max_threads();
      int nthreads = CartesianCommunicator::nCommThreads;
      assert(nthreads <= maxthreads);
      if (nthreads == -1) nthreads = 1;
 #else
      int mythread = 0;
      int nthreads = 1;
 #endif
      if (mythread < nthreads) {
 	for (int i = mythread; i < Packets.size(); i += nthreads) {
 	  double start = usecond();
 	  comm_bytes_thr[mythread] += _grid->StencilSendToRecvFrom(Packets[i].send_buf,
 								   Packets[i].to_rank,
 								   Packets[i].recv_buf,
 								   Packets[i].from_rank,
 								   Packets[i].bytes,i);
 	  comm_time_thr[mythread] += usecond() - start;
 	}
      }
 #ifdef GRID_OMP
    }
 #endif
  }
  template<class compressor> void HaloExchange(const Lattice<vobj> &source,compressor &compress) 
  {
    std::vector<std::vector<CommsRequest_t> > reqs;
    Prepare();
    HaloGather(source,compress);
-    CommunicateBegin(reqs);
+    // Concurrent
-    CommunicateComplete(reqs);
+    //CommunicateBegin(reqs);
    //CommunicateComplete(reqs);
    // Sequential, possibly threaded
    Communicate();
    CommsMergeSHM(compress); 
    CommsMerge(compress); 
  }
@@ -337,7 +429,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
  template<class compressor>
  void HaloGather(const Lattice<vobj> &source,compressor &compress)
  {
    mpi3synctime_g-=usecond();
    _grid->StencilBarrier();// Synch shared memory on a single nodes
    mpi3synctime_g+=usecond();
    // conformable(source._grid,_grid);
    assert(source._grid==_grid);
@@ -397,8 +491,12 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
    CommsMerge(decompress,Mergers,Decompressions); 
  }
  template<class decompressor>  void CommsMergeSHM(decompressor decompress) {
    mpi3synctime-=usecond();    
    _grid->StencilBarrier();// Synch shared memory on a single nodes
    mpi3synctime+=usecond();    
    shmmergetime-=usecond();    
    CommsMerge(decompress,MergersSHM,DecompressionsSHM);
    shmmergetime+=usecond();    
  }
  template<class decompressor>
@@ -442,7 +540,12 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 		  int checkerboard,
 		  const std::vector<int> &directions,
 		  const std::vector<int> &distances) 
-   :   _permute_type(npoints), _comm_buf_size(npoints)
+   : _permute_type(npoints), 
    _comm_buf_size(npoints),
    comm_bytes_thr(npoints), 
    comm_enter_thr(npoints),
    comm_leave_thr(npoints), 
       comm_time_thr(npoints)
  {
    face_table_computed=0;
    _npoints = npoints;
@@ -996,6 +1099,15 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
  void ZeroCounters(void) {
    gathertime = 0.;
    commtime = 0.;
    mpi3synctime=0.;
    mpi3synctime_g=0.;
    shmmergetime=0.;
    for(int i=0;i<_npoints;i++){
      comm_time_thr[i]=0;
      comm_bytes_thr[i]=0;
      comm_enter_thr[i]=0;
      comm_leave_thr[i]=0;
    }
    halogtime = 0.;
    mergetime = 0.;
    decompresstime = 0.;
@@ -1011,6 +1123,18 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 #define PRINTIT(A) AVERAGE(A); std::cout << GridLogMessage << " Stencil " << #A << " "<< A/calls<<std::endl;
    RealD NP = _grid->_Nprocessors;
    RealD NN = _grid->NodeCount();
    double t = 0;
    // if comm_time_thr is set they were all done in parallel so take the max
    // but add up the bytes
    int threaded = 0 ;
    for (int i = 0; i < 8; ++i) {
      if ( comm_time_thr[i]>0.0 ) {
 	threaded = 1;
 	comms_bytes += comm_bytes_thr[i];
 	if (t < comm_time_thr[i]) t = comm_time_thr[i];
      }
    }
    if (threaded) commtime += t;
    _grid->GlobalSum(commtime);    commtime/=NP;
    if ( calls > 0. ) {
@@ -1026,6 +1150,9 @@ class CartesianStencil { // Stencil runs along coordinate axes only; NO diagonal
 	std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000. << " GB/s per rank"<<std::endl;
 	std::cout << GridLogMessage << " Stencil " << comms_bytes/commtime/1000.*NP/NN << " GB/s per node"<<std::endl;
      }
      PRINTIT(mpi3synctime);
      PRINTIT(mpi3synctime_g);
      PRINTIT(shmmergetime);
      PRINTIT(splicetime);
      PRINTIT(nosplicetime);
    }
--- a/lib/util/Init.cc
+++ b/lib/util/Init.cc
@@ -222,6 +222,11 @@ void Grid_init(int *argc,char ***argv)
    CartesianCommunicator::MAX_MPI_SHM_BYTES = MB*1024*1024;
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--shm-hugepages") ){
    CartesianCommunicator::Hugepages = 1;
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--debug-signals") ){
    Grid_debug_handler_init();
  }
@@ -304,6 +309,7 @@ void Grid_init(int *argc,char ***argv)
    std::cout<<GridLogMessage<<"  --threads n     : default number of OMP threads"<<std::endl;
    std::cout<<GridLogMessage<<"  --grid n.n.n.n  : default Grid size"<<std::endl;
    std::cout<<GridLogMessage<<"  --shm  M        : allocate M megabytes of shared memory for comms"<<std::endl;
    std::cout<<GridLogMessage<<"  --shm-hugepages : use explicit huge pages in mmap call "<<std::endl;    
    std::cout<<GridLogMessage<<std::endl;
    std::cout<<GridLogMessage<<"Verbose and debug:"<<std::endl;
    std::cout<<GridLogMessage<<std::endl;
@@ -317,7 +323,7 @@ void Grid_init(int *argc,char ***argv)
    std::cout<<GridLogMessage<<std::endl;
    std::cout<<GridLogMessage<<"  --comms-concurrent : Asynchronous MPI calls; several dirs at a time "<<std::endl;    
    std::cout<<GridLogMessage<<"  --comms-sequential : Synchronous MPI calls; one dirs at a time "<<std::endl;    
-    std::cout<<GridLogMessage<<"  --comms-overlap : Overlap comms with compute "<<std::endl;    
+    std::cout<<GridLogMessage<<"  --comms-overlap    : Overlap comms with compute "<<std::endl;    
    std::cout<<GridLogMessage<<std::endl;
    std::cout<<GridLogMessage<<"  --dslash-generic: Wilson kernel for generic Nc"<<std::endl;    
    std::cout<<GridLogMessage<<"  --dslash-unroll : Wilson kernel for Nc=3"<<std::endl;    
@@ -356,10 +362,15 @@ void Grid_init(int *argc,char ***argv)
  if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-sequential") ){
    CartesianCommunicator::SetCommunicatorPolicy(CartesianCommunicator::CommunicatorPolicySequential);
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--lebesgue") ){
    LebesgueOrder::UseLebesgueOrder=1;
  }
-
+  CartesianCommunicator::nCommThreads = -1;
  if( GridCmdOptionExists(*argv,*argv+*argc,"--comms-threads") ){
    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--comms-threads");
    GridCmdOptionInt(arg,CartesianCommunicator::nCommThreads);
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--cacheblocking") ){
    arg= GridCmdOptionPayload(*argv,*argv+*argc,"--cacheblocking");
    GridCmdOptionIntVector(arg,LebesgueOrder::Block);
@@ -374,7 +385,10 @@ void Grid_init(int *argc,char ***argv)
 		  Grid_default_latt,
 		  Grid_default_mpi);
-  std::cout << GridLogDebug << "Requesting "<< CartesianCommunicator::MAX_MPI_SHM_BYTES <<" byte stencil comms buffers "<<std::endl;
+  std::cout << GridLogMessage << "Requesting "<< CartesianCommunicator::MAX_MPI_SHM_BYTES <<" byte stencil comms buffers "<<std::endl;
  if ( CartesianCommunicator::Hugepages) {
    std::cout << GridLogMessage << "Mapped stencil comms buffers as MAP_HUGETLB "<<std::endl;
  }
  if( GridCmdOptionExists(*argv,*argv+*argc,"--decomposition") ){
    std::cout<<GridLogMessage<<"Grid Default Decomposition patterns\n";
@@ -393,7 +407,7 @@ void Grid_init(int *argc,char ***argv)
 void Grid_finalize(void)
 {
-#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3)
+#if defined (GRID_COMMS_MPI) || defined (GRID_COMMS_MPI3) || defined (GRID_COMMS_MPIT)
  MPI_Finalize();
  Grid_unquiesce_nodes();
 #endif
--- a/tests/core/Test_fft_gfix.cc
+++ b/tests/core/Test_fft_gfix.cc
@@ -28,6 +28,9 @@ Author: Peter Boyle <paboyle@ph.ed.ac.uk>
    /*  END LEGAL */
 #include <Grid/Grid.h>
 using namespace Grid;
 using namespace Grid::QCD;
 int main (int argc, char ** argv)
 {
  std::vector<int> seeds({1,2,3,4});
@@ -82,6 +85,7 @@ int main (int argc, char ** argv)
  Uorg = Uorg - Umu;
  std::cout << " Norm Difference "<< norm2(Uorg) << std::endl;
  std::cout << " Norm "<< norm2(Umu) << std::endl;
  std::cout<< "*****************************************************************" <<std::endl;