MPI is now working and passing basic tests. Will start to construct a more sensible test suite shortly

since testing requirements now go beyond what a single Grid_main.cc can do. Will need a more organised src tree for this and will require substantial reorg of build system.
2025-10-13 12:44:42 +01:00 · 2015-04-03 04:52:53 +01:00
parent 19b9069453
commit 7b97e50b7b
17 changed files with 1298 additions and 954 deletions
--- a/Grid.h
+++ b/Grid.h
@@ -52,7 +52,7 @@

 namespace dpo {

-  void Grid_init(void);
+  void Grid_init(int *argc,char ***argv);
  double usecond(void);
  void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr);
  void Grid_debug_handler_init(void);
--- a/Grid_Cartesian.h
+++ b/Grid_Cartesian.h
@@ -41,8 +41,8 @@ public:
    std::vector<int> _simd_layout;     // Which dimensions get relayed out over simd lanes.


-    std::vector<int> _fdimensions;// Global dimensions of array with cb removed
-    std::vector<int> _gdimensions;// Global dimensions of array
+    std::vector<int> _fdimensions;// Global dimensions of array prior to cb removal
+    std::vector<int> _gdimensions;// Global dimensions of array after cb removal
    std::vector<int> _ldimensions;// local dimensions of array with processor images removed
    std::vector<int> _rdimensions;// Reduced local dimensions with simd lane images and processor images removed 

@@ -88,11 +88,20 @@ public:
        for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(rcoor[d]/_rdimensions[d]);
        return idx;
    }
+    inline int iCoordFromIsite(int lane,int mu)
+    {
+      std::vector<int> coor(_ndimension);
+      for(int d=0;d<_ndimension;d++){
+	coor[d] = lane % _simd_layout[d];
+	lane    = lane / _simd_layout[d];
+      }
+      return coor[mu];
+    }
    
    inline int oSites(void) { return _osites; };
    inline int iSites(void) { return _isites; };

-    inline int CheckerboardFromOsite (int Osite){
+    inline int CheckerBoardFromOsite (int Osite){
      std::vector<int> ocoor;
      CoordFromOsite(ocoor,Osite);
      int ss=0;
@@ -109,6 +118,7 @@ public:
      }
    }

+    virtual int CheckerBoarded(int dim)=0;
    virtual int CheckerBoard(std::vector<int> site)=0;
    virtual int CheckerBoardDestination(int source_cb,int shift)=0;
    virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
@@ -116,6 +126,9 @@ public:

 class GridCartesian: public SimdGrid {
 public:
+    virtual int CheckerBoarded(int dim){
+      return 0;
+    }
    virtual int CheckerBoard(std::vector<int> site){
        return 0;
    }
@@ -199,6 +212,10 @@ public:
 class GridRedBlackCartesian : public SimdGrid
 {
 public:
+    virtual int CheckerBoarded(int dim){
+      if( dim==0) return 1;
+      else return 0;
+    }
    virtual int CheckerBoard(std::vector<int> site){
      return (site[0]+site[1]+site[2]+site[3])&0x1;
    }
@@ -215,11 +232,13 @@ public:

      // Probably faster with table lookup;
      // or by looping over x,y,z and multiply rather than computing checkerboard.
-      int ocb=CheckerboardFromOsite(osite);
+      int ocb=CheckerBoardFromOsite(osite);
 	  
      if ( (source_cb+ocb)&1 ) {
+	printf("Checkerboard shift %d\n",(shift)/2);
 	return (shift)/2;
      } else {
+	printf("Checkerboard shift %d\n",(shift+1)/2);
 	return (shift+1)/2;
      }
    }
--- a/Grid_Communicator.h
+++ b/Grid_Communicator.h
@@ -3,12 +3,16 @@
 ///////////////////////////////////
 // Processor layout information
 ///////////////////////////////////
+#ifdef GRID_COMMS_MPI
+#include <mpi.h>
+#endif
 namespace dpo {
 class CartesianCommunicator {
  public:    

  // Communicator should know nothing of the physics grid, only processor grid.

+    int              _Nprocessors;     // How many in all
    std::vector<int> _processors;      // Which dimensions get relayed out over processors lanes.
    int              _processor;       // linear processor rank
    std::vector<int> _processor_coor;  // linear processor coordinate
@@ -20,7 +24,9 @@ class CartesianCommunicator {

    CartesianCommunicator(std::vector<int> &pdimensions_in);

+    void ShiftedRanks(int dim,int shift,int & source, int & dest);
    int  Rank(std::vector<int> coor);
+    int  MyRank(void);
    void GlobalSumF(float &);
    void GlobalSumFVector(float *,int N);

@@ -28,9 +34,9 @@ class CartesianCommunicator {
    void GlobalSumFVector(double *,int N);

    void SendToRecvFrom(void *xmit,
-			std::vector<int> to_coordinate,
+			int xmit_to_rank,
 			void *recv,
-			std::vector<int> from_coordinate,
+			int recv_from_rank,
 			int bytes);
    
 }; 
--- a/Grid_Lattice.h
+++ b/Grid_Lattice.h
@@ -23,7 +23,8 @@ public:
    SimdGrid *_grid;
    int checkerboard;
    std::vector<vobj,alignedAllocator<vobj> > _odata;
-
+    typedef typename vobj::scalar_type scalar_type;
+    typedef typename vobj::vector_type vector_type;
 public:


@@ -36,6 +37,8 @@ public:
    }
    

+#include <Grid_cshift_common.h>
+
 #ifdef GRID_COMMS_NONE
 #include <Grid_none_cshift.h>
 #endif
--- a/Grid_config.h
+++ b/Grid_config.h
@@ -11,10 +11,10 @@
 /* #undef AVX512 */

 /* GRID_COMMS_FAKE */
-#define GRID_COMMS_FAKE 1
+/* #undef GRID_COMMS_FAKE */

 /* GRID_COMMS_MPI */
-/* #undef GRID_COMMS_MPI */
+#define GRID_COMMS_MPI 1

 /* GRID_COMMS_NONE */
 /* #undef GRID_COMMS_NONE */
--- a/Grid_config.h.in
+++ b/Grid_config.h.in
@@ -9,6 +9,15 @@
 /* AVX512 */
 #undef AVX512

+/* GRID_COMMS_FAKE */
+#undef GRID_COMMS_FAKE
+
+/* GRID_COMMS_MPI */
+#undef GRID_COMMS_MPI
+
+/* GRID_COMMS_NONE */
+#undef GRID_COMMS_NONE
+
 /* Define to 1 if you have the `gettimeofday' function. */
 #undef HAVE_GETTIMEOFDAY

@@ -60,6 +69,9 @@
 /* Define to the one symbol short name of this package. */
 #undef PACKAGE_TARNAME

+/* Define to the home page for this package. */
+#undef PACKAGE_URL
+
 /* Define to the version of this package. */
 #undef PACKAGE_VERSION

--- a/Grid_cshift_common.h
+++ b/Grid_cshift_common.h
@@ -0,0 +1,384 @@
+#ifndef _GRID_CSHIFT_COMMON_H_
+#define _GRID_CSHIFT_COMMON_H_
+
+//////////////////////////////////////////////////////
+// Gather for when there is no need to SIMD split
+//////////////////////////////////////////////////////
+friend void Gather_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllocator<vobj> > &buffer,             int dimension,int plane,int cbmask)
+{
+  int rd = rhs._grid->_rdimensions[dimension];
+
+  //  printf("Gather plane _simple mask %d\n",cbmask);
+
+  if ( !rhs._grid->CheckerBoarded(dimension) ) {
+
+    int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
+    int o   = 0;                                    // relative offset to base within plane
+    int bo  = 0;                                    // offset in buffer
+
+    // Simple block stride gather of SIMD objects
+#pragma omp parallel for collapse(2)
+    for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
+      for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
+	buffer[bo++]=rhs._odata[so+o+b];
+      }
+      o +=rhs._grid->_slice_stride[dimension];
+    }
+
+  } else { 
+
+    int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
+    int o   = 0;                                      // relative offset to base within plane
+    int bo  = 0;                                      // offset in buffer
+
+    //    int jjj=0;
+#pragma omp parallel for collapse(2)
+    for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
+      for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
+
+	int ocb=1<<rhs._grid->CheckerBoardFromOsite(o+b);// Could easily be a table lookup
+	if ( ocb &cbmask ) {
+	  buffer[bo]=rhs._odata[so+o+b];
+	  //	  float * ptr = (float *)& rhs._odata[so+o+b];
+	  //	  if( (cbmask!=3)&&(jjj<8)){
+	  //	    printf("Gather_plane_simple %d %le bo %d\n",so+o+b,*ptr,bo);
+	  //	    jjj++;
+	  //	  }
+	  bo++;
+	}
+
+      }
+      o +=rhs._grid->_slice_stride[dimension];
+    }
+  }
+}
+
+
+//////////////////////////////////////////////////////
+// Gather for when there *is* need to SIMD split
+//////////////////////////////////////////////////////
+friend void Gather_plane_extract(Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane,int cbmask)
+{
+  int rd = rhs._grid->_rdimensions[dimension];
+
+  if ( !rhs._grid->CheckerBoarded(dimension) ) {
+
+    int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
+    int o   = 0;                                    // relative offset to base within plane
+    int bo  = 0;                                    // offset in buffer
+
+    // Simple block stride gather of SIMD objects
+#pragma omp parallel for collapse(2)
+    for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
+      for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
+	extract(rhs._odata[so+o+b],pointers);
+      }
+      o +=rhs._grid->_slice_stride[dimension];
+    }
+
+  } else { 
+
+    int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
+    int o   = 0;                                      // relative offset to base within plane
+    int bo  = 0;                                      // offset in buffer
+    
+#pragma omp parallel for collapse(2)
+    for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
+      for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
+
+	int ocb=1<<rhs._grid->CheckerBoardFromOsite(o+b);
+	if ( ocb & cbmask ) {
+	  extract(rhs._odata[so+o+b],pointers);
+	}
+
+      }
+      o +=rhs._grid->_slice_stride[dimension];
+    }
+  }
+}
+
+
+
+//////////////////////////////////////////////////////
+// Scatter for when there is no need to SIMD split
+//////////////////////////////////////////////////////
+friend void Scatter_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllocator<vobj> > &buffer,             int dimension,int plane,int cbmask)
+{
+  int rd = rhs._grid->_rdimensions[dimension];
+
+  if ( !rhs._grid->CheckerBoarded(dimension) ) {
+
+    int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
+    int o   = 0;                                    // relative offset to base within plane
+    int bo  = 0;                                    // offset in buffer
+
+    // Simple block stride gather of SIMD objects
+#pragma omp parallel for collapse(2)
+    for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
+      for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
+	rhs._odata[so+o+b]=buffer[bo++];
+      }
+      o +=rhs._grid->_slice_stride[dimension];
+    }
+
+  } else { 
+
+    int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
+    int o   = 0;                                      // relative offset to base within plane
+    int bo  = 0;                                      // offset in buffer
+    
+#pragma omp parallel for collapse(2)
+    for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
+      for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
+
+	int ocb=1<<rhs._grid->CheckerBoardFromOsite(o+b);// Could easily be a table lookup
+	if ( ocb & cbmask ) {
+	  rhs._odata[so+o+b]=buffer[bo++];
+	}
+
+      }
+      o +=rhs._grid->_slice_stride[dimension];
+    }
+  }
+}
+
+
+//////////////////////////////////////////////////////
+// Scatter for when there *is* need to SIMD split
+//////////////////////////////////////////////////////
+friend void Scatter_plane_merge(Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane,int cbmask)
+{
+  int rd = rhs._grid->_rdimensions[dimension];
+
+  if ( !rhs._grid->CheckerBoarded(dimension) ) {
+
+    int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
+    int o   = 0;                                    // relative offset to base within plane
+    int bo  = 0;                                    // offset in buffer
+
+    // Simple block stride gather of SIMD objects
+#pragma omp parallel for collapse(2)
+    for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
+      for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
+	merge(rhs._odata[so+o+b],pointers);
+      }
+      o +=rhs._grid->_slice_stride[dimension];
+    }
+
+  } else { 
+
+    int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
+    int o   = 0;                                      // relative offset to base within plane
+    int bo  = 0;                                      // offset in buffer
+    
+#pragma omp parallel for collapse(2)
+    for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
+      for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
+
+	int ocb=1<<rhs._grid->CheckerBoardFromOsite(o+b);
+	if ( ocb&cbmask ) {
+	  merge(rhs._odata[so+o+b],pointers);
+	}
+
+      }
+      o +=rhs._grid->_slice_stride[dimension];
+    }
+  }
+}
+
+
+//////////////////////////////////////////////////////
+// local to node block strided copies
+//////////////////////////////////////////////////////
+// if lhs is odd, rhs even??
+friend void Copy_plane(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask)
+{
+  int rd = rhs._grid->_rdimensions[dimension];
+
+  if ( !rhs._grid->CheckerBoarded(dimension) ) {
+
+    int o   = 0;                                     // relative offset to base within plane
+    int ro  = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
+    int lo  = lplane*lhs._grid->_ostride[dimension]; // offset in buffer
+
+  // Simple block stride gather of SIMD objects
+#pragma omp parallel for collapse(2)
+    for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
+      for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
+	lhs._odata[lo+o+b]=rhs._odata[ro+o+b];
+      }
+      o +=rhs._grid->_slice_stride[dimension];
+    }
+
+  } else {
+
+    int ro  = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
+    int lo  = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane 
+    int o   = 0;                                     // relative offset to base within plane
+    //    int jjj=0;    
+#pragma omp parallel for collapse(2)
+    for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
+      for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
+
+	int ocb=1<<lhs._grid->CheckerBoardFromOsite(o+b);
+
+	if ( ocb&cbmask ) {
+	  lhs._odata[lo+o+b]=rhs._odata[ro+o+b];
+	  //	  float *ptr =(float *) &rhs._odata[ro+o+b];
+	  //	  if((cbmask!=0x3)&&jjj<8) {
+	    //	    printf("Copy_plane %d %le n,b=%d,%d mask %d ocb %d\n",ro+o+b,*ptr,n,b,cbmask,ocb);
+	    //	    jjj++;
+	    //	  }
+	}
+
+      }
+      o +=rhs._grid->_slice_stride[dimension];
+    }
+
+  }
+}
+
+friend void Copy_plane_permute(Lattice<vobj>& lhs,Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type)
+{
+  int rd = rhs._grid->_rdimensions[dimension];
+
+
+  if ( !rhs._grid->CheckerBoarded(dimension) ) {
+
+    int o   = 0;                                     // relative offset to base within plane
+    int ro  = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
+    int lo  = lplane*rhs._grid->_ostride[dimension]; // offset in buffer
+
+  // Simple block stride gather of SIMD objects
+#pragma omp parallel for collapse(2)
+    for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
+      for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
+	permute(lhs._odata[lo+o+b],rhs._odata[ro+o+b],permute_type);
+      }
+      o +=rhs._grid->_slice_stride[dimension];
+    }
+
+  } else {
+
+    int ro  = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
+    int lo  = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane 
+    int o   = 0;                                     // relative offset to base within plane
+    
+#pragma omp parallel for collapse(2)
+    for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
+      for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
+
+	int ocb=1<<lhs._grid->CheckerBoardFromOsite(o+b);
+
+	if ( ocb&cbmask ) {
+	  permute(lhs._odata[lo+o+b],rhs._odata[ro+o+b],permute_type);
+	}
+
+      }
+      o +=rhs._grid->_slice_stride[dimension];
+    }
+
+  }
+}
+
+//////////////////////////////////////////////////////
+// Local to node Cshift
+//////////////////////////////////////////////////////
+
+  // Work out whether to permute 
+  // ABCDEFGH ->   AE BF CG DH       permute              wrap num
+  //
+  // Shift 0       AE BF CG DH       0 0 0 0    ABCDEFGH   0   0
+  // Shift 1       BF CG DH AE       0 0 0 1    BCDEFGHA   0   1
+  // Shift 2       CG DH AE BF       0 0 1 1    CDEFGHAB   0   2
+  // Shift 3       DH AE BF CG       0 1 1 1    DEFGHABC   0   3
+  // Shift 4       AE BF CG DH       1 1 1 1    EFGHABCD   1   0 
+  // Shift 5       BF CG DH AE       1 1 1 0    FGHACBDE   1   1 
+  // Shift 6       CG DH AE BF       1 1 0 0    GHABCDEF   1   2
+  // Shift 7       DH AE BF CG       1 0 0 0    HABCDEFG   1   3
+
+  // Suppose 4way simd in one dim.
+  // ABCDEFGH ->   AECG BFDH      permute              wrap num
+
+  // Shift 0       AECG BFDH      0,00 0,00 ABCDEFGH         0     0
+  // Shift 1       BFDH CGEA      0,00 1,01 BCDEFGHA         0     1
+  // Shift 2       CGEA DHFB      1,01 1,01 CDEFGHAB         1     0
+  // Shift 3       DHFB EAGC      1,01 1,11 DEFGHABC         1     1
+  // Shift 4       EAGC FBHD      1,11 1,11 EFGHABCD         2     0 
+  // Shift 5       FBHD GCAE      1,11 1,10 FGHABCDE         2     1
+  // Shift 6       GCAE HDBF      1,10 1,10 GHABCDEF         3     0
+  // Shift 7       HDBF AECG      1,10 0,00 HABCDEFG         3     1
+
+  // Generalisation to 8 way simd, 16 way simd required.
+  //
+  // Need log2 Nway masks. consisting of 
+  //	    1 bit  256 bit granule
+  //	    2 bit  128 bit granule
+  //        4 bits 64  bit granule
+  //        8 bits 32  bit granules
+  //
+  //        15 bits....
+
+friend void Cshift_local(Lattice<vobj>& ret,Lattice<vobj> &rhs,int dimension,int shift)
+{
+  int sshift[2];
+
+  sshift[0] = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,0);
+  sshift[1] = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,1);
+
+  if ( sshift[0] == sshift[1] ) {
+    Cshift_local(ret,rhs,dimension,shift,0x3);
+  } else {
+    Cshift_local(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
+    Cshift_local(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
+  }
+}
+
+
+friend Lattice<vobj> Cshift_local(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
+{
+  int fd = rhs._grid->_fdimensions[dimension];
+  int rd = rhs._grid->_rdimensions[dimension];
+  int ld = rhs._grid->_ldimensions[dimension];
+  int gd = rhs._grid->_gdimensions[dimension];
+  
+
+  // Map to always positive shift modulo global full dimension.
+  shift = (shift+fd)%fd;
+
+  ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
+        
+  // the permute type
+  int permute_dim =rhs._grid->_simd_layout[dimension]>1 ;
+  int permute_type=0;
+  for(int d=0;d<dimension;d++){
+    if (rhs._grid->_simd_layout[d]>1 ) permute_type++;
+  }
+
+  for(int x=0;x<rd;x++){       
+
+    int o   = 0;
+    int bo  = x * rhs._grid->_ostride[dimension];
+    
+    int cb= (cbmask==0x2)? 1 : 0;
+
+    int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb);
+    int sx     = (x+sshift)%rd;
+	
+    int permute_slice=0;
+    if(permute_dim){
+      int wrap = sshift/rd;
+      int  num = sshift%rd;
+      if ( x< rd-num ) permute_slice=wrap;
+      else permute_slice = 1-wrap;
+    }
+
+    if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type);
+    else                 Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
+
+  
+  }
+  return ret;
+}
+
+#endif
--- a/Grid_init.cc
+++ b/Grid_init.cc
@@ -16,8 +16,11 @@
 #undef __X86_64
 namespace dpo {

-void Grid_init(void)
+void Grid_init(int *argc,char ***argv)
 {
+#ifdef GRID_COMMS_MPI
+  MPI_Init(argc,argv);
+#endif
  Grid_debug_handler_init();
 }
 double usecond(void) {
--- a/Grid_main.cc
+++ b/Grid_main.cc
@@ -13,34 +13,28 @@ using namespace dpo::QCD;

 int main (int argc, char ** argv)
 {
+  Grid_init(&argc,&argv);

-#if 0
-  struct sigaction sa,osa;
-  sigemptyset (&sa.sa_mask);
-  sa.sa_sigaction= sa_action;
-  sa.sa_flags    = SA_ONSTACK|SA_SIGINFO;
-
-  sigaction(SIGSEGV,&sa,NULL);
-  sigaction(SIGTRAP,&sa,NULL);
-#endif
-
-    std::vector<int> latt_size(4);
-    std::vector<int> simd_layout(4);
-
-    std::vector<int> mpi_layout(4);
-    for(int d=0;d<4;d++) mpi_layout[d]=1;
+  std::vector<int> latt_size(4);
+  std::vector<int> simd_layout(4);
+  
+  std::vector<int> mpi_layout(4);
+  mpi_layout[0]=4;
+  mpi_layout[1]=2;
+  mpi_layout[2]=4;
+  mpi_layout[3]=2;

 #ifdef AVX512
 for(int omp=128;omp<236;omp+=16){
 #else
- for(int omp=1;omp<8;omp*=2){
+ for(int omp=1;omp<8;omp*=20){
 #endif

 #ifdef OMP
   omp_set_num_threads(omp);
 #endif 

-  for(int lat=4;lat<=16;lat+=40){
+  for(int lat=16;lat<=16;lat+=40){
    latt_size[0] = lat;
    latt_size[1] = lat;
    latt_size[2] = lat;
@@ -234,9 +228,9 @@ int main (int argc, char ** argv)
 	std::cout << "Shifting both parities by "<< shift <<" direction "<< dir <<std::endl;
 	Shifted  = Cshift(Foo,dir,shift);    // Shift everything

-	std::cout << "Shifting even parities"<<std::endl;
+	std::cout << "Shifting even source parities to odd result"<<std::endl;
 	bShifted = Cshift(rFoo,dir,shift);   // Shift red->black
-	std::cout << "Shifting odd parities"<<std::endl;
+	std::cout << "Shifting odd parities to even result"<<std::endl;
 	rShifted = Cshift(bFoo,dir,shift);   // Shift black->red
    
 	ShiftedCheck=zero;
@@ -245,19 +239,19 @@ int main (int argc, char ** argv)
    
 	// Check results
 	std::vector<int> coor(4);
-	for(coor[3]=0;coor[3]<latt_size[3];coor[3]++){
-	for(coor[2]=0;coor[2]<latt_size[2];coor[2]++){
-	for(coor[1]=0;coor[1]<latt_size[1];coor[1]++){
-	for(coor[0]=0;coor[0]<latt_size[0];coor[0]++){
+	for(coor[3]=0;coor[3]<latt_size[3]/mpi_layout[3];coor[3]++){
+	for(coor[2]=0;coor[2]<latt_size[2]/mpi_layout[2];coor[2]++){
+	for(coor[1]=0;coor[1]<latt_size[1]/mpi_layout[1];coor[1]++){
+	for(coor[0]=0;coor[0]<latt_size[0]/mpi_layout[0];coor[0]++){
 
        std::complex<dpo::Real> diff;
                    
        std::vector<int> shiftcoor = coor;
-        shiftcoor[dir]=(shiftcoor[dir]+shift+latt_size[dir])%latt_size[dir];
+        shiftcoor[dir]=(shiftcoor[dir]+shift+latt_size[dir])%(latt_size[dir]/mpi_layout[dir]);

 	std::vector<int> rl(4);
 	for(int dd=0;dd<4;dd++){
-	  rl[dd] = latt_size[dd]/simd_layout[dd];
+	  rl[dd] = latt_size[dd]/simd_layout[dd]/mpi_layout[dd];
 	}
 	int lex =  coor[0]%rl[0]
 	  + (coor[1]%rl[1])*rl[0]
@@ -344,4 +338,6 @@ int main (int argc, char ** argv)
   } // loop for lat
 } // loop for omp

+ MPI_Finalize();
+
 }
--- a/Grid_mpi.cc
+++ b/Grid_mpi.cc
@@ -7,16 +7,23 @@ namespace dpo {

 CartesianCommunicator::CartesianCommunicator(std::vector<int> &processors)
 {
-  _ndimension = _processors.size();
+  _ndimension = processors.size();
  std::vector<int> periodic(_ndimension,1);

+  _Nprocessors=1;
  _processors = processors;
-  _processor_coords.resize(_ndimension);
-
-  MPI_Cart_create(MPI_COMM_WORLD _ndimension,&_processors[0],&periodic[0],1,&communicator);
+  _processor_coor.resize(_ndimension);
+  
+  MPI_Cart_create(MPI_COMM_WORLD, _ndimension,&_processors[0],&periodic[0],1,&communicator);
  MPI_Comm_rank(communicator,&_processor);
-  MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coords[0]);
-
+  MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coor[0]);
+  printf("Hello world from processor [");
+  for(int i=0;i<_ndimension;i++){
+    printf("%d ",_processor_coor[i]);
+    _Nprocessors*=_processors[i];
+  }
+  printf("]\n");
+  fflush(stdout);
 }

 void CartesianCommunicator::GlobalSumF(float &f){
@@ -35,11 +42,9 @@ void CartesianCommunicator::GlobalSumFVector(double *d,int N)
  MPI_Allreduce(d,d,N,MPI_DOUBLE,MPI_SUM,communicator);
 }

-int CartesianCommunicator::ShiftedRank(int dim,int shift)
+void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
 {
-  int rank;
-  MPI_Cart_shift(communicator,dim,shift,&_processor,&rank);
-  return rank;
+  MPI_Cart_shift(communicator,dim,shift,&source,&dest);
 }
 int CartesianCommunicator::Rank(std::vector<int> coor)
 {
@@ -50,23 +55,18 @@ int CartesianCommunicator::Rank(std::vector<int> coor)

 // Basic Halo comms primitive
 void CartesianCommunicator::SendToRecvFrom(void *xmit,
-					   std::vector<int> &to_coordinate,
+					   int dest,
 					   void *recv,
-					   std::vector<int> &from_coordinate,
+					   int from,
 					   int bytes)
 {
-  int dest = Rank(to_coordinate);
-  int from = Rank(from_coordinate);
+  MPI_Request reqs[2];
+  MPI_Status OkeyDokey[2];
+  int rank = _processor;
+  MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator,&reqs[0]);
+  MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&reqs[1]);
+  MPI_Waitall(2,reqs,OkeyDokey);

-  MPI_Request x_req;
-  MPI_Request r_req;
-  MPI_Status OkeyDokey;
-
-  MPI_Isend(xmit, bytes, MPI_CHAR,dest,dest,communicator,&x_req);
-  MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&r_req);
-  MPI_Wait(&x_req,&OkeyDokey);
-  MPI_Wait(&r_req,&OkeyDokey);
-  MPI_Barrier();
 }

 }
--- a/Grid_mpi_cshift.h
+++ b/Grid_mpi_cshift.h
@@ -1,26 +1,9 @@
 #ifndef _GRID_MPI_CSHIFT_H_
 #define _GRID_MPI_CSHIFT_H_

-      
-//////////////////////////////////////////////
-// Q. Split this into seperate sub functions?
-//////////////////////////////////////////////
-
-// CshiftCB_comms_splice
-// CshiftCB_comms
-// CshiftCB_local
-// CshiftCB_local_permute
-
-// Cshift_comms_splice
-// Cshift_comms
-// Cshift_local
-// Cshift_local_permute
-
-// Broadly I remain annoyed that the iteration is so painful
-// for red black data layout, when simple block strided descriptors suffice for non-cb. 
-//
-// The other option is to do it table driven, or perhaps store the CB of each site in a table.
-//
+#define MAX(x,y) ((x)>(y)?(x):(y))
+#define MIN(x,y) ((x)>(y)?(y):(x))
+//////////////////////////////////////////////////////////////////////////////////////////
 // Must not lose sight that goal is to be able to construct really efficient
 // gather to a point stencil code. CSHIFT is not the best way, so probably need
 // additional stencil support.
@@ -33,79 +16,31 @@
 //
 // Grid could create a neighbour index table for a given stencil.
 // Could also implement CovariantCshift.
+//////////////////////////////////////////////////////////////////////////////////////////

-//////////////////////////////////////////////////////
-//Non checkerboarded support functions
-//////////////////////////////////////////////////////
+      
+/////////////////////////////////////////////////////////////
+// Q. Further split this into separate sub functions?
+/////////////////////////////////////////////////////////////

-friend void Gather_plane        (Lattice<vobj> &rhs,std::vector<vobj> &buffer,             int dimension,int plane)
-{
-  const int Nsimd = vector_type::Nsimd();
-  int rd = rhs._grid->_rdimensions[dimension];
+// CshiftCB_local
+// CshiftCB_local_permute

-  int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
-  int o   = 0;                                    // relative offset to base within plane
-#pragma omp parallel for collapse(2)
-  for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
-    for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
-	  
-	  int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
-	  int sx = (x+sshift)%rd;
-	  int so = sx*rhs._grid->_ostride[dimension];
-	  int permute_slice=0;
-	  int wrap = sshift/rd;
-	  int  num = sshift%rd;
-	  
-	  if ( x< rd-num ) permute_slice=wrap;
-	  else permute_slice = 1-wrap;
-	  
-	  if ( permute_slice ) {
-	    permute(ret._odata[ro+o+b],rhs._odata[so+o+b],permute_type);
-	  } else {
-	    ret._odata[ro+o+b]=rhs._odata[so+o+b];
-	  }
-	}
-	o +=rhs._grid->_slice_stride[dimension];
-      }
-  }
-  
-}
-//friend void Gather_plane_extract(Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane);
-//
-//friend void Scatter_plane       (Lattice<vobj> &rhs,std::vector<vobj> face,             int dimension,int plane);
-//friend void Scatter_plane_merge (Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane);
-//
-//template<int permute_type> friend void Copy_plane_permute(Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);
-//                           friend void Copy_plane(Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);
-//
-
-//////////////////////////////////////////////////////
-//Checkerboarded support functions
-//////////////////////////////////////////////////////
-
-//friend void GatherCB_plane        (Lattice<vobj> &rhs,std::vector<vobj> face,             int dimension,int plane);
-//friend void GatherCB_plane_extract(Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane);
-//
-//friend void ScatterCB_plane       (Lattice<vobj> &rhs,std::vector<vobj> face,             int dimension,int plane);
-//friend void ScatterCB_plane_merge (Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane);
-//
-//template<int permute_type> friend void CopyCB_plane_permute(Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);
-//                           friend void Copy_plane(Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);
+// Cshift_comms_splice
+// Cshift_comms
+// Cshift_local
+// Cshift_local_permute


 friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
 {
  typedef typename vobj::vector_type vector_type;
  typedef typename vobj::scalar_type scalar_type;
-  const int Nsimd = vector_type::Nsimd();

  Lattice<vobj> ret(rhs._grid);
  
  int fd = rhs._grid->_fdimensions[dimension];
  int rd = rhs._grid->_rdimensions[dimension];
-  //int ld = rhs._grid->_ldimensions[dimension];
-  //int gd = rhs._grid->_gdimensions[dimension];
-  

  // Map to always positive shift modulo global full dimension.
  shift = (shift+fd)%fd;
@@ -115,270 +50,288 @@ friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
  // the permute type
  int simd_layout     = rhs._grid->_simd_layout[dimension];
  int comm_dim        = rhs._grid->_processors[dimension] >1 ;
-  int permute_dim     = rhs._grid->_simd_layout[dimension]>1 && (!comm_dim);
  int splice_dim      = rhs._grid->_simd_layout[dimension]>1 && (comm_dim);

-  int permute_type=0;
-  for(int d=0;d<dimension;d++){
-    if (rhs._grid->_simd_layout[d]>1 ) permute_type++;
-  }
-
-  // Logic for non-distributed dimension	  
-  std::vector<int> comm_offnode(simd_layout);
-  std::vector<int> comm_to     (simd_layout);
-  std::vector<int> comm_from   (simd_layout);
-  std::vector<int> comm_rx     (simd_layout);  // reduced coordinate of neighbour plane
-  std::vector<int> comm_simd_lane(simd_layout);// simd lane of neigbour plane
-
-  ///////////////////////////////////////////////
-  // Move via a fake comms buffer
-  // Simd direction uses an extract/merge pair
-  ///////////////////////////////////////////////
-  int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
-  int words = sizeof(vobj)/sizeof(vector_type);
-
-  std::vector<vobj,alignedAllocator<vobj> > comm_buf(buffer_size);
-  std::vector<std::vector<scalar_type> > comm_buf_extract(Nsimd,std::vector<scalar_type>(buffer_size*words) );
-  std::vector<scalar_type *> pointers(Nsimd);
-
-
-
-  if ( permute_dim ) {
-
-    for(int x=0;x<rd;x++){       
-      int ro  = x*rhs._grid->_ostride[dimension]; // base offset for result
-      int o   = 0;                                // relative offset to base
-      for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
-	for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
-	  
-	  int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
-	  int sx = (x+sshift)%rd;
-	  int so = sx*rhs._grid->_ostride[dimension];
-	  int permute_slice=0;
-	  int wrap = sshift/rd;
-	  int  num = sshift%rd;
-	  
-	  if ( x< rd-num ) permute_slice=wrap;
-	  else permute_slice = 1-wrap;
-	  
-	  if ( permute_slice ) {
-	    permute(ret._odata[ro+o+b],rhs._odata[so+o+b],permute_type);
-	  } else {
-	    ret._odata[ro+o+b]=rhs._odata[so+o+b];
-	  }
-	}
-	o +=rhs._grid->_slice_stride[dimension];
-      }
-    }

+  if ( !comm_dim ) {
+    Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding
  } else if ( splice_dim ) {
-
-    if ( rhs._grid->_simd_layout[dimension] > 2 ) exit(-1); // use Cassert. Audit code for exit and replace
-    if ( rhs._grid->_simd_layout[dimension] < 1 ) exit(-1);
-	    
-
-    for(int i=0;i<vobj::vector_type::Nsimd();i++){
-      pointers[i] = (scalar_type *)&comm_buf_extract[i][0];
-    }
-
-    
-    for(int x=0;x<rd;x++){       
-
-
-      ///////////////////////////////////////////
-      // Extract one orthogonal slice at a time
-      ///////////////////////////////////////////
-      int ro  = x*rhs._grid->_ostride[dimension]; // base offset for result
-
-      o   = 0;                                    // relative offset to base      
-      for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
-	for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
-	  
-	  int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
-	  int sx = (x+sshift)%rd;
-	  
-	  // base offset for source
-	  int so = sx*rhs._grid->_ostride[dimension];
-
-	  int permute_slice=0;
-	  int wrap = sshift/rd;
-	  int  num = sshift%rd;
-	    
-	  if ( x< rd-num ) permute_slice=wrap;
-	  else permute_slice = 1-wrap;
-
-	  if ( permute_slice ) {
-	    extract(rhs._odata[so+o+b],pointers);
-	  }
-	}
-	o +=rhs._grid->_slice_stride[dimension];
-      }
-
-    
-      ///////////////////////////////////////////
-      // Work out what to send where
-      ///////////////////////////////////////////
-
-      for(int s=0;s<simd_layout;s++) {
-
-
-	// shift to "neighbour" takes us off node
-	// coordinates (rx, simd_lane) of neighbour
-	// how many nodes away is this shift
-	// where we should send to
-	// where we should receive from
-	int shifted_x = x+s*rd+shift;
-	comm_offnode[s]           = shifted_x > ld; 
-	comm_send_rx[s]           = shifted_x%rd;     // which slice geton the other node
-	comm_send_simd_lane [s]   = shifted_x/rd;     // which slice on the other node
-	comm_from[s]    = shifted_x/ld;     
-	comm_to  [s] = (2*_processors[dimension]-comm_from[s]) % _processors[dimension];
-	comm_from[s] = (comm_from[s]+_processors[dimension])   % _processors[dimension];
-	      
-      }
-
-      ////////////////////////////////////////////////
-      // Insert communication phase
-      ////////////////////////////////////////////////
-#if 0
-	  } else if (comm_dim ) {
-	    
-	    // Packed gather sequence is clean
-	    int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_nblock[dimension];
-	    std::vector<vobj,alignedAllocator<vobj> > send_buf(buffer_size);
-	    std::vector<vobj,alignedAllocator<vobj> > recv_buf(buffer_size);
-
-	    // off node; communcate slice (ld==rd)
-	    if ( x+shift > rd ) { 
-	      int sb=0;
-	      for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
-		for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
-		  send_buf[sb++]=rhs._odata[so+i];
-		}
-		so+=rhs._grid->_slice_stride[dimension];
-	      }	      
-
-	      // Make a comm_fake them mimics comms in periodic case.
-	      
-	      // scatter face
-	      int rb=0;
-	      for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
-		for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
-		  ret._odata[so+i]=recv_buf[rb++];
-		}
-		so+=rhs._grid->_slice_stride[dimension];
-	      }	      
-
-	    } else { 
-
-	      for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
-		for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
-		  ret._odata[o+i]=rhs._odata[so+i];
-		}
-		o+=rhs._grid->_slice_stride[dimension];
-		so+=rhs._grid->_slice_stride[dimension];
-	      }
-	    }
-
-#endif	
-    
-      ////////////////////////////////////////////////
-      // Pull receive buffers and permuted buffers in
-      ////////////////////////////////////////////////
-      for(int i=0;i<vobj::vector_type::Nsimd();i++){
-	pointers[i] = (scalar_type *)&comm_buf_extract[permute_map[permute_type][i]][0];
-      }
-
-      o   = 0;                                    // relative offset to base
-      int ro  = x*rhs._grid->_ostride[dimension]; // base offset for result
-      for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
-	for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
-	  
-	  int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
-	  int sx = (x+sshift)%rd;
-	  
-	  // base offset for source
-	  int so = sx*rhs._grid->_ostride[dimension];
-	  
-	  int permute_slice=0;
-	  int wrap = sshift/rd;
-	  int  num = sshift%rd;
-	  
-	  if ( x< rd-num ) permute_slice=wrap;
-	  else permute_slice = 1-wrap;
-	  
-	  if ( permute_slice ) {
-	    merge(ret._odata[ro+o+b],pointers);
-	  }
-	}
-	o +=rhs._grid->_slice_stride[dimension];
-      }
-    }
-  
-
-  } else if ( comm_dim ) { 
-	
-
-    int co; // comm offset
-    int o;
-    
-    co=0;
-    for(int x=0;x<rd;x++){       
-      o=0;
-      int ro  = x*rhs._grid->_ostride[dimension]; // base offset for result
-      for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
-	for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
-
-	  // This call in inner loop is annoying but necessary for dimension=0
-	  // in the case of RedBlack grids. Could optimise away with 
-	  // alternate code paths for all other cases.
-	  int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
-	  int sx = (x+sshift)%rd;
-	  int so = sx*rhs._grid->_ostride[dimension];
-
-	  comm_buf[co++]=rhs._odata[so+o+b];
-
-	}
-	o +=rhs._grid->_slice_stride[dimension];
-      }
-
-      // Step through a copy into a comms buffer and pull back in.
-      // Genuine fake implementation could calculate if loops back
-      co=0; o=0;
-      for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
-	for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
-	  ret._odata[ro+o+b]=comm_buf[co++];
-	}
-	o +=rhs._grid->_slice_stride[dimension];
-      }
-    }
-
-  } else { // Local dimension, no permute required
-
-    for(int x=0;x<rd;x++){       
-      int o=0;
-      int ro  = x*rhs._grid->_ostride[dimension]; // base offset for result
-      for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
-	for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
-
-	  // This call in inner loop is annoying but necessary for dimension=0
-	  // in the case of RedBlack grids. Could optimise away with 
-	  // alternate code paths for all other cases.
-	  int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
-	  
-	  int sx = (x+sshift)%rd;
-	  int so = sx*rhs._grid->_ostride[dimension];
-	  ret._odata[bo+o+b]=rhs._odata[so+o+b];
-
-	}
-	o +=rhs._grid->_slice_stride[dimension];
-      }
-    }
-
+    Cshift_comms_simd(ret,rhs,dimension,shift);
+  } else {
+    Cshift_comms(ret,rhs,dimension,shift);
  }
-
  return ret;
 }

+friend void Cshift_comms(Lattice<vobj>& ret,Lattice<vobj> &rhs,int dimension,int shift)
+{
+  int sshift[2];
+
+  sshift[0] = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,0);
+  sshift[1] = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,1);
+
+  if ( sshift[0] == sshift[1] ) {
+    //    printf("Cshift_comms : single pass\n");
+    Cshift_comms(ret,rhs,dimension,shift,0x3);
+  } else {
+    //    printf("Cshift_comms : two pass\n");
+    //    printf("call1\n");
+    Cshift_comms(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
+    //    printf("call2\n");
+    Cshift_comms(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
+    //    printf("done\n");
+
+  }
+}
+
+friend void Cshift_comms_simd(Lattice<vobj>& ret,Lattice<vobj> &rhs,int dimension,int shift)
+{
+  int sshift[2];
+
+  sshift[0] = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,0);
+  sshift[1] = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,1);
+
+  if ( sshift[0] == sshift[1] ) {
+    Cshift_comms_simd(ret,rhs,dimension,shift,0x3);
+  } else {
+    //    printf("call1 0x1 cb=even\n");
+    Cshift_comms_simd(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
+    //    printf("call2 0x2 cb=odd\n");
+    Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
+    //    printf("done\n");
+  }
+}
+
+
+friend void Cshift_comms(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
+{
+  typedef typename vobj::vector_type vector_type;
+  typedef typename vobj::scalar_type scalar_type;
+
+  SimdGrid *grid=rhs._grid;
+  Lattice<vobj> temp(rhs._grid);
+
+  int fd              = rhs._grid->_fdimensions[dimension];
+  int rd              = rhs._grid->_rdimensions[dimension];
+  int simd_layout     = rhs._grid->_simd_layout[dimension];
+  int comm_dim        = rhs._grid->_processors[dimension] >1 ;
+  assert(simd_layout==1);
+  assert(comm_dim==1);
+  assert(shift>=0);
+  assert(shift<fd);
+  
+  // Packed gather sequence is clean
+  int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
+  std::vector<vobj,alignedAllocator<vobj> > send_buf(buffer_size);
+  std::vector<vobj,alignedAllocator<vobj> > recv_buf(buffer_size);
+
+  // This code could be simplified by multiple calls to single routine with extra params to
+  // encapsulate the difference in the code paths.
+  int cb= (cbmask==0x2)? 1 : 0;
+  int sshift= rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb);
+
+  for(int x=0;x<rd;x++){       
+
+    int offnode = ( x+sshift >= rd );
+    int sx        = (x+sshift)%rd;
+    int comm_proc = (x+sshift)/rd;
+    
+    if (!offnode) {
+      //      printf("local x %d sshift %d offnode %d rd %d cb %d\n",x,sshift,offnode,rd,cb);
+      Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
+    } else {
+
+      int words = send_buf.size();
+      if (cbmask != 0x3) words=words>>1;
+
+      int bytes = words * sizeof(vobj);
+
+      //      printf("nonlocal x %d sx %d sshift %d offnode %d rd %d cb %d cbmask %d rhscb %d comm_proc %d\n",
+      //	     x,sx,sshift,offnode,rd,cb,cbmask,rhs.checkerboard,comm_proc);
+      //      Copy_plane(temp,rhs,dimension,x,sx,cbmask); 
+
+      // Bug found; cbmask may differ between sx plan and rx plane.
+      Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask);
+      //      for(int i=0;i<MIN(words,8);i++){
+      //	float *ptr = (float *)&send_buf[i];
+      //	printf("send buf shift %d cbmask %d i %d %le\n",sshift,cbmask,i,*ptr);
+      //      }
+      //      Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask^0x3);
+      //      for(int i=0;i<MIN(words,8);i++){
+      //	float *ptr = (float *)&send_buf[i];
+      //	printf("send buf shift %d cbmask %d i %d %le\n",sshift,cbmask,i,*ptr);
+      //      }
+      //      recv_buf=send_buf;
+
+      int rank           = grid->_processor;
+      int recv_from_rank;
+      int xmit_to_rank;
+      grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
+
+      //      printf("bytes %d node %d sending to %d receiving from %d\n",bytes,rank,xmit_to_rank,recv_from_rank );
+      grid->SendToRecvFrom((void *)&send_buf[0],
+			   xmit_to_rank,
+			   (void *)&recv_buf[0],
+			   recv_from_rank,
+			   bytes);
+
+      Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
+    }
+  }
+}
+
+
+friend void  Cshift_comms_simd(Lattice<vobj> &ret,Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
+{
+  const int Nsimd = vector_type::Nsimd();
+  SimdGrid *grid=rhs._grid;
+  typedef typename vobj::vector_type vector_type;
+  typedef typename vobj::scalar_type scalar_type;
+   
+  int fd = grid->_fdimensions[dimension];
+  int rd = grid->_rdimensions[dimension];
+  int ld = grid->_ldimensions[dimension];
+  int simd_layout     = grid->_simd_layout[dimension];
+  int comm_dim        = grid->_processors[dimension] >1 ;
+
+  assert(comm_dim==1);
+  assert(simd_layout==2);
+  assert(shift>=0);
+  assert(shift<fd);
+
+  int permute_type=0;
+  for(int d=0;d<dimension;d++){
+    if (grid->_simd_layout[d]>1 ) permute_type++;
+  }
+
+  ///////////////////////////////////////////////
+  // Simd direction uses an extract/merge pair
+  ///////////////////////////////////////////////
+  int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
+  int words = sizeof(vobj)/sizeof(vector_type);
+
+  std::vector<std::vector<scalar_type> > send_buf_extract(Nsimd,std::vector<scalar_type>(buffer_size*words) );
+  std::vector<std::vector<scalar_type> > recv_buf_extract(Nsimd,std::vector<scalar_type>(buffer_size*words) );
+  int bytes = buffer_size*words*sizeof(scalar_type);
+
+  std::vector<scalar_type *> pointers(Nsimd);  // 
+  std::vector<scalar_type *> rpointers(Nsimd); // received pointers
+
+  ///////////////////////////////////////////
+  // Work out what to send where
+  ///////////////////////////////////////////
+
+  int cb    = (cbmask==0x2)? 1 : 0;
+  int sshift= grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,cb);
+  
+  //  printf("cshift-comms-simd: shift = %d ; sshift = %d ; cbmask %d ; simd_layout %d\n",shift,sshift,cbmask,simd_layout);
+  std::vector<int> comm_offnode(simd_layout);
+  std::vector<int> comm_proc   (simd_layout);  //relative processor coord in dim=dimension
+
+  // Strategy
+  //
+  //*  Loop over source planes
+  //*    if any communication needed extract and send
+  //*    if communication needed extract and send
+
+  for(int x=0;x<rd;x++){       
+
+    int comm_any = 0;
+    for(int s=0;s<simd_layout;s++) {
+      // does shift to "neighbour" takes us off node?
+      // coordinates (reduce plane, simd_lane) of neighbour?
+      // how many nodes away is this shift?
+      // where we should send to?
+      // where we should receive from?
+      int shifted_x   = x+s*rd+sshift;
+      comm_offnode[s] = shifted_x >= ld; 
+      comm_any        = comm_any | comm_offnode[s];
+      comm_proc[s]    = shifted_x/ld;     
+      //      printf("rd %d x %d shifted %d s=%d comm_any %d\n",rd, x,shifted_x,s,comm_any);
+    }
+    
+    int o    = 0;
+    int bo   = x*grid->_ostride[dimension];
+    int sx   = (x+sshift)%rd;
+
+    // Need Convenience function in _grid. Move this in
+    if ( comm_any ) {
+
+      for(int i=0;i<Nsimd;i++){
+	pointers[i] = (scalar_type *)&send_buf_extract[i][0];
+      }
+      Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
+      //      for(int i=0;i<Nsimd;i++){
+      //	printf("extracted %d %le\n",i,real(send_buf_extract[i][0]));
+      //      }
+
+      for(int i=0;i<Nsimd;i++){
+
+	int s = grid->iCoordFromIsite(i,dimension);
+
+	if(comm_offnode[s]){
+
+	  int rank           = grid->_processor;
+	  int recv_from_rank;
+	  int xmit_to_rank;
+	  grid->ShiftedRanks(dimension,comm_proc[s],xmit_to_rank,recv_from_rank);
+	  
+
+	  grid->SendToRecvFrom((void *)&send_buf_extract[i][0],
+			    xmit_to_rank,
+			    (void *)&recv_buf_extract[i][0],
+			    recv_from_rank,
+			    bytes);
+
+	  //	  printf("Cshift_simd comms %d %le %le\n",i,real(recv_buf_extract[i][0]),real(send_buf_extract[i][0]));
+
+	  rpointers[i] = (scalar_type *)&recv_buf_extract[i][0];
+
+	} else { 
+
+	  rpointers[i] = (scalar_type *)&send_buf_extract[i][0];
+	  //	  printf("Cshift_simd local %d %le \n",i,real(send_buf_extract[i][0]));
+
+	}
+
+      }
+
+      // Permute by swizzling pointers in merge
+      int permute_slice=0;
+      int lshift=sshift%ld;
+      int wrap  =lshift/rd;
+      int  num  =lshift%rd;
+
+      if ( x< rd-num ) permute_slice=wrap;
+      else permute_slice = 1-wrap;
+
+      for(int i=0;i<vobj::vector_type::Nsimd();i++){
+	if ( permute_slice ) {
+	  pointers[i] = rpointers[permute_map[permute_type][i]];
+	} else {
+	  pointers[i] = rpointers[i];
+	}
+	//	printf("Cshift_simd perm %d num %d wrap %d swiz %d %le unswiz %le\n",permute_slice,num,wrap,i,real(pointers[i][0]),real(rpointers[i][0]));
+      }
+
+      Scatter_plane_merge(ret,pointers,dimension,x,cbmask);
+
+    } else { 
+
+      int permute_slice=0;
+      int wrap = sshift/rd;
+      int  num = sshift%rd;
+      if ( x< rd-num ) permute_slice=wrap;
+      else permute_slice = 1-wrap;
+
+      if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type);
+      else                 Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
+
+    }
+  }
+}
+
+
+

 #endif
--- a/Grid_none_cshift.h
+++ b/Grid_none_cshift.h
@@ -1,313 +1,11 @@
 #ifndef _GRID_NONE_CSHIFT_H_
 #define _GRID_NONE_CSHIFT_H_
-  // Work out whether to permute 
-  // ABCDEFGH ->   AE BF CG DH       permute              wrap num
-  //
-  // Shift 0       AE BF CG DH       0 0 0 0    ABCDEFGH   0   0
-  // Shift 1       BF CG DH AE       0 0 0 1    BCDEFGHA   0   1
-  // Shift 2       CG DH AE BF       0 0 1 1    CDEFGHAB   0   2
-  // Shift 3       DH AE BF CG       0 1 1 1    DEFGHABC   0   3
-  // Shift 4       AE BF CG DH       1 1 1 1    EFGHABCD   1   0 
-  // Shift 5       BF CG DH AE       1 1 1 0    FGHACBDE   1   1 
-  // Shift 6       CG DH AE BF       1 1 0 0    GHABCDEF   1   2
-  // Shift 7       DH AE BF CG       1 0 0 0    HABCDEFG   1   3
-
-  // Suppose 4way simd in one dim.
-  // ABCDEFGH ->   AECG BFDH      permute              wrap num
-
-  // Shift 0       AECG BFDH      0,00 0,00 ABCDEFGH         0     0
-  // Shift 1       BFDH CGEA      0,00 1,01 BCDEFGHA         0     1
-  // Shift 2       CGEA DHFB      1,01 1,01 CDEFGHAB         1     0
-  // Shift 3       DHFB EAGC      1,01 1,11 DEFGHABC         1     1
-  // Shift 4       EAGC FBHD      1,11 1,11 EFGHABCD         2     0 
-  // Shift 5       FBHD GCAE      1,11 1,10 FGHABCDE         2     1
-  // Shift 6       GCAE HDBF      1,10 1,10 GHABCDEF         3     0
-  // Shift 7       HDBF AECG      1,10 0,00 HABCDEFG         3     1
-
-  // Generalisation to 8 way simd, 16 way simd required.
-  //
-  // Need log2 Nway masks. consisting of 
-  //	    1 bit  256 bit granule
-  //	    2 bit  128 bit granule
-  //        4 bits 64  bit granule
-  //        8 bits 32  bit granules
-  //
-  //        15 bits....
-  
-// For optimisation:
-//
-// split into Cshift_none_rb_permute
-// split into Cshift_none_rb_simple
-//
-// split into Cshift_none_permute
-// split into Cshift_none_simple

 friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
 {
  Lattice<vobj> ret(rhs._grid);
-  
-  int fd = rhs._grid->_fdimensions[dimension];
-  int rd = rhs._grid->_rdimensions[dimension];
-  int ld = rhs._grid->_ldimensions[dimension];
-  int gd = rhs._grid->_gdimensions[dimension];
-  
-
-  // Map to always positive shift modulo global full dimension.
-  shift = (shift+fd)%fd;
-
  ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
-        
-  // the permute type
-
-  int permute_dim =rhs._grid->_simd_layout[dimension]>1 ;
-  int permute_type=0;
-  for(int d=0;d<dimension;d++){
-    if (rhs._grid->_simd_layout[d]>1 ) permute_type++;
-  }
-
-  for(int x=0;x<rd;x++){       
-
-    int bo  = x*rhs._grid->_ostride[dimension];
-    int o   = 0;
-
-    if ( permute_dim ) {
-
-      for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
-	for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
-	  
-	  int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
-	  
-	  int sx = (x+sshift)%rd;
-	  int so = sx*rhs._grid->_ostride[dimension];
-
-	  int permute_slice=0;
-	  int wrap = sshift/rd;
-	  int  num = sshift%rd;
-	    
-	  if ( x< rd-num ) permute_slice=wrap;
-	  else permute_slice = 1-wrap;
-	  
-	  if ( permute_slice ) {
-	    permute(ret._odata[bo+o+b],rhs._odata[so+o+b],permute_type);
-	  } else {
-	    ret._odata[bo+o+b]=rhs._odata[so+o+b];
-	  }
-
-	}
-	o +=rhs._grid->_slice_stride[dimension];
-
-      }
-    } else {
-      for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
-	for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
-
-	  // This call in inner loop is annoying but necessary for dimension=0
-	  // in the case of RedBlack grids. Could optimise away with 
-	  // alternate code paths for all other cases.
-	  int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
-	  
-	  int sx = (x+sshift)%rd;
-	  int so = sx*rhs._grid->_ostride[dimension];
-	  ret._odata[bo+o+b]=rhs._odata[so+o+b];
-
-	}
-	o +=rhs._grid->_slice_stride[dimension];
-      }
-    }
-  }
-  return ret;
+  Cshift_local(ret,rhs,dimension,shift);
 }
        
-#if 0 
-// Collapse doesn't appear to work the way I think it should in icpc
-friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
-{
-  Lattice<vobj> ret(rhs._grid);
-        
-  ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
-  shift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift);
-  int sx,so,o;
-  int rd = rhs._grid->_rdimensions[dimension];
-  int ld = rhs._grid->_dimensions[dimension];
-  // Map to always positive shift.
-  shift = (shift+ld)%ld;
-  // Work out whether to permute and the permute type
-  // ABCDEFGH ->   AE BF CG DH       permute
-  // Shift 0       AE BF CG DH       0 0 0 0    ABCDEFGH
-  // Shift 1       DH AE BF CG       1 0 0 0    HABCDEFG
-  // Shift 2       CG DH AE BF       1 1 0 0    GHABCDEF
-  // Shift 3       BF CG DH AE       1 1 1 0    FGHACBDE
-  // Shift 4       AE BF CG DH       1 1 1 1    EFGHABCD
-  // Shift 5       DH AE BF CG       0 1 1 1    DEFGHABC
-  // Shift 6       CG DH AE BF       0 0 1 1    CDEFGHAB
-  // Shift 7       BF CG DH AE       0 0 0 1    BCDEFGHA
-  int permute_dim =rhs._grid->_layout[dimension]>1 ;
-  int permute_type=0;
-  for(int d=0;d<dimension;d++)
-    if (rhs._grid->_layout[d]>1 ) permute_type++;
-        
-  // loop over perp slices.
-  // Threading considerations:
-  //   Need to map thread_num to
-  //
-  //               x_min,x_max for Loop-A
-  //               n_min,n_max for Loop-B
-  //               b_min,b_max for Loop-C
-  //  In a way that maximally load balances.
-  //
-  //  Optimal:
-  //      There are rd*n_block*block items of work.
-  //      These serialise as item "w"
-  //      b=w%block; w=w/block
-  //      n=w%nblock; x=w/nblock. Perhaps 20 cycles?
-  //
-  //  Logic:
-  //      x_chunk = (rd+thread)/nthreads simply divide work across nodes.
-  //
-  //      rd=5 , threads = 8;
-  //      0 1 2 3 4 5 6 7
-  //      0 0 0 1 1 1 1 1
-  for(int x=0;x<rd;x++){         // Loop A
-    sx = (x-shift+ld)%rd;
-    o  = x*rhs._grid->_ostride[dimension];
-    so =sx*rhs._grid->_ostride[dimension];
-    int permute_slice=0;
-    if ( permute_dim ) {
-      permute_slice = shift/rd;
-      if ( x<shift%rd ) permute_slice = 1-permute_slice;
-    }
-#if 0
-    if ( permute_slice ) {
-                
-      int internal=sizeof(vobj)/sizeof(vComplex);
-      int num =rhs._grid->_slice_block[dimension]*internal;
-                
-      for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
-	vComplex *optr = (vComplex *)&ret._odata[o];
-	vComplex *iptr = (vComplex *)&rhs._odata[so];
-	for(int b=0;b<num;b++){
-	  permute(optr[b],iptr[b],permute_type);
-	}
-	o+=rhs._grid->_slice_stride[dimension];
-	so+=rhs._grid->_slice_stride[dimension];
-      }
-    } else {
-      for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
-	for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
-	  ret._odata[o+i]=rhs._odata[so+i];
-	}
-	o+=rhs._grid->_slice_stride[dimension];
-	so+=rhs._grid->_slice_stride[dimension];
-      }
-            
-    }
-#else
-    if ( permute_slice ) {
-      int internal=sizeof(vobj)/sizeof(vComplex);
-      int num =rhs._grid->_slice_block[dimension]*internal;
-#pragma omp parallel for collapse(2)
-      for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
-	for(int b=0;b<num;b++){
-	  vComplex *optr = (vComplex *)&ret._odata[o +n*rhs._grid->_slice_stride[dimension]];
-	  vComplex *iptr = (vComplex *)&rhs._odata[so+n*rhs._grid->_slice_stride[dimension]];
-	  permute(optr[b],iptr[b],permute_type);
-	}
-      }
-    } else {
-#pragma omp parallel for collapse(2)
-      for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
-	for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
-	  int oo = o+ n*rhs._grid->_slice_stride[dimension];
-	  int soo=so+ n*rhs._grid->_slice_stride[dimension];
-	  ret._odata[oo+i]=rhs._odata[soo+i];
-	}
-      }
-            
-    }
-#endif
-  }
-  return ret;
-}
-friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
-{
-  Lattice<vobj> ret(rhs._grid);
-        
-  ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
-  shift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift);
-  int rd = rhs._grid->_rdimensions[dimension];
-  int ld = rhs._grid->_dimensions[dimension];
-        
-  // Map to always positive shift.
-  shift = (shift+ld)%ld;
-        
-  // Work out whether to permute and the permute type
-  // ABCDEFGH ->   AE BF CG DH       permute
-  // Shift 0       AE BF CG DH       0 0 0 0    ABCDEFGH
-  // Shift 1       DH AE BF CG       1 0 0 0    HABCDEFG
-  // Shift 2       CG DH AE BF       1 1 0 0    GHABCDEF
-  // Shift 3       BF CG DH AE       1 1 1 0    FGHACBDE
-  // Shift 4       AE BF CG DH       1 1 1 1    EFGHABCD
-  // Shift 5       DH AE BF CG       0 1 1 1    DEFGHABC
-  // Shift 6       CG DH AE BF       0 0 1 1    CDEFGHAB
-  // Shift 7       BF CG DH AE       0 0 0 1    BCDEFGHA
-  int permute_dim =rhs._grid->_layout[dimension]>1 ;
-  int permute_type=0;
-  for(int d=0;d<dimension;d++)
-    if (rhs._grid->_layout[d]>1 ) permute_type++;
-        
-        
-  // loop over all work
-  int work =rd*rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
-
-#pragma omp parallel for
-  for(int ww=0;ww<work;ww++){
-
-            
-    // can optimise this if know w moves congtiguously for a given thread.
-    // b=(b+1);
-    // if (b==_slice_block) {b=0; n=n+1;}
-    // if (n==_slice_nblock) { n=0; x=x+1}
-    //
-    // Perhaps a five cycle iterator, or so.
-    int w=ww;
-    int b = w%rhs._grid->_slice_block[dimension] ; w=w/rhs._grid->_slice_block[dimension];
-    int n = w%rhs._grid->_slice_nblock[dimension]; w=w/rhs._grid->_slice_nblock[dimension];
-    int x = w;
-
-    int sx,so,o;
-    sx = (x-shift+ld)%rd;
-    o  = x*rhs._grid->_ostride[dimension]+n*rhs._grid->_slice_stride[dimension]; // common sub expression alert.
-    so =sx*rhs._grid->_ostride[dimension]+n*rhs._grid->_slice_stride[dimension];
-            
-    int permute_slice=0;
-    if ( permute_dim ) {
-      permute_slice = shift/rd;
-      if ( x<shift%rd ) permute_slice = 1-permute_slice;
-    }
-            
-    if ( permute_slice ) {
-                
-      int internal=sizeof(vobj)/sizeof(vComplex);
-      vComplex *optr = (vComplex *)&ret._odata[o+b];
-      vComplex *iptr = (vComplex *)&rhs._odata[so+b];
-      const char *pf = (const char *)iptr;
-      for(int i=0;i<sizeof(vobj);i+=64){
-	_mm_prefetch(pf+i,_MM_HINT_T0);
-      }
-
-      for(int i=0;i<internal;i++){
-	permute(optr[i],iptr[i],permute_type);
-      }
-    } else {
-      const char *pf = (const char *) &rhs._odata[so+b];
-      for(int i=0;i<sizeof(vobj);i+=64){
-	_mm_prefetch(pf+i,_MM_HINT_T0);
-      }
-      ret._odata[o+b]=rhs._odata[so+b];
-    }
-  }
-  return ret;
-}
-#endif
-#endif

--- a/Makefile.am
+++ b/Makefile.am
@@ -23,8 +23,20 @@ include_HEADERS = Grid.h\
 # Test code
 #
 bin_PROGRAMS = Grid_main
+
+extra_sources=
+if BUILD_COMMS_MPI
+  extra_sources+=Grid_mpi.cc
+endif
+if BUILD_COMMS_FAKE
+  extra_sources+=Grid_fake.cc
+endif
+if BUILD_COMMS_NONE
+  extra_sources+=Grid_fake.cc
+endif
+
 Grid_main_SOURCES = \
 	Grid_main.cc\
-	Grid_fake.cc
+	$(extra_sources)

 Grid_main_LDADD = libGrid.a
--- a/Makefile.in
+++ b/Makefile.in
@@ -89,6 +89,9 @@ NORMAL_UNINSTALL = :
 PRE_UNINSTALL = :
 POST_UNINSTALL = :
 bin_PROGRAMS = Grid_main$(EXEEXT)
+@BUILD_COMMS_MPI_TRUE@am__append_1 = Grid_mpi.cc
+@BUILD_COMMS_FAKE_TRUE@am__append_2 = Grid_fake.cc
+@BUILD_COMMS_NONE_TRUE@am__append_3 = Grid_fake.cc
 subdir = .
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
 am__aclocal_m4_deps = $(top_srcdir)/configure.ac
@@ -143,7 +146,12 @@ libGrid_a_LIBADD =
 am_libGrid_a_OBJECTS = Grid_init.$(OBJEXT)
 libGrid_a_OBJECTS = $(am_libGrid_a_OBJECTS)
 PROGRAMS = $(bin_PROGRAMS)
-am_Grid_main_OBJECTS = Grid_main.$(OBJEXT) Grid_fake.$(OBJEXT)
+am__Grid_main_SOURCES_DIST = Grid_main.cc Grid_mpi.cc Grid_fake.cc
+@BUILD_COMMS_MPI_TRUE@am__objects_1 = Grid_mpi.$(OBJEXT)
+@BUILD_COMMS_FAKE_TRUE@am__objects_2 = Grid_fake.$(OBJEXT)
+@BUILD_COMMS_NONE_TRUE@am__objects_3 = Grid_fake.$(OBJEXT)
+am__objects_4 = $(am__objects_1) $(am__objects_2) $(am__objects_3)
+am_Grid_main_OBJECTS = Grid_main.$(OBJEXT) $(am__objects_4)
 Grid_main_OBJECTS = $(am_Grid_main_OBJECTS)
 Grid_main_DEPENDENCIES = libGrid.a
 AM_V_P = $(am__v_P_@AM_V@)
@@ -176,7 +184,7 @@ am__v_CXXLD_ = $(am__v_CXXLD_@AM_DEFAULT_V@)
 am__v_CXXLD_0 = @echo "  CXXLD   " $@;
 am__v_CXXLD_1 = 
 SOURCES = $(libGrid_a_SOURCES) $(Grid_main_SOURCES)
-DIST_SOURCES = $(libGrid_a_SOURCES) $(Grid_main_SOURCES)
+DIST_SOURCES = $(libGrid_a_SOURCES) $(am__Grid_main_SOURCES_DIST)
 am__can_run_installinfo = \
  case $$AM_UPDATE_INFO_DIR in \
    n|no|NO) false;; \
@@ -340,9 +348,10 @@ include_HEADERS = Grid.h\
 	Grid_Lattice.h\
 	Grid_config.h

+extra_sources = $(am__append_1) $(am__append_2) $(am__append_3)
 Grid_main_SOURCES = \
 	Grid_main.cc\
-	Grid_fake.cc
+	$(extra_sources)

 Grid_main_LDADD = libGrid.a
 all: Grid_config.h
@@ -490,6 +499,7 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_fake.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_init.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_main.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/Grid_mpi.Po@am__quote@

 .cc.o:
@am__fastdepCXX_TRUE@	$(AM_V_CXX)$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
--- a/aclocal.m4
+++ b/aclocal.m4
--- a/43
+++ b/43
@@ -626,6 +626,12 @@ ac_subst_vars='am__EXEEXT_FALSE
 am__EXEEXT_TRUE
 LTLIBOBJS
 LIBOBJS
+BUILD_COMMS_NONE_FALSE
+BUILD_COMMS_NONE_TRUE
+BUILD_COMMS_FAKE_FALSE
+BUILD_COMMS_FAKE_TRUE
+BUILD_COMMS_MPI_FALSE
+BUILD_COMMS_MPI_TRUE
 EGREP
 GREP
 CPP
@@ -5068,6 +5074,31 @@ $as_echo "#define GRID_COMMS_MPI 1" >>confdefs.h
     ;;
 esac

+ if  test "X${ac_COMMS}X" == "XmpiX" ; then
+  BUILD_COMMS_MPI_TRUE=
+  BUILD_COMMS_MPI_FALSE='#'
+else
+  BUILD_COMMS_MPI_TRUE='#'
+  BUILD_COMMS_MPI_FALSE=
+fi
+
+ if  test "X${ac_COMMS}X" == "XfakeX" ; then
+  BUILD_COMMS_FAKE_TRUE=
+  BUILD_COMMS_FAKE_FALSE='#'
+else
+  BUILD_COMMS_FAKE_TRUE='#'
+  BUILD_COMMS_FAKE_FALSE=
+fi
+
+ if  test "X${ac_COMMS}X" == "XnoneX" ; then
+  BUILD_COMMS_NONE_TRUE=
+  BUILD_COMMS_NONE_FALSE='#'
+else
+  BUILD_COMMS_NONE_TRUE='#'
+  BUILD_COMMS_NONE_FALSE=
+fi
+
+

 ac_config_files="$ac_config_files Makefile"

@@ -5208,6 +5239,18 @@ if test -z "${am__fastdepCC_TRUE}" && test -z "${am__fastdepCC_FALSE}"; then
  as_fn_error $? "conditional \"am__fastdepCC\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
+if test -z "${BUILD_COMMS_MPI_TRUE}" && test -z "${BUILD_COMMS_MPI_FALSE}"; then
+  as_fn_error $? "conditional \"BUILD_COMMS_MPI\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${BUILD_COMMS_FAKE_TRUE}" && test -z "${BUILD_COMMS_FAKE_FALSE}"; then
+  as_fn_error $? "conditional \"BUILD_COMMS_FAKE\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
+if test -z "${BUILD_COMMS_NONE_TRUE}" && test -z "${BUILD_COMMS_NONE_FALSE}"; then
+  as_fn_error $? "conditional \"BUILD_COMMS_NONE\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi

 : "${CONFIG_STATUS=./config.status}"
 ac_write_fail=0
--- a/configure.ac
+++ b/configure.ac
@@ -71,6 +71,10 @@ case ${ac_COMMS} in
     ;;
 esac

+AM_CONDITIONAL(BUILD_COMMS_MPI,[ test "X${ac_COMMS}X" == "XmpiX" ])
+AM_CONDITIONAL(BUILD_COMMS_FAKE,[ test "X${ac_COMMS}X" == "XfakeX" ])
+AM_CONDITIONAL(BUILD_COMMS_NONE,[ test "X${ac_COMMS}X" == "XnoneX" ])
+

 AC_CONFIG_FILES(Makefile)
 AC_OUTPUT