Grid/Grid_mpi_cshift.h

#ifndef _GRID_MPI_CSHIFT_H_
#define _GRID_MPI_CSHIFT_H_

      
//////////////////////////////////////////////
// Q. Split this into seperate sub functions?
//////////////////////////////////////////////

// CshiftCB_comms_splice
// CshiftCB_comms
// CshiftCB_local
// CshiftCB_local_permute

// Cshift_comms_splice
// Cshift_comms
// Cshift_local
// Cshift_local_permute

// Broadly I remain annoyed that the iteration is so painful
// for red black data layout, when simple block strided descriptors suffice for non-cb. 
//
// The other option is to do it table driven, or perhaps store the CB of each site in a table.
//
// Must not lose sight that goal is to be able to construct really efficient
// gather to a point stencil code. CSHIFT is not the best way, so probably need
// additional stencil support.
//
// Could still do a templated syntax tree and make CSHIFT return lattice vector.
//
// Stencil based code could pre-exchange haloes and use a table lookup for neighbours
//
// Lattice <foo> could also allocate haloes which get used for stencil code.
//
// Grid could create a neighbour index table for a given stencil.
// Could also implement CovariantCshift.

//////////////////////////////////////////////////////
//Non checkerboarded support functions
//////////////////////////////////////////////////////

friend void Gather_plane        (Lattice<vobj> &rhs,std::vector<vobj> &buffer,             int dimension,int plane)
{
  const int Nsimd = vector_type::Nsimd();
  int rd = rhs._grid->_rdimensions[dimension];

  int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
  int o   = 0;                                    // relative offset to base within plane
#pragma omp parallel for collapse(2)
  for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
    for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
	  
	  int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
	  int sx = (x+sshift)%rd;
	  int so = sx*rhs._grid->_ostride[dimension];
	  int permute_slice=0;
	  int wrap = sshift/rd;
	  int  num = sshift%rd;
	  
	  if ( x< rd-num ) permute_slice=wrap;
	  else permute_slice = 1-wrap;
	  
	  if ( permute_slice ) {
	    permute(ret._odata[ro+o+b],rhs._odata[so+o+b],permute_type);
	  } else {
	    ret._odata[ro+o+b]=rhs._odata[so+o+b];
	  }
	}
	o +=rhs._grid->_slice_stride[dimension];
      }
  }
  
}
//friend void Gather_plane_extract(Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane);
//
//friend void Scatter_plane       (Lattice<vobj> &rhs,std::vector<vobj> face,             int dimension,int plane);
//friend void Scatter_plane_merge (Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane);
//
//template<int permute_type> friend void Copy_plane_permute(Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);
//                           friend void Copy_plane(Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);
//

//////////////////////////////////////////////////////
//Checkerboarded support functions
//////////////////////////////////////////////////////

//friend void GatherCB_plane        (Lattice<vobj> &rhs,std::vector<vobj> face,             int dimension,int plane);
//friend void GatherCB_plane_extract(Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane);
//
//friend void ScatterCB_plane       (Lattice<vobj> &rhs,std::vector<vobj> face,             int dimension,int plane);
//friend void ScatterCB_plane_merge (Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane);
//
//template<int permute_type> friend void CopyCB_plane_permute(Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);
//                           friend void Copy_plane(Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);


friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
{
  typedef typename vobj::vector_type vector_type;
  typedef typename vobj::scalar_type scalar_type;
  const int Nsimd = vector_type::Nsimd();

  Lattice<vobj> ret(rhs._grid);
  
  int fd = rhs._grid->_fdimensions[dimension];
  int rd = rhs._grid->_rdimensions[dimension];
  //int ld = rhs._grid->_ldimensions[dimension];
  //int gd = rhs._grid->_gdimensions[dimension];
  

  // Map to always positive shift modulo global full dimension.
  shift = (shift+fd)%fd;

  ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
        
  // the permute type
  int simd_layout     = rhs._grid->_simd_layout[dimension];
  int comm_dim        = rhs._grid->_processors[dimension] >1 ;
  int permute_dim     = rhs._grid->_simd_layout[dimension]>1 && (!comm_dim);
  int splice_dim      = rhs._grid->_simd_layout[dimension]>1 && (comm_dim);

  int permute_type=0;
  for(int d=0;d<dimension;d++){
    if (rhs._grid->_simd_layout[d]>1 ) permute_type++;
  }

  // Logic for non-distributed dimension	  
  std::vector<int> comm_offnode(simd_layout);
  std::vector<int> comm_to     (simd_layout);
  std::vector<int> comm_from   (simd_layout);
  std::vector<int> comm_rx     (simd_layout);  // reduced coordinate of neighbour plane
  std::vector<int> comm_simd_lane(simd_layout);// simd lane of neigbour plane

  ///////////////////////////////////////////////
  // Move via a fake comms buffer
  // Simd direction uses an extract/merge pair
  ///////////////////////////////////////////////
  int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
  int words = sizeof(vobj)/sizeof(vector_type);

  std::vector<vobj,alignedAllocator<vobj> > comm_buf(buffer_size);
  std::vector<std::vector<scalar_type> > comm_buf_extract(Nsimd,std::vector<scalar_type>(buffer_size*words) );
  std::vector<scalar_type *> pointers(Nsimd);


  if ( permute_dim ) {

    for(int x=0;x<rd;x++){       
      int ro  = x*rhs._grid->_ostride[dimension]; // base offset for result
      int o   = 0;                                // relative offset to base
      for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
	for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
	  
	  int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
	  int sx = (x+sshift)%rd;
	  int so = sx*rhs._grid->_ostride[dimension];
	  int permute_slice=0;
	  int wrap = sshift/rd;
	  int  num = sshift%rd;
	  
	  if ( x< rd-num ) permute_slice=wrap;
	  else permute_slice = 1-wrap;
	  
	  if ( permute_slice ) {
	    permute(ret._odata[ro+o+b],rhs._odata[so+o+b],permute_type);
	  } else {
	    ret._odata[ro+o+b]=rhs._odata[so+o+b];
	  }
	}
	o +=rhs._grid->_slice_stride[dimension];
      }
    }

  } else if ( splice_dim ) {

    if ( rhs._grid->_simd_layout[dimension] > 2 ) exit(-1); // use Cassert. Audit code for exit and replace
    if ( rhs._grid->_simd_layout[dimension] < 1 ) exit(-1);
	    

    for(int i=0;i<vobj::vector_type::Nsimd();i++){
      pointers[i] = (scalar_type *)&comm_buf_extract[i][0];
    }

    
    for(int x=0;x<rd;x++){       


      ///////////////////////////////////////////
      // Extract one orthogonal slice at a time
      ///////////////////////////////////////////
      int ro  = x*rhs._grid->_ostride[dimension]; // base offset for result

      o   = 0;                                    // relative offset to base      
      for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
	for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
	  
	  int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
	  int sx = (x+sshift)%rd;
	  
	  // base offset for source
	  int so = sx*rhs._grid->_ostride[dimension];

	  int permute_slice=0;
	  int wrap = sshift/rd;
	  int  num = sshift%rd;
	    
	  if ( x< rd-num ) permute_slice=wrap;
	  else permute_slice = 1-wrap;

	  if ( permute_slice ) {
	    extract(rhs._odata[so+o+b],pointers);
	  }
	}
	o +=rhs._grid->_slice_stride[dimension];
      }

    
      ///////////////////////////////////////////
      // Work out what to send where
      ///////////////////////////////////////////

      for(int s=0;s<simd_layout;s++) {


	// shift to "neighbour" takes us off node
	// coordinates (rx, simd_lane) of neighbour
	// how many nodes away is this shift
	// where we should send to
	// where we should receive from
	int shifted_x = x+s*rd+shift;
	comm_offnode[s]           = shifted_x > ld; 
	comm_send_rx[s]           = shifted_x%rd;     // which slice geton the other node
	comm_send_simd_lane [s]   = shifted_x/rd;     // which slice on the other node
	comm_from[s]    = shifted_x/ld;     
	comm_to  [s] = (2*_processors[dimension]-comm_from[s]) % _processors[dimension];
	comm_from[s] = (comm_from[s]+_processors[dimension])   % _processors[dimension];
	      
      }

      ////////////////////////////////////////////////
      // Insert communication phase
      ////////////////////////////////////////////////
#if 0
	  } else if (comm_dim ) {
	    
	    // Packed gather sequence is clean
	    int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_nblock[dimension];
	    std::vector<vobj,alignedAllocator<vobj> > send_buf(buffer_size);
	    std::vector<vobj,alignedAllocator<vobj> > recv_buf(buffer_size);

	    // off node; communcate slice (ld==rd)
	    if ( x+shift > rd ) { 
	      int sb=0;
	      for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
		for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
		  send_buf[sb++]=rhs._odata[so+i];
		}
		so+=rhs._grid->_slice_stride[dimension];
	      }	      

	      // Make a comm_fake them mimics comms in periodic case.
	      
	      // scatter face
	      int rb=0;
	      for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
		for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
		  ret._odata[so+i]=recv_buf[rb++];
		}
		so+=rhs._grid->_slice_stride[dimension];
	      }	      

	    } else { 

	      for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
		for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
		  ret._odata[o+i]=rhs._odata[so+i];
		}
		o+=rhs._grid->_slice_stride[dimension];
		so+=rhs._grid->_slice_stride[dimension];
	      }
	    }

#endif	
    
      ////////////////////////////////////////////////
      // Pull receive buffers and permuted buffers in
      ////////////////////////////////////////////////
      for(int i=0;i<vobj::vector_type::Nsimd();i++){
	pointers[i] = (scalar_type *)&comm_buf_extract[permute_map[permute_type][i]][0];
      }

      o   = 0;                                    // relative offset to base
      int ro  = x*rhs._grid->_ostride[dimension]; // base offset for result
      for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
	for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
	  
	  int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
	  int sx = (x+sshift)%rd;
	  
	  // base offset for source
	  int so = sx*rhs._grid->_ostride[dimension];
	  
	  int permute_slice=0;
	  int wrap = sshift/rd;
	  int  num = sshift%rd;
	  
	  if ( x< rd-num ) permute_slice=wrap;
	  else permute_slice = 1-wrap;
	  
	  if ( permute_slice ) {
	    merge(ret._odata[ro+o+b],pointers);
	  }
	}
	o +=rhs._grid->_slice_stride[dimension];
      }
    }
  

  } else if ( comm_dim ) { 
	

    int co; // comm offset
    int o;
    
    co=0;
    for(int x=0;x<rd;x++){       
      o=0;
      int ro  = x*rhs._grid->_ostride[dimension]; // base offset for result
      for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
	for(int b=0;b<rhs._grid->_slice_block[dimension];b++){

	  // This call in inner loop is annoying but necessary for dimension=0
	  // in the case of RedBlack grids. Could optimise away with 
	  // alternate code paths for all other cases.
	  int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
	  int sx = (x+sshift)%rd;
	  int so = sx*rhs._grid->_ostride[dimension];

	  comm_buf[co++]=rhs._odata[so+o+b];

	}
	o +=rhs._grid->_slice_stride[dimension];
      }

      // Step through a copy into a comms buffer and pull back in.
      // Genuine fake implementation could calculate if loops back
      co=0; o=0;
      for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
	for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
	  ret._odata[ro+o+b]=comm_buf[co++];
	}
	o +=rhs._grid->_slice_stride[dimension];
      }
    }

  } else { // Local dimension, no permute required

    for(int x=0;x<rd;x++){       
      int o=0;
      int ro  = x*rhs._grid->_ostride[dimension]; // base offset for result
      for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
	for(int b=0;b<rhs._grid->_slice_block[dimension];b++){

	  // This call in inner loop is annoying but necessary for dimension=0
	  // in the case of RedBlack grids. Could optimise away with 
	  // alternate code paths for all other cases.
	  int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
	  
	  int sx = (x+sshift)%rd;
	  int so = sx*rhs._grid->_ostride[dimension];
	  ret._odata[bo+o+b]=rhs._odata[so+o+b];

	}
	o +=rhs._grid->_slice_stride[dimension];
      }
    }

  }

  return ret;
}


#endif
Fixing the Checkerboarding cshift. Implemented "fake" communications in preparation for the leap to MPI. 2015-03-29 20:35:37 +01:00			`#ifndef _GRID_MPI_CSHIFT_H_`
			`#define _GRID_MPI_CSHIFT_H_`


			`//////////////////////////////////////////////`
			`// Q. Split this into seperate sub functions?`
			`//////////////////////////////////////////////`

			`// CshiftCB_comms_splice`
			`// CshiftCB_comms`
			`// CshiftCB_local`
			`// CshiftCB_local_permute`

			`// Cshift_comms_splice`
			`// Cshift_comms`
			`// Cshift_local`
			`// Cshift_local_permute`

			`// Broadly I remain annoyed that the iteration is so painful`
			`// for red black data layout, when simple block strided descriptors suffice for non-cb.`
			`//`
			`// The other option is to do it table driven, or perhaps store the CB of each site in a table.`
			`//`
			`// Must not lose sight that goal is to be able to construct really efficient`
			`// gather to a point stencil code. CSHIFT is not the best way, so probably need`
			`// additional stencil support.`
			`//`
			`// Could still do a templated syntax tree and make CSHIFT return lattice vector.`
			`//`
			`// Stencil based code could pre-exchange haloes and use a table lookup for neighbours`
			`//`
			`// Lattice <foo> could also allocate haloes which get used for stencil code.`
			`//`
			`// Grid could create a neighbour index table for a given stencil.`
			`// Could also implement CovariantCshift.`

			`//////////////////////////////////////////////////////`
			`//Non checkerboarded support functions`
			`//////////////////////////////////////////////////////`

			`friend void Gather_plane (Lattice<vobj> &rhs,std::vector<vobj> &buffer, int dimension,int plane)`
			`{`
			`const int Nsimd = vector_type::Nsimd();`
			`int rd = rhs._grid->_rdimensions[dimension];`

			`int so = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane`
			`int o = 0; // relative offset to base within plane`
			`#pragma omp parallel for collapse(2)`
			`for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){`
			`for(int b=0;b<rhs._grid->_slice_block[dimension];b++){`

			`int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);`
			`int sx = (x+sshift)%rd;`
			`int so = sx*rhs._grid->_ostride[dimension];`
			`int permute_slice=0;`
			`int wrap = sshift/rd;`
			`int num = sshift%rd;`

			`if ( x< rd-num ) permute_slice=wrap;`
			`else permute_slice = 1-wrap;`

			`if ( permute_slice ) {`
			`permute(ret._odata[ro+o+b],rhs._odata[so+o+b],permute_type);`
			`} else {`
			`ret._odata[ro+o+b]=rhs._odata[so+o+b];`
			`}`
			`}`
			`o +=rhs._grid->_slice_stride[dimension];`
			`}`
			`}`

			`}`
			`//friend void Gather_plane_extract(Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane);`
			`//`
			`//friend void Scatter_plane (Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);`
			`//friend void Scatter_plane_merge (Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane);`
			`//`
			`//template<int permute_type> friend void Copy_plane_permute(Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);`
			`// friend void Copy_plane(Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);`
			`//`

			`//////////////////////////////////////////////////////`
			`//Checkerboarded support functions`
			`//////////////////////////////////////////////////////`

			`//friend void GatherCB_plane (Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);`
			`//friend void GatherCB_plane_extract(Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane);`
			`//`
			`//friend void ScatterCB_plane (Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);`
			`//friend void ScatterCB_plane_merge (Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane);`
			`//`
			`//template<int permute_type> friend void CopyCB_plane_permute(Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);`
			`// friend void Copy_plane(Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);`


			`friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)`
			`{`
			`typedef typename vobj::vector_type vector_type;`
			`typedef typename vobj::scalar_type scalar_type;`
			`const int Nsimd = vector_type::Nsimd();`

			`Lattice<vobj> ret(rhs._grid);`

			`int fd = rhs._grid->_fdimensions[dimension];`
			`int rd = rhs._grid->_rdimensions[dimension];`
			`//int ld = rhs._grid->_ldimensions[dimension];`
			`//int gd = rhs._grid->_gdimensions[dimension];`


			`// Map to always positive shift modulo global full dimension.`
			`shift = (shift+fd)%fd;`

			`ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);`

			`// the permute type`
			`int simd_layout = rhs._grid->_simd_layout[dimension];`
			`int comm_dim = rhs._grid->_processors[dimension] >1 ;`
			`int permute_dim = rhs._grid->_simd_layout[dimension]>1 && (!comm_dim);`
			`int splice_dim = rhs._grid->_simd_layout[dimension]>1 && (comm_dim);`

			`int permute_type=0;`
			`for(int d=0;d<dimension;d++){`
			`if (rhs._grid->_simd_layout[d]>1 ) permute_type++;`
			`}`

			`// Logic for non-distributed dimension`
			`std::vector<int> comm_offnode(simd_layout);`
			`std::vector<int> comm_to (simd_layout);`
			`std::vector<int> comm_from (simd_layout);`
			`std::vector<int> comm_rx (simd_layout); // reduced coordinate of neighbour plane`
			`std::vector<int> comm_simd_lane(simd_layout);// simd lane of neigbour plane`

			`///////////////////////////////////////////////`
			`// Move via a fake comms buffer`
			`// Simd direction uses an extract/merge pair`
			`///////////////////////////////////////////////`
			`int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];`
			`int words = sizeof(vobj)/sizeof(vector_type);`

			`std::vector<vobj,alignedAllocator<vobj> > comm_buf(buffer_size);`
			`std::vector<std::vector<scalar_type> > comm_buf_extract(Nsimd,std::vector<scalar_type>(buffer_size*words) );`
			`std::vector<scalar_type *> pointers(Nsimd);`



			`if ( permute_dim ) {`

			`for(int x=0;x<rd;x++){`
			`int ro = x*rhs._grid->_ostride[dimension]; // base offset for result`
			`int o = 0; // relative offset to base`
			`for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){`
			`for(int b=0;b<rhs._grid->_slice_block[dimension];b++){`

			`int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);`
			`int sx = (x+sshift)%rd;`
			`int so = sx*rhs._grid->_ostride[dimension];`
			`int permute_slice=0;`
			`int wrap = sshift/rd;`
			`int num = sshift%rd;`

			`if ( x< rd-num ) permute_slice=wrap;`
			`else permute_slice = 1-wrap;`

			`if ( permute_slice ) {`
			`permute(ret._odata[ro+o+b],rhs._odata[so+o+b],permute_type);`
			`} else {`
			`ret._odata[ro+o+b]=rhs._odata[so+o+b];`
			`}`
			`}`
			`o +=rhs._grid->_slice_stride[dimension];`
			`}`
			`}`

			`} else if ( splice_dim ) {`

			`if ( rhs._grid->_simd_layout[dimension] > 2 ) exit(-1); // use Cassert. Audit code for exit and replace`
			`if ( rhs._grid->_simd_layout[dimension] < 1 ) exit(-1);`


			`for(int i=0;i<vobj::vector_type::Nsimd();i++){`
			`pointers[i] = (scalar_type *)&comm_buf_extract[i][0];`
			`}`


			`for(int x=0;x<rd;x++){`


			`///////////////////////////////////////////`
			`// Extract one orthogonal slice at a time`
			`///////////////////////////////////////////`
			`int ro = x*rhs._grid->_ostride[dimension]; // base offset for result`

			`o = 0; // relative offset to base`
			`for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){`
			`for(int b=0;b<rhs._grid->_slice_block[dimension];b++){`

			`int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);`
			`int sx = (x+sshift)%rd;`

			`// base offset for source`
			`int so = sx*rhs._grid->_ostride[dimension];`

			`int permute_slice=0;`
			`int wrap = sshift/rd;`
			`int num = sshift%rd;`

			`if ( x< rd-num ) permute_slice=wrap;`
			`else permute_slice = 1-wrap;`

			`if ( permute_slice ) {`
			`extract(rhs._odata[so+o+b],pointers);`
			`}`
			`}`
			`o +=rhs._grid->_slice_stride[dimension];`
			`}`


			`///////////////////////////////////////////`
			`// Work out what to send where`
			`///////////////////////////////////////////`

			`for(int s=0;s<simd_layout;s++) {`


			`// shift to "neighbour" takes us off node`
			`// coordinates (rx, simd_lane) of neighbour`
			`// how many nodes away is this shift`
			`// where we should send to`
			`// where we should receive from`
			`int shifted_x = x+s*rd+shift;`
			`comm_offnode[s] = shifted_x > ld;`
			`comm_send_rx[s] = shifted_x%rd; // which slice geton the other node`
			`comm_send_simd_lane [s] = shifted_x/rd; // which slice on the other node`
			`comm_from[s] = shifted_x/ld;`
			`comm_to [s] = (2*_processors[dimension]-comm_from[s]) % _processors[dimension];`
			`comm_from[s] = (comm_from[s]+_processors[dimension]) % _processors[dimension];`

			`}`

			`////////////////////////////////////////////////`
			`// Insert communication phase`
			`////////////////////////////////////////////////`
			`#if 0`
			`} else if (comm_dim ) {`

			`// Packed gather sequence is clean`
			`int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_nblock[dimension];`
			`std::vector<vobj,alignedAllocator<vobj> > send_buf(buffer_size);`
			`std::vector<vobj,alignedAllocator<vobj> > recv_buf(buffer_size);`

			`// off node; communcate slice (ld==rd)`
			`if ( x+shift > rd ) {`
			`int sb=0;`
			`for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){`
			`for(int i=0;i<rhs._grid->_slice_block[dimension];i++){`
			`send_buf[sb++]=rhs._odata[so+i];`
			`}`
			`so+=rhs._grid->_slice_stride[dimension];`
			`}`

			`// Make a comm_fake them mimics comms in periodic case.`

			`// scatter face`
			`int rb=0;`
			`for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){`
			`for(int i=0;i<rhs._grid->_slice_block[dimension];i++){`
			`ret._odata[so+i]=recv_buf[rb++];`
			`}`
			`so+=rhs._grid->_slice_stride[dimension];`
			`}`

			`} else {`

			`for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){`
			`for(int i=0;i<rhs._grid->_slice_block[dimension];i++){`
			`ret._odata[o+i]=rhs._odata[so+i];`
			`}`
			`o+=rhs._grid->_slice_stride[dimension];`
			`so+=rhs._grid->_slice_stride[dimension];`
			`}`
			`}`

			`#endif`

			`////////////////////////////////////////////////`
			`// Pull receive buffers and permuted buffers in`
			`////////////////////////////////////////////////`
			`for(int i=0;i<vobj::vector_type::Nsimd();i++){`
			`pointers[i] = (scalar_type *)&comm_buf_extract[permute_map[permute_type][i]][0];`
			`}`

			`o = 0; // relative offset to base`
			`int ro = x*rhs._grid->_ostride[dimension]; // base offset for result`
			`for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){`
			`for(int b=0;b<rhs._grid->_slice_block[dimension];b++){`

			`int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);`
			`int sx = (x+sshift)%rd;`

			`// base offset for source`
			`int so = sx*rhs._grid->_ostride[dimension];`

			`int permute_slice=0;`
			`int wrap = sshift/rd;`
			`int num = sshift%rd;`

			`if ( x< rd-num ) permute_slice=wrap;`
			`else permute_slice = 1-wrap;`

			`if ( permute_slice ) {`
			`merge(ret._odata[ro+o+b],pointers);`
			`}`
			`}`
			`o +=rhs._grid->_slice_stride[dimension];`
			`}`
			`}`


			`} else if ( comm_dim ) {`


			`int co; // comm offset`
			`int o;`

			`co=0;`
			`for(int x=0;x<rd;x++){`
			`o=0;`
			`int ro = x*rhs._grid->_ostride[dimension]; // base offset for result`
			`for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){`
			`for(int b=0;b<rhs._grid->_slice_block[dimension];b++){`

			`// This call in inner loop is annoying but necessary for dimension=0`
			`// in the case of RedBlack grids. Could optimise away with`
			`// alternate code paths for all other cases.`
			`int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);`
			`int sx = (x+sshift)%rd;`
			`int so = sx*rhs._grid->_ostride[dimension];`

			`comm_buf[co++]=rhs._odata[so+o+b];`

			`}`
			`o +=rhs._grid->_slice_stride[dimension];`
			`}`

			`// Step through a copy into a comms buffer and pull back in.`
			`// Genuine fake implementation could calculate if loops back`
			`co=0; o=0;`
			`for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){`
			`for(int b=0;b<rhs._grid->_slice_block[dimension];b++){`
			`ret._odata[ro+o+b]=comm_buf[co++];`
			`}`
			`o +=rhs._grid->_slice_stride[dimension];`
			`}`
			`}`

			`} else { // Local dimension, no permute required`

			`for(int x=0;x<rd;x++){`
			`int o=0;`
			`int ro = x*rhs._grid->_ostride[dimension]; // base offset for result`
			`for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){`
			`for(int b=0;b<rhs._grid->_slice_block[dimension];b++){`

			`// This call in inner loop is annoying but necessary for dimension=0`
			`// in the case of RedBlack grids. Could optimise away with`
			`// alternate code paths for all other cases.`
			`int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);`

			`int sx = (x+sshift)%rd;`
			`int so = sx*rhs._grid->_ostride[dimension];`
			`ret._odata[bo+o+b]=rhs._odata[so+o+b];`

			`}`
			`o +=rhs._grid->_slice_stride[dimension];`
			`}`
			`}`

			`}`

			`return ret;`
			`}`


			`#endif`