Hadrons: moving Hadrons to root directory, build system improvements

2026-05-06 02:04:33 +01:00 · 2018-08-28 15:00:40 +01:00
parent 5f206df775
commit fb7d021b9d
499 changed files with 429 additions and 846 deletions
@@ -0,0 +1,52 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/Cshift.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef _GRID_CSHIFT_H_
+#define _GRID_CSHIFT_H_
+
+#include <Grid/cshift/Cshift_common.h>
+
+#ifdef GRID_COMMS_NONE
+#include <Grid/cshift/Cshift_none.h>
+#endif
+
+#ifdef GRID_COMMS_MPI
+#include <Grid/cshift/Cshift_mpi.h>
+#endif 
+
+#ifdef GRID_COMMS_MPI3
+#include <Grid/cshift/Cshift_mpi.h>
+#endif 
+
+#ifdef GRID_COMMS_MPIT
+#include <Grid/cshift/Cshift_mpi.h>
+#endif 
+
+#ifdef GRID_COMMS_SHMEM
+#include <Grid/cshift/Cshift_mpi.h> // uses same implementation of communicator
+#endif 
+#endif
@@ -0,0 +1,391 @@
+/*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/cshift/Cshift_common.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef _GRID_CSHIFT_COMMON_H_
+#define _GRID_CSHIFT_COMMON_H_
+
+namespace Grid {
+
+///////////////////////////////////////////////////////////////////
+// Gather for when there is no need to SIMD split 
+///////////////////////////////////////////////////////////////////
+template<class vobj> void 
+Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
+{
+  int rd = rhs._grid->_rdimensions[dimension];
+
+  if ( !rhs._grid->CheckerBoarded(dimension) ) {
+    cbmask = 0x3;
+  }
+  
+  int so=plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
+  int e1=rhs._grid->_slice_nblock[dimension];
+  int e2=rhs._grid->_slice_block[dimension];
+  int ent = 0;
+
+  static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
+
+  int stride=rhs._grid->_slice_stride[dimension];
+  if ( cbmask == 0x3 ) { 
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+	int o  = n*stride;
+	int bo = n*e2;
+	table[ent++] = std::pair<int,int>(off+bo+b,so+o+b);
+      }
+    }
+  } else { 
+     int bo=0;
+     for(int n=0;n<e1;n++){
+       for(int b=0;b<e2;b++){
+	 int o  = n*stride;
+	 int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
+	 if ( ocb &cbmask ) {
+	   table[ent++]=std::pair<int,int> (off+bo++,so+o+b);
+	 }
+       }
+     }
+  }
+  parallel_for(int i=0;i<ent;i++){
+    buffer[table[i].first]=rhs._odata[table[i].second];
+  }
+}
+
+///////////////////////////////////////////////////////////////////
+// Gather for when there *is* need to SIMD split 
+///////////////////////////////////////////////////////////////////
+template<class vobj> void 
+Gather_plane_extract(const Lattice<vobj> &rhs,std::vector<typename vobj::scalar_object *> pointers,int dimension,int plane,int cbmask)
+{
+  int rd = rhs._grid->_rdimensions[dimension];
+
+  if ( !rhs._grid->CheckerBoarded(dimension) ) {
+    cbmask = 0x3;
+  }
+
+  int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
+
+  int e1=rhs._grid->_slice_nblock[dimension];
+  int e2=rhs._grid->_slice_block[dimension];
+  int n1=rhs._grid->_slice_stride[dimension];
+
+  if ( cbmask ==0x3){
+    parallel_for_nest2(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+
+	int o      =   n*n1;
+	int offset = b+n*e2;
+	
+	vobj temp =rhs._odata[so+o+b];
+	extract<vobj>(temp,pointers,offset);
+
+      }
+    }
+  } else { 
+
+    // Case of SIMD split AND checker dim cannot currently be hit, except in 
+    // Test_cshift_red_black code.
+    std::cout << " Dense packed buffer WARNING " <<std::endl;
+    parallel_for_nest2(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+
+	int o=n*n1;
+	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
+	int offset = b+n*e2;
+
+	if ( ocb & cbmask ) {
+	  vobj temp =rhs._odata[so+o+b];
+	  extract<vobj>(temp,pointers,offset);
+	}
+      }
+    }
+  }
+}
+
+//////////////////////////////////////////////////////
+// Scatter for when there is no need to SIMD split
+//////////////////////////////////////////////////////
+template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vobj> &buffer, int dimension,int plane,int cbmask)
+{
+  int rd = rhs._grid->_rdimensions[dimension];
+
+  if ( !rhs._grid->CheckerBoarded(dimension) ) {
+    cbmask=0x3;
+  }
+
+  int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
+    
+  int e1=rhs._grid->_slice_nblock[dimension];
+  int e2=rhs._grid->_slice_block[dimension];
+  int stride=rhs._grid->_slice_stride[dimension];
+
+  static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
+  int ent    =0;
+
+  if ( cbmask ==0x3 ) {
+
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+	int o   =n*rhs._grid->_slice_stride[dimension];
+	int bo  =n*rhs._grid->_slice_block[dimension];
+	table[ent++] = std::pair<int,int>(so+o+b,bo+b);
+      }
+    }
+
+  } else { 
+    int bo=0;
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+	int o   =n*rhs._grid->_slice_stride[dimension];
+	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);// Could easily be a table lookup
+	if ( ocb & cbmask ) {
+	  table[ent++]=std::pair<int,int> (so+o+b,bo++);
+	}
+      }
+    }
+  }
+
+  parallel_for(int i=0;i<ent;i++){
+    rhs._odata[table[i].first]=buffer[table[i].second];
+  }
+}
+
+//////////////////////////////////////////////////////
+// Scatter for when there *is* need to SIMD split
+//////////////////////////////////////////////////////
+template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,std::vector<typename vobj::scalar_object *> pointers,int dimension,int plane,int cbmask)
+{
+  int rd = rhs._grid->_rdimensions[dimension];
+
+  if ( !rhs._grid->CheckerBoarded(dimension) ) {
+    cbmask=0x3;
+  }
+
+  int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
+    
+  int e1=rhs._grid->_slice_nblock[dimension];
+  int e2=rhs._grid->_slice_block[dimension];
+
+  if(cbmask ==0x3 ) {
+    parallel_for_nest2(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+	int o      = n*rhs._grid->_slice_stride[dimension];
+	int offset = b+n*rhs._grid->_slice_block[dimension];
+	merge(rhs._odata[so+o+b],pointers,offset);
+      }
+    }
+  } else { 
+
+    // Case of SIMD split AND checker dim cannot currently be hit, except in 
+    // Test_cshift_red_black code.
+    //    std::cout << "Scatter_plane merge assert(0); think this is buggy FIXME "<< std::endl;// think this is buggy FIXME
+    std::cout<<" Unthreaded warning -- buffer is not densely packed ??"<<std::endl;
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+	int o      = n*rhs._grid->_slice_stride[dimension];
+	int offset = b+n*rhs._grid->_slice_block[dimension];
+	int ocb=1<<rhs._grid->CheckerBoardFromOindex(o+b);
+	if ( ocb&cbmask ) {
+	  merge(rhs._odata[so+o+b],pointers,offset);
+	}
+      }
+    }
+  }
+}
+
+//////////////////////////////////////////////////////
+// local to node block strided copies
+//////////////////////////////////////////////////////
+template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask)
+{
+  int rd = rhs._grid->_rdimensions[dimension];
+
+  if ( !rhs._grid->CheckerBoarded(dimension) ) {
+    cbmask=0x3;
+  }
+
+  int ro  = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
+  int lo  = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane 
+
+  int e1=rhs._grid->_slice_nblock[dimension]; // clearly loop invariant for icpc
+  int e2=rhs._grid->_slice_block[dimension];
+  int stride = rhs._grid->_slice_stride[dimension];
+  static std::vector<std::pair<int,int> > table; table.resize(e1*e2);
+  int ent=0;
+
+  if(cbmask == 0x3 ){
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+        int o =n*stride+b;
+	table[ent++] = std::pair<int,int>(lo+o,ro+o);
+      }
+    }
+  } else { 
+    for(int n=0;n<e1;n++){
+      for(int b=0;b<e2;b++){
+        int o =n*stride+b;
+        int ocb=1<<lhs._grid->CheckerBoardFromOindex(o);
+        if ( ocb&cbmask ) {
+	  table[ent++] = std::pair<int,int>(lo+o,ro+o);
+	}
+      }
+    }
+  }
+
+  parallel_for(int i=0;i<ent;i++){
+    lhs._odata[table[i].first]=rhs._odata[table[i].second];
+  }
+
+}
+
+template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vobj> &rhs, int dimension,int lplane,int rplane,int cbmask,int permute_type)
+{
+ 
+  int rd = rhs._grid->_rdimensions[dimension];
+
+  if ( !rhs._grid->CheckerBoarded(dimension) ) {
+    cbmask=0x3;
+  }
+
+  int ro  = rplane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
+  int lo  = lplane*lhs._grid->_ostride[dimension]; // base offset for start of plane 
+
+  int e1=rhs._grid->_slice_nblock[dimension];
+  int e2=rhs._grid->_slice_block [dimension];
+  int stride = rhs._grid->_slice_stride[dimension];
+
+  static std::vector<std::pair<int,int> > table;  table.resize(e1*e2);
+  int ent=0;
+
+  double t_tab,t_perm;
+  if ( cbmask == 0x3 ) {
+    for(int n=0;n<e1;n++){
+    for(int b=0;b<e2;b++){
+      int o  =n*stride;
+      table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
+    }}
+  } else {
+    for(int n=0;n<e1;n++){
+    for(int b=0;b<e2;b++){
+      int o  =n*stride;
+      int ocb=1<<lhs._grid->CheckerBoardFromOindex(o+b);
+      if ( ocb&cbmask ) table[ent++] = std::pair<int,int>(lo+o+b,ro+o+b);
+    }}
+  }
+
+  parallel_for(int i=0;i<ent;i++){
+    permute(lhs._odata[table[i].first],rhs._odata[table[i].second],permute_type);
+  }
+}
+
+//////////////////////////////////////////////////////
+// Local to node Cshift
+//////////////////////////////////////////////////////
+template<class vobj> void Cshift_local(Lattice<vobj>& ret,const Lattice<vobj> &rhs,int dimension,int shift)
+{
+  int sshift[2];
+
+  sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even);
+  sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd);
+
+  double t_local;
+  
+  if ( sshift[0] == sshift[1] ) {
+    Cshift_local(ret,rhs,dimension,shift,0x3);
+  } else {
+    Cshift_local(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
+    Cshift_local(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
+  }
+}
+
+template<class vobj> void Cshift_local(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
+{
+  GridBase *grid = rhs._grid;
+  int fd = grid->_fdimensions[dimension];
+  int rd = grid->_rdimensions[dimension];
+  int ld = grid->_ldimensions[dimension];
+  int gd = grid->_gdimensions[dimension];
+  int ly = grid->_simd_layout[dimension];
+
+  // Map to always positive shift modulo global full dimension.
+  shift = (shift+fd)%fd;
+
+  // the permute type
+  ret.checkerboard = grid->CheckerBoardDestination(rhs.checkerboard,shift,dimension);
+  int permute_dim =grid->PermuteDim(dimension);
+  int permute_type=grid->PermuteType(dimension);
+  int permute_type_dist;
+
+  for(int x=0;x<rd;x++){       
+
+    int o   = 0;
+    int bo  = x * grid->_ostride[dimension];
+    int cb= (cbmask==0x2)? Odd : Even;
+
+    int sshift = grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
+    int sx     = (x+sshift)%rd;
+    
+    // wrap is whether sshift > rd.
+    //  num is sshift mod rd.
+    // 
+    //  shift 7
+    //
+    //  XoXo YcYc 
+    //  oXoX cYcY
+    //  XoXo YcYc
+    //  oXoX cYcY
+    //
+    //  sshift -- 
+    //
+    //  XX YY ; 3
+    //  XX YY ; 0
+    //  XX YY ; 3
+    //  XX YY ; 0
+    //
+    int permute_slice=0;
+    if(permute_dim){
+      int wrap = sshift/rd; wrap=wrap % ly;
+      int  num = sshift%rd;
+
+      if ( x< rd-num ) permute_slice=wrap;
+      else permute_slice = (wrap+1)%ly;
+
+      if ( (ly>2) && (permute_slice) ) {
+	assert(permute_type & RotateBit);
+	permute_type_dist = permute_type|permute_slice;
+      } else {
+	permute_type_dist = permute_type;
+      }
+    }
+
+    if ( permute_slice ) Copy_plane_permute(ret,rhs,dimension,x,sx,cbmask,permute_type_dist);
+    else                 Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
+  
+  }
+}
+}
+#endif
@@ -0,0 +1,262 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/cshift/Cshift_mpi.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+Author: paboyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef _GRID_CSHIFT_MPI_H_
+#define _GRID_CSHIFT_MPI_H_
+
+
+namespace Grid { 
+
+template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
+{
+  typedef typename vobj::vector_type vector_type;
+  typedef typename vobj::scalar_type scalar_type;
+
+  Lattice<vobj> ret(rhs._grid); 
+  
+  int fd = rhs._grid->_fdimensions[dimension];
+  int rd = rhs._grid->_rdimensions[dimension];
+
+  // Map to always positive shift modulo global full dimension.
+  shift = (shift+fd)%fd;
+
+  ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift,dimension);
+        
+  // the permute type
+  int simd_layout     = rhs._grid->_simd_layout[dimension];
+  int comm_dim        = rhs._grid->_processors[dimension] >1 ;
+  int splice_dim      = rhs._grid->_simd_layout[dimension]>1 && (comm_dim);
+
+
+  if ( !comm_dim ) {
+    //std::cout << "CSHIFT: Cshift_local" <<std::endl;
+    Cshift_local(ret,rhs,dimension,shift); // Handles checkerboarding
+  } else if ( splice_dim ) {
+    //std::cout << "CSHIFT: Cshift_comms_simd call - splice_dim = " << splice_dim << " shift " << shift << " dimension = " << dimension << std::endl;
+    Cshift_comms_simd(ret,rhs,dimension,shift);
+  } else {
+    //std::cout << "CSHIFT: Cshift_comms" <<std::endl;
+    Cshift_comms(ret,rhs,dimension,shift);
+  }
+  return ret;
+}
+
+template<class vobj> void Cshift_comms(Lattice<vobj>& ret,const Lattice<vobj> &rhs,int dimension,int shift)
+{
+  int sshift[2];
+
+  sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even);
+  sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd);
+
+  //  std::cout << "Cshift_comms dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
+  if ( sshift[0] == sshift[1] ) {
+    //    std::cout << "Single pass Cshift_comms" <<std::endl;
+    Cshift_comms(ret,rhs,dimension,shift,0x3);
+  } else {
+    //    std::cout << "Two pass Cshift_comms" <<std::endl;
+    Cshift_comms(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
+    Cshift_comms(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
+  }
+}
+
+template<class vobj> void Cshift_comms_simd(Lattice<vobj>& ret,const Lattice<vobj> &rhs,int dimension,int shift)
+{
+  int sshift[2];
+
+  sshift[0] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Even);
+  sshift[1] = rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,Odd);
+
+  //std::cout << "Cshift_comms_simd dim "<<dimension<<"cb "<<rhs.checkerboard<<"shift "<<shift<<" sshift " << sshift[0]<<" "<<sshift[1]<<std::endl;
+  if ( sshift[0] == sshift[1] ) {
+    //std::cout << "Single pass Cshift_comms" <<std::endl;
+    Cshift_comms_simd(ret,rhs,dimension,shift,0x3);
+  } else {
+    //std::cout << "Two pass Cshift_comms" <<std::endl;
+    Cshift_comms_simd(ret,rhs,dimension,shift,0x1);// if checkerboard is unfavourable take two passes
+    Cshift_comms_simd(ret,rhs,dimension,shift,0x2);// both with block stride loop iteration
+  }
+}
+
+template<class vobj> void Cshift_comms(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
+{
+  typedef typename vobj::vector_type vector_type;
+  typedef typename vobj::scalar_type scalar_type;
+
+  GridBase *grid=rhs._grid;
+  Lattice<vobj> temp(rhs._grid);
+
+  int fd              = rhs._grid->_fdimensions[dimension];
+  int rd              = rhs._grid->_rdimensions[dimension];
+  int pd              = rhs._grid->_processors[dimension];
+  int simd_layout     = rhs._grid->_simd_layout[dimension];
+  int comm_dim        = rhs._grid->_processors[dimension] >1 ;
+  assert(simd_layout==1);
+  assert(comm_dim==1);
+  assert(shift>=0);
+  assert(shift<fd);
+  
+  int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
+  commVector<vobj> send_buf(buffer_size);
+  commVector<vobj> recv_buf(buffer_size);
+
+  int cb= (cbmask==0x2)? Odd : Even;
+  int sshift= rhs._grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
+
+  for(int x=0;x<rd;x++){       
+
+    int sx        =  (x+sshift)%rd;
+    int comm_proc = ((x+sshift)/rd)%pd;
+    
+    if (comm_proc==0) {
+
+      Copy_plane(ret,rhs,dimension,x,sx,cbmask); 
+
+    } else {
+
+      int words = send_buf.size();
+      if (cbmask != 0x3) words=words>>1;
+
+      int bytes = words * sizeof(vobj);
+
+      Gather_plane_simple (rhs,send_buf,dimension,sx,cbmask);
+
+      int rank           = grid->_processor;
+      int recv_from_rank;
+      int xmit_to_rank;
+      grid->ShiftedRanks(dimension,comm_proc,xmit_to_rank,recv_from_rank);
+
+
+      grid->SendToRecvFrom((void *)&send_buf[0],
+			   xmit_to_rank,
+			   (void *)&recv_buf[0],
+			   recv_from_rank,
+			   bytes);
+      grid->Barrier();
+
+      Scatter_plane_simple (ret,recv_buf,dimension,x,cbmask);
+    }
+  }
+}
+
+template<class vobj> void  Cshift_comms_simd(Lattice<vobj> &ret,const Lattice<vobj> &rhs,int dimension,int shift,int cbmask)
+{
+  GridBase *grid=rhs._grid;
+  const int Nsimd = grid->Nsimd();
+  typedef typename vobj::vector_type vector_type;
+  typedef typename vobj::scalar_object scalar_object;
+  typedef typename vobj::scalar_type scalar_type;
+   
+  int fd = grid->_fdimensions[dimension];
+  int rd = grid->_rdimensions[dimension];
+  int ld = grid->_ldimensions[dimension];
+  int pd = grid->_processors[dimension];
+  int simd_layout     = grid->_simd_layout[dimension];
+  int comm_dim        = grid->_processors[dimension] >1 ;
+
+  //std::cout << "Cshift_comms_simd dim "<< dimension << " fd "<<fd<<" rd "<<rd
+  //    << " ld "<<ld<<" pd " << pd<<" simd_layout "<<simd_layout 
+  //    << " comm_dim " << comm_dim << " cbmask " << cbmask <<std::endl;
+
+  assert(comm_dim==1);
+  assert(simd_layout==2);
+  assert(shift>=0);
+  assert(shift<fd);
+
+  int permute_type=grid->PermuteType(dimension);
+
+  ///////////////////////////////////////////////
+  // Simd direction uses an extract/merge pair
+  ///////////////////////////////////////////////
+  int buffer_size = grid->_slice_nblock[dimension]*grid->_slice_block[dimension];
+  int words = sizeof(vobj)/sizeof(vector_type);
+
+  std::vector<commVector<scalar_object> >   send_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
+  std::vector<commVector<scalar_object> >   recv_buf_extract(Nsimd,commVector<scalar_object>(buffer_size) );
+
+  int bytes = buffer_size*sizeof(scalar_object);
+
+  std::vector<scalar_object *>  pointers(Nsimd); // 
+  std::vector<scalar_object *> rpointers(Nsimd); // received pointers
+
+  ///////////////////////////////////////////
+  // Work out what to send where
+  ///////////////////////////////////////////
+  int cb    = (cbmask==0x2)? Odd : Even;
+  int sshift= grid->CheckerBoardShiftForCB(rhs.checkerboard,dimension,shift,cb);
+
+  // loop over outer coord planes orthog to dim
+  for(int x=0;x<rd;x++){       
+
+    // FIXME call local permute copy if none are offnode.
+    for(int i=0;i<Nsimd;i++){       
+      pointers[i] = &send_buf_extract[i][0];
+    }
+    int sx   = (x+sshift)%rd;
+    Gather_plane_extract(rhs,pointers,dimension,sx,cbmask);
+
+    for(int i=0;i<Nsimd;i++){
+      
+      int inner_bit = (Nsimd>>(permute_type+1));
+      int ic= (i&inner_bit)? 1:0;
+
+      int my_coor          = rd*ic + x;
+      int nbr_coor         = my_coor+sshift;
+      int nbr_proc = ((nbr_coor)/ld) % pd;// relative shift in processors
+
+      int nbr_ic   = (nbr_coor%ld)/rd;    // inner coord of peer
+      int nbr_ox   = (nbr_coor%rd);       // outer coord of peer
+      int nbr_lane = (i&(~inner_bit));
+
+      int recv_from_rank;
+      int xmit_to_rank;
+
+      if (nbr_ic) nbr_lane|=inner_bit;
+
+      assert (sx == nbr_ox);
+
+      if(nbr_proc){
+	grid->ShiftedRanks(dimension,nbr_proc,xmit_to_rank,recv_from_rank); 
+
+	grid->SendToRecvFrom((void *)&send_buf_extract[nbr_lane][0],
+			     xmit_to_rank,
+			     (void *)&recv_buf_extract[i][0],
+			     recv_from_rank,
+			     bytes);
+	grid->Barrier();
+	rpointers[i] = &recv_buf_extract[i][0];
+      } else { 
+	rpointers[i] = &send_buf_extract[nbr_lane][0];
+      }
+
+    }
+    Scatter_plane_merge(ret,rpointers,dimension,x,cbmask);
+  }
+
+ }
+}
+#endif
@@ -0,0 +1,39 @@
+    /*************************************************************************************
+
+    Grid physics library, www.github.com/paboyle/Grid 
+
+    Source file: ./lib/cshift/Cshift_none.h
+
+    Copyright (C) 2015
+
+Author: Peter Boyle <paboyle@ph.ed.ac.uk>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+    See the full license in the file "LICENSE" in the top level distribution directory
+    *************************************************************************************/
+    /*  END LEGAL */
+#ifndef _GRID_CSHIFT_NONE_H_
+#define _GRID_CSHIFT_NONE_H_
+namespace Grid {
+template<class vobj> Lattice<vobj> Cshift(const Lattice<vobj> &rhs,int dimension,int shift)
+{
+  Lattice<vobj> ret(rhs._grid);
+  ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift,dimension);
+  Cshift_local(ret,rhs,dimension,shift);
+  return ret;
+}
+}
+#endif