Fixing the Checkerboarding cshift.

Implemented "fake" communications in preparation for the leap to MPI.
2026-01-10 20:09:33 +00:00 · 2015-03-29 20:35:37 +01:00
parent 82eeb9d07e
commit 196fd203e2
23 changed files with 1976 additions and 343 deletions
--- a/Grid/Grid.xcodeproj/project.pbxproj
+++ b/Grid/Grid.xcodeproj/project.pbxproj
@@ -0,0 +1,239 @@
+// !$*UTF8*$!
+{
+	archiveVersion = 1;
+	classes = {
+	};
+	objectVersion = 46;
+	objects = {
+
+/* Begin PBXBuildFile section */
+		0BF5DA461AC370840045EA34 /* main.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 0BF5DA451AC370840045EA34 /* main.cpp */; };
+/* End PBXBuildFile section */
+
+/* Begin PBXCopyFilesBuildPhase section */
+		0BF5DA401AC370840045EA34 /* CopyFiles */ = {
+			isa = PBXCopyFilesBuildPhase;
+			buildActionMask = 2147483647;
+			dstPath = /usr/share/man/man1/;
+			dstSubfolderSpec = 0;
+			files = (
+			);
+			runOnlyForDeploymentPostprocessing = 1;
+		};
+/* End PBXCopyFilesBuildPhase section */
+
+/* Begin PBXFileReference section */
+		0BF5DA421AC370840045EA34 /* Grid */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = Grid; sourceTree = BUILT_PRODUCTS_DIR; };
+		0BF5DA451AC370840045EA34 /* main.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = main.cpp; sourceTree = "<group>"; };
+/* End PBXFileReference section */
+
+/* Begin PBXFrameworksBuildPhase section */
+		0BF5DA3F1AC370840045EA34 /* Frameworks */ = {
+			isa = PBXFrameworksBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXFrameworksBuildPhase section */
+
+/* Begin PBXGroup section */
+		0BF5DA391AC370840045EA34 = {
+			isa = PBXGroup;
+			children = (
+				0BF5DA441AC370840045EA34 /* Grid */,
+				0BF5DA431AC370840045EA34 /* Products */,
+			);
+			sourceTree = "<group>";
+		};
+		0BF5DA431AC370840045EA34 /* Products */ = {
+			isa = PBXGroup;
+			children = (
+				0BF5DA421AC370840045EA34 /* Grid */,
+			);
+			name = Products;
+			sourceTree = "<group>";
+		};
+		0BF5DA441AC370840045EA34 /* Grid */ = {
+			isa = PBXGroup;
+			children = (
+				0BF5DA451AC370840045EA34 /* main.cpp */,
+			);
+			path = Grid;
+			sourceTree = "<group>";
+		};
+/* End PBXGroup section */
+
+/* Begin PBXNativeTarget section */
+		0BF5DA411AC370840045EA34 /* Grid */ = {
+			isa = PBXNativeTarget;
+			buildConfigurationList = 0BF5DA491AC370840045EA34 /* Build configuration list for PBXNativeTarget "Grid" */;
+			buildPhases = (
+				0BF5DA3E1AC370840045EA34 /* Sources */,
+				0BF5DA3F1AC370840045EA34 /* Frameworks */,
+				0BF5DA401AC370840045EA34 /* CopyFiles */,
+			);
+			buildRules = (
+			);
+			dependencies = (
+			);
+			name = Grid;
+			productName = Grid;
+			productReference = 0BF5DA421AC370840045EA34 /* Grid */;
+			productType = "com.apple.product-type.tool";
+		};
+/* End PBXNativeTarget section */
+
+/* Begin PBXProject section */
+		0BF5DA3A1AC370840045EA34 /* Project object */ = {
+			isa = PBXProject;
+			attributes = {
+				LastUpgradeCheck = 0610;
+				ORGANIZATIONNAME = "University of Edinburgh";
+				TargetAttributes = {
+					0BF5DA411AC370840045EA34 = {
+						CreatedOnToolsVersion = 6.1.1;
+					};
+				};
+			};
+			buildConfigurationList = 0BF5DA3D1AC370840045EA34 /* Build configuration list for PBXProject "Grid" */;
+			compatibilityVersion = "Xcode 3.2";
+			developmentRegion = English;
+			hasScannedForEncodings = 0;
+			knownRegions = (
+				en,
+			);
+			mainGroup = 0BF5DA391AC370840045EA34;
+			productRefGroup = 0BF5DA431AC370840045EA34 /* Products */;
+			projectDirPath = "";
+			projectRoot = "";
+			targets = (
+				0BF5DA411AC370840045EA34 /* Grid */,
+			);
+		};
+/* End PBXProject section */
+
+/* Begin PBXSourcesBuildPhase section */
+		0BF5DA3E1AC370840045EA34 /* Sources */ = {
+			isa = PBXSourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				0BF5DA461AC370840045EA34 /* main.cpp in Sources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXSourcesBuildPhase section */
+
+/* Begin XCBuildConfiguration section */
+		0BF5DA471AC370840045EA34 /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				COPY_PHASE_STRIP = NO;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu99;
+				GCC_DYNAMIC_NO_PIC = NO;
+				GCC_OPTIMIZATION_LEVEL = 0;
+				GCC_PREPROCESSOR_DEFINITIONS = (
+					"DEBUG=1",
+					"$(inherited)",
+				);
+				GCC_SYMBOLS_PRIVATE_EXTERN = NO;
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				MACOSX_DEPLOYMENT_TARGET = 10.10;
+				MTL_ENABLE_DEBUG_INFO = YES;
+				ONLY_ACTIVE_ARCH = YES;
+				SDKROOT = macosx;
+			};
+			name = Debug;
+		};
+		0BF5DA481AC370840045EA34 /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
+				CLANG_CXX_LIBRARY = "libc++";
+				CLANG_ENABLE_MODULES = YES;
+				CLANG_ENABLE_OBJC_ARC = YES;
+				CLANG_WARN_BOOL_CONVERSION = YES;
+				CLANG_WARN_CONSTANT_CONVERSION = YES;
+				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+				CLANG_WARN_EMPTY_BODY = YES;
+				CLANG_WARN_ENUM_CONVERSION = YES;
+				CLANG_WARN_INT_CONVERSION = YES;
+				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+				CLANG_WARN_UNREACHABLE_CODE = YES;
+				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+				COPY_PHASE_STRIP = YES;
+				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+				ENABLE_NS_ASSERTIONS = NO;
+				ENABLE_STRICT_OBJC_MSGSEND = YES;
+				GCC_C_LANGUAGE_STANDARD = gnu99;
+				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+				GCC_WARN_UNDECLARED_SELECTOR = YES;
+				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+				GCC_WARN_UNUSED_FUNCTION = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				MACOSX_DEPLOYMENT_TARGET = 10.10;
+				MTL_ENABLE_DEBUG_INFO = NO;
+				SDKROOT = macosx;
+			};
+			name = Release;
+		};
+		0BF5DA4A1AC370840045EA34 /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				PRODUCT_NAME = "$(TARGET_NAME)";
+			};
+			name = Debug;
+		};
+		0BF5DA4B1AC370840045EA34 /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				PRODUCT_NAME = "$(TARGET_NAME)";
+			};
+			name = Release;
+		};
+/* End XCBuildConfiguration section */
+
+/* Begin XCConfigurationList section */
+		0BF5DA3D1AC370840045EA34 /* Build configuration list for PBXProject "Grid" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				0BF5DA471AC370840045EA34 /* Debug */,
+				0BF5DA481AC370840045EA34 /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+		0BF5DA491AC370840045EA34 /* Build configuration list for PBXNativeTarget "Grid" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				0BF5DA4A1AC370840045EA34 /* Debug */,
+				0BF5DA4B1AC370840045EA34 /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+		};
+/* End XCConfigurationList section */
+	};
+	rootObject = 0BF5DA3A1AC370840045EA34 /* Project object */;
+}
--- a/Grid/Grid.xcodeproj/project.xcworkspace/contents.xcworkspacedata
+++ b/Grid/Grid.xcodeproj/project.xcworkspace/contents.xcworkspacedata
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<Workspace
+   version = "1.0">
+   <FileRef
+      location = "self:Grid.xcodeproj">
+   </FileRef>
+</Workspace>
--- a/Grid/Grid.xcodeproj/project.xcworkspace/xcuserdata/pab.xcuserdatad/UserInterfaceState.xcuserstate
+++ b/Grid/Grid.xcodeproj/project.xcworkspace/xcuserdata/pab.xcuserdatad/UserInterfaceState.xcuserstate
--- a/Grid/Grid.xcodeproj/xcuserdata/pab.xcuserdatad/xcschemes/Grid.xcscheme
+++ b/Grid/Grid.xcodeproj/xcuserdata/pab.xcuserdatad/xcschemes/Grid.xcscheme
@@ -0,0 +1,43 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<Scheme
+   LastUpgradeVersion = "0610"
+   version = "1.3">
+   <BuildAction
+      parallelizeBuildables = "YES"
+      buildImplicitDependencies = "YES">
+   </BuildAction>
+   <TestAction
+      selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
+      selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
+      shouldUseLaunchSchemeArgsEnv = "YES"
+      buildConfiguration = "Debug">
+      <Testables>
+      </Testables>
+   </TestAction>
+   <LaunchAction
+      selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
+      selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
+      launchStyle = "0"
+      useCustomWorkingDirectory = "NO"
+      buildConfiguration = "Debug"
+      ignoresPersistentStateOnLaunch = "NO"
+      debugDocumentVersioning = "YES"
+      allowLocationSimulation = "YES">
+      <AdditionalOptions>
+      </AdditionalOptions>
+   </LaunchAction>
+   <ProfileAction
+      shouldUseLaunchSchemeArgsEnv = "YES"
+      savedToolIdentifier = ""
+      useCustomWorkingDirectory = "NO"
+      buildConfiguration = "Release"
+      debugDocumentVersioning = "YES">
+   </ProfileAction>
+   <AnalyzeAction
+      buildConfiguration = "Debug">
+   </AnalyzeAction>
+   <ArchiveAction
+      buildConfiguration = "Release"
+      revealArchiveInOrganizer = "YES">
+   </ArchiveAction>
+</Scheme>
--- a/Grid/Grid.xcodeproj/xcuserdata/pab.xcuserdatad/xcschemes/xcschememanagement.plist
+++ b/Grid/Grid.xcodeproj/xcuserdata/pab.xcuserdatad/xcschemes/xcschememanagement.plist
@@ -0,0 +1,22 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>SchemeUserState</key>
+	<dict>
+		<key>Grid.xcscheme</key>
+		<dict>
+			<key>orderHint</key>
+			<integer>0</integer>
+		</dict>
+	</dict>
+	<key>SuppressBuildableAutocreation</key>
+	<dict>
+		<key>0BF5DA411AC370840045EA34</key>
+		<dict>
+			<key>primary</key>
+			<true/>
+		</dict>
+	</dict>
+</dict>
+</plist>
--- a/Grid_Cartesian.h
+++ b/Grid_Cartesian.h
@@ -1,10 +1,9 @@
 #ifndef GRID_CARTESIAN_H
 #define GRID_CARTESIAN_H

-#include "Grid.h"
-
+#include <Grid.h>
+#include <Grid_Communicator.h>
 namespace dpo{
-
    
 /////////////////////////////////////////////////////////////////////////////////////////
 // Grid Support. Following will go into Grid.h.
@@ -13,28 +12,47 @@ namespace dpo{
    // dpo::Grid
    // dpo::GridCartesian
    // dpo::GridCartesianRedBlack
-    
-class Grid {
+
+
+class SimdGrid : public CartesianCommunicator {
 public:
+
+  
+ SimdGrid(std::vector<int> & processor_grid) : CartesianCommunicator(processor_grid) {};
+
    // Give Lattice access
    template<class object> friend class Lattice;
-
        
 //protected:
        
-    // Lattice wide random support. not yet fully implemented. Need seed strategy
-    // and one generator per site.
-    //std::default_random_engine generator;
+  // Lattice wide random support. not yet fully implemented. Need seed strategy
+  // and one generator per site.
+  //std::default_random_engine generator;
  //    static std::mt19937  generator( 9 );
-
        
    // Grid information.
-    unsigned long _ndimension;
-    std::vector<int> _layout;     // Which dimensions get relayed out over simd lanes.
-    std::vector<int> _dimensions; // Dimensions of array
-    std::vector<int> _rdimensions;// Reduced dimensions with simd lane images removed
+
+    // Commicator provides
+    //    unsigned long _ndimension;
+    //    std::vector<int> _processors; // processor grid
+    //    int              _processor;  // linear processor rank
+    //    std::vector<int> _processor_coor;  // linear processor rank
+
+    std::vector<int> _simd_layout;     // Which dimensions get relayed out over simd lanes.
+
+
+    std::vector<int> _fdimensions;// Global dimensions of array with cb removed
+    std::vector<int> _gdimensions;// Global dimensions of array
+    std::vector<int> _ldimensions;// local dimensions of array with processor images removed
+    std::vector<int> _rdimensions;// Reduced local dimensions with simd lane images and processor images removed 
+
+    //    std::vector<int> _lstart;     // local start of array in gcoors. _processor_coor[d]*_ldimensions[d]
+    //    std::vector<int> _lend;       // local end of array in gcoors    _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1
+
+ 
    std::vector<int> _ostride;    // Outer stride for each dimension
    std::vector<int> _istride;    // Inner stride i.e. within simd lane
+
    int _osites;                  // _isites*_osites = product(dimensions).
    int _isites;
        
@@ -70,15 +88,33 @@ public:
        for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(rcoor[d]/_rdimensions[d]);
        return idx;
    }
-        
+    
    inline int oSites(void) { return _osites; };
    inline int iSites(void) { return _isites; };
+
+    inline int CheckerboardFromOsite (int Osite){
+      std::vector<int> ocoor;
+      CoordFromOsite(ocoor,Osite);
+      int ss=0;
+      for(int d=0;d<_ndimension;d++){
+	ss=ss+ocoor[d];
+      }      
+      return ss&0x1;
+    }
+    inline void CoordFromOsite (std::vector<int>& coor,int Osite){
+      coor.resize(_ndimension);
+      for(int d=0;d<_ndimension;d++){
+	coor[d] = Osite % _rdimensions[d];
+	Osite   = Osite / _rdimensions[d];
+      }
+    }
+
    virtual int CheckerBoard(std::vector<int> site)=0;
    virtual int CheckerBoardDestination(int source_cb,int shift)=0;
-    virtual int CheckerBoardShift(int source_cb,int dim,int shift)=0;
+    virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
 };

-class GridCartesian: public Grid {
+class GridCartesian: public SimdGrid {
 public:
    virtual int CheckerBoard(std::vector<int> site){
        return 0;
@@ -86,19 +122,24 @@ public:
    virtual int CheckerBoardDestination(int cb,int shift){
        return 0;
    }
-    virtual int CheckerBoardShift(int source_cb,int dim,int shift){
+    virtual int CheckerBoardShift(int source_cb,int dim,int shift, int osite){
        return shift;
    }
-    GridCartesian(std::vector<int> &dimensions,std::vector<int> layout)
+    GridCartesian(std::vector<int> &dimensions,
+		  std::vector<int> &simd_layout,
+		  std::vector<int> &processor_grid
+		  ) : SimdGrid(processor_grid)
    {
        ///////////////////////
        // Grid information
        ///////////////////////
        _ndimension = dimensions.size();
            
-        _dimensions.resize(_ndimension);
+        _fdimensions.resize(_ndimension);
+        _gdimensions.resize(_ndimension);
+        _ldimensions.resize(_ndimension);
        _rdimensions.resize(_ndimension);
-        _layout.resize(_ndimension);
+        _simd_layout.resize(_ndimension);
            
        _ostride.resize(_ndimension);
        _istride.resize(_ndimension);
@@ -106,21 +147,26 @@ public:
        _osites = 1;
        _isites = 1;
        for(int d=0;d<_ndimension;d++){
-            _dimensions[d] = dimensions[d];
-            _layout[d]     = layout[d];
-            // Use a reduced simd grid
-            _rdimensions[d]= _dimensions[d]/_layout[d]; //<-- _layout[d] is zero
-            _osites *= _rdimensions[d];
-            _isites *= _layout[d];
+	  _fdimensions[d] = dimensions[d]; // Global dimensions
+	  _gdimensions[d] = _fdimensions[d]; // Global dimensions
+	  _simd_layout[d] = simd_layout[d];
+
+	  //FIXME check for exact division
+
+	  // Use a reduced simd grid
+	  _ldimensions[d]= _gdimensions[d]/_processors[d];  //local dimensions
+	  _rdimensions[d]= _ldimensions[d]/_simd_layout[d]; //overdecomposition
+	  _osites *= _rdimensions[d];
+	  _isites *= _simd_layout[d];
                
-            // Addressing support
-            if ( d==0 ) {
-                _ostride[d] = 1;
-                _istride[d] = 1;
-            } else {
-                _ostride[d] = _ostride[d-1]*_rdimensions[d-1];
-                _istride[d] = _istride[d-1]*_layout[d-1];
-            }
+	  // Addressing support
+	  if ( d==0 ) {
+	    _ostride[d] = 1;
+	    _istride[d] = 1;
+	  } else {
+	    _ostride[d] = _ostride[d-1]*_rdimensions[d-1];
+	    _istride[d] = _istride[d-1]*_simd_layout[d-1];
+	  }
        }
        
        ///////////////////////
@@ -148,46 +194,57 @@ public:
        }
    };
 };
-
+ 
 // Specialise this for red black grids storing half the data like a chess board.
-class GridRedBlackCartesian : public Grid
+class GridRedBlackCartesian : public SimdGrid
 {
 public:
    virtual int CheckerBoard(std::vector<int> site){
-        return site[0]&0x1;
+      return (site[0]+site[1]+site[2]+site[3])&0x1;
    }
-    virtual int CheckerBoardShift(int source_cb,int dim,int shift){
-        if ( dim == 0 ){
-            int fulldim =2*_dimensions[0];
-            shift = (shift+fulldim)%fulldim;
-            if ( source_cb ) {
-                // Shift 0,1 -> 0
-                return (shift+1)/2;
-            } else {
-                // Shift 0->0, 1 -> 1, 2->1
-                return (shift)/2;
-            }
-        } else {
-            return shift;
-        }
+
+    // Depending on the cb of site, we toggle source cb.
+    // for block #b, element #e = (b, e)
+    // we need 
+    virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite){
+
+      if(dim != 0) return shift;
+
+      int fulldim =_fdimensions[0];
+      shift = (shift+fulldim)%fulldim;
+
+      // Probably faster with table lookup;
+      // or by looping over x,y,z and multiply rather than computing checkerboard.
+      int ocb=CheckerboardFromOsite(osite);
+	  
+      if ( (source_cb+ocb)&1 ) {
+	return (shift)/2;
+      } else {
+	return (shift+1)/2;
+      }
    }
+
    virtual int CheckerBoardDestination(int source_cb,int shift){
-        if ((shift+2*_dimensions[0])&0x1) {
+        if ((shift+_fdimensions[0])&0x1) {
            return 1-source_cb;
        } else {
            return source_cb;
        }
    };
-    GridRedBlackCartesian(std::vector<int> &dimensions,std::vector<int> layout)
+    GridRedBlackCartesian(std::vector<int> &dimensions,
+			  std::vector<int> &simd_layout,
+			  std::vector<int> &processor_grid) : SimdGrid(processor_grid)
    {
    ///////////////////////
    // Grid information
    ///////////////////////
        _ndimension = dimensions.size();
        
-        _dimensions.resize(_ndimension);
+        _fdimensions.resize(_ndimension);
+        _gdimensions.resize(_ndimension);
+        _ldimensions.resize(_ndimension);
        _rdimensions.resize(_ndimension);
-        _layout.resize(_ndimension);
+        _simd_layout.resize(_ndimension);
        
        _ostride.resize(_ndimension);
        _istride.resize(_ndimension);
@@ -195,14 +252,17 @@ public:
        _osites = 1;
        _isites = 1;
        for(int d=0;d<_ndimension;d++){
-            _dimensions[d] = dimensions[d];
-            _dimensions[0] = dimensions[0]/2;
-            _layout[d]     = layout[d];
+            _fdimensions[d] = dimensions[d];
+            _gdimensions[d] = _fdimensions[d];
+            if (d==0) _gdimensions[0] = _gdimensions[0]/2; // Remove a checkerboard
+            _ldimensions[d] = _gdimensions[d]/_processors[d];
                
            // Use a reduced simd grid
-            _rdimensions[d]= _dimensions[d]/_layout[d];
+            _simd_layout[d] = simd_layout[d];
+            _rdimensions[d]= _ldimensions[d]/_simd_layout[d];
+
            _osites *= _rdimensions[d];
-            _isites *= _layout[d];
+            _isites *= _simd_layout[d];
                
            // Addressing support
            if ( d==0 ) {
@@ -210,7 +270,7 @@ public:
                _istride[d] = 1;
            } else {
                _ostride[d] = _ostride[d-1]*_rdimensions[d-1];
-                _istride[d] = _istride[d-1]*_layout[d-1];
+                _istride[d] = _istride[d-1]*_simd_layout[d-1];
            }
        }
            
--- a/Grid_Communicator.h
+++ b/Grid_Communicator.h
@@ -0,0 +1,39 @@
+#ifndef GRID_COMMUNICATOR_H
+#define GRID_COMMUNICATOR_H
+///////////////////////////////////
+// Processor layout information
+///////////////////////////////////
+namespace dpo {
+class CartesianCommunicator {
+  public:    
+
+  // Communicator should know nothing of the physics grid, only processor grid.
+
+    std::vector<int> _processors;      // Which dimensions get relayed out over processors lanes.
+    int              _processor;       // linear processor rank
+    std::vector<int> _processor_coor;  // linear processor coordinate
+    unsigned long _ndimension;
+
+#ifdef GRID_COMMS_MPI
+    MPI_Comm communicator;
+#endif
+
+    CartesianCommunicator(std::vector<int> &pdimensions_in);
+
+    int  Rank(std::vector<int> coor);
+    void GlobalSumF(float &);
+    void GlobalSumFVector(float *,int N);
+
+    void GlobalSumF(double &);
+    void GlobalSumFVector(double *,int N);
+
+    void SendToRecvFrom(void *xmit,
+			std::vector<int> to_coordinate,
+			void *recv,
+			std::vector<int> from_coordinate,
+			int bytes);
+    
+}; 
+}
+
+#endif
--- a/Grid_Lattice.h
+++ b/Grid_Lattice.h
@@ -3,19 +3,31 @@

 #include "Grid.h"

+
+
 namespace dpo {

+// Permute the pointers 32bitx16 = 512
+static int permute_map[4][16] = { 
+  { 1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14},
+  { 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13},
+  { 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11},
+  { 9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8}
+};
+
+
 template<class vobj>
 class Lattice
 {
 public:
-    Grid *_grid;
+    SimdGrid *_grid;
    int checkerboard;
    std::vector<vobj,alignedAllocator<vobj> > _odata;

 public:
-    
-    Lattice(Grid *grid) : _grid(grid) {
+
+
+    Lattice(SimdGrid *grid) : _grid(grid) {
        _odata.reserve(_grid->oSites());
        if ( ((uint64_t)&_odata[0])&0xF) {
            exit(-1);
@@ -23,6 +35,19 @@ public:
        checkerboard=0;
    }
    
+
+#ifdef GRID_COMMS_NONE
+#include <Grid_none_cshift.h>
+#endif
+
+#ifdef GRID_COMMS_FAKE
+#include <Grid_fake_cshift.h>
+#endif
+
+#ifdef GRID_COMMS_MPI
+#include <Grid_mpi_cshift.h>
+#endif 
+
    
    // overloading dpo::conformable but no conformable in dpo ...?:w
    template<class obj1,class obj2>
@@ -69,212 +94,6 @@ public:
        return ret;
    };

-#if 0 
-    // Collapse doesn't appear to work the way I think it should in icpc
-    friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
-    {
-        Lattice<vobj> ret(rhs._grid);
-        
-        ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
-        shift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift);
-        int sx,so,o;
-        int rd = rhs._grid->_rdimensions[dimension];
-        int ld = rhs._grid->_dimensions[dimension];
-
-        // Map to always positive shift.
-        shift = (shift+ld)%ld;
-
-        // Work out whether to permute and the permute type
-        // ABCDEFGH ->   AE BF CG DH       permute
-        // Shift 0       AE BF CG DH       0 0 0 0    ABCDEFGH
-        // Shift 1       DH AE BF CG       1 0 0 0    HABCDEFG
-        // Shift 2       CG DH AE BF       1 1 0 0    GHABCDEF
-        // Shift 3       BF CG DH AE       1 1 1 0    FGHACBDE
-        // Shift 4       AE BF CG DH       1 1 1 1    EFGHABCD
-        // Shift 5       DH AE BF CG       0 1 1 1    DEFGHABC
-        // Shift 6       CG DH AE BF       0 0 1 1    CDEFGHAB
-        // Shift 7       BF CG DH AE       0 0 0 1    BCDEFGHA
-        int permute_dim =rhs._grid->_layout[dimension]>1 ;
-        int permute_type=0;
-        for(int d=0;d<dimension;d++)
-            if (rhs._grid->_layout[d]>1 ) permute_type++;
-
-        
-        // loop over perp slices.
-        // Threading considerations:
-        //   Need to map thread_num to
-        //
-        //               x_min,x_max for Loop-A
-        //               n_min,n_max for Loop-B
-        //               b_min,b_max for Loop-C
-        //  In a way that maximally load balances.
-        //
-        //  Optimal:
-        //      There are rd*n_block*block items of work.
-        //      These serialise as item "w"
-        //      b=w%block; w=w/block
-        //      n=w%nblock; x=w/nblock. Perhaps 20 cycles?
-        //
-        //  Logic:
-        //      x_chunk = (rd+thread)/nthreads simply divide work across nodes.
-        //
-        //      rd=5 , threads = 8;
-        //      0 1 2 3 4 5 6 7
-        //      0 0 0 1 1 1 1 1
-        for(int x=0;x<rd;x++){         // Loop A
-            sx = (x-shift+ld)%rd;
-            o  = x*rhs._grid->_ostride[dimension];
-            so =sx*rhs._grid->_ostride[dimension];
-
-            int permute_slice=0;
-            if ( permute_dim ) {
-                permute_slice = shift/rd;
-                if ( x<shift%rd ) permute_slice = 1-permute_slice;
-            }
-
-
-#if 0
-            if ( permute_slice ) {
-                
-                int internal=sizeof(vobj)/sizeof(vComplex);
-                int num =rhs._grid->_slice_block[dimension]*internal;
-                
-                for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
-                    vComplex *optr = (vComplex *)&ret._odata[o];
-                    vComplex *iptr = (vComplex *)&rhs._odata[so];
-                    for(int b=0;b<num;b++){
-                        permute(optr[b],iptr[b],permute_type);
-                    }
-                    o+=rhs._grid->_slice_stride[dimension];
-                    so+=rhs._grid->_slice_stride[dimension];
-                }
-            } else {
-                for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
-                    for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
-                        ret._odata[o+i]=rhs._odata[so+i];
-                    }
-                    o+=rhs._grid->_slice_stride[dimension];
-                    so+=rhs._grid->_slice_stride[dimension];
-                }
-            
-            }
-#else
-
-            if ( permute_slice ) {
-
-	        int internal=sizeof(vobj)/sizeof(vComplex);
-                int num =rhs._grid->_slice_block[dimension]*internal;
-
-
-#pragma omp parallel for collapse(2)
-                for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
-                    for(int b=0;b<num;b++){
-		      vComplex *optr = (vComplex *)&ret._odata[o +n*rhs._grid->_slice_stride[dimension]];
-		      vComplex *iptr = (vComplex *)&rhs._odata[so+n*rhs._grid->_slice_stride[dimension]];
-                      permute(optr[b],iptr[b],permute_type);
-                    }
-                }
-
-            } else {
-
-
-#pragma omp parallel for collapse(2)
-                for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
-                    for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
-		      int oo = o+ n*rhs._grid->_slice_stride[dimension];
-		      int soo=so+ n*rhs._grid->_slice_stride[dimension];
-		      ret._odata[oo+i]=rhs._odata[soo+i];
-                    }
-                }
-            
-            }
-#endif
-        }
-        return ret;
-    }
-#else
-    friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
-    {
-        Lattice<vobj> ret(rhs._grid);
-        
-        ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
-        shift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift);
-        int rd = rhs._grid->_rdimensions[dimension];
-        int ld = rhs._grid->_dimensions[dimension];
-        
-        // Map to always positive shift.
-        shift = (shift+ld)%ld;
-        
-        // Work out whether to permute and the permute type
-        // ABCDEFGH ->   AE BF CG DH       permute
-        // Shift 0       AE BF CG DH       0 0 0 0    ABCDEFGH
-        // Shift 1       DH AE BF CG       1 0 0 0    HABCDEFG
-        // Shift 2       CG DH AE BF       1 1 0 0    GHABCDEF
-        // Shift 3       BF CG DH AE       1 1 1 0    FGHACBDE
-        // Shift 4       AE BF CG DH       1 1 1 1    EFGHABCD
-        // Shift 5       DH AE BF CG       0 1 1 1    DEFGHABC
-        // Shift 6       CG DH AE BF       0 0 1 1    CDEFGHAB
-        // Shift 7       BF CG DH AE       0 0 0 1    BCDEFGHA
-        int permute_dim =rhs._grid->_layout[dimension]>1 ;
-        int permute_type=0;
-        for(int d=0;d<dimension;d++)
-            if (rhs._grid->_layout[d]>1 ) permute_type++;
-        
-        
-        // loop over all work
-        int work =rd*rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
-
-#pragma omp parallel for
-        for(int ww=0;ww<work;ww++){
-
-            
-            // can optimise this if know w moves congtiguously for a given thread.
-            // b=(b+1);
-            // if (b==_slice_block) {b=0; n=n+1;}
-            // if (n==_slice_nblock) { n=0; x=x+1}
-            //
-            // Perhaps a five cycle iterator, or so.
-            int w=ww;
-            int b = w%rhs._grid->_slice_block[dimension] ; w=w/rhs._grid->_slice_block[dimension];
-            int n = w%rhs._grid->_slice_nblock[dimension]; w=w/rhs._grid->_slice_nblock[dimension];
-            int x = w;
-
-	    int sx,so,o;
-            sx = (x-shift+ld)%rd;
-            o  = x*rhs._grid->_ostride[dimension]+n*rhs._grid->_slice_stride[dimension]; // common sub expression alert.
-            so =sx*rhs._grid->_ostride[dimension]+n*rhs._grid->_slice_stride[dimension];
-            
-            int permute_slice=0;
-            if ( permute_dim ) {
-                permute_slice = shift/rd;
-                if ( x<shift%rd ) permute_slice = 1-permute_slice;
-            }
-            
-            if ( permute_slice ) {
-                
-                int internal=sizeof(vobj)/sizeof(vComplex);
-                vComplex *optr = (vComplex *)&ret._odata[o+b];
-                vComplex *iptr = (vComplex *)&rhs._odata[so+b];
-		const char *pf = (const char *)iptr;
-		for(int i=0;i<sizeof(vobj);i+=64){
-		  _mm_prefetch(pf+i,_MM_HINT_T0);
-		}
-
-                for(int i=0;i<internal;i++){
-                    permute(optr[i],iptr[i],permute_type);
-                }
-            } else {
-	        const char *pf = (const char *) &rhs._odata[so+b];
-		for(int i=0;i<sizeof(vobj);i+=64){
-		  _mm_prefetch(pf+i,_MM_HINT_T0);
-		}
-                ret._odata[o+b]=rhs._odata[so+b];
-            }
-        }
-        return ret;
-    }
-#endif
-
    template<class sobj>
    inline Lattice<vobj> & operator = (const sobj & r){
 #pragma omp parallel for
@@ -288,14 +107,15 @@ public:
    template<class sobj>
    friend void pokeSite(const sobj &s,Lattice<vobj> &l,std::vector<int> &site){

-        if ( l._grid.checkerboard != l._grid->Checkerboard(site)){
-            printf("Poking wrong checkerboard\n");
-            exit(EXIT_FAILURE);
+        if ( l.checkerboard != l._grid->CheckerBoard(site)){
+	  printf("Poking wrong checkerboard\n");
+	  exit(EXIT_FAILURE);
        }

        int o_index = l._grid->oIndex(site);
        int i_index = l._grid->iIndex(site);
        
+	// BUGGY. This assumes complex real
        Real *v_ptr = (Real *)&l._odata[o_index];
        Real *s_ptr = (Real *)&s;
        v_ptr = v_ptr + 2*i_index;
@@ -348,6 +168,22 @@ public:
        }
    };

+    friend void lex_sites(Lattice<vobj> &l){
+
+        Real *v_ptr = (Real *)&l._odata[0];
+	size_t o_len = l._grid->oSites();
+        size_t v_len = sizeof(vobj)/sizeof(vRealF);
+	size_t vec_len = vRealF::Nsimd();
+
+        for(int i=0;i<o_len;i++){
+        for(int j=0;j<v_len;j++){
+	  for(int vv=0;vv<vec_len;vv+=2){
+            v_ptr[i*v_len*vec_len+j*vec_len+vv  ]= i+vv*500;
+            v_ptr[i*v_len*vec_len+j*vec_len+vv+1]= i+vv*500;
+	  }
+        }}
+    };
+
    friend void gaussian(Lattice<vobj> &l){
        // Zero mean, unit variance.
        std::normal_distribution<double> distribution(0.0,1.0);
@@ -427,22 +263,41 @@ public:
        return ret;
    };

-
-
    // remove and insert a half checkerboard
    friend void pickCheckerboard(int cb,Lattice<vobj> &half,const Lattice<vobj> &full){
+      half.checkerboard = cb;
+      int ssh=0;
 #pragma omp parallel for
-        for(int ss=0;ss<half._grid->oSites();ss++){
-            half._odata[ss] = full._odata[ss*2+cb];
-        }
-        half.checkerboard = cb;
+      for(int ss=0;ss<full._grid->oSites();ss++){
+	std::vector<int> coor;
+	int cbos;
+	
+	full._grid->CoordFromOsite(coor,ss);
+	cbos=half._grid->CheckerBoard(coor);
+	
+	if (cbos==cb) {
+
+	  half._odata[ssh] = full._odata[ss];
+	  ssh++;
+	}
+      }
    }
    friend void setCheckerboard(Lattice<vobj> &full,const Lattice<vobj> &half){
-        int cb = half.checkerboard;
+      int cb = half.checkerboard;
+      int ssh=0;
 #pragma omp parallel for
-        for(int ss=0;ss<half._grid->oSites();ss++){
-            full._odata[ss*2+cb]=half._odata[ss];
-        }
+      for(int ss=0;ss<full._grid->oSites();ss++){
+	std::vector<int> coor;
+	int cbos;
+	
+	full._grid->CoordFromOsite(coor,ss);
+	cbos=half._grid->CheckerBoard(coor);
+
+	if (cbos==cb) {
+	  full._odata[ss]=half._odata[ssh];
+	  ssh++;
+	}
+      }
    }
 }; // class Lattice

--- a/Grid_QCD.h
+++ b/Grid_QCD.h
@@ -5,7 +5,6 @@ namespace QCD {

    static const int Nc=3;
    static const int Ns=4;
-
    static const int CbRed  =0;
    static const int CbBlack=1;

--- a/Grid_config.h
+++ b/Grid_config.h
@@ -10,6 +10,15 @@
 /* AVX512 */
 /* #undef AVX512 */

+/* GRID_COMMS_FAKE */
+#define GRID_COMMS_FAKE 1
+
+/* GRID_COMMS_MPI */
+/* #undef GRID_COMMS_MPI */
+
+/* GRID_COMMS_NONE */
+/* #undef GRID_COMMS_NONE */
+
 /* Define to 1 if you have the `gettimeofday' function. */
 #define HAVE_GETTIMEOFDAY 1

@@ -61,6 +70,9 @@
 /* Define to the one symbol short name of this package. */
 #define PACKAGE_TARNAME "grid"

+/* Define to the home page for this package. */
+#define PACKAGE_URL ""
+
 /* Define to the version of this package. */
 #define PACKAGE_VERSION "1.0"

--- a/Grid_fake.cc
+++ b/Grid_fake.cc
@@ -0,0 +1,34 @@
+#include "Grid.h"
+namespace dpo {
+
+CartesianCommunicator::CartesianCommunicator(std::vector<int> &processors)
+{
+  _ndimension = _processors.size();
+  _processor_coor.resize(_ndimension);
+  _processors = processors;
+  
+  // Require 1^N processor grid for fake
+  for(int d=0;d<_ndimension;d++) if(_processors[d]!=1) exit(-1);
+
+  _processor = 0;// I am the one. The only one..
+  for(int d=0;d<_ndimension;d++) _processor_coor[d] = 0;
+}
+
+void CartesianCommunicator::GlobalSumF(float &){}
+void CartesianCommunicator::GlobalSumFVector(float *,int N){}
+void CartesianCommunicator::GlobalSumF(double &){}
+void CartesianCommunicator::GlobalSumFVector(double *,int N){}
+
+// Basic Halo comms primitive
+void CartesianCommunicator::SendToRecvFrom(void *xmit,
+		    std::vector<int> to_coordinate,
+		    void *recv,
+		    std::vector<int> from_coordinate,
+		    int bytes)
+{
+  exit(-1);
+  bcopy(xmit,recv,bytes);
+}
+
+}
+
--- a/Grid_fake_cshift.h
+++ b/Grid_fake_cshift.h
@@ -0,0 +1,246 @@
+#ifndef _GRID_FAKE_H_
+#define _GRID_FAKE_H_
+
+      
+
+friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
+{
+  typedef typename vobj::vector_type vector_type;
+  typedef typename vobj::scalar_type scalar_type;
+  const int Nsimd = vector_type::Nsimd();
+
+  Lattice<vobj> ret(rhs._grid);
+  
+  int fd = rhs._grid->_fdimensions[dimension];
+  int rd = rhs._grid->_rdimensions[dimension];
+  //int ld = rhs._grid->_ldimensions[dimension];
+  //int gd = rhs._grid->_gdimensions[dimension];
+  
+
+  // Map to always positive shift modulo global full dimension.
+  shift = (shift+fd)%fd;
+
+  ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
+        
+  // the permute type
+
+  int permute_dim =rhs._grid->_simd_layout[dimension]>1 ;
+  int permute_type=0;
+  for(int d=0;d<dimension;d++){
+    if (rhs._grid->_simd_layout[d]>1 ) permute_type++;
+  }
+
+  ///////////////////////////////////////////////
+  // Move via a fake comms buffer
+  // Simd direction uses an extract/merge pair
+  ///////////////////////////////////////////////
+  int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
+  int words = sizeof(vobj)/sizeof(vector_type);
+
+  std::vector<vobj,alignedAllocator<vobj> > comm_buf(buffer_size);
+  std::vector<std::vector<scalar_type> > comm_buf_extract(Nsimd,std::vector<scalar_type>(buffer_size*words) );
+  std::vector<scalar_type *> pointers(Nsimd);
+
+  for(int x=0;x<rd;x++){       
+
+    for(int i=0;i<vobj::vector_type::Nsimd();i++){
+      pointers[i] = (scalar_type *)&comm_buf_extract[i][0];
+    }
+
+    int ro  = x*rhs._grid->_ostride[dimension]; // base offset for result
+
+    if ( permute_dim ) {
+
+      int o   = 0;                                // relative offset to base
+      for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
+	for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
+	  
+	  int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
+	  
+	  int sx = (x+sshift)%rd;
+	  
+	  // base offset for source
+	  int so = sx*rhs._grid->_ostride[dimension];
+
+	  int permute_slice=0;
+	  int wrap = sshift/rd;
+	  int  num = sshift%rd;
+	    
+	  if ( x< rd-num ) permute_slice=wrap;
+	  else permute_slice = 1-wrap;
+
+	  if ( permute_slice ) {
+	    extract(rhs._odata[so+o+b],pointers);
+	  } else {
+	    ret._odata[ro+o+b]=rhs._odata[so+o+b];
+	  }
+
+	}
+	o +=rhs._grid->_slice_stride[dimension];
+      }
+
+      for(int i=0;i<vobj::vector_type::Nsimd();i++){
+	pointers[i] = (scalar_type *)&comm_buf_extract[permute_map[permute_type][i]][0];
+      }
+
+      o   = 0;                                // relative offset to base
+      for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
+	for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
+	  
+	  int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
+	  int sx = (x+sshift)%rd;
+	  
+	  // base offset for source
+	  int so = sx*rhs._grid->_ostride[dimension];
+
+	  int permute_slice=0;
+	  int wrap = sshift/rd;
+	  int  num = sshift%rd;
+	    
+	  if ( x< rd-num ) permute_slice=wrap;
+	  else permute_slice = 1-wrap;
+
+	  if ( permute_slice ) {
+	    merge(ret._odata[ro+o+b],pointers);
+	  }
+	}
+	o +=rhs._grid->_slice_stride[dimension];
+      }
+
+    } else {
+
+      int co; // comm offset
+      int o;
+
+      co=0; o=0;
+      for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
+	for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
+
+	  // This call in inner loop is annoying but necessary for dimension=0
+	  // in the case of RedBlack grids. Could optimise away with 
+	  // alternate code paths for all other cases.
+	  int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
+	  int sx = (x+sshift)%rd;
+	  int so = sx*rhs._grid->_ostride[dimension];
+
+	  comm_buf[co++]=rhs._odata[so+o+b];
+
+	}
+	o +=rhs._grid->_slice_stride[dimension];
+      }
+
+      // Step through a copy into a comms buffer and pull back in.
+      // Genuine fake implementation could calculate if loops back
+      co=0; o=0;
+      for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
+	for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
+	  ret._odata[ro+o+b]=comm_buf[co++];
+	}
+	o +=rhs._grid->_slice_stride[dimension];
+      }
+    }
+  }
+  return ret;
+}
+
+/*
+    friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
+    {
+      Lattice<vobj> ret(rhs._grid);
+        
+        int rd = rhs._grid->_rdimensions[dimension];
+        int ld = rhs._grid->_ldimensions[dimension];
+        int gd = rhs._grid->_gdimensions[dimension];
+        
+        // Map to always positive shift.
+        shift = (shift+gd)%gd;
+
+        ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
+        shift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift);
+        
+        // Work out whether to permute and the permute type
+        // ABCDEFGH ->   AE BF CG DH       permute
+        // Shift 0       AE BF CG DH       0 0 0 0    ABCDEFGH
+        // Shift 1       BF CG DH AE       0 0 0 1    BCDEFGHA
+        // Shift 2       CG DH AE BF       0 0 1 1    CDEFGHAB
+        // Shift 3       DH AE BF CG       0 1 1 1    DEFGHABC
+        // Shift 4       AE BF CG DH       1 1 1 1    EFGHABCD
+        // Shift 5       BF CG DH AE       1 1 1 0    FGHACBDE
+        // Shift 6       CG DH AE BF       1 1 0 0    GHABCDEF
+        // Shift 7       DH AE BF CG       1 0 0 0    HABCDEFG
+
+        int permute_dim =rhs._grid->_simd_layout[dimension]>1 ;
+        int permute_type=0;
+        for(int d=0;d<dimension;d++)
+            if (rhs._grid->_simd_layout[d]>1 ) permute_type++;
+        
+        
+        // loop over all work
+        int work =rd*rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
+
+	// Packed gather sequence is clean
+	int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
+	
+	typedef typename vobj::scalar_type scalar_t;
+	typedef typename vobj::vector_type vector_t;
+	const int ns=sizeof(vobj)/sizeof(scalar_t);
+	const int nv=sizeof(vobj)/sizeof(vector_t);
+	std::vector<vobj,alignedAllocator<vobj> > comm_buf(buffer_size);
+
+        for(int x=0;x<rd;x++){       
+	  
+	  int sx = (x+shift)%rd;
+	  int o  = x*rhs._grid->_ostride[dimension];
+	  int so =sx*rhs._grid->_ostride[dimension];
+
+	
+	  int permute_slice=0;
+	  if ( permute_dim ) {
+	    permute_slice = shift/rd;
+	    if ( x<shift%rd ) permute_slice = 1-permute_slice;
+	  }
+
+	  if ( permute_slice ) {
+	    exit(0);
+	    // For fake communication ALWAYS extract and either merge one way or other
+	    scalar_t * bptr = (scalar_t *) &comm_buf[0];
+
+	    int bo=0;
+	    for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
+	    
+	      vector_t *optr = (vector_t *)&ret._odata[o];
+	      vector_t *iptr = (vector_t *)&rhs._odata[so];
+	      int skew    = buffer_size*ns/2;
+
+	      for(int b=0;b<rhs._grid->_slice_block[dimension];b++){ 
+		for(int n=0;n<nv;n++){// number of simd vecsscalars in a vector
+		  extract(iptr[b*nv+n],&bptr[n],skew,permute_type);
+		}
+	      }
+	      o+=rhs._grid->_slice_stride[dimension];
+	      //	      bo+=rhs._grid->_slice_stride[dimension]*ns/2;
+
+	    }
+
+	  } else {
+	    int bo=0;
+	    for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
+	      for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
+		comm_buf[bo++] =rhs._odata[so+i];
+	      }
+	      so+=rhs._grid->_slice_stride[dimension];
+	    }
+	    bo=0;
+	    for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
+	      for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
+		ret._odata[o+i]=comm_buf[bo++];
+	      }
+	      o+=rhs._grid->_slice_stride[dimension];
+	    }
+	  }
+	}
+        return ret;
+    };
+*/
+
+#endif
--- a/Grid_init.cc
+++ b/Grid_init.cc
@@ -28,13 +28,13 @@ double usecond(void) {

 void Grid_sa_signal_handler(int sig,siginfo_t *si,void * ptr)
 {
-         ucontext_t * uc= (ucontext_t *)ptr;
-
  printf("Caught signal %d\n",si->si_signo);
  printf("  mem address %llx\n",(uint64_t)si->si_addr);
  printf("         code %d\n",si->si_code);

 #ifdef __X86_64
+    ucontext_t * uc= (ucontext_t *)ptr;
+
  struct sigcontext *sc = (struct sigcontext *)&uc->uc_mcontext;
  printf("  instruction %llx\n",(uint64_t)sc->rip);

--- a/Grid_main.cc
+++ b/Grid_main.cc
@@ -14,7 +14,7 @@ using namespace dpo::QCD;
 int main (int argc, char ** argv)
 {

-#ifdef KNL
+#if 0
  struct sigaction sa,osa;
  sigemptyset (&sa.sa_mask);
  sa.sa_sigaction= sa_action;
@@ -27,6 +27,9 @@ int main (int argc, char ** argv)
    std::vector<int> latt_size(4);
    std::vector<int> simd_layout(4);

+    std::vector<int> mpi_layout(4);
+    for(int d=0;d<4;d++) mpi_layout[d]=1;
+
 #ifdef AVX512
 for(int omp=128;omp<236;omp+=16){
 #else
@@ -37,7 +40,7 @@ int main (int argc, char ** argv)
   omp_set_num_threads(omp);
 #endif 

-  for(int lat=16;lat<=20;lat+=4){
+  for(int lat=4;lat<=16;lat+=40){
    latt_size[0] = lat;
    latt_size[1] = lat;
    latt_size[2] = lat;
@@ -62,11 +65,9 @@ int main (int argc, char ** argv)
    simd_layout[2] = 1;
    simd_layout[3] = 2;
 #endif
-
    
-    GridCartesian Fine(latt_size,simd_layout);
-    GridRedBlackCartesian rbFine(latt_size,simd_layout);
-
+    GridCartesian Fine(latt_size,simd_layout,mpi_layout);
+    GridRedBlackCartesian rbFine(latt_size,simd_layout,mpi_layout);
    
    LatticeColourMatrix Foo(&Fine);
    LatticeColourMatrix Bar(&Fine);
@@ -152,19 +153,23 @@ int main (int argc, char ** argv)
    trscMat = trace(scMat); // Trace
    
    FooBar = Bar;
-    int shift=1;
-    int dir=0;
 
+    /*
+    { 
+      std::vector<int> coor(4);
+      for(int d=0;d<4;d++) coor[d] = 0;
+      peekSite(cmat,Foo,coor);
+      Foo = zero;
+      pokeSite(cmat,Foo,coor);
+    }
    random(Foo);
-    pickCheckerboard(CbRed,rFoo,Foo);    // Pick out red or black checkerboards
-    pickCheckerboard(CbBlack,bFoo,Foo);
+    */
+    lex_sites(Foo);
+
+
+    //setCheckerboard(ShiftedCheck,rFoo); 
+    //setCheckerboard(ShiftedCheck,bFoo); 

-    Shifted  = Cshift(Foo,dir,shift);    // Shift everything
-    bShifted = Cshift(rFoo,dir,shift);   // Shift red
-    rShifted = Cshift(bFoo,dir,shift);   // Shift black
-    
-    setCheckerboard(ShiftedCheck,bShifted); // Put them all together
-    setCheckerboard(ShiftedCheck,rShifted); // and check the results (later)

    // Lattice SU(3) x SU(3)
    FooBar = Foo * Bar;
@@ -211,25 +216,69 @@ int main (int argc, char ** argv)
    printf("Cshift Mult: NumThread %d , Lattice size %d , %f Mflop/s\n",omp,lat,flops/(t1-t0));
    printf("Cshift Mult: NumThread %d , Lattice size %d , %f MB/s\n",omp,lat,bytes/(t1-t0));

-    pickCheckerboard(0,rFoo,FooBar);
-    pickCheckerboard(1,bFoo,FooBar);
-    
-    setCheckerboard(FooBar,rFoo);
-    setCheckerboard(FooBar,bFoo);
+    //    pickCheckerboard(0,rFoo,FooBar);
+    //    pickCheckerboard(1,bFoo,FooBar);
+    //    setCheckerboard(FooBar,rFoo);
+    //    setCheckerboard(FooBar,bFoo);
    
    double nrm=0;
+
+    for(int dir=0;dir<4;dir++){
+      for(int shift=0;shift<latt_size[dir];shift++){
+
+
+
+	pickCheckerboard(0,rFoo,Foo);    // Pick out red or black checkerboards
+	pickCheckerboard(1,bFoo,Foo);
    
-    std::vector<int> coor(4);
-    for(coor[0]=0;coor[0]<latt_size[0];coor[0]++){
-    for(coor[1]=0;coor[1]<latt_size[1];coor[1]++){
-    for(coor[2]=0;coor[2]<latt_size[2];coor[2]++){
-    for(coor[3]=0;coor[3]<latt_size[3];coor[3]++){
+	std::cout << "Shifting both parities by "<< shift <<" direction "<< dir <<std::endl;
+	Shifted  = Cshift(Foo,dir,shift);    // Shift everything
+
+	std::cout << "Shifting even parities"<<std::endl;
+	bShifted = Cshift(rFoo,dir,shift);   // Shift red->black
+	std::cout << "Shifting odd parities"<<std::endl;
+	rShifted = Cshift(bFoo,dir,shift);   // Shift black->red
+    
+	ShiftedCheck=zero;
+	setCheckerboard(ShiftedCheck,bShifted); // Put them all together
+	setCheckerboard(ShiftedCheck,rShifted); // and check the results (later)
+    
+	// Check results
+	std::vector<int> coor(4);
+	for(coor[3]=0;coor[3]<latt_size[3];coor[3]++){
+	for(coor[2]=0;coor[2]<latt_size[2];coor[2]++){
+	for(coor[1]=0;coor[1]<latt_size[1];coor[1]++){
+	for(coor[0]=0;coor[0]<latt_size[0];coor[0]++){
 
        std::complex<dpo::Real> diff;
                    
        std::vector<int> shiftcoor = coor;
-        shiftcoor[dir]=(shiftcoor[dir]-shift+latt_size[dir])%latt_size[dir];
-                    
+        shiftcoor[dir]=(shiftcoor[dir]+shift+latt_size[dir])%latt_size[dir];
+
+	std::vector<int> rl(4);
+	for(int dd=0;dd<4;dd++){
+	  rl[dd] = latt_size[dd]/simd_layout[dd];
+	}
+	int lex =  coor[0]%rl[0]
+	  + (coor[1]%rl[1])*rl[0]
+	  + (coor[2]%rl[2])*rl[0]*rl[1]
+	  + (coor[3]%rl[3])*rl[0]*rl[1]*rl[2];
+	lex += 
+	  +1000*(coor[0]/rl[0])
+	  +1000*(coor[1]/rl[1])*simd_layout[0]
+	  +1000*(coor[2]/rl[2])*simd_layout[0]*simd_layout[1]
+	  +1000*(coor[3]/rl[3])*simd_layout[0]*simd_layout[1]*simd_layout[2];
+
+	int lex_coor = shiftcoor[0]%rl[0]
+	  + (shiftcoor[1]%rl[1])*rl[0]
+	  + (shiftcoor[2]%rl[2])*rl[0]*rl[1]
+	  + (shiftcoor[3]%rl[3])*rl[0]*rl[1]*rl[2];
+	lex_coor += 
+	  +1000*(shiftcoor[0]/rl[0])
+	  +1000*(shiftcoor[1]/rl[1])*simd_layout[0]
+	  +1000*(shiftcoor[2]/rl[2])*simd_layout[0]*simd_layout[1]
+	  +1000*(shiftcoor[3]/rl[3])*simd_layout[0]*simd_layout[1]*simd_layout[2];
+
        ColourMatrix foo;
        ColourMatrix bar;
        ColourMatrix shifted1;
@@ -242,7 +291,6 @@ int main (int argc, char ** argv)
        peekSite(shifted1,Shifted,coor);
        peekSite(shifted2,Foo,shiftcoor);
        peekSite(shifted3,ShiftedCheck,coor);
-       
        peekSite(foo,Foo,coor);
        
        mdiff = shifted1-shifted2;
@@ -258,13 +306,13 @@ int main (int argc, char ** argv)
            diff =shifted1._internal._internal[r][c]-shifted2._internal._internal[r][c];
            nn=real(conj(diff)*diff);
            if ( nn > 0 )
-                cout<<"Shift fail "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" "
+                cout<<"Shift fail (shifted1/shifted2-ref) "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" "
                    <<shifted1._internal._internal[r][c]<<" "<<shifted2._internal._internal[r][c]
-                    << " "<< foo._internal._internal[r][c]<<endl;
+                    << " "<< foo._internal._internal[r][c]<< " lex expect " << lex_coor << " lex "<<lex<<endl;
            else if(0)
-                cout<<"Shift pass "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" "
+                cout<<"Shift pass 1vs2 "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" "
                    <<shifted1._internal._internal[r][c]<<" "<<shifted2._internal._internal[r][c]
-                    << " "<< foo._internal._internal[r][c]<<endl;
+                    << " "<< foo._internal._internal[r][c]<< " lex expect " << lex_coor << " lex "<<lex<<endl;
        }}
        
        for(int r=0;r<3;r++){
@@ -272,13 +320,13 @@ int main (int argc, char ** argv)
            diff =shifted3._internal._internal[r][c]-shifted2._internal._internal[r][c];
            nn=real(conj(diff)*diff);
            if ( nn > 0 )
-                cout<<"Shift rb fail "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" "
+                cout<<"Shift rb fail (shifted3/shifted2-ref) "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" "
                <<shifted3._internal._internal[r][c]<<" "<<shifted2._internal._internal[r][c]
-                << " "<< foo._internal._internal[r][c]<<endl;
+                << " "<< foo._internal._internal[r][c]<< " lex expect " << lex_coor << " lex "<<lex<<endl;
            else if(0)
-                cout<<"Shift rb pass "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" "
+                cout<<"Shift rb pass 3vs2 "<<coor[0]<<coor[1]<<coor[2]<<coor[3] <<" "
                <<shifted3._internal._internal[r][c]<<" "<<shifted2._internal._internal[r][c]
-                << " "<< foo._internal._internal[r][c]<<endl;
+                << " "<< foo._internal._internal[r][c]<< " lex expect " << lex_coor << " lex "<<lex<<endl;
        }}
        peekSite(bar,Bar,coor);
                    
@@ -290,7 +338,7 @@ int main (int argc, char ** argv)
            nrm = nrm + real(conj(diff)*diff);
        }}
    }}}}
-
+      }}
    std::cout << "LatticeColorMatrix * LatticeColorMatrix nrm diff = "<<nrm<<std::endl;

   } // loop for lat
--- a/Grid_math_types.h
+++ b/Grid_math_types.h
@@ -1,6 +1,63 @@
 #ifndef GRID_MATH_TYPES_H
 #define GRID_MATH_TYPES_H
 namespace dpo {
+
+
+
+//////////////////////////////////////////////////////////////////////////////////
+// Want to recurse: GridTypeMapper<Matrix<vComplexD> >::scalar_type == ComplexD.
+//////////////////////////////////////////////////////////////////////////////////
+
+  template <class T> class GridTypeMapper {
+  public:
+    typedef typename T::scalar_type scalar_type;
+    typedef typename T::vector_type vector_type;
+  };
+
+  template<> class GridTypeMapper<RealF> {
+  public:
+    typedef RealF scalar_type;
+    typedef RealF vector_type;
+  };
+  template<> class GridTypeMapper<RealD> {
+  public:
+    typedef RealD scalar_type;
+    typedef RealD vector_type;
+  };
+  template<> class GridTypeMapper<ComplexF> {
+  public:
+    typedef ComplexF scalar_type;
+    typedef ComplexF vector_type;
+  };
+  template<> class GridTypeMapper<ComplexD> {
+  public:
+    typedef ComplexD scalar_type;
+    typedef ComplexD vector_type;
+  };
+
+  template<> class GridTypeMapper<vRealF> {
+  public:
+    typedef RealF  scalar_type;
+    typedef vRealF vector_type;
+  };
+  template<> class GridTypeMapper<vRealD> {
+  public:
+    typedef RealD  scalar_type;
+    typedef vRealD vector_type;
+  };
+  template<> class GridTypeMapper<vComplexF> {
+  public:
+    typedef ComplexF  scalar_type;
+    typedef vComplexF vector_type;
+  };
+  template<> class GridTypeMapper<vComplexD> {
+  public:
+    typedef ComplexD  scalar_type;
+    typedef vComplexD vector_type;
+  };
+
+
+
 ///////////////////////////////////////////////////
 // Scalar, Vector, Matrix objects.
 // These can be composed to form tensor products of internal indices.
@@ -10,6 +67,10 @@ template<class vtype> class iScalar
 {
 public:
  SIMDalign vtype _internal;
+
+  typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
+  typedef typename GridTypeMapper<vtype>::vector_type vector_type;
+
    iScalar(){};
    iScalar(Zero &z){ *this = zero; };
    iScalar<vtype> & operator= (const Zero &hero){
@@ -19,6 +80,15 @@ public:
    friend void zeroit(iScalar<vtype> &that){
        zeroit(that._internal);
    }
+    friend void permute(iScalar<vtype> &out,iScalar<vtype> &in,int permutetype){
+      permute(out._internal,in._internal,permutetype);
+    }
+    friend void extract(iScalar<vtype> &in,std::vector<scalar_type *> &out){
+      extract(in._internal,out); // extract advances the pointers in out
+    }
+    friend void merge(iScalar<vtype> &in,std::vector<scalar_type *> &out){
+      merge(in._internal,out); // extract advances the pointers in out
+    }
    // Unary negation
    friend inline iScalar<vtype> operator -(const iScalar<vtype> &r) {
        iScalar<vtype> ret;
@@ -38,14 +108,16 @@ public:
        *this = (*this)+r;
        return *this;
    }
-    
-
 };
    
 template<class vtype,int N> class iVector
 {
 public:
  SIMDalign vtype _internal[N];
+
+  typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
+  typedef typename GridTypeMapper<vtype>::vector_type vector_type;
+
    iVector(Zero &z){ *this = zero; };
    iVector() {};
    iVector<vtype,N> & operator= (Zero &hero){
@@ -57,6 +129,21 @@ public:
            zeroit(that._internal[i]);
        }
    }
+    friend void permute(iVector<vtype,N> &out,iVector<vtype,N> &in,int permutetype){
+      for(int i=0;i<N;i++){
+	permute(out._internal[i],in._internal[i],permutetype);
+      }
+    }
+    friend void extract(iVector<vtype,N> &in,std::vector<scalar_type *> &out){
+      for(int i=0;i<N;i++){
+	extract(in._internal[i],out);// extract advances pointers in out
+      }
+    }
+    friend void merge(iVector<vtype,N> &in,std::vector<scalar_type *> &out){
+      for(int i=0;i<N;i++){
+	merge(in._internal[i],out);// extract advances pointers in out
+      }
+    }
    // Unary negation
    friend inline iVector<vtype,N> operator -(const iVector<vtype,N> &r) {
        iVector<vtype,N> ret;
@@ -84,6 +171,10 @@ template<class vtype,int N> class iMatrix
 {
 public:
  SIMDalign    vtype _internal[N][N];
+
+  typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
+  typedef typename GridTypeMapper<vtype>::vector_type vector_type;
+
    iMatrix(Zero &z){ *this = zero; };
    iMatrix() {};
    iMatrix<vtype,N> & operator= (Zero &hero){
@@ -96,6 +187,24 @@ public:
                zeroit(that._internal[i][j]);
        }}
    }
+    friend void permute(iMatrix<vtype,N> &out,iMatrix<vtype,N> &in,int permutetype){
+      for(int i=0;i<N;i++){
+      for(int j=0;j<N;j++){
+	permute(out._internal[i][j],in._internal[i][j],permutetype);
+      }}
+    }
+    friend void extract(iMatrix<vtype,N> &in,std::vector<scalar_type *> &out){
+      for(int i=0;i<N;i++){
+      for(int j=0;j<N;j++){
+	extract(in._internal[i][j],out);// extract advances pointers in out
+      }}
+    }
+    friend void merge(iMatrix<vtype,N> &in,std::vector<scalar_type *> &out){
+      for(int i=0;i<N;i++){
+      for(int j=0;j<N;j++){
+	merge(in._internal[i][j],out);// extract advances pointers in out
+      }}
+    }
    // Unary negation
    friend inline iMatrix<vtype,N> operator -(const iMatrix<vtype,N> &r) {
        iMatrix<vtype,N> ret;
--- a/Grid_mpi.cc
+++ b/Grid_mpi.cc
@@ -0,0 +1,85 @@
+#include "Grid.h"
+#include <mpi.h>
+
+namespace dpo {
+
+  // Should error check all MPI calls.
+
+CartesianCommunicator::CartesianCommunicator(std::vector<int> &processors)
+{
+  _ndimension = _processors.size();
+  std::vector<int> periodic(_ndimension,1);
+
+  _processors = processors;
+  _processor_coords.resize(_ndimension);
+
+  MPI_Cart_create(MPI_COMM_WORLD _ndimension,&_processors[0],&periodic[0],1,&communicator);
+  MPI_Comm_rank(communicator,&_processor);
+  MPI_Cart_coords(communicator,_processor,_ndimension,&_processor_coords[0]);
+
+}
+
+void CartesianCommunicator::GlobalSumF(float &f){
+  MPI_Allreduce(&f,&f,1,MPI_FLOAT,MPI_SUM,communicator);
+}
+void CartesianCommunicator::GlobalSumFVector(float *f,int N)
+{
+  MPI_Allreduce(f,f,N,MPI_FLOAT,MPI_SUM,communicator);
+}
+void CartesianCommunicator::GlobalSumF(double &d)
+{
+  MPI_Allreduce(&d,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
+}
+void CartesianCommunicator::GlobalSumFVector(double *d,int N)
+{
+  MPI_Allreduce(d,d,N,MPI_DOUBLE,MPI_SUM,communicator);
+}
+
+int CartesianCommunicator::ShiftedRank(int dim,int shift)
+{
+  int rank;
+  MPI_Cart_shift(communicator,dim,shift,&_processor,&rank);
+  return rank;
+}
+int CartesianCommunicator::Rank(std::vector<int> coor)
+{
+  int rank;
+  MPI_Cart_rank  (communicator, &coor[0], &rank);
+  return rank;
+}
+
+// Basic Halo comms primitive
+void CartesianCommunicator::SendToRecvFrom(void *xmit,
+					   std::vector<int> &to_coordinate,
+					   void *recv,
+					   std::vector<int> &from_coordinate,
+					   int bytes)
+{
+  int dest = Rank(to_coordinate);
+  int from = Rank(from_coordinate);
+
+  MPI_Request x_req;
+  MPI_Request r_req;
+  MPI_Status OkeyDokey;
+
+  MPI_Isend(xmit, bytes, MPI_CHAR,dest,dest,communicator,&x_req);
+  MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator,&r_req);
+  MPI_Wait(&x_req,&OkeyDokey);
+  MPI_Wait(&r_req,&OkeyDokey);
+  MPI_Barrier();
+}
+
+}
+
+#if 0
+
+// Could possibly do a direct block strided send?
+  int MPI_Type_vector(
+		      int count,
+		      int blocklength,
+		      int stride,
+		      MPI_Datatype old_type,
+  MPI_Datatype *newtype_p
+		      );
+
+#endif
--- a/Grid_mpi_cshift.h
+++ b/Grid_mpi_cshift.h
@@ -0,0 +1,384 @@
+#ifndef _GRID_MPI_CSHIFT_H_
+#define _GRID_MPI_CSHIFT_H_
+
+      
+//////////////////////////////////////////////
+// Q. Split this into seperate sub functions?
+//////////////////////////////////////////////
+
+// CshiftCB_comms_splice
+// CshiftCB_comms
+// CshiftCB_local
+// CshiftCB_local_permute
+
+// Cshift_comms_splice
+// Cshift_comms
+// Cshift_local
+// Cshift_local_permute
+
+// Broadly I remain annoyed that the iteration is so painful
+// for red black data layout, when simple block strided descriptors suffice for non-cb. 
+//
+// The other option is to do it table driven, or perhaps store the CB of each site in a table.
+//
+// Must not lose sight that goal is to be able to construct really efficient
+// gather to a point stencil code. CSHIFT is not the best way, so probably need
+// additional stencil support.
+//
+// Could still do a templated syntax tree and make CSHIFT return lattice vector.
+//
+// Stencil based code could pre-exchange haloes and use a table lookup for neighbours
+//
+// Lattice <foo> could also allocate haloes which get used for stencil code.
+//
+// Grid could create a neighbour index table for a given stencil.
+// Could also implement CovariantCshift.
+
+//////////////////////////////////////////////////////
+//Non checkerboarded support functions
+//////////////////////////////////////////////////////
+
+friend void Gather_plane        (Lattice<vobj> &rhs,std::vector<vobj> &buffer,             int dimension,int plane)
+{
+  const int Nsimd = vector_type::Nsimd();
+  int rd = rhs._grid->_rdimensions[dimension];
+
+  int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
+  int o   = 0;                                    // relative offset to base within plane
+#pragma omp parallel for collapse(2)
+  for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
+    for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
+	  
+	  int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
+	  int sx = (x+sshift)%rd;
+	  int so = sx*rhs._grid->_ostride[dimension];
+	  int permute_slice=0;
+	  int wrap = sshift/rd;
+	  int  num = sshift%rd;
+	  
+	  if ( x< rd-num ) permute_slice=wrap;
+	  else permute_slice = 1-wrap;
+	  
+	  if ( permute_slice ) {
+	    permute(ret._odata[ro+o+b],rhs._odata[so+o+b],permute_type);
+	  } else {
+	    ret._odata[ro+o+b]=rhs._odata[so+o+b];
+	  }
+	}
+	o +=rhs._grid->_slice_stride[dimension];
+      }
+  }
+  
+}
+//friend void Gather_plane_extract(Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane);
+//
+//friend void Scatter_plane       (Lattice<vobj> &rhs,std::vector<vobj> face,             int dimension,int plane);
+//friend void Scatter_plane_merge (Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane);
+//
+//template<int permute_type> friend void Copy_plane_permute(Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);
+//                           friend void Copy_plane(Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);
+//
+
+//////////////////////////////////////////////////////
+//Checkerboarded support functions
+//////////////////////////////////////////////////////
+
+//friend void GatherCB_plane        (Lattice<vobj> &rhs,std::vector<vobj> face,             int dimension,int plane);
+//friend void GatherCB_plane_extract(Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane);
+//
+//friend void ScatterCB_plane       (Lattice<vobj> &rhs,std::vector<vobj> face,             int dimension,int plane);
+//friend void ScatterCB_plane_merge (Lattice<vobj> &rhs,std::vector<scalar_type *> pointers,int dimension,int plane);
+//
+//template<int permute_type> friend void CopyCB_plane_permute(Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);
+//                           friend void Copy_plane(Lattice<vobj> &rhs,std::vector<vobj> face, int dimension,int plane);
+
+
+friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
+{
+  typedef typename vobj::vector_type vector_type;
+  typedef typename vobj::scalar_type scalar_type;
+  const int Nsimd = vector_type::Nsimd();
+
+  Lattice<vobj> ret(rhs._grid);
+  
+  int fd = rhs._grid->_fdimensions[dimension];
+  int rd = rhs._grid->_rdimensions[dimension];
+  //int ld = rhs._grid->_ldimensions[dimension];
+  //int gd = rhs._grid->_gdimensions[dimension];
+  
+
+  // Map to always positive shift modulo global full dimension.
+  shift = (shift+fd)%fd;
+
+  ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
+        
+  // the permute type
+  int simd_layout     = rhs._grid->_simd_layout[dimension];
+  int comm_dim        = rhs._grid->_processors[dimension] >1 ;
+  int permute_dim     = rhs._grid->_simd_layout[dimension]>1 && (!comm_dim);
+  int splice_dim      = rhs._grid->_simd_layout[dimension]>1 && (comm_dim);
+
+  int permute_type=0;
+  for(int d=0;d<dimension;d++){
+    if (rhs._grid->_simd_layout[d]>1 ) permute_type++;
+  }
+
+  // Logic for non-distributed dimension	  
+  std::vector<int> comm_offnode(simd_layout);
+  std::vector<int> comm_to     (simd_layout);
+  std::vector<int> comm_from   (simd_layout);
+  std::vector<int> comm_rx     (simd_layout);  // reduced coordinate of neighbour plane
+  std::vector<int> comm_simd_lane(simd_layout);// simd lane of neigbour plane
+
+  ///////////////////////////////////////////////
+  // Move via a fake comms buffer
+  // Simd direction uses an extract/merge pair
+  ///////////////////////////////////////////////
+  int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
+  int words = sizeof(vobj)/sizeof(vector_type);
+
+  std::vector<vobj,alignedAllocator<vobj> > comm_buf(buffer_size);
+  std::vector<std::vector<scalar_type> > comm_buf_extract(Nsimd,std::vector<scalar_type>(buffer_size*words) );
+  std::vector<scalar_type *> pointers(Nsimd);
+
+
+
+  if ( permute_dim ) {
+
+    for(int x=0;x<rd;x++){       
+      int ro  = x*rhs._grid->_ostride[dimension]; // base offset for result
+      int o   = 0;                                // relative offset to base
+      for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
+	for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
+	  
+	  int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
+	  int sx = (x+sshift)%rd;
+	  int so = sx*rhs._grid->_ostride[dimension];
+	  int permute_slice=0;
+	  int wrap = sshift/rd;
+	  int  num = sshift%rd;
+	  
+	  if ( x< rd-num ) permute_slice=wrap;
+	  else permute_slice = 1-wrap;
+	  
+	  if ( permute_slice ) {
+	    permute(ret._odata[ro+o+b],rhs._odata[so+o+b],permute_type);
+	  } else {
+	    ret._odata[ro+o+b]=rhs._odata[so+o+b];
+	  }
+	}
+	o +=rhs._grid->_slice_stride[dimension];
+      }
+    }
+
+  } else if ( splice_dim ) {
+
+    if ( rhs._grid->_simd_layout[dimension] > 2 ) exit(-1); // use Cassert. Audit code for exit and replace
+    if ( rhs._grid->_simd_layout[dimension] < 1 ) exit(-1);
+	    
+
+    for(int i=0;i<vobj::vector_type::Nsimd();i++){
+      pointers[i] = (scalar_type *)&comm_buf_extract[i][0];
+    }
+
+    
+    for(int x=0;x<rd;x++){       
+
+
+      ///////////////////////////////////////////
+      // Extract one orthogonal slice at a time
+      ///////////////////////////////////////////
+      int ro  = x*rhs._grid->_ostride[dimension]; // base offset for result
+
+      o   = 0;                                    // relative offset to base      
+      for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
+	for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
+	  
+	  int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
+	  int sx = (x+sshift)%rd;
+	  
+	  // base offset for source
+	  int so = sx*rhs._grid->_ostride[dimension];
+
+	  int permute_slice=0;
+	  int wrap = sshift/rd;
+	  int  num = sshift%rd;
+	    
+	  if ( x< rd-num ) permute_slice=wrap;
+	  else permute_slice = 1-wrap;
+
+	  if ( permute_slice ) {
+	    extract(rhs._odata[so+o+b],pointers);
+	  }
+	}
+	o +=rhs._grid->_slice_stride[dimension];
+      }
+
+    
+      ///////////////////////////////////////////
+      // Work out what to send where
+      ///////////////////////////////////////////
+
+      for(int s=0;s<simd_layout;s++) {
+
+
+	// shift to "neighbour" takes us off node
+	// coordinates (rx, simd_lane) of neighbour
+	// how many nodes away is this shift
+	// where we should send to
+	// where we should receive from
+	int shifted_x = x+s*rd+shift;
+	comm_offnode[s]           = shifted_x > ld; 
+	comm_send_rx[s]           = shifted_x%rd;     // which slice geton the other node
+	comm_send_simd_lane [s]   = shifted_x/rd;     // which slice on the other node
+	comm_from[s]    = shifted_x/ld;     
+	comm_to  [s] = (2*_processors[dimension]-comm_from[s]) % _processors[dimension];
+	comm_from[s] = (comm_from[s]+_processors[dimension])   % _processors[dimension];
+	      
+      }
+
+      ////////////////////////////////////////////////
+      // Insert communication phase
+      ////////////////////////////////////////////////
+#if 0
+	  } else if (comm_dim ) {
+	    
+	    // Packed gather sequence is clean
+	    int buffer_size = rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_nblock[dimension];
+	    std::vector<vobj,alignedAllocator<vobj> > send_buf(buffer_size);
+	    std::vector<vobj,alignedAllocator<vobj> > recv_buf(buffer_size);
+
+	    // off node; communcate slice (ld==rd)
+	    if ( x+shift > rd ) { 
+	      int sb=0;
+	      for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
+		for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
+		  send_buf[sb++]=rhs._odata[so+i];
+		}
+		so+=rhs._grid->_slice_stride[dimension];
+	      }	      
+
+	      // Make a comm_fake them mimics comms in periodic case.
+	      
+	      // scatter face
+	      int rb=0;
+	      for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
+		for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
+		  ret._odata[so+i]=recv_buf[rb++];
+		}
+		so+=rhs._grid->_slice_stride[dimension];
+	      }	      
+
+	    } else { 
+
+	      for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
+		for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
+		  ret._odata[o+i]=rhs._odata[so+i];
+		}
+		o+=rhs._grid->_slice_stride[dimension];
+		so+=rhs._grid->_slice_stride[dimension];
+	      }
+	    }
+
+#endif	
+    
+      ////////////////////////////////////////////////
+      // Pull receive buffers and permuted buffers in
+      ////////////////////////////////////////////////
+      for(int i=0;i<vobj::vector_type::Nsimd();i++){
+	pointers[i] = (scalar_type *)&comm_buf_extract[permute_map[permute_type][i]][0];
+      }
+
+      o   = 0;                                    // relative offset to base
+      int ro  = x*rhs._grid->_ostride[dimension]; // base offset for result
+      for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
+	for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
+	  
+	  int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
+	  int sx = (x+sshift)%rd;
+	  
+	  // base offset for source
+	  int so = sx*rhs._grid->_ostride[dimension];
+	  
+	  int permute_slice=0;
+	  int wrap = sshift/rd;
+	  int  num = sshift%rd;
+	  
+	  if ( x< rd-num ) permute_slice=wrap;
+	  else permute_slice = 1-wrap;
+	  
+	  if ( permute_slice ) {
+	    merge(ret._odata[ro+o+b],pointers);
+	  }
+	}
+	o +=rhs._grid->_slice_stride[dimension];
+      }
+    }
+  
+
+  } else if ( comm_dim ) { 
+	
+
+    int co; // comm offset
+    int o;
+    
+    co=0;
+    for(int x=0;x<rd;x++){       
+      o=0;
+      int ro  = x*rhs._grid->_ostride[dimension]; // base offset for result
+      for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
+	for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
+
+	  // This call in inner loop is annoying but necessary for dimension=0
+	  // in the case of RedBlack grids. Could optimise away with 
+	  // alternate code paths for all other cases.
+	  int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
+	  int sx = (x+sshift)%rd;
+	  int so = sx*rhs._grid->_ostride[dimension];
+
+	  comm_buf[co++]=rhs._odata[so+o+b];
+
+	}
+	o +=rhs._grid->_slice_stride[dimension];
+      }
+
+      // Step through a copy into a comms buffer and pull back in.
+      // Genuine fake implementation could calculate if loops back
+      co=0; o=0;
+      for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
+	for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
+	  ret._odata[ro+o+b]=comm_buf[co++];
+	}
+	o +=rhs._grid->_slice_stride[dimension];
+      }
+    }
+
+  } else { // Local dimension, no permute required
+
+    for(int x=0;x<rd;x++){       
+      int o=0;
+      int ro  = x*rhs._grid->_ostride[dimension]; // base offset for result
+      for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
+	for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
+
+	  // This call in inner loop is annoying but necessary for dimension=0
+	  // in the case of RedBlack grids. Could optimise away with 
+	  // alternate code paths for all other cases.
+	  int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
+	  
+	  int sx = (x+sshift)%rd;
+	  int so = sx*rhs._grid->_ostride[dimension];
+	  ret._odata[bo+o+b]=rhs._odata[so+o+b];
+
+	}
+	o +=rhs._grid->_slice_stride[dimension];
+      }
+    }
+
+  }
+
+  return ret;
+}
+
+
+#endif
--- a/Grid_none_cshift.h
+++ b/Grid_none_cshift.h
@@ -0,0 +1,313 @@
+#ifndef _GRID_NONE_CSHIFT_H_
+#define _GRID_NONE_CSHIFT_H_
+  // Work out whether to permute 
+  // ABCDEFGH ->   AE BF CG DH       permute              wrap num
+  //
+  // Shift 0       AE BF CG DH       0 0 0 0    ABCDEFGH   0   0
+  // Shift 1       BF CG DH AE       0 0 0 1    BCDEFGHA   0   1
+  // Shift 2       CG DH AE BF       0 0 1 1    CDEFGHAB   0   2
+  // Shift 3       DH AE BF CG       0 1 1 1    DEFGHABC   0   3
+  // Shift 4       AE BF CG DH       1 1 1 1    EFGHABCD   1   0 
+  // Shift 5       BF CG DH AE       1 1 1 0    FGHACBDE   1   1 
+  // Shift 6       CG DH AE BF       1 1 0 0    GHABCDEF   1   2
+  // Shift 7       DH AE BF CG       1 0 0 0    HABCDEFG   1   3
+
+  // Suppose 4way simd in one dim.
+  // ABCDEFGH ->   AECG BFDH      permute              wrap num
+
+  // Shift 0       AECG BFDH      0,00 0,00 ABCDEFGH         0     0
+  // Shift 1       BFDH CGEA      0,00 1,01 BCDEFGHA         0     1
+  // Shift 2       CGEA DHFB      1,01 1,01 CDEFGHAB         1     0
+  // Shift 3       DHFB EAGC      1,01 1,11 DEFGHABC         1     1
+  // Shift 4       EAGC FBHD      1,11 1,11 EFGHABCD         2     0 
+  // Shift 5       FBHD GCAE      1,11 1,10 FGHABCDE         2     1
+  // Shift 6       GCAE HDBF      1,10 1,10 GHABCDEF         3     0
+  // Shift 7       HDBF AECG      1,10 0,00 HABCDEFG         3     1
+
+  // Generalisation to 8 way simd, 16 way simd required.
+  //
+  // Need log2 Nway masks. consisting of 
+  //	    1 bit  256 bit granule
+  //	    2 bit  128 bit granule
+  //        4 bits 64  bit granule
+  //        8 bits 32  bit granules
+  //
+  //        15 bits....
+  
+// For optimisation:
+//
+// split into Cshift_none_rb_permute
+// split into Cshift_none_rb_simple
+//
+// split into Cshift_none_permute
+// split into Cshift_none_simple
+
+friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
+{
+  Lattice<vobj> ret(rhs._grid);
+  
+  int fd = rhs._grid->_fdimensions[dimension];
+  int rd = rhs._grid->_rdimensions[dimension];
+  int ld = rhs._grid->_ldimensions[dimension];
+  int gd = rhs._grid->_gdimensions[dimension];
+  
+
+  // Map to always positive shift modulo global full dimension.
+  shift = (shift+fd)%fd;
+
+  ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
+        
+  // the permute type
+
+  int permute_dim =rhs._grid->_simd_layout[dimension]>1 ;
+  int permute_type=0;
+  for(int d=0;d<dimension;d++){
+    if (rhs._grid->_simd_layout[d]>1 ) permute_type++;
+  }
+
+  for(int x=0;x<rd;x++){       
+
+    int bo  = x*rhs._grid->_ostride[dimension];
+    int o   = 0;
+
+    if ( permute_dim ) {
+
+      for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
+	for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
+	  
+	  int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
+	  
+	  int sx = (x+sshift)%rd;
+	  int so = sx*rhs._grid->_ostride[dimension];
+
+	  int permute_slice=0;
+	  int wrap = sshift/rd;
+	  int  num = sshift%rd;
+	    
+	  if ( x< rd-num ) permute_slice=wrap;
+	  else permute_slice = 1-wrap;
+	  
+	  if ( permute_slice ) {
+	    permute(ret._odata[bo+o+b],rhs._odata[so+o+b],permute_type);
+	  } else {
+	    ret._odata[bo+o+b]=rhs._odata[so+o+b];
+	  }
+
+	}
+	o +=rhs._grid->_slice_stride[dimension];
+
+      }
+    } else {
+      for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
+	for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
+
+	  // This call in inner loop is annoying but necessary for dimension=0
+	  // in the case of RedBlack grids. Could optimise away with 
+	  // alternate code paths for all other cases.
+	  int sshift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift,o+b);
+	  
+	  int sx = (x+sshift)%rd;
+	  int so = sx*rhs._grid->_ostride[dimension];
+	  ret._odata[bo+o+b]=rhs._odata[so+o+b];
+
+	}
+	o +=rhs._grid->_slice_stride[dimension];
+      }
+    }
+  }
+  return ret;
+}
+        
+#if 0 
+// Collapse doesn't appear to work the way I think it should in icpc
+friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
+{
+  Lattice<vobj> ret(rhs._grid);
+        
+  ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
+  shift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift);
+  int sx,so,o;
+  int rd = rhs._grid->_rdimensions[dimension];
+  int ld = rhs._grid->_dimensions[dimension];
+  // Map to always positive shift.
+  shift = (shift+ld)%ld;
+  // Work out whether to permute and the permute type
+  // ABCDEFGH ->   AE BF CG DH       permute
+  // Shift 0       AE BF CG DH       0 0 0 0    ABCDEFGH
+  // Shift 1       DH AE BF CG       1 0 0 0    HABCDEFG
+  // Shift 2       CG DH AE BF       1 1 0 0    GHABCDEF
+  // Shift 3       BF CG DH AE       1 1 1 0    FGHACBDE
+  // Shift 4       AE BF CG DH       1 1 1 1    EFGHABCD
+  // Shift 5       DH AE BF CG       0 1 1 1    DEFGHABC
+  // Shift 6       CG DH AE BF       0 0 1 1    CDEFGHAB
+  // Shift 7       BF CG DH AE       0 0 0 1    BCDEFGHA
+  int permute_dim =rhs._grid->_layout[dimension]>1 ;
+  int permute_type=0;
+  for(int d=0;d<dimension;d++)
+    if (rhs._grid->_layout[d]>1 ) permute_type++;
+        
+  // loop over perp slices.
+  // Threading considerations:
+  //   Need to map thread_num to
+  //
+  //               x_min,x_max for Loop-A
+  //               n_min,n_max for Loop-B
+  //               b_min,b_max for Loop-C
+  //  In a way that maximally load balances.
+  //
+  //  Optimal:
+  //      There are rd*n_block*block items of work.
+  //      These serialise as item "w"
+  //      b=w%block; w=w/block
+  //      n=w%nblock; x=w/nblock. Perhaps 20 cycles?
+  //
+  //  Logic:
+  //      x_chunk = (rd+thread)/nthreads simply divide work across nodes.
+  //
+  //      rd=5 , threads = 8;
+  //      0 1 2 3 4 5 6 7
+  //      0 0 0 1 1 1 1 1
+  for(int x=0;x<rd;x++){         // Loop A
+    sx = (x-shift+ld)%rd;
+    o  = x*rhs._grid->_ostride[dimension];
+    so =sx*rhs._grid->_ostride[dimension];
+    int permute_slice=0;
+    if ( permute_dim ) {
+      permute_slice = shift/rd;
+      if ( x<shift%rd ) permute_slice = 1-permute_slice;
+    }
+#if 0
+    if ( permute_slice ) {
+                
+      int internal=sizeof(vobj)/sizeof(vComplex);
+      int num =rhs._grid->_slice_block[dimension]*internal;
+                
+      for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
+	vComplex *optr = (vComplex *)&ret._odata[o];
+	vComplex *iptr = (vComplex *)&rhs._odata[so];
+	for(int b=0;b<num;b++){
+	  permute(optr[b],iptr[b],permute_type);
+	}
+	o+=rhs._grid->_slice_stride[dimension];
+	so+=rhs._grid->_slice_stride[dimension];
+      }
+    } else {
+      for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
+	for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
+	  ret._odata[o+i]=rhs._odata[so+i];
+	}
+	o+=rhs._grid->_slice_stride[dimension];
+	so+=rhs._grid->_slice_stride[dimension];
+      }
+            
+    }
+#else
+    if ( permute_slice ) {
+      int internal=sizeof(vobj)/sizeof(vComplex);
+      int num =rhs._grid->_slice_block[dimension]*internal;
+#pragma omp parallel for collapse(2)
+      for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
+	for(int b=0;b<num;b++){
+	  vComplex *optr = (vComplex *)&ret._odata[o +n*rhs._grid->_slice_stride[dimension]];
+	  vComplex *iptr = (vComplex *)&rhs._odata[so+n*rhs._grid->_slice_stride[dimension]];
+	  permute(optr[b],iptr[b],permute_type);
+	}
+      }
+    } else {
+#pragma omp parallel for collapse(2)
+      for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
+	for(int i=0;i<rhs._grid->_slice_block[dimension];i++){
+	  int oo = o+ n*rhs._grid->_slice_stride[dimension];
+	  int soo=so+ n*rhs._grid->_slice_stride[dimension];
+	  ret._odata[oo+i]=rhs._odata[soo+i];
+	}
+      }
+            
+    }
+#endif
+  }
+  return ret;
+}
+friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
+{
+  Lattice<vobj> ret(rhs._grid);
+        
+  ret.checkerboard = rhs._grid->CheckerBoardDestination(rhs.checkerboard,shift);
+  shift = rhs._grid->CheckerBoardShift(rhs.checkerboard,dimension,shift);
+  int rd = rhs._grid->_rdimensions[dimension];
+  int ld = rhs._grid->_dimensions[dimension];
+        
+  // Map to always positive shift.
+  shift = (shift+ld)%ld;
+        
+  // Work out whether to permute and the permute type
+  // ABCDEFGH ->   AE BF CG DH       permute
+  // Shift 0       AE BF CG DH       0 0 0 0    ABCDEFGH
+  // Shift 1       DH AE BF CG       1 0 0 0    HABCDEFG
+  // Shift 2       CG DH AE BF       1 1 0 0    GHABCDEF
+  // Shift 3       BF CG DH AE       1 1 1 0    FGHACBDE
+  // Shift 4       AE BF CG DH       1 1 1 1    EFGHABCD
+  // Shift 5       DH AE BF CG       0 1 1 1    DEFGHABC
+  // Shift 6       CG DH AE BF       0 0 1 1    CDEFGHAB
+  // Shift 7       BF CG DH AE       0 0 0 1    BCDEFGHA
+  int permute_dim =rhs._grid->_layout[dimension]>1 ;
+  int permute_type=0;
+  for(int d=0;d<dimension;d++)
+    if (rhs._grid->_layout[d]>1 ) permute_type++;
+        
+        
+  // loop over all work
+  int work =rd*rhs._grid->_slice_nblock[dimension]*rhs._grid->_slice_block[dimension];
+
+#pragma omp parallel for
+  for(int ww=0;ww<work;ww++){
+
+            
+    // can optimise this if know w moves congtiguously for a given thread.
+    // b=(b+1);
+    // if (b==_slice_block) {b=0; n=n+1;}
+    // if (n==_slice_nblock) { n=0; x=x+1}
+    //
+    // Perhaps a five cycle iterator, or so.
+    int w=ww;
+    int b = w%rhs._grid->_slice_block[dimension] ; w=w/rhs._grid->_slice_block[dimension];
+    int n = w%rhs._grid->_slice_nblock[dimension]; w=w/rhs._grid->_slice_nblock[dimension];
+    int x = w;
+
+    int sx,so,o;
+    sx = (x-shift+ld)%rd;
+    o  = x*rhs._grid->_ostride[dimension]+n*rhs._grid->_slice_stride[dimension]; // common sub expression alert.
+    so =sx*rhs._grid->_ostride[dimension]+n*rhs._grid->_slice_stride[dimension];
+            
+    int permute_slice=0;
+    if ( permute_dim ) {
+      permute_slice = shift/rd;
+      if ( x<shift%rd ) permute_slice = 1-permute_slice;
+    }
+            
+    if ( permute_slice ) {
+                
+      int internal=sizeof(vobj)/sizeof(vComplex);
+      vComplex *optr = (vComplex *)&ret._odata[o+b];
+      vComplex *iptr = (vComplex *)&rhs._odata[so+b];
+      const char *pf = (const char *)iptr;
+      for(int i=0;i<sizeof(vobj);i+=64){
+	_mm_prefetch(pf+i,_MM_HINT_T0);
+      }
+
+      for(int i=0;i<internal;i++){
+	permute(optr[i],iptr[i],permute_type);
+      }
+    } else {
+      const char *pf = (const char *) &rhs._odata[so+b];
+      for(int i=0;i<sizeof(vobj);i+=64){
+	_mm_prefetch(pf+i,_MM_HINT_T0);
+      }
+      ret._odata[o+b]=rhs._odata[so+b];
+    }
+  }
+  return ret;
+}
+#endif
+#endif
+
--- a/Grid_simd.h
+++ b/Grid_simd.h
@@ -47,7 +47,7 @@ namespace dpo {
  typedef std::complex<RealF> ComplexF;
  typedef std::complex<RealD> ComplexD;
  typedef std::complex<Real>  Complex;
-  
+

  inline RealF adj(const RealF  & r){ return r; }
  inline RealF conj(const RealF  & r){ return r; }
@@ -130,7 +130,7 @@ namespace dpo {
 #if defined (AVX1) || defined (AVX2)
    typedef __m256 fvec;
    typedef __m256d dvec;
-    typedef __m256 cvec;
+    typedef __m256  cvec;
    typedef __m256d zvec;
 #endif
 #if defined (AVX512)
--- a/Grid_vComplexD.h
+++ b/Grid_vComplexD.h
@@ -8,6 +8,9 @@ namespace dpo {
    protected:
        zvec v;
    public:
+	typedef zvec     vector_type;
+	typedef ComplexD scalar_type;
+
        vComplexD & operator = ( Zero & z){
            vzero(*this);
            return (*this);
@@ -145,14 +148,43 @@ namespace dpo {
            ymm0 =  _mm512_swizzle_pd(b.v, _MM_SWIZ_REG_CDAB); // OK
            ret.v=  _mm512_fmadd_pd(ymm0,imag,ymm1);
             /* Imag OK */
-            
-
 #endif
 #ifdef QPX
            ret.v = vec_mul(a.v,b.v);
 #endif
            return ret;
        };
+
+        /////////////////////////////////////////////////////////////////
+        // Extract
+        /////////////////////////////////////////////////////////////////
+        friend inline void extract(vComplexD &y,std::vector<ComplexD *> &extracted){
+	  // Bounce off stack is painful
+	  // temporary hack while I figure out the right interface
+	  const int Nsimd = vComplexD::Nsimd();
+	  std::vector<ComplexD> buf(Nsimd); 
+
+	  vstore(y,&buf[0]);
+
+	  for(int i=0;i<Nsimd;i++){
+	    *extracted[i] = buf[i];
+	    extracted[i]++;
+	  }
+        };
+
+        friend inline void merge(vComplexD &y,std::vector<ComplexD *> &extracted){
+	  // Bounce off stack is painful
+	  // temporary hack while I figure out the right interface
+	  const int Nsimd = vComplexD::Nsimd();
+	  std::vector<ComplexD> buf(Nsimd); 
+
+	  for(int i=0;i<Nsimd;i++){
+	    buf[i]=*extracted[i];
+	    extracted[i]++;
+	  }
+	  vset(y,&buf[0]); 
+        };
+        
        
        /////////////////////////////////////////////////////////////////
        // Permute
@@ -165,7 +197,7 @@ namespace dpo {
                // AB => BA i.e. ab cd =>cd ab
 #endif
 #ifdef SSE2
-                    //No cases here
+		  break;
 #endif
 #ifdef AVX512
                    // 4 complex=>2 permute
@@ -205,7 +237,7 @@ namespace dpo {
 #endif
        }

-        friend inline void vset(vComplexD &ret, std::complex<double> *a){
+        friend inline void vset(vComplexD &ret,ComplexD *a){
 #if defined (AVX1)|| defined (AVX2)
            ret.v = _mm256_set_pd(a[1].imag(),a[1].real(),a[0].imag(),a[0].real());
 #endif
@@ -221,7 +253,7 @@ namespace dpo {
 #endif
        }

-friend inline void vstore(vComplexD &ret, std::complex<double> *a){
+friend inline void vstore(vComplexD &ret, ComplexD *a){
 #if defined (AVX1)|| defined (AVX2)
       _mm256_store_pd((double *)a,ret.v);
 #endif
@@ -273,10 +305,10 @@ friend inline void vstore(vComplexD &ret, std::complex<double> *a){
 	 //            return std::complex<double>(_mm256_mask_reduce_add_pd(0x55, in.v),_mm256_mask_reduce_add_pd(0xAA, in.v));
 	 __attribute__ ((aligned(32))) double c_[4];
         _mm256_store_pd(c_,in.v);
-	 return std::complex<double>(c_[0]+c_[2],c_[1]+c_[3]);
+	 return ComplexD(c_[0]+c_[2],c_[1]+c_[3]);
 #endif 
 #ifdef AVX512
-            return std::complex<double>(_mm512_mask_reduce_add_pd(0x5555, in.v),_mm512_mask_reduce_add_pd(0xAAAA, in.v));
+            return ComplexD(_mm512_mask_reduce_add_pd(0x5555, in.v),_mm512_mask_reduce_add_pd(0xAAAA, in.v));
 #endif 
 #ifdef QPX
 #endif
--- a/Grid_vComplexF.h
+++ b/Grid_vComplexF.h
@@ -8,6 +8,11 @@ namespace dpo {
        cvec v;
        
    public:
+        static inline int Nsimd(void) { return sizeof(cvec)/sizeof(float)/2;}
+    public:
+	typedef cvec     vector_type;
+	typedef ComplexF scalar_type;
+
        vComplexF & operator = ( Zero & z){
            vzero(*this);
            return (*this);
@@ -125,6 +130,39 @@ namespace dpo {
            return ret;
        };

+        /////////////////////////////////////////////////////////////////
+        // Extract
+        /////////////////////////////////////////////////////////////////
+        friend inline void extract(vComplexF &y,std::vector<ComplexF *> &extracted){
+	  // Bounce off heap is painful
+	  // temporary hack while I figure out the right interface
+            vComplexF vbuf;
+            ComplexF *buf = (ComplexF *)&vbuf;
+            
+	  vstore(y,&buf[0]);
+	  for(int i=0;i<vComplexF::Nsimd();i++){
+	    *extracted[i] = buf[i];
+	    extracted[i]++;
+	  }
+
+        };
+
+        friend inline void merge(vComplexF &y,std::vector<ComplexF *> &extracted){
+	  // Bounce off stack is painful
+	  // temporary hack while I figure out the right interface
+	  const int Nsimd = vComplexF::Nsimd();
+            vComplexF vbuf;
+            ComplexF *buf = (ComplexF *)&vbuf;
+
+	  for(int i=0;i<Nsimd;i++){
+	    buf[i]=*extracted[i];
+	    extracted[i]++;
+	  }
+	  vset(y,&buf[0]); 
+        };
+        
+
+
        /////////////////////////////////////////////////////////////////
        // Permute
        /////////////////////////////////////////////////////////////////
@@ -185,7 +223,7 @@ namespace dpo {
            vsplat(ret,a,b);
        }

-friend inline void vstore(vComplexF &ret, std::complex<float> *a){
+friend inline void vstore(vComplexF &ret, ComplexF *a){
 #if defined (AVX1)|| defined (AVX2)
        _mm256_store_ps((float *)a,ret.v);
 #endif
@@ -225,11 +263,11 @@ exit(-1);
 #if defined (AVX1) || defined(AVX2)
         __attribute__ ((aligned(32))) float c_[8];
         _mm256_store_ps(c_,in.v);
-         return std::complex<float>(c_[0]+c_[2]+c_[4]+c_[6],c_[1]+c_[3]+c_[5]+c_[7]);
+         return ComplexF(c_[0]+c_[2]+c_[4]+c_[6],c_[1]+c_[3]+c_[5]+c_[7]);

 #endif
 #ifdef AVX512
-            return std::complex<float>(_mm512_mask_reduce_add_ps(0x5555, in.v),_mm512_mask_reduce_add_ps(0xAAAA, in.v));
+            return ComplexF(_mm512_mask_reduce_add_ps(0x5555, in.v),_mm512_mask_reduce_add_ps(0xAAAA, in.v));
 #endif
 #ifdef QPX
 #endif
@@ -321,9 +359,6 @@ exit(-1);
            return *this;
        }

-        
-    public:
-        static int Nsimd(void) { return sizeof(cvec)/sizeof(float)/2;}
    };

    inline vComplexF localInnerProduct(const vComplexF & l, const vComplexF & r) { return conj(l)*r; }
--- a/Grid_vRealD.h
+++ b/Grid_vRealD.h
@@ -7,7 +7,11 @@ namespace dpo{
    class vRealD  {
    protected:
        dvec v; // dvec is double precision vector
+
    public:
+	typedef dvec  vector_type;
+	typedef RealD scalar_type;
+
        vRealD(){};

        friend inline void mult(vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) * (*r);}
@@ -94,6 +98,37 @@ namespace dpo{
 #endif
            return ret;
        };
+
+        /////////////////////////////////////////////////////////////////
+        // Extract
+        /////////////////////////////////////////////////////////////////
+        friend inline void extract(vRealD &y,std::vector<RealD *> &extracted){
+	  // Bounce off stack is painful
+	  // temporary hack while I figure out the right interface
+	  const int Nsimd = vRealD::Nsimd();
+	  RealD buf[Nsimd]; 
+
+	  vstore(y,buf);
+
+	  for(int i=0;i<Nsimd;i++){
+	    *extracted[i] = buf[i];
+	    extracted[i]++;
+	  }
+        };
+
+        friend inline void merge(vRealD &y,std::vector<RealD *> &extracted){
+	  // Bounce off stack is painful
+	  // temporary hack while I figure out the right interface
+	  const int Nsimd = vRealD::Nsimd();
+	  RealD buf[Nsimd]; 
+
+	  for(int i=0;i<Nsimd;i++){
+	    buf[i]=*extracted[i];
+	    extracted[i]++;
+	  }
+	  vset(y,buf); 
+        };
+
        
        // Permute plans
        // Permute 0 every ABCDEFGH -> BA DC FE HG
--- a/Grid_vRealF.h
+++ b/Grid_vRealF.h
@@ -7,7 +7,12 @@ namespace dpo {
    class vRealF  {
    protected:
        fvec v;
+
    public:
+
+	typedef fvec  vector_type;
+	typedef RealF scalar_type;
+
        vRealF(){};
        ////////////////////////////////////
        // Arithmetic operator overloads +,-,*
@@ -113,6 +118,37 @@ namespace dpo {
        //////////////////////////////////
        friend inline void vone (vRealF &ret){vsplat(ret,1.0);}
        friend inline void vzero(vRealF &ret){vsplat(ret,0.0);}
+
+
+        /////////////////////////////////////////////////////////////////
+        // Extract
+        /////////////////////////////////////////////////////////////////
+        friend inline void extract(vRealF &y,std::vector<RealF *> &extracted){
+	  // Bounce off stack is painful
+	  // temporary hack while I figure out the right interface
+	  const int Nsimd = vRealF::Nsimd();
+	  RealF buf[Nsimd]; 
+
+	  vstore(y,buf);
+
+	  for(int i=0;i<Nsimd;i++){
+	    *extracted[i] = buf[i];
+	    extracted[i]++;
+	  }
+        };
+
+        friend inline void merge(vRealF &y,std::vector<RealF *> &extracted){
+	  // Bounce off stack is painful
+	  // temporary hack while I figure out the right interface
+	  const int Nsimd = vRealF::Nsimd();
+	  RealF buf[Nsimd]; 
+
+	  for(int i=0;i<Nsimd;i++){
+	    buf[i]=*extracted[i];
+	    extracted[i]++;
+	  }
+	  vset(y,buf); 
+        };
        
        //////////////////////////////////////////////////////////
        // Permute
@@ -256,7 +292,7 @@ friend inline void vstore(vRealF &ret, float *a){
            return *this;
        }
    public:
-        static int Nsimd(void) { return sizeof(fvec)/sizeof(float);}
+        static inline int Nsimd(void) { return sizeof(fvec)/sizeof(float);}
    };
    inline vRealF localInnerProduct(const vRealF & l, const vRealF & r) { return conj(l)*r; }
    inline void  zeroit(vRealF &z){ vzero(z);}