Reorganise to keep files smaller

2025-07-26 09:17:08 +01:00 · 2015-04-18 18:36:48 +01:00
parent f7d80aac7f
commit 8195d302dc
19 changed files with 2308 additions and 492 deletions
--- a/129
+++ b/129
@@ -4,6 +4,70 @@ FUNCTIONALITY:
 * Coordinate information, integers etc...            -----DONE
 * Integer type padding/union to vector.              -----DONE 
 * LatticeCoordinate[mu]                              -----DONE
+* expose traceIndex, peekIndex, transposeIndex etc at the Lattice Level -- DONE
+* TraceColor, TraceSpin.                             ----- DONE (traceIndex<1>,traceIndex<2>, transposeIndex<1>,transposeIndex<2>)
+                                                     ----- Implement mapping between traceColour and traceSpin and traceIndex<1/2>.
+* How to do U[mu] ... lorentz part of type structure or not. more like chroma if not. -- DONE
+
+* subdirs lib, tests ??                              ----- DONE
+
+Not done, or just incomplete
+
+* Consider switch std::vector to boost arrays.
+  boost::multi_array<type, 3> A()...    to replace multi1d, multi2d etc..
+
+
+* How to define simple matrix operations, such as flavour matrices?
+
+* Dirac, Pauli, SU subgroup, etc.. * Gamma/Dirac structures
+* Fourspin, two spin project
+
+* su3 exponentiation, log etc.. [Jamie's code?]
+
+* Stencil operator support                           -----Initial thoughts, trial implementation DONE.
+                                                     -----some simple tests that Stencil matches Cshift.
+                                                     -----do all permute in comms phase, so that copy permute
+						     -----cases move into a buffer.
+						     -----allow transform in/out buffers spproj
+
+* CovariantShift support                             -----Use a class to store gauge field? (parallel transport?)
+
+* Subset support, slice sums etc...                  -----Only need slice sum?
+                                                     -----Generic cartesian subslicing?
+                                                     -----Array ranges / boost extents?
+                                                     -----Multigrid grid transferral?
+                                                     -----Suggests generalised cartesian subblocking
+                                                          sums, returning modified grid?
+					             -----What should interface be?
+
+* Grid transferral
+  * pickCheckerboard, pickSubPlane, pickSubBlock,
+  *                    sumSubPlane, sumSubBlocks
+
+* rb4d support.
+
+* Check for missing functionality                    - partially audited against QDP++ layout
+
+* Optimise the extract/merge SIMD routines; Azusa??
+
+ - I have collated into single location at least.
+ - Need to use _mm_*insert/extract routines.
+
+* Conformable test in Cshift routines.
+
+
+
+* Broadcast, reduction tests. innerProduct, localInnerProduct
+
+* QDP++ regression suite and comparative benchmark
+
+* I/O support
+
+* NERSC Lattice loading, plaquette test
+
+  - MPI IO?
+  - BinaryWriter, TextWriter etc...
+  - protocol buffers?

 AUDITS:
 // Lattice support audit                 Tested in Grid_main.cc
@@ -18,7 +82,7 @@ AUDITS:
 //
 //     transposeIndex                     Y
 //     traceIndex                         Y
-//     peekIndex                          N; #args
+//     peekIndex                          Y
 //
 //     real,imag                          missing, semantic thought needed on real/im support.
 //                                        perhaps I just keep everything complex?
@@ -29,47 +93,15 @@ AUDITS:
 * Replace vset with a call to merge.; 
 * care in Gmerge,Gextract over vset .
 * extract / merge extra implementation removal      
-
-BUILD:
 * Test infrastructure
-* subdirs lib, tests ??

-* How to do U[mu] ... lorentz part of type structure or not. more like chroma if not.
-* Passing a Grid into construct iVector<LatticeColourMatrix,4>??
-*
-
-* Stencil operator support                           -----Initial thoughts, trial implementation DONE.
-                                                     -----some simple tests that Stencil matches Cshift.
-                                                     -----do all permute in comms phase, so that copy permute
-						     -----cases move into a buffer.
-						     -----allow transform in/out buffers spproj
-
-* CovariantShift support                             -----Use a class to store gauge field? (parallel transport?)
-
-POST-SFW call April 16:
-
-* TraceColor, TraceSpin.                             ----- DONE (traceIndex<1>,traceIndex<2>, transposeIndex<1>,transposeIndex<2>)
-                                                     ----- Implement mapping between traceColour and traceSpin and traceIndex<1/2>.
-
-* expose traceIndex, peekIndex, transposeIndex etc at the Lattice Level -- DONE
-
-* How to define simple matrix operations, such as flavour matrices?
-
-* Subset support, slice sums etc...                  -----Only need slice sum?
-                                                     -----Generic cartesian subslicing?
-                                                     -----Array ranges / boost extents?
-                                                     -----Multigrid grid transferral?
-                                                     -----Suggests generalised cartesian subblocking
-                                                          sums, returning modified grid?
-					             -----What should interface be?
-
-i)  Two classes of subset;   red black parity subsetting (pick checkerboard).
+[ More on subsets and grid transfers ]
+i)  Three classes of subset;   red black parity subsetting (pick checkerboard).
                             cartesian sub-block subsetting
+                             rbNd 

 ii) Need to be able to project one Grid to another Grid.

-Interface: (?)
-
 Lattice<vobj> coarse_data SubBlockSum (GridBase *CoarseGrid, Lattice<vobj> &fine_data)

 Operation ensure either:
@@ -89,35 +121,12 @@ Instead of subsetting

 iii) No general permutation map.

-* Consider switch std::vector to boost arrays.
-  boost::multi_array<type, 3> A()...    to replace multi1d, multi2d etc..

-*? Cell definition <-> sliceSum.
+ ? Cell definition <-> sliceSum.
 ? Replicated arrays.

-* Check for missing functionality                    - partially audited against QDP++ layout

-* Optimise the extract/merge SIMD routines; Azusa??

- - I have collated into single location at least.
- - Need to use _mm_*insert/extract routines.
-
-* Conformable test in Cshift routines.
-
-* Gamma/Dirac structures
-
-* Fourspin, two spin project
-
-* Broadcast, reduction tests. innerProduct, localInnerProduct
-
-* QDP++ regression suite and comparative benchmark
-
-* NERSC Lattice loading, plaquette test
-
-* I/O support
-  - MPI IO?
-  - BinaryWriter, TextWriter etc...
-  - protocol buffers?

 // Cartesian grid inheritance
 //            Grid::GridBase
--- a/lib/Grid.h
+++ b/lib/Grid.h
@@ -7,8 +7,8 @@
 //


-#ifndef GRID_V3_H
-#define GRID_V3_H
+#ifndef GRID_H
+#define GRID_H

 #include <stdio.h>
 #include <complex>
@@ -43,7 +43,7 @@

 #include <Grid_aligned_allocator.h>
 #include <Grid_simd.h>
-#include <Grid_math_types.h>
+#include <Grid_math.h>
 #include <Grid_cartesian.h>
 #include <Grid_lattice.h>
 #include <Grid_comparison.h>
--- a/lib/Grid_cartesian.h
+++ b/lib/Grid_cartesian.h
@@ -1,400 +1,8 @@
 #ifndef GRID_CARTESIAN_H
 #define GRID_CARTESIAN_H

-#include <Grid.h>
-#include <Grid_communicator.h>
+#include <Grid_cartesian_base.h>
+#include <Grid_cartesian_full.h>
+#include <Grid_cartesian_red_black.h> 

-namespace Grid{
-    
-/////////////////////////////////////////////////////////////////////////////////////////
-// Grid Support.
-/////////////////////////////////////////////////////////////////////////////////////////
-
-class GridBase : public CartesianCommunicator {
-public:
-
- // Give Lattice access
- template<class object> friend class Lattice;
-  
- GridBase(std::vector<int> & processor_grid) : CartesianCommunicator(processor_grid) {};
-
-        
- //FIXME 
- // protected:
- // Lattice wide random support. not yet fully implemented. Need seed strategy
- // and one generator per site.
- // std::default_random_engine generator;
- //    static std::mt19937  generator( 9 );
- 
-    //////////////////////////////////////////////////////////////////////
-    // Commicator provides information on the processor grid
-    //////////////////////////////////////////////////////////////////////
-    //    unsigned long _ndimension;
-    //    std::vector<int> _processors; // processor grid
-    //    int              _processor;  // linear processor rank
-    //    std::vector<int> _processor_coor;  // linear processor rank
-    //////////////////////////////////////////////////////////////////////
-
-    // Physics Grid information.
-    std::vector<int> _simd_layout;     // Which dimensions get relayed out over simd lanes.
-    std::vector<int> _fdimensions;// Global dimensions of array prior to cb removal
-    std::vector<int> _gdimensions;// Global dimensions of array after cb removal
-    std::vector<int> _ldimensions;// local dimensions of array with processor images removed
-    std::vector<int> _rdimensions;// Reduced local dimensions with simd lane images and processor images removed 
-    std::vector<int> _ostride;    // Outer stride for each dimension
-    std::vector<int> _istride;    // Inner stride i.e. within simd lane
-    int _osites;                  // _isites*_osites = product(dimensions).
-    int _isites;
-    std::vector<int> _slice_block;   // subslice information
-    std::vector<int> _slice_stride;
-    std::vector<int> _slice_nblock;
-
-    // Might need these at some point
-    //    std::vector<int> _lstart;     // local start of array in gcoors. _processor_coor[d]*_ldimensions[d]
-    //    std::vector<int> _lend;       // local end of array in gcoors    _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1
-
-public:
-
-    ////////////////////////////////////////////////////////////////
-    // Checkerboarding interface is virtual and overridden by 
-    // GridCartesian / GridRedBlackCartesian
-    ////////////////////////////////////////////////////////////////
-    virtual int CheckerBoarded(int dim)=0;
-    virtual int CheckerBoard(std::vector<int> site)=0;
-    virtual int CheckerBoardDestination(int source_cb,int shift)=0;
-    virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
-    inline int  CheckerBoardFromOindex (int Oindex){
-      std::vector<int> ocoor;
-      oCoorFromOindex(ocoor,Oindex); 
-      int ss=0;
-      for(int d=0;d<_ndimension;d++){
-	ss=ss+ocoor[d];
-      }      
-      return ss&0x1;
-    }
-
-    //////////////////////////////////////////////////////////////////////////////////////////////
-    // Local layout calculations
-    //////////////////////////////////////////////////////////////////////////////////////////////
-    // These routines are key. Subdivide the linearised cartesian index into
-    //      "inner" index identifying which simd lane of object<vFcomplex> is associated with coord
-    //      "outer" index identifying which element of _odata in class "Lattice" is associated with coord.
-    //
-    // Compared to, say, Blitz++ we simply need to store BOTH an inner stride and an outer
-    // stride per dimension. The cost of evaluating the indexing information is doubled for an n-dimensional
-    // coordinate. Note, however, for data parallel operations the "inner" indexing cost is not paid and all
-    // lanes are operated upon simultaneously.
-  
-    virtual int oIndex(std::vector<int> &coor)
-    {
-        int idx=0;
-	// Works with either global or local coordinates
-        for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
-        return idx;
-    }
-    inline int oIndexReduced(std::vector<int> &ocoor)
-    {
-      int idx=0; 
-      // ocoor is already reduced so can eliminate the modulo operation
-      // for fast indexing and inline the routine
-      for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*ocoor[d];
-      return idx;
-    }
-    inline void oCoorFromOindex (std::vector<int>& coor,int Oindex){
-      coor.resize(_ndimension);
-      for(int d=0;d<_ndimension;d++){
-	coor[d] = Oindex % _rdimensions[d];
-	Oindex  = Oindex / _rdimensions[d];
-      }
-    }
-
-    //////////////////////////////////////////////////////////
-    // SIMD lane addressing
-    //////////////////////////////////////////////////////////
-    inline int iIndex(std::vector<int> &lcoor)
-    {
-        int idx=0;
-        for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
-        return idx;
-    }
-    inline void iCoorFromIindex(std::vector<int> &coor,int lane)
-    {
-      coor.resize(_ndimension);
-      for(int d=0;d<_ndimension;d++){
-	coor[d] = lane % _simd_layout[d];
-	lane    = lane / _simd_layout[d];
-      }
-    }
-    inline int PermuteDim(int dimension){
-      return _simd_layout[dimension]>1;
-    }
-    inline int PermuteType(int dimension){
-      int permute_type=0;
-      for(int d=_ndimension-1;d>dimension;d--){
-	if (_simd_layout[d]>1 ) permute_type++;
-      }
-      return permute_type;
-    }
-    ////////////////////////////////////////////////////////////////
-    // Array sizing queries
-    ////////////////////////////////////////////////////////////////
-
-    inline int iSites(void) { return _isites; };
-    inline int Nsimd(void)  { return _isites; };// Synonymous with iSites
-    inline int oSites(void) { return _osites; };
-    inline int lSites(void) { return _isites*_osites; }; 
-    inline int gSites(void) { return _isites*_osites*_Nprocessors; }; 
-    inline int Nd    (void) { return _ndimension;};
-    inline const std::vector<int> &FullDimensions(void)         { return _fdimensions;};
-    inline const std::vector<int> &GlobalDimensions(void)       { return _gdimensions;};
-    inline const std::vector<int> &LocalDimensions(void)        { return _ldimensions;};
-    inline const std::vector<int> &VirtualLocalDimensions(void) { return _ldimensions;};
-
-    ////////////////////////////////////////////////////////////////
-    // Global addressing
-    ////////////////////////////////////////////////////////////////
-    void RankIndexToGlobalCoor(int rank, int o_idx, int i_idx , std::vector<int> &gcoor)
-    {
-      gcoor.resize(_ndimension);
-      std::vector<int> coor(_ndimension);
-
-      ProcessorCoorFromRank(rank,coor);
-      for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = _ldimensions[mu]&coor[mu];
-
-      iCoorFromIindex(coor,i_idx);
-      for(int mu=0;mu<_ndimension;mu++) gcoor[mu] += _rdimensions[mu]&coor[mu];
-
-      oCoorFromOindex (coor,o_idx);
-      for(int mu=0;mu<_ndimension;mu++) gcoor[mu] += coor[mu];
-      
-    }
-    void RankIndexCbToFullGlobalCoor(int rank, int o_idx, int i_idx, int cb,std::vector<int> &fcoor)
-    {
-      RankIndexToGlobalCoor(rank,o_idx,i_idx ,fcoor);
-      if(CheckerBoarded(0)){
-	fcoor[0] = fcoor[0]*2+cb;
-      }
-    }
-    void ProcessorCoorLocalCoorToGlobalCoor(std::vector<int> &Pcoor,std::vector<int> &Lcoor,std::vector<int> &gcoor)
-    {
-      gcoor.resize(_ndimension);
-      for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = Pcoor[mu]*_ldimensions[mu]+Lcoor[mu];
-    }
-    void GlobalCoorToProcessorCoorLocalCoor(std::vector<int> &pcoor,std::vector<int> &lcoor,const std::vector<int> &gcoor)
-    {
-      pcoor.resize(_ndimension);
-      lcoor.resize(_ndimension);
-      for(int mu=0;mu<_ndimension;mu++){
-	pcoor[mu] = gcoor[mu]/_ldimensions[mu];
-	lcoor[mu] = gcoor[mu]%_ldimensions[mu];
-      }
-    }
-    void GlobalCoorToRankIndex(int &rank, int &o_idx, int &i_idx ,const std::vector<int> &gcoor)
-    {
-      std::vector<int> pcoor;
-      std::vector<int> lcoor;
-      GlobalCoorToProcessorCoorLocalCoor(pcoor,lcoor,gcoor);
-      rank = RankFromProcessorCoor(pcoor);
-      i_idx= iIndex(lcoor);
-      o_idx= oIndex(lcoor);
-    }
-
-};
-
-class GridCartesian: public GridBase {
-
-public:
-
-    virtual int CheckerBoarded(int dim){
-      return 0;
-    }
-    virtual int CheckerBoard(std::vector<int> site){
-        return 0;
-    }
-    virtual int CheckerBoardDestination(int cb,int shift){
-        return 0;
-    }
-    virtual int CheckerBoardShift(int source_cb,int dim,int shift, int osite){
-        return shift;
-    }
-    GridCartesian(std::vector<int> &dimensions,
-		  std::vector<int> &simd_layout,
-		  std::vector<int> &processor_grid
-		  ) : GridBase(processor_grid)
-    {
-        ///////////////////////
-        // Grid information
-        ///////////////////////
-        _ndimension = dimensions.size();
-            
-        _fdimensions.resize(_ndimension);
-        _gdimensions.resize(_ndimension);
-        _ldimensions.resize(_ndimension);
-        _rdimensions.resize(_ndimension);
-        _simd_layout.resize(_ndimension);
-            
-        _ostride.resize(_ndimension);
-        _istride.resize(_ndimension);
-            
-        _osites = 1;
-        _isites = 1;
-        for(int d=0;d<_ndimension;d++){
-	  _fdimensions[d] = dimensions[d]; // Global dimensions
-	  _gdimensions[d] = _fdimensions[d]; // Global dimensions
-	  _simd_layout[d] = simd_layout[d];
-
-	  //FIXME check for exact division
-
-	  // Use a reduced simd grid
-	  _ldimensions[d]= _gdimensions[d]/_processors[d];  //local dimensions
-	  _rdimensions[d]= _ldimensions[d]/_simd_layout[d]; //overdecomposition
-	  _osites *= _rdimensions[d];
-	  _isites *= _simd_layout[d];
-                
-	  // Addressing support
-	  if ( d==0 ) {
-	    _ostride[d] = 1;
-	    _istride[d] = 1;
-	  } else {
-	    _ostride[d] = _ostride[d-1]*_rdimensions[d-1];
-	    _istride[d] = _istride[d-1]*_simd_layout[d-1];
-	  }
-        }
-        
-        ///////////////////////
-        // subplane information
-        ///////////////////////
-        _slice_block.resize(_ndimension);
-        _slice_stride.resize(_ndimension);
-        _slice_nblock.resize(_ndimension);
-            
-        int block =1;
-        int nblock=1;
-        for(int d=0;d<_ndimension;d++) nblock*=_rdimensions[d];
-            
-        for(int d=0;d<_ndimension;d++){
-            nblock/=_rdimensions[d];
-            _slice_block[d] =block;
-            _slice_stride[d]=_ostride[d]*_rdimensions[d];
-            _slice_nblock[d]=nblock;
-            block = block*_rdimensions[d];
-        }
-
-    };
-};
- 
-// Specialise this for red black grids storing half the data like a chess board.
-class GridRedBlackCartesian : public GridBase
-{
-public:
-    virtual int CheckerBoarded(int dim){
-      if( dim==0) return 1;
-      else return 0;
-    }
-    virtual int CheckerBoard(std::vector<int> site){
-      return (site[0]+site[1]+site[2]+site[3])&0x1;
-    }
-
-    // Depending on the cb of site, we toggle source cb.
-    // for block #b, element #e = (b, e)
-    // we need 
-    virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite){
-
-      if(dim != 0) return shift;
-
-      int fulldim =_fdimensions[0];
-      shift = (shift+fulldim)%fulldim;
-
-      // Probably faster with table lookup;
-      // or by looping over x,y,z and multiply rather than computing checkerboard.
-      int ocb=CheckerBoardFromOindex(osite);
-	  
-      if ( (source_cb+ocb)&1 ) {
-	return (shift)/2;
-      } else {
-	return (shift+1)/2;
-      }
-    }
-
-    virtual int CheckerBoardDestination(int source_cb,int shift){
-        if ((shift+_fdimensions[0])&0x1) {
-            return 1-source_cb;
-        } else {
-            return source_cb;
-        }
-    };
-    GridRedBlackCartesian(std::vector<int> &dimensions,
-			  std::vector<int> &simd_layout,
-			  std::vector<int> &processor_grid) : GridBase(processor_grid)
-    {
-    ///////////////////////
-    // Grid information
-    ///////////////////////
-        _ndimension = dimensions.size();
-        
-        _fdimensions.resize(_ndimension);
-        _gdimensions.resize(_ndimension);
-        _ldimensions.resize(_ndimension);
-        _rdimensions.resize(_ndimension);
-        _simd_layout.resize(_ndimension);
-        
-        _ostride.resize(_ndimension);
-        _istride.resize(_ndimension);
-        
-        _osites = 1;
-        _isites = 1;
-        for(int d=0;d<_ndimension;d++){
-            _fdimensions[d] = dimensions[d];
-            _gdimensions[d] = _fdimensions[d];
-            if (d==0) _gdimensions[0] = _gdimensions[0]/2; // Remove a checkerboard
-            _ldimensions[d] = _gdimensions[d]/_processors[d];
-                
-            // Use a reduced simd grid
-            _simd_layout[d] = simd_layout[d];
-            _rdimensions[d]= _ldimensions[d]/_simd_layout[d];
-
-            _osites *= _rdimensions[d];
-            _isites *= _simd_layout[d];
-                
-            // Addressing support
-            if ( d==0 ) {
-                _ostride[d] = 1;
-                _istride[d] = 1;
-            } else {
-                _ostride[d] = _ostride[d-1]*_rdimensions[d-1];
-                _istride[d] = _istride[d-1]*_simd_layout[d-1];
-            }
-        }
-            
-        ////////////////////////////////////////////////////////////////////////////////////////////
-        // subplane information
-        ////////////////////////////////////////////////////////////////////////////////////////////
-        _slice_block.resize(_ndimension);
-        _slice_stride.resize(_ndimension);
-        _slice_nblock.resize(_ndimension);
-        
-        int block =1;
-        int nblock=1;
-        for(int d=0;d<_ndimension;d++) nblock*=_rdimensions[d];
-            
-        for(int d=0;d<_ndimension;d++){
-            nblock/=_rdimensions[d];
-            _slice_block[d] =block;
-            _slice_stride[d]=_ostride[d]*_rdimensions[d];
-            _slice_nblock[d]=nblock;
-            block = block*_rdimensions[d];
-        }
-            
-    };
-protected:
-    virtual int oIndex(std::vector<int> &coor)
-    {
-        int idx=_ostride[0]*((coor[0]/2)%_rdimensions[0]);
-        for(int d=1;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
-        return idx;
-    };
-        
-};
-
-}
 #endif
--- a/lib/Grid_cartesian_base.h
+++ b/lib/Grid_cartesian_base.h
@@ -0,0 +1,200 @@
+#ifndef GRID_CARTESIAN_BASE_H
+#define GRID_CARTESIAN_BASE_H
+
+#include <Grid.h>
+#include <Grid_communicator.h>
+
+namespace Grid{
+
+class GridBase : public CartesianCommunicator {
+public:
+
+ // Give Lattice access
+ template<class object> friend class Lattice;
+  
+ GridBase(std::vector<int> & processor_grid) : CartesianCommunicator(processor_grid) {};
+
+        
+ //FIXME 
+ // protected:
+ // Lattice wide random support. not yet fully implemented. Need seed strategy
+ // and one generator per site.
+ // std::default_random_engine generator;
+ //    static std::mt19937  generator( 9 );
+ 
+    //////////////////////////////////////////////////////////////////////
+    // Commicator provides information on the processor grid
+    //////////////////////////////////////////////////////////////////////
+    //    unsigned long _ndimension;
+    //    std::vector<int> _processors; // processor grid
+    //    int              _processor;  // linear processor rank
+    //    std::vector<int> _processor_coor;  // linear processor rank
+    //////////////////////////////////////////////////////////////////////
+
+    // Physics Grid information.
+    std::vector<int> _simd_layout;     // Which dimensions get relayed out over simd lanes.
+    std::vector<int> _fdimensions;// Global dimensions of array prior to cb removal
+    std::vector<int> _gdimensions;// Global dimensions of array after cb removal
+    std::vector<int> _ldimensions;// local dimensions of array with processor images removed
+    std::vector<int> _rdimensions;// Reduced local dimensions with simd lane images and processor images removed 
+    std::vector<int> _ostride;    // Outer stride for each dimension
+    std::vector<int> _istride;    // Inner stride i.e. within simd lane
+    int _osites;                  // _isites*_osites = product(dimensions).
+    int _isites;
+    std::vector<int> _slice_block;   // subslice information
+    std::vector<int> _slice_stride;
+    std::vector<int> _slice_nblock;
+
+    // Might need these at some point
+    //    std::vector<int> _lstart;     // local start of array in gcoors. _processor_coor[d]*_ldimensions[d]
+    //    std::vector<int> _lend;       // local end of array in gcoors    _processor_coor[d]*_ldimensions[d]+_ldimensions_[d]-1
+
+public:
+
+    ////////////////////////////////////////////////////////////////
+    // Checkerboarding interface is virtual and overridden by 
+    // GridCartesian / GridRedBlackCartesian
+    ////////////////////////////////////////////////////////////////
+    virtual int CheckerBoarded(int dim)=0;
+    virtual int CheckerBoard(std::vector<int> site)=0;
+    virtual int CheckerBoardDestination(int source_cb,int shift)=0;
+    virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite)=0;
+    inline int  CheckerBoardFromOindex (int Oindex){
+      std::vector<int> ocoor;
+      oCoorFromOindex(ocoor,Oindex); 
+      int ss=0;
+      for(int d=0;d<_ndimension;d++){
+	ss=ss+ocoor[d];
+      }      
+      return ss&0x1;
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////////////
+    // Local layout calculations
+    //////////////////////////////////////////////////////////////////////////////////////////////
+    // These routines are key. Subdivide the linearised cartesian index into
+    //      "inner" index identifying which simd lane of object<vFcomplex> is associated with coord
+    //      "outer" index identifying which element of _odata in class "Lattice" is associated with coord.
+    //
+    // Compared to, say, Blitz++ we simply need to store BOTH an inner stride and an outer
+    // stride per dimension. The cost of evaluating the indexing information is doubled for an n-dimensional
+    // coordinate. Note, however, for data parallel operations the "inner" indexing cost is not paid and all
+    // lanes are operated upon simultaneously.
+  
+    virtual int oIndex(std::vector<int> &coor)
+    {
+        int idx=0;
+	// Works with either global or local coordinates
+        for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
+        return idx;
+    }
+    inline int oIndexReduced(std::vector<int> &ocoor)
+    {
+      int idx=0; 
+      // ocoor is already reduced so can eliminate the modulo operation
+      // for fast indexing and inline the routine
+      for(int d=0;d<_ndimension;d++) idx+=_ostride[d]*ocoor[d];
+      return idx;
+    }
+    inline void oCoorFromOindex (std::vector<int>& coor,int Oindex){
+      coor.resize(_ndimension);
+      for(int d=0;d<_ndimension;d++){
+	coor[d] = Oindex % _rdimensions[d];
+	Oindex  = Oindex / _rdimensions[d];
+      }
+    }
+
+    //////////////////////////////////////////////////////////
+    // SIMD lane addressing
+    //////////////////////////////////////////////////////////
+    inline int iIndex(std::vector<int> &lcoor)
+    {
+        int idx=0;
+        for(int d=0;d<_ndimension;d++) idx+=_istride[d]*(lcoor[d]/_rdimensions[d]);
+        return idx;
+    }
+    inline void iCoorFromIindex(std::vector<int> &coor,int lane)
+    {
+      coor.resize(_ndimension);
+      for(int d=0;d<_ndimension;d++){
+	coor[d] = lane % _simd_layout[d];
+	lane    = lane / _simd_layout[d];
+      }
+    }
+    inline int PermuteDim(int dimension){
+      return _simd_layout[dimension]>1;
+    }
+    inline int PermuteType(int dimension){
+      int permute_type=0;
+      for(int d=_ndimension-1;d>dimension;d--){
+	if (_simd_layout[d]>1 ) permute_type++;
+      }
+      return permute_type;
+    }
+    ////////////////////////////////////////////////////////////////
+    // Array sizing queries
+    ////////////////////////////////////////////////////////////////
+
+    inline int iSites(void) { return _isites; };
+    inline int Nsimd(void)  { return _isites; };// Synonymous with iSites
+    inline int oSites(void) { return _osites; };
+    inline int lSites(void) { return _isites*_osites; }; 
+    inline int gSites(void) { return _isites*_osites*_Nprocessors; }; 
+    inline int Nd    (void) { return _ndimension;};
+    inline const std::vector<int> &FullDimensions(void)         { return _fdimensions;};
+    inline const std::vector<int> &GlobalDimensions(void)       { return _gdimensions;};
+    inline const std::vector<int> &LocalDimensions(void)        { return _ldimensions;};
+    inline const std::vector<int> &VirtualLocalDimensions(void) { return _ldimensions;};
+
+    ////////////////////////////////////////////////////////////////
+    // Global addressing
+    ////////////////////////////////////////////////////////////////
+    void RankIndexToGlobalCoor(int rank, int o_idx, int i_idx , std::vector<int> &gcoor)
+    {
+      gcoor.resize(_ndimension);
+      std::vector<int> coor(_ndimension);
+
+      ProcessorCoorFromRank(rank,coor);
+      for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = _ldimensions[mu]&coor[mu];
+
+      iCoorFromIindex(coor,i_idx);
+      for(int mu=0;mu<_ndimension;mu++) gcoor[mu] += _rdimensions[mu]&coor[mu];
+
+      oCoorFromOindex (coor,o_idx);
+      for(int mu=0;mu<_ndimension;mu++) gcoor[mu] += coor[mu];
+      
+    }
+    void RankIndexCbToFullGlobalCoor(int rank, int o_idx, int i_idx, int cb,std::vector<int> &fcoor)
+    {
+      RankIndexToGlobalCoor(rank,o_idx,i_idx ,fcoor);
+      if(CheckerBoarded(0)){
+	fcoor[0] = fcoor[0]*2+cb;
+      }
+    }
+    void ProcessorCoorLocalCoorToGlobalCoor(std::vector<int> &Pcoor,std::vector<int> &Lcoor,std::vector<int> &gcoor)
+    {
+      gcoor.resize(_ndimension);
+      for(int mu=0;mu<_ndimension;mu++) gcoor[mu] = Pcoor[mu]*_ldimensions[mu]+Lcoor[mu];
+    }
+    void GlobalCoorToProcessorCoorLocalCoor(std::vector<int> &pcoor,std::vector<int> &lcoor,const std::vector<int> &gcoor)
+    {
+      pcoor.resize(_ndimension);
+      lcoor.resize(_ndimension);
+      for(int mu=0;mu<_ndimension;mu++){
+	pcoor[mu] = gcoor[mu]/_ldimensions[mu];
+	lcoor[mu] = gcoor[mu]%_ldimensions[mu];
+      }
+    }
+    void GlobalCoorToRankIndex(int &rank, int &o_idx, int &i_idx ,const std::vector<int> &gcoor)
+    {
+      std::vector<int> pcoor;
+      std::vector<int> lcoor;
+      GlobalCoorToProcessorCoorLocalCoor(pcoor,lcoor,gcoor);
+      rank = RankFromProcessorCoor(pcoor);
+      i_idx= iIndex(lcoor);
+      o_idx= oIndex(lcoor);
+    }
+
+};
+}
+#endif
--- a/lib/Grid_cartesian_full.h
+++ b/lib/Grid_cartesian_full.h
@@ -0,0 +1,95 @@
+#ifndef GRID_CARTESIAN_FULL_H
+#define GRID_CARTESIAN_FULL_H
+
+namespace Grid{
+    
+/////////////////////////////////////////////////////////////////////////////////////////
+// Grid Support.
+/////////////////////////////////////////////////////////////////////////////////////////
+
+
+class GridCartesian: public GridBase {
+
+public:
+
+    virtual int CheckerBoarded(int dim){
+      return 0;
+    }
+    virtual int CheckerBoard(std::vector<int> site){
+        return 0;
+    }
+    virtual int CheckerBoardDestination(int cb,int shift){
+        return 0;
+    }
+    virtual int CheckerBoardShift(int source_cb,int dim,int shift, int osite){
+        return shift;
+    }
+    GridCartesian(std::vector<int> &dimensions,
+		  std::vector<int> &simd_layout,
+		  std::vector<int> &processor_grid
+		  ) : GridBase(processor_grid)
+    {
+        ///////////////////////
+        // Grid information
+        ///////////////////////
+        _ndimension = dimensions.size();
+            
+        _fdimensions.resize(_ndimension);
+        _gdimensions.resize(_ndimension);
+        _ldimensions.resize(_ndimension);
+        _rdimensions.resize(_ndimension);
+        _simd_layout.resize(_ndimension);
+            
+        _ostride.resize(_ndimension);
+        _istride.resize(_ndimension);
+            
+        _osites = 1;
+        _isites = 1;
+        for(int d=0;d<_ndimension;d++){
+	  _fdimensions[d] = dimensions[d]; // Global dimensions
+	  _gdimensions[d] = _fdimensions[d]; // Global dimensions
+	  _simd_layout[d] = simd_layout[d];
+
+	  //FIXME check for exact division
+
+	  // Use a reduced simd grid
+	  _ldimensions[d]= _gdimensions[d]/_processors[d];  //local dimensions
+	  _rdimensions[d]= _ldimensions[d]/_simd_layout[d]; //overdecomposition
+	  _osites *= _rdimensions[d];
+	  _isites *= _simd_layout[d];
+                
+	  // Addressing support
+	  if ( d==0 ) {
+	    _ostride[d] = 1;
+	    _istride[d] = 1;
+	  } else {
+	    _ostride[d] = _ostride[d-1]*_rdimensions[d-1];
+	    _istride[d] = _istride[d-1]*_simd_layout[d-1];
+	  }
+        }
+        
+        ///////////////////////
+        // subplane information
+        ///////////////////////
+        _slice_block.resize(_ndimension);
+        _slice_stride.resize(_ndimension);
+        _slice_nblock.resize(_ndimension);
+            
+        int block =1;
+        int nblock=1;
+        for(int d=0;d<_ndimension;d++) nblock*=_rdimensions[d];
+            
+        for(int d=0;d<_ndimension;d++){
+            nblock/=_rdimensions[d];
+            _slice_block[d] =block;
+            _slice_stride[d]=_ostride[d]*_rdimensions[d];
+            _slice_nblock[d]=nblock;
+            block = block*_rdimensions[d];
+        }
+
+    };
+};
+
+
+}
+#endif
--- a/lib/Grid_cartesian_red_black.h
+++ b/lib/Grid_cartesian_red_black.h
@@ -0,0 +1,121 @@
+#ifndef GRID_CARTESIAN_RED_BLACK_H
+#define GRID_CARTESIAN_RED_BLACK_H
+
+
+namespace Grid {
+
+// Specialise this for red black grids storing half the data like a chess board.
+class GridRedBlackCartesian : public GridBase
+{
+public:
+    virtual int CheckerBoarded(int dim){
+      if( dim==0) return 1;
+      else return 0;
+    }
+    virtual int CheckerBoard(std::vector<int> site){
+      return (site[0]+site[1]+site[2]+site[3])&0x1;
+    }
+
+    // Depending on the cb of site, we toggle source cb.
+    // for block #b, element #e = (b, e)
+    // we need 
+    virtual int CheckerBoardShift(int source_cb,int dim,int shift,int osite){
+
+      if(dim != 0) return shift;
+
+      int fulldim =_fdimensions[0];
+      shift = (shift+fulldim)%fulldim;
+
+      // Probably faster with table lookup;
+      // or by looping over x,y,z and multiply rather than computing checkerboard.
+      int ocb=CheckerBoardFromOindex(osite);
+	  
+      if ( (source_cb+ocb)&1 ) {
+	return (shift)/2;
+      } else {
+	return (shift+1)/2;
+      }
+    }
+
+    virtual int CheckerBoardDestination(int source_cb,int shift){
+        if ((shift+_fdimensions[0])&0x1) {
+            return 1-source_cb;
+        } else {
+            return source_cb;
+        }
+    };
+    GridRedBlackCartesian(std::vector<int> &dimensions,
+			  std::vector<int> &simd_layout,
+			  std::vector<int> &processor_grid) : GridBase(processor_grid)
+    {
+    ///////////////////////
+    // Grid information
+    ///////////////////////
+        _ndimension = dimensions.size();
+        
+        _fdimensions.resize(_ndimension);
+        _gdimensions.resize(_ndimension);
+        _ldimensions.resize(_ndimension);
+        _rdimensions.resize(_ndimension);
+        _simd_layout.resize(_ndimension);
+        
+        _ostride.resize(_ndimension);
+        _istride.resize(_ndimension);
+        
+        _osites = 1;
+        _isites = 1;
+        for(int d=0;d<_ndimension;d++){
+            _fdimensions[d] = dimensions[d];
+            _gdimensions[d] = _fdimensions[d];
+            if (d==0) _gdimensions[0] = _gdimensions[0]/2; // Remove a checkerboard
+            _ldimensions[d] = _gdimensions[d]/_processors[d];
+                
+            // Use a reduced simd grid
+            _simd_layout[d] = simd_layout[d];
+            _rdimensions[d]= _ldimensions[d]/_simd_layout[d];
+
+            _osites *= _rdimensions[d];
+            _isites *= _simd_layout[d];
+                
+            // Addressing support
+            if ( d==0 ) {
+                _ostride[d] = 1;
+                _istride[d] = 1;
+            } else {
+                _ostride[d] = _ostride[d-1]*_rdimensions[d-1];
+                _istride[d] = _istride[d-1]*_simd_layout[d-1];
+            }
+        }
+            
+        ////////////////////////////////////////////////////////////////////////////////////////////
+        // subplane information
+        ////////////////////////////////////////////////////////////////////////////////////////////
+        _slice_block.resize(_ndimension);
+        _slice_stride.resize(_ndimension);
+        _slice_nblock.resize(_ndimension);
+        
+        int block =1;
+        int nblock=1;
+        for(int d=0;d<_ndimension;d++) nblock*=_rdimensions[d];
+            
+        for(int d=0;d<_ndimension;d++){
+            nblock/=_rdimensions[d];
+            _slice_block[d] =block;
+            _slice_stride[d]=_ostride[d]*_rdimensions[d];
+            _slice_nblock[d]=nblock;
+            block = block*_rdimensions[d];
+        }
+            
+    };
+protected:
+    virtual int oIndex(std::vector<int> &coor)
+    {
+        int idx=_ostride[0]*((coor[0]/2)%_rdimensions[0]);
+        for(int d=1;d<_ndimension;d++) idx+=_ostride[d]*(coor[d]%_rdimensions[d]);
+        return idx;
+    };
+        
+};
+
+}
+#endif
--- a/lib/Grid_cshift_mpi.h
+++ b/lib/Grid_cshift_mpi.h
@@ -1,5 +1,5 @@
-#ifndef _GRID_MPI_CSHIFT_H_
-#define _GRID_MPI_CSHIFT_H_
+#ifndef _GRID_CSHIFT_MPI_H_
+#define _GRID_CSHIFT_MPI_H_

 #ifndef MAX
 #define MAX(x,y) ((x)>(y)?(x):(y))
--- a/lib/Grid_cshift_none.h
+++ b/lib/Grid_cshift_none.h
@@ -1,5 +1,5 @@
-#ifndef _GRID_NONE_CSHIFT_H_
-#define _GRID_NONE_CSHIFT_H_
+#ifndef _GRID_CSHIFT_NONE_H_
+#define _GRID_CSHIFT_NONE_H_

 friend Lattice<vobj> Cshift(Lattice<vobj> &rhs,int dimension,int shift)
 {
--- a/lib/Grid_lattice.h
+++ b/lib/Grid_lattice.h
@@ -1,9 +1,6 @@
 #ifndef GRID_LATTICE_H
 #define GRID_LATTICE_H

-#include "Grid.h"
-
-
 namespace Grid {

 // TODO: Indexing ()
--- a/lib/Grid_math.h
+++ b/lib/Grid_math.h
@@ -0,0 +1,649 @@
+#ifndef GRID_MATH_H
+#define GRID_MATH_H
+
+#include <Grid_math_traits.h>
+#include <Grid_math_tensors.h>
+#include <Grid_math_arith.h>
+
+//
+// Indexing; want to be able to dereference and 
+// obtain either an lvalue or an rvalue.
+//
+namespace Grid {
+
+
+
+
+    ///////////////////////////////////////////////////////////////////////////////////////
+    // innerProduct Scalar x Scalar -> Scalar
+    // innerProduct Vector x Vector -> Scalar
+    // innerProduct Matrix x Matrix -> Scalar
+    ///////////////////////////////////////////////////////////////////////////////////////
+    template<class l,class r,int N> inline
+    auto innerProduct (const iVector<l,N>& lhs,const iVector<r,N>& rhs) -> iScalar<decltype(innerProduct(lhs._internal[0],rhs._internal[0]))>
+    {
+        typedef decltype(innerProduct(lhs._internal[0],rhs._internal[0])) ret_t;
+        iScalar<ret_t> ret=zero;
+        for(int c1=0;c1<N;c1++){
+            ret._internal += innerProduct(lhs._internal[c1],rhs._internal[c1]);
+        }
+        return ret;
+    }
+    template<class l,class r,int N> inline
+    auto innerProduct (const iMatrix<l,N>& lhs,const iMatrix<r,N>& rhs) -> iScalar<decltype(innerProduct(lhs._internal[0][0],rhs._internal[0][0]))>
+    {
+        typedef decltype(innerProduct(lhs._internal[0][0],rhs._internal[0][0])) ret_t;
+        iScalar<ret_t> ret=zero;
+        iScalar<ret_t> tmp;
+        for(int c1=0;c1<N;c1++){
+        for(int c2=0;c2<N;c2++){
+	  ret._internal+=innerProduct(lhs._internal[c1][c2],rhs._internal[c1][c2]);
+        }}
+        return ret;
+    }
+    template<class l,class r> inline
+    auto innerProduct (const iScalar<l>& lhs,const iScalar<r>& rhs) -> iScalar<decltype(innerProduct(lhs._internal,rhs._internal))>
+    {
+        typedef decltype(innerProduct(lhs._internal,rhs._internal)) ret_t;
+        iScalar<ret_t> ret;
+        ret._internal = innerProduct(lhs._internal,rhs._internal);
+        return ret;
+    }
+
+    ///////////////////////////////////////////////////////////////////////////////////////
+    // outerProduct Scalar x Scalar -> Scalar
+    //              Vector x Vector -> Matrix
+    ///////////////////////////////////////////////////////////////////////////////////////
+
+template<class l,class r,int N> inline
+auto outerProduct (const iVector<l,N>& lhs,const iVector<r,N>& rhs) -> iMatrix<decltype(outerProduct(lhs._internal[0],rhs._internal[0])),N>
+{
+    typedef decltype(outerProduct(lhs._internal[0],rhs._internal[0])) ret_t;
+    iMatrix<ret_t,N> ret;
+    for(int c1=0;c1<N;c1++){
+    for(int c2=0;c2<N;c2++){
+        ret._internal[c1][c2] = outerProduct(lhs._internal[c1],rhs._internal[c2]);
+    }}
+    return ret;
+}
+template<class l,class r> inline
+auto outerProduct (const iScalar<l>& lhs,const iScalar<r>& rhs) -> iScalar<decltype(outerProduct(lhs._internal,rhs._internal))>
+{
+    typedef decltype(outerProduct(lhs._internal,rhs._internal)) ret_t;
+    iScalar<ret_t> ret;
+    ret._internal = outerProduct(lhs._internal,rhs._internal);
+    return ret;
+}
+
+inline ComplexF outerProduct(const ComplexF &l, const ComplexF& r)
+{
+  return l*r;
+}
+inline ComplexD outerProduct(const ComplexD &l, const ComplexD& r)
+{
+  return l*r;
+}
+inline RealF outerProduct(const RealF &l, const RealF& r)
+{
+  return l*r;
+}
+inline RealD outerProduct(const RealD &l, const RealD& r)
+{
+  return l*r;
+}
+    ///////////////////////////////////////////////////////////////////////////////////////////////////
+    /////////////////////////////////////////// CONJ         ///////////////////////////////////////////
+    ///////////////////////////////////////////////////////////////////////////////////////////////////
+ 
+// Conj function for scalar, vector, matrix
+template<class vtype> inline iScalar<vtype> conj(const iScalar<vtype>&r)
+{
+    iScalar<vtype> ret;
+    ret._internal = conj(r._internal);
+    return ret;
+}
+template<class vtype,int N> inline iVector<vtype,N> conj(const iVector<vtype,N>&r)
+{
+  iVector<vtype,N> ret;
+  for(int i=0;i<N;i++){
+    ret._internal[i] = conj(r._internal[i]);
+  }
+  return ret;
+}
+template<class vtype,int N> inline iMatrix<vtype,N> conj(const iMatrix<vtype,N>&r)
+{
+  iMatrix<vtype,N> ret;
+  for(int i=0;i<N;i++){
+  for(int j=0;j<N;j++){
+    ret._internal[i][j] = conj(r._internal[i][j]);
+  }}
+  return ret;
+}
+
+// Adj function for scalar, vector, matrix
+template<class vtype> inline iScalar<vtype> adj(const iScalar<vtype>&r)
+{
+    iScalar<vtype> ret;
+    ret._internal = adj(r._internal);
+    return ret;
+}
+template<class vtype,int N> inline iVector<vtype,N> adj(const iVector<vtype,N>&r)
+{
+    iVector<vtype,N> ret;
+    for(int i=0;i<N;i++){
+        ret._internal[i] = adj(r._internal[i]);
+    }
+    return ret;
+}
+template<class vtype,int N> inline iMatrix<vtype,N> adj(const iMatrix<vtype,N> &arg)
+{
+    iMatrix<vtype,N> ret;
+    for(int c1=0;c1<N;c1++){
+    for(int c2=0;c2<N;c2++){
+        ret._internal[c1][c2]=adj(arg._internal[c2][c1]);
+    }}
+    return ret;
+}
+
+
+
+/////////////////////////////////////////////////////////////////
+// Transpose all indices
+/////////////////////////////////////////////////////////////////
+
+inline ComplexD transpose(ComplexD &rhs){  return rhs;}
+inline ComplexF transpose(ComplexF &rhs){  return rhs;}
+inline RealD transpose(RealD &rhs){  return rhs;}
+inline RealF transpose(RealF &rhs){  return rhs;}
+
+template<class vtype,int N>
+  inline typename std::enable_if<isGridTensor<vtype>::value, iMatrix<vtype,N> >::type 
+  transpose(iMatrix<vtype,N> arg)
+  {
+    iMatrix<vtype,N> ret;
+    for(int i=0;i<N;i++){
+      for(int j=0;j<N;j++){
+	ret._internal[i][j] = transpose(arg._internal[j][i]); // NB recurses
+      }}
+    return ret;
+  }
+template<class vtype,int N>
+  inline typename std::enable_if<isGridTensor<vtype>::notvalue, iMatrix<vtype,N> >::type 
+  transpose(iMatrix<vtype,N> arg)
+  {
+    iMatrix<vtype,N> ret;
+    for(int i=0;i<N;i++){
+      for(int j=0;j<N;j++){
+	ret._internal[i][j] = arg._internal[j][i]; // Stop recursion if not a tensor type
+      }}
+    return ret;
+  }
+
+template<class vtype>
+  inline typename std::enable_if<isGridTensor<vtype>::value, iScalar<vtype> >::type 
+  transpose(iScalar<vtype> arg)
+  {
+    iScalar<vtype> ret;
+    ret._internal = transpose(arg._internal); // NB recurses
+    return ret;
+  }
+
+template<class vtype>
+  inline typename std::enable_if<isGridTensor<vtype>::notvalue, iScalar<vtype> >::type 
+  transpose(iScalar<vtype> arg)
+  {
+    iScalar<vtype> ret;
+    ret._internal = arg._internal; // NB recursion stops
+    return ret;
+  }
+
+
+////////////////////////////////////////////////////////////////////////////////////////////
+// Transpose a specific index; instructive to compare this style of recursion termination
+// to that of adj; which is easiers?
+////////////////////////////////////////////////////////////////////////////////////////////
+template<int Level,class vtype,int N> inline 
+  typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,Level>::value, iMatrix<vtype,N> >::type 
+transposeIndex (const iMatrix<vtype,N> &arg)
+{
+  iMatrix<vtype,N> ret;
+  for(int i=0;i<N;i++){
+    for(int j=0;j<N;j++){
+      ret._internal[i][j] = arg._internal[j][i]; 
+  }}
+  return ret;
+}
+// or not
+template<int Level,class vtype,int N> inline 
+typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,Level>::notvalue, iMatrix<vtype,N> >::type 
+transposeIndex (const iMatrix<vtype,N> &arg)
+{
+  iMatrix<vtype,N> ret;
+  for(int i=0;i<N;i++){
+    for(int j=0;j<N;j++){
+      ret._internal[i][j] = transposeIndex<Level>(arg._internal[i][j]); 
+  }}
+  return ret;
+}
+template<int Level,class vtype> inline 
+typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,Level>::notvalue, iScalar<vtype> >::type 
+transposeIndex (const iScalar<vtype> &arg)
+{
+  iScalar<vtype> ret;
+  ret._internal=transposeIndex<Level>(arg._internal);
+  return ret;
+}
+template<int Level,class vtype> inline 
+typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,Level>::value, iScalar<vtype> >::type 
+transposeIndex (const iScalar<vtype> &arg)
+{
+  return arg;
+}
+
+//////////////////////////////////////////////////////////////////
+// Traces: both all indices and a specific index 
+/////////////////////////////////////////////////////////////////
+
+inline ComplexF trace( const ComplexF &arg){    return arg;}
+inline ComplexD trace( const ComplexD &arg){    return arg;}
+inline RealF trace( const RealF &arg){    return arg;}
+inline RealD trace( const RealD &arg){    return arg;}
+
+template<int Level> inline ComplexF traceIndex(const ComplexF arg) { return arg;}
+template<int Level> inline ComplexD traceIndex(const ComplexD arg) { return arg;}
+template<int Level> inline RealF traceIndex(const RealF arg) { return arg;}
+template<int Level> inline RealD traceIndex(const RealD arg) { return arg;}
+
+template<class vtype,int N>
+inline auto trace(const iMatrix<vtype,N> &arg) -> iScalar<decltype(trace(arg._internal[0][0]))>
+{
+    iScalar<decltype( trace(arg._internal[0][0] )) > ret;
+    zeroit(ret._internal);
+    for(int i=0;i<N;i++){
+        ret._internal=ret._internal+trace(arg._internal[i][i]);
+    }
+    return ret;
+}
+template<class vtype>
+inline auto trace(const iScalar<vtype> &arg) -> iScalar<decltype(trace(arg._internal))>
+{
+    iScalar<decltype(trace(arg._internal))> ret;
+    ret._internal=trace(arg._internal);
+    return ret;
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Trace Specific indices.
+////////////////////////////////////////////////////////////////////////////////////////////////////////
+/*
+template<int Level,class vtype> inline 
+auto traceIndex(const iScalar<vtype> &arg) -> iScalar<decltype(traceIndex<Level>(arg._internal)) >
+{
+  iScalar<decltype(traceIndex<Level>(arg._internal))> ret;
+  ret._internal = traceIndex<Level>(arg._internal);
+  return ret;
+}
+*/
+template<int Level,class vtype> inline auto 
+traceIndex (const iScalar<vtype> &arg) ->
+typename 
+std::enable_if<matchGridTensorIndex<iScalar<vtype>,Level>::notvalue, 
+  iScalar<decltype(traceIndex<Level>(arg._internal))> >::type 
+
+{
+  iScalar<decltype(traceIndex<Level>(arg._internal))> ret;
+  ret._internal=traceIndex<Level>(arg._internal);
+  return ret;
+}
+template<int Level,class vtype> inline auto
+traceIndex (const iScalar<vtype> &arg) ->
+typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,Level>::value, 
+                        iScalar<vtype> >::type 
+{
+  return arg;
+}
+
+// If we hit the right index, return scalar and trace it with no further recursion
+template<int Level,class vtype,int N> inline 
+auto traceIndex(const iMatrix<vtype,N> &arg) ->
+  typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,Level>::value,  // Index matches
+                                                    iScalar<vtype> >::type                              // return scalar
+{
+  iScalar<vtype> ret;
+  zeroit(ret._internal);
+  for(int i=0;i<N;i++){
+    ret._internal = ret._internal + arg._internal[i][i];
+  }
+  return ret;
+}
+
+// not this level, so recurse
+template<int Level,class vtype,int N> inline 
+auto traceIndex(const iMatrix<vtype,N> &arg) ->
+  typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,Level>::notvalue,// No index match
+         iMatrix<decltype(traceIndex<Level>(arg._internal[0][0])),N> >::type     // return matrix
+{
+  iMatrix<decltype(traceIndex<Level>(arg._internal[0][0])),N> ret;
+  for(int i=0;i<N;i++){
+  for(int j=0;j<N;j++){
+    ret._internal[i][j] = traceIndex<Level>(arg._internal[i][j]);
+  }}
+  return ret;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Peek on a specific index; returns a scalar in that index, tensor inherits rest
+//////////////////////////////////////////////////////////////////////////////
+// If we hit the right index, return scalar with no further recursion
+
+//template<int Level> inline ComplexF peekIndex(const ComplexF arg) { return arg;}
+//template<int Level> inline ComplexD peekIndex(const ComplexD arg) { return arg;}
+//template<int Level> inline RealF peekIndex(const RealF arg) { return arg;}
+//template<int Level> inline RealD peekIndex(const RealD arg) { return arg;}
+
+// Scalar peek, no indices
+template<int Level,class vtype> inline 
+  auto peekIndex(const iScalar<vtype> &arg) -> 
+  typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,Level>::value,  // Index matches
+  iScalar<vtype> >::type                              // return scalar
+{
+  return arg;
+}
+// Vector peek, one index
+template<int Level,class vtype,int N> inline 
+  auto peekIndex(const iVector<vtype,N> &arg,int i) -> 
+  typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,Level>::value,  // Index matches
+  iScalar<vtype> >::type                              // return scalar
+{
+  iScalar<vtype> ret;                              // return scalar
+  ret._internal = arg._internal[i];
+  return ret;
+}
+// Matrix peek, two indices
+template<int Level,class vtype,int N> inline 
+  auto peekIndex(const iMatrix<vtype,N> &arg,int i,int j) -> 
+  typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,Level>::value,  // Index matches
+  iScalar<vtype> >::type                              // return scalar
+{
+  iScalar<vtype> ret;                              // return scalar
+  ret._internal = arg._internal[i][j];
+  return ret;
+}
+
+/////////////
+// No match peek for scalar,vector,matrix must forward on either 0,1,2 args. Must have 9 routines with notvalue
+/////////////
+// scalar
+template<int Level,class vtype> inline 
+  auto peekIndex(const iScalar<vtype> &arg) ->                                     // Scalar 0 index  
+  typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,Level>::notvalue,  // Index does NOT match
+  iScalar<decltype(peekIndex<Level>(arg._internal))> >::type                       
+{
+  iScalar<decltype(peekIndex<Level>(arg._internal))> ret;
+  ret._internal= peekIndex<Level>(arg._internal);
+  return ret;
+}
+template<int Level,class vtype> inline 
+  auto peekIndex(const iScalar<vtype> &arg,int i) ->                             // Scalar 1 index
+  typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,Level>::notvalue,  // Index does NOT match
+  iScalar<decltype(peekIndex<Level>(arg._internal,i))> >::type                       
+{
+  iScalar<decltype(peekIndex<Level>(arg._internal,i))> ret;
+  ret._internal=peekIndex<Level>(arg._internal,i);
+  return ret;
+}
+template<int Level,class vtype> inline 
+  auto peekIndex(const iScalar<vtype> &arg,int i,int j) ->                         // Scalar, 2 index
+  typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,Level>::notvalue,  // Index does NOT match
+  iScalar<decltype(peekIndex<Level>(arg._internal,i,j))> >::type                       
+{
+  iScalar<decltype(peekIndex<Level>(arg._internal,i,j))> ret;
+  ret._internal=peekIndex<Level>(arg._internal,i,j);
+  return ret;
+}
+// vector
+template<int Level,class vtype,int N> inline 
+auto peekIndex(const iVector<vtype,N> &arg) -> 
+  typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,Level>::notvalue,  // Index does not match
+  iVector<decltype(peekIndex<Level>(arg._internal[0])),N> >::type                       
+{
+  iVector<decltype(peekIndex<Level>(arg._internal[0])),N> ret;
+  for(int ii=0;ii<N;ii++){
+    ret._internal[ii]=peekIndex<Level>(arg._internal[ii]);
+  }
+  return ret;
+}
+template<int Level,class vtype,int N> inline 
+  auto peekIndex(const iVector<vtype,N> &arg,int i) -> 
+  typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,Level>::notvalue,  // Index does not match
+  iVector<decltype(peekIndex<Level>(arg._internal[0],i)),N> >::type                       
+{
+  iVector<decltype(peekIndex<Level>(arg._internal[0],i)),N> ret;
+  for(int ii=0;ii<N;ii++){
+    ret._internal[ii]=peekIndex<Level>(arg._internal[ii],i);
+  }
+  return ret;
+}
+template<int Level,class vtype,int N> inline 
+  auto peekIndex(const iVector<vtype,N> &arg,int i,int j) -> 
+  typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,Level>::notvalue,  // Index does not match
+  iVector<decltype(peekIndex<Level>(arg._internal[0],i,j)),N> >::type                       
+{
+  iVector<decltype(peekIndex<Level>(arg._internal[0],i,j)),N> ret;
+  for(int ii=0;ii<N;ii++){
+    ret._internal[ii]=peekIndex<Level>(arg._internal[ii],i,j);
+  }
+  return ret;
+}
+// matrix
+template<int Level,class vtype,int N> inline 
+auto peekIndex(const iMatrix<vtype,N> &arg) -> 
+  typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,Level>::notvalue,  // Index does not match
+  iMatrix<decltype(peekIndex<Level>(arg._internal[0][0])),N> >::type                       
+{
+  iMatrix<decltype(peekIndex<Level>(arg._internal[0][0])),N> ret;
+  for(int ii=0;ii<N;ii++){
+  for(int jj=0;jj<N;jj++){
+    ret._internal[ii][jj]=peekIndex<Level>(arg._internal[ii][jj]);// Could avoid this because peeking a scalar is dumb
+  }}
+  return ret;
+}
+template<int Level,class vtype,int N> inline 
+  auto peekIndex(const iMatrix<vtype,N> &arg,int i) -> 
+  typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,Level>::notvalue,  // Index does not match
+  iMatrix<decltype(peekIndex<Level>(arg._internal[0],i)),N> >::type                       
+{
+  iMatrix<decltype(peekIndex<Level>(arg._internal[0],i)),N> ret;
+  for(int ii=0;ii<N;ii++){
+  for(int jj=0;jj<N;jj++){
+    ret._internal[ii][jj]=peekIndex<Level>(arg._internal[ii][jj],i);
+  }}
+  return ret;
+}
+template<int Level,class vtype,int N> inline 
+  auto peekIndex(const iMatrix<vtype,N> &arg,int i,int j) -> 
+  typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,Level>::notvalue,  // Index does not match
+  iMatrix<decltype(peekIndex<Level>(arg._internal[0][0],i,j)),N> >::type                       
+{
+  iMatrix<decltype(peekIndex<Level>(arg._internal[0][0],i,j)),N> ret;
+  for(int ii=0;ii<N;ii++){
+  for(int jj=0;jj<N;jj++){
+    ret._internal[ii][jj]=peekIndex<Level>(arg._internal[ii][jj],i,j);
+  }}
+  return ret;
+}
+
+
+//////////////////////////////////////////////////////////////////////////////
+// Poke a specific index; 
+//////////////////////////////////////////////////////////////////////////////
+
+// Scalar poke
+template<int Level,class vtype> inline 
+  void pokeIndex(iScalar<vtype> &ret, 
+		 const typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,Level>::value,iScalar<vtype> >::type &arg)
+{
+  ret._internal = arg._internal;
+}
+// Vector poke, one index
+template<int Level,class vtype,int N> inline 
+  void pokeIndex(iVector<vtype,N> &ret, 
+		 const typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,Level>::value,iScalar<vtype> >::type &arg,int i)
+{
+  ret._internal[i] = arg._internal;
+}
+// Vector poke, two indices
+template<int Level,class vtype,int N> inline 
+  void pokeIndex(iMatrix<vtype,N> &ret, 
+		 const typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,Level>::value,iScalar<vtype> >::type &arg,int i,int j)
+{
+  ret._internal[i][j] = arg._internal;
+}
+
+/////////////
+// No match poke for scalar,vector,matrix must forward on either 0,1,2 args. Must have 9 routines with notvalue
+/////////////
+// scalar
+template<int Level,class vtype> inline 
+  void pokeIndex(iScalar<vtype> &ret,
+		 const typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,Level>::notvalue,iScalar<decltype(peekIndex<Level>(ret._internal))> >::type &arg)
+		 
+{
+  pokeIndex<Level>(ret._internal,arg._internal);
+}
+template<int Level,class vtype> inline 
+  void pokeIndex(iScalar<vtype> &ret,
+		 const typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,Level>::notvalue,iScalar<decltype(peekIndex<Level>(ret._internal,0))> >::type &arg,
+		 int i)
+		 
+{
+  pokeIndex<Level>(ret._internal,arg._internal,i);
+}
+template<int Level,class vtype> inline 
+  void pokeIndex(iScalar<vtype> &ret,
+		 const typename std::enable_if<matchGridTensorIndex<iScalar<vtype>,Level>::notvalue,iScalar<decltype(peekIndex<Level>(ret._internal,0,0))> >::type &arg,
+		 int i,int j)
+		 
+{
+  pokeIndex<Level>(ret._internal,arg._internal,i,j);
+}
+
+// Vector
+template<int Level,class vtype,int N> inline 
+  void pokeIndex(iVector<vtype,N> &ret,
+		 const typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,Level>::notvalue,iVector<decltype(peekIndex<Level>(ret._internal)),N> >::type &arg)
+		 
+{
+  for(int ii=0;ii<N;ii++){
+    pokeIndex<Level>(ret._internal[ii],arg._internal[ii]);
+  }
+}
+template<int Level,class vtype,int N> inline 
+  void pokeIndex(iVector<vtype,N> &ret,
+		 const typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,Level>::notvalue,iVector<decltype(peekIndex<Level>(ret._internal,0)),N> >::type &arg,
+		 int i)
+		 
+{
+  for(int ii=0;ii<N;ii++){
+    pokeIndex<Level>(ret._internal[ii],arg._internal[ii],i);
+  }
+}
+template<int Level,class vtype,int N> inline 
+  void pokeIndex(iVector<vtype,N> &ret,
+		 const typename std::enable_if<matchGridTensorIndex<iVector<vtype,N>,Level>::notvalue,iVector<decltype(peekIndex<Level>(ret._internal,0,0)),N> >::type &arg,
+		 int i,int j)
+		 
+{
+  for(int ii=0;ii<N;ii++){
+    pokeIndex<Level>(ret._internal[ii],arg._internal[ii],i,j);
+  }
+}
+
+// Matrix
+template<int Level,class vtype,int N> inline 
+  void pokeIndex(iMatrix<vtype,N> &ret,
+		 const typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,Level>::notvalue,iMatrix<decltype(peekIndex<Level>(ret._internal)),N> >::type &arg)
+		 
+{
+  for(int ii=0;ii<N;ii++){
+  for(int jj=0;jj<N;jj++){
+    pokeIndex<Level>(ret._internal[ii][jj],arg._internal[ii][jj]);
+  }}
+}
+template<int Level,class vtype,int N> inline 
+  void pokeIndex(iMatrix<vtype,N> &ret,
+		 const typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,Level>::notvalue,iMatrix<decltype(peekIndex<Level>(ret._internal,0)),N> >::type &arg,
+		 int i)
+		 
+{
+  for(int ii=0;ii<N;ii++){
+  for(int jj=0;jj<N;jj++){
+    pokeIndex<Level>(ret._internal[ii][jj],arg._internal[ii][jj],i);
+  }}
+}
+template<int Level,class vtype,int N> inline 
+  void pokeIndex(iMatrix<vtype,N> &ret,
+		 const typename std::enable_if<matchGridTensorIndex<iMatrix<vtype,N>,Level>::notvalue,iMatrix<decltype(peekIndex<Level>(ret._internal,0,0)),N> >::type &arg,
+		 int i,int j)
+		 
+{
+  for(int ii=0;ii<N;ii++){
+  for(int jj=0;jj<N;jj++){
+    pokeIndex<Level>(ret._internal[ii][jj],arg._internal[ii][jj],i,j);
+  }}
+}
+
+/////////////////////////////////////////////////////////////////
+// Can only take the real/imag part of scalar objects, since
+// lattice objects of different complex nature are non-conformable.
+/////////////////////////////////////////////////////////////////
+template<class itype> inline auto real(const iScalar<itype> &z) -> iScalar<decltype(real(z._internal))>
+{
+    iScalar<decltype(real(z._internal))> ret;
+    ret._internal = real(z._internal);
+    return ret;
+}
+template<class itype,int N> inline auto real(const iMatrix<itype,N> &z) -> iMatrix<decltype(real(z._internal[0][0])),N>
+{
+    iMatrix<decltype(real(z._internal[0][0])),N> ret;
+    for(int c1=0;c1<N;c1++){
+    for(int c2=0;c2<N;c2++){
+        ret._internal[c1][c2] = real(z._internal[c1][c2]);
+    }}
+    return ret;
+}
+template<class itype,int N> inline auto real(const iVector<itype,N> &z) -> iVector<decltype(real(z._internal[0])),N>
+{
+    iVector<decltype(real(z._internal[0])),N> ret;
+    for(int c1=0;c1<N;c1++){
+        ret._internal[c1] = real(z._internal[c1]);
+    }
+    return ret;
+}
+    
+template<class itype> inline auto imag(const iScalar<itype> &z) -> iScalar<decltype(imag(z._internal))>
+{
+    iScalar<decltype(imag(z._internal))> ret;
+    ret._internal = imag(z._internal);
+    return ret;
+}
+template<class itype,int N> inline auto imag(const iMatrix<itype,N> &z) -> iMatrix<decltype(imag(z._internal[0][0])),N>
+{
+    iMatrix<decltype(imag(z._internal[0][0])),N> ret;
+    for(int c1=0;c1<N;c1++){
+    for(int c2=0;c2<N;c2++){
+        ret._internal[c1][c2] = imag(z._internal[c1][c2]);
+    }}
+    return ret;
+}
+template<class itype,int N> inline auto imag(const iVector<itype,N> &z) -> iVector<decltype(imag(z._internal[0])),N>
+{
+    iVector<decltype(imag(z._internal[0])),N> ret;
+    for(int c1=0;c1<N;c1++){
+        ret._internal[c1] = imag(z._internal[c1]);
+    }
+    return ret;
+}
+
+};
+    
+#endif
--- a/lib/Grid_math_arith.h
+++ b/lib/Grid_math_arith.h
@@ -0,0 +1,745 @@
+#ifndef GRID_MATH_ARITH_H
+#define GRID_MATH_ARITH_H
+
+namespace Grid {
+
+    
+    ///////////////////////////////////////////////////////////////////////////////////////////////////
+    /////////////////////////////////////////// ADD         ///////////////////////////////////////////
+    ///////////////////////////////////////////////////////////////////////////////////////////////////
+    
+
+// ADD is simple for now; cannot mix types and straightforward template
+// Scalar +/- Scalar
+// Vector +/- Vector
+// Matrix +/- Matrix
+  template<class vtype,class ltype,class rtype> inline void add(iScalar<vtype> * __restrict__ ret,
+								const iScalar<ltype> * __restrict__ lhs,
+								const iScalar<rtype> * __restrict__ rhs)
+  {
+    add(&ret->_internal,&lhs->_internal,&rhs->_internal);
+  }
+  template<class vtype,class ltype,class rtype,int N> inline void add(iVector<vtype,N> * __restrict__ ret,
+								      const iVector<ltype,N> * __restrict__ lhs,
+								      const iVector<rtype,N> * __restrict__ rhs)
+  {
+    for(int c=0;c<N;c++){
+      ret->_internal[c]=lhs->_internal[c]+rhs->_internal[c];
+    }
+    return;
+  }
+  
+  template<class vtype,class ltype,class rtype, int N> inline  void add(iMatrix<vtype,N> * __restrict__ ret,
+									const iMatrix<ltype,N> * __restrict__ lhs,
+									const iMatrix<rtype,N> * __restrict__ rhs)
+  {
+    for(int c2=0;c2<N;c2++){
+      for(int c1=0;c1<N;c1++){
+        add(&ret->_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal[c1][c2]);
+      }}
+    return;
+  }
+  template<class vtype,class ltype,class rtype, int N> inline  void add(iMatrix<vtype,N> * __restrict__ ret,
+									const iScalar<ltype>   * __restrict__ lhs,
+									const iMatrix<rtype,N> * __restrict__ rhs)
+  {
+    for(int c2=0;c2<N;c2++){
+      for(int c1=0;c1<N;c1++){
+        add(&ret->_internal[c1][c2],&lhs->_internal,&rhs->_internal[c1][c2]);
+      }}
+    return;
+  }
+  template<class vtype,class ltype,class rtype, int N> inline  void add(iMatrix<vtype,N> * __restrict__ ret,
+									const iMatrix<ltype,N> * __restrict__ lhs,
+									const iScalar<rtype>   * __restrict__ rhs)
+  {
+    for(int c2=0;c2<N;c2++){
+      for(int c1=0;c1<N;c1++){
+        if ( c1==c2)
+	  add(&ret->_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal);
+        else
+	  ret->_internal[c1][c2]=lhs->_internal[c1][c2];
+      }}
+    return;
+  }
+
+  // Need to figure multi-precision.
+  template<class Mytype>  Mytype timesI(Mytype &r)
+    {
+      iScalar<Complex> i;
+      i._internal = Complex(0,1);
+      return r*i;
+    }
+
+  // + operator for scalar, vector, matrix
+  template<class ltype,class rtype>
+    //inline auto operator + (iScalar<ltype>& lhs,iScalar<rtype>&& rhs) -> iScalar<decltype(lhs._internal + rhs._internal)>
+    inline auto operator + (const iScalar<ltype>& lhs,const iScalar<rtype>& rhs) -> iScalar<decltype(lhs._internal + rhs._internal)>
+  {
+    typedef iScalar<decltype(lhs._internal+rhs._internal)> ret_t;
+    ret_t ret;
+    add(&ret,&lhs,&rhs);
+    return ret;
+  }
+  template<class ltype,class rtype,int N>
+    inline auto operator + (const iVector<ltype,N>& lhs,const iVector<rtype,N>& rhs) ->iVector<decltype(lhs._internal[0]+rhs._internal[0]),N>
+    {
+      typedef iVector<decltype(lhs._internal[0]+rhs._internal[0]),N> ret_t;
+    ret_t ret;
+    add(&ret,&lhs,&rhs);
+    return ret;
+    }
+  template<class ltype,class rtype,int N>
+    inline auto operator + (const iMatrix<ltype,N>& lhs,const iMatrix<rtype,N>& rhs) ->iMatrix<decltype(lhs._internal[0][0]+rhs._internal[0][0]),N>
+    {
+      typedef iMatrix<decltype(lhs._internal[0][0]+rhs._internal[0][0]),N> ret_t;
+      ret_t ret;
+      add(&ret,&lhs,&rhs);
+      return ret;
+    }
+  template<class ltype,class rtype,int N>
+inline auto operator + (const iScalar<ltype>& lhs,const iMatrix<rtype,N>& rhs)->iMatrix<decltype(lhs._internal+rhs._internal[0][0]),N>
+    {
+      typedef iMatrix<decltype(lhs._internal+rhs._internal[0][0]),N> ret_t;
+      ret_t ret;
+      add(&ret,&lhs,&rhs);
+      return ret;
+    }
+
+  template<class ltype,class rtype,int N>
+    inline auto operator + (const iMatrix<ltype,N>& lhs,const iScalar<rtype>& rhs)->iMatrix<decltype(lhs._internal[0][0]+rhs._internal),N>
+    {
+      typedef iMatrix<decltype(lhs._internal[0][0]+rhs._internal),N> ret_t;
+      ret_t ret;
+      add(&ret,&lhs,&rhs);
+      return ret;
+    }
+
+
+    ///////////////////////////////////////////////////////////////////////////////////////////////////
+    /////////////////////////////////////////// SUB         ///////////////////////////////////////////
+    ///////////////////////////////////////////////////////////////////////////////////////////////////
+    
+
+// SUB is simple for now; cannot mix types and straightforward template
+// Scalar +/- Scalar
+// Vector +/- Vector
+// Matrix +/- Matrix
+// Matrix /- scalar
+template<class vtype,class ltype,class rtype> inline void sub(iScalar<vtype> * __restrict__ ret,
+                                                              const iScalar<ltype> * __restrict__ lhs,
+                                                              const iScalar<rtype> * __restrict__ rhs)
+{
+    sub(&ret->_internal,&lhs->_internal,&rhs->_internal);
+}
+
+template<class vtype,class ltype,class rtype,int N> inline void sub(iVector<vtype,N> * __restrict__ ret,
+                                                                    const iVector<ltype,N> * __restrict__ lhs,
+                                                                    const iVector<rtype,N> * __restrict__ rhs)
+{
+    for(int c=0;c<N;c++){
+        ret->_internal[c]=lhs->_internal[c]-rhs->_internal[c];
+    }
+    return;
+}
+template<class vtype,class ltype,class rtype, int N> inline void sub(iMatrix<vtype,N> * __restrict__ ret,
+                                                                     const iMatrix<ltype,N> * __restrict__ lhs,
+                                                                     const iMatrix<rtype,N> * __restrict__ rhs){
+    for(int c2=0;c2<N;c2++){
+    for(int c1=0;c1<N;c1++){
+        sub(&ret->_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal[c1][c2]);
+    }}
+    return;
+}
+template<class vtype,class ltype,class rtype, int N> inline void sub(iMatrix<vtype,N> * __restrict__ ret,
+                                                                     const iScalar<ltype> * __restrict__ lhs,
+                                                                     const iMatrix<rtype,N> * __restrict__ rhs){
+    for(int c2=0;c2<N;c2++){
+    for(int c1=0;c1<N;c1++){
+        if ( c1!=c2) {
+            sub(&ret->_internal[c1][c2],&lhs->_internal,&rhs->_internal[c1][c2]);
+        } else {
+            // Fails -- need unary minus. Catalogue other unops?
+            ret->_internal[c1][c2]=zero;
+            ret->_internal[c1][c2]=ret->_internal[c1][c2]-rhs->_internal[c1][c2];
+
+        }
+    }}
+    return;
+}
+template<class vtype,class ltype,class rtype, int N> inline void sub(iMatrix<vtype,N> * __restrict__ ret,
+                                                                     const iMatrix<ltype,N> * __restrict__ lhs,
+                                                                     const iScalar<rtype> * __restrict__ rhs){
+    for(int c2=0;c2<N;c2++){
+    for(int c1=0;c1<N;c1++){
+        if ( c1!=c2)
+            sub(&ret->_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal);
+        else
+            ret->_internal[c1][c2]=lhs->_internal[c1][c2];
+    }}
+    return;
+}
+
+template<class v> void vprefetch(const iScalar<v> &vv)
+{
+  vprefetch(vv._internal);
+}
+template<class v,int N> void vprefetch(const iVector<v,N> &vv)
+{
+  for(int i=0;i<N;i++){
+    vprefetch(vv._internal[i]);
+  }
+}
+template<class v,int N> void vprefetch(const iMatrix<v,N> &vv)
+{
+  for(int i=0;i<N;i++){
+  for(int j=0;j<N;j++){
+    vprefetch(vv._internal[i][j]);
+  }}
+}
+
+    // - operator for scalar, vector, matrix
+template<class ltype,class rtype> inline auto
+operator - (const iScalar<ltype>& lhs, const iScalar<rtype>& rhs) -> iScalar<decltype(lhs._internal - rhs._internal)>
+{
+    typedef iScalar<decltype(lhs._internal-rhs._internal)> ret_t;
+    ret_t ret;
+    sub(&ret,&lhs,&rhs);
+    return ret;
+}
+template<class ltype,class rtype,int N>
+inline auto operator - (const iVector<ltype,N>& lhs,const iVector<rtype,N>& rhs) ->iVector<decltype(lhs._internal[0]-rhs._internal[0]),N>
+{
+    typedef iVector<decltype(lhs._internal[0]-rhs._internal[0]),N> ret_t;
+    ret_t ret;
+    sub(&ret,&lhs,&rhs);
+    return ret;
+}
+template<class ltype,class rtype,int N>
+inline auto operator - (const iMatrix<ltype,N>& lhs,const iMatrix<rtype,N>& rhs) ->iMatrix<decltype(lhs._internal[0][0]-rhs._internal[0][0]),N>
+{
+    typedef iMatrix<decltype(lhs._internal[0][0]-rhs._internal[0][0]),N> ret_t;
+    ret_t ret;
+    sub(&ret,&lhs,&rhs);
+    return ret;
+}
+template<class ltype,class rtype,int N>
+inline auto operator - (const iScalar<ltype>& lhs,const iMatrix<rtype,N>& rhs)->iMatrix<decltype(lhs._internal-rhs._internal[0][0]),N>
+{
+    typedef iMatrix<decltype(lhs._internal-rhs._internal[0][0]),N> ret_t;
+    ret_t ret;
+    sub(&ret,&lhs,&rhs);
+    return ret;
+}
+template<class ltype,class rtype,int N>
+inline auto operator - (const iMatrix<ltype,N>& lhs,const iScalar<rtype>& rhs)->iMatrix<decltype(lhs._internal[0][0]-rhs._internal),N>
+{
+    typedef iMatrix<decltype(lhs._internal[0][0]-rhs._internal),N> ret_t;
+    ret_t ret;
+    sub(&ret,&lhs,&rhs);
+    return ret;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////// MAC         ///////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+    ///////////////////////////
+    // Legal multiplication table
+    ///////////////////////////
+    // scal x scal = scal
+    // mat x  mat  = mat
+    // mat  x scal = mat
+    // scal x mat  = mat
+    // mat  x vec  = vec
+    // vec  x scal = vec
+    // scal x vec  = vec
+    ///////////////////////////
+template<class rtype,class vtype,class mtype>
+inline  void mac(iScalar<rtype> * __restrict__ ret,const iScalar<vtype> * __restrict__ lhs,const iScalar<mtype> * __restrict__ rhs)
+{
+    mac(&ret->_internal,&lhs->_internal,&rhs->_internal);
+}
+template<class rrtype,class ltype,class rtype,int N>
+inline void mac(iMatrix<rrtype,N> * __restrict__ ret,const iMatrix<ltype,N> * __restrict__ lhs,const iMatrix<rtype,N> * __restrict__ rhs){
+    for(int c2=0;c2<N;c2++){
+    for(int c1=0;c1<N;c1++){
+    for(int c3=0;c3<N;c3++){
+        mac(&ret->_internal[c1][c2],&lhs->_internal[c1][c3],&rhs->_internal[c3][c2]);
+    }}}
+    return;
+}
+template<class rrtype,class ltype,class rtype,int N>
+inline void mac(iMatrix<rrtype,N> * __restrict__ ret,const iMatrix<ltype,N> * __restrict__ lhs,const iScalar<rtype> * __restrict__ rhs){
+    for(int c1=0;c1<N;c1++){
+    for(int c2=0;c2<N;c2++){
+        mac(&ret->_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal);
+    }}
+    return;
+}
+template<class rrtype,class ltype,class rtype,int N>
+inline void mac(iMatrix<rrtype,N> * __restrict__ ret,const iScalar<ltype> * __restrict__ lhs,const iMatrix<rtype,N> * __restrict__ rhs){
+    for(int c1=0;c1<N;c1++){
+    for(int c2=0;c2<N;c2++){
+        mac(&ret->_internal[c1][c2],&lhs->_internal,&rhs->_internal[c1][c2]);
+    }}
+    return;
+}
+template<class rrtype,class ltype,class rtype,int N>
+inline void mac(iVector<rrtype,N> * __restrict__ ret,const iMatrix<ltype,N> * __restrict__ lhs,const iVector<rtype,N> * __restrict__ rhs)
+{
+    for(int c1=0;c1<N;c1++){
+    for(int c2=0;c2<N;c2++){
+        mac(&ret->_internal[c1],&lhs->_internal[c1][c2],&rhs->_internal[c2]);
+    }}
+    return;
+}
+template<class rrtype,class ltype,class rtype,int N>
+inline void mac(iVector<rrtype,N> * __restrict__ ret,const iScalar<ltype> * __restrict__ lhs,const iVector<rtype,N> * __restrict__ rhs)
+{
+    for(int c1=0;c1<N;c1++){
+        mac(&ret->_internal[c1],&lhs->_internal,&rhs->_internal[c1]);
+    }
+    return;
+}
+template<class rrtype,class ltype,class rtype,int N>
+inline void mac(iVector<rrtype,N> * __restrict__ ret,const iVector<ltype,N> * __restrict__ lhs,const iScalar<rtype> * __restrict__ rhs)
+{
+    for(int c1=0;c1<N;c1++){
+        mac(&ret->_internal[c1],&lhs->_internal[c1],&rhs->_internal);
+    }
+    return;
+}
+
+    ///////////////////////////////////////////////////////////////////////////////////////////////////
+    /////////////////////////////////////////// MUL         ///////////////////////////////////////////
+    ///////////////////////////////////////////////////////////////////////////////////////////////////
+
+    
+template<class rtype,class vtype,class mtype>
+inline void mult(iScalar<rtype> * __restrict__ ret,const iScalar<mtype> * __restrict__ lhs,const iScalar<vtype> * __restrict__ rhs){
+    mult(&ret->_internal,&lhs->_internal,&rhs->_internal);
+}
+
+template<class rrtype,class ltype,class rtype,int N>
+inline void mult(iMatrix<rrtype,N> * __restrict__ ret,const iMatrix<ltype,N> * __restrict__ lhs,const iMatrix<rtype,N> * __restrict__ rhs){
+    for(int c2=0;c2<N;c2++){
+    for(int c1=0;c1<N;c1++){
+        mult(&ret->_internal[c1][c2],&lhs->_internal[c1][0],&rhs->_internal[0][c2]);
+        for(int c3=1;c3<N;c3++){
+            mac(&ret->_internal[c1][c2],&lhs->_internal[c1][c3],&rhs->_internal[c3][c2]);
+        }
+    }}
+    return;
+}
+template<class rrtype,class ltype,class rtype,int N>
+inline void mult(iMatrix<rrtype,N> * __restrict__ ret,const iMatrix<ltype,N> * __restrict__ lhs,const iScalar<rtype> * __restrict__ rhs){
+    for(int c2=0;c2<N;c2++){
+    for(int c1=0;c1<N;c1++){
+        mult(&ret->_internal[c1][c2],&lhs->_internal[c1][c2],&rhs->_internal);
+    }}
+    return;
+}
+
+template<class rrtype,class ltype,class rtype, int N>
+inline void mult(iMatrix<rrtype,N> * __restrict__ ret,const iScalar<ltype>   * __restrict__ lhs,const iMatrix<rtype,N> * __restrict__ rhs){
+    for(int c2=0;c2<N;c2++){
+    for(int c1=0;c1<N;c1++){
+        mult(&ret->_internal[c1][c2],&lhs->_internal,&rhs->_internal[c1][c2]);
+    }}
+    return;
+}
+// Matrix left multiplies vector
+template<class rtype,class vtype,class mtype,int N>
+inline void mult(iVector<rtype,N> * __restrict__ ret,const iMatrix<mtype,N> * __restrict__ lhs,const iVector<vtype,N> * __restrict__ rhs)
+{
+    for(int c1=0;c1<N;c1++){
+        mult(&ret->_internal[c1],&lhs->_internal[c1][0],&rhs->_internal[0]);
+        for(int c2=1;c2<N;c2++){
+            mac(&ret->_internal[c1],&lhs->_internal[c1][c2],&rhs->_internal[c2]);
+        }
+    }
+    return;
+}
+template<class rtype,class vtype,class mtype,int N>
+inline void mult(iVector<rtype,N> * __restrict__ ret,
+                 const iScalar<mtype>   * __restrict__ lhs,
+                 const iVector<vtype,N> * __restrict__ rhs){
+    for(int c1=0;c1<N;c1++){
+        mult(&ret->_internal[c1],&lhs->_internal,&rhs->_internal[c1]);
+    }
+}
+template<class rtype,class vtype,class mtype,int N>
+inline void mult(iVector<rtype,N> * __restrict__ ret,
+                 const iVector<vtype,N> * __restrict__ rhs,
+                 const iScalar<mtype> * __restrict__ lhs){
+    mult(ret,lhs,rhs);
+}
+    
+
+
+template<class rtype,class vtype,class mtype,int N> inline
+iVector<rtype,N> operator * (const iMatrix<mtype,N>& lhs,const iVector<vtype,N>& rhs)
+{
+    iVector<rtype,N> ret;
+    mult(&ret,&lhs,&rhs);
+    return ret;
+}
+
+template<class rtype,class vtype,class mtype,int N> inline
+iVector<rtype,N> operator * (const iScalar<mtype>& lhs,const iVector<vtype,N>& rhs)
+{
+    iVector<rtype,N> ret;
+    mult(&ret,&lhs,&rhs);
+    return ret;
+}
+
+template<class rtype,class vtype,class mtype,int N> inline
+iVector<rtype,N> operator * (const iVector<mtype,N>& lhs,const iScalar<vtype>& rhs)
+{
+    iVector<rtype,N> ret;
+    mult(&ret,&lhs,&rhs);
+    return ret;
+}
+    
+    //////////////////////////////////////////////////////////////////
+    // Glue operators to mult routines. Must resolve return type cleverly from typeof(internal)
+    // since nesting matrix<scalar> x matrix<matrix>-> matrix<matrix>
+    // while         matrix<scalar> x matrix<scalar>-> matrix<scalar>
+    // so return type depends on argument types in nasty way.
+    //////////////////////////////////////////////////////////////////
+    // scal x scal = scal
+    // mat x  mat  = mat
+    // mat  x scal = mat
+    // scal x mat  = mat
+    // mat  x vec  = vec
+    // vec  x scal = vec
+    // scal x vec  = vec
+    //
+    // We can special case scalar_type ??
+template<class l,class r>
+inline auto operator * (const iScalar<l>& lhs,const iScalar<r>& rhs) -> iScalar<decltype(lhs._internal * rhs._internal)>
+{
+    typedef iScalar<decltype(lhs._internal*rhs._internal)> ret_t;
+    ret_t ret;
+    mult(&ret,&lhs,&rhs);
+    return ret;
+}
+template<class l,class r,int N> inline
+auto operator * (const iMatrix<l,N>& lhs,const iMatrix<r,N>& rhs) -> iMatrix<decltype(lhs._internal[0][0]*rhs._internal[0][0]),N>
+{
+    typedef decltype(lhs._internal[0][0]*rhs._internal[0][0]) ret_t;
+    iMatrix<ret_t,N> ret;
+    mult(&ret,&lhs,&rhs);
+    return ret;
+}
+template<class l,class r, int N> inline
+auto operator * (const iMatrix<r,N>& lhs,const iScalar<l>& rhs) -> iMatrix<decltype(lhs._internal[0][0]*rhs._internal),N>
+{
+    typedef decltype(lhs._internal[0][0]*rhs._internal) ret_t;
+        
+    iMatrix<ret_t,N> ret;
+    for(int c1=0;c1<N;c1++){
+    for(int c2=0;c2<N;c2++){
+        mult(&ret._internal[c1][c2],&lhs._internal[c1][c2],&rhs._internal);
+    }}
+    return ret;
+}
+template<class l,class r,int N> inline
+auto operator * (const iScalar<l>& lhs,const iMatrix<r,N>& rhs) -> iMatrix<decltype(lhs._internal*rhs._internal[0][0]),N>
+{
+    typedef decltype(lhs._internal*rhs._internal[0][0]) ret_t;
+    iMatrix<ret_t,N> ret;
+    for(int c1=0;c1<N;c1++){
+    for(int c2=0;c2<N;c2++){
+        mult(&ret._internal[c1][c2],&lhs._internal,&rhs._internal[c1][c2]);
+    }}
+    return ret;
+}
+template<class l,class r,int N> inline
+auto operator * (const iMatrix<l,N>& lhs,const iVector<r,N>& rhs) -> iVector<decltype(lhs._internal[0][0]*rhs._internal[0]),N>
+{
+    typedef decltype(lhs._internal[0][0]*rhs._internal[0]) ret_t;
+    iVector<ret_t,N> ret;
+    for(int c1=0;c1<N;c1++){
+        mult(&ret._internal[c1],&lhs._internal[c1][0],&rhs._internal[0]);
+        for(int c2=1;c2<N;c2++){
+            mac(&ret._internal[c1],&lhs._internal[c1][c2],&rhs._internal[c2]);
+        }
+    }
+    return ret;
+}
+template<class l,class r,int N> inline
+auto operator * (const iScalar<l>& lhs,const iVector<r,N>& rhs) -> iVector<decltype(lhs._internal*rhs._internal[0]),N>
+{
+    typedef decltype(lhs._internal*rhs._internal[0]) ret_t;
+    iVector<ret_t,N> ret;
+    for(int c1=0;c1<N;c1++){
+        mult(&ret._internal[c1],&lhs._internal,&rhs._internal[c1]);
+    }
+    return ret;
+}
+template<class l,class r,int N> inline
+auto operator * (const iVector<l,N>& lhs,const iScalar<r>& rhs) -> iVector<decltype(lhs._internal[0]*rhs._internal),N>
+{
+    typedef decltype(lhs._internal[0]*rhs._internal) ret_t;
+    iVector<ret_t,N> ret;
+    for(int c1=0;c1<N;c1++){
+        mult(&ret._internal[c1],&lhs._internal[c1],&rhs._internal);
+    }
+    return ret;
+}
+
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// Must support native C++ types Integer, Complex, Real
+//////////////////////////////////////////////////////////////////////////////////////////
+
+// multiplication by fundamental scalar type
+template<class l,int N> inline iScalar<l> operator * (const iScalar<l>& lhs,const typename iScalar<l>::scalar_type rhs) 
+{
+  typename iScalar<l>::tensor_reduced srhs(rhs);
+  return lhs*srhs;
+}
+template<class l,int N> inline iScalar<l> operator * (const typename iScalar<l>::scalar_type lhs,const iScalar<l>& rhs) {  return rhs*lhs; }
+
+template<class l,int N> inline iVector<l,N> operator * (const iVector<l,N>& lhs,const typename iScalar<l>::scalar_type rhs) 
+{
+  typename iVector<l,N>::tensor_reduced srhs(rhs);
+  return lhs*srhs;
+}
+template<class l,int N> inline iVector<l,N> operator * (const typename iScalar<l>::scalar_type lhs,const iVector<l,N>& rhs) {  return rhs*lhs; }
+
+template<class l,int N> inline iMatrix<l,N> operator * (const iMatrix<l,N>& lhs,const typename iScalar<l>::scalar_type &rhs) 
+{
+  typename iMatrix<l,N>::tensor_reduced srhs(rhs);
+  return lhs*srhs;
+}
+template<class l,int N> inline iMatrix<l,N> operator * (const typename iScalar<l>::scalar_type & lhs,const iMatrix<l,N>& rhs) {  return rhs*lhs; }
+
+////////////////////////////////////////////////////////////////////
+// Double support; cast to "scalar_type" through constructor
+////////////////////////////////////////////////////////////////////
+template<class l> inline iScalar<l> operator * (const iScalar<l>& lhs,double rhs) 
+{
+  typename iScalar<l>::scalar_type t(rhs);
+  typename iScalar<l>::tensor_reduced srhs(t);
+  return lhs*srhs;
+}
+template<class l> inline iScalar<l> operator * (double lhs,const iScalar<l>& rhs) {  return rhs*lhs; }
+
+template<class l,int N> inline iVector<l,N> operator * (const iVector<l,N>& lhs,double rhs) 
+{
+  typename iScalar<l>::scalar_type t(rhs);
+  typename iScalar<l>::tensor_reduced srhs(t);
+  return lhs*srhs;
+}
+template<class l,int N> inline iVector<l,N> operator * (double lhs,const iVector<l,N>& rhs) {  return rhs*lhs; }
+
+template<class l,int N> inline iMatrix<l,N> operator * (const iMatrix<l,N>& lhs,double rhs) 
+{
+  typename iScalar<l>::scalar_type t(rhs);
+  typename iScalar<l>::tensor_reduced srhs(t);
+  return lhs*srhs;
+}
+template<class l,int N> inline iMatrix<l,N> operator * (double lhs,const iMatrix<l,N>& rhs) {  return rhs*lhs; }
+
+////////////////////////////////////////////////////////////////////
+// Complex support; cast to "scalar_type" through constructor
+////////////////////////////////////////////////////////////////////
+template<class l> inline iScalar<l> operator * (const iScalar<l>& lhs,ComplexD rhs) 
+{
+  typename iScalar<l>::scalar_type t(rhs);
+  typename iScalar<l>::tensor_reduced srhs(t);
+  return lhs*srhs;
+}
+template<class l> inline iScalar<l> operator * (ComplexD lhs,const iScalar<l>& rhs) {  return rhs*lhs; }
+
+template<class l,int N> inline iVector<l,N> operator * (const iVector<l,N>& lhs,ComplexD rhs) 
+{
+  typename iScalar<l>::scalar_type t(rhs);
+  typename iScalar<l>::tensor_reduced srhs(t);
+  return lhs*srhs;
+}
+template<class l,int N> inline iVector<l,N> operator * (ComplexD lhs,const iVector<l,N>& rhs) {  return rhs*lhs; }
+
+template<class l,int N> inline iMatrix<l,N> operator * (const iMatrix<l,N>& lhs,ComplexD rhs) 
+{
+  typename iScalar<l>::scalar_type t(rhs);
+  typename iScalar<l>::tensor_reduced srhs(t);
+  return lhs*srhs;
+}
+template<class l,int N> inline iMatrix<l,N> operator * (ComplexD lhs,const iMatrix<l,N>& rhs) {  return rhs*lhs; }
+
+////////////////////////////////////////////////////////////////////
+// Integer support; cast to "scalar_type" through constructor
+////////////////////////////////////////////////////////////////////
+template<class l> inline iScalar<l> operator * (const iScalar<l>& lhs,Integer rhs) 
+{
+  typename iScalar<l>::scalar_type t(rhs);
+  typename iScalar<l>::tensor_reduced srhs(t);
+  return lhs*srhs;
+}
+template<class l> inline iScalar<l> operator * (Integer lhs,const iScalar<l>& rhs) {  return rhs*lhs; }
+
+template<class l,int N> inline iVector<l,N> operator * (const iVector<l,N>& lhs,Integer rhs) 
+{
+  typename iScalar<l>::scalar_type t(rhs);
+  typename iScalar<l>::tensor_reduced srhs(t);
+  return lhs*srhs;
+}
+template<class l,int N> inline iVector<l,N> operator * (Integer lhs,const iVector<l,N>& rhs) {  return rhs*lhs; }
+
+template<class l,int N> inline iMatrix<l,N> operator * (const iMatrix<l,N>& lhs,Integer rhs) 
+{
+  typename iScalar<l>::scalar_type t(rhs);
+  typename iScalar<l>::tensor_reduced srhs(t);
+  return lhs*srhs;
+}
+template<class l,int N> inline iMatrix<l,N> operator * (Integer lhs,const iMatrix<l,N>& rhs) {  return rhs*lhs; }
+
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+// addition by fundamental scalar type applies to matrix(down diag) and scalar
+///////////////////////////////////////////////////////////////////////////////////////////////
+template<class l,int N> inline iScalar<l> operator + (const iScalar<l>& lhs,const typename iScalar<l>::scalar_type rhs) 
+{
+  typename iScalar<l>::tensor_reduced srhs(rhs);
+  return lhs+srhs;
+}
+template<class l,int N> inline iScalar<l> operator + (const typename iScalar<l>::scalar_type lhs,const iScalar<l>& rhs) {  return rhs+lhs; }
+
+template<class l,int N> inline iMatrix<l,N> operator + (const iMatrix<l,N>& lhs,const typename iScalar<l>::scalar_type rhs) 
+{
+  typename iMatrix<l,N>::tensor_reduced srhs(rhs);
+  return lhs+srhs;
+}
+template<class l,int N> inline iMatrix<l,N> operator + (const typename iScalar<l>::scalar_type lhs,const iMatrix<l,N>& rhs) {  return rhs+lhs; }
+
+////////////////////////////////////////////////////////////////////
+// Double support; cast to "scalar_type" through constructor
+////////////////////////////////////////////////////////////////////
+template<class l> inline iScalar<l> operator + (const iScalar<l>& lhs,double rhs) 
+{
+  typename iScalar<l>::scalar_type t(rhs);
+  typename iScalar<l>::tensor_reduced srhs(t);
+  return lhs+srhs;
+}
+template<class l> inline iScalar<l> operator + (double lhs,const iScalar<l>& rhs) {  return rhs+lhs; }
+
+template<class l,int N> inline iMatrix<l,N> operator + (const iMatrix<l,N>& lhs,double rhs) 
+{
+  typename iScalar<l>::scalar_type t(rhs);
+  typename iScalar<l>::tensor_reduced srhs(t);
+  return lhs+srhs;
+}
+template<class l,int N> inline iMatrix<l,N> operator + (double lhs,const iMatrix<l,N>& rhs) {  return rhs+lhs; }
+
+
+// Integer support cast to scalar type through constructor
+
+
+template<class l> inline iScalar<l> operator + (const iScalar<l>& lhs,Integer rhs) 
+{
+  typename iScalar<l>::scalar_type t(rhs);
+  typename iScalar<l>::tensor_reduced srhs(t);
+  return lhs+srhs;
+}
+
+template<class l> inline iScalar<l> operator + (Integer lhs,const iScalar<l>& rhs) {  return rhs+lhs; }
+
+template<class l,int N> inline iMatrix<l,N> operator + (const iMatrix<l,N>& lhs,Integer rhs) 
+{
+  typename iScalar<l>::scalar_type t(rhs);
+  typename iScalar<l>::tensor_reduced srhs(t);
+  return lhs+srhs;
+}
+template<class l,int N> inline iMatrix<l,N> operator + (Integer lhs,const iMatrix<l,N>& rhs) {  return rhs+lhs; }
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+// subtraction of fundamental scalar type applies to matrix(down diag) and scalar
+///////////////////////////////////////////////////////////////////////////////////////////////
+template<class l,int N> inline iScalar<l> operator - (const iScalar<l>& lhs,const typename iScalar<l>::scalar_type rhs) 
+{
+  typename iScalar<l>::tensor_reduced srhs(rhs);
+  return lhs-srhs;
+}
+template<class l,int N> inline iScalar<l> operator - (const typename iScalar<l>::scalar_type lhs,const iScalar<l>& rhs) 
+{
+  typename iScalar<l>::tensor_reduced slhs(lhs);
+  return slhs-rhs;
+}
+
+template<class l,int N> inline iMatrix<l,N> operator - (const iMatrix<l,N>& lhs,const typename iScalar<l>::scalar_type rhs) 
+{
+  typename iScalar<l>::tensor_reduced srhs(rhs);
+  return lhs-srhs;
+}
+template<class l,int N> inline iMatrix<l,N> operator - (const typename iScalar<l>::scalar_type lhs,const iMatrix<l,N>& rhs) 
+{
+  typename iScalar<l>::tensor_reduced slhs(lhs);
+  return slhs-rhs;
+}
+
+////////////////////////////////////////////////////////////////////
+// Double support; cast to "scalar_type" through constructor
+////////////////////////////////////////////////////////////////////
+template<class l> inline iScalar<l> operator - (const iScalar<l>& lhs,double rhs) 
+{
+  typename iScalar<l>::scalar_type t(rhs);
+  typename iScalar<l>::tensor_reduced srhs(t);
+  return lhs-srhs;
+}
+template<class l> inline iScalar<l> operator - (double lhs,const iScalar<l>& rhs) 
+{
+  typename iScalar<l>::scalar_type t(lhs);
+  typename iScalar<l>::tensor_reduced slhs(t);
+  return slhs-rhs;
+}
+
+template<class l,int N> inline iMatrix<l,N> operator - (const iMatrix<l,N>& lhs,double rhs) 
+{
+  typename iScalar<l>::scalar_type t(rhs);
+  typename iScalar<l>::tensor_reduced srhs(t);
+  return lhs-srhs;
+}
+template<class l,int N> inline iMatrix<l,N> operator - (double lhs,const iMatrix<l,N>& rhs) 
+{
+  typename iScalar<l>::scalar_type t(lhs);
+  typename iScalar<l>::tensor_reduced slhs(t);
+  return slhs-rhs;
+}
+
+////////////////////////////////////////////////////////////////////
+// Integer support; cast to "scalar_type" through constructor
+////////////////////////////////////////////////////////////////////
+template<class l> inline iScalar<l> operator - (const iScalar<l>& lhs,Integer rhs) 
+{
+  typename iScalar<l>::scalar_type t(rhs);
+  typename iScalar<l>::tensor_reduced srhs(t);
+  return lhs-srhs;
+}
+template<class l> inline iScalar<l> operator - (Integer lhs,const iScalar<l>& rhs) 
+{
+  typename iScalar<l>::scalar_type t(lhs);
+  typename iScalar<l>::tensor_reduced slhs(t);
+  return slhs-rhs;
+}
+template<class l,int N> inline iMatrix<l,N> operator - (const iMatrix<l,N>& lhs,Integer rhs) 
+{
+  typename iScalar<l>::scalar_type t(rhs);
+  typename iScalar<l>::tensor_reduced srhs(t);
+  return lhs-srhs;
+}
+template<class l,int N> inline iMatrix<l,N> operator - (Integer lhs,const iMatrix<l,N>& rhs) 
+{
+  typename iScalar<l>::scalar_type t(lhs);
+  typename iScalar<l>::tensor_reduced slhs(t);
+  return slhs-rhs;
+}
+
+}
+#endif
+
--- a/lib/Grid_math_tensors.h
+++ b/lib/Grid_math_tensors.h
@@ -0,0 +1,224 @@
+#ifndef GRID_MATH_TENSORS_H
+#define GRID_MATH_TENSORS_H
+
+namespace Grid {
+
+///////////////////////////////////////////////////
+// Scalar, Vector, Matrix objects.
+// These can be composed to form tensor products of internal indices.
+///////////////////////////////////////////////////
+
+template<class vtype> class iScalar
+{
+public:
+  vtype _internal;
+
+  typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
+  typedef typename GridTypeMapper<vtype>::vector_type vector_type;
+  typedef typename GridTypeMapper<vtype>::tensor_reduced tensor_reduced_v;
+  typedef iScalar<tensor_reduced_v> tensor_reduced;
+
+  enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1};
+
+  // Scalar no action
+  //  template<int Level> using tensor_reduce_level = typename iScalar<GridTypeMapper<vtype>::tensor_reduce_level<Level> >;
+
+    iScalar(){};
+    
+    iScalar(scalar_type s) : _internal(s) {};// recurse down and hit the constructor for vector_type
+
+    iScalar(Zero &z){ *this = zero; };
+
+    iScalar<vtype> & operator= (const Zero &hero){
+        zeroit(*this);
+        return *this;
+    }
+    friend void zeroit(iScalar<vtype> &that){
+        zeroit(that._internal);
+    }
+    friend void permute(iScalar<vtype> &out,const iScalar<vtype> &in,int permutetype){
+      permute(out._internal,in._internal,permutetype);
+    }
+    friend void extract(const iScalar<vtype> &in,std::vector<scalar_type *> &out){
+      extract(in._internal,out); // extract advances the pointers in out
+    }
+    friend void merge(iScalar<vtype> &in,std::vector<scalar_type *> &out){
+      merge(in._internal,out); // extract advances the pointers in out
+    }
+
+    // Unary negation
+    friend inline iScalar<vtype> operator -(const iScalar<vtype> &r) {
+        iScalar<vtype> ret;
+        ret._internal= -r._internal;
+        return ret;
+    }
+    // *=,+=,-= operators inherit from corresponding "*,-,+" behaviour
+    inline iScalar<vtype> &operator *=(const iScalar<vtype> &r) {
+        *this = (*this)*r;
+        return *this;
+    }
+    inline iScalar<vtype> &operator -=(const iScalar<vtype> &r) {
+        *this = (*this)-r;
+        return *this;
+    }
+    inline iScalar<vtype> &operator +=(const iScalar<vtype> &r) {
+        *this = (*this)+r;
+        return *this;
+    }
+    
+    inline vtype & operator ()(void) {
+      return _internal;
+    }
+
+    operator ComplexD () const { return(TensorRemove(_internal)); };
+    operator RealD () const { return(real(TensorRemove(_internal))); }
+
+};
+///////////////////////////////////////////////////////////
+// Allows to turn scalar<scalar<scalar<double>>>> back to double.
+///////////////////////////////////////////////////////////
+template<class T> inline typename std::enable_if<isGridTensor<T>::notvalue, T>::type TensorRemove(T arg) { return arg;}
+template<class vtype> inline auto TensorRemove(iScalar<vtype> arg) -> decltype(TensorRemove(arg._internal))
+{
+  return TensorRemove(arg._internal);
+}
+    
+template<class vtype,int N> class iVector
+{
+public:
+  vtype _internal[N];
+
+  typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
+  typedef typename GridTypeMapper<vtype>::vector_type vector_type;
+  typedef typename GridTypeMapper<vtype>::tensor_reduced tensor_reduced_v;
+
+  enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1};
+  typedef iScalar<tensor_reduced_v> tensor_reduced;
+
+    iVector(Zero &z){ *this = zero; };
+    iVector() {};// Empty constructure
+
+    iVector<vtype,N> & operator= (Zero &hero){
+        zeroit(*this);
+        return *this;
+    }
+    friend void zeroit(iVector<vtype,N> &that){
+        for(int i=0;i<N;i++){
+            zeroit(that._internal[i]);
+        }
+    }
+    friend void permute(iVector<vtype,N> &out,const iVector<vtype,N> &in,int permutetype){
+      for(int i=0;i<N;i++){
+	permute(out._internal[i],in._internal[i],permutetype);
+      }
+    }
+    friend void extract(const iVector<vtype,N> &in,std::vector<scalar_type *> &out){
+      for(int i=0;i<N;i++){
+	extract(in._internal[i],out);// extract advances pointers in out
+      }
+    }
+    friend void merge(iVector<vtype,N> &in,std::vector<scalar_type *> &out){
+      for(int i=0;i<N;i++){
+	merge(in._internal[i],out);// extract advances pointers in out
+      }
+    }
+    // Unary negation
+    friend inline iVector<vtype,N> operator -(const iVector<vtype,N> &r) {
+        iVector<vtype,N> ret;
+        for(int i=0;i<N;i++) ret._internal[i]= -r._internal[i];
+        return ret;
+    }
+    // *=,+=,-= operators inherit from corresponding "*,-,+" behaviour
+    inline iVector<vtype,N> &operator *=(const iScalar<vtype> &r) {
+        *this = (*this)*r;
+        return *this;
+    }
+    inline iVector<vtype,N> &operator -=(const iVector<vtype,N> &r) {
+        *this = (*this)-r;
+        return *this;
+    }
+    inline iVector<vtype,N> &operator +=(const iVector<vtype,N> &r) {
+        *this = (*this)+r;
+        return *this;
+    }
+    inline vtype & operator ()(int i) {
+      return _internal[i];
+    }
+};
+    
+template<class vtype,int N> class iMatrix
+{
+public:
+  vtype _internal[N][N];
+
+  typedef typename GridTypeMapper<vtype>::scalar_type scalar_type;
+  typedef typename GridTypeMapper<vtype>::vector_type vector_type;
+
+  typedef typename GridTypeMapper<vtype>::tensor_reduced tensor_reduced_v;
+
+  enum { TensorLevel = GridTypeMapper<vtype>::TensorLevel + 1};
+  typedef iScalar<tensor_reduced_v> tensor_reduced;
+
+  iMatrix(Zero &z){ *this = zero; };
+  iMatrix() {};
+  iMatrix<vtype,N> & operator= (Zero &hero){
+    zeroit(*this);
+    return *this;
+  }
+  friend void zeroit(iMatrix<vtype,N> &that){
+    for(int i=0;i<N;i++){
+      for(int j=0;j<N;j++){
+	zeroit(that._internal[i][j]);
+    }}
+  }
+  friend void permute(iMatrix<vtype,N> &out,const iMatrix<vtype,N> &in,int permutetype){
+    for(int i=0;i<N;i++){
+      for(int j=0;j<N;j++){
+	permute(out._internal[i][j],in._internal[i][j],permutetype);
+    }}
+  }
+  friend void extract(const iMatrix<vtype,N> &in,std::vector<scalar_type *> &out){
+    for(int i=0;i<N;i++){
+      for(int j=0;j<N;j++){
+	extract(in._internal[i][j],out);// extract advances pointers in out
+    }}
+  }
+  friend void merge(iMatrix<vtype,N> &in,std::vector<scalar_type *> &out){
+    for(int i=0;i<N;i++){
+      for(int j=0;j<N;j++){
+	merge(in._internal[i][j],out);// extract advances pointers in out
+    }}
+  }
+  // Unary negation
+  friend inline iMatrix<vtype,N> operator -(const iMatrix<vtype,N> &r) {
+    iMatrix<vtype,N> ret;
+    for(int i=0;i<N;i++){
+      for(int j=0;j<N;j++){
+	ret._internal[i][j]= -r._internal[i][j];
+    }}
+    return ret;
+  }
+  // *=,+=,-= operators inherit from corresponding "*,-,+" behaviour
+  template<class T>
+  inline iMatrix<vtype,N> &operator *=(const T &r) {
+    *this = (*this)*r;
+    return *this;
+  }
+  template<class T>
+  inline iMatrix<vtype,N> &operator -=(const T &r) {
+    *this = (*this)-r;
+    return *this;
+  }
+  template<class T>
+  inline iMatrix<vtype,N> &operator +=(const T &r) {
+    *this = (*this)+r;
+    return *this;
+  }
+  inline vtype & operator ()(int i,int j) {
+    return _internal[i][j];
+  }
+
+};
+
+}
+#endif
--- a/lib/Grid_math_traits.h
+++ b/lib/Grid_math_traits.h
@@ -0,0 +1,165 @@
+#ifndef GRID_MATH_TRAITS_H
+#define GRID_MATH_TRAITS_H
+
+#include <type_traits>
+
+namespace Grid {
+
+//////////////////////////////////////////////////////////////////////////////////
+// Want to recurse: GridTypeMapper<Matrix<vComplexD> >::scalar_type == ComplexD.
+// Use of a helper class like this allows us to template specialise and "dress"
+// other classes such as RealD == double, ComplexD == std::complex<double> with these
+// traits.
+//
+// It is possible that we could do this more elegantly if I introduced a 
+// queryable trait in iScalar, iMatrix and iVector and used the query on vtype in 
+// place of the type mapper?
+//
+// Not sure how to do this, but probably could be done with a research effort
+// to study C++11's type_traits.h file. (std::enable_if<isGridTensorType<vtype> >)
+//
+//////////////////////////////////////////////////////////////////////////////////
+  
+  template <class T> class GridTypeMapper {
+  public:
+    typedef typename T::scalar_type scalar_type;
+    typedef typename T::vector_type vector_type;
+    typedef typename T::tensor_reduced tensor_reduced;
+    enum { TensorLevel = T::TensorLevel };
+  };
+
+//////////////////////////////////////////////////////////////////////////////////
+// Recursion stops with these template specialisations
+//////////////////////////////////////////////////////////////////////////////////
+  template<> class GridTypeMapper<RealF> {
+  public:
+    typedef RealF scalar_type;
+    typedef RealF vector_type;
+    typedef RealF tensor_reduced ;
+    enum { TensorLevel = 0 };
+  };
+  template<> class GridTypeMapper<RealD> {
+  public:
+    typedef RealD scalar_type;
+    typedef RealD vector_type;
+    typedef RealD tensor_reduced;
+    enum { TensorLevel = 0 };
+  };
+  template<> class GridTypeMapper<ComplexF> {
+  public:
+    typedef ComplexF scalar_type;
+    typedef ComplexF vector_type;
+    typedef ComplexF tensor_reduced;
+    enum { TensorLevel = 0 };
+  };
+  template<> class GridTypeMapper<ComplexD> {
+  public:
+    typedef ComplexD scalar_type;
+    typedef ComplexD vector_type;
+    typedef ComplexD tensor_reduced;
+    enum { TensorLevel = 0 };
+  };
+
+  template<> class GridTypeMapper<vRealF> {
+  public:
+    typedef RealF  scalar_type;
+    typedef vRealF vector_type;
+    typedef vRealF tensor_reduced;
+    enum { TensorLevel = 0 };
+  };
+  template<> class GridTypeMapper<vRealD> {
+  public:
+    typedef RealD  scalar_type;
+    typedef vRealD vector_type;
+    typedef vRealD tensor_reduced;
+    enum { TensorLevel = 0 };
+  };
+  template<> class GridTypeMapper<vComplexF> {
+  public:
+    typedef ComplexF  scalar_type;
+    typedef vComplexF vector_type;
+    typedef vComplexF tensor_reduced;
+    enum { TensorLevel = 0 };
+  };
+  template<> class GridTypeMapper<vComplexD> {
+  public:
+    typedef ComplexD  scalar_type;
+    typedef vComplexD vector_type;
+    typedef vComplexD tensor_reduced;
+    enum { TensorLevel = 0 };
+  };
+  template<> class GridTypeMapper<vInteger> {
+  public:
+    typedef Integer  scalar_type;
+    typedef vInteger vector_type;
+    typedef vInteger tensor_reduced;
+    enum { TensorLevel = 0 };
+  };
+
+  // First some of my own traits
+  template<typename T> struct isGridTensor {
+    static const bool value = true;
+    static const bool notvalue = false;
+  };
+
+  template<> struct isGridTensor<RealD > {
+    static const bool value = false;
+    static const bool notvalue = true;
+  };
+  template<> struct isGridTensor<RealF > {
+    static const bool value = false;
+    static const bool notvalue = true;
+  };
+  template<> struct isGridTensor<ComplexD > {
+    static const bool value = false;
+    static const bool notvalue = true;
+  };
+  template<> struct isGridTensor<ComplexF > {
+    static const bool value = false;
+    static const bool notvalue = true;
+  };
+  template<> struct isGridTensor<Integer > {
+    static const bool value = false;
+    static const bool notvalue = true;
+  };
+  template<> struct isGridTensor<vRealD > {
+    static const bool value = false;
+    static const bool notvalue = true;
+  };
+  template<> struct isGridTensor<vRealF > {
+    static const bool value = false;
+    static const bool notvalue = true;
+  };
+  template<> struct isGridTensor<vComplexD > {
+    static const bool value = false;
+    static const bool notvalue = true;
+  };
+  template<> struct isGridTensor<vComplexF > {
+    static const bool value = false;
+    static const bool notvalue = true;
+  };
+  template<> struct isGridTensor<vInteger > {
+    static const bool value = false;
+    static const bool notvalue = true;
+  };
+
+  // Match the index
+  template<typename T,int Level> struct matchGridTensorIndex {
+    static const bool value = (Level==T::TensorLevel);
+    static const bool notvalue = (Level!=T::TensorLevel);
+  };
+  // What is the vtype
+  template<typename T> struct isComplex {
+    static const bool value = false;
+  };
+  template<> struct isComplex<ComplexF> {
+    static const bool value = true;
+  };
+  template<> struct isComplex<ComplexD> {
+    static const bool value = true;
+  };
+
+
+}
+
+#endif
--- a/lib/Grid_vComplexD.h
+++ b/lib/Grid_vComplexD.h
@@ -1,7 +1,5 @@
-#ifndef VCOMPLEXD_H
-#define  VCOMPLEXD_H
-#include "Grid.h"
-#include "Grid_vComplexF.h"
+#ifndef GRID_VCOMPLEXD_H
+#define  GRID_VCOMPLEXD_H

 namespace Grid {
    class vComplexD {
--- a/lib/Grid_vComplexF.h
+++ b/lib/Grid_vComplexF.h
@@ -1,6 +1,5 @@
-#ifndef VCOMPLEXF
-#define VCOMPLEXF
-#include "Grid.h"
+#ifndef GRID_VCOMPLEXF
+#define GRID_VCOMPLEXF

 namespace Grid {

--- a/lib/Grid_vInteger.h
+++ b/lib/Grid_vInteger.h
@@ -1,7 +1,5 @@
-#ifndef VINTEGER_H
-#define VINTEGER_H
-
-#include "Grid.h"
+#ifndef GRID_VINTEGER_H
+#define GRID_VINTEGER_H

 namespace Grid {

--- a/lib/Grid_vRealD.h
+++ b/lib/Grid_vRealD.h
@@ -1,7 +1,5 @@
-#ifndef VREALD_H
-#define VREALD_H
-
-#include "Grid.h"
+#ifndef GRID_VREALD_H
+#define GRID_VREALD_H

 namespace Grid {
    class vRealD  {
--- a/lib/Grid_vRealF.h
+++ b/lib/Grid_vRealF.h
@@ -1,7 +1,6 @@
-#ifndef VREALF_H
-#define VREALF_H
+#ifndef GRID_VREALF_H
+#define GRID_VREALF_H

-#include "Grid.h"

 namespace Grid {
    class vRealF  {
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@@ -23,22 +23,33 @@ libGrid_a_SOURCES = Grid_init.cc $(extra_sources)
 #
 include_HEADERS = Grid_config.h\
 	Grid.h\
-	Grid_simd.h\
-	Grid_vComplexD.h\
-	Grid_vComplexF.h\
-	Grid_vRealD.h\
-	Grid_vRealF.h\
-	Grid_Cartesian.h\
-	Grid_Lattice.h\
-	Grid_Communicator.h\
 	Grid_QCD.h\
 	Grid_aligned_allocator.h\
+	Grid_cartesian.h\
+	Grid_cartesian_base.h\
+	Grid_cartesian_full.h\
+	Grid_cartesian_red_black.h\
+	Grid_communicator.h\
+	Grid_comparison.h\
+	Grid_config.h\
 	Grid_cshift.h\
 	Grid_cshift_common.h\
 	Grid_cshift_mpi.h\
 	Grid_cshift_none.h\
+	Grid_lattice.h\
+	Grid_math.h\
+	Grid_math_arith.h\
+	Grid_math_tensors.h\
+	Grid_math_traits.h\
+	Grid_predicated.h\
+	Grid_simd.h\
 	Grid_stencil.h\
-	Grid_math_types.h
+	Grid_summation.h\
+	Grid_vComplexD.h\
+	Grid_vComplexF.h\
+	Grid_vInteger.h\
+	Grid_vRealD.h\
+	Grid_vRealF.h