diff --git a/Grid_Communicator.h b/Grid_Communicator.h
index c7e71f12..0e15765c 100644
--- a/Grid_Communicator.h
+++ b/Grid_Communicator.h
@@ -42,13 +42,31 @@ class CartesianCommunicator {
     ////////////////////////////////////////////////////////////
     // Reduction
     ////////////////////////////////////////////////////////////
-    void GlobalSum(float &);
-    void GlobalSumVector(float *,int N);
+    void GlobalSum(RealF &);
+    void GlobalSumVector(RealF *,int N);
 
-    void GlobalSum(double &);
-    void GlobalSumVector(double *,int N);
+    void GlobalSum(RealD &);
+    void GlobalSumVector(RealD *,int N);
+
+    void GlobalSum(ComplexF &c)
+    {
+      GlobalSumVector((float *)&c,2);
+    }
+    void GlobalSumVector(ComplexF *c,int N)
+    {
+      GlobalSumVector((float *)c,2*N);
+    }
+
+    void GlobalSum(ComplexD &c)
+    {
+      GlobalSumVector((double *)&c,2);
+    }
+    void GlobalSumVector(ComplexD *c,int N)
+    {
+      GlobalSumVector((double *)c,2*N);
+    }
     
-    template<class obj> void GlobalSumObj(obj &o){
+    template<class obj> void GlobalSum(obj &o){
 
       typedef typename obj::scalar_type scalar_type;
       int words = sizeof(obj)/sizeof(scalar_type);
diff --git a/Grid_Lattice.h b/Grid_Lattice.h
index 1e2c820e..a94a13a4 100644
--- a/Grid_Lattice.h
+++ b/Grid_Lattice.h
@@ -407,18 +407,83 @@ public:
         return ret;
     }
 
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Reduction operations
+    ////////////////////////////////////////////////////////////////////////////////////////////////////
     template<class vobj>
     inline RealD norm2(const Lattice<vobj> &arg){
 
       typedef typename vobj::scalar_type scalar;
-      decltype(localInnerProduct(arg._odata[0],arg._odata[0])) vnrm=zero;
+      decltype(innerProduct(arg._odata[0],arg._odata[0])) vnrm=zero;
 
       scalar nrm;
+      //FIXME make this loop parallelisable
       for(int ss=0;ss<arg._grid->oSites(); ss++){
-	vnrm = vnrm + localInnerProduct(arg._odata[ss],arg._odata[ss]);
+	vnrm = vnrm + innerProduct(arg._odata[ss],arg._odata[ss]);
       }
       nrm = Reduce(TensorRemove(vnrm));
+      arg._grid->GlobalSum(nrm);
       return real(nrm);
     }
+
+    template<class vobj>
+    inline auto innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right) ->decltype(innerProduct(left._odata[0],right._odata[0]))
+    {
+      typedef typename vobj::scalar_type scalar;
+      decltype(innerProduct(left._odata[0],right._odata[0])) vnrm=zero;
+
+      scalar nrm;
+      //FIXME make this loop parallelisable
+      for(int ss=0;ss<left._grid->oSites(); ss++){
+	vnrm = vnrm + innerProduct(left._odata[ss],right._odata[ss]);
+      }
+      nrm = Reduce(TensorRemove(vnrm));
+      right._grid->GlobalSum(nrm);
+      return nrm;
+    }
+
+    /////////////////////////////////////////////////////
+    // Non site reduced routines
+    /////////////////////////////////////////////////////
+
+    // localNorm2,
+    template<class vobj>
+    inline auto localNorm2 (const Lattice<vobj> &rhs)-> Lattice<typename vobj::tensor_reduced>
+    {
+      Lattice<typename vobj::tensor_reduced> ret(rhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<rhs._grid->oSites(); ss++){
+	  ret._odata[ss]=innerProduct(rhs,rhs);
+        }
+        return ret;
+    }
+
+    // localInnerProduct
+    template<class vobj>
+    inline auto localInnerProduct (const Lattice<vobj> &lhs,const Lattice<vobj> &rhs)
+      -> Lattice<typename vobj::tensor_reduced>
+    {
+      Lattice<typename vobj::tensor_reduced> ret(rhs._grid);
+#pragma omp parallel for
+      for(int ss=0;ss<rhs._grid->oSites(); ss++){
+	ret._odata[ss]=innerProduct(lhs._odata[ss],rhs._odata[ss]);
+      }
+      return ret;
+    }
+    
+    // outerProduct Scalar x Scalar -> Scalar
+    //              Vector x Vector -> Matrix
+    template<class ll,class rr>
+    inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))>
+    {
+        Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> ret(rhs._grid);
+#pragma omp parallel for
+        for(int ss=0;ss<rhs._grid->oSites(); ss++){
+            ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]);
+        }
+        return ret;
+     }
+
+
 }
 #endif
diff --git a/Grid_QCD.h b/Grid_QCD.h
index 1d52f911..72a8ee67 100644
--- a/Grid_QCD.h
+++ b/Grid_QCD.h
@@ -57,41 +57,6 @@ namespace QCD {
     typedef Lattice<vSpinVector>       LatticeSpinVector;
     typedef Lattice<vColourVector>     LatticeColourVector;
 
-    // localNorm2,
-    template<class tt>
-    inline LatticeComplex localNorm2 (const Lattice<tt> &rhs)
-    {
-        LatticeComplex ret(rhs._grid);
-#pragma omp parallel for
-        for(int ss=0;ss<rhs._grid->oSites(); ss++){
-            ret._odata[ss]=trace(adj(rhs)*rhs);
-        }
-        return ret;
-    }
-    // localInnerProduct
-    template<class tt>
-    inline LatticeComplex localInnerProduct (const Lattice<tt> &lhs,const Lattice<tt> &rhs)
-    {
-        LatticeComplex ret(rhs._grid);
-#pragma omp parallel for
-        for(int ss=0;ss<rhs._grid->oSites(); ss++){
-            ret._odata[ss]=localInnerProduct(lhs._odata[ss],rhs._odata[ss]);
-        }
-        return ret;
-    }
-    
-    // outerProduct Scalar x Scalar -> Scalar
-    //              Vector x Vector -> Matrix
-    template<class ll,class rr>
-    inline auto outerProduct (const Lattice<ll> &lhs,const Lattice<rr> &rhs) -> Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))>
-    {
-        Lattice<decltype(outerProduct(lhs._odata[0],rhs._odata[0]))> ret(rhs._grid);
-#pragma omp parallel for
-        for(int ss=0;ss<rhs._grid->oSites(); ss++){
-            ret._odata[ss]=outerProduct(lhs._odata[ss],rhs._odata[ss]);
-        }
-        return ret;
-     }
 
     // FIXME for debug; deprecate this
    inline void LatticeCoordinate(LatticeInteger &l,int mu){
diff --git a/Grid_communicator_mpi.cc b/Grid_communicator_mpi.cc
index a4943478..dba5f039 100644
--- a/Grid_communicator_mpi.cc
+++ b/Grid_communicator_mpi.cc
@@ -29,21 +29,20 @@ CartesianCommunicator::CartesianCommunicator(std::vector<int> &processors)
 }
 
 void CartesianCommunicator::GlobalSum(float &f){
-  MPI_Allreduce(&f,&f,1,MPI_FLOAT,MPI_SUM,communicator);
+  MPI_Allreduce(MPI_IN_PLACE,&f,1,MPI_FLOAT,MPI_SUM,communicator);
 }
 void CartesianCommunicator::GlobalSumVector(float *f,int N)
 {
-  MPI_Allreduce(f,f,N,MPI_FLOAT,MPI_SUM,communicator);
+  MPI_Allreduce(MPI_IN_PLACE,f,N,MPI_FLOAT,MPI_SUM,communicator);
 }
 void CartesianCommunicator::GlobalSum(double &d)
 {
-  MPI_Allreduce(&d,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
+  MPI_Allreduce(MPI_IN_PLACE,&d,1,MPI_DOUBLE,MPI_SUM,communicator);
 }
 void CartesianCommunicator::GlobalSumVector(double *d,int N)
 {
-  MPI_Allreduce(d,d,N,MPI_DOUBLE,MPI_SUM,communicator);
+  MPI_Allreduce(MPI_IN_PLACE,d,N,MPI_DOUBLE,MPI_SUM,communicator);
 }
-
 void CartesianCommunicator::ShiftedRanks(int dim,int shift,int &source,int &dest)
 {
   MPI_Cart_shift(communicator,dim,shift,&source,&dest);
diff --git a/Grid_cshift_common.h b/Grid_cshift_common.h
index e962f6ea..2910151c 100644
--- a/Grid_cshift_common.h
+++ b/Grid_cshift_common.h
@@ -8,7 +8,6 @@ friend void Gather_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllo
 {
   int rd = rhs._grid->_rdimensions[dimension];
 
-  printf("Gather_plane_simple buf size %d rhs size %d \n",buffer.size(),rhs._odata.size());fflush(stdout);
   if ( !rhs._grid->CheckerBoarded(dimension) ) {
 
     int so  = plane*rhs._grid->_ostride[dimension]; // base offset for start of plane 
@@ -19,7 +18,6 @@ friend void Gather_plane_simple (Lattice<vobj> &rhs,std::vector<vobj,alignedAllo
 #pragma omp parallel for collapse(2)
     for(int n=0;n<rhs._grid->_slice_nblock[dimension];n++){
       for(int b=0;b<rhs._grid->_slice_block[dimension];b++){
-	printf("Gather_plane_simple %d ; %d %d %d\n",bo,so,o,b);fflush(stdout);
 	buffer[bo++]=rhs._odata[so+o+b];
       }
       o +=rhs._grid->_slice_stride[dimension];
diff --git a/Grid_math_types.h b/Grid_math_types.h
index e3432645..01fc7bf2 100644
--- a/Grid_math_types.h
+++ b/Grid_math_types.h
@@ -903,38 +903,38 @@ template<class l,int N> inline iMatrix<l,N> operator - (Integer lhs,const iMatri
 
 
     ///////////////////////////////////////////////////////////////////////////////////////
-    // localInnerProduct Scalar x Scalar -> Scalar
-    // localInnerProduct Vector x Vector -> Scalar
-    // localInnerProduct Matrix x Matrix -> Scalar
+    // innerProduct Scalar x Scalar -> Scalar
+    // innerProduct Vector x Vector -> Scalar
+    // innerProduct Matrix x Matrix -> Scalar
     ///////////////////////////////////////////////////////////////////////////////////////
     template<class l,class r,int N> inline
-    auto localInnerProduct (const iVector<l,N>& lhs,const iVector<r,N>& rhs) -> iScalar<decltype(localInnerProduct(lhs._internal[0],rhs._internal[0]))>
+    auto innerProduct (const iVector<l,N>& lhs,const iVector<r,N>& rhs) -> iScalar<decltype(innerProduct(lhs._internal[0],rhs._internal[0]))>
     {
-        typedef decltype(localInnerProduct(lhs._internal[0],rhs._internal[0])) ret_t;
+        typedef decltype(innerProduct(lhs._internal[0],rhs._internal[0])) ret_t;
         iScalar<ret_t> ret=zero;
         for(int c1=0;c1<N;c1++){
-            ret._internal += localInnerProduct(lhs._internal[c1],rhs._internal[c1]);
+            ret._internal += innerProduct(lhs._internal[c1],rhs._internal[c1]);
         }
         return ret;
     }
     template<class l,class r,int N> inline
-    auto localInnerProduct (const iMatrix<l,N>& lhs,const iMatrix<r,N>& rhs) -> iScalar<decltype(localInnerProduct(lhs._internal[0][0],rhs._internal[0][0]))>
+    auto innerProduct (const iMatrix<l,N>& lhs,const iMatrix<r,N>& rhs) -> iScalar<decltype(innerProduct(lhs._internal[0][0],rhs._internal[0][0]))>
     {
-        typedef decltype(localInnerProduct(lhs._internal[0][0],rhs._internal[0][0])) ret_t;
+        typedef decltype(innerProduct(lhs._internal[0][0],rhs._internal[0][0])) ret_t;
         iScalar<ret_t> ret=zero;
         iScalar<ret_t> tmp;
         for(int c1=0;c1<N;c1++){
         for(int c2=0;c2<N;c2++){
-	  ret._internal+=localInnerProduct(lhs._internal[c1][c2],rhs._internal[c1][c2]);
+	  ret._internal+=innerProduct(lhs._internal[c1][c2],rhs._internal[c1][c2]);
         }}
         return ret;
     }
     template<class l,class r> inline
-    auto localInnerProduct (const iScalar<l>& lhs,const iScalar<r>& rhs) -> iScalar<decltype(localInnerProduct(lhs._internal,rhs._internal))>
+    auto innerProduct (const iScalar<l>& lhs,const iScalar<r>& rhs) -> iScalar<decltype(innerProduct(lhs._internal,rhs._internal))>
     {
-        typedef decltype(localInnerProduct(lhs._internal,rhs._internal)) ret_t;
+        typedef decltype(innerProduct(lhs._internal,rhs._internal)) ret_t;
         iScalar<ret_t> ret;
-        ret._internal = localInnerProduct(lhs._internal,rhs._internal);
+        ret._internal = innerProduct(lhs._internal,rhs._internal);
         return ret;
     }
 
diff --git a/Grid_simd.h b/Grid_simd.h
index 23061632..4f9f4b0d 100644
--- a/Grid_simd.h
+++ b/Grid_simd.h
@@ -33,10 +33,10 @@ namespace Grid {
 
   inline RealF adj(const RealF  & r){ return r; }
   inline RealF conj(const RealF  & r){ return r; }
-  inline ComplexD localInnerProduct(const ComplexD & l, const ComplexD & r) { return conj(l)*r; }
-  inline ComplexF localInnerProduct(const ComplexF & l, const ComplexF & r) { return conj(l)*r; }
-  inline RealD localInnerProduct(const RealD & l, const RealD & r) { return l*r; }
-  inline RealF localInnerProduct(const RealF & l, const RealF & r) { return l*r; }
+  inline ComplexD innerProduct(const ComplexD & l, const ComplexD & r) { return conj(l)*r; }
+  inline ComplexF innerProduct(const ComplexF & l, const ComplexF & r) { return conj(l)*r; }
+  inline RealD innerProduct(const RealD & l, const RealD & r) { return l*r; }
+  inline RealF innerProduct(const RealF & l, const RealF & r) { return l*r; }
 
     ////////////////////////////////////////////////////////////////////////////////
     //Provide support functions for basic real and complex data types required by Grid
diff --git a/Grid_vComplexD.h b/Grid_vComplexD.h
index f1c335e8..133e354b 100644
--- a/Grid_vComplexD.h
+++ b/Grid_vComplexD.h
@@ -313,7 +313,7 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){
     };
 
 
-    inline vComplexD localInnerProduct(const vComplexD & l, const vComplexD & r) { return conj(l)*r; }
+    inline vComplexD innerProduct(const vComplexD & l, const vComplexD & r) { return conj(l)*r; }
 
 
     typedef  vComplexD vDComplex;
diff --git a/Grid_vComplexF.h b/Grid_vComplexF.h
index 18cf1749..9dc5f051 100644
--- a/Grid_vComplexF.h
+++ b/Grid_vComplexF.h
@@ -362,7 +362,7 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){
 
     };
 
-    inline vComplexF localInnerProduct(const vComplexF & l, const vComplexF & r) 
+    inline vComplexF innerProduct(const vComplexF & l, const vComplexF & r) 
     {
       return conj(l)*r; 
     }
diff --git a/Grid_vRealD.h b/Grid_vRealD.h
index c51e41cf..84c80698 100644
--- a/Grid_vRealD.h
+++ b/Grid_vRealD.h
@@ -240,7 +240,7 @@ namespace Grid {
         static int Nsimd(void) { return sizeof(dvec)/sizeof(double);}
     };
 
-   inline vRealD localInnerProduct(const vRealD & l, const vRealD & r) { return conj(l)*r; }
+   inline vRealD innerProduct(const vRealD & l, const vRealD & r) { return conj(l)*r; }
     inline void zeroit(vRealD &z){ vzero(z);}
 
     inline vRealD outerProduct(const vRealD &l, const vRealD& r)
diff --git a/Grid_vRealF.h b/Grid_vRealF.h
index 0fe68f43..aeb454f0 100644
--- a/Grid_vRealF.h
+++ b/Grid_vRealF.h
@@ -260,7 +260,7 @@ friend inline void vstore(const vRealF &ret, float *a){
     public:
         static inline int Nsimd(void) { return sizeof(fvec)/sizeof(float);}
     };
-    inline vRealF localInnerProduct(const vRealF & l, const vRealF & r) { return conj(l)*r; }
+    inline vRealF innerProduct(const vRealF & l, const vRealF & r) { return conj(l)*r; }
     inline void  zeroit(vRealF &z){ vzero(z);}
 
     inline vRealF outerProduct(const vRealF &l, const vRealF& r)
diff --git a/TODO b/TODO
index 2c8fdcf8..010a06b7 100644
--- a/TODO
+++ b/TODO
@@ -1,15 +1,14 @@
 
+AUDITS:
 * FIXME audit
-
+* const audit
 * Replace vset with a call to merge.; 
-
 * care in Gmerge,Gextract over vset .
-
-* Const audit
-
 * extract / merge extra implementation removal      
 
 
+FUNCTIONALITY:
+
 * Conditional execution, where etc...                -----DONE, simple test
 
 * Integer relational support                         -----DONE
@@ -22,6 +21,11 @@
 
 * Stencil operator support                           -----Initial thoughts, trial implementation DONE.
                                                      -----some simple tests that Stencil matches Cshift.
+                                                     -----do all permute in comms phase, so that copy permute
+						     -----cases move into a buffer.
+
+* CovariantShift support                             -----Use a class to store gauge field? (parallel transport?)
+
 
 * Subset support, slice sums etc...                  -----Only need slice sum?
                                                      -----Generic cartesian subslicing?