diff --git a/extras/Hadrons/AllToAllReduction.hpp b/extras/Hadrons/AllToAllReduction.hpp
new file mode 100644
index 00000000..7b2d2c1a
--- /dev/null
+++ b/extras/Hadrons/AllToAllReduction.hpp
@@ -0,0 +1,146 @@
+#ifndef A2A_Reduction_hpp_
+#define A2A_Reduction_hpp_
+
+#include <Grid/Hadrons/Global.hpp>
+#include <Grid/Hadrons/Environment.hpp>
+#include <Grid/Hadrons/Solver.hpp>
+
+BEGIN_HADRONS_NAMESPACE
+
+////////////////////////////////////////////
+// A2A Meson Field Inner Product
+////////////////////////////////////////////
+
+template <class FermionField>
+void sliceInnerProductMesonField(std::vector<std::vector<ComplexD>> &mat,
+                                 const std::vector<Lattice<FermionField>> &lhs,
+                                 const std::vector<Lattice<FermionField>> &rhs,
+                                 int orthogdim)
+{
+    typedef typename FermionField::scalar_type scalar_type;
+    typedef typename FermionField::vector_type vector_type;
+
+    int Lblock = lhs.size();
+    int Rblock = rhs.size();
+
+    GridBase *grid = lhs[0]._grid;
+
+    const int Nd = grid->_ndimension;
+    const int Nsimd = grid->Nsimd();
+    int Nt = grid->GlobalDimensions()[orthogdim];
+
+    assert(mat.size() == Lblock * Rblock);
+    for (int t = 0; t < mat.size(); t++)
+    {
+        assert(mat[t].size() == Nt);
+    }
+
+    int fd = grid->_fdimensions[orthogdim];
+    int ld = grid->_ldimensions[orthogdim];
+    int rd = grid->_rdimensions[orthogdim];
+
+    // will locally sum vectors first
+    // sum across these down to scalars
+    // splitting the SIMD
+    std::vector<vector_type, alignedAllocator<vector_type>> lvSum(rd * Lblock * Rblock);
+    for(int r=0;r<rd * Lblock * Rblock;r++)
+    {
+        lvSum[r]=zero;
+    }
+    std::vector<scalar_type> lsSum(ld * Lblock * Rblock, scalar_type(0.0));
+
+    int e1 = grid->_slice_nblock[orthogdim];
+    int e2 = grid->_slice_block[orthogdim];
+    int stride = grid->_slice_stride[orthogdim];
+
+    // std::cout << GridLogMessage << " Entering first parallel loop " << std::endl;
+    // Parallelise over t-direction doesn't expose as much parallelism as needed for KNL
+    parallel_for(int r = 0; r < rd; r++)
+    {
+        int so = r * grid->_ostride[orthogdim]; // base offset for start of plane
+        for (int n = 0; n < e1; n++)
+        {
+            for (int b = 0; b < e2; b++)
+            {
+                int ss = so + n * stride + b;
+                for (int i = 0; i < Lblock; i++)
+                {
+                    auto left = conjugate(lhs[i]._odata[ss]);
+                    for (int j = 0; j < Rblock; j++)
+                    {
+                        int idx = i + Lblock * j + Lblock * Rblock * r;
+                        auto right = rhs[j]._odata[ss];
+                        vector_type vv = left()(0)(0) * right()(0)(0) 
+                                       + left()(0)(1) * right()(0)(1) 
+                                       + left()(0)(2) * right()(0)(2) 
+                                       + left()(1)(0) * right()(1)(0) 
+                                       + left()(1)(1) * right()(1)(1) 
+                                       + left()(1)(2) * right()(1)(2) 
+                                       + left()(2)(0) * right()(2)(0) 
+                                       + left()(2)(1) * right()(2)(1) 
+                                       + left()(2)(2) * right()(2)(2) 
+                                       + left()(3)(0) * right()(3)(0) 
+                                       + left()(3)(1) * right()(3)(1) 
+                                       + left()(3)(2) * right()(3)(2);
+
+                        lvSum[idx] = lvSum[idx] + vv;
+                    }
+                }
+            }
+        }
+    }
+
+    // std::cout << GridLogMessage << " Entering second parallel loop " << std::endl;
+    // Sum across simd lanes in the plane, breaking out orthog dir.
+    parallel_for(int rt = 0; rt < rd; rt++)
+    {
+        std::vector<int> icoor(Nd);
+        for (int i = 0; i < Lblock; i++)
+        {
+            for (int j = 0; j < Rblock; j++)
+            {
+                iScalar<vector_type> temp;
+                std::vector<iScalar<scalar_type>> extracted(Nsimd);
+                temp._internal = lvSum[i + Lblock * j + Lblock * Rblock * rt];
+                extract(temp, extracted);
+                for (int idx = 0; idx < Nsimd; idx++)
+                {
+                    grid->iCoorFromIindex(icoor, idx);
+                    int ldx = rt + icoor[orthogdim] * rd;
+                    int ij_dx = i + Lblock * j + Lblock * Rblock * ldx;
+                    lsSum[ij_dx] = lsSum[ij_dx] + extracted[idx]._internal;
+                }
+            }
+        }
+    }
+
+    // std::cout << GridLogMessage << " Entering non parallel loop " << std::endl;
+    for (int t = 0; t < fd; t++)
+    {
+        int pt = t/ld; // processor plane
+        int lt = t%ld;
+        for (int i = 0; i < Lblock; i++)
+        {
+            for (int j = 0; j < Rblock; j++)
+            {
+                if (pt == grid->_processor_coor[orthogdim])
+                {
+                    int ij_dx = i + Lblock * j + Lblock * Rblock * lt;
+                    mat[i + j * Lblock][t] = lsSum[ij_dx];
+                }
+                else
+                {
+                    mat[i + j * Lblock][t] = scalar_type(0.0);
+                }
+                
+            }
+        }
+    }
+    // std::cout << GridLogMessage << " Done " << std::endl;
+    // defer sum over nodes.
+    return;
+}
+
+END_HADRONS_NAMESPACE
+
+#endif // A2A_Reduction_hpp_
\ No newline at end of file
diff --git a/extras/Hadrons/AllToAllVectors.hpp b/extras/Hadrons/AllToAllVectors.hpp
index a514beac..24f5397e 100644
--- a/extras/Hadrons/AllToAllVectors.hpp
+++ b/extras/Hadrons/AllToAllVectors.hpp
@@ -182,6 +182,7 @@ class A2AModesSchurDiagTwo
         assert(sol_o.checkerboard == Odd);
 
         action.DminusDag(tmp_wout, wout_5d);
+
         action.ExportPhysicalFermionSolution(wout_5d, wout_4d);
     }
 
diff --git a/extras/Hadrons/Makefile.am b/extras/Hadrons/Makefile.am
index 7240be82..abfcf9a8 100644
--- a/extras/Hadrons/Makefile.am
+++ b/extras/Hadrons/Makefile.am
@@ -15,6 +15,7 @@ libHadrons_adir = $(pkgincludedir)/Hadrons
 nobase_libHadrons_a_HEADERS = \
 	$(modules_hpp)            \
 	AllToAllVectors.hpp       \
+	AllToAllReduction.hpp       \
 	Application.hpp           \
 	EigenPack.hpp             \
 	Environment.hpp           \
diff --git a/extras/Hadrons/Modules/MContraction/A2AMeson.hpp b/extras/Hadrons/Modules/MContraction/A2AMeson.hpp
index fa1cb82e..45fb4db9 100644
--- a/extras/Hadrons/Modules/MContraction/A2AMeson.hpp
+++ b/extras/Hadrons/Modules/MContraction/A2AMeson.hpp
@@ -174,8 +174,8 @@ void TA2AMeson<FImpl>::execute(void)
     {
         for (unsigned int j = 0; j < N; j++)
         {
-            sliceInnerProductVector(MF_x, w1[i], v1[j], Tp);
-            sliceInnerProductVector(MF_y, w1[j], v1[i], Tp);
+            mySliceInnerProductVector(MF_x, w1[i], v1[j], Tp);
+            mySliceInnerProductVector(MF_y, w1[j], v1[i], Tp);
             for (unsigned int t = 0; t < nt; ++t)
             {
                 for (unsigned int tx = 0; tx < nt; tx++)
diff --git a/extras/Hadrons/Modules/MContraction/MesonFieldGamma.hpp b/extras/Hadrons/Modules/MContraction/MesonFieldGamma.hpp
index 0b81f607..eefd5e64 100644
--- a/extras/Hadrons/Modules/MContraction/MesonFieldGamma.hpp
+++ b/extras/Hadrons/Modules/MContraction/MesonFieldGamma.hpp
@@ -5,6 +5,9 @@
 #include <Grid/Hadrons/Module.hpp>
 #include <Grid/Hadrons/ModuleFactory.hpp>
 #include <Grid/Hadrons/AllToAllVectors.hpp>
+#include <Grid/Hadrons/AllToAllReduction.hpp>
+#include <Grid/Grid_Eigen_Dense.h>
+#include <fstream>
 
 BEGIN_HADRONS_NAMESPACE
 
@@ -19,6 +22,7 @@ class MesonFieldPar : Serializable
     GRID_SERIALIZABLE_CLASS_MEMBERS(MesonFieldPar,
                                     int, Nl,
                                     int, N,
+                                    int, Nblock,
                                     std::string, A2A1,
                                     std::string, A2A2,
                                     std::string, gammas,
@@ -40,6 +44,7 @@ class TMesonFieldGamma : public Module<MesonFieldPar>
         GRID_SERIALIZABLE_CLASS_MEMBERS(Result,
                                         Gamma::Algebra, gamma,
                                         std::vector<std::vector<std::vector<ComplexD>>>, MesonField,
+                                        std::vector<std::vector<ComplexD>>, MesonFiield,
                                         ComplexD, last);
     };
 
@@ -52,6 +57,9 @@ class TMesonFieldGamma : public Module<MesonFieldPar>
     virtual std::vector<std::string> getInput(void);
     virtual std::vector<std::string> getOutput(void);
     virtual void parseGammaString(std::vector<Gamma::Algebra> &gammaList);
+    virtual void vectorOfWs(std::vector<FermionField> &w, int i, int Nblock, FermionField &tmpw_5d, std::vector<FermionField> &vec_w);
+    virtual void vectorOfVs(std::vector<FermionField> &v, int j, int Nblock, FermionField &tmpv_5d, std::vector<FermionField> &vec_v);
+    virtual void gammaMult(std::vector<FermionField> &v, Gamma gamma);
     // setup
     virtual void setup(void);
     // execution
@@ -107,19 +115,54 @@ void TMesonFieldGamma<FImpl>::parseGammaString(std::vector<Gamma::Algebra> &gamm
     }
 }
 
+template <typename FImpl>
+void TMesonFieldGamma<FImpl>::vectorOfWs(std::vector<FermionField> &w, int i, int Nblock, FermionField &tmpw_5d, std::vector<FermionField> &vec_w)
+{
+    for (unsigned int ni = 0; ni < Nblock; ni++)
+    {
+        vec_w[ni] = w[i + ni];
+    }
+}
+
+template <typename FImpl>
+void TMesonFieldGamma<FImpl>::vectorOfVs(std::vector<FermionField> &v, int j, int Nblock, FermionField &tmpv_5d, std::vector<FermionField> &vec_v)
+{
+    for (unsigned int nj = 0; nj < Nblock; nj++)
+    {
+        vec_v[nj] = v[j+nj];
+    }
+}
+
+template <typename FImpl>
+void TMesonFieldGamma<FImpl>::gammaMult(std::vector<FermionField> &v, Gamma gamma)
+{
+    int Nblock = v.size();
+    for (unsigned int nj = 0; nj < Nblock; nj++)
+    {
+        v[nj] = gamma * v[nj];
+    }
+}
+
 // setup ///////////////////////////////////////////////////////////////////////
 template <typename FImpl>
 void TMesonFieldGamma<FImpl>::setup(void)
 {
     int nt = env().getDim(Tp);
     int N = par().N;
+    int Nblock = par().Nblock;
 
     int Ls_ = env().getObjectLs(par().A2A1 + "_class");
 
-    envTmpLat(FermionField, "w", Ls_);
-    envTmpLat(FermionField, "v", Ls_);
     envTmpLat(FermionField, "tmpv_5d", Ls_);
     envTmpLat(FermionField, "tmpw_5d", Ls_);
+
+    envTmp(std::vector<FermionField>, "w", 1, N, FermionField(env().getGrid(1)));
+    envTmp(std::vector<FermionField>, "v", 1, N, FermionField(env().getGrid(1)));
+
+    envTmp(Eigen::MatrixXcd, "MF", 1, Eigen::MatrixXcd::Zero(nt, N * N));
+
+    envTmp(std::vector<FermionField>, "w_block", 1, Nblock, FermionField(env().getGrid(1)));
+    envTmp(std::vector<FermionField>, "v_block", 1, Nblock, FermionField(env().getGrid(1)));
 }
 
 // execution ///////////////////////////////////////////////////////////////////
@@ -130,6 +173,7 @@ void TMesonFieldGamma<FImpl>::execute(void)
 
     int N = par().N;
     int nt = env().getDim(Tp);
+    int Nblock = par().Nblock;
 
     std::vector<Result> result;
     std::vector<Gamma::Algebra> gammaResultList;
@@ -145,33 +189,54 @@ void TMesonFieldGamma<FImpl>::execute(void)
     {
         result[i].gamma = gammaResultList[i];
         result[i].MesonField.resize(N, std::vector<std::vector<ComplexD>>(N, std::vector<ComplexD>(nt)));
+        result[i].MesonFiield.resize(N, std::vector<<ComplexD>(nt));
 
         Gamma gamma(gammaResultList[i]);
         gammaList[i] = gamma;
     }
 
-    std::vector<ComplexD> MesonField_ij;
-    MesonField_ij.resize(nt);
-
     auto &a2a1 = envGet(A2ABase, par().A2A1 + "_class");
     auto &a2a2 = envGet(A2ABase, par().A2A2 + "_class");
 
-    envGetTmp(FermionField, w);
-    envGetTmp(FermionField, v);
     envGetTmp(FermionField, tmpv_5d);
     envGetTmp(FermionField, tmpw_5d);
 
-    for (unsigned int i = 0; i < N; i++)
+    envGetTmp(std::vector<FermionField>, v);
+    envGetTmp(std::vector<FermionField>, w);
+    LOG(Message) << "Finding v and w vectors for N =  " << N << std::endl;
+    for (int i = 0; i < N; i++)
     {
-        a2a1.return_w(i, tmpw_5d, w);
-        for (unsigned int j = 0; j < N; j++)
+        a2a2.return_v(i, tmpv_5d, v[i]);
+        a2a1.return_w(i, tmpw_5d, w[i]);
+    }
+    LOG(Message) << "Found v and w vectors for N =  " << N << std::endl;
+
+    std::vector<std::vector<ComplexD>> MesonField_ij;
+    LOG(Message) << "Before blocked MFs, Nblock = " << Nblock << std::endl;
+    envGetTmp(std::vector<FermionField>, v_block);
+    envGetTmp(std::vector<FermionField>, w_block);
+    MesonField_ij.resize(Nblock * Nblock, std::vector<ComplexD>(nt));
+
+    envGetTmp(Eigen::MatrixXcd, MF);
+
+    LOG(Message) << "Before blocked MFs, Nblock = " << Nblock << std::endl;
+    for (unsigned int i = 0; i < N; i += Nblock)
+    {
+        vectorOfWs(w, i, Nblock, tmpw_5d, w_block);
+        for (unsigned int j = 0; j < N; j += Nblock)
         {
-            a2a2.return_v(j, tmpv_5d, v);
+            vectorOfVs(v, j, Nblock, tmpv_5d, v_block);
             for (unsigned int k = 0; k < result.size(); k++)
             {
-                v = gammaList[k]*v;
-                sliceInnerProductVector(MesonField_ij, w, v, Tp);
-                result[k].MesonField[i][j] = MesonField_ij;
+                gammaMult(v_block, gammaList[k]);
+                sliceInnerProductMesonField(MesonField_ij, w_block, v_block, Tp);
+                for (unsigned int nj = 0; nj < Nblock; nj++)
+                {
+                    for (unsigned int ni = 0; ni < Nblock; ni++)
+                    {
+                        MF.col((i + ni) + (j + nj) * N) = Eigen::VectorXcd::Map(&MesonField_ij[nj * Nblock + ni][0], MesonField_ij[nj * Nblock + ni].size());
+                    }
+                }
             }
         }
         if (i % 10 == 0)
@@ -179,7 +244,22 @@ void TMesonFieldGamma<FImpl>::execute(void)
             LOG(Message) << "MF for i = " << i << " of " << N << std::endl;
         }
     }
-    result[0].last = MesonField_ij[7];
+    LOG(Message) << "Before Global sum, Nblock = " << Nblock << std::endl;
+    v_block[0]._grid->GlobalSumVector(MF.data(), MF.size());
+    LOG(Message) << "After Global sum, Nblock = " << Nblock << std::endl;
+    for (unsigned int i = 0; i < N; i++)
+    {
+        for (unsigned int j = 0; j < N; j++)
+        {
+            for (unsigned int k = 0; k < result.size(); k++)
+            {
+                for (unsigned int t = 0; t < nt; t++)
+                {
+                    result[k].MesonField[i][j][t] = MF.col(i + N * j)[t];
+                }
+            }
+        }
+    }
     saveResult(par().output, "meson", result);
 }
 
diff --git a/lib/lattice/Lattice_reduction.h b/lib/lattice/Lattice_reduction.h
index 12bfebed..b8d13b47 100644
--- a/lib/lattice/Lattice_reduction.h
+++ b/lib/lattice/Lattice_reduction.h
@@ -41,7 +41,6 @@ template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
 template<class vobj>
 inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &right)
 {
-  std::cout << GridLogMessage << "Start alloc innerProduct" << std::endl;
   typedef typename vobj::scalar_type scalar_type;
   typedef typename vobj::vector_typeD vector_type;
   GridBase *grid = left._grid;
@@ -50,8 +49,6 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
   ComplexD  inner;
   Vector<ComplexD> sumarray(grid->SumArraySize()*pad);
 
-  std::cout << GridLogMessage << "End alloc innerProduct" << std::endl;
-  std::cout << GridLogMessage << "Start parallel for innerProduct" << std::endl;
   parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
     int nwork, mywork, myoff;
     GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff);
@@ -65,18 +62,12 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
     ComplexD tmp = Reduce(TensorRemove(vinner)) ;
     vstream(sumarray[thr*pad],tmp);
   }
-  std::cout << GridLogMessage << "End parallel for innerProduct" << std::endl;
-
-  std::cout << GridLogMessage << "Start inner sum innerProduct" << std::endl;
   inner=0.0;
   for(int i=0;i<grid->SumArraySize();i++){
     inner = inner+sumarray[i*pad];
   } 
   right._grid->GlobalSum(inner);
   return inner;
-  std::cout << GridLogMessage << "End inner sum innerProduct" << std::endl;
-
-  std::cout << GridLogMessage << "End innerProduct" << std::endl;
 }
 
 /////////////////////////
@@ -285,7 +276,7 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
 template<class vobj>
 static void mySliceInnerProductVector( std::vector<ComplexD> & result, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int orthogdim)
 {
-  std::cout << GridLogMessage << "Start mySsliceInnerProductVector" << std::endl;
+  std::cout << GridLogMessage << "Start mySliceInnerProductVector" << std::endl;
 
   typedef typename vobj::scalar_type scalar_type;
   std::vector<scalar_type> lsSum;
@@ -313,12 +304,12 @@ static void localSliceInnerProductVector(std::vector<ComplexD> &result, const La
   int fd=grid->_fdimensions[orthogdim];
   int ld=grid->_ldimensions[orthogdim];
   int rd=grid->_rdimensions[orthogdim];
-  std::cout << GridLogMessage << "Start alloc" << std::endl;
+  // std::cout << GridLogMessage << "Start alloc" << std::endl;
 
   std::vector<vector_type,alignedAllocator<vector_type> > lvSum(rd); // will locally sum vectors first
   lsSum.resize(ld,scalar_type(0.0));                    // sum across these down to scalars
   std::vector<iScalar<scalar_type>> extracted(Nsimd);   // splitting the SIMD  
-  std::cout << GridLogMessage << "End alloc" << std::endl;
+  // std::cout << GridLogMessage << "End alloc" << std::endl;
 
   result.resize(fd); // And then global sum to return the same vector to every node for IO to file
   for(int r=0;r<rd;r++){
@@ -328,8 +319,8 @@ static void localSliceInnerProductVector(std::vector<ComplexD> &result, const La
   int e1=    grid->_slice_nblock[orthogdim];
   int e2=    grid->_slice_block [orthogdim];
   int stride=grid->_slice_stride[orthogdim];
-  std::cout << GridLogMessage << "End prep" << std::endl;
-  std::cout << GridLogMessage << "Start parallel inner product, _rd = " << rd << std::endl;
+  // std::cout << GridLogMessage << "End prep" << std::endl;
+  // std::cout << GridLogMessage << "Start parallel inner product, _rd = " << rd << std::endl;
   vector_type vv;
   parallel_for(int r=0;r<rd;r++)
   {
@@ -339,12 +330,12 @@ static void localSliceInnerProductVector(std::vector<ComplexD> &result, const La
     for(int n=0;n<e1;n++){
       for(int b=0;b<e2;b++){
         int ss = so + n * stride + b;
-        vv = TensorRemove(innerProduct(lhs._odata[ss]._internal, rhs._odata[ss]._internal));
+        vv = TensorRemove(innerProduct(lhs._odata[ss], rhs._odata[ss]));
         lvSum[r] = lvSum[r] + vv;
       }
     }
   }
-  std::cout << GridLogMessage << "End parallel inner product" << std::endl;
+  // std::cout << GridLogMessage << "End parallel inner product" << std::endl;
 
   // Sum across simd lanes in the plane, breaking out orthog dir.
   std::vector<int> icoor(Nd);
@@ -364,7 +355,7 @@ static void localSliceInnerProductVector(std::vector<ComplexD> &result, const La
 
     }
   }
-  std::cout << GridLogMessage << "End sum over simd lanes" << std::endl;
+  // std::cout << GridLogMessage << "End sum over simd lanes" << std::endl;
 }
 template <class vobj>
 static void globalSliceInnerProductVector(std::vector<ComplexD> &result, const Lattice<vobj> &lhs, std::vector<typename vobj::scalar_type> &lsSum, int orthogdim)
@@ -376,7 +367,7 @@ static void globalSliceInnerProductVector(std::vector<ComplexD> &result, const L
   // sum over nodes.
   std::vector<scalar_type> gsum;
   gsum.resize(fd, scalar_type(0.0));
-  std::cout << GridLogMessage << "Start of gsum[t] creation:" << std::endl;
+  // std::cout << GridLogMessage << "Start of gsum[t] creation:" << std::endl;
   for(int t=0;t<fd;t++){
     int pt = t/ld; // processor plane
     int lt = t%ld;
@@ -384,10 +375,10 @@ static void globalSliceInnerProductVector(std::vector<ComplexD> &result, const L
       gsum[t]=lsSum[lt];
     }
   }
-  std::cout << GridLogMessage << "End of gsum[t] creation:" << std::endl;
-  std::cout << GridLogMessage << "Start of GlobalSumVector:" << std::endl;
+  // std::cout << GridLogMessage << "End of gsum[t] creation:" << std::endl;
+  // std::cout << GridLogMessage << "Start of GlobalSumVector:" << std::endl;
   grid->GlobalSumVector(&gsum[0], fd);
-  std::cout << GridLogMessage << "End of GlobalSumVector:" << std::endl;
+  // std::cout << GridLogMessage << "End of GlobalSumVector:" << std::endl;
 
   result = gsum;
 }
diff --git a/lib/tensors/Tensor_inner.h b/lib/tensors/Tensor_inner.h
index b7a8417d..46185652 100644
--- a/lib/tensors/Tensor_inner.h
+++ b/lib/tensors/Tensor_inner.h
@@ -106,7 +106,6 @@ inline vRealD innerProductD(const vRealF &l,const vRealF &r){
     typedef decltype(innerProduct(lhs._internal[0],rhs._internal[0])) ret_t;
     iScalar<ret_t> ret;
     ret=zero;
-    // std::cout << GridLogMessage << "innerProduct iVector" << std::endl;
     for(int c1=0;c1<N;c1++){
       ret._internal += innerProduct(lhs._internal[c1],rhs._internal[c1]);
     }
@@ -130,34 +129,9 @@ inline vRealD innerProductD(const vRealF &l,const vRealF &r){
   {
     typedef decltype(innerProduct(lhs._internal,rhs._internal)) ret_t;
     iScalar<ret_t> ret;
-    // std::cout << GridLogMessage << "innerProduct iScalar" << std::endl;
-
     ret._internal = innerProduct(lhs._internal,rhs._internal);
     return ret;
   }
-  template<class l,class r,int N> inline
-  auto myInnerProduct (const iVector<l,N>& lhs,const iVector<r,N>& rhs) -> iScalar<decltype(innerProduct(lhs._internal[0],rhs._internal[0]))>
-  {
-    typedef decltype(innerProduct(lhs._internal[0],rhs._internal[0])) ret_t;
-    iScalar<ret_t> ret;
-    ret=zero;
-    std::cout << GridLogMessage << "myInnerProduct iVector, N = " << N << std::endl;
-    for(int c1=0;c1<N;c1++){
-      ret._internal += innerProduct(lhs._internal[c1],rhs._internal[c1]);
-    }
-    return ret;
-  }
-
-  template<class l,class r> inline
-  auto myInnerProduct (const iScalar<l>& lhs,const iScalar<r>& rhs) -> iScalar<decltype(innerProduct(lhs._internal,rhs._internal))>
-  {
-    typedef decltype(innerProduct(lhs._internal,rhs._internal)) ret_t;
-    iScalar<ret_t> ret;
-    std::cout << GridLogMessage << "myInnerProduct iScalar" << std::endl;
-
-    ret._internal = myInnerProduct(lhs._internal,rhs._internal);
-    return ret;
-  }
 
 }
 #endif