From 7ede6961269516638aa21560f9511ec5140fbcb2 Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Sun, 16 Apr 2017 23:40:00 +0100
Subject: [PATCH] Non compile of tests fixed

---
 TODO                                          |  17 +-
 .../iterative/BlockConjugateGradient.h        | 201 ------------------
 lib/lattice/Lattice_reduction.h               | 159 ++++++++++++++
 lib/simd/Grid_vector_types.h                  |   2 +-
 lib/tensors/Tensor_traits.h                   |   1 +
 5 files changed, 170 insertions(+), 210 deletions(-)
diff --git a/TODO b/TODO
index 91034f20..27579ad3 100644
--- a/TODO
+++ b/TODO
@@ -3,19 +3,20 @@ TODO:
 
 Peter's work list:
 
--- Merge high precision reduction into develop
+-- Remove DenseVector, DenseMatrix; Use Eigen instead. <-- started
+-- Merge high precision reduction into develop         <-- done
+-- Precision conversion and sort out localConvert      <-- 
 -- Physical propagator interface
--- Precision conversion and sort out localConvert
--- slice* linalg routines for multiRHS, BlockCG
+
+-- multiRHS DWF; benchmark on Cori/BNL for comms elimination
+   -- slice* linalg routines for multiRHS, BlockCG        <-- started
+
 -- Profile CG, BlockCG, etc... Flop count/rate
 -- Binary I/O speed up & x-strips
--- Half-precision comms
--- multiRHS DWF; benchmark on Cori/BNL for comms elimination
+-- Half-precision comms                                <-- started
 -- GaugeFix into central location
--- Help Julia with NPR code
--- Switch to measurements
+-- FFTfix in sensible place
 -- Multigrid Wilson and DWF, compare to other Multigrid implementations
--- Remove DenseVector, DenseMatrix; Use Eigen instead.
 -- quaternions                 -- Might not need
 
 
diff --git a/lib/algorithms/iterative/BlockConjugateGradient.h b/lib/algorithms/iterative/BlockConjugateGradient.h
index 0f4a3a80..1db89512 100644
--- a/lib/algorithms/iterative/BlockConjugateGradient.h
+++ b/lib/algorithms/iterative/BlockConjugateGradient.h
@@ -30,210 +30,9 @@ directory
 #ifndef GRID_BLOCK_CONJUGATE_GRADIENT_H
 #define GRID_BLOCK_CONJUGATE_GRADIENT_H
 
-#include <Grid/Eigen/Dense>
 
 namespace Grid {
 
-GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
-{
-  int NN    = BlockSolverGrid->_ndimension;
-  int nsimd = BlockSolverGrid->Nsimd();
-
-  std::vector<int> latt_phys(0);
-  std::vector<int> simd_phys(0);
-  std::vector<int>  mpi_phys(0);
-  
-  for(int d=0;d<NN;d++){
-    if( d!=Orthog ) { 
-    latt_phys.push_back(BlockSolverGrid->_fdimensions[d]);
-    simd_phys.push_back(BlockSolverGrid->_simd_layout[d]);
-     mpi_phys.push_back(BlockSolverGrid->_processors[d]);
-    }
-  }
-  return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); 
-}
-  //////////////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Need to move sliceInnerProduct, sliceAxpy, sliceNorm etc... into lattice sector along with sliceSum
-  //////////////////////////////////////////////////////////////////////////////////////////////////////////////
-template<class vobj>
-static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) 
-{    
-  typedef typename vobj::scalar_object sobj;
-  typedef typename vobj::scalar_type scalar_type;
-  typedef typename vobj::vector_type vector_type;
-
-  int Nblock = X._grid->GlobalDimensions()[Orthog];
-    
-  GridBase *FullGrid  = X._grid;
-  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
-  
-  Lattice<vobj> Xslice(SliceGrid);
-  Lattice<vobj> Rslice(SliceGrid);
-  // FIXME: Implementation is slow
-  // If we based this on Cshift it would work for spread out
-  // but it would be even slower
-  //
-  // Repeated extract slice is inefficient
-  //
-  // Best base the linear combination by constructing a 
-  // set of vectors of size grid->_rdimensions[Orthog].
-  for(int i=0;i<Nblock;i++){
-    ExtractSlice(Rslice,Y,i,Orthog);
-    for(int j=0;j<Nblock;j++){
-      ExtractSlice(Xslice,X,j,Orthog);
-      Rslice = Rslice + Xslice*(scale*aa(j,i));
-    }
-    InsertSlice(Rslice,R,i,Orthog);
-  }
-};
-template<class vobj>
-static void sliceMaddVector (Lattice<vobj> &R,std::vector<RealD> &a,const Lattice<vobj> &X,const Lattice<vobj> &Y,
-			     int Orthog,RealD scale=1.0) 
-{    
-  // FIXME: Implementation is slow
-  // Best base the linear combination by constructing a 
-  // set of vectors of size grid->_rdimensions[Orthog].
-  typedef typename vobj::scalar_object sobj;
-  typedef typename vobj::scalar_type scalar_type;
-  typedef typename vobj::vector_type vector_type;
-
-  int Nblock = X._grid->GlobalDimensions()[Orthog];
-    
-  GridBase *FullGrid  = X._grid;
-  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
-  
-  Lattice<vobj> Xslice(SliceGrid);
-  Lattice<vobj> Rslice(SliceGrid);
-  // If we based this on Cshift it would work for spread out
-  // but it would be even slower
-  for(int i=0;i<Nblock;i++){
-    ExtractSlice(Rslice,Y,i,Orthog);
-    ExtractSlice(Xslice,X,i,Orthog);
-    Rslice = Rslice + Xslice*(scale*a[i]);
-    InsertSlice(Rslice,R,i,Orthog);
-  }
-};
-template<class vobj>
-static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
-{
-  // FIXME: Implementation is slow
-  // Not sure of best solution.. think about it
-  typedef typename vobj::scalar_object sobj;
-  typedef typename vobj::scalar_type scalar_type;
-  typedef typename vobj::vector_type vector_type;
-
-  GridBase *FullGrid  = lhs._grid;
-  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
-
-  int Nblock = FullGrid->GlobalDimensions()[Orthog];
-  
-  Lattice<vobj> Lslice(SliceGrid);
-  Lattice<vobj> Rslice(SliceGrid);
-
-  mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
-
-  for(int i=0;i<Nblock;i++){
-    ExtractSlice(Lslice,lhs,i,Orthog);
-    for(int j=0;j<Nblock;j++){
-      ExtractSlice(Rslice,rhs,j,Orthog);
-      mat(i,j) = innerProduct(Lslice,Rslice);
-    }
-  }
-#undef FORCE_DIAG
-#ifdef FORCE_DIAG
-  for(int i=0;i<Nblock;i++){
-    for(int j=0;j<Nblock;j++){
-      if ( i != j ) mat(i,j)=0.0;
-    }
-  }
-#endif
-  return;
-}
-template<class vobj>
-static void sliceInnerProductVector( std::vector<ComplexD> & vec, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
-{
-  // FIXME: Implementation is slow
-  // Look at localInnerProduct implementation,
-  // and do inside a site loop with block strided iterators
-  typedef typename vobj::scalar_object sobj;
-  typedef typename vobj::scalar_type scalar_type;
-  typedef typename vobj::vector_type vector_type;
-  typedef typename vobj::tensor_reduced scalar;
-  typedef typename scalar::scalar_object  scomplex;
-  
-  int Nblock = lhs._grid->GlobalDimensions()[Orthog];
-
-  vec.resize(Nblock);
-  std::vector<scomplex> sip(Nblock);
-  Lattice<scalar> IP(lhs._grid); 
-
-  IP=localInnerProduct(lhs,rhs);
-  sliceSum(IP,sip,Orthog);
-  
-  for(int ss=0;ss<Nblock;ss++){
-    vec[ss] = TensorRemove(sip[ss]);
-  }
-}
-template<class vobj>
-static void sliceNorm (std::vector<RealD> &sn,const Lattice<vobj> &rhs,int Orthog) {
-
-  typedef typename vobj::scalar_object sobj;
-  typedef typename vobj::scalar_type scalar_type;
-  typedef typename vobj::vector_type vector_type;
-  
-  int Nblock = rhs._grid->GlobalDimensions()[Orthog];
-  std::vector<ComplexD> ip(Nblock);
-  sn.resize(Nblock);
-  
-  sliceInnerProductVector(ip,rhs,rhs,Orthog);
-  for(int ss=0;ss<Nblock;ss++){
-    sn[ss] = real(ip[ss]);
-  }
-};
-/*
-template<class vobj>
-static void sliceInnerProductMatrixOld(  Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
-{
-  typedef typename vobj::scalar_object  sobj;
-  typedef typename vobj::scalar_type scalar_type;
-  typedef typename vobj::vector_type vector_type;
-  typedef typename vobj::tensor_reduced scalar;
-  typedef typename scalar::scalar_object  scomplex;
-
-  int Nblock = lhs._grid->GlobalDimensions()[Orthog];
-
-  std::cout << " sliceInnerProductMatrix Dim "<<Orthog<<" Nblock " << Nblock<<std::endl;
-
-  Lattice<scalar> IP(lhs._grid); 
-  std::vector<scomplex> sip(Nblock);
-    
-  mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
-
-  Lattice<vobj> tmp = rhs;
-  
-  for(int s1=0;s1<Nblock;s1++){
-    
-    IP=localInnerProduct(lhs,tmp);
-    sliceSum(IP,sip,Orthog);
-
-    std::cout << "InnerProductMatrix ["<<s1<<"] = ";
-    for(int ss=0;ss<Nblock;ss++){
-      std::cout << TensorRemove(sip[ss])<<" ";
-    }
-    std::cout << std::endl;
-
-    for(int ss=0;ss<Nblock;ss++){
-      mat(ss,(s1+ss)%Nblock) = TensorRemove(sip[ss]);
-    }
-    if ( s1!=(Nblock-1) ) { 
-      tmp = Cshift(tmp,Orthog,1);
-    }
-  }
-}
-*/
-
-
-
 //////////////////////////////////////////////////////////////////////////
 // Block conjugate gradient. Dimension zero should be the block direction
 //////////////////////////////////////////////////////////////////////////
diff --git a/lib/lattice/Lattice_reduction.h b/lib/lattice/Lattice_reduction.h
index e12bf0dd..d5de4bc3 100644
--- a/lib/lattice/Lattice_reduction.h
+++ b/lib/lattice/Lattice_reduction.h
@@ -30,6 +30,8 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_LATTICE_REDUCTION_H
 #define GRID_LATTICE_REDUCTION_H
 
+#include <Grid/Eigen/Dense>
+
 namespace Grid {
 #ifdef GRID_WARN_SUBOPTIMAL
 #warning "Optimisation alert all these reduction loops are NOT threaded "
@@ -215,6 +217,163 @@ template<class vobj> inline void sliceSum(const Lattice<vobj> &Data,std::vector<
   }
 }
 
+inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
+ {
+   int NN    = BlockSolverGrid->_ndimension;
+   int nsimd = BlockSolverGrid->Nsimd();
+
+   std::vector<int> latt_phys(0);
+   std::vector<int> simd_phys(0);
+   std::vector<int>  mpi_phys(0);
+  
+   for(int d=0;d<NN;d++){
+     if( d!=Orthog ) { 
+       latt_phys.push_back(BlockSolverGrid->_fdimensions[d]);
+       simd_phys.push_back(BlockSolverGrid->_simd_layout[d]);
+       mpi_phys.push_back(BlockSolverGrid->_processors[d]);
+     }
+   }
+   return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); 
+ }
+ //////////////////////////////////////////////////////////////////////////////////////////////////////////////
+ // Need to move sliceInnerProduct, sliceAxpy, sliceNorm etc... into lattice sector along with sliceSum
+ //////////////////////////////////////////////////////////////////////////////////////////////////////////////
+template<class vobj>
+  static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) 
+  {    
+    typedef typename vobj::scalar_object sobj;
+    typedef typename vobj::scalar_type scalar_type;
+    typedef typename vobj::vector_type vector_type;
+
+    int Nblock = X._grid->GlobalDimensions()[Orthog];
+    
+    GridBase *FullGrid  = X._grid;
+    GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+  
+    Lattice<vobj> Xslice(SliceGrid);
+    Lattice<vobj> Rslice(SliceGrid);
+    // FIXME: Implementation is slow
+    // If we based this on Cshift it would work for spread out
+    // but it would be even slower
+    //
+    // Repeated extract slice is inefficient
+    //
+    // Best base the linear combination by constructing a 
+    // set of vectors of size grid->_rdimensions[Orthog].
+    for(int i=0;i<Nblock;i++){
+      ExtractSlice(Rslice,Y,i,Orthog);
+      for(int j=0;j<Nblock;j++){
+	ExtractSlice(Xslice,X,j,Orthog);
+	Rslice = Rslice + Xslice*(scale*aa(j,i));
+      }
+      InsertSlice(Rslice,R,i,Orthog);
+    }
+  };
+template<class vobj>
+  static void sliceMaddVector (Lattice<vobj> &R,std::vector<RealD> &a,const Lattice<vobj> &X,const Lattice<vobj> &Y,
+			       int Orthog,RealD scale=1.0) 
+  {    
+    // FIXME: Implementation is slow
+    // Best base the linear combination by constructing a 
+    // set of vectors of size grid->_rdimensions[Orthog].
+    typedef typename vobj::scalar_object sobj;
+    typedef typename vobj::scalar_type scalar_type;
+    typedef typename vobj::vector_type vector_type;
+
+    int Nblock = X._grid->GlobalDimensions()[Orthog];
+    
+    GridBase *FullGrid  = X._grid;
+    GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+  
+    Lattice<vobj> Xslice(SliceGrid);
+    Lattice<vobj> Rslice(SliceGrid);
+    // If we based this on Cshift it would work for spread out
+    // but it would be even slower
+    for(int i=0;i<Nblock;i++){
+      ExtractSlice(Rslice,Y,i,Orthog);
+      ExtractSlice(Xslice,X,i,Orthog);
+      Rslice = Rslice + Xslice*(scale*a[i]);
+      InsertSlice(Rslice,R,i,Orthog);
+    }
+  };
+template<class vobj>
+  static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
+  {
+    // FIXME: Implementation is slow
+    // Not sure of best solution.. think about it
+    typedef typename vobj::scalar_object sobj;
+    typedef typename vobj::scalar_type scalar_type;
+    typedef typename vobj::vector_type vector_type;
+
+    GridBase *FullGrid  = lhs._grid;
+    GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
+
+    int Nblock = FullGrid->GlobalDimensions()[Orthog];
+  
+    Lattice<vobj> Lslice(SliceGrid);
+    Lattice<vobj> Rslice(SliceGrid);
+
+    mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
+
+    for(int i=0;i<Nblock;i++){
+      ExtractSlice(Lslice,lhs,i,Orthog);
+      for(int j=0;j<Nblock;j++){
+	ExtractSlice(Rslice,rhs,j,Orthog);
+	mat(i,j) = innerProduct(Lslice,Rslice);
+      }
+    }
+#undef FORCE_DIAG
+#ifdef FORCE_DIAG
+    for(int i=0;i<Nblock;i++){
+      for(int j=0;j<Nblock;j++){
+	if ( i != j ) mat(i,j)=0.0;
+      }
+    }
+#endif
+    return;
+  }
+template<class vobj>
+  static void sliceInnerProductVector( std::vector<ComplexD> & vec, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
+  {
+    // FIXME: Implementation is slow
+    // Look at localInnerProduct implementation,
+    // and do inside a site loop with block strided iterators
+    typedef typename vobj::scalar_object sobj;
+    typedef typename vobj::scalar_type scalar_type;
+    typedef typename vobj::vector_type vector_type;
+    typedef typename vobj::tensor_reduced scalar;
+    typedef typename scalar::scalar_object  scomplex;
+  
+    int Nblock = lhs._grid->GlobalDimensions()[Orthog];
+
+    vec.resize(Nblock);
+    std::vector<scomplex> sip(Nblock);
+    Lattice<scalar> IP(lhs._grid); 
+
+    IP=localInnerProduct(lhs,rhs);
+    sliceSum(IP,sip,Orthog);
+  
+    for(int ss=0;ss<Nblock;ss++){
+      vec[ss] = TensorRemove(sip[ss]);
+    }
+  }
+template<class vobj>
+  static void sliceNorm (std::vector<RealD> &sn,const Lattice<vobj> &rhs,int Orthog) {
+
+  typedef typename vobj::scalar_object sobj;
+  typedef typename vobj::scalar_type scalar_type;
+  typedef typename vobj::vector_type vector_type;
+  
+  int Nblock = rhs._grid->GlobalDimensions()[Orthog];
+  std::vector<ComplexD> ip(Nblock);
+  sn.resize(Nblock);
+  
+  sliceInnerProductVector(ip,rhs,rhs,Orthog);
+  for(int ss=0;ss<Nblock;ss++){
+    sn[ss] = real(ip[ss]);
+  }
+ };
+
 
 }
 #endif
diff --git a/lib/simd/Grid_vector_types.h b/lib/simd/Grid_vector_types.h
index 248a625c..8559a611 100644
--- a/lib/simd/Grid_vector_types.h
+++ b/lib/simd/Grid_vector_types.h
@@ -2,7 +2,7 @@
 
 Grid physics library, www.github.com/paboyle/Grid
 
-Source file: ./lib/simd/Grid_vector_types.h
+Source file: ./lib/simd/Grid_vector_type.h
 
 Copyright (C) 2015
 
diff --git a/lib/tensors/Tensor_traits.h b/lib/tensors/Tensor_traits.h
index 4dcfd9b1..e630c217 100644
--- a/lib/tensors/Tensor_traits.h
+++ b/lib/tensors/Tensor_traits.h
@@ -112,6 +112,7 @@ namespace Grid {
   public:
     typedef Integer scalar_type;
     typedef Integer vector_type;
+    typedef Integer vector_typeD;
     typedef Integer tensor_reduced;
     typedef Integer scalar_object;
     typedef void Complexified;