From 733f8ff0b2de7210763a341ca454a870dc10ebeb Mon Sep 17 00:00:00 2001
From: paboyle <paboyle@ph.ed.ac.uk>
Date: Wed, 24 Jan 2018 13:38:13 +0000
Subject: [PATCH] Still using parallel_for -- don't know how to implement
 reduction on GPU yet. Look at some sample code is best.

---
 lib/lattice/Lattice_reduction.h | 205 +-------------------------------
 1 file changed, 4 insertions(+), 201 deletions(-)
diff --git a/lib/lattice/Lattice_reduction.h b/lib/lattice/Lattice_reduction.h
index 3e0a36fd..c6c44706 100644
--- a/lib/lattice/Lattice_reduction.h
+++ b/lib/lattice/Lattice_reduction.h
@@ -22,8 +22,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_LATTICE_REDUCTION_H
 #define GRID_LATTICE_REDUCTION_H
 
-#include <Grid/Grid_Eigen_Dense.h>
-
 #ifdef GRID_WARN_SUBOPTIMAL
 #warning "Optimisation alert all these reduction loops are NOT threaded "
 #endif     
@@ -35,7 +33,7 @@ NAMESPACE_BEGIN(Grid);
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
   ComplexD nrm = innerProduct(arg,arg);
-  return std::real(nrm); 
+  return real(nrm); 
 }
 
 // Double inner product
@@ -51,7 +49,7 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
   std::vector<vector_type,alignedAllocator<vector_type> > sumarray(grid->SumArraySize());
   
   parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
-    int nwork, mywork, myoff;
+    int mywork, myoff;
     GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff);
     
     decltype(innerProductD(left._odata[0],right._odata[0])) vnrm=zero; // private to thread; sub summation
@@ -107,7 +105,7 @@ inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
   }
   
   parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
-    int nwork, mywork, myoff;
+    int mywork, myoff;
     GridThread::GetWork(grid->oSites(),thr,mywork,myoff);
     
     vobj vvsum=zero;
@@ -123,7 +121,7 @@ inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
   } 
   
   typedef typename vobj::scalar_object sobj;
-  sobj ssum=zero;
+  sobj ssum; zeroit(ssum);
   
   std::vector<sobj>               buf(Nsimd);
   extract(vsum,buf);
@@ -370,201 +368,6 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
   }
 };
 
-/*
-  inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
-  {
-  int NN    = BlockSolverGrid->_ndimension;
-  int nsimd = BlockSolverGrid->Nsimd();
-  
-  std::vector<int> latt_phys(0);
-  std::vector<int> simd_phys(0);
-  std::vector<int>  mpi_phys(0);
-  
-  for(int d=0;d<NN;d++){
-  if( d!=Orthog ) { 
-  latt_phys.push_back(BlockSolverGrid->_fdimensions[d]);
-  simd_phys.push_back(BlockSolverGrid->_simd_layout[d]);
-  mpi_phys.push_back(BlockSolverGrid->_processors[d]);
-  }
-  }
-  return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); 
-  }
-*/
-
-template<class vobj>
-static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) 
-{    
-  typedef typename vobj::scalar_object sobj;
-  typedef typename vobj::scalar_type scalar_type;
-  typedef typename vobj::vector_type vector_type;
-
-  int Nblock = X._grid->GlobalDimensions()[Orthog];
-
-  GridBase *FullGrid  = X._grid;
-  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
-
-  //  Lattice<vobj> Xslice(SliceGrid);
-  //  Lattice<vobj> Rslice(SliceGrid);
-
-  assert( FullGrid->_simd_layout[Orthog]==1);
-  int nh =  FullGrid->_ndimension;
-  //  int nl = SliceGrid->_ndimension;
-  int nl = nh-1;
-
-  //FIXME package in a convenient iterator
-  //Should loop over a plane orthogonal to direction "Orthog"
-  int stride=FullGrid->_slice_stride[Orthog];
-  int block =FullGrid->_slice_block [Orthog];
-  int nblock=FullGrid->_slice_nblock[Orthog];
-  int ostride=FullGrid->_ostride[Orthog];
-#pragma omp parallel 
-  {
-    std::vector<vobj> s_x(Nblock);
-
-#pragma omp for collapse(2)
-    for(int n=0;n<nblock;n++){
-      for(int b=0;b<block;b++){
-	int o  = n*stride + b;
-
-	for(int i=0;i<Nblock;i++){
-	  s_x[i] = X[o+i*ostride];
-	}
-
-	vobj dot;
-	for(int i=0;i<Nblock;i++){
-	  dot = Y[o+i*ostride];
-	  for(int j=0;j<Nblock;j++){
-	    dot = dot + s_x[j]*(scale*aa(j,i));
-	  }
-	  R[o+i*ostride]=dot;
-	}
-      }}
-  }
-};
-
-template<class vobj>
-static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0) 
-{    
-  typedef typename vobj::scalar_object sobj;
-  typedef typename vobj::scalar_type scalar_type;
-  typedef typename vobj::vector_type vector_type;
-
-  int Nblock = X._grid->GlobalDimensions()[Orthog];
-
-  GridBase *FullGrid  = X._grid;
-  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
-  //  Lattice<vobj> Xslice(SliceGrid);
-  //  Lattice<vobj> Rslice(SliceGrid);
-
-  assert( FullGrid->_simd_layout[Orthog]==1);
-  int nh =  FullGrid->_ndimension;
-  //  int nl = SliceGrid->_ndimension;
-  int nl=1;
-
-  //FIXME package in a convenient iterator
-  //Should loop over a plane orthogonal to direction "Orthog"
-  int stride=FullGrid->_slice_stride[Orthog];
-  int block =FullGrid->_slice_block [Orthog];
-  int nblock=FullGrid->_slice_nblock[Orthog];
-  int ostride=FullGrid->_ostride[Orthog];
-#pragma omp parallel 
-  {
-    std::vector<vobj> s_x(Nblock);
-
-#pragma omp for collapse(2)
-    for(int n=0;n<nblock;n++){
-      for(int b=0;b<block;b++){
-	int o  = n*stride + b;
-
-	for(int i=0;i<Nblock;i++){
-	  s_x[i] = X[o+i*ostride];
-	}
-
-	vobj dot;
-	for(int i=0;i<Nblock;i++){
-	  dot = s_x[0]*(scale*aa(0,i));
-	  for(int j=1;j<Nblock;j++){
-	    dot = dot + s_x[j]*(scale*aa(j,i));
-	  }
-	  R[o+i*ostride]=dot;
-	}
-      }}
-  }
-
-};
-
-
-template<class vobj>
-static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
-{
-  typedef typename vobj::scalar_object sobj;
-  typedef typename vobj::scalar_type scalar_type;
-  typedef typename vobj::vector_type vector_type;
-  
-  GridBase *FullGrid  = lhs._grid;
-  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
-  
-  int Nblock = FullGrid->GlobalDimensions()[Orthog];
-  
-  //  Lattice<vobj> Lslice(SliceGrid);
-  //  Lattice<vobj> Rslice(SliceGrid);
-  
-  mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
-
-  assert( FullGrid->_simd_layout[Orthog]==1);
-  int nh =  FullGrid->_ndimension;
-  //  int nl = SliceGrid->_ndimension;
-  int nl = nh-1;
-
-  //FIXME package in a convenient iterator
-  //Should loop over a plane orthogonal to direction "Orthog"
-  int stride=FullGrid->_slice_stride[Orthog];
-  int block =FullGrid->_slice_block [Orthog];
-  int nblock=FullGrid->_slice_nblock[Orthog];
-  int ostride=FullGrid->_ostride[Orthog];
-
-  typedef typename vobj::vector_typeD vector_typeD;
-
-#pragma omp parallel 
-  {
-    std::vector<vobj> Left(Nblock);
-    std::vector<vobj> Right(Nblock);
-    Eigen::MatrixXcd  mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock);
-
-#pragma omp for collapse(2)
-    for(int n=0;n<nblock;n++){
-      for(int b=0;b<block;b++){
-
-	int o  = n*stride + b;
-
-	for(int i=0;i<Nblock;i++){
-	  Left [i] = lhs[o+i*ostride];
-	  Right[i] = rhs[o+i*ostride];
-	}
-
-	for(int i=0;i<Nblock;i++){
-	  for(int j=0;j<Nblock;j++){
-	    auto tmp = innerProduct(Left[i],Right[j]);
-	    auto rtmp = TensorRemove(tmp);
-	    mat_thread(i,j) += Reduce(rtmp);
-	  }}
-      }}
-#pragma omp critical
-    {
-      mat += mat_thread;
-    }  
-  }
-
-  for(int i=0;i<Nblock;i++){
-    for(int j=0;j<Nblock;j++){
-      ComplexD sum = mat(i,j);
-      FullGrid->GlobalSum(sum);
-      mat(i,j)=sum;
-    }}
-
-  return;
-}
-
 NAMESPACE_END(Grid);
 #endif