Still using parallel_for -- don't know how to implement reduction on GPU yet. Look at some sample code is best.

2026-05-24 02:54:16 +01:00 · 2018-01-24 13:38:13 +00:00
parent 0bfa5bb213
commit 733f8ff0b2
1 changed files with 4 additions and 201 deletions
@@ -22,8 +22,6 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #ifndef GRID_LATTICE_REDUCTION_H
 #define GRID_LATTICE_REDUCTION_H
 #include <Grid/Grid_Eigen_Dense.h>
 #ifdef GRID_WARN_SUBOPTIMAL
 #warning "Optimisation alert all these reduction loops are NOT threaded "
 #endif     
@@ -35,7 +33,7 @@ NAMESPACE_BEGIN(Grid);
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 template<class vobj> inline RealD norm2(const Lattice<vobj> &arg){
  ComplexD nrm = innerProduct(arg,arg);
-  return std::real(nrm); 
+  return real(nrm); 
 }
 // Double inner product
@@ -51,7 +49,7 @@ inline ComplexD innerProduct(const Lattice<vobj> &left,const Lattice<vobj> &righ
  std::vector<vector_type,alignedAllocator<vector_type> > sumarray(grid->SumArraySize());
  parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
-    int nwork, mywork, myoff;
+    int mywork, myoff;
    GridThread::GetWork(left._grid->oSites(),thr,mywork,myoff);
    decltype(innerProductD(left._odata[0],right._odata[0])) vnrm=zero; // private to thread; sub summation
@@ -107,7 +105,7 @@ inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
  }
  parallel_for(int thr=0;thr<grid->SumArraySize();thr++){
-    int nwork, mywork, myoff;
+    int mywork, myoff;
    GridThread::GetWork(grid->oSites(),thr,mywork,myoff);
    vobj vvsum=zero;
@@ -123,7 +121,7 @@ inline typename vobj::scalar_object sum(const Lattice<vobj> &arg)
  } 
  typedef typename vobj::scalar_object sobj;
-  sobj ssum=zero;
+  sobj ssum; zeroit(ssum);
  std::vector<sobj>               buf(Nsimd);
  extract(vsum,buf);
@@ -370,201 +368,6 @@ static void sliceMaddVector(Lattice<vobj> &R,std::vector<RealD> &a,const Lattice
  }
 };
 /*
  inline GridBase         *makeSubSliceGrid(const GridBase *BlockSolverGrid,int Orthog)
  {
  int NN    = BlockSolverGrid->_ndimension;
  int nsimd = BlockSolverGrid->Nsimd();
  std::vector<int> latt_phys(0);
  std::vector<int> simd_phys(0);
  std::vector<int>  mpi_phys(0);
  for(int d=0;d<NN;d++){
  if( d!=Orthog ) { 
  latt_phys.push_back(BlockSolverGrid->_fdimensions[d]);
  simd_phys.push_back(BlockSolverGrid->_simd_layout[d]);
  mpi_phys.push_back(BlockSolverGrid->_processors[d]);
  }
  }
  return (GridBase *)new GridCartesian(latt_phys,simd_phys,mpi_phys); 
  }
 */
 template<class vobj>
 static void sliceMaddMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,const Lattice<vobj> &Y,int Orthog,RealD scale=1.0) 
 {    
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  int Nblock = X._grid->GlobalDimensions()[Orthog];
  GridBase *FullGrid  = X._grid;
  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
  //  Lattice<vobj> Xslice(SliceGrid);
  //  Lattice<vobj> Rslice(SliceGrid);
  assert( FullGrid->_simd_layout[Orthog]==1);
  int nh =  FullGrid->_ndimension;
  //  int nl = SliceGrid->_ndimension;
  int nl = nh-1;
  //FIXME package in a convenient iterator
  //Should loop over a plane orthogonal to direction "Orthog"
  int stride=FullGrid->_slice_stride[Orthog];
  int block =FullGrid->_slice_block [Orthog];
  int nblock=FullGrid->_slice_nblock[Orthog];
  int ostride=FullGrid->_ostride[Orthog];
 #pragma omp parallel 
  {
    std::vector<vobj> s_x(Nblock);
 #pragma omp for collapse(2)
    for(int n=0;n<nblock;n++){
      for(int b=0;b<block;b++){
 	int o  = n*stride + b;
 	for(int i=0;i<Nblock;i++){
 	  s_x[i] = X[o+i*ostride];
 	}
 	vobj dot;
 	for(int i=0;i<Nblock;i++){
 	  dot = Y[o+i*ostride];
 	  for(int j=0;j<Nblock;j++){
 	    dot = dot + s_x[j]*(scale*aa(j,i));
 	  }
 	  R[o+i*ostride]=dot;
 	}
      }}
  }
 };
 template<class vobj>
 static void sliceMulMatrix (Lattice<vobj> &R,Eigen::MatrixXcd &aa,const Lattice<vobj> &X,int Orthog,RealD scale=1.0) 
 {    
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  int Nblock = X._grid->GlobalDimensions()[Orthog];
  GridBase *FullGrid  = X._grid;
  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
  //  Lattice<vobj> Xslice(SliceGrid);
  //  Lattice<vobj> Rslice(SliceGrid);
  assert( FullGrid->_simd_layout[Orthog]==1);
  int nh =  FullGrid->_ndimension;
  //  int nl = SliceGrid->_ndimension;
  int nl=1;
  //FIXME package in a convenient iterator
  //Should loop over a plane orthogonal to direction "Orthog"
  int stride=FullGrid->_slice_stride[Orthog];
  int block =FullGrid->_slice_block [Orthog];
  int nblock=FullGrid->_slice_nblock[Orthog];
  int ostride=FullGrid->_ostride[Orthog];
 #pragma omp parallel 
  {
    std::vector<vobj> s_x(Nblock);
 #pragma omp for collapse(2)
    for(int n=0;n<nblock;n++){
      for(int b=0;b<block;b++){
 	int o  = n*stride + b;
 	for(int i=0;i<Nblock;i++){
 	  s_x[i] = X[o+i*ostride];
 	}
 	vobj dot;
 	for(int i=0;i<Nblock;i++){
 	  dot = s_x[0]*(scale*aa(0,i));
 	  for(int j=1;j<Nblock;j++){
 	    dot = dot + s_x[j]*(scale*aa(j,i));
 	  }
 	  R[o+i*ostride]=dot;
 	}
      }}
  }
 };
 template<class vobj>
 static void sliceInnerProductMatrix(  Eigen::MatrixXcd &mat, const Lattice<vobj> &lhs,const Lattice<vobj> &rhs,int Orthog) 
 {
  typedef typename vobj::scalar_object sobj;
  typedef typename vobj::scalar_type scalar_type;
  typedef typename vobj::vector_type vector_type;
  GridBase *FullGrid  = lhs._grid;
  //  GridBase *SliceGrid = makeSubSliceGrid(FullGrid,Orthog);
  int Nblock = FullGrid->GlobalDimensions()[Orthog];
  //  Lattice<vobj> Lslice(SliceGrid);
  //  Lattice<vobj> Rslice(SliceGrid);
  mat = Eigen::MatrixXcd::Zero(Nblock,Nblock);
  assert( FullGrid->_simd_layout[Orthog]==1);
  int nh =  FullGrid->_ndimension;
  //  int nl = SliceGrid->_ndimension;
  int nl = nh-1;
  //FIXME package in a convenient iterator
  //Should loop over a plane orthogonal to direction "Orthog"
  int stride=FullGrid->_slice_stride[Orthog];
  int block =FullGrid->_slice_block [Orthog];
  int nblock=FullGrid->_slice_nblock[Orthog];
  int ostride=FullGrid->_ostride[Orthog];
  typedef typename vobj::vector_typeD vector_typeD;
 #pragma omp parallel 
  {
    std::vector<vobj> Left(Nblock);
    std::vector<vobj> Right(Nblock);
    Eigen::MatrixXcd  mat_thread = Eigen::MatrixXcd::Zero(Nblock,Nblock);
 #pragma omp for collapse(2)
    for(int n=0;n<nblock;n++){
      for(int b=0;b<block;b++){
 	int o  = n*stride + b;
 	for(int i=0;i<Nblock;i++){
 	  Left [i] = lhs[o+i*ostride];
 	  Right[i] = rhs[o+i*ostride];
 	}
 	for(int i=0;i<Nblock;i++){
 	  for(int j=0;j<Nblock;j++){
 	    auto tmp = innerProduct(Left[i],Right[j]);
 	    auto rtmp = TensorRemove(tmp);
 	    mat_thread(i,j) += Reduce(rtmp);
 	  }}
      }}
 #pragma omp critical
    {
      mat += mat_thread;
    }  
  }
  for(int i=0;i<Nblock;i++){
    for(int j=0;j<Nblock;j++){
      ComplexD sum = mat(i,j);
      FullGrid->GlobalSum(sum);
      mat(i,j)=sum;
    }}
  return;
 }
 NAMESPACE_END(Grid);
 #endif