mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 05:54:32 +00:00 
			
		
		
		
	Compare commits
	
		
			1 Commits
		
	
	
		
			aff3d50bae
			...
			feature/fe
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 
						 | 
					8b91b61b61 | 
							
								
								
									
										126
									
								
								Grid/lattice/Lattice_slice_gpu.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										126
									
								
								Grid/lattice/Lattice_slice_gpu.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,126 @@
 | 
			
		||||
NAMESPACE_BEGIN(Grid);
 | 
			
		||||
 | 
			
		||||
// If NOT CUDA or HIP -- we should provide
 | 
			
		||||
// -- atomicAdd(float *,float)
 | 
			
		||||
// -- atomicAdd(double *,double)
 | 
			
		||||
// 
 | 
			
		||||
// Augment CUDA with complex atomics
 | 
			
		||||
#if !defined(GRID_HIP) || !defined(GRID_CUDA)
 | 
			
		||||
inline void atomicAdd(float *acc,float elem)
 | 
			
		||||
{
 | 
			
		||||
  *acc += elem;
 | 
			
		||||
}
 | 
			
		||||
inline void atomicAdd(double *acc,double elem)
 | 
			
		||||
{
 | 
			
		||||
  *acc += elem;
 | 
			
		||||
}
 | 
			
		||||
#endif
 | 
			
		||||
inline void atomicAdd(ComplexD *accum,ComplexD & elem)
 | 
			
		||||
{
 | 
			
		||||
  double *a_p = (double *)accum;
 | 
			
		||||
  double *e_p = (double *)&elem;
 | 
			
		||||
  for(int w=0;w<2;w++){
 | 
			
		||||
    atomicAdd(&a_p[w],e_p[w]);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
inline void atomicAdd(ComplexF *accum,ComplexF & elem)
 | 
			
		||||
{
 | 
			
		||||
  float *a_p = (float *)accum;
 | 
			
		||||
  float *e_p = (float *)&elem;
 | 
			
		||||
  for(int w=0;w<2;w++){
 | 
			
		||||
    atomicAdd(&a_p[w],e_p[w]);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
// Augment CUDA with vobj atomics
 | 
			
		||||
template<class vobj> accelerator_inline void atomicAdd(vobj *accum, vobj & elem)
 | 
			
		||||
{
 | 
			
		||||
  typedef typename vobj::scalar_type scalar_type;
 | 
			
		||||
  scalar_type *a_p= (scalar_type *)accum;
 | 
			
		||||
  scalar_type *e_p= (scalar_type *)& elem;
 | 
			
		||||
  for(int w=0;w<vobj::Nsimd();w++){
 | 
			
		||||
    atomicAdd(&a_p[w],e_p[w]);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
// Atomics based slice sum
 | 
			
		||||
template<class vobj> inline void sliceSumGpu(const Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim)
 | 
			
		||||
{
 | 
			
		||||
  typedef typename vobj::scalar_object sobj;
 | 
			
		||||
  typedef typename vobj::scalar_object::scalar_type scalar_type;
 | 
			
		||||
  GridBase  *grid = Data.Grid();
 | 
			
		||||
  assert(grid!=NULL);
 | 
			
		||||
 | 
			
		||||
  const int    Nd = grid->_ndimension;
 | 
			
		||||
  const int Nsimd = grid->Nsimd();
 | 
			
		||||
 | 
			
		||||
  assert(orthogdim >= 0);
 | 
			
		||||
  assert(orthogdim < Nd);
 | 
			
		||||
 | 
			
		||||
  int fd=grid->_fdimensions[orthogdim];
 | 
			
		||||
  int ld=grid->_ldimensions[orthogdim];
 | 
			
		||||
  int rd=grid->_rdimensions[orthogdim];
 | 
			
		||||
 | 
			
		||||
  // Move to device memory and copy in / out
 | 
			
		||||
  Vector<vobj> lvSum(rd); // will locally sum vectors first
 | 
			
		||||
  Vector<sobj> lsSum(ld,Zero());                    // sum across these down to scalars
 | 
			
		||||
  ExtractBuffer<sobj> extracted(Nsimd);                  // splitting the SIMD
 | 
			
		||||
 | 
			
		||||
  result.resize(fd); // And then global sum to return the same vector to every node 
 | 
			
		||||
  for(int r=0;r<rd;r++){
 | 
			
		||||
    lvSum[r]=Zero();
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  int e1=    grid->_slice_nblock[orthogdim];
 | 
			
		||||
  int e2=    grid->_slice_block [orthogdim];
 | 
			
		||||
  int stride=grid->_slice_stride[orthogdim];
 | 
			
		||||
 | 
			
		||||
  // sum over reduced dimension planes, breaking out orthog dir
 | 
			
		||||
  // Parallel over orthog direction
 | 
			
		||||
  autoView( Data_v, Data, AcceleratorRead);
 | 
			
		||||
  auto lvSum_p=&lvSum[0];
 | 
			
		||||
  int ostride = grid->_ostride[orthogdim]; 
 | 
			
		||||
  accelerator_for( ree,rd*e1*e2,1, {
 | 
			
		||||
    int b = ree%e2;
 | 
			
		||||
    int re= ree/e2;
 | 
			
		||||
    int n=re%e1;
 | 
			
		||||
    int r=re/e1;
 | 
			
		||||
    int so=r*ostride;
 | 
			
		||||
    int ss=so+n*stride+b;
 | 
			
		||||
    atomicAdd(&lvSum_p[r],Data_v[ss]);
 | 
			
		||||
  });
 | 
			
		||||
 | 
			
		||||
  // Sum across simd lanes in the plane, breaking out orthog dir.
 | 
			
		||||
  Coordinate icoor(Nd);
 | 
			
		||||
 | 
			
		||||
  for(int rt=0;rt<rd;rt++){
 | 
			
		||||
 | 
			
		||||
    extract(lvSum[rt],extracted);
 | 
			
		||||
 | 
			
		||||
    for(int idx=0;idx<Nsimd;idx++){
 | 
			
		||||
 | 
			
		||||
      grid->iCoorFromIindex(icoor,idx);
 | 
			
		||||
 | 
			
		||||
      int ldx =rt+icoor[orthogdim]*rd;
 | 
			
		||||
 | 
			
		||||
      lsSum[ldx]=lsSum[ldx]+extracted[idx];
 | 
			
		||||
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  // sum over nodes.
 | 
			
		||||
  for(int t=0;t<fd;t++){
 | 
			
		||||
    int pt = t/ld; // processor plane
 | 
			
		||||
    int lt = t%ld;
 | 
			
		||||
    if ( pt == grid->_processor_coor[orthogdim] ) {
 | 
			
		||||
      result[t]=lsSum[lt];
 | 
			
		||||
    } else {
 | 
			
		||||
      result[t]=Zero();
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
  }
 | 
			
		||||
  scalar_type * ptr = (scalar_type *) &result[0];
 | 
			
		||||
  int words = fd*sizeof(sobj)/sizeof(scalar_type);
 | 
			
		||||
  grid->GlobalSumVector(ptr, words);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
NAMESPACE_END(Grid);
 | 
			
		||||
							
								
								
									
										73
									
								
								tests/core/Test_slicesum.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										73
									
								
								tests/core/Test_slicesum.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,73 @@
 | 
			
		||||
    /*************************************************************************************
 | 
			
		||||
 | 
			
		||||
    Grid physics library, www.github.com/paboyle/Grid 
 | 
			
		||||
 | 
			
		||||
    Source file: ./tests/Test_poisson_fft.cc
 | 
			
		||||
 | 
			
		||||
    Copyright (C) 2015
 | 
			
		||||
 | 
			
		||||
Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 | 
			
		||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
			
		||||
 | 
			
		||||
    This program is free software; you can redistribute it and/or modify
 | 
			
		||||
    it under the terms of the GNU General Public License as published by
 | 
			
		||||
    the Free Software Foundation; either version 2 of the License, or
 | 
			
		||||
    (at your option) any later version.
 | 
			
		||||
 | 
			
		||||
    This program is distributed in the hope that it will be useful,
 | 
			
		||||
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
			
		||||
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
			
		||||
    GNU General Public License for more details.
 | 
			
		||||
 | 
			
		||||
    You should have received a copy of the GNU General Public License along
 | 
			
		||||
    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
			
		||||
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
			
		||||
 | 
			
		||||
    See the full license in the file "LICENSE" in the top level distribution directory
 | 
			
		||||
    *************************************************************************************/
 | 
			
		||||
    /*  END LEGAL */
 | 
			
		||||
#include <Grid/Grid.h>
 | 
			
		||||
#include <Grid/lattice/Lattice_slice_gpu.h>
 | 
			
		||||
 | 
			
		||||
using namespace Grid;
 | 
			
		||||
 | 
			
		||||
int main (int argc, char ** argv)
 | 
			
		||||
{
 | 
			
		||||
  Grid_init(&argc,&argv);
 | 
			
		||||
 | 
			
		||||
  int N=16;
 | 
			
		||||
  std::vector<int> latt_size  ({N,N,N,N});
 | 
			
		||||
  std::vector<int> simd_layout({vComplexD::Nsimd(),1,1,1});
 | 
			
		||||
  std::vector<int> mpi_layout ({1,1,1,1});
 | 
			
		||||
 | 
			
		||||
  GridCartesian         GRID(latt_size,simd_layout,mpi_layout);
 | 
			
		||||
 | 
			
		||||
  LatticeComplexD  rn(&GRID);
 | 
			
		||||
 | 
			
		||||
  GridParallelRNG RNG(&GRID);
 | 
			
		||||
  RNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));  
 | 
			
		||||
  gaussian(RNG,rn);
 | 
			
		||||
 | 
			
		||||
  std::vector<TComplex> reduced_ref;
 | 
			
		||||
  std::vector<TComplex> reduced_gpu;
 | 
			
		||||
  for(int d=0;d<4;d++){
 | 
			
		||||
    {
 | 
			
		||||
      RealD t=-usecond();
 | 
			
		||||
      sliceSum(rn,reduced_ref,d);
 | 
			
		||||
      t+=usecond();
 | 
			
		||||
      std::cout << " sliceSum took "<<t<<" usecs"<<std::endl;
 | 
			
		||||
    }
 | 
			
		||||
    {
 | 
			
		||||
      RealD t=-usecond();
 | 
			
		||||
      sliceSumGpu(rn,reduced_gpu,d);
 | 
			
		||||
      t+=usecond();
 | 
			
		||||
      std::cout << " sliceSumGpu took "<<t<<" usecs"<<std::endl;
 | 
			
		||||
    }
 | 
			
		||||
    for(int t=0;t<reduced_ref.size();t++){
 | 
			
		||||
      std::cout << t<<" ref "<< reduced_ref[t] <<" opt " << reduced_gpu[t] << " diff "<<reduced_ref[t]-reduced_gpu[t]<<std::endl;
 | 
			
		||||
      TComplex diff = reduced_ref[t]-reduced_gpu[t];
 | 
			
		||||
      assert(abs(TensorRemove(diff)) < 1e-8 );
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
  Grid_finalize();
 | 
			
		||||
}
 | 
			
		||||
		Reference in New Issue
	
	Block a user