mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-04 05:54:32 +00:00 
			
		
		
		
	Compare commits
	
		
			1 Commits
		
	
	
		
			b58fd80379
			...
			feature/fe
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 
						 | 
					8b91b61b61 | 
							
								
								
									
										126
									
								
								Grid/lattice/Lattice_slice_gpu.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										126
									
								
								Grid/lattice/Lattice_slice_gpu.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,126 @@
 | 
				
			|||||||
 | 
					NAMESPACE_BEGIN(Grid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// If NOT CUDA or HIP -- we should provide
 | 
				
			||||||
 | 
					// -- atomicAdd(float *,float)
 | 
				
			||||||
 | 
					// -- atomicAdd(double *,double)
 | 
				
			||||||
 | 
					// 
 | 
				
			||||||
 | 
					// Augment CUDA with complex atomics
 | 
				
			||||||
 | 
					#if !defined(GRID_HIP) || !defined(GRID_CUDA)
 | 
				
			||||||
 | 
					inline void atomicAdd(float *acc,float elem)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  *acc += elem;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					inline void atomicAdd(double *acc,double elem)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  *acc += elem;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					inline void atomicAdd(ComplexD *accum,ComplexD & elem)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  double *a_p = (double *)accum;
 | 
				
			||||||
 | 
					  double *e_p = (double *)&elem;
 | 
				
			||||||
 | 
					  for(int w=0;w<2;w++){
 | 
				
			||||||
 | 
					    atomicAdd(&a_p[w],e_p[w]);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					inline void atomicAdd(ComplexF *accum,ComplexF & elem)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  float *a_p = (float *)accum;
 | 
				
			||||||
 | 
					  float *e_p = (float *)&elem;
 | 
				
			||||||
 | 
					  for(int w=0;w<2;w++){
 | 
				
			||||||
 | 
					    atomicAdd(&a_p[w],e_p[w]);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					// Augment CUDA with vobj atomics
 | 
				
			||||||
 | 
					template<class vobj> accelerator_inline void atomicAdd(vobj *accum, vobj & elem)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  typedef typename vobj::scalar_type scalar_type;
 | 
				
			||||||
 | 
					  scalar_type *a_p= (scalar_type *)accum;
 | 
				
			||||||
 | 
					  scalar_type *e_p= (scalar_type *)& elem;
 | 
				
			||||||
 | 
					  for(int w=0;w<vobj::Nsimd();w++){
 | 
				
			||||||
 | 
					    atomicAdd(&a_p[w],e_p[w]);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					// Atomics based slice sum
 | 
				
			||||||
 | 
					template<class vobj> inline void sliceSumGpu(const Lattice<vobj> &Data,std::vector<typename vobj::scalar_object> &result,int orthogdim)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  typedef typename vobj::scalar_object sobj;
 | 
				
			||||||
 | 
					  typedef typename vobj::scalar_object::scalar_type scalar_type;
 | 
				
			||||||
 | 
					  GridBase  *grid = Data.Grid();
 | 
				
			||||||
 | 
					  assert(grid!=NULL);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  const int    Nd = grid->_ndimension;
 | 
				
			||||||
 | 
					  const int Nsimd = grid->Nsimd();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  assert(orthogdim >= 0);
 | 
				
			||||||
 | 
					  assert(orthogdim < Nd);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  int fd=grid->_fdimensions[orthogdim];
 | 
				
			||||||
 | 
					  int ld=grid->_ldimensions[orthogdim];
 | 
				
			||||||
 | 
					  int rd=grid->_rdimensions[orthogdim];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Move to device memory and copy in / out
 | 
				
			||||||
 | 
					  Vector<vobj> lvSum(rd); // will locally sum vectors first
 | 
				
			||||||
 | 
					  Vector<sobj> lsSum(ld,Zero());                    // sum across these down to scalars
 | 
				
			||||||
 | 
					  ExtractBuffer<sobj> extracted(Nsimd);                  // splitting the SIMD
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  result.resize(fd); // And then global sum to return the same vector to every node 
 | 
				
			||||||
 | 
					  for(int r=0;r<rd;r++){
 | 
				
			||||||
 | 
					    lvSum[r]=Zero();
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  int e1=    grid->_slice_nblock[orthogdim];
 | 
				
			||||||
 | 
					  int e2=    grid->_slice_block [orthogdim];
 | 
				
			||||||
 | 
					  int stride=grid->_slice_stride[orthogdim];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // sum over reduced dimension planes, breaking out orthog dir
 | 
				
			||||||
 | 
					  // Parallel over orthog direction
 | 
				
			||||||
 | 
					  autoView( Data_v, Data, AcceleratorRead);
 | 
				
			||||||
 | 
					  auto lvSum_p=&lvSum[0];
 | 
				
			||||||
 | 
					  int ostride = grid->_ostride[orthogdim]; 
 | 
				
			||||||
 | 
					  accelerator_for( ree,rd*e1*e2,1, {
 | 
				
			||||||
 | 
					    int b = ree%e2;
 | 
				
			||||||
 | 
					    int re= ree/e2;
 | 
				
			||||||
 | 
					    int n=re%e1;
 | 
				
			||||||
 | 
					    int r=re/e1;
 | 
				
			||||||
 | 
					    int so=r*ostride;
 | 
				
			||||||
 | 
					    int ss=so+n*stride+b;
 | 
				
			||||||
 | 
					    atomicAdd(&lvSum_p[r],Data_v[ss]);
 | 
				
			||||||
 | 
					  });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Sum across simd lanes in the plane, breaking out orthog dir.
 | 
				
			||||||
 | 
					  Coordinate icoor(Nd);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  for(int rt=0;rt<rd;rt++){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    extract(lvSum[rt],extracted);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for(int idx=0;idx<Nsimd;idx++){
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      grid->iCoorFromIindex(icoor,idx);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      int ldx =rt+icoor[orthogdim]*rd;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      lsSum[ldx]=lsSum[ldx]+extracted[idx];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  // sum over nodes.
 | 
				
			||||||
 | 
					  for(int t=0;t<fd;t++){
 | 
				
			||||||
 | 
					    int pt = t/ld; // processor plane
 | 
				
			||||||
 | 
					    int lt = t%ld;
 | 
				
			||||||
 | 
					    if ( pt == grid->_processor_coor[orthogdim] ) {
 | 
				
			||||||
 | 
					      result[t]=lsSum[lt];
 | 
				
			||||||
 | 
					    } else {
 | 
				
			||||||
 | 
					      result[t]=Zero();
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  scalar_type * ptr = (scalar_type *) &result[0];
 | 
				
			||||||
 | 
					  int words = fd*sizeof(sobj)/sizeof(scalar_type);
 | 
				
			||||||
 | 
					  grid->GlobalSumVector(ptr, words);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					NAMESPACE_END(Grid);
 | 
				
			||||||
							
								
								
									
										73
									
								
								tests/core/Test_slicesum.cc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										73
									
								
								tests/core/Test_slicesum.cc
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,73 @@
 | 
				
			|||||||
 | 
					    /*************************************************************************************
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Grid physics library, www.github.com/paboyle/Grid 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Source file: ./tests/Test_poisson_fft.cc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Copyright (C) 2015
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Author: Azusa Yamaguchi <ayamaguc@staffmail.ed.ac.uk>
 | 
				
			||||||
 | 
					Author: Peter Boyle <paboyle@ph.ed.ac.uk>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    This program is free software; you can redistribute it and/or modify
 | 
				
			||||||
 | 
					    it under the terms of the GNU General Public License as published by
 | 
				
			||||||
 | 
					    the Free Software Foundation; either version 2 of the License, or
 | 
				
			||||||
 | 
					    (at your option) any later version.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    This program is distributed in the hope that it will be useful,
 | 
				
			||||||
 | 
					    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
				
			||||||
 | 
					    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
				
			||||||
 | 
					    GNU General Public License for more details.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    You should have received a copy of the GNU General Public License along
 | 
				
			||||||
 | 
					    with this program; if not, write to the Free Software Foundation, Inc.,
 | 
				
			||||||
 | 
					    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    See the full license in the file "LICENSE" in the top level distribution directory
 | 
				
			||||||
 | 
					    *************************************************************************************/
 | 
				
			||||||
 | 
					    /*  END LEGAL */
 | 
				
			||||||
 | 
					#include <Grid/Grid.h>
 | 
				
			||||||
 | 
					#include <Grid/lattice/Lattice_slice_gpu.h>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					using namespace Grid;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					int main (int argc, char ** argv)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					  Grid_init(&argc,&argv);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  int N=16;
 | 
				
			||||||
 | 
					  std::vector<int> latt_size  ({N,N,N,N});
 | 
				
			||||||
 | 
					  std::vector<int> simd_layout({vComplexD::Nsimd(),1,1,1});
 | 
				
			||||||
 | 
					  std::vector<int> mpi_layout ({1,1,1,1});
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  GridCartesian         GRID(latt_size,simd_layout,mpi_layout);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  LatticeComplexD  rn(&GRID);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  GridParallelRNG RNG(&GRID);
 | 
				
			||||||
 | 
					  RNG.SeedFixedIntegers(std::vector<int>({45,12,81,9}));  
 | 
				
			||||||
 | 
					  gaussian(RNG,rn);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  std::vector<TComplex> reduced_ref;
 | 
				
			||||||
 | 
					  std::vector<TComplex> reduced_gpu;
 | 
				
			||||||
 | 
					  for(int d=0;d<4;d++){
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					      RealD t=-usecond();
 | 
				
			||||||
 | 
					      sliceSum(rn,reduced_ref,d);
 | 
				
			||||||
 | 
					      t+=usecond();
 | 
				
			||||||
 | 
					      std::cout << " sliceSum took "<<t<<" usecs"<<std::endl;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					      RealD t=-usecond();
 | 
				
			||||||
 | 
					      sliceSumGpu(rn,reduced_gpu,d);
 | 
				
			||||||
 | 
					      t+=usecond();
 | 
				
			||||||
 | 
					      std::cout << " sliceSumGpu took "<<t<<" usecs"<<std::endl;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    for(int t=0;t<reduced_ref.size();t++){
 | 
				
			||||||
 | 
					      std::cout << t<<" ref "<< reduced_ref[t] <<" opt " << reduced_gpu[t] << " diff "<<reduced_ref[t]-reduced_gpu[t]<<std::endl;
 | 
				
			||||||
 | 
					      TComplex diff = reduced_ref[t]-reduced_gpu[t];
 | 
				
			||||||
 | 
					      assert(abs(TensorRemove(diff)) < 1e-8 );
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					  Grid_finalize();
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
		Reference in New Issue
	
	Block a user