diff --git a/Grid/lattice/Lattice_slice_gpu.h b/Grid/lattice/Lattice_slice_gpu.h new file mode 100644 index 00000000..d081dc70 --- /dev/null +++ b/Grid/lattice/Lattice_slice_gpu.h @@ -0,0 +1,126 @@ +NAMESPACE_BEGIN(Grid); + +// If NOT CUDA or HIP -- we should provide +// -- atomicAdd(float *,float) +// -- atomicAdd(double *,double) +// +// Augment CUDA with complex atomics +#if !defined(GRID_HIP) || !defined(GRID_CUDA) +inline void atomicAdd(float *acc,float elem) +{ + *acc += elem; +} +inline void atomicAdd(double *acc,double elem) +{ + *acc += elem; +} +#endif +inline void atomicAdd(ComplexD *accum,ComplexD & elem) +{ + double *a_p = (double *)accum; + double *e_p = (double *)&elem; + for(int w=0;w<2;w++){ + atomicAdd(&a_p[w],e_p[w]); + } +} +inline void atomicAdd(ComplexF *accum,ComplexF & elem) +{ + float *a_p = (float *)accum; + float *e_p = (float *)&elem; + for(int w=0;w<2;w++){ + atomicAdd(&a_p[w],e_p[w]); + } +} +// Augment CUDA with vobj atomics +template accelerator_inline void atomicAdd(vobj *accum, vobj & elem) +{ + typedef typename vobj::scalar_type scalar_type; + scalar_type *a_p= (scalar_type *)accum; + scalar_type *e_p= (scalar_type *)& elem; + for(int w=0;w inline void sliceSumGpu(const Lattice &Data,std::vector &result,int orthogdim) +{ + typedef typename vobj::scalar_object sobj; + typedef typename vobj::scalar_object::scalar_type scalar_type; + GridBase *grid = Data.Grid(); + assert(grid!=NULL); + + const int Nd = grid->_ndimension; + const int Nsimd = grid->Nsimd(); + + assert(orthogdim >= 0); + assert(orthogdim < Nd); + + int fd=grid->_fdimensions[orthogdim]; + int ld=grid->_ldimensions[orthogdim]; + int rd=grid->_rdimensions[orthogdim]; + + // Move to device memory and copy in / out + Vector lvSum(rd); // will locally sum vectors first + Vector lsSum(ld,Zero()); // sum across these down to scalars + ExtractBuffer extracted(Nsimd); // splitting the SIMD + + result.resize(fd); // And then global sum to return the same vector to every node + for(int r=0;r_slice_nblock[orthogdim]; + int e2= grid->_slice_block [orthogdim]; + int stride=grid->_slice_stride[orthogdim]; + + // sum over reduced dimension planes, breaking out orthog dir + // Parallel over orthog direction + autoView( Data_v, Data, AcceleratorRead); + auto lvSum_p=&lvSum[0]; + int ostride = grid->_ostride[orthogdim]; + accelerator_for( ree,rd*e1*e2,1, { + int b = ree%e2; + int re= ree/e2; + int n=re%e1; + int r=re/e1; + int so=r*ostride; + int ss=so+n*stride+b; + atomicAdd(&lvSum_p[r],Data_v[ss]); + }); + + // Sum across simd lanes in the plane, breaking out orthog dir. + Coordinate icoor(Nd); + + for(int rt=0;rtiCoorFromIindex(icoor,idx); + + int ldx =rt+icoor[orthogdim]*rd; + + lsSum[ldx]=lsSum[ldx]+extracted[idx]; + + } + } + + // sum over nodes. + for(int t=0;t_processor_coor[orthogdim] ) { + result[t]=lsSum[lt]; + } else { + result[t]=Zero(); + } + + } + scalar_type * ptr = (scalar_type *) &result[0]; + int words = fd*sizeof(sobj)/sizeof(scalar_type); + grid->GlobalSumVector(ptr, words); +} + + +NAMESPACE_END(Grid); diff --git a/tests/core/Test_slicesum.cc b/tests/core/Test_slicesum.cc new file mode 100644 index 00000000..a345ffdc --- /dev/null +++ b/tests/core/Test_slicesum.cc @@ -0,0 +1,73 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./tests/Test_poisson_fft.cc + + Copyright (C) 2015 + +Author: Azusa Yamaguchi +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +#include +#include + +using namespace Grid; + +int main (int argc, char ** argv) +{ + Grid_init(&argc,&argv); + + int N=16; + std::vector latt_size ({N,N,N,N}); + std::vector simd_layout({vComplexD::Nsimd(),1,1,1}); + std::vector mpi_layout ({1,1,1,1}); + + GridCartesian GRID(latt_size,simd_layout,mpi_layout); + + LatticeComplexD rn(&GRID); + + GridParallelRNG RNG(&GRID); + RNG.SeedFixedIntegers(std::vector({45,12,81,9})); + gaussian(RNG,rn); + + std::vector reduced_ref; + std::vector reduced_gpu; + for(int d=0;d<4;d++){ + { + RealD t=-usecond(); + sliceSum(rn,reduced_ref,d); + t+=usecond(); + std::cout << " sliceSum took "<