From 551a5f8dc8bb356b475f6f075902964429df93ba Mon Sep 17 00:00:00 2001 From: Peter Boyle Date: Tue, 11 Oct 2022 14:44:55 -0400 Subject: [PATCH] RRII gpu option --- Grid/communicator/Communicator_base.h | 2 +- Grid/lattice/Lattice_ET.h | 2 +- Grid/lattice/Lattice_matrix_reduction.h | 3 - Grid/lattice/Lattice_peekpoke.h | 14 +- Grid/lattice/Lattice_reduction.h | 12 +- Grid/lattice/Lattice_reduction_gpu.h | 2 - Grid/lattice/Lattice_transfer.h | 6 +- .../CayleyFermion5DImplementation.h | 82 -- Grid/qcd/utils/SUn.h | 1 - Grid/simd/Grid_a64fx-2.h | 4 +- Grid/simd/Grid_a64fx-fixedsize.h | 8 +- Grid/simd/Grid_avx.h | 8 +- Grid/simd/Grid_avx512.h | 9 +- Grid/simd/Grid_gpu_rrii.h | 878 ++++++++++++++++++ Grid/simd/Grid_gpu_vec.h | 10 +- Grid/simd/Grid_qpx.h | 4 +- Grid/simd/Grid_sse4.h | 10 +- Grid/simd/Grid_vector_types.h | 170 ++-- Grid/simd/Simd.h | 2 + Grid/stencil/Stencil.h | 2 - Grid/tensors/Tensor_class.h | 3 + Grid/tensors/Tensor_extract_merge.h | 49 +- configure.ac | 84 +- systems/mac-arm/config-command-mpi | 4 +- 24 files changed, 1099 insertions(+), 270 deletions(-) create mode 100644 Grid/simd/Grid_gpu_rrii.h diff --git a/Grid/communicator/Communicator_base.h b/Grid/communicator/Communicator_base.h index cb3b9f0e..68cd36cc 100644 --- a/Grid/communicator/Communicator_base.h +++ b/Grid/communicator/Communicator_base.h @@ -131,7 +131,7 @@ public: template void GlobalSum(obj &o){ typedef typename obj::scalar_type scalar_type; int words = sizeof(obj)/sizeof(scalar_type); - scalar_type * ptr = (scalar_type *)& o; + scalar_type * ptr = (scalar_type *)& o; // Safe alias GlobalSumVector(ptr,words); } diff --git a/Grid/lattice/Lattice_ET.h b/Grid/lattice/Lattice_ET.h index 4a8a7423..fdd31b28 100644 --- a/Grid/lattice/Lattice_ET.h +++ b/Grid/lattice/Lattice_ET.h @@ -63,7 +63,7 @@ accelerator_inline vobj predicatedWhere(const iobj &predicate, typename std::remove_const::type ret; typedef typename vobj::scalar_object scalar_object; - typedef typename vobj::scalar_type scalar_type; + // typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; const int Nsimd = vobj::vector_type::Nsimd(); diff --git a/Grid/lattice/Lattice_matrix_reduction.h b/Grid/lattice/Lattice_matrix_reduction.h index 7c470fef..abebbfd6 100644 --- a/Grid/lattice/Lattice_matrix_reduction.h +++ b/Grid/lattice/Lattice_matrix_reduction.h @@ -32,7 +32,6 @@ template static void sliceMaddMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice &X,const Lattice &Y,int Orthog,RealD scale=1.0) { typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; int Nblock = X.Grid()->GlobalDimensions()[Orthog]; @@ -82,7 +81,6 @@ template static void sliceMulMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice &X,int Orthog,RealD scale=1.0) { typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; int Nblock = X.Grid()->GlobalDimensions()[Orthog]; @@ -130,7 +128,6 @@ template static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice &lhs,const Lattice &rhs,int Orthog) { typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; GridBase *FullGrid = lhs.Grid(); diff --git a/Grid/lattice/Lattice_peekpoke.h b/Grid/lattice/Lattice_peekpoke.h index f3b485a4..b6a36b11 100644 --- a/Grid/lattice/Lattice_peekpoke.h +++ b/Grid/lattice/Lattice_peekpoke.h @@ -96,9 +96,6 @@ void pokeSite(const sobj &s,Lattice &l,const Coordinate &site){ GridBase *grid=l.Grid(); - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - int Nsimd = grid->Nsimd(); assert( l.Checkerboard()== l.Grid()->CheckerBoard(site)); @@ -136,9 +133,6 @@ void peekSite(sobj &s,const Lattice &l,const Coordinate &site){ GridBase *grid=l.Grid(); - typedef typename vobj::scalar_type scalar_type; - typedef typename vobj::vector_type vector_type; - int Nsimd = grid->Nsimd(); assert( l.Checkerboard() == l.Grid()->CheckerBoard(site)); @@ -179,11 +173,11 @@ inline void peekLocalSite(sobj &s,const LatticeView &l,Coordinate &site) idx= grid->iIndex(site); odx= grid->oIndex(site); - scalar_type * vp = (scalar_type *)&l[odx]; + const vector_type *vp = (const vector_type *) &l[odx]; scalar_type * pt = (scalar_type *)&s; for(int w=0;w &l,Coordinate &site) idx= grid->iIndex(site); odx= grid->oIndex(site); - scalar_type * vp = (scalar_type *)&l[odx]; + vector_type * vp = (vector_type *)&l[odx]; scalar_type * pt = (scalar_type *)&s; for(int w=0;w inline RealD maxLocalNorm2(const Lattice &arg) template inline ComplexD rankInnerProduct(const Lattice &left,const Lattice &right) { - typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_typeD vector_type; ComplexD nrm; @@ -296,7 +295,6 @@ axpby_norm_fast(Lattice &z,sobj a,sobj b,const Lattice &x,const Latt conformable(z,x); conformable(x,y); - typedef typename vobj::scalar_type scalar_type; // typedef typename vobj::vector_typeD vector_type; RealD nrm; @@ -341,7 +339,6 @@ innerProductNorm(ComplexD& ip, RealD &nrm, const Lattice &left,const Latti { conformable(left,right); - typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_typeD vector_type; Vector tmp(2); @@ -597,7 +594,8 @@ static void sliceNorm (std::vector &sn,const Lattice &rhs,int Ortho template static void sliceMaddVector(Lattice &R,std::vector &a,const Lattice &X,const Lattice &Y, int orthogdim,RealD scale=1.0) -{ +{ + // perhaps easier to just promote A to a field and use regular madd typedef typename vobj::scalar_object sobj; typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; @@ -628,8 +626,7 @@ static void sliceMaddVector(Lattice &R,std::vector &a,const Lattice for(int l=0;liCoorFromIindex(icoor,l); int ldx =r+icoor[orthogdim]*rd; - scalar_type *as =(scalar_type *)&av; - as[l] = scalar_type(a[ldx])*zscale; + av.putlane(scalar_type(a[ldx])*zscale,l); } tensor_reduced at; at=av; @@ -669,7 +666,6 @@ template static void sliceMaddMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice &X,const Lattice &Y,int Orthog,RealD scale=1.0) { typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; int Nblock = X.Grid()->GlobalDimensions()[Orthog]; @@ -723,7 +719,6 @@ template static void sliceMulMatrix (Lattice &R,Eigen::MatrixXcd &aa,const Lattice &X,int Orthog,RealD scale=1.0) { typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; int Nblock = X.Grid()->GlobalDimensions()[Orthog]; @@ -777,7 +772,6 @@ template static void sliceInnerProductMatrix( Eigen::MatrixXcd &mat, const Lattice &lhs,const Lattice &rhs,int Orthog) { typedef typename vobj::scalar_object sobj; - typedef typename vobj::scalar_type scalar_type; typedef typename vobj::vector_type vector_type; GridBase *FullGrid = lhs.Grid(); diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h index bad86d2a..5f5c6cc0 100644 --- a/Grid/lattice/Lattice_reduction_gpu.h +++ b/Grid/lattice/Lattice_reduction_gpu.h @@ -250,8 +250,6 @@ inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osi template inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites) { - typedef typename vobj::vector_type vector; - typedef typename vobj::scalar_typeD scalarD; typedef typename vobj::scalar_objectD sobj; sobj ret; diff --git a/Grid/lattice/Lattice_transfer.h b/Grid/lattice/Lattice_transfer.h index aee55e93..d9b7a704 100644 --- a/Grid/lattice/Lattice_transfer.h +++ b/Grid/lattice/Lattice_transfer.h @@ -677,10 +677,10 @@ void localCopyRegion(const Lattice &From,Lattice & To,Coordinate Fro Integer idx_t = 0; for(int d=0;d::SeqConservedCurrent(PropagatorField &q_in, #undef TopRowWithSource - -#if 0 -template -void CayleyFermion5D::MooeeInternalCompute(int dag, int inv, - Vector > & Matp, - Vector > & Matm) -{ - int Ls=this->Ls; - - GridBase *grid = this->FermionRedBlackGrid(); - int LLs = grid->_rdimensions[0]; - - if ( LLs == Ls ) { - return; // Not vectorised in 5th direction - } - - Eigen::MatrixXcd Pplus = Eigen::MatrixXcd::Zero(Ls,Ls); - Eigen::MatrixXcd Pminus = Eigen::MatrixXcd::Zero(Ls,Ls); - - for(int s=0;s::iscomplex() ) { - sp[l] = PplusMat (l*istride+s1*ostride,s2); - sm[l] = PminusMat(l*istride+s1*ostride,s2); - } else { - // if real - scalar_type tmp; - tmp = PplusMat (l*istride+s1*ostride,s2); - sp[l] = scalar_type(tmp.real(),tmp.real()); - tmp = PminusMat(l*istride+s1*ostride,s2); - sm[l] = scalar_type(tmp.real(),tmp.real()); - } - } - Matp[LLs*s2+s1] = Vp; - Matm[LLs*s2+s1] = Vm; - }} -} -#endif - NAMESPACE_END(Grid); diff --git a/Grid/qcd/utils/SUn.h b/Grid/qcd/utils/SUn.h index b9660c65..23eceea3 100644 --- a/Grid/qcd/utils/SUn.h +++ b/Grid/qcd/utils/SUn.h @@ -615,7 +615,6 @@ public: GridBase *grid = out.Grid(); typedef typename LatticeMatrixType::vector_type vector_type; - typedef typename LatticeMatrixType::scalar_type scalar_type; typedef iSinglet vTComplexType; diff --git a/Grid/simd/Grid_a64fx-2.h b/Grid/simd/Grid_a64fx-2.h index 2ad8591c..a4ef70ae 100644 --- a/Grid/simd/Grid_a64fx-2.h +++ b/Grid/simd/Grid_a64fx-2.h @@ -501,7 +501,7 @@ struct Conj{ struct TimesMinusI{ // Complex template - inline vec operator()(vec a, vec b){ + inline vec operator()(vec a){ vec out; const vec::uint> tbl_swap = acle::tbl_swap(); svbool_t pg1 = acle::pg1(); @@ -520,7 +520,7 @@ struct TimesMinusI{ struct TimesI{ // Complex template - inline vec operator()(vec a, vec b){ + inline vec operator()(vec a){ vec out; const vec::uint> tbl_swap = acle::tbl_swap(); svbool_t pg1 = acle::pg1(); diff --git a/Grid/simd/Grid_a64fx-fixedsize.h b/Grid/simd/Grid_a64fx-fixedsize.h index 6b450012..5bf1b0a3 100644 --- a/Grid/simd/Grid_a64fx-fixedsize.h +++ b/Grid/simd/Grid_a64fx-fixedsize.h @@ -418,7 +418,7 @@ struct Conj{ struct TimesMinusI{ // Complex float - inline vecf operator()(vecf a, vecf b){ + inline vecf operator()(vecf a){ lutf tbl_swap = acle::tbl_swap(); pred pg1 = acle::pg1(); pred pg_odd = acle::pg_odd(); @@ -428,7 +428,7 @@ struct TimesMinusI{ return svneg_m(a_v, pg_odd, a_v); } // Complex double - inline vecd operator()(vecd a, vecd b){ + inline vecd operator()(vecd a){ lutd tbl_swap = acle::tbl_swap(); pred pg1 = acle::pg1(); pred pg_odd = acle::pg_odd(); @@ -441,7 +441,7 @@ struct TimesMinusI{ struct TimesI{ // Complex float - inline vecf operator()(vecf a, vecf b){ + inline vecf operator()(vecf a){ lutf tbl_swap = acle::tbl_swap(); pred pg1 = acle::pg1(); pred pg_even = acle::pg_even(); @@ -451,7 +451,7 @@ struct TimesI{ return svneg_m(a_v, pg_even, a_v); } // Complex double - inline vecd operator()(vecd a, vecd b){ + inline vecd operator()(vecd a){ lutd tbl_swap = acle::tbl_swap(); pred pg1 = acle::pg1(); pred pg_even = acle::pg_even(); diff --git a/Grid/simd/Grid_avx.h b/Grid/simd/Grid_avx.h index ad9800fb..f8962714 100644 --- a/Grid/simd/Grid_avx.h +++ b/Grid/simd/Grid_avx.h @@ -405,12 +405,12 @@ struct Conj{ struct TimesMinusI{ //Complex single - inline __m256 operator()(__m256 in, __m256 ret){ + inline __m256 operator()(__m256 in){ __m256 tmp =_mm256_addsub_ps(_mm256_setzero_ps(),in); // r,-i return _mm256_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,0,1)); //-i,r } //Complex double - inline __m256d operator()(__m256d in, __m256d ret){ + inline __m256d operator()(__m256d in){ __m256d tmp = _mm256_addsub_pd(_mm256_setzero_pd(),in); // r,-i return _mm256_shuffle_pd(tmp,tmp,0x5); } @@ -418,12 +418,12 @@ struct TimesMinusI{ struct TimesI{ //Complex single - inline __m256 operator()(__m256 in, __m256 ret){ + inline __m256 operator()(__m256 in){ __m256 tmp =_mm256_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); // i,r return _mm256_addsub_ps(_mm256_setzero_ps(),tmp); // i,-r } //Complex double - inline __m256d operator()(__m256d in, __m256d ret){ + inline __m256d operator()(__m256d in){ __m256d tmp = _mm256_shuffle_pd(in,in,0x5); return _mm256_addsub_pd(_mm256_setzero_pd(),tmp); // i,-r } diff --git a/Grid/simd/Grid_avx512.h b/Grid/simd/Grid_avx512.h index 839d4554..95b96143 100644 --- a/Grid/simd/Grid_avx512.h +++ b/Grid/simd/Grid_avx512.h @@ -271,14 +271,14 @@ struct Conj{ struct TimesMinusI{ //Complex single - inline __m512 operator()(__m512 in, __m512 ret){ + inline __m512 operator()(__m512 in){ //__m512 tmp = _mm512_mask_sub_ps(in,0xaaaa,_mm512_setzero_ps(),in); // real -imag //return _mm512_shuffle_ps(tmp,tmp,_MM_SELECT_FOUR_FOUR(2,3,1,0)); // 0x4E?? __m512 tmp = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); return _mm512_mask_sub_ps(tmp,0xaaaa,_mm512_setzero_ps(),tmp); } //Complex double - inline __m512d operator()(__m512d in, __m512d ret){ + inline __m512d operator()(__m512d in){ //__m512d tmp = _mm512_mask_sub_pd(in,0xaa,_mm512_setzero_pd(),in); // real -imag //return _mm512_shuffle_pd(tmp,tmp,0x55); __m512d tmp = _mm512_shuffle_pd(in,in,0x55); @@ -288,17 +288,16 @@ struct TimesMinusI{ struct TimesI{ //Complex single - inline __m512 operator()(__m512 in, __m512 ret){ + inline __m512 operator()(__m512 in){ __m512 tmp = _mm512_shuffle_ps(in,in,_MM_SELECT_FOUR_FOUR(2,3,0,1)); return _mm512_mask_sub_ps(tmp,0x5555,_mm512_setzero_ps(),tmp); } //Complex double - inline __m512d operator()(__m512d in, __m512d ret){ + inline __m512d operator()(__m512d in){ __m512d tmp = _mm512_shuffle_pd(in,in,0x55); return _mm512_mask_sub_pd(tmp,0x55,_mm512_setzero_pd(),tmp); } - }; // Gpermute utilities consider coalescing into 1 Gpermute diff --git a/Grid/simd/Grid_gpu_rrii.h b/Grid/simd/Grid_gpu_rrii.h new file mode 100644 index 00000000..36f343e4 --- /dev/null +++ b/Grid/simd/Grid_gpu_rrii.h @@ -0,0 +1,878 @@ + /************************************************************************************* + + Grid physics library, www.github.com/paboyle/Grid + + Source file: ./lib/simd/Grid_gpu.h + + Copyright (C) 2021 + +Author: Peter Boyle + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + See the full license in the file "LICENSE" in the top level distribution directory + *************************************************************************************/ + /* END LEGAL */ +//---------------------------------------------------------------------- +/*! @file Grid_gpu_rrii.h*/ +//---------------------------------------------------------------------- + +////////////////////////////// +// fp16 +////////////////////////////// +#ifdef GRID_CUDA +#include +#endif +#ifdef GRID_HIP +#include +#endif +#if !defined(GRID_HIP) && !defined(GRID_CUDA) +namespace Grid { + typedef struct { uint16_t x;} half; +} +#endif +namespace Grid { + accelerator_inline float half2float(half h) + { + float f; +#if defined(GRID_CUDA) || defined(GRID_HIP) + f = __half2float(h); +#else + Grid_half hh; + hh.x = h.x; + f= sfw_half_to_float(hh); +#endif + return f; + } + accelerator_inline half float2half(float f) + { + half h; +#if defined(GRID_CUDA) || defined(GRID_HIP) + h = __float2half(f); +#else + Grid_half hh = sfw_float_to_half(f); + h.x = hh.x; +#endif + return h; + } +} + + +#define COALESCE_GRANULARITY ( GEN_SIMD_WIDTH ) + +namespace Grid { + +//////////////////////////////////////////////////////////////////////// +// Real vector +//////////////////////////////////////////////////////////////////////// +template +struct GpuVector { + _datum rrrr[_N]; + static const int N = _N; + typedef _datum datum; +}; +template +inline accelerator GpuVector operator*(const GpuVector l,const GpuVector r) { + GpuVector ret; + for(int i=0;i +inline accelerator GpuVector operator-(const GpuVector l,const GpuVector r) { + GpuVector ret; + for(int i=0;i +inline accelerator GpuVector operator+(const GpuVector l,const GpuVector r) { + GpuVector ret; + for(int i=0;i +inline accelerator GpuVector operator/(const GpuVector l,const GpuVector r) { + GpuVector ret; + for(int i=0;i +struct GpuComplexVector { + _datum rrrr[_N]; + _datum iiii[_N]; + static const int N = _N; + typedef _datum datum; +}; +template +inline accelerator GpuComplexVector operator*(const GpuComplexVector l,const GpuComplexVector r) { + GpuComplexVector ret; + for(int i=0;i +inline accelerator GpuComplexVector operator-(const GpuComplexVector l,const GpuComplexVector r) { + GpuComplexVector ret; + for(int i=0;i +inline accelerator GpuComplexVector operator+(const GpuComplexVector l,const GpuComplexVector r) { + GpuComplexVector ret; + for(int i=0;i +inline accelerator GpuComplexVector operator/(const GpuComplexVector l,const GpuComplexVector r) { + GpuComplexVector ret; + for(int i=0;i GpuVectorRH; +typedef GpuComplexVector GpuVectorCH; +typedef GpuVector GpuVectorRF; +typedef GpuComplexVector GpuVectorCF; +typedef GpuVector GpuVectorRD; +typedef GpuComplexVector GpuVectorCD; +typedef GpuVector GpuVectorI; + +namespace Optimization { + + struct Vsplat{ + //Complex float + accelerator_inline GpuVectorCF operator()(float a, float b){ + GpuVectorCF ret; + for(int i=0;i + accelerator_inline void operator()(GpuVector a, P* Fp){ + GpuVector *vF = (GpuVector *)Fp; + *vF = a; + } + template + accelerator_inline void operator()(GpuComplexVector a, P* Fp){ + GpuComplexVector *vF = (GpuComplexVector *)Fp; + *vF = a; + } + }; + + struct Vstream{ + template + accelerator_inline void operator()(P* F,GpuVector a){ + GpuVector *vF = (GpuVector *)F; + *vF = a; + } + template + accelerator_inline void operator()(P* F,GpuComplexVector a){ + GpuComplexVector *vF = (GpuComplexVector *)F; + *vF = a; + } + }; + + struct Vset{ + // Complex float + accelerator_inline GpuVectorCF operator()(Grid::ComplexF *a){ + typedef GpuVectorCF vec; + vec ret; + for(int i=0;i + struct Reduce{ + //Need templated class to overload output type + //General form must generate error if compiled + accelerator_inline Out_type operator()(In_type in){ + printf("Error, using wrong Reduce function\n"); + exit(1); + return 0; + } + }; + + ///////////////////////////////////////////////////// + // Arithmetic operations + ///////////////////////////////////////////////////// + struct Sum{ + //Real float + accelerator_inline GpuVectorRF operator()(GpuVectorRF a,GpuVectorRF b){ + return a+b; + } + accelerator_inline GpuVectorRD operator()(GpuVectorRD a,GpuVectorRD b){ + return a+b; + } + accelerator_inline GpuVectorCF operator()(GpuVectorCF a,GpuVectorCF b){ + return a+b; + } + accelerator_inline GpuVectorCD operator()(GpuVectorCD a,GpuVectorCD b){ + return a+b; + } + accelerator_inline GpuVectorI operator()(GpuVectorI a,GpuVectorI b){ + return a+b; + } + }; + + struct Sub{ + accelerator_inline GpuVectorRF operator()(GpuVectorRF a,GpuVectorRF b){ + return a-b; + } + accelerator_inline GpuVectorRD operator()(GpuVectorRD a,GpuVectorRD b){ + return a-b; + } + accelerator_inline GpuVectorCF operator()(GpuVectorCF a,GpuVectorCF b){ + return a-b; + } + accelerator_inline GpuVectorCD operator()(GpuVectorCD a,GpuVectorCD b){ + return a-b; + } + accelerator_inline GpuVectorI operator()(GpuVectorI a,GpuVectorI b){ + return a-b; + } + }; + + struct MultRealPart{ + accelerator_inline GpuVectorCF operator()(GpuVectorCF a,GpuVectorCF b){ + typedef GpuVectorCF vec; + vec ret; + for(int i=0;i + static accelerator_inline GpuVector<_N,_datum> PermuteN(GpuVector<_N,_datum> &in) { + typedef GpuVector<_N,_datum> vec; + vec out; + unsigned int _mask = vec::N >> (n + 1); + for(int i=0;i + static accelerator_inline GpuComplexVector<_N,_datum> PermuteN(GpuComplexVector<_N,_datum> &in) { + typedef GpuComplexVector<_N,_datum> vec; + vec out; + unsigned int _mask = vec::N >> (n + 1); + for(int i=0;i static accelerator_inline vec Permute0(vec in) { return PermuteN<0,vec::N,typename vec::datum>(in); } + template static accelerator_inline vec Permute1(vec in) { return PermuteN<1,vec::N,typename vec::datum>(in); } + template static accelerator_inline vec Permute2(vec in) { return PermuteN<2,vec::N,typename vec::datum>(in); } + template static accelerator_inline vec Permute3(vec in) { return PermuteN<3,vec::N,typename vec::datum>(in); } + + }; + + struct PrecisionChange { + + //////////////////////////////////////////////////////////////////////////////////// + // Single / Half + //////////////////////////////////////////////////////////////////////////////////// + static accelerator_inline GpuVectorCH StoH (GpuVectorCF a,GpuVectorCF b) { + int N = GpuVectorCF::N; + GpuVectorCH h; + for(int i=0;i + static accelerator_inline void ExchangeN(GpuVector<_N,_datum> &out1, + GpuVector<_N,_datum> &out2, + GpuVector<_N,_datum> &in1, + GpuVector<_N,_datum> &in2 ) + { + typedef GpuVector<_N,_datum> vec; + unsigned int mask = vec::N >> (n + 1); + for(int i=0;i + static accelerator_inline void ExchangeN(GpuComplexVector<_N,_datum> &out1, + GpuComplexVector<_N,_datum> &out2, + GpuComplexVector<_N,_datum> &in1, + GpuComplexVector<_N,_datum> &in2 ) + { + typedef GpuComplexVector<_N,_datum> vec; + unsigned int mask = vec::N >> (n + 1); + for(int i=0;i + static accelerator_inline void Exchange0(vec &out1,vec &out2,vec &in1,vec &in2){ + ExchangeN<0>(out1,out2,in1,in2); + }; + template + static accelerator_inline void Exchange1(vec &out1,vec &out2,vec &in1,vec &in2){ + ExchangeN<1>(out1,out2,in1,in2); + }; + template + static accelerator_inline void Exchange2(vec &out1,vec &out2,vec &in1,vec &in2){ + ExchangeN<2>(out1,out2,in1,in2); + }; + template + static accelerator_inline void Exchange3(vec &out1,vec &out2,vec &in1,vec &in2){ + ExchangeN<3>(out1,out2,in1,in2); + }; + +}; + +struct Rotate{ + + template static accelerator_inline vec tRotate(vec in){ + return rotate(in, n); + } + + template + static accelerator_inline GpuComplexVector<_N,_datum> rotate_template(GpuComplexVector<_N,_datum> &in, int n) + { + typedef GpuComplexVector<_N,_datum> vec; + vec out; + for(int i=0;i + static accelerator_inline GpuVector<_N,_datum> rotate_template(GpuVector<_N,_datum> &in, int n) + { + typedef GpuVector<_N,_datum> vec; + vec out; + for(int i=0;i + accelerator_inline Grid::ComplexF + Reduce::operator()(GpuVectorCF in) + { + Grid::ComplexF greduce(in.rrrr[0],in.iiii[0]); + for(int i=1;i + accelerator_inline Grid::ComplexD + Reduce::operator()(GpuVectorCD in) + { + Grid::ComplexD greduce(in.rrrr[0],in.iiii[0]); + for(int i=1;i + accelerator_inline Grid::RealF + Reduce::operator()(GpuVectorRF in) + { + RealF ret = in.rrrr[0]; + for(int i=1;i + accelerator_inline Grid::RealD + Reduce::operator()(GpuVectorRD in) + { + RealD ret = in.rrrr[0]; + for(int i=1;i + accelerator_inline Integer + Reduce::operator()(GpuVectorI in) + { + Integer ret = in.rrrr[0]; + for(int i=1;i using ReduceSIMD = Optimization::Reduce; + + // Arithmetic operations + typedef Optimization::Sum SumSIMD; + typedef Optimization::Sub SubSIMD; + typedef Optimization::Div DivSIMD; + typedef Optimization::Mult MultSIMD; + typedef Optimization::MultComplex MultComplexSIMD; + typedef Optimization::MultRealPart MultRealPartSIMD; + typedef Optimization::MaddRealPart MaddRealPartSIMD; + typedef Optimization::Conj ConjSIMD; + typedef Optimization::TimesMinusI TimesMinusISIMD; + typedef Optimization::TimesI TimesISIMD; + +} diff --git a/Grid/simd/Grid_gpu_vec.h b/Grid/simd/Grid_gpu_vec.h index b2c7588f..6f4528c7 100644 --- a/Grid/simd/Grid_gpu_vec.h +++ b/Grid/simd/Grid_gpu_vec.h @@ -38,7 +38,7 @@ Author: Peter Boyle #ifdef GRID_HIP #include #endif -#ifdef GRID_SYCL +#if !defined(GRID_CUDA) && !defined(GRID_HIP) namespace Grid { typedef struct { uint16_t x;} half; typedef struct { half x; half y;} half2; @@ -486,7 +486,7 @@ namespace Optimization { struct TimesMinusI{ //Complex single - accelerator_inline GpuVectorCF operator()(GpuVectorCF in,GpuVectorCF dummy){ + accelerator_inline GpuVectorCF operator()(GpuVectorCF in){ typedef GpuVectorCF vec; vec ret; for(int i=0;i friend accelerator_inline Grid_simd SimdApply(const functor &func, const Grid_simd &v) { Grid_simd ret; - Grid_simd::conv_t conv; Grid_simd::scalar_type s; - conv.v = v.v; for (int i = 0; i < Nsimd(); i++) { - s = conv.s[i]; - conv.s[i] = func(s); + s = v.getlane(i); + s = func(s); + ret.putlane(s,i); } - ret.v = conv.v; return ret; } template @@ -571,18 +569,14 @@ public: const Grid_simd &x, const Grid_simd &y) { Grid_simd ret; - Grid_simd::conv_t cx; - Grid_simd::conv_t cy; Grid_simd::scalar_type sx,sy; - cx.v = x.v; - cy.v = y.v; for (int i = 0; i < Nsimd(); i++) { - sx = cx.s[i]; - sy = cy.s[i]; - cx.s[i] = func(sx,sy); + sx = x.getlane(i); + sy = y.getlane(i); + sx = func(sx,sy); + ret.putlane(sx,i); } - ret.v = cx.v; return ret; } /////////////////////// @@ -645,15 +639,36 @@ public: /////////////////////////////// // Getting single lanes /////////////////////////////// - accelerator_inline Scalar_type getlane(int lane) { +#ifdef GPU_RRII + template = 0> + accelerator_inline Scalar_type getlane(int lane) const { + return Scalar_type(v.rrrr[lane],v.iiii[lane]); + } + template = 0> + accelerator_inline void putlane(const Scalar_type &_S, int lane){ + v.rrrr[lane] = real(_S); + v.iiii[lane] = imag(_S); + } + template = 0> + accelerator_inline Scalar_type getlane(int lane) const { + return ((S*)&v)[lane]; + } + template = 0> + accelerator_inline void putlane(const S &_S, int lane){ + ((Scalar_type*)&v)[lane] = _S; + } +#else // Can pun to an array of complex + accelerator_inline Scalar_type getlane(int lane) const { return ((Scalar_type*)&v)[lane]; } - accelerator_inline void putlane(const Scalar_type &S, int lane){ ((Scalar_type*)&v)[lane] = S; } +#endif + }; // end of Grid_simd class definition + /////////////////////////////// // Define available types /////////////////////////////// @@ -663,7 +678,7 @@ typedef Grid_simd vRealD; typedef Grid_simd vInteger; typedef Grid_simd vRealH; -#ifdef GPU_VEC +#if defined(GPU_VEC) || defined(GPU_RRII) typedef Grid_simd, SIMD_CHtype> vComplexH; typedef Grid_simd , SIMD_CFtype> vComplexF; typedef Grid_simd , SIMD_CDtype> vComplexD; @@ -763,6 +778,7 @@ accelerator_inline void vsplat(Grid_simd &ret, NotEnableIf, } ////////////////////////// + /////////////////////////////////////////////// // Initialise to 1,0,i for the correct types /////////////////////////////////////////////// @@ -907,34 +923,6 @@ accelerator_inline Grid_simd fxmac(Grid_simd a, Grid_simd b, G // ---------------------------------------------- -// Distinguish between complex types and others -template = 0> -accelerator_inline Grid_simd operator/(Grid_simd a, Grid_simd b) { - typedef Grid_simd simd; - - simd ret; - simd den; - typename simd::conv_t conv; - - ret = a * conjugate(b) ; - den = b * conjugate(b) ; - - // duplicates real part - auto real_den = toReal(den); - simd zden; - memcpy((void *)&zden.v,(void *)&real_den.v,sizeof(zden)); - ret.v=binary(ret.v, zden.v, DivSIMD()); - return ret; -}; - -// Real/Integer types -template = 0> -accelerator_inline Grid_simd operator/(Grid_simd a, Grid_simd b) { - Grid_simd ret; - ret.v = binary(a.v, b.v, DivSIMD()); - return ret; -}; - /////////////////////// // Conjugate /////////////////////// @@ -959,30 +947,29 @@ accelerator_inline Grid_simd adj(const Grid_simd &in) { /////////////////////// template = 0> accelerator_inline void timesMinusI(Grid_simd &ret, const Grid_simd &in) { - ret.v = binary(in.v, ret.v, TimesMinusISIMD()); + ret.v = unary(in.v, TimesMinusISIMD()); } template = 0> accelerator_inline Grid_simd timesMinusI(const Grid_simd &in) { Grid_simd ret; - timesMinusI(ret, in); + ret.v=unary(in.v, TimesMinusISIMD()); return ret; } template = 0> accelerator_inline Grid_simd timesMinusI(const Grid_simd &in) { return in; } - /////////////////////// // timesI /////////////////////// template = 0> accelerator_inline void timesI(Grid_simd &ret, const Grid_simd &in) { - ret.v = binary(in.v, ret.v, TimesISIMD()); + ret.v = unary(in.v, TimesISIMD()); } template = 0> accelerator_inline Grid_simd timesI(const Grid_simd &in) { Grid_simd ret; - timesI(ret, in); + ret.v= unary(in.v, TimesISIMD()); return ret; } template = 0> @@ -990,6 +977,35 @@ accelerator_inline Grid_simd timesI(const Grid_simd &in) { return in; } + +// Distinguish between complex types and others +template = 0> +accelerator_inline Grid_simd operator/(Grid_simd a, Grid_simd b) { + typedef Grid_simd simd; + + simd ret; + simd den; + + ret = a * conjugate(b) ; + den = b * conjugate(b) ; + + // duplicates real part + auto real_den = toReal(den); + simd zden; + memcpy((void *)&zden.v,(void *)&real_den.v,sizeof(zden)); + ret.v=binary(ret.v, zden.v, DivSIMD()); + return ret; +}; + +// Real/Integer types +template = 0> +accelerator_inline Grid_simd operator/(Grid_simd a, Grid_simd b) { + Grid_simd ret; + ret.v = binary(a.v, b.v, DivSIMD()); + return ret; +}; + + ///////////////////// // Inner, outer ///////////////////// @@ -1021,12 +1037,12 @@ template // must be a real arg accelerator_inline typename toRealMapper::Realified toReal(const Csimd &in) { typedef typename toRealMapper::Realified Rsimd; Rsimd ret; - typename Rsimd::conv_t conv; - memcpy((void *)&conv.v,(void *)&in.v,sizeof(conv.v)); + int j=0; for (int i = 0; i < Rsimd::Nsimd(); i += 2) { - conv.s[i + 1] = conv.s[i]; // duplicate (r,r);(r,r);(r,r); etc... + auto s = real(in.getlane(j++)); + ret.putlane(s,i); + ret.putlane(s,i+1); } - memcpy((void *)&ret.v,(void *)&conv.v,sizeof(ret.v)); return ret; } @@ -1039,18 +1055,19 @@ template // must be a real arg accelerator_inline typename toComplexMapper::Complexified toComplex(const Rsimd &in) { typedef typename toComplexMapper::Complexified Csimd; - typename Rsimd::conv_t conv; // address as real - - conv.v = in.v; + typedef typename Csimd::scalar_type scalar_type; + int j=0; + Csimd ret; for (int i = 0; i < Rsimd::Nsimd(); i += 2) { - assert(conv.s[i + 1] == conv.s[i]); + auto rr = in.getlane(i); + auto ri = in.getlane(i+1); + assert(rr==ri); // trap any cases where real was not duplicated // indicating the SIMD grids of real and imag assignment did not correctly // match - conv.s[i + 1] = 0.0; // zero imaginary parts + scalar_type s(rr,0.0); + ret.putlane(s,j++); } - Csimd ret; - memcpy((void *)&ret.v,(void *)&conv.v,sizeof(ret.v)); return ret; } @@ -1146,6 +1163,27 @@ template <> struct is_simd : public std::true_type {}; template using IfSimd = Invoke::value, int> >; template using IfNotSimd = Invoke::value, unsigned> >; +/////////////////////////////////////////////// +// Convenience insert / extract with complex support +/////////////////////////////////////////////// +template +accelerator_inline S getlane(const Grid_simd &in,int lane) { + return in.getlane(lane); +} +template +accelerator_inline void putlane(Grid_simd &vec,const S &_S, int lane){ + vec.putlane(_S,lane); +} +template = 0 > +accelerator_inline S getlane(const S &in,int lane) { + return in; +} +template = 0 > +accelerator_inline void putlane(S &vec,const S &_S, int lane){ + vec = _S; +} + + NAMESPACE_END(Grid); #endif diff --git a/Grid/simd/Simd.h b/Grid/simd/Simd.h index 76ca3bef..9de192bb 100644 --- a/Grid/simd/Simd.h +++ b/Grid/simd/Simd.h @@ -69,6 +69,7 @@ typedef RealF Real; typedef thrust::complex ComplexF; typedef thrust::complex ComplexD; typedef thrust::complex Complex; +typedef thrust::complex ComplexH; template using complex = thrust::complex; accelerator_inline ComplexD pow(const ComplexD& r,RealD y){ return(thrust::pow(r,(double)y)); } @@ -77,6 +78,7 @@ accelerator_inline ComplexF pow(const ComplexF& r,RealF y){ return(thrust::pow(r typedef std::complex ComplexF; typedef std::complex ComplexD; typedef std::complex Complex; +typedef std::complex ComplexH; // Hack template using complex = std::complex; accelerator_inline ComplexD pow(const ComplexD& r,RealD y){ return(std::pow(r,y)); } diff --git a/Grid/stencil/Stencil.h b/Grid/stencil/Stencil.h index 19b5e6ea..e23ff258 100644 --- a/Grid/stencil/Stencil.h +++ b/Grid/stencil/Stencil.h @@ -216,7 +216,6 @@ class CartesianStencil : public CartesianStencilAccelerator View_type; typedef typename View_type::StencilVector StencilVector; @@ -1014,7 +1013,6 @@ public: int Gather(const Lattice &rhs,int dimension,int shift,int cbmask,compressor & compress,int &face_idx, int point) { typedef typename cobj::vector_type vector_type; - typedef typename cobj::scalar_type scalar_type; int comms_send = this->_comms_send[point] ; int comms_recv = this->_comms_recv[point] ; diff --git a/Grid/tensors/Tensor_class.h b/Grid/tensors/Tensor_class.h index be045ede..f3114cb5 100644 --- a/Grid/tensors/Tensor_class.h +++ b/Grid/tensors/Tensor_class.h @@ -178,6 +178,7 @@ public: stream << "S {" << o._internal << "}"; return stream; }; + // FIXME These will break with change of data layout strong_inline const scalar_type * begin() const { return reinterpret_cast(&_internal); } strong_inline scalar_type * begin() { return reinterpret_cast< scalar_type *>(&_internal); } strong_inline const scalar_type * end() const { return begin() + Traits::count; } @@ -288,6 +289,7 @@ public: // return _internal[i]; // } + // FIXME These will break with change of data layout strong_inline const scalar_type * begin() const { return reinterpret_cast(_internal); } strong_inline scalar_type * begin() { return reinterpret_cast< scalar_type *>(_internal); } strong_inline const scalar_type * end() const { return begin() + Traits::count; } @@ -430,6 +432,7 @@ public: // return _internal[i][j]; // } + // FIXME These will break with change of data layout strong_inline const scalar_type * begin() const { return reinterpret_cast(_internal[0]); } strong_inline scalar_type * begin() { return reinterpret_cast< scalar_type *>(_internal[0]); } strong_inline const scalar_type * end() const { return begin() + Traits::count; } diff --git a/Grid/tensors/Tensor_extract_merge.h b/Grid/tensors/Tensor_extract_merge.h index ab14f81f..c63a2439 100644 --- a/Grid/tensors/Tensor_extract_merge.h +++ b/Grid/tensors/Tensor_extract_merge.h @@ -1,5 +1,5 @@ /************************************************************************************* -n + Grid physics library, www.github.com/paboyle/Grid Source file: ./lib/tensors/Tensor_extract_merge.h @@ -62,8 +62,18 @@ void extract(const vobj &vec,ExtractBuffer &extracted) const int words=sizeof(vobj)/sizeof(vector_type); const int Nsimd=vector_type::Nsimd(); const int Nextr=extracted.size(); + vector_type * vp = (vector_type *)&vec; const int s=Nsimd/Nextr; sobj_scalar_type *sp = (sobj_scalar_type *) &extracted[0]; + sobj_scalar_type stmp; + for(int w=0;w &extracted) memcpy((char *)&sp[i*words+w],(char *)&stmp,sizeof(stmp)); } } + */ + return; } @@ -93,7 +105,7 @@ void merge(vobj &vec,ExtractBuffer &extracted) const int s=Nsimd/Nextr; sobj_scalar_type *sp = (sobj_scalar_type *)&extracted[0]; - scalar_type *vp = (scalar_type *)&vec; + vector_type *vp = (vector_type *)&vec; scalar_type vtmp; sobj_scalar_type stmp; for(int w=0;w &extracted) for(int ii=0;ii &extracted, int off const int Nextr=extracted.size(); const int s = Nsimd/Nextr; - scalar_type * vp = (scalar_type *)&vec; + vector_type * vp = (vector_type *)&vec; scalar_type vtmp; sobj_scalar_type stmp; for(int w=0;w &extracted, int offset) const int Nextr=extracted.size(); const int s = Nsimd/Nextr; - scalar_type * vp = (scalar_type *)&vec; + vector_type * vp = (vector_type *)&vec; scalar_type vtmp; sobj_scalar_type stmp; for(int w=0;w