mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-04 19:25:56 +01:00
Optional GPU target SIMD types; work in progress and trying experiments
This commit is contained in:
parent
7a4c142955
commit
066be31a3b
652
lib/simd/Grid_gpu.h
Normal file
652
lib/simd/Grid_gpu.h
Normal file
@ -0,0 +1,652 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/simd/Grid_gpu.h
|
||||
|
||||
Copyright (C) 2018
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
//----------------------------------------------------------------------
|
||||
/*! @file Grid_gpu.h
|
||||
@brief Optimization libraries for GPU
|
||||
Use float4, double2
|
||||
*/
|
||||
//----------------------------------------------------------------------
|
||||
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
namespace Grid {
|
||||
|
||||
// re im, re, im, re, im etc..
|
||||
struct half8 {
|
||||
half ax, ay, az, aw, bx, by, bz, bw;
|
||||
};
|
||||
accelerator_inline float half2float(half h)
|
||||
{
|
||||
float f;
|
||||
#ifdef __CUDA_ARCH__
|
||||
f = __half2float(h);
|
||||
#else
|
||||
//f = __half2float(h);
|
||||
__half_raw hr(h);
|
||||
Grid_half hh;
|
||||
hh.x = hr.x;
|
||||
f= sfw_half_to_float(hh);
|
||||
#endif
|
||||
return f;
|
||||
}
|
||||
accelerator_inline half float2half(float f)
|
||||
{
|
||||
half h;
|
||||
#ifdef __CUDA_ARCH__
|
||||
h = __float2half(f);
|
||||
#else
|
||||
Grid_half hh = sfw_float_to_half(f);
|
||||
__half_raw hr;
|
||||
hr.x = hh.x;
|
||||
h = __half(hr);
|
||||
#endif
|
||||
return h;
|
||||
}
|
||||
|
||||
namespace Optimization {
|
||||
|
||||
inline accelerator float4 operator*(float4 a,float4 b) {return make_float4(a.x*b.x,a.y*b.y,a.z*b.z,a.w*b.w);}
|
||||
inline accelerator float4 operator+(float4 a,float4 b) {return make_float4(a.x+b.x,a.y+b.y,a.z+b.z,a.w+b.w);}
|
||||
inline accelerator float4 operator-(float4 a,float4 b) {return make_float4(a.x-b.x,a.y-b.y,a.z-b.z,a.w-b.w);}
|
||||
inline accelerator float4 operator/(float4 a,float4 b) {return make_float4(a.x/b.x,a.y/b.y,a.z/b.z,a.w/b.w);}
|
||||
|
||||
inline accelerator double2 operator*(double2 a,double2 b) {return make_double2(a.x*b.x,a.y*b.y);}
|
||||
inline accelerator double2 operator+(double2 a,double2 b) {return make_double2(a.x+b.x,a.y+b.y);}
|
||||
inline accelerator double2 operator-(double2 a,double2 b) {return make_double2(a.x-b.x,a.y-b.y);}
|
||||
inline accelerator double2 operator/(double2 a,double2 b) {return make_double2(a.x/b.x,a.y/b.y);}
|
||||
|
||||
inline accelerator int4 operator*(int4 a,int4 b) {return make_int4(a.x*b.x,a.y*b.y,a.z*b.z,a.w*b.w);}
|
||||
inline accelerator int4 operator+(int4 a,int4 b) {return make_int4(a.x+b.x,a.y+b.y,a.z+b.z,a.w+b.w);}
|
||||
inline accelerator int4 operator-(int4 a,int4 b) {return make_int4(a.x-b.x,a.y-b.y,a.z-b.z,a.w-b.w);}
|
||||
inline accelerator int4 operator/(int4 a,int4 b) {return make_int4(a.x/b.x,a.y/b.y,a.z/b.z,a.w/b.w);}
|
||||
|
||||
struct Vsplat{
|
||||
//Complex float
|
||||
accelerator_inline float4 operator()(float a, float b){
|
||||
float4 ret;
|
||||
ret.x=ret.z=a;
|
||||
ret.y=ret.w=b;
|
||||
return ret;
|
||||
}
|
||||
// Real float
|
||||
accelerator_inline float4 operator()(float a){
|
||||
float4 ret;
|
||||
ret.x=ret.y=ret.z=ret.w = a;
|
||||
return ret;
|
||||
}
|
||||
//Complex double
|
||||
accelerator_inline double2 operator()(double a, double b){
|
||||
double2 ret;
|
||||
ret.x=a;
|
||||
ret.y=b;
|
||||
return ret;
|
||||
}
|
||||
//Real double
|
||||
accelerator_inline double2 operator()(double a){
|
||||
double2 ret;
|
||||
ret.x = ret.y = a;
|
||||
return ret;
|
||||
}
|
||||
//Integer
|
||||
accelerator_inline int4 operator()(Integer a){
|
||||
int4 ret;
|
||||
ret.x=ret.y=ret.z=ret.w=a;
|
||||
return ret;
|
||||
}
|
||||
};
|
||||
|
||||
struct Vstore{
|
||||
//Float
|
||||
accelerator_inline void operator()(float4 a, float* F){
|
||||
float4 *F4 = (float4 *)F;
|
||||
*F4 = a;
|
||||
}
|
||||
//Double
|
||||
accelerator_inline void operator()(double2 a, double* D){
|
||||
double2 *D2 = (double2 *)D;
|
||||
*D2 = a;
|
||||
}
|
||||
//Integer
|
||||
accelerator_inline void operator()(int4 a, Integer* I){
|
||||
int4 *I4 = (int4 *)I;
|
||||
*I4 = a;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
struct Vstream{
|
||||
//Float
|
||||
accelerator_inline void operator()(float * a, float4 b){
|
||||
float4 * a4 = (float4 *)a;
|
||||
*a4 = b;
|
||||
}
|
||||
//Double
|
||||
accelerator_inline void operator()(double * a, double2 b){
|
||||
double2 * a2 = (double2 *)a;
|
||||
*a2 = b;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
struct Vset{
|
||||
// Complex float
|
||||
accelerator_inline float4 operator()(Grid::ComplexF *a){
|
||||
float4 ret;
|
||||
ret.x = a[0].real();
|
||||
ret.y = a[0].imag();
|
||||
ret.z = a[1].real();
|
||||
ret.w = a[1].imag();
|
||||
return ret;
|
||||
}
|
||||
// Complex double
|
||||
accelerator_inline double2 operator()(Grid::ComplexD *a){
|
||||
double2 ret;
|
||||
ret.x = a[0].real();
|
||||
ret.y = a[0].imag();
|
||||
return ret;
|
||||
}
|
||||
// Real float
|
||||
accelerator_inline float4 operator()(float *a){
|
||||
float4 ret;
|
||||
ret.x = a[0];
|
||||
ret.y = a[1];
|
||||
ret.z = a[2];
|
||||
ret.w = a[3];
|
||||
return ret;
|
||||
}
|
||||
// Real double
|
||||
accelerator_inline double2 operator()(double *a){
|
||||
double2 ret;
|
||||
ret.x = a[0];
|
||||
ret.y = a[1];
|
||||
return ret;
|
||||
}
|
||||
// Integer
|
||||
accelerator_inline int4 operator()(Integer *a){
|
||||
int4 ret;
|
||||
ret.x = a[0];
|
||||
ret.y = a[1];
|
||||
ret.z = a[2];
|
||||
ret.w = a[3];
|
||||
return ret;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
template <typename Out_type, typename In_type>
|
||||
struct Reduce{
|
||||
//Need templated class to overload output type
|
||||
//General form must generate error if compiled
|
||||
accelerator_inline Out_type operator()(In_type in){
|
||||
printf("Error, using wrong Reduce function\n");
|
||||
exit(1);
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////
|
||||
// Arithmetic operations
|
||||
/////////////////////////////////////////////////////
|
||||
struct Sum{
|
||||
//Complex/Real float
|
||||
accelerator_inline float4 operator()(float4 a, float4 b){
|
||||
return a+b;
|
||||
}
|
||||
//Complex/Real double
|
||||
accelerator_inline double2 operator()(double2 a, double2 b){
|
||||
return a+b;
|
||||
}
|
||||
//Integer
|
||||
accelerator_inline int4 operator()(int4 a,int4 b){
|
||||
return a+b;
|
||||
}
|
||||
};
|
||||
|
||||
struct Sub{
|
||||
//Complex/Real float
|
||||
accelerator_inline float4 operator()(float4 a, float4 b){
|
||||
return a-b;
|
||||
}
|
||||
//Complex/Real double
|
||||
accelerator_inline double2 operator()(double2 a, double2 b){
|
||||
return a-b;
|
||||
}
|
||||
//Integer
|
||||
accelerator_inline int4 operator()(int4 a, int4 b){
|
||||
return a-b;
|
||||
}
|
||||
};
|
||||
|
||||
struct MultRealPart{
|
||||
accelerator_inline float4 operator()(float4 a, float4 b){
|
||||
float4 ymm0;
|
||||
ymm0.x = a.y;
|
||||
ymm0.y = a.y;
|
||||
ymm0.z = a.w;
|
||||
ymm0.w = a.w;
|
||||
return ymm0*b;
|
||||
// ymm0 = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
|
||||
// return _mm_mul_ps(ymm0,b); // ymm0 <- ar bi, ar br
|
||||
}
|
||||
accelerator_inline double2 operator()(double2 a, double2 b){
|
||||
double2 ymm0;
|
||||
ymm0.x = a.y;
|
||||
ymm0.y = a.y;
|
||||
return ymm0*b;
|
||||
// ymm0 = _mm_shuffle_pd(a,a,0x0); // ymm0 <- ar ar, ar,ar b'00,00
|
||||
// return _mm_mul_pd(ymm0,b); // ymm0 <- ar bi, ar br
|
||||
}
|
||||
};
|
||||
struct MaddRealPart{
|
||||
accelerator_inline float4 operator()(float4 a, float4 b, float4 c){
|
||||
float4 ymm0; // = _mm_shuffle_ps(a,a,_MM_SELECT_FOUR_FOUR(2,2,0,0)); // ymm0 <- ar ar,
|
||||
ymm0.x = a.y;
|
||||
ymm0.y = a.y;
|
||||
ymm0.z = a.w;
|
||||
ymm0.w = a.w;
|
||||
return c+ymm0*b;
|
||||
}
|
||||
accelerator_inline double2 operator()(double2 a, double2 b, double2 c){
|
||||
// ymm0 = _mm_shuffle_pd( a, a, 0x0 );
|
||||
double2 ymm0;
|
||||
ymm0.x = a.y;
|
||||
ymm0.y = a.y;
|
||||
return c+ymm0*b;
|
||||
}
|
||||
};
|
||||
|
||||
struct MultComplex{
|
||||
// Complex float
|
||||
accelerator_inline float4 operator()(float4 a, float4 b){
|
||||
float4 ymm0;
|
||||
ymm0.x = a.x*b.x - a.y*b.y ; // rr - ii
|
||||
ymm0.y = a.x*b.y + a.y*b.x ; // ir + ri
|
||||
ymm0.z = a.z*b.z - a.w*b.w ; // rr - ii
|
||||
ymm0.w = a.w*b.z + a.z*b.w ; // ir + ri
|
||||
return ymm0;
|
||||
}
|
||||
// Complex double
|
||||
accelerator_inline double2 operator()(double2 a, double2 b){
|
||||
double2 ymm0;
|
||||
ymm0.x = a.x*b.x - a.y*b.y ; // rr - ii
|
||||
ymm0.y = a.x*b.y + a.y*b.x ; // ir + ri
|
||||
return ymm0;
|
||||
}
|
||||
};
|
||||
|
||||
struct Mult{
|
||||
|
||||
accelerator_inline void mac(float4 &a, float4 b, float4 c){
|
||||
a= a+b*c;
|
||||
}
|
||||
|
||||
accelerator_inline void mac(double2 &a, double2 b, double2 c){
|
||||
a= a+b*c;
|
||||
}
|
||||
|
||||
// Real float
|
||||
accelerator_inline float4 operator()(float4 a, float4 b){
|
||||
return a*b;
|
||||
}
|
||||
// Real double
|
||||
accelerator_inline double2 operator()(double2 a, double2 b){
|
||||
return a*b;
|
||||
}
|
||||
// Integer
|
||||
accelerator_inline int4 operator()(int4 a, int4 b){
|
||||
return a*b;
|
||||
}
|
||||
};
|
||||
|
||||
struct Div{
|
||||
// Real float
|
||||
accelerator_inline float4 operator()(float4 a, float4 b){
|
||||
return a/b;
|
||||
}
|
||||
// Real double
|
||||
accelerator_inline double2 operator()(double2 a, double2 b){
|
||||
return a/b;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
struct Conj{
|
||||
// Complex single
|
||||
accelerator_inline float4 operator()(float4 in){
|
||||
float4 ret;
|
||||
ret.x = in.x;
|
||||
ret.y = - in.y;
|
||||
ret.z = in.z;
|
||||
ret.w = - in.w;
|
||||
return ret;
|
||||
}
|
||||
// Complex double
|
||||
accelerator_inline double2 operator()(double2 in){
|
||||
double2 ret;
|
||||
ret.x = in.x;
|
||||
ret.y = - in.y;
|
||||
return ret;
|
||||
}
|
||||
// do not define for integer input
|
||||
};
|
||||
|
||||
struct TimesMinusI{
|
||||
//Complex single
|
||||
accelerator_inline float4 operator()(float4 in, float4 ret){
|
||||
float4 tmp;
|
||||
tmp.x = in.y;
|
||||
tmp.y = - in.x;
|
||||
tmp.z = in.w;
|
||||
tmp.w = - in.z;
|
||||
return tmp;
|
||||
}
|
||||
//Complex double
|
||||
accelerator_inline double2 operator()(double2 in, double2 ret){
|
||||
double2 tmp;
|
||||
tmp.x = in.y;
|
||||
tmp.y = - in.x;
|
||||
return tmp;
|
||||
}
|
||||
};
|
||||
|
||||
struct TimesI{
|
||||
//Complex single
|
||||
accelerator_inline float4 operator()(float4 in, float4 ret){
|
||||
float4 tmp;
|
||||
tmp.x = - in.y;
|
||||
tmp.y = in.x;
|
||||
tmp.z = - in.w;
|
||||
tmp.w = in.z;
|
||||
return tmp;
|
||||
}
|
||||
//Complex double
|
||||
accelerator_inline double2 operator()(double2 in, double2 ret){
|
||||
double2 tmp ;
|
||||
tmp.x = - in.y;
|
||||
tmp.y = in.x;
|
||||
return tmp;
|
||||
}
|
||||
};
|
||||
|
||||
struct Permute{
|
||||
|
||||
static accelerator_inline float4 Permute0(float4 in){
|
||||
float4 tmp;
|
||||
tmp.x = in.z;
|
||||
tmp.y = in.w;
|
||||
tmp.z = in.x;
|
||||
tmp.w = in.y;
|
||||
return tmp;
|
||||
};
|
||||
static accelerator_inline float4 Permute1(float4 in){
|
||||
float4 tmp;
|
||||
tmp.x = in.y;
|
||||
tmp.y = in.x;
|
||||
tmp.z = in.w;
|
||||
tmp.w = in.z;
|
||||
return tmp;
|
||||
};
|
||||
static accelerator_inline float4 Permute2(float4 in){
|
||||
return in;
|
||||
};
|
||||
static accelerator_inline float4 Permute3(float4 in){
|
||||
return in;
|
||||
};
|
||||
|
||||
static accelerator_inline double2 Permute0(double2 in){ //AB -> BA
|
||||
double2 tmp;
|
||||
tmp.x = in.y;
|
||||
tmp.y = in.x;
|
||||
return tmp;
|
||||
};
|
||||
static accelerator_inline double2 Permute1(double2 in){
|
||||
return in;
|
||||
};
|
||||
static accelerator_inline double2 Permute2(double2 in){
|
||||
return in;
|
||||
};
|
||||
static accelerator_inline double2 Permute3(double2 in){
|
||||
return in;
|
||||
};
|
||||
};
|
||||
|
||||
struct PrecisionChange {
|
||||
static accelerator_inline half8 StoH (float4 a,float4 b) {
|
||||
half8 h;
|
||||
h.ax = float2half(a.x);
|
||||
h.ay = float2half(a.y);
|
||||
h.az = float2half(a.z);
|
||||
h.aw = float2half(a.w);
|
||||
h.bx = float2half(b.x);
|
||||
h.by = float2half(b.y);
|
||||
h.bz = float2half(b.z);
|
||||
h.bw = float2half(b.w);
|
||||
return h;
|
||||
}
|
||||
static accelerator_inline void HtoS (half8 h,float4 &sa,float4 &sb) {
|
||||
sa.x = half2float(h.ax);
|
||||
sa.y = half2float(h.ay);
|
||||
sa.z = half2float(h.az);
|
||||
sa.w = half2float(h.aw);
|
||||
sb.x = half2float(h.bx);
|
||||
sb.y = half2float(h.by);
|
||||
sb.z = half2float(h.bz);
|
||||
sb.w = half2float(h.bw);
|
||||
}
|
||||
static accelerator_inline float4 DtoS (double2 a,double2 b) {
|
||||
float4 s;
|
||||
s.x = a.x;
|
||||
s.y = a.y;
|
||||
s.z = b.x;
|
||||
s.w = b.y;
|
||||
return s;
|
||||
}
|
||||
static accelerator_inline void StoD (float4 s,double2 &a,double2 &b) {
|
||||
a.x = s.x;
|
||||
a.y = s.y;
|
||||
b.x = s.z;
|
||||
b.y = s.w;
|
||||
}
|
||||
static accelerator_inline half8 DtoH (double2 a,double2 b,double2 c,double2 d) {
|
||||
float4 sa,sb;
|
||||
sa = DtoS(a,b);
|
||||
sb = DtoS(c,d);
|
||||
return StoH(sa,sb);
|
||||
}
|
||||
static accelerator_inline void HtoD (half8 h,double2 &a,double2 &b,double2 &c,double2 &d) {
|
||||
float4 sa,sb;
|
||||
HtoS(h,sa,sb);
|
||||
StoD(sa,a,b);
|
||||
StoD(sb,c,d);
|
||||
}
|
||||
};
|
||||
|
||||
struct Exchange{
|
||||
// 3210 ordering
|
||||
|
||||
static accelerator_inline void Exchange0(float4 &out1,float4 &out2,float4 in1,float4 in2){
|
||||
out1.x = in1.x;
|
||||
out1.y = in1.y;
|
||||
out1.z = in2.x;
|
||||
out1.w = in2.y;
|
||||
|
||||
out2.x = in1.z;
|
||||
out2.y = in1.w;
|
||||
out2.z = in2.z;
|
||||
out2.w = in2.w;
|
||||
|
||||
// out1= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(1,0,1,0));
|
||||
// out2= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,2,3,2));
|
||||
return;
|
||||
};
|
||||
|
||||
static accelerator_inline void Exchange1(float4 &out1,float4 &out2,float4 in1,float4 in2){
|
||||
|
||||
out1.x = in1.x;
|
||||
out1.y = in2.x;
|
||||
out1.z = in1.z;
|
||||
out1.w = in2.z;
|
||||
|
||||
out2.x = in1.y;
|
||||
out2.y = in2.y;
|
||||
out2.z = in1.w;
|
||||
out2.w = in2.w;
|
||||
|
||||
// out1= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(2,0,2,0)); /*ACEG*/
|
||||
// out2= _mm_shuffle_ps(in1,in2,_MM_SELECT_FOUR_FOUR(3,1,3,1)); /*BDFH*/
|
||||
// out1= _mm_shuffle_ps(out1,out1,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
|
||||
// out2= _mm_shuffle_ps(out2,out2,_MM_SELECT_FOUR_FOUR(3,1,2,0)); /*AECG*/
|
||||
};
|
||||
static accelerator_inline void Exchange2(float4 &out1,float4 &out2,float4 in1,float4 in2){
|
||||
assert(0);
|
||||
return;
|
||||
};
|
||||
static accelerator_inline void Exchange3(float4 &out1,float4 &out2,float4 in1,float4 in2){
|
||||
assert(0);
|
||||
return;
|
||||
};
|
||||
|
||||
static accelerator_inline void Exchange0(double2 &out1,double2 &out2,double2 in1,double2 in2){
|
||||
out1.x = in1.x;
|
||||
out1.y = in2.x;
|
||||
out2.x = in1.y;
|
||||
out2.y = in2.y;
|
||||
// out1= _mm_shuffle_pd(in1,in2,0x0);
|
||||
// out2= _mm_shuffle_pd(in1,in2,0x3);
|
||||
};
|
||||
static accelerator_inline void Exchange1(double2 &out1,double2 &out2,double2 in1,double2 in2){
|
||||
assert(0);
|
||||
return;
|
||||
};
|
||||
static accelerator_inline void Exchange2(double2 &out1,double2 &out2,double2 in1,double2 in2){
|
||||
assert(0);
|
||||
return;
|
||||
};
|
||||
static accelerator_inline void Exchange3(double2 &out1,double2 &out2,double2 in1,double2 in2){
|
||||
assert(0);
|
||||
return;
|
||||
};
|
||||
};
|
||||
|
||||
struct Rotate{
|
||||
|
||||
static accelerator_inline float4 rotate(float4 in,int n){
|
||||
float4 ret;
|
||||
switch(n){
|
||||
case 0: ret = in ; break;
|
||||
case 1: ret.x = in.y; ret.y = in.z ; ret.z = in.w ; ret.w = in.x; break;
|
||||
case 2: ret.x = in.z; ret.y = in.w ; ret.z = in.x ; ret.w = in.y; break;
|
||||
case 3: ret.x = in.w; ret.y = in.x ; ret.z = in.y ; ret.w = in.z; break;
|
||||
default: break;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
static accelerator_inline double2 rotate(double2 in,int n){
|
||||
double2 ret;
|
||||
switch(n){
|
||||
case 0: ret = in; break;
|
||||
case 1: ret.x = in.y; ret.y = in.x ; break;
|
||||
default: break;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
template<int n> static accelerator_inline float4 tRotate(float4 in){
|
||||
return rotate(in,n);
|
||||
};
|
||||
template<int n> static accelerator_inline double2 tRotate(double2 in){
|
||||
return rotate(in,n);
|
||||
};
|
||||
|
||||
};
|
||||
//////////////////////////////////////////////
|
||||
// Some Template specialization
|
||||
|
||||
//Complex float Reduce
|
||||
template<>
|
||||
accelerator_inline Grid::ComplexF Reduce<Grid::ComplexF, float4>::operator()(float4 in){
|
||||
Grid::ComplexF ret(in.x+in.z,in.y+in.w);
|
||||
return ret;
|
||||
}
|
||||
//Real float Reduce
|
||||
template<>
|
||||
accelerator_inline Grid::RealF Reduce<Grid::RealF, float4>::operator()(float4 in){
|
||||
return in.x+in.y+in.z+in.w;
|
||||
}
|
||||
|
||||
//Complex double Reduce
|
||||
template<>
|
||||
accelerator_inline Grid::ComplexD Reduce<Grid::ComplexD, double2>::operator()(double2 in){
|
||||
return Grid::ComplexD(in.x,in.y);
|
||||
}
|
||||
|
||||
//Real double Reduce
|
||||
template<>
|
||||
accelerator_inline Grid::RealD Reduce<Grid::RealD, double2>::operator()(double2 in){
|
||||
return in.x+in.y;
|
||||
}
|
||||
|
||||
//Integer Reduce
|
||||
template<>
|
||||
accelerator_inline Integer Reduce<Integer, int4>::operator()(int4 in){
|
||||
return in.x+in.y+in.z+in.w;
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
// Here assign types
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
typedef half8 SIMD_Htype; // Single precision type
|
||||
typedef float4 SIMD_Ftype; // Single precision type
|
||||
typedef double2 SIMD_Dtype; // Double precision type
|
||||
typedef int4 SIMD_Itype; // Integer type
|
||||
|
||||
// prefetch utilities
|
||||
accelerator_inline void v_prefetch0(int size, const char *ptr){};
|
||||
accelerator_inline void prefetch_HINT_T0(const char *ptr){};
|
||||
|
||||
// Function name aliases
|
||||
typedef Optimization::Vsplat VsplatSIMD;
|
||||
typedef Optimization::Vstore VstoreSIMD;
|
||||
typedef Optimization::Vset VsetSIMD;
|
||||
typedef Optimization::Vstream VstreamSIMD;
|
||||
template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
|
||||
|
||||
// Arithmetic operations
|
||||
typedef Optimization::Sum SumSIMD;
|
||||
typedef Optimization::Sub SubSIMD;
|
||||
typedef Optimization::Div DivSIMD;
|
||||
typedef Optimization::Mult MultSIMD;
|
||||
typedef Optimization::MultComplex MultComplexSIMD;
|
||||
typedef Optimization::MultRealPart MultRealPartSIMD;
|
||||
typedef Optimization::MaddRealPart MaddRealPartSIMD;
|
||||
typedef Optimization::Conj ConjSIMD;
|
||||
typedef Optimization::TimesMinusI TimesMinusISIMD;
|
||||
typedef Optimization::TimesI TimesISIMD;
|
||||
|
||||
}
|
793
lib/simd/Grid_gpu_vec.h
Normal file
793
lib/simd/Grid_gpu_vec.h
Normal file
@ -0,0 +1,793 @@
|
||||
/*************************************************************************************
|
||||
|
||||
Grid physics library, www.github.com/paboyle/Grid
|
||||
|
||||
Source file: ./lib/simd/Grid_gpu.h
|
||||
|
||||
Copyright (C) 2018
|
||||
|
||||
Author: Peter Boyle <paboyle@ph.ed.ac.uk>
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
See the full license in the file "LICENSE" in the top level distribution directory
|
||||
*************************************************************************************/
|
||||
/* END LEGAL */
|
||||
//----------------------------------------------------------------------
|
||||
/*! @file Grid_gpu.h
|
||||
@brief Optimization libraries for GPU
|
||||
Use float4, double2
|
||||
*/
|
||||
//----------------------------------------------------------------------
|
||||
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
namespace Grid {
|
||||
|
||||
|
||||
template<class pair>
|
||||
class GpuComplex {
|
||||
public:
|
||||
pair z;
|
||||
typedef decltype(z.x) real;
|
||||
public:
|
||||
accelerator_inline GpuComplex() = default;
|
||||
accelerator_inline GpuComplex(real re,real im) { z.x=re; z.y=im; };
|
||||
accelerator_inline GpuComplex(const GpuComplex &zz) { z = zz.z;};
|
||||
friend accelerator_inline GpuComplex operator+(const GpuComplex &lhs,const GpuComplex &rhs) {
|
||||
GpuComplex r ;
|
||||
r.z.x = lhs.z.x + rhs.z.x;
|
||||
r.z.y = lhs.z.y + rhs.z.y;
|
||||
return r;
|
||||
}
|
||||
friend accelerator_inline GpuComplex operator-(const GpuComplex &lhs,const GpuComplex &rhs) {
|
||||
GpuComplex r ;
|
||||
r.z.x = lhs.z.x - rhs.z.x;
|
||||
r.z.y = lhs.z.y - rhs.z.y;
|
||||
return r;
|
||||
}
|
||||
friend accelerator_inline GpuComplex operator*(const GpuComplex &lhs,const GpuComplex &rhs) {
|
||||
GpuComplex r ;
|
||||
r.z.x= lhs.z.x*rhs.z.x - lhs.z.y*rhs.z.y; // rr-ii
|
||||
r.z.y= lhs.z.x*rhs.z.y + lhs.z.y*rhs.z.x; // ri+ir
|
||||
return r;
|
||||
}
|
||||
friend accelerator_inline GpuComplex real_mult(const GpuComplex &l,const GpuComplex &r)
|
||||
{
|
||||
GpuComplex ret;
|
||||
ret.z.x = l.z.x*r.z.x;
|
||||
ret.z.y = l.z.x*r.z.y;
|
||||
return ret;
|
||||
}
|
||||
friend std::ostream& operator<< (std::ostream& stream, const GpuComplex o){
|
||||
stream << "("<< o.z.x << ","<< o.z.y <<")";
|
||||
return stream;
|
||||
}
|
||||
};
|
||||
|
||||
template<int _N, class _datum>
|
||||
struct GpuVector {
|
||||
_datum v[_N];
|
||||
static const int N = _N;
|
||||
typedef _datum datum;
|
||||
};
|
||||
|
||||
|
||||
template<int N,class datum>
|
||||
inline accelerator GpuVector<N,datum> operator*(const GpuVector<N,datum> l,const GpuVector<N,datum> r) {
|
||||
GpuVector<N,datum> ret;
|
||||
for(int i=0;i<N;i++) {
|
||||
ret.v[i] = l.v[i]*r.v[i];
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
template<int N,class datum>
|
||||
inline accelerator GpuVector<N,datum> operator-(const GpuVector<N,datum> l,const GpuVector<N,datum> r) {
|
||||
GpuVector<N,datum> ret;
|
||||
for(int i=0;i<N;i++) {
|
||||
ret.v[i] = l.v[i]-r.v[i];
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
template<int N,class datum>
|
||||
inline accelerator GpuVector<N,datum> operator+(const GpuVector<N,datum> l,const GpuVector<N,datum> r) {
|
||||
GpuVector<N,datum> ret;
|
||||
for(int i=0;i<N;i++) {
|
||||
ret.v[i] = l.v[i]+r.v[i];
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
template<int N,class datum>
|
||||
inline accelerator GpuVector<N,datum> operator/(const GpuVector<N,datum> l,const GpuVector<N,datum> r) {
|
||||
GpuVector<N,datum> ret;
|
||||
for(int i=0;i<N;i++) {
|
||||
ret.v[i] = l.v[i]/r.v[i];
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
#define COALESCE_GRANULARITY (64) // 512 bit coalesce size target
|
||||
|
||||
constexpr int NSIMD_RealH = COALESCE_GRANULARITY / sizeof(half);
|
||||
constexpr int NSIMD_ComplexH = COALESCE_GRANULARITY / sizeof(half2);
|
||||
constexpr int NSIMD_RealF = COALESCE_GRANULARITY / sizeof(float);
|
||||
constexpr int NSIMD_ComplexF = COALESCE_GRANULARITY / sizeof(float2);
|
||||
constexpr int NSIMD_RealD = COALESCE_GRANULARITY / sizeof(double);
|
||||
constexpr int NSIMD_ComplexD = COALESCE_GRANULARITY / sizeof(double2);
|
||||
constexpr int NSIMD_Integer = COALESCE_GRANULARITY / sizeof(Integer);
|
||||
|
||||
typedef GpuComplex<half2 > GpuComplexH;
|
||||
typedef GpuComplex<float2 > GpuComplexF;
|
||||
typedef GpuComplex<double2> GpuComplexD;
|
||||
|
||||
typedef GpuVector<NSIMD_RealH , half > GpuVectorRH;
|
||||
typedef GpuVector<NSIMD_ComplexH, GpuComplexH > GpuVectorCH;
|
||||
typedef GpuVector<NSIMD_RealF, float > GpuVectorRF;
|
||||
typedef GpuVector<NSIMD_ComplexF, GpuComplexF > GpuVectorCF;
|
||||
typedef GpuVector<NSIMD_RealD, double > GpuVectorRD;
|
||||
typedef GpuVector<NSIMD_ComplexD, GpuComplexD > GpuVectorCD;
|
||||
typedef GpuVector<NSIMD_Integer, Integer > GpuVectorI;
|
||||
|
||||
accelerator_inline float half2float(half h)
|
||||
{
|
||||
float f;
|
||||
#ifdef __CUDA_ARCH__
|
||||
f = __half2float(h);
|
||||
#else
|
||||
//f = __half2float(h);
|
||||
__half_raw hr(h);
|
||||
Grid_half hh;
|
||||
hh.x = hr.x;
|
||||
f= sfw_half_to_float(hh);
|
||||
#endif
|
||||
return f;
|
||||
}
|
||||
accelerator_inline half float2half(float f)
|
||||
{
|
||||
half h;
|
||||
#ifdef __CUDA_ARCH__
|
||||
h = __float2half(f);
|
||||
#else
|
||||
Grid_half hh = sfw_float_to_half(f);
|
||||
__half_raw hr;
|
||||
hr.x = hh.x;
|
||||
h = __half(hr);
|
||||
#endif
|
||||
return h;
|
||||
}
|
||||
|
||||
namespace Optimization {
|
||||
|
||||
struct Vsplat{
|
||||
//Complex float
|
||||
accelerator_inline GpuVectorCF operator()(float a, float b){
|
||||
GpuVectorCF ret;
|
||||
for(int i=0;i<GpuVectorCF::N;i++){
|
||||
ret.v[i] = typename GpuVectorCF::datum(a,b);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
// Real float
|
||||
accelerator_inline GpuVectorRF operator()(float a){
|
||||
GpuVectorRF ret;
|
||||
for(int i=0;i<GpuVectorRF::N;i++){
|
||||
ret.v[i] = typename GpuVectorRF::datum(a);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
//Complex double
|
||||
accelerator_inline GpuVectorCD operator()(double a, double b){
|
||||
GpuVectorCD ret;
|
||||
for(int i=0;i<GpuVectorCD::N;i++){
|
||||
ret.v[i] = typename GpuVectorCD::datum(a,b);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
//Real double
|
||||
accelerator_inline GpuVectorRD operator()(double a){
|
||||
GpuVectorRD ret;
|
||||
for(int i=0;i<GpuVectorRD::N;i++){
|
||||
ret.v[i] = typename GpuVectorRD::datum(a);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
//Integer
|
||||
accelerator_inline GpuVectorI operator()(Integer a){
|
||||
GpuVectorI ret;
|
||||
for(int i=0;i<GpuVectorI::N;i++){
|
||||
ret.v[i] = typename GpuVectorI::datum(a);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
};
|
||||
|
||||
struct Vstore{
|
||||
template<int N,class datum,class P>
|
||||
accelerator_inline void operator()(GpuVector<N,datum> a, P* Fp){
|
||||
GpuVector<N,datum> *vF = (GpuVector<N,datum> *)Fp;
|
||||
*vF = a;
|
||||
}
|
||||
};
|
||||
|
||||
struct Vstream{
|
||||
template<int N,class datum, class P>
|
||||
accelerator_inline void operator()(P* F,GpuVector<N,datum> a){
|
||||
GpuVector<N,datum> *vF = (GpuVector<N,datum> *)F;
|
||||
*vF = a;
|
||||
}
|
||||
};
|
||||
|
||||
struct Vset{
|
||||
// Complex float
|
||||
accelerator_inline GpuVectorCF operator()(Grid::ComplexF *a){
|
||||
typedef GpuVectorCF vec;
|
||||
vec ret;
|
||||
for(int i=0;i<vec::N;i++){
|
||||
ret.v[i] = vec::datum(a[i].real(),a[i].imag());
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
// Complex double
|
||||
accelerator_inline GpuVectorCD operator()(Grid::ComplexD *a){
|
||||
typedef GpuVectorCD vec;
|
||||
vec ret;
|
||||
for(int i=0;i<vec::N;i++){
|
||||
ret.v[i] = vec::datum(a[i].real(),a[i].imag());
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
// Real float
|
||||
accelerator_inline GpuVectorRF operator()(float *a){
|
||||
typedef GpuVectorRF vec;
|
||||
vec ret;
|
||||
for(int i=0;i<vec::N;i++){
|
||||
ret.v[i] = vec::datum(a[i]);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
// Real double
|
||||
accelerator_inline GpuVectorRD operator()(double *a){
|
||||
typedef GpuVectorRD vec;
|
||||
vec ret;
|
||||
for(int i=0;i<vec::N;i++){
|
||||
ret.v[i] = vec::datum(a[i]);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
// Integer
|
||||
accelerator_inline GpuVectorI operator()(Integer *a){
|
||||
typedef GpuVectorI vec;
|
||||
vec ret;
|
||||
for(int i=0;i<vec::N;i++){
|
||||
ret.v[i] = vec::datum(a[i]);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Out_type, typename In_type>
|
||||
struct Reduce{
|
||||
//Need templated class to overload output type
|
||||
//General form must generate error if compiled
|
||||
accelerator_inline Out_type operator()(In_type in){
|
||||
printf("Error, using wrong Reduce function\n");
|
||||
exit(1);
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
/////////////////////////////////////////////////////
|
||||
// Arithmetic operations
|
||||
/////////////////////////////////////////////////////
|
||||
struct Sum{
|
||||
//Real float
|
||||
accelerator_inline GpuVectorRF operator()(GpuVectorRF a,GpuVectorRF b){
|
||||
return a+b;
|
||||
}
|
||||
accelerator_inline GpuVectorRD operator()(GpuVectorRD a,GpuVectorRD b){
|
||||
return a+b;
|
||||
}
|
||||
accelerator_inline GpuVectorCF operator()(GpuVectorCF a,GpuVectorCF b){
|
||||
return a+b;
|
||||
}
|
||||
accelerator_inline GpuVectorCD operator()(GpuVectorCD a,GpuVectorCD b){
|
||||
return a+b;
|
||||
}
|
||||
accelerator_inline GpuVectorI operator()(GpuVectorI a,GpuVectorI b){
|
||||
return a+b;
|
||||
}
|
||||
};
|
||||
|
||||
struct Sub{
|
||||
accelerator_inline GpuVectorRF operator()(GpuVectorRF a,GpuVectorRF b){
|
||||
return a-b;
|
||||
}
|
||||
accelerator_inline GpuVectorRD operator()(GpuVectorRD a,GpuVectorRD b){
|
||||
return a-b;
|
||||
}
|
||||
accelerator_inline GpuVectorCF operator()(GpuVectorCF a,GpuVectorCF b){
|
||||
return a-b;
|
||||
}
|
||||
accelerator_inline GpuVectorCD operator()(GpuVectorCD a,GpuVectorCD b){
|
||||
return a-b;
|
||||
}
|
||||
accelerator_inline GpuVectorI operator()(GpuVectorI a,GpuVectorI b){
|
||||
return a-b;
|
||||
}
|
||||
};
|
||||
|
||||
struct MultRealPart{
|
||||
accelerator_inline GpuVectorCF operator()(GpuVectorCF a,GpuVectorCF b){
|
||||
typedef GpuVectorCF vec;
|
||||
vec ret;
|
||||
for(int i=0;i<vec::N;i++){
|
||||
ret.v[i] = real_mult(a.v[i],b.v[i]);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
accelerator_inline GpuVectorCD operator()(GpuVectorCD a,GpuVectorCD b){
|
||||
typedef GpuVectorCD vec;
|
||||
vec ret;
|
||||
for(int i=0;i<vec::N;i++){
|
||||
ret.v[i] = real_mult(a.v[i],b.v[i]);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
};
|
||||
|
||||
struct MaddRealPart{
|
||||
accelerator_inline GpuVectorCF operator()(GpuVectorCF a,GpuVectorCF b,GpuVectorCF c){
|
||||
typedef GpuVectorCF vec;
|
||||
vec ret;
|
||||
for(int i=0;i<vec::N;i++){
|
||||
ret.v[i] = real_mult(a.v[i],b.v[i]) +c.v[i];
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
accelerator_inline GpuVectorCD operator()(GpuVectorCD a,GpuVectorCD b,GpuVectorCD c){
|
||||
typedef GpuVectorCD vec;
|
||||
vec ret;
|
||||
for(int i=0;i<vec::N;i++){
|
||||
ret.v[i] = real_mult(a.v[i],b.v[i]) +c.v[i];
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
};
|
||||
|
||||
struct MultComplex{
|
||||
|
||||
accelerator_inline GpuVectorCF operator()(GpuVectorCF a,GpuVectorCF b){
|
||||
return a*b;
|
||||
}
|
||||
accelerator_inline GpuVectorCD operator()(GpuVectorCD a,GpuVectorCD b){
|
||||
return a*b;
|
||||
}
|
||||
};
|
||||
|
||||
struct Mult{
|
||||
accelerator_inline void mac(GpuVectorRF &a, GpuVectorRF b, GpuVectorRF c){
|
||||
a= a+b*c;
|
||||
}
|
||||
accelerator_inline void mac(GpuVectorRD &a, GpuVectorRD b, GpuVectorRD c){
|
||||
a= a+b*c;
|
||||
}
|
||||
// Real float
|
||||
accelerator_inline GpuVectorRF operator()(GpuVectorRF a, GpuVectorRF b){
|
||||
return a*b;
|
||||
}
|
||||
// Real double
|
||||
accelerator_inline GpuVectorRD operator()(GpuVectorRD a, GpuVectorRD b){
|
||||
return a*b;
|
||||
}
|
||||
accelerator_inline GpuVectorI operator()(GpuVectorI a, GpuVectorI b){
|
||||
return a*b;
|
||||
}
|
||||
};
|
||||
|
||||
struct Div{
|
||||
// Real float
|
||||
accelerator_inline GpuVectorRF operator()(GpuVectorRF a, GpuVectorRF b){
|
||||
return a/b;
|
||||
}
|
||||
accelerator_inline GpuVectorRD operator()(GpuVectorRD a, GpuVectorRD b){
|
||||
return a/b;
|
||||
}
|
||||
// Danger -- element wise divide fro complex, not complex div.
|
||||
// See Grid_vector_types.h lines around 735, applied after "toReal"
|
||||
accelerator_inline GpuVectorCF operator()(GpuVectorCF a, GpuVectorCF b){
|
||||
GpuVectorCF ret;
|
||||
for(int i=0;i< GpuVectorCF::N;i++){
|
||||
ret.v[i].z.x = a.v[i].z.x / b.v[i].z.x;
|
||||
ret.v[i].z.y = a.v[i].z.y / b.v[i].z.y;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
accelerator_inline GpuVectorCD operator()(GpuVectorCD a, GpuVectorCD b){
|
||||
GpuVectorCD ret;
|
||||
for(int i=0;i< GpuVectorCD::N;i++){
|
||||
ret.v[i].z.x = a.v[i].z.x / b.v[i].z.x;
|
||||
ret.v[i].z.y = a.v[i].z.y / b.v[i].z.y;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
struct Conj{
|
||||
// Complex single
|
||||
accelerator_inline GpuVectorCF operator()(GpuVectorCF in){
|
||||
typedef GpuVectorCF vec;
|
||||
vec ret;
|
||||
for(int i=0;i<vec::N;i++){
|
||||
ret.v[i].z.x = in.v[i].z.x;
|
||||
ret.v[i].z.y =-in.v[i].z.y;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
accelerator_inline GpuVectorCD operator()(GpuVectorCD in){
|
||||
typedef GpuVectorCD vec;
|
||||
vec ret;
|
||||
for(int i=0;i<vec::N;i++){
|
||||
ret.v[i].z.x = in.v[i].z.x;
|
||||
ret.v[i].z.y =-in.v[i].z.y;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
};
|
||||
|
||||
struct TimesMinusI{
|
||||
//Complex single
|
||||
accelerator_inline GpuVectorCF operator()(GpuVectorCF in,GpuVectorCF dummy){
|
||||
typedef GpuVectorCF vec;
|
||||
vec ret;
|
||||
for(int i=0;i<vec::N;i++){
|
||||
ret.v[i].z.x = in.v[i].z.y;
|
||||
ret.v[i].z.y =-in.v[i].z.x;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
accelerator_inline GpuVectorCD operator()(GpuVectorCD in,GpuVectorCD dummy){
|
||||
typedef GpuVectorCD vec;
|
||||
vec ret;
|
||||
for(int i=0;i<vec::N;i++){
|
||||
ret.v[i].z.x = in.v[i].z.y;
|
||||
ret.v[i].z.y =-in.v[i].z.x;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
};
|
||||
|
||||
struct TimesI{
|
||||
//Complex single
|
||||
accelerator_inline GpuVectorCF operator()(GpuVectorCF in,GpuVectorCF dummy){
|
||||
typedef GpuVectorCF vec;
|
||||
vec ret;
|
||||
for(int i=0;i<vec::N;i++){
|
||||
ret.v[i].z.x =-in.v[i].z.y;
|
||||
ret.v[i].z.y = in.v[i].z.x;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
accelerator_inline GpuVectorCD operator()(GpuVectorCD in,GpuVectorCD dummy){
|
||||
typedef GpuVectorCD vec;
|
||||
vec ret;
|
||||
for(int i=0;i<vec::N;i++){
|
||||
ret.v[i].z.x =-in.v[i].z.y;
|
||||
ret.v[i].z.y = in.v[i].z.x;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
};
|
||||
|
||||
struct Permute{
|
||||
|
||||
template <int n,typename vec>
|
||||
static accelerator_inline vec PermuteN(vec in) {
|
||||
vec out;
|
||||
unsigned int _mask = vec::N >> (n + 1);
|
||||
for(int i=0;i<vec::N;i++) {
|
||||
out.v[i] = in.v[i^_mask];
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
template <typename vec> static accelerator_inline vec Permute0(vec in) { return PermuteN<0,vec>(in); }
|
||||
template <typename vec> static accelerator_inline vec Permute1(vec in) { return PermuteN<1,vec>(in); }
|
||||
template <typename vec> static accelerator_inline vec Permute2(vec in) { return PermuteN<2,vec>(in); }
|
||||
template <typename vec> static accelerator_inline vec Permute3(vec in) { return PermuteN<3,vec>(in); }
|
||||
|
||||
};
|
||||
|
||||
struct PrecisionChange {
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////
|
||||
// Single / Half
|
||||
////////////////////////////////////////////////////////////////////////////////////
|
||||
static accelerator_inline GpuVectorCH StoH (GpuVectorCF a,GpuVectorCF b) {
|
||||
int N = GpuVectorCF::N;
|
||||
GpuVectorCH h;
|
||||
for(int i=0;i<N;i++) {
|
||||
h.v[i ].z.x = float2half(a.v[i].z.x);
|
||||
h.v[i ].z.y = float2half(a.v[i].z.y);
|
||||
h.v[i+N].z.x = float2half(b.v[i].z.x);
|
||||
h.v[i+N].z.y = float2half(b.v[i].z.y);
|
||||
}
|
||||
return h;
|
||||
}
|
||||
static accelerator_inline void HtoS (GpuVectorCH h,GpuVectorCF &sa,GpuVectorCF &sb) {
|
||||
int N = GpuVectorCF::N;
|
||||
for(int i=0;i<N;i++) {
|
||||
sa.v[i].z.x = half2float(h.v[i ].z.x);
|
||||
sa.v[i].z.y = half2float(h.v[i ].z.y);
|
||||
sb.v[i].z.x = half2float(h.v[i+N].z.x);
|
||||
sb.v[i].z.y = half2float(h.v[i+N].z.y);
|
||||
}
|
||||
}
|
||||
static accelerator_inline GpuVectorRH StoH (GpuVectorRF a,GpuVectorRF b) {
|
||||
int N = GpuVectorRF::N;
|
||||
GpuVectorRH h;
|
||||
for(int i=0;i<N;i++) {
|
||||
h.v[i ] = float2half(a.v[i]);
|
||||
h.v[i+N] = float2half(b.v[i]);
|
||||
}
|
||||
return h;
|
||||
}
|
||||
static accelerator_inline void HtoS (GpuVectorRH h,GpuVectorRF &sa,GpuVectorRF &sb) {
|
||||
int N = GpuVectorRF::N;
|
||||
for(int i=0;i<N;i++) {
|
||||
sa.v[i] = half2float(h.v[i ]);
|
||||
sb.v[i] = half2float(h.v[i+N]);
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////
|
||||
// Double Single
|
||||
////////////////////////////////////////////////////////////////////////////////////
|
||||
static accelerator_inline GpuVectorCF DtoS (GpuVectorCD a,GpuVectorCD b) {
|
||||
int N = GpuVectorCD::N;
|
||||
GpuVectorCF h;
|
||||
for(int i=0;i<N;i++) {
|
||||
h.v[i ].z.x = a.v[i].z.x;
|
||||
h.v[i ].z.y = a.v[i].z.y;
|
||||
h.v[i+N].z.x = b.v[i].z.x;
|
||||
h.v[i+N].z.y = b.v[i].z.y;
|
||||
}
|
||||
return h;
|
||||
}
|
||||
|
||||
static accelerator_inline void StoD (GpuVectorCF h,GpuVectorCD &sa,GpuVectorCD &sb) {
|
||||
int N = GpuVectorCD::N;
|
||||
for(int i=0;i<N;i++) {
|
||||
sa.v[i].z.x = h.v[i ].z.x;
|
||||
sa.v[i].z.y = h.v[i ].z.y;
|
||||
sb.v[i].z.x = h.v[i+N].z.x;
|
||||
sb.v[i].z.y = h.v[i+N].z.y;
|
||||
}
|
||||
}
|
||||
|
||||
static accelerator_inline GpuVectorRF DtoS (GpuVectorRD a,GpuVectorRD b) {
|
||||
int N = GpuVectorRD::N;
|
||||
GpuVectorRF h;
|
||||
for(int i=0;i<N;i++) {
|
||||
h.v[i ] = a.v[i];
|
||||
h.v[i+N] = b.v[i];
|
||||
}
|
||||
return h;
|
||||
}
|
||||
|
||||
static accelerator_inline void StoD (GpuVectorRF h,GpuVectorRD &sa,GpuVectorRD &sb) {
|
||||
int N = GpuVectorRD::N;
|
||||
for(int i=0;i<N;i++) {
|
||||
sa.v[i] = h.v[i ];
|
||||
sb.v[i] = h.v[i+N];
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////
|
||||
// Double Half
|
||||
////////////////////////////////////////////////////////////////////////////////////
|
||||
static accelerator_inline GpuVectorCH DtoH (GpuVectorCD a,GpuVectorCD b,GpuVectorCD c,GpuVectorCD d) {
|
||||
GpuVectorCF sa,sb;
|
||||
sa = DtoS(a,b);
|
||||
sb = DtoS(c,d);
|
||||
return StoH(sa,sb);
|
||||
}
|
||||
static accelerator_inline void HtoD (GpuVectorCH h,GpuVectorCD &a,GpuVectorCD &b,GpuVectorCD &c,GpuVectorCD &d) {
|
||||
GpuVectorCF sa,sb;
|
||||
HtoS(h,sa,sb);
|
||||
StoD(sa,a,b);
|
||||
StoD(sb,c,d);
|
||||
}
|
||||
static accelerator_inline GpuVectorRH DtoH (GpuVectorRD a,GpuVectorRD b,GpuVectorRD c,GpuVectorRD d) {
|
||||
GpuVectorRF sa,sb;
|
||||
sa = DtoS(a,b);
|
||||
sb = DtoS(c,d);
|
||||
return StoH(sa,sb);
|
||||
}
|
||||
static accelerator_inline void HtoD (GpuVectorRH h,GpuVectorRD &a,GpuVectorRD &b,GpuVectorRD &c,GpuVectorRD &d) {
|
||||
GpuVectorRF sa,sb;
|
||||
HtoS(h,sa,sb);
|
||||
StoD(sa,a,b);
|
||||
StoD(sb,c,d);
|
||||
}
|
||||
};
|
||||
|
||||
struct Exchange{
|
||||
|
||||
template <typename vec,int n>
|
||||
static accelerator_inline void ExchangeN(vec &out1,vec &out2,vec &in1,vec &in2){
|
||||
unsigned int mask = vec::N >> (n + 1);
|
||||
for(int i=0;i<vec::N;i++) {
|
||||
int j1 = i&(~mask);
|
||||
if ( (i&mask) == 0 ) { out1.v[i]=in1.v[j1];}
|
||||
else { out1.v[i]=in2.v[j1];}
|
||||
int j2 = i|mask;
|
||||
if ( (i&mask) == 0 ) { out2.v[i]=in1.v[j2];}
|
||||
else { out2.v[i]=in2.v[j2];}
|
||||
}
|
||||
}
|
||||
template <typename vec>
|
||||
static accelerator_inline void Exchange0(vec &out1,vec &out2,vec &in1,vec &in2){
|
||||
ExchangeN<vec,0>(out1,out2,in1,in2);
|
||||
};
|
||||
template <typename vec>
|
||||
static accelerator_inline void Exchange1(vec &out1,vec &out2,vec &in1,vec &in2){
|
||||
ExchangeN<vec,1>(out1,out2,in1,in2);
|
||||
};
|
||||
template <typename vec>
|
||||
static accelerator_inline void Exchange2(vec &out1,vec &out2,vec &in1,vec &in2){
|
||||
ExchangeN<vec,2>(out1,out2,in1,in2);
|
||||
};
|
||||
template <typename vec>
|
||||
static accelerator_inline void Exchange3(vec &out1,vec &out2,vec &in1,vec &in2){
|
||||
ExchangeN<vec,3>(out1,out2,in1,in2);
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
struct Rotate{
|
||||
|
||||
template <int n, typename vec> static accelerator_inline vec tRotate(vec in){
|
||||
return rotate(in, n);
|
||||
}
|
||||
|
||||
template <typename vec>
|
||||
static accelerator_inline vec rotate_template(vec in, int n){
|
||||
vec out;
|
||||
for(int i=0;i<vec::N;i++){
|
||||
out.v[i] = in.v[(i + n)%vec::N];
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
typedef GpuVectorRH SIMD_Htype; // Single precision type
|
||||
typedef GpuVectorRF SIMD_Ftype; // Single precision type
|
||||
typedef GpuVectorRD SIMD_Dtype; // Double precision type
|
||||
typedef GpuVectorI SIMD_Itype; // Integer type
|
||||
|
||||
typedef GpuVectorCH SIMD_CHtype; // Single precision type
|
||||
typedef GpuVectorCF SIMD_CFtype; // Single precision type
|
||||
typedef GpuVectorCD SIMD_CDtype; // Double precision type
|
||||
|
||||
static accelerator_inline GpuVectorRH rotate(GpuVectorRH in, int n){ return rotate_template(in,n);}
|
||||
static accelerator_inline GpuVectorRF rotate(GpuVectorRF in, int n){ return rotate_template(in,n);}
|
||||
static accelerator_inline GpuVectorRD rotate(GpuVectorRD in, int n){ return rotate_template(in,n);}
|
||||
static accelerator_inline GpuVectorI rotate(GpuVectorI in, int n){ return rotate_template(in,n);}
|
||||
static accelerator_inline GpuVectorCH rotate(GpuVectorCH in, int n){ return rotate_template(in,n/2);} // Measure in complex not float
|
||||
static accelerator_inline GpuVectorCF rotate(GpuVectorCF in, int n){ return rotate_template(in,n/2);}
|
||||
static accelerator_inline GpuVectorCD rotate(GpuVectorCD in, int n){ return rotate_template(in,n/2);}
|
||||
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////
|
||||
// Some Template specialization
|
||||
|
||||
//Complex float Reduce
|
||||
template<>
|
||||
accelerator_inline Grid::ComplexF
|
||||
Reduce<Grid::ComplexF, GpuVectorCF>::operator()(GpuVectorCF in)
|
||||
{
|
||||
GpuComplexF greduce = in.v[0];
|
||||
for(int i=1;i<GpuVectorCF::N;i++) {
|
||||
greduce = greduce+in.v[i];
|
||||
}
|
||||
Grid::ComplexF ret(greduce.z.x,greduce.z.y);
|
||||
return ret;
|
||||
}
|
||||
|
||||
template<>
|
||||
accelerator_inline Grid::ComplexD
|
||||
Reduce<Grid::ComplexD, GpuVectorCD>::operator()(GpuVectorCD in)
|
||||
{
|
||||
GpuComplexD greduce = in.v[0];
|
||||
for(int i=1;i<GpuVectorCD::N;i++) {
|
||||
greduce = greduce+in.v[i];
|
||||
}
|
||||
Grid::ComplexD ret(greduce.z.x,greduce.z.y);
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Real
|
||||
template<>
|
||||
accelerator_inline Grid::RealF
|
||||
Reduce<RealF, GpuVectorRF>::operator()(GpuVectorRF in)
|
||||
{
|
||||
RealF ret = in.v[0];
|
||||
for(int i=1;i<GpuVectorRF::N;i++) {
|
||||
ret = ret+in.v[i];
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
template<>
|
||||
accelerator_inline Grid::RealD
|
||||
Reduce<RealD, GpuVectorRD>::operator()(GpuVectorRD in)
|
||||
{
|
||||
RealD ret = in.v[0];
|
||||
for(int i=1;i<GpuVectorRD::N;i++) {
|
||||
ret = ret+in.v[i];
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
template<>
|
||||
accelerator_inline Integer
|
||||
Reduce<Integer, GpuVectorI>::operator()(GpuVectorI in)
|
||||
{
|
||||
Integer ret = in.v[0];
|
||||
for(int i=1;i<GpuVectorI::N;i++) {
|
||||
ret = ret+in.v[i];
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
}// End optimizatoin
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
// Here assign types
|
||||
//////////////////////////////////////////////////////////////////////////////////////
|
||||
typedef GpuVectorRH SIMD_Htype; // Single precision type
|
||||
typedef GpuVectorRF SIMD_Ftype; // Single precision type
|
||||
typedef GpuVectorRD SIMD_Dtype; // Double precision type
|
||||
typedef GpuVectorI SIMD_Itype; // Integer type
|
||||
|
||||
typedef GpuVectorCH SIMD_CHtype; // Single precision type
|
||||
typedef GpuVectorCF SIMD_CFtype; // Single precision type
|
||||
typedef GpuVectorCD SIMD_CDtype; // Double precision type
|
||||
|
||||
// prefetch utilities
|
||||
accelerator_inline void v_prefetch0(int size, const char *ptr){};
|
||||
accelerator_inline void prefetch_HINT_T0(const char *ptr){};
|
||||
|
||||
// Function name aliases
|
||||
typedef Optimization::Vsplat VsplatSIMD;
|
||||
typedef Optimization::Vstore VstoreSIMD;
|
||||
typedef Optimization::Vset VsetSIMD;
|
||||
typedef Optimization::Vstream VstreamSIMD;
|
||||
template <typename S, typename T> using ReduceSIMD = Optimization::Reduce<S,T>;
|
||||
|
||||
// Arithmetic operations
|
||||
typedef Optimization::Sum SumSIMD;
|
||||
typedef Optimization::Sub SubSIMD;
|
||||
typedef Optimization::Div DivSIMD;
|
||||
typedef Optimization::Mult MultSIMD;
|
||||
typedef Optimization::MultComplex MultComplexSIMD;
|
||||
typedef Optimization::MultRealPart MultRealPartSIMD;
|
||||
typedef Optimization::MaddRealPart MaddRealPartSIMD;
|
||||
typedef Optimization::Conj ConjSIMD;
|
||||
typedef Optimization::TimesMinusI TimesMinusISIMD;
|
||||
typedef Optimization::TimesI TimesISIMD;
|
||||
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user