1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-06-13 12:47:05 +01:00

Got unpreconditioned conjugate gradient to run and converge on a random (uniform random,

not even SU(3) for now) gauge field. Convergence history is correctly indepdendent of decomposition
on 1,2,4,8,16 mpi tasks.
Found a couple of simd bugs which required fixed and enhanced the Grid_simd.cc test suite.
Implemented the Mdag, M, MdagM, Meooe Mooee schur type stuff in the wilson dop.
This commit is contained in:
Peter Boyle
2015-05-19 13:57:35 +01:00
parent 6f387b4916
commit a6e1ea216d
33 changed files with 566 additions and 316 deletions

View File

@ -32,7 +32,7 @@ namespace Grid {
friend inline void mult(vComplexD * __restrict__ y,const vComplexD * __restrict__ l,const vComplexD *__restrict__ r) {*y = (*l) * (*r);}
friend inline void sub (vComplexD * __restrict__ y,const vComplexD * __restrict__ l,const vComplexD *__restrict__ r) {*y = (*l) - (*r);}
friend inline void add (vComplexD * __restrict__ y,const vComplexD * __restrict__ l,const vComplexD *__restrict__ r) {*y = (*l) + (*r);}
friend inline vComplexD adj(const vComplexD &in){ return conj(in); }
friend inline vComplexD adj(const vComplexD &in){ return conjugate(in); }
//////////////////////////////////
// Initialise to 1,0,i
@ -166,11 +166,11 @@ namespace Grid {
// all subtypes; may not be a good assumption, but could
// add the vector width as a template param for BG/Q for example
////////////////////////////////////////////////////////////////////
/*
friend inline void permute(vComplexD &y,vComplexD b,int perm)
{
Gpermute<vComplexD>(y,b,perm);
}
/*
friend inline void merge(vComplexD &y,std::vector<ComplexD *> &extracted)
{
Gmerge<vComplexD,ComplexD >(y,extracted);
@ -269,7 +269,7 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){
////////////////////////
// Conjugate
////////////////////////
friend inline vComplexD conj(const vComplexD &in){
friend inline vComplexD conjugate(const vComplexD &in){
vComplexD ret ; vzero(ret);
#if defined (AVX1)|| defined (AVX2)
// addsubps 0, inv=>0+in.v[3] 0-in.v[2], 0+in.v[1], 0-in.v[0], ...
@ -345,17 +345,17 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){
// REDUCE FIXME must be a cleaner implementation
friend inline ComplexD Reduce(const vComplexD & in)
{
#if defined SSE4
return ComplexD(in.v[0], in.v[1]); // inefficient
#ifdef SSE4
return ComplexD(in.v[0],in.v[1]);
#endif
#if defined(AVX1) || defined (AVX2)
vComplexD v1;
permute(v1,in,0); // sse 128; paired complex single
v1=v1+in;
return ComplexD(v1.v[0],v1.v[1]);
#endif
#if defined (AVX1) || defined(AVX2)
// return std::complex<double>(_mm256_mask_reduce_add_pd(0x55, in.v),_mm256_mask_reduce_add_pd(0xAA, in.v));
__attribute__ ((aligned(32))) double c_[4];
_mm256_store_pd(c_,in.v);
return ComplexD(c_[0]+c_[2],c_[1]+c_[3]);
#endif
#ifdef AVX512
return ComplexD(_mm512_mask_reduce_add_pd(0x55, in.v),_mm512_mask_reduce_add_pd(0xAA, in.v));
return ComplexD(_mm512_mask_reduce_add_pd(0x55, in.v),_mm512_mask_reduce_add_pd(0xAA, in.v));
#endif
#ifdef QPX
#endif
@ -387,7 +387,7 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){
};
inline vComplexD innerProduct(const vComplexD & l, const vComplexD & r) { return conj(l)*r; }
inline vComplexD innerProduct(const vComplexD & l, const vComplexD & r) { return conjugate(l)*r; }
typedef vComplexD vDComplex;

View File

@ -47,7 +47,7 @@ namespace Grid {
friend inline void mult(vComplexF * __restrict__ y,const vComplexF * __restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) * (*r); }
friend inline void sub (vComplexF * __restrict__ y,const vComplexF * __restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) - (*r); }
friend inline void add (vComplexF * __restrict__ y,const vComplexF * __restrict__ l,const vComplexF *__restrict__ r){ *y = (*l) + (*r); }
friend inline vComplexF adj(const vComplexF &in){ return conj(in); }
friend inline vComplexF adj(const vComplexF &in){ return conjugate(in); }
//////////////////////////////////
// Initialise to 1,0,i
@ -228,42 +228,25 @@ namespace Grid {
ret.v = {a,b,a,b};
#endif
}
friend inline ComplexF Reduce(const vComplexF & in)
{
friend inline void permute(vComplexF &y,vComplexF b,int perm)
{
Gpermute<vComplexF>(y,b,perm);
}
friend inline ComplexF Reduce(const vComplexF & in)
{
#ifdef SSE4
union {
cvec v1; // SSE 4 x float vector
float f[4]; // scalar array of 4 floats
} u128;
u128.v1= _mm_add_ps(in.v, _mm_shuffle_ps(in.v,in.v, 0b01001110)); // FIXME Prefer to use _MM_SHUFFLE macros
return ComplexF(u128.f[0], u128.f[1]);
vComplexF v1;
permute(v1,in,0); // sse 128; paired complex single
v1=v1+in;
return ComplexF(v1.v[0],v1.v[1]);
#endif
#ifdef AVX1
//it would be better passing 2 arguments to saturate the vector lanes
union {
__m256 v1;
float f[8];
} u256;
//SWAP lanes
// FIXME .. icc complains with lib/lattice/Grid_lattice_reduction.h (49): (col. 20) warning #13211: Immediate parameter to intrinsic call too large
__m256 t0 = _mm256_permute2f128_ps(in.v, in.v, 1);
__m256 t1 = _mm256_permute_ps(in.v , 0b11011000);//real (0,2,1,3)
__m256 t2 = _mm256_permute_ps(t0 , 0b10001101);//imag (1,3,0,2)
t0 = _mm256_blend_ps(t1, t2, 0b0101000001010000);// (0,0,1,1,0,0,1,1)
t1 = _mm256_hadd_ps(t0,t0);
u256.v1 = _mm256_hadd_ps(t1, t1);
return ComplexF(u256.f[0], u256.f[4]);
#endif
#ifdef AVX2
union {
__m256 v1;
float f[8];
} u256;
const __m256i mask= _mm256_set_epi32( 7, 5, 3, 1, 6, 4, 2, 0);
__m256 tmp1 = _mm256_permutevar8x32_ps(in.v, mask);
__m256 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
u256.v1 = _mm256_hadd_ps(tmp2, tmp2);
return ComplexF(u256.f[0], u256.f[4]);
#if defined(AVX1) || defined (AVX2)
vComplexF v1,v2;
permute(v1,in,0); // sse 128; paired complex single
v1=v1+in;
permute(v2,v1,1); // avx 256; quad complex single
v1=v1+v2;
return ComplexF(v1.v[0],v1.v[1]);
#endif
#ifdef AVX512
return ComplexF(_mm512_mask_reduce_add_ps(0x5555, in.v),_mm512_mask_reduce_add_ps(0xAAAA, in.v));
@ -345,13 +328,10 @@ namespace Grid {
// Conjugate
///////////////////////
friend inline vComplexF conj(const vComplexF &in){
friend inline vComplexF conjugate(const vComplexF &in){
vComplexF ret ; vzero(ret);
#if defined (AVX1)|| defined (AVX2)
cvec tmp;
tmp = _mm256_addsub_ps(ret.v,_mm256_shuffle_ps(in.v,in.v,_MM_SHUFFLE(2,3,0,1))); // ymm1 <- br,bi
ret.v=_mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1));
ret.v = _mm256_xor_ps(_mm256_addsub_ps(ret.v,in.v), _mm256_set1_ps(-0.f));
#endif
#ifdef SSE4
ret.v = _mm_xor_ps(_mm_addsub_ps(ret.v,in.v), _mm_set1_ps(-0.f));
@ -433,10 +413,6 @@ namespace Grid {
return *this;
}
friend inline void permute(vComplexF &y,vComplexF b,int perm)
{
Gpermute<vComplexF>(y,b,perm);
}
/*
friend inline void merge(vComplexF &y,std::vector<ComplexF *> &extracted)
{
@ -460,7 +436,7 @@ namespace Grid {
inline vComplexF innerProduct(const vComplexF & l, const vComplexF & r)
{
return conj(l)*r;
return conjugate(l)*r;
}
inline void zeroit(vComplexF &z){ vzero(z);}

View File

@ -117,7 +117,7 @@ namespace Grid {
};
///////////////////////////////////////////////
// mult, sub, add, adj,conj, mac functions
// mult, sub, add, adj,conjugate, mac functions
///////////////////////////////////////////////
friend inline void mult(vInteger * __restrict__ y,const vInteger * __restrict__ l,const vInteger *__restrict__ r) {*y = (*l) * (*r);}
friend inline void sub (vInteger * __restrict__ y,const vInteger * __restrict__ l,const vInteger *__restrict__ r) {*y = (*l) - (*r);}

View File

@ -26,7 +26,7 @@ namespace Grid {
friend inline void sub (vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) - (*r);}
friend inline void add (vRealD * __restrict__ y,const vRealD * __restrict__ l,const vRealD *__restrict__ r) {*y = (*l) + (*r);}
friend inline vRealD adj(const vRealD &in) { return in; }
friend inline vRealD conj(const vRealD &in){ return in; }
friend inline vRealD conjugate(const vRealD &in){ return in; }
friend inline void mac (vRealD &y,const vRealD a,const vRealD x){
#if defined (AVX1) || defined (SSE4)
@ -112,11 +112,12 @@ namespace Grid {
// all subtypes; may not be a good assumption, but could
// add the vector width as a template param for BG/Q for example
////////////////////////////////////////////////////////////////////
/*
friend inline void permute(vRealD &y,vRealD b,int perm)
{
Gpermute<vRealD>(y,b,perm);
}
/*
friend inline void merge(vRealD &y,std::vector<RealD *> &extracted)
{
Gmerge<vRealD,RealD >(y,extracted);
@ -209,48 +210,26 @@ namespace Grid {
friend inline RealD Reduce(const vRealD & in)
{
#if defined (SSE4)
// FIXME Hack
const RealD * ptr =(const RealD *) &in;
RealD ret = 0;
for(int i=0;i<vRealD::Nsimd();i++){
ret = ret+ptr[i];
}
return ret;
#ifdef SSE4
vRealD v1;
permute(v1,in,0); // sse 128; paired real double
v1=v1+in;
return RealD(v1.v[0]);
#endif
#if defined (AVX1) || defined(AVX2)
typedef union {
uint64_t l;
double d;
} my_conv_t;
my_conv_t converter;
// more reduce_add
/*
__attribute__ ((aligned(32))) double c_[16];
__m256d tmp = _mm256_permute2f128_pd(in.v,in.v,0x01); // tmp 1032; in= 3210
__m256d hadd = _mm256_hadd_pd(in.v,tmp); // hadd = 1+0,3+2,3+2,1+0
tmp = _mm256_permute2f128_pd(hadd,hadd,0x01);// tmp = 3+2,1+0,1+0,3+2
hadd = _mm256_hadd_pd(tmp,tmp); // tmp = 3+2+1+0,3+2+1+0,1+0+3+2,1+0+3+2
_mm256_store_pd(c_,hadd);<3B>
return c[0]
*/
__m256d tmp = _mm256_permute2f128_pd(in.v,in.v,0x01); // tmp 1032; in= 3210
__m256d hadd = _mm256_hadd_pd(in.v,tmp); // hadd = 1+0,3+2,3+2,1+0
hadd = _mm256_hadd_pd(hadd,hadd); // hadd = 1+0+3+2...
converter.l = _mm256_extract_epi64((ivec)hadd,0);
return converter.d;
#if defined(AVX1) || defined (AVX2)
vRealD v1,v2;
permute(v1,in,0); // avx 256; quad double
v1=v1+in;
permute(v2,v1,1);
v1=v1+v2;
return v1.v[0];
#endif
#ifdef AVX512
return _mm512_reduce_add_pd(in.v);
/*
__attribute__ ((aligned(32))) double c_[8];
_mm512_store_pd(c_,in.v);
return c_[0]+c_[1]+c_[2]+c_[3]+c_[4]+c_[5]+c_[6]+c_[7];
*/
#endif
#ifdef QPX
#endif
}
}
// *=,+=,-= operators
inline vRealD &operator *=(const vRealD &r) {
@ -270,7 +249,7 @@ namespace Grid {
static int Nsimd(void) { return sizeof(dvec)/sizeof(double);}
};
inline vRealD innerProduct(const vRealD & l, const vRealD & r) { return conj(l)*r; }
inline vRealD innerProduct(const vRealD & l, const vRealD & r) { return conjugate(l)*r; }
inline void zeroit(vRealD &z){ vzero(z);}
inline vRealD outerProduct(const vRealD &l, const vRealD& r)

View File

@ -92,13 +92,13 @@ namespace Grid {
};
///////////////////////////////////////////////
// mult, sub, add, adj,conj, mac functions
// mult, sub, add, adj,conjugate, mac functions
///////////////////////////////////////////////
friend inline void mult(vRealF * __restrict__ y,const vRealF * __restrict__ l,const vRealF *__restrict__ r) {*y = (*l) * (*r);}
friend inline void sub (vRealF * __restrict__ y,const vRealF * __restrict__ l,const vRealF *__restrict__ r) {*y = (*l) - (*r);}
friend inline void add (vRealF * __restrict__ y,const vRealF * __restrict__ l,const vRealF *__restrict__ r) {*y = (*l) + (*r);}
friend inline vRealF adj(const vRealF &in) { return in; }
friend inline vRealF conj(const vRealF &in){ return in; }
friend inline vRealF conjugate(const vRealF &in){ return in; }
friend inline void mac (vRealF &y,const vRealF a,const vRealF x){
#if defined (AVX1) || defined (SSE4)
@ -133,11 +133,12 @@ namespace Grid {
// all subtypes; may not be a good assumption, but could
// add the vector width as a template param for BG/Q for example
////////////////////////////////////////////////////////////////////
/*
friend inline void permute(vRealF &y,vRealF b,int perm)
{
Gpermute<vRealF>(y,b,perm);
}
/*
friend inline void merge(vRealF &y,std::vector<RealF *> &extracted)
{
Gmerge<vRealF,RealF >(y,extracted);
@ -155,7 +156,6 @@ namespace Grid {
Gextract<vRealF,RealF>(y,extracted);
}
*/
/////////////////////////////////////////////////////
// Broadcast a value across Nsimd copies.
@ -243,33 +243,26 @@ friend inline void vstore(const vRealF &ret, float *a){
}
friend inline RealF Reduce(const vRealF & in)
{
#if defined (SSE4)
// FIXME Hack
const RealF * ptr = (const RealF *) &in;
RealF ret = 0;
for(int i=0;i<vRealF::Nsimd();i++){
ret = ret+ptr[i];
}
return ret;
#ifdef SSE4
vRealF v1,v2;
permute(v1,in,0); // sse 128; quad single
v1=v1+in;
permute(v2,v1,1);
v1=v1+v2;
return v1.v[0];
#endif
#if defined (AVX1) || defined(AVX2)
__attribute__ ((aligned(32))) float c_[16];
__m256 tmp = _mm256_permute2f128_ps(in.v,in.v,0x01);
__m256 hadd = _mm256_hadd_ps(in.v,tmp);
tmp = _mm256_permute2f128_ps(hadd,hadd,0x01);
hadd = _mm256_hadd_ps(tmp,tmp);
_mm256_store_ps(c_,hadd);
return (float)c_[0];
#if defined(AVX1) || defined (AVX2)
vRealF v1,v2;
permute(v1,in,0); // avx 256; octo-double
v1=v1+in;
permute(v2,v1,1);
v1=v1+v2;
permute(v2,v1,2);
v1=v1+v2;
return v1.v[0];
#endif
#ifdef AVX512
return _mm512_reduce_add_ps(in.v);
/*
__attribute__ ((aligned(64))) float c_[16];
_mm512_store_ps(c_,in.v);
return c_[0]+c_[1]+c_[2]+c_[3]+c_[4]+c_[5]+c_[6]+c_[7]
+c_[8]+c_[9]+c_[10]+c_[11]+c_[12]+c_[13]+c_[14]+c_[15];
*/
#endif
#ifdef QPX
#endif
@ -291,7 +284,7 @@ friend inline void vstore(const vRealF &ret, float *a){
public:
static inline int Nsimd(void) { return sizeof(fvec)/sizeof(float);}
};
inline vRealF innerProduct(const vRealF & l, const vRealF & r) { return conj(l)*r; }
inline vRealF innerProduct(const vRealF & l, const vRealF & r) { return conjugate(l)*r; }
inline void zeroit(vRealF &z){ vzero(z);}
inline vRealF outerProduct(const vRealF &l, const vRealF& r)

View File

@ -79,7 +79,7 @@ namespace Grid {
friend inline void mult(Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) * (*r); }
friend inline void sub (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) - (*r); }
friend inline void add (Grid_simd * __restrict__ y,const Grid_simd * __restrict__ l,const Grid_simd *__restrict__ r){ *y = (*l) + (*r); }
friend inline Grid_simd adj(const Grid_simd &in){ return conj(in); }
friend inline Grid_simd adj(const Grid_simd &in){ return conjugate(in); }
//////////////////////////////////
// Initialise to 1,0,i
@ -193,7 +193,7 @@ namespace Grid {
// Conjugate
///////////////////////
friend inline Grid_simd conj(const Grid_simd &in){
friend inline Grid_simd conjugate(const Grid_simd &in){
Grid_simd ret ; vzero(ret);
// FIXME add operator
return ret;
@ -265,7 +265,7 @@ namespace Grid {
template<class scalar_type, class vector_type >
inline Grid_simd< scalar_type, vector_type> innerProduct(const Grid_simd< scalar_type, vector_type> & l, const Grid_simd< scalar_type, vector_type> & r)
{
return conj(l)*r;
return conjugate(l)*r;
}
template<class scalar_type, class vector_type >