diff --git a/benchmarks/Makefile.am b/benchmarks/Makefile.am index 77ce7663..ae09a6db 100644 --- a/benchmarks/Makefile.am +++ b/benchmarks/Makefile.am @@ -16,7 +16,7 @@ Grid_wilson_cg_unprec_LDADD = -lGrid Grid_comms_SOURCES = Grid_comms.cc Grid_comms_LDADD = -lGrid -Grid_su3_SOURCES = Grid_su3.cc +Grid_su3_SOURCES = Grid_su3.cc Grid_su3_test.cc Grid_su3_LDADD = -lGrid Grid_memory_bandwidth_SOURCES = Grid_memory_bandwidth.cc diff --git a/lib/algorithms/LinearOperator.h b/lib/algorithms/LinearOperator.h index bdb6c387..167213ec 100644 --- a/lib/algorithms/LinearOperator.h +++ b/lib/algorithms/LinearOperator.h @@ -25,7 +25,7 @@ namespace Grid { ///////////////////////////////////////////////////////////////////////////////////////////// template class HermitianOperatorBase : public LinearOperatorBase { public: - virtual void OpAndNorm(const Field &in, Field &out,double &n1,double &n2); + virtual void OpAndNorm(const Field &in, Field &out,double &n1,double &n2)=0; void AdjOp(const Field &in, Field &out) { Op(in,out); }; diff --git a/lib/algorithms/approx/bigfloat.h b/lib/algorithms/approx/bigfloat.h index aee71d03..a749fa81 100755 --- a/lib/algorithms/approx/bigfloat.h +++ b/lib/algorithms/approx/bigfloat.h @@ -31,7 +31,7 @@ public: bigfloat(const double d) { mpf_init_set_d(x, d); } bigfloat(const char *str) { mpf_init_set_str(x, (char*)str, 10); } ~bigfloat(void) { mpf_clear(x); } - operator const double (void) const { return (double)mpf_get_d(x); } + operator double (void) const { return (double)mpf_get_d(x); } static void setDefaultPrecision(unsigned long dprec) { unsigned long bprec = (unsigned long)(3.321928094 * (double)dprec); mpf_set_default_prec(bprec); diff --git a/lib/simd/Grid_vComplexD.h b/lib/simd/Grid_vComplexD.h index 2b03c825..116c2866 100644 --- a/lib/simd/Grid_vComplexD.h +++ b/lib/simd/Grid_vComplexD.h @@ -345,20 +345,30 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){ // REDUCE FIXME must be a cleaner implementation friend inline ComplexD Reduce(const vComplexD & in) { + vComplexD v1,v2; + union { + zvec v; + double f[sizeof(zvec)/sizeof(double)]; + } conv; + #ifdef SSE4 - return ComplexD(in.v[0],in.v[1]); + v1=in; #endif #if defined(AVX1) || defined (AVX2) - vComplexD v1; permute(v1,in,0); // sse 128; paired complex single v1=v1+in; - return ComplexD(v1.v[0],v1.v[1]); #endif #ifdef AVX512 - return ComplexD(_mm512_mask_reduce_add_pd(0x55, in.v),_mm512_mask_reduce_add_pd(0xAA, in.v)); + permute(v1,in,0); // sse 128; paired complex single + v1=v1+in; + permute(v2,v1,1); // avx 256; quad complex single + v1=v1+v2; #endif #ifdef QPX +#error #endif + conv.v = v1.v; + return ComplexD(conv.f[0],conv.f[1]); } // Unary negation diff --git a/lib/simd/Grid_vComplexF.h b/lib/simd/Grid_vComplexF.h index 713bbdd8..cf74ff03 100644 --- a/lib/simd/Grid_vComplexF.h +++ b/lib/simd/Grid_vComplexF.h @@ -234,26 +234,34 @@ namespace Grid { } friend inline ComplexF Reduce(const vComplexF & in) { + vComplexF v1,v2; + union { + cvec v; + float f[sizeof(cvec)/sizeof(float)]; + } conv; #ifdef SSE4 - vComplexF v1; permute(v1,in,0); // sse 128; paired complex single v1=v1+in; - return ComplexF(v1.v[0],v1.v[1]); #endif #if defined(AVX1) || defined (AVX2) - vComplexF v1,v2; permute(v1,in,0); // sse 128; paired complex single v1=v1+in; permute(v2,v1,1); // avx 256; quad complex single v1=v1+v2; - return ComplexF(v1.v[0],v1.v[1]); #endif #ifdef AVX512 - return ComplexF(_mm512_mask_reduce_add_ps(0x5555, in.v),_mm512_mask_reduce_add_ps(0xAAAA, in.v)); + permute(v1,in,0); // avx512 octo-complex single + v1=v1+in; + permute(v2,v1,1); + v1=v1+v2; + permute(v2,v1,2); + v1=v1+v2; #endif #ifdef QPX #error #endif + conv.v = v1.v; + return ComplexF(conv.f[0],conv.f[1]); } friend inline vComplexF operator * (const ComplexF &a, vComplexF b){ diff --git a/lib/simd/Grid_vRealD.h b/lib/simd/Grid_vRealD.h index 83979d14..a32e579e 100644 --- a/lib/simd/Grid_vRealD.h +++ b/lib/simd/Grid_vRealD.h @@ -210,25 +210,33 @@ namespace Grid { friend inline RealD Reduce(const vRealD & in) { + vRealD v1,v2; + union { + dvec v; + double f[sizeof(dvec)/sizeof(double)]; + } conv; #ifdef SSE4 - vRealD v1; permute(v1,in,0); // sse 128; paired real double v1=v1+in; - return RealD(v1.v[0]); #endif #if defined(AVX1) || defined (AVX2) - vRealD v1,v2; permute(v1,in,0); // avx 256; quad double v1=v1+in; permute(v2,v1,1); v1=v1+v2; - return v1.v[0]; #endif #ifdef AVX512 - return _mm512_reduce_add_pd(in.v); + permute(v1,in,0); // avx 512; octo-double + v1=v1+in; + permute(v2,v1,1); + v1=v1+v2; + permute(v2,v1,2); + v1=v1+v2; #endif #ifdef QPX #endif + conv.v=v1.v; + return conv.f[0]; } // *=,+=,-= operators diff --git a/lib/simd/Grid_vRealF.h b/lib/simd/Grid_vRealF.h index 754c5121..05bfa2f5 100644 --- a/lib/simd/Grid_vRealF.h +++ b/lib/simd/Grid_vRealF.h @@ -243,29 +243,39 @@ friend inline void vstore(const vRealF &ret, float *a){ } friend inline RealF Reduce(const vRealF & in) { -#ifdef SSE4 vRealF v1,v2; + union { + fvec v; + float f[sizeof(fvec)/sizeof(double)]; + } conv; +#ifdef SSE4 permute(v1,in,0); // sse 128; quad single v1=v1+in; permute(v2,v1,1); v1=v1+v2; - return v1.v[0]; #endif #if defined(AVX1) || defined (AVX2) - vRealF v1,v2; permute(v1,in,0); // avx 256; octo-double v1=v1+in; permute(v2,v1,1); v1=v1+v2; permute(v2,v1,2); v1=v1+v2; - return v1.v[0]; #endif #ifdef AVX512 - return _mm512_reduce_add_ps(in.v); + permute(v1,in,0); // avx 256; octo-double + v1=v1+in; + permute(v2,v1,1); + v1=v1+v2; + permute(v2,v1,2); + v1=v1+v2; + permute(v2,v1,3); + v1=v1+v2; #endif #ifdef QPX #endif + conv.v=v1.v; + return conv.f[0]; } // *=,+=,-= operators diff --git a/tests/Grid_nersc_io.cc b/tests/Grid_nersc_io.cc index fbef3cb1..6fe587a6 100644 --- a/tests/Grid_nersc_io.cc +++ b/tests/Grid_nersc_io.cc @@ -13,7 +13,7 @@ int main (int argc, char ** argv) std::vector simd_layout = GridDefaultSimd(4,vComplexF::Nsimd()); std::vector mpi_layout = GridDefaultMpi(); - std::vector latt_size ({16,16,16,32}); + std::vector latt_size = GridDefaultLatt(); std::vector clatt_size ({4,4,4,8}); int orthodir=3; int orthosz =latt_size[orthodir]; @@ -44,13 +44,15 @@ int main (int argc, char ** argv) // (1+2+3)=6 = N(N-1)/2 terms LatticeComplex Plaq(&Fine); LatticeComplex cPlaq(&Coarse); + Plaq = zero; +#if 1 for(int mu=1;mu