mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-03 21:44:33 +00:00 
			
		
		
		
	Big updates with progress towards wilson matrix
This commit is contained in:
		@@ -251,13 +251,13 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){
 | 
			
		||||
        friend inline vComplexD conj(const vComplexD &in){
 | 
			
		||||
            vComplexD ret ; vzero(ret);
 | 
			
		||||
#if defined (AVX1)|| defined (AVX2)
 | 
			
		||||
            // addsubps 0, inv=>0+in.v[3] 0-in.v[2], 0+in.v[1], 0-in.v[0], ...
 | 
			
		||||
	    //            __m256d tmp = _mm256_addsub_pd(ret.v,_mm256_shuffle_pd(in.v,in.v,0x5));
 | 
			
		||||
	    //             ret.v=_mm256_shuffle_pd(tmp,tmp,0x5);
 | 
			
		||||
            ret.v = _mm256_addsub_pd(ret.v,in.v);
 | 
			
		||||
	    //	    addsubps 0, inv=>0+in.v[3] 0-in.v[2], 0+in.v[1], 0-in.v[0], ...
 | 
			
		||||
	    zvec tmp = _mm256_addsub_pd(ret.v,_mm256_shuffle_pd(in.v,in.v,0x5));
 | 
			
		||||
	    ret.v    =_mm256_shuffle_pd(tmp,tmp,0x5);
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef SSE4
 | 
			
		||||
            ret.v = _mm_addsub_pd(ret.v,in.v);
 | 
			
		||||
	    zvec tmp = _mm_addsub_pd(ret.v,_mm_shuffle_pd(in.v,in.v,0x1));
 | 
			
		||||
	    ret.v    = _mm_shuffle_pd(tmp,tmp,0x1);
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
	    ret.v = _mm512_mask_sub_pd(in.v, 0xaaaa,ret.v, in.v);             
 | 
			
		||||
@@ -268,48 +268,41 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){
 | 
			
		||||
            return ret;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        friend inline vComplexD timesI(const vComplexD &in){
 | 
			
		||||
        friend inline vComplexD timesMinusI(const vComplexD &in){
 | 
			
		||||
	  vComplexD ret; vzero(ret);
 | 
			
		||||
	  vComplexD tmp;
 | 
			
		||||
#if defined (AVX1)|| defined (AVX2)
 | 
			
		||||
	  cvec tmp =_mm256_addsub_ps(ret.v,in.v); // r,-i
 | 
			
		||||
	  /*
 | 
			
		||||
             IF IMM0[0] = 0
 | 
			
		||||
             THEN DEST[63:0]=SRC1[63:0] ELSE DEST[63:0]=SRC1[127:64] FI;
 | 
			
		||||
             IF IMM0[1] = 0
 | 
			
		||||
             THEN DEST[127:64]=SRC2[63:0] ELSE DEST[127:64]=SRC2[127:64] FI;
 | 
			
		||||
             IF IMM0[2] = 0
 | 
			
		||||
             THEN DEST[191:128]=SRC1[191:128] ELSE DEST[191:128]=SRC1[255:192] FI;
 | 
			
		||||
             IF IMM0[3] = 0
 | 
			
		||||
             THEN DEST[255:192]=SRC2[191:128] ELSE DEST[255:192]=SRC2[255:192] FI;
 | 
			
		||||
	  */
 | 
			
		||||
          ret.v    =_mm256_shuffle_ps(tmp,tmp,0x5);
 | 
			
		||||
	  tmp.v    =_mm256_addsub_pd(ret.v,in.v); // r,-i
 | 
			
		||||
          ret.v    =_mm256_shuffle_pd(tmp.v,tmp.v,0x5);
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef SSE4
 | 
			
		||||
	  cvec tmp =_mm_addsub_ps(ret.v,in.v); // r,-i
 | 
			
		||||
          ret.v    =_mm_shuffle_ps(tmp,tmp,0x5);
 | 
			
		||||
	  tmp.v    =_mm_addsub_pd(ret.v,in.v); // r,-i
 | 
			
		||||
          ret.v    =_mm_shuffle_pd(tmp.v,tmp.v,0x1);
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
          ret.v = _mm512_mask_sub_ps(in.v,0xaaaa,ret.v,in.v); // real -imag 
 | 
			
		||||
	  ret.v = _mm512_swizzle_ps(ret.v, _MM_SWIZ_REG_CDAB);// OK
 | 
			
		||||
          ret.v = _mm512_mask_sub_pd(in.v,0xaaaa,ret.v,in.v); // real -imag 
 | 
			
		||||
	  ret.v = _mm512_swizzle_pd(ret.v, _MM_SWIZ_REG_CDAB);// OK
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef QPX
 | 
			
		||||
            assert(0);
 | 
			
		||||
#endif
 | 
			
		||||
	  return ret;
 | 
			
		||||
	}
 | 
			
		||||
        friend inline vComplexD timesMinusI(const vComplexD &in){
 | 
			
		||||
 | 
			
		||||
        friend inline vComplexD timesI(const vComplexD &in){
 | 
			
		||||
	  vComplexD ret; vzero(ret);
 | 
			
		||||
	  vComplexD tmp;
 | 
			
		||||
#if defined (AVX1)|| defined (AVX2)
 | 
			
		||||
	  cvec tmp =_mm256_shuffle_ps(in.v,in.v,0x5);
 | 
			
		||||
          ret.v    =_mm256_addsub_ps(ret.v,tmp); // i,-r
 | 
			
		||||
	  tmp.v    =_mm256_shuffle_pd(in.v,in.v,0x5);
 | 
			
		||||
          ret.v    =_mm256_addsub_pd(ret.v,tmp.v); // i,-r
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef SSE4
 | 
			
		||||
	  cvec tmp =_mm_shuffle_ps(in.v,in.v,0x5);
 | 
			
		||||
          ret.v    =_mm_addsub_ps(ret.v,tmp); // r,-i
 | 
			
		||||
	  tmp.v    =_mm_shuffle_pd(in.v,in.v,0x1);
 | 
			
		||||
          ret.v    =_mm_addsub_pd(ret.v,tmp.v); // r,-i
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
          cvec tmp = _mm512_swizzle_ps(in.v, _MM_SWIZ_REG_CDAB);// OK
 | 
			
		||||
	  ret.v    = _mm512_mask_sub_ps(tmp,0xaaaa,ret.v,tmp); // real -imag 
 | 
			
		||||
          tmp.v    = _mm512_swizzle_pd(in.v, _MM_SWIZ_REG_CDAB);// OK
 | 
			
		||||
	  ret.v    = _mm512_mask_sub_pd(tmp.v,0xaaaa,ret.v,tmp.v); // real -imag 
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef QPX
 | 
			
		||||
            assert(0);
 | 
			
		||||
 
 | 
			
		||||
@@ -214,10 +214,10 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){
 | 
			
		||||
       {
 | 
			
		||||
#ifdef SSE4
 | 
			
		||||
	   union {
 | 
			
		||||
	     __m128 v1;    // SSE 4 x float vector
 | 
			
		||||
	     cvec v1;    // SSE 4 x float vector
 | 
			
		||||
	     float f[4];  // scalar array of 4 floats
 | 
			
		||||
	   } u128;
 | 
			
		||||
	   u128.v1= _mm_add_ps(v, _mm_shuffle_ps(v, v, 0b01001110)); // FIXME Prefer to use _MM_SHUFFLE macros
 | 
			
		||||
	   u128.v1= _mm_add_ps(in.v, _mm_shuffle_ps(in.v,in.v, 0b01001110)); // FIXME Prefer to use _MM_SHUFFLE macros
 | 
			
		||||
	   return ComplexF(u128.f[0], u128.f[1]);
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef AVX1
 | 
			
		||||
@@ -329,13 +329,15 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){
 | 
			
		||||
        friend inline vComplexF conj(const vComplexF &in){
 | 
			
		||||
            vComplexF ret ; vzero(ret);
 | 
			
		||||
#if defined (AVX1)|| defined (AVX2)
 | 
			
		||||
	    //             cvec tmp;
 | 
			
		||||
	    //             tmp = _mm256_addsub_ps(ret.v,_mm256_shuffle_ps(in.v,in.v,_MM_SHUFFLE(2,3,0,1))); // ymm1 <- br,bi
 | 
			
		||||
	    //             ret.v=_mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1));
 | 
			
		||||
	    ret.v = _mm256_addsub_ps(ret.v,in.v);
 | 
			
		||||
	    cvec tmp;
 | 
			
		||||
	    tmp = _mm256_addsub_ps(ret.v,_mm256_shuffle_ps(in.v,in.v,_MM_SHUFFLE(2,3,0,1))); // ymm1 <- br,bi
 | 
			
		||||
	    ret.v=_mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1));
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef SSE4
 | 
			
		||||
            ret.v = _mm_addsub_ps(ret.v,in.v);
 | 
			
		||||
	    cvec tmp;
 | 
			
		||||
	    tmp = _mm_addsub_ps(ret.v,_mm_shuffle_ps(in.v,in.v,_MM_SHUFFLE(2,3,0,1))); // ymm1 <- br,bi
 | 
			
		||||
	    ret.v=_mm_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1));
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
            ret.v = _mm512_mask_sub_ps(in.v,0xaaaa,ret.v,in.v); // Zero out 0+real 0-imag 
 | 
			
		||||
@@ -345,15 +347,16 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){
 | 
			
		||||
#endif
 | 
			
		||||
            return ret;
 | 
			
		||||
        }
 | 
			
		||||
        friend inline vComplexF timesI(const vComplexF &in){
 | 
			
		||||
	  vComplexF ret; vzero(ret);
 | 
			
		||||
        friend inline vComplexF timesMinusI(const vComplexF &in){
 | 
			
		||||
	  vComplexF ret; 
 | 
			
		||||
	  vzero(ret);
 | 
			
		||||
#if defined (AVX1)|| defined (AVX2)
 | 
			
		||||
	  cvec tmp =_mm256_addsub_ps(ret.v,in.v); // r,-i
 | 
			
		||||
          ret.v = _mm256_shuffle_ps(tmp,tmp,0x5);
 | 
			
		||||
          ret.v = _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1)); //-i,r
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef SSE4
 | 
			
		||||
	  cvec tmp =_mm_addsub_ps(ret.v,in.v); // r,-i
 | 
			
		||||
          ret.v = _mm_shuffle_ps(tmp,tmp,0x5);
 | 
			
		||||
          ret.v = _mm_shuffle_ps(tmp,tmp,_MM_SHUFFLE(2,3,0,1));
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
          ret.v = _mm512_mask_sub_ps(in.v,0xaaaa,ret.v,in.v); // real -imag 
 | 
			
		||||
@@ -364,14 +367,14 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){
 | 
			
		||||
#endif
 | 
			
		||||
	  return ret;
 | 
			
		||||
	}
 | 
			
		||||
        friend inline vComplexF timesMinusI(const vComplexF &in){
 | 
			
		||||
        friend inline vComplexF timesI(const vComplexF &in){
 | 
			
		||||
	  vComplexF ret; vzero(ret);
 | 
			
		||||
#if defined (AVX1)|| defined (AVX2)
 | 
			
		||||
	  cvec tmp =_mm256_shuffle_ps(in.v,in.v,0x5);
 | 
			
		||||
          ret.v = _mm256_addsub_ps(ret.v,tmp); // i,-r
 | 
			
		||||
	  cvec tmp =_mm256_shuffle_ps(in.v,in.v,_MM_SHUFFLE(2,3,0,1));//i,r
 | 
			
		||||
          ret.v    =_mm256_addsub_ps(ret.v,tmp);     //i,-r
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef SSE4
 | 
			
		||||
	  cvec tmp =_mm_shuffle_ps(in.v,in.v,0x5);
 | 
			
		||||
	  cvec tmp =_mm_shuffle_ps(in.v,in.v,_MM_SHUFFLE(2,3,0,1));
 | 
			
		||||
          ret.v = _mm_addsub_ps(ret.v,tmp); // r,-i
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
@@ -443,5 +446,8 @@ friend inline void vstore(const vComplexF &ret, ComplexF *a){
 | 
			
		||||
    inline vComplexF trace(const vComplexF &arg){
 | 
			
		||||
        return arg;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
@@ -146,7 +146,7 @@ namespace Grid {
 | 
			
		||||
            ret.v = _mm256_set_pd(a[3],a[2],a[1],a[0]);
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef SSE4
 | 
			
		||||
            ret.v = _mm_set_pd(a[0],a[1]);
 | 
			
		||||
            ret.v = _mm_set_pd(a[1],a[0]);
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
            ret.v = _mm512_set_pd(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
 | 
			
		||||
@@ -186,6 +186,15 @@ namespace Grid {
 | 
			
		||||
 | 
			
		||||
       friend inline RealD Reduce(const vRealD & in)
 | 
			
		||||
       {
 | 
			
		||||
#if defined (SSE4)
 | 
			
		||||
	 // FIXME Hack
 | 
			
		||||
	 const RealD * ptr =(const RealD *)  ∈
 | 
			
		||||
	 RealD ret = 0; 
 | 
			
		||||
	 for(int i=0;i<vRealD::Nsimd();i++){
 | 
			
		||||
	   ret = ret+ptr[i];
 | 
			
		||||
	 }
 | 
			
		||||
	 return ret;
 | 
			
		||||
#endif
 | 
			
		||||
#if defined (AVX1) || defined(AVX2)
 | 
			
		||||
	 typedef union  {
 | 
			
		||||
	   uint64_t l;
 | 
			
		||||
 
 | 
			
		||||
@@ -175,7 +175,7 @@ namespace Grid {
 | 
			
		||||
            ret.v = _mm256_set_ps(a[7],a[6],a[5],a[4],a[3],a[2],a[1],a[0]);
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef SSE4
 | 
			
		||||
            ret.v = _mm_set_ps(a[0],a[1],a[2],a[3]);
 | 
			
		||||
            ret.v = _mm_set_ps(a[3],a[2],a[1],a[0]);
 | 
			
		||||
#endif
 | 
			
		||||
#ifdef AVX512
 | 
			
		||||
            ret.v = _mm512_set_ps( a[15],a[14],a[13],a[12],a[11],a[10],a[9],a[8],
 | 
			
		||||
@@ -220,6 +220,15 @@ friend inline void vstore(const vRealF &ret, float *a){
 | 
			
		||||
        }
 | 
			
		||||
       friend inline RealF Reduce(const vRealF & in)
 | 
			
		||||
       {
 | 
			
		||||
#if defined (SSE4)
 | 
			
		||||
	 // FIXME Hack
 | 
			
		||||
	 const RealF * ptr = (const RealF *) ∈
 | 
			
		||||
	 RealF ret = 0; 
 | 
			
		||||
	 for(int i=0;i<vRealF::Nsimd();i++){
 | 
			
		||||
	   ret = ret+ptr[i];
 | 
			
		||||
	 }
 | 
			
		||||
	 return ret;
 | 
			
		||||
#endif
 | 
			
		||||
#if defined (AVX1) || defined(AVX2)
 | 
			
		||||
            __attribute__ ((aligned(32))) float c_[16];
 | 
			
		||||
            __m256 tmp = _mm256_permute2f128_ps(in.v,in.v,0x01);
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user