mirror of
				https://github.com/paboyle/Grid.git
				synced 2025-11-02 21:14:32 +00:00 
			
		
		
		
	Wilson perf improvements with Gauge prefetching
This commit is contained in:
		@@ -11,21 +11,21 @@
 | 
			
		||||
/* #undef AVX512 */
 | 
			
		||||
 | 
			
		||||
/* GRID_COMMS_MPI */
 | 
			
		||||
/* #undef GRID_COMMS_MPI */
 | 
			
		||||
#define GRID_COMMS_MPI 1
 | 
			
		||||
 | 
			
		||||
/* GRID_COMMS_NONE */
 | 
			
		||||
#define GRID_COMMS_NONE 1
 | 
			
		||||
/* #undef GRID_COMMS_NONE */
 | 
			
		||||
 | 
			
		||||
/* Define to 1 if you have the declaration of `be64toh', and to 0 if you
 | 
			
		||||
   don't. */
 | 
			
		||||
#define HAVE_DECL_BE64TOH 1
 | 
			
		||||
#define HAVE_DECL_BE64TOH 0
 | 
			
		||||
 | 
			
		||||
/* Define to 1 if you have the declaration of `ntohll', and to 0 if you don't.
 | 
			
		||||
   */
 | 
			
		||||
#define HAVE_DECL_NTOHLL 0
 | 
			
		||||
#define HAVE_DECL_NTOHLL 1
 | 
			
		||||
 | 
			
		||||
/* Define to 1 if you have the <endian.h> header file. */
 | 
			
		||||
#define HAVE_ENDIAN_H 1
 | 
			
		||||
/* #undef HAVE_ENDIAN_H */
 | 
			
		||||
 | 
			
		||||
/* Define to 1 if you have the `gettimeofday' function. */
 | 
			
		||||
#define HAVE_GETTIMEOFDAY 1
 | 
			
		||||
@@ -34,10 +34,10 @@
 | 
			
		||||
#define HAVE_INTTYPES_H 1
 | 
			
		||||
 | 
			
		||||
/* Define to 1 if you have the <malloc.h> header file. */
 | 
			
		||||
#define HAVE_MALLOC_H 1
 | 
			
		||||
/* #undef HAVE_MALLOC_H */
 | 
			
		||||
 | 
			
		||||
/* Define to 1 if you have the <malloc/malloc.h> header file. */
 | 
			
		||||
/* #undef HAVE_MALLOC_MALLOC_H */
 | 
			
		||||
#define HAVE_MALLOC_MALLOC_H 1
 | 
			
		||||
 | 
			
		||||
/* Define to 1 if you have the <memory.h> header file. */
 | 
			
		||||
#define HAVE_MEMORY_H 1
 | 
			
		||||
@@ -78,6 +78,9 @@
 | 
			
		||||
/* Define to the one symbol short name of this package. */
 | 
			
		||||
#define PACKAGE_TARNAME "grid"
 | 
			
		||||
 | 
			
		||||
/* Define to the home page for this package. */
 | 
			
		||||
#define PACKAGE_URL ""
 | 
			
		||||
 | 
			
		||||
/* Define to the version of this package. */
 | 
			
		||||
#define PACKAGE_VERSION "1.0"
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -77,6 +77,9 @@
 | 
			
		||||
/* Define to the one symbol short name of this package. */
 | 
			
		||||
#undef PACKAGE_TARNAME
 | 
			
		||||
 | 
			
		||||
/* Define to the home page for this package. */
 | 
			
		||||
#undef PACKAGE_URL
 | 
			
		||||
 | 
			
		||||
/* Define to the version of this package. */
 | 
			
		||||
#undef PACKAGE_VERSION
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -57,6 +57,9 @@ public:
 | 
			
		||||
    friend void zeroit(iScalar<vtype> &that){
 | 
			
		||||
        zeroit(that._internal);
 | 
			
		||||
    }
 | 
			
		||||
    friend void prefetch(iScalar<vtype> &that){
 | 
			
		||||
      prefetch(that._internal);
 | 
			
		||||
    }
 | 
			
		||||
    friend void permute(iScalar<vtype> &out,const iScalar<vtype> &in,int permutetype){
 | 
			
		||||
      permute(out._internal,in._internal,permutetype);
 | 
			
		||||
    }
 | 
			
		||||
@@ -141,6 +144,9 @@ public:
 | 
			
		||||
            zeroit(that._internal[i]);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
    friend void prefetch(iVector<vtype,N> &that){
 | 
			
		||||
      for(int i=0;i<N;i++) prefetch(that._internal[i]);
 | 
			
		||||
    }
 | 
			
		||||
    friend void vstream(iVector<vtype,N> &out,const iVector<vtype,N> &in){
 | 
			
		||||
      for(int i=0;i<N;i++){
 | 
			
		||||
	vstream(out._internal[i],in._internal[i]);
 | 
			
		||||
@@ -219,6 +225,11 @@ public:
 | 
			
		||||
	zeroit(that._internal[i][j]);
 | 
			
		||||
    }}
 | 
			
		||||
  }
 | 
			
		||||
  friend void prefetch(iMatrix<vtype,N> &that){
 | 
			
		||||
    for(int i=0;i<N;i++) 
 | 
			
		||||
    for(int j=0;j<N;j++) 
 | 
			
		||||
      prefetch(that._internal[i][j]);
 | 
			
		||||
  }
 | 
			
		||||
  friend void vstream(iMatrix<vtype,N> &out,const iMatrix<vtype,N> &in){
 | 
			
		||||
      for(int i=0;i<N;i++){
 | 
			
		||||
      for(int j=0;j<N;j++){
 | 
			
		||||
 
 | 
			
		||||
@@ -106,7 +106,9 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
 | 
			
		||||
  for(int sss=0;sss<grid->oSites();sss++){
 | 
			
		||||
 | 
			
		||||
    int ss = sss;
 | 
			
		||||
	//int ss = Stencil._LebesgueReorder[sss];
 | 
			
		||||
    int ssu= sss;
 | 
			
		||||
    //int ss = 0;
 | 
			
		||||
    //int ss = Stencil._LebesgueReorder[sss];
 | 
			
		||||
 | 
			
		||||
    // Xp
 | 
			
		||||
    offset = Stencil._offsets [Xp][ss];
 | 
			
		||||
@@ -123,7 +125,8 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
 | 
			
		||||
    } else { 
 | 
			
		||||
      chi=comm_buf[offset];
 | 
			
		||||
    }
 | 
			
		||||
    mult(&Uchi(),&Umu._odata[ss](Xp),&chi());
 | 
			
		||||
    mult(&Uchi(),&Umu._odata[ssu](Xp),&chi());
 | 
			
		||||
    prefetch(Umu._odata[ssu](Yp));
 | 
			
		||||
    spReconXp(result,Uchi);
 | 
			
		||||
 | 
			
		||||
    // Yp
 | 
			
		||||
@@ -141,7 +144,8 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
 | 
			
		||||
    } else { 
 | 
			
		||||
      chi=comm_buf[offset];
 | 
			
		||||
    }
 | 
			
		||||
    mult(&Uchi(),&Umu._odata[ss](Yp),&chi());
 | 
			
		||||
    mult(&Uchi(),&Umu._odata[ssu](Yp),&chi());
 | 
			
		||||
    prefetch(Umu._odata[ssu](Zp));
 | 
			
		||||
    accumReconYp(result,Uchi);
 | 
			
		||||
 | 
			
		||||
    // Zp
 | 
			
		||||
@@ -159,7 +163,8 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
 | 
			
		||||
    } else { 
 | 
			
		||||
      chi=comm_buf[offset];
 | 
			
		||||
    }
 | 
			
		||||
    mult(&Uchi(),&Umu._odata[ss](Zp),&chi());
 | 
			
		||||
    mult(&Uchi(),&Umu._odata[ssu](Zp),&chi());
 | 
			
		||||
    prefetch(Umu._odata[ssu](Tp));
 | 
			
		||||
    accumReconZp(result,Uchi);
 | 
			
		||||
 | 
			
		||||
    // Tp
 | 
			
		||||
@@ -177,7 +182,8 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
 | 
			
		||||
    } else { 
 | 
			
		||||
      chi=comm_buf[offset];
 | 
			
		||||
    }
 | 
			
		||||
    mult(&Uchi(),&Umu._odata[ss](Tp),&chi());
 | 
			
		||||
    mult(&Uchi(),&Umu._odata[ssu](Tp),&chi());
 | 
			
		||||
    prefetch(Umu._odata[ssu](Xm));
 | 
			
		||||
    accumReconTp(result,Uchi);
 | 
			
		||||
 | 
			
		||||
    // Xm
 | 
			
		||||
@@ -195,7 +201,8 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
 | 
			
		||||
    } else { 
 | 
			
		||||
      chi=comm_buf[offset];
 | 
			
		||||
    }
 | 
			
		||||
    mult(&Uchi(),&Umu._odata[ss](Xm),&chi());
 | 
			
		||||
    mult(&Uchi(),&Umu._odata[ssu](Xm),&chi());
 | 
			
		||||
    prefetch(Umu._odata[ssu](Ym));
 | 
			
		||||
    accumReconXm(result,Uchi);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@@ -214,7 +221,8 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
 | 
			
		||||
    } else { 
 | 
			
		||||
      chi=comm_buf[offset];
 | 
			
		||||
    }
 | 
			
		||||
    mult(&Uchi(),&Umu._odata[ss](Ym),&chi());
 | 
			
		||||
    mult(&Uchi(),&Umu._odata[ssu](Ym),&chi());
 | 
			
		||||
    prefetch(Umu._odata[ssu](Zm));
 | 
			
		||||
    accumReconYm(result,Uchi);
 | 
			
		||||
 | 
			
		||||
    // Zm
 | 
			
		||||
@@ -232,7 +240,8 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
 | 
			
		||||
    } else { 
 | 
			
		||||
      chi=comm_buf[offset];
 | 
			
		||||
    }
 | 
			
		||||
    mult(&Uchi(),&Umu._odata[ss](Zm),&chi());
 | 
			
		||||
    mult(&Uchi(),&Umu._odata[ssu](Zm),&chi());
 | 
			
		||||
    prefetch(Umu._odata[ssu](Tm));
 | 
			
		||||
    accumReconZm(result,Uchi);
 | 
			
		||||
 | 
			
		||||
    // Tm
 | 
			
		||||
@@ -250,7 +259,7 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
 | 
			
		||||
    } else { 
 | 
			
		||||
      chi=comm_buf[offset];
 | 
			
		||||
    }
 | 
			
		||||
    mult(&Uchi(),&Umu._odata[ss](Tm),&chi());
 | 
			
		||||
    mult(&Uchi(),&Umu._odata[ssu](Tm),&chi());
 | 
			
		||||
    accumReconTm(result,Uchi);
 | 
			
		||||
 | 
			
		||||
    vstream(out._odata[ss],result);
 | 
			
		||||
 
 | 
			
		||||
@@ -257,7 +257,7 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){
 | 
			
		||||
	  assert(0);
 | 
			
		||||
#endif
 | 
			
		||||
	}
 | 
			
		||||
      friend inline void vprefetch(const vComplexD &v)
 | 
			
		||||
      friend inline void prefetch(const vComplexD &v)
 | 
			
		||||
        {
 | 
			
		||||
            _mm_prefetch((const char*)&v.v,_MM_HINT_T0);
 | 
			
		||||
        }
 | 
			
		||||
 
 | 
			
		||||
@@ -206,7 +206,7 @@ namespace Grid {
 | 
			
		||||
	  assert(0);
 | 
			
		||||
#endif
 | 
			
		||||
	}
 | 
			
		||||
      friend inline void vprefetch(const vComplexF &v)
 | 
			
		||||
      friend inline void prefetch(const vComplexF &v)
 | 
			
		||||
        {
 | 
			
		||||
            _mm_prefetch((const char*)&v.v,_MM_HINT_T0);
 | 
			
		||||
        }
 | 
			
		||||
 
 | 
			
		||||
@@ -190,7 +190,7 @@ namespace Grid {
 | 
			
		||||
	  out=in;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        friend inline void vprefetch(const vInteger &v)
 | 
			
		||||
        friend inline void prefetch(const vInteger &v)
 | 
			
		||||
        {
 | 
			
		||||
            _mm_prefetch((const char*)&v.v,_MM_HINT_T0);
 | 
			
		||||
        }
 | 
			
		||||
 
 | 
			
		||||
@@ -191,7 +191,7 @@ namespace Grid {
 | 
			
		||||
	  assert(0);
 | 
			
		||||
#endif
 | 
			
		||||
	}
 | 
			
		||||
        friend inline void vprefetch(const vRealD &v)
 | 
			
		||||
        friend inline void prefetch(const vRealD &v)
 | 
			
		||||
        {
 | 
			
		||||
            _mm_prefetch((const char*)&v.v,_MM_HINT_T0);
 | 
			
		||||
        }
 | 
			
		||||
 
 | 
			
		||||
@@ -225,7 +225,7 @@ friend inline void vstore(const vRealF &ret, float *a){
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
        friend inline void vprefetch(const vRealF &v)
 | 
			
		||||
        friend inline void prefetch(const vRealF &v)
 | 
			
		||||
        {
 | 
			
		||||
            _mm_prefetch((const char*)&v.v,_MM_HINT_T0);
 | 
			
		||||
        }
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user