mirror of
https://github.com/paboyle/Grid.git
synced 2025-06-12 20:27:06 +01:00
Wilson perf improvements with Gauge prefetching
This commit is contained in:
@ -11,21 +11,21 @@
|
||||
/* #undef AVX512 */
|
||||
|
||||
/* GRID_COMMS_MPI */
|
||||
/* #undef GRID_COMMS_MPI */
|
||||
#define GRID_COMMS_MPI 1
|
||||
|
||||
/* GRID_COMMS_NONE */
|
||||
#define GRID_COMMS_NONE 1
|
||||
/* #undef GRID_COMMS_NONE */
|
||||
|
||||
/* Define to 1 if you have the declaration of `be64toh', and to 0 if you
|
||||
don't. */
|
||||
#define HAVE_DECL_BE64TOH 1
|
||||
#define HAVE_DECL_BE64TOH 0
|
||||
|
||||
/* Define to 1 if you have the declaration of `ntohll', and to 0 if you don't.
|
||||
*/
|
||||
#define HAVE_DECL_NTOHLL 0
|
||||
#define HAVE_DECL_NTOHLL 1
|
||||
|
||||
/* Define to 1 if you have the <endian.h> header file. */
|
||||
#define HAVE_ENDIAN_H 1
|
||||
/* #undef HAVE_ENDIAN_H */
|
||||
|
||||
/* Define to 1 if you have the `gettimeofday' function. */
|
||||
#define HAVE_GETTIMEOFDAY 1
|
||||
@ -34,10 +34,10 @@
|
||||
#define HAVE_INTTYPES_H 1
|
||||
|
||||
/* Define to 1 if you have the <malloc.h> header file. */
|
||||
#define HAVE_MALLOC_H 1
|
||||
/* #undef HAVE_MALLOC_H */
|
||||
|
||||
/* Define to 1 if you have the <malloc/malloc.h> header file. */
|
||||
/* #undef HAVE_MALLOC_MALLOC_H */
|
||||
#define HAVE_MALLOC_MALLOC_H 1
|
||||
|
||||
/* Define to 1 if you have the <memory.h> header file. */
|
||||
#define HAVE_MEMORY_H 1
|
||||
@ -78,6 +78,9 @@
|
||||
/* Define to the one symbol short name of this package. */
|
||||
#define PACKAGE_TARNAME "grid"
|
||||
|
||||
/* Define to the home page for this package. */
|
||||
#define PACKAGE_URL ""
|
||||
|
||||
/* Define to the version of this package. */
|
||||
#define PACKAGE_VERSION "1.0"
|
||||
|
||||
|
@ -77,6 +77,9 @@
|
||||
/* Define to the one symbol short name of this package. */
|
||||
#undef PACKAGE_TARNAME
|
||||
|
||||
/* Define to the home page for this package. */
|
||||
#undef PACKAGE_URL
|
||||
|
||||
/* Define to the version of this package. */
|
||||
#undef PACKAGE_VERSION
|
||||
|
||||
|
@ -57,6 +57,9 @@ public:
|
||||
friend void zeroit(iScalar<vtype> &that){
|
||||
zeroit(that._internal);
|
||||
}
|
||||
friend void prefetch(iScalar<vtype> &that){
|
||||
prefetch(that._internal);
|
||||
}
|
||||
friend void permute(iScalar<vtype> &out,const iScalar<vtype> &in,int permutetype){
|
||||
permute(out._internal,in._internal,permutetype);
|
||||
}
|
||||
@ -141,6 +144,9 @@ public:
|
||||
zeroit(that._internal[i]);
|
||||
}
|
||||
}
|
||||
friend void prefetch(iVector<vtype,N> &that){
|
||||
for(int i=0;i<N;i++) prefetch(that._internal[i]);
|
||||
}
|
||||
friend void vstream(iVector<vtype,N> &out,const iVector<vtype,N> &in){
|
||||
for(int i=0;i<N;i++){
|
||||
vstream(out._internal[i],in._internal[i]);
|
||||
@ -219,6 +225,11 @@ public:
|
||||
zeroit(that._internal[i][j]);
|
||||
}}
|
||||
}
|
||||
friend void prefetch(iMatrix<vtype,N> &that){
|
||||
for(int i=0;i<N;i++)
|
||||
for(int j=0;j<N;j++)
|
||||
prefetch(that._internal[i][j]);
|
||||
}
|
||||
friend void vstream(iMatrix<vtype,N> &out,const iMatrix<vtype,N> &in){
|
||||
for(int i=0;i<N;i++){
|
||||
for(int j=0;j<N;j++){
|
||||
|
@ -106,7 +106,9 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
|
||||
for(int sss=0;sss<grid->oSites();sss++){
|
||||
|
||||
int ss = sss;
|
||||
//int ss = Stencil._LebesgueReorder[sss];
|
||||
int ssu= sss;
|
||||
//int ss = 0;
|
||||
//int ss = Stencil._LebesgueReorder[sss];
|
||||
|
||||
// Xp
|
||||
offset = Stencil._offsets [Xp][ss];
|
||||
@ -123,7 +125,8 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
|
||||
} else {
|
||||
chi=comm_buf[offset];
|
||||
}
|
||||
mult(&Uchi(),&Umu._odata[ss](Xp),&chi());
|
||||
mult(&Uchi(),&Umu._odata[ssu](Xp),&chi());
|
||||
prefetch(Umu._odata[ssu](Yp));
|
||||
spReconXp(result,Uchi);
|
||||
|
||||
// Yp
|
||||
@ -141,7 +144,8 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
|
||||
} else {
|
||||
chi=comm_buf[offset];
|
||||
}
|
||||
mult(&Uchi(),&Umu._odata[ss](Yp),&chi());
|
||||
mult(&Uchi(),&Umu._odata[ssu](Yp),&chi());
|
||||
prefetch(Umu._odata[ssu](Zp));
|
||||
accumReconYp(result,Uchi);
|
||||
|
||||
// Zp
|
||||
@ -159,7 +163,8 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
|
||||
} else {
|
||||
chi=comm_buf[offset];
|
||||
}
|
||||
mult(&Uchi(),&Umu._odata[ss](Zp),&chi());
|
||||
mult(&Uchi(),&Umu._odata[ssu](Zp),&chi());
|
||||
prefetch(Umu._odata[ssu](Tp));
|
||||
accumReconZp(result,Uchi);
|
||||
|
||||
// Tp
|
||||
@ -177,7 +182,8 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
|
||||
} else {
|
||||
chi=comm_buf[offset];
|
||||
}
|
||||
mult(&Uchi(),&Umu._odata[ss](Tp),&chi());
|
||||
mult(&Uchi(),&Umu._odata[ssu](Tp),&chi());
|
||||
prefetch(Umu._odata[ssu](Xm));
|
||||
accumReconTp(result,Uchi);
|
||||
|
||||
// Xm
|
||||
@ -195,7 +201,8 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
|
||||
} else {
|
||||
chi=comm_buf[offset];
|
||||
}
|
||||
mult(&Uchi(),&Umu._odata[ss](Xm),&chi());
|
||||
mult(&Uchi(),&Umu._odata[ssu](Xm),&chi());
|
||||
prefetch(Umu._odata[ssu](Ym));
|
||||
accumReconXm(result,Uchi);
|
||||
|
||||
|
||||
@ -214,7 +221,8 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
|
||||
} else {
|
||||
chi=comm_buf[offset];
|
||||
}
|
||||
mult(&Uchi(),&Umu._odata[ss](Ym),&chi());
|
||||
mult(&Uchi(),&Umu._odata[ssu](Ym),&chi());
|
||||
prefetch(Umu._odata[ssu](Zm));
|
||||
accumReconYm(result,Uchi);
|
||||
|
||||
// Zm
|
||||
@ -232,7 +240,8 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
|
||||
} else {
|
||||
chi=comm_buf[offset];
|
||||
}
|
||||
mult(&Uchi(),&Umu._odata[ss](Zm),&chi());
|
||||
mult(&Uchi(),&Umu._odata[ssu](Zm),&chi());
|
||||
prefetch(Umu._odata[ssu](Tm));
|
||||
accumReconZm(result,Uchi);
|
||||
|
||||
// Tm
|
||||
@ -250,7 +259,7 @@ void WilsonMatrix::Dhop(const LatticeFermion &in, LatticeFermion &out)
|
||||
} else {
|
||||
chi=comm_buf[offset];
|
||||
}
|
||||
mult(&Uchi(),&Umu._odata[ss](Tm),&chi());
|
||||
mult(&Uchi(),&Umu._odata[ssu](Tm),&chi());
|
||||
accumReconTm(result,Uchi);
|
||||
|
||||
vstream(out._odata[ss],result);
|
||||
|
@ -257,7 +257,7 @@ friend inline void vstore(const vComplexD &ret, ComplexD *a){
|
||||
assert(0);
|
||||
#endif
|
||||
}
|
||||
friend inline void vprefetch(const vComplexD &v)
|
||||
friend inline void prefetch(const vComplexD &v)
|
||||
{
|
||||
_mm_prefetch((const char*)&v.v,_MM_HINT_T0);
|
||||
}
|
||||
|
@ -206,7 +206,7 @@ namespace Grid {
|
||||
assert(0);
|
||||
#endif
|
||||
}
|
||||
friend inline void vprefetch(const vComplexF &v)
|
||||
friend inline void prefetch(const vComplexF &v)
|
||||
{
|
||||
_mm_prefetch((const char*)&v.v,_MM_HINT_T0);
|
||||
}
|
||||
|
@ -190,7 +190,7 @@ namespace Grid {
|
||||
out=in;
|
||||
}
|
||||
|
||||
friend inline void vprefetch(const vInteger &v)
|
||||
friend inline void prefetch(const vInteger &v)
|
||||
{
|
||||
_mm_prefetch((const char*)&v.v,_MM_HINT_T0);
|
||||
}
|
||||
|
@ -191,7 +191,7 @@ namespace Grid {
|
||||
assert(0);
|
||||
#endif
|
||||
}
|
||||
friend inline void vprefetch(const vRealD &v)
|
||||
friend inline void prefetch(const vRealD &v)
|
||||
{
|
||||
_mm_prefetch((const char*)&v.v,_MM_HINT_T0);
|
||||
}
|
||||
|
@ -225,7 +225,7 @@ friend inline void vstore(const vRealF &ret, float *a){
|
||||
}
|
||||
|
||||
|
||||
friend inline void vprefetch(const vRealF &v)
|
||||
friend inline void prefetch(const vRealF &v)
|
||||
{
|
||||
_mm_prefetch((const char*)&v.v,_MM_HINT_T0);
|
||||
}
|
||||
|
Reference in New Issue
Block a user