mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-09 21:50:45 +01:00
round-up after testing of prefetches in stencil close
This commit is contained in:
parent
5050833b42
commit
5ee3ea2144
@ -164,12 +164,7 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
|||||||
if((!local)&&(!st.same_node[Dir]) ) { \
|
if((!local)&&(!st.same_node[Dir]) ) { \
|
||||||
LOAD_CHI(base); \
|
LOAD_CHI(base); \
|
||||||
MULT_2SPIN_1(Dir); \
|
MULT_2SPIN_1(Dir); \
|
||||||
PREFETCH_CHIMU(base); \
|
|
||||||
/* PREFETCH_GAUGE_L1(NxtDir); */ \
|
|
||||||
MULT_2SPIN_2; \
|
MULT_2SPIN_2; \
|
||||||
if (s == 0) { \
|
|
||||||
if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
|
|
||||||
} \
|
|
||||||
RECON; \
|
RECON; \
|
||||||
nmu++; \
|
nmu++; \
|
||||||
}
|
}
|
||||||
@ -180,12 +175,7 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
|
|||||||
if((!local)&&(!st.same_node[Dir]) ) { \
|
if((!local)&&(!st.same_node[Dir]) ) { \
|
||||||
LOAD_CHI(base); \
|
LOAD_CHI(base); \
|
||||||
MULT_2SPIN_1(Dir); \
|
MULT_2SPIN_1(Dir); \
|
||||||
PREFETCH_CHIMU(base); \
|
|
||||||
/* PREFETCH_GAUGE_L1(NxtDir); */ \
|
|
||||||
MULT_2SPIN_2; \
|
MULT_2SPIN_2; \
|
||||||
if (s == 0) { \
|
|
||||||
if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
|
|
||||||
} \
|
|
||||||
RECON; \
|
RECON; \
|
||||||
nmu++; \
|
nmu++; \
|
||||||
}
|
}
|
||||||
|
@ -445,18 +445,21 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
|
|||||||
#ifndef GRID_NVCC
|
#ifndef GRID_NVCC
|
||||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;}
|
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;}
|
||||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); /* printf("."); */ return;}
|
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); /* printf("."); */ return;}
|
||||||
|
//if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); printf("."); return;}
|
||||||
#endif
|
#endif
|
||||||
} else if( interior ) {
|
} else if( interior ) {
|
||||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLNB(GenericDhopSiteInt); return;}
|
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLNB(GenericDhopSiteInt); return;}
|
||||||
#ifndef GRID_NVCC
|
#ifndef GRID_NVCC
|
||||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteInt); return;}
|
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteInt); return;}
|
||||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); /* printf("-"); */ return;}
|
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); /* printf("-"); */ return;}
|
||||||
|
//if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); printf("-"); return;}
|
||||||
#endif
|
#endif
|
||||||
} else if( exterior ) {
|
} else if( exterior ) {
|
||||||
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;}
|
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;}
|
||||||
#ifndef GRID_NVCC
|
#ifndef GRID_NVCC
|
||||||
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;}
|
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;}
|
||||||
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); /* printf("+"); */ return;}
|
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); /* printf("+"); */ return;}
|
||||||
|
//if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); printf("+"); return;}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
assert(0 && " Kernel optimisation case not covered ");
|
assert(0 && " Kernel optimisation case not covered ");
|
||||||
|
@ -799,6 +799,25 @@ typedef veci SIMD_Itype; // Integer type
|
|||||||
|
|
||||||
// prefetch utilities
|
// prefetch utilities
|
||||||
inline void v_prefetch0(int size, const char *ptr){};
|
inline void v_prefetch0(int size, const char *ptr){};
|
||||||
|
|
||||||
|
/* PF 256 worse than PF 64
|
||||||
|
inline void prefetch_HINT_T0(const char *ptr){
|
||||||
|
static int64_t last_ptr;
|
||||||
|
int64_t vptr = reinterpret_cast<std::intptr_t>(ptr) & 0x7fffffffffffff00ll;
|
||||||
|
if (last_ptr != vptr) {
|
||||||
|
last_ptr = vptr;
|
||||||
|
pred pg1 = Optimization::acle<double>::pg1();
|
||||||
|
svprfd(pg1, reinterpret_cast<int64_t*>(ptr), SV_PLDL1STRM);
|
||||||
|
svprfd(pg1, ptr, SV_PLDL1STRM);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
*/
|
||||||
|
/* beneficial for operators?
|
||||||
|
inline void prefetch_HINT_T0(const char *ptr){
|
||||||
|
pred pg1 = Optimization::acle<double>::pg1();
|
||||||
|
svprfd(pg1, ptr, SV_PLDL1STRM);
|
||||||
|
};
|
||||||
|
*/
|
||||||
inline void prefetch_HINT_T0(const char *ptr){};
|
inline void prefetch_HINT_T0(const char *ptr){};
|
||||||
|
|
||||||
// Function name aliases
|
// Function name aliases
|
||||||
|
@ -68,8 +68,27 @@ void Gather_plane_simple_table (Vector<std::pair<int,int> >& table,const Lattice
|
|||||||
int num=table.size();
|
int num=table.size();
|
||||||
std::pair<int,int> *table_v = & table[0];
|
std::pair<int,int> *table_v = & table[0];
|
||||||
auto rhs_v = rhs.View();
|
auto rhs_v = rhs.View();
|
||||||
|
|
||||||
|
// main loop
|
||||||
accelerator_forNB( i,num, vobj::Nsimd(), {
|
accelerator_forNB( i,num, vobj::Nsimd(), {
|
||||||
typedef decltype(coalescedRead(buffer[0])) compressed_t;
|
typedef decltype(coalescedRead(buffer[0])) compressed_t;
|
||||||
|
// prefetching:
|
||||||
|
// +1% performance for Wilson on 32**4
|
||||||
|
// -2% performance for DW on 24**4 x 12
|
||||||
|
/*
|
||||||
|
const int dist = 2;
|
||||||
|
if (i+dist < num){
|
||||||
|
svbool_t pg1 = svptrue_b64();
|
||||||
|
|
||||||
|
// prefetch input
|
||||||
|
auto in = rhs_v(so+table_v[i+dist].second);
|
||||||
|
svprfd(pg1, (char*)&in, SV_PLDL1STRM);
|
||||||
|
|
||||||
|
// prefetch store buffer
|
||||||
|
uint64_t o = table_v[i+dist].first;
|
||||||
|
svprfd(pg1, (char*)&buffer[off+o], SV_PSTL1STRM);
|
||||||
|
}
|
||||||
|
*/
|
||||||
compressed_t tmp_c;
|
compressed_t tmp_c;
|
||||||
uint64_t o = table_v[i].first;
|
uint64_t o = table_v[i].first;
|
||||||
compress.Compress(&tmp_c,0,rhs_v(so+table_v[i].second));
|
compress.Compress(&tmp_c,0,rhs_v(so+table_v[i].second));
|
||||||
|
Loading…
x
Reference in New Issue
Block a user