1
0
mirror of https://github.com/paboyle/Grid.git synced 2025-04-10 14:10:46 +01:00

revert Add/SubTimesI and prefetching in stencil

This reverts commit 9b2699226c7a3ca8d45f843f4f8e4658fa082163.
This commit is contained in:
nmeyer-ur 2020-06-08 12:02:53 +02:00
parent 93a37c8f68
commit 433766ac62
7 changed files with 83 additions and 344 deletions

View File

@ -164,7 +164,12 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
if((!local)&&(!st.same_node[Dir]) ) { \
LOAD_CHI(base); \
MULT_2SPIN_1(Dir); \
PREFETCH_CHIMU(base); \
/* PREFETCH_GAUGE_L1(NxtDir); */ \
MULT_2SPIN_2; \
if (s == 0) { \
if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
} \
RECON; \
nmu++; \
}
@ -175,7 +180,12 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
if((!local)&&(!st.same_node[Dir]) ) { \
LOAD_CHI(base); \
MULT_2SPIN_1(Dir); \
PREFETCH_CHIMU(base); \
/* PREFETCH_GAUGE_L1(NxtDir); */ \
MULT_2SPIN_2; \
if (s == 0) { \
if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
} \
RECON; \
nmu++; \
}

View File

@ -445,21 +445,18 @@ void WilsonKernels<Impl>::DhopKernel(int Opt,StencilImpl &st, DoubledGaugeField
#ifndef GRID_NVCC
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSite); return;}
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); /* printf("."); */ return;}
//if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSite); printf("."); return;}
#endif
} else if( interior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALLNB(GenericDhopSiteInt); return;}
#ifndef GRID_NVCC
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALLNB(HandDhopSiteInt); return;}
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); /* printf("-"); */ return;}
//if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteInt); printf("-"); return;}
#endif
} else if( exterior ) {
if (Opt == WilsonKernelsStatic::OptGeneric ) { KERNEL_CALL(GenericDhopSiteExt); return;}
#ifndef GRID_NVCC
if (Opt == WilsonKernelsStatic::OptHandUnroll ) { KERNEL_CALL(HandDhopSiteExt); return;}
if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); /* printf("+"); */ return;}
//if (Opt == WilsonKernelsStatic::OptInlineAsm ) { ASM_CALL(AsmDhopSiteExt); printf("+"); return;}
#endif
}
assert(0 && " Kernel optimisation case not covered ");

View File

@ -74,17 +74,13 @@ NAMESPACE_BEGIN(Grid);
// To fail is not to err (Cryptic clue: suggest to Google SFINAE ;) )
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProjXp (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin)
{
//hspin(0)=fspin(0)+timesI(fspin(3));
//hspin(1)=fspin(1)+timesI(fspin(2));
hspin(0)=addTimesI(fspin(0), fspin(3));
hspin(1)=addTimesI(fspin(1), fspin(2));
hspin(0)=fspin(0)+timesI(fspin(3));
hspin(1)=fspin(1)+timesI(fspin(2));
}
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProjXm (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin)
{
//hspin(0)=fspin(0)-timesI(fspin(3));
//hspin(1)=fspin(1)-timesI(fspin(2));
hspin(0)=subTimesI(fspin(0), fspin(3));
hspin(1)=subTimesI(fspin(1), fspin(2));
hspin(0)=fspin(0)-timesI(fspin(3));
hspin(1)=fspin(1)-timesI(fspin(2));
}
// 0 0 0 -1 [0] -+ [3]
@ -109,18 +105,14 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void s
*/
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProjZp (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin)
{
//hspin(0)=fspin(0)+timesI(fspin(2));
//hspin(1)=fspin(1)-timesI(fspin(3));
hspin(0)=addTimesI(fspin(0), fspin(2));
hspin(1)=subTimesI(fspin(1), fspin(3));
hspin(0)=fspin(0)+timesI(fspin(2));
hspin(1)=fspin(1)-timesI(fspin(3));
}
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void spProjZm (iVector<vtype,Nhs> &hspin,const iVector<vtype,Ns> &fspin)
{
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
//hspin(0)=fspin(0)-timesI(fspin(2));
//hspin(1)=fspin(1)+timesI(fspin(3));
hspin(0)=subTimesI(fspin(0), fspin(2));
hspin(1)=addTimesI(fspin(1), fspin(3));
hspin(0)=fspin(0)-timesI(fspin(2));
hspin(1)=fspin(1)+timesI(fspin(3));
}
/*Gt
* 0 0 1 0 [0]+-[2]
@ -210,20 +202,16 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
fspin(0)+=hspin(0);
fspin(1)+=hspin(1);
//fspin(2)-=timesI(hspin(1));
//fspin(3)-=timesI(hspin(0));
fspin(2)=subTimesI(fspin(2), hspin(1));
fspin(3)=subTimesI(fspin(3), hspin(0));
fspin(2)-=timesI(hspin(1));
fspin(3)-=timesI(hspin(0));
}
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconXm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
{
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
fspin(0)+=hspin(0);
fspin(1)+=hspin(1);
//fspin(2)+=timesI(hspin(1));
//fspin(3)+=timesI(hspin(0));
fspin(2)=addTimesI(fspin(2), hspin(1));
fspin(3)=addTimesI(fspin(3), hspin(0));
fspin(2)+=timesI(hspin(1));
fspin(3)+=timesI(hspin(0));
}
// 0 0 0 -1 [0] -+ [3]
@ -291,20 +279,16 @@ template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void a
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
fspin(0)+=hspin(0);
fspin(1)+=hspin(1);
//fspin(2)-=timesI(hspin(0));
//fspin(3)+=timesI(hspin(1));
fspin(2)=subTimesI(fspin(2), hspin(0));
fspin(3)=addTimesI(fspin(3), hspin(1));
fspin(2)-=timesI(hspin(0));
fspin(3)+=timesI(hspin(1));
}
template<class vtype,IfSpinor<iVector<vtype,Ns> > = 0> accelerator_inline void accumReconZm (iVector<vtype,Ns> &fspin,const iVector<vtype,Nhs> &hspin)
{
//typename std::enable_if<matchGridTensorIndex<iVector<vtype,Ns>,SpinorIndex>::value,iVector<vtype,Ns> >::type *SFINAE;
fspin(0)+=hspin(0);
fspin(1)+=hspin(1);
//fspin(2)+=timesI(hspin(0));
//fspin(3)-=timesI(hspin(1));
fspin(2)=addTimesI(fspin(2), hspin(0));
fspin(3)=subTimesI(fspin(3), hspin(1));
fspin(2)+=timesI(hspin(0));
fspin(3)-=timesI(hspin(1));
}
/*Gt
* 0 0 1 0 [0]+-[2]

View File

@ -442,59 +442,6 @@ struct TimesMinusI{
}
};
// alternative implementation using fcadd
// this is not optimal because we have op1 = op2 + TimesMinusI(op3) = op2 - TimesI(op3) etc
// but ideally we have op1 = SubTimesI(op2,op3)
//
// makes performance worse in Benchmark_wilson using MPI
// increases halogtime and gathertime
/*
struct TimesMinusI{
// Complex float
inline vecf operator()(vecf a, vecf b){
pred pg1 = acle<float>::pg1();
vecf z_v = acle<float>::zero();
return svcadd_x(pg1, z_v, a, 270);
}
// Complex double
inline vecd operator()(vecd a, vecd b){
pred pg1 = acle<double>::pg1();
vecd z_v = acle<double>::zero();
return svcadd_x(pg1, z_v, a, 270);
}
};
*/
// SVE only, fcadd returns a +- i*b
// a + i * b
struct AddTimesI{
// Complex float
inline vecf operator()(vecf a, vecf b){
pred pg1 = acle<float>::pg1();
return svcadd_x(pg1, a, b, 90);
}
// Complex double
inline vecd operator()(vecd a, vecd b){
pred pg1 = acle<double>::pg1();
return svcadd_x(pg1, a, b, 90);
}
};
// a - i * b
struct SubTimesI{
// Complex float
inline vecf operator()(vecf a, vecf b){
pred pg1 = acle<float>::pg1();
return svcadd_x(pg1, a, b, 270);
}
// Complex double
inline vecd operator()(vecd a, vecd b){
pred pg1 = acle<double>::pg1();
return svcadd_x(pg1, a, b, 270);
}
};
struct TimesI{
// Complex float
inline vecf operator()(vecf a, vecf b){
@ -518,33 +465,6 @@ struct TimesI{
}
};
// alternative implementation using fcadd
// this is not optimal because we have op1 = op2 + TimesI(op3) etc
// ideally we have op1 = AddTimesI(op2,op3)
//
// makes performance worse in Benchmark_wilson using MPI
// increases halogtime and gathertime
/*
struct TimesI{
// Complex float
inline vecf operator()(vecf a, vecf b){
pred pg1 = acle<float>::pg1();
vecf z_v = acle<float>::zero();
return svcadd_x(pg1, z_v, a, 90);
}
// Complex double
inline vecd operator()(vecd a, vecd b){
pred pg1 = acle<double>::pg1();
vecd z_v = acle<double>::zero();
return svcadd_x(pg1, z_v, a, 90);
}
};
*/
struct PrecisionChange {
static inline vech StoH (vecf sa, vecf sb) {
pred pg1s = acle<float>::pg1();
@ -827,25 +747,6 @@ typedef veci SIMD_Itype; // Integer type
// prefetch utilities
inline void v_prefetch0(int size, const char *ptr){};
/* PF 256
inline void prefetch_HINT_T0(const char *ptr){
static int64_t last_ptr;
int64_t vptr = reinterpret_cast<std::intptr_t>(ptr) & 0x7fffffffffffff00ll;
if (last_ptr != vptr) {
last_ptr = vptr;
pred pg1 = Optimization::acle<double>::pg1();
svprfd(pg1, reinterpret_cast<int64_t*>(ptr), SV_PLDL1STRM);
svprfd(pg1, ptr, SV_PLDL1STRM);
}
};
*/
/* PF 64
inline void prefetch_HINT_T0(const char *ptr){
pred pg1 = Optimization::acle<double>::pg1();
svprfd(pg1, ptr, SV_PLDL1STRM);
};
*/
inline void prefetch_HINT_T0(const char *ptr){};
// Function name aliases
@ -867,8 +768,5 @@ typedef Optimization::MaddRealPart MaddRealPartSIMD;
typedef Optimization::Conj ConjSIMD;
typedef Optimization::TimesMinusI TimesMinusISIMD;
typedef Optimization::TimesI TimesISIMD;
typedef Optimization::AddTimesI AddTimesISIMD;
typedef Optimization::SubTimesI SubTimesISIMD;
NAMESPACE_END(Grid);

View File

@ -298,7 +298,7 @@ public:
// FIXME -- alias this to an accelerator_inline MAC struct.
// specialize mac for A64FX
// FIXME VLA build error
#if defined(A64FX) || defined(A64FXFIXEDSIZE)
friend accelerator_inline void mac(Grid_simd *__restrict__ y,
const Grid_simd *__restrict__ a,
@ -894,47 +894,6 @@ accelerator_inline Grid_simd<S, V> timesI(const Grid_simd<S, V> &in) {
return in;
}
// -----------------------------------------------------------------------------
// SVE only
///////////////////////
// AddTimesI
///////////////////////
template <class S, class V, IfComplex<S> = 0>
accelerator_inline void addTimesI(Grid_simd<S, V> &ret, const Grid_simd<S, V> &in1, const Grid_simd<S, V> &in2) {
ret.v = binary<V>(in1.v, in2.v, AddTimesISIMD());
}
template <class S, class V, IfComplex<S> = 0>
accelerator_inline Grid_simd<S, V> addTimesI(const Grid_simd<S, V> &in1, const Grid_simd<S, V> &in2) {
Grid_simd<S, V> ret;
ret = addTimesI(in1, in2);
return ret;
}
template <class S, class V, IfNotComplex<S> = 0>
accelerator_inline Grid_simd<S, V> addTimesI(const Grid_simd<S, V> &in1, const Grid_simd<S, V> &in2) {
return in1;
}
///////////////////////
// SubTimesI
///////////////////////
template <class S, class V, IfComplex<S> = 0>
accelerator_inline void subTimesI(Grid_simd<S, V> &ret, const Grid_simd<S, V> &in1, const Grid_simd<S, V> &in2) {
ret.v = binary<V>(in1.v, in2.v, SubTimesISIMD());
}
template <class S, class V, IfComplex<S> = 0>
accelerator_inline Grid_simd<S, V> subTimesI(const Grid_simd<S, V> &in1, const Grid_simd<S, V> &in2) {
Grid_simd<S, V> ret;
ret = subTimesI(in1, in2);
return ret;
}
template <class S, class V, IfNotComplex<S> = 0>
accelerator_inline Grid_simd<S, V> subTimesI(const Grid_simd<S, V> &in1, const Grid_simd<S, V> &in2) {
return in1;
}
// end SVE
// -----------------------------------------------------------------------------
/////////////////////
// Inner, outer
/////////////////////

View File

@ -68,27 +68,8 @@ void Gather_plane_simple_table (Vector<std::pair<int,int> >& table,const Lattice
int num=table.size();
std::pair<int,int> *table_v = & table[0];
auto rhs_v = rhs.View();
// main loop
accelerator_forNB( i,num, vobj::Nsimd(), {
typedef decltype(coalescedRead(buffer[0])) compressed_t;
// prefetching:
// +1% performance for Wilson on 32**4
// -2% performance for DW on 24**4 x 12
const int dist = 7;
if (i+dist < num){
svbool_t pg1 = svptrue_b64();
// prefetch input
auto in = rhs_v(so+table_v[i+dist].second);
svprfd(pg1, (char*)&in, SV_PLDL2STRM);
// prefetch store buffer
uint64_t o = table_v[i+dist].first;
svprfd(pg1, (char*)&buffer[off+o], SV_PSTL2STRM);
}
compressed_t tmp_c;
uint64_t o = table_v[i].first;
compress.Compress(&tmp_c,0,rhs_v(so+table_v[i].second));

View File

@ -120,96 +120,6 @@ template<class vtype,int N> accelerator_inline void timesMinusI(iMatrix<vtype,N
}
// -----------------------------------------------------------------------------
// SVE
template<class vtype> accelerator_inline iScalar<vtype> addTimesI(const iScalar<vtype>&r1, const iScalar<vtype>&r2)
{
iScalar<vtype> ret;
addTimesI(ret._internal,r1._internal,r2._internal);
return ret;
}
template<class vtype,int N> accelerator_inline iVector<vtype,N> addTimesI(const iVector<vtype,N>&r1, const iVector<vtype,N>&r2)
{
iVector<vtype,N> ret;
for(int i=0;i<N;i++){
addTimesI(ret._internal[i],r1._internal[i],r2._internal[i]);
}
return ret;
}
template<class vtype,int N> accelerator_inline iMatrix<vtype,N> addTimesI(const iMatrix<vtype,N>&r1, const iMatrix<vtype,N>&r2)
{
iMatrix<vtype,N> ret;
for(int i=0;i<N;i++){
for(int j=0;j<N;j++){
addTimesI(ret._internal[i][j],r1._internal[i][j],r2._internal[i][j]);
}}
return ret;
}
template<class vtype> accelerator_inline void addTimesI(iScalar<vtype> &ret,const iScalar<vtype>&r1,const iScalar<vtype>&r2)
{
addTimesI(ret._internal,r1._internal,r2._internal);
}
template<class vtype,int N> accelerator_inline void addTimesI(iVector<vtype,N> &ret,const iVector<vtype,N>&r1,const iVector<vtype,N>&r2)
{
for(int i=0;i<N;i++){
addTimesI(ret._internal[i],r1._internal[i],r2._internal[i]);
}
}
template<class vtype,int N> accelerator_inline void addTimesI(iMatrix<vtype,N> &ret,const iMatrix<vtype,N>&r1,const iMatrix<vtype,N>&r2)
{
for(int i=0;i<N;i++){
for(int j=0;j<N;j++){
addTimesI(ret._internal[i][j],r1._internal[i][j],r2._internal[i][j]);
}}
}
template<class vtype> accelerator_inline iScalar<vtype> subTimesI(const iScalar<vtype>&r1, const iScalar<vtype>&r2)
{
iScalar<vtype> ret;
subTimesI(ret._internal,r1._internal,r2._internal);
return ret;
}
template<class vtype,int N> accelerator_inline iVector<vtype,N> subTimesI(const iVector<vtype,N>&r1, const iVector<vtype,N>&r2)
{
iVector<vtype,N> ret;
for(int i=0;i<N;i++){
subTimesI(ret._internal[i],r1._internal[i],r2._internal[i]);
}
return ret;
}
template<class vtype,int N> accelerator_inline iMatrix<vtype,N> subTimesI(const iMatrix<vtype,N>&r1, const iMatrix<vtype,N>&r2)
{
iMatrix<vtype,N> ret;
for(int i=0;i<N;i++){
for(int j=0;j<N;j++){
subTimesI(ret._internal[i][j],r1._internal[i][j],r2._internal[i][j]);
}}
return ret;
}
template<class vtype> accelerator_inline void subTimesI(iScalar<vtype> &ret,const iScalar<vtype>&r1,const iScalar<vtype>&r2)
{
subTimesI(ret._internal,r1._internal,r2._internal);
}
template<class vtype,int N> accelerator_inline void subTimesI(iVector<vtype,N> &ret,const iVector<vtype,N>&r1,const iVector<vtype,N>&r2)
{
for(int i=0;i<N;i++){
subTimesI(ret._internal[i],r1._internal[i],r2._internal[i]);
}
}
template<class vtype,int N> accelerator_inline void subTimesI(iMatrix<vtype,N> &ret,const iMatrix<vtype,N>&r1,const iMatrix<vtype,N>&r2)
{
for(int i=0;i<N;i++){
for(int j=0;j<N;j++){
subTimesI(ret._internal[i][j],r1._internal[i][j],r2._internal[i][j]);
}}
}
// -----------------------------------------------------------------------------
// end SVE
///////////////////////////////////////////////
// Conj function for scalar, vector, matrix
///////////////////////////////////////////////