1
0
mirror of https://github.com/paboyle/Grid.git synced 2024-11-10 07:55:35 +00:00
This commit is contained in:
Nils Meyer 2020-12-19 03:35:18 +01:00
parent 6013183361
commit 45d49d8648

View File

@ -26,9 +26,9 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
*************************************************************************************/ *************************************************************************************/
/* END LEGAL */ /* END LEGAL */
// GCC 10 messes up SVE instruction scheduling using -O3 only, // GCC 10 messes up SVE instruction scheduling using -O3, but
// using -O3 -fno-schedule-insns -fno-schedule-insns2 does wonders // -O3 -fno-schedule-insns -fno-schedule-insns2 does wonders
// performance is better than armclang 20.2 // performance now is better than armclang 20.2
#ifdef KERNEL_DAG #ifdef KERNEL_DAG
#define DIR0_PROJ XP_PROJ #define DIR0_PROJ XP_PROJ
@ -118,10 +118,6 @@ Author: Nils Meyer <nils.meyer@ur.de> Regensburg University
/* /*
NB: picking PREFETCH_GAUGE_L2(Dir+4); here results in performance penalty NB: picking PREFETCH_GAUGE_L2(Dir+4); here results in performance penalty
though I expected that it would improve on performance though I expected that it would improve on performance
if (s == 0) { \
if ((Dir == 0) || (Dir == 4)) { PREFETCH_GAUGE_L2(Dir); } \
} \
*/ */
#define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \ #define ASM_LEG_XP(Dir,NxtDir,PERMUTE_DIR,PROJ,RECON) \
@ -149,7 +145,7 @@ NB: picking PREFETCH_GAUGE_L2(Dir+4); here results in performance penalty
if ( local || st.same_node[Dir] ) { \ if ( local || st.same_node[Dir] ) { \
MULT_2SPIN_1(Dir); \ MULT_2SPIN_1(Dir); \
MULT_2SPIN_2; \ MULT_2SPIN_2; \
RECON; \ RECON; \
} \ } \
base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \ base = st.GetInfo(ptype,local,perm,NxtDir,ent,plocal); ent++; \
PREFETCH_CHIMU(base); \ PREFETCH_CHIMU(base); \
@ -300,7 +296,7 @@ NB: picking PREFETCH_GAUGE_L2(Dir+4); here results in performance penalty
// DC ZVA test // DC ZVA test
// { uint64_t basestore = (uint64_t)&out[ss]; // { uint64_t basestore = (uint64_t)&out[ss];
// PREFETCH_RESULT_L2_STORE(basestore); } // PREFETCH_RESULT_L2_STORE(basestore); }
ASM_LEG(Ym,Zm,PERMUTE_DIR2,DIR5_PROJ,DIR5_RECON); ASM_LEG(Ym,Zm,PERMUTE_DIR2,DIR5_PROJ,DIR5_RECON);
@ -336,8 +332,8 @@ NB: picking PREFETCH_GAUGE_L2(Dir+4); here results in performance penalty
// DC ZVA test // DC ZVA test
//{ uint64_t basestore = (uint64_t)&out[ss]; //{ uint64_t basestore = (uint64_t)&out[ss];
// PREFETCH_RESULT_L2_STORE(basestore); // PREFETCH_RESULT_L2_STORE(basestore); }
//}
ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJ,DIR7_RECON); ASM_LEG(Tm,Xp,PERMUTE_DIR0,DIR7_PROJ,DIR7_RECON);