diff --git a/Grid/simd/Fujitsu_A64FX_intrin_double.h b/Grid/simd/Fujitsu_A64FX_intrin_double.h index 361246fc..f195e3c5 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_double.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_double.h @@ -144,38 +144,38 @@ Author: Nils Meyer // PREFETCH_CHIMU_L2 (prefetch to L2) #define PREFETCH_CHIMU_L2_INTERNAL_A64FXd(base) \ { \ - svprfd(pg1, (int64_t*)(base + 0), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(base + 256), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(base + 512), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)0), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)4), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)8), SV_PLDL2STRM); \ } // PREFETCH_CHIMU_L1 (prefetch to L1) #define PREFETCH_CHIMU_L1_INTERNAL_A64FXd(base) \ { \ - svprfd(pg1, (int64_t*)(base + 0), SV_PLDL1STRM); \ - svprfd(pg1, (int64_t*)(base + 256), SV_PLDL1STRM); \ - svprfd(pg1, (int64_t*)(base + 512), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)0), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)4), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)8), SV_PLDL1STRM); \ } // PREFETCH_GAUGE_L2 (prefetch to L2) #define PREFETCH_GAUGE_L2_INTERNAL_A64FXd(A) \ { \ const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \ - svprfd(pg1, (int64_t*)(baseU + -256), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 768), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 1024), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 1280), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 1536), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 1792), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)-4), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)0), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)4), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)8), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)12), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)16), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)20), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)24), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)28), SV_PLDL2STRM); \ } // PREFETCH_GAUGE_L1 (prefetch to L1) #define PREFETCH_GAUGE_L1_INTERNAL_A64FXd(A) \ { \ const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ - svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \ - svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \ - svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)0), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)4), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)8), SV_PLDL1STRM); \ } // LOAD_CHI #define LOAD_CHI_A64FXd(base) \ diff --git a/Grid/simd/Fujitsu_A64FX_intrin_single.h b/Grid/simd/Fujitsu_A64FX_intrin_single.h index 30273b6e..0b874f02 100644 --- a/Grid/simd/Fujitsu_A64FX_intrin_single.h +++ b/Grid/simd/Fujitsu_A64FX_intrin_single.h @@ -144,38 +144,38 @@ Author: Nils Meyer // PREFETCH_CHIMU_L2 (prefetch to L2) #define PREFETCH_CHIMU_L2_INTERNAL_A64FXf(base) \ { \ - svprfd(pg1, (int64_t*)(base + 0), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(base + 256), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(base + 512), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(0), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(4), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(8), SV_PLDL2STRM); \ } // PREFETCH_CHIMU_L1 (prefetch to L1) #define PREFETCH_CHIMU_L1_INTERNAL_A64FXf(base) \ { \ - svprfd(pg1, (int64_t*)(base + 0), SV_PLDL1STRM); \ - svprfd(pg1, (int64_t*)(base + 256), SV_PLDL1STRM); \ - svprfd(pg1, (int64_t*)(base + 512), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(0), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(4), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(base), (int64_t)(8), SV_PLDL1STRM); \ } // PREFETCH_GAUGE_L2 (prefetch to L2) #define PREFETCH_GAUGE_L2_INTERNAL_A64FXf(A) \ { \ const auto & ref(U[sUn](A)); baseU = (uint64_t)&ref + 3 * 3 * 64; \ - svprfd(pg1, (int64_t*)(baseU + -256), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 768), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 1024), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 1280), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 1536), SV_PLDL2STRM); \ - svprfd(pg1, (int64_t*)(baseU + 1792), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(-4), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(0), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(4), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(8), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(12), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(16), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(20), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(24), SV_PLDL2STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(28), SV_PLDL2STRM); \ } // PREFETCH_GAUGE_L1 (prefetch to L1) #define PREFETCH_GAUGE_L1_INTERNAL_A64FXf(A) \ { \ const auto & ref(U[sU](A)); baseU = (uint64_t)&ref; \ - svprfd(pg1, (int64_t*)(baseU + 0), SV_PLDL1STRM); \ - svprfd(pg1, (int64_t*)(baseU + 256), SV_PLDL1STRM); \ - svprfd(pg1, (int64_t*)(baseU + 512), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(0), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(4), SV_PLDL1STRM); \ + svprfd_vnum(pg1, (void*)(baseU), (int64_t)(8), SV_PLDL1STRM); \ } // LOAD_CHI #define LOAD_CHI_A64FXf(base) \