diff --git a/Grid/simd/Grid_vector_types.h b/Grid/simd/Grid_vector_types.h index c203cd9e..61f19a15 100644 --- a/Grid/simd/Grid_vector_types.h +++ b/Grid/simd/Grid_vector_types.h @@ -118,7 +118,7 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) { #ifdef GEN #if defined(A64FX) // breakout A64FX SVE ACLE here //#pragma message("building for A64FX / SVE ACLE") - //#define ARMCLANGHOTFIX + #define ARMCLANGHOTFIX #include "Grid_a64fx-2.h" #else #include "Grid_generic.h" diff --git a/Grid/simd/gridverter.py b/Grid/simd/gridverter.py index 415f5578..137471cd 100755 --- a/Grid/simd/gridverter.py +++ b/Grid/simd/gridverter.py @@ -47,7 +47,7 @@ ALTERNATIVE_LOADS = False # must use with my_wilson4.h and my_wilson4pf.h ALTERNATIVE_REGISTER_MAPPING = False -ALTERNATIVE_REGISTER_MAPPING = not ALTERNATIVE_REGISTER_MAPPING +#ALTERNATIVE_REGISTER_MAPPING = not ALTERNATIVE_REGISTER_MAPPING if ALTERNATIVE_REGISTER_MAPPING == True: ALTERNATIVE_LOADS = False @@ -229,15 +229,25 @@ class Register: gpr = d['asmtableptr'] cast = 'uint64_t' - asm_opcode = 'ld1d' + #asm_opcode = 'ld1d' + #if PRECISION == 'single': + # asm_opcode = 'ld1w' + # cast = 'uint32_t' + asm_opcode = 'ldr' if PRECISION == 'single': - asm_opcode = 'ld1w' + asm_opcode = 'ldr' cast = 'uint32_t' d['I'] += F' {self.name} = svld1(pg1, ({cast}*)&lut[{t}]); \\\n' # using immediate index break-out works - d['A'] += F' "{asm_opcode} {{ {self.asmregwithsuffix} }}, {pg1.asmreg}/z, [%[tableptr], %[index], mul vl] \\n\\t" \\\n' + if asm_opcode == 'ldr': + # ldr version + d['A'] += F' "{asm_opcode} {self.asmreg}, [%[tableptr], %[index], mul vl] \\n\\t" \\\n' + else: + # ld1 version + d['A'] += F' "{asm_opcode} {{ {self.asmregwithsuffix} }}, {pg1.asmreg}/z, [%[tableptr], %[index], mul vl] \\n\\t" \\\n' + d['asminput'].append(F'[tableptr] "r" (&lut[0])') d['asminput'].append(F'[index] "i" ({t})') d['asmclobber'].append(F'"memory"') @@ -249,9 +259,14 @@ class Register: indices = re.findall(r'\d+', address) index = (int(indices[0]) - offset) * colors + int(indices[1]) - asm_opcode = 'ld1d' + #asm_opcode = 'ld1d' + #if PRECISION == 'single': + #asm_opcode = 'ld1w' + # cast = 'float32_t' + + asm_opcode = 'ldr' if PRECISION == 'single': - asm_opcode = 'ld1w' + asm_opcode = 'ldr' cast = 'float32_t' gpr = d['asmfetchbaseptr'] @@ -259,9 +274,13 @@ class Register: if (target in ['ALL', 'C']): d['C'] += F' {self.name} = {address}; \\\n' if (target in ['ALL', 'I']): +# d['I'] += F' {self.name} = svldnt1(pg1, ({cast}*)({intrinfetchbase} + {index} * 64)); \\\n' d['I'] += F' {self.name} = svld1(pg1, ({cast}*)({intrinfetchbase} + {index} * 64)); \\\n' if (target in ['ALL', 'A']): - d['A'] += F' "{asm_opcode} {{ {self.asmregwithsuffix} }}, {pg1.asmreg}/z, [%[fetchptr], {index}, mul vl] \\n\\t" \\\n' + if asm_opcode == 'ldr': + d['A'] += F' "{asm_opcode} {self.asmreg}, [%[fetchptr], {index}, mul vl] \\n\\t" \\\n' + else: + d['A'] += F' "{asm_opcode} {{ {self.asmregwithsuffix} }}, {pg1.asmreg}/z, [%[fetchptr], {index}, mul vl] \\n\\t" \\\n' def store(self, address, cast='float64_t', colors=3, offset=STORE_BASE_PTR_COLOR_OFFSET): global d @@ -269,16 +288,24 @@ class Register: indices = re.findall(r'\d+', address) index = (int(indices[0]) - offset) * colors + int(indices[1]) - asm_opcode = 'stnt1d' + #asm_opcode = 'stnt1d' + #if PRECISION == 'single': + # asm_opcode = 'stnt1w' + # cast = 'float32_t' + asm_opcode = 'str' if PRECISION == 'single': - asm_opcode = 'stnt1w' + asm_opcode = 'str' cast = 'float32_t' intrinstorebase = d['intrinstorebase'] d['C'] += F' {address} = {self.name}; \\\n' - d['I'] += F' svstnt1(pg1, ({cast}*)({intrinstorebase} + {index} * 64), {self.name}); \\\n' - d['A'] += F' "{asm_opcode} {{ {self.asmregwithsuffix} }}, {pg1.asmreg}, [%[storeptr], {index}, mul vl] \\n\\t" \\\n' + #d['I'] += F' svstnt1(pg1, ({cast}*)({intrinstorebase} + {index} * 64), {self.name}); \\\n' + d['I'] += F' svst1(pg1, ({cast}*)({intrinstorebase} + {index} * 64), {self.name}); \\\n' + if asm_opcode == 'str': + d['A'] += F' "{asm_opcode} {self.asmreg}, [%[storeptr], {index}, mul vl] \\n\\t" \\\n' + else: + d['A'] += F' "{asm_opcode} {{ {self.asmregwithsuffix} }}, {pg1.asmreg}, [%[storeptr], {index}, mul vl] \\n\\t" \\\n' def movestr(self, str): global d @@ -621,7 +648,16 @@ def prefetch_L2_store(address, offset): d['I'] += F' svprfd(pg1, (int64_t*)({address} + {offset * multiplier * 64}), SV_{policy}); \\\n' d['A'] += F' "prfd {policy}, {pg1.asmreg}, [%[fetchptr], {offset * multiplier}, mul vl] \\n\\t" \\\n' - #d['A'] += + +def prefetch_L1_store(address, offset): + global d + multiplier = 4 # offset in CL, have to multiply by 4 + policy = "PSTL1STRM" # weak + #policy = "PSTL2KEEP" # strong + + d['I'] += F' svprfd(pg1, (int64_t*)({address} + {offset * multiplier * 64}), SV_{policy}); \\\n' + d['A'] += F' "prfd {policy}, {pg1.asmreg}, [%[fetchptr], {offset * multiplier}, mul vl] \\n\\t" \\\n' + def asmopen(): #write('asm volatile ( \\', target='A') @@ -878,9 +914,11 @@ if PREFETCH: define(F'PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_{PRECSUFFIX}(A)') define(F'PF_GAUGE(A)') define(F'PREFETCH_RESULT_L2_STORE(A) PREFETCH_RESULT_L2_STORE_INTERNAL_{PRECSUFFIX}(A)') + define(F'PREFETCH_RESULT_L1_STORE(A) PREFETCH_RESULT_L1_STORE_INTERNAL_{PRECSUFFIX}(A)') define(F'PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A)') # define(F'PREFETCH1_CHIMU(A)') define(F'PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A)') +# define(F'PREFETCH_CHIMU(A)') else: define(F'PREFETCH_CHIMU_L1(A)') define(F'PREFETCH_GAUGE_L1(A)') @@ -897,8 +935,9 @@ define(F'UNLOCK_GAUGE(A)') define(F'MASK_REGS DECLARATIONS_{PRECSUFFIX}') define(F'COMPLEX_SIGNS(A)') define(F'LOAD64(A,B)') -#define(F'SAVE_RESULT(A,B) RESULT_{PRECSUFFIX}(A); PREFETCH_RESULT_L2_STORE(B);') -define(F'SAVE_RESULT(A,B) RESULT_{PRECSUFFIX}(A); PREFETCH_CHIMU_L1(B);') +# prefetch chimu here is useless, because already done in last leg +#define(F'SAVE_RESULT(A,B) RESULT_{PRECSUFFIX}(A);') +define(F'SAVE_RESULT(A,B) RESULT_{PRECSUFFIX}(A); PREFETCH_RESULT_L2_STORE(B);') if PREFETCH: definemultiline(F'MULT_2SPIN_DIR_PF(A,B) ') write (F' MULT_2SPIN_{PRECSUFFIX}(A); \\') @@ -2156,8 +2195,7 @@ asmclose() #debugall('ZERO_PSI', group='result') newline() -d['factor'] = 0 -# prefetch store spinors into L2 cache +# prefetch store spinors to L2 cache d['factor'] = 0 d['cycles_PREFETCH_L2'] += 0 * d['factor'] write('// PREFETCH_RESULT_L2_STORE (prefetch store to L2)') @@ -2173,6 +2211,23 @@ asmclose() curlyclose() newline() +# prefetch store spinors to L1 cache +d['factor'] = 0 +d['cycles_PREFETCH_L1'] += 0 * d['factor'] +write('// PREFETCH_RESULT_L1_STORE (prefetch store to L1)') +definemultiline(F'PREFETCH_RESULT_L1_STORE_INTERNAL_{PRECSUFFIX}(base)') +curlyopen() +fetch_base_ptr(F"base") +asmopen() +fetch_base_ptr(F"base", target='A') +prefetch_L1_store(F"base", 0) +prefetch_L1_store(F"base", 1) +prefetch_L1_store(F"base", 2) +asmclose() +curlyclose() +newline() + + d['factor'] = 0 write('// ADD_RESULT_INTERNAL') definemultiline(F'ADD_RESULT_INTERNAL_{PRECSUFFIX}')