mirror of
https://github.com/paboyle/Grid.git
synced 2024-11-10 07:55:35 +00:00
re-introduced HOTFIX cause Grid binaries give wrong results otherwise; checked in good gridverter.py
This commit is contained in:
parent
6504a098cc
commit
852db4626a
@ -118,7 +118,7 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) {
|
|||||||
#ifdef GEN
|
#ifdef GEN
|
||||||
#if defined(A64FX) // breakout A64FX SVE ACLE here
|
#if defined(A64FX) // breakout A64FX SVE ACLE here
|
||||||
//#pragma message("building for A64FX / SVE ACLE")
|
//#pragma message("building for A64FX / SVE ACLE")
|
||||||
//#define ARMCLANGHOTFIX
|
#define ARMCLANGHOTFIX
|
||||||
#include "Grid_a64fx-2.h"
|
#include "Grid_a64fx-2.h"
|
||||||
#else
|
#else
|
||||||
#include "Grid_generic.h"
|
#include "Grid_generic.h"
|
||||||
|
@ -47,7 +47,7 @@ ALTERNATIVE_LOADS = False
|
|||||||
# must use with my_wilson4.h and my_wilson4pf.h
|
# must use with my_wilson4.h and my_wilson4pf.h
|
||||||
|
|
||||||
ALTERNATIVE_REGISTER_MAPPING = False
|
ALTERNATIVE_REGISTER_MAPPING = False
|
||||||
ALTERNATIVE_REGISTER_MAPPING = not ALTERNATIVE_REGISTER_MAPPING
|
#ALTERNATIVE_REGISTER_MAPPING = not ALTERNATIVE_REGISTER_MAPPING
|
||||||
|
|
||||||
if ALTERNATIVE_REGISTER_MAPPING == True:
|
if ALTERNATIVE_REGISTER_MAPPING == True:
|
||||||
ALTERNATIVE_LOADS = False
|
ALTERNATIVE_LOADS = False
|
||||||
@ -229,15 +229,25 @@ class Register:
|
|||||||
gpr = d['asmtableptr']
|
gpr = d['asmtableptr']
|
||||||
|
|
||||||
cast = 'uint64_t'
|
cast = 'uint64_t'
|
||||||
asm_opcode = 'ld1d'
|
#asm_opcode = 'ld1d'
|
||||||
|
#if PRECISION == 'single':
|
||||||
|
# asm_opcode = 'ld1w'
|
||||||
|
# cast = 'uint32_t'
|
||||||
|
asm_opcode = 'ldr'
|
||||||
if PRECISION == 'single':
|
if PRECISION == 'single':
|
||||||
asm_opcode = 'ld1w'
|
asm_opcode = 'ldr'
|
||||||
cast = 'uint32_t'
|
cast = 'uint32_t'
|
||||||
|
|
||||||
d['I'] += F' {self.name} = svld1(pg1, ({cast}*)&lut[{t}]); \\\n'
|
d['I'] += F' {self.name} = svld1(pg1, ({cast}*)&lut[{t}]); \\\n'
|
||||||
|
|
||||||
# using immediate index break-out works
|
# using immediate index break-out works
|
||||||
|
if asm_opcode == 'ldr':
|
||||||
|
# ldr version
|
||||||
|
d['A'] += F' "{asm_opcode} {self.asmreg}, [%[tableptr], %[index], mul vl] \\n\\t" \\\n'
|
||||||
|
else:
|
||||||
|
# ld1 version
|
||||||
d['A'] += F' "{asm_opcode} {{ {self.asmregwithsuffix} }}, {pg1.asmreg}/z, [%[tableptr], %[index], mul vl] \\n\\t" \\\n'
|
d['A'] += F' "{asm_opcode} {{ {self.asmregwithsuffix} }}, {pg1.asmreg}/z, [%[tableptr], %[index], mul vl] \\n\\t" \\\n'
|
||||||
|
|
||||||
d['asminput'].append(F'[tableptr] "r" (&lut[0])')
|
d['asminput'].append(F'[tableptr] "r" (&lut[0])')
|
||||||
d['asminput'].append(F'[index] "i" ({t})')
|
d['asminput'].append(F'[index] "i" ({t})')
|
||||||
d['asmclobber'].append(F'"memory"')
|
d['asmclobber'].append(F'"memory"')
|
||||||
@ -249,9 +259,14 @@ class Register:
|
|||||||
indices = re.findall(r'\d+', address)
|
indices = re.findall(r'\d+', address)
|
||||||
index = (int(indices[0]) - offset) * colors + int(indices[1])
|
index = (int(indices[0]) - offset) * colors + int(indices[1])
|
||||||
|
|
||||||
asm_opcode = 'ld1d'
|
#asm_opcode = 'ld1d'
|
||||||
|
#if PRECISION == 'single':
|
||||||
|
#asm_opcode = 'ld1w'
|
||||||
|
# cast = 'float32_t'
|
||||||
|
|
||||||
|
asm_opcode = 'ldr'
|
||||||
if PRECISION == 'single':
|
if PRECISION == 'single':
|
||||||
asm_opcode = 'ld1w'
|
asm_opcode = 'ldr'
|
||||||
cast = 'float32_t'
|
cast = 'float32_t'
|
||||||
|
|
||||||
gpr = d['asmfetchbaseptr']
|
gpr = d['asmfetchbaseptr']
|
||||||
@ -259,8 +274,12 @@ class Register:
|
|||||||
if (target in ['ALL', 'C']):
|
if (target in ['ALL', 'C']):
|
||||||
d['C'] += F' {self.name} = {address}; \\\n'
|
d['C'] += F' {self.name} = {address}; \\\n'
|
||||||
if (target in ['ALL', 'I']):
|
if (target in ['ALL', 'I']):
|
||||||
|
# d['I'] += F' {self.name} = svldnt1(pg1, ({cast}*)({intrinfetchbase} + {index} * 64)); \\\n'
|
||||||
d['I'] += F' {self.name} = svld1(pg1, ({cast}*)({intrinfetchbase} + {index} * 64)); \\\n'
|
d['I'] += F' {self.name} = svld1(pg1, ({cast}*)({intrinfetchbase} + {index} * 64)); \\\n'
|
||||||
if (target in ['ALL', 'A']):
|
if (target in ['ALL', 'A']):
|
||||||
|
if asm_opcode == 'ldr':
|
||||||
|
d['A'] += F' "{asm_opcode} {self.asmreg}, [%[fetchptr], {index}, mul vl] \\n\\t" \\\n'
|
||||||
|
else:
|
||||||
d['A'] += F' "{asm_opcode} {{ {self.asmregwithsuffix} }}, {pg1.asmreg}/z, [%[fetchptr], {index}, mul vl] \\n\\t" \\\n'
|
d['A'] += F' "{asm_opcode} {{ {self.asmregwithsuffix} }}, {pg1.asmreg}/z, [%[fetchptr], {index}, mul vl] \\n\\t" \\\n'
|
||||||
|
|
||||||
def store(self, address, cast='float64_t', colors=3, offset=STORE_BASE_PTR_COLOR_OFFSET):
|
def store(self, address, cast='float64_t', colors=3, offset=STORE_BASE_PTR_COLOR_OFFSET):
|
||||||
@ -269,15 +288,23 @@ class Register:
|
|||||||
indices = re.findall(r'\d+', address)
|
indices = re.findall(r'\d+', address)
|
||||||
index = (int(indices[0]) - offset) * colors + int(indices[1])
|
index = (int(indices[0]) - offset) * colors + int(indices[1])
|
||||||
|
|
||||||
asm_opcode = 'stnt1d'
|
#asm_opcode = 'stnt1d'
|
||||||
|
#if PRECISION == 'single':
|
||||||
|
# asm_opcode = 'stnt1w'
|
||||||
|
# cast = 'float32_t'
|
||||||
|
asm_opcode = 'str'
|
||||||
if PRECISION == 'single':
|
if PRECISION == 'single':
|
||||||
asm_opcode = 'stnt1w'
|
asm_opcode = 'str'
|
||||||
cast = 'float32_t'
|
cast = 'float32_t'
|
||||||
|
|
||||||
intrinstorebase = d['intrinstorebase']
|
intrinstorebase = d['intrinstorebase']
|
||||||
|
|
||||||
d['C'] += F' {address} = {self.name}; \\\n'
|
d['C'] += F' {address} = {self.name}; \\\n'
|
||||||
d['I'] += F' svstnt1(pg1, ({cast}*)({intrinstorebase} + {index} * 64), {self.name}); \\\n'
|
#d['I'] += F' svstnt1(pg1, ({cast}*)({intrinstorebase} + {index} * 64), {self.name}); \\\n'
|
||||||
|
d['I'] += F' svst1(pg1, ({cast}*)({intrinstorebase} + {index} * 64), {self.name}); \\\n'
|
||||||
|
if asm_opcode == 'str':
|
||||||
|
d['A'] += F' "{asm_opcode} {self.asmreg}, [%[storeptr], {index}, mul vl] \\n\\t" \\\n'
|
||||||
|
else:
|
||||||
d['A'] += F' "{asm_opcode} {{ {self.asmregwithsuffix} }}, {pg1.asmreg}, [%[storeptr], {index}, mul vl] \\n\\t" \\\n'
|
d['A'] += F' "{asm_opcode} {{ {self.asmregwithsuffix} }}, {pg1.asmreg}, [%[storeptr], {index}, mul vl] \\n\\t" \\\n'
|
||||||
|
|
||||||
def movestr(self, str):
|
def movestr(self, str):
|
||||||
@ -621,7 +648,16 @@ def prefetch_L2_store(address, offset):
|
|||||||
|
|
||||||
d['I'] += F' svprfd(pg1, (int64_t*)({address} + {offset * multiplier * 64}), SV_{policy}); \\\n'
|
d['I'] += F' svprfd(pg1, (int64_t*)({address} + {offset * multiplier * 64}), SV_{policy}); \\\n'
|
||||||
d['A'] += F' "prfd {policy}, {pg1.asmreg}, [%[fetchptr], {offset * multiplier}, mul vl] \\n\\t" \\\n'
|
d['A'] += F' "prfd {policy}, {pg1.asmreg}, [%[fetchptr], {offset * multiplier}, mul vl] \\n\\t" \\\n'
|
||||||
#d['A'] +=
|
|
||||||
|
def prefetch_L1_store(address, offset):
|
||||||
|
global d
|
||||||
|
multiplier = 4 # offset in CL, have to multiply by 4
|
||||||
|
policy = "PSTL1STRM" # weak
|
||||||
|
#policy = "PSTL2KEEP" # strong
|
||||||
|
|
||||||
|
d['I'] += F' svprfd(pg1, (int64_t*)({address} + {offset * multiplier * 64}), SV_{policy}); \\\n'
|
||||||
|
d['A'] += F' "prfd {policy}, {pg1.asmreg}, [%[fetchptr], {offset * multiplier}, mul vl] \\n\\t" \\\n'
|
||||||
|
|
||||||
|
|
||||||
def asmopen():
|
def asmopen():
|
||||||
#write('asm volatile ( \\', target='A')
|
#write('asm volatile ( \\', target='A')
|
||||||
@ -878,9 +914,11 @@ if PREFETCH:
|
|||||||
define(F'PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_{PRECSUFFIX}(A)')
|
define(F'PREFETCH_GAUGE_L2(A) PREFETCH_GAUGE_L2_INTERNAL_{PRECSUFFIX}(A)')
|
||||||
define(F'PF_GAUGE(A)')
|
define(F'PF_GAUGE(A)')
|
||||||
define(F'PREFETCH_RESULT_L2_STORE(A) PREFETCH_RESULT_L2_STORE_INTERNAL_{PRECSUFFIX}(A)')
|
define(F'PREFETCH_RESULT_L2_STORE(A) PREFETCH_RESULT_L2_STORE_INTERNAL_{PRECSUFFIX}(A)')
|
||||||
|
define(F'PREFETCH_RESULT_L1_STORE(A) PREFETCH_RESULT_L1_STORE_INTERNAL_{PRECSUFFIX}(A)')
|
||||||
define(F'PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A)')
|
define(F'PREFETCH1_CHIMU(A) PREFETCH_CHIMU_L1(A)')
|
||||||
# define(F'PREFETCH1_CHIMU(A)')
|
# define(F'PREFETCH1_CHIMU(A)')
|
||||||
define(F'PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A)')
|
define(F'PREFETCH_CHIMU(A) PREFETCH_CHIMU_L1(A)')
|
||||||
|
# define(F'PREFETCH_CHIMU(A)')
|
||||||
else:
|
else:
|
||||||
define(F'PREFETCH_CHIMU_L1(A)')
|
define(F'PREFETCH_CHIMU_L1(A)')
|
||||||
define(F'PREFETCH_GAUGE_L1(A)')
|
define(F'PREFETCH_GAUGE_L1(A)')
|
||||||
@ -897,8 +935,9 @@ define(F'UNLOCK_GAUGE(A)')
|
|||||||
define(F'MASK_REGS DECLARATIONS_{PRECSUFFIX}')
|
define(F'MASK_REGS DECLARATIONS_{PRECSUFFIX}')
|
||||||
define(F'COMPLEX_SIGNS(A)')
|
define(F'COMPLEX_SIGNS(A)')
|
||||||
define(F'LOAD64(A,B)')
|
define(F'LOAD64(A,B)')
|
||||||
#define(F'SAVE_RESULT(A,B) RESULT_{PRECSUFFIX}(A); PREFETCH_RESULT_L2_STORE(B);')
|
# prefetch chimu here is useless, because already done in last leg
|
||||||
define(F'SAVE_RESULT(A,B) RESULT_{PRECSUFFIX}(A); PREFETCH_CHIMU_L1(B);')
|
#define(F'SAVE_RESULT(A,B) RESULT_{PRECSUFFIX}(A);')
|
||||||
|
define(F'SAVE_RESULT(A,B) RESULT_{PRECSUFFIX}(A); PREFETCH_RESULT_L2_STORE(B);')
|
||||||
if PREFETCH:
|
if PREFETCH:
|
||||||
definemultiline(F'MULT_2SPIN_DIR_PF(A,B) ')
|
definemultiline(F'MULT_2SPIN_DIR_PF(A,B) ')
|
||||||
write (F' MULT_2SPIN_{PRECSUFFIX}(A); \\')
|
write (F' MULT_2SPIN_{PRECSUFFIX}(A); \\')
|
||||||
@ -2156,8 +2195,7 @@ asmclose()
|
|||||||
#debugall('ZERO_PSI', group='result')
|
#debugall('ZERO_PSI', group='result')
|
||||||
newline()
|
newline()
|
||||||
|
|
||||||
d['factor'] = 0
|
# prefetch store spinors to L2 cache
|
||||||
# prefetch store spinors into L2 cache
|
|
||||||
d['factor'] = 0
|
d['factor'] = 0
|
||||||
d['cycles_PREFETCH_L2'] += 0 * d['factor']
|
d['cycles_PREFETCH_L2'] += 0 * d['factor']
|
||||||
write('// PREFETCH_RESULT_L2_STORE (prefetch store to L2)')
|
write('// PREFETCH_RESULT_L2_STORE (prefetch store to L2)')
|
||||||
@ -2173,6 +2211,23 @@ asmclose()
|
|||||||
curlyclose()
|
curlyclose()
|
||||||
newline()
|
newline()
|
||||||
|
|
||||||
|
# prefetch store spinors to L1 cache
|
||||||
|
d['factor'] = 0
|
||||||
|
d['cycles_PREFETCH_L1'] += 0 * d['factor']
|
||||||
|
write('// PREFETCH_RESULT_L1_STORE (prefetch store to L1)')
|
||||||
|
definemultiline(F'PREFETCH_RESULT_L1_STORE_INTERNAL_{PRECSUFFIX}(base)')
|
||||||
|
curlyopen()
|
||||||
|
fetch_base_ptr(F"base")
|
||||||
|
asmopen()
|
||||||
|
fetch_base_ptr(F"base", target='A')
|
||||||
|
prefetch_L1_store(F"base", 0)
|
||||||
|
prefetch_L1_store(F"base", 1)
|
||||||
|
prefetch_L1_store(F"base", 2)
|
||||||
|
asmclose()
|
||||||
|
curlyclose()
|
||||||
|
newline()
|
||||||
|
|
||||||
|
|
||||||
d['factor'] = 0
|
d['factor'] = 0
|
||||||
write('// ADD_RESULT_INTERNAL')
|
write('// ADD_RESULT_INTERNAL')
|
||||||
definemultiline(F'ADD_RESULT_INTERNAL_{PRECSUFFIX}')
|
definemultiline(F'ADD_RESULT_INTERNAL_{PRECSUFFIX}')
|
||||||
|
Loading…
Reference in New Issue
Block a user