added real fma, corrected typos in tbls; integrated, must supply A64FXGCC with GEN in configure

2025-06-18 15:57:05 +01:00 · 2020-05-08 10:20:19 +02:00
parent b338719bc8
commit 3417147b11
2 changed files with 23 additions and 10 deletions
--- a/Grid/simd/Grid_a64fx-fixedsize.h
+++ b/Grid/simd/Grid_a64fx-fixedsize.h
@ -2,7 +2,7 @@
    Grid physics library, www.github.com/paboyle/Grid
-    Source file: Grid_a64fx-2.h
+    Source file: Grid_a64fx-fixedsize.h
    Copyright (C) 2020
@ -30,11 +30,11 @@
 // Using SVE ACLE
 /////////////////////////////////////////////////////
-#ifndef GEN_SIMD_WIDTH
+//#ifndef GEN_SIMD_WIDTH
-#define GEN_SIMD_WIDTH 64u
+//#define GEN_SIMD_WIDTH 64u
-#endif
+//#endif
-static_assert(GEN_SIMD_WIDTH % 64u == 0, "A64FX SIMD vector size is 64 bytes");
+//static_assert(GEN_SIMD_WIDTH % 64u == 0, "A64FX SIMD vector size is 64 bytes");
 #ifdef __ARM_FEATURE_SVE
  #include <arm_sve.h>
@ -100,13 +100,13 @@ struct acle<float>{
      pred pg1 = svptrue_b32();
      return svld1(pg1, t);
  }
-  static inline vec<uint32_t> tbl1(){
+  static inline lutf tbl1(){
-      const lutf = {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11};
+      const uint32_t t[16] = {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11};
      pred pg1 = svptrue_b32();
      return svld1(pg1, t);
  }
-  static inline vec<uint32_t> tbl2(){
+  static inline lutf tbl2(){
-      const lutf = {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13};
+      const uint32_t t[16] = {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13};
      pred pg1 = svptrue_b32();
      return svld1(pg1, t);
  }
@ -264,6 +264,16 @@ struct Sub{
 };
 struct Mult{
  // Real float fma
  inline void mac(vecf &a, vecf b, vecf c){
    pred pg1 = acle<float>::pg1();
    a = svmad_x(pg1, b, c, a);
  }
  // Real double fma
  inline void mac(vecd &a, vecd b, vecd c){
    pred pg1 = acle<double>::pg1();
    a = svmad_x(pg1, b, c, a);
  }
  // Real float
  inline vecf operator()(vecf a, vecf b){
    pred pg1 = acle<float>::pg1();
--- a/Grid/simd/Grid_vector_types.h
+++ b/Grid/simd/Grid_vector_types.h
@ -129,7 +129,10 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) {
    #include "Grid_generic.h"
  #endif
 #endif
-
+// A64FX with gcc 10
 #ifdef A64FXGCC
 #include "Grid_a64fx-fixedsize.h"
 #endif
 #ifdef SSE4
 #include "Grid_sse4.h"
 #endif