mirror of
https://github.com/paboyle/Grid.git
synced 2025-04-09 21:50:45 +01:00
testing gcc 10.0.1: build errors in Exchange1 using -DA64FX and in Lattice_base.h building Dslash only
This commit is contained in:
parent
6fdce60492
commit
64b72fc17f
@ -32,7 +32,12 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
|
|||||||
|
|
||||||
#if defined(A64FXASM)
|
#if defined(A64FXASM)
|
||||||
|
|
||||||
#pragma message("invoking A64FX Dslash")
|
// include here if A64FX was not defined
|
||||||
|
#ifndef A64FX
|
||||||
|
#include <arm_sve.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#pragma message("specialize A64FX Dslash")
|
||||||
|
|
||||||
// undefine everything
|
// undefine everything
|
||||||
#include <simd/Fujitsu_A64FX_undef.h>
|
#include <simd/Fujitsu_A64FX_undef.h>
|
||||||
|
@ -34,6 +34,8 @@
|
|||||||
#define DIR7_RECON TP_RECON_ACCUM
|
#define DIR7_RECON TP_RECON_ACCUM
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#pragma message("this should not happen")
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Comms then compute kernel
|
// Comms then compute kernel
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
@ -38,8 +38,6 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
|||||||
#define LOCK_GAUGE(A)
|
#define LOCK_GAUGE(A)
|
||||||
#define UNLOCK_GAUGE(A)
|
#define UNLOCK_GAUGE(A)
|
||||||
#define MASK_REGS DECLARATIONS_A64FXd
|
#define MASK_REGS DECLARATIONS_A64FXd
|
||||||
#define COMPLEX_SIGNS(A)
|
|
||||||
#define LOAD64(A,B)
|
|
||||||
#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B)
|
#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B)
|
||||||
#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXd(Dir)
|
#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXd(Dir)
|
||||||
#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXd
|
#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXd
|
||||||
|
@ -38,8 +38,6 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
|||||||
#define LOCK_GAUGE(A)
|
#define LOCK_GAUGE(A)
|
||||||
#define UNLOCK_GAUGE(A)
|
#define UNLOCK_GAUGE(A)
|
||||||
#define MASK_REGS DECLARATIONS_A64FXf
|
#define MASK_REGS DECLARATIONS_A64FXf
|
||||||
#define COMPLEX_SIGNS(A)
|
|
||||||
#define LOAD64(A,B)
|
|
||||||
#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B)
|
#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B)
|
||||||
#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXf(Dir)
|
#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXf(Dir)
|
||||||
#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXf
|
#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXf
|
||||||
|
@ -38,8 +38,6 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
|||||||
#define LOCK_GAUGE(A)
|
#define LOCK_GAUGE(A)
|
||||||
#define UNLOCK_GAUGE(A)
|
#define UNLOCK_GAUGE(A)
|
||||||
#define MASK_REGS DECLARATIONS_A64FXd
|
#define MASK_REGS DECLARATIONS_A64FXd
|
||||||
#define COMPLEX_SIGNS(A)
|
|
||||||
#define LOAD64(A,B)
|
|
||||||
#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B)
|
#define SAVE_RESULT(A,B) RESULT_A64FXd(A); PREFETCH_RESULT_L2_STORE(B)
|
||||||
#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXd(Dir)
|
#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXd(Dir)
|
||||||
#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXd
|
#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXd
|
||||||
@ -111,7 +109,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
|||||||
pg1 = svptrue_b64(); \
|
pg1 = svptrue_b64(); \
|
||||||
svuint64_t table0; \
|
svuint64_t table0; \
|
||||||
svfloat64_t zero0; \
|
svfloat64_t zero0; \
|
||||||
zero0 = __svzero(zero0);
|
zero0 = svdup_f64(0.);
|
||||||
|
|
||||||
#define Chimu_00 Chi_00
|
#define Chimu_00 Chi_00
|
||||||
#define Chimu_01 Chi_01
|
#define Chimu_01 Chi_01
|
||||||
@ -559,18 +557,18 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
|||||||
|
|
||||||
// ZERO_PSI
|
// ZERO_PSI
|
||||||
#define ZERO_PSI_A64FXd \
|
#define ZERO_PSI_A64FXd \
|
||||||
result_00 = __svzero(result_00); \
|
result_00 = svdup_f64(0.); \
|
||||||
result_01 = __svzero(result_01); \
|
result_01 = svdup_f64(0.); \
|
||||||
result_02 = __svzero(result_02); \
|
result_02 = svdup_f64(0.); \
|
||||||
result_10 = __svzero(result_10); \
|
result_10 = svdup_f64(0.); \
|
||||||
result_11 = __svzero(result_11); \
|
result_11 = svdup_f64(0.); \
|
||||||
result_12 = __svzero(result_12); \
|
result_12 = svdup_f64(0.); \
|
||||||
result_20 = __svzero(result_20); \
|
result_20 = svdup_f64(0.); \
|
||||||
result_21 = __svzero(result_21); \
|
result_21 = svdup_f64(0.); \
|
||||||
result_22 = __svzero(result_22); \
|
result_22 = svdup_f64(0.); \
|
||||||
result_30 = __svzero(result_30); \
|
result_30 = svdup_f64(0.); \
|
||||||
result_31 = __svzero(result_31); \
|
result_31 = svdup_f64(0.); \
|
||||||
result_32 = __svzero(result_32);
|
result_32 = svdup_f64(0.);
|
||||||
|
|
||||||
// PREFETCH_RESULT_L2_STORE (prefetch store to L2)
|
// PREFETCH_RESULT_L2_STORE (prefetch store to L2)
|
||||||
#define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(base) \
|
#define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXd(base) \
|
||||||
|
@ -38,8 +38,6 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
|||||||
#define LOCK_GAUGE(A)
|
#define LOCK_GAUGE(A)
|
||||||
#define UNLOCK_GAUGE(A)
|
#define UNLOCK_GAUGE(A)
|
||||||
#define MASK_REGS DECLARATIONS_A64FXf
|
#define MASK_REGS DECLARATIONS_A64FXf
|
||||||
#define COMPLEX_SIGNS(A)
|
|
||||||
#define LOAD64(A,B)
|
|
||||||
#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B)
|
#define SAVE_RESULT(A,B) RESULT_A64FXf(A); PREFETCH_RESULT_L2_STORE(B)
|
||||||
#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXf(Dir)
|
#define MULT_2SPIN_1(Dir) MULT_2SPIN_1_A64FXf(Dir)
|
||||||
#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXf
|
#define MULT_2SPIN_2 MULT_2SPIN_2_A64FXf
|
||||||
@ -111,7 +109,7 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
|||||||
pg1 = svptrue_b32(); \
|
pg1 = svptrue_b32(); \
|
||||||
svuint32_t table0; \
|
svuint32_t table0; \
|
||||||
svfloat32_t zero0; \
|
svfloat32_t zero0; \
|
||||||
zero0 = __svzero(zero0);
|
zero0 = svdup_f32(0.);
|
||||||
|
|
||||||
#define Chimu_00 Chi_00
|
#define Chimu_00 Chi_00
|
||||||
#define Chimu_01 Chi_01
|
#define Chimu_01 Chi_01
|
||||||
@ -559,18 +557,18 @@ Author: Nils Meyer <nils.meyer@ur.de>
|
|||||||
|
|
||||||
// ZERO_PSI
|
// ZERO_PSI
|
||||||
#define ZERO_PSI_A64FXf \
|
#define ZERO_PSI_A64FXf \
|
||||||
result_00 = __svzero(result_00); \
|
result_00 = svdup_f32(0.); \
|
||||||
result_01 = __svzero(result_01); \
|
result_01 = svdup_f32(0.); \
|
||||||
result_02 = __svzero(result_02); \
|
result_02 = svdup_f32(0.); \
|
||||||
result_10 = __svzero(result_10); \
|
result_10 = svdup_f32(0.); \
|
||||||
result_11 = __svzero(result_11); \
|
result_11 = svdup_f32(0.); \
|
||||||
result_12 = __svzero(result_12); \
|
result_12 = svdup_f32(0.); \
|
||||||
result_20 = __svzero(result_20); \
|
result_20 = svdup_f32(0.); \
|
||||||
result_21 = __svzero(result_21); \
|
result_21 = svdup_f32(0.); \
|
||||||
result_22 = __svzero(result_22); \
|
result_22 = svdup_f32(0.); \
|
||||||
result_30 = __svzero(result_30); \
|
result_30 = svdup_f32(0.); \
|
||||||
result_31 = __svzero(result_31); \
|
result_31 = svdup_f32(0.); \
|
||||||
result_32 = __svzero(result_32);
|
result_32 = svdup_f32(0.);
|
||||||
|
|
||||||
// PREFETCH_RESULT_L2_STORE (prefetch store to L2)
|
// PREFETCH_RESULT_L2_STORE (prefetch store to L2)
|
||||||
#define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(base) \
|
#define PREFETCH_RESULT_L2_STORE_INTERNAL_A64FXf(base) \
|
||||||
|
@ -385,7 +385,7 @@ struct MultComplex{
|
|||||||
svbool_t pg1 = acle<T>::pg1();
|
svbool_t pg1 = acle<T>::pg1();
|
||||||
typename acle<T>::vt a_v = svld1(pg1, a.v);
|
typename acle<T>::vt a_v = svld1(pg1, a.v);
|
||||||
typename acle<T>::vt b_v = svld1(pg1, b.v);
|
typename acle<T>::vt b_v = svld1(pg1, b.v);
|
||||||
typename acle<T>::vt z_v = __svzero(z_v);
|
typename acle<T>::vt z_v = acle<T>::zero();
|
||||||
|
|
||||||
// using FCMLA
|
// using FCMLA
|
||||||
typename acle<T>::vt r_v = svcmla_x(pg1, z_v, a_v, b_v, 90);
|
typename acle<T>::vt r_v = svcmla_x(pg1, z_v, a_v, b_v, 90);
|
||||||
|
@ -118,7 +118,10 @@ accelerator_inline Grid_half sfw_float_to_half(float ff) {
|
|||||||
#ifdef GEN
|
#ifdef GEN
|
||||||
#if defined(A64FX) // breakout A64FX SVE ACLE here
|
#if defined(A64FX) // breakout A64FX SVE ACLE here
|
||||||
//#pragma message("building for A64FX / SVE ACLE")
|
//#pragma message("building for A64FX / SVE ACLE")
|
||||||
#define ARMCLANGHOTFIX
|
#if defined(clang)
|
||||||
|
#define ARMCLANGHOTFIX // armclang 20.0 compiles, but binaries give wrong results without hotfix
|
||||||
|
#endif
|
||||||
|
#include <arm_sve.h>
|
||||||
#include "Grid_a64fx-2.h"
|
#include "Grid_a64fx-2.h"
|
||||||
#else
|
#else
|
||||||
#include "Grid_generic.h"
|
#include "Grid_generic.h"
|
||||||
|
@ -201,12 +201,20 @@ int main (int argc, char ** argv)
|
|||||||
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
double volume=Ls; for(int mu=0;mu<Nd;mu++) volume=volume*latt4[mu];
|
||||||
double flops=single_site_flops*volume*ncall;
|
double flops=single_site_flops*volume*ncall;
|
||||||
|
|
||||||
|
// RF/L1: 4d Wilson
|
||||||
|
double data_L1 = (volume * 180 * 64 / 4 * ncall) / (1024.*1024.*1024.);
|
||||||
|
|
||||||
|
// L2 throughput
|
||||||
|
double data_L2 = (volume * 9 * 12 * 64 / 4 * ncall + (volume/Ls) * 8*9 * 64/4) / (1024.*1024.*1024.);
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
|
std::cout<<GridLogMessage << "Called Dw "<<ncall<<" times in "<<t1-t0<<" us"<<std::endl;
|
||||||
// std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
|
// std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
|
||||||
// std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
|
// std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
|
||||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||||
std::cout<<GridLogMessage << "mflop/s per rank = "<< flops/(t1-t0)/NP<<std::endl;
|
std::cout<<GridLogMessage << "mflop/s per rank = "<< flops/(t1-t0)/NP<<std::endl;
|
||||||
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NN<<std::endl;
|
std::cout<<GridLogMessage << "mflop/s per node = "<< flops/(t1-t0)/NN<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "RF/L1 GiB/s (base 2) = "<< 1000000. * data_L1/((t1-t0))<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "L2 GiB/s (base 2) = "<< 1000000. * data_L2/((t1-t0))<<std::endl;
|
||||||
err = ref-result;
|
err = ref-result;
|
||||||
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
||||||
//exit(0);
|
//exit(0);
|
||||||
|
@ -152,6 +152,7 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
|
std::cout<<GridLogMessage << "Calling Dw"<<std::endl;
|
||||||
int ncall=1000;
|
int ncall=1000;
|
||||||
|
//int ncall=1;
|
||||||
double t0=usecond();
|
double t0=usecond();
|
||||||
for(int i=0;i<ncall;i++){
|
for(int i=0;i<ncall;i++){
|
||||||
Dw.Dhop(src,result,0);
|
Dw.Dhop(src,result,0);
|
||||||
@ -173,12 +174,14 @@ int main (int argc, char ** argv)
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
double data = (volume * 180 * 64 / 4 * ncall) / (1024.*1024.*1024.);
|
||||||
|
|
||||||
std::cout<<GridLogMessage << "Called Dw"<<std::endl;
|
std::cout<<GridLogMessage << "Called Dw"<<std::endl;
|
||||||
std::cout<<GridLogMessage << "flops per site " << single_site_flops << std::endl;
|
std::cout<<GridLogMessage << "flops per site " << single_site_flops << std::endl;
|
||||||
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
|
std::cout<<GridLogMessage << "norm result "<< norm2(result)<<std::endl;
|
||||||
std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
|
std::cout<<GridLogMessage << "norm ref "<< norm2(ref)<<std::endl;
|
||||||
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
std::cout<<GridLogMessage << "mflop/s = "<< flops/(t1-t0)<<std::endl;
|
||||||
|
std::cout<<GridLogMessage << "GiB/s (base 2) = "<< 1000000. * data/((t1-t0))<<std::endl;
|
||||||
err = ref-result;
|
err = ref-result;
|
||||||
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
std::cout<<GridLogMessage << "norm diff "<< norm2(err)<<std::endl;
|
||||||
|
|
||||||
|
@ -132,7 +132,10 @@ void bench_wilson (
|
|||||||
for(int i=0; i<ncall; i++) { Dw.Dhop(src,result,dag); }
|
for(int i=0; i<ncall; i++) { Dw.Dhop(src,result,dag); }
|
||||||
double t1 = usecond();
|
double t1 = usecond();
|
||||||
double flops = single_site_flops * volume * ncall;
|
double flops = single_site_flops * volume * ncall;
|
||||||
|
double data_tp = (volume * 180 * 64 * ncall) / 1000.; // / (1024.*1024.*1024.);
|
||||||
|
//std::cout << flops/(t1-t0) << " (" << data_tp/(t1-t0) << " MB/s) \t";
|
||||||
std::cout << flops/(t1-t0) << "\t\t";
|
std::cout << flops/(t1-t0) << "\t\t";
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void bench_wilson_eo (
|
void bench_wilson_eo (
|
||||||
|
Loading…
x
Reference in New Issue
Block a user