From de2e1ca9d3d87e74c0c20c1e4ad3c32b31a5875b Mon Sep 17 00:00:00 2001 From: Mike Pall Date: Wed, 7 Dec 2022 18:38:22 +0100 Subject: Disable FMA by default. Use -Ofma or jit.opt.start("+fma") to enable. See the discussion in #918 for the rationale. --- doc/running.html | 8 ++++++++ src/lj_asm_arm.h | 6 +++++- src/lj_asm_arm64.h | 3 ++- src/lj_asm_ppc.h | 3 ++- src/lj_jit.h | 4 +++- src/lj_vmmath.c | 13 ++++++++++++- src/vm_arm64.dasc | 4 +++- 7 files changed, 35 insertions(+), 6 deletions(-) diff --git a/doc/running.html b/doc/running.html index 9979d223..edc049fb 100644 --- a/doc/running.html +++ b/doc/running.html @@ -220,6 +220,12 @@ mix the three forms, but note that setting an optimization level overrides all earlier flags.

+Note that -Ofma is not enabled by default at any level, +because it affects floating-point result accuracy. Only enable this, +if you fully understand the trade-offs of FMA for performance (higher), +determinism (lower) and numerical accuracy (higher). +

+

Here are the available flags and at what optimization levels they are enabled:

@@ -251,6 +257,8 @@ are enabled: sink  •Allocation/Store Sinking fuse  •Fusion of operands into instructions + +fma    Fused multiply-add

Here are the parameters and their default settings: diff --git a/src/lj_asm_arm.h b/src/lj_asm_arm.h index 326330f4..ba6267ec 100644 --- a/src/lj_asm_arm.h +++ b/src/lj_asm_arm.h @@ -313,7 +313,11 @@ static void asm_fusexref(ASMState *as, ARMIns ai, Reg rd, IRRef ref, } #if !LJ_SOFTFP -/* Fuse to multiply-add/sub instruction. */ +/* +** Fuse to multiply-add/sub instruction. +** VMLA rounds twice (UMA, not FMA) -- no need to check for JIT_F_OPT_FMA. +** VFMA needs VFPv4, which is uncommon on the remaining ARM32 targets. +*/ static int asm_fusemadd(ASMState *as, IRIns *ir, ARMIns ai, ARMIns air) { IRRef lref = ir->op1, rref = ir->op2; diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h index 4e34b3be..805ea54b 100644 --- a/src/lj_asm_arm64.h +++ b/src/lj_asm_arm64.h @@ -337,7 +337,8 @@ static int asm_fusemadd(ASMState *as, IRIns *ir, A64Ins ai, A64Ins air) { IRRef lref = ir->op1, rref = ir->op2; IRIns *irm; - if (lref != rref && + if ((as->flags & JIT_F_OPT_FMA) && + lref != rref && ((mayfuse(as, lref) && (irm = IR(lref), irm->o == IR_MUL) && ra_noreg(irm->r)) || (mayfuse(as, rref) && (irm = IR(rref), irm->o == IR_MUL) && diff --git a/src/lj_asm_ppc.h b/src/lj_asm_ppc.h index 546b8e5d..aa818745 100644 --- a/src/lj_asm_ppc.h +++ b/src/lj_asm_ppc.h @@ -235,7 +235,8 @@ static int asm_fusemadd(ASMState *as, IRIns *ir, PPCIns pi, PPCIns pir) { IRRef lref = ir->op1, rref = ir->op2; IRIns *irm; - if (lref != rref && + if ((as->flags & JIT_F_OPT_FMA) && + lref != rref && ((mayfuse(as, lref) && (irm = IR(lref), irm->o == IR_MUL) && ra_noreg(irm->r)) || (mayfuse(as, rref) && (irm = IR(rref), irm->o == IR_MUL) && diff --git a/src/lj_jit.h b/src/lj_jit.h index 32b3861a..7f081730 100644 --- a/src/lj_jit.h +++ b/src/lj_jit.h @@ -87,10 +87,11 @@ #define JIT_F_OPT_ABC (JIT_F_OPT << 7) #define JIT_F_OPT_SINK (JIT_F_OPT << 8) #define JIT_F_OPT_FUSE (JIT_F_OPT << 9) +#define JIT_F_OPT_FMA (JIT_F_OPT << 10) /* Optimizations names for -O. Must match the order above. */ #define JIT_F_OPTSTRING \ - "\4fold\3cse\3dce\3fwd\3dse\6narrow\4loop\3abc\4sink\4fuse" + "\4fold\3cse\3dce\3fwd\3dse\6narrow\4loop\3abc\4sink\4fuse\3fma" /* Optimization levels set a fixed combination of flags. */ #define JIT_F_OPT_0 0 @@ -99,6 +100,7 @@ #define JIT_F_OPT_3 (JIT_F_OPT_2|\ JIT_F_OPT_FWD|JIT_F_OPT_DSE|JIT_F_OPT_ABC|JIT_F_OPT_SINK|JIT_F_OPT_FUSE) #define JIT_F_OPT_DEFAULT JIT_F_OPT_3 +/* Note: FMA is not set by default. */ /* -- JIT engine parameters ----------------------------------------------- */ diff --git a/src/lj_vmmath.c b/src/lj_vmmath.c index b6cc60ba..d0febd81 100644 --- a/src/lj_vmmath.c +++ b/src/lj_vmmath.c @@ -36,6 +36,17 @@ LJ_FUNCA double lj_wrap_fmod(double x, double y) { return fmod(x, y); } /* -- Helper functions ---------------------------------------------------- */ +/* Required to prevent the C compiler from applying FMA optimizations. +** +** Yes, there's -ffp-contract and the FP_CONTRACT pragma ... in theory. +** But the current state of C compilers is a mess in this regard. +** Also, this function is not performance sensitive at all. +*/ +LJ_NOINLINE static double lj_vm_floormul(double x, double y) +{ + return lj_vm_floor(x / y) * y; +} + double lj_vm_foldarith(double x, double y, int op) { switch (op) { @@ -43,7 +54,7 @@ double lj_vm_foldarith(double x, double y, int op) case IR_SUB - IR_ADD: return x-y; break; case IR_MUL - IR_ADD: return x*y; break; case IR_DIV - IR_ADD: return x/y; break; - case IR_MOD - IR_ADD: return x-lj_vm_floor(x/y)*y; break; + case IR_MOD - IR_ADD: return x-lj_vm_floormul(x, y); break; case IR_POW - IR_ADD: return pow(x, y); break; case IR_NEG - IR_ADD: return -x; break; case IR_ABS - IR_ADD: return fabs(x); break; diff --git a/src/vm_arm64.dasc b/src/vm_arm64.dasc index 3448d0d2..36a036ae 100644 --- a/src/vm_arm64.dasc +++ b/src/vm_arm64.dasc @@ -2636,7 +2636,9 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop) |.macro ins_arithmod, res, reg1, reg2 | fdiv d2, reg1, reg2 | frintm d2, d2 - | fmsub res, d2, reg2, reg1 + | // Cannot use fmsub, because FMA is not enabled by default. + | fmul d2, d2, reg2 + | fsub res, reg1, d2 |.endmacro | |.macro ins_arithdn, intins, fpins -- cgit v1.2.1