summaryrefslogtreecommitdiff
path: root/src/VBox/Runtime/common/math/sin.asm
diff options
context:
space:
mode:
authorvboxsync <vboxsync@cfe28804-0f27-0410-a406-dd0f0b0b656f>2022-08-17 00:59:31 +0000
committervboxsync <vboxsync@cfe28804-0f27-0410-a406-dd0f0b0b656f>2022-08-17 00:59:31 +0000
commit70c69d24b92dc0d0cb460a8295cb471123ea8275 (patch)
treeb80eaa5a93dcecda36981119b4e2cfe87dceaf80 /src/VBox/Runtime/common/math/sin.asm
parent638829feab70a2544096126da97cbb008b26f87f (diff)
downloadVirtualBox-svn-70c69d24b92dc0d0cb460a8295cb471123ea8275.tar.gz
IPRT/nocrt: Reworking the sin and cos code to take into account which ranges FCOS and FSIN instructions are expected to deliver reasonable results and handle extreme values better. bugref:10261
git-svn-id: https://www.virtualbox.org/svn/vbox/trunk@96240 cfe28804-0f27-0410-a406-dd0f0b0b656f
Diffstat (limited to 'src/VBox/Runtime/common/math/sin.asm')
-rw-r--r--src/VBox/Runtime/common/math/sin.asm473
1 files changed, 442 insertions, 31 deletions
diff --git a/src/VBox/Runtime/common/math/sin.asm b/src/VBox/Runtime/common/math/sin.asm
index 7178a7696f7..b8a6b31581b 100644
--- a/src/VBox/Runtime/common/math/sin.asm
+++ b/src/VBox/Runtime/common/math/sin.asm
@@ -24,48 +24,459 @@
; terms and conditions of either the GPL or the CDDL or both.
;
+
+%define RT_ASM_WITH_SEH64
%include "iprt/asmdefs.mac"
+%include "iprt/x86.mac"
+
BEGINCODE
;;
-; Compute the sine of rd
-; @returns st(0)/xmm0
-; @param rd [xSP + xCB*2] / xmm0
+; Internal sine and cosine worker that calculates the sine of st0 returning
+; it in st0.
+;
+; When called by a sine function, fabs(st0) >= pi/2.
+; When called by a cosine function, fabs(original input value) >= 3pi/8.
+;
+; That the input isn't a tiny number close to zero, means that we can do a bit
+; cruder rounding when operating close to a pi/2 boundrary. The value in the
+; ecx register indicates the input precision and controls the crudeness of the
+; rounding.
+;
+; @returns st0 = sine
+; @param st0 A finite number to calucate sine of.
+; @param ecx Set to 0 if original input was a 32-bit float.
+; Set to 1 if original input was a 64-bit double.
+; set to 2 if original input was a 80-bit long double.
+;
+BEGINPROC rtNoCrtMathSinCore
+ push xBP
+ SEH64_PUSH_xBP
+ mov xBP, xSP
+ SEH64_SET_FRAME_xBP 0
+ SEH64_END_PROLOGUE
+
+ ;
+ ; Load the pointer to the rounding crudeness factor into xDX.
+ ;
+ lea xDX, [.s_ar64NearZero xWrtRIP]
+ lea xDX, [xDX + xCX * xCB]
+
+ ;
+ ; Finite number. We want it in the range [0,2pi] and will preform
+ ; a remainder division if it isn't.
+ ;
+ fcom qword [.s_r64Max xWrtRIP] ; compares st0 and 2*pi
+ fnstsw ax
+ test ax, X86_FSW_C3 | X86_FSW_C0 | X86_FSW_C2 ; C3 := st0 == mem; C0 := st0 < mem; C2 := unordered (should be the case);
+ jz .reduce_st0 ; Jump if st0 > mem
+
+ fcom qword [.s_r64Min xWrtRIP] ; compares st0 and 0.0
+ fnstsw ax
+ test ax, X86_FSW_C3 | X86_FSW_C0
+ jnz .reduce_st0 ; Jump if st0 <= mem
+
+ ;
+ ; We get here if st0 is in the [0,2pi] range.
+ ;
+ ; Now, FSIN is documented to be reasonably accurate for the range
+ ; -3pi/4 to +3pi/4, so we have to make some more effort to calculate
+ ; in that range only.
+ ;
+.in_range:
+ ; if (st0 < pi)
+ fldpi
+ fcom st1 ; compares st0 (pi) with st1 (the normalized value)
+ fnstsw ax
+ test ax, X86_FSW_C0 ; st1 > pi
+ jnz .larger_than_pi
+ test ax, X86_FSW_C3
+ jnz .equals_pi
+
+ ;
+ ; input in the range [0,pi[
+ ;
+.smaller_than_pi:
+ fdiv qword [.s_r64Two xWrtRIP] ; st0 = pi/2
+
+ ; if (st0 < pi/2)
+ fcom st1 ; compares st0 (pi/2) with st1
+ fnstsw ax
+ test ax, X86_FSW_C0 ; st1 > pi
+ jnz .between_half_pi_and_pi
+ test ax, X86_FSW_C3
+ jnz .equals_half_pi
+
+ ;
+ ; The value is between zero and half pi, including the zero value.
+ ;
+ ; This is in range where FSIN works reasonably reliably. So drop the
+ ; half pi in st0 and do the calculation.
+ ;
+.between_zero_and_half_pi:
+ ; Check if we're so close to pi/2 that it makes no difference.
+ fsub st0, st1 ; st0 = pi/2 - st1
+ fcom qword [xDX]
+ fnstsw ax
+ test ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number.
+ jnz .equals_half_pi
+ ffreep st0
+
+ ; Check if we're so close to zero that it makes no difference given the
+ ; internal accuracy of the FPU.
+ fcom qword [xDX]
+ fnstsw ax
+ test ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number.
+ jnz .equals_zero_popped_one
+
+ ; Ok, calculate sine.
+ fsin
+ jmp .return
+
+ ;
+ ; The value is in the range ]pi/2,pi[
+ ;
+ ; This is outside the comfortable FSIN range, but if we subtract PI and
+ ; move to the ]-pi/2,0[ range we just have to change the sign to get
+ ; the value we want.
+ ;
+.between_half_pi_and_pi:
+ ; Check if we're so close to pi/2 that it makes no difference.
+ fsubr st0, st1 ; st0 = st1 - st0
+ fcom qword [xDX]
+ fnstsw ax
+ test ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number.
+ jnz .equals_half_pi
+ ffreep st0
+
+ ; Check if we're so close to pi that it makes no difference.
+ fldpi
+ fsub st0, st1 ; st0 = st0 - st1
+ fcom qword [xDX]
+ fnstsw ax
+ test ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number.
+ jnz .equals_pi
+ ffreep st0
+
+ ; Ok, transform the value and calculate sine.
+ fldpi
+ fsubp st1, st0
+
+ fsin
+ fchs
+ jmp .return
+
+ ;
+ ; input in the range ]pi,2pi[
+ ;
+.larger_than_pi:
+ fsub st1, st0 ; st1 -= pi
+ fdiv qword [.s_r64Two xWrtRIP] ; st0 = pi/2
+
+ ; if (st0 < pi/2)
+ fcom st1 ; compares st0 (pi/2) with reduced st1
+ fnstsw ax
+ test ax, X86_FSW_C0 ; st1 > pi
+ jnz .between_3_half_pi_and_2pi
+ test ax, X86_FSW_C3
+ jnz .equals_3_half_pi
+
+ ;
+ ; The value is in the the range: ]pi,3pi/2[
+ ;
+ ; The actual st0 is in the range ]pi,pi/2[ where FSIN is performing okay
+ ; and we can get the desired result by changing the sign (-FSIN).
+ ;
+.between_pi_and_3_half_pi:
+ ; Check if we're so close to pi/2 that it makes no difference.
+ fsub st0, st1 ; st0 = pi/2 - st1
+ fcom qword [xDX]
+ fnstsw ax
+ test ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number.
+ jnz .equals_3_half_pi
+ ffreep st0
+
+ ; Check if we're so close to zero that it makes no difference given the
+ ; internal accuracy of the FPU.
+ fcom qword [xDX]
+ fnstsw ax
+ test ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number.
+ jnz .equals_pi_popped
+
+ ; Ok, calculate sine and flip the sign.
+ fsin
+ fchs
+ jmp .return
+
+ ;
+ ; The value is in the last pi/2 of the range: ]3pi/2,2pi[
+ ;
+ ; Since FSIN should work reasonably well for ]-pi/2,pi], we can just
+ ; subtract pi again (we subtracted pi at .larger_than_pi above) and
+ ; run FSIN on it. (st1 is currently in the range ]pi/2,pi[.)
+ ;
+.between_3_half_pi_and_2pi:
+ ; Check if we're so close to pi/2 that it makes no difference.
+ fsubr st0, st1 ; st0 = st1 - st0
+ fcom qword [xDX]
+ fnstsw ax
+ test ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number.
+ jnz .equals_3_half_pi
+ ffreep st0
+
+ ; Check if we're so close to pi that it makes no difference.
+ fldpi
+ fsub st0, st1 ; st0 = st0 - st1
+ fcom qword [xDX]
+ fnstsw ax
+ test ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number.
+ jnz .equals_2pi
+ ffreep st0
+
+ ; Ok, adjust input and calculate sine.
+ fldpi
+ fsubp st1, st0
+ fsin
+ jmp .return
+
+ ;
+ ; sin(0) = 0
+ ; sin(pi) = 0
+ ;
+.equals_zero:
+.equals_pi:
+.equals_2pi:
+ ffreep st0
+.equals_zero_popped_one:
+.equals_pi_popped:
+ ffreep st0
+ fldz
+ jmp .return
+
+ ;
+ ; sin(pi/2) = 1
+ ;
+.equals_half_pi:
+ ffreep st0
+ ffreep st0
+ fld1
+ jmp .return
+
+ ;
+ ; sin(3*pi/2) = -1
+ ;
+.equals_3_half_pi:
+ ffreep st0
+ ffreep st0
+ fld1
+ fchs
+ jmp .return
+
+ ;
+ ; Return.
+ ;
+.return:
+ leave
+ ret
+
+ ;
+ ; Reduce st0 by reminder division by PI*2. The result should be positive here.
+ ;
+ ;; @todo this is one of our weak spots (really any calculation involving PI is).
+.reduce_st0:
+ fldpi
+ fadd st0, st0
+ fxch st1 ; st0=input (dividend) st1=2pi (divisor)
+.again:
+ fprem1
+ fnstsw ax
+ test ah, (X86_FSW_C2 >> 8) ; C2 is set if partial result.
+ jnz .again ; Loop till C2 == 0 and we have a final result.
+
+ ;
+ ; Make sure the result is positive.
+ ;
+ fxam
+ fnstsw ax
+ test ax, X86_FSW_C1 ; The sign bit
+ jz .reduced_to_positive
+
+ fadd st0, st1 ; st0 += 2pi, which should make it positive
+
+%ifdef RT_STRICT
+ fxam
+ fnstsw ax
+ test ax, X86_FSW_C1
+ jz .reduced_to_positive
+ int3
+%endif
+
+.reduced_to_positive:
+ fstp st1 ; Get rid of the 2pi value.
+ jmp .in_range
+
+ALIGNCODE(8)
+.s_r64Max:
+ dq +6.28318530717958647692 ; 2*pi
+.s_r64Min:
+ dq 0.0
+.s_r64Two:
+ dq 2.0
+ ;;
+ ; Close to 2/pi rounding limits for 32-bit, 64-bit and 80-bit floating point operations.
+ ; Given that the original input is at least +/-3pi/8 (1.178) and that precision of the
+ ; PI constant used during reduction/whatever, I think we can round to a whole pi/2
+ ; step when we get close enough.
+ ;
+ ; Look to RTFLOAT64U for the format details, but 52 is the shift for the exponent field
+ ; and 1023 is the exponent bias. Since the format uses an implied 1 in the mantissa,
+ ; we only have to set the exponent to get a valid number.
+ ;
+.s_ar64NearZero:
+ dq (-18 + 1023) << 52 ; float / 32-bit / single precision input
+ dq (-40 + 1023) << 52 ; double / 64-bit / double precision input
+ dq (-52 + 1023) << 52 ; long double / 80-bit / extended precision input
+ENDPROC rtNoCrtMathSinCore
+
+
+;;
+; Compute the sine of rd, measured in radians.
+;
+; @returns st(0) / xmm0
+; @param rd [rbp + xCB*2] / xmm0
+;
RT_NOCRT_BEGINPROC sin
- push xBP
- mov xBP, xSP
+ push xBP
+ SEH64_PUSH_xBP
+ mov xBP, xSP
+ SEH64_SET_FRAME_xBP 0
+ sub xSP, 20h
+ SEH64_ALLOCATE_STACK 20h
+ SEH64_END_PROLOGUE
-%ifdef RT_ARCH_AMD64
- sub xSP, 10h
+%ifdef RT_OS_WINDOWS
+ ;
+ ; Make sure we use full precision and not the windows default of 53 bits.
+ ;
+ fnstcw [xBP - 20h]
+ mov ax, [xBP - 20h]
+ or ax, X86_FCW_PC_64 ; includes both bits, so no need to clear the mask.
+ mov [xBP - 1ch], ax
+ fldcw [xBP - 1ch]
+%endif
- movsd [xSP], xmm0
- fld qword [xSP]
+ ;
+ ; Load the input into st0.
+ ;
+%ifdef RT_ARCH_AMD64
+ movsd [xBP - 10h], xmm0
+ fld qword [xBP - 10h]
%else
- fld qword [xBP + xCB*2]
+ fld qword [xBP + xCB*2]
%endif
- fsin
- fnstsw ax
- test ah, 04h
- jz .done
-
- fldpi
- fadd st0
- fxch st1
-.again:
- fprem1
- fnstsw ax
- test ah, 04h
- jnz .again
- fstp st1
- fsin
-
-.done:
+
+ ;
+ ; We examin the input and weed out non-finit numbers first.
+ ;
+ fxam
+ fnstsw ax
+ and ax, X86_FSW_C3 | X86_FSW_C2 | X86_FSW_C0
+ cmp ax, X86_FSW_C2 ; Normal finite number (excluding zero)
+ je .finite
+ cmp ax, X86_FSW_C3 ; Zero
+ je .zero
+ cmp ax, X86_FSW_C3 | X86_FSW_C2 ; Denormals - treat them as zero.
+ je .zero
+ cmp ax, X86_FSW_C0 ; NaN - must handle it special,
+ je .nan
+
+ ; Pass infinities and unsupported inputs to fsin, assuming it does the right thing.
+.do_sin:
+ fsin
+ jmp .return_val
+
+ ;
+ ; Finite number.
+ ;
+.finite:
+ ; For very tiny numbers, 0 < abs(input) < 2**-25, we can return the
+ ; input value directly.
+ fld st0 ; duplicate st0
+ fabs ; make it an absolute (positive) value.
+ fld qword [.s_r64Tiny xWrtRIP]
+ fcomip st1 ; compare s_r64Tiny and fabs(input)
+ ja .return_tiny_number_as_is ; jump if fabs(input) is smaller
+
+ ; FSIN is documented to be reasonable for the range ]-3pi/4,3pi/4[, so
+ ; while we have fabs(input) loaded already, check for that here and
+ ; allow rtNoCrtMathSinCore to assume it won't see values very close to
+ ; zero, except by cos -> sin conversion where they won't be relevant to
+ ; any assumpttions about precision approximation.
+ fld qword [.s_r64FSinOkay xWrtRIP]
+ fcomip st1
+ ffreep st0 ; drop the fabs(input) value
+ ja .do_sin
+
+ ;
+ ; Call common sine/cos worker.
+ ;
+ mov ecx, 1 ; double
+ extern NAME(rtNoCrtMathSinCore)
+ call NAME(rtNoCrtMathSinCore)
+
+ ;
+ ; Run st0.
+ ;
+.return_val:
%ifdef RT_ARCH_AMD64
- fstp qword [xSP]
- movsd xmm0, [xSP]
+ fstp qword [xBP - 10h]
+ movsd xmm0, [xBP - 10h]
+%endif
+%ifdef RT_OS_WINDOWS
+ fldcw [xBP - 20h] ; restore original
%endif
- leave
- ret
+.return:
+ leave
+ ret
+
+ ;
+ ; As explained already, we can return tiny numbers directly too as the
+ ; output from sin(input) = input given our precision.
+ ; We can skip the st0 -> xmm0 translation here, so follow the same path
+ ; as .zero & .nan, after we've removed the fabs(input) value.
+ ;
+.return_tiny_number_as_is:
+ ffreep st0
+
+ ;
+ ; sin(+/-0.0) = +/-0.0 (preserve the sign)
+ ; We can skip the st0 -> xmm0 translation here, so follow the .nan code path.
+ ;
+.zero:
+
+ ;
+ ; Input is NaN, output it unmodified as far as we can (FLD changes SNaN
+ ; to QNaN when masked).
+ ;
+.nan:
+%ifdef RT_ARCH_AMD64
+ ffreep st0
+%endif
+ jmp .return
+
+ALIGNCODE(8)
+ ; Ca. 2**-26, absolute value. Inputs closer to zero than this can be
+ ; returns directly as the sin(input) value should be basically the same
+ ; given the precision we're working with and FSIN probably won't even
+ ; manage that.
+ ;; @todo experiment when FSIN gets better than this.
+.s_r64Tiny:
+ dq 1.49011612e-8
+ ; The absolute limit of FSIN "good" range.
+.s_r64FSinOkay:
+ dq 2.356194490192344928845 ; 3pi/4
+ ;dq 1.57079632679489661923 ; pi/2 - alternative.
+
ENDPROC RT_NOCRT(sin)