diff options
author | vboxsync <vboxsync@cfe28804-0f27-0410-a406-dd0f0b0b656f> | 2022-08-17 00:59:31 +0000 |
---|---|---|
committer | vboxsync <vboxsync@cfe28804-0f27-0410-a406-dd0f0b0b656f> | 2022-08-17 00:59:31 +0000 |
commit | 70c69d24b92dc0d0cb460a8295cb471123ea8275 (patch) | |
tree | b80eaa5a93dcecda36981119b4e2cfe87dceaf80 /src/VBox/Runtime/common/math/sin.asm | |
parent | 638829feab70a2544096126da97cbb008b26f87f (diff) | |
download | VirtualBox-svn-70c69d24b92dc0d0cb460a8295cb471123ea8275.tar.gz |
IPRT/nocrt: Reworking the sin and cos code to take into account which ranges FCOS and FSIN instructions are expected to deliver reasonable results and handle extreme values better. bugref:10261
git-svn-id: https://www.virtualbox.org/svn/vbox/trunk@96240 cfe28804-0f27-0410-a406-dd0f0b0b656f
Diffstat (limited to 'src/VBox/Runtime/common/math/sin.asm')
-rw-r--r-- | src/VBox/Runtime/common/math/sin.asm | 473 |
1 files changed, 442 insertions, 31 deletions
diff --git a/src/VBox/Runtime/common/math/sin.asm b/src/VBox/Runtime/common/math/sin.asm index 7178a7696f7..b8a6b31581b 100644 --- a/src/VBox/Runtime/common/math/sin.asm +++ b/src/VBox/Runtime/common/math/sin.asm @@ -24,48 +24,459 @@ ; terms and conditions of either the GPL or the CDDL or both. ; + +%define RT_ASM_WITH_SEH64 %include "iprt/asmdefs.mac" +%include "iprt/x86.mac" + BEGINCODE ;; -; Compute the sine of rd -; @returns st(0)/xmm0 -; @param rd [xSP + xCB*2] / xmm0 +; Internal sine and cosine worker that calculates the sine of st0 returning +; it in st0. +; +; When called by a sine function, fabs(st0) >= pi/2. +; When called by a cosine function, fabs(original input value) >= 3pi/8. +; +; That the input isn't a tiny number close to zero, means that we can do a bit +; cruder rounding when operating close to a pi/2 boundrary. The value in the +; ecx register indicates the input precision and controls the crudeness of the +; rounding. +; +; @returns st0 = sine +; @param st0 A finite number to calucate sine of. +; @param ecx Set to 0 if original input was a 32-bit float. +; Set to 1 if original input was a 64-bit double. +; set to 2 if original input was a 80-bit long double. +; +BEGINPROC rtNoCrtMathSinCore + push xBP + SEH64_PUSH_xBP + mov xBP, xSP + SEH64_SET_FRAME_xBP 0 + SEH64_END_PROLOGUE + + ; + ; Load the pointer to the rounding crudeness factor into xDX. + ; + lea xDX, [.s_ar64NearZero xWrtRIP] + lea xDX, [xDX + xCX * xCB] + + ; + ; Finite number. We want it in the range [0,2pi] and will preform + ; a remainder division if it isn't. + ; + fcom qword [.s_r64Max xWrtRIP] ; compares st0 and 2*pi + fnstsw ax + test ax, X86_FSW_C3 | X86_FSW_C0 | X86_FSW_C2 ; C3 := st0 == mem; C0 := st0 < mem; C2 := unordered (should be the case); + jz .reduce_st0 ; Jump if st0 > mem + + fcom qword [.s_r64Min xWrtRIP] ; compares st0 and 0.0 + fnstsw ax + test ax, X86_FSW_C3 | X86_FSW_C0 + jnz .reduce_st0 ; Jump if st0 <= mem + + ; + ; We get here if st0 is in the [0,2pi] range. + ; + ; Now, FSIN is documented to be reasonably accurate for the range + ; -3pi/4 to +3pi/4, so we have to make some more effort to calculate + ; in that range only. + ; +.in_range: + ; if (st0 < pi) + fldpi + fcom st1 ; compares st0 (pi) with st1 (the normalized value) + fnstsw ax + test ax, X86_FSW_C0 ; st1 > pi + jnz .larger_than_pi + test ax, X86_FSW_C3 + jnz .equals_pi + + ; + ; input in the range [0,pi[ + ; +.smaller_than_pi: + fdiv qword [.s_r64Two xWrtRIP] ; st0 = pi/2 + + ; if (st0 < pi/2) + fcom st1 ; compares st0 (pi/2) with st1 + fnstsw ax + test ax, X86_FSW_C0 ; st1 > pi + jnz .between_half_pi_and_pi + test ax, X86_FSW_C3 + jnz .equals_half_pi + + ; + ; The value is between zero and half pi, including the zero value. + ; + ; This is in range where FSIN works reasonably reliably. So drop the + ; half pi in st0 and do the calculation. + ; +.between_zero_and_half_pi: + ; Check if we're so close to pi/2 that it makes no difference. + fsub st0, st1 ; st0 = pi/2 - st1 + fcom qword [xDX] + fnstsw ax + test ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number. + jnz .equals_half_pi + ffreep st0 + + ; Check if we're so close to zero that it makes no difference given the + ; internal accuracy of the FPU. + fcom qword [xDX] + fnstsw ax + test ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number. + jnz .equals_zero_popped_one + + ; Ok, calculate sine. + fsin + jmp .return + + ; + ; The value is in the range ]pi/2,pi[ + ; + ; This is outside the comfortable FSIN range, but if we subtract PI and + ; move to the ]-pi/2,0[ range we just have to change the sign to get + ; the value we want. + ; +.between_half_pi_and_pi: + ; Check if we're so close to pi/2 that it makes no difference. + fsubr st0, st1 ; st0 = st1 - st0 + fcom qword [xDX] + fnstsw ax + test ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number. + jnz .equals_half_pi + ffreep st0 + + ; Check if we're so close to pi that it makes no difference. + fldpi + fsub st0, st1 ; st0 = st0 - st1 + fcom qword [xDX] + fnstsw ax + test ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number. + jnz .equals_pi + ffreep st0 + + ; Ok, transform the value and calculate sine. + fldpi + fsubp st1, st0 + + fsin + fchs + jmp .return + + ; + ; input in the range ]pi,2pi[ + ; +.larger_than_pi: + fsub st1, st0 ; st1 -= pi + fdiv qword [.s_r64Two xWrtRIP] ; st0 = pi/2 + + ; if (st0 < pi/2) + fcom st1 ; compares st0 (pi/2) with reduced st1 + fnstsw ax + test ax, X86_FSW_C0 ; st1 > pi + jnz .between_3_half_pi_and_2pi + test ax, X86_FSW_C3 + jnz .equals_3_half_pi + + ; + ; The value is in the the range: ]pi,3pi/2[ + ; + ; The actual st0 is in the range ]pi,pi/2[ where FSIN is performing okay + ; and we can get the desired result by changing the sign (-FSIN). + ; +.between_pi_and_3_half_pi: + ; Check if we're so close to pi/2 that it makes no difference. + fsub st0, st1 ; st0 = pi/2 - st1 + fcom qword [xDX] + fnstsw ax + test ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number. + jnz .equals_3_half_pi + ffreep st0 + + ; Check if we're so close to zero that it makes no difference given the + ; internal accuracy of the FPU. + fcom qword [xDX] + fnstsw ax + test ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number. + jnz .equals_pi_popped + + ; Ok, calculate sine and flip the sign. + fsin + fchs + jmp .return + + ; + ; The value is in the last pi/2 of the range: ]3pi/2,2pi[ + ; + ; Since FSIN should work reasonably well for ]-pi/2,pi], we can just + ; subtract pi again (we subtracted pi at .larger_than_pi above) and + ; run FSIN on it. (st1 is currently in the range ]pi/2,pi[.) + ; +.between_3_half_pi_and_2pi: + ; Check if we're so close to pi/2 that it makes no difference. + fsubr st0, st1 ; st0 = st1 - st0 + fcom qword [xDX] + fnstsw ax + test ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number. + jnz .equals_3_half_pi + ffreep st0 + + ; Check if we're so close to pi that it makes no difference. + fldpi + fsub st0, st1 ; st0 = st0 - st1 + fcom qword [xDX] + fnstsw ax + test ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number. + jnz .equals_2pi + ffreep st0 + + ; Ok, adjust input and calculate sine. + fldpi + fsubp st1, st0 + fsin + jmp .return + + ; + ; sin(0) = 0 + ; sin(pi) = 0 + ; +.equals_zero: +.equals_pi: +.equals_2pi: + ffreep st0 +.equals_zero_popped_one: +.equals_pi_popped: + ffreep st0 + fldz + jmp .return + + ; + ; sin(pi/2) = 1 + ; +.equals_half_pi: + ffreep st0 + ffreep st0 + fld1 + jmp .return + + ; + ; sin(3*pi/2) = -1 + ; +.equals_3_half_pi: + ffreep st0 + ffreep st0 + fld1 + fchs + jmp .return + + ; + ; Return. + ; +.return: + leave + ret + + ; + ; Reduce st0 by reminder division by PI*2. The result should be positive here. + ; + ;; @todo this is one of our weak spots (really any calculation involving PI is). +.reduce_st0: + fldpi + fadd st0, st0 + fxch st1 ; st0=input (dividend) st1=2pi (divisor) +.again: + fprem1 + fnstsw ax + test ah, (X86_FSW_C2 >> 8) ; C2 is set if partial result. + jnz .again ; Loop till C2 == 0 and we have a final result. + + ; + ; Make sure the result is positive. + ; + fxam + fnstsw ax + test ax, X86_FSW_C1 ; The sign bit + jz .reduced_to_positive + + fadd st0, st1 ; st0 += 2pi, which should make it positive + +%ifdef RT_STRICT + fxam + fnstsw ax + test ax, X86_FSW_C1 + jz .reduced_to_positive + int3 +%endif + +.reduced_to_positive: + fstp st1 ; Get rid of the 2pi value. + jmp .in_range + +ALIGNCODE(8) +.s_r64Max: + dq +6.28318530717958647692 ; 2*pi +.s_r64Min: + dq 0.0 +.s_r64Two: + dq 2.0 + ;; + ; Close to 2/pi rounding limits for 32-bit, 64-bit and 80-bit floating point operations. + ; Given that the original input is at least +/-3pi/8 (1.178) and that precision of the + ; PI constant used during reduction/whatever, I think we can round to a whole pi/2 + ; step when we get close enough. + ; + ; Look to RTFLOAT64U for the format details, but 52 is the shift for the exponent field + ; and 1023 is the exponent bias. Since the format uses an implied 1 in the mantissa, + ; we only have to set the exponent to get a valid number. + ; +.s_ar64NearZero: + dq (-18 + 1023) << 52 ; float / 32-bit / single precision input + dq (-40 + 1023) << 52 ; double / 64-bit / double precision input + dq (-52 + 1023) << 52 ; long double / 80-bit / extended precision input +ENDPROC rtNoCrtMathSinCore + + +;; +; Compute the sine of rd, measured in radians. +; +; @returns st(0) / xmm0 +; @param rd [rbp + xCB*2] / xmm0 +; RT_NOCRT_BEGINPROC sin - push xBP - mov xBP, xSP + push xBP + SEH64_PUSH_xBP + mov xBP, xSP + SEH64_SET_FRAME_xBP 0 + sub xSP, 20h + SEH64_ALLOCATE_STACK 20h + SEH64_END_PROLOGUE -%ifdef RT_ARCH_AMD64 - sub xSP, 10h +%ifdef RT_OS_WINDOWS + ; + ; Make sure we use full precision and not the windows default of 53 bits. + ; + fnstcw [xBP - 20h] + mov ax, [xBP - 20h] + or ax, X86_FCW_PC_64 ; includes both bits, so no need to clear the mask. + mov [xBP - 1ch], ax + fldcw [xBP - 1ch] +%endif - movsd [xSP], xmm0 - fld qword [xSP] + ; + ; Load the input into st0. + ; +%ifdef RT_ARCH_AMD64 + movsd [xBP - 10h], xmm0 + fld qword [xBP - 10h] %else - fld qword [xBP + xCB*2] + fld qword [xBP + xCB*2] %endif - fsin - fnstsw ax - test ah, 04h - jz .done - - fldpi - fadd st0 - fxch st1 -.again: - fprem1 - fnstsw ax - test ah, 04h - jnz .again - fstp st1 - fsin - -.done: + + ; + ; We examin the input and weed out non-finit numbers first. + ; + fxam + fnstsw ax + and ax, X86_FSW_C3 | X86_FSW_C2 | X86_FSW_C0 + cmp ax, X86_FSW_C2 ; Normal finite number (excluding zero) + je .finite + cmp ax, X86_FSW_C3 ; Zero + je .zero + cmp ax, X86_FSW_C3 | X86_FSW_C2 ; Denormals - treat them as zero. + je .zero + cmp ax, X86_FSW_C0 ; NaN - must handle it special, + je .nan + + ; Pass infinities and unsupported inputs to fsin, assuming it does the right thing. +.do_sin: + fsin + jmp .return_val + + ; + ; Finite number. + ; +.finite: + ; For very tiny numbers, 0 < abs(input) < 2**-25, we can return the + ; input value directly. + fld st0 ; duplicate st0 + fabs ; make it an absolute (positive) value. + fld qword [.s_r64Tiny xWrtRIP] + fcomip st1 ; compare s_r64Tiny and fabs(input) + ja .return_tiny_number_as_is ; jump if fabs(input) is smaller + + ; FSIN is documented to be reasonable for the range ]-3pi/4,3pi/4[, so + ; while we have fabs(input) loaded already, check for that here and + ; allow rtNoCrtMathSinCore to assume it won't see values very close to + ; zero, except by cos -> sin conversion where they won't be relevant to + ; any assumpttions about precision approximation. + fld qword [.s_r64FSinOkay xWrtRIP] + fcomip st1 + ffreep st0 ; drop the fabs(input) value + ja .do_sin + + ; + ; Call common sine/cos worker. + ; + mov ecx, 1 ; double + extern NAME(rtNoCrtMathSinCore) + call NAME(rtNoCrtMathSinCore) + + ; + ; Run st0. + ; +.return_val: %ifdef RT_ARCH_AMD64 - fstp qword [xSP] - movsd xmm0, [xSP] + fstp qword [xBP - 10h] + movsd xmm0, [xBP - 10h] +%endif +%ifdef RT_OS_WINDOWS + fldcw [xBP - 20h] ; restore original %endif - leave - ret +.return: + leave + ret + + ; + ; As explained already, we can return tiny numbers directly too as the + ; output from sin(input) = input given our precision. + ; We can skip the st0 -> xmm0 translation here, so follow the same path + ; as .zero & .nan, after we've removed the fabs(input) value. + ; +.return_tiny_number_as_is: + ffreep st0 + + ; + ; sin(+/-0.0) = +/-0.0 (preserve the sign) + ; We can skip the st0 -> xmm0 translation here, so follow the .nan code path. + ; +.zero: + + ; + ; Input is NaN, output it unmodified as far as we can (FLD changes SNaN + ; to QNaN when masked). + ; +.nan: +%ifdef RT_ARCH_AMD64 + ffreep st0 +%endif + jmp .return + +ALIGNCODE(8) + ; Ca. 2**-26, absolute value. Inputs closer to zero than this can be + ; returns directly as the sin(input) value should be basically the same + ; given the precision we're working with and FSIN probably won't even + ; manage that. + ;; @todo experiment when FSIN gets better than this. +.s_r64Tiny: + dq 1.49011612e-8 + ; The absolute limit of FSIN "good" range. +.s_r64FSinOkay: + dq 2.356194490192344928845 ; 3pi/4 + ;dq 1.57079632679489661923 ; pi/2 - alternative. + ENDPROC RT_NOCRT(sin) |