VirtualBox

source: vbox/trunk/src/VBox/Runtime/common/math/sincore.asm

Last change on this file was 106061, checked in by vboxsync, 8 weeks ago

Copyright year updates by scm.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 11.2 KB
Line 
1; $Id: sincore.asm 106061 2024-09-16 14:03:52Z vboxsync $
2;; @file
3; IPRT - No-CRT common sin & cos - AMD64 & X86.
4;
5
6;
7; Copyright (C) 2022-2024 Oracle and/or its affiliates.
8;
9; This file is part of VirtualBox base platform packages, as
10; available from https://www.virtualbox.org.
11;
12; This program is free software; you can redistribute it and/or
13; modify it under the terms of the GNU General Public License
14; as published by the Free Software Foundation, in version 3 of the
15; License.
16;
17; This program is distributed in the hope that it will be useful, but
18; WITHOUT ANY WARRANTY; without even the implied warranty of
19; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20; General Public License for more details.
21;
22; You should have received a copy of the GNU General Public License
23; along with this program; if not, see <https://www.gnu.org/licenses>.
24;
25; The contents of this file may alternatively be used under the terms
26; of the Common Development and Distribution License Version 1.0
27; (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
28; in the VirtualBox distribution, in which case the provisions of the
29; CDDL are applicable instead of those of the GPL.
30;
31; You may elect to license modified versions of this file under the
32; terms and conditions of either the GPL or the CDDL or both.
33;
34; SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
35;
36
37
38%define RT_ASM_WITH_SEH64
39%include "iprt/asmdefs.mac"
40%include "iprt/x86.mac"
41
42
43BEGINCODE
44
45;;
46; Internal sine and cosine worker that calculates the sine of st0 returning
47; it in st0.
48;
49; When called by a sine function, fabs(st0) >= pi/2.
50; When called by a cosine function, fabs(original input value) >= 3pi/8.
51;
52; That the input isn't a tiny number close to zero, means that we can do a bit
53; cruder rounding when operating close to a pi/2 boundrary. The value in the
54; ecx register indicates the input precision and controls the crudeness of the
55; rounding.
56;
57; @returns st0 = sine
58; @param st0 A finite number to calucate sine of.
59; @param ecx Set to 0 if original input was a 32-bit float.
60; Set to 1 if original input was a 64-bit double.
61; set to 2 if original input was a 80-bit long double.
62;
63BEGINPROC rtNoCrtMathSinCore
64 push xBP
65 SEH64_PUSH_xBP
66 mov xBP, xSP
67 SEH64_SET_FRAME_xBP 0
68 SEH64_END_PROLOGUE
69
70 ;
71 ; Load the pointer to the rounding crudeness factor into xDX.
72 ;
73 lea xDX, [.s_ar64NearZero xWrtRIP]
74 lea xDX, [xDX + xCX * xCB]
75
76 ;
77 ; Finite number. We want it in the range [0,2pi] and will preform
78 ; a remainder division if it isn't.
79 ;
80 fcom qword [.s_r64Max xWrtRIP] ; compares st0 and 2*pi
81 fnstsw ax
82 test ax, X86_FSW_C3 | X86_FSW_C0 | X86_FSW_C2 ; C3 := st0 == mem; C0 := st0 < mem; C2 := unordered (should be the case);
83 jz .reduce_st0 ; Jump if st0 > mem
84
85 fcom qword [.s_r64Min xWrtRIP] ; compares st0 and 0.0
86 fnstsw ax
87 test ax, X86_FSW_C3 | X86_FSW_C0
88 jnz .reduce_st0 ; Jump if st0 <= mem
89
90 ;
91 ; We get here if st0 is in the [0,2pi] range.
92 ;
93 ; Now, FSIN is documented to be reasonably accurate for the range
94 ; -3pi/4 to +3pi/4, so we have to make some more effort to calculate
95 ; in that range only.
96 ;
97.in_range:
98 ; if (st0 < pi)
99 fldpi
100 fcom st1 ; compares st0 (pi) with st1 (the normalized value)
101 fnstsw ax
102 test ax, X86_FSW_C0 ; st1 > pi
103 jnz .larger_than_pi
104 test ax, X86_FSW_C3
105 jnz .equals_pi
106
107 ;
108 ; input in the range [0,pi[
109 ;
110.smaller_than_pi:
111 fdiv qword [.s_r64Two xWrtRIP] ; st0 = pi/2
112
113 ; if (st0 < pi/2)
114 fcom st1 ; compares st0 (pi/2) with st1
115 fnstsw ax
116 test ax, X86_FSW_C0 ; st1 > pi
117 jnz .between_half_pi_and_pi
118 test ax, X86_FSW_C3
119 jnz .equals_half_pi
120
121 ;
122 ; The value is between zero and half pi, including the zero value.
123 ;
124 ; This is in range where FSIN works reasonably reliably. So drop the
125 ; half pi in st0 and do the calculation.
126 ;
127.between_zero_and_half_pi:
128 ; Check if we're so close to pi/2 that it makes no difference.
129 fsub st0, st1 ; st0 = pi/2 - st1
130 fcom qword [xDX]
131 fnstsw ax
132 test ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number.
133 jnz .equals_half_pi
134 ffreep st0
135
136 ; Check if we're so close to zero that it makes no difference given the
137 ; internal accuracy of the FPU.
138 fcom qword [xDX]
139 fnstsw ax
140 test ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number.
141 jnz .equals_zero_popped_one
142
143 ; Ok, calculate sine.
144 fsin
145 jmp .return
146
147 ;
148 ; The value is in the range ]pi/2,pi[
149 ;
150 ; This is outside the comfortable FSIN range, but if we subtract PI and
151 ; move to the ]-pi/2,0[ range we just have to change the sign to get
152 ; the value we want.
153 ;
154.between_half_pi_and_pi:
155 ; Check if we're so close to pi/2 that it makes no difference.
156 fsubr st0, st1 ; st0 = st1 - st0
157 fcom qword [xDX]
158 fnstsw ax
159 test ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number.
160 jnz .equals_half_pi
161 ffreep st0
162
163 ; Check if we're so close to pi that it makes no difference.
164 fldpi
165 fsub st0, st1 ; st0 = st0 - st1
166 fcom qword [xDX]
167 fnstsw ax
168 test ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number.
169 jnz .equals_pi
170 ffreep st0
171
172 ; Ok, transform the value and calculate sine.
173 fldpi
174 fsubp st1, st0
175
176 fsin
177 fchs
178 jmp .return
179
180 ;
181 ; input in the range ]pi,2pi[
182 ;
183.larger_than_pi:
184 fsub st1, st0 ; st1 -= pi
185 fdiv qword [.s_r64Two xWrtRIP] ; st0 = pi/2
186
187 ; if (st0 < pi/2)
188 fcom st1 ; compares st0 (pi/2) with reduced st1
189 fnstsw ax
190 test ax, X86_FSW_C0 ; st1 > pi
191 jnz .between_3_half_pi_and_2pi
192 test ax, X86_FSW_C3
193 jnz .equals_3_half_pi
194
195 ;
196 ; The value is in the the range: ]pi,3pi/2[
197 ;
198 ; The actual st0 is in the range ]pi,pi/2[ where FSIN is performing okay
199 ; and we can get the desired result by changing the sign (-FSIN).
200 ;
201.between_pi_and_3_half_pi:
202 ; Check if we're so close to pi/2 that it makes no difference.
203 fsub st0, st1 ; st0 = pi/2 - st1
204 fcom qword [xDX]
205 fnstsw ax
206 test ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number.
207 jnz .equals_3_half_pi
208 ffreep st0
209
210 ; Check if we're so close to zero that it makes no difference given the
211 ; internal accuracy of the FPU.
212 fcom qword [xDX]
213 fnstsw ax
214 test ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number.
215 jnz .equals_pi_popped
216
217 ; Ok, calculate sine and flip the sign.
218 fsin
219 fchs
220 jmp .return
221
222 ;
223 ; The value is in the last pi/2 of the range: ]3pi/2,2pi[
224 ;
225 ; Since FSIN should work reasonably well for ]-pi/2,pi], we can just
226 ; subtract pi again (we subtracted pi at .larger_than_pi above) and
227 ; run FSIN on it. (st1 is currently in the range ]pi/2,pi[.)
228 ;
229.between_3_half_pi_and_2pi:
230 ; Check if we're so close to pi/2 that it makes no difference.
231 fsubr st0, st1 ; st0 = st1 - st0
232 fcom qword [xDX]
233 fnstsw ax
234 test ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number.
235 jnz .equals_3_half_pi
236 ffreep st0
237
238 ; Check if we're so close to pi that it makes no difference.
239 fldpi
240 fsub st0, st1 ; st0 = st0 - st1
241 fcom qword [xDX]
242 fnstsw ax
243 test ax, X86_FSW_C0 | X86_FSW_C3 ; st0 <= very small positive number.
244 jnz .equals_2pi
245 ffreep st0
246
247 ; Ok, adjust input and calculate sine.
248 fldpi
249 fsubp st1, st0
250 fsin
251 jmp .return
252
253 ;
254 ; sin(0) = 0
255 ; sin(pi) = 0
256 ;
257.equals_zero:
258.equals_pi:
259.equals_2pi:
260 ffreep st0
261.equals_zero_popped_one:
262.equals_pi_popped:
263 ffreep st0
264 fldz
265 jmp .return
266
267 ;
268 ; sin(pi/2) = 1
269 ;
270.equals_half_pi:
271 ffreep st0
272 ffreep st0
273 fld1
274 jmp .return
275
276 ;
277 ; sin(3*pi/2) = -1
278 ;
279.equals_3_half_pi:
280 ffreep st0
281 ffreep st0
282 fld1
283 fchs
284 jmp .return
285
286 ;
287 ; Return.
288 ;
289.return:
290 leave
291 ret
292
293 ;
294 ; Reduce st0 by reminder division by PI*2. The result should be positive here.
295 ;
296 ;; @todo this is one of our weak spots (really any calculation involving PI is).
297.reduce_st0:
298 fldpi
299 fadd st0, st0
300 fxch st1 ; st0=input (dividend) st1=2pi (divisor)
301.again:
302 fprem1
303 fnstsw ax
304 test ah, (X86_FSW_C2 >> 8) ; C2 is set if partial result.
305 jnz .again ; Loop till C2 == 0 and we have a final result.
306
307 ;
308 ; Make sure the result is positive.
309 ;
310 fxam
311 fnstsw ax
312 test ax, X86_FSW_C1 ; The sign bit
313 jz .reduced_to_positive
314
315 fadd st0, st1 ; st0 += 2pi, which should make it positive
316
317%ifdef RT_STRICT
318 fxam
319 fnstsw ax
320 test ax, X86_FSW_C1
321 jz .reduced_to_positive
322 int3
323%endif
324
325.reduced_to_positive:
326 fstp st1 ; Get rid of the 2pi value.
327 jmp .in_range
328
329ALIGNCODE(8)
330.s_r64Max:
331 dq +6.28318530717958647692 ; 2*pi
332.s_r64Min:
333 dq 0.0
334.s_r64Two:
335 dq 2.0
336 ;;
337 ; Close to 2/pi rounding limits for 32-bit, 64-bit and 80-bit floating point operations.
338 ; Given that the original input is at least +/-3pi/8 (1.178) and that precision of the
339 ; PI constant used during reduction/whatever, I think we can round to a whole pi/2
340 ; step when we get close enough.
341 ;
342 ; Look to RTFLOAT64U for the format details, but 52 is the shift for the exponent field
343 ; and 1023 is the exponent bias. Since the format uses an implied 1 in the mantissa,
344 ; we only have to set the exponent to get a valid number.
345 ;
346.s_ar64NearZero:
347;; @todo check how sensible these really are...
348 dq (-18 + 1023) << 52 ; float / 32-bit / single precision input
349 dq (-40 + 1023) << 52 ; double / 64-bit / double precision input
350 dq (-52 + 1023) << 52 ; long double / 80-bit / extended precision input
351ENDPROC rtNoCrtMathSinCore
352
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette