sincore.asm

Last change on this file was 106061, checked in by vboxsync, 8 weeks ago
Copyright year updates by scm.
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 11.2 KB

Line
1	; $Id: sincore.asm 106061 2024-09-16 14:03:52Z vboxsync $
2	;; @file
3	; IPRT - No-CRT common sin & cos - AMD64 & X86.
4	;
5
6	;
7	; Copyright (C) 2022-2024 Oracle and/or its affiliates.
8	;
9	; This file is part of VirtualBox base platform packages, as
10	; available from https://www.virtualbox.org.
11	;
12	; This program is free software; you can redistribute it and/or
13	; modify it under the terms of the GNU General Public License
14	; as published by the Free Software Foundation, in version 3 of the
15	; License.
16	;
17	; This program is distributed in the hope that it will be useful, but
18	; WITHOUT ANY WARRANTY; without even the implied warranty of
19	; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20	; General Public License for more details.
21	;
22	; You should have received a copy of the GNU General Public License
23	; along with this program; if not, see <https://www.gnu.org/licenses>.
24	;
25	; The contents of this file may alternatively be used under the terms
26	; of the Common Development and Distribution License Version 1.0
27	; (CDDL), a copy of it is provided in the "COPYING.CDDL" file included
28	; in the VirtualBox distribution, in which case the provisions of the
29	; CDDL are applicable instead of those of the GPL.
30	;
31	; You may elect to license modified versions of this file under the
32	; terms and conditions of either the GPL or the CDDL or both.
33	;
34	; SPDX-License-Identifier: GPL-3.0-only OR CDDL-1.0
35	;
36
37
38	%define RT_ASM_WITH_SEH64
39	%include "iprt/asmdefs.mac"
40	%include "iprt/x86.mac"
41
42
43	BEGINCODE
44
45	;;
46	; Internal sine and cosine worker that calculates the sine of st0 returning
47	; it in st0.
48	;
49	; When called by a sine function, fabs(st0) >= pi/2.
50	; When called by a cosine function, fabs(original input value) >= 3pi/8.
51	;
52	; That the input isn't a tiny number close to zero, means that we can do a bit
53	; cruder rounding when operating close to a pi/2 boundrary. The value in the
54	; ecx register indicates the input precision and controls the crudeness of the
55	; rounding.
56	;
57	; @returns st0 = sine
58	; @param st0 A finite number to calucate sine of.
59	; @param ecx Set to 0 if original input was a 32-bit float.
60	; Set to 1 if original input was a 64-bit double.
61	; set to 2 if original input was a 80-bit long double.
62	;
63	BEGINPROC rtNoCrtMathSinCore
64	push xBP
65	SEH64_PUSH_xBP
66	mov xBP, xSP
67	SEH64_SET_FRAME_xBP 0
68	SEH64_END_PROLOGUE
69
70	;
71	; Load the pointer to the rounding crudeness factor into xDX.
72	;
73	lea xDX, [.s_ar64NearZero xWrtRIP]
74	lea xDX, [xDX + xCX * xCB]
75
76	;
77	; Finite number. We want it in the range [0,2pi] and will preform
78	; a remainder division if it isn't.
79	;
80	fcom qword [.s_r64Max xWrtRIP] ; compares st0 and 2*pi
81	fnstsw ax
82	test ax, X86_FSW_C3 \| X86_FSW_C0 \| X86_FSW_C2 ; C3 := st0 == mem; C0 := st0 < mem; C2 := unordered (should be the case);
83	jz .reduce_st0 ; Jump if st0 > mem
84
85	fcom qword [.s_r64Min xWrtRIP] ; compares st0 and 0.0
86	fnstsw ax
87	test ax, X86_FSW_C3 \| X86_FSW_C0
88	jnz .reduce_st0 ; Jump if st0 <= mem
89
90	;
91	; We get here if st0 is in the [0,2pi] range.
92	;
93	; Now, FSIN is documented to be reasonably accurate for the range
94	; -3pi/4 to +3pi/4, so we have to make some more effort to calculate
95	; in that range only.
96	;
97	.in_range:
98	; if (st0 < pi)
99	fldpi
100	fcom st1 ; compares st0 (pi) with st1 (the normalized value)
101	fnstsw ax
102	test ax, X86_FSW_C0 ; st1 > pi
103	jnz .larger_than_pi
104	test ax, X86_FSW_C3
105	jnz .equals_pi
106
107	;
108	; input in the range [0,pi[
109	;
110	.smaller_than_pi:
111	fdiv qword [.s_r64Two xWrtRIP] ; st0 = pi/2
112
113	; if (st0 < pi/2)
114	fcom st1 ; compares st0 (pi/2) with st1
115	fnstsw ax
116	test ax, X86_FSW_C0 ; st1 > pi
117	jnz .between_half_pi_and_pi
118	test ax, X86_FSW_C3
119	jnz .equals_half_pi
120
121	;
122	; The value is between zero and half pi, including the zero value.
123	;
124	; This is in range where FSIN works reasonably reliably. So drop the
125	; half pi in st0 and do the calculation.
126	;
127	.between_zero_and_half_pi:
128	; Check if we're so close to pi/2 that it makes no difference.
129	fsub st0, st1 ; st0 = pi/2 - st1
130	fcom qword [xDX]
131	fnstsw ax
132	test ax, X86_FSW_C0 \| X86_FSW_C3 ; st0 <= very small positive number.
133	jnz .equals_half_pi
134	ffreep st0
135
136	; Check if we're so close to zero that it makes no difference given the
137	; internal accuracy of the FPU.
138	fcom qword [xDX]
139	fnstsw ax
140	test ax, X86_FSW_C0 \| X86_FSW_C3 ; st0 <= very small positive number.
141	jnz .equals_zero_popped_one
142
143	; Ok, calculate sine.
144	fsin
145	jmp .return
146
147	;
148	; The value is in the range ]pi/2,pi[
149	;
150	; This is outside the comfortable FSIN range, but if we subtract PI and
151	; move to the ]-pi/2,0[ range we just have to change the sign to get
152	; the value we want.
153	;
154	.between_half_pi_and_pi:
155	; Check if we're so close to pi/2 that it makes no difference.
156	fsubr st0, st1 ; st0 = st1 - st0
157	fcom qword [xDX]
158	fnstsw ax
159	test ax, X86_FSW_C0 \| X86_FSW_C3 ; st0 <= very small positive number.
160	jnz .equals_half_pi
161	ffreep st0
162
163	; Check if we're so close to pi that it makes no difference.
164	fldpi
165	fsub st0, st1 ; st0 = st0 - st1
166	fcom qword [xDX]
167	fnstsw ax
168	test ax, X86_FSW_C0 \| X86_FSW_C3 ; st0 <= very small positive number.
169	jnz .equals_pi
170	ffreep st0
171
172	; Ok, transform the value and calculate sine.
173	fldpi
174	fsubp st1, st0
175
176	fsin
177	fchs
178	jmp .return
179
180	;
181	; input in the range ]pi,2pi[
182	;
183	.larger_than_pi:
184	fsub st1, st0 ; st1 -= pi
185	fdiv qword [.s_r64Two xWrtRIP] ; st0 = pi/2
186
187	; if (st0 < pi/2)
188	fcom st1 ; compares st0 (pi/2) with reduced st1
189	fnstsw ax
190	test ax, X86_FSW_C0 ; st1 > pi
191	jnz .between_3_half_pi_and_2pi
192	test ax, X86_FSW_C3
193	jnz .equals_3_half_pi
194
195	;
196	; The value is in the the range: ]pi,3pi/2[
197	;
198	; The actual st0 is in the range ]pi,pi/2[ where FSIN is performing okay
199	; and we can get the desired result by changing the sign (-FSIN).
200	;
201	.between_pi_and_3_half_pi:
202	; Check if we're so close to pi/2 that it makes no difference.
203	fsub st0, st1 ; st0 = pi/2 - st1
204	fcom qword [xDX]
205	fnstsw ax
206	test ax, X86_FSW_C0 \| X86_FSW_C3 ; st0 <= very small positive number.
207	jnz .equals_3_half_pi
208	ffreep st0
209
210	; Check if we're so close to zero that it makes no difference given the
211	; internal accuracy of the FPU.
212	fcom qword [xDX]
213	fnstsw ax
214	test ax, X86_FSW_C0 \| X86_FSW_C3 ; st0 <= very small positive number.
215	jnz .equals_pi_popped
216
217	; Ok, calculate sine and flip the sign.
218	fsin
219	fchs
220	jmp .return
221
222	;
223	; The value is in the last pi/2 of the range: ]3pi/2,2pi[
224	;
225	; Since FSIN should work reasonably well for ]-pi/2,pi], we can just
226	; subtract pi again (we subtracted pi at .larger_than_pi above) and
227	; run FSIN on it. (st1 is currently in the range ]pi/2,pi[.)
228	;
229	.between_3_half_pi_and_2pi:
230	; Check if we're so close to pi/2 that it makes no difference.
231	fsubr st0, st1 ; st0 = st1 - st0
232	fcom qword [xDX]
233	fnstsw ax
234	test ax, X86_FSW_C0 \| X86_FSW_C3 ; st0 <= very small positive number.
235	jnz .equals_3_half_pi
236	ffreep st0
237
238	; Check if we're so close to pi that it makes no difference.
239	fldpi
240	fsub st0, st1 ; st0 = st0 - st1
241	fcom qword [xDX]
242	fnstsw ax
243	test ax, X86_FSW_C0 \| X86_FSW_C3 ; st0 <= very small positive number.
244	jnz .equals_2pi
245	ffreep st0
246
247	; Ok, adjust input and calculate sine.
248	fldpi
249	fsubp st1, st0
250	fsin
251	jmp .return
252
253	;
254	; sin(0) = 0
255	; sin(pi) = 0
256	;
257	.equals_zero:
258	.equals_pi:
259	.equals_2pi:
260	ffreep st0
261	.equals_zero_popped_one:
262	.equals_pi_popped:
263	ffreep st0
264	fldz
265	jmp .return
266
267	;
268	; sin(pi/2) = 1
269	;
270	.equals_half_pi:
271	ffreep st0
272	ffreep st0
273	fld1
274	jmp .return
275
276	;
277	; sin(3*pi/2) = -1
278	;
279	.equals_3_half_pi:
280	ffreep st0
281	ffreep st0
282	fld1
283	fchs
284	jmp .return
285
286	;
287	; Return.
288	;
289	.return:
290	leave
291	ret
292
293	;
294	; Reduce st0 by reminder division by PI*2. The result should be positive here.
295	;
296	;; @todo this is one of our weak spots (really any calculation involving PI is).
297	.reduce_st0:
298	fldpi
299	fadd st0, st0
300	fxch st1 ; st0=input (dividend) st1=2pi (divisor)
301	.again:
302	fprem1
303	fnstsw ax
304	test ah, (X86_FSW_C2 >> 8) ; C2 is set if partial result.
305	jnz .again ; Loop till C2 == 0 and we have a final result.
306
307	;
308	; Make sure the result is positive.
309	;
310	fxam
311	fnstsw ax
312	test ax, X86_FSW_C1 ; The sign bit
313	jz .reduced_to_positive
314
315	fadd st0, st1 ; st0 += 2pi, which should make it positive
316
317	%ifdef RT_STRICT
318	fxam
319	fnstsw ax
320	test ax, X86_FSW_C1
321	jz .reduced_to_positive
322	int3
323	%endif
324
325	.reduced_to_positive:
326	fstp st1 ; Get rid of the 2pi value.
327	jmp .in_range
328
329	ALIGNCODE(8)
330	.s_r64Max:
331	dq +6.28318530717958647692 ; 2*pi
332	.s_r64Min:
333	dq 0.0
334	.s_r64Two:
335	dq 2.0
336	;;
337	; Close to 2/pi rounding limits for 32-bit, 64-bit and 80-bit floating point operations.
338	; Given that the original input is at least +/-3pi/8 (1.178) and that precision of the
339	; PI constant used during reduction/whatever, I think we can round to a whole pi/2
340	; step when we get close enough.
341	;
342	; Look to RTFLOAT64U for the format details, but 52 is the shift for the exponent field
343	; and 1023 is the exponent bias. Since the format uses an implied 1 in the mantissa,
344	; we only have to set the exponent to get a valid number.
345	;
346	.s_ar64NearZero:
347	;; @todo check how sensible these really are...
348	dq (-18 + 1023) << 52 ; float / 32-bit / single precision input
349	dq (-40 + 1023) << 52 ; double / 64-bit / double precision input
350	dq (-52 + 1023) << 52 ; long double / 80-bit / extended precision input
351	ENDPROC rtNoCrtMathSinCore
352

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/VBox/Runtime/common/math/sincore.asm

Download in other formats: