softfloat.c@ 69879

Last change on this file since 69879 was 69465, checked in by vboxsync, 7 years ago
recompiler: scm updates
Property svn:eol-style set to `native` Property svn:keywords set to `Id Revision`
File size: 204.6 KB

Line
1
2	/*============================================================================
3
4	This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
5	Package, Release 2b.
6
7	Written by John R. Hauser. This work was made possible in part by the
8	International Computer Science Institute, located at Suite 600, 1947 Center
9	Street, Berkeley, California 94704. Funding was partially provided by the
10	National Science Foundation under grant MIP-9311980. The original version
11	of this code was written as part of a project to build a fixed-point vector
12	processor in collaboration with the University of California at Berkeley,
13	overseen by Profs. Nelson Morgan and John Wawrzynek. More information
14	is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
15	arithmetic/SoftFloat.html'.
16
17	THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
18	been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
19	RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
20	AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
21	COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
22	EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
23	INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
24	OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
25
26	Derivative works are acceptable, even for commercial purposes, so long as
27	(1) the source code for the derivative work includes prominent notice that
28	the work is derivative, and (2) the source code includes prominent notice with
29	these four paragraphs for those parts of this code that are retained.
30
31	=============================================================================*/
32
33	/* FIXME: Flush-To-Zero only effects results. Denormal inputs should also
34	be flushed to zero. */
35	#include "softfloat.h"
36
37	/*----------------------------------------------------------------------------
38	\| Primitive arithmetic functions, including multi-word arithmetic, and
39	\| division and square root approximations. (Can be specialized to target if
40	\| desired.)
41	----------------------------------------------------------------------------/
42	#include "softfloat-macros.h"
43
44	/*----------------------------------------------------------------------------
45	\| Functions and definitions to determine: (1) whether tininess for underflow
46	\| is detected before or after rounding by default, (2) what (if anything)
47	\| happens when exceptions are raised, (3) how signaling NaNs are distinguished
48	\| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
49	\| are propagated from function inputs to output. These details are target-
50	\| specific.
51	----------------------------------------------------------------------------/
52	#include "softfloat-specialize.h"
53
54	void set_float_rounding_mode(int val STATUS_PARAM)
55	{
56	STATUS(float_rounding_mode) = val;
57	}
58
59	void set_float_exception_flags(int val STATUS_PARAM)
60	{
61	STATUS(float_exception_flags) = val;
62	}
63
64	#ifdef FLOATX80
65	void set_floatx80_rounding_precision(int val STATUS_PARAM)
66	{
67	STATUS(floatx80_rounding_precision) = val;
68	}
69	#endif
70
71	/*----------------------------------------------------------------------------
72	\| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
73	\| and 7, and returns the properly rounded 32-bit integer corresponding to the
74	\| input. If `zSign' is 1, the input is negated before being converted to an
75	\| integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
76	\| is simply rounded to an integer, with the inexact exception raised if the
77	\| input cannot be represented exactly as an integer. However, if the fixed-
78	\| point input is too large, the invalid exception is raised and the largest
79	\| positive or negative integer is returned.
80	----------------------------------------------------------------------------/
81
82	static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM)
83	{
84	int8 roundingMode;
85	flag roundNearestEven;
86	int8 roundIncrement, roundBits;
87	int32 z;
88
89	roundingMode = STATUS(float_rounding_mode);
90	roundNearestEven = ( roundingMode == float_round_nearest_even );
91	roundIncrement = 0x40;
92	if ( ! roundNearestEven ) {
93	if ( roundingMode == float_round_to_zero ) {
94	roundIncrement = 0;
95	}
96	else {
97	roundIncrement = 0x7F;
98	if ( zSign ) {
99	if ( roundingMode == float_round_up ) roundIncrement = 0;
100	}
101	else {
102	if ( roundingMode == float_round_down ) roundIncrement = 0;
103	}
104	}
105	}
106	roundBits = absZ & 0x7F;
107	absZ = ( absZ + roundIncrement )>>7;
108	absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
109	z = absZ;
110	if ( zSign ) z = - z;
111	if ( ( absZ>>32 ) \|\| ( z && ( ( z < 0 ) ^ zSign ) ) ) {
112	float_raise( float_flag_invalid STATUS_VAR);
113	return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
114	}
115	if ( roundBits ) STATUS(float_exception_flags) \|= float_flag_inexact;
116	return z;
117
118	}
119
120	/*----------------------------------------------------------------------------
121	\| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
122	\| `absZ1', with binary point between bits 63 and 64 (between the input words),
123	\| and returns the properly rounded 64-bit integer corresponding to the input.
124	\| If `zSign' is 1, the input is negated before being converted to an integer.
125	\| Ordinarily, the fixed-point input is simply rounded to an integer, with
126	\| the inexact exception raised if the input cannot be represented exactly as
127	\| an integer. However, if the fixed-point input is too large, the invalid
128	\| exception is raised and the largest positive or negative integer is
129	\| returned.
130	----------------------------------------------------------------------------/
131
132	static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM)
133	{
134	int8 roundingMode;
135	flag roundNearestEven, increment;
136	int64 z;
137
138	roundingMode = STATUS(float_rounding_mode);
139	roundNearestEven = ( roundingMode == float_round_nearest_even );
140	increment = ( (sbits64) absZ1 < 0 );
141	if ( ! roundNearestEven ) {
142	if ( roundingMode == float_round_to_zero ) {
143	increment = 0;
144	}
145	else {
146	if ( zSign ) {
147	increment = ( roundingMode == float_round_down ) && absZ1;
148	}
149	else {
150	increment = ( roundingMode == float_round_up ) && absZ1;
151	}
152	}
153	}
154	if ( increment ) {
155	++absZ0;
156	if ( absZ0 == 0 ) goto overflow;
157	absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
158	}
159	z = absZ0;
160	if ( zSign ) z = - z;
161	if ( z && ( ( z < 0 ) ^ zSign ) ) {
162	overflow:
163	float_raise( float_flag_invalid STATUS_VAR);
164	return
165	zSign ? (sbits64) LIT64( 0x8000000000000000 )
166	: LIT64( 0x7FFFFFFFFFFFFFFF );
167	}
168	if ( absZ1 ) STATUS(float_exception_flags) \|= float_flag_inexact;
169	return z;
170
171	}
172
173	/*----------------------------------------------------------------------------
174	\| Returns the fraction bits of the single-precision floating-point value `a'.
175	----------------------------------------------------------------------------/
176
177	INLINE bits32 extractFloat32Frac( float32 a )
178	{
179
180	return float32_val(a) & 0x007FFFFF;
181
182	}
183
184	/*----------------------------------------------------------------------------
185	\| Returns the exponent bits of the single-precision floating-point value `a'.
186	----------------------------------------------------------------------------/
187
188	INLINE int16 extractFloat32Exp( float32 a )
189	{
190
191	return ( float32_val(a)>>23 ) & 0xFF;
192
193	}
194
195	/*----------------------------------------------------------------------------
196	\| Returns the sign bit of the single-precision floating-point value `a'.
197	----------------------------------------------------------------------------/
198
199	INLINE flag extractFloat32Sign( float32 a )
200	{
201
202	return float32_val(a)>>31;
203
204	}
205
206	/*----------------------------------------------------------------------------
207	\| Normalizes the subnormal single-precision floating-point value represented
208	\| by the denormalized significand `aSig'. The normalized exponent and
209	\| significand are stored at the locations pointed to by `zExpPtr' and
210	\| `zSigPtr', respectively.
211	----------------------------------------------------------------------------/
212
213	static void
214	normalizeFloat32Subnormal( bits32 aSig, int16 zExpPtr, bits32 zSigPtr )
215	{
216	int8 shiftCount;
217
218	shiftCount = countLeadingZeros32( aSig ) - 8;
219	*zSigPtr = aSig<<shiftCount;
220	*zExpPtr = 1 - shiftCount;
221
222	}
223
224	/*----------------------------------------------------------------------------
225	\| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
226	\| single-precision floating-point value, returning the result. After being
227	\| shifted into the proper positions, the three fields are simply added
228	\| together to form the result. This means that any integer portion of `zSig'
229	\| will be added into the exponent. Since a properly normalized significand
230	\| will have an integer portion equal to 1, the `zExp' input should be 1 less
231	\| than the desired result exponent whenever `zSig' is a complete, normalized
232	\| significand.
233	----------------------------------------------------------------------------/
234
235	INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
236	{
237
238	return make_float32(
239	( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig);
240
241	}
242
243	/*----------------------------------------------------------------------------
244	\| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
245	\| and significand `zSig', and returns the proper single-precision floating-
246	\| point value corresponding to the abstract input. Ordinarily, the abstract
247	\| value is simply rounded and packed into the single-precision format, with
248	\| the inexact exception raised if the abstract input cannot be represented
249	\| exactly. However, if the abstract value is too large, the overflow and
250	\| inexact exceptions are raised and an infinity or maximal finite value is
251	\| returned. If the abstract value is too small, the input value is rounded to
252	\| a subnormal number, and the underflow and inexact exceptions are raised if
253	\| the abstract input cannot be represented exactly as a subnormal single-
254	\| precision floating-point number.
255	\| The input significand `zSig' has its binary point between bits 30
256	\| and 29, which is 7 bits to the left of the usual location. This shifted
257	\| significand must be normalized or smaller. If `zSig' is not normalized,
258	\| `zExp' must be 0; in that case, the result returned is a subnormal number,
259	\| and it must not require rounding. In the usual case that `zSig' is
260	\| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
261	\| The handling of underflow and overflow follows the IEC/IEEE Standard for
262	\| Binary Floating-Point Arithmetic.
263	----------------------------------------------------------------------------/
264
265	static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
266	{
267	int8 roundingMode;
268	flag roundNearestEven;
269	int8 roundIncrement, roundBits;
270	flag isTiny;
271
272	roundingMode = STATUS(float_rounding_mode);
273	roundNearestEven = ( roundingMode == float_round_nearest_even );
274	roundIncrement = 0x40;
275	if ( ! roundNearestEven ) {
276	if ( roundingMode == float_round_to_zero ) {
277	roundIncrement = 0;
278	}
279	else {
280	roundIncrement = 0x7F;
281	if ( zSign ) {
282	if ( roundingMode == float_round_up ) roundIncrement = 0;
283	}
284	else {
285	if ( roundingMode == float_round_down ) roundIncrement = 0;
286	}
287	}
288	}
289	roundBits = zSig & 0x7F;
290	if ( 0xFD <= (bits16) zExp ) {
291	if ( ( 0xFD < zExp )
292	\|\| ( ( zExp == 0xFD )
293	&& ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
294	) {
295	float_raise( float_flag_overflow \| float_flag_inexact STATUS_VAR);
296	return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
297	}
298	if ( zExp < 0 ) {
299	if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
300	isTiny =
301	( STATUS(float_detect_tininess) == float_tininess_before_rounding )
302	\|\| ( zExp < -1 )
303	\|\| ( zSig + roundIncrement < 0x80000000 );
304	shift32RightJamming( zSig, - zExp, &zSig );
305	zExp = 0;
306	roundBits = zSig & 0x7F;
307	if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
308	}
309	}
310	if ( roundBits ) STATUS(float_exception_flags) \|= float_flag_inexact;
311	zSig = ( zSig + roundIncrement )>>7;
312	zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
313	if ( zSig == 0 ) zExp = 0;
314	return packFloat32( zSign, zExp, zSig );
315
316	}
317
318	/*----------------------------------------------------------------------------
319	\| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
320	\| and significand `zSig', and returns the proper single-precision floating-
321	\| point value corresponding to the abstract input. This routine is just like
322	\| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
323	\| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
324	\| floating-point exponent.
325	----------------------------------------------------------------------------/
326
327	static float32
328	normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
329	{
330	int8 shiftCount;
331
332	shiftCount = countLeadingZeros32( zSig ) - 1;
333	return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
334
335	}
336
337	/*----------------------------------------------------------------------------
338	\| Returns the fraction bits of the double-precision floating-point value `a'.
339	----------------------------------------------------------------------------/
340
341	INLINE bits64 extractFloat64Frac( float64 a )
342	{
343
344	return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
345
346	}
347
348	/*----------------------------------------------------------------------------
349	\| Returns the exponent bits of the double-precision floating-point value `a'.
350	----------------------------------------------------------------------------/
351
352	INLINE int16 extractFloat64Exp( float64 a )
353	{
354
355	return ( float64_val(a)>>52 ) & 0x7FF;
356
357	}
358
359	/*----------------------------------------------------------------------------
360	\| Returns the sign bit of the double-precision floating-point value `a'.
361	----------------------------------------------------------------------------/
362
363	INLINE flag extractFloat64Sign( float64 a )
364	{
365
366	return float64_val(a)>>63;
367
368	}
369
370	/*----------------------------------------------------------------------------
371	\| Normalizes the subnormal double-precision floating-point value represented
372	\| by the denormalized significand `aSig'. The normalized exponent and
373	\| significand are stored at the locations pointed to by `zExpPtr' and
374	\| `zSigPtr', respectively.
375	----------------------------------------------------------------------------/
376
377	static void
378	normalizeFloat64Subnormal( bits64 aSig, int16 zExpPtr, bits64 zSigPtr )
379	{
380	int8 shiftCount;
381
382	shiftCount = countLeadingZeros64( aSig ) - 11;
383	*zSigPtr = aSig<<shiftCount;
384	*zExpPtr = 1 - shiftCount;
385
386	}
387
388	/*----------------------------------------------------------------------------
389	\| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
390	\| double-precision floating-point value, returning the result. After being
391	\| shifted into the proper positions, the three fields are simply added
392	\| together to form the result. This means that any integer portion of `zSig'
393	\| will be added into the exponent. Since a properly normalized significand
394	\| will have an integer portion equal to 1, the `zExp' input should be 1 less
395	\| than the desired result exponent whenever `zSig' is a complete, normalized
396	\| significand.
397	----------------------------------------------------------------------------/
398
399	INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
400	{
401
402	return make_float64(
403	( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig);
404
405	}
406
407	/*----------------------------------------------------------------------------
408	\| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
409	\| and significand `zSig', and returns the proper double-precision floating-
410	\| point value corresponding to the abstract input. Ordinarily, the abstract
411	\| value is simply rounded and packed into the double-precision format, with
412	\| the inexact exception raised if the abstract input cannot be represented
413	\| exactly. However, if the abstract value is too large, the overflow and
414	\| inexact exceptions are raised and an infinity or maximal finite value is
415	\| returned. If the abstract value is too small, the input value is rounded
416	\| to a subnormal number, and the underflow and inexact exceptions are raised
417	\| if the abstract input cannot be represented exactly as a subnormal double-
418	\| precision floating-point number.
419	\| The input significand `zSig' has its binary point between bits 62
420	\| and 61, which is 10 bits to the left of the usual location. This shifted
421	\| significand must be normalized or smaller. If `zSig' is not normalized,
422	\| `zExp' must be 0; in that case, the result returned is a subnormal number,
423	\| and it must not require rounding. In the usual case that `zSig' is
424	\| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
425	\| The handling of underflow and overflow follows the IEC/IEEE Standard for
426	\| Binary Floating-Point Arithmetic.
427	----------------------------------------------------------------------------/
428
429	static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
430	{
431	int8 roundingMode;
432	flag roundNearestEven;
433	int16 roundIncrement, roundBits;
434	flag isTiny;
435
436	roundingMode = STATUS(float_rounding_mode);
437	roundNearestEven = ( roundingMode == float_round_nearest_even );
438	roundIncrement = 0x200;
439	if ( ! roundNearestEven ) {
440	if ( roundingMode == float_round_to_zero ) {
441	roundIncrement = 0;
442	}
443	else {
444	roundIncrement = 0x3FF;
445	if ( zSign ) {
446	if ( roundingMode == float_round_up ) roundIncrement = 0;
447	}
448	else {
449	if ( roundingMode == float_round_down ) roundIncrement = 0;
450	}
451	}
452	}
453	roundBits = zSig & 0x3FF;
454	if ( 0x7FD <= (bits16) zExp ) {
455	if ( ( 0x7FD < zExp )
456	\|\| ( ( zExp == 0x7FD )
457	&& ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
458	) {
459	float_raise( float_flag_overflow \| float_flag_inexact STATUS_VAR);
460	return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
461	}
462	if ( zExp < 0 ) {
463	if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
464	isTiny =
465	( STATUS(float_detect_tininess) == float_tininess_before_rounding )
466	\|\| ( zExp < -1 )
467	\|\| ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
468	shift64RightJamming( zSig, - zExp, &zSig );
469	zExp = 0;
470	roundBits = zSig & 0x3FF;
471	if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
472	}
473	}
474	if ( roundBits ) STATUS(float_exception_flags) \|= float_flag_inexact;
475	zSig = ( zSig + roundIncrement )>>10;
476	zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
477	if ( zSig == 0 ) zExp = 0;
478	return packFloat64( zSign, zExp, zSig );
479
480	}
481
482	/*----------------------------------------------------------------------------
483	\| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
484	\| and significand `zSig', and returns the proper double-precision floating-
485	\| point value corresponding to the abstract input. This routine is just like
486	\| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
487	\| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
488	\| floating-point exponent.
489	----------------------------------------------------------------------------/
490
491	static float64
492	normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
493	{
494	int8 shiftCount;
495
496	shiftCount = countLeadingZeros64( zSig ) - 1;
497	return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
498
499	}
500
501	#ifdef FLOATX80
502
503	/*----------------------------------------------------------------------------
504	\| Returns the fraction bits of the extended double-precision floating-point
505	\| value `a'.
506	----------------------------------------------------------------------------/
507
508	INLINE bits64 extractFloatx80Frac( floatx80 a )
509	{
510
511	return a.low;
512
513	}
514
515	/*----------------------------------------------------------------------------
516	\| Returns the exponent bits of the extended double-precision floating-point
517	\| value `a'.
518	----------------------------------------------------------------------------/
519
520	INLINE int32 extractFloatx80Exp( floatx80 a )
521	{
522
523	return a.high & 0x7FFF;
524
525	}
526
527	/*----------------------------------------------------------------------------
528	\| Returns the sign bit of the extended double-precision floating-point value
529	\| `a'.
530	----------------------------------------------------------------------------/
531
532	INLINE flag extractFloatx80Sign( floatx80 a )
533	{
534
535	return a.high>>15;
536
537	}
538
539	/*----------------------------------------------------------------------------
540	\| Normalizes the subnormal extended double-precision floating-point value
541	\| represented by the denormalized significand `aSig'. The normalized exponent
542	\| and significand are stored at the locations pointed to by `zExpPtr' and
543	\| `zSigPtr', respectively.
544	----------------------------------------------------------------------------/
545
546	static void
547	normalizeFloatx80Subnormal( bits64 aSig, int32 zExpPtr, bits64 zSigPtr )
548	{
549	int8 shiftCount;
550
551	shiftCount = countLeadingZeros64( aSig );
552	*zSigPtr = aSig<<shiftCount;
553	*zExpPtr = 1 - shiftCount;
554
555	}
556
557	/*----------------------------------------------------------------------------
558	\| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
559	\| extended double-precision floating-point value, returning the result.
560	----------------------------------------------------------------------------/
561
562	INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
563	{
564	floatx80 z;
565
566	z.low = zSig;
567	z.high = ( ( (bits16) zSign )<<15 ) + zExp;
568	return z;
569
570	}
571
572	/*----------------------------------------------------------------------------
573	\| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
574	\| and extended significand formed by the concatenation of `zSig0' and `zSig1',
575	\| and returns the proper extended double-precision floating-point value
576	\| corresponding to the abstract input. Ordinarily, the abstract value is
577	\| rounded and packed into the extended double-precision format, with the
578	\| inexact exception raised if the abstract input cannot be represented
579	\| exactly. However, if the abstract value is too large, the overflow and
580	\| inexact exceptions are raised and an infinity or maximal finite value is
581	\| returned. If the abstract value is too small, the input value is rounded to
582	\| a subnormal number, and the underflow and inexact exceptions are raised if
583	\| the abstract input cannot be represented exactly as a subnormal extended
584	\| double-precision floating-point number.
585	\| If `roundingPrecision' is 32 or 64, the result is rounded to the same
586	\| number of bits as single or double precision, respectively. Otherwise, the
587	\| result is rounded to the full precision of the extended double-precision
588	\| format.
589	\| The input significand must be normalized or smaller. If the input
590	\| significand is not normalized, `zExp' must be 0; in that case, the result
591	\| returned is a subnormal number, and it must not require rounding. The
592	\| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
593	\| Floating-Point Arithmetic.
594	----------------------------------------------------------------------------/
595
596	static floatx80
597	roundAndPackFloatx80(
598	int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
599	STATUS_PARAM)
600	{
601	int8 roundingMode;
602	flag roundNearestEven, increment, isTiny;
603	int64 roundIncrement, roundMask, roundBits;
604
605	roundingMode = STATUS(float_rounding_mode);
606	roundNearestEven = ( roundingMode == float_round_nearest_even );
607	if ( roundingPrecision == 80 ) goto precision80;
608	if ( roundingPrecision == 64 ) {
609	roundIncrement = LIT64( 0x0000000000000400 );
610	roundMask = LIT64( 0x00000000000007FF );
611	}
612	else if ( roundingPrecision == 32 ) {
613	roundIncrement = LIT64( 0x0000008000000000 );
614	roundMask = LIT64( 0x000000FFFFFFFFFF );
615	}
616	else {
617	goto precision80;
618	}
619	zSig0 \|= ( zSig1 != 0 );
620	if ( ! roundNearestEven ) {
621	if ( roundingMode == float_round_to_zero ) {
622	roundIncrement = 0;
623	}
624	else {
625	roundIncrement = roundMask;
626	if ( zSign ) {
627	if ( roundingMode == float_round_up ) roundIncrement = 0;
628	}
629	else {
630	if ( roundingMode == float_round_down ) roundIncrement = 0;
631	}
632	}
633	}
634	roundBits = zSig0 & roundMask;
635	if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
636	if ( ( 0x7FFE < zExp )
637	\|\| ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
638	) {
639	goto overflow;
640	}
641	if ( zExp <= 0 ) {
642	if ( STATUS(flush_to_zero) ) return packFloatx80( zSign, 0, 0 );
643	isTiny =
644	( STATUS(float_detect_tininess) == float_tininess_before_rounding )
645	\|\| ( zExp < 0 )
646	\|\| ( zSig0 <= zSig0 + roundIncrement );
647	shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
648	zExp = 0;
649	roundBits = zSig0 & roundMask;
650	if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
651	if ( roundBits ) STATUS(float_exception_flags) \|= float_flag_inexact;
652	zSig0 += roundIncrement;
653	if ( (sbits64) zSig0 < 0 ) zExp = 1;
654	roundIncrement = roundMask + 1;
655	if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
656	roundMask \|= roundIncrement;
657	}
658	zSig0 &= ~ roundMask;
659	return packFloatx80( zSign, zExp, zSig0 );
660	}
661	}
662	if ( roundBits ) STATUS(float_exception_flags) \|= float_flag_inexact;
663	zSig0 += roundIncrement;
664	if ( zSig0 < roundIncrement ) {
665	++zExp;
666	zSig0 = LIT64( 0x8000000000000000 );
667	}
668	roundIncrement = roundMask + 1;
669	if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
670	roundMask \|= roundIncrement;
671	}
672	zSig0 &= ~ roundMask;
673	if ( zSig0 == 0 ) zExp = 0;
674	return packFloatx80( zSign, zExp, zSig0 );
675	precision80:
676	increment = ( (sbits64) zSig1 < 0 );
677	if ( ! roundNearestEven ) {
678	if ( roundingMode == float_round_to_zero ) {
679	increment = 0;
680	}
681	else {
682	if ( zSign ) {
683	increment = ( roundingMode == float_round_down ) && zSig1;
684	}
685	else {
686	increment = ( roundingMode == float_round_up ) && zSig1;
687	}
688	}
689	}
690	if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
691	if ( ( 0x7FFE < zExp )
692	\|\| ( ( zExp == 0x7FFE )
693	&& ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
694	&& increment
695	)
696	) {
697	roundMask = 0;
698	overflow:
699	float_raise( float_flag_overflow \| float_flag_inexact STATUS_VAR);
700	if ( ( roundingMode == float_round_to_zero )
701	\|\| ( zSign && ( roundingMode == float_round_up ) )
702	\|\| ( ! zSign && ( roundingMode == float_round_down ) )
703	) {
704	return packFloatx80( zSign, 0x7FFE, ~ roundMask );
705	}
706	return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
707	}
708	if ( zExp <= 0 ) {
709	isTiny =
710	( STATUS(float_detect_tininess) == float_tininess_before_rounding )
711	\|\| ( zExp < 0 )
712	\|\| ! increment
713	\|\| ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
714	shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
715	zExp = 0;
716	if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
717	if ( zSig1 ) STATUS(float_exception_flags) \|= float_flag_inexact;
718	if ( roundNearestEven ) {
719	increment = ( (sbits64) zSig1 < 0 );
720	}
721	else {
722	if ( zSign ) {
723	increment = ( roundingMode == float_round_down ) && zSig1;
724	}
725	else {
726	increment = ( roundingMode == float_round_up ) && zSig1;
727	}
728	}
729	if ( increment ) {
730	++zSig0;
731	zSig0 &=
732	~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
733	if ( (sbits64) zSig0 < 0 ) zExp = 1;
734	}
735	return packFloatx80( zSign, zExp, zSig0 );
736	}
737	}
738	if ( zSig1 ) STATUS(float_exception_flags) \|= float_flag_inexact;
739	if ( increment ) {
740	++zSig0;
741	if ( zSig0 == 0 ) {
742	++zExp;
743	zSig0 = LIT64( 0x8000000000000000 );
744	}
745	else {
746	zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
747	}
748	}
749	else {
750	if ( zSig0 == 0 ) zExp = 0;
751	}
752	return packFloatx80( zSign, zExp, zSig0 );
753
754	}
755
756	/*----------------------------------------------------------------------------
757	\| Takes an abstract floating-point value having sign `zSign', exponent
758	\| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
759	\| and returns the proper extended double-precision floating-point value
760	\| corresponding to the abstract input. This routine is just like
761	\| `roundAndPackFloatx80' except that the input significand does not have to be
762	\| normalized.
763	----------------------------------------------------------------------------/
764
765	static floatx80
766	normalizeRoundAndPackFloatx80(
767	int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
768	STATUS_PARAM)
769	{
770	int8 shiftCount;
771
772	if ( zSig0 == 0 ) {
773	zSig0 = zSig1;
774	zSig1 = 0;
775	zExp -= 64;
776	}
777	shiftCount = countLeadingZeros64( zSig0 );
778	shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
779	zExp -= shiftCount;
780	return
781	roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
782
783	}
784
785	#endif
786
787	#ifdef FLOAT128
788
789	/*----------------------------------------------------------------------------
790	\| Returns the least-significant 64 fraction bits of the quadruple-precision
791	\| floating-point value `a'.
792	----------------------------------------------------------------------------/
793
794	INLINE bits64 extractFloat128Frac1( float128 a )
795	{
796
797	return a.low;
798
799	}
800
801	/*----------------------------------------------------------------------------
802	\| Returns the most-significant 48 fraction bits of the quadruple-precision
803	\| floating-point value `a'.
804	----------------------------------------------------------------------------/
805
806	INLINE bits64 extractFloat128Frac0( float128 a )
807	{
808
809	return a.high & LIT64( 0x0000FFFFFFFFFFFF );
810
811	}
812
813	/*----------------------------------------------------------------------------
814	\| Returns the exponent bits of the quadruple-precision floating-point value
815	\| `a'.
816	----------------------------------------------------------------------------/
817
818	INLINE int32 extractFloat128Exp( float128 a )
819	{
820
821	return ( a.high>>48 ) & 0x7FFF;
822
823	}
824
825	/*----------------------------------------------------------------------------
826	\| Returns the sign bit of the quadruple-precision floating-point value `a'.
827	----------------------------------------------------------------------------/
828
829	INLINE flag extractFloat128Sign( float128 a )
830	{
831
832	return a.high>>63;
833
834	}
835
836	/*----------------------------------------------------------------------------
837	\| Normalizes the subnormal quadruple-precision floating-point value
838	\| represented by the denormalized significand formed by the concatenation of
839	\| `aSig0' and `aSig1'. The normalized exponent is stored at the location
840	\| pointed to by `zExpPtr'. The most significant 49 bits of the normalized
841	\| significand are stored at the location pointed to by `zSig0Ptr', and the
842	\| least significant 64 bits of the normalized significand are stored at the
843	\| location pointed to by `zSig1Ptr'.
844	----------------------------------------------------------------------------/
845
846	static void
847	normalizeFloat128Subnormal(
848	bits64 aSig0,
849	bits64 aSig1,
850	int32 *zExpPtr,
851	bits64 *zSig0Ptr,
852	bits64 *zSig1Ptr
853	)
854	{
855	int8 shiftCount;
856
857	if ( aSig0 == 0 ) {
858	shiftCount = countLeadingZeros64( aSig1 ) - 15;
859	if ( shiftCount < 0 ) {
860	*zSig0Ptr = aSig1>>( - shiftCount );
861	*zSig1Ptr = aSig1<<( shiftCount & 63 );
862	}
863	else {
864	*zSig0Ptr = aSig1<<shiftCount;
865	*zSig1Ptr = 0;
866	}
867	*zExpPtr = - shiftCount - 63;
868	}
869	else {
870	shiftCount = countLeadingZeros64( aSig0 ) - 15;
871	shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
872	*zExpPtr = 1 - shiftCount;
873	}
874
875	}
876
877	/*----------------------------------------------------------------------------
878	\| Packs the sign `zSign', the exponent `zExp', and the significand formed
879	\| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
880	\| floating-point value, returning the result. After being shifted into the
881	\| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
882	\| added together to form the most significant 32 bits of the result. This
883	\| means that any integer portion of `zSig0' will be added into the exponent.
884	\| Since a properly normalized significand will have an integer portion equal
885	\| to 1, the `zExp' input should be 1 less than the desired result exponent
886	\| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
887	\| significand.
888	----------------------------------------------------------------------------/
889
890	INLINE float128
891	packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
892	{
893	float128 z;
894
895	z.low = zSig1;
896	z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
897	return z;
898
899	}
900
901	/*----------------------------------------------------------------------------
902	\| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
903	\| and extended significand formed by the concatenation of `zSig0', `zSig1',
904	\| and `zSig2', and returns the proper quadruple-precision floating-point value
905	\| corresponding to the abstract input. Ordinarily, the abstract value is
906	\| simply rounded and packed into the quadruple-precision format, with the
907	\| inexact exception raised if the abstract input cannot be represented
908	\| exactly. However, if the abstract value is too large, the overflow and
909	\| inexact exceptions are raised and an infinity or maximal finite value is
910	\| returned. If the abstract value is too small, the input value is rounded to
911	\| a subnormal number, and the underflow and inexact exceptions are raised if
912	\| the abstract input cannot be represented exactly as a subnormal quadruple-
913	\| precision floating-point number.
914	\| The input significand must be normalized or smaller. If the input
915	\| significand is not normalized, `zExp' must be 0; in that case, the result
916	\| returned is a subnormal number, and it must not require rounding. In the
917	\| usual case that the input significand is normalized, `zExp' must be 1 less
918	\| than the ``true'' floating-point exponent. The handling of underflow and
919	\| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
920	----------------------------------------------------------------------------/
921
922	static float128
923	roundAndPackFloat128(
924	flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM)
925	{
926	int8 roundingMode;
927	flag roundNearestEven, increment, isTiny;
928
929	roundingMode = STATUS(float_rounding_mode);
930	roundNearestEven = ( roundingMode == float_round_nearest_even );
931	increment = ( (sbits64) zSig2 < 0 );
932	if ( ! roundNearestEven ) {
933	if ( roundingMode == float_round_to_zero ) {
934	increment = 0;
935	}
936	else {
937	if ( zSign ) {
938	increment = ( roundingMode == float_round_down ) && zSig2;
939	}
940	else {
941	increment = ( roundingMode == float_round_up ) && zSig2;
942	}
943	}
944	}
945	if ( 0x7FFD <= (bits32) zExp ) {
946	if ( ( 0x7FFD < zExp )
947	\|\| ( ( zExp == 0x7FFD )
948	&& eq128(
949	LIT64( 0x0001FFFFFFFFFFFF ),
950	LIT64( 0xFFFFFFFFFFFFFFFF ),
951	zSig0,
952	zSig1
953	)
954	&& increment
955	)
956	) {
957	float_raise( float_flag_overflow \| float_flag_inexact STATUS_VAR);
958	if ( ( roundingMode == float_round_to_zero )
959	\|\| ( zSign && ( roundingMode == float_round_up ) )
960	\|\| ( ! zSign && ( roundingMode == float_round_down ) )
961	) {
962	return
963	packFloat128(
964	zSign,
965	0x7FFE,
966	LIT64( 0x0000FFFFFFFFFFFF ),
967	LIT64( 0xFFFFFFFFFFFFFFFF )
968	);
969	}
970	return packFloat128( zSign, 0x7FFF, 0, 0 );
971	}
972	if ( zExp < 0 ) {
973	if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
974	isTiny =
975	( STATUS(float_detect_tininess) == float_tininess_before_rounding )
976	\|\| ( zExp < -1 )
977	\|\| ! increment
978	\|\| lt128(
979	zSig0,
980	zSig1,
981	LIT64( 0x0001FFFFFFFFFFFF ),
982	LIT64( 0xFFFFFFFFFFFFFFFF )
983	);
984	shift128ExtraRightJamming(
985	zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
986	zExp = 0;
987	if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
988	if ( roundNearestEven ) {
989	increment = ( (sbits64) zSig2 < 0 );
990	}
991	else {
992	if ( zSign ) {
993	increment = ( roundingMode == float_round_down ) && zSig2;
994	}
995	else {
996	increment = ( roundingMode == float_round_up ) && zSig2;
997	}
998	}
999	}
1000	}
1001	if ( zSig2 ) STATUS(float_exception_flags) \|= float_flag_inexact;
1002	if ( increment ) {
1003	add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1004	zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1005	}
1006	else {
1007	if ( ( zSig0 \| zSig1 ) == 0 ) zExp = 0;
1008	}
1009	return packFloat128( zSign, zExp, zSig0, zSig1 );
1010
1011	}
1012
1013	/*----------------------------------------------------------------------------
1014	\| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1015	\| and significand formed by the concatenation of `zSig0' and `zSig1', and
1016	\| returns the proper quadruple-precision floating-point value corresponding
1017	\| to the abstract input. This routine is just like `roundAndPackFloat128'
1018	\| except that the input significand has fewer bits and does not have to be
1019	\| normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1020	\| point exponent.
1021	----------------------------------------------------------------------------/
1022
1023	static float128
1024	normalizeRoundAndPackFloat128(
1025	flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM)
1026	{
1027	int8 shiftCount;
1028	bits64 zSig2;
1029
1030	if ( zSig0 == 0 ) {
1031	zSig0 = zSig1;
1032	zSig1 = 0;
1033	zExp -= 64;
1034	}
1035	shiftCount = countLeadingZeros64( zSig0 ) - 15;
1036	if ( 0 <= shiftCount ) {
1037	zSig2 = 0;
1038	shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1039	}
1040	else {
1041	shift128ExtraRightJamming(
1042	zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1043	}
1044	zExp -= shiftCount;
1045	return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1046
1047	}
1048
1049	#endif
1050
1051	/*----------------------------------------------------------------------------
1052	\| Returns the result of converting the 32-bit two's complement integer `a'
1053	\| to the single-precision floating-point format. The conversion is performed
1054	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1055	----------------------------------------------------------------------------/
1056
1057	float32 int32_to_float32( int32 a STATUS_PARAM )
1058	{
1059	flag zSign;
1060
1061	if ( a == 0 ) return float32_zero;
1062	if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1063	zSign = ( a < 0 );
1064	return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1065
1066	}
1067
1068	/*----------------------------------------------------------------------------
1069	\| Returns the result of converting the 32-bit two's complement integer `a'
1070	\| to the double-precision floating-point format. The conversion is performed
1071	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1072	----------------------------------------------------------------------------/
1073
1074	float64 int32_to_float64( int32 a STATUS_PARAM )
1075	{
1076	flag zSign;
1077	uint32 absA;
1078	int8 shiftCount;
1079	bits64 zSig;
1080
1081	if ( a == 0 ) return float64_zero;
1082	zSign = ( a < 0 );
1083	absA = zSign ? - a : a;
1084	shiftCount = countLeadingZeros32( absA ) + 21;
1085	zSig = absA;
1086	return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1087
1088	}
1089
1090	#ifdef FLOATX80
1091
1092	/*----------------------------------------------------------------------------
1093	\| Returns the result of converting the 32-bit two's complement integer `a'
1094	\| to the extended double-precision floating-point format. The conversion
1095	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1096	\| Arithmetic.
1097	----------------------------------------------------------------------------/
1098
1099	floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1100	{
1101	flag zSign;
1102	uint32 absA;
1103	int8 shiftCount;
1104	bits64 zSig;
1105
1106	if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1107	zSign = ( a < 0 );
1108	absA = zSign ? - a : a;
1109	shiftCount = countLeadingZeros32( absA ) + 32;
1110	zSig = absA;
1111	return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1112
1113	}
1114
1115	#endif
1116
1117	#ifdef FLOAT128
1118
1119	/*----------------------------------------------------------------------------
1120	\| Returns the result of converting the 32-bit two's complement integer `a' to
1121	\| the quadruple-precision floating-point format. The conversion is performed
1122	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1123	----------------------------------------------------------------------------/
1124
1125	float128 int32_to_float128( int32 a STATUS_PARAM )
1126	{
1127	flag zSign;
1128	uint32 absA;
1129	int8 shiftCount;
1130	bits64 zSig0;
1131
1132	if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1133	zSign = ( a < 0 );
1134	absA = zSign ? - a : a;
1135	shiftCount = countLeadingZeros32( absA ) + 17;
1136	zSig0 = absA;
1137	return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1138
1139	}
1140
1141	#endif
1142
1143	/*----------------------------------------------------------------------------
1144	\| Returns the result of converting the 64-bit two's complement integer `a'
1145	\| to the single-precision floating-point format. The conversion is performed
1146	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1147	----------------------------------------------------------------------------/
1148
1149	float32 int64_to_float32( int64 a STATUS_PARAM )
1150	{
1151	flag zSign;
1152	uint64 absA;
1153	int8 shiftCount;
1154
1155	if ( a == 0 ) return float32_zero;
1156	zSign = ( a < 0 );
1157	absA = zSign ? - a : a;
1158	shiftCount = countLeadingZeros64( absA ) - 40;
1159	if ( 0 <= shiftCount ) {
1160	return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1161	}
1162	else {
1163	shiftCount += 7;
1164	if ( shiftCount < 0 ) {
1165	shift64RightJamming( absA, - shiftCount, &absA );
1166	}
1167	else {
1168	absA <<= shiftCount;
1169	}
1170	return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1171	}
1172
1173	}
1174
1175	float32 uint64_to_float32( uint64 a STATUS_PARAM )
1176	{
1177	int8 shiftCount;
1178
1179	if ( a == 0 ) return float32_zero;
1180	shiftCount = countLeadingZeros64( a ) - 40;
1181	if ( 0 <= shiftCount ) {
1182	return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1183	}
1184	else {
1185	shiftCount += 7;
1186	if ( shiftCount < 0 ) {
1187	shift64RightJamming( a, - shiftCount, &a );
1188	}
1189	else {
1190	a <<= shiftCount;
1191	}
1192	return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1193	}
1194	}
1195
1196	/*----------------------------------------------------------------------------
1197	\| Returns the result of converting the 64-bit two's complement integer `a'
1198	\| to the double-precision floating-point format. The conversion is performed
1199	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1200	----------------------------------------------------------------------------/
1201
1202	float64 int64_to_float64( int64 a STATUS_PARAM )
1203	{
1204	flag zSign;
1205
1206	if ( a == 0 ) return float64_zero;
1207	if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1208	return packFloat64( 1, 0x43E, 0 );
1209	}
1210	zSign = ( a < 0 );
1211	return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1212
1213	}
1214
1215	float64 uint64_to_float64( uint64 a STATUS_PARAM )
1216	{
1217	if ( a == 0 ) return float64_zero;
1218	return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1219
1220	}
1221
1222	#ifdef FLOATX80
1223
1224	/*----------------------------------------------------------------------------
1225	\| Returns the result of converting the 64-bit two's complement integer `a'
1226	\| to the extended double-precision floating-point format. The conversion
1227	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1228	\| Arithmetic.
1229	----------------------------------------------------------------------------/
1230
1231	floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1232	{
1233	flag zSign;
1234	uint64 absA;
1235	int8 shiftCount;
1236
1237	if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1238	zSign = ( a < 0 );
1239	absA = zSign ? - a : a;
1240	shiftCount = countLeadingZeros64( absA );
1241	return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1242
1243	}
1244
1245	#endif
1246
1247	#ifdef FLOAT128
1248
1249	/*----------------------------------------------------------------------------
1250	\| Returns the result of converting the 64-bit two's complement integer `a' to
1251	\| the quadruple-precision floating-point format. The conversion is performed
1252	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1253	----------------------------------------------------------------------------/
1254
1255	float128 int64_to_float128( int64 a STATUS_PARAM )
1256	{
1257	flag zSign;
1258	uint64 absA;
1259	int8 shiftCount;
1260	int32 zExp;
1261	bits64 zSig0, zSig1;
1262
1263	if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1264	zSign = ( a < 0 );
1265	absA = zSign ? - a : a;
1266	shiftCount = countLeadingZeros64( absA ) + 49;
1267	zExp = 0x406E - shiftCount;
1268	if ( 64 <= shiftCount ) {
1269	zSig1 = 0;
1270	zSig0 = absA;
1271	shiftCount -= 64;
1272	}
1273	else {
1274	zSig1 = absA;
1275	zSig0 = 0;
1276	}
1277	shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1278	return packFloat128( zSign, zExp, zSig0, zSig1 );
1279
1280	}
1281
1282	#endif
1283
1284	/*----------------------------------------------------------------------------
1285	\| Returns the result of converting the single-precision floating-point value
1286	\| `a' to the 32-bit two's complement integer format. The conversion is
1287	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
1288	\| Arithmetic---which means in particular that the conversion is rounded
1289	\| according to the current rounding mode. If `a' is a NaN, the largest
1290	\| positive integer is returned. Otherwise, if the conversion overflows, the
1291	\| largest integer with the same sign as `a' is returned.
1292	----------------------------------------------------------------------------/
1293
1294	int32 float32_to_int32( float32 a STATUS_PARAM )
1295	{
1296	flag aSign;
1297	int16 aExp, shiftCount;
1298	bits32 aSig;
1299	bits64 aSig64;
1300
1301	aSig = extractFloat32Frac( a );
1302	aExp = extractFloat32Exp( a );
1303	aSign = extractFloat32Sign( a );
1304	if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1305	if ( aExp ) aSig \|= 0x00800000;
1306	shiftCount = 0xAF - aExp;
1307	aSig64 = aSig;
1308	aSig64 <<= 32;
1309	if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1310	return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1311
1312	}
1313
1314	/*----------------------------------------------------------------------------
1315	\| Returns the result of converting the single-precision floating-point value
1316	\| `a' to the 32-bit two's complement integer format. The conversion is
1317	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
1318	\| Arithmetic, except that the conversion is always rounded toward zero.
1319	\| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1320	\| the conversion overflows, the largest integer with the same sign as `a' is
1321	\| returned.
1322	----------------------------------------------------------------------------/
1323
1324	int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1325	{
1326	flag aSign;
1327	int16 aExp, shiftCount;
1328	bits32 aSig;
1329	int32 z;
1330
1331	aSig = extractFloat32Frac( a );
1332	aExp = extractFloat32Exp( a );
1333	aSign = extractFloat32Sign( a );
1334	shiftCount = aExp - 0x9E;
1335	if ( 0 <= shiftCount ) {
1336	if ( float32_val(a) != 0xCF000000 ) {
1337	float_raise( float_flag_invalid STATUS_VAR);
1338	if ( ! aSign \|\| ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1339	}
1340	return (sbits32) 0x80000000;
1341	}
1342	else if ( aExp <= 0x7E ) {
1343	if ( aExp \| aSig ) STATUS(float_exception_flags) \|= float_flag_inexact;
1344	return 0;
1345	}
1346	aSig = ( aSig \| 0x00800000 )<<8;
1347	z = aSig>>( - shiftCount );
1348	if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1349	STATUS(float_exception_flags) \|= float_flag_inexact;
1350	}
1351	if ( aSign ) z = - z;
1352	return z;
1353
1354	}
1355
1356	/*----------------------------------------------------------------------------
1357	\| Returns the result of converting the single-precision floating-point value
1358	\| `a' to the 64-bit two's complement integer format. The conversion is
1359	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
1360	\| Arithmetic---which means in particular that the conversion is rounded
1361	\| according to the current rounding mode. If `a' is a NaN, the largest
1362	\| positive integer is returned. Otherwise, if the conversion overflows, the
1363	\| largest integer with the same sign as `a' is returned.
1364	----------------------------------------------------------------------------/
1365
1366	int64 float32_to_int64( float32 a STATUS_PARAM )
1367	{
1368	flag aSign;
1369	int16 aExp, shiftCount;
1370	bits32 aSig;
1371	bits64 aSig64, aSigExtra;
1372
1373	aSig = extractFloat32Frac( a );
1374	aExp = extractFloat32Exp( a );
1375	aSign = extractFloat32Sign( a );
1376	shiftCount = 0xBE - aExp;
1377	if ( shiftCount < 0 ) {
1378	float_raise( float_flag_invalid STATUS_VAR);
1379	if ( ! aSign \|\| ( ( aExp == 0xFF ) && aSig ) ) {
1380	return LIT64( 0x7FFFFFFFFFFFFFFF );
1381	}
1382	return (sbits64) LIT64( 0x8000000000000000 );
1383	}
1384	if ( aExp ) aSig \|= 0x00800000;
1385	aSig64 = aSig;
1386	aSig64 <<= 40;
1387	shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1388	return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1389
1390	}
1391
1392	/*----------------------------------------------------------------------------
1393	\| Returns the result of converting the single-precision floating-point value
1394	\| `a' to the 64-bit two's complement integer format. The conversion is
1395	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
1396	\| Arithmetic, except that the conversion is always rounded toward zero. If
1397	\| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1398	\| conversion overflows, the largest integer with the same sign as `a' is
1399	\| returned.
1400	----------------------------------------------------------------------------/
1401
1402	int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1403	{
1404	flag aSign;
1405	int16 aExp, shiftCount;
1406	bits32 aSig;
1407	bits64 aSig64;
1408	int64 z;
1409
1410	aSig = extractFloat32Frac( a );
1411	aExp = extractFloat32Exp( a );
1412	aSign = extractFloat32Sign( a );
1413	shiftCount = aExp - 0xBE;
1414	if ( 0 <= shiftCount ) {
1415	if ( float32_val(a) != 0xDF000000 ) {
1416	float_raise( float_flag_invalid STATUS_VAR);
1417	if ( ! aSign \|\| ( ( aExp == 0xFF ) && aSig ) ) {
1418	return LIT64( 0x7FFFFFFFFFFFFFFF );
1419	}
1420	}
1421	return (sbits64) LIT64( 0x8000000000000000 );
1422	}
1423	else if ( aExp <= 0x7E ) {
1424	if ( aExp \| aSig ) STATUS(float_exception_flags) \|= float_flag_inexact;
1425	return 0;
1426	}
1427	aSig64 = aSig \| 0x00800000;
1428	aSig64 <<= 40;
1429	z = aSig64>>( - shiftCount );
1430	if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1431	STATUS(float_exception_flags) \|= float_flag_inexact;
1432	}
1433	if ( aSign ) z = - z;
1434	return z;
1435
1436	}
1437
1438	/*----------------------------------------------------------------------------
1439	\| Returns the result of converting the single-precision floating-point value
1440	\| `a' to the double-precision floating-point format. The conversion is
1441	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
1442	\| Arithmetic.
1443	----------------------------------------------------------------------------/
1444
1445	float64 float32_to_float64( float32 a STATUS_PARAM )
1446	{
1447	flag aSign;
1448	int16 aExp;
1449	bits32 aSig;
1450
1451	aSig = extractFloat32Frac( a );
1452	aExp = extractFloat32Exp( a );
1453	aSign = extractFloat32Sign( a );
1454	if ( aExp == 0xFF ) {
1455	if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ));
1456	return packFloat64( aSign, 0x7FF, 0 );
1457	}
1458	if ( aExp == 0 ) {
1459	if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1460	normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1461	--aExp;
1462	}
1463	return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1464
1465	}
1466
1467	#ifdef FLOATX80
1468
1469	/*----------------------------------------------------------------------------
1470	\| Returns the result of converting the single-precision floating-point value
1471	\| `a' to the extended double-precision floating-point format. The conversion
1472	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1473	\| Arithmetic.
1474	----------------------------------------------------------------------------/
1475
1476	floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1477	{
1478	flag aSign;
1479	int16 aExp;
1480	bits32 aSig;
1481
1482	aSig = extractFloat32Frac( a );
1483	aExp = extractFloat32Exp( a );
1484	aSign = extractFloat32Sign( a );
1485	if ( aExp == 0xFF ) {
1486	if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) );
1487	return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1488	}
1489	if ( aExp == 0 ) {
1490	if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1491	normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1492	}
1493	aSig \|= 0x00800000;
1494	return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1495
1496	}
1497
1498	#endif
1499
1500	#ifdef FLOAT128
1501
1502	/*----------------------------------------------------------------------------
1503	\| Returns the result of converting the single-precision floating-point value
1504	\| `a' to the double-precision floating-point format. The conversion is
1505	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
1506	\| Arithmetic.
1507	----------------------------------------------------------------------------/
1508
1509	float128 float32_to_float128( float32 a STATUS_PARAM )
1510	{
1511	flag aSign;
1512	int16 aExp;
1513	bits32 aSig;
1514
1515	aSig = extractFloat32Frac( a );
1516	aExp = extractFloat32Exp( a );
1517	aSign = extractFloat32Sign( a );
1518	if ( aExp == 0xFF ) {
1519	if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) );
1520	return packFloat128( aSign, 0x7FFF, 0, 0 );
1521	}
1522	if ( aExp == 0 ) {
1523	if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1524	normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1525	--aExp;
1526	}
1527	return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1528
1529	}
1530
1531	#endif
1532
1533	/*----------------------------------------------------------------------------
1534	\| Rounds the single-precision floating-point value `a' to an integer, and
1535	\| returns the result as a single-precision floating-point value. The
1536	\| operation is performed according to the IEC/IEEE Standard for Binary
1537	\| Floating-Point Arithmetic.
1538	----------------------------------------------------------------------------/
1539
1540	float32 float32_round_to_int( float32 a STATUS_PARAM)
1541	{
1542	flag aSign;
1543	int16 aExp;
1544	bits32 lastBitMask, roundBitsMask;
1545	int8 roundingMode;
1546	bits32 z;
1547
1548	aExp = extractFloat32Exp( a );
1549	if ( 0x96 <= aExp ) {
1550	if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1551	return propagateFloat32NaN( a, a STATUS_VAR );
1552	}
1553	return a;
1554	}
1555	if ( aExp <= 0x7E ) {
1556	if ( (bits32) ( float32_val(a)<<1 ) == 0 ) return a;
1557	STATUS(float_exception_flags) \|= float_flag_inexact;
1558	aSign = extractFloat32Sign( a );
1559	switch ( STATUS(float_rounding_mode) ) {
1560	case float_round_nearest_even:
1561	if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1562	return packFloat32( aSign, 0x7F, 0 );
1563	}
1564	break;
1565	case float_round_down:
1566	return make_float32(aSign ? 0xBF800000 : 0);
1567	case float_round_up:
1568	return make_float32(aSign ? 0x80000000 : 0x3F800000);
1569	}
1570	return packFloat32( aSign, 0, 0 );
1571	}
1572	lastBitMask = 1;
1573	lastBitMask <<= 0x96 - aExp;
1574	roundBitsMask = lastBitMask - 1;
1575	z = float32_val(a);
1576	roundingMode = STATUS(float_rounding_mode);
1577	if ( roundingMode == float_round_nearest_even ) {
1578	z += lastBitMask>>1;
1579	if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1580	}
1581	else if ( roundingMode != float_round_to_zero ) {
1582	if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1583	z += roundBitsMask;
1584	}
1585	}
1586	z &= ~ roundBitsMask;
1587	if ( z != float32_val(a) ) STATUS(float_exception_flags) \|= float_flag_inexact;
1588	return make_float32(z);
1589
1590	}
1591
1592	/*----------------------------------------------------------------------------
1593	\| Returns the result of adding the absolute values of the single-precision
1594	\| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1595	\| before being returned. `zSign' is ignored if the result is a NaN.
1596	\| The addition is performed according to the IEC/IEEE Standard for Binary
1597	\| Floating-Point Arithmetic.
1598	----------------------------------------------------------------------------/
1599
1600	static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1601	{
1602	int16 aExp, bExp, zExp;
1603	bits32 aSig, bSig, zSig;
1604	int16 expDiff;
1605
1606	aSig = extractFloat32Frac( a );
1607	aExp = extractFloat32Exp( a );
1608	bSig = extractFloat32Frac( b );
1609	bExp = extractFloat32Exp( b );
1610	expDiff = aExp - bExp;
1611	aSig <<= 6;
1612	bSig <<= 6;
1613	if ( 0 < expDiff ) {
1614	if ( aExp == 0xFF ) {
1615	if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1616	return a;
1617	}
1618	if ( bExp == 0 ) {
1619	--expDiff;
1620	}
1621	else {
1622	bSig \|= 0x20000000;
1623	}
1624	shift32RightJamming( bSig, expDiff, &bSig );
1625	zExp = aExp;
1626	}
1627	else if ( expDiff < 0 ) {
1628	if ( bExp == 0xFF ) {
1629	if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1630	return packFloat32( zSign, 0xFF, 0 );
1631	}
1632	if ( aExp == 0 ) {
1633	++expDiff;
1634	}
1635	else {
1636	aSig \|= 0x20000000;
1637	}
1638	shift32RightJamming( aSig, - expDiff, &aSig );
1639	zExp = bExp;
1640	}
1641	else {
1642	if ( aExp == 0xFF ) {
1643	if ( aSig \| bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1644	return a;
1645	}
1646	if ( aExp == 0 ) {
1647	if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
1648	return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1649	}
1650	zSig = 0x40000000 + aSig + bSig;
1651	zExp = aExp;
1652	goto roundAndPack;
1653	}
1654	aSig \|= 0x20000000;
1655	zSig = ( aSig + bSig )<<1;
1656	--zExp;
1657	if ( (sbits32) zSig < 0 ) {
1658	zSig = aSig + bSig;
1659	++zExp;
1660	}
1661	roundAndPack:
1662	return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1663
1664	}
1665
1666	/*----------------------------------------------------------------------------
1667	\| Returns the result of subtracting the absolute values of the single-
1668	\| precision floating-point values `a' and `b'. If `zSign' is 1, the
1669	\| difference is negated before being returned. `zSign' is ignored if the
1670	\| result is a NaN. The subtraction is performed according to the IEC/IEEE
1671	\| Standard for Binary Floating-Point Arithmetic.
1672	----------------------------------------------------------------------------/
1673
1674	static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1675	{
1676	int16 aExp, bExp, zExp;
1677	bits32 aSig, bSig, zSig;
1678	int16 expDiff;
1679
1680	aSig = extractFloat32Frac( a );
1681	aExp = extractFloat32Exp( a );
1682	bSig = extractFloat32Frac( b );
1683	bExp = extractFloat32Exp( b );
1684	expDiff = aExp - bExp;
1685	aSig <<= 7;
1686	bSig <<= 7;
1687	if ( 0 < expDiff ) goto aExpBigger;
1688	if ( expDiff < 0 ) goto bExpBigger;
1689	if ( aExp == 0xFF ) {
1690	if ( aSig \| bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1691	float_raise( float_flag_invalid STATUS_VAR);
1692	return float32_default_nan;
1693	}
1694	if ( aExp == 0 ) {
1695	aExp = 1;
1696	bExp = 1;
1697	}
1698	if ( bSig < aSig ) goto aBigger;
1699	if ( aSig < bSig ) goto bBigger;
1700	return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1701	bExpBigger:
1702	if ( bExp == 0xFF ) {
1703	if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1704	return packFloat32( zSign ^ 1, 0xFF, 0 );
1705	}
1706	if ( aExp == 0 ) {
1707	++expDiff;
1708	}
1709	else {
1710	aSig \|= 0x40000000;
1711	}
1712	shift32RightJamming( aSig, - expDiff, &aSig );
1713	bSig \|= 0x40000000;
1714	bBigger:
1715	zSig = bSig - aSig;
1716	zExp = bExp;
1717	zSign ^= 1;
1718	goto normalizeRoundAndPack;
1719	aExpBigger:
1720	if ( aExp == 0xFF ) {
1721	if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1722	return a;
1723	}
1724	if ( bExp == 0 ) {
1725	--expDiff;
1726	}
1727	else {
1728	bSig \|= 0x40000000;
1729	}
1730	shift32RightJamming( bSig, expDiff, &bSig );
1731	aSig \|= 0x40000000;
1732	aBigger:
1733	zSig = aSig - bSig;
1734	zExp = aExp;
1735	normalizeRoundAndPack:
1736	--zExp;
1737	return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1738
1739	}
1740
1741	/*----------------------------------------------------------------------------
1742	\| Returns the result of adding the single-precision floating-point values `a'
1743	\| and `b'. The operation is performed according to the IEC/IEEE Standard for
1744	\| Binary Floating-Point Arithmetic.
1745	----------------------------------------------------------------------------/
1746
1747	float32 float32_add( float32 a, float32 b STATUS_PARAM )
1748	{
1749	flag aSign, bSign;
1750
1751	aSign = extractFloat32Sign( a );
1752	bSign = extractFloat32Sign( b );
1753	if ( aSign == bSign ) {
1754	return addFloat32Sigs( a, b, aSign STATUS_VAR);
1755	}
1756	else {
1757	return subFloat32Sigs( a, b, aSign STATUS_VAR );
1758	}
1759
1760	}
1761
1762	/*----------------------------------------------------------------------------
1763	\| Returns the result of subtracting the single-precision floating-point values
1764	\| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1765	\| for Binary Floating-Point Arithmetic.
1766	----------------------------------------------------------------------------/
1767
1768	float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1769	{
1770	flag aSign, bSign;
1771
1772	aSign = extractFloat32Sign( a );
1773	bSign = extractFloat32Sign( b );
1774	if ( aSign == bSign ) {
1775	return subFloat32Sigs( a, b, aSign STATUS_VAR );
1776	}
1777	else {
1778	return addFloat32Sigs( a, b, aSign STATUS_VAR );
1779	}
1780
1781	}
1782
1783	/*----------------------------------------------------------------------------
1784	\| Returns the result of multiplying the single-precision floating-point values
1785	\| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1786	\| for Binary Floating-Point Arithmetic.
1787	----------------------------------------------------------------------------/
1788
1789	float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1790	{
1791	flag aSign, bSign, zSign;
1792	int16 aExp, bExp, zExp;
1793	bits32 aSig, bSig;
1794	bits64 zSig64;
1795	bits32 zSig;
1796
1797	aSig = extractFloat32Frac( a );
1798	aExp = extractFloat32Exp( a );
1799	aSign = extractFloat32Sign( a );
1800	bSig = extractFloat32Frac( b );
1801	bExp = extractFloat32Exp( b );
1802	bSign = extractFloat32Sign( b );
1803	zSign = aSign ^ bSign;
1804	if ( aExp == 0xFF ) {
1805	if ( aSig \|\| ( ( bExp == 0xFF ) && bSig ) ) {
1806	return propagateFloat32NaN( a, b STATUS_VAR );
1807	}
1808	if ( ( bExp \| bSig ) == 0 ) {
1809	float_raise( float_flag_invalid STATUS_VAR);
1810	return float32_default_nan;
1811	}
1812	return packFloat32( zSign, 0xFF, 0 );
1813	}
1814	if ( bExp == 0xFF ) {
1815	if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1816	if ( ( aExp \| aSig ) == 0 ) {
1817	float_raise( float_flag_invalid STATUS_VAR);
1818	return float32_default_nan;
1819	}
1820	return packFloat32( zSign, 0xFF, 0 );
1821	}
1822	if ( aExp == 0 ) {
1823	if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1824	normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1825	}
1826	if ( bExp == 0 ) {
1827	if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1828	normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1829	}
1830	zExp = aExp + bExp - 0x7F;
1831	aSig = ( aSig \| 0x00800000 )<<7;
1832	bSig = ( bSig \| 0x00800000 )<<8;
1833	shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1834	zSig = zSig64;
1835	if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1836	zSig <<= 1;
1837	--zExp;
1838	}
1839	return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1840
1841	}
1842
1843	/*----------------------------------------------------------------------------
1844	\| Returns the result of dividing the single-precision floating-point value `a'
1845	\| by the corresponding value `b'. The operation is performed according to the
1846	\| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1847	----------------------------------------------------------------------------/
1848
1849	float32 float32_div( float32 a, float32 b STATUS_PARAM )
1850	{
1851	flag aSign, bSign, zSign;
1852	int16 aExp, bExp, zExp;
1853	bits32 aSig, bSig, zSig;
1854
1855	aSig = extractFloat32Frac( a );
1856	aExp = extractFloat32Exp( a );
1857	aSign = extractFloat32Sign( a );
1858	bSig = extractFloat32Frac( b );
1859	bExp = extractFloat32Exp( b );
1860	bSign = extractFloat32Sign( b );
1861	zSign = aSign ^ bSign;
1862	if ( aExp == 0xFF ) {
1863	if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1864	if ( bExp == 0xFF ) {
1865	if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1866	float_raise( float_flag_invalid STATUS_VAR);
1867	return float32_default_nan;
1868	}
1869	return packFloat32( zSign, 0xFF, 0 );
1870	}
1871	if ( bExp == 0xFF ) {
1872	if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1873	return packFloat32( zSign, 0, 0 );
1874	}
1875	if ( bExp == 0 ) {
1876	if ( bSig == 0 ) {
1877	if ( ( aExp \| aSig ) == 0 ) {
1878	float_raise( float_flag_invalid STATUS_VAR);
1879	return float32_default_nan;
1880	}
1881	float_raise( float_flag_divbyzero STATUS_VAR);
1882	return packFloat32( zSign, 0xFF, 0 );
1883	}
1884	normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1885	}
1886	if ( aExp == 0 ) {
1887	if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1888	normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1889	}
1890	zExp = aExp - bExp + 0x7D;
1891	aSig = ( aSig \| 0x00800000 )<<7;
1892	bSig = ( bSig \| 0x00800000 )<<8;
1893	if ( bSig <= ( aSig + aSig ) ) {
1894	aSig >>= 1;
1895	++zExp;
1896	}
1897	zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1898	if ( ( zSig & 0x3F ) == 0 ) {
1899	zSig \|= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1900	}
1901	return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1902
1903	}
1904
1905	/*----------------------------------------------------------------------------
1906	\| Returns the remainder of the single-precision floating-point value `a'
1907	\| with respect to the corresponding value `b'. The operation is performed
1908	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1909	----------------------------------------------------------------------------/
1910
1911	float32 float32_rem( float32 a, float32 b STATUS_PARAM )
1912	{
1913	flag aSign, zSign;
1914	int16 aExp, bExp, expDiff;
1915	bits32 aSig, bSig;
1916	bits32 q;
1917	bits64 aSig64, bSig64, q64;
1918	bits32 alternateASig;
1919	sbits32 sigMean;
1920
1921	aSig = extractFloat32Frac( a );
1922	aExp = extractFloat32Exp( a );
1923	aSign = extractFloat32Sign( a );
1924	bSig = extractFloat32Frac( b );
1925	bExp = extractFloat32Exp( b );
1926	if ( aExp == 0xFF ) {
1927	if ( aSig \|\| ( ( bExp == 0xFF ) && bSig ) ) {
1928	return propagateFloat32NaN( a, b STATUS_VAR );
1929	}
1930	float_raise( float_flag_invalid STATUS_VAR);
1931	return float32_default_nan;
1932	}
1933	if ( bExp == 0xFF ) {
1934	if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1935	return a;
1936	}
1937	if ( bExp == 0 ) {
1938	if ( bSig == 0 ) {
1939	float_raise( float_flag_invalid STATUS_VAR);
1940	return float32_default_nan;
1941	}
1942	normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1943	}
1944	if ( aExp == 0 ) {
1945	if ( aSig == 0 ) return a;
1946	normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1947	}
1948	expDiff = aExp - bExp;
1949	aSig \|= 0x00800000;
1950	bSig \|= 0x00800000;
1951	if ( expDiff < 32 ) {
1952	aSig <<= 8;
1953	bSig <<= 8;
1954	if ( expDiff < 0 ) {
1955	if ( expDiff < -1 ) return a;
1956	aSig >>= 1;
1957	}
1958	q = ( bSig <= aSig );
1959	if ( q ) aSig -= bSig;
1960	if ( 0 < expDiff ) {
1961	q = ( ( (bits64) aSig )<<32 ) / bSig;
1962	q >>= 32 - expDiff;
1963	bSig >>= 2;
1964	aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1965	}
1966	else {
1967	aSig >>= 2;
1968	bSig >>= 2;
1969	}
1970	}
1971	else {
1972	if ( bSig <= aSig ) aSig -= bSig;
1973	aSig64 = ( (bits64) aSig )<<40;
1974	bSig64 = ( (bits64) bSig )<<40;
1975	expDiff -= 64;
1976	while ( 0 < expDiff ) {
1977	q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1978	q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1979	aSig64 = - ( ( bSig * q64 )<<38 );
1980	expDiff -= 62;
1981	}
1982	expDiff += 64;
1983	q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1984	q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1985	q = q64>>( 64 - expDiff );
1986	bSig <<= 6;
1987	aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1988	}
1989	do {
1990	alternateASig = aSig;
1991	++q;
1992	aSig -= bSig;
1993	} while ( 0 <= (sbits32) aSig );
1994	sigMean = aSig + alternateASig;
1995	if ( ( sigMean < 0 ) \|\| ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1996	aSig = alternateASig;
1997	}
1998	zSign = ( (sbits32) aSig < 0 );
1999	if ( zSign ) aSig = - aSig;
2000	return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2001
2002	}
2003
2004	/*----------------------------------------------------------------------------
2005	\| Returns the square root of the single-precision floating-point value `a'.
2006	\| The operation is performed according to the IEC/IEEE Standard for Binary
2007	\| Floating-Point Arithmetic.
2008	----------------------------------------------------------------------------/
2009
2010	float32 float32_sqrt( float32 a STATUS_PARAM )
2011	{
2012	flag aSign;
2013	int16 aExp, zExp;
2014	bits32 aSig, zSig;
2015	bits64 rem, term;
2016
2017	aSig = extractFloat32Frac( a );
2018	aExp = extractFloat32Exp( a );
2019	aSign = extractFloat32Sign( a );
2020	if ( aExp == 0xFF ) {
2021	if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2022	if ( ! aSign ) return a;
2023	float_raise( float_flag_invalid STATUS_VAR);
2024	return float32_default_nan;
2025	}
2026	if ( aSign ) {
2027	if ( ( aExp \| aSig ) == 0 ) return a;
2028	float_raise( float_flag_invalid STATUS_VAR);
2029	return float32_default_nan;
2030	}
2031	if ( aExp == 0 ) {
2032	if ( aSig == 0 ) return float32_zero;
2033	normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2034	}
2035	zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2036	aSig = ( aSig \| 0x00800000 )<<8;
2037	zSig = estimateSqrt32( aExp, aSig ) + 2;
2038	if ( ( zSig & 0x7F ) <= 5 ) {
2039	if ( zSig < 2 ) {
2040	zSig = 0x7FFFFFFF;
2041	goto roundAndPack;
2042	}
2043	aSig >>= aExp & 1;
2044	term = ( (bits64) zSig ) * zSig;
2045	rem = ( ( (bits64) aSig )<<32 ) - term;
2046	while ( (sbits64) rem < 0 ) {
2047	--zSig;
2048	rem += ( ( (bits64) zSig )<<1 ) \| 1;
2049	}
2050	zSig \|= ( rem != 0 );
2051	}
2052	shift32RightJamming( zSig, 1, &zSig );
2053	roundAndPack:
2054	return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2055
2056	}
2057
2058	/*----------------------------------------------------------------------------
2059	\| Returns the binary exponential of the single-precision floating-point value
2060	\| `a'. The operation is performed according to the IEC/IEEE Standard for
2061	\| Binary Floating-Point Arithmetic.
2062	\|
2063	\| Uses the following identities:
2064	\|
2065	\| 1. -------------------------------------------------------------------------
2066	\| x x*ln(2)
2067	\| 2 = e
2068	\|
2069	\| 2. -------------------------------------------------------------------------
2070	\| 2 3 4 5 n
2071	\| x x x x x x x
2072	\| e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2073	\| 1! 2! 3! 4! 5! n!
2074	----------------------------------------------------------------------------/
2075
2076	static const float64 float32_exp2_coefficients[15] =
2077	{
2078	make_float64( 0x3ff0000000000000ll ), /* 1 */
2079	make_float64( 0x3fe0000000000000ll ), /* 2 */
2080	make_float64( 0x3fc5555555555555ll ), /* 3 */
2081	make_float64( 0x3fa5555555555555ll ), /* 4 */
2082	make_float64( 0x3f81111111111111ll ), /* 5 */
2083	make_float64( 0x3f56c16c16c16c17ll ), /* 6 */
2084	make_float64( 0x3f2a01a01a01a01all ), /* 7 */
2085	make_float64( 0x3efa01a01a01a01all ), /* 8 */
2086	make_float64( 0x3ec71de3a556c734ll ), /* 9 */
2087	make_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2088	make_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2089	make_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2090	make_float64( 0x3de6124613a86d09ll ), /* 13 */
2091	make_float64( 0x3da93974a8c07c9dll ), /* 14 */
2092	make_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2093	};
2094
2095	float32 float32_exp2( float32 a STATUS_PARAM )
2096	{
2097	flag aSign;
2098	int16 aExp;
2099	bits32 aSig;
2100	float64 r, x, xn;
2101	int i;
2102
2103	aSig = extractFloat32Frac( a );
2104	aExp = extractFloat32Exp( a );
2105	aSign = extractFloat32Sign( a );
2106
2107	if ( aExp == 0xFF) {
2108	if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2109	return (aSign) ? float32_zero : a;
2110	}
2111	if (aExp == 0) {
2112	if (aSig == 0) return float32_one;
2113	}
2114
2115	float_raise( float_flag_inexact STATUS_VAR);
2116
2117	/* ******************************* */
2118	/* using float64 for approximation */
2119	/* ******************************* */
2120	x = float32_to_float64(a STATUS_VAR);
2121	x = float64_mul(x, float64_ln2 STATUS_VAR);
2122
2123	xn = x;
2124	r = float64_one;
2125	for (i = 0 ; i < 15 ; i++) {
2126	float64 f;
2127
2128	f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2129	r = float64_add(r, f STATUS_VAR);
2130
2131	xn = float64_mul(xn, x STATUS_VAR);
2132	}
2133
2134	return float64_to_float32(r, status);
2135	}
2136
2137	/*----------------------------------------------------------------------------
2138	\| Returns the binary log of the single-precision floating-point value `a'.
2139	\| The operation is performed according to the IEC/IEEE Standard for Binary
2140	\| Floating-Point Arithmetic.
2141	----------------------------------------------------------------------------/
2142	float32 float32_log2( float32 a STATUS_PARAM )
2143	{
2144	flag aSign, zSign;
2145	int16 aExp;
2146	bits32 aSig, zSig, i;
2147
2148	aSig = extractFloat32Frac( a );
2149	aExp = extractFloat32Exp( a );
2150	aSign = extractFloat32Sign( a );
2151
2152	if ( aExp == 0 ) {
2153	if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2154	normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2155	}
2156	if ( aSign ) {
2157	float_raise( float_flag_invalid STATUS_VAR);
2158	return float32_default_nan;
2159	}
2160	if ( aExp == 0xFF ) {
2161	if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2162	return a;
2163	}
2164
2165	aExp -= 0x7F;
2166	aSig \|= 0x00800000;
2167	zSign = aExp < 0;
2168	zSig = aExp << 23;
2169
2170	for (i = 1 << 22; i > 0; i >>= 1) {
2171	aSig = ( (bits64)aSig * aSig ) >> 23;
2172	if ( aSig & 0x01000000 ) {
2173	aSig >>= 1;
2174	zSig \|= i;
2175	}
2176	}
2177
2178	if ( zSign )
2179	zSig = -zSig;
2180
2181	return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2182	}
2183
2184	/*----------------------------------------------------------------------------
2185	\| Returns 1 if the single-precision floating-point value `a' is equal to
2186	\| the corresponding value `b', and 0 otherwise. The comparison is performed
2187	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2188	----------------------------------------------------------------------------/
2189
2190	int float32_eq( float32 a, float32 b STATUS_PARAM )
2191	{
2192
2193	if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2194	\|\| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2195	) {
2196	if ( float32_is_signaling_nan( a ) \|\| float32_is_signaling_nan( b ) ) {
2197	float_raise( float_flag_invalid STATUS_VAR);
2198	}
2199	return 0;
2200	}
2201	return ( float32_val(a) == float32_val(b) ) \|\|
2202	( (bits32) ( ( float32_val(a) \| float32_val(b) )<<1 ) == 0 );
2203
2204	}
2205
2206	/*----------------------------------------------------------------------------
2207	\| Returns 1 if the single-precision floating-point value `a' is less than
2208	\| or equal to the corresponding value `b', and 0 otherwise. The comparison
2209	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2210	\| Arithmetic.
2211	----------------------------------------------------------------------------/
2212
2213	int float32_le( float32 a, float32 b STATUS_PARAM )
2214	{
2215	flag aSign, bSign;
2216	bits32 av, bv;
2217
2218	if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2219	\|\| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2220	) {
2221	float_raise( float_flag_invalid STATUS_VAR);
2222	return 0;
2223	}
2224	aSign = extractFloat32Sign( a );
2225	bSign = extractFloat32Sign( b );
2226	av = float32_val(a);
2227	bv = float32_val(b);
2228	if ( aSign != bSign ) return aSign \|\| ( (bits32) ( ( av \| bv )<<1 ) == 0 );
2229	return ( av == bv ) \|\| ( aSign ^ ( av < bv ) );
2230
2231	}
2232
2233	/*----------------------------------------------------------------------------
2234	\| Returns 1 if the single-precision floating-point value `a' is less than
2235	\| the corresponding value `b', and 0 otherwise. The comparison is performed
2236	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2237	----------------------------------------------------------------------------/
2238
2239	int float32_lt( float32 a, float32 b STATUS_PARAM )
2240	{
2241	flag aSign, bSign;
2242	bits32 av, bv;
2243
2244	if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2245	\|\| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2246	) {
2247	float_raise( float_flag_invalid STATUS_VAR);
2248	return 0;
2249	}
2250	aSign = extractFloat32Sign( a );
2251	bSign = extractFloat32Sign( b );
2252	av = float32_val(a);
2253	bv = float32_val(b);
2254	if ( aSign != bSign ) return aSign && ( (bits32) ( ( av \| bv )<<1 ) != 0 );
2255	return ( av != bv ) && ( aSign ^ ( av < bv ) );
2256
2257	}
2258
2259	/*----------------------------------------------------------------------------
2260	\| Returns 1 if the single-precision floating-point value `a' is equal to
2261	\| the corresponding value `b', and 0 otherwise. The invalid exception is
2262	\| raised if either operand is a NaN. Otherwise, the comparison is performed
2263	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2264	----------------------------------------------------------------------------/
2265
2266	int float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
2267	{
2268	bits32 av, bv;
2269
2270	if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2271	\|\| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2272	) {
2273	float_raise( float_flag_invalid STATUS_VAR);
2274	return 0;
2275	}
2276	av = float32_val(a);
2277	bv = float32_val(b);
2278	return ( av == bv ) \|\| ( (bits32) ( ( av \| bv )<<1 ) == 0 );
2279
2280	}
2281
2282	/*----------------------------------------------------------------------------
2283	\| Returns 1 if the single-precision floating-point value `a' is less than or
2284	\| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2285	\| cause an exception. Otherwise, the comparison is performed according to the
2286	\| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2287	----------------------------------------------------------------------------/
2288
2289	int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2290	{
2291	flag aSign, bSign;
2292	bits32 av, bv;
2293
2294	if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2295	\|\| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2296	) {
2297	if ( float32_is_signaling_nan( a ) \|\| float32_is_signaling_nan( b ) ) {
2298	float_raise( float_flag_invalid STATUS_VAR);
2299	}
2300	return 0;
2301	}
2302	aSign = extractFloat32Sign( a );
2303	bSign = extractFloat32Sign( b );
2304	av = float32_val(a);
2305	bv = float32_val(b);
2306	if ( aSign != bSign ) return aSign \|\| ( (bits32) ( ( av \| bv )<<1 ) == 0 );
2307	return ( av == bv ) \|\| ( aSign ^ ( av < bv ) );
2308
2309	}
2310
2311	/*----------------------------------------------------------------------------
2312	\| Returns 1 if the single-precision floating-point value `a' is less than
2313	\| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2314	\| exception. Otherwise, the comparison is performed according to the IEC/IEEE
2315	\| Standard for Binary Floating-Point Arithmetic.
2316	----------------------------------------------------------------------------/
2317
2318	int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2319	{
2320	flag aSign, bSign;
2321	bits32 av, bv;
2322
2323	if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2324	\|\| ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2325	) {
2326	if ( float32_is_signaling_nan( a ) \|\| float32_is_signaling_nan( b ) ) {
2327	float_raise( float_flag_invalid STATUS_VAR);
2328	}
2329	return 0;
2330	}
2331	aSign = extractFloat32Sign( a );
2332	bSign = extractFloat32Sign( b );
2333	av = float32_val(a);
2334	bv = float32_val(b);
2335	if ( aSign != bSign ) return aSign && ( (bits32) ( ( av \| bv )<<1 ) != 0 );
2336	return ( av != bv ) && ( aSign ^ ( av < bv ) );
2337
2338	}
2339
2340	/*----------------------------------------------------------------------------
2341	\| Returns the result of converting the double-precision floating-point value
2342	\| `a' to the 32-bit two's complement integer format. The conversion is
2343	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
2344	\| Arithmetic---which means in particular that the conversion is rounded
2345	\| according to the current rounding mode. If `a' is a NaN, the largest
2346	\| positive integer is returned. Otherwise, if the conversion overflows, the
2347	\| largest integer with the same sign as `a' is returned.
2348	----------------------------------------------------------------------------/
2349
2350	int32 float64_to_int32( float64 a STATUS_PARAM )
2351	{
2352	flag aSign;
2353	int16 aExp, shiftCount;
2354	bits64 aSig;
2355
2356	aSig = extractFloat64Frac( a );
2357	aExp = extractFloat64Exp( a );
2358	aSign = extractFloat64Sign( a );
2359	if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2360	if ( aExp ) aSig \|= LIT64( 0x0010000000000000 );
2361	shiftCount = 0x42C - aExp;
2362	if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2363	return roundAndPackInt32( aSign, aSig STATUS_VAR );
2364
2365	}
2366
2367	/*----------------------------------------------------------------------------
2368	\| Returns the result of converting the double-precision floating-point value
2369	\| `a' to the 32-bit two's complement integer format. The conversion is
2370	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
2371	\| Arithmetic, except that the conversion is always rounded toward zero.
2372	\| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2373	\| the conversion overflows, the largest integer with the same sign as `a' is
2374	\| returned.
2375	----------------------------------------------------------------------------/
2376
2377	int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2378	{
2379	flag aSign;
2380	int16 aExp, shiftCount;
2381	bits64 aSig, savedASig;
2382	int32 z;
2383
2384	aSig = extractFloat64Frac( a );
2385	aExp = extractFloat64Exp( a );
2386	aSign = extractFloat64Sign( a );
2387	if ( 0x41E < aExp ) {
2388	if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2389	goto invalid;
2390	}
2391	else if ( aExp < 0x3FF ) {
2392	if ( aExp \|\| aSig ) STATUS(float_exception_flags) \|= float_flag_inexact;
2393	return 0;
2394	}
2395	aSig \|= LIT64( 0x0010000000000000 );
2396	shiftCount = 0x433 - aExp;
2397	savedASig = aSig;
2398	aSig >>= shiftCount;
2399	z = aSig;
2400	if ( aSign ) z = - z;
2401	if ( ( z < 0 ) ^ aSign ) {
2402	invalid:
2403	float_raise( float_flag_invalid STATUS_VAR);
2404	return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2405	}
2406	if ( ( aSig<<shiftCount ) != savedASig ) {
2407	STATUS(float_exception_flags) \|= float_flag_inexact;
2408	}
2409	return z;
2410
2411	}
2412
2413	/*----------------------------------------------------------------------------
2414	\| Returns the result of converting the double-precision floating-point value
2415	\| `a' to the 64-bit two's complement integer format. The conversion is
2416	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
2417	\| Arithmetic---which means in particular that the conversion is rounded
2418	\| according to the current rounding mode. If `a' is a NaN, the largest
2419	\| positive integer is returned. Otherwise, if the conversion overflows, the
2420	\| largest integer with the same sign as `a' is returned.
2421	----------------------------------------------------------------------------/
2422
2423	int64 float64_to_int64( float64 a STATUS_PARAM )
2424	{
2425	flag aSign;
2426	int16 aExp, shiftCount;
2427	bits64 aSig, aSigExtra;
2428
2429	aSig = extractFloat64Frac( a );
2430	aExp = extractFloat64Exp( a );
2431	aSign = extractFloat64Sign( a );
2432	if ( aExp ) aSig \|= LIT64( 0x0010000000000000 );
2433	shiftCount = 0x433 - aExp;
2434	if ( shiftCount <= 0 ) {
2435	if ( 0x43E < aExp ) {
2436	float_raise( float_flag_invalid STATUS_VAR);
2437	if ( ! aSign
2438	\|\| ( ( aExp == 0x7FF )
2439	&& ( aSig != LIT64( 0x0010000000000000 ) ) )
2440	) {
2441	return LIT64( 0x7FFFFFFFFFFFFFFF );
2442	}
2443	return (sbits64) LIT64( 0x8000000000000000 );
2444	}
2445	aSigExtra = 0;
2446	aSig <<= - shiftCount;
2447	}
2448	else {
2449	shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2450	}
2451	return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2452
2453	}
2454
2455	/*----------------------------------------------------------------------------
2456	\| Returns the result of converting the double-precision floating-point value
2457	\| `a' to the 64-bit two's complement integer format. The conversion is
2458	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
2459	\| Arithmetic, except that the conversion is always rounded toward zero.
2460	\| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2461	\| the conversion overflows, the largest integer with the same sign as `a' is
2462	\| returned.
2463	----------------------------------------------------------------------------/
2464
2465	int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2466	{
2467	flag aSign;
2468	int16 aExp, shiftCount;
2469	bits64 aSig;
2470	int64 z;
2471
2472	aSig = extractFloat64Frac( a );
2473	aExp = extractFloat64Exp( a );
2474	aSign = extractFloat64Sign( a );
2475	if ( aExp ) aSig \|= LIT64( 0x0010000000000000 );
2476	shiftCount = aExp - 0x433;
2477	if ( 0 <= shiftCount ) {
2478	if ( 0x43E <= aExp ) {
2479	if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2480	float_raise( float_flag_invalid STATUS_VAR);
2481	if ( ! aSign
2482	\|\| ( ( aExp == 0x7FF )
2483	&& ( aSig != LIT64( 0x0010000000000000 ) ) )
2484	) {
2485	return LIT64( 0x7FFFFFFFFFFFFFFF );
2486	}
2487	}
2488	return (sbits64) LIT64( 0x8000000000000000 );
2489	}
2490	z = aSig<<shiftCount;
2491	}
2492	else {
2493	if ( aExp < 0x3FE ) {
2494	if ( aExp \| aSig ) STATUS(float_exception_flags) \|= float_flag_inexact;
2495	return 0;
2496	}
2497	z = aSig>>( - shiftCount );
2498	if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2499	STATUS(float_exception_flags) \|= float_flag_inexact;
2500	}
2501	}
2502	if ( aSign ) z = - z;
2503	return z;
2504
2505	}
2506
2507	/*----------------------------------------------------------------------------
2508	\| Returns the result of converting the double-precision floating-point value
2509	\| `a' to the single-precision floating-point format. The conversion is
2510	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
2511	\| Arithmetic.
2512	----------------------------------------------------------------------------/
2513
2514	float32 float64_to_float32( float64 a STATUS_PARAM )
2515	{
2516	flag aSign;
2517	int16 aExp;
2518	bits64 aSig;
2519	bits32 zSig;
2520
2521	aSig = extractFloat64Frac( a );
2522	aExp = extractFloat64Exp( a );
2523	aSign = extractFloat64Sign( a );
2524	if ( aExp == 0x7FF ) {
2525	if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) );
2526	return packFloat32( aSign, 0xFF, 0 );
2527	}
2528	shift64RightJamming( aSig, 22, &aSig );
2529	zSig = aSig;
2530	if ( aExp \|\| zSig ) {
2531	zSig \|= 0x40000000;
2532	aExp -= 0x381;
2533	}
2534	return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2535
2536	}
2537
2538
2539	/*----------------------------------------------------------------------------
2540	\| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2541	\| half-precision floating-point value, returning the result. After being
2542	\| shifted into the proper positions, the three fields are simply added
2543	\| together to form the result. This means that any integer portion of `zSig'
2544	\| will be added into the exponent. Since a properly normalized significand
2545	\| will have an integer portion equal to 1, the `zExp' input should be 1 less
2546	\| than the desired result exponent whenever `zSig' is a complete, normalized
2547	\| significand.
2548	----------------------------------------------------------------------------/
2549	static bits16 packFloat16(flag zSign, int16 zExp, bits16 zSig)
2550	{
2551	return (((bits32)zSign) << 15) + (((bits32)zExp) << 10) + zSig;
2552	}
2553
2554	/* Half precision floats come in two formats: standard IEEE and "ARM" format.
2555	The latter gains extra exponent range by omitting the NaN/Inf encodings. */
2556
2557	float32 float16_to_float32( bits16 a, flag ieee STATUS_PARAM )
2558	{
2559	flag aSign;
2560	int16 aExp;
2561	bits32 aSig;
2562
2563	aSign = a >> 15;
2564	aExp = (a >> 10) & 0x1f;
2565	aSig = a & 0x3ff;
2566
2567	if (aExp == 0x1f && ieee) {
2568	if (aSig) {
2569	/* Make sure correct exceptions are raised. */
2570	float32ToCommonNaN(a STATUS_VAR);
2571	aSig \|= 0x200;
2572	}
2573	return packFloat32(aSign, 0xff, aSig << 13);
2574	}
2575	if (aExp == 0) {
2576	int8 shiftCount;
2577
2578	if (aSig == 0) {
2579	return packFloat32(aSign, 0, 0);
2580	}
2581
2582	shiftCount = countLeadingZeros32( aSig ) - 21;
2583	aSig = aSig << shiftCount;
2584	aExp = -shiftCount;
2585	}
2586	return packFloat32( aSign, aExp + 0x70, aSig << 13);
2587	}
2588
2589	bits16 float32_to_float16( float32 a, flag ieee STATUS_PARAM)
2590	{
2591	flag aSign;
2592	int16 aExp;
2593	bits32 aSig;
2594	bits32 mask;
2595	bits32 increment;
2596	int8 roundingMode;
2597
2598	aSig = extractFloat32Frac( a );
2599	aExp = extractFloat32Exp( a );
2600	aSign = extractFloat32Sign( a );
2601	if ( aExp == 0xFF ) {
2602	if (aSig) {
2603	/* Make sure correct exceptions are raised. */
2604	float32ToCommonNaN(a STATUS_VAR);
2605	aSig \|= 0x00400000;
2606	}
2607	return packFloat16(aSign, 0x1f, aSig >> 13);
2608	}
2609	if (aExp == 0 && aSign == 0) {
2610	return packFloat16(aSign, 0, 0);
2611	}
2612	/* Decimal point between bits 22 and 23. */
2613	aSig \|= 0x00800000;
2614	aExp -= 0x7f;
2615	if (aExp < -14) {
2616	mask = 0x007fffff;
2617	if (aExp < -24) {
2618	aExp = -25;
2619	} else {
2620	mask >>= 24 + aExp;
2621	}
2622	} else {
2623	mask = 0x00001fff;
2624	}
2625	if (aSig & mask) {
2626	float_raise( float_flag_underflow STATUS_VAR );
2627	roundingMode = STATUS(float_rounding_mode);
2628	switch (roundingMode) {
2629	case float_round_nearest_even:
2630	increment = (mask + 1) >> 1;
2631	if ((aSig & mask) == increment) {
2632	increment = aSig & (increment << 1);
2633	}
2634	break;
2635	case float_round_up:
2636	increment = aSign ? 0 : mask;
2637	break;
2638	case float_round_down:
2639	increment = aSign ? mask : 0;
2640	break;
2641	default: /* round_to_zero */
2642	increment = 0;
2643	break;
2644	}
2645	aSig += increment;
2646	if (aSig >= 0x01000000) {
2647	aSig >>= 1;
2648	aExp++;
2649	}
2650	} else if (aExp < -14
2651	&& STATUS(float_detect_tininess) == float_tininess_before_rounding) {
2652	float_raise( float_flag_underflow STATUS_VAR);
2653	}
2654
2655	if (ieee) {
2656	if (aExp > 15) {
2657	float_raise( float_flag_overflow \| float_flag_inexact STATUS_VAR);
2658	return packFloat16(aSign, 0x1f, 0);
2659	}
2660	} else {
2661	if (aExp > 16) {
2662	float_raise( float_flag_overflow \| float_flag_inexact STATUS_VAR);
2663	return packFloat16(aSign, 0x1f, 0x3ff);
2664	}
2665	}
2666	if (aExp < -24) {
2667	return packFloat16(aSign, 0, 0);
2668	}
2669	if (aExp < -14) {
2670	aSig >>= -14 - aExp;
2671	aExp = -14;
2672	}
2673	return packFloat16(aSign, aExp + 14, aSig >> 13);
2674	}
2675
2676	#ifdef FLOATX80
2677
2678	/*----------------------------------------------------------------------------
2679	\| Returns the result of converting the double-precision floating-point value
2680	\| `a' to the extended double-precision floating-point format. The conversion
2681	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2682	\| Arithmetic.
2683	----------------------------------------------------------------------------/
2684
2685	floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2686	{
2687	flag aSign;
2688	int16 aExp;
2689	bits64 aSig;
2690
2691	aSig = extractFloat64Frac( a );
2692	aExp = extractFloat64Exp( a );
2693	aSign = extractFloat64Sign( a );
2694	if ( aExp == 0x7FF ) {
2695	if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) );
2696	return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2697	}
2698	if ( aExp == 0 ) {
2699	if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2700	normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2701	}
2702	return
2703	packFloatx80(
2704	aSign, aExp + 0x3C00, ( aSig \| LIT64( 0x0010000000000000 ) )<<11 );
2705
2706	}
2707
2708	#endif
2709
2710	#ifdef FLOAT128
2711
2712	/*----------------------------------------------------------------------------
2713	\| Returns the result of converting the double-precision floating-point value
2714	\| `a' to the quadruple-precision floating-point format. The conversion is
2715	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
2716	\| Arithmetic.
2717	----------------------------------------------------------------------------/
2718
2719	float128 float64_to_float128( float64 a STATUS_PARAM )
2720	{
2721	flag aSign;
2722	int16 aExp;
2723	bits64 aSig, zSig0, zSig1;
2724
2725	aSig = extractFloat64Frac( a );
2726	aExp = extractFloat64Exp( a );
2727	aSign = extractFloat64Sign( a );
2728	if ( aExp == 0x7FF ) {
2729	if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) );
2730	return packFloat128( aSign, 0x7FFF, 0, 0 );
2731	}
2732	if ( aExp == 0 ) {
2733	if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2734	normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2735	--aExp;
2736	}
2737	shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2738	return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2739
2740	}
2741
2742	#endif
2743
2744	/*----------------------------------------------------------------------------
2745	\| Rounds the double-precision floating-point value `a' to an integer, and
2746	\| returns the result as a double-precision floating-point value. The
2747	\| operation is performed according to the IEC/IEEE Standard for Binary
2748	\| Floating-Point Arithmetic.
2749	----------------------------------------------------------------------------/
2750
2751	float64 float64_round_to_int( float64 a STATUS_PARAM )
2752	{
2753	flag aSign;
2754	int16 aExp;
2755	bits64 lastBitMask, roundBitsMask;
2756	int8 roundingMode;
2757	bits64 z;
2758
2759	aExp = extractFloat64Exp( a );
2760	if ( 0x433 <= aExp ) {
2761	if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2762	return propagateFloat64NaN( a, a STATUS_VAR );
2763	}
2764	return a;
2765	}
2766	if ( aExp < 0x3FF ) {
2767	if ( (bits64) ( float64_val(a)<<1 ) == 0 ) return a;
2768	STATUS(float_exception_flags) \|= float_flag_inexact;
2769	aSign = extractFloat64Sign( a );
2770	switch ( STATUS(float_rounding_mode) ) {
2771	case float_round_nearest_even:
2772	if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2773	return packFloat64( aSign, 0x3FF, 0 );
2774	}
2775	break;
2776	case float_round_down:
2777	return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
2778	case float_round_up:
2779	return make_float64(
2780	aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
2781	}
2782	return packFloat64( aSign, 0, 0 );
2783	}
2784	lastBitMask = 1;
2785	lastBitMask <<= 0x433 - aExp;
2786	roundBitsMask = lastBitMask - 1;
2787	z = float64_val(a);
2788	roundingMode = STATUS(float_rounding_mode);
2789	if ( roundingMode == float_round_nearest_even ) {
2790	z += lastBitMask>>1;
2791	if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2792	}
2793	else if ( roundingMode != float_round_to_zero ) {
2794	if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
2795	z += roundBitsMask;
2796	}
2797	}
2798	z &= ~ roundBitsMask;
2799	if ( z != float64_val(a) )
2800	STATUS(float_exception_flags) \|= float_flag_inexact;
2801	return make_float64(z);
2802
2803	}
2804
2805	float64 float64_trunc_to_int( float64 a STATUS_PARAM)
2806	{
2807	int oldmode;
2808	float64 res;
2809	oldmode = STATUS(float_rounding_mode);
2810	STATUS(float_rounding_mode) = float_round_to_zero;
2811	res = float64_round_to_int(a STATUS_VAR);
2812	STATUS(float_rounding_mode) = oldmode;
2813	return res;
2814	}
2815
2816	/*----------------------------------------------------------------------------
2817	\| Returns the result of adding the absolute values of the double-precision
2818	\| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2819	\| before being returned. `zSign' is ignored if the result is a NaN.
2820	\| The addition is performed according to the IEC/IEEE Standard for Binary
2821	\| Floating-Point Arithmetic.
2822	----------------------------------------------------------------------------/
2823
2824	static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2825	{
2826	int16 aExp, bExp, zExp;
2827	bits64 aSig, bSig, zSig;
2828	int16 expDiff;
2829
2830	aSig = extractFloat64Frac( a );
2831	aExp = extractFloat64Exp( a );
2832	bSig = extractFloat64Frac( b );
2833	bExp = extractFloat64Exp( b );
2834	expDiff = aExp - bExp;
2835	aSig <<= 9;
2836	bSig <<= 9;
2837	if ( 0 < expDiff ) {
2838	if ( aExp == 0x7FF ) {
2839	if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2840	return a;
2841	}
2842	if ( bExp == 0 ) {
2843	--expDiff;
2844	}
2845	else {
2846	bSig \|= LIT64( 0x2000000000000000 );
2847	}
2848	shift64RightJamming( bSig, expDiff, &bSig );
2849	zExp = aExp;
2850	}
2851	else if ( expDiff < 0 ) {
2852	if ( bExp == 0x7FF ) {
2853	if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2854	return packFloat64( zSign, 0x7FF, 0 );
2855	}
2856	if ( aExp == 0 ) {
2857	++expDiff;
2858	}
2859	else {
2860	aSig \|= LIT64( 0x2000000000000000 );
2861	}
2862	shift64RightJamming( aSig, - expDiff, &aSig );
2863	zExp = bExp;
2864	}
2865	else {
2866	if ( aExp == 0x7FF ) {
2867	if ( aSig \| bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2868	return a;
2869	}
2870	if ( aExp == 0 ) {
2871	if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
2872	return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2873	}
2874	zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2875	zExp = aExp;
2876	goto roundAndPack;
2877	}
2878	aSig \|= LIT64( 0x2000000000000000 );
2879	zSig = ( aSig + bSig )<<1;
2880	--zExp;
2881	if ( (sbits64) zSig < 0 ) {
2882	zSig = aSig + bSig;
2883	++zExp;
2884	}
2885	roundAndPack:
2886	return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2887
2888	}
2889
2890	/*----------------------------------------------------------------------------
2891	\| Returns the result of subtracting the absolute values of the double-
2892	\| precision floating-point values `a' and `b'. If `zSign' is 1, the
2893	\| difference is negated before being returned. `zSign' is ignored if the
2894	\| result is a NaN. The subtraction is performed according to the IEC/IEEE
2895	\| Standard for Binary Floating-Point Arithmetic.
2896	----------------------------------------------------------------------------/
2897
2898	static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2899	{
2900	int16 aExp, bExp, zExp;
2901	bits64 aSig, bSig, zSig;
2902	int16 expDiff;
2903
2904	aSig = extractFloat64Frac( a );
2905	aExp = extractFloat64Exp( a );
2906	bSig = extractFloat64Frac( b );
2907	bExp = extractFloat64Exp( b );
2908	expDiff = aExp - bExp;
2909	aSig <<= 10;
2910	bSig <<= 10;
2911	if ( 0 < expDiff ) goto aExpBigger;
2912	if ( expDiff < 0 ) goto bExpBigger;
2913	if ( aExp == 0x7FF ) {
2914	if ( aSig \| bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2915	float_raise( float_flag_invalid STATUS_VAR);
2916	return float64_default_nan;
2917	}
2918	if ( aExp == 0 ) {
2919	aExp = 1;
2920	bExp = 1;
2921	}
2922	if ( bSig < aSig ) goto aBigger;
2923	if ( aSig < bSig ) goto bBigger;
2924	return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
2925	bExpBigger:
2926	if ( bExp == 0x7FF ) {
2927	if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2928	return packFloat64( zSign ^ 1, 0x7FF, 0 );
2929	}
2930	if ( aExp == 0 ) {
2931	++expDiff;
2932	}
2933	else {
2934	aSig \|= LIT64( 0x4000000000000000 );
2935	}
2936	shift64RightJamming( aSig, - expDiff, &aSig );
2937	bSig \|= LIT64( 0x4000000000000000 );
2938	bBigger:
2939	zSig = bSig - aSig;
2940	zExp = bExp;
2941	zSign ^= 1;
2942	goto normalizeRoundAndPack;
2943	aExpBigger:
2944	if ( aExp == 0x7FF ) {
2945	if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2946	return a;
2947	}
2948	if ( bExp == 0 ) {
2949	--expDiff;
2950	}
2951	else {
2952	bSig \|= LIT64( 0x4000000000000000 );
2953	}
2954	shift64RightJamming( bSig, expDiff, &bSig );
2955	aSig \|= LIT64( 0x4000000000000000 );
2956	aBigger:
2957	zSig = aSig - bSig;
2958	zExp = aExp;
2959	normalizeRoundAndPack:
2960	--zExp;
2961	return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2962
2963	}
2964
2965	/*----------------------------------------------------------------------------
2966	\| Returns the result of adding the double-precision floating-point values `a'
2967	\| and `b'. The operation is performed according to the IEC/IEEE Standard for
2968	\| Binary Floating-Point Arithmetic.
2969	----------------------------------------------------------------------------/
2970
2971	float64 float64_add( float64 a, float64 b STATUS_PARAM )
2972	{
2973	flag aSign, bSign;
2974
2975	aSign = extractFloat64Sign( a );
2976	bSign = extractFloat64Sign( b );
2977	if ( aSign == bSign ) {
2978	return addFloat64Sigs( a, b, aSign STATUS_VAR );
2979	}
2980	else {
2981	return subFloat64Sigs( a, b, aSign STATUS_VAR );
2982	}
2983
2984	}
2985
2986	/*----------------------------------------------------------------------------
2987	\| Returns the result of subtracting the double-precision floating-point values
2988	\| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2989	\| for Binary Floating-Point Arithmetic.
2990	----------------------------------------------------------------------------/
2991
2992	float64 float64_sub( float64 a, float64 b STATUS_PARAM )
2993	{
2994	flag aSign, bSign;
2995
2996	aSign = extractFloat64Sign( a );
2997	bSign = extractFloat64Sign( b );
2998	if ( aSign == bSign ) {
2999	return subFloat64Sigs( a, b, aSign STATUS_VAR );
3000	}
3001	else {
3002	return addFloat64Sigs( a, b, aSign STATUS_VAR );
3003	}
3004
3005	}
3006
3007	/*----------------------------------------------------------------------------
3008	\| Returns the result of multiplying the double-precision floating-point values
3009	\| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3010	\| for Binary Floating-Point Arithmetic.
3011	----------------------------------------------------------------------------/
3012
3013	float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3014	{
3015	flag aSign, bSign, zSign;
3016	int16 aExp, bExp, zExp;
3017	bits64 aSig, bSig, zSig0, zSig1;
3018
3019	aSig = extractFloat64Frac( a );
3020	aExp = extractFloat64Exp( a );
3021	aSign = extractFloat64Sign( a );
3022	bSig = extractFloat64Frac( b );
3023	bExp = extractFloat64Exp( b );
3024	bSign = extractFloat64Sign( b );
3025	zSign = aSign ^ bSign;
3026	if ( aExp == 0x7FF ) {
3027	if ( aSig \|\| ( ( bExp == 0x7FF ) && bSig ) ) {
3028	return propagateFloat64NaN( a, b STATUS_VAR );
3029	}
3030	if ( ( bExp \| bSig ) == 0 ) {
3031	float_raise( float_flag_invalid STATUS_VAR);
3032	return float64_default_nan;
3033	}
3034	return packFloat64( zSign, 0x7FF, 0 );
3035	}
3036	if ( bExp == 0x7FF ) {
3037	if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3038	if ( ( aExp \| aSig ) == 0 ) {
3039	float_raise( float_flag_invalid STATUS_VAR);
3040	return float64_default_nan;
3041	}
3042	return packFloat64( zSign, 0x7FF, 0 );
3043	}
3044	if ( aExp == 0 ) {
3045	if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3046	normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3047	}
3048	if ( bExp == 0 ) {
3049	if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3050	normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3051	}
3052	zExp = aExp + bExp - 0x3FF;
3053	aSig = ( aSig \| LIT64( 0x0010000000000000 ) )<<10;
3054	bSig = ( bSig \| LIT64( 0x0010000000000000 ) )<<11;
3055	mul64To128( aSig, bSig, &zSig0, &zSig1 );
3056	zSig0 \|= ( zSig1 != 0 );
3057	if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
3058	zSig0 <<= 1;
3059	--zExp;
3060	}
3061	return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3062
3063	}
3064
3065	/*----------------------------------------------------------------------------
3066	\| Returns the result of dividing the double-precision floating-point value `a'
3067	\| by the corresponding value `b'. The operation is performed according to
3068	\| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3069	----------------------------------------------------------------------------/
3070
3071	float64 float64_div( float64 a, float64 b STATUS_PARAM )
3072	{
3073	flag aSign, bSign, zSign;
3074	int16 aExp, bExp, zExp;
3075	bits64 aSig, bSig, zSig;
3076	bits64 rem0, rem1;
3077	bits64 term0, term1;
3078
3079	aSig = extractFloat64Frac( a );
3080	aExp = extractFloat64Exp( a );
3081	aSign = extractFloat64Sign( a );
3082	bSig = extractFloat64Frac( b );
3083	bExp = extractFloat64Exp( b );
3084	bSign = extractFloat64Sign( b );
3085	zSign = aSign ^ bSign;
3086	if ( aExp == 0x7FF ) {
3087	if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3088	if ( bExp == 0x7FF ) {
3089	if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3090	float_raise( float_flag_invalid STATUS_VAR);
3091	return float64_default_nan;
3092	}
3093	return packFloat64( zSign, 0x7FF, 0 );
3094	}
3095	if ( bExp == 0x7FF ) {
3096	if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3097	return packFloat64( zSign, 0, 0 );
3098	}
3099	if ( bExp == 0 ) {
3100	if ( bSig == 0 ) {
3101	if ( ( aExp \| aSig ) == 0 ) {
3102	float_raise( float_flag_invalid STATUS_VAR);
3103	return float64_default_nan;
3104	}
3105	float_raise( float_flag_divbyzero STATUS_VAR);
3106	return packFloat64( zSign, 0x7FF, 0 );
3107	}
3108	normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3109	}
3110	if ( aExp == 0 ) {
3111	if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3112	normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3113	}
3114	zExp = aExp - bExp + 0x3FD;
3115	aSig = ( aSig \| LIT64( 0x0010000000000000 ) )<<10;
3116	bSig = ( bSig \| LIT64( 0x0010000000000000 ) )<<11;
3117	if ( bSig <= ( aSig + aSig ) ) {
3118	aSig >>= 1;
3119	++zExp;
3120	}
3121	zSig = estimateDiv128To64( aSig, 0, bSig );
3122	if ( ( zSig & 0x1FF ) <= 2 ) {
3123	mul64To128( bSig, zSig, &term0, &term1 );
3124	sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3125	while ( (sbits64) rem0 < 0 ) {
3126	--zSig;
3127	add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3128	}
3129	zSig \|= ( rem1 != 0 );
3130	}
3131	return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3132
3133	}
3134
3135	/*----------------------------------------------------------------------------
3136	\| Returns the remainder of the double-precision floating-point value `a'
3137	\| with respect to the corresponding value `b'. The operation is performed
3138	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3139	----------------------------------------------------------------------------/
3140
3141	float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3142	{
3143	flag aSign, zSign;
3144	int16 aExp, bExp, expDiff;
3145	bits64 aSig, bSig;
3146	bits64 q, alternateASig;
3147	sbits64 sigMean;
3148
3149	aSig = extractFloat64Frac( a );
3150	aExp = extractFloat64Exp( a );
3151	aSign = extractFloat64Sign( a );
3152	bSig = extractFloat64Frac( b );
3153	bExp = extractFloat64Exp( b );
3154	if ( aExp == 0x7FF ) {
3155	if ( aSig \|\| ( ( bExp == 0x7FF ) && bSig ) ) {
3156	return propagateFloat64NaN( a, b STATUS_VAR );
3157	}
3158	float_raise( float_flag_invalid STATUS_VAR);
3159	return float64_default_nan;
3160	}
3161	if ( bExp == 0x7FF ) {
3162	if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3163	return a;
3164	}
3165	if ( bExp == 0 ) {
3166	if ( bSig == 0 ) {
3167	float_raise( float_flag_invalid STATUS_VAR);
3168	return float64_default_nan;
3169	}
3170	normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3171	}
3172	if ( aExp == 0 ) {
3173	if ( aSig == 0 ) return a;
3174	normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3175	}
3176	expDiff = aExp - bExp;
3177	aSig = ( aSig \| LIT64( 0x0010000000000000 ) )<<11;
3178	bSig = ( bSig \| LIT64( 0x0010000000000000 ) )<<11;
3179	if ( expDiff < 0 ) {
3180	if ( expDiff < -1 ) return a;
3181	aSig >>= 1;
3182	}
3183	q = ( bSig <= aSig );
3184	if ( q ) aSig -= bSig;
3185	expDiff -= 64;
3186	while ( 0 < expDiff ) {
3187	q = estimateDiv128To64( aSig, 0, bSig );
3188	q = ( 2 < q ) ? q - 2 : 0;
3189	aSig = - ( ( bSig>>2 ) * q );
3190	expDiff -= 62;
3191	}
3192	expDiff += 64;
3193	if ( 0 < expDiff ) {
3194	q = estimateDiv128To64( aSig, 0, bSig );
3195	q = ( 2 < q ) ? q - 2 : 0;
3196	q >>= 64 - expDiff;
3197	bSig >>= 2;
3198	aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3199	}
3200	else {
3201	aSig >>= 2;
3202	bSig >>= 2;
3203	}
3204	do {
3205	alternateASig = aSig;
3206	++q;
3207	aSig -= bSig;
3208	} while ( 0 <= (sbits64) aSig );
3209	sigMean = aSig + alternateASig;
3210	if ( ( sigMean < 0 ) \|\| ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3211	aSig = alternateASig;
3212	}
3213	zSign = ( (sbits64) aSig < 0 );
3214	if ( zSign ) aSig = - aSig;
3215	return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3216
3217	}
3218
3219	/*----------------------------------------------------------------------------
3220	\| Returns the square root of the double-precision floating-point value `a'.
3221	\| The operation is performed according to the IEC/IEEE Standard for Binary
3222	\| Floating-Point Arithmetic.
3223	----------------------------------------------------------------------------/
3224
3225	float64 float64_sqrt( float64 a STATUS_PARAM )
3226	{
3227	flag aSign;
3228	int16 aExp, zExp;
3229	bits64 aSig, zSig, doubleZSig;
3230	bits64 rem0, rem1, term0, term1;
3231
3232	aSig = extractFloat64Frac( a );
3233	aExp = extractFloat64Exp( a );
3234	aSign = extractFloat64Sign( a );
3235	if ( aExp == 0x7FF ) {
3236	if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3237	if ( ! aSign ) return a;
3238	float_raise( float_flag_invalid STATUS_VAR);
3239	return float64_default_nan;
3240	}
3241	if ( aSign ) {
3242	if ( ( aExp \| aSig ) == 0 ) return a;
3243	float_raise( float_flag_invalid STATUS_VAR);
3244	return float64_default_nan;
3245	}
3246	if ( aExp == 0 ) {
3247	if ( aSig == 0 ) return float64_zero;
3248	normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3249	}
3250	zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3251	aSig \|= LIT64( 0x0010000000000000 );
3252	zSig = estimateSqrt32( aExp, aSig>>21 );
3253	aSig <<= 9 - ( aExp & 1 );
3254	zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3255	if ( ( zSig & 0x1FF ) <= 5 ) {
3256	doubleZSig = zSig<<1;
3257	mul64To128( zSig, zSig, &term0, &term1 );
3258	sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3259	while ( (sbits64) rem0 < 0 ) {
3260	--zSig;
3261	doubleZSig -= 2;
3262	add128( rem0, rem1, zSig>>63, doubleZSig \| 1, &rem0, &rem1 );
3263	}
3264	zSig \|= ( ( rem0 \| rem1 ) != 0 );
3265	}
3266	return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3267
3268	}
3269
3270	/*----------------------------------------------------------------------------
3271	\| Returns the binary log of the double-precision floating-point value `a'.
3272	\| The operation is performed according to the IEC/IEEE Standard for Binary
3273	\| Floating-Point Arithmetic.
3274	----------------------------------------------------------------------------/
3275	float64 float64_log2( float64 a STATUS_PARAM )
3276	{
3277	flag aSign, zSign;
3278	int16 aExp;
3279	bits64 aSig, aSig0, aSig1, zSig, i;
3280
3281	aSig = extractFloat64Frac( a );
3282	aExp = extractFloat64Exp( a );
3283	aSign = extractFloat64Sign( a );
3284
3285	if ( aExp == 0 ) {
3286	if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3287	normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3288	}
3289	if ( aSign ) {
3290	float_raise( float_flag_invalid STATUS_VAR);
3291	return float64_default_nan;
3292	}
3293	if ( aExp == 0x7FF ) {
3294	if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3295	return a;
3296	}
3297
3298	aExp -= 0x3FF;
3299	aSig \|= LIT64( 0x0010000000000000 );
3300	zSign = aExp < 0;
3301	zSig = (bits64)aExp << 52;
3302	for (i = 1LL << 51; i > 0; i >>= 1) {
3303	mul64To128( aSig, aSig, &aSig0, &aSig1 );
3304	aSig = ( aSig0 << 12 ) \| ( aSig1 >> 52 );
3305	if ( aSig & LIT64( 0x0020000000000000 ) ) {
3306	aSig >>= 1;
3307	zSig \|= i;
3308	}
3309	}
3310
3311	if ( zSign )
3312	zSig = -zSig;
3313	return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
3314	}
3315
3316	/*----------------------------------------------------------------------------
3317	\| Returns 1 if the double-precision floating-point value `a' is equal to the
3318	\| corresponding value `b', and 0 otherwise. The comparison is performed
3319	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3320	----------------------------------------------------------------------------/
3321
3322	int float64_eq( float64 a, float64 b STATUS_PARAM )
3323	{
3324	bits64 av, bv;
3325
3326	if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3327	\|\| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3328	) {
3329	if ( float64_is_signaling_nan( a ) \|\| float64_is_signaling_nan( b ) ) {
3330	float_raise( float_flag_invalid STATUS_VAR);
3331	}
3332	return 0;
3333	}
3334	av = float64_val(a);
3335	bv = float64_val(b);
3336	return ( av == bv ) \|\| ( (bits64) ( ( av \| bv )<<1 ) == 0 );
3337
3338	}
3339
3340	/*----------------------------------------------------------------------------
3341	\| Returns 1 if the double-precision floating-point value `a' is less than or
3342	\| equal to the corresponding value `b', and 0 otherwise. The comparison is
3343	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
3344	\| Arithmetic.
3345	----------------------------------------------------------------------------/
3346
3347	int float64_le( float64 a, float64 b STATUS_PARAM )
3348	{
3349	flag aSign, bSign;
3350	bits64 av, bv;
3351
3352	if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3353	\|\| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3354	) {
3355	float_raise( float_flag_invalid STATUS_VAR);
3356	return 0;
3357	}
3358	aSign = extractFloat64Sign( a );
3359	bSign = extractFloat64Sign( b );
3360	av = float64_val(a);
3361	bv = float64_val(b);
3362	if ( aSign != bSign ) return aSign \|\| ( (bits64) ( ( av \| bv )<<1 ) == 0 );
3363	return ( av == bv ) \|\| ( aSign ^ ( av < bv ) );
3364
3365	}
3366
3367	/*----------------------------------------------------------------------------
3368	\| Returns 1 if the double-precision floating-point value `a' is less than
3369	\| the corresponding value `b', and 0 otherwise. The comparison is performed
3370	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3371	----------------------------------------------------------------------------/
3372
3373	int float64_lt( float64 a, float64 b STATUS_PARAM )
3374	{
3375	flag aSign, bSign;
3376	bits64 av, bv;
3377
3378	if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3379	\|\| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3380	) {
3381	float_raise( float_flag_invalid STATUS_VAR);
3382	return 0;
3383	}
3384	aSign = extractFloat64Sign( a );
3385	bSign = extractFloat64Sign( b );
3386	av = float64_val(a);
3387	bv = float64_val(b);
3388	if ( aSign != bSign ) return aSign && ( (bits64) ( ( av \| bv )<<1 ) != 0 );
3389	return ( av != bv ) && ( aSign ^ ( av < bv ) );
3390
3391	}
3392
3393	/*----------------------------------------------------------------------------
3394	\| Returns 1 if the double-precision floating-point value `a' is equal to the
3395	\| corresponding value `b', and 0 otherwise. The invalid exception is raised
3396	\| if either operand is a NaN. Otherwise, the comparison is performed
3397	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3398	----------------------------------------------------------------------------/
3399
3400	int float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
3401	{
3402	bits64 av, bv;
3403
3404	if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3405	\|\| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3406	) {
3407	float_raise( float_flag_invalid STATUS_VAR);
3408	return 0;
3409	}
3410	av = float64_val(a);
3411	bv = float64_val(b);
3412	return ( av == bv ) \|\| ( (bits64) ( ( av \| bv )<<1 ) == 0 );
3413
3414	}
3415
3416	/*----------------------------------------------------------------------------
3417	\| Returns 1 if the double-precision floating-point value `a' is less than or
3418	\| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3419	\| cause an exception. Otherwise, the comparison is performed according to the
3420	\| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3421	----------------------------------------------------------------------------/
3422
3423	int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3424	{
3425	flag aSign, bSign;
3426	bits64 av, bv;
3427
3428	if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3429	\|\| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3430	) {
3431	if ( float64_is_signaling_nan( a ) \|\| float64_is_signaling_nan( b ) ) {
3432	float_raise( float_flag_invalid STATUS_VAR);
3433	}
3434	return 0;
3435	}
3436	aSign = extractFloat64Sign( a );
3437	bSign = extractFloat64Sign( b );
3438	av = float64_val(a);
3439	bv = float64_val(b);
3440	if ( aSign != bSign ) return aSign \|\| ( (bits64) ( ( av \| bv )<<1 ) == 0 );
3441	return ( av == bv ) \|\| ( aSign ^ ( av < bv ) );
3442
3443	}
3444
3445	/*----------------------------------------------------------------------------
3446	\| Returns 1 if the double-precision floating-point value `a' is less than
3447	\| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3448	\| exception. Otherwise, the comparison is performed according to the IEC/IEEE
3449	\| Standard for Binary Floating-Point Arithmetic.
3450	----------------------------------------------------------------------------/
3451
3452	int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3453	{
3454	flag aSign, bSign;
3455	bits64 av, bv;
3456
3457	if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3458	\|\| ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3459	) {
3460	if ( float64_is_signaling_nan( a ) \|\| float64_is_signaling_nan( b ) ) {
3461	float_raise( float_flag_invalid STATUS_VAR);
3462	}
3463	return 0;
3464	}
3465	aSign = extractFloat64Sign( a );
3466	bSign = extractFloat64Sign( b );
3467	av = float64_val(a);
3468	bv = float64_val(b);
3469	if ( aSign != bSign ) return aSign && ( (bits64) ( ( av \| bv )<<1 ) != 0 );
3470	return ( av != bv ) && ( aSign ^ ( av < bv ) );
3471
3472	}
3473
3474	#ifdef FLOATX80
3475
3476	/*----------------------------------------------------------------------------
3477	\| Returns the result of converting the extended double-precision floating-
3478	\| point value `a' to the 32-bit two's complement integer format. The
3479	\| conversion is performed according to the IEC/IEEE Standard for Binary
3480	\| Floating-Point Arithmetic---which means in particular that the conversion
3481	\| is rounded according to the current rounding mode. If `a' is a NaN, the
3482	\| largest positive integer is returned. Otherwise, if the conversion
3483	\| overflows, the largest integer with the same sign as `a' is returned.
3484	----------------------------------------------------------------------------/
3485
3486	int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3487	{
3488	flag aSign;
3489	int32 aExp, shiftCount;
3490	bits64 aSig;
3491
3492	aSig = extractFloatx80Frac( a );
3493	aExp = extractFloatx80Exp( a );
3494	aSign = extractFloatx80Sign( a );
3495	if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3496	shiftCount = 0x4037 - aExp;
3497	if ( shiftCount <= 0 ) shiftCount = 1;
3498	shift64RightJamming( aSig, shiftCount, &aSig );
3499	return roundAndPackInt32( aSign, aSig STATUS_VAR );
3500
3501	}
3502
3503	/*----------------------------------------------------------------------------
3504	\| Returns the result of converting the extended double-precision floating-
3505	\| point value `a' to the 32-bit two's complement integer format. The
3506	\| conversion is performed according to the IEC/IEEE Standard for Binary
3507	\| Floating-Point Arithmetic, except that the conversion is always rounded
3508	\| toward zero. If `a' is a NaN, the largest positive integer is returned.
3509	\| Otherwise, if the conversion overflows, the largest integer with the same
3510	\| sign as `a' is returned.
3511	----------------------------------------------------------------------------/
3512
3513	int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3514	{
3515	flag aSign;
3516	int32 aExp, shiftCount;
3517	bits64 aSig, savedASig;
3518	int32 z;
3519
3520	aSig = extractFloatx80Frac( a );
3521	aExp = extractFloatx80Exp( a );
3522	aSign = extractFloatx80Sign( a );
3523	if ( 0x401E < aExp ) {
3524	if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3525	goto invalid;
3526	}
3527	else if ( aExp < 0x3FFF ) {
3528	if ( aExp \|\| aSig ) STATUS(float_exception_flags) \|= float_flag_inexact;
3529	return 0;
3530	}
3531	shiftCount = 0x403E - aExp;
3532	savedASig = aSig;
3533	aSig >>= shiftCount;
3534	z = aSig;
3535	if ( aSign ) z = - z;
3536	if ( ( z < 0 ) ^ aSign ) {
3537	invalid:
3538	float_raise( float_flag_invalid STATUS_VAR);
3539	return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3540	}
3541	if ( ( aSig<<shiftCount ) != savedASig ) {
3542	STATUS(float_exception_flags) \|= float_flag_inexact;
3543	}
3544	return z;
3545
3546	}
3547
3548	/*----------------------------------------------------------------------------
3549	\| Returns the result of converting the extended double-precision floating-
3550	\| point value `a' to the 64-bit two's complement integer format. The
3551	\| conversion is performed according to the IEC/IEEE Standard for Binary
3552	\| Floating-Point Arithmetic---which means in particular that the conversion
3553	\| is rounded according to the current rounding mode. If `a' is a NaN,
3554	\| the largest positive integer is returned. Otherwise, if the conversion
3555	\| overflows, the largest integer with the same sign as `a' is returned.
3556	----------------------------------------------------------------------------/
3557
3558	int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3559	{
3560	flag aSign;
3561	int32 aExp, shiftCount;
3562	bits64 aSig, aSigExtra;
3563
3564	aSig = extractFloatx80Frac( a );
3565	aExp = extractFloatx80Exp( a );
3566	aSign = extractFloatx80Sign( a );
3567	shiftCount = 0x403E - aExp;
3568	if ( shiftCount <= 0 ) {
3569	if ( shiftCount ) {
3570	float_raise( float_flag_invalid STATUS_VAR);
3571	if ( ! aSign
3572	\|\| ( ( aExp == 0x7FFF )
3573	&& ( aSig != LIT64( 0x8000000000000000 ) ) )
3574	) {
3575	return LIT64( 0x7FFFFFFFFFFFFFFF );
3576	}
3577	return (sbits64) LIT64( 0x8000000000000000 );
3578	}
3579	aSigExtra = 0;
3580	}
3581	else {
3582	shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3583	}
3584	return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3585
3586	}
3587
3588	/*----------------------------------------------------------------------------
3589	\| Returns the result of converting the extended double-precision floating-
3590	\| point value `a' to the 64-bit two's complement integer format. The
3591	\| conversion is performed according to the IEC/IEEE Standard for Binary
3592	\| Floating-Point Arithmetic, except that the conversion is always rounded
3593	\| toward zero. If `a' is a NaN, the largest positive integer is returned.
3594	\| Otherwise, if the conversion overflows, the largest integer with the same
3595	\| sign as `a' is returned.
3596	----------------------------------------------------------------------------/
3597
3598	int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3599	{
3600	flag aSign;
3601	int32 aExp, shiftCount;
3602	bits64 aSig;
3603	int64 z;
3604
3605	aSig = extractFloatx80Frac( a );
3606	aExp = extractFloatx80Exp( a );
3607	aSign = extractFloatx80Sign( a );
3608	shiftCount = aExp - 0x403E;
3609	if ( 0 <= shiftCount ) {
3610	aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3611	if ( ( a.high != 0xC03E ) \|\| aSig ) {
3612	float_raise( float_flag_invalid STATUS_VAR);
3613	if ( ! aSign \|\| ( ( aExp == 0x7FFF ) && aSig ) ) {
3614	return LIT64( 0x7FFFFFFFFFFFFFFF );
3615	}
3616	}
3617	return (sbits64) LIT64( 0x8000000000000000 );
3618	}
3619	else if ( aExp < 0x3FFF ) {
3620	if ( aExp \| aSig ) STATUS(float_exception_flags) \|= float_flag_inexact;
3621	return 0;
3622	}
3623	z = aSig>>( - shiftCount );
3624	if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3625	STATUS(float_exception_flags) \|= float_flag_inexact;
3626	}
3627	if ( aSign ) z = - z;
3628	return z;
3629
3630	}
3631
3632	/*----------------------------------------------------------------------------
3633	\| Returns the result of converting the extended double-precision floating-
3634	\| point value `a' to the single-precision floating-point format. The
3635	\| conversion is performed according to the IEC/IEEE Standard for Binary
3636	\| Floating-Point Arithmetic.
3637	----------------------------------------------------------------------------/
3638
3639	float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3640	{
3641	flag aSign;
3642	int32 aExp;
3643	bits64 aSig;
3644
3645	aSig = extractFloatx80Frac( a );
3646	aExp = extractFloatx80Exp( a );
3647	aSign = extractFloatx80Sign( a );
3648	if ( aExp == 0x7FFF ) {
3649	if ( (bits64) ( aSig<<1 ) ) {
3650	return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) );
3651	}
3652	return packFloat32( aSign, 0xFF, 0 );
3653	}
3654	shift64RightJamming( aSig, 33, &aSig );
3655	if ( aExp \|\| aSig ) aExp -= 0x3F81;
3656	return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3657
3658	}
3659
3660	/*----------------------------------------------------------------------------
3661	\| Returns the result of converting the extended double-precision floating-
3662	\| point value `a' to the double-precision floating-point format. The
3663	\| conversion is performed according to the IEC/IEEE Standard for Binary
3664	\| Floating-Point Arithmetic.
3665	----------------------------------------------------------------------------/
3666
3667	float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3668	{
3669	flag aSign;
3670	int32 aExp;
3671	bits64 aSig, zSig;
3672
3673	aSig = extractFloatx80Frac( a );
3674	aExp = extractFloatx80Exp( a );
3675	aSign = extractFloatx80Sign( a );
3676	if ( aExp == 0x7FFF ) {
3677	if ( (bits64) ( aSig<<1 ) ) {
3678	return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) );
3679	}
3680	return packFloat64( aSign, 0x7FF, 0 );
3681	}
3682	shift64RightJamming( aSig, 1, &zSig );
3683	if ( aExp \|\| aSig ) aExp -= 0x3C01;
3684	return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3685
3686	}
3687
3688	#ifdef FLOAT128
3689
3690	/*----------------------------------------------------------------------------
3691	\| Returns the result of converting the extended double-precision floating-
3692	\| point value `a' to the quadruple-precision floating-point format. The
3693	\| conversion is performed according to the IEC/IEEE Standard for Binary
3694	\| Floating-Point Arithmetic.
3695	----------------------------------------------------------------------------/
3696
3697	float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3698	{
3699	flag aSign;
3700	int16 aExp;
3701	bits64 aSig, zSig0, zSig1;
3702
3703	aSig = extractFloatx80Frac( a );
3704	aExp = extractFloatx80Exp( a );
3705	aSign = extractFloatx80Sign( a );
3706	if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3707	return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) );
3708	}
3709	shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3710	return packFloat128( aSign, aExp, zSig0, zSig1 );
3711
3712	}
3713
3714	#endif
3715
3716	/*----------------------------------------------------------------------------
3717	\| Rounds the extended double-precision floating-point value `a' to an integer,
3718	\| and returns the result as an extended quadruple-precision floating-point
3719	\| value. The operation is performed according to the IEC/IEEE Standard for
3720	\| Binary Floating-Point Arithmetic.
3721	----------------------------------------------------------------------------/
3722
3723	floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
3724	{
3725	flag aSign;
3726	int32 aExp;
3727	bits64 lastBitMask, roundBitsMask;
3728	int8 roundingMode;
3729	floatx80 z;
3730
3731	aExp = extractFloatx80Exp( a );
3732	if ( 0x403E <= aExp ) {
3733	if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3734	return propagateFloatx80NaN( a, a STATUS_VAR );
3735	}
3736	return a;
3737	}
3738	if ( aExp < 0x3FFF ) {
3739	if ( ( aExp == 0 )
3740	&& ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3741	return a;
3742	}
3743	STATUS(float_exception_flags) \|= float_flag_inexact;
3744	aSign = extractFloatx80Sign( a );
3745	switch ( STATUS(float_rounding_mode) ) {
3746	case float_round_nearest_even:
3747	if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3748	) {
3749	return
3750	packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3751	}
3752	break;
3753	case float_round_down:
3754	return
3755	aSign ?
3756	packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3757	: packFloatx80( 0, 0, 0 );
3758	case float_round_up:
3759	return
3760	aSign ? packFloatx80( 1, 0, 0 )
3761	: packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3762	}
3763	return packFloatx80( aSign, 0, 0 );
3764	}
3765	lastBitMask = 1;
3766	lastBitMask <<= 0x403E - aExp;
3767	roundBitsMask = lastBitMask - 1;
3768	z = a;
3769	roundingMode = STATUS(float_rounding_mode);
3770	if ( roundingMode == float_round_nearest_even ) {
3771	z.low += lastBitMask>>1;
3772	if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3773	}
3774	else if ( roundingMode != float_round_to_zero ) {
3775	if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3776	z.low += roundBitsMask;
3777	}
3778	}
3779	z.low &= ~ roundBitsMask;
3780	if ( z.low == 0 ) {
3781	++z.high;
3782	z.low = LIT64( 0x8000000000000000 );
3783	}
3784	if ( z.low != a.low ) STATUS(float_exception_flags) \|= float_flag_inexact;
3785	return z;
3786
3787	}
3788
3789	/*----------------------------------------------------------------------------
3790	\| Returns the result of adding the absolute values of the extended double-
3791	\| precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
3792	\| negated before being returned. `zSign' is ignored if the result is a NaN.
3793	\| The addition is performed according to the IEC/IEEE Standard for Binary
3794	\| Floating-Point Arithmetic.
3795	----------------------------------------------------------------------------/
3796
3797	static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
3798	{
3799	int32 aExp, bExp, zExp;
3800	bits64 aSig, bSig, zSig0, zSig1;
3801	int32 expDiff;
3802
3803	aSig = extractFloatx80Frac( a );
3804	aExp = extractFloatx80Exp( a );
3805	bSig = extractFloatx80Frac( b );
3806	bExp = extractFloatx80Exp( b );
3807	expDiff = aExp - bExp;
3808	if ( 0 < expDiff ) {
3809	if ( aExp == 0x7FFF ) {
3810	if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3811	return a;
3812	}
3813	if ( bExp == 0 ) --expDiff;
3814	shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3815	zExp = aExp;
3816	}
3817	else if ( expDiff < 0 ) {
3818	if ( bExp == 0x7FFF ) {
3819	if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3820	return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3821	}
3822	if ( aExp == 0 ) ++expDiff;
3823	shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3824	zExp = bExp;
3825	}
3826	else {
3827	if ( aExp == 0x7FFF ) {
3828	if ( (bits64) ( ( aSig \| bSig )<<1 ) ) {
3829	return propagateFloatx80NaN( a, b STATUS_VAR );
3830	}
3831	return a;
3832	}
3833	zSig1 = 0;
3834	zSig0 = aSig + bSig;
3835	if ( aExp == 0 ) {
3836	normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3837	goto roundAndPack;
3838	}
3839	zExp = aExp;
3840	goto shiftRight1;
3841	}
3842	zSig0 = aSig + bSig;
3843	if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3844	shiftRight1:
3845	shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3846	zSig0 \|= LIT64( 0x8000000000000000 );
3847	++zExp;
3848	roundAndPack:
3849	return
3850	roundAndPackFloatx80(
3851	STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3852
3853	}
3854
3855	/*----------------------------------------------------------------------------
3856	\| Returns the result of subtracting the absolute values of the extended
3857	\| double-precision floating-point values `a' and `b'. If `zSign' is 1, the
3858	\| difference is negated before being returned. `zSign' is ignored if the
3859	\| result is a NaN. The subtraction is performed according to the IEC/IEEE
3860	\| Standard for Binary Floating-Point Arithmetic.
3861	----------------------------------------------------------------------------/
3862
3863	static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
3864	{
3865	int32 aExp, bExp, zExp;
3866	bits64 aSig, bSig, zSig0, zSig1;
3867	int32 expDiff;
3868	floatx80 z;
3869
3870	aSig = extractFloatx80Frac( a );
3871	aExp = extractFloatx80Exp( a );
3872	bSig = extractFloatx80Frac( b );
3873	bExp = extractFloatx80Exp( b );
3874	expDiff = aExp - bExp;
3875	if ( 0 < expDiff ) goto aExpBigger;
3876	if ( expDiff < 0 ) goto bExpBigger;
3877	if ( aExp == 0x7FFF ) {
3878	if ( (bits64) ( ( aSig \| bSig )<<1 ) ) {
3879	return propagateFloatx80NaN( a, b STATUS_VAR );
3880	}
3881	float_raise( float_flag_invalid STATUS_VAR);
3882	z.low = floatx80_default_nan_low;
3883	z.high = floatx80_default_nan_high;
3884	return z;
3885	}
3886	if ( aExp == 0 ) {
3887	aExp = 1;
3888	bExp = 1;
3889	}
3890	zSig1 = 0;
3891	if ( bSig < aSig ) goto aBigger;
3892	if ( aSig < bSig ) goto bBigger;
3893	return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3894	bExpBigger:
3895	if ( bExp == 0x7FFF ) {
3896	if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3897	return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3898	}
3899	if ( aExp == 0 ) ++expDiff;
3900	shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3901	bBigger:
3902	sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3903	zExp = bExp;
3904	zSign ^= 1;
3905	goto normalizeRoundAndPack;
3906	aExpBigger:
3907	if ( aExp == 0x7FFF ) {
3908	if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3909	return a;
3910	}
3911	if ( bExp == 0 ) --expDiff;
3912	shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3913	aBigger:
3914	sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3915	zExp = aExp;
3916	normalizeRoundAndPack:
3917	return
3918	normalizeRoundAndPackFloatx80(
3919	STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3920
3921	}
3922
3923	/*----------------------------------------------------------------------------
3924	\| Returns the result of adding the extended double-precision floating-point
3925	\| values `a' and `b'. The operation is performed according to the IEC/IEEE
3926	\| Standard for Binary Floating-Point Arithmetic.
3927	----------------------------------------------------------------------------/
3928
3929	floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
3930	{
3931	flag aSign, bSign;
3932
3933	aSign = extractFloatx80Sign( a );
3934	bSign = extractFloatx80Sign( b );
3935	if ( aSign == bSign ) {
3936	return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3937	}
3938	else {
3939	return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3940	}
3941
3942	}
3943
3944	/*----------------------------------------------------------------------------
3945	\| Returns the result of subtracting the extended double-precision floating-
3946	\| point values `a' and `b'. The operation is performed according to the
3947	\| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3948	----------------------------------------------------------------------------/
3949
3950	floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
3951	{
3952	flag aSign, bSign;
3953
3954	aSign = extractFloatx80Sign( a );
3955	bSign = extractFloatx80Sign( b );
3956	if ( aSign == bSign ) {
3957	return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3958	}
3959	else {
3960	return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3961	}
3962
3963	}
3964
3965	/*----------------------------------------------------------------------------
3966	\| Returns the result of multiplying the extended double-precision floating-
3967	\| point values `a' and `b'. The operation is performed according to the
3968	\| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3969	----------------------------------------------------------------------------/
3970
3971	floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
3972	{
3973	flag aSign, bSign, zSign;
3974	int32 aExp, bExp, zExp;
3975	bits64 aSig, bSig, zSig0, zSig1;
3976	floatx80 z;
3977
3978	aSig = extractFloatx80Frac( a );
3979	aExp = extractFloatx80Exp( a );
3980	aSign = extractFloatx80Sign( a );
3981	bSig = extractFloatx80Frac( b );
3982	bExp = extractFloatx80Exp( b );
3983	bSign = extractFloatx80Sign( b );
3984	zSign = aSign ^ bSign;
3985	if ( aExp == 0x7FFF ) {
3986	if ( (bits64) ( aSig<<1 )
3987	\|\| ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3988	return propagateFloatx80NaN( a, b STATUS_VAR );
3989	}
3990	if ( ( bExp \| bSig ) == 0 ) goto invalid;
3991	return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3992	}
3993	if ( bExp == 0x7FFF ) {
3994	if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3995	if ( ( aExp \| aSig ) == 0 ) {
3996	invalid:
3997	float_raise( float_flag_invalid STATUS_VAR);
3998	z.low = floatx80_default_nan_low;
3999	z.high = floatx80_default_nan_high;
4000	return z;
4001	}
4002	return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4003	}
4004	if ( aExp == 0 ) {
4005	if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4006	normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4007	}
4008	if ( bExp == 0 ) {
4009	if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4010	normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4011	}
4012	zExp = aExp + bExp - 0x3FFE;
4013	mul64To128( aSig, bSig, &zSig0, &zSig1 );
4014	if ( 0 < (sbits64) zSig0 ) {
4015	shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4016	--zExp;
4017	}
4018	return
4019	roundAndPackFloatx80(
4020	STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4021
4022	}
4023
4024	/*----------------------------------------------------------------------------
4025	\| Returns the result of dividing the extended double-precision floating-point
4026	\| value `a' by the corresponding value `b'. The operation is performed
4027	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4028	----------------------------------------------------------------------------/
4029
4030	floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4031	{
4032	flag aSign, bSign, zSign;
4033	int32 aExp, bExp, zExp;
4034	bits64 aSig, bSig, zSig0, zSig1;
4035	bits64 rem0, rem1, rem2, term0, term1, term2;
4036	floatx80 z;
4037
4038	aSig = extractFloatx80Frac( a );
4039	aExp = extractFloatx80Exp( a );
4040	aSign = extractFloatx80Sign( a );
4041	bSig = extractFloatx80Frac( b );
4042	bExp = extractFloatx80Exp( b );
4043	bSign = extractFloatx80Sign( b );
4044	zSign = aSign ^ bSign;
4045	if ( aExp == 0x7FFF ) {
4046	if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4047	if ( bExp == 0x7FFF ) {
4048	if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4049	goto invalid;
4050	}
4051	return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4052	}
4053	if ( bExp == 0x7FFF ) {
4054	if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4055	return packFloatx80( zSign, 0, 0 );
4056	}
4057	if ( bExp == 0 ) {
4058	if ( bSig == 0 ) {
4059	if ( ( aExp \| aSig ) == 0 ) {
4060	invalid:
4061	float_raise( float_flag_invalid STATUS_VAR);
4062	z.low = floatx80_default_nan_low;
4063	z.high = floatx80_default_nan_high;
4064	return z;
4065	}
4066	float_raise( float_flag_divbyzero STATUS_VAR);
4067	return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4068	}
4069	normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4070	}
4071	if ( aExp == 0 ) {
4072	if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4073	normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4074	}
4075	zExp = aExp - bExp + 0x3FFE;
4076	rem1 = 0;
4077	if ( bSig <= aSig ) {
4078	shift128Right( aSig, 0, 1, &aSig, &rem1 );
4079	++zExp;
4080	}
4081	zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4082	mul64To128( bSig, zSig0, &term0, &term1 );
4083	sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4084	while ( (sbits64) rem0 < 0 ) {
4085	--zSig0;
4086	add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4087	}
4088	zSig1 = estimateDiv128To64( rem1, 0, bSig );
4089	if ( (bits64) ( zSig1<<1 ) <= 8 ) {
4090	mul64To128( bSig, zSig1, &term1, &term2 );
4091	sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4092	while ( (sbits64) rem1 < 0 ) {
4093	--zSig1;
4094	add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4095	}
4096	zSig1 \|= ( ( rem1 \| rem2 ) != 0 );
4097	}
4098	return
4099	roundAndPackFloatx80(
4100	STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4101
4102	}
4103
4104	/*----------------------------------------------------------------------------
4105	\| Returns the remainder of the extended double-precision floating-point value
4106	\| `a' with respect to the corresponding value `b'. The operation is performed
4107	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4108	----------------------------------------------------------------------------/
4109
4110	floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4111	{
4112	flag aSign, zSign;
4113	int32 aExp, bExp, expDiff;
4114	bits64 aSig0, aSig1, bSig;
4115	bits64 q, term0, term1, alternateASig0, alternateASig1;
4116	floatx80 z;
4117
4118	aSig0 = extractFloatx80Frac( a );
4119	aExp = extractFloatx80Exp( a );
4120	aSign = extractFloatx80Sign( a );
4121	bSig = extractFloatx80Frac( b );
4122	bExp = extractFloatx80Exp( b );
4123	if ( aExp == 0x7FFF ) {
4124	if ( (bits64) ( aSig0<<1 )
4125	\|\| ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
4126	return propagateFloatx80NaN( a, b STATUS_VAR );
4127	}
4128	goto invalid;
4129	}
4130	if ( bExp == 0x7FFF ) {
4131	if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4132	return a;
4133	}
4134	if ( bExp == 0 ) {
4135	if ( bSig == 0 ) {
4136	invalid:
4137	float_raise( float_flag_invalid STATUS_VAR);
4138	z.low = floatx80_default_nan_low;
4139	z.high = floatx80_default_nan_high;
4140	return z;
4141	}
4142	normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4143	}
4144	if ( aExp == 0 ) {
4145	if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
4146	normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4147	}
4148	bSig \|= LIT64( 0x8000000000000000 );
4149	zSign = aSign;
4150	expDiff = aExp - bExp;
4151	aSig1 = 0;
4152	if ( expDiff < 0 ) {
4153	if ( expDiff < -1 ) return a;
4154	shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4155	expDiff = 0;
4156	}
4157	q = ( bSig <= aSig0 );
4158	if ( q ) aSig0 -= bSig;
4159	expDiff -= 64;
4160	while ( 0 < expDiff ) {
4161	q = estimateDiv128To64( aSig0, aSig1, bSig );
4162	q = ( 2 < q ) ? q - 2 : 0;
4163	mul64To128( bSig, q, &term0, &term1 );
4164	sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4165	shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4166	expDiff -= 62;
4167	}
4168	expDiff += 64;
4169	if ( 0 < expDiff ) {
4170	q = estimateDiv128To64( aSig0, aSig1, bSig );
4171	q = ( 2 < q ) ? q - 2 : 0;
4172	q >>= 64 - expDiff;
4173	mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4174	sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4175	shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4176	while ( le128( term0, term1, aSig0, aSig1 ) ) {
4177	++q;
4178	sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4179	}
4180	}
4181	else {
4182	term1 = 0;
4183	term0 = bSig;
4184	}
4185	sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4186	if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4187	\|\| ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4188	&& ( q & 1 ) )
4189	) {
4190	aSig0 = alternateASig0;
4191	aSig1 = alternateASig1;
4192	zSign = ! zSign;
4193	}
4194	return
4195	normalizeRoundAndPackFloatx80(
4196	80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4197
4198	}
4199
4200	/*----------------------------------------------------------------------------
4201	\| Returns the square root of the extended double-precision floating-point
4202	\| value `a'. The operation is performed according to the IEC/IEEE Standard
4203	\| for Binary Floating-Point Arithmetic.
4204	----------------------------------------------------------------------------/
4205
4206	floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4207	{
4208	flag aSign;
4209	int32 aExp, zExp;
4210	bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4211	bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4212	floatx80 z;
4213
4214	aSig0 = extractFloatx80Frac( a );
4215	aExp = extractFloatx80Exp( a );
4216	aSign = extractFloatx80Sign( a );
4217	if ( aExp == 0x7FFF ) {
4218	if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
4219	if ( ! aSign ) return a;
4220	goto invalid;
4221	}
4222	if ( aSign ) {
4223	if ( ( aExp \| aSig0 ) == 0 ) return a;
4224	invalid:
4225	float_raise( float_flag_invalid STATUS_VAR);
4226	z.low = floatx80_default_nan_low;
4227	z.high = floatx80_default_nan_high;
4228	return z;
4229	}
4230	if ( aExp == 0 ) {
4231	if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4232	normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4233	}
4234	zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4235	zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4236	shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4237	zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4238	doubleZSig0 = zSig0<<1;
4239	mul64To128( zSig0, zSig0, &term0, &term1 );
4240	sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4241	while ( (sbits64) rem0 < 0 ) {
4242	--zSig0;
4243	doubleZSig0 -= 2;
4244	add128( rem0, rem1, zSig0>>63, doubleZSig0 \| 1, &rem0, &rem1 );
4245	}
4246	zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4247	if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4248	if ( zSig1 == 0 ) zSig1 = 1;
4249	mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4250	sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4251	mul64To128( zSig1, zSig1, &term2, &term3 );
4252	sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4253	while ( (sbits64) rem1 < 0 ) {
4254	--zSig1;
4255	shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4256	term3 \|= 1;
4257	term2 \|= doubleZSig0;
4258	add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4259	}
4260	zSig1 \|= ( ( rem1 \| rem2 \| rem3 ) != 0 );
4261	}
4262	shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4263	zSig0 \|= doubleZSig0;
4264	return
4265	roundAndPackFloatx80(
4266	STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
4267
4268	}
4269
4270	/*----------------------------------------------------------------------------
4271	\| Returns 1 if the extended double-precision floating-point value `a' is
4272	\| equal to the corresponding value `b', and 0 otherwise. The comparison is
4273	\| performed according to the IEC/IEEE Standard for Binary Floating-Point
4274	\| Arithmetic.
4275	----------------------------------------------------------------------------/
4276
4277	int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
4278	{
4279
4280	if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4281	&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
4282	\|\| ( ( extractFloatx80Exp( b ) == 0x7FFF )
4283	&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
4284	) {
4285	if ( floatx80_is_signaling_nan( a )
4286	\|\| floatx80_is_signaling_nan( b ) ) {
4287	float_raise( float_flag_invalid STATUS_VAR);
4288	}
4289	return 0;
4290	}
4291	return
4292	( a.low == b.low )
4293	&& ( ( a.high == b.high )
4294	\|\| ( ( a.low == 0 )
4295	&& ( (bits16) ( ( a.high \| b.high )<<1 ) == 0 ) )
4296	);
4297
4298	}
4299
4300	/*----------------------------------------------------------------------------
4301	\| Returns 1 if the extended double-precision floating-point value `a' is
4302	\| less than or equal to the corresponding value `b', and 0 otherwise. The
4303	\| comparison is performed according to the IEC/IEEE Standard for Binary
4304	\| Floating-Point Arithmetic.
4305	----------------------------------------------------------------------------/
4306
4307	int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
4308	{
4309	flag aSign, bSign;
4310
4311	if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4312	&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
4313	\|\| ( ( extractFloatx80Exp( b ) == 0x7FFF )
4314	&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
4315	) {
4316	float_raise( float_flag_invalid STATUS_VAR);
4317	return 0;
4318	}
4319	aSign = extractFloatx80Sign( a );
4320	bSign = extractFloatx80Sign( b );
4321	if ( aSign != bSign ) {
4322	return
4323	aSign
4324	\|\| ( ( ( (bits16) ( ( a.high \| b.high )<<1 ) ) \| a.low \| b.low )
4325	== 0 );
4326	}
4327	return
4328	aSign ? le128( b.high, b.low, a.high, a.low )
4329	: le128( a.high, a.low, b.high, b.low );
4330
4331	}
4332
4333	/*----------------------------------------------------------------------------
4334	\| Returns 1 if the extended double-precision floating-point value `a' is
4335	\| less than the corresponding value `b', and 0 otherwise. The comparison
4336	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4337	\| Arithmetic.
4338	----------------------------------------------------------------------------/
4339
4340	int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
4341	{
4342	flag aSign, bSign;
4343
4344	if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4345	&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
4346	\|\| ( ( extractFloatx80Exp( b ) == 0x7FFF )
4347	&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
4348	) {
4349	float_raise( float_flag_invalid STATUS_VAR);
4350	return 0;
4351	}
4352	aSign = extractFloatx80Sign( a );
4353	bSign = extractFloatx80Sign( b );
4354	if ( aSign != bSign ) {
4355	return
4356	aSign
4357	&& ( ( ( (bits16) ( ( a.high \| b.high )<<1 ) ) \| a.low \| b.low )
4358	!= 0 );
4359	}
4360	return
4361	aSign ? lt128( b.high, b.low, a.high, a.low )
4362	: lt128( a.high, a.low, b.high, b.low );
4363
4364	}
4365
4366	/*----------------------------------------------------------------------------
4367	\| Returns 1 if the extended double-precision floating-point value `a' is equal
4368	\| to the corresponding value `b', and 0 otherwise. The invalid exception is
4369	\| raised if either operand is a NaN. Otherwise, the comparison is performed
4370	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4371	----------------------------------------------------------------------------/
4372
4373	int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
4374	{
4375
4376	if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4377	&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
4378	\|\| ( ( extractFloatx80Exp( b ) == 0x7FFF )
4379	&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
4380	) {
4381	float_raise( float_flag_invalid STATUS_VAR);
4382	return 0;
4383	}
4384	return
4385	( a.low == b.low )
4386	&& ( ( a.high == b.high )
4387	\|\| ( ( a.low == 0 )
4388	&& ( (bits16) ( ( a.high \| b.high )<<1 ) == 0 ) )
4389	);
4390
4391	}
4392
4393	/*----------------------------------------------------------------------------
4394	\| Returns 1 if the extended double-precision floating-point value `a' is less
4395	\| than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4396	\| do not cause an exception. Otherwise, the comparison is performed according
4397	\| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4398	----------------------------------------------------------------------------/
4399
4400	int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4401	{
4402	flag aSign, bSign;
4403
4404	if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4405	&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
4406	\|\| ( ( extractFloatx80Exp( b ) == 0x7FFF )
4407	&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
4408	) {
4409	if ( floatx80_is_signaling_nan( a )
4410	\|\| floatx80_is_signaling_nan( b ) ) {
4411	float_raise( float_flag_invalid STATUS_VAR);
4412	}
4413	return 0;
4414	}
4415	aSign = extractFloatx80Sign( a );
4416	bSign = extractFloatx80Sign( b );
4417	if ( aSign != bSign ) {
4418	return
4419	aSign
4420	\|\| ( ( ( (bits16) ( ( a.high \| b.high )<<1 ) ) \| a.low \| b.low )
4421	== 0 );
4422	}
4423	return
4424	aSign ? le128( b.high, b.low, a.high, a.low )
4425	: le128( a.high, a.low, b.high, b.low );
4426
4427	}
4428
4429	/*----------------------------------------------------------------------------
4430	\| Returns 1 if the extended double-precision floating-point value `a' is less
4431	\| than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4432	\| an exception. Otherwise, the comparison is performed according to the
4433	\| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4434	----------------------------------------------------------------------------/
4435
4436	int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4437	{
4438	flag aSign, bSign;
4439
4440	if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4441	&& (bits64) ( extractFloatx80Frac( a )<<1 ) )
4442	\|\| ( ( extractFloatx80Exp( b ) == 0x7FFF )
4443	&& (bits64) ( extractFloatx80Frac( b )<<1 ) )
4444	) {
4445	if ( floatx80_is_signaling_nan( a )
4446	\|\| floatx80_is_signaling_nan( b ) ) {
4447	float_raise( float_flag_invalid STATUS_VAR);
4448	}
4449	return 0;
4450	}
4451	aSign = extractFloatx80Sign( a );
4452	bSign = extractFloatx80Sign( b );
4453	if ( aSign != bSign ) {
4454	return
4455	aSign
4456	&& ( ( ( (bits16) ( ( a.high \| b.high )<<1 ) ) \| a.low \| b.low )
4457	!= 0 );
4458	}
4459	return
4460	aSign ? lt128( b.high, b.low, a.high, a.low )
4461	: lt128( a.high, a.low, b.high, b.low );
4462
4463	}
4464
4465	#endif
4466
4467	#ifdef FLOAT128
4468
4469	/*----------------------------------------------------------------------------
4470	\| Returns the result of converting the quadruple-precision floating-point
4471	\| value `a' to the 32-bit two's complement integer format. The conversion
4472	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4473	\| Arithmetic---which means in particular that the conversion is rounded
4474	\| according to the current rounding mode. If `a' is a NaN, the largest
4475	\| positive integer is returned. Otherwise, if the conversion overflows, the
4476	\| largest integer with the same sign as `a' is returned.
4477	----------------------------------------------------------------------------/
4478
4479	int32 float128_to_int32( float128 a STATUS_PARAM )
4480	{
4481	flag aSign;
4482	int32 aExp, shiftCount;
4483	bits64 aSig0, aSig1;
4484
4485	aSig1 = extractFloat128Frac1( a );
4486	aSig0 = extractFloat128Frac0( a );
4487	aExp = extractFloat128Exp( a );
4488	aSign = extractFloat128Sign( a );
4489	if ( ( aExp == 0x7FFF ) && ( aSig0 \| aSig1 ) ) aSign = 0;
4490	if ( aExp ) aSig0 \|= LIT64( 0x0001000000000000 );
4491	aSig0 \|= ( aSig1 != 0 );
4492	shiftCount = 0x4028 - aExp;
4493	if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4494	return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4495
4496	}
4497
4498	/*----------------------------------------------------------------------------
4499	\| Returns the result of converting the quadruple-precision floating-point
4500	\| value `a' to the 32-bit two's complement integer format. The conversion
4501	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4502	\| Arithmetic, except that the conversion is always rounded toward zero. If
4503	\| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4504	\| conversion overflows, the largest integer with the same sign as `a' is
4505	\| returned.
4506	----------------------------------------------------------------------------/
4507
4508	int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4509	{
4510	flag aSign;
4511	int32 aExp, shiftCount;
4512	bits64 aSig0, aSig1, savedASig;
4513	int32 z;
4514
4515	aSig1 = extractFloat128Frac1( a );
4516	aSig0 = extractFloat128Frac0( a );
4517	aExp = extractFloat128Exp( a );
4518	aSign = extractFloat128Sign( a );
4519	aSig0 \|= ( aSig1 != 0 );
4520	if ( 0x401E < aExp ) {
4521	if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4522	goto invalid;
4523	}
4524	else if ( aExp < 0x3FFF ) {
4525	if ( aExp \|\| aSig0 ) STATUS(float_exception_flags) \|= float_flag_inexact;
4526	return 0;
4527	}
4528	aSig0 \|= LIT64( 0x0001000000000000 );
4529	shiftCount = 0x402F - aExp;
4530	savedASig = aSig0;
4531	aSig0 >>= shiftCount;
4532	z = aSig0;
4533	if ( aSign ) z = - z;
4534	if ( ( z < 0 ) ^ aSign ) {
4535	invalid:
4536	float_raise( float_flag_invalid STATUS_VAR);
4537	return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4538	}
4539	if ( ( aSig0<<shiftCount ) != savedASig ) {
4540	STATUS(float_exception_flags) \|= float_flag_inexact;
4541	}
4542	return z;
4543
4544	}
4545
4546	/*----------------------------------------------------------------------------
4547	\| Returns the result of converting the quadruple-precision floating-point
4548	\| value `a' to the 64-bit two's complement integer format. The conversion
4549	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4550	\| Arithmetic---which means in particular that the conversion is rounded
4551	\| according to the current rounding mode. If `a' is a NaN, the largest
4552	\| positive integer is returned. Otherwise, if the conversion overflows, the
4553	\| largest integer with the same sign as `a' is returned.
4554	----------------------------------------------------------------------------/
4555
4556	int64 float128_to_int64( float128 a STATUS_PARAM )
4557	{
4558	flag aSign;
4559	int32 aExp, shiftCount;
4560	bits64 aSig0, aSig1;
4561
4562	aSig1 = extractFloat128Frac1( a );
4563	aSig0 = extractFloat128Frac0( a );
4564	aExp = extractFloat128Exp( a );
4565	aSign = extractFloat128Sign( a );
4566	if ( aExp ) aSig0 \|= LIT64( 0x0001000000000000 );
4567	shiftCount = 0x402F - aExp;
4568	if ( shiftCount <= 0 ) {
4569	if ( 0x403E < aExp ) {
4570	float_raise( float_flag_invalid STATUS_VAR);
4571	if ( ! aSign
4572	\|\| ( ( aExp == 0x7FFF )
4573	&& ( aSig1 \|\| ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4574	)
4575	) {
4576	return LIT64( 0x7FFFFFFFFFFFFFFF );
4577	}
4578	return (sbits64) LIT64( 0x8000000000000000 );
4579	}
4580	shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4581	}
4582	else {
4583	shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4584	}
4585	return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4586
4587	}
4588
4589	/*----------------------------------------------------------------------------
4590	\| Returns the result of converting the quadruple-precision floating-point
4591	\| value `a' to the 64-bit two's complement integer format. The conversion
4592	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4593	\| Arithmetic, except that the conversion is always rounded toward zero.
4594	\| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4595	\| the conversion overflows, the largest integer with the same sign as `a' is
4596	\| returned.
4597	----------------------------------------------------------------------------/
4598
4599	int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4600	{
4601	flag aSign;
4602	int32 aExp, shiftCount;
4603	bits64 aSig0, aSig1;
4604	int64 z;
4605
4606	aSig1 = extractFloat128Frac1( a );
4607	aSig0 = extractFloat128Frac0( a );
4608	aExp = extractFloat128Exp( a );
4609	aSign = extractFloat128Sign( a );
4610	if ( aExp ) aSig0 \|= LIT64( 0x0001000000000000 );
4611	shiftCount = aExp - 0x402F;
4612	if ( 0 < shiftCount ) {
4613	if ( 0x403E <= aExp ) {
4614	aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4615	if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4616	&& ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4617	if ( aSig1 ) STATUS(float_exception_flags) \|= float_flag_inexact;
4618	}
4619	else {
4620	float_raise( float_flag_invalid STATUS_VAR);
4621	if ( ! aSign \|\| ( ( aExp == 0x7FFF ) && ( aSig0 \| aSig1 ) ) ) {
4622	return LIT64( 0x7FFFFFFFFFFFFFFF );
4623	}
4624	}
4625	return (sbits64) LIT64( 0x8000000000000000 );
4626	}
4627	z = ( aSig0<<shiftCount ) \| ( aSig1>>( ( - shiftCount ) & 63 ) );
4628	if ( (bits64) ( aSig1<<shiftCount ) ) {
4629	STATUS(float_exception_flags) \|= float_flag_inexact;
4630	}
4631	}
4632	else {
4633	if ( aExp < 0x3FFF ) {
4634	if ( aExp \| aSig0 \| aSig1 ) {
4635	STATUS(float_exception_flags) \|= float_flag_inexact;
4636	}
4637	return 0;
4638	}
4639	z = aSig0>>( - shiftCount );
4640	if ( aSig1
4641	\|\| ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4642	STATUS(float_exception_flags) \|= float_flag_inexact;
4643	}
4644	}
4645	if ( aSign ) z = - z;
4646	return z;
4647
4648	}
4649
4650	/*----------------------------------------------------------------------------
4651	\| Returns the result of converting the quadruple-precision floating-point
4652	\| value `a' to the single-precision floating-point format. The conversion
4653	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4654	\| Arithmetic.
4655	----------------------------------------------------------------------------/
4656
4657	float32 float128_to_float32( float128 a STATUS_PARAM )
4658	{
4659	flag aSign;
4660	int32 aExp;
4661	bits64 aSig0, aSig1;
4662	bits32 zSig;
4663
4664	aSig1 = extractFloat128Frac1( a );
4665	aSig0 = extractFloat128Frac0( a );
4666	aExp = extractFloat128Exp( a );
4667	aSign = extractFloat128Sign( a );
4668	if ( aExp == 0x7FFF ) {
4669	if ( aSig0 \| aSig1 ) {
4670	return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) );
4671	}
4672	return packFloat32( aSign, 0xFF, 0 );
4673	}
4674	aSig0 \|= ( aSig1 != 0 );
4675	shift64RightJamming( aSig0, 18, &aSig0 );
4676	zSig = aSig0;
4677	if ( aExp \|\| zSig ) {
4678	zSig \|= 0x40000000;
4679	aExp -= 0x3F81;
4680	}
4681	return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
4682
4683	}
4684
4685	/*----------------------------------------------------------------------------
4686	\| Returns the result of converting the quadruple-precision floating-point
4687	\| value `a' to the double-precision floating-point format. The conversion
4688	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4689	\| Arithmetic.
4690	----------------------------------------------------------------------------/
4691
4692	float64 float128_to_float64( float128 a STATUS_PARAM )
4693	{
4694	flag aSign;
4695	int32 aExp;
4696	bits64 aSig0, aSig1;
4697
4698	aSig1 = extractFloat128Frac1( a );
4699	aSig0 = extractFloat128Frac0( a );
4700	aExp = extractFloat128Exp( a );
4701	aSign = extractFloat128Sign( a );
4702	if ( aExp == 0x7FFF ) {
4703	if ( aSig0 \| aSig1 ) {
4704	return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) );
4705	}
4706	return packFloat64( aSign, 0x7FF, 0 );
4707	}
4708	shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4709	aSig0 \|= ( aSig1 != 0 );
4710	if ( aExp \|\| aSig0 ) {
4711	aSig0 \|= LIT64( 0x4000000000000000 );
4712	aExp -= 0x3C01;
4713	}
4714	return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
4715
4716	}
4717
4718	#ifdef FLOATX80
4719
4720	/*----------------------------------------------------------------------------
4721	\| Returns the result of converting the quadruple-precision floating-point
4722	\| value `a' to the extended double-precision floating-point format. The
4723	\| conversion is performed according to the IEC/IEEE Standard for Binary
4724	\| Floating-Point Arithmetic.
4725	----------------------------------------------------------------------------/
4726
4727	floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
4728	{
4729	flag aSign;
4730	int32 aExp;
4731	bits64 aSig0, aSig1;
4732
4733	aSig1 = extractFloat128Frac1( a );
4734	aSig0 = extractFloat128Frac0( a );
4735	aExp = extractFloat128Exp( a );
4736	aSign = extractFloat128Sign( a );
4737	if ( aExp == 0x7FFF ) {
4738	if ( aSig0 \| aSig1 ) {
4739	return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) );
4740	}
4741	return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4742	}
4743	if ( aExp == 0 ) {
4744	if ( ( aSig0 \| aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4745	normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4746	}
4747	else {
4748	aSig0 \|= LIT64( 0x0001000000000000 );
4749	}
4750	shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4751	return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
4752
4753	}
4754
4755	#endif
4756
4757	/*----------------------------------------------------------------------------
4758	\| Rounds the quadruple-precision floating-point value `a' to an integer, and
4759	\| returns the result as a quadruple-precision floating-point value. The
4760	\| operation is performed according to the IEC/IEEE Standard for Binary
4761	\| Floating-Point Arithmetic.
4762	----------------------------------------------------------------------------/
4763
4764	float128 float128_round_to_int( float128 a STATUS_PARAM )
4765	{
4766	flag aSign;
4767	int32 aExp;
4768	bits64 lastBitMask, roundBitsMask;
4769	int8 roundingMode;
4770	float128 z;
4771
4772	aExp = extractFloat128Exp( a );
4773	if ( 0x402F <= aExp ) {
4774	if ( 0x406F <= aExp ) {
4775	if ( ( aExp == 0x7FFF )
4776	&& ( extractFloat128Frac0( a ) \| extractFloat128Frac1( a ) )
4777	) {
4778	return propagateFloat128NaN( a, a STATUS_VAR );
4779	}
4780	return a;
4781	}
4782	lastBitMask = 1;
4783	lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4784	roundBitsMask = lastBitMask - 1;
4785	z = a;
4786	roundingMode = STATUS(float_rounding_mode);
4787	if ( roundingMode == float_round_nearest_even ) {
4788	if ( lastBitMask ) {
4789	add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4790	if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4791	}
4792	else {
4793	if ( (sbits64) z.low < 0 ) {
4794	++z.high;
4795	if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4796	}
4797	}
4798	}
4799	else if ( roundingMode != float_round_to_zero ) {
4800	if ( extractFloat128Sign( z )
4801	^ ( roundingMode == float_round_up ) ) {
4802	add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4803	}
4804	}
4805	z.low &= ~ roundBitsMask;
4806	}
4807	else {
4808	if ( aExp < 0x3FFF ) {
4809	if ( ( ( (bits64) ( a.high<<1 ) ) \| a.low ) == 0 ) return a;
4810	STATUS(float_exception_flags) \|= float_flag_inexact;
4811	aSign = extractFloat128Sign( a );
4812	switch ( STATUS(float_rounding_mode) ) {
4813	case float_round_nearest_even:
4814	if ( ( aExp == 0x3FFE )
4815	&& ( extractFloat128Frac0( a )
4816	\| extractFloat128Frac1( a ) )
4817	) {
4818	return packFloat128( aSign, 0x3FFF, 0, 0 );
4819	}
4820	break;
4821	case float_round_down:
4822	return
4823	aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4824	: packFloat128( 0, 0, 0, 0 );
4825	case float_round_up:
4826	return
4827	aSign ? packFloat128( 1, 0, 0, 0 )
4828	: packFloat128( 0, 0x3FFF, 0, 0 );
4829	}
4830	return packFloat128( aSign, 0, 0, 0 );
4831	}
4832	lastBitMask = 1;
4833	lastBitMask <<= 0x402F - aExp;
4834	roundBitsMask = lastBitMask - 1;
4835	z.low = 0;
4836	z.high = a.high;
4837	roundingMode = STATUS(float_rounding_mode);
4838	if ( roundingMode == float_round_nearest_even ) {
4839	z.high += lastBitMask>>1;
4840	if ( ( ( z.high & roundBitsMask ) \| a.low ) == 0 ) {
4841	z.high &= ~ lastBitMask;
4842	}
4843	}
4844	else if ( roundingMode != float_round_to_zero ) {
4845	if ( extractFloat128Sign( z )
4846	^ ( roundingMode == float_round_up ) ) {
4847	z.high \|= ( a.low != 0 );
4848	z.high += roundBitsMask;
4849	}
4850	}
4851	z.high &= ~ roundBitsMask;
4852	}
4853	if ( ( z.low != a.low ) \|\| ( z.high != a.high ) ) {
4854	STATUS(float_exception_flags) \|= float_flag_inexact;
4855	}
4856	return z;
4857
4858	}
4859
4860	/*----------------------------------------------------------------------------
4861	\| Returns the result of adding the absolute values of the quadruple-precision
4862	\| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
4863	\| before being returned. `zSign' is ignored if the result is a NaN.
4864	\| The addition is performed according to the IEC/IEEE Standard for Binary
4865	\| Floating-Point Arithmetic.
4866	----------------------------------------------------------------------------/
4867
4868	static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4869	{
4870	int32 aExp, bExp, zExp;
4871	bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4872	int32 expDiff;
4873
4874	aSig1 = extractFloat128Frac1( a );
4875	aSig0 = extractFloat128Frac0( a );
4876	aExp = extractFloat128Exp( a );
4877	bSig1 = extractFloat128Frac1( b );
4878	bSig0 = extractFloat128Frac0( b );
4879	bExp = extractFloat128Exp( b );
4880	expDiff = aExp - bExp;
4881	if ( 0 < expDiff ) {
4882	if ( aExp == 0x7FFF ) {
4883	if ( aSig0 \| aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4884	return a;
4885	}
4886	if ( bExp == 0 ) {
4887	--expDiff;
4888	}
4889	else {
4890	bSig0 \|= LIT64( 0x0001000000000000 );
4891	}
4892	shift128ExtraRightJamming(
4893	bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4894	zExp = aExp;
4895	}
4896	else if ( expDiff < 0 ) {
4897	if ( bExp == 0x7FFF ) {
4898	if ( bSig0 \| bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4899	return packFloat128( zSign, 0x7FFF, 0, 0 );
4900	}
4901	if ( aExp == 0 ) {
4902	++expDiff;
4903	}
4904	else {
4905	aSig0 \|= LIT64( 0x0001000000000000 );
4906	}
4907	shift128ExtraRightJamming(
4908	aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4909	zExp = bExp;
4910	}
4911	else {
4912	if ( aExp == 0x7FFF ) {
4913	if ( aSig0 \| aSig1 \| bSig0 \| bSig1 ) {
4914	return propagateFloat128NaN( a, b STATUS_VAR );
4915	}
4916	return a;
4917	}
4918	add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4919	if ( aExp == 0 ) {
4920	if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
4921	return packFloat128( zSign, 0, zSig0, zSig1 );
4922	}
4923	zSig2 = 0;
4924	zSig0 \|= LIT64( 0x0002000000000000 );
4925	zExp = aExp;
4926	goto shiftRight1;
4927	}
4928	aSig0 \|= LIT64( 0x0001000000000000 );
4929	add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4930	--zExp;
4931	if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4932	++zExp;
4933	shiftRight1:
4934	shift128ExtraRightJamming(
4935	zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4936	roundAndPack:
4937	return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4938
4939	}
4940
4941	/*----------------------------------------------------------------------------
4942	\| Returns the result of subtracting the absolute values of the quadruple-
4943	\| precision floating-point values `a' and `b'. If `zSign' is 1, the
4944	\| difference is negated before being returned. `zSign' is ignored if the
4945	\| result is a NaN. The subtraction is performed according to the IEC/IEEE
4946	\| Standard for Binary Floating-Point Arithmetic.
4947	----------------------------------------------------------------------------/
4948
4949	static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4950	{
4951	int32 aExp, bExp, zExp;
4952	bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4953	int32 expDiff;
4954	float128 z;
4955
4956	aSig1 = extractFloat128Frac1( a );
4957	aSig0 = extractFloat128Frac0( a );
4958	aExp = extractFloat128Exp( a );
4959	bSig1 = extractFloat128Frac1( b );
4960	bSig0 = extractFloat128Frac0( b );
4961	bExp = extractFloat128Exp( b );
4962	expDiff = aExp - bExp;
4963	shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4964	shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4965	if ( 0 < expDiff ) goto aExpBigger;
4966	if ( expDiff < 0 ) goto bExpBigger;
4967	if ( aExp == 0x7FFF ) {
4968	if ( aSig0 \| aSig1 \| bSig0 \| bSig1 ) {
4969	return propagateFloat128NaN( a, b STATUS_VAR );
4970	}
4971	float_raise( float_flag_invalid STATUS_VAR);
4972	z.low = float128_default_nan_low;
4973	z.high = float128_default_nan_high;
4974	return z;
4975	}
4976	if ( aExp == 0 ) {
4977	aExp = 1;
4978	bExp = 1;
4979	}
4980	if ( bSig0 < aSig0 ) goto aBigger;
4981	if ( aSig0 < bSig0 ) goto bBigger;
4982	if ( bSig1 < aSig1 ) goto aBigger;
4983	if ( aSig1 < bSig1 ) goto bBigger;
4984	return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
4985	bExpBigger:
4986	if ( bExp == 0x7FFF ) {
4987	if ( bSig0 \| bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4988	return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4989	}
4990	if ( aExp == 0 ) {
4991	++expDiff;
4992	}
4993	else {
4994	aSig0 \|= LIT64( 0x4000000000000000 );
4995	}
4996	shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4997	bSig0 \|= LIT64( 0x4000000000000000 );
4998	bBigger:
4999	sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5000	zExp = bExp;
5001	zSign ^= 1;
5002	goto normalizeRoundAndPack;
5003	aExpBigger:
5004	if ( aExp == 0x7FFF ) {
5005	if ( aSig0 \| aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5006	return a;
5007	}
5008	if ( bExp == 0 ) {
5009	--expDiff;
5010	}
5011	else {
5012	bSig0 \|= LIT64( 0x4000000000000000 );
5013	}
5014	shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5015	aSig0 \|= LIT64( 0x4000000000000000 );
5016	aBigger:
5017	sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5018	zExp = aExp;
5019	normalizeRoundAndPack:
5020	--zExp;
5021	return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5022
5023	}
5024
5025	/*----------------------------------------------------------------------------
5026	\| Returns the result of adding the quadruple-precision floating-point values
5027	\| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
5028	\| for Binary Floating-Point Arithmetic.
5029	----------------------------------------------------------------------------/
5030
5031	float128 float128_add( float128 a, float128 b STATUS_PARAM )
5032	{
5033	flag aSign, bSign;
5034
5035	aSign = extractFloat128Sign( a );
5036	bSign = extractFloat128Sign( b );
5037	if ( aSign == bSign ) {
5038	return addFloat128Sigs( a, b, aSign STATUS_VAR );
5039	}
5040	else {
5041	return subFloat128Sigs( a, b, aSign STATUS_VAR );
5042	}
5043
5044	}
5045
5046	/*----------------------------------------------------------------------------
5047	\| Returns the result of subtracting the quadruple-precision floating-point
5048	\| values `a' and `b'. The operation is performed according to the IEC/IEEE
5049	\| Standard for Binary Floating-Point Arithmetic.
5050	----------------------------------------------------------------------------/
5051
5052	float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5053	{
5054	flag aSign, bSign;
5055
5056	aSign = extractFloat128Sign( a );
5057	bSign = extractFloat128Sign( b );
5058	if ( aSign == bSign ) {
5059	return subFloat128Sigs( a, b, aSign STATUS_VAR );
5060	}
5061	else {
5062	return addFloat128Sigs( a, b, aSign STATUS_VAR );
5063	}
5064
5065	}
5066
5067	/*----------------------------------------------------------------------------
5068	\| Returns the result of multiplying the quadruple-precision floating-point
5069	\| values `a' and `b'. The operation is performed according to the IEC/IEEE
5070	\| Standard for Binary Floating-Point Arithmetic.
5071	----------------------------------------------------------------------------/
5072
5073	float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5074	{
5075	flag aSign, bSign, zSign;
5076	int32 aExp, bExp, zExp;
5077	bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
5078	float128 z;
5079
5080	aSig1 = extractFloat128Frac1( a );
5081	aSig0 = extractFloat128Frac0( a );
5082	aExp = extractFloat128Exp( a );
5083	aSign = extractFloat128Sign( a );
5084	bSig1 = extractFloat128Frac1( b );
5085	bSig0 = extractFloat128Frac0( b );
5086	bExp = extractFloat128Exp( b );
5087	bSign = extractFloat128Sign( b );
5088	zSign = aSign ^ bSign;
5089	if ( aExp == 0x7FFF ) {
5090	if ( ( aSig0 \| aSig1 )
5091	\|\| ( ( bExp == 0x7FFF ) && ( bSig0 \| bSig1 ) ) ) {
5092	return propagateFloat128NaN( a, b STATUS_VAR );
5093	}
5094	if ( ( bExp \| bSig0 \| bSig1 ) == 0 ) goto invalid;
5095	return packFloat128( zSign, 0x7FFF, 0, 0 );
5096	}
5097	if ( bExp == 0x7FFF ) {
5098	if ( bSig0 \| bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5099	if ( ( aExp \| aSig0 \| aSig1 ) == 0 ) {
5100	invalid:
5101	float_raise( float_flag_invalid STATUS_VAR);
5102	z.low = float128_default_nan_low;
5103	z.high = float128_default_nan_high;
5104	return z;
5105	}
5106	return packFloat128( zSign, 0x7FFF, 0, 0 );
5107	}
5108	if ( aExp == 0 ) {
5109	if ( ( aSig0 \| aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5110	normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5111	}
5112	if ( bExp == 0 ) {
5113	if ( ( bSig0 \| bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5114	normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5115	}
5116	zExp = aExp + bExp - 0x4000;
5117	aSig0 \|= LIT64( 0x0001000000000000 );
5118	shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5119	mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5120	add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5121	zSig2 \|= ( zSig3 != 0 );
5122	if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5123	shift128ExtraRightJamming(
5124	zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5125	++zExp;
5126	}
5127	return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5128
5129	}
5130
5131	/*----------------------------------------------------------------------------
5132	\| Returns the result of dividing the quadruple-precision floating-point value
5133	\| `a' by the corresponding value `b'. The operation is performed according to
5134	\| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5135	----------------------------------------------------------------------------/
5136
5137	float128 float128_div( float128 a, float128 b STATUS_PARAM )
5138	{
5139	flag aSign, bSign, zSign;
5140	int32 aExp, bExp, zExp;
5141	bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5142	bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5143	float128 z;
5144
5145	aSig1 = extractFloat128Frac1( a );
5146	aSig0 = extractFloat128Frac0( a );
5147	aExp = extractFloat128Exp( a );
5148	aSign = extractFloat128Sign( a );
5149	bSig1 = extractFloat128Frac1( b );
5150	bSig0 = extractFloat128Frac0( b );
5151	bExp = extractFloat128Exp( b );
5152	bSign = extractFloat128Sign( b );
5153	zSign = aSign ^ bSign;
5154	if ( aExp == 0x7FFF ) {
5155	if ( aSig0 \| aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5156	if ( bExp == 0x7FFF ) {
5157	if ( bSig0 \| bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5158	goto invalid;
5159	}
5160	return packFloat128( zSign, 0x7FFF, 0, 0 );
5161	}
5162	if ( bExp == 0x7FFF ) {
5163	if ( bSig0 \| bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5164	return packFloat128( zSign, 0, 0, 0 );
5165	}
5166	if ( bExp == 0 ) {
5167	if ( ( bSig0 \| bSig1 ) == 0 ) {
5168	if ( ( aExp \| aSig0 \| aSig1 ) == 0 ) {
5169	invalid:
5170	float_raise( float_flag_invalid STATUS_VAR);
5171	z.low = float128_default_nan_low;
5172	z.high = float128_default_nan_high;
5173	return z;
5174	}
5175	float_raise( float_flag_divbyzero STATUS_VAR);
5176	return packFloat128( zSign, 0x7FFF, 0, 0 );
5177	}
5178	normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5179	}
5180	if ( aExp == 0 ) {
5181	if ( ( aSig0 \| aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5182	normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5183	}
5184	zExp = aExp - bExp + 0x3FFD;
5185	shortShift128Left(
5186	aSig0 \| LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5187	shortShift128Left(
5188	bSig0 \| LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5189	if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5190	shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5191	++zExp;
5192	}
5193	zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5194	mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5195	sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5196	while ( (sbits64) rem0 < 0 ) {
5197	--zSig0;
5198	add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5199	}
5200	zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5201	if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5202	mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5203	sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5204	while ( (sbits64) rem1 < 0 ) {
5205	--zSig1;
5206	add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5207	}
5208	zSig1 \|= ( ( rem1 \| rem2 \| rem3 ) != 0 );
5209	}
5210	shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5211	return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5212
5213	}
5214
5215	/*----------------------------------------------------------------------------
5216	\| Returns the remainder of the quadruple-precision floating-point value `a'
5217	\| with respect to the corresponding value `b'. The operation is performed
5218	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5219	----------------------------------------------------------------------------/
5220
5221	float128 float128_rem( float128 a, float128 b STATUS_PARAM )
5222	{
5223	flag aSign, zSign;
5224	int32 aExp, bExp, expDiff;
5225	bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5226	bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5227	sbits64 sigMean0;
5228	float128 z;
5229
5230	aSig1 = extractFloat128Frac1( a );
5231	aSig0 = extractFloat128Frac0( a );
5232	aExp = extractFloat128Exp( a );
5233	aSign = extractFloat128Sign( a );
5234	bSig1 = extractFloat128Frac1( b );
5235	bSig0 = extractFloat128Frac0( b );
5236	bExp = extractFloat128Exp( b );
5237	if ( aExp == 0x7FFF ) {
5238	if ( ( aSig0 \| aSig1 )
5239	\|\| ( ( bExp == 0x7FFF ) && ( bSig0 \| bSig1 ) ) ) {
5240	return propagateFloat128NaN( a, b STATUS_VAR );
5241	}
5242	goto invalid;
5243	}
5244	if ( bExp == 0x7FFF ) {
5245	if ( bSig0 \| bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5246	return a;
5247	}
5248	if ( bExp == 0 ) {
5249	if ( ( bSig0 \| bSig1 ) == 0 ) {
5250	invalid:
5251	float_raise( float_flag_invalid STATUS_VAR);
5252	z.low = float128_default_nan_low;
5253	z.high = float128_default_nan_high;
5254	return z;
5255	}
5256	normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5257	}
5258	if ( aExp == 0 ) {
5259	if ( ( aSig0 \| aSig1 ) == 0 ) return a;
5260	normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5261	}
5262	expDiff = aExp - bExp;
5263	if ( expDiff < -1 ) return a;
5264	shortShift128Left(
5265	aSig0 \| LIT64( 0x0001000000000000 ),
5266	aSig1,
5267	15 - ( expDiff < 0 ),
5268	&aSig0,
5269	&aSig1
5270	);
5271	shortShift128Left(
5272	bSig0 \| LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5273	q = le128( bSig0, bSig1, aSig0, aSig1 );
5274	if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5275	expDiff -= 64;
5276	while ( 0 < expDiff ) {
5277	q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5278	q = ( 4 < q ) ? q - 4 : 0;
5279	mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5280	shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5281	shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5282	sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5283	expDiff -= 61;
5284	}
5285	if ( -64 < expDiff ) {
5286	q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5287	q = ( 4 < q ) ? q - 4 : 0;
5288	q >>= - expDiff;
5289	shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5290	expDiff += 52;
5291	if ( expDiff < 0 ) {
5292	shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5293	}
5294	else {
5295	shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5296	}
5297	mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5298	sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5299	}
5300	else {
5301	shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5302	shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5303	}
5304	do {
5305	alternateASig0 = aSig0;
5306	alternateASig1 = aSig1;
5307	++q;
5308	sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5309	} while ( 0 <= (sbits64) aSig0 );
5310	add128(
5311	aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
5312	if ( ( sigMean0 < 0 )
5313	\|\| ( ( ( sigMean0 \| sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5314	aSig0 = alternateASig0;
5315	aSig1 = alternateASig1;
5316	}
5317	zSign = ( (sbits64) aSig0 < 0 );
5318	if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5319	return
5320	normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5321
5322	}
5323
5324	/*----------------------------------------------------------------------------
5325	\| Returns the square root of the quadruple-precision floating-point value `a'.
5326	\| The operation is performed according to the IEC/IEEE Standard for Binary
5327	\| Floating-Point Arithmetic.
5328	----------------------------------------------------------------------------/
5329
5330	float128 float128_sqrt( float128 a STATUS_PARAM )
5331	{
5332	flag aSign;
5333	int32 aExp, zExp;
5334	bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5335	bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5336	float128 z;
5337
5338	aSig1 = extractFloat128Frac1( a );
5339	aSig0 = extractFloat128Frac0( a );
5340	aExp = extractFloat128Exp( a );
5341	aSign = extractFloat128Sign( a );
5342	if ( aExp == 0x7FFF ) {
5343	if ( aSig0 \| aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5344	if ( ! aSign ) return a;
5345	goto invalid;
5346	}
5347	if ( aSign ) {
5348	if ( ( aExp \| aSig0 \| aSig1 ) == 0 ) return a;
5349	invalid:
5350	float_raise( float_flag_invalid STATUS_VAR);
5351	z.low = float128_default_nan_low;
5352	z.high = float128_default_nan_high;
5353	return z;
5354	}
5355	if ( aExp == 0 ) {
5356	if ( ( aSig0 \| aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5357	normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5358	}
5359	zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5360	aSig0 \|= LIT64( 0x0001000000000000 );
5361	zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5362	shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5363	zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5364	doubleZSig0 = zSig0<<1;
5365	mul64To128( zSig0, zSig0, &term0, &term1 );
5366	sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5367	while ( (sbits64) rem0 < 0 ) {
5368	--zSig0;
5369	doubleZSig0 -= 2;
5370	add128( rem0, rem1, zSig0>>63, doubleZSig0 \| 1, &rem0, &rem1 );
5371	}
5372	zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5373	if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5374	if ( zSig1 == 0 ) zSig1 = 1;
5375	mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5376	sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5377	mul64To128( zSig1, zSig1, &term2, &term3 );
5378	sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5379	while ( (sbits64) rem1 < 0 ) {
5380	--zSig1;
5381	shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5382	term3 \|= 1;
5383	term2 \|= doubleZSig0;
5384	add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5385	}
5386	zSig1 \|= ( ( rem1 \| rem2 \| rem3 ) != 0 );
5387	}
5388	shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5389	return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5390
5391	}
5392
5393	/*----------------------------------------------------------------------------
5394	\| Returns 1 if the quadruple-precision floating-point value `a' is equal to
5395	\| the corresponding value `b', and 0 otherwise. The comparison is performed
5396	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5397	----------------------------------------------------------------------------/
5398
5399	int float128_eq( float128 a, float128 b STATUS_PARAM )
5400	{
5401
5402	if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5403	&& ( extractFloat128Frac0( a ) \| extractFloat128Frac1( a ) ) )
5404	\|\| ( ( extractFloat128Exp( b ) == 0x7FFF )
5405	&& ( extractFloat128Frac0( b ) \| extractFloat128Frac1( b ) ) )
5406	) {
5407	if ( float128_is_signaling_nan( a )
5408	\|\| float128_is_signaling_nan( b ) ) {
5409	float_raise( float_flag_invalid STATUS_VAR);
5410	}
5411	return 0;
5412	}
5413	return
5414	( a.low == b.low )
5415	&& ( ( a.high == b.high )
5416	\|\| ( ( a.low == 0 )
5417	&& ( (bits64) ( ( a.high \| b.high )<<1 ) == 0 ) )
5418	);
5419
5420	}
5421
5422	/*----------------------------------------------------------------------------
5423	\| Returns 1 if the quadruple-precision floating-point value `a' is less than
5424	\| or equal to the corresponding value `b', and 0 otherwise. The comparison
5425	\| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5426	\| Arithmetic.
5427	----------------------------------------------------------------------------/
5428
5429	int float128_le( float128 a, float128 b STATUS_PARAM )
5430	{
5431	flag aSign, bSign;
5432
5433	if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5434	&& ( extractFloat128Frac0( a ) \| extractFloat128Frac1( a ) ) )
5435	\|\| ( ( extractFloat128Exp( b ) == 0x7FFF )
5436	&& ( extractFloat128Frac0( b ) \| extractFloat128Frac1( b ) ) )
5437	) {
5438	float_raise( float_flag_invalid STATUS_VAR);
5439	return 0;
5440	}
5441	aSign = extractFloat128Sign( a );
5442	bSign = extractFloat128Sign( b );
5443	if ( aSign != bSign ) {
5444	return
5445	aSign
5446	\|\| ( ( ( (bits64) ( ( a.high \| b.high )<<1 ) ) \| a.low \| b.low )
5447	== 0 );
5448	}
5449	return
5450	aSign ? le128( b.high, b.low, a.high, a.low )
5451	: le128( a.high, a.low, b.high, b.low );
5452
5453	}
5454
5455	/*----------------------------------------------------------------------------
5456	\| Returns 1 if the quadruple-precision floating-point value `a' is less than
5457	\| the corresponding value `b', and 0 otherwise. The comparison is performed
5458	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5459	----------------------------------------------------------------------------/
5460
5461	int float128_lt( float128 a, float128 b STATUS_PARAM )
5462	{
5463	flag aSign, bSign;
5464
5465	if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5466	&& ( extractFloat128Frac0( a ) \| extractFloat128Frac1( a ) ) )
5467	\|\| ( ( extractFloat128Exp( b ) == 0x7FFF )
5468	&& ( extractFloat128Frac0( b ) \| extractFloat128Frac1( b ) ) )
5469	) {
5470	float_raise( float_flag_invalid STATUS_VAR);
5471	return 0;
5472	}
5473	aSign = extractFloat128Sign( a );
5474	bSign = extractFloat128Sign( b );
5475	if ( aSign != bSign ) {
5476	return
5477	aSign
5478	&& ( ( ( (bits64) ( ( a.high \| b.high )<<1 ) ) \| a.low \| b.low )
5479	!= 0 );
5480	}
5481	return
5482	aSign ? lt128( b.high, b.low, a.high, a.low )
5483	: lt128( a.high, a.low, b.high, b.low );
5484
5485	}
5486
5487	/*----------------------------------------------------------------------------
5488	\| Returns 1 if the quadruple-precision floating-point value `a' is equal to
5489	\| the corresponding value `b', and 0 otherwise. The invalid exception is
5490	\| raised if either operand is a NaN. Otherwise, the comparison is performed
5491	\| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5492	----------------------------------------------------------------------------/
5493
5494	int float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
5495	{
5496
5497	if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5498	&& ( extractFloat128Frac0( a ) \| extractFloat128Frac1( a ) ) )
5499	\|\| ( ( extractFloat128Exp( b ) == 0x7FFF )
5500	&& ( extractFloat128Frac0( b ) \| extractFloat128Frac1( b ) ) )
5501	) {
5502	float_raise( float_flag_invalid STATUS_VAR);
5503	return 0;
5504	}
5505	return
5506	( a.low == b.low )
5507	&& ( ( a.high == b.high )
5508	\|\| ( ( a.low == 0 )
5509	&& ( (bits64) ( ( a.high \| b.high )<<1 ) == 0 ) )
5510	);
5511
5512	}
5513
5514	/*----------------------------------------------------------------------------
5515	\| Returns 1 if the quadruple-precision floating-point value `a' is less than
5516	\| or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5517	\| cause an exception. Otherwise, the comparison is performed according to the
5518	\| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5519	----------------------------------------------------------------------------/
5520
5521	int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5522	{
5523	flag aSign, bSign;
5524
5525	if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5526	&& ( extractFloat128Frac0( a ) \| extractFloat128Frac1( a ) ) )
5527	\|\| ( ( extractFloat128Exp( b ) == 0x7FFF )
5528	&& ( extractFloat128Frac0( b ) \| extractFloat128Frac1( b ) ) )
5529	) {
5530	if ( float128_is_signaling_nan( a )
5531	\|\| float128_is_signaling_nan( b ) ) {
5532	float_raise( float_flag_invalid STATUS_VAR);
5533	}
5534	return 0;
5535	}
5536	aSign = extractFloat128Sign( a );
5537	bSign = extractFloat128Sign( b );
5538	if ( aSign != bSign ) {
5539	return
5540	aSign
5541	\|\| ( ( ( (bits64) ( ( a.high \| b.high )<<1 ) ) \| a.low \| b.low )
5542	== 0 );
5543	}
5544	return
5545	aSign ? le128( b.high, b.low, a.high, a.low )
5546	: le128( a.high, a.low, b.high, b.low );
5547
5548	}
5549
5550	/*----------------------------------------------------------------------------
5551	\| Returns 1 if the quadruple-precision floating-point value `a' is less than
5552	\| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5553	\| exception. Otherwise, the comparison is performed according to the IEC/IEEE
5554	\| Standard for Binary Floating-Point Arithmetic.
5555	----------------------------------------------------------------------------/
5556
5557	int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5558	{
5559	flag aSign, bSign;
5560
5561	if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5562	&& ( extractFloat128Frac0( a ) \| extractFloat128Frac1( a ) ) )
5563	\|\| ( ( extractFloat128Exp( b ) == 0x7FFF )
5564	&& ( extractFloat128Frac0( b ) \| extractFloat128Frac1( b ) ) )
5565	) {
5566	if ( float128_is_signaling_nan( a )
5567	\|\| float128_is_signaling_nan( b ) ) {
5568	float_raise( float_flag_invalid STATUS_VAR);
5569	}
5570	return 0;
5571	}
5572	aSign = extractFloat128Sign( a );
5573	bSign = extractFloat128Sign( b );
5574	if ( aSign != bSign ) {
5575	return
5576	aSign
5577	&& ( ( ( (bits64) ( ( a.high \| b.high )<<1 ) ) \| a.low \| b.low )
5578	!= 0 );
5579	}
5580	return
5581	aSign ? lt128( b.high, b.low, a.high, a.low )
5582	: lt128( a.high, a.low, b.high, b.low );
5583
5584	}
5585
5586	#endif
5587
5588	/* misc functions */
5589	float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5590	{
5591	return int64_to_float32(a STATUS_VAR);
5592	}
5593
5594	float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5595	{
5596	return int64_to_float64(a STATUS_VAR);
5597	}
5598
5599	unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5600	{
5601	int64_t v;
5602	unsigned int res;
5603
5604	v = float32_to_int64(a STATUS_VAR);
5605	if (v < 0) {
5606	res = 0;
5607	float_raise( float_flag_invalid STATUS_VAR);
5608	} else if (v > 0xffffffff) {
5609	res = 0xffffffff;
5610	float_raise( float_flag_invalid STATUS_VAR);
5611	} else {
5612	res = v;
5613	}
5614	return res;
5615	}
5616
5617	unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
5618	{
5619	int64_t v;
5620	unsigned int res;
5621
5622	v = float32_to_int64_round_to_zero(a STATUS_VAR);
5623	if (v < 0) {
5624	res = 0;
5625	float_raise( float_flag_invalid STATUS_VAR);
5626	} else if (v > 0xffffffff) {
5627	res = 0xffffffff;
5628	float_raise( float_flag_invalid STATUS_VAR);
5629	} else {
5630	res = v;
5631	}
5632	return res;
5633	}
5634
5635	unsigned int float64_to_uint32( float64 a STATUS_PARAM )
5636	{
5637	int64_t v;
5638	unsigned int res;
5639
5640	v = float64_to_int64(a STATUS_VAR);
5641	if (v < 0) {
5642	res = 0;
5643	float_raise( float_flag_invalid STATUS_VAR);
5644	} else if (v > 0xffffffff) {
5645	res = 0xffffffff;
5646	float_raise( float_flag_invalid STATUS_VAR);
5647	} else {
5648	res = v;
5649	}
5650	return res;
5651	}
5652
5653	unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
5654	{
5655	int64_t v;
5656	unsigned int res;
5657
5658	v = float64_to_int64_round_to_zero(a STATUS_VAR);
5659	if (v < 0) {
5660	res = 0;
5661	float_raise( float_flag_invalid STATUS_VAR);
5662	} else if (v > 0xffffffff) {
5663	res = 0xffffffff;
5664	float_raise( float_flag_invalid STATUS_VAR);
5665	} else {
5666	res = v;
5667	}
5668	return res;
5669	}
5670
5671	/* FIXME: This looks broken. */
5672	uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
5673	{
5674	int64_t v;
5675
5676	v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5677	v += float64_val(a);
5678	v = float64_to_int64(make_float64(v) STATUS_VAR);
5679
5680	return v - INT64_MIN;
5681	}
5682
5683	uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
5684	{
5685	int64_t v;
5686
5687	v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5688	v += float64_val(a);
5689	v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
5690
5691	return v - INT64_MIN;
5692	}
5693
5694	#define COMPARE(s, nan_exp) \
5695	INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \
5696	int is_quiet STATUS_PARAM ) \
5697	{ \
5698	flag aSign, bSign; \
5699	bits ## s av, bv; \
5700	\
5701	if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
5702	extractFloat ## s ## Frac( a ) ) \|\| \
5703	( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
5704	extractFloat ## s ## Frac( b ) )) { \
5705	if (!is_quiet \|\| \
5706	float ## s ## _is_signaling_nan( a ) \|\| \
5707	float ## s ## _is_signaling_nan( b ) ) { \
5708	float_raise( float_flag_invalid STATUS_VAR); \
5709	} \
5710	return float_relation_unordered; \
5711	} \
5712	aSign = extractFloat ## s ## Sign( a ); \
5713	bSign = extractFloat ## s ## Sign( b ); \
5714	av = float ## s ## _val(a); \
5715	bv = float ## s ## _val(b); \
5716	if ( aSign != bSign ) { \
5717	if ( (bits ## s) ( ( av \| bv )<<1 ) == 0 ) { \
5718	/* zero case */ \
5719	return float_relation_equal; \
5720	} else { \
5721	return 1 - (2 * aSign); \
5722	} \
5723	} else { \
5724	if (av == bv) { \
5725	return float_relation_equal; \
5726	} else { \
5727	return 1 - 2 * (aSign ^ ( av < bv )); \
5728	} \
5729	} \
5730	} \
5731	\
5732	int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
5733	{ \
5734	return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
5735	} \
5736	\
5737	int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
5738	{ \
5739	return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
5740	}
5741
5742	COMPARE(32, 0xff)
5743	COMPARE(64, 0x7ff)
5744
5745	INLINE int float128_compare_internal( float128 a, float128 b,
5746	int is_quiet STATUS_PARAM )
5747	{
5748	flag aSign, bSign;
5749
5750	if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
5751	( extractFloat128Frac0( a ) \| extractFloat128Frac1( a ) ) ) \|\|
5752	( ( extractFloat128Exp( b ) == 0x7fff ) &&
5753	( extractFloat128Frac0( b ) \| extractFloat128Frac1( b ) ) )) {
5754	if (!is_quiet \|\|
5755	float128_is_signaling_nan( a ) \|\|
5756	float128_is_signaling_nan( b ) ) {
5757	float_raise( float_flag_invalid STATUS_VAR);
5758	}
5759	return float_relation_unordered;
5760	}
5761	aSign = extractFloat128Sign( a );
5762	bSign = extractFloat128Sign( b );
5763	if ( aSign != bSign ) {
5764	if ( ( ( ( a.high \| b.high )<<1 ) \| a.low \| b.low ) == 0 ) {
5765	/* zero case */
5766	return float_relation_equal;
5767	} else {
5768	return 1 - (2 * aSign);
5769	}
5770	} else {
5771	if (a.low == b.low && a.high == b.high) {
5772	return float_relation_equal;
5773	} else {
5774	return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
5775	}
5776	}
5777	}
5778
5779	int float128_compare( float128 a, float128 b STATUS_PARAM )
5780	{
5781	return float128_compare_internal(a, b, 0 STATUS_VAR);
5782	}
5783
5784	int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
5785	{
5786	return float128_compare_internal(a, b, 1 STATUS_VAR);
5787	}
5788
5789	/* Multiply A by 2 raised to the power N. */
5790	float32 float32_scalbn( float32 a, int n STATUS_PARAM )
5791	{
5792	flag aSign;
5793	int16 aExp;
5794	bits32 aSig;
5795
5796	aSig = extractFloat32Frac( a );
5797	aExp = extractFloat32Exp( a );
5798	aSign = extractFloat32Sign( a );
5799
5800	if ( aExp == 0xFF ) {
5801	return a;
5802	}
5803	if ( aExp != 0 )
5804	aSig \|= 0x00800000;
5805	else if ( aSig == 0 )
5806	return a;
5807
5808	aExp += n - 1;
5809	aSig <<= 7;
5810	return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
5811	}
5812
5813	float64 float64_scalbn( float64 a, int n STATUS_PARAM )
5814	{
5815	flag aSign;
5816	int16 aExp;
5817	bits64 aSig;
5818
5819	aSig = extractFloat64Frac( a );
5820	aExp = extractFloat64Exp( a );
5821	aSign = extractFloat64Sign( a );
5822
5823	if ( aExp == 0x7FF ) {
5824	return a;
5825	}
5826	if ( aExp != 0 )
5827	aSig \|= LIT64( 0x0010000000000000 );
5828	else if ( aSig == 0 )
5829	return a;
5830
5831	aExp += n - 1;
5832	aSig <<= 10;
5833	return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
5834	}
5835
5836	#ifdef FLOATX80
5837	floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
5838	{
5839	flag aSign;
5840	int16 aExp;
5841	bits64 aSig;
5842
5843	aSig = extractFloatx80Frac( a );
5844	aExp = extractFloatx80Exp( a );
5845	aSign = extractFloatx80Sign( a );
5846
5847	if ( aExp == 0x7FF ) {
5848	return a;
5849	}
5850	if (aExp == 0 && aSig == 0)
5851	return a;
5852
5853	aExp += n;
5854	return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
5855	aSign, aExp, aSig, 0 STATUS_VAR );
5856	}
5857	#endif
5858
5859	#ifdef FLOAT128
5860	float128 float128_scalbn( float128 a, int n STATUS_PARAM )
5861	{
5862	flag aSign;
5863	int32 aExp;
5864	bits64 aSig0, aSig1;
5865
5866	aSig1 = extractFloat128Frac1( a );
5867	aSig0 = extractFloat128Frac0( a );
5868	aExp = extractFloat128Exp( a );
5869	aSign = extractFloat128Sign( a );
5870	if ( aExp == 0x7FFF ) {
5871	return a;
5872	}
5873	if ( aExp != 0 )
5874	aSig0 \|= LIT64( 0x0001000000000000 );
5875	else if ( aSig0 == 0 && aSig1 == 0 )
5876	return a;
5877
5878	aExp += n - 1;
5879	return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
5880	STATUS_VAR );
5881
5882	}
5883	#endif

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/recompiler/fpu/softfloat.c@ 69879

Download in other formats: