VirtualBox

source: vbox/trunk/src/recompiler/fpu/softfloat.c@ 18208

Last change on this file since 18208 was 1, checked in by vboxsync, 55 years ago

import

File size: 188.5 KB
Line 
1
2/*============================================================================
3
4This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
5Package, Release 2b.
6
7Written by John R. Hauser. This work was made possible in part by the
8International Computer Science Institute, located at Suite 600, 1947 Center
9Street, Berkeley, California 94704. Funding was partially provided by the
10National Science Foundation under grant MIP-9311980. The original version
11of this code was written as part of a project to build a fixed-point vector
12processor in collaboration with the University of California at Berkeley,
13overseen by Profs. Nelson Morgan and John Wawrzynek. More information
14is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
15arithmetic/SoftFloat.html'.
16
17THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
18been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
19RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
20AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
21COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
22EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
23INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
24OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
25
26Derivative works are acceptable, even for commercial purposes, so long as
27(1) the source code for the derivative work includes prominent notice that
28the work is derivative, and (2) the source code includes prominent notice with
29these four paragraphs for those parts of this code that are retained.
30
31=============================================================================*/
32
33#include "softfloat.h"
34
35/*----------------------------------------------------------------------------
36| Primitive arithmetic functions, including multi-word arithmetic, and
37| division and square root approximations. (Can be specialized to target if
38| desired.)
39*----------------------------------------------------------------------------*/
40#include "softfloat-macros.h"
41
42/*----------------------------------------------------------------------------
43| Functions and definitions to determine: (1) whether tininess for underflow
44| is detected before or after rounding by default, (2) what (if anything)
45| happens when exceptions are raised, (3) how signaling NaNs are distinguished
46| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
47| are propagated from function inputs to output. These details are target-
48| specific.
49*----------------------------------------------------------------------------*/
50#include "softfloat-specialize.h"
51
52void set_float_rounding_mode(int val STATUS_PARAM)
53{
54 STATUS(float_rounding_mode) = val;
55}
56
57void set_float_exception_flags(int val STATUS_PARAM)
58{
59 STATUS(float_exception_flags) = val;
60}
61
62#ifdef FLOATX80
63void set_floatx80_rounding_precision(int val STATUS_PARAM)
64{
65 STATUS(floatx80_rounding_precision) = val;
66}
67#endif
68
69/*----------------------------------------------------------------------------
70| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
71| and 7, and returns the properly rounded 32-bit integer corresponding to the
72| input. If `zSign' is 1, the input is negated before being converted to an
73| integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
74| is simply rounded to an integer, with the inexact exception raised if the
75| input cannot be represented exactly as an integer. However, if the fixed-
76| point input is too large, the invalid exception is raised and the largest
77| positive or negative integer is returned.
78*----------------------------------------------------------------------------*/
79
80static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM)
81{
82 int8 roundingMode;
83 flag roundNearestEven;
84 int8 roundIncrement, roundBits;
85 int32 z;
86
87 roundingMode = STATUS(float_rounding_mode);
88 roundNearestEven = ( roundingMode == float_round_nearest_even );
89 roundIncrement = 0x40;
90 if ( ! roundNearestEven ) {
91 if ( roundingMode == float_round_to_zero ) {
92 roundIncrement = 0;
93 }
94 else {
95 roundIncrement = 0x7F;
96 if ( zSign ) {
97 if ( roundingMode == float_round_up ) roundIncrement = 0;
98 }
99 else {
100 if ( roundingMode == float_round_down ) roundIncrement = 0;
101 }
102 }
103 }
104 roundBits = absZ & 0x7F;
105 absZ = ( absZ + roundIncrement )>>7;
106 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
107 z = absZ;
108 if ( zSign ) z = - z;
109 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
110 float_raise( float_flag_invalid STATUS_VAR);
111 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
112 }
113 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
114 return z;
115
116}
117
118/*----------------------------------------------------------------------------
119| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
120| `absZ1', with binary point between bits 63 and 64 (between the input words),
121| and returns the properly rounded 64-bit integer corresponding to the input.
122| If `zSign' is 1, the input is negated before being converted to an integer.
123| Ordinarily, the fixed-point input is simply rounded to an integer, with
124| the inexact exception raised if the input cannot be represented exactly as
125| an integer. However, if the fixed-point input is too large, the invalid
126| exception is raised and the largest positive or negative integer is
127| returned.
128*----------------------------------------------------------------------------*/
129
130static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM)
131{
132 int8 roundingMode;
133 flag roundNearestEven, increment;
134 int64 z;
135
136 roundingMode = STATUS(float_rounding_mode);
137 roundNearestEven = ( roundingMode == float_round_nearest_even );
138 increment = ( (sbits64) absZ1 < 0 );
139 if ( ! roundNearestEven ) {
140 if ( roundingMode == float_round_to_zero ) {
141 increment = 0;
142 }
143 else {
144 if ( zSign ) {
145 increment = ( roundingMode == float_round_down ) && absZ1;
146 }
147 else {
148 increment = ( roundingMode == float_round_up ) && absZ1;
149 }
150 }
151 }
152 if ( increment ) {
153 ++absZ0;
154 if ( absZ0 == 0 ) goto overflow;
155 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
156 }
157 z = absZ0;
158 if ( zSign ) z = - z;
159 if ( z && ( ( z < 0 ) ^ zSign ) ) {
160 overflow:
161 float_raise( float_flag_invalid STATUS_VAR);
162 return
163 zSign ? (sbits64) LIT64( 0x8000000000000000 )
164 : LIT64( 0x7FFFFFFFFFFFFFFF );
165 }
166 if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
167 return z;
168
169}
170
171/*----------------------------------------------------------------------------
172| Returns the fraction bits of the single-precision floating-point value `a'.
173*----------------------------------------------------------------------------*/
174
175INLINE bits32 extractFloat32Frac( float32 a )
176{
177
178 return a & 0x007FFFFF;
179
180}
181
182/*----------------------------------------------------------------------------
183| Returns the exponent bits of the single-precision floating-point value `a'.
184*----------------------------------------------------------------------------*/
185
186INLINE int16 extractFloat32Exp( float32 a )
187{
188
189 return ( a>>23 ) & 0xFF;
190
191}
192
193/*----------------------------------------------------------------------------
194| Returns the sign bit of the single-precision floating-point value `a'.
195*----------------------------------------------------------------------------*/
196
197INLINE flag extractFloat32Sign( float32 a )
198{
199
200 return a>>31;
201
202}
203
204/*----------------------------------------------------------------------------
205| Normalizes the subnormal single-precision floating-point value represented
206| by the denormalized significand `aSig'. The normalized exponent and
207| significand are stored at the locations pointed to by `zExpPtr' and
208| `zSigPtr', respectively.
209*----------------------------------------------------------------------------*/
210
211static void
212 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
213{
214 int8 shiftCount;
215
216 shiftCount = countLeadingZeros32( aSig ) - 8;
217 *zSigPtr = aSig<<shiftCount;
218 *zExpPtr = 1 - shiftCount;
219
220}
221
222/*----------------------------------------------------------------------------
223| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
224| single-precision floating-point value, returning the result. After being
225| shifted into the proper positions, the three fields are simply added
226| together to form the result. This means that any integer portion of `zSig'
227| will be added into the exponent. Since a properly normalized significand
228| will have an integer portion equal to 1, the `zExp' input should be 1 less
229| than the desired result exponent whenever `zSig' is a complete, normalized
230| significand.
231*----------------------------------------------------------------------------*/
232
233INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
234{
235
236 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
237
238}
239
240/*----------------------------------------------------------------------------
241| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
242| and significand `zSig', and returns the proper single-precision floating-
243| point value corresponding to the abstract input. Ordinarily, the abstract
244| value is simply rounded and packed into the single-precision format, with
245| the inexact exception raised if the abstract input cannot be represented
246| exactly. However, if the abstract value is too large, the overflow and
247| inexact exceptions are raised and an infinity or maximal finite value is
248| returned. If the abstract value is too small, the input value is rounded to
249| a subnormal number, and the underflow and inexact exceptions are raised if
250| the abstract input cannot be represented exactly as a subnormal single-
251| precision floating-point number.
252| The input significand `zSig' has its binary point between bits 30
253| and 29, which is 7 bits to the left of the usual location. This shifted
254| significand must be normalized or smaller. If `zSig' is not normalized,
255| `zExp' must be 0; in that case, the result returned is a subnormal number,
256| and it must not require rounding. In the usual case that `zSig' is
257| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
258| The handling of underflow and overflow follows the IEC/IEEE Standard for
259| Binary Floating-Point Arithmetic.
260*----------------------------------------------------------------------------*/
261
262static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
263{
264 int8 roundingMode;
265 flag roundNearestEven;
266 int8 roundIncrement, roundBits;
267 flag isTiny;
268
269 roundingMode = STATUS(float_rounding_mode);
270 roundNearestEven = ( roundingMode == float_round_nearest_even );
271 roundIncrement = 0x40;
272 if ( ! roundNearestEven ) {
273 if ( roundingMode == float_round_to_zero ) {
274 roundIncrement = 0;
275 }
276 else {
277 roundIncrement = 0x7F;
278 if ( zSign ) {
279 if ( roundingMode == float_round_up ) roundIncrement = 0;
280 }
281 else {
282 if ( roundingMode == float_round_down ) roundIncrement = 0;
283 }
284 }
285 }
286 roundBits = zSig & 0x7F;
287 if ( 0xFD <= (bits16) zExp ) {
288 if ( ( 0xFD < zExp )
289 || ( ( zExp == 0xFD )
290 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
291 ) {
292 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
293 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
294 }
295 if ( zExp < 0 ) {
296 isTiny =
297 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
298 || ( zExp < -1 )
299 || ( zSig + roundIncrement < 0x80000000 );
300 shift32RightJamming( zSig, - zExp, &zSig );
301 zExp = 0;
302 roundBits = zSig & 0x7F;
303 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
304 }
305 }
306 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
307 zSig = ( zSig + roundIncrement )>>7;
308 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
309 if ( zSig == 0 ) zExp = 0;
310 return packFloat32( zSign, zExp, zSig );
311
312}
313
314/*----------------------------------------------------------------------------
315| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
316| and significand `zSig', and returns the proper single-precision floating-
317| point value corresponding to the abstract input. This routine is just like
318| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
319| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
320| floating-point exponent.
321*----------------------------------------------------------------------------*/
322
323static float32
324 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
325{
326 int8 shiftCount;
327
328 shiftCount = countLeadingZeros32( zSig ) - 1;
329 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
330
331}
332
333/*----------------------------------------------------------------------------
334| Returns the fraction bits of the double-precision floating-point value `a'.
335*----------------------------------------------------------------------------*/
336
337INLINE bits64 extractFloat64Frac( float64 a )
338{
339
340 return a & LIT64( 0x000FFFFFFFFFFFFF );
341
342}
343
344/*----------------------------------------------------------------------------
345| Returns the exponent bits of the double-precision floating-point value `a'.
346*----------------------------------------------------------------------------*/
347
348INLINE int16 extractFloat64Exp( float64 a )
349{
350
351 return ( a>>52 ) & 0x7FF;
352
353}
354
355/*----------------------------------------------------------------------------
356| Returns the sign bit of the double-precision floating-point value `a'.
357*----------------------------------------------------------------------------*/
358
359INLINE flag extractFloat64Sign( float64 a )
360{
361
362 return a>>63;
363
364}
365
366/*----------------------------------------------------------------------------
367| Normalizes the subnormal double-precision floating-point value represented
368| by the denormalized significand `aSig'. The normalized exponent and
369| significand are stored at the locations pointed to by `zExpPtr' and
370| `zSigPtr', respectively.
371*----------------------------------------------------------------------------*/
372
373static void
374 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
375{
376 int8 shiftCount;
377
378 shiftCount = countLeadingZeros64( aSig ) - 11;
379 *zSigPtr = aSig<<shiftCount;
380 *zExpPtr = 1 - shiftCount;
381
382}
383
384/*----------------------------------------------------------------------------
385| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
386| double-precision floating-point value, returning the result. After being
387| shifted into the proper positions, the three fields are simply added
388| together to form the result. This means that any integer portion of `zSig'
389| will be added into the exponent. Since a properly normalized significand
390| will have an integer portion equal to 1, the `zExp' input should be 1 less
391| than the desired result exponent whenever `zSig' is a complete, normalized
392| significand.
393*----------------------------------------------------------------------------*/
394
395INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
396{
397
398 return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
399
400}
401
402/*----------------------------------------------------------------------------
403| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
404| and significand `zSig', and returns the proper double-precision floating-
405| point value corresponding to the abstract input. Ordinarily, the abstract
406| value is simply rounded and packed into the double-precision format, with
407| the inexact exception raised if the abstract input cannot be represented
408| exactly. However, if the abstract value is too large, the overflow and
409| inexact exceptions are raised and an infinity or maximal finite value is
410| returned. If the abstract value is too small, the input value is rounded
411| to a subnormal number, and the underflow and inexact exceptions are raised
412| if the abstract input cannot be represented exactly as a subnormal double-
413| precision floating-point number.
414| The input significand `zSig' has its binary point between bits 62
415| and 61, which is 10 bits to the left of the usual location. This shifted
416| significand must be normalized or smaller. If `zSig' is not normalized,
417| `zExp' must be 0; in that case, the result returned is a subnormal number,
418| and it must not require rounding. In the usual case that `zSig' is
419| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
420| The handling of underflow and overflow follows the IEC/IEEE Standard for
421| Binary Floating-Point Arithmetic.
422*----------------------------------------------------------------------------*/
423
424static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
425{
426 int8 roundingMode;
427 flag roundNearestEven;
428 int16 roundIncrement, roundBits;
429 flag isTiny;
430
431 roundingMode = STATUS(float_rounding_mode);
432 roundNearestEven = ( roundingMode == float_round_nearest_even );
433 roundIncrement = 0x200;
434 if ( ! roundNearestEven ) {
435 if ( roundingMode == float_round_to_zero ) {
436 roundIncrement = 0;
437 }
438 else {
439 roundIncrement = 0x3FF;
440 if ( zSign ) {
441 if ( roundingMode == float_round_up ) roundIncrement = 0;
442 }
443 else {
444 if ( roundingMode == float_round_down ) roundIncrement = 0;
445 }
446 }
447 }
448 roundBits = zSig & 0x3FF;
449 if ( 0x7FD <= (bits16) zExp ) {
450 if ( ( 0x7FD < zExp )
451 || ( ( zExp == 0x7FD )
452 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
453 ) {
454 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
455 return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
456 }
457 if ( zExp < 0 ) {
458 isTiny =
459 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
460 || ( zExp < -1 )
461 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
462 shift64RightJamming( zSig, - zExp, &zSig );
463 zExp = 0;
464 roundBits = zSig & 0x3FF;
465 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
466 }
467 }
468 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
469 zSig = ( zSig + roundIncrement )>>10;
470 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
471 if ( zSig == 0 ) zExp = 0;
472 return packFloat64( zSign, zExp, zSig );
473
474}
475
476/*----------------------------------------------------------------------------
477| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
478| and significand `zSig', and returns the proper double-precision floating-
479| point value corresponding to the abstract input. This routine is just like
480| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
481| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
482| floating-point exponent.
483*----------------------------------------------------------------------------*/
484
485static float64
486 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
487{
488 int8 shiftCount;
489
490 shiftCount = countLeadingZeros64( zSig ) - 1;
491 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
492
493}
494
495#ifdef FLOATX80
496
497/*----------------------------------------------------------------------------
498| Returns the fraction bits of the extended double-precision floating-point
499| value `a'.
500*----------------------------------------------------------------------------*/
501
502INLINE bits64 extractFloatx80Frac( floatx80 a )
503{
504
505 return a.low;
506
507}
508
509/*----------------------------------------------------------------------------
510| Returns the exponent bits of the extended double-precision floating-point
511| value `a'.
512*----------------------------------------------------------------------------*/
513
514INLINE int32 extractFloatx80Exp( floatx80 a )
515{
516
517 return a.high & 0x7FFF;
518
519}
520
521/*----------------------------------------------------------------------------
522| Returns the sign bit of the extended double-precision floating-point value
523| `a'.
524*----------------------------------------------------------------------------*/
525
526INLINE flag extractFloatx80Sign( floatx80 a )
527{
528
529 return a.high>>15;
530
531}
532
533/*----------------------------------------------------------------------------
534| Normalizes the subnormal extended double-precision floating-point value
535| represented by the denormalized significand `aSig'. The normalized exponent
536| and significand are stored at the locations pointed to by `zExpPtr' and
537| `zSigPtr', respectively.
538*----------------------------------------------------------------------------*/
539
540static void
541 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
542{
543 int8 shiftCount;
544
545 shiftCount = countLeadingZeros64( aSig );
546 *zSigPtr = aSig<<shiftCount;
547 *zExpPtr = 1 - shiftCount;
548
549}
550
551/*----------------------------------------------------------------------------
552| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
553| extended double-precision floating-point value, returning the result.
554*----------------------------------------------------------------------------*/
555
556INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
557{
558 floatx80 z;
559
560 z.low = zSig;
561 z.high = ( ( (bits16) zSign )<<15 ) + zExp;
562 return z;
563
564}
565
566/*----------------------------------------------------------------------------
567| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
568| and extended significand formed by the concatenation of `zSig0' and `zSig1',
569| and returns the proper extended double-precision floating-point value
570| corresponding to the abstract input. Ordinarily, the abstract value is
571| rounded and packed into the extended double-precision format, with the
572| inexact exception raised if the abstract input cannot be represented
573| exactly. However, if the abstract value is too large, the overflow and
574| inexact exceptions are raised and an infinity or maximal finite value is
575| returned. If the abstract value is too small, the input value is rounded to
576| a subnormal number, and the underflow and inexact exceptions are raised if
577| the abstract input cannot be represented exactly as a subnormal extended
578| double-precision floating-point number.
579| If `roundingPrecision' is 32 or 64, the result is rounded to the same
580| number of bits as single or double precision, respectively. Otherwise, the
581| result is rounded to the full precision of the extended double-precision
582| format.
583| The input significand must be normalized or smaller. If the input
584| significand is not normalized, `zExp' must be 0; in that case, the result
585| returned is a subnormal number, and it must not require rounding. The
586| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
587| Floating-Point Arithmetic.
588*----------------------------------------------------------------------------*/
589
590static floatx80
591 roundAndPackFloatx80(
592 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
593 STATUS_PARAM)
594{
595 int8 roundingMode;
596 flag roundNearestEven, increment, isTiny;
597 int64 roundIncrement, roundMask, roundBits;
598
599 roundingMode = STATUS(float_rounding_mode);
600 roundNearestEven = ( roundingMode == float_round_nearest_even );
601 if ( roundingPrecision == 80 ) goto precision80;
602 if ( roundingPrecision == 64 ) {
603 roundIncrement = LIT64( 0x0000000000000400 );
604 roundMask = LIT64( 0x00000000000007FF );
605 }
606 else if ( roundingPrecision == 32 ) {
607 roundIncrement = LIT64( 0x0000008000000000 );
608 roundMask = LIT64( 0x000000FFFFFFFFFF );
609 }
610 else {
611 goto precision80;
612 }
613 zSig0 |= ( zSig1 != 0 );
614 if ( ! roundNearestEven ) {
615 if ( roundingMode == float_round_to_zero ) {
616 roundIncrement = 0;
617 }
618 else {
619 roundIncrement = roundMask;
620 if ( zSign ) {
621 if ( roundingMode == float_round_up ) roundIncrement = 0;
622 }
623 else {
624 if ( roundingMode == float_round_down ) roundIncrement = 0;
625 }
626 }
627 }
628 roundBits = zSig0 & roundMask;
629 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
630 if ( ( 0x7FFE < zExp )
631 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
632 ) {
633 goto overflow;
634 }
635 if ( zExp <= 0 ) {
636 isTiny =
637 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
638 || ( zExp < 0 )
639 || ( zSig0 <= zSig0 + roundIncrement );
640 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
641 zExp = 0;
642 roundBits = zSig0 & roundMask;
643 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
644 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
645 zSig0 += roundIncrement;
646 if ( (sbits64) zSig0 < 0 ) zExp = 1;
647 roundIncrement = roundMask + 1;
648 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
649 roundMask |= roundIncrement;
650 }
651 zSig0 &= ~ roundMask;
652 return packFloatx80( zSign, zExp, zSig0 );
653 }
654 }
655 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
656 zSig0 += roundIncrement;
657 if ( zSig0 < roundIncrement ) {
658 ++zExp;
659 zSig0 = LIT64( 0x8000000000000000 );
660 }
661 roundIncrement = roundMask + 1;
662 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
663 roundMask |= roundIncrement;
664 }
665 zSig0 &= ~ roundMask;
666 if ( zSig0 == 0 ) zExp = 0;
667 return packFloatx80( zSign, zExp, zSig0 );
668 precision80:
669 increment = ( (sbits64) zSig1 < 0 );
670 if ( ! roundNearestEven ) {
671 if ( roundingMode == float_round_to_zero ) {
672 increment = 0;
673 }
674 else {
675 if ( zSign ) {
676 increment = ( roundingMode == float_round_down ) && zSig1;
677 }
678 else {
679 increment = ( roundingMode == float_round_up ) && zSig1;
680 }
681 }
682 }
683 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
684 if ( ( 0x7FFE < zExp )
685 || ( ( zExp == 0x7FFE )
686 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
687 && increment
688 )
689 ) {
690 roundMask = 0;
691 overflow:
692 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
693 if ( ( roundingMode == float_round_to_zero )
694 || ( zSign && ( roundingMode == float_round_up ) )
695 || ( ! zSign && ( roundingMode == float_round_down ) )
696 ) {
697 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
698 }
699 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
700 }
701 if ( zExp <= 0 ) {
702 isTiny =
703 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
704 || ( zExp < 0 )
705 || ! increment
706 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
707 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
708 zExp = 0;
709 if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
710 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
711 if ( roundNearestEven ) {
712 increment = ( (sbits64) zSig1 < 0 );
713 }
714 else {
715 if ( zSign ) {
716 increment = ( roundingMode == float_round_down ) && zSig1;
717 }
718 else {
719 increment = ( roundingMode == float_round_up ) && zSig1;
720 }
721 }
722 if ( increment ) {
723 ++zSig0;
724 zSig0 &=
725 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
726 if ( (sbits64) zSig0 < 0 ) zExp = 1;
727 }
728 return packFloatx80( zSign, zExp, zSig0 );
729 }
730 }
731 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
732 if ( increment ) {
733 ++zSig0;
734 if ( zSig0 == 0 ) {
735 ++zExp;
736 zSig0 = LIT64( 0x8000000000000000 );
737 }
738 else {
739 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
740 }
741 }
742 else {
743 if ( zSig0 == 0 ) zExp = 0;
744 }
745 return packFloatx80( zSign, zExp, zSig0 );
746
747}
748
749/*----------------------------------------------------------------------------
750| Takes an abstract floating-point value having sign `zSign', exponent
751| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
752| and returns the proper extended double-precision floating-point value
753| corresponding to the abstract input. This routine is just like
754| `roundAndPackFloatx80' except that the input significand does not have to be
755| normalized.
756*----------------------------------------------------------------------------*/
757
758static floatx80
759 normalizeRoundAndPackFloatx80(
760 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
761 STATUS_PARAM)
762{
763 int8 shiftCount;
764
765 if ( zSig0 == 0 ) {
766 zSig0 = zSig1;
767 zSig1 = 0;
768 zExp -= 64;
769 }
770 shiftCount = countLeadingZeros64( zSig0 );
771 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
772 zExp -= shiftCount;
773 return
774 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
775
776}
777
778#endif
779
780#ifdef FLOAT128
781
782/*----------------------------------------------------------------------------
783| Returns the least-significant 64 fraction bits of the quadruple-precision
784| floating-point value `a'.
785*----------------------------------------------------------------------------*/
786
787INLINE bits64 extractFloat128Frac1( float128 a )
788{
789
790 return a.low;
791
792}
793
794/*----------------------------------------------------------------------------
795| Returns the most-significant 48 fraction bits of the quadruple-precision
796| floating-point value `a'.
797*----------------------------------------------------------------------------*/
798
799INLINE bits64 extractFloat128Frac0( float128 a )
800{
801
802 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
803
804}
805
806/*----------------------------------------------------------------------------
807| Returns the exponent bits of the quadruple-precision floating-point value
808| `a'.
809*----------------------------------------------------------------------------*/
810
811INLINE int32 extractFloat128Exp( float128 a )
812{
813
814 return ( a.high>>48 ) & 0x7FFF;
815
816}
817
818/*----------------------------------------------------------------------------
819| Returns the sign bit of the quadruple-precision floating-point value `a'.
820*----------------------------------------------------------------------------*/
821
822INLINE flag extractFloat128Sign( float128 a )
823{
824
825 return a.high>>63;
826
827}
828
829/*----------------------------------------------------------------------------
830| Normalizes the subnormal quadruple-precision floating-point value
831| represented by the denormalized significand formed by the concatenation of
832| `aSig0' and `aSig1'. The normalized exponent is stored at the location
833| pointed to by `zExpPtr'. The most significant 49 bits of the normalized
834| significand are stored at the location pointed to by `zSig0Ptr', and the
835| least significant 64 bits of the normalized significand are stored at the
836| location pointed to by `zSig1Ptr'.
837*----------------------------------------------------------------------------*/
838
839static void
840 normalizeFloat128Subnormal(
841 bits64 aSig0,
842 bits64 aSig1,
843 int32 *zExpPtr,
844 bits64 *zSig0Ptr,
845 bits64 *zSig1Ptr
846 )
847{
848 int8 shiftCount;
849
850 if ( aSig0 == 0 ) {
851 shiftCount = countLeadingZeros64( aSig1 ) - 15;
852 if ( shiftCount < 0 ) {
853 *zSig0Ptr = aSig1>>( - shiftCount );
854 *zSig1Ptr = aSig1<<( shiftCount & 63 );
855 }
856 else {
857 *zSig0Ptr = aSig1<<shiftCount;
858 *zSig1Ptr = 0;
859 }
860 *zExpPtr = - shiftCount - 63;
861 }
862 else {
863 shiftCount = countLeadingZeros64( aSig0 ) - 15;
864 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
865 *zExpPtr = 1 - shiftCount;
866 }
867
868}
869
870/*----------------------------------------------------------------------------
871| Packs the sign `zSign', the exponent `zExp', and the significand formed
872| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
873| floating-point value, returning the result. After being shifted into the
874| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
875| added together to form the most significant 32 bits of the result. This
876| means that any integer portion of `zSig0' will be added into the exponent.
877| Since a properly normalized significand will have an integer portion equal
878| to 1, the `zExp' input should be 1 less than the desired result exponent
879| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
880| significand.
881*----------------------------------------------------------------------------*/
882
883INLINE float128
884 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
885{
886 float128 z;
887
888 z.low = zSig1;
889 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
890 return z;
891
892}
893
894/*----------------------------------------------------------------------------
895| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
896| and extended significand formed by the concatenation of `zSig0', `zSig1',
897| and `zSig2', and returns the proper quadruple-precision floating-point value
898| corresponding to the abstract input. Ordinarily, the abstract value is
899| simply rounded and packed into the quadruple-precision format, with the
900| inexact exception raised if the abstract input cannot be represented
901| exactly. However, if the abstract value is too large, the overflow and
902| inexact exceptions are raised and an infinity or maximal finite value is
903| returned. If the abstract value is too small, the input value is rounded to
904| a subnormal number, and the underflow and inexact exceptions are raised if
905| the abstract input cannot be represented exactly as a subnormal quadruple-
906| precision floating-point number.
907| The input significand must be normalized or smaller. If the input
908| significand is not normalized, `zExp' must be 0; in that case, the result
909| returned is a subnormal number, and it must not require rounding. In the
910| usual case that the input significand is normalized, `zExp' must be 1 less
911| than the ``true'' floating-point exponent. The handling of underflow and
912| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
913*----------------------------------------------------------------------------*/
914
915static float128
916 roundAndPackFloat128(
917 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM)
918{
919 int8 roundingMode;
920 flag roundNearestEven, increment, isTiny;
921
922 roundingMode = STATUS(float_rounding_mode);
923 roundNearestEven = ( roundingMode == float_round_nearest_even );
924 increment = ( (sbits64) zSig2 < 0 );
925 if ( ! roundNearestEven ) {
926 if ( roundingMode == float_round_to_zero ) {
927 increment = 0;
928 }
929 else {
930 if ( zSign ) {
931 increment = ( roundingMode == float_round_down ) && zSig2;
932 }
933 else {
934 increment = ( roundingMode == float_round_up ) && zSig2;
935 }
936 }
937 }
938 if ( 0x7FFD <= (bits32) zExp ) {
939 if ( ( 0x7FFD < zExp )
940 || ( ( zExp == 0x7FFD )
941 && eq128(
942 LIT64( 0x0001FFFFFFFFFFFF ),
943 LIT64( 0xFFFFFFFFFFFFFFFF ),
944 zSig0,
945 zSig1
946 )
947 && increment
948 )
949 ) {
950 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
951 if ( ( roundingMode == float_round_to_zero )
952 || ( zSign && ( roundingMode == float_round_up ) )
953 || ( ! zSign && ( roundingMode == float_round_down ) )
954 ) {
955 return
956 packFloat128(
957 zSign,
958 0x7FFE,
959 LIT64( 0x0000FFFFFFFFFFFF ),
960 LIT64( 0xFFFFFFFFFFFFFFFF )
961 );
962 }
963 return packFloat128( zSign, 0x7FFF, 0, 0 );
964 }
965 if ( zExp < 0 ) {
966 isTiny =
967 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
968 || ( zExp < -1 )
969 || ! increment
970 || lt128(
971 zSig0,
972 zSig1,
973 LIT64( 0x0001FFFFFFFFFFFF ),
974 LIT64( 0xFFFFFFFFFFFFFFFF )
975 );
976 shift128ExtraRightJamming(
977 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
978 zExp = 0;
979 if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
980 if ( roundNearestEven ) {
981 increment = ( (sbits64) zSig2 < 0 );
982 }
983 else {
984 if ( zSign ) {
985 increment = ( roundingMode == float_round_down ) && zSig2;
986 }
987 else {
988 increment = ( roundingMode == float_round_up ) && zSig2;
989 }
990 }
991 }
992 }
993 if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
994 if ( increment ) {
995 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
996 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
997 }
998 else {
999 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1000 }
1001 return packFloat128( zSign, zExp, zSig0, zSig1 );
1002
1003}
1004
1005/*----------------------------------------------------------------------------
1006| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1007| and significand formed by the concatenation of `zSig0' and `zSig1', and
1008| returns the proper quadruple-precision floating-point value corresponding
1009| to the abstract input. This routine is just like `roundAndPackFloat128'
1010| except that the input significand has fewer bits and does not have to be
1011| normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1012| point exponent.
1013*----------------------------------------------------------------------------*/
1014
1015static float128
1016 normalizeRoundAndPackFloat128(
1017 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM)
1018{
1019 int8 shiftCount;
1020 bits64 zSig2;
1021
1022 if ( zSig0 == 0 ) {
1023 zSig0 = zSig1;
1024 zSig1 = 0;
1025 zExp -= 64;
1026 }
1027 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1028 if ( 0 <= shiftCount ) {
1029 zSig2 = 0;
1030 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1031 }
1032 else {
1033 shift128ExtraRightJamming(
1034 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1035 }
1036 zExp -= shiftCount;
1037 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1038
1039}
1040
1041#endif
1042
1043/*----------------------------------------------------------------------------
1044| Returns the result of converting the 32-bit two's complement integer `a'
1045| to the single-precision floating-point format. The conversion is performed
1046| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1047*----------------------------------------------------------------------------*/
1048
1049float32 int32_to_float32( int32 a STATUS_PARAM )
1050{
1051 flag zSign;
1052
1053 if ( a == 0 ) return 0;
1054 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1055 zSign = ( a < 0 );
1056 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1057
1058}
1059
1060/*----------------------------------------------------------------------------
1061| Returns the result of converting the 32-bit two's complement integer `a'
1062| to the double-precision floating-point format. The conversion is performed
1063| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1064*----------------------------------------------------------------------------*/
1065
1066float64 int32_to_float64( int32 a STATUS_PARAM )
1067{
1068 flag zSign;
1069 uint32 absA;
1070 int8 shiftCount;
1071 bits64 zSig;
1072
1073 if ( a == 0 ) return 0;
1074 zSign = ( a < 0 );
1075 absA = zSign ? - a : a;
1076 shiftCount = countLeadingZeros32( absA ) + 21;
1077 zSig = absA;
1078 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1079
1080}
1081
1082#ifdef FLOATX80
1083
1084/*----------------------------------------------------------------------------
1085| Returns the result of converting the 32-bit two's complement integer `a'
1086| to the extended double-precision floating-point format. The conversion
1087| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1088| Arithmetic.
1089*----------------------------------------------------------------------------*/
1090
1091floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1092{
1093 flag zSign;
1094 uint32 absA;
1095 int8 shiftCount;
1096 bits64 zSig;
1097
1098 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1099 zSign = ( a < 0 );
1100 absA = zSign ? - a : a;
1101 shiftCount = countLeadingZeros32( absA ) + 32;
1102 zSig = absA;
1103 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1104
1105}
1106
1107#endif
1108
1109#ifdef FLOAT128
1110
1111/*----------------------------------------------------------------------------
1112| Returns the result of converting the 32-bit two's complement integer `a' to
1113| the quadruple-precision floating-point format. The conversion is performed
1114| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1115*----------------------------------------------------------------------------*/
1116
1117float128 int32_to_float128( int32 a STATUS_PARAM )
1118{
1119 flag zSign;
1120 uint32 absA;
1121 int8 shiftCount;
1122 bits64 zSig0;
1123
1124 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1125 zSign = ( a < 0 );
1126 absA = zSign ? - a : a;
1127 shiftCount = countLeadingZeros32( absA ) + 17;
1128 zSig0 = absA;
1129 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1130
1131}
1132
1133#endif
1134
1135/*----------------------------------------------------------------------------
1136| Returns the result of converting the 64-bit two's complement integer `a'
1137| to the single-precision floating-point format. The conversion is performed
1138| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1139*----------------------------------------------------------------------------*/
1140
1141float32 int64_to_float32( int64 a STATUS_PARAM )
1142{
1143 flag zSign;
1144 uint64 absA;
1145 int8 shiftCount;
1146
1147 if ( a == 0 ) return 0;
1148 zSign = ( a < 0 );
1149 absA = zSign ? - a : a;
1150 shiftCount = countLeadingZeros64( absA ) - 40;
1151 if ( 0 <= shiftCount ) {
1152 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1153 }
1154 else {
1155 shiftCount += 7;
1156 if ( shiftCount < 0 ) {
1157 shift64RightJamming( absA, - shiftCount, &absA );
1158 }
1159 else {
1160 absA <<= shiftCount;
1161 }
1162 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1163 }
1164
1165}
1166
1167/*----------------------------------------------------------------------------
1168| Returns the result of converting the 64-bit two's complement integer `a'
1169| to the double-precision floating-point format. The conversion is performed
1170| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1171*----------------------------------------------------------------------------*/
1172
1173float64 int64_to_float64( int64 a STATUS_PARAM )
1174{
1175 flag zSign;
1176
1177 if ( a == 0 ) return 0;
1178 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1179 return packFloat64( 1, 0x43E, 0 );
1180 }
1181 zSign = ( a < 0 );
1182 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1183
1184}
1185
1186#ifdef FLOATX80
1187
1188/*----------------------------------------------------------------------------
1189| Returns the result of converting the 64-bit two's complement integer `a'
1190| to the extended double-precision floating-point format. The conversion
1191| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1192| Arithmetic.
1193*----------------------------------------------------------------------------*/
1194
1195floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1196{
1197 flag zSign;
1198 uint64 absA;
1199 int8 shiftCount;
1200
1201 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1202 zSign = ( a < 0 );
1203 absA = zSign ? - a : a;
1204 shiftCount = countLeadingZeros64( absA );
1205 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1206
1207}
1208
1209#endif
1210
1211#ifdef FLOAT128
1212
1213/*----------------------------------------------------------------------------
1214| Returns the result of converting the 64-bit two's complement integer `a' to
1215| the quadruple-precision floating-point format. The conversion is performed
1216| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1217*----------------------------------------------------------------------------*/
1218
1219float128 int64_to_float128( int64 a STATUS_PARAM )
1220{
1221 flag zSign;
1222 uint64 absA;
1223 int8 shiftCount;
1224 int32 zExp;
1225 bits64 zSig0, zSig1;
1226
1227 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1228 zSign = ( a < 0 );
1229 absA = zSign ? - a : a;
1230 shiftCount = countLeadingZeros64( absA ) + 49;
1231 zExp = 0x406E - shiftCount;
1232 if ( 64 <= shiftCount ) {
1233 zSig1 = 0;
1234 zSig0 = absA;
1235 shiftCount -= 64;
1236 }
1237 else {
1238 zSig1 = absA;
1239 zSig0 = 0;
1240 }
1241 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1242 return packFloat128( zSign, zExp, zSig0, zSig1 );
1243
1244}
1245
1246#endif
1247
1248/*----------------------------------------------------------------------------
1249| Returns the result of converting the single-precision floating-point value
1250| `a' to the 32-bit two's complement integer format. The conversion is
1251| performed according to the IEC/IEEE Standard for Binary Floating-Point
1252| Arithmetic---which means in particular that the conversion is rounded
1253| according to the current rounding mode. If `a' is a NaN, the largest
1254| positive integer is returned. Otherwise, if the conversion overflows, the
1255| largest integer with the same sign as `a' is returned.
1256*----------------------------------------------------------------------------*/
1257
1258int32 float32_to_int32( float32 a STATUS_PARAM )
1259{
1260 flag aSign;
1261 int16 aExp, shiftCount;
1262 bits32 aSig;
1263 bits64 aSig64;
1264
1265 aSig = extractFloat32Frac( a );
1266 aExp = extractFloat32Exp( a );
1267 aSign = extractFloat32Sign( a );
1268 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1269 if ( aExp ) aSig |= 0x00800000;
1270 shiftCount = 0xAF - aExp;
1271 aSig64 = aSig;
1272 aSig64 <<= 32;
1273 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1274 return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1275
1276}
1277
1278/*----------------------------------------------------------------------------
1279| Returns the result of converting the single-precision floating-point value
1280| `a' to the 32-bit two's complement integer format. The conversion is
1281| performed according to the IEC/IEEE Standard for Binary Floating-Point
1282| Arithmetic, except that the conversion is always rounded toward zero.
1283| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1284| the conversion overflows, the largest integer with the same sign as `a' is
1285| returned.
1286*----------------------------------------------------------------------------*/
1287
1288int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1289{
1290 flag aSign;
1291 int16 aExp, shiftCount;
1292 bits32 aSig;
1293 int32 z;
1294
1295 aSig = extractFloat32Frac( a );
1296 aExp = extractFloat32Exp( a );
1297 aSign = extractFloat32Sign( a );
1298 shiftCount = aExp - 0x9E;
1299 if ( 0 <= shiftCount ) {
1300 if ( a != 0xCF000000 ) {
1301 float_raise( float_flag_invalid STATUS_VAR);
1302 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1303 }
1304 return (sbits32) 0x80000000;
1305 }
1306 else if ( aExp <= 0x7E ) {
1307 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1308 return 0;
1309 }
1310 aSig = ( aSig | 0x00800000 )<<8;
1311 z = aSig>>( - shiftCount );
1312 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1313 STATUS(float_exception_flags) |= float_flag_inexact;
1314 }
1315 if ( aSign ) z = - z;
1316 return z;
1317
1318}
1319
1320/*----------------------------------------------------------------------------
1321| Returns the result of converting the single-precision floating-point value
1322| `a' to the 64-bit two's complement integer format. The conversion is
1323| performed according to the IEC/IEEE Standard for Binary Floating-Point
1324| Arithmetic---which means in particular that the conversion is rounded
1325| according to the current rounding mode. If `a' is a NaN, the largest
1326| positive integer is returned. Otherwise, if the conversion overflows, the
1327| largest integer with the same sign as `a' is returned.
1328*----------------------------------------------------------------------------*/
1329
1330int64 float32_to_int64( float32 a STATUS_PARAM )
1331{
1332 flag aSign;
1333 int16 aExp, shiftCount;
1334 bits32 aSig;
1335 bits64 aSig64, aSigExtra;
1336
1337 aSig = extractFloat32Frac( a );
1338 aExp = extractFloat32Exp( a );
1339 aSign = extractFloat32Sign( a );
1340 shiftCount = 0xBE - aExp;
1341 if ( shiftCount < 0 ) {
1342 float_raise( float_flag_invalid STATUS_VAR);
1343 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1344 return LIT64( 0x7FFFFFFFFFFFFFFF );
1345 }
1346 return (sbits64) LIT64( 0x8000000000000000 );
1347 }
1348 if ( aExp ) aSig |= 0x00800000;
1349 aSig64 = aSig;
1350 aSig64 <<= 40;
1351 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1352 return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1353
1354}
1355
1356/*----------------------------------------------------------------------------
1357| Returns the result of converting the single-precision floating-point value
1358| `a' to the 64-bit two's complement integer format. The conversion is
1359| performed according to the IEC/IEEE Standard for Binary Floating-Point
1360| Arithmetic, except that the conversion is always rounded toward zero. If
1361| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1362| conversion overflows, the largest integer with the same sign as `a' is
1363| returned.
1364*----------------------------------------------------------------------------*/
1365
1366int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1367{
1368 flag aSign;
1369 int16 aExp, shiftCount;
1370 bits32 aSig;
1371 bits64 aSig64;
1372 int64 z;
1373
1374 aSig = extractFloat32Frac( a );
1375 aExp = extractFloat32Exp( a );
1376 aSign = extractFloat32Sign( a );
1377 shiftCount = aExp - 0xBE;
1378 if ( 0 <= shiftCount ) {
1379 if ( a != 0xDF000000 ) {
1380 float_raise( float_flag_invalid STATUS_VAR);
1381 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1382 return LIT64( 0x7FFFFFFFFFFFFFFF );
1383 }
1384 }
1385 return (sbits64) LIT64( 0x8000000000000000 );
1386 }
1387 else if ( aExp <= 0x7E ) {
1388 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1389 return 0;
1390 }
1391 aSig64 = aSig | 0x00800000;
1392 aSig64 <<= 40;
1393 z = aSig64>>( - shiftCount );
1394 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1395 STATUS(float_exception_flags) |= float_flag_inexact;
1396 }
1397 if ( aSign ) z = - z;
1398 return z;
1399
1400}
1401
1402/*----------------------------------------------------------------------------
1403| Returns the result of converting the single-precision floating-point value
1404| `a' to the double-precision floating-point format. The conversion is
1405| performed according to the IEC/IEEE Standard for Binary Floating-Point
1406| Arithmetic.
1407*----------------------------------------------------------------------------*/
1408
1409float64 float32_to_float64( float32 a STATUS_PARAM )
1410{
1411 flag aSign;
1412 int16 aExp;
1413 bits32 aSig;
1414
1415 aSig = extractFloat32Frac( a );
1416 aExp = extractFloat32Exp( a );
1417 aSign = extractFloat32Sign( a );
1418 if ( aExp == 0xFF ) {
1419 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ));
1420 return packFloat64( aSign, 0x7FF, 0 );
1421 }
1422 if ( aExp == 0 ) {
1423 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1424 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1425 --aExp;
1426 }
1427 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1428
1429}
1430
1431#ifdef FLOATX80
1432
1433/*----------------------------------------------------------------------------
1434| Returns the result of converting the single-precision floating-point value
1435| `a' to the extended double-precision floating-point format. The conversion
1436| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1437| Arithmetic.
1438*----------------------------------------------------------------------------*/
1439
1440floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1441{
1442 flag aSign;
1443 int16 aExp;
1444 bits32 aSig;
1445
1446 aSig = extractFloat32Frac( a );
1447 aExp = extractFloat32Exp( a );
1448 aSign = extractFloat32Sign( a );
1449 if ( aExp == 0xFF ) {
1450 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) );
1451 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1452 }
1453 if ( aExp == 0 ) {
1454 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1455 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1456 }
1457 aSig |= 0x00800000;
1458 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1459
1460}
1461
1462#endif
1463
1464#ifdef FLOAT128
1465
1466/*----------------------------------------------------------------------------
1467| Returns the result of converting the single-precision floating-point value
1468| `a' to the double-precision floating-point format. The conversion is
1469| performed according to the IEC/IEEE Standard for Binary Floating-Point
1470| Arithmetic.
1471*----------------------------------------------------------------------------*/
1472
1473float128 float32_to_float128( float32 a STATUS_PARAM )
1474{
1475 flag aSign;
1476 int16 aExp;
1477 bits32 aSig;
1478
1479 aSig = extractFloat32Frac( a );
1480 aExp = extractFloat32Exp( a );
1481 aSign = extractFloat32Sign( a );
1482 if ( aExp == 0xFF ) {
1483 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) );
1484 return packFloat128( aSign, 0x7FFF, 0, 0 );
1485 }
1486 if ( aExp == 0 ) {
1487 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1488 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1489 --aExp;
1490 }
1491 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1492
1493}
1494
1495#endif
1496
1497/*----------------------------------------------------------------------------
1498| Rounds the single-precision floating-point value `a' to an integer, and
1499| returns the result as a single-precision floating-point value. The
1500| operation is performed according to the IEC/IEEE Standard for Binary
1501| Floating-Point Arithmetic.
1502*----------------------------------------------------------------------------*/
1503
1504float32 float32_round_to_int( float32 a STATUS_PARAM)
1505{
1506 flag aSign;
1507 int16 aExp;
1508 bits32 lastBitMask, roundBitsMask;
1509 int8 roundingMode;
1510 float32 z;
1511
1512 aExp = extractFloat32Exp( a );
1513 if ( 0x96 <= aExp ) {
1514 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1515 return propagateFloat32NaN( a, a STATUS_VAR );
1516 }
1517 return a;
1518 }
1519 if ( aExp <= 0x7E ) {
1520 if ( (bits32) ( a<<1 ) == 0 ) return a;
1521 STATUS(float_exception_flags) |= float_flag_inexact;
1522 aSign = extractFloat32Sign( a );
1523 switch ( STATUS(float_rounding_mode) ) {
1524 case float_round_nearest_even:
1525 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1526 return packFloat32( aSign, 0x7F, 0 );
1527 }
1528 break;
1529 case float_round_down:
1530 return aSign ? 0xBF800000 : 0;
1531 case float_round_up:
1532 return aSign ? 0x80000000 : 0x3F800000;
1533 }
1534 return packFloat32( aSign, 0, 0 );
1535 }
1536 lastBitMask = 1;
1537 lastBitMask <<= 0x96 - aExp;
1538 roundBitsMask = lastBitMask - 1;
1539 z = a;
1540 roundingMode = STATUS(float_rounding_mode);
1541 if ( roundingMode == float_round_nearest_even ) {
1542 z += lastBitMask>>1;
1543 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1544 }
1545 else if ( roundingMode != float_round_to_zero ) {
1546 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1547 z += roundBitsMask;
1548 }
1549 }
1550 z &= ~ roundBitsMask;
1551 if ( z != a ) STATUS(float_exception_flags) |= float_flag_inexact;
1552 return z;
1553
1554}
1555
1556/*----------------------------------------------------------------------------
1557| Returns the result of adding the absolute values of the single-precision
1558| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1559| before being returned. `zSign' is ignored if the result is a NaN.
1560| The addition is performed according to the IEC/IEEE Standard for Binary
1561| Floating-Point Arithmetic.
1562*----------------------------------------------------------------------------*/
1563
1564static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1565{
1566 int16 aExp, bExp, zExp;
1567 bits32 aSig, bSig, zSig;
1568 int16 expDiff;
1569
1570 aSig = extractFloat32Frac( a );
1571 aExp = extractFloat32Exp( a );
1572 bSig = extractFloat32Frac( b );
1573 bExp = extractFloat32Exp( b );
1574 expDiff = aExp - bExp;
1575 aSig <<= 6;
1576 bSig <<= 6;
1577 if ( 0 < expDiff ) {
1578 if ( aExp == 0xFF ) {
1579 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1580 return a;
1581 }
1582 if ( bExp == 0 ) {
1583 --expDiff;
1584 }
1585 else {
1586 bSig |= 0x20000000;
1587 }
1588 shift32RightJamming( bSig, expDiff, &bSig );
1589 zExp = aExp;
1590 }
1591 else if ( expDiff < 0 ) {
1592 if ( bExp == 0xFF ) {
1593 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1594 return packFloat32( zSign, 0xFF, 0 );
1595 }
1596 if ( aExp == 0 ) {
1597 ++expDiff;
1598 }
1599 else {
1600 aSig |= 0x20000000;
1601 }
1602 shift32RightJamming( aSig, - expDiff, &aSig );
1603 zExp = bExp;
1604 }
1605 else {
1606 if ( aExp == 0xFF ) {
1607 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1608 return a;
1609 }
1610 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1611 zSig = 0x40000000 + aSig + bSig;
1612 zExp = aExp;
1613 goto roundAndPack;
1614 }
1615 aSig |= 0x20000000;
1616 zSig = ( aSig + bSig )<<1;
1617 --zExp;
1618 if ( (sbits32) zSig < 0 ) {
1619 zSig = aSig + bSig;
1620 ++zExp;
1621 }
1622 roundAndPack:
1623 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1624
1625}
1626
1627/*----------------------------------------------------------------------------
1628| Returns the result of subtracting the absolute values of the single-
1629| precision floating-point values `a' and `b'. If `zSign' is 1, the
1630| difference is negated before being returned. `zSign' is ignored if the
1631| result is a NaN. The subtraction is performed according to the IEC/IEEE
1632| Standard for Binary Floating-Point Arithmetic.
1633*----------------------------------------------------------------------------*/
1634
1635static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1636{
1637 int16 aExp, bExp, zExp;
1638 bits32 aSig, bSig, zSig;
1639 int16 expDiff;
1640
1641 aSig = extractFloat32Frac( a );
1642 aExp = extractFloat32Exp( a );
1643 bSig = extractFloat32Frac( b );
1644 bExp = extractFloat32Exp( b );
1645 expDiff = aExp - bExp;
1646 aSig <<= 7;
1647 bSig <<= 7;
1648 if ( 0 < expDiff ) goto aExpBigger;
1649 if ( expDiff < 0 ) goto bExpBigger;
1650 if ( aExp == 0xFF ) {
1651 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1652 float_raise( float_flag_invalid STATUS_VAR);
1653 return float32_default_nan;
1654 }
1655 if ( aExp == 0 ) {
1656 aExp = 1;
1657 bExp = 1;
1658 }
1659 if ( bSig < aSig ) goto aBigger;
1660 if ( aSig < bSig ) goto bBigger;
1661 return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1662 bExpBigger:
1663 if ( bExp == 0xFF ) {
1664 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1665 return packFloat32( zSign ^ 1, 0xFF, 0 );
1666 }
1667 if ( aExp == 0 ) {
1668 ++expDiff;
1669 }
1670 else {
1671 aSig |= 0x40000000;
1672 }
1673 shift32RightJamming( aSig, - expDiff, &aSig );
1674 bSig |= 0x40000000;
1675 bBigger:
1676 zSig = bSig - aSig;
1677 zExp = bExp;
1678 zSign ^= 1;
1679 goto normalizeRoundAndPack;
1680 aExpBigger:
1681 if ( aExp == 0xFF ) {
1682 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1683 return a;
1684 }
1685 if ( bExp == 0 ) {
1686 --expDiff;
1687 }
1688 else {
1689 bSig |= 0x40000000;
1690 }
1691 shift32RightJamming( bSig, expDiff, &bSig );
1692 aSig |= 0x40000000;
1693 aBigger:
1694 zSig = aSig - bSig;
1695 zExp = aExp;
1696 normalizeRoundAndPack:
1697 --zExp;
1698 return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1699
1700}
1701
1702/*----------------------------------------------------------------------------
1703| Returns the result of adding the single-precision floating-point values `a'
1704| and `b'. The operation is performed according to the IEC/IEEE Standard for
1705| Binary Floating-Point Arithmetic.
1706*----------------------------------------------------------------------------*/
1707
1708float32 float32_add( float32 a, float32 b STATUS_PARAM )
1709{
1710 flag aSign, bSign;
1711
1712 aSign = extractFloat32Sign( a );
1713 bSign = extractFloat32Sign( b );
1714 if ( aSign == bSign ) {
1715 return addFloat32Sigs( a, b, aSign STATUS_VAR);
1716 }
1717 else {
1718 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1719 }
1720
1721}
1722
1723/*----------------------------------------------------------------------------
1724| Returns the result of subtracting the single-precision floating-point values
1725| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1726| for Binary Floating-Point Arithmetic.
1727*----------------------------------------------------------------------------*/
1728
1729float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1730{
1731 flag aSign, bSign;
1732
1733 aSign = extractFloat32Sign( a );
1734 bSign = extractFloat32Sign( b );
1735 if ( aSign == bSign ) {
1736 return subFloat32Sigs( a, b, aSign STATUS_VAR );
1737 }
1738 else {
1739 return addFloat32Sigs( a, b, aSign STATUS_VAR );
1740 }
1741
1742}
1743
1744/*----------------------------------------------------------------------------
1745| Returns the result of multiplying the single-precision floating-point values
1746| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1747| for Binary Floating-Point Arithmetic.
1748*----------------------------------------------------------------------------*/
1749
1750float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1751{
1752 flag aSign, bSign, zSign;
1753 int16 aExp, bExp, zExp;
1754 bits32 aSig, bSig;
1755 bits64 zSig64;
1756 bits32 zSig;
1757
1758 aSig = extractFloat32Frac( a );
1759 aExp = extractFloat32Exp( a );
1760 aSign = extractFloat32Sign( a );
1761 bSig = extractFloat32Frac( b );
1762 bExp = extractFloat32Exp( b );
1763 bSign = extractFloat32Sign( b );
1764 zSign = aSign ^ bSign;
1765 if ( aExp == 0xFF ) {
1766 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1767 return propagateFloat32NaN( a, b STATUS_VAR );
1768 }
1769 if ( ( bExp | bSig ) == 0 ) {
1770 float_raise( float_flag_invalid STATUS_VAR);
1771 return float32_default_nan;
1772 }
1773 return packFloat32( zSign, 0xFF, 0 );
1774 }
1775 if ( bExp == 0xFF ) {
1776 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1777 if ( ( aExp | aSig ) == 0 ) {
1778 float_raise( float_flag_invalid STATUS_VAR);
1779 return float32_default_nan;
1780 }
1781 return packFloat32( zSign, 0xFF, 0 );
1782 }
1783 if ( aExp == 0 ) {
1784 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1785 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1786 }
1787 if ( bExp == 0 ) {
1788 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1789 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1790 }
1791 zExp = aExp + bExp - 0x7F;
1792 aSig = ( aSig | 0x00800000 )<<7;
1793 bSig = ( bSig | 0x00800000 )<<8;
1794 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1795 zSig = zSig64;
1796 if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1797 zSig <<= 1;
1798 --zExp;
1799 }
1800 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1801
1802}
1803
1804/*----------------------------------------------------------------------------
1805| Returns the result of dividing the single-precision floating-point value `a'
1806| by the corresponding value `b'. The operation is performed according to the
1807| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1808*----------------------------------------------------------------------------*/
1809
1810float32 float32_div( float32 a, float32 b STATUS_PARAM )
1811{
1812 flag aSign, bSign, zSign;
1813 int16 aExp, bExp, zExp;
1814 bits32 aSig, bSig, zSig;
1815
1816 aSig = extractFloat32Frac( a );
1817 aExp = extractFloat32Exp( a );
1818 aSign = extractFloat32Sign( a );
1819 bSig = extractFloat32Frac( b );
1820 bExp = extractFloat32Exp( b );
1821 bSign = extractFloat32Sign( b );
1822 zSign = aSign ^ bSign;
1823 if ( aExp == 0xFF ) {
1824 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1825 if ( bExp == 0xFF ) {
1826 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1827 float_raise( float_flag_invalid STATUS_VAR);
1828 return float32_default_nan;
1829 }
1830 return packFloat32( zSign, 0xFF, 0 );
1831 }
1832 if ( bExp == 0xFF ) {
1833 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1834 return packFloat32( zSign, 0, 0 );
1835 }
1836 if ( bExp == 0 ) {
1837 if ( bSig == 0 ) {
1838 if ( ( aExp | aSig ) == 0 ) {
1839 float_raise( float_flag_invalid STATUS_VAR);
1840 return float32_default_nan;
1841 }
1842 float_raise( float_flag_divbyzero STATUS_VAR);
1843 return packFloat32( zSign, 0xFF, 0 );
1844 }
1845 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1846 }
1847 if ( aExp == 0 ) {
1848 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1849 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1850 }
1851 zExp = aExp - bExp + 0x7D;
1852 aSig = ( aSig | 0x00800000 )<<7;
1853 bSig = ( bSig | 0x00800000 )<<8;
1854 if ( bSig <= ( aSig + aSig ) ) {
1855 aSig >>= 1;
1856 ++zExp;
1857 }
1858 zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1859 if ( ( zSig & 0x3F ) == 0 ) {
1860 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1861 }
1862 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1863
1864}
1865
1866/*----------------------------------------------------------------------------
1867| Returns the remainder of the single-precision floating-point value `a'
1868| with respect to the corresponding value `b'. The operation is performed
1869| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1870*----------------------------------------------------------------------------*/
1871
1872float32 float32_rem( float32 a, float32 b STATUS_PARAM )
1873{
1874 flag aSign, bSign, zSign;
1875 int16 aExp, bExp, expDiff;
1876 bits32 aSig, bSig;
1877 bits32 q;
1878 bits64 aSig64, bSig64, q64;
1879 bits32 alternateASig;
1880 sbits32 sigMean;
1881
1882 aSig = extractFloat32Frac( a );
1883 aExp = extractFloat32Exp( a );
1884 aSign = extractFloat32Sign( a );
1885 bSig = extractFloat32Frac( b );
1886 bExp = extractFloat32Exp( b );
1887 bSign = extractFloat32Sign( b );
1888 if ( aExp == 0xFF ) {
1889 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1890 return propagateFloat32NaN( a, b STATUS_VAR );
1891 }
1892 float_raise( float_flag_invalid STATUS_VAR);
1893 return float32_default_nan;
1894 }
1895 if ( bExp == 0xFF ) {
1896 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1897 return a;
1898 }
1899 if ( bExp == 0 ) {
1900 if ( bSig == 0 ) {
1901 float_raise( float_flag_invalid STATUS_VAR);
1902 return float32_default_nan;
1903 }
1904 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1905 }
1906 if ( aExp == 0 ) {
1907 if ( aSig == 0 ) return a;
1908 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1909 }
1910 expDiff = aExp - bExp;
1911 aSig |= 0x00800000;
1912 bSig |= 0x00800000;
1913 if ( expDiff < 32 ) {
1914 aSig <<= 8;
1915 bSig <<= 8;
1916 if ( expDiff < 0 ) {
1917 if ( expDiff < -1 ) return a;
1918 aSig >>= 1;
1919 }
1920 q = ( bSig <= aSig );
1921 if ( q ) aSig -= bSig;
1922 if ( 0 < expDiff ) {
1923 q = ( ( (bits64) aSig )<<32 ) / bSig;
1924 q >>= 32 - expDiff;
1925 bSig >>= 2;
1926 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1927 }
1928 else {
1929 aSig >>= 2;
1930 bSig >>= 2;
1931 }
1932 }
1933 else {
1934 if ( bSig <= aSig ) aSig -= bSig;
1935 aSig64 = ( (bits64) aSig )<<40;
1936 bSig64 = ( (bits64) bSig )<<40;
1937 expDiff -= 64;
1938 while ( 0 < expDiff ) {
1939 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1940 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1941 aSig64 = - ( ( bSig * q64 )<<38 );
1942 expDiff -= 62;
1943 }
1944 expDiff += 64;
1945 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1946 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1947 q = q64>>( 64 - expDiff );
1948 bSig <<= 6;
1949 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1950 }
1951 do {
1952 alternateASig = aSig;
1953 ++q;
1954 aSig -= bSig;
1955 } while ( 0 <= (sbits32) aSig );
1956 sigMean = aSig + alternateASig;
1957 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1958 aSig = alternateASig;
1959 }
1960 zSign = ( (sbits32) aSig < 0 );
1961 if ( zSign ) aSig = - aSig;
1962 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
1963
1964}
1965
1966/*----------------------------------------------------------------------------
1967| Returns the square root of the single-precision floating-point value `a'.
1968| The operation is performed according to the IEC/IEEE Standard for Binary
1969| Floating-Point Arithmetic.
1970*----------------------------------------------------------------------------*/
1971
1972float32 float32_sqrt( float32 a STATUS_PARAM )
1973{
1974 flag aSign;
1975 int16 aExp, zExp;
1976 bits32 aSig, zSig;
1977 bits64 rem, term;
1978
1979 aSig = extractFloat32Frac( a );
1980 aExp = extractFloat32Exp( a );
1981 aSign = extractFloat32Sign( a );
1982 if ( aExp == 0xFF ) {
1983 if ( aSig ) return propagateFloat32NaN( a, 0 STATUS_VAR );
1984 if ( ! aSign ) return a;
1985 float_raise( float_flag_invalid STATUS_VAR);
1986 return float32_default_nan;
1987 }
1988 if ( aSign ) {
1989 if ( ( aExp | aSig ) == 0 ) return a;
1990 float_raise( float_flag_invalid STATUS_VAR);
1991 return float32_default_nan;
1992 }
1993 if ( aExp == 0 ) {
1994 if ( aSig == 0 ) return 0;
1995 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1996 }
1997 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1998 aSig = ( aSig | 0x00800000 )<<8;
1999 zSig = estimateSqrt32( aExp, aSig ) + 2;
2000 if ( ( zSig & 0x7F ) <= 5 ) {
2001 if ( zSig < 2 ) {
2002 zSig = 0x7FFFFFFF;
2003 goto roundAndPack;
2004 }
2005 aSig >>= aExp & 1;
2006 term = ( (bits64) zSig ) * zSig;
2007 rem = ( ( (bits64) aSig )<<32 ) - term;
2008 while ( (sbits64) rem < 0 ) {
2009 --zSig;
2010 rem += ( ( (bits64) zSig )<<1 ) | 1;
2011 }
2012 zSig |= ( rem != 0 );
2013 }
2014 shift32RightJamming( zSig, 1, &zSig );
2015 roundAndPack:
2016 return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2017
2018}
2019
2020/*----------------------------------------------------------------------------
2021| Returns 1 if the single-precision floating-point value `a' is equal to
2022| the corresponding value `b', and 0 otherwise. The comparison is performed
2023| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2024*----------------------------------------------------------------------------*/
2025
2026int float32_eq( float32 a, float32 b STATUS_PARAM )
2027{
2028
2029 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2030 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2031 ) {
2032 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2033 float_raise( float_flag_invalid STATUS_VAR);
2034 }
2035 return 0;
2036 }
2037 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2038
2039}
2040
2041/*----------------------------------------------------------------------------
2042| Returns 1 if the single-precision floating-point value `a' is less than
2043| or equal to the corresponding value `b', and 0 otherwise. The comparison
2044| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2045| Arithmetic.
2046*----------------------------------------------------------------------------*/
2047
2048int float32_le( float32 a, float32 b STATUS_PARAM )
2049{
2050 flag aSign, bSign;
2051
2052 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2053 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2054 ) {
2055 float_raise( float_flag_invalid STATUS_VAR);
2056 return 0;
2057 }
2058 aSign = extractFloat32Sign( a );
2059 bSign = extractFloat32Sign( b );
2060 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2061 return ( a == b ) || ( aSign ^ ( a < b ) );
2062
2063}
2064
2065/*----------------------------------------------------------------------------
2066| Returns 1 if the single-precision floating-point value `a' is less than
2067| the corresponding value `b', and 0 otherwise. The comparison is performed
2068| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2069*----------------------------------------------------------------------------*/
2070
2071int float32_lt( float32 a, float32 b STATUS_PARAM )
2072{
2073 flag aSign, bSign;
2074
2075 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2076 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2077 ) {
2078 float_raise( float_flag_invalid STATUS_VAR);
2079 return 0;
2080 }
2081 aSign = extractFloat32Sign( a );
2082 bSign = extractFloat32Sign( b );
2083 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2084 return ( a != b ) && ( aSign ^ ( a < b ) );
2085
2086}
2087
2088/*----------------------------------------------------------------------------
2089| Returns 1 if the single-precision floating-point value `a' is equal to
2090| the corresponding value `b', and 0 otherwise. The invalid exception is
2091| raised if either operand is a NaN. Otherwise, the comparison is performed
2092| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2093*----------------------------------------------------------------------------*/
2094
2095int float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
2096{
2097
2098 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2099 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2100 ) {
2101 float_raise( float_flag_invalid STATUS_VAR);
2102 return 0;
2103 }
2104 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2105
2106}
2107
2108/*----------------------------------------------------------------------------
2109| Returns 1 if the single-precision floating-point value `a' is less than or
2110| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2111| cause an exception. Otherwise, the comparison is performed according to the
2112| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2113*----------------------------------------------------------------------------*/
2114
2115int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2116{
2117 flag aSign, bSign;
2118
2119 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2120 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2121 ) {
2122 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2123 float_raise( float_flag_invalid STATUS_VAR);
2124 }
2125 return 0;
2126 }
2127 aSign = extractFloat32Sign( a );
2128 bSign = extractFloat32Sign( b );
2129 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2130 return ( a == b ) || ( aSign ^ ( a < b ) );
2131
2132}
2133
2134/*----------------------------------------------------------------------------
2135| Returns 1 if the single-precision floating-point value `a' is less than
2136| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2137| exception. Otherwise, the comparison is performed according to the IEC/IEEE
2138| Standard for Binary Floating-Point Arithmetic.
2139*----------------------------------------------------------------------------*/
2140
2141int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2142{
2143 flag aSign, bSign;
2144
2145 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2146 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2147 ) {
2148 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2149 float_raise( float_flag_invalid STATUS_VAR);
2150 }
2151 return 0;
2152 }
2153 aSign = extractFloat32Sign( a );
2154 bSign = extractFloat32Sign( b );
2155 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2156 return ( a != b ) && ( aSign ^ ( a < b ) );
2157
2158}
2159
2160/*----------------------------------------------------------------------------
2161| Returns the result of converting the double-precision floating-point value
2162| `a' to the 32-bit two's complement integer format. The conversion is
2163| performed according to the IEC/IEEE Standard for Binary Floating-Point
2164| Arithmetic---which means in particular that the conversion is rounded
2165| according to the current rounding mode. If `a' is a NaN, the largest
2166| positive integer is returned. Otherwise, if the conversion overflows, the
2167| largest integer with the same sign as `a' is returned.
2168*----------------------------------------------------------------------------*/
2169
2170int32 float64_to_int32( float64 a STATUS_PARAM )
2171{
2172 flag aSign;
2173 int16 aExp, shiftCount;
2174 bits64 aSig;
2175
2176 aSig = extractFloat64Frac( a );
2177 aExp = extractFloat64Exp( a );
2178 aSign = extractFloat64Sign( a );
2179 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2180 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2181 shiftCount = 0x42C - aExp;
2182 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2183 return roundAndPackInt32( aSign, aSig STATUS_VAR );
2184
2185}
2186
2187/*----------------------------------------------------------------------------
2188| Returns the result of converting the double-precision floating-point value
2189| `a' to the 32-bit two's complement integer format. The conversion is
2190| performed according to the IEC/IEEE Standard for Binary Floating-Point
2191| Arithmetic, except that the conversion is always rounded toward zero.
2192| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2193| the conversion overflows, the largest integer with the same sign as `a' is
2194| returned.
2195*----------------------------------------------------------------------------*/
2196
2197int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2198{
2199 flag aSign;
2200 int16 aExp, shiftCount;
2201 bits64 aSig, savedASig;
2202 int32 z;
2203
2204 aSig = extractFloat64Frac( a );
2205 aExp = extractFloat64Exp( a );
2206 aSign = extractFloat64Sign( a );
2207 if ( 0x41E < aExp ) {
2208 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2209 goto invalid;
2210 }
2211 else if ( aExp < 0x3FF ) {
2212 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2213 return 0;
2214 }
2215 aSig |= LIT64( 0x0010000000000000 );
2216 shiftCount = 0x433 - aExp;
2217 savedASig = aSig;
2218 aSig >>= shiftCount;
2219 z = aSig;
2220 if ( aSign ) z = - z;
2221 if ( ( z < 0 ) ^ aSign ) {
2222 invalid:
2223 float_raise( float_flag_invalid STATUS_VAR);
2224 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2225 }
2226 if ( ( aSig<<shiftCount ) != savedASig ) {
2227 STATUS(float_exception_flags) |= float_flag_inexact;
2228 }
2229 return z;
2230
2231}
2232
2233/*----------------------------------------------------------------------------
2234| Returns the result of converting the double-precision floating-point value
2235| `a' to the 64-bit two's complement integer format. The conversion is
2236| performed according to the IEC/IEEE Standard for Binary Floating-Point
2237| Arithmetic---which means in particular that the conversion is rounded
2238| according to the current rounding mode. If `a' is a NaN, the largest
2239| positive integer is returned. Otherwise, if the conversion overflows, the
2240| largest integer with the same sign as `a' is returned.
2241*----------------------------------------------------------------------------*/
2242
2243int64 float64_to_int64( float64 a STATUS_PARAM )
2244{
2245 flag aSign;
2246 int16 aExp, shiftCount;
2247 bits64 aSig, aSigExtra;
2248
2249 aSig = extractFloat64Frac( a );
2250 aExp = extractFloat64Exp( a );
2251 aSign = extractFloat64Sign( a );
2252 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2253 shiftCount = 0x433 - aExp;
2254 if ( shiftCount <= 0 ) {
2255 if ( 0x43E < aExp ) {
2256 float_raise( float_flag_invalid STATUS_VAR);
2257 if ( ! aSign
2258 || ( ( aExp == 0x7FF )
2259 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2260 ) {
2261 return LIT64( 0x7FFFFFFFFFFFFFFF );
2262 }
2263 return (sbits64) LIT64( 0x8000000000000000 );
2264 }
2265 aSigExtra = 0;
2266 aSig <<= - shiftCount;
2267 }
2268 else {
2269 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2270 }
2271 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2272
2273}
2274
2275/*----------------------------------------------------------------------------
2276| Returns the result of converting the double-precision floating-point value
2277| `a' to the 64-bit two's complement integer format. The conversion is
2278| performed according to the IEC/IEEE Standard for Binary Floating-Point
2279| Arithmetic, except that the conversion is always rounded toward zero.
2280| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2281| the conversion overflows, the largest integer with the same sign as `a' is
2282| returned.
2283*----------------------------------------------------------------------------*/
2284
2285int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2286{
2287 flag aSign;
2288 int16 aExp, shiftCount;
2289 bits64 aSig;
2290 int64 z;
2291
2292 aSig = extractFloat64Frac( a );
2293 aExp = extractFloat64Exp( a );
2294 aSign = extractFloat64Sign( a );
2295 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2296 shiftCount = aExp - 0x433;
2297 if ( 0 <= shiftCount ) {
2298 if ( 0x43E <= aExp ) {
2299 if ( a != LIT64( 0xC3E0000000000000 ) ) {
2300 float_raise( float_flag_invalid STATUS_VAR);
2301 if ( ! aSign
2302 || ( ( aExp == 0x7FF )
2303 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2304 ) {
2305 return LIT64( 0x7FFFFFFFFFFFFFFF );
2306 }
2307 }
2308 return (sbits64) LIT64( 0x8000000000000000 );
2309 }
2310 z = aSig<<shiftCount;
2311 }
2312 else {
2313 if ( aExp < 0x3FE ) {
2314 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2315 return 0;
2316 }
2317 z = aSig>>( - shiftCount );
2318 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2319 STATUS(float_exception_flags) |= float_flag_inexact;
2320 }
2321 }
2322 if ( aSign ) z = - z;
2323 return z;
2324
2325}
2326
2327/*----------------------------------------------------------------------------
2328| Returns the result of converting the double-precision floating-point value
2329| `a' to the single-precision floating-point format. The conversion is
2330| performed according to the IEC/IEEE Standard for Binary Floating-Point
2331| Arithmetic.
2332*----------------------------------------------------------------------------*/
2333
2334float32 float64_to_float32( float64 a STATUS_PARAM )
2335{
2336 flag aSign;
2337 int16 aExp;
2338 bits64 aSig;
2339 bits32 zSig;
2340
2341 aSig = extractFloat64Frac( a );
2342 aExp = extractFloat64Exp( a );
2343 aSign = extractFloat64Sign( a );
2344 if ( aExp == 0x7FF ) {
2345 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) );
2346 return packFloat32( aSign, 0xFF, 0 );
2347 }
2348 shift64RightJamming( aSig, 22, &aSig );
2349 zSig = aSig;
2350 if ( aExp || zSig ) {
2351 zSig |= 0x40000000;
2352 aExp -= 0x381;
2353 }
2354 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2355
2356}
2357
2358#ifdef FLOATX80
2359
2360/*----------------------------------------------------------------------------
2361| Returns the result of converting the double-precision floating-point value
2362| `a' to the extended double-precision floating-point format. The conversion
2363| is performed according to the IEC/IEEE Standard for Binary Floating-Point
2364| Arithmetic.
2365*----------------------------------------------------------------------------*/
2366
2367floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2368{
2369 flag aSign;
2370 int16 aExp;
2371 bits64 aSig;
2372
2373 aSig = extractFloat64Frac( a );
2374 aExp = extractFloat64Exp( a );
2375 aSign = extractFloat64Sign( a );
2376 if ( aExp == 0x7FF ) {
2377 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) );
2378 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2379 }
2380 if ( aExp == 0 ) {
2381 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2382 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2383 }
2384 return
2385 packFloatx80(
2386 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2387
2388}
2389
2390#endif
2391
2392#ifdef FLOAT128
2393
2394/*----------------------------------------------------------------------------
2395| Returns the result of converting the double-precision floating-point value
2396| `a' to the quadruple-precision floating-point format. The conversion is
2397| performed according to the IEC/IEEE Standard for Binary Floating-Point
2398| Arithmetic.
2399*----------------------------------------------------------------------------*/
2400
2401float128 float64_to_float128( float64 a STATUS_PARAM )
2402{
2403 flag aSign;
2404 int16 aExp;
2405 bits64 aSig, zSig0, zSig1;
2406
2407 aSig = extractFloat64Frac( a );
2408 aExp = extractFloat64Exp( a );
2409 aSign = extractFloat64Sign( a );
2410 if ( aExp == 0x7FF ) {
2411 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) );
2412 return packFloat128( aSign, 0x7FFF, 0, 0 );
2413 }
2414 if ( aExp == 0 ) {
2415 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2416 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2417 --aExp;
2418 }
2419 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2420 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2421
2422}
2423
2424#endif
2425
2426/*----------------------------------------------------------------------------
2427| Rounds the double-precision floating-point value `a' to an integer, and
2428| returns the result as a double-precision floating-point value. The
2429| operation is performed according to the IEC/IEEE Standard for Binary
2430| Floating-Point Arithmetic.
2431*----------------------------------------------------------------------------*/
2432
2433float64 float64_round_to_int( float64 a STATUS_PARAM )
2434{
2435 flag aSign;
2436 int16 aExp;
2437 bits64 lastBitMask, roundBitsMask;
2438 int8 roundingMode;
2439 float64 z;
2440
2441 aExp = extractFloat64Exp( a );
2442 if ( 0x433 <= aExp ) {
2443 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2444 return propagateFloat64NaN( a, a STATUS_VAR );
2445 }
2446 return a;
2447 }
2448 if ( aExp < 0x3FF ) {
2449 if ( (bits64) ( a<<1 ) == 0 ) return a;
2450 STATUS(float_exception_flags) |= float_flag_inexact;
2451 aSign = extractFloat64Sign( a );
2452 switch ( STATUS(float_rounding_mode) ) {
2453 case float_round_nearest_even:
2454 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2455 return packFloat64( aSign, 0x3FF, 0 );
2456 }
2457 break;
2458 case float_round_down:
2459 return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2460 case float_round_up:
2461 return
2462 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2463 }
2464 return packFloat64( aSign, 0, 0 );
2465 }
2466 lastBitMask = 1;
2467 lastBitMask <<= 0x433 - aExp;
2468 roundBitsMask = lastBitMask - 1;
2469 z = a;
2470 roundingMode = STATUS(float_rounding_mode);
2471 if ( roundingMode == float_round_nearest_even ) {
2472 z += lastBitMask>>1;
2473 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2474 }
2475 else if ( roundingMode != float_round_to_zero ) {
2476 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2477 z += roundBitsMask;
2478 }
2479 }
2480 z &= ~ roundBitsMask;
2481 if ( z != a ) STATUS(float_exception_flags) |= float_flag_inexact;
2482 return z;
2483
2484}
2485
2486float64 float64_trunc_to_int( float64 a STATUS_PARAM)
2487{
2488 int oldmode;
2489 float64 res;
2490 oldmode = STATUS(float_rounding_mode);
2491 STATUS(float_rounding_mode) = float_round_to_zero;
2492 res = float64_round_to_int(a STATUS_VAR);
2493 STATUS(float_rounding_mode) = oldmode;
2494 return res;
2495}
2496
2497/*----------------------------------------------------------------------------
2498| Returns the result of adding the absolute values of the double-precision
2499| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2500| before being returned. `zSign' is ignored if the result is a NaN.
2501| The addition is performed according to the IEC/IEEE Standard for Binary
2502| Floating-Point Arithmetic.
2503*----------------------------------------------------------------------------*/
2504
2505static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2506{
2507 int16 aExp, bExp, zExp;
2508 bits64 aSig, bSig, zSig;
2509 int16 expDiff;
2510
2511 aSig = extractFloat64Frac( a );
2512 aExp = extractFloat64Exp( a );
2513 bSig = extractFloat64Frac( b );
2514 bExp = extractFloat64Exp( b );
2515 expDiff = aExp - bExp;
2516 aSig <<= 9;
2517 bSig <<= 9;
2518 if ( 0 < expDiff ) {
2519 if ( aExp == 0x7FF ) {
2520 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2521 return a;
2522 }
2523 if ( bExp == 0 ) {
2524 --expDiff;
2525 }
2526 else {
2527 bSig |= LIT64( 0x2000000000000000 );
2528 }
2529 shift64RightJamming( bSig, expDiff, &bSig );
2530 zExp = aExp;
2531 }
2532 else if ( expDiff < 0 ) {
2533 if ( bExp == 0x7FF ) {
2534 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2535 return packFloat64( zSign, 0x7FF, 0 );
2536 }
2537 if ( aExp == 0 ) {
2538 ++expDiff;
2539 }
2540 else {
2541 aSig |= LIT64( 0x2000000000000000 );
2542 }
2543 shift64RightJamming( aSig, - expDiff, &aSig );
2544 zExp = bExp;
2545 }
2546 else {
2547 if ( aExp == 0x7FF ) {
2548 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2549 return a;
2550 }
2551 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2552 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2553 zExp = aExp;
2554 goto roundAndPack;
2555 }
2556 aSig |= LIT64( 0x2000000000000000 );
2557 zSig = ( aSig + bSig )<<1;
2558 --zExp;
2559 if ( (sbits64) zSig < 0 ) {
2560 zSig = aSig + bSig;
2561 ++zExp;
2562 }
2563 roundAndPack:
2564 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2565
2566}
2567
2568/*----------------------------------------------------------------------------
2569| Returns the result of subtracting the absolute values of the double-
2570| precision floating-point values `a' and `b'. If `zSign' is 1, the
2571| difference is negated before being returned. `zSign' is ignored if the
2572| result is a NaN. The subtraction is performed according to the IEC/IEEE
2573| Standard for Binary Floating-Point Arithmetic.
2574*----------------------------------------------------------------------------*/
2575
2576static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2577{
2578 int16 aExp, bExp, zExp;
2579 bits64 aSig, bSig, zSig;
2580 int16 expDiff;
2581
2582 aSig = extractFloat64Frac( a );
2583 aExp = extractFloat64Exp( a );
2584 bSig = extractFloat64Frac( b );
2585 bExp = extractFloat64Exp( b );
2586 expDiff = aExp - bExp;
2587 aSig <<= 10;
2588 bSig <<= 10;
2589 if ( 0 < expDiff ) goto aExpBigger;
2590 if ( expDiff < 0 ) goto bExpBigger;
2591 if ( aExp == 0x7FF ) {
2592 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2593 float_raise( float_flag_invalid STATUS_VAR);
2594 return float64_default_nan;
2595 }
2596 if ( aExp == 0 ) {
2597 aExp = 1;
2598 bExp = 1;
2599 }
2600 if ( bSig < aSig ) goto aBigger;
2601 if ( aSig < bSig ) goto bBigger;
2602 return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
2603 bExpBigger:
2604 if ( bExp == 0x7FF ) {
2605 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2606 return packFloat64( zSign ^ 1, 0x7FF, 0 );
2607 }
2608 if ( aExp == 0 ) {
2609 ++expDiff;
2610 }
2611 else {
2612 aSig |= LIT64( 0x4000000000000000 );
2613 }
2614 shift64RightJamming( aSig, - expDiff, &aSig );
2615 bSig |= LIT64( 0x4000000000000000 );
2616 bBigger:
2617 zSig = bSig - aSig;
2618 zExp = bExp;
2619 zSign ^= 1;
2620 goto normalizeRoundAndPack;
2621 aExpBigger:
2622 if ( aExp == 0x7FF ) {
2623 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2624 return a;
2625 }
2626 if ( bExp == 0 ) {
2627 --expDiff;
2628 }
2629 else {
2630 bSig |= LIT64( 0x4000000000000000 );
2631 }
2632 shift64RightJamming( bSig, expDiff, &bSig );
2633 aSig |= LIT64( 0x4000000000000000 );
2634 aBigger:
2635 zSig = aSig - bSig;
2636 zExp = aExp;
2637 normalizeRoundAndPack:
2638 --zExp;
2639 return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2640
2641}
2642
2643/*----------------------------------------------------------------------------
2644| Returns the result of adding the double-precision floating-point values `a'
2645| and `b'. The operation is performed according to the IEC/IEEE Standard for
2646| Binary Floating-Point Arithmetic.
2647*----------------------------------------------------------------------------*/
2648
2649float64 float64_add( float64 a, float64 b STATUS_PARAM )
2650{
2651 flag aSign, bSign;
2652
2653 aSign = extractFloat64Sign( a );
2654 bSign = extractFloat64Sign( b );
2655 if ( aSign == bSign ) {
2656 return addFloat64Sigs( a, b, aSign STATUS_VAR );
2657 }
2658 else {
2659 return subFloat64Sigs( a, b, aSign STATUS_VAR );
2660 }
2661
2662}
2663
2664/*----------------------------------------------------------------------------
2665| Returns the result of subtracting the double-precision floating-point values
2666| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2667| for Binary Floating-Point Arithmetic.
2668*----------------------------------------------------------------------------*/
2669
2670float64 float64_sub( float64 a, float64 b STATUS_PARAM )
2671{
2672 flag aSign, bSign;
2673
2674 aSign = extractFloat64Sign( a );
2675 bSign = extractFloat64Sign( b );
2676 if ( aSign == bSign ) {
2677 return subFloat64Sigs( a, b, aSign STATUS_VAR );
2678 }
2679 else {
2680 return addFloat64Sigs( a, b, aSign STATUS_VAR );
2681 }
2682
2683}
2684
2685/*----------------------------------------------------------------------------
2686| Returns the result of multiplying the double-precision floating-point values
2687| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2688| for Binary Floating-Point Arithmetic.
2689*----------------------------------------------------------------------------*/
2690
2691float64 float64_mul( float64 a, float64 b STATUS_PARAM )
2692{
2693 flag aSign, bSign, zSign;
2694 int16 aExp, bExp, zExp;
2695 bits64 aSig, bSig, zSig0, zSig1;
2696
2697 aSig = extractFloat64Frac( a );
2698 aExp = extractFloat64Exp( a );
2699 aSign = extractFloat64Sign( a );
2700 bSig = extractFloat64Frac( b );
2701 bExp = extractFloat64Exp( b );
2702 bSign = extractFloat64Sign( b );
2703 zSign = aSign ^ bSign;
2704 if ( aExp == 0x7FF ) {
2705 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2706 return propagateFloat64NaN( a, b STATUS_VAR );
2707 }
2708 if ( ( bExp | bSig ) == 0 ) {
2709 float_raise( float_flag_invalid STATUS_VAR);
2710 return float64_default_nan;
2711 }
2712 return packFloat64( zSign, 0x7FF, 0 );
2713 }
2714 if ( bExp == 0x7FF ) {
2715 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2716 if ( ( aExp | aSig ) == 0 ) {
2717 float_raise( float_flag_invalid STATUS_VAR);
2718 return float64_default_nan;
2719 }
2720 return packFloat64( zSign, 0x7FF, 0 );
2721 }
2722 if ( aExp == 0 ) {
2723 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2724 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2725 }
2726 if ( bExp == 0 ) {
2727 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2728 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2729 }
2730 zExp = aExp + bExp - 0x3FF;
2731 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2732 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2733 mul64To128( aSig, bSig, &zSig0, &zSig1 );
2734 zSig0 |= ( zSig1 != 0 );
2735 if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2736 zSig0 <<= 1;
2737 --zExp;
2738 }
2739 return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
2740
2741}
2742
2743/*----------------------------------------------------------------------------
2744| Returns the result of dividing the double-precision floating-point value `a'
2745| by the corresponding value `b'. The operation is performed according to
2746| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2747*----------------------------------------------------------------------------*/
2748
2749float64 float64_div( float64 a, float64 b STATUS_PARAM )
2750{
2751 flag aSign, bSign, zSign;
2752 int16 aExp, bExp, zExp;
2753 bits64 aSig, bSig, zSig;
2754 bits64 rem0, rem1;
2755 bits64 term0, term1;
2756
2757 aSig = extractFloat64Frac( a );
2758 aExp = extractFloat64Exp( a );
2759 aSign = extractFloat64Sign( a );
2760 bSig = extractFloat64Frac( b );
2761 bExp = extractFloat64Exp( b );
2762 bSign = extractFloat64Sign( b );
2763 zSign = aSign ^ bSign;
2764 if ( aExp == 0x7FF ) {
2765 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2766 if ( bExp == 0x7FF ) {
2767 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2768 float_raise( float_flag_invalid STATUS_VAR);
2769 return float64_default_nan;
2770 }
2771 return packFloat64( zSign, 0x7FF, 0 );
2772 }
2773 if ( bExp == 0x7FF ) {
2774 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2775 return packFloat64( zSign, 0, 0 );
2776 }
2777 if ( bExp == 0 ) {
2778 if ( bSig == 0 ) {
2779 if ( ( aExp | aSig ) == 0 ) {
2780 float_raise( float_flag_invalid STATUS_VAR);
2781 return float64_default_nan;
2782 }
2783 float_raise( float_flag_divbyzero STATUS_VAR);
2784 return packFloat64( zSign, 0x7FF, 0 );
2785 }
2786 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2787 }
2788 if ( aExp == 0 ) {
2789 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2790 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2791 }
2792 zExp = aExp - bExp + 0x3FD;
2793 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2794 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2795 if ( bSig <= ( aSig + aSig ) ) {
2796 aSig >>= 1;
2797 ++zExp;
2798 }
2799 zSig = estimateDiv128To64( aSig, 0, bSig );
2800 if ( ( zSig & 0x1FF ) <= 2 ) {
2801 mul64To128( bSig, zSig, &term0, &term1 );
2802 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2803 while ( (sbits64) rem0 < 0 ) {
2804 --zSig;
2805 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2806 }
2807 zSig |= ( rem1 != 0 );
2808 }
2809 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
2810
2811}
2812
2813/*----------------------------------------------------------------------------
2814| Returns the remainder of the double-precision floating-point value `a'
2815| with respect to the corresponding value `b'. The operation is performed
2816| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2817*----------------------------------------------------------------------------*/
2818
2819float64 float64_rem( float64 a, float64 b STATUS_PARAM )
2820{
2821 flag aSign, bSign, zSign;
2822 int16 aExp, bExp, expDiff;
2823 bits64 aSig, bSig;
2824 bits64 q, alternateASig;
2825 sbits64 sigMean;
2826
2827 aSig = extractFloat64Frac( a );
2828 aExp = extractFloat64Exp( a );
2829 aSign = extractFloat64Sign( a );
2830 bSig = extractFloat64Frac( b );
2831 bExp = extractFloat64Exp( b );
2832 bSign = extractFloat64Sign( b );
2833 if ( aExp == 0x7FF ) {
2834 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2835 return propagateFloat64NaN( a, b STATUS_VAR );
2836 }
2837 float_raise( float_flag_invalid STATUS_VAR);
2838 return float64_default_nan;
2839 }
2840 if ( bExp == 0x7FF ) {
2841 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
2842 return a;
2843 }
2844 if ( bExp == 0 ) {
2845 if ( bSig == 0 ) {
2846 float_raise( float_flag_invalid STATUS_VAR);
2847 return float64_default_nan;
2848 }
2849 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2850 }
2851 if ( aExp == 0 ) {
2852 if ( aSig == 0 ) return a;
2853 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2854 }
2855 expDiff = aExp - bExp;
2856 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2857 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2858 if ( expDiff < 0 ) {
2859 if ( expDiff < -1 ) return a;
2860 aSig >>= 1;
2861 }
2862 q = ( bSig <= aSig );
2863 if ( q ) aSig -= bSig;
2864 expDiff -= 64;
2865 while ( 0 < expDiff ) {
2866 q = estimateDiv128To64( aSig, 0, bSig );
2867 q = ( 2 < q ) ? q - 2 : 0;
2868 aSig = - ( ( bSig>>2 ) * q );
2869 expDiff -= 62;
2870 }
2871 expDiff += 64;
2872 if ( 0 < expDiff ) {
2873 q = estimateDiv128To64( aSig, 0, bSig );
2874 q = ( 2 < q ) ? q - 2 : 0;
2875 q >>= 64 - expDiff;
2876 bSig >>= 2;
2877 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2878 }
2879 else {
2880 aSig >>= 2;
2881 bSig >>= 2;
2882 }
2883 do {
2884 alternateASig = aSig;
2885 ++q;
2886 aSig -= bSig;
2887 } while ( 0 <= (sbits64) aSig );
2888 sigMean = aSig + alternateASig;
2889 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2890 aSig = alternateASig;
2891 }
2892 zSign = ( (sbits64) aSig < 0 );
2893 if ( zSign ) aSig = - aSig;
2894 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
2895
2896}
2897
2898/*----------------------------------------------------------------------------
2899| Returns the square root of the double-precision floating-point value `a'.
2900| The operation is performed according to the IEC/IEEE Standard for Binary
2901| Floating-Point Arithmetic.
2902*----------------------------------------------------------------------------*/
2903
2904float64 float64_sqrt( float64 a STATUS_PARAM )
2905{
2906 flag aSign;
2907 int16 aExp, zExp;
2908 bits64 aSig, zSig, doubleZSig;
2909 bits64 rem0, rem1, term0, term1;
2910
2911 aSig = extractFloat64Frac( a );
2912 aExp = extractFloat64Exp( a );
2913 aSign = extractFloat64Sign( a );
2914 if ( aExp == 0x7FF ) {
2915 if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
2916 if ( ! aSign ) return a;
2917 float_raise( float_flag_invalid STATUS_VAR);
2918 return float64_default_nan;
2919 }
2920 if ( aSign ) {
2921 if ( ( aExp | aSig ) == 0 ) return a;
2922 float_raise( float_flag_invalid STATUS_VAR);
2923 return float64_default_nan;
2924 }
2925 if ( aExp == 0 ) {
2926 if ( aSig == 0 ) return 0;
2927 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2928 }
2929 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2930 aSig |= LIT64( 0x0010000000000000 );
2931 zSig = estimateSqrt32( aExp, aSig>>21 );
2932 aSig <<= 9 - ( aExp & 1 );
2933 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
2934 if ( ( zSig & 0x1FF ) <= 5 ) {
2935 doubleZSig = zSig<<1;
2936 mul64To128( zSig, zSig, &term0, &term1 );
2937 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2938 while ( (sbits64) rem0 < 0 ) {
2939 --zSig;
2940 doubleZSig -= 2;
2941 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
2942 }
2943 zSig |= ( ( rem0 | rem1 ) != 0 );
2944 }
2945 return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
2946
2947}
2948
2949/*----------------------------------------------------------------------------
2950| Returns 1 if the double-precision floating-point value `a' is equal to the
2951| corresponding value `b', and 0 otherwise. The comparison is performed
2952| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2953*----------------------------------------------------------------------------*/
2954
2955int float64_eq( float64 a, float64 b STATUS_PARAM )
2956{
2957
2958 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2959 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2960 ) {
2961 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2962 float_raise( float_flag_invalid STATUS_VAR);
2963 }
2964 return 0;
2965 }
2966 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2967
2968}
2969
2970/*----------------------------------------------------------------------------
2971| Returns 1 if the double-precision floating-point value `a' is less than or
2972| equal to the corresponding value `b', and 0 otherwise. The comparison is
2973| performed according to the IEC/IEEE Standard for Binary Floating-Point
2974| Arithmetic.
2975*----------------------------------------------------------------------------*/
2976
2977int float64_le( float64 a, float64 b STATUS_PARAM )
2978{
2979 flag aSign, bSign;
2980
2981 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2982 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2983 ) {
2984 float_raise( float_flag_invalid STATUS_VAR);
2985 return 0;
2986 }
2987 aSign = extractFloat64Sign( a );
2988 bSign = extractFloat64Sign( b );
2989 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2990 return ( a == b ) || ( aSign ^ ( a < b ) );
2991
2992}
2993
2994/*----------------------------------------------------------------------------
2995| Returns 1 if the double-precision floating-point value `a' is less than
2996| the corresponding value `b', and 0 otherwise. The comparison is performed
2997| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2998*----------------------------------------------------------------------------*/
2999
3000int float64_lt( float64 a, float64 b STATUS_PARAM )
3001{
3002 flag aSign, bSign;
3003
3004 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3005 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3006 ) {
3007 float_raise( float_flag_invalid STATUS_VAR);
3008 return 0;
3009 }
3010 aSign = extractFloat64Sign( a );
3011 bSign = extractFloat64Sign( b );
3012 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3013 return ( a != b ) && ( aSign ^ ( a < b ) );
3014
3015}
3016
3017/*----------------------------------------------------------------------------
3018| Returns 1 if the double-precision floating-point value `a' is equal to the
3019| corresponding value `b', and 0 otherwise. The invalid exception is raised
3020| if either operand is a NaN. Otherwise, the comparison is performed
3021| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3022*----------------------------------------------------------------------------*/
3023
3024int float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
3025{
3026
3027 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3028 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3029 ) {
3030 float_raise( float_flag_invalid STATUS_VAR);
3031 return 0;
3032 }
3033 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3034
3035}
3036
3037/*----------------------------------------------------------------------------
3038| Returns 1 if the double-precision floating-point value `a' is less than or
3039| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3040| cause an exception. Otherwise, the comparison is performed according to the
3041| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3042*----------------------------------------------------------------------------*/
3043
3044int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3045{
3046 flag aSign, bSign;
3047
3048 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3049 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3050 ) {
3051 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3052 float_raise( float_flag_invalid STATUS_VAR);
3053 }
3054 return 0;
3055 }
3056 aSign = extractFloat64Sign( a );
3057 bSign = extractFloat64Sign( b );
3058 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3059 return ( a == b ) || ( aSign ^ ( a < b ) );
3060
3061}
3062
3063/*----------------------------------------------------------------------------
3064| Returns 1 if the double-precision floating-point value `a' is less than
3065| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3066| exception. Otherwise, the comparison is performed according to the IEC/IEEE
3067| Standard for Binary Floating-Point Arithmetic.
3068*----------------------------------------------------------------------------*/
3069
3070int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3071{
3072 flag aSign, bSign;
3073
3074 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3075 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3076 ) {
3077 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3078 float_raise( float_flag_invalid STATUS_VAR);
3079 }
3080 return 0;
3081 }
3082 aSign = extractFloat64Sign( a );
3083 bSign = extractFloat64Sign( b );
3084 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3085 return ( a != b ) && ( aSign ^ ( a < b ) );
3086
3087}
3088
3089#ifdef FLOATX80
3090
3091/*----------------------------------------------------------------------------
3092| Returns the result of converting the extended double-precision floating-
3093| point value `a' to the 32-bit two's complement integer format. The
3094| conversion is performed according to the IEC/IEEE Standard for Binary
3095| Floating-Point Arithmetic---which means in particular that the conversion
3096| is rounded according to the current rounding mode. If `a' is a NaN, the
3097| largest positive integer is returned. Otherwise, if the conversion
3098| overflows, the largest integer with the same sign as `a' is returned.
3099*----------------------------------------------------------------------------*/
3100
3101int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3102{
3103 flag aSign;
3104 int32 aExp, shiftCount;
3105 bits64 aSig;
3106
3107 aSig = extractFloatx80Frac( a );
3108 aExp = extractFloatx80Exp( a );
3109 aSign = extractFloatx80Sign( a );
3110 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3111 shiftCount = 0x4037 - aExp;
3112 if ( shiftCount <= 0 ) shiftCount = 1;
3113 shift64RightJamming( aSig, shiftCount, &aSig );
3114 return roundAndPackInt32( aSign, aSig STATUS_VAR );
3115
3116}
3117
3118/*----------------------------------------------------------------------------
3119| Returns the result of converting the extended double-precision floating-
3120| point value `a' to the 32-bit two's complement integer format. The
3121| conversion is performed according to the IEC/IEEE Standard for Binary
3122| Floating-Point Arithmetic, except that the conversion is always rounded
3123| toward zero. If `a' is a NaN, the largest positive integer is returned.
3124| Otherwise, if the conversion overflows, the largest integer with the same
3125| sign as `a' is returned.
3126*----------------------------------------------------------------------------*/
3127
3128int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3129{
3130 flag aSign;
3131 int32 aExp, shiftCount;
3132 bits64 aSig, savedASig;
3133 int32 z;
3134
3135 aSig = extractFloatx80Frac( a );
3136 aExp = extractFloatx80Exp( a );
3137 aSign = extractFloatx80Sign( a );
3138 if ( 0x401E < aExp ) {
3139 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3140 goto invalid;
3141 }
3142 else if ( aExp < 0x3FFF ) {
3143 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3144 return 0;
3145 }
3146 shiftCount = 0x403E - aExp;
3147 savedASig = aSig;
3148 aSig >>= shiftCount;
3149 z = aSig;
3150 if ( aSign ) z = - z;
3151 if ( ( z < 0 ) ^ aSign ) {
3152 invalid:
3153 float_raise( float_flag_invalid STATUS_VAR);
3154 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3155 }
3156 if ( ( aSig<<shiftCount ) != savedASig ) {
3157 STATUS(float_exception_flags) |= float_flag_inexact;
3158 }
3159 return z;
3160
3161}
3162
3163/*----------------------------------------------------------------------------
3164| Returns the result of converting the extended double-precision floating-
3165| point value `a' to the 64-bit two's complement integer format. The
3166| conversion is performed according to the IEC/IEEE Standard for Binary
3167| Floating-Point Arithmetic---which means in particular that the conversion
3168| is rounded according to the current rounding mode. If `a' is a NaN,
3169| the largest positive integer is returned. Otherwise, if the conversion
3170| overflows, the largest integer with the same sign as `a' is returned.
3171*----------------------------------------------------------------------------*/
3172
3173int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3174{
3175 flag aSign;
3176 int32 aExp, shiftCount;
3177 bits64 aSig, aSigExtra;
3178
3179 aSig = extractFloatx80Frac( a );
3180 aExp = extractFloatx80Exp( a );
3181 aSign = extractFloatx80Sign( a );
3182 shiftCount = 0x403E - aExp;
3183 if ( shiftCount <= 0 ) {
3184 if ( shiftCount ) {
3185 float_raise( float_flag_invalid STATUS_VAR);
3186 if ( ! aSign
3187 || ( ( aExp == 0x7FFF )
3188 && ( aSig != LIT64( 0x8000000000000000 ) ) )
3189 ) {
3190 return LIT64( 0x7FFFFFFFFFFFFFFF );
3191 }
3192 return (sbits64) LIT64( 0x8000000000000000 );
3193 }
3194 aSigExtra = 0;
3195 }
3196 else {
3197 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3198 }
3199 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3200
3201}
3202
3203/*----------------------------------------------------------------------------
3204| Returns the result of converting the extended double-precision floating-
3205| point value `a' to the 64-bit two's complement integer format. The
3206| conversion is performed according to the IEC/IEEE Standard for Binary
3207| Floating-Point Arithmetic, except that the conversion is always rounded
3208| toward zero. If `a' is a NaN, the largest positive integer is returned.
3209| Otherwise, if the conversion overflows, the largest integer with the same
3210| sign as `a' is returned.
3211*----------------------------------------------------------------------------*/
3212
3213int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3214{
3215 flag aSign;
3216 int32 aExp, shiftCount;
3217 bits64 aSig;
3218 int64 z;
3219
3220 aSig = extractFloatx80Frac( a );
3221 aExp = extractFloatx80Exp( a );
3222 aSign = extractFloatx80Sign( a );
3223 shiftCount = aExp - 0x403E;
3224 if ( 0 <= shiftCount ) {
3225 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3226 if ( ( a.high != 0xC03E ) || aSig ) {
3227 float_raise( float_flag_invalid STATUS_VAR);
3228 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3229 return LIT64( 0x7FFFFFFFFFFFFFFF );
3230 }
3231 }
3232 return (sbits64) LIT64( 0x8000000000000000 );
3233 }
3234 else if ( aExp < 0x3FFF ) {
3235 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3236 return 0;
3237 }
3238 z = aSig>>( - shiftCount );
3239 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3240 STATUS(float_exception_flags) |= float_flag_inexact;
3241 }
3242 if ( aSign ) z = - z;
3243 return z;
3244
3245}
3246
3247/*----------------------------------------------------------------------------
3248| Returns the result of converting the extended double-precision floating-
3249| point value `a' to the single-precision floating-point format. The
3250| conversion is performed according to the IEC/IEEE Standard for Binary
3251| Floating-Point Arithmetic.
3252*----------------------------------------------------------------------------*/
3253
3254float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3255{
3256 flag aSign;
3257 int32 aExp;
3258 bits64 aSig;
3259
3260 aSig = extractFloatx80Frac( a );
3261 aExp = extractFloatx80Exp( a );
3262 aSign = extractFloatx80Sign( a );
3263 if ( aExp == 0x7FFF ) {
3264 if ( (bits64) ( aSig<<1 ) ) {
3265 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) );
3266 }
3267 return packFloat32( aSign, 0xFF, 0 );
3268 }
3269 shift64RightJamming( aSig, 33, &aSig );
3270 if ( aExp || aSig ) aExp -= 0x3F81;
3271 return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3272
3273}
3274
3275/*----------------------------------------------------------------------------
3276| Returns the result of converting the extended double-precision floating-
3277| point value `a' to the double-precision floating-point format. The
3278| conversion is performed according to the IEC/IEEE Standard for Binary
3279| Floating-Point Arithmetic.
3280*----------------------------------------------------------------------------*/
3281
3282float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3283{
3284 flag aSign;
3285 int32 aExp;
3286 bits64 aSig, zSig;
3287
3288 aSig = extractFloatx80Frac( a );
3289 aExp = extractFloatx80Exp( a );
3290 aSign = extractFloatx80Sign( a );
3291 if ( aExp == 0x7FFF ) {
3292 if ( (bits64) ( aSig<<1 ) ) {
3293 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) );
3294 }
3295 return packFloat64( aSign, 0x7FF, 0 );
3296 }
3297 shift64RightJamming( aSig, 1, &zSig );
3298 if ( aExp || aSig ) aExp -= 0x3C01;
3299 return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3300
3301}
3302
3303#ifdef FLOAT128
3304
3305/*----------------------------------------------------------------------------
3306| Returns the result of converting the extended double-precision floating-
3307| point value `a' to the quadruple-precision floating-point format. The
3308| conversion is performed according to the IEC/IEEE Standard for Binary
3309| Floating-Point Arithmetic.
3310*----------------------------------------------------------------------------*/
3311
3312float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3313{
3314 flag aSign;
3315 int16 aExp;
3316 bits64 aSig, zSig0, zSig1;
3317
3318 aSig = extractFloatx80Frac( a );
3319 aExp = extractFloatx80Exp( a );
3320 aSign = extractFloatx80Sign( a );
3321 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3322 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) );
3323 }
3324 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3325 return packFloat128( aSign, aExp, zSig0, zSig1 );
3326
3327}
3328
3329#endif
3330
3331/*----------------------------------------------------------------------------
3332| Rounds the extended double-precision floating-point value `a' to an integer,
3333| and returns the result as an extended quadruple-precision floating-point
3334| value. The operation is performed according to the IEC/IEEE Standard for
3335| Binary Floating-Point Arithmetic.
3336*----------------------------------------------------------------------------*/
3337
3338floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
3339{
3340 flag aSign;
3341 int32 aExp;
3342 bits64 lastBitMask, roundBitsMask;
3343 int8 roundingMode;
3344 floatx80 z;
3345
3346 aExp = extractFloatx80Exp( a );
3347 if ( 0x403E <= aExp ) {
3348 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3349 return propagateFloatx80NaN( a, a STATUS_VAR );
3350 }
3351 return a;
3352 }
3353 if ( aExp < 0x3FFF ) {
3354 if ( ( aExp == 0 )
3355 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3356 return a;
3357 }
3358 STATUS(float_exception_flags) |= float_flag_inexact;
3359 aSign = extractFloatx80Sign( a );
3360 switch ( STATUS(float_rounding_mode) ) {
3361 case float_round_nearest_even:
3362 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3363 ) {
3364 return
3365 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3366 }
3367 break;
3368 case float_round_down:
3369 return
3370 aSign ?
3371 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3372 : packFloatx80( 0, 0, 0 );
3373 case float_round_up:
3374 return
3375 aSign ? packFloatx80( 1, 0, 0 )
3376 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3377 }
3378 return packFloatx80( aSign, 0, 0 );
3379 }
3380 lastBitMask = 1;
3381 lastBitMask <<= 0x403E - aExp;
3382 roundBitsMask = lastBitMask - 1;
3383 z = a;
3384 roundingMode = STATUS(float_rounding_mode);
3385 if ( roundingMode == float_round_nearest_even ) {
3386 z.low += lastBitMask>>1;
3387 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3388 }
3389 else if ( roundingMode != float_round_to_zero ) {
3390 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3391 z.low += roundBitsMask;
3392 }
3393 }
3394 z.low &= ~ roundBitsMask;
3395 if ( z.low == 0 ) {
3396 ++z.high;
3397 z.low = LIT64( 0x8000000000000000 );
3398 }
3399 if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
3400 return z;
3401
3402}
3403
3404/*----------------------------------------------------------------------------
3405| Returns the result of adding the absolute values of the extended double-
3406| precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
3407| negated before being returned. `zSign' is ignored if the result is a NaN.
3408| The addition is performed according to the IEC/IEEE Standard for Binary
3409| Floating-Point Arithmetic.
3410*----------------------------------------------------------------------------*/
3411
3412static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
3413{
3414 int32 aExp, bExp, zExp;
3415 bits64 aSig, bSig, zSig0, zSig1;
3416 int32 expDiff;
3417
3418 aSig = extractFloatx80Frac( a );
3419 aExp = extractFloatx80Exp( a );
3420 bSig = extractFloatx80Frac( b );
3421 bExp = extractFloatx80Exp( b );
3422 expDiff = aExp - bExp;
3423 if ( 0 < expDiff ) {
3424 if ( aExp == 0x7FFF ) {
3425 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3426 return a;
3427 }
3428 if ( bExp == 0 ) --expDiff;
3429 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3430 zExp = aExp;
3431 }
3432 else if ( expDiff < 0 ) {
3433 if ( bExp == 0x7FFF ) {
3434 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3435 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3436 }
3437 if ( aExp == 0 ) ++expDiff;
3438 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3439 zExp = bExp;
3440 }
3441 else {
3442 if ( aExp == 0x7FFF ) {
3443 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3444 return propagateFloatx80NaN( a, b STATUS_VAR );
3445 }
3446 return a;
3447 }
3448 zSig1 = 0;
3449 zSig0 = aSig + bSig;
3450 if ( aExp == 0 ) {
3451 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3452 goto roundAndPack;
3453 }
3454 zExp = aExp;
3455 goto shiftRight1;
3456 }
3457 zSig0 = aSig + bSig;
3458 if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3459 shiftRight1:
3460 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3461 zSig0 |= LIT64( 0x8000000000000000 );
3462 ++zExp;
3463 roundAndPack:
3464 return
3465 roundAndPackFloatx80(
3466 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3467
3468}
3469
3470/*----------------------------------------------------------------------------
3471| Returns the result of subtracting the absolute values of the extended
3472| double-precision floating-point values `a' and `b'. If `zSign' is 1, the
3473| difference is negated before being returned. `zSign' is ignored if the
3474| result is a NaN. The subtraction is performed according to the IEC/IEEE
3475| Standard for Binary Floating-Point Arithmetic.
3476*----------------------------------------------------------------------------*/
3477
3478static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
3479{
3480 int32 aExp, bExp, zExp;
3481 bits64 aSig, bSig, zSig0, zSig1;
3482 int32 expDiff;
3483 floatx80 z;
3484
3485 aSig = extractFloatx80Frac( a );
3486 aExp = extractFloatx80Exp( a );
3487 bSig = extractFloatx80Frac( b );
3488 bExp = extractFloatx80Exp( b );
3489 expDiff = aExp - bExp;
3490 if ( 0 < expDiff ) goto aExpBigger;
3491 if ( expDiff < 0 ) goto bExpBigger;
3492 if ( aExp == 0x7FFF ) {
3493 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3494 return propagateFloatx80NaN( a, b STATUS_VAR );
3495 }
3496 float_raise( float_flag_invalid STATUS_VAR);
3497 z.low = floatx80_default_nan_low;
3498 z.high = floatx80_default_nan_high;
3499 return z;
3500 }
3501 if ( aExp == 0 ) {
3502 aExp = 1;
3503 bExp = 1;
3504 }
3505 zSig1 = 0;
3506 if ( bSig < aSig ) goto aBigger;
3507 if ( aSig < bSig ) goto bBigger;
3508 return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3509 bExpBigger:
3510 if ( bExp == 0x7FFF ) {
3511 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3512 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3513 }
3514 if ( aExp == 0 ) ++expDiff;
3515 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3516 bBigger:
3517 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3518 zExp = bExp;
3519 zSign ^= 1;
3520 goto normalizeRoundAndPack;
3521 aExpBigger:
3522 if ( aExp == 0x7FFF ) {
3523 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3524 return a;
3525 }
3526 if ( bExp == 0 ) --expDiff;
3527 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3528 aBigger:
3529 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3530 zExp = aExp;
3531 normalizeRoundAndPack:
3532 return
3533 normalizeRoundAndPackFloatx80(
3534 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3535
3536}
3537
3538/*----------------------------------------------------------------------------
3539| Returns the result of adding the extended double-precision floating-point
3540| values `a' and `b'. The operation is performed according to the IEC/IEEE
3541| Standard for Binary Floating-Point Arithmetic.
3542*----------------------------------------------------------------------------*/
3543
3544floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
3545{
3546 flag aSign, bSign;
3547
3548 aSign = extractFloatx80Sign( a );
3549 bSign = extractFloatx80Sign( b );
3550 if ( aSign == bSign ) {
3551 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3552 }
3553 else {
3554 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3555 }
3556
3557}
3558
3559/*----------------------------------------------------------------------------
3560| Returns the result of subtracting the extended double-precision floating-
3561| point values `a' and `b'. The operation is performed according to the
3562| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3563*----------------------------------------------------------------------------*/
3564
3565floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
3566{
3567 flag aSign, bSign;
3568
3569 aSign = extractFloatx80Sign( a );
3570 bSign = extractFloatx80Sign( b );
3571 if ( aSign == bSign ) {
3572 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
3573 }
3574 else {
3575 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
3576 }
3577
3578}
3579
3580/*----------------------------------------------------------------------------
3581| Returns the result of multiplying the extended double-precision floating-
3582| point values `a' and `b'. The operation is performed according to the
3583| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3584*----------------------------------------------------------------------------*/
3585
3586floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
3587{
3588 flag aSign, bSign, zSign;
3589 int32 aExp, bExp, zExp;
3590 bits64 aSig, bSig, zSig0, zSig1;
3591 floatx80 z;
3592
3593 aSig = extractFloatx80Frac( a );
3594 aExp = extractFloatx80Exp( a );
3595 aSign = extractFloatx80Sign( a );
3596 bSig = extractFloatx80Frac( b );
3597 bExp = extractFloatx80Exp( b );
3598 bSign = extractFloatx80Sign( b );
3599 zSign = aSign ^ bSign;
3600 if ( aExp == 0x7FFF ) {
3601 if ( (bits64) ( aSig<<1 )
3602 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3603 return propagateFloatx80NaN( a, b STATUS_VAR );
3604 }
3605 if ( ( bExp | bSig ) == 0 ) goto invalid;
3606 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3607 }
3608 if ( bExp == 0x7FFF ) {
3609 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3610 if ( ( aExp | aSig ) == 0 ) {
3611 invalid:
3612 float_raise( float_flag_invalid STATUS_VAR);
3613 z.low = floatx80_default_nan_low;
3614 z.high = floatx80_default_nan_high;
3615 return z;
3616 }
3617 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3618 }
3619 if ( aExp == 0 ) {
3620 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3621 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3622 }
3623 if ( bExp == 0 ) {
3624 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3625 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3626 }
3627 zExp = aExp + bExp - 0x3FFE;
3628 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3629 if ( 0 < (sbits64) zSig0 ) {
3630 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3631 --zExp;
3632 }
3633 return
3634 roundAndPackFloatx80(
3635 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3636
3637}
3638
3639/*----------------------------------------------------------------------------
3640| Returns the result of dividing the extended double-precision floating-point
3641| value `a' by the corresponding value `b'. The operation is performed
3642| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3643*----------------------------------------------------------------------------*/
3644
3645floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
3646{
3647 flag aSign, bSign, zSign;
3648 int32 aExp, bExp, zExp;
3649 bits64 aSig, bSig, zSig0, zSig1;
3650 bits64 rem0, rem1, rem2, term0, term1, term2;
3651 floatx80 z;
3652
3653 aSig = extractFloatx80Frac( a );
3654 aExp = extractFloatx80Exp( a );
3655 aSign = extractFloatx80Sign( a );
3656 bSig = extractFloatx80Frac( b );
3657 bExp = extractFloatx80Exp( b );
3658 bSign = extractFloatx80Sign( b );
3659 zSign = aSign ^ bSign;
3660 if ( aExp == 0x7FFF ) {
3661 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3662 if ( bExp == 0x7FFF ) {
3663 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3664 goto invalid;
3665 }
3666 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3667 }
3668 if ( bExp == 0x7FFF ) {
3669 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3670 return packFloatx80( zSign, 0, 0 );
3671 }
3672 if ( bExp == 0 ) {
3673 if ( bSig == 0 ) {
3674 if ( ( aExp | aSig ) == 0 ) {
3675 invalid:
3676 float_raise( float_flag_invalid STATUS_VAR);
3677 z.low = floatx80_default_nan_low;
3678 z.high = floatx80_default_nan_high;
3679 return z;
3680 }
3681 float_raise( float_flag_divbyzero STATUS_VAR);
3682 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3683 }
3684 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3685 }
3686 if ( aExp == 0 ) {
3687 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3688 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3689 }
3690 zExp = aExp - bExp + 0x3FFE;
3691 rem1 = 0;
3692 if ( bSig <= aSig ) {
3693 shift128Right( aSig, 0, 1, &aSig, &rem1 );
3694 ++zExp;
3695 }
3696 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3697 mul64To128( bSig, zSig0, &term0, &term1 );
3698 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3699 while ( (sbits64) rem0 < 0 ) {
3700 --zSig0;
3701 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3702 }
3703 zSig1 = estimateDiv128To64( rem1, 0, bSig );
3704 if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3705 mul64To128( bSig, zSig1, &term1, &term2 );
3706 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3707 while ( (sbits64) rem1 < 0 ) {
3708 --zSig1;
3709 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3710 }
3711 zSig1 |= ( ( rem1 | rem2 ) != 0 );
3712 }
3713 return
3714 roundAndPackFloatx80(
3715 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
3716
3717}
3718
3719/*----------------------------------------------------------------------------
3720| Returns the remainder of the extended double-precision floating-point value
3721| `a' with respect to the corresponding value `b'. The operation is performed
3722| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3723*----------------------------------------------------------------------------*/
3724
3725floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
3726{
3727 flag aSign, bSign, zSign;
3728 int32 aExp, bExp, expDiff;
3729 bits64 aSig0, aSig1, bSig;
3730 bits64 q, term0, term1, alternateASig0, alternateASig1;
3731 floatx80 z;
3732
3733 aSig0 = extractFloatx80Frac( a );
3734 aExp = extractFloatx80Exp( a );
3735 aSign = extractFloatx80Sign( a );
3736 bSig = extractFloatx80Frac( b );
3737 bExp = extractFloatx80Exp( b );
3738 bSign = extractFloatx80Sign( b );
3739 if ( aExp == 0x7FFF ) {
3740 if ( (bits64) ( aSig0<<1 )
3741 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3742 return propagateFloatx80NaN( a, b STATUS_VAR );
3743 }
3744 goto invalid;
3745 }
3746 if ( bExp == 0x7FFF ) {
3747 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
3748 return a;
3749 }
3750 if ( bExp == 0 ) {
3751 if ( bSig == 0 ) {
3752 invalid:
3753 float_raise( float_flag_invalid STATUS_VAR);
3754 z.low = floatx80_default_nan_low;
3755 z.high = floatx80_default_nan_high;
3756 return z;
3757 }
3758 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3759 }
3760 if ( aExp == 0 ) {
3761 if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3762 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3763 }
3764 bSig |= LIT64( 0x8000000000000000 );
3765 zSign = aSign;
3766 expDiff = aExp - bExp;
3767 aSig1 = 0;
3768 if ( expDiff < 0 ) {
3769 if ( expDiff < -1 ) return a;
3770 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3771 expDiff = 0;
3772 }
3773 q = ( bSig <= aSig0 );
3774 if ( q ) aSig0 -= bSig;
3775 expDiff -= 64;
3776 while ( 0 < expDiff ) {
3777 q = estimateDiv128To64( aSig0, aSig1, bSig );
3778 q = ( 2 < q ) ? q - 2 : 0;
3779 mul64To128( bSig, q, &term0, &term1 );
3780 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3781 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3782 expDiff -= 62;
3783 }
3784 expDiff += 64;
3785 if ( 0 < expDiff ) {
3786 q = estimateDiv128To64( aSig0, aSig1, bSig );
3787 q = ( 2 < q ) ? q - 2 : 0;
3788 q >>= 64 - expDiff;
3789 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3790 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3791 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3792 while ( le128( term0, term1, aSig0, aSig1 ) ) {
3793 ++q;
3794 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3795 }
3796 }
3797 else {
3798 term1 = 0;
3799 term0 = bSig;
3800 }
3801 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3802 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3803 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3804 && ( q & 1 ) )
3805 ) {
3806 aSig0 = alternateASig0;
3807 aSig1 = alternateASig1;
3808 zSign = ! zSign;
3809 }
3810 return
3811 normalizeRoundAndPackFloatx80(
3812 80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
3813
3814}
3815
3816/*----------------------------------------------------------------------------
3817| Returns the square root of the extended double-precision floating-point
3818| value `a'. The operation is performed according to the IEC/IEEE Standard
3819| for Binary Floating-Point Arithmetic.
3820*----------------------------------------------------------------------------*/
3821
3822floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
3823{
3824 flag aSign;
3825 int32 aExp, zExp;
3826 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3827 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3828 floatx80 z;
3829
3830 aSig0 = extractFloatx80Frac( a );
3831 aExp = extractFloatx80Exp( a );
3832 aSign = extractFloatx80Sign( a );
3833 if ( aExp == 0x7FFF ) {
3834 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
3835 if ( ! aSign ) return a;
3836 goto invalid;
3837 }
3838 if ( aSign ) {
3839 if ( ( aExp | aSig0 ) == 0 ) return a;
3840 invalid:
3841 float_raise( float_flag_invalid STATUS_VAR);
3842 z.low = floatx80_default_nan_low;
3843 z.high = floatx80_default_nan_high;
3844 return z;
3845 }
3846 if ( aExp == 0 ) {
3847 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3848 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3849 }
3850 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3851 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3852 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
3853 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
3854 doubleZSig0 = zSig0<<1;
3855 mul64To128( zSig0, zSig0, &term0, &term1 );
3856 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3857 while ( (sbits64) rem0 < 0 ) {
3858 --zSig0;
3859 doubleZSig0 -= 2;
3860 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
3861 }
3862 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
3863 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
3864 if ( zSig1 == 0 ) zSig1 = 1;
3865 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
3866 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3867 mul64To128( zSig1, zSig1, &term2, &term3 );
3868 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3869 while ( (sbits64) rem1 < 0 ) {
3870 --zSig1;
3871 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
3872 term3 |= 1;
3873 term2 |= doubleZSig0;
3874 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
3875 }
3876 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3877 }
3878 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
3879 zSig0 |= doubleZSig0;
3880 return
3881 roundAndPackFloatx80(
3882 STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
3883
3884}
3885
3886/*----------------------------------------------------------------------------
3887| Returns 1 if the extended double-precision floating-point value `a' is
3888| equal to the corresponding value `b', and 0 otherwise. The comparison is
3889| performed according to the IEC/IEEE Standard for Binary Floating-Point
3890| Arithmetic.
3891*----------------------------------------------------------------------------*/
3892
3893int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
3894{
3895
3896 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3897 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3898 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3899 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3900 ) {
3901 if ( floatx80_is_signaling_nan( a )
3902 || floatx80_is_signaling_nan( b ) ) {
3903 float_raise( float_flag_invalid STATUS_VAR);
3904 }
3905 return 0;
3906 }
3907 return
3908 ( a.low == b.low )
3909 && ( ( a.high == b.high )
3910 || ( ( a.low == 0 )
3911 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3912 );
3913
3914}
3915
3916/*----------------------------------------------------------------------------
3917| Returns 1 if the extended double-precision floating-point value `a' is
3918| less than or equal to the corresponding value `b', and 0 otherwise. The
3919| comparison is performed according to the IEC/IEEE Standard for Binary
3920| Floating-Point Arithmetic.
3921*----------------------------------------------------------------------------*/
3922
3923int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
3924{
3925 flag aSign, bSign;
3926
3927 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3928 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3929 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3930 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3931 ) {
3932 float_raise( float_flag_invalid STATUS_VAR);
3933 return 0;
3934 }
3935 aSign = extractFloatx80Sign( a );
3936 bSign = extractFloatx80Sign( b );
3937 if ( aSign != bSign ) {
3938 return
3939 aSign
3940 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3941 == 0 );
3942 }
3943 return
3944 aSign ? le128( b.high, b.low, a.high, a.low )
3945 : le128( a.high, a.low, b.high, b.low );
3946
3947}
3948
3949/*----------------------------------------------------------------------------
3950| Returns 1 if the extended double-precision floating-point value `a' is
3951| less than the corresponding value `b', and 0 otherwise. The comparison
3952| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3953| Arithmetic.
3954*----------------------------------------------------------------------------*/
3955
3956int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
3957{
3958 flag aSign, bSign;
3959
3960 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3961 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3962 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3963 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3964 ) {
3965 float_raise( float_flag_invalid STATUS_VAR);
3966 return 0;
3967 }
3968 aSign = extractFloatx80Sign( a );
3969 bSign = extractFloatx80Sign( b );
3970 if ( aSign != bSign ) {
3971 return
3972 aSign
3973 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3974 != 0 );
3975 }
3976 return
3977 aSign ? lt128( b.high, b.low, a.high, a.low )
3978 : lt128( a.high, a.low, b.high, b.low );
3979
3980}
3981
3982/*----------------------------------------------------------------------------
3983| Returns 1 if the extended double-precision floating-point value `a' is equal
3984| to the corresponding value `b', and 0 otherwise. The invalid exception is
3985| raised if either operand is a NaN. Otherwise, the comparison is performed
3986| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3987*----------------------------------------------------------------------------*/
3988
3989int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
3990{
3991
3992 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3993 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3994 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3995 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3996 ) {
3997 float_raise( float_flag_invalid STATUS_VAR);
3998 return 0;
3999 }
4000 return
4001 ( a.low == b.low )
4002 && ( ( a.high == b.high )
4003 || ( ( a.low == 0 )
4004 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4005 );
4006
4007}
4008
4009/*----------------------------------------------------------------------------
4010| Returns 1 if the extended double-precision floating-point value `a' is less
4011| than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4012| do not cause an exception. Otherwise, the comparison is performed according
4013| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4014*----------------------------------------------------------------------------*/
4015
4016int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4017{
4018 flag aSign, bSign;
4019
4020 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4021 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4022 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4023 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4024 ) {
4025 if ( floatx80_is_signaling_nan( a )
4026 || floatx80_is_signaling_nan( b ) ) {
4027 float_raise( float_flag_invalid STATUS_VAR);
4028 }
4029 return 0;
4030 }
4031 aSign = extractFloatx80Sign( a );
4032 bSign = extractFloatx80Sign( b );
4033 if ( aSign != bSign ) {
4034 return
4035 aSign
4036 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4037 == 0 );
4038 }
4039 return
4040 aSign ? le128( b.high, b.low, a.high, a.low )
4041 : le128( a.high, a.low, b.high, b.low );
4042
4043}
4044
4045/*----------------------------------------------------------------------------
4046| Returns 1 if the extended double-precision floating-point value `a' is less
4047| than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4048| an exception. Otherwise, the comparison is performed according to the
4049| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4050*----------------------------------------------------------------------------*/
4051
4052int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4053{
4054 flag aSign, bSign;
4055
4056 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4057 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4058 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4059 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4060 ) {
4061 if ( floatx80_is_signaling_nan( a )
4062 || floatx80_is_signaling_nan( b ) ) {
4063 float_raise( float_flag_invalid STATUS_VAR);
4064 }
4065 return 0;
4066 }
4067 aSign = extractFloatx80Sign( a );
4068 bSign = extractFloatx80Sign( b );
4069 if ( aSign != bSign ) {
4070 return
4071 aSign
4072 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4073 != 0 );
4074 }
4075 return
4076 aSign ? lt128( b.high, b.low, a.high, a.low )
4077 : lt128( a.high, a.low, b.high, b.low );
4078
4079}
4080
4081#endif
4082
4083#ifdef FLOAT128
4084
4085/*----------------------------------------------------------------------------
4086| Returns the result of converting the quadruple-precision floating-point
4087| value `a' to the 32-bit two's complement integer format. The conversion
4088| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4089| Arithmetic---which means in particular that the conversion is rounded
4090| according to the current rounding mode. If `a' is a NaN, the largest
4091| positive integer is returned. Otherwise, if the conversion overflows, the
4092| largest integer with the same sign as `a' is returned.
4093*----------------------------------------------------------------------------*/
4094
4095int32 float128_to_int32( float128 a STATUS_PARAM )
4096{
4097 flag aSign;
4098 int32 aExp, shiftCount;
4099 bits64 aSig0, aSig1;
4100
4101 aSig1 = extractFloat128Frac1( a );
4102 aSig0 = extractFloat128Frac0( a );
4103 aExp = extractFloat128Exp( a );
4104 aSign = extractFloat128Sign( a );
4105 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4106 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4107 aSig0 |= ( aSig1 != 0 );
4108 shiftCount = 0x4028 - aExp;
4109 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4110 return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4111
4112}
4113
4114/*----------------------------------------------------------------------------
4115| Returns the result of converting the quadruple-precision floating-point
4116| value `a' to the 32-bit two's complement integer format. The conversion
4117| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4118| Arithmetic, except that the conversion is always rounded toward zero. If
4119| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4120| conversion overflows, the largest integer with the same sign as `a' is
4121| returned.
4122*----------------------------------------------------------------------------*/
4123
4124int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4125{
4126 flag aSign;
4127 int32 aExp, shiftCount;
4128 bits64 aSig0, aSig1, savedASig;
4129 int32 z;
4130
4131 aSig1 = extractFloat128Frac1( a );
4132 aSig0 = extractFloat128Frac0( a );
4133 aExp = extractFloat128Exp( a );
4134 aSign = extractFloat128Sign( a );
4135 aSig0 |= ( aSig1 != 0 );
4136 if ( 0x401E < aExp ) {
4137 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4138 goto invalid;
4139 }
4140 else if ( aExp < 0x3FFF ) {
4141 if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4142 return 0;
4143 }
4144 aSig0 |= LIT64( 0x0001000000000000 );
4145 shiftCount = 0x402F - aExp;
4146 savedASig = aSig0;
4147 aSig0 >>= shiftCount;
4148 z = aSig0;
4149 if ( aSign ) z = - z;
4150 if ( ( z < 0 ) ^ aSign ) {
4151 invalid:
4152 float_raise( float_flag_invalid STATUS_VAR);
4153 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4154 }
4155 if ( ( aSig0<<shiftCount ) != savedASig ) {
4156 STATUS(float_exception_flags) |= float_flag_inexact;
4157 }
4158 return z;
4159
4160}
4161
4162/*----------------------------------------------------------------------------
4163| Returns the result of converting the quadruple-precision floating-point
4164| value `a' to the 64-bit two's complement integer format. The conversion
4165| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4166| Arithmetic---which means in particular that the conversion is rounded
4167| according to the current rounding mode. If `a' is a NaN, the largest
4168| positive integer is returned. Otherwise, if the conversion overflows, the
4169| largest integer with the same sign as `a' is returned.
4170*----------------------------------------------------------------------------*/
4171
4172int64 float128_to_int64( float128 a STATUS_PARAM )
4173{
4174 flag aSign;
4175 int32 aExp, shiftCount;
4176 bits64 aSig0, aSig1;
4177
4178 aSig1 = extractFloat128Frac1( a );
4179 aSig0 = extractFloat128Frac0( a );
4180 aExp = extractFloat128Exp( a );
4181 aSign = extractFloat128Sign( a );
4182 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4183 shiftCount = 0x402F - aExp;
4184 if ( shiftCount <= 0 ) {
4185 if ( 0x403E < aExp ) {
4186 float_raise( float_flag_invalid STATUS_VAR);
4187 if ( ! aSign
4188 || ( ( aExp == 0x7FFF )
4189 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4190 )
4191 ) {
4192 return LIT64( 0x7FFFFFFFFFFFFFFF );
4193 }
4194 return (sbits64) LIT64( 0x8000000000000000 );
4195 }
4196 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4197 }
4198 else {
4199 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4200 }
4201 return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4202
4203}
4204
4205/*----------------------------------------------------------------------------
4206| Returns the result of converting the quadruple-precision floating-point
4207| value `a' to the 64-bit two's complement integer format. The conversion
4208| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4209| Arithmetic, except that the conversion is always rounded toward zero.
4210| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4211| the conversion overflows, the largest integer with the same sign as `a' is
4212| returned.
4213*----------------------------------------------------------------------------*/
4214
4215int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4216{
4217 flag aSign;
4218 int32 aExp, shiftCount;
4219 bits64 aSig0, aSig1;
4220 int64 z;
4221
4222 aSig1 = extractFloat128Frac1( a );
4223 aSig0 = extractFloat128Frac0( a );
4224 aExp = extractFloat128Exp( a );
4225 aSign = extractFloat128Sign( a );
4226 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4227 shiftCount = aExp - 0x402F;
4228 if ( 0 < shiftCount ) {
4229 if ( 0x403E <= aExp ) {
4230 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4231 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4232 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4233 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4234 }
4235 else {
4236 float_raise( float_flag_invalid STATUS_VAR);
4237 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4238 return LIT64( 0x7FFFFFFFFFFFFFFF );
4239 }
4240 }
4241 return (sbits64) LIT64( 0x8000000000000000 );
4242 }
4243 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4244 if ( (bits64) ( aSig1<<shiftCount ) ) {
4245 STATUS(float_exception_flags) |= float_flag_inexact;
4246 }
4247 }
4248 else {
4249 if ( aExp < 0x3FFF ) {
4250 if ( aExp | aSig0 | aSig1 ) {
4251 STATUS(float_exception_flags) |= float_flag_inexact;
4252 }
4253 return 0;
4254 }
4255 z = aSig0>>( - shiftCount );
4256 if ( aSig1
4257 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4258 STATUS(float_exception_flags) |= float_flag_inexact;
4259 }
4260 }
4261 if ( aSign ) z = - z;
4262 return z;
4263
4264}
4265
4266/*----------------------------------------------------------------------------
4267| Returns the result of converting the quadruple-precision floating-point
4268| value `a' to the single-precision floating-point format. The conversion
4269| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4270| Arithmetic.
4271*----------------------------------------------------------------------------*/
4272
4273float32 float128_to_float32( float128 a STATUS_PARAM )
4274{
4275 flag aSign;
4276 int32 aExp;
4277 bits64 aSig0, aSig1;
4278 bits32 zSig;
4279
4280 aSig1 = extractFloat128Frac1( a );
4281 aSig0 = extractFloat128Frac0( a );
4282 aExp = extractFloat128Exp( a );
4283 aSign = extractFloat128Sign( a );
4284 if ( aExp == 0x7FFF ) {
4285 if ( aSig0 | aSig1 ) {
4286 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) );
4287 }
4288 return packFloat32( aSign, 0xFF, 0 );
4289 }
4290 aSig0 |= ( aSig1 != 0 );
4291 shift64RightJamming( aSig0, 18, &aSig0 );
4292 zSig = aSig0;
4293 if ( aExp || zSig ) {
4294 zSig |= 0x40000000;
4295 aExp -= 0x3F81;
4296 }
4297 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
4298
4299}
4300
4301/*----------------------------------------------------------------------------
4302| Returns the result of converting the quadruple-precision floating-point
4303| value `a' to the double-precision floating-point format. The conversion
4304| is performed according to the IEC/IEEE Standard for Binary Floating-Point
4305| Arithmetic.
4306*----------------------------------------------------------------------------*/
4307
4308float64 float128_to_float64( float128 a STATUS_PARAM )
4309{
4310 flag aSign;
4311 int32 aExp;
4312 bits64 aSig0, aSig1;
4313
4314 aSig1 = extractFloat128Frac1( a );
4315 aSig0 = extractFloat128Frac0( a );
4316 aExp = extractFloat128Exp( a );
4317 aSign = extractFloat128Sign( a );
4318 if ( aExp == 0x7FFF ) {
4319 if ( aSig0 | aSig1 ) {
4320 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) );
4321 }
4322 return packFloat64( aSign, 0x7FF, 0 );
4323 }
4324 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4325 aSig0 |= ( aSig1 != 0 );
4326 if ( aExp || aSig0 ) {
4327 aSig0 |= LIT64( 0x4000000000000000 );
4328 aExp -= 0x3C01;
4329 }
4330 return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
4331
4332}
4333
4334#ifdef FLOATX80
4335
4336/*----------------------------------------------------------------------------
4337| Returns the result of converting the quadruple-precision floating-point
4338| value `a' to the extended double-precision floating-point format. The
4339| conversion is performed according to the IEC/IEEE Standard for Binary
4340| Floating-Point Arithmetic.
4341*----------------------------------------------------------------------------*/
4342
4343floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
4344{
4345 flag aSign;
4346 int32 aExp;
4347 bits64 aSig0, aSig1;
4348
4349 aSig1 = extractFloat128Frac1( a );
4350 aSig0 = extractFloat128Frac0( a );
4351 aExp = extractFloat128Exp( a );
4352 aSign = extractFloat128Sign( a );
4353 if ( aExp == 0x7FFF ) {
4354 if ( aSig0 | aSig1 ) {
4355 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) );
4356 }
4357 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4358 }
4359 if ( aExp == 0 ) {
4360 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4361 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4362 }
4363 else {
4364 aSig0 |= LIT64( 0x0001000000000000 );
4365 }
4366 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4367 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
4368
4369}
4370
4371#endif
4372
4373/*----------------------------------------------------------------------------
4374| Rounds the quadruple-precision floating-point value `a' to an integer, and
4375| returns the result as a quadruple-precision floating-point value. The
4376| operation is performed according to the IEC/IEEE Standard for Binary
4377| Floating-Point Arithmetic.
4378*----------------------------------------------------------------------------*/
4379
4380float128 float128_round_to_int( float128 a STATUS_PARAM )
4381{
4382 flag aSign;
4383 int32 aExp;
4384 bits64 lastBitMask, roundBitsMask;
4385 int8 roundingMode;
4386 float128 z;
4387
4388 aExp = extractFloat128Exp( a );
4389 if ( 0x402F <= aExp ) {
4390 if ( 0x406F <= aExp ) {
4391 if ( ( aExp == 0x7FFF )
4392 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4393 ) {
4394 return propagateFloat128NaN( a, a STATUS_VAR );
4395 }
4396 return a;
4397 }
4398 lastBitMask = 1;
4399 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4400 roundBitsMask = lastBitMask - 1;
4401 z = a;
4402 roundingMode = STATUS(float_rounding_mode);
4403 if ( roundingMode == float_round_nearest_even ) {
4404 if ( lastBitMask ) {
4405 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4406 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4407 }
4408 else {
4409 if ( (sbits64) z.low < 0 ) {
4410 ++z.high;
4411 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4412 }
4413 }
4414 }
4415 else if ( roundingMode != float_round_to_zero ) {
4416 if ( extractFloat128Sign( z )
4417 ^ ( roundingMode == float_round_up ) ) {
4418 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4419 }
4420 }
4421 z.low &= ~ roundBitsMask;
4422 }
4423 else {
4424 if ( aExp < 0x3FFF ) {
4425 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4426 STATUS(float_exception_flags) |= float_flag_inexact;
4427 aSign = extractFloat128Sign( a );
4428 switch ( STATUS(float_rounding_mode) ) {
4429 case float_round_nearest_even:
4430 if ( ( aExp == 0x3FFE )
4431 && ( extractFloat128Frac0( a )
4432 | extractFloat128Frac1( a ) )
4433 ) {
4434 return packFloat128( aSign, 0x3FFF, 0, 0 );
4435 }
4436 break;
4437 case float_round_down:
4438 return
4439 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4440 : packFloat128( 0, 0, 0, 0 );
4441 case float_round_up:
4442 return
4443 aSign ? packFloat128( 1, 0, 0, 0 )
4444 : packFloat128( 0, 0x3FFF, 0, 0 );
4445 }
4446 return packFloat128( aSign, 0, 0, 0 );
4447 }
4448 lastBitMask = 1;
4449 lastBitMask <<= 0x402F - aExp;
4450 roundBitsMask = lastBitMask - 1;
4451 z.low = 0;
4452 z.high = a.high;
4453 roundingMode = STATUS(float_rounding_mode);
4454 if ( roundingMode == float_round_nearest_even ) {
4455 z.high += lastBitMask>>1;
4456 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4457 z.high &= ~ lastBitMask;
4458 }
4459 }
4460 else if ( roundingMode != float_round_to_zero ) {
4461 if ( extractFloat128Sign( z )
4462 ^ ( roundingMode == float_round_up ) ) {
4463 z.high |= ( a.low != 0 );
4464 z.high += roundBitsMask;
4465 }
4466 }
4467 z.high &= ~ roundBitsMask;
4468 }
4469 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4470 STATUS(float_exception_flags) |= float_flag_inexact;
4471 }
4472 return z;
4473
4474}
4475
4476/*----------------------------------------------------------------------------
4477| Returns the result of adding the absolute values of the quadruple-precision
4478| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
4479| before being returned. `zSign' is ignored if the result is a NaN.
4480| The addition is performed according to the IEC/IEEE Standard for Binary
4481| Floating-Point Arithmetic.
4482*----------------------------------------------------------------------------*/
4483
4484static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4485{
4486 int32 aExp, bExp, zExp;
4487 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4488 int32 expDiff;
4489
4490 aSig1 = extractFloat128Frac1( a );
4491 aSig0 = extractFloat128Frac0( a );
4492 aExp = extractFloat128Exp( a );
4493 bSig1 = extractFloat128Frac1( b );
4494 bSig0 = extractFloat128Frac0( b );
4495 bExp = extractFloat128Exp( b );
4496 expDiff = aExp - bExp;
4497 if ( 0 < expDiff ) {
4498 if ( aExp == 0x7FFF ) {
4499 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4500 return a;
4501 }
4502 if ( bExp == 0 ) {
4503 --expDiff;
4504 }
4505 else {
4506 bSig0 |= LIT64( 0x0001000000000000 );
4507 }
4508 shift128ExtraRightJamming(
4509 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4510 zExp = aExp;
4511 }
4512 else if ( expDiff < 0 ) {
4513 if ( bExp == 0x7FFF ) {
4514 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4515 return packFloat128( zSign, 0x7FFF, 0, 0 );
4516 }
4517 if ( aExp == 0 ) {
4518 ++expDiff;
4519 }
4520 else {
4521 aSig0 |= LIT64( 0x0001000000000000 );
4522 }
4523 shift128ExtraRightJamming(
4524 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4525 zExp = bExp;
4526 }
4527 else {
4528 if ( aExp == 0x7FFF ) {
4529 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4530 return propagateFloat128NaN( a, b STATUS_VAR );
4531 }
4532 return a;
4533 }
4534 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4535 if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4536 zSig2 = 0;
4537 zSig0 |= LIT64( 0x0002000000000000 );
4538 zExp = aExp;
4539 goto shiftRight1;
4540 }
4541 aSig0 |= LIT64( 0x0001000000000000 );
4542 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4543 --zExp;
4544 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4545 ++zExp;
4546 shiftRight1:
4547 shift128ExtraRightJamming(
4548 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4549 roundAndPack:
4550 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4551
4552}
4553
4554/*----------------------------------------------------------------------------
4555| Returns the result of subtracting the absolute values of the quadruple-
4556| precision floating-point values `a' and `b'. If `zSign' is 1, the
4557| difference is negated before being returned. `zSign' is ignored if the
4558| result is a NaN. The subtraction is performed according to the IEC/IEEE
4559| Standard for Binary Floating-Point Arithmetic.
4560*----------------------------------------------------------------------------*/
4561
4562static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
4563{
4564 int32 aExp, bExp, zExp;
4565 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4566 int32 expDiff;
4567 float128 z;
4568
4569 aSig1 = extractFloat128Frac1( a );
4570 aSig0 = extractFloat128Frac0( a );
4571 aExp = extractFloat128Exp( a );
4572 bSig1 = extractFloat128Frac1( b );
4573 bSig0 = extractFloat128Frac0( b );
4574 bExp = extractFloat128Exp( b );
4575 expDiff = aExp - bExp;
4576 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4577 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4578 if ( 0 < expDiff ) goto aExpBigger;
4579 if ( expDiff < 0 ) goto bExpBigger;
4580 if ( aExp == 0x7FFF ) {
4581 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4582 return propagateFloat128NaN( a, b STATUS_VAR );
4583 }
4584 float_raise( float_flag_invalid STATUS_VAR);
4585 z.low = float128_default_nan_low;
4586 z.high = float128_default_nan_high;
4587 return z;
4588 }
4589 if ( aExp == 0 ) {
4590 aExp = 1;
4591 bExp = 1;
4592 }
4593 if ( bSig0 < aSig0 ) goto aBigger;
4594 if ( aSig0 < bSig0 ) goto bBigger;
4595 if ( bSig1 < aSig1 ) goto aBigger;
4596 if ( aSig1 < bSig1 ) goto bBigger;
4597 return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
4598 bExpBigger:
4599 if ( bExp == 0x7FFF ) {
4600 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4601 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4602 }
4603 if ( aExp == 0 ) {
4604 ++expDiff;
4605 }
4606 else {
4607 aSig0 |= LIT64( 0x4000000000000000 );
4608 }
4609 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4610 bSig0 |= LIT64( 0x4000000000000000 );
4611 bBigger:
4612 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4613 zExp = bExp;
4614 zSign ^= 1;
4615 goto normalizeRoundAndPack;
4616 aExpBigger:
4617 if ( aExp == 0x7FFF ) {
4618 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4619 return a;
4620 }
4621 if ( bExp == 0 ) {
4622 --expDiff;
4623 }
4624 else {
4625 bSig0 |= LIT64( 0x4000000000000000 );
4626 }
4627 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4628 aSig0 |= LIT64( 0x4000000000000000 );
4629 aBigger:
4630 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4631 zExp = aExp;
4632 normalizeRoundAndPack:
4633 --zExp;
4634 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
4635
4636}
4637
4638/*----------------------------------------------------------------------------
4639| Returns the result of adding the quadruple-precision floating-point values
4640| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4641| for Binary Floating-Point Arithmetic.
4642*----------------------------------------------------------------------------*/
4643
4644float128 float128_add( float128 a, float128 b STATUS_PARAM )
4645{
4646 flag aSign, bSign;
4647
4648 aSign = extractFloat128Sign( a );
4649 bSign = extractFloat128Sign( b );
4650 if ( aSign == bSign ) {
4651 return addFloat128Sigs( a, b, aSign STATUS_VAR );
4652 }
4653 else {
4654 return subFloat128Sigs( a, b, aSign STATUS_VAR );
4655 }
4656
4657}
4658
4659/*----------------------------------------------------------------------------
4660| Returns the result of subtracting the quadruple-precision floating-point
4661| values `a' and `b'. The operation is performed according to the IEC/IEEE
4662| Standard for Binary Floating-Point Arithmetic.
4663*----------------------------------------------------------------------------*/
4664
4665float128 float128_sub( float128 a, float128 b STATUS_PARAM )
4666{
4667 flag aSign, bSign;
4668
4669 aSign = extractFloat128Sign( a );
4670 bSign = extractFloat128Sign( b );
4671 if ( aSign == bSign ) {
4672 return subFloat128Sigs( a, b, aSign STATUS_VAR );
4673 }
4674 else {
4675 return addFloat128Sigs( a, b, aSign STATUS_VAR );
4676 }
4677
4678}
4679
4680/*----------------------------------------------------------------------------
4681| Returns the result of multiplying the quadruple-precision floating-point
4682| values `a' and `b'. The operation is performed according to the IEC/IEEE
4683| Standard for Binary Floating-Point Arithmetic.
4684*----------------------------------------------------------------------------*/
4685
4686float128 float128_mul( float128 a, float128 b STATUS_PARAM )
4687{
4688 flag aSign, bSign, zSign;
4689 int32 aExp, bExp, zExp;
4690 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4691 float128 z;
4692
4693 aSig1 = extractFloat128Frac1( a );
4694 aSig0 = extractFloat128Frac0( a );
4695 aExp = extractFloat128Exp( a );
4696 aSign = extractFloat128Sign( a );
4697 bSig1 = extractFloat128Frac1( b );
4698 bSig0 = extractFloat128Frac0( b );
4699 bExp = extractFloat128Exp( b );
4700 bSign = extractFloat128Sign( b );
4701 zSign = aSign ^ bSign;
4702 if ( aExp == 0x7FFF ) {
4703 if ( ( aSig0 | aSig1 )
4704 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4705 return propagateFloat128NaN( a, b STATUS_VAR );
4706 }
4707 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4708 return packFloat128( zSign, 0x7FFF, 0, 0 );
4709 }
4710 if ( bExp == 0x7FFF ) {
4711 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4712 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4713 invalid:
4714 float_raise( float_flag_invalid STATUS_VAR);
4715 z.low = float128_default_nan_low;
4716 z.high = float128_default_nan_high;
4717 return z;
4718 }
4719 return packFloat128( zSign, 0x7FFF, 0, 0 );
4720 }
4721 if ( aExp == 0 ) {
4722 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4723 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4724 }
4725 if ( bExp == 0 ) {
4726 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4727 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4728 }
4729 zExp = aExp + bExp - 0x4000;
4730 aSig0 |= LIT64( 0x0001000000000000 );
4731 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4732 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4733 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4734 zSig2 |= ( zSig3 != 0 );
4735 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4736 shift128ExtraRightJamming(
4737 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4738 ++zExp;
4739 }
4740 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4741
4742}
4743
4744/*----------------------------------------------------------------------------
4745| Returns the result of dividing the quadruple-precision floating-point value
4746| `a' by the corresponding value `b'. The operation is performed according to
4747| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4748*----------------------------------------------------------------------------*/
4749
4750float128 float128_div( float128 a, float128 b STATUS_PARAM )
4751{
4752 flag aSign, bSign, zSign;
4753 int32 aExp, bExp, zExp;
4754 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4755 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4756 float128 z;
4757
4758 aSig1 = extractFloat128Frac1( a );
4759 aSig0 = extractFloat128Frac0( a );
4760 aExp = extractFloat128Exp( a );
4761 aSign = extractFloat128Sign( a );
4762 bSig1 = extractFloat128Frac1( b );
4763 bSig0 = extractFloat128Frac0( b );
4764 bExp = extractFloat128Exp( b );
4765 bSign = extractFloat128Sign( b );
4766 zSign = aSign ^ bSign;
4767 if ( aExp == 0x7FFF ) {
4768 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4769 if ( bExp == 0x7FFF ) {
4770 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4771 goto invalid;
4772 }
4773 return packFloat128( zSign, 0x7FFF, 0, 0 );
4774 }
4775 if ( bExp == 0x7FFF ) {
4776 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4777 return packFloat128( zSign, 0, 0, 0 );
4778 }
4779 if ( bExp == 0 ) {
4780 if ( ( bSig0 | bSig1 ) == 0 ) {
4781 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4782 invalid:
4783 float_raise( float_flag_invalid STATUS_VAR);
4784 z.low = float128_default_nan_low;
4785 z.high = float128_default_nan_high;
4786 return z;
4787 }
4788 float_raise( float_flag_divbyzero STATUS_VAR);
4789 return packFloat128( zSign, 0x7FFF, 0, 0 );
4790 }
4791 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4792 }
4793 if ( aExp == 0 ) {
4794 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4795 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4796 }
4797 zExp = aExp - bExp + 0x3FFD;
4798 shortShift128Left(
4799 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4800 shortShift128Left(
4801 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4802 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4803 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4804 ++zExp;
4805 }
4806 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4807 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4808 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4809 while ( (sbits64) rem0 < 0 ) {
4810 --zSig0;
4811 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
4812 }
4813 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
4814 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
4815 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
4816 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
4817 while ( (sbits64) rem1 < 0 ) {
4818 --zSig1;
4819 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
4820 }
4821 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4822 }
4823 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
4824 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
4825
4826}
4827
4828/*----------------------------------------------------------------------------
4829| Returns the remainder of the quadruple-precision floating-point value `a'
4830| with respect to the corresponding value `b'. The operation is performed
4831| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4832*----------------------------------------------------------------------------*/
4833
4834float128 float128_rem( float128 a, float128 b STATUS_PARAM )
4835{
4836 flag aSign, bSign, zSign;
4837 int32 aExp, bExp, expDiff;
4838 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
4839 bits64 allZero, alternateASig0, alternateASig1, sigMean1;
4840 sbits64 sigMean0;
4841 float128 z;
4842
4843 aSig1 = extractFloat128Frac1( a );
4844 aSig0 = extractFloat128Frac0( a );
4845 aExp = extractFloat128Exp( a );
4846 aSign = extractFloat128Sign( a );
4847 bSig1 = extractFloat128Frac1( b );
4848 bSig0 = extractFloat128Frac0( b );
4849 bExp = extractFloat128Exp( b );
4850 bSign = extractFloat128Sign( b );
4851 if ( aExp == 0x7FFF ) {
4852 if ( ( aSig0 | aSig1 )
4853 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4854 return propagateFloat128NaN( a, b STATUS_VAR );
4855 }
4856 goto invalid;
4857 }
4858 if ( bExp == 0x7FFF ) {
4859 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
4860 return a;
4861 }
4862 if ( bExp == 0 ) {
4863 if ( ( bSig0 | bSig1 ) == 0 ) {
4864 invalid:
4865 float_raise( float_flag_invalid STATUS_VAR);
4866 z.low = float128_default_nan_low;
4867 z.high = float128_default_nan_high;
4868 return z;
4869 }
4870 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4871 }
4872 if ( aExp == 0 ) {
4873 if ( ( aSig0 | aSig1 ) == 0 ) return a;
4874 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4875 }
4876 expDiff = aExp - bExp;
4877 if ( expDiff < -1 ) return a;
4878 shortShift128Left(
4879 aSig0 | LIT64( 0x0001000000000000 ),
4880 aSig1,
4881 15 - ( expDiff < 0 ),
4882 &aSig0,
4883 &aSig1
4884 );
4885 shortShift128Left(
4886 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4887 q = le128( bSig0, bSig1, aSig0, aSig1 );
4888 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4889 expDiff -= 64;
4890 while ( 0 < expDiff ) {
4891 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4892 q = ( 4 < q ) ? q - 4 : 0;
4893 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4894 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
4895 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
4896 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
4897 expDiff -= 61;
4898 }
4899 if ( -64 < expDiff ) {
4900 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4901 q = ( 4 < q ) ? q - 4 : 0;
4902 q >>= - expDiff;
4903 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4904 expDiff += 52;
4905 if ( expDiff < 0 ) {
4906 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4907 }
4908 else {
4909 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
4910 }
4911 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4912 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
4913 }
4914 else {
4915 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
4916 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4917 }
4918 do {
4919 alternateASig0 = aSig0;
4920 alternateASig1 = aSig1;
4921 ++q;
4922 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4923 } while ( 0 <= (sbits64) aSig0 );
4924 add128(
4925 aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
4926 if ( ( sigMean0 < 0 )
4927 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
4928 aSig0 = alternateASig0;
4929 aSig1 = alternateASig1;
4930 }
4931 zSign = ( (sbits64) aSig0 < 0 );
4932 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
4933 return
4934 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
4935
4936}
4937
4938/*----------------------------------------------------------------------------
4939| Returns the square root of the quadruple-precision floating-point value `a'.
4940| The operation is performed according to the IEC/IEEE Standard for Binary
4941| Floating-Point Arithmetic.
4942*----------------------------------------------------------------------------*/
4943
4944float128 float128_sqrt( float128 a STATUS_PARAM )
4945{
4946 flag aSign;
4947 int32 aExp, zExp;
4948 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
4949 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4950 float128 z;
4951
4952 aSig1 = extractFloat128Frac1( a );
4953 aSig0 = extractFloat128Frac0( a );
4954 aExp = extractFloat128Exp( a );
4955 aSign = extractFloat128Sign( a );
4956 if ( aExp == 0x7FFF ) {
4957 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
4958 if ( ! aSign ) return a;
4959 goto invalid;
4960 }
4961 if ( aSign ) {
4962 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
4963 invalid:
4964 float_raise( float_flag_invalid STATUS_VAR);
4965 z.low = float128_default_nan_low;
4966 z.high = float128_default_nan_high;
4967 return z;
4968 }
4969 if ( aExp == 0 ) {
4970 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
4971 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4972 }
4973 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
4974 aSig0 |= LIT64( 0x0001000000000000 );
4975 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
4976 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
4977 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4978 doubleZSig0 = zSig0<<1;
4979 mul64To128( zSig0, zSig0, &term0, &term1 );
4980 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4981 while ( (sbits64) rem0 < 0 ) {
4982 --zSig0;
4983 doubleZSig0 -= 2;
4984 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4985 }
4986 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4987 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
4988 if ( zSig1 == 0 ) zSig1 = 1;
4989 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4990 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4991 mul64To128( zSig1, zSig1, &term2, &term3 );
4992 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4993 while ( (sbits64) rem1 < 0 ) {
4994 --zSig1;
4995 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4996 term3 |= 1;
4997 term2 |= doubleZSig0;
4998 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4999 }
5000 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5001 }
5002 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5003 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5004
5005}
5006
5007/*----------------------------------------------------------------------------
5008| Returns 1 if the quadruple-precision floating-point value `a' is equal to
5009| the corresponding value `b', and 0 otherwise. The comparison is performed
5010| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5011*----------------------------------------------------------------------------*/
5012
5013int float128_eq( float128 a, float128 b STATUS_PARAM )
5014{
5015
5016 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5017 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5018 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5019 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5020 ) {
5021 if ( float128_is_signaling_nan( a )
5022 || float128_is_signaling_nan( b ) ) {
5023 float_raise( float_flag_invalid STATUS_VAR);
5024 }
5025 return 0;
5026 }
5027 return
5028 ( a.low == b.low )
5029 && ( ( a.high == b.high )
5030 || ( ( a.low == 0 )
5031 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5032 );
5033
5034}
5035
5036/*----------------------------------------------------------------------------
5037| Returns 1 if the quadruple-precision floating-point value `a' is less than
5038| or equal to the corresponding value `b', and 0 otherwise. The comparison
5039| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5040| Arithmetic.
5041*----------------------------------------------------------------------------*/
5042
5043int float128_le( float128 a, float128 b STATUS_PARAM )
5044{
5045 flag aSign, bSign;
5046
5047 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5048 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5049 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5050 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5051 ) {
5052 float_raise( float_flag_invalid STATUS_VAR);
5053 return 0;
5054 }
5055 aSign = extractFloat128Sign( a );
5056 bSign = extractFloat128Sign( b );
5057 if ( aSign != bSign ) {
5058 return
5059 aSign
5060 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5061 == 0 );
5062 }
5063 return
5064 aSign ? le128( b.high, b.low, a.high, a.low )
5065 : le128( a.high, a.low, b.high, b.low );
5066
5067}
5068
5069/*----------------------------------------------------------------------------
5070| Returns 1 if the quadruple-precision floating-point value `a' is less than
5071| the corresponding value `b', and 0 otherwise. The comparison is performed
5072| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5073*----------------------------------------------------------------------------*/
5074
5075int float128_lt( float128 a, float128 b STATUS_PARAM )
5076{
5077 flag aSign, bSign;
5078
5079 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5080 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5081 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5082 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5083 ) {
5084 float_raise( float_flag_invalid STATUS_VAR);
5085 return 0;
5086 }
5087 aSign = extractFloat128Sign( a );
5088 bSign = extractFloat128Sign( b );
5089 if ( aSign != bSign ) {
5090 return
5091 aSign
5092 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5093 != 0 );
5094 }
5095 return
5096 aSign ? lt128( b.high, b.low, a.high, a.low )
5097 : lt128( a.high, a.low, b.high, b.low );
5098
5099}
5100
5101/*----------------------------------------------------------------------------
5102| Returns 1 if the quadruple-precision floating-point value `a' is equal to
5103| the corresponding value `b', and 0 otherwise. The invalid exception is
5104| raised if either operand is a NaN. Otherwise, the comparison is performed
5105| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5106*----------------------------------------------------------------------------*/
5107
5108int float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
5109{
5110
5111 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5112 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5113 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5114 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5115 ) {
5116 float_raise( float_flag_invalid STATUS_VAR);
5117 return 0;
5118 }
5119 return
5120 ( a.low == b.low )
5121 && ( ( a.high == b.high )
5122 || ( ( a.low == 0 )
5123 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5124 );
5125
5126}
5127
5128/*----------------------------------------------------------------------------
5129| Returns 1 if the quadruple-precision floating-point value `a' is less than
5130| or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5131| cause an exception. Otherwise, the comparison is performed according to the
5132| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5133*----------------------------------------------------------------------------*/
5134
5135int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5136{
5137 flag aSign, bSign;
5138
5139 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5140 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5141 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5142 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5143 ) {
5144 if ( float128_is_signaling_nan( a )
5145 || float128_is_signaling_nan( b ) ) {
5146 float_raise( float_flag_invalid STATUS_VAR);
5147 }
5148 return 0;
5149 }
5150 aSign = extractFloat128Sign( a );
5151 bSign = extractFloat128Sign( b );
5152 if ( aSign != bSign ) {
5153 return
5154 aSign
5155 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5156 == 0 );
5157 }
5158 return
5159 aSign ? le128( b.high, b.low, a.high, a.low )
5160 : le128( a.high, a.low, b.high, b.low );
5161
5162}
5163
5164/*----------------------------------------------------------------------------
5165| Returns 1 if the quadruple-precision floating-point value `a' is less than
5166| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5167| exception. Otherwise, the comparison is performed according to the IEC/IEEE
5168| Standard for Binary Floating-Point Arithmetic.
5169*----------------------------------------------------------------------------*/
5170
5171int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5172{
5173 flag aSign, bSign;
5174
5175 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5176 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5177 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5178 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5179 ) {
5180 if ( float128_is_signaling_nan( a )
5181 || float128_is_signaling_nan( b ) ) {
5182 float_raise( float_flag_invalid STATUS_VAR);
5183 }
5184 return 0;
5185 }
5186 aSign = extractFloat128Sign( a );
5187 bSign = extractFloat128Sign( b );
5188 if ( aSign != bSign ) {
5189 return
5190 aSign
5191 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5192 != 0 );
5193 }
5194 return
5195 aSign ? lt128( b.high, b.low, a.high, a.low )
5196 : lt128( a.high, a.low, b.high, b.low );
5197
5198}
5199
5200#endif
5201
5202/* misc functions */
5203float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5204{
5205 return int64_to_float32(a STATUS_VAR);
5206}
5207
5208float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5209{
5210 return int64_to_float64(a STATUS_VAR);
5211}
5212
5213unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5214{
5215 int64_t v;
5216 unsigned int res;
5217
5218 v = float32_to_int64(a STATUS_VAR);
5219 if (v < 0) {
5220 res = 0;
5221 float_raise( float_flag_invalid STATUS_VAR);
5222 } else if (v > 0xffffffff) {
5223 res = 0xffffffff;
5224 float_raise( float_flag_invalid STATUS_VAR);
5225 } else {
5226 res = v;
5227 }
5228 return res;
5229}
5230
5231unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
5232{
5233 int64_t v;
5234 unsigned int res;
5235
5236 v = float32_to_int64_round_to_zero(a STATUS_VAR);
5237 if (v < 0) {
5238 res = 0;
5239 float_raise( float_flag_invalid STATUS_VAR);
5240 } else if (v > 0xffffffff) {
5241 res = 0xffffffff;
5242 float_raise( float_flag_invalid STATUS_VAR);
5243 } else {
5244 res = v;
5245 }
5246 return res;
5247}
5248
5249unsigned int float64_to_uint32( float64 a STATUS_PARAM )
5250{
5251 int64_t v;
5252 unsigned int res;
5253
5254 v = float64_to_int64(a STATUS_VAR);
5255 if (v < 0) {
5256 res = 0;
5257 float_raise( float_flag_invalid STATUS_VAR);
5258 } else if (v > 0xffffffff) {
5259 res = 0xffffffff;
5260 float_raise( float_flag_invalid STATUS_VAR);
5261 } else {
5262 res = v;
5263 }
5264 return res;
5265}
5266
5267unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
5268{
5269 int64_t v;
5270 unsigned int res;
5271
5272 v = float64_to_int64_round_to_zero(a STATUS_VAR);
5273 if (v < 0) {
5274 res = 0;
5275 float_raise( float_flag_invalid STATUS_VAR);
5276 } else if (v > 0xffffffff) {
5277 res = 0xffffffff;
5278 float_raise( float_flag_invalid STATUS_VAR);
5279 } else {
5280 res = v;
5281 }
5282 return res;
5283}
5284
5285#define COMPARE(s, nan_exp) \
5286INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \
5287 int is_quiet STATUS_PARAM ) \
5288{ \
5289 flag aSign, bSign; \
5290 \
5291 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
5292 extractFloat ## s ## Frac( a ) ) || \
5293 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
5294 extractFloat ## s ## Frac( b ) )) { \
5295 if (!is_quiet || \
5296 float ## s ## _is_signaling_nan( a ) || \
5297 float ## s ## _is_signaling_nan( b ) ) { \
5298 float_raise( float_flag_invalid STATUS_VAR); \
5299 } \
5300 return float_relation_unordered; \
5301 } \
5302 aSign = extractFloat ## s ## Sign( a ); \
5303 bSign = extractFloat ## s ## Sign( b ); \
5304 if ( aSign != bSign ) { \
5305 if ( (bits ## s) ( ( a | b )<<1 ) == 0 ) { \
5306 /* zero case */ \
5307 return float_relation_equal; \
5308 } else { \
5309 return 1 - (2 * aSign); \
5310 } \
5311 } else { \
5312 if (a == b) { \
5313 return float_relation_equal; \
5314 } else { \
5315 return 1 - 2 * (aSign ^ ( a < b )); \
5316 } \
5317 } \
5318} \
5319 \
5320int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
5321{ \
5322 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
5323} \
5324 \
5325int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
5326{ \
5327 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
5328}
5329
5330COMPARE(32, 0xff)
5331COMPARE(64, 0x7ff)
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette