File: /usr/src/linux/arch/arm/nwfpe/softfloat.c
1 /*
2 ===============================================================================
3
4 This C source file is part of the SoftFloat IEC/IEEE Floating-point
5 Arithmetic Package, Release 2.
6
7 Written by John R. Hauser. This work was made possible in part by the
8 International Computer Science Institute, located at Suite 600, 1947 Center
9 Street, Berkeley, California 94704. Funding was partially provided by the
10 National Science Foundation under grant MIP-9311980. The original version
11 of this code was written as part of a project to build a fixed-point vector
12 processor in collaboration with the University of California at Berkeley,
13 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
14 is available through the web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
15 arithmetic/softfloat.html'.
16
17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
18 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
19 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
20 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
21 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
22
23 Derivative works are acceptable, even for commercial purposes, so long as
24 (1) they include prominent notice that the work is derivative, and (2) they
25 include prominent notice akin to these three paragraphs for those parts of
26 this code that are retained.
27
28 ===============================================================================
29 */
30
31 #include "milieu.h"
32 #include "softfloat.h"
33
34 /*
35 -------------------------------------------------------------------------------
36 Floating-point rounding mode, extended double-precision rounding precision,
37 and exception flags.
38 -------------------------------------------------------------------------------
39 */
40 int8 float_rounding_mode = float_round_nearest_even;
41 int8 floatx80_rounding_precision = 80;
42 int8 float_exception_flags;
43
44 /*
45 -------------------------------------------------------------------------------
46 Primitive arithmetic functions, including multi-word arithmetic, and
47 division and square root approximations. (Can be specialized to target if
48 desired.)
49 -------------------------------------------------------------------------------
50 */
51 #include "softfloat-macros"
52
53 /*
54 -------------------------------------------------------------------------------
55 Functions and definitions to determine: (1) whether tininess for underflow
56 is detected before or after rounding by default, (2) what (if anything)
57 happens when exceptions are raised, (3) how signaling NaNs are distinguished
58 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
59 are propagated from function inputs to output. These details are target-
60 specific.
61 -------------------------------------------------------------------------------
62 */
63 #include "softfloat-specialize"
64
65 /*
66 -------------------------------------------------------------------------------
67 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
68 and 7, and returns the properly rounded 32-bit integer corresponding to the
69 input. If `zSign' is nonzero, the input is negated before being converted
70 to an integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point
71 input is simply rounded to an integer, with the inexact exception raised if
72 the input cannot be represented exactly as an integer. If the fixed-point
73 input is too large, however, the invalid exception is raised and the largest
74 positive or negative integer is returned.
75 -------------------------------------------------------------------------------
76 */
77 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
78 {
79 int8 roundingMode;
80 flag roundNearestEven;
81 int8 roundIncrement, roundBits;
82 int32 z;
83
84 roundingMode = float_rounding_mode;
85 roundNearestEven = ( roundingMode == float_round_nearest_even );
86 roundIncrement = 0x40;
87 if ( ! roundNearestEven ) {
88 if ( roundingMode == float_round_to_zero ) {
89 roundIncrement = 0;
90 }
91 else {
92 roundIncrement = 0x7F;
93 if ( zSign ) {
94 if ( roundingMode == float_round_up ) roundIncrement = 0;
95 }
96 else {
97 if ( roundingMode == float_round_down ) roundIncrement = 0;
98 }
99 }
100 }
101 roundBits = absZ & 0x7F;
102 absZ = ( absZ + roundIncrement )>>7;
103 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
104 z = absZ;
105 if ( zSign ) z = - z;
106 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
107 float_exception_flags |= float_flag_invalid;
108 return zSign ? 0x80000000 : 0x7FFFFFFF;
109 }
110 if ( roundBits ) float_exception_flags |= float_flag_inexact;
111 return z;
112
113 }
114
115 /*
116 -------------------------------------------------------------------------------
117 Returns the fraction bits of the single-precision floating-point value `a'.
118 -------------------------------------------------------------------------------
119 */
120 INLINE bits32 extractFloat32Frac( float32 a )
121 {
122
123 return a & 0x007FFFFF;
124
125 }
126
127 /*
128 -------------------------------------------------------------------------------
129 Returns the exponent bits of the single-precision floating-point value `a'.
130 -------------------------------------------------------------------------------
131 */
132 INLINE int16 extractFloat32Exp( float32 a )
133 {
134
135 return ( a>>23 ) & 0xFF;
136
137 }
138
139 /*
140 -------------------------------------------------------------------------------
141 Returns the sign bit of the single-precision floating-point value `a'.
142 -------------------------------------------------------------------------------
143 */
144 INLINE flag extractFloat32Sign( float32 a )
145 {
146
147 return a>>31;
148
149 }
150
151 /*
152 -------------------------------------------------------------------------------
153 Normalizes the subnormal single-precision floating-point value represented
154 by the denormalized significand `aSig'. The normalized exponent and
155 significand are stored at the locations pointed to by `zExpPtr' and
156 `zSigPtr', respectively.
157 -------------------------------------------------------------------------------
158 */
159 static void
160 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
161 {
162 int8 shiftCount;
163
164 shiftCount = countLeadingZeros32( aSig ) - 8;
165 *zSigPtr = aSig<<shiftCount;
166 *zExpPtr = 1 - shiftCount;
167
168 }
169
170 /*
171 -------------------------------------------------------------------------------
172 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
173 single-precision floating-point value, returning the result. After being
174 shifted into the proper positions, the three fields are simply added
175 together to form the result. This means that any integer portion of `zSig'
176 will be added into the exponent. Since a properly normalized significand
177 will have an integer portion equal to 1, the `zExp' input should be 1 less
178 than the desired result exponent whenever `zSig' is a complete, normalized
179 significand.
180 -------------------------------------------------------------------------------
181 */
182 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
183 {
184 #if 0
185 float32 f;
186 __asm__("@ packFloat32;
187 mov %0, %1, asl #31;
188 orr %0, %2, asl #23;
189 orr %0, %3"
190 : /* no outputs */
191 : "g" (f), "g" (zSign), "g" (zExp), "g" (zSig)
192 : "cc");
193 return f;
194 #else
195 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
196 #endif
197 }
198
199 /*
200 -------------------------------------------------------------------------------
201 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
202 and significand `zSig', and returns the proper single-precision floating-
203 point value corresponding to the abstract input. Ordinarily, the abstract
204 value is simply rounded and packed into the single-precision format, with
205 the inexact exception raised if the abstract input cannot be represented
206 exactly. If the abstract value is too large, however, the overflow and
207 inexact exceptions are raised and an infinity or maximal finite value is
208 returned. If the abstract value is too small, the input value is rounded to
209 a subnormal number, and the underflow and inexact exceptions are raised if
210 the abstract input cannot be represented exactly as a subnormal single-
211 precision floating-point number.
212 The input significand `zSig' has its binary point between bits 30
213 and 29, which is 7 bits to the left of the usual location. This shifted
214 significand must be normalized or smaller. If `zSig' is not normalized,
215 `zExp' must be 0; in that case, the result returned is a subnormal number,
216 and it must not require rounding. In the usual case that `zSig' is
217 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
218 The handling of underflow and overflow follows the IEC/IEEE Standard for
219 Binary Floating-point Arithmetic.
220 -------------------------------------------------------------------------------
221 */
222 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
223 {
224 int8 roundingMode;
225 flag roundNearestEven;
226 int8 roundIncrement, roundBits;
227 flag isTiny;
228
229 roundingMode = float_rounding_mode;
230 roundNearestEven = ( roundingMode == float_round_nearest_even );
231 roundIncrement = 0x40;
232 if ( ! roundNearestEven ) {
233 if ( roundingMode == float_round_to_zero ) {
234 roundIncrement = 0;
235 }
236 else {
237 roundIncrement = 0x7F;
238 if ( zSign ) {
239 if ( roundingMode == float_round_up ) roundIncrement = 0;
240 }
241 else {
242 if ( roundingMode == float_round_down ) roundIncrement = 0;
243 }
244 }
245 }
246 roundBits = zSig & 0x7F;
247 if ( 0xFD <= (bits16) zExp ) {
248 if ( ( 0xFD < zExp )
249 || ( ( zExp == 0xFD )
250 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
251 ) {
252 float_raise( float_flag_overflow | float_flag_inexact );
253 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
254 }
255 if ( zExp < 0 ) {
256 isTiny =
257 ( float_detect_tininess == float_tininess_before_rounding )
258 || ( zExp < -1 )
259 || ( zSig + roundIncrement < 0x80000000 );
260 shift32RightJamming( zSig, - zExp, &zSig );
261 zExp = 0;
262 roundBits = zSig & 0x7F;
263 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
264 }
265 }
266 if ( roundBits ) float_exception_flags |= float_flag_inexact;
267 zSig = ( zSig + roundIncrement )>>7;
268 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
269 if ( zSig == 0 ) zExp = 0;
270 return packFloat32( zSign, zExp, zSig );
271
272 }
273
274 /*
275 -------------------------------------------------------------------------------
276 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
277 and significand `zSig', and returns the proper single-precision floating-
278 point value corresponding to the abstract input. This routine is just like
279 `roundAndPackFloat32' except that `zSig' does not have to be normalized in
280 any way. In all cases, `zExp' must be 1 less than the ``true'' floating-
281 point exponent.
282 -------------------------------------------------------------------------------
283 */
284 static float32
285 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
286 {
287 int8 shiftCount;
288
289 shiftCount = countLeadingZeros32( zSig ) - 1;
290 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
291
292 }
293
294 /*
295 -------------------------------------------------------------------------------
296 Returns the fraction bits of the double-precision floating-point value `a'.
297 -------------------------------------------------------------------------------
298 */
299 INLINE bits64 extractFloat64Frac( float64 a )
300 {
301
302 return a & LIT64( 0x000FFFFFFFFFFFFF );
303
304 }
305
306 /*
307 -------------------------------------------------------------------------------
308 Returns the exponent bits of the double-precision floating-point value `a'.
309 -------------------------------------------------------------------------------
310 */
311 INLINE int16 extractFloat64Exp( float64 a )
312 {
313
314 return ( a>>52 ) & 0x7FF;
315
316 }
317
318 /*
319 -------------------------------------------------------------------------------
320 Returns the sign bit of the double-precision floating-point value `a'.
321 -------------------------------------------------------------------------------
322 */
323 INLINE flag extractFloat64Sign( float64 a )
324 {
325
326 return a>>63;
327
328 }
329
330 /*
331 -------------------------------------------------------------------------------
332 Normalizes the subnormal double-precision floating-point value represented
333 by the denormalized significand `aSig'. The normalized exponent and
334 significand are stored at the locations pointed to by `zExpPtr' and
335 `zSigPtr', respectively.
336 -------------------------------------------------------------------------------
337 */
338 static void
339 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
340 {
341 int8 shiftCount;
342
343 shiftCount = countLeadingZeros64( aSig ) - 11;
344 *zSigPtr = aSig<<shiftCount;
345 *zExpPtr = 1 - shiftCount;
346
347 }
348
349 /*
350 -------------------------------------------------------------------------------
351 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
352 double-precision floating-point value, returning the result. After being
353 shifted into the proper positions, the three fields are simply added
354 together to form the result. This means that any integer portion of `zSig'
355 will be added into the exponent. Since a properly normalized significand
356 will have an integer portion equal to 1, the `zExp' input should be 1 less
357 than the desired result exponent whenever `zSig' is a complete, normalized
358 significand.
359 -------------------------------------------------------------------------------
360 */
361 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
362 {
363
364 return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
365
366 }
367
368 /*
369 -------------------------------------------------------------------------------
370 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
371 and significand `zSig', and returns the proper double-precision floating-
372 point value corresponding to the abstract input. Ordinarily, the abstract
373 value is simply rounded and packed into the double-precision format, with
374 the inexact exception raised if the abstract input cannot be represented
375 exactly. If the abstract value is too large, however, the overflow and
376 inexact exceptions are raised and an infinity or maximal finite value is
377 returned. If the abstract value is too small, the input value is rounded to
378 a subnormal number, and the underflow and inexact exceptions are raised if
379 the abstract input cannot be represented exactly as a subnormal double-
380 precision floating-point number.
381 The input significand `zSig' has its binary point between bits 62
382 and 61, which is 10 bits to the left of the usual location. This shifted
383 significand must be normalized or smaller. If `zSig' is not normalized,
384 `zExp' must be 0; in that case, the result returned is a subnormal number,
385 and it must not require rounding. In the usual case that `zSig' is
386 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
387 The handling of underflow and overflow follows the IEC/IEEE Standard for
388 Binary Floating-point Arithmetic.
389 -------------------------------------------------------------------------------
390 */
391 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
392 {
393 int8 roundingMode;
394 flag roundNearestEven;
395 int16 roundIncrement, roundBits;
396 flag isTiny;
397
398 roundingMode = float_rounding_mode;
399 roundNearestEven = ( roundingMode == float_round_nearest_even );
400 roundIncrement = 0x200;
401 if ( ! roundNearestEven ) {
402 if ( roundingMode == float_round_to_zero ) {
403 roundIncrement = 0;
404 }
405 else {
406 roundIncrement = 0x3FF;
407 if ( zSign ) {
408 if ( roundingMode == float_round_up ) roundIncrement = 0;
409 }
410 else {
411 if ( roundingMode == float_round_down ) roundIncrement = 0;
412 }
413 }
414 }
415 roundBits = zSig & 0x3FF;
416 if ( 0x7FD <= (bits16) zExp ) {
417 if ( ( 0x7FD < zExp )
418 || ( ( zExp == 0x7FD )
419 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
420 ) {
421 //register int lr = __builtin_return_address(0);
422 //printk("roundAndPackFloat64 called from 0x%08x\n",lr);
423 float_raise( float_flag_overflow | float_flag_inexact );
424 return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
425 }
426 if ( zExp < 0 ) {
427 isTiny =
428 ( float_detect_tininess == float_tininess_before_rounding )
429 || ( zExp < -1 )
430 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
431 shift64RightJamming( zSig, - zExp, &zSig );
432 zExp = 0;
433 roundBits = zSig & 0x3FF;
434 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
435 }
436 }
437 if ( roundBits ) float_exception_flags |= float_flag_inexact;
438 zSig = ( zSig + roundIncrement )>>10;
439 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
440 if ( zSig == 0 ) zExp = 0;
441 return packFloat64( zSign, zExp, zSig );
442
443 }
444
445 /*
446 -------------------------------------------------------------------------------
447 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
448 and significand `zSig', and returns the proper double-precision floating-
449 point value corresponding to the abstract input. This routine is just like
450 `roundAndPackFloat64' except that `zSig' does not have to be normalized in
451 any way. In all cases, `zExp' must be 1 less than the ``true'' floating-
452 point exponent.
453 -------------------------------------------------------------------------------
454 */
455 static float64
456 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
457 {
458 int8 shiftCount;
459
460 shiftCount = countLeadingZeros64( zSig ) - 1;
461 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
462
463 }
464
465 #ifdef FLOATX80
466
467 /*
468 -------------------------------------------------------------------------------
469 Returns the fraction bits of the extended double-precision floating-point
470 value `a'.
471 -------------------------------------------------------------------------------
472 */
473 INLINE bits64 extractFloatx80Frac( floatx80 a )
474 {
475
476 return a.low;
477
478 }
479
480 /*
481 -------------------------------------------------------------------------------
482 Returns the exponent bits of the extended double-precision floating-point
483 value `a'.
484 -------------------------------------------------------------------------------
485 */
486 INLINE int32 extractFloatx80Exp( floatx80 a )
487 {
488
489 return a.high & 0x7FFF;
490
491 }
492
493 /*
494 -------------------------------------------------------------------------------
495 Returns the sign bit of the extended double-precision floating-point value
496 `a'.
497 -------------------------------------------------------------------------------
498 */
499 INLINE flag extractFloatx80Sign( floatx80 a )
500 {
501
502 return a.high>>15;
503
504 }
505
506 /*
507 -------------------------------------------------------------------------------
508 Normalizes the subnormal extended double-precision floating-point value
509 represented by the denormalized significand `aSig'. The normalized exponent
510 and significand are stored at the locations pointed to by `zExpPtr' and
511 `zSigPtr', respectively.
512 -------------------------------------------------------------------------------
513 */
514 static void
515 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
516 {
517 int8 shiftCount;
518
519 shiftCount = countLeadingZeros64( aSig );
520 *zSigPtr = aSig<<shiftCount;
521 *zExpPtr = 1 - shiftCount;
522
523 }
524
525 /*
526 -------------------------------------------------------------------------------
527 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
528 extended double-precision floating-point value, returning the result.
529 -------------------------------------------------------------------------------
530 */
531 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
532 {
533 floatx80 z;
534
535 z.low = zSig;
536 z.high = ( ( (bits16) zSign )<<15 ) + zExp;
537 return z;
538
539 }
540
541 /*
542 -------------------------------------------------------------------------------
543 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
544 and extended significand formed by the concatenation of `zSig0' and `zSig1',
545 and returns the proper extended double-precision floating-point value
546 corresponding to the abstract input. Ordinarily, the abstract value is
547 rounded and packed into the extended double-precision format, with the
548 inexact exception raised if the abstract input cannot be represented
549 exactly. If the abstract value is too large, however, the overflow and
550 inexact exceptions are raised and an infinity or maximal finite value is
551 returned. If the abstract value is too small, the input value is rounded to
552 a subnormal number, and the underflow and inexact exceptions are raised if
553 the abstract input cannot be represented exactly as a subnormal extended
554 double-precision floating-point number.
555 If `roundingPrecision' is 32 or 64, the result is rounded to the same
556 number of bits as single or double precision, respectively. Otherwise, the
557 result is rounded to the full precision of the extended double-precision
558 format.
559 The input significand must be normalized or smaller. If the input
560 significand is not normalized, `zExp' must be 0; in that case, the result
561 returned is a subnormal number, and it must not require rounding. The
562 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
563 Floating-point Arithmetic.
564 -------------------------------------------------------------------------------
565 */
566 static floatx80
567 roundAndPackFloatx80(
568 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
569 )
570 {
571 int8 roundingMode;
572 flag roundNearestEven, increment, isTiny;
573 int64 roundIncrement, roundMask, roundBits;
574
575 roundingMode = float_rounding_mode;
576 roundNearestEven = ( roundingMode == float_round_nearest_even );
577 if ( roundingPrecision == 80 ) goto precision80;
578 if ( roundingPrecision == 64 ) {
579 roundIncrement = LIT64( 0x0000000000000400 );
580 roundMask = LIT64( 0x00000000000007FF );
581 }
582 else if ( roundingPrecision == 32 ) {
583 roundIncrement = LIT64( 0x0000008000000000 );
584 roundMask = LIT64( 0x000000FFFFFFFFFF );
585 }
586 else {
587 goto precision80;
588 }
589 zSig0 |= ( zSig1 != 0 );
590 if ( ! roundNearestEven ) {
591 if ( roundingMode == float_round_to_zero ) {
592 roundIncrement = 0;
593 }
594 else {
595 roundIncrement = roundMask;
596 if ( zSign ) {
597 if ( roundingMode == float_round_up ) roundIncrement = 0;
598 }
599 else {
600 if ( roundingMode == float_round_down ) roundIncrement = 0;
601 }
602 }
603 }
604 roundBits = zSig0 & roundMask;
605 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
606 if ( ( 0x7FFE < zExp )
607 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
608 ) {
609 goto overflow;
610 }
611 if ( zExp <= 0 ) {
612 isTiny =
613 ( float_detect_tininess == float_tininess_before_rounding )
614 || ( zExp < 0 )
615 || ( zSig0 <= zSig0 + roundIncrement );
616 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
617 zExp = 0;
618 roundBits = zSig0 & roundMask;
619 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
620 if ( roundBits ) float_exception_flags |= float_flag_inexact;
621 zSig0 += roundIncrement;
622 if ( (sbits64) zSig0 < 0 ) zExp = 1;
623 roundIncrement = roundMask + 1;
624 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
625 roundMask |= roundIncrement;
626 }
627 zSig0 &= ~ roundMask;
628 return packFloatx80( zSign, zExp, zSig0 );
629 }
630 }
631 if ( roundBits ) float_exception_flags |= float_flag_inexact;
632 zSig0 += roundIncrement;
633 if ( zSig0 < roundIncrement ) {
634 ++zExp;
635 zSig0 = LIT64( 0x8000000000000000 );
636 }
637 roundIncrement = roundMask + 1;
638 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
639 roundMask |= roundIncrement;
640 }
641 zSig0 &= ~ roundMask;
642 if ( zSig0 == 0 ) zExp = 0;
643 return packFloatx80( zSign, zExp, zSig0 );
644 precision80:
645 increment = ( (sbits64) zSig1 < 0 );
646 if ( ! roundNearestEven ) {
647 if ( roundingMode == float_round_to_zero ) {
648 increment = 0;
649 }
650 else {
651 if ( zSign ) {
652 increment = ( roundingMode == float_round_down ) && zSig1;
653 }
654 else {
655 increment = ( roundingMode == float_round_up ) && zSig1;
656 }
657 }
658 }
659 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
660 if ( ( 0x7FFE < zExp )
661 || ( ( zExp == 0x7FFE )
662 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
663 && increment
664 )
665 ) {
666 roundMask = 0;
667 overflow:
668 float_raise( float_flag_overflow | float_flag_inexact );
669 if ( ( roundingMode == float_round_to_zero )
670 || ( zSign && ( roundingMode == float_round_up ) )
671 || ( ! zSign && ( roundingMode == float_round_down ) )
672 ) {
673 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
674 }
675 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
676 }
677 if ( zExp <= 0 ) {
678 isTiny =
679 ( float_detect_tininess == float_tininess_before_rounding )
680 || ( zExp < 0 )
681 || ! increment
682 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
683 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
684 zExp = 0;
685 if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
686 if ( zSig1 ) float_exception_flags |= float_flag_inexact;
687 if ( roundNearestEven ) {
688 increment = ( (sbits64) zSig1 < 0 );
689 }
690 else {
691 if ( zSign ) {
692 increment = ( roundingMode == float_round_down ) && zSig1;
693 }
694 else {
695 increment = ( roundingMode == float_round_up ) && zSig1;
696 }
697 }
698 if ( increment ) {
699 ++zSig0;
700 zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
701 if ( (sbits64) zSig0 < 0 ) zExp = 1;
702 }
703 return packFloatx80( zSign, zExp, zSig0 );
704 }
705 }
706 if ( zSig1 ) float_exception_flags |= float_flag_inexact;
707 if ( increment ) {
708 ++zSig0;
709 if ( zSig0 == 0 ) {
710 ++zExp;
711 zSig0 = LIT64( 0x8000000000000000 );
712 }
713 else {
714 zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
715 }
716 }
717 else {
718 if ( zSig0 == 0 ) zExp = 0;
719 }
720
721 return packFloatx80( zSign, zExp, zSig0 );
722 }
723
724 /*
725 -------------------------------------------------------------------------------
726 Takes an abstract floating-point value having sign `zSign', exponent
727 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
728 and returns the proper extended double-precision floating-point value
729 corresponding to the abstract input. This routine is just like
730 `roundAndPackFloatx80' except that the input significand does not have to be
731 normalized.
732 -------------------------------------------------------------------------------
733 */
734 static floatx80
735 normalizeRoundAndPackFloatx80(
736 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
737 )
738 {
739 int8 shiftCount;
740
741 if ( zSig0 == 0 ) {
742 zSig0 = zSig1;
743 zSig1 = 0;
744 zExp -= 64;
745 }
746 shiftCount = countLeadingZeros64( zSig0 );
747 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
748 zExp -= shiftCount;
749 return
750 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
751
752 }
753
754 #endif
755
756 #ifdef FLOAT128
757
758 /*
759 -------------------------------------------------------------------------------
760 Returns the least-significant 64 fraction bits of the quadruple-precision
761 floating-point value `a'.
762 -------------------------------------------------------------------------------
763 */
764 INLINE bits64 extractFloat128Frac1( float128 a )
765 {
766
767 return a.low;
768
769 }
770
771 /*
772 -------------------------------------------------------------------------------
773 Returns the most-significant 48 fraction bits of the quadruple-precision
774 floating-point value `a'.
775 -------------------------------------------------------------------------------
776 */
777 INLINE bits64 extractFloat128Frac0( float128 a )
778 {
779
780 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
781
782 }
783
784 /*
785 -------------------------------------------------------------------------------
786 Returns the exponent bits of the quadruple-precision floating-point value
787 `a'.
788 -------------------------------------------------------------------------------
789 */
790 INLINE int32 extractFloat128Exp( float128 a )
791 {
792
793 return ( a.high>>48 ) & 0x7FFF;
794
795 }
796
797 /*
798 -------------------------------------------------------------------------------
799 Returns the sign bit of the quadruple-precision floating-point value `a'.
800 -------------------------------------------------------------------------------
801 */
802 INLINE flag extractFloat128Sign( float128 a )
803 {
804
805 return a.high>>63;
806
807 }
808
809 /*
810 -------------------------------------------------------------------------------
811 Normalizes the subnormal quadruple-precision floating-point value
812 represented by the denormalized significand formed by the concatenation of
813 `aSig0' and `aSig1'. The normalized exponent is stored at the location
814 pointed to by `zExpPtr'. The most significant 49 bits of the normalized
815 significand are stored at the location pointed to by `zSig0Ptr', and the
816 least significant 64 bits of the normalized significand are stored at the
817 location pointed to by `zSig1Ptr'.
818 -------------------------------------------------------------------------------
819 */
820 static void
821 normalizeFloat128Subnormal(
822 bits64 aSig0,
823 bits64 aSig1,
824 int32 *zExpPtr,
825 bits64 *zSig0Ptr,
826 bits64 *zSig1Ptr
827 )
828 {
829 int8 shiftCount;
830
831 if ( aSig0 == 0 ) {
832 shiftCount = countLeadingZeros64( aSig1 ) - 15;
833 if ( shiftCount < 0 ) {
834 *zSig0Ptr = aSig1>>( - shiftCount );
835 *zSig1Ptr = aSig1<<( shiftCount & 63 );
836 }
837 else {
838 *zSig0Ptr = aSig1<<shiftCount;
839 *zSig1Ptr = 0;
840 }
841 *zExpPtr = - shiftCount - 63;
842 }
843 else {
844 shiftCount = countLeadingZeros64( aSig0 ) - 15;
845 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
846 *zExpPtr = 1 - shiftCount;
847 }
848
849 }
850
851 /*
852 -------------------------------------------------------------------------------
853 Packs the sign `zSign', the exponent `zExp', and the significand formed
854 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
855 floating-point value, returning the result. After being shifted into the
856 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
857 added together to form the most significant 32 bits of the result. This
858 means that any integer portion of `zSig0' will be added into the exponent.
859 Since a properly normalized significand will have an integer portion equal
860 to 1, the `zExp' input should be 1 less than the desired result exponent
861 whenever `zSig0' and `zSig1' concatenated form a complete, normalized
862 significand.
863 -------------------------------------------------------------------------------
864 */
865 INLINE float128
866 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
867 {
868 float128 z;
869
870 z.low = zSig1;
871 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
872 return z;
873
874 }
875
876 /*
877 -------------------------------------------------------------------------------
878 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
879 and extended significand formed by the concatenation of `zSig0', `zSig1',
880 and `zSig2', and returns the proper quadruple-precision floating-point value
881 corresponding to the abstract input. Ordinarily, the abstract value is
882 simply rounded and packed into the quadruple-precision format, with the
883 inexact exception raised if the abstract input cannot be represented
884 exactly. If the abstract value is too large, however, the overflow and
885 inexact exceptions are raised and an infinity or maximal finite value is
886 returned. If the abstract value is too small, the input value is rounded to
887 a subnormal number, and the underflow and inexact exceptions are raised if
888 the abstract input cannot be represented exactly as a subnormal quadruple-
889 precision floating-point number.
890 The input significand must be normalized or smaller. If the input
891 significand is not normalized, `zExp' must be 0; in that case, the result
892 returned is a subnormal number, and it must not require rounding. In the
893 usual case that the input significand is normalized, `zExp' must be 1 less
894 than the ``true'' floating-point exponent. The handling of underflow and
895 overflow follows the IEC/IEEE Standard for Binary Floating-point Arithmetic.
896 -------------------------------------------------------------------------------
897 */
898 static float128
899 roundAndPackFloat128(
900 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
901 {
902 int8 roundingMode;
903 flag roundNearestEven, increment, isTiny;
904
905 roundingMode = float_rounding_mode;
906 roundNearestEven = ( roundingMode == float_round_nearest_even );
907 increment = ( (sbits64) zSig2 < 0 );
908 if ( ! roundNearestEven ) {
909 if ( roundingMode == float_round_to_zero ) {
910 increment = 0;
911 }
912 else {
913 if ( zSign ) {
914 increment = ( roundingMode == float_round_down ) && zSig2;
915 }
916 else {
917 increment = ( roundingMode == float_round_up ) && zSig2;
918 }
919 }
920 }
921 if ( 0x7FFD <= (bits32) zExp ) {
922 if ( ( 0x7FFD < zExp )
923 || ( ( zExp == 0x7FFD )
924 && eq128(
925 LIT64( 0x0001FFFFFFFFFFFF ),
926 LIT64( 0xFFFFFFFFFFFFFFFF ),
927 zSig0,
928 zSig1
929 )
930 && increment
931 )
932 ) {
933 float_raise( float_flag_overflow | float_flag_inexact );
934 if ( ( roundingMode == float_round_to_zero )
935 || ( zSign && ( roundingMode == float_round_up ) )
936 || ( ! zSign && ( roundingMode == float_round_down ) )
937 ) {
938 return
939 packFloat128(
940 zSign,
941 0x7FFE,
942 LIT64( 0x0000FFFFFFFFFFFF ),
943 LIT64( 0xFFFFFFFFFFFFFFFF )
944 );
945 }
946 return packFloat128( zSign, 0x7FFF, 0, 0 );
947 }
948 if ( zExp < 0 ) {
949 isTiny =
950 ( float_detect_tininess == float_tininess_before_rounding )
951 || ( zExp < -1 )
952 || ! increment
953 || lt128(
954 zSig0,
955 zSig1,
956 LIT64( 0x0001FFFFFFFFFFFF ),
957 LIT64( 0xFFFFFFFFFFFFFFFF )
958 );
959 shift128ExtraRightJamming(
960 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
961 zExp = 0;
962 if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
963 if ( roundNearestEven ) {
964 increment = ( (sbits64) zSig2 < 0 );
965 }
966 else {
967 if ( zSign ) {
968 increment = ( roundingMode == float_round_down ) && zSig2;
969 }
970 else {
971 increment = ( roundingMode == float_round_up ) && zSig2;
972 }
973 }
974 }
975 }
976 if ( zSig2 ) float_exception_flags |= float_flag_inexact;
977 if ( increment ) {
978 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
979 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
980 }
981 else {
982 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
983 }
984 return packFloat128( zSign, zExp, zSig0, zSig1 );
985
986 }
987
988 /*
989 -------------------------------------------------------------------------------
990 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
991 and significand formed by the concatenation of `zSig0' and `zSig1', and
992 returns the proper quadruple-precision floating-point value corresponding to
993 the abstract input. This routine is just like `roundAndPackFloat128' except
994 that the input significand has fewer bits and does not have to be normalized
995 in any way. In all cases, `zExp' must be 1 less than the ``true'' floating-
996 point exponent.
997 -------------------------------------------------------------------------------
998 */
999 static float128
1000 normalizeRoundAndPackFloat128(
1001 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1002 {
1003 int8 shiftCount;
1004 bits64 zSig2;
1005
1006 if ( zSig0 == 0 ) {
1007 zSig0 = zSig1;
1008 zSig1 = 0;
1009 zExp -= 64;
1010 }
1011 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1012 if ( 0 <= shiftCount ) {
1013 zSig2 = 0;
1014 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1015 }
1016 else {
1017 shift128ExtraRightJamming(
1018 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1019 }
1020 zExp -= shiftCount;
1021 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1022
1023 }
1024
1025 #endif
1026
1027 /*
1028 -------------------------------------------------------------------------------
1029 Returns the result of converting the 32-bit two's complement integer `a' to
1030 the single-precision floating-point format. The conversion is performed
1031 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1032 -------------------------------------------------------------------------------
1033 */
1034 float32 int32_to_float32( int32 a )
1035 {
1036 flag zSign;
1037
1038 if ( a == 0 ) return 0;
1039 if ( a == 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1040 zSign = ( a < 0 );
1041 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
1042
1043 }
1044
1045 /*
1046 -------------------------------------------------------------------------------
1047 Returns the result of converting the 32-bit two's complement integer `a' to
1048 the double-precision floating-point format. The conversion is performed
1049 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1050 -------------------------------------------------------------------------------
1051 */
1052 float64 int32_to_float64( int32 a )
1053 {
1054 flag aSign;
1055 uint32 absA;
1056 int8 shiftCount;
1057 bits64 zSig;
1058
1059 if ( a == 0 ) return 0;
1060 aSign = ( a < 0 );
1061 absA = aSign ? - a : a;
1062 shiftCount = countLeadingZeros32( absA ) + 21;
1063 zSig = absA;
1064 return packFloat64( aSign, 0x432 - shiftCount, zSig<<shiftCount );
1065
1066 }
1067
1068 #ifdef FLOATX80
1069
1070 /*
1071 -------------------------------------------------------------------------------
1072 Returns the result of converting the 32-bit two's complement integer `a'
1073 to the extended double-precision floating-point format. The conversion
1074 is performed according to the IEC/IEEE Standard for Binary Floating-point
1075 Arithmetic.
1076 -------------------------------------------------------------------------------
1077 */
1078 floatx80 int32_to_floatx80( int32 a )
1079 {
1080 flag zSign;
1081 uint32 absA;
1082 int8 shiftCount;
1083 bits64 zSig;
1084
1085 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1086 zSign = ( a < 0 );
1087 absA = zSign ? - a : a;
1088 shiftCount = countLeadingZeros32( absA ) + 32;
1089 zSig = absA;
1090 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1091
1092 }
1093
1094 #endif
1095
1096 #ifdef FLOAT128
1097
1098 /*
1099 -------------------------------------------------------------------------------
1100 Returns the result of converting the 32-bit two's complement integer `a' to
1101 the quadruple-precision floating-point format. The conversion is performed
1102 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1103 -------------------------------------------------------------------------------
1104 */
1105 float128 int32_to_float128( int32 a )
1106 {
1107 flag zSign;
1108 uint32 absA;
1109 int8 shiftCount;
1110 bits64 zSig0;
1111
1112 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1113 zSign = ( a < 0 );
1114 absA = zSign ? - a : a;
1115 shiftCount = countLeadingZeros32( absA ) + 17;
1116 zSig0 = absA;
1117 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1118
1119 }
1120
1121 #endif
1122
1123 /*
1124 -------------------------------------------------------------------------------
1125 Returns the result of converting the single-precision floating-point value
1126 `a' to the 32-bit two's complement integer format. The conversion is
1127 performed according to the IEC/IEEE Standard for Binary Floating-point
1128 Arithmetic---which means in particular that the conversion is rounded
1129 according to the current rounding mode. If `a' is a NaN, the largest
1130 positive integer is returned. Otherwise, if the conversion overflows, the
1131 largest integer with the same sign as `a' is returned.
1132 -------------------------------------------------------------------------------
1133 */
1134 int32 float32_to_int32( float32 a )
1135 {
1136 flag aSign;
1137 int16 aExp, shiftCount;
1138 bits32 aSig;
1139 bits64 zSig;
1140
1141 aSig = extractFloat32Frac( a );
1142 aExp = extractFloat32Exp( a );
1143 aSign = extractFloat32Sign( a );
1144 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1145 if ( aExp ) aSig |= 0x00800000;
1146 shiftCount = 0xAF - aExp;
1147 zSig = aSig;
1148 zSig <<= 32;
1149 if ( 0 < shiftCount ) shift64RightJamming( zSig, shiftCount, &zSig );
1150 return roundAndPackInt32( aSign, zSig );
1151
1152 }
1153
1154 /*
1155 -------------------------------------------------------------------------------
1156 Returns the result of converting the single-precision floating-point value
1157 `a' to the 32-bit two's complement integer format. The conversion is
1158 performed according to the IEC/IEEE Standard for Binary Floating-point
1159 Arithmetic, except that the conversion is always rounded toward zero. If
1160 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1161 conversion overflows, the largest integer with the same sign as `a' is
1162 returned.
1163 -------------------------------------------------------------------------------
1164 */
1165 int32 float32_to_int32_round_to_zero( float32 a )
1166 {
1167 flag aSign;
1168 int16 aExp, shiftCount;
1169 bits32 aSig;
1170 int32 z;
1171
1172 aSig = extractFloat32Frac( a );
1173 aExp = extractFloat32Exp( a );
1174 aSign = extractFloat32Sign( a );
1175 shiftCount = aExp - 0x9E;
1176 if ( 0 <= shiftCount ) {
1177 if ( a == 0xCF000000 ) return 0x80000000;
1178 float_raise( float_flag_invalid );
1179 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1180 return 0x80000000;
1181 }
1182 else if ( aExp <= 0x7E ) {
1183 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1184 return 0;
1185 }
1186 aSig = ( aSig | 0x00800000 )<<8;
1187 z = aSig>>( - shiftCount );
1188 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1189 float_exception_flags |= float_flag_inexact;
1190 }
1191 return aSign ? - z : z;
1192
1193 }
1194
1195 /*
1196 -------------------------------------------------------------------------------
1197 Returns the result of converting the single-precision floating-point value
1198 `a' to the double-precision floating-point format. The conversion is
1199 performed according to the IEC/IEEE Standard for Binary Floating-point
1200 Arithmetic.
1201 -------------------------------------------------------------------------------
1202 */
1203 float64 float32_to_float64( float32 a )
1204 {
1205 flag aSign;
1206 int16 aExp;
1207 bits32 aSig;
1208
1209 aSig = extractFloat32Frac( a );
1210 aExp = extractFloat32Exp( a );
1211 aSign = extractFloat32Sign( a );
1212 if ( aExp == 0xFF ) {
1213 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1214 return packFloat64( aSign, 0x7FF, 0 );
1215 }
1216 if ( aExp == 0 ) {
1217 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1218 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1219 --aExp;
1220 }
1221 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1222
1223 }
1224
1225 #ifdef FLOATX80
1226
1227 /*
1228 -------------------------------------------------------------------------------
1229 Returns the result of converting the single-precision floating-point value
1230 `a' to the extended double-precision floating-point format. The conversion
1231 is performed according to the IEC/IEEE Standard for Binary Floating-point
1232 Arithmetic.
1233 -------------------------------------------------------------------------------
1234 */
1235 floatx80 float32_to_floatx80( float32 a )
1236 {
1237 flag aSign;
1238 int16 aExp;
1239 bits32 aSig;
1240
1241 aSig = extractFloat32Frac( a );
1242 aExp = extractFloat32Exp( a );
1243 aSign = extractFloat32Sign( a );
1244 if ( aExp == 0xFF ) {
1245 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1246 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1247 }
1248 if ( aExp == 0 ) {
1249 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1250 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1251 }
1252 aSig |= 0x00800000;
1253 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1254
1255 }
1256
1257 #endif
1258
1259 #ifdef FLOAT128
1260
1261 /*
1262 -------------------------------------------------------------------------------
1263 Returns the result of converting the single-precision floating-point value
1264 `a' to the double-precision floating-point format. The conversion is
1265 performed according to the IEC/IEEE Standard for Binary Floating-point
1266 Arithmetic.
1267 -------------------------------------------------------------------------------
1268 */
1269 float128 float32_to_float128( float32 a )
1270 {
1271 flag aSign;
1272 int16 aExp;
1273 bits32 aSig;
1274
1275 aSig = extractFloat32Frac( a );
1276 aExp = extractFloat32Exp( a );
1277 aSign = extractFloat32Sign( a );
1278 if ( aExp == 0xFF ) {
1279 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1280 return packFloat128( aSign, 0x7FFF, 0, 0 );
1281 }
1282 if ( aExp == 0 ) {
1283 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1284 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1285 --aExp;
1286 }
1287 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1288
1289 }
1290
1291 #endif
1292
1293 /*
1294 -------------------------------------------------------------------------------
1295 Rounds the single-precision floating-point value `a' to an integer, and
1296 returns the result as a single-precision floating-point value. The
1297 operation is performed according to the IEC/IEEE Standard for Binary
1298 Floating-point Arithmetic.
1299 -------------------------------------------------------------------------------
1300 */
1301 float32 float32_round_to_int( float32 a )
1302 {
1303 flag aSign;
1304 int16 aExp;
1305 bits32 lastBitMask, roundBitsMask;
1306 int8 roundingMode;
1307 float32 z;
1308
1309 aExp = extractFloat32Exp( a );
1310 if ( 0x96 <= aExp ) {
1311 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1312 return propagateFloat32NaN( a, a );
1313 }
1314 return a;
1315 }
1316 if ( aExp <= 0x7E ) {
1317 if ( (bits32) ( a<<1 ) == 0 ) return a;
1318 float_exception_flags |= float_flag_inexact;
1319 aSign = extractFloat32Sign( a );
1320 switch ( float_rounding_mode ) {
1321 case float_round_nearest_even:
1322 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1323 return packFloat32( aSign, 0x7F, 0 );
1324 }
1325 break;
1326 case float_round_down:
1327 return aSign ? 0xBF800000 : 0;
1328 case float_round_up:
1329 return aSign ? 0x80000000 : 0x3F800000;
1330 }
1331 return packFloat32( aSign, 0, 0 );
1332 }
1333 lastBitMask = 1;
1334 lastBitMask <<= 0x96 - aExp;
1335 roundBitsMask = lastBitMask - 1;
1336 z = a;
1337 roundingMode = float_rounding_mode;
1338 if ( roundingMode == float_round_nearest_even ) {
1339 z += lastBitMask>>1;
1340 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1341 }
1342 else if ( roundingMode != float_round_to_zero ) {
1343 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1344 z += roundBitsMask;
1345 }
1346 }
1347 z &= ~ roundBitsMask;
1348 if ( z != a ) float_exception_flags |= float_flag_inexact;
1349 return z;
1350
1351 }
1352
1353 /*
1354 -------------------------------------------------------------------------------
1355 Returns the result of adding the absolute values of the single-precision
1356 floating-point values `a' and `b'. If `zSign' is true, the sum is negated
1357 before being returned. `zSign' is ignored if the result is a NaN. The
1358 addition is performed according to the IEC/IEEE Standard for Binary
1359 Floating-point Arithmetic.
1360 -------------------------------------------------------------------------------
1361 */
1362 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1363 {
1364 int16 aExp, bExp, zExp;
1365 bits32 aSig, bSig, zSig;
1366 int16 expDiff;
1367
1368 aSig = extractFloat32Frac( a );
1369 aExp = extractFloat32Exp( a );
1370 bSig = extractFloat32Frac( b );
1371 bExp = extractFloat32Exp( b );
1372 expDiff = aExp - bExp;
1373 aSig <<= 6;
1374 bSig <<= 6;
1375 if ( 0 < expDiff ) {
1376 if ( aExp == 0xFF ) {
1377 if ( aSig ) return propagateFloat32NaN( a, b );
1378 return a;
1379 }
1380 if ( bExp == 0 ) {
1381 --expDiff;
1382 }
1383 else {
1384 bSig |= 0x20000000;
1385 }
1386 shift32RightJamming( bSig, expDiff, &bSig );
1387 zExp = aExp;
1388 }
1389 else if ( expDiff < 0 ) {
1390 if ( bExp == 0xFF ) {
1391 if ( bSig ) return propagateFloat32NaN( a, b );
1392 return packFloat32( zSign, 0xFF, 0 );
1393 }
1394 if ( aExp == 0 ) {
1395 ++expDiff;
1396 }
1397 else {
1398 aSig |= 0x20000000;
1399 }
1400 shift32RightJamming( aSig, - expDiff, &aSig );
1401 zExp = bExp;
1402 }
1403 else {
1404 if ( aExp == 0xFF ) {
1405 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1406 return a;
1407 }
1408 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1409 zSig = 0x40000000 + aSig + bSig;
1410 zExp = aExp;
1411 goto roundAndPack;
1412 }
1413 aSig |= 0x20000000;
1414 zSig = ( aSig + bSig )<<1;
1415 --zExp;
1416 if ( (sbits32) zSig < 0 ) {
1417 zSig = aSig + bSig;
1418 ++zExp;
1419 }
1420 roundAndPack:
1421 return roundAndPackFloat32( zSign, zExp, zSig );
1422
1423 }
1424
1425 /*
1426 -------------------------------------------------------------------------------
1427 Returns the result of subtracting the absolute values of the single-
1428 precision floating-point values `a' and `b'. If `zSign' is true, the
1429 difference is negated before being returned. `zSign' is ignored if the
1430 result is a NaN. The subtraction is performed according to the IEC/IEEE
1431 Standard for Binary Floating-point Arithmetic.
1432 -------------------------------------------------------------------------------
1433 */
1434 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1435 {
1436 int16 aExp, bExp, zExp;
1437 bits32 aSig, bSig, zSig;
1438 int16 expDiff;
1439
1440 aSig = extractFloat32Frac( a );
1441 aExp = extractFloat32Exp( a );
1442 bSig = extractFloat32Frac( b );
1443 bExp = extractFloat32Exp( b );
1444 expDiff = aExp - bExp;
1445 aSig <<= 7;
1446 bSig <<= 7;
1447 if ( 0 < expDiff ) goto aExpBigger;
1448 if ( expDiff < 0 ) goto bExpBigger;
1449 if ( aExp == 0xFF ) {
1450 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1451 float_raise( float_flag_invalid );
1452 return float32_default_nan;
1453 }
1454 if ( aExp == 0 ) {
1455 aExp = 1;
1456 bExp = 1;
1457 }
1458 if ( bSig < aSig ) goto aBigger;
1459 if ( aSig < bSig ) goto bBigger;
1460 return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
1461 bExpBigger:
1462 if ( bExp == 0xFF ) {
1463 if ( bSig ) return propagateFloat32NaN( a, b );
1464 return packFloat32( zSign ^ 1, 0xFF, 0 );
1465 }
1466 if ( aExp == 0 ) {
1467 ++expDiff;
1468 }
1469 else {
1470 aSig |= 0x40000000;
1471 }
1472 shift32RightJamming( aSig, - expDiff, &aSig );
1473 bSig |= 0x40000000;
1474 bBigger:
1475 zSig = bSig - aSig;
1476 zExp = bExp;
1477 zSign ^= 1;
1478 goto normalizeRoundAndPack;
1479 aExpBigger:
1480 if ( aExp == 0xFF ) {
1481 if ( aSig ) return propagateFloat32NaN( a, b );
1482 return a;
1483 }
1484 if ( bExp == 0 ) {
1485 --expDiff;
1486 }
1487 else {
1488 bSig |= 0x40000000;
1489 }
1490 shift32RightJamming( bSig, expDiff, &bSig );
1491 aSig |= 0x40000000;
1492 aBigger:
1493 zSig = aSig - bSig;
1494 zExp = aExp;
1495 normalizeRoundAndPack:
1496 --zExp;
1497 return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1498
1499 }
1500
1501 /*
1502 -------------------------------------------------------------------------------
1503 Returns the result of adding the single-precision floating-point values `a'
1504 and `b'. The operation is performed according to the IEC/IEEE Standard for
1505 Binary Floating-point Arithmetic.
1506 -------------------------------------------------------------------------------
1507 */
1508 float32 float32_add( float32 a, float32 b )
1509 {
1510 flag aSign, bSign;
1511
1512 aSign = extractFloat32Sign( a );
1513 bSign = extractFloat32Sign( b );
1514 if ( aSign == bSign ) {
1515 return addFloat32Sigs( a, b, aSign );
1516 }
1517 else {
1518 return subFloat32Sigs( a, b, aSign );
1519 }
1520
1521 }
1522
1523 /*
1524 -------------------------------------------------------------------------------
1525 Returns the result of subtracting the single-precision floating-point values
1526 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1527 for Binary Floating-point Arithmetic.
1528 -------------------------------------------------------------------------------
1529 */
1530 float32 float32_sub( float32 a, float32 b )
1531 {
1532 flag aSign, bSign;
1533
1534 aSign = extractFloat32Sign( a );
1535 bSign = extractFloat32Sign( b );
1536 if ( aSign == bSign ) {
1537 return subFloat32Sigs( a, b, aSign );
1538 }
1539 else {
1540 return addFloat32Sigs( a, b, aSign );
1541 }
1542
1543 }
1544
1545 /*
1546 -------------------------------------------------------------------------------
1547 Returns the result of multiplying the single-precision floating-point values
1548 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1549 for Binary Floating-point Arithmetic.
1550 -------------------------------------------------------------------------------
1551 */
1552 float32 float32_mul( float32 a, float32 b )
1553 {
1554 flag aSign, bSign, zSign;
1555 int16 aExp, bExp, zExp;
1556 bits32 aSig, bSig;
1557 bits64 zSig64;
1558 bits32 zSig;
1559
1560 aSig = extractFloat32Frac( a );
1561 aExp = extractFloat32Exp( a );
1562 aSign = extractFloat32Sign( a );
1563 bSig = extractFloat32Frac( b );
1564 bExp = extractFloat32Exp( b );
1565 bSign = extractFloat32Sign( b );
1566 zSign = aSign ^ bSign;
1567 if ( aExp == 0xFF ) {
1568 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1569 return propagateFloat32NaN( a, b );
1570 }
1571 if ( ( bExp | bSig ) == 0 ) {
1572 float_raise( float_flag_invalid );
1573 return float32_default_nan;
1574 }
1575 return packFloat32( zSign, 0xFF, 0 );
1576 }
1577 if ( bExp == 0xFF ) {
1578 if ( bSig ) return propagateFloat32NaN( a, b );
1579 if ( ( aExp | aSig ) == 0 ) {
1580 float_raise( float_flag_invalid );
1581 return float32_default_nan;
1582 }
1583 return packFloat32( zSign, 0xFF, 0 );
1584 }
1585 if ( aExp == 0 ) {
1586 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1587 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1588 }
1589 if ( bExp == 0 ) {
1590 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1591 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1592 }
1593 zExp = aExp + bExp - 0x7F;
1594 aSig = ( aSig | 0x00800000 )<<7;
1595 bSig = ( bSig | 0x00800000 )<<8;
1596 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1597 zSig = zSig64;
1598 if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1599 zSig <<= 1;
1600 --zExp;
1601 }
1602 return roundAndPackFloat32( zSign, zExp, zSig );
1603
1604 }
1605
1606 /*
1607 -------------------------------------------------------------------------------
1608 Returns the result of dividing the single-precision floating-point value `a'
1609 by the corresponding value `b'. The operation is performed according to the
1610 IEC/IEEE Standard for Binary Floating-point Arithmetic.
1611 -------------------------------------------------------------------------------
1612 */
1613 float32 float32_div( float32 a, float32 b )
1614 {
1615 flag aSign, bSign, zSign;
1616 int16 aExp, bExp, zExp;
1617 bits32 aSig, bSig, zSig;
1618
1619 aSig = extractFloat32Frac( a );
1620 aExp = extractFloat32Exp( a );
1621 aSign = extractFloat32Sign( a );
1622 bSig = extractFloat32Frac( b );
1623 bExp = extractFloat32Exp( b );
1624 bSign = extractFloat32Sign( b );
1625 zSign = aSign ^ bSign;
1626 if ( aExp == 0xFF ) {
1627 if ( aSig ) return propagateFloat32NaN( a, b );
1628 if ( bExp == 0xFF ) {
1629 if ( bSig ) return propagateFloat32NaN( a, b );
1630 float_raise( float_flag_invalid );
1631 return float32_default_nan;
1632 }
1633 return packFloat32( zSign, 0xFF, 0 );
1634 }
1635 if ( bExp == 0xFF ) {
1636 if ( bSig ) return propagateFloat32NaN( a, b );
1637 return packFloat32( zSign, 0, 0 );
1638 }
1639 if ( bExp == 0 ) {
1640 if ( bSig == 0 ) {
1641 if ( ( aExp | aSig ) == 0 ) {
1642 float_raise( float_flag_invalid );
1643 return float32_default_nan;
1644 }
1645 float_raise( float_flag_divbyzero );
1646 return packFloat32( zSign, 0xFF, 0 );
1647 }
1648 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1649 }
1650 if ( aExp == 0 ) {
1651 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1652 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1653 }
1654 zExp = aExp - bExp + 0x7D;
1655 aSig = ( aSig | 0x00800000 )<<7;
1656 bSig = ( bSig | 0x00800000 )<<8;
1657 if ( bSig <= ( aSig + aSig ) ) {
1658 aSig >>= 1;
1659 ++zExp;
1660 }
1661 zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1662 if ( ( zSig & 0x3F ) == 0 ) {
1663 zSig |= ( ( (bits64) bSig ) * zSig != ( (bits64) aSig )<<32 );
1664 }
1665 return roundAndPackFloat32( zSign, zExp, zSig );
1666
1667 }
1668
1669 /*
1670 -------------------------------------------------------------------------------
1671 Returns the remainder of the single-precision floating-point value `a'
1672 with respect to the corresponding value `b'. The operation is performed
1673 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1674 -------------------------------------------------------------------------------
1675 */
1676 float32 float32_rem( float32 a, float32 b )
1677 {
1678 flag aSign, bSign, zSign;
1679 int16 aExp, bExp, expDiff;
1680 bits32 aSig, bSig;
1681 bits32 q;
1682 bits64 aSig64, bSig64, q64;
1683 bits32 alternateASig;
1684 sbits32 sigMean;
1685
1686 aSig = extractFloat32Frac( a );
1687 aExp = extractFloat32Exp( a );
1688 aSign = extractFloat32Sign( a );
1689 bSig = extractFloat32Frac( b );
1690 bExp = extractFloat32Exp( b );
1691 bSign = extractFloat32Sign( b );
1692 if ( aExp == 0xFF ) {
1693 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1694 return propagateFloat32NaN( a, b );
1695 }
1696 float_raise( float_flag_invalid );
1697 return float32_default_nan;
1698 }
1699 if ( bExp == 0xFF ) {
1700 if ( bSig ) return propagateFloat32NaN( a, b );
1701 return a;
1702 }
1703 if ( bExp == 0 ) {
1704 if ( bSig == 0 ) {
1705 float_raise( float_flag_invalid );
1706 return float32_default_nan;
1707 }
1708 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1709 }
1710 if ( aExp == 0 ) {
1711 if ( aSig == 0 ) return a;
1712 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1713 }
1714 expDiff = aExp - bExp;
1715 aSig |= 0x00800000;
1716 bSig |= 0x00800000;
1717 if ( expDiff < 32 ) {
1718 aSig <<= 8;
1719 bSig <<= 8;
1720 if ( expDiff < 0 ) {
1721 if ( expDiff < -1 ) return a;
1722 aSig >>= 1;
1723 }
1724 q = ( bSig <= aSig );
1725 if ( q ) aSig -= bSig;
1726 if ( 0 < expDiff ) {
1727 q = ( ( (bits64) aSig )<<32 ) / bSig;
1728 q >>= 32 - expDiff;
1729 bSig >>= 2;
1730 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1731 }
1732 else {
1733 aSig >>= 2;
1734 bSig >>= 2;
1735 }
1736 }
1737 else {
1738 if ( bSig <= aSig ) aSig -= bSig;
1739 aSig64 = ( (bits64) aSig )<<40;
1740 bSig64 = ( (bits64) bSig )<<40;
1741 expDiff -= 64;
1742 while ( 0 < expDiff ) {
1743 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1744 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1745 aSig64 = - ( ( bSig * q64 )<<38 );
1746 expDiff -= 62;
1747 }
1748 expDiff += 64;
1749 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1750 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1751 q = q64>>( 64 - expDiff );
1752 bSig <<= 6;
1753 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1754 }
1755 do {
1756 alternateASig = aSig;
1757 ++q;
1758 aSig -= bSig;
1759 } while ( 0 <= (sbits32) aSig );
1760 sigMean = aSig + alternateASig;
1761 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1762 aSig = alternateASig;
1763 }
1764 zSign = ( (sbits32) aSig < 0 );
1765 if ( zSign ) aSig = - aSig;
1766 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
1767
1768 }
1769
1770 /*
1771 -------------------------------------------------------------------------------
1772 Returns the square root of the single-precision floating-point value `a'.
1773 The operation is performed according to the IEC/IEEE Standard for Binary
1774 Floating-point Arithmetic.
1775 -------------------------------------------------------------------------------
1776 */
1777 float32 float32_sqrt( float32 a )
1778 {
1779 flag aSign;
1780 int16 aExp, zExp;
1781 bits32 aSig, zSig;
1782 bits64 rem, term;
1783
1784 aSig = extractFloat32Frac( a );
1785 aExp = extractFloat32Exp( a );
1786 aSign = extractFloat32Sign( a );
1787 if ( aExp == 0xFF ) {
1788 if ( aSig ) return propagateFloat32NaN( a, 0 );
1789 if ( ! aSign ) return a;
1790 float_raise( float_flag_invalid );
1791 return float32_default_nan;
1792 }
1793 if ( aSign ) {
1794 if ( ( aExp | aSig ) == 0 ) return a;
1795 float_raise( float_flag_invalid );
1796 return float32_default_nan;
1797 }
1798 if ( aExp == 0 ) {
1799 if ( aSig == 0 ) return 0;
1800 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1801 }
1802 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1803 aSig = ( aSig | 0x00800000 )<<8;
1804 zSig = estimateSqrt32( aExp, aSig ) + 2;
1805 if ( ( zSig & 0x7F ) <= 5 ) {
1806 if ( zSig < 2 ) {
1807 zSig = 0xFFFFFFFF;
1808 }
1809 else {
1810 aSig >>= aExp & 1;
1811 term = ( (bits64) zSig ) * zSig;
1812 rem = ( ( (bits64) aSig )<<32 ) - term;
1813 while ( (sbits64) rem < 0 ) {
1814 --zSig;
1815 rem += ( ( (bits64) zSig )<<1 ) | 1;
1816 }
1817 zSig |= ( rem != 0 );
1818 }
1819 }
1820 shift32RightJamming( zSig, 1, &zSig );
1821 return roundAndPackFloat32( 0, zExp, zSig );
1822
1823 }
1824
1825 /*
1826 -------------------------------------------------------------------------------
1827 Returns 1 if the single-precision floating-point value `a' is equal to the
1828 corresponding value `b', and 0 otherwise. The comparison is performed
1829 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1830 -------------------------------------------------------------------------------
1831 */
1832 flag float32_eq( float32 a, float32 b )
1833 {
1834
1835 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1836 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1837 ) {
1838 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1839 float_raise( float_flag_invalid );
1840 }
1841 return 0;
1842 }
1843 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1844
1845 }
1846
1847 /*
1848 -------------------------------------------------------------------------------
1849 Returns 1 if the single-precision floating-point value `a' is less than or
1850 equal to the corresponding value `b', and 0 otherwise. The comparison is
1851 performed according to the IEC/IEEE Standard for Binary Floating-point
1852 Arithmetic.
1853 -------------------------------------------------------------------------------
1854 */
1855 flag float32_le( float32 a, float32 b )
1856 {
1857 flag aSign, bSign;
1858
1859 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1860 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1861 ) {
1862 float_raise( float_flag_invalid );
1863 return 0;
1864 }
1865 aSign = extractFloat32Sign( a );
1866 bSign = extractFloat32Sign( b );
1867 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1868 return ( a == b ) || ( aSign ^ ( a < b ) );
1869
1870 }
1871
1872 /*
1873 -------------------------------------------------------------------------------
1874 Returns 1 if the single-precision floating-point value `a' is less than
1875 the corresponding value `b', and 0 otherwise. The comparison is performed
1876 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1877 -------------------------------------------------------------------------------
1878 */
1879 flag float32_lt( float32 a, float32 b )
1880 {
1881 flag aSign, bSign;
1882
1883 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1884 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1885 ) {
1886 float_raise( float_flag_invalid );
1887 return 0;
1888 }
1889 aSign = extractFloat32Sign( a );
1890 bSign = extractFloat32Sign( b );
1891 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1892 return ( a != b ) && ( aSign ^ ( a < b ) );
1893
1894 }
1895
1896 /*
1897 -------------------------------------------------------------------------------
1898 Returns 1 if the single-precision floating-point value `a' is equal to the
1899 corresponding value `b', and 0 otherwise. The invalid exception is raised
1900 if either operand is a NaN. Otherwise, the comparison is performed
1901 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1902 -------------------------------------------------------------------------------
1903 */
1904 flag float32_eq_signaling( float32 a, float32 b )
1905 {
1906
1907 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1908 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1909 ) {
1910 float_raise( float_flag_invalid );
1911 return 0;
1912 }
1913 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1914
1915 }
1916
1917 /*
1918 -------------------------------------------------------------------------------
1919 Returns 1 if the single-precision floating-point value `a' is less than or
1920 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
1921 cause an exception. Otherwise, the comparison is performed according to the
1922 IEC/IEEE Standard for Binary Floating-point Arithmetic.
1923 -------------------------------------------------------------------------------
1924 */
1925 flag float32_le_quiet( float32 a, float32 b )
1926 {
1927 flag aSign, bSign;
1928 //int16 aExp, bExp;
1929
1930 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1931 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1932 ) {
1933 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1934 float_raise( float_flag_invalid );
1935 }
1936 return 0;
1937 }
1938 aSign = extractFloat32Sign( a );
1939 bSign = extractFloat32Sign( b );
1940 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1941 return ( a == b ) || ( aSign ^ ( a < b ) );
1942
1943 }
1944
1945 /*
1946 -------------------------------------------------------------------------------
1947 Returns 1 if the single-precision floating-point value `a' is less than
1948 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
1949 exception. Otherwise, the comparison is performed according to the IEC/IEEE
1950 Standard for Binary Floating-point Arithmetic.
1951 -------------------------------------------------------------------------------
1952 */
1953 flag float32_lt_quiet( float32 a, float32 b )
1954 {
1955 flag aSign, bSign;
1956
1957 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1958 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1959 ) {
1960 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1961 float_raise( float_flag_invalid );
1962 }
1963 return 0;
1964 }
1965 aSign = extractFloat32Sign( a );
1966 bSign = extractFloat32Sign( b );
1967 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1968 return ( a != b ) && ( aSign ^ ( a < b ) );
1969
1970 }
1971
1972 /*
1973 -------------------------------------------------------------------------------
1974 Returns the result of converting the double-precision floating-point value
1975 `a' to the 32-bit two's complement integer format. The conversion is
1976 performed according to the IEC/IEEE Standard for Binary Floating-point
1977 Arithmetic---which means in particular that the conversion is rounded
1978 according to the current rounding mode. If `a' is a NaN, the largest
1979 positive integer is returned. Otherwise, if the conversion overflows, the
1980 largest integer with the same sign as `a' is returned.
1981 -------------------------------------------------------------------------------
1982 */
1983 int32 float64_to_int32( float64 a )
1984 {
1985 flag aSign;
1986 int16 aExp, shiftCount;
1987 bits64 aSig;
1988
1989 aSig = extractFloat64Frac( a );
1990 aExp = extractFloat64Exp( a );
1991 aSign = extractFloat64Sign( a );
1992 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1993 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1994 shiftCount = 0x42C - aExp;
1995 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1996 return roundAndPackInt32( aSign, aSig );
1997
1998 }
1999
2000 /*
2001 -------------------------------------------------------------------------------
2002 Returns the result of converting the double-precision floating-point value
2003 `a' to the 32-bit two's complement integer format. The conversion is
2004 performed according to the IEC/IEEE Standard for Binary Floating-point
2005 Arithmetic, except that the conversion is always rounded toward zero. If
2006 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
2007 conversion overflows, the largest integer with the same sign as `a' is
2008 returned.
2009 -------------------------------------------------------------------------------
2010 */
2011 int32 float64_to_int32_round_to_zero( float64 a )
2012 {
2013 flag aSign;
2014 int16 aExp, shiftCount;
2015 bits64 aSig, savedASig;
2016 int32 z;
2017
2018 aSig = extractFloat64Frac( a );
2019 aExp = extractFloat64Exp( a );
2020 aSign = extractFloat64Sign( a );
2021 shiftCount = 0x433 - aExp;
2022 if ( shiftCount < 21 ) {
2023 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2024 goto invalid;
2025 }
2026 else if ( 52 < shiftCount ) {
2027 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2028 return 0;
2029 }
2030 aSig |= LIT64( 0x0010000000000000 );
2031 savedASig = aSig;
2032 aSig >>= shiftCount;
2033 z = aSig;
2034 if ( aSign ) z = - z;
2035 if ( ( z < 0 ) ^ aSign ) {
2036 invalid:
2037 float_exception_flags |= float_flag_invalid;
2038 return aSign ? 0x80000000 : 0x7FFFFFFF;
2039 }
2040 if ( ( aSig<<shiftCount ) != savedASig ) {
2041 float_exception_flags |= float_flag_inexact;
2042 }
2043 return z;
2044
2045 }
2046
2047 /*
2048 -------------------------------------------------------------------------------
2049 Returns the result of converting the double-precision floating-point value
2050 `a' to the 32-bit two's complement unsigned integer format. The conversion
2051 is performed according to the IEC/IEEE Standard for Binary Floating-point
2052 Arithmetic---which means in particular that the conversion is rounded
2053 according to the current rounding mode. If `a' is a NaN, the largest
2054 positive integer is returned. Otherwise, if the conversion overflows, the
2055 largest positive integer is returned.
2056 -------------------------------------------------------------------------------
2057 */
2058 int32 float64_to_uint32( float64 a )
2059 {
2060 flag aSign;
2061 int16 aExp, shiftCount;
2062 bits64 aSig;
2063
2064 aSig = extractFloat64Frac( a );
2065 aExp = extractFloat64Exp( a );
2066 aSign = 0; //extractFloat64Sign( a );
2067 //if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2068 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2069 shiftCount = 0x42C - aExp;
2070 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2071 return roundAndPackInt32( aSign, aSig );
2072 }
2073
2074 /*
2075 -------------------------------------------------------------------------------
2076 Returns the result of converting the double-precision floating-point value
2077 `a' to the 32-bit two's complement integer format. The conversion is
2078 performed according to the IEC/IEEE Standard for Binary Floating-point
2079 Arithmetic, except that the conversion is always rounded toward zero. If
2080 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
2081 conversion overflows, the largest positive integer is returned.
2082 -------------------------------------------------------------------------------
2083 */
2084 int32 float64_to_uint32_round_to_zero( float64 a )
2085 {
2086 flag aSign;
2087 int16 aExp, shiftCount;
2088 bits64 aSig, savedASig;
2089 int32 z;
2090
2091 aSig = extractFloat64Frac( a );
2092 aExp = extractFloat64Exp( a );
2093 aSign = extractFloat64Sign( a );
2094 shiftCount = 0x433 - aExp;
2095 if ( shiftCount < 21 ) {
2096 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2097 goto invalid;
2098 }
2099 else if ( 52 < shiftCount ) {
2100 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2101 return 0;
2102 }
2103 aSig |= LIT64( 0x0010000000000000 );
2104 savedASig = aSig;
2105 aSig >>= shiftCount;
2106 z = aSig;
2107 if ( aSign ) z = - z;
2108 if ( ( z < 0 ) ^ aSign ) {
2109 invalid:
2110 float_exception_flags |= float_flag_invalid;
2111 return aSign ? 0x80000000 : 0x7FFFFFFF;
2112 }
2113 if ( ( aSig<<shiftCount ) != savedASig ) {
2114 float_exception_flags |= float_flag_inexact;
2115 }
2116 return z;
2117 }
2118
2119 /*
2120 -------------------------------------------------------------------------------
2121 Returns the result of converting the double-precision floating-point value
2122 `a' to the single-precision floating-point format. The conversion is
2123 performed according to the IEC/IEEE Standard for Binary Floating-point
2124 Arithmetic.
2125 -------------------------------------------------------------------------------
2126 */
2127 float32 float64_to_float32( float64 a )
2128 {
2129 flag aSign;
2130 int16 aExp;
2131 bits64 aSig;
2132 bits32 zSig;
2133
2134 aSig = extractFloat64Frac( a );
2135 aExp = extractFloat64Exp( a );
2136 aSign = extractFloat64Sign( a );
2137 if ( aExp == 0x7FF ) {
2138 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2139 return packFloat32( aSign, 0xFF, 0 );
2140 }
2141 shift64RightJamming( aSig, 22, &aSig );
2142 zSig = aSig;
2143 if ( aExp || zSig ) {
2144 zSig |= 0x40000000;
2145 aExp -= 0x381;
2146 }
2147 return roundAndPackFloat32( aSign, aExp, zSig );
2148
2149 }
2150
2151 #ifdef FLOATX80
2152
2153 /*
2154 -------------------------------------------------------------------------------
2155 Returns the result of converting the double-precision floating-point value
2156 `a' to the extended double-precision floating-point format. The conversion
2157 is performed according to the IEC/IEEE Standard for Binary Floating-point
2158 Arithmetic.
2159 -------------------------------------------------------------------------------
2160 */
2161 floatx80 float64_to_floatx80( float64 a )
2162 {
2163 flag aSign;
2164 int16 aExp;
2165 bits64 aSig;
2166
2167 aSig = extractFloat64Frac( a );
2168 aExp = extractFloat64Exp( a );
2169 aSign = extractFloat64Sign( a );
2170 if ( aExp == 0x7FF ) {
2171 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2172 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2173 }
2174 if ( aExp == 0 ) {
2175 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2176 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2177 }
2178 return
2179 packFloatx80(
2180 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2181
2182 }
2183
2184 #endif
2185
2186 #ifdef FLOAT128
2187
2188 /*
2189 -------------------------------------------------------------------------------
2190 Returns the result of converting the double-precision floating-point value
2191 `a' to the quadruple-precision floating-point format. The conversion is
2192 performed according to the IEC/IEEE Standard for Binary Floating-point
2193 Arithmetic.
2194 -------------------------------------------------------------------------------
2195 */
2196 float128 float64_to_float128( float64 a )
2197 {
2198 flag aSign;
2199 int16 aExp;
2200 bits64 aSig, zSig0, zSig1;
2201
2202 aSig = extractFloat64Frac( a );
2203 aExp = extractFloat64Exp( a );
2204 aSign = extractFloat64Sign( a );
2205 if ( aExp == 0x7FF ) {
2206 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2207 return packFloat128( aSign, 0x7FFF, 0, 0 );
2208 }
2209 if ( aExp == 0 ) {
2210 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2211 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2212 --aExp;
2213 }
2214 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2215 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2216
2217 }
2218
2219 #endif
2220
2221 /*
2222 -------------------------------------------------------------------------------
2223 Rounds the double-precision floating-point value `a' to an integer, and
2224 returns the result as a double-precision floating-point value. The
2225 operation is performed according to the IEC/IEEE Standard for Binary
2226 Floating-point Arithmetic.
2227 -------------------------------------------------------------------------------
2228 */
2229 float64 float64_round_to_int( float64 a )
2230 {
2231 flag aSign;
2232 int16 aExp;
2233 bits64 lastBitMask, roundBitsMask;
2234 int8 roundingMode;
2235 float64 z;
2236
2237 aExp = extractFloat64Exp( a );
2238 if ( 0x433 <= aExp ) {
2239 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2240 return propagateFloat64NaN( a, a );
2241 }
2242 return a;
2243 }
2244 if ( aExp <= 0x3FE ) {
2245 if ( (bits64) ( a<<1 ) == 0 ) return a;
2246 float_exception_flags |= float_flag_inexact;
2247 aSign = extractFloat64Sign( a );
2248 switch ( float_rounding_mode ) {
2249 case float_round_nearest_even:
2250 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2251 return packFloat64( aSign, 0x3FF, 0 );
2252 }
2253 break;
2254 case float_round_down:
2255 return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2256 case float_round_up:
2257 return
2258 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2259 }
2260 return packFloat64( aSign, 0, 0 );
2261 }
2262 lastBitMask = 1;
2263 lastBitMask <<= 0x433 - aExp;
2264 roundBitsMask = lastBitMask - 1;
2265 z = a;
2266 roundingMode = float_rounding_mode;
2267 if ( roundingMode == float_round_nearest_even ) {
2268 z += lastBitMask>>1;
2269 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2270 }
2271 else if ( roundingMode != float_round_to_zero ) {
2272 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2273 z += roundBitsMask;
2274 }
2275 }
2276 z &= ~ roundBitsMask;
2277 if ( z != a ) float_exception_flags |= float_flag_inexact;
2278 return z;
2279
2280 }
2281
2282 /*
2283 -------------------------------------------------------------------------------
2284 Returns the result of adding the absolute values of the double-precision
2285 floating-point values `a' and `b'. If `zSign' is true, the sum is negated
2286 before being returned. `zSign' is ignored if the result is a NaN. The
2287 addition is performed according to the IEC/IEEE Standard for Binary
2288 Floating-point Arithmetic.
2289 -------------------------------------------------------------------------------
2290 */
2291 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2292 {
2293 int16 aExp, bExp, zExp;
2294 bits64 aSig, bSig, zSig;
2295 int16 expDiff;
2296
2297 aSig = extractFloat64Frac( a );
2298 aExp = extractFloat64Exp( a );
2299 bSig = extractFloat64Frac( b );
2300 bExp = extractFloat64Exp( b );
2301 expDiff = aExp - bExp;
2302 aSig <<= 9;
2303 bSig <<= 9;
2304 if ( 0 < expDiff ) {
2305 if ( aExp == 0x7FF ) {
2306 if ( aSig ) return propagateFloat64NaN( a, b );
2307 return a;
2308 }
2309 if ( bExp == 0 ) {
2310 --expDiff;
2311 }
2312 else {
2313 bSig |= LIT64( 0x2000000000000000 );
2314 }
2315 shift64RightJamming( bSig, expDiff, &bSig );
2316 zExp = aExp;
2317 }
2318 else if ( expDiff < 0 ) {
2319 if ( bExp == 0x7FF ) {
2320 if ( bSig ) return propagateFloat64NaN( a, b );
2321 return packFloat64( zSign, 0x7FF, 0 );
2322 }
2323 if ( aExp == 0 ) {
2324 ++expDiff;
2325 }
2326 else {
2327 aSig |= LIT64( 0x2000000000000000 );
2328 }
2329 shift64RightJamming( aSig, - expDiff, &aSig );
2330 zExp = bExp;
2331 }
2332 else {
2333 if ( aExp == 0x7FF ) {
2334 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2335 return a;
2336 }
2337 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2338 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2339 zExp = aExp;
2340 goto roundAndPack;
2341 }
2342 aSig |= LIT64( 0x2000000000000000 );
2343 zSig = ( aSig + bSig )<<1;
2344 --zExp;
2345 if ( (sbits64) zSig < 0 ) {
2346 zSig = aSig + bSig;
2347 ++zExp;
2348 }
2349 roundAndPack:
2350 return roundAndPackFloat64( zSign, zExp, zSig );
2351
2352 }
2353
2354 /*
2355 -------------------------------------------------------------------------------
2356 Returns the result of subtracting the absolute values of the double-
2357 precision floating-point values `a' and `b'. If `zSign' is true, the
2358 difference is negated before being returned. `zSign' is ignored if the
2359 result is a NaN. The subtraction is performed according to the IEC/IEEE
2360 Standard for Binary Floating-point Arithmetic.
2361 -------------------------------------------------------------------------------
2362 */
2363 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2364 {
2365 int16 aExp, bExp, zExp;
2366 bits64 aSig, bSig, zSig;
2367 int16 expDiff;
2368
2369 aSig = extractFloat64Frac( a );
2370 aExp = extractFloat64Exp( a );
2371 bSig = extractFloat64Frac( b );
2372 bExp = extractFloat64Exp( b );
2373 expDiff = aExp - bExp;
2374 aSig <<= 10;
2375 bSig <<= 10;
2376 if ( 0 < expDiff ) goto aExpBigger;
2377 if ( expDiff < 0 ) goto bExpBigger;
2378 if ( aExp == 0x7FF ) {
2379 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2380 float_raise( float_flag_invalid );
2381 return float64_default_nan;
2382 }
2383 if ( aExp == 0 ) {
2384 aExp = 1;
2385 bExp = 1;
2386 }
2387 if ( bSig < aSig ) goto aBigger;
2388 if ( aSig < bSig ) goto bBigger;
2389 return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
2390 bExpBigger:
2391 if ( bExp == 0x7FF ) {
2392 if ( bSig ) return propagateFloat64NaN( a, b );
2393 return packFloat64( zSign ^ 1, 0x7FF, 0 );
2394 }
2395 if ( aExp == 0 ) {
2396 ++expDiff;
2397 }
2398 else {
2399 aSig |= LIT64( 0x4000000000000000 );
2400 }
2401 shift64RightJamming( aSig, - expDiff, &aSig );
2402 bSig |= LIT64( 0x4000000000000000 );
2403 bBigger:
2404 zSig = bSig - aSig;
2405 zExp = bExp;
2406 zSign ^= 1;
2407 goto normalizeRoundAndPack;
2408 aExpBigger:
2409 if ( aExp == 0x7FF ) {
2410 if ( aSig ) return propagateFloat64NaN( a, b );
2411 return a;
2412 }
2413 if ( bExp == 0 ) {
2414 --expDiff;
2415 }
2416 else {
2417 bSig |= LIT64( 0x4000000000000000 );
2418 }
2419 shift64RightJamming( bSig, expDiff, &bSig );
2420 aSig |= LIT64( 0x4000000000000000 );
2421 aBigger:
2422 zSig = aSig - bSig;
2423 zExp = aExp;
2424 normalizeRoundAndPack:
2425 --zExp;
2426 return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2427
2428 }
2429
2430 /*
2431 -------------------------------------------------------------------------------
2432 Returns the result of adding the double-precision floating-point values `a'
2433 and `b'. The operation is performed according to the IEC/IEEE Standard for
2434 Binary Floating-point Arithmetic.
2435 -------------------------------------------------------------------------------
2436 */
2437 float64 float64_add( float64 a, float64 b )
2438 {
2439 flag aSign, bSign;
2440
2441 aSign = extractFloat64Sign( a );
2442 bSign = extractFloat64Sign( b );
2443 if ( aSign == bSign ) {
2444 return addFloat64Sigs( a, b, aSign );
2445 }
2446 else {
2447 return subFloat64Sigs( a, b, aSign );
2448 }
2449
2450 }
2451
2452 /*
2453 -------------------------------------------------------------------------------
2454 Returns the result of subtracting the double-precision floating-point values
2455 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2456 for Binary Floating-point Arithmetic.
2457 -------------------------------------------------------------------------------
2458 */
2459 float64 float64_sub( float64 a, float64 b )
2460 {
2461 flag aSign, bSign;
2462
2463 aSign = extractFloat64Sign( a );
2464 bSign = extractFloat64Sign( b );
2465 if ( aSign == bSign ) {
2466 return subFloat64Sigs( a, b, aSign );
2467 }
2468 else {
2469 return addFloat64Sigs( a, b, aSign );
2470 }
2471
2472 }
2473
2474 /*
2475 -------------------------------------------------------------------------------
2476 Returns the result of multiplying the double-precision floating-point values
2477 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2478 for Binary Floating-point Arithmetic.
2479 -------------------------------------------------------------------------------
2480 */
2481 float64 float64_mul( float64 a, float64 b )
2482 {
2483 flag aSign, bSign, zSign;
2484 int16 aExp, bExp, zExp;
2485 bits64 aSig, bSig, zSig0, zSig1;
2486
2487 aSig = extractFloat64Frac( a );
2488 aExp = extractFloat64Exp( a );
2489 aSign = extractFloat64Sign( a );
2490 bSig = extractFloat64Frac( b );
2491 bExp = extractFloat64Exp( b );
2492 bSign = extractFloat64Sign( b );
2493 zSign = aSign ^ bSign;
2494 if ( aExp == 0x7FF ) {
2495 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2496 return propagateFloat64NaN( a, b );
2497 }
2498 if ( ( bExp | bSig ) == 0 ) {
2499 float_raise( float_flag_invalid );
2500 return float64_default_nan;
2501 }
2502 return packFloat64( zSign, 0x7FF, 0 );
2503 }
2504 if ( bExp == 0x7FF ) {
2505 if ( bSig ) return propagateFloat64NaN( a, b );
2506 if ( ( aExp | aSig ) == 0 ) {
2507 float_raise( float_flag_invalid );
2508 return float64_default_nan;
2509 }
2510 return packFloat64( zSign, 0x7FF, 0 );
2511 }
2512 if ( aExp == 0 ) {
2513 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2514 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2515 }
2516 if ( bExp == 0 ) {
2517 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2518 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2519 }
2520 zExp = aExp + bExp - 0x3FF;
2521 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2522 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2523 mul64To128( aSig, bSig, &zSig0, &zSig1 );
2524 zSig0 |= ( zSig1 != 0 );
2525 if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2526 zSig0 <<= 1;
2527 --zExp;
2528 }
2529 return roundAndPackFloat64( zSign, zExp, zSig0 );
2530
2531 }
2532
2533 /*
2534 -------------------------------------------------------------------------------
2535 Returns the result of dividing the double-precision floating-point value `a'
2536 by the corresponding value `b'. The operation is performed according to
2537 the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2538 -------------------------------------------------------------------------------
2539 */
2540 float64 float64_div( float64 a, float64 b )
2541 {
2542 flag aSign, bSign, zSign;
2543 int16 aExp, bExp, zExp;
2544 bits64 aSig, bSig, zSig;
2545 bits64 rem0, rem1;
2546 bits64 term0, term1;
2547
2548 aSig = extractFloat64Frac( a );
2549 aExp = extractFloat64Exp( a );
2550 aSign = extractFloat64Sign( a );
2551 bSig = extractFloat64Frac( b );
2552 bExp = extractFloat64Exp( b );
2553 bSign = extractFloat64Sign( b );
2554 zSign = aSign ^ bSign;
2555 if ( aExp == 0x7FF ) {
2556 if ( aSig ) return propagateFloat64NaN( a, b );
2557 if ( bExp == 0x7FF ) {
2558 if ( bSig ) return propagateFloat64NaN( a, b );
2559 float_raise( float_flag_invalid );
2560 return float64_default_nan;
2561 }
2562 return packFloat64( zSign, 0x7FF, 0 );
2563 }
2564 if ( bExp == 0x7FF ) {
2565 if ( bSig ) return propagateFloat64NaN( a, b );
2566 return packFloat64( zSign, 0, 0 );
2567 }
2568 if ( bExp == 0 ) {
2569 if ( bSig == 0 ) {
2570 if ( ( aExp | aSig ) == 0 ) {
2571 float_raise( float_flag_invalid );
2572 return float64_default_nan;
2573 }
2574 float_raise( float_flag_divbyzero );
2575 return packFloat64( zSign, 0x7FF, 0 );
2576 }
2577 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2578 }
2579 if ( aExp == 0 ) {
2580 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2581 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2582 }
2583 zExp = aExp - bExp + 0x3FD;
2584 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2585 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2586 if ( bSig <= ( aSig + aSig ) ) {
2587 aSig >>= 1;
2588 ++zExp;
2589 }
2590 zSig = estimateDiv128To64( aSig, 0, bSig );
2591 if ( ( zSig & 0x1FF ) <= 2 ) {
2592 mul64To128( bSig, zSig, &term0, &term1 );
2593 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2594 while ( (sbits64) rem0 < 0 ) {
2595 --zSig;
2596 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2597 }
2598 zSig |= ( rem1 != 0 );
2599 }
2600 return roundAndPackFloat64( zSign, zExp, zSig );
2601
2602 }
2603
2604 /*
2605 -------------------------------------------------------------------------------
2606 Returns the remainder of the double-precision floating-point value `a'
2607 with respect to the corresponding value `b'. The operation is performed
2608 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2609 -------------------------------------------------------------------------------
2610 */
2611 float64 float64_rem( float64 a, float64 b )
2612 {
2613 flag aSign, bSign, zSign;
2614 int16 aExp, bExp, expDiff;
2615 bits64 aSig, bSig;
2616 bits64 q, alternateASig;
2617 sbits64 sigMean;
2618
2619 aSig = extractFloat64Frac( a );
2620 aExp = extractFloat64Exp( a );
2621 aSign = extractFloat64Sign( a );
2622 bSig = extractFloat64Frac( b );
2623 bExp = extractFloat64Exp( b );
2624 bSign = extractFloat64Sign( b );
2625 if ( aExp == 0x7FF ) {
2626 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2627 return propagateFloat64NaN( a, b );
2628 }
2629 float_raise( float_flag_invalid );
2630 return float64_default_nan;
2631 }
2632 if ( bExp == 0x7FF ) {
2633 if ( bSig ) return propagateFloat64NaN( a, b );
2634 return a;
2635 }
2636 if ( bExp == 0 ) {
2637 if ( bSig == 0 ) {
2638 float_raise( float_flag_invalid );
2639 return float64_default_nan;
2640 }
2641 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2642 }
2643 if ( aExp == 0 ) {
2644 if ( aSig == 0 ) return a;
2645 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2646 }
2647 expDiff = aExp - bExp;
2648 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2649 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2650 if ( expDiff < 0 ) {
2651 if ( expDiff < -1 ) return a;
2652 aSig >>= 1;
2653 }
2654 q = ( bSig <= aSig );
2655 if ( q ) aSig -= bSig;
2656 expDiff -= 64;
2657 while ( 0 < expDiff ) {
2658 q = estimateDiv128To64( aSig, 0, bSig );
2659 q = ( 2 < q ) ? q - 2 : 0;
2660 aSig = - ( ( bSig>>2 ) * q );
2661 expDiff -= 62;
2662 }
2663 expDiff += 64;
2664 if ( 0 < expDiff ) {
2665 q = estimateDiv128To64( aSig, 0, bSig );
2666 q = ( 2 < q ) ? q - 2 : 0;
2667 q >>= 64 - expDiff;
2668 bSig >>= 2;
2669 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2670 }
2671 else {
2672 aSig >>= 2;
2673 bSig >>= 2;
2674 }
2675 do {
2676 alternateASig = aSig;
2677 ++q;
2678 aSig -= bSig;
2679 } while ( 0 <= (sbits64) aSig );
2680 sigMean = aSig + alternateASig;
2681 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2682 aSig = alternateASig;
2683 }
2684 zSign = ( (sbits64) aSig < 0 );
2685 if ( zSign ) aSig = - aSig;
2686 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
2687
2688 }
2689
2690 /*
2691 -------------------------------------------------------------------------------
2692 Returns the square root of the double-precision floating-point value `a'.
2693 The operation is performed according to the IEC/IEEE Standard for Binary
2694 Floating-point Arithmetic.
2695 -------------------------------------------------------------------------------
2696 */
2697 float64 float64_sqrt( float64 a )
2698 {
2699 flag aSign;
2700 int16 aExp, zExp;
2701 bits64 aSig, zSig;
2702 bits64 rem0, rem1, term0, term1; //, shiftedRem;
2703 //float64 z;
2704
2705 aSig = extractFloat64Frac( a );
2706 aExp = extractFloat64Exp( a );
2707 aSign = extractFloat64Sign( a );
2708 if ( aExp == 0x7FF ) {
2709 if ( aSig ) return propagateFloat64NaN( a, a );
2710 if ( ! aSign ) return a;
2711 float_raise( float_flag_invalid );
2712 return float64_default_nan;
2713 }
2714 if ( aSign ) {
2715 if ( ( aExp | aSig ) == 0 ) return a;
2716 float_raise( float_flag_invalid );
2717 return float64_default_nan;
2718 }
2719 if ( aExp == 0 ) {
2720 if ( aSig == 0 ) return 0;
2721 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2722 }
2723 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2724 aSig |= LIT64( 0x0010000000000000 );
2725 zSig = estimateSqrt32( aExp, aSig>>21 );
2726 zSig <<= 31;
2727 aSig <<= 9 - ( aExp & 1 );
2728 zSig = estimateDiv128To64( aSig, 0, zSig ) + zSig + 2;
2729 if ( ( zSig & 0x3FF ) <= 5 ) {
2730 if ( zSig < 2 ) {
2731 zSig = LIT64( 0xFFFFFFFFFFFFFFFF );
2732 }
2733 else {
2734 aSig <<= 2;
2735 mul64To128( zSig, zSig, &term0, &term1 );
2736 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2737 while ( (sbits64) rem0 < 0 ) {
2738 --zSig;
2739 shortShift128Left( 0, zSig, 1, &term0, &term1 );
2740 term1 |= 1;
2741 add128( rem0, rem1, term0, term1, &rem0, &rem1 );
2742 }
2743 zSig |= ( ( rem0 | rem1 ) != 0 );
2744 }
2745 }
2746 shift64RightJamming( zSig, 1, &zSig );
2747 return roundAndPackFloat64( 0, zExp, zSig );
2748
2749 }
2750
2751 /*
2752 -------------------------------------------------------------------------------
2753 Returns 1 if the double-precision floating-point value `a' is equal to the
2754 corresponding value `b', and 0 otherwise. The comparison is performed
2755 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2756 -------------------------------------------------------------------------------
2757 */
2758 flag float64_eq( float64 a, float64 b )
2759 {
2760
2761 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2762 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2763 ) {
2764 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2765 float_raise( float_flag_invalid );
2766 }
2767 return 0;
2768 }
2769 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2770
2771 }
2772
2773 /*
2774 -------------------------------------------------------------------------------
2775 Returns 1 if the double-precision floating-point value `a' is less than or
2776 equal to the corresponding value `b', and 0 otherwise. The comparison is
2777 performed according to the IEC/IEEE Standard for Binary Floating-point
2778 Arithmetic.
2779 -------------------------------------------------------------------------------
2780 */
2781 flag float64_le( float64 a, float64 b )
2782 {
2783 flag aSign, bSign;
2784
2785 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2786 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2787 ) {
2788 float_raise( float_flag_invalid );
2789 return 0;
2790 }
2791 aSign = extractFloat64Sign( a );
2792 bSign = extractFloat64Sign( b );
2793 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2794 return ( a == b ) || ( aSign ^ ( a < b ) );
2795
2796 }
2797
2798 /*
2799 -------------------------------------------------------------------------------
2800 Returns 1 if the double-precision floating-point value `a' is less than
2801 the corresponding value `b', and 0 otherwise. The comparison is performed
2802 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2803 -------------------------------------------------------------------------------
2804 */
2805 flag float64_lt( float64 a, float64 b )
2806 {
2807 flag aSign, bSign;
2808
2809 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2810 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2811 ) {
2812 float_raise( float_flag_invalid );
2813 return 0;
2814 }
2815 aSign = extractFloat64Sign( a );
2816 bSign = extractFloat64Sign( b );
2817 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2818 return ( a != b ) && ( aSign ^ ( a < b ) );
2819
2820 }
2821
2822 /*
2823 -------------------------------------------------------------------------------
2824 Returns 1 if the double-precision floating-point value `a' is equal to the
2825 corresponding value `b', and 0 otherwise. The invalid exception is raised
2826 if either operand is a NaN. Otherwise, the comparison is performed
2827 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2828 -------------------------------------------------------------------------------
2829 */
2830 flag float64_eq_signaling( float64 a, float64 b )
2831 {
2832
2833 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2834 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2835 ) {
2836 float_raise( float_flag_invalid );
2837 return 0;
2838 }
2839 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2840
2841 }
2842
2843 /*
2844 -------------------------------------------------------------------------------
2845 Returns 1 if the double-precision floating-point value `a' is less than or
2846 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2847 cause an exception. Otherwise, the comparison is performed according to the
2848 IEC/IEEE Standard for Binary Floating-point Arithmetic.
2849 -------------------------------------------------------------------------------
2850 */
2851 flag float64_le_quiet( float64 a, float64 b )
2852 {
2853 flag aSign, bSign;
2854 //int16 aExp, bExp;
2855
2856 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2857 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2858 ) {
2859 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2860 float_raise( float_flag_invalid );
2861 }
2862 return 0;
2863 }
2864 aSign = extractFloat64Sign( a );
2865 bSign = extractFloat64Sign( b );
2866 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2867 return ( a == b ) || ( aSign ^ ( a < b ) );
2868
2869 }
2870
2871 /*
2872 -------------------------------------------------------------------------------
2873 Returns 1 if the double-precision floating-point value `a' is less than
2874 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2875 exception. Otherwise, the comparison is performed according to the IEC/IEEE
2876 Standard for Binary Floating-point Arithmetic.
2877 -------------------------------------------------------------------------------
2878 */
2879 flag float64_lt_quiet( float64 a, float64 b )
2880 {
2881 flag aSign, bSign;
2882
2883 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2884 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2885 ) {
2886 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2887 float_raise( float_flag_invalid );
2888 }
2889 return 0;
2890 }
2891 aSign = extractFloat64Sign( a );
2892 bSign = extractFloat64Sign( b );
2893 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2894 return ( a != b ) && ( aSign ^ ( a < b ) );
2895
2896 }
2897
2898 #ifdef FLOATX80
2899
2900 /*
2901 -------------------------------------------------------------------------------
2902 Returns the result of converting the extended double-precision floating-
2903 point value `a' to the 32-bit two's complement integer format. The
2904 conversion is performed according to the IEC/IEEE Standard for Binary
2905 Floating-point Arithmetic---which means in particular that the conversion
2906 is rounded according to the current rounding mode. If `a' is a NaN, the
2907 largest positive integer is returned. Otherwise, if the conversion
2908 overflows, the largest integer with the same sign as `a' is returned.
2909 -------------------------------------------------------------------------------
2910 */
2911 int32 floatx80_to_int32( floatx80 a )
2912 {
2913 flag aSign;
2914 int32 aExp, shiftCount;
2915 bits64 aSig;
2916
2917 aSig = extractFloatx80Frac( a );
2918 aExp = extractFloatx80Exp( a );
2919 aSign = extractFloatx80Sign( a );
2920 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2921 shiftCount = 0x4037 - aExp;
2922 if ( shiftCount <= 0 ) shiftCount = 1;
2923 shift64RightJamming( aSig, shiftCount, &aSig );
2924 return roundAndPackInt32( aSign, aSig );
2925
2926 }
2927
2928 /*
2929 -------------------------------------------------------------------------------
2930 Returns the result of converting the extended double-precision floating-
2931 point value `a' to the 32-bit two's complement integer format. The
2932 conversion is performed according to the IEC/IEEE Standard for Binary
2933 Floating-point Arithmetic, except that the conversion is always rounded
2934 toward zero. If `a' is a NaN, the largest positive integer is returned.
2935 Otherwise, if the conversion overflows, the largest integer with the same
2936 sign as `a' is returned.
2937 -------------------------------------------------------------------------------
2938 */
2939 int32 floatx80_to_int32_round_to_zero( floatx80 a )
2940 {
2941 flag aSign;
2942 int32 aExp, shiftCount;
2943 bits64 aSig, savedASig;
2944 int32 z;
2945
2946 aSig = extractFloatx80Frac( a );
2947 aExp = extractFloatx80Exp( a );
2948 aSign = extractFloatx80Sign( a );
2949 shiftCount = 0x403E - aExp;
2950 if ( shiftCount < 32 ) {
2951 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2952 goto invalid;
2953 }
2954 else if ( 63 < shiftCount ) {
2955 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2956 return 0;
2957 }
2958 savedASig = aSig;
2959 aSig >>= shiftCount;
2960 z = aSig;
2961 if ( aSign ) z = - z;
2962 if ( ( z < 0 ) ^ aSign ) {
2963 invalid:
2964 float_exception_flags |= float_flag_invalid;
2965 return aSign ? 0x80000000 : 0x7FFFFFFF;
2966 }
2967 if ( ( aSig<<shiftCount ) != savedASig ) {
2968 float_exception_flags |= float_flag_inexact;
2969 }
2970 return z;
2971
2972 }
2973
2974 /*
2975 -------------------------------------------------------------------------------
2976 Returns the result of converting the extended double-precision floating-
2977 point value `a' to the single-precision floating-point format. The
2978 conversion is performed according to the IEC/IEEE Standard for Binary
2979 Floating-point Arithmetic.
2980 -------------------------------------------------------------------------------
2981 */
2982 float32 floatx80_to_float32( floatx80 a )
2983 {
2984 flag aSign;
2985 int32 aExp;
2986 bits64 aSig;
2987
2988 aSig = extractFloatx80Frac( a );
2989 aExp = extractFloatx80Exp( a );
2990 aSign = extractFloatx80Sign( a );
2991 if ( aExp == 0x7FFF ) {
2992 if ( (bits64) ( aSig<<1 ) ) {
2993 return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
2994 }
2995 return packFloat32( aSign, 0xFF, 0 );
2996 }
2997 shift64RightJamming( aSig, 33, &aSig );
2998 if ( aExp || aSig ) aExp -= 0x3F81;
2999 return roundAndPackFloat32( aSign, aExp, aSig );
3000
3001 }
3002
3003 /*
3004 -------------------------------------------------------------------------------
3005 Returns the result of converting the extended double-precision floating-
3006 point value `a' to the double-precision floating-point format. The
3007 conversion is performed according to the IEC/IEEE Standard for Binary
3008 Floating-point Arithmetic.
3009 -------------------------------------------------------------------------------
3010 */
3011 float64 floatx80_to_float64( floatx80 a )
3012 {
3013 flag aSign;
3014 int32 aExp;
3015 bits64 aSig, zSig;
3016
3017 aSig = extractFloatx80Frac( a );
3018 aExp = extractFloatx80Exp( a );
3019 aSign = extractFloatx80Sign( a );
3020 if ( aExp == 0x7FFF ) {
3021 if ( (bits64) ( aSig<<1 ) ) {
3022 return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3023 }
3024 return packFloat64( aSign, 0x7FF, 0 );
3025 }
3026 shift64RightJamming( aSig, 1, &zSig );
3027 if ( aExp || aSig ) aExp -= 0x3C01;
3028 return roundAndPackFloat64( aSign, aExp, zSig );
3029
3030 }
3031
3032 #ifdef FLOAT128
3033
3034 /*
3035 -------------------------------------------------------------------------------
3036 Returns the result of converting the extended double-precision floating-
3037 point value `a' to the quadruple-precision floating-point format. The
3038 conversion is performed according to the IEC/IEEE Standard for Binary
3039 Floating-point Arithmetic.
3040 -------------------------------------------------------------------------------
3041 */
3042 float128 floatx80_to_float128( floatx80 a )
3043 {
3044 flag aSign;
3045 int16 aExp;
3046 bits64 aSig, zSig0, zSig1;
3047
3048 aSig = extractFloatx80Frac( a );
3049 aExp = extractFloatx80Exp( a );
3050 aSign = extractFloatx80Sign( a );
3051 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3052 return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3053 }
3054 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3055 return packFloat128( aSign, aExp, zSig0, zSig1 );
3056
3057 }
3058
3059 #endif
3060
3061 /*
3062 -------------------------------------------------------------------------------
3063 Rounds the extended double-precision floating-point value `a' to an integer,
3064 and returns the result as an extended quadruple-precision floating-point
3065 value. The operation is performed according to the IEC/IEEE Standard for
3066 Binary Floating-point Arithmetic.
3067 -------------------------------------------------------------------------------
3068 */
3069 floatx80 floatx80_round_to_int( floatx80 a )
3070 {
3071 flag aSign;
3072 int32 aExp;
3073 bits64 lastBitMask, roundBitsMask;
3074 int8 roundingMode;
3075 floatx80 z;
3076
3077 aExp = extractFloatx80Exp( a );
3078 if ( 0x403E <= aExp ) {
3079 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3080 return propagateFloatx80NaN( a, a );
3081 }
3082 return a;
3083 }
3084 if ( aExp <= 0x3FFE ) {
3085 if ( ( aExp == 0 )
3086 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3087 return a;
3088 }
3089 float_exception_flags |= float_flag_inexact;
3090 aSign = extractFloatx80Sign( a );
3091 switch ( float_rounding_mode ) {
3092 case float_round_nearest_even:
3093 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3094 ) {
3095 return
3096 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3097 }
3098 break;
3099 case float_round_down:
3100 return
3101 aSign ?
3102 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3103 : packFloatx80( 0, 0, 0 );
3104 case float_round_up:
3105 return
3106 aSign ? packFloatx80( 1, 0, 0 )
3107 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3108 }
3109 return packFloatx80( aSign, 0, 0 );
3110 }
3111 lastBitMask = 1;
3112 lastBitMask <<= 0x403E - aExp;
3113 roundBitsMask = lastBitMask - 1;
3114 z = a;
3115 roundingMode = float_rounding_mode;
3116 if ( roundingMode == float_round_nearest_even ) {
3117 z.low += lastBitMask>>1;
3118 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3119 }
3120 else if ( roundingMode != float_round_to_zero ) {
3121 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3122 z.low += roundBitsMask;
3123 }
3124 }
3125 z.low &= ~ roundBitsMask;
3126 if ( z.low == 0 ) {
3127 ++z.high;
3128 z.low = LIT64( 0x8000000000000000 );
3129 }
3130 if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;
3131 return z;
3132
3133 }
3134
3135 /*
3136 -------------------------------------------------------------------------------
3137 Returns the result of adding the absolute values of the extended double-
3138 precision floating-point values `a' and `b'. If `zSign' is true, the sum is
3139 negated before being returned. `zSign' is ignored if the result is a NaN.
3140 The addition is performed according to the IEC/IEEE Standard for Binary
3141 Floating-point Arithmetic.
3142 -------------------------------------------------------------------------------
3143 */
3144 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3145 {
3146 int32 aExp, bExp, zExp;
3147 bits64 aSig, bSig, zSig0, zSig1;
3148 int32 expDiff;
3149
3150 aSig = extractFloatx80Frac( a );
3151 aExp = extractFloatx80Exp( a );
3152 bSig = extractFloatx80Frac( b );
3153 bExp = extractFloatx80Exp( b );
3154 expDiff = aExp - bExp;
3155 if ( 0 < expDiff ) {
3156 if ( aExp == 0x7FFF ) {
3157 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3158 return a;
3159 }
3160 if ( bExp == 0 ) --expDiff;
3161 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3162 zExp = aExp;
3163 }
3164 else if ( expDiff < 0 ) {
3165 if ( bExp == 0x7FFF ) {
3166 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3167 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3168 }
3169 if ( aExp == 0 ) ++expDiff;
3170 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3171 zExp = bExp;
3172 }
3173 else {
3174 if ( aExp == 0x7FFF ) {
3175 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3176 return propagateFloatx80NaN( a, b );
3177 }
3178 return a;
3179 }
3180 zSig1 = 0;
3181 zSig0 = aSig + bSig;
3182 if ( aExp == 0 ) {
3183 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3184 goto roundAndPack;
3185 }
3186 zExp = aExp;
3187 goto shiftRight1;
3188 }
3189
3190 zSig0 = aSig + bSig;
3191
3192 if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3193 shiftRight1:
3194 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3195 zSig0 |= LIT64( 0x8000000000000000 );
3196 ++zExp;
3197 roundAndPack:
3198 return
3199 roundAndPackFloatx80(
3200 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3201
3202 }
3203
3204 /*
3205 -------------------------------------------------------------------------------
3206 Returns the result of subtracting the absolute values of the extended
3207 double-precision floating-point values `a' and `b'. If `zSign' is true,
3208 the difference is negated before being returned. `zSign' is ignored if the
3209 result is a NaN. The subtraction is performed according to the IEC/IEEE
3210 Standard for Binary Floating-point Arithmetic.
3211 -------------------------------------------------------------------------------
3212 */
3213 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3214 {
3215 int32 aExp, bExp, zExp;
3216 bits64 aSig, bSig, zSig0, zSig1;
3217 int32 expDiff;
3218 floatx80 z;
3219
3220 aSig = extractFloatx80Frac( a );
3221 aExp = extractFloatx80Exp( a );
3222 bSig = extractFloatx80Frac( b );
3223 bExp = extractFloatx80Exp( b );
3224 expDiff = aExp - bExp;
3225 if ( 0 < expDiff ) goto aExpBigger;
3226 if ( expDiff < 0 ) goto bExpBigger;
3227 if ( aExp == 0x7FFF ) {
3228 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3229 return propagateFloatx80NaN( a, b );
3230 }
3231 float_raise( float_flag_invalid );
3232 z.low = floatx80_default_nan_low;
3233 z.high = floatx80_default_nan_high;
3234 return z;
3235 }
3236 if ( aExp == 0 ) {
3237 aExp = 1;
3238 bExp = 1;
3239 }
3240 zSig1 = 0;
3241 if ( bSig < aSig ) goto aBigger;
3242 if ( aSig < bSig ) goto bBigger;
3243 return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
3244 bExpBigger:
3245 if ( bExp == 0x7FFF ) {
3246 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3247 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3248 }
3249 if ( aExp == 0 ) ++expDiff;
3250 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3251 bBigger:
3252 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3253 zExp = bExp;
3254 zSign ^= 1;
3255 goto normalizeRoundAndPack;
3256 aExpBigger:
3257 if ( aExp == 0x7FFF ) {
3258 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3259 return a;
3260 }
3261 if ( bExp == 0 ) --expDiff;
3262 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3263 aBigger:
3264 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3265 zExp = aExp;
3266 normalizeRoundAndPack:
3267 return
3268 normalizeRoundAndPackFloatx80(
3269 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3270
3271 }
3272
3273 /*
3274 -------------------------------------------------------------------------------
3275 Returns the result of adding the extended double-precision floating-point
3276 values `a' and `b'. The operation is performed according to the IEC/IEEE
3277 Standard for Binary Floating-point Arithmetic.
3278 -------------------------------------------------------------------------------
3279 */
3280 floatx80 floatx80_add( floatx80 a, floatx80 b )
3281 {
3282 flag aSign, bSign;
3283
3284 aSign = extractFloatx80Sign( a );
3285 bSign = extractFloatx80Sign( b );
3286 if ( aSign == bSign ) {
3287 return addFloatx80Sigs( a, b, aSign );
3288 }
3289 else {
3290 return subFloatx80Sigs( a, b, aSign );
3291 }
3292
3293 }
3294
3295 /*
3296 -------------------------------------------------------------------------------
3297 Returns the result of subtracting the extended double-precision floating-
3298 point values `a' and `b'. The operation is performed according to the
3299 IEC/IEEE Standard for Binary Floating-point Arithmetic.
3300 -------------------------------------------------------------------------------
3301 */
3302 floatx80 floatx80_sub( floatx80 a, floatx80 b )
3303 {
3304 flag aSign, bSign;
3305
3306 aSign = extractFloatx80Sign( a );
3307 bSign = extractFloatx80Sign( b );
3308 if ( aSign == bSign ) {
3309 return subFloatx80Sigs( a, b, aSign );
3310 }
3311 else {
3312 return addFloatx80Sigs( a, b, aSign );
3313 }
3314
3315 }
3316
3317 /*
3318 -------------------------------------------------------------------------------
3319 Returns the result of multiplying the extended double-precision floating-
3320 point values `a' and `b'. The operation is performed according to the
3321 IEC/IEEE Standard for Binary Floating-point Arithmetic.
3322 -------------------------------------------------------------------------------
3323 */
3324 floatx80 floatx80_mul( floatx80 a, floatx80 b )
3325 {
3326 flag aSign, bSign, zSign;
3327 int32 aExp, bExp, zExp;
3328 bits64 aSig, bSig, zSig0, zSig1;
3329 floatx80 z;
3330
3331 aSig = extractFloatx80Frac( a );
3332 aExp = extractFloatx80Exp( a );
3333 aSign = extractFloatx80Sign( a );
3334 bSig = extractFloatx80Frac( b );
3335 bExp = extractFloatx80Exp( b );
3336 bSign = extractFloatx80Sign( b );
3337 zSign = aSign ^ bSign;
3338 if ( aExp == 0x7FFF ) {
3339 if ( (bits64) ( aSig<<1 )
3340 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3341 return propagateFloatx80NaN( a, b );
3342 }
3343 if ( ( bExp | bSig ) == 0 ) goto invalid;
3344 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3345 }
3346 if ( bExp == 0x7FFF ) {
3347 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3348 if ( ( aExp | aSig ) == 0 ) {
3349 invalid:
3350 float_raise( float_flag_invalid );
3351 z.low = floatx80_default_nan_low;
3352 z.high = floatx80_default_nan_high;
3353 return z;
3354 }
3355 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3356 }
3357 if ( aExp == 0 ) {
3358 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3359 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3360 }
3361 if ( bExp == 0 ) {
3362 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3363 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3364 }
3365 zExp = aExp + bExp - 0x3FFE;
3366 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3367 if ( 0 < (sbits64) zSig0 ) {
3368 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3369 --zExp;
3370 }
3371 return
3372 roundAndPackFloatx80(
3373 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3374
3375 }
3376
3377 /*
3378 -------------------------------------------------------------------------------
3379 Returns the result of dividing the extended double-precision floating-point
3380 value `a' by the corresponding value `b'. The operation is performed
3381 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3382 -------------------------------------------------------------------------------
3383 */
3384 floatx80 floatx80_div( floatx80 a, floatx80 b )
3385 {
3386 flag aSign, bSign, zSign;
3387 int32 aExp, bExp, zExp;
3388 bits64 aSig, bSig, zSig0, zSig1;
3389 bits64 rem0, rem1, rem2, term0, term1, term2;
3390 floatx80 z;
3391
3392 aSig = extractFloatx80Frac( a );
3393 aExp = extractFloatx80Exp( a );
3394 aSign = extractFloatx80Sign( a );
3395 bSig = extractFloatx80Frac( b );
3396 bExp = extractFloatx80Exp( b );
3397 bSign = extractFloatx80Sign( b );
3398 zSign = aSign ^ bSign;
3399 if ( aExp == 0x7FFF ) {
3400 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3401 if ( bExp == 0x7FFF ) {
3402 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3403 goto invalid;
3404 }
3405 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3406 }
3407 if ( bExp == 0x7FFF ) {
3408 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3409 return packFloatx80( zSign, 0, 0 );
3410 }
3411 if ( bExp == 0 ) {
3412 if ( bSig == 0 ) {
3413 if ( ( aExp | aSig ) == 0 ) {
3414 invalid:
3415 float_raise( float_flag_invalid );
3416 z.low = floatx80_default_nan_low;
3417 z.high = floatx80_default_nan_high;
3418 return z;
3419 }
3420 float_raise( float_flag_divbyzero );
3421 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3422 }
3423 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3424 }
3425 if ( aExp == 0 ) {
3426 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3427 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3428 }
3429 zExp = aExp - bExp + 0x3FFE;
3430 rem1 = 0;
3431 if ( bSig <= aSig ) {
3432 shift128Right( aSig, 0, 1, &aSig, &rem1 );
3433 ++zExp;
3434 }
3435 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3436 mul64To128( bSig, zSig0, &term0, &term1 );
3437 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3438 while ( (sbits64) rem0 < 0 ) {
3439 --zSig0;
3440 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3441 }
3442 zSig1 = estimateDiv128To64( rem1, 0, bSig );
3443 if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3444 mul64To128( bSig, zSig1, &term1, &term2 );
3445 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3446 while ( (sbits64) rem1 < 0 ) {
3447 --zSig1;
3448 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3449 }
3450 zSig1 |= ( ( rem1 | rem2 ) != 0 );
3451 }
3452 return
3453 roundAndPackFloatx80(
3454 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3455
3456 }
3457
3458 /*
3459 -------------------------------------------------------------------------------
3460 Returns the remainder of the extended double-precision floating-point value
3461 `a' with respect to the corresponding value `b'. The operation is performed
3462 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3463 -------------------------------------------------------------------------------
3464 */
3465 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3466 {
3467 flag aSign, bSign, zSign;
3468 int32 aExp, bExp, expDiff;
3469 bits64 aSig0, aSig1, bSig;
3470 bits64 q, term0, term1, alternateASig0, alternateASig1;
3471 floatx80 z;
3472
3473 aSig0 = extractFloatx80Frac( a );
3474 aExp = extractFloatx80Exp( a );
3475 aSign = extractFloatx80Sign( a );
3476 bSig = extractFloatx80Frac( b );
3477 bExp = extractFloatx80Exp( b );
3478 bSign = extractFloatx80Sign( b );
3479 if ( aExp == 0x7FFF ) {
3480 if ( (bits64) ( aSig0<<1 )
3481 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3482 return propagateFloatx80NaN( a, b );
3483 }
3484 goto invalid;
3485 }
3486 if ( bExp == 0x7FFF ) {
3487 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3488 return a;
3489 }
3490 if ( bExp == 0 ) {
3491 if ( bSig == 0 ) {
3492 invalid:
3493 float_raise( float_flag_invalid );
3494 z.low = floatx80_default_nan_low;
3495 z.high = floatx80_default_nan_high;
3496 return z;
3497 }
3498 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3499 }
3500 if ( aExp == 0 ) {
3501 if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3502 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3503 }
3504 bSig |= LIT64( 0x8000000000000000 );
3505 zSign = aSign;
3506 expDiff = aExp - bExp;
3507 aSig1 = 0;
3508 if ( expDiff < 0 ) {
3509 if ( expDiff < -1 ) return a;
3510 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3511 expDiff = 0;
3512 }
3513 q = ( bSig <= aSig0 );
3514 if ( q ) aSig0 -= bSig;
3515 expDiff -= 64;
3516 while ( 0 < expDiff ) {
3517 q = estimateDiv128To64( aSig0, aSig1, bSig );
3518 q = ( 2 < q ) ? q - 2 : 0;
3519 mul64To128( bSig, q, &term0, &term1 );
3520 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3521 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3522 expDiff -= 62;
3523 }
3524 expDiff += 64;
3525 if ( 0 < expDiff ) {
3526 q = estimateDiv128To64( aSig0, aSig1, bSig );
3527 q = ( 2 < q ) ? q - 2 : 0;
3528 q >>= 64 - expDiff;
3529 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3530 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3531 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3532 while ( le128( term0, term1, aSig0, aSig1 ) ) {
3533 ++q;
3534 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3535 }
3536 }
3537 else {
3538 term1 = 0;
3539 term0 = bSig;
3540 }
3541 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3542 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3543 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3544 && ( q & 1 ) )
3545 ) {
3546 aSig0 = alternateASig0;
3547 aSig1 = alternateASig1;
3548 zSign = ! zSign;
3549 }
3550 return
3551 normalizeRoundAndPackFloatx80(
3552 80, zSign, bExp + expDiff, aSig0, aSig1 );
3553
3554 }
3555
3556 /*
3557 -------------------------------------------------------------------------------
3558 Returns the square root of the extended double-precision floating-point
3559 value `a'. The operation is performed according to the IEC/IEEE Standard
3560 for Binary Floating-point Arithmetic.
3561 -------------------------------------------------------------------------------
3562 */
3563 floatx80 floatx80_sqrt( floatx80 a )
3564 {
3565 flag aSign;
3566 int32 aExp, zExp;
3567 bits64 aSig0, aSig1, zSig0, zSig1;
3568 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3569 bits64 shiftedRem0, shiftedRem1;
3570 floatx80 z;
3571
3572 aSig0 = extractFloatx80Frac( a );
3573 aExp = extractFloatx80Exp( a );
3574 aSign = extractFloatx80Sign( a );
3575 if ( aExp == 0x7FFF ) {
3576 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
3577 if ( ! aSign ) return a;
3578 goto invalid;
3579 }
3580 if ( aSign ) {
3581 if ( ( aExp | aSig0 ) == 0 ) return a;
3582 invalid:
3583 float_raise( float_flag_invalid );
3584 z.low = floatx80_default_nan_low;
3585 z.high = floatx80_default_nan_high;
3586 return z;
3587 }
3588 if ( aExp == 0 ) {
3589 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3590 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3591 }
3592 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3593 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3594 zSig0 <<= 31;
3595 aSig1 = 0;
3596 shift128Right( aSig0, 0, ( aExp & 1 ) + 2, &aSig0, &aSig1 );
3597 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0 ) + zSig0 + 4;
3598 if ( 0 <= (sbits64) zSig0 ) zSig0 = LIT64( 0xFFFFFFFFFFFFFFFF );
3599 shortShift128Left( aSig0, aSig1, 2, &aSig0, &aSig1 );
3600 mul64To128( zSig0, zSig0, &term0, &term1 );
3601 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3602 while ( (sbits64) rem0 < 0 ) {
3603 --zSig0;
3604 shortShift128Left( 0, zSig0, 1, &term0, &term1 );
3605 term1 |= 1;
3606 add128( rem0, rem1, term0, term1, &rem0, &rem1 );
3607 }
3608 shortShift128Left( rem0, rem1, 63, &shiftedRem0, &shiftedRem1 );
3609 zSig1 = estimateDiv128To64( shiftedRem0, shiftedRem1, zSig0 );
3610 if ( (bits64) ( zSig1<<1 ) <= 10 ) {
3611 if ( zSig1 == 0 ) zSig1 = 1;
3612 mul64To128( zSig0, zSig1, &term1, &term2 );
3613 shortShift128Left( term1, term2, 1, &term1, &term2 );
3614 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3615 mul64To128( zSig1, zSig1, &term2, &term3 );
3616 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3617 while ( (sbits64) rem1 < 0 ) {
3618 --zSig1;
3619 shortShift192Left( 0, zSig0, zSig1, 1, &term1, &term2, &term3 );
3620 term3 |= 1;
3621 add192(
3622 rem1, rem2, rem3, term1, term2, term3, &rem1, &rem2, &rem3 );
3623 }
3624 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3625 }
3626 return
3627 roundAndPackFloatx80(
3628 floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
3629
3630 }
3631
3632 /*
3633 -------------------------------------------------------------------------------
3634 Returns 1 if the extended double-precision floating-point value `a' is
3635 equal to the corresponding value `b', and 0 otherwise. The comparison is
3636 performed according to the IEC/IEEE Standard for Binary Floating-point
3637 Arithmetic.
3638 -------------------------------------------------------------------------------
3639 */
3640 flag floatx80_eq( floatx80 a, floatx80 b )
3641 {
3642
3643 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3644 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3645 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3646 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3647 ) {
3648 if ( floatx80_is_signaling_nan( a )
3649 || floatx80_is_signaling_nan( b ) ) {
3650 float_raise( float_flag_invalid );
3651 }
3652 return 0;
3653 }
3654 return
3655 ( a.low == b.low )
3656 && ( ( a.high == b.high )
3657 || ( ( a.low == 0 )
3658 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3659 );
3660
3661 }
3662
3663 /*
3664 -------------------------------------------------------------------------------
3665 Returns 1 if the extended double-precision floating-point value `a' is
3666 less than or equal to the corresponding value `b', and 0 otherwise. The
3667 comparison is performed according to the IEC/IEEE Standard for Binary
3668 Floating-point Arithmetic.
3669 -------------------------------------------------------------------------------
3670 */
3671 flag floatx80_le( floatx80 a, floatx80 b )
3672 {
3673 flag aSign, bSign;
3674
3675 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3676 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3677 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3678 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3679 ) {
3680 float_raise( float_flag_invalid );
3681 return 0;
3682 }
3683 aSign = extractFloatx80Sign( a );
3684 bSign = extractFloatx80Sign( b );
3685 if ( aSign != bSign ) {
3686 return
3687 aSign
3688 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3689 == 0 );
3690 }
3691 return
3692 aSign ? le128( b.high, b.low, a.high, a.low )
3693 : le128( a.high, a.low, b.high, b.low );
3694
3695 }
3696
3697 /*
3698 -------------------------------------------------------------------------------
3699 Returns 1 if the extended double-precision floating-point value `a' is
3700 less than the corresponding value `b', and 0 otherwise. The comparison
3701 is performed according to the IEC/IEEE Standard for Binary Floating-point
3702 Arithmetic.
3703 -------------------------------------------------------------------------------
3704 */
3705 flag floatx80_lt( floatx80 a, floatx80 b )
3706 {
3707 flag aSign, bSign;
3708
3709 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3710 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3711 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3712 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3713 ) {
3714 float_raise( float_flag_invalid );
3715 return 0;
3716 }
3717 aSign = extractFloatx80Sign( a );
3718 bSign = extractFloatx80Sign( b );
3719 if ( aSign != bSign ) {
3720 return
3721 aSign
3722 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3723 != 0 );
3724 }
3725 return
3726 aSign ? lt128( b.high, b.low, a.high, a.low )
3727 : lt128( a.high, a.low, b.high, b.low );
3728
3729 }
3730
3731 /*
3732 -------------------------------------------------------------------------------
3733 Returns 1 if the extended double-precision floating-point value `a' is equal
3734 to the corresponding value `b', and 0 otherwise. The invalid exception is
3735 raised if either operand is a NaN. Otherwise, the comparison is performed
3736 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3737 -------------------------------------------------------------------------------
3738 */
3739 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
3740 {
3741
3742 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3743 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3744 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3745 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3746 ) {
3747 float_raise( float_flag_invalid );
3748 return 0;
3749 }
3750 return
3751 ( a.low == b.low )
3752 && ( ( a.high == b.high )
3753 || ( ( a.low == 0 )
3754 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3755 );
3756
3757 }
3758
3759 /*
3760 -------------------------------------------------------------------------------
3761 Returns 1 if the extended double-precision floating-point value `a' is less
3762 than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
3763 do not cause an exception. Otherwise, the comparison is performed according
3764 to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3765 -------------------------------------------------------------------------------
3766 */
3767 flag floatx80_le_quiet( floatx80 a, floatx80 b )
3768 {
3769 flag aSign, bSign;
3770
3771 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3772 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3773 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3774 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3775 ) {
3776 if ( floatx80_is_signaling_nan( a )
3777 || floatx80_is_signaling_nan( b ) ) {
3778 float_raise( float_flag_invalid );
3779 }
3780 return 0;
3781 }
3782 aSign = extractFloatx80Sign( a );
3783 bSign = extractFloatx80Sign( b );
3784 if ( aSign != bSign ) {
3785 return
3786 aSign
3787 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3788 == 0 );
3789 }
3790 return
3791 aSign ? le128( b.high, b.low, a.high, a.low )
3792 : le128( a.high, a.low, b.high, b.low );
3793
3794 }
3795
3796 /*
3797 -------------------------------------------------------------------------------
3798 Returns 1 if the extended double-precision floating-point value `a' is less
3799 than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
3800 an exception. Otherwise, the comparison is performed according to the
3801 IEC/IEEE Standard for Binary Floating-point Arithmetic.
3802 -------------------------------------------------------------------------------
3803 */
3804 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
3805 {
3806 flag aSign, bSign;
3807
3808 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3809 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3810 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3811 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3812 ) {
3813 if ( floatx80_is_signaling_nan( a )
3814 || floatx80_is_signaling_nan( b ) ) {
3815 float_raise( float_flag_invalid );
3816 }
3817 return 0;
3818 }
3819 aSign = extractFloatx80Sign( a );
3820 bSign = extractFloatx80Sign( b );
3821 if ( aSign != bSign ) {
3822 return
3823 aSign
3824 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3825 != 0 );
3826 }
3827 return
3828 aSign ? lt128( b.high, b.low, a.high, a.low )
3829 : lt128( a.high, a.low, b.high, b.low );
3830
3831 }
3832
3833 #endif
3834
3835 #ifdef FLOAT128
3836
3837 /*
3838 -------------------------------------------------------------------------------
3839 Returns the result of converting the quadruple-precision floating-point
3840 value `a' to the 32-bit two's complement integer format. The conversion
3841 is performed according to the IEC/IEEE Standard for Binary Floating-point
3842 Arithmetic---which means in particular that the conversion is rounded
3843 according to the current rounding mode. If `a' is a NaN, the largest
3844 positive integer is returned. Otherwise, if the conversion overflows, the
3845 largest integer with the same sign as `a' is returned.
3846 -------------------------------------------------------------------------------
3847 */
3848 int32 float128_to_int32( float128 a )
3849 {
3850 flag aSign;
3851 int32 aExp, shiftCount;
3852 bits64 aSig0, aSig1;
3853
3854 aSig1 = extractFloat128Frac1( a );
3855 aSig0 = extractFloat128Frac0( a );
3856 aExp = extractFloat128Exp( a );
3857 aSign = extractFloat128Sign( a );
3858 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
3859 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
3860 aSig0 |= ( aSig1 != 0 );
3861 shiftCount = 0x4028 - aExp;
3862 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
3863 return roundAndPackInt32( aSign, aSig0 );
3864
3865 }
3866
3867 /*
3868 -------------------------------------------------------------------------------
3869 Returns the result of converting the quadruple-precision floating-point
3870 value `a' to the 32-bit two's complement integer format. The conversion
3871 is performed according to the IEC/IEEE Standard for Binary Floating-point
3872 Arithmetic, except that the conversion is always rounded toward zero. If
3873 `a' is a NaN, the largest positive integer is returned. Otherwise, if the
3874 conversion overflows, the largest integer with the same sign as `a' is
3875 returned.
3876 -------------------------------------------------------------------------------
3877 */
3878 int32 float128_to_int32_round_to_zero( float128 a )
3879 {
3880 flag aSign;
3881 int32 aExp, shiftCount;
3882 bits64 aSig0, aSig1, savedASig;
3883 int32 z;
3884
3885 aSig1 = extractFloat128Frac1( a );
3886 aSig0 = extractFloat128Frac0( a );
3887 aExp = extractFloat128Exp( a );
3888 aSign = extractFloat128Sign( a );
3889 aSig0 |= ( aSig1 != 0 );
3890 shiftCount = 0x402F - aExp;
3891 if ( shiftCount < 17 ) {
3892 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
3893 goto invalid;
3894 }
3895 else if ( 48 < shiftCount ) {
3896 if ( aExp || aSig0 ) float_exception_flags |= float_flag_inexact;
3897 return 0;
3898 }
3899 aSig0 |= LIT64( 0x0001000000000000 );
3900 savedASig = aSig0;
3901 aSig0 >>= shiftCount;
3902 z = aSig0;
3903 if ( aSign ) z = - z;
3904 if ( ( z < 0 ) ^ aSign ) {
3905 invalid:
3906 float_exception_flags |= float_flag_invalid;
3907 return aSign ? 0x80000000 : 0x7FFFFFFF;
3908 }
3909 if ( ( aSig0<<shiftCount ) != savedASig ) {
3910 float_exception_flags |= float_flag_inexact;
3911 }
3912 return z;
3913
3914 }
3915
3916 /*
3917 -------------------------------------------------------------------------------
3918 Returns the result of converting the quadruple-precision floating-point
3919 value `a' to the single-precision floating-point format. The conversion
3920 is performed according to the IEC/IEEE Standard for Binary Floating-point
3921 Arithmetic.
3922 -------------------------------------------------------------------------------
3923 */
3924 float32 float128_to_float32( float128 a )
3925 {
3926 flag aSign;
3927 int32 aExp;
3928 bits64 aSig0, aSig1;
3929 bits32 zSig;
3930
3931 aSig1 = extractFloat128Frac1( a );
3932 aSig0 = extractFloat128Frac0( a );
3933 aExp = extractFloat128Exp( a );
3934 aSign = extractFloat128Sign( a );
3935 if ( aExp == 0x7FFF ) {
3936 if ( aSig0 | aSig1 ) {
3937 return commonNaNToFloat32( float128ToCommonNaN( a ) );
3938 }
3939 return packFloat32( aSign, 0xFF, 0 );
3940 }
3941 aSig0 |= ( aSig1 != 0 );
3942 shift64RightJamming( aSig0, 18, &aSig0 );
3943 zSig = aSig0;
3944 if ( aExp || zSig ) {
3945 zSig |= 0x40000000;
3946 aExp -= 0x3F81;
3947 }
3948 return roundAndPackFloat32( aSign, aExp, zSig );
3949
3950 }
3951
3952 /*
3953 -------------------------------------------------------------------------------
3954 Returns the result of converting the quadruple-precision floating-point
3955 value `a' to the double-precision floating-point format. The conversion
3956 is performed according to the IEC/IEEE Standard for Binary Floating-point
3957 Arithmetic.
3958 -------------------------------------------------------------------------------
3959 */
3960 float64 float128_to_float64( float128 a )
3961 {
3962 flag aSign;
3963 int32 aExp;
3964 bits64 aSig0, aSig1;
3965
3966 aSig1 = extractFloat128Frac1( a );
3967 aSig0 = extractFloat128Frac0( a );
3968 aExp = extractFloat128Exp( a );
3969 aSign = extractFloat128Sign( a );
3970 if ( aExp == 0x7FFF ) {
3971 if ( aSig0 | aSig1 ) {
3972 return commonNaNToFloat64( float128ToCommonNaN( a ) );
3973 }
3974 return packFloat64( aSign, 0x7FF, 0 );
3975 }
3976 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
3977 aSig0 |= ( aSig1 != 0 );
3978 if ( aExp || aSig0 ) {
3979 aSig0 |= LIT64( 0x4000000000000000 );
3980 aExp -= 0x3C01;
3981 }
3982 return roundAndPackFloat64( aSign, aExp, aSig0 );
3983
3984 }
3985
3986 #ifdef FLOATX80
3987
3988 /*
3989 -------------------------------------------------------------------------------
3990 Returns the result of converting the quadruple-precision floating-point
3991 value `a' to the extended double-precision floating-point format. The
3992 conversion is performed according to the IEC/IEEE Standard for Binary
3993 Floating-point Arithmetic.
3994 -------------------------------------------------------------------------------
3995 */
3996 floatx80 float128_to_floatx80( float128 a )
3997 {
3998 flag aSign;
3999 int32 aExp;
4000 bits64 aSig0, aSig1;
4001
4002 aSig1 = extractFloat128Frac1( a );
4003 aSig0 = extractFloat128Frac0( a );
4004 aExp = extractFloat128Exp( a );
4005 aSign = extractFloat128Sign( a );
4006 if ( aExp == 0x7FFF ) {
4007 if ( aSig0 | aSig1 ) {
4008 return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4009 }
4010 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4011 }
4012 if ( aExp == 0 ) {
4013 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4014 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4015 }
4016 else {
4017 aSig0 |= LIT64( 0x0001000000000000 );
4018 }
4019 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4020 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4021
4022 }
4023
4024 #endif
4025
4026 /*
4027 -------------------------------------------------------------------------------
4028 Rounds the quadruple-precision floating-point value `a' to an integer, and
4029 returns the result as a quadruple-precision floating-point value. The
4030 operation is performed according to the IEC/IEEE Standard for Binary
4031 Floating-point Arithmetic.
4032 -------------------------------------------------------------------------------
4033 */
4034 float128 float128_round_to_int( float128 a )
4035 {
4036 flag aSign;
4037 int32 aExp;
4038 bits64 lastBitMask, roundBitsMask;
4039 int8 roundingMode;
4040 float128 z;
4041
4042 aExp = extractFloat128Exp( a );
4043 if ( 0x402F <= aExp ) {
4044 if ( 0x406F <= aExp ) {
4045 if ( ( aExp == 0x7FFF )
4046 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4047 ) {
4048 return propagateFloat128NaN( a, a );
4049 }
4050 return a;
4051 }
4052 lastBitMask = 1;
4053 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4054 roundBitsMask = lastBitMask - 1;
4055 z = a;
4056 roundingMode = float_rounding_mode;
4057 if ( roundingMode == float_round_nearest_even ) {
4058 if ( lastBitMask ) {
4059 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4060 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4061 }
4062 else {
4063 if ( (sbits64) z.low < 0 ) {
4064 ++z.high;
4065 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4066 }
4067 }
4068 }
4069 else if ( roundingMode != float_round_to_zero ) {
4070 if ( extractFloat128Sign( z )
4071 ^ ( roundingMode == float_round_up ) ) {
4072 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4073 }
4074 }
4075 z.low &= ~ roundBitsMask;
4076 }
4077 else {
4078 if ( aExp <= 0x3FFE ) {
4079 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4080 float_exception_flags |= float_flag_inexact;
4081 aSign = extractFloat128Sign( a );
4082 switch ( float_rounding_mode ) {
4083 case float_round_nearest_even:
4084 if ( ( aExp == 0x3FFE )
4085 && ( extractFloat128Frac0( a )
4086 | extractFloat128Frac1( a ) )
4087 ) {
4088 return packFloat128( aSign, 0x3FFF, 0, 0 );
4089 }
4090 break;
4091 case float_round_down:
4092 return
4093 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4094 : packFloat128( 0, 0, 0, 0 );
4095 case float_round_up:
4096 return
4097 aSign ? packFloat128( 1, 0, 0, 0 )
4098 : packFloat128( 0, 0x3FFF, 0, 0 );
4099 }
4100 return packFloat128( aSign, 0, 0, 0 );
4101 }
4102 lastBitMask = 1;
4103 lastBitMask <<= 0x402F - aExp;
4104 roundBitsMask = lastBitMask - 1;
4105 z.low = 0;
4106 z.high = a.high;
4107 roundingMode = float_rounding_mode;
4108 if ( roundingMode == float_round_nearest_even ) {
4109 z.high += lastBitMask>>1;
4110 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4111 z.high &= ~ lastBitMask;
4112 }
4113 }
4114 else if ( roundingMode != float_round_to_zero ) {
4115 if ( extractFloat128Sign( z )
4116 ^ ( roundingMode == float_round_up ) ) {
4117 z.high |= ( a.low != 0 );
4118 z.high += roundBitsMask;
4119 }
4120 }
4121 z.high &= ~ roundBitsMask;
4122 }
4123 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4124 float_exception_flags |= float_flag_inexact;
4125 }
4126 return z;
4127
4128 }
4129
4130 /*
4131 -------------------------------------------------------------------------------
4132 Returns the result of adding the absolute values of the quadruple-precision
4133 floating-point values `a' and `b'. If `zSign' is true, the sum is negated
4134 before being returned. `zSign' is ignored if the result is a NaN. The
4135 addition is performed according to the IEC/IEEE Standard for Binary
4136 Floating-point Arithmetic.
4137 -------------------------------------------------------------------------------
4138 */
4139 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4140 {
4141 int32 aExp, bExp, zExp;
4142 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4143 int32 expDiff;
4144
4145 aSig1 = extractFloat128Frac1( a );
4146 aSig0 = extractFloat128Frac0( a );
4147 aExp = extractFloat128Exp( a );
4148 bSig1 = extractFloat128Frac1( b );
4149 bSig0 = extractFloat128Frac0( b );
4150 bExp = extractFloat128Exp( b );
4151 expDiff = aExp - bExp;
4152 if ( 0 < expDiff ) {
4153 if ( aExp == 0x7FFF ) {
4154 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4155 return a;
4156 }
4157 if ( bExp == 0 ) {
4158 --expDiff;
4159 }
4160 else {
4161 bSig0 |= LIT64( 0x0001000000000000 );
4162 }
4163 shift128ExtraRightJamming(
4164 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4165 zExp = aExp;
4166 }
4167 else if ( expDiff < 0 ) {
4168 if ( bExp == 0x7FFF ) {
4169 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4170 return packFloat128( zSign, 0x7FFF, 0, 0 );
4171 }
4172 if ( aExp == 0 ) {
4173 ++expDiff;
4174 }
4175 else {
4176 aSig0 |= LIT64( 0x0001000000000000 );
4177 }
4178 shift128ExtraRightJamming(
4179 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4180 zExp = bExp;
4181 }
4182 else {
4183 if ( aExp == 0x7FFF ) {
4184 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4185 return propagateFloat128NaN( a, b );
4186 }
4187 return a;
4188 }
4189 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4190 if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4191 zSig2 = 0;
4192 zSig0 |= LIT64( 0x0002000000000000 );
4193 zExp = aExp;
4194 goto shiftRight1;
4195 }
4196 aSig0 |= LIT64( 0x0001000000000000 );
4197 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4198 --zExp;
4199 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4200 ++zExp;
4201 shiftRight1:
4202 shift128ExtraRightJamming(
4203 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4204 roundAndPack:
4205 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4206
4207 }
4208
4209 /*
4210 -------------------------------------------------------------------------------
4211 Returns the result of subtracting the absolute values of the quadruple-
4212 precision floating-point values `a' and `b'. If `zSign' is true, the
4213 difference is negated before being returned. `zSign' is ignored if the
4214 result is a NaN. The subtraction is performed according to the IEC/IEEE
4215 Standard for Binary Floating-point Arithmetic.
4216 -------------------------------------------------------------------------------
4217 */
4218 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4219 {
4220 int32 aExp, bExp, zExp;
4221 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4222 int32 expDiff;
4223 float128 z;
4224
4225 aSig1 = extractFloat128Frac1( a );
4226 aSig0 = extractFloat128Frac0( a );
4227 aExp = extractFloat128Exp( a );
4228 bSig1 = extractFloat128Frac1( b );
4229 bSig0 = extractFloat128Frac0( b );
4230 bExp = extractFloat128Exp( b );
4231 expDiff = aExp - bExp;
4232 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4233 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4234 if ( 0 < expDiff ) goto aExpBigger;
4235 if ( expDiff < 0 ) goto bExpBigger;
4236 if ( aExp == 0x7FFF ) {
4237 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4238 return propagateFloat128NaN( a, b );
4239 }
4240 float_raise( float_flag_invalid );
4241 z.low = float128_default_nan_low;
4242 z.high = float128_default_nan_high;
4243 return z;
4244 }
4245 if ( aExp == 0 ) {
4246 aExp = 1;
4247 bExp = 1;
4248 }
4249 if ( bSig0 < aSig0 ) goto aBigger;
4250 if ( aSig0 < bSig0 ) goto bBigger;
4251 if ( bSig1 < aSig1 ) goto aBigger;
4252 if ( aSig1 < bSig1 ) goto bBigger;
4253 return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 );
4254 bExpBigger:
4255 if ( bExp == 0x7FFF ) {
4256 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4257 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4258 }
4259 if ( aExp == 0 ) {
4260 ++expDiff;
4261 }
4262 else {
4263 aSig0 |= LIT64( 0x4000000000000000 );
4264 }
4265 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4266 bSig0 |= LIT64( 0x4000000000000000 );
4267 bBigger:
4268 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4269 zExp = bExp;
4270 zSign ^= 1;
4271 goto normalizeRoundAndPack;
4272 aExpBigger:
4273 if ( aExp == 0x7FFF ) {
4274 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4275 return a;
4276 }
4277 if ( bExp == 0 ) {
4278 --expDiff;
4279 }
4280 else {
4281 bSig0 |= LIT64( 0x4000000000000000 );
4282 }
4283 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4284 aSig0 |= LIT64( 0x4000000000000000 );
4285 aBigger:
4286 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4287 zExp = aExp;
4288 normalizeRoundAndPack:
4289 --zExp;
4290 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4291
4292 }
4293
4294 /*
4295 -------------------------------------------------------------------------------
4296 Returns the result of adding the quadruple-precision floating-point values
4297 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4298 for Binary Floating-point Arithmetic.
4299 -------------------------------------------------------------------------------
4300 */
4301 float128 float128_add( float128 a, float128 b )
4302 {
4303 flag aSign, bSign;
4304
4305 aSign = extractFloat128Sign( a );
4306 bSign = extractFloat128Sign( b );
4307 if ( aSign == bSign ) {
4308 return addFloat128Sigs( a, b, aSign );
4309 }
4310 else {
4311 return subFloat128Sigs( a, b, aSign );
4312 }
4313
4314 }
4315
4316 /*
4317 -------------------------------------------------------------------------------
4318 Returns the result of subtracting the quadruple-precision floating-point
4319 values `a' and `b'. The operation is performed according to the IEC/IEEE
4320 Standard for Binary Floating-point Arithmetic.
4321 -------------------------------------------------------------------------------
4322 */
4323 float128 float128_sub( float128 a, float128 b )
4324 {
4325 flag aSign, bSign;
4326
4327 aSign = extractFloat128Sign( a );
4328 bSign = extractFloat128Sign( b );
4329 if ( aSign == bSign ) {
4330 return subFloat128Sigs( a, b, aSign );
4331 }
4332 else {
4333 return addFloat128Sigs( a, b, aSign );
4334 }
4335
4336 }
4337
4338 /*
4339 -------------------------------------------------------------------------------
4340 Returns the result of multiplying the quadruple-precision floating-point
4341 values `a' and `b'. The operation is performed according to the IEC/IEEE
4342 Standard for Binary Floating-point Arithmetic.
4343 -------------------------------------------------------------------------------
4344 */
4345 float128 float128_mul( float128 a, float128 b )
4346 {
4347 flag aSign, bSign, zSign;
4348 int32 aExp, bExp, zExp;
4349 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4350 float128 z;
4351
4352 aSig1 = extractFloat128Frac1( a );
4353 aSig0 = extractFloat128Frac0( a );
4354 aExp = extractFloat128Exp( a );
4355 aSign = extractFloat128Sign( a );
4356 bSig1 = extractFloat128Frac1( b );
4357 bSig0 = extractFloat128Frac0( b );
4358 bExp = extractFloat128Exp( b );
4359 bSign = extractFloat128Sign( b );
4360 zSign = aSign ^ bSign;
4361 if ( aExp == 0x7FFF ) {
4362 if ( ( aSig0 | aSig1 )
4363 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4364 return propagateFloat128NaN( a, b );
4365 }
4366 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4367 return packFloat128( zSign, 0x7FFF, 0, 0 );
4368 }
4369 if ( bExp == 0x7FFF ) {
4370 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4371 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4372 invalid:
4373 float_raise( float_flag_invalid );
4374 z.low = float128_default_nan_low;
4375 z.high = float128_default_nan_high;
4376 return z;
4377 }
4378 return packFloat128( zSign, 0x7FFF, 0, 0 );
4379 }
4380 if ( aExp == 0 ) {
4381 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4382 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4383 }
4384 if ( bExp == 0 ) {
4385 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4386 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4387 }
4388 zExp = aExp + bExp - 0x4000;
4389 aSig0 |= LIT64( 0x0001000000000000 );
4390 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4391 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4392 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4393 zSig2 |= ( zSig3 != 0 );
4394 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4395 shift128ExtraRightJamming(
4396 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4397 ++zExp;
4398 }
4399 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4400
4401 }
4402
4403 /*
4404 -------------------------------------------------------------------------------
4405 Returns the result of dividing the quadruple-precision floating-point value
4406 `a' by the corresponding value `b'. The operation is performed according to
4407 the IEC/IEEE Standard for Binary Floating-point Arithmetic.
4408 -------------------------------------------------------------------------------
4409 */
4410 float128 float128_div( float128 a, float128 b )
4411 {
4412 flag aSign, bSign, zSign;
4413 int32 aExp, bExp, zExp;
4414 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4415 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4416 float128 z;
4417
4418 aSig1 = extractFloat128Frac1( a );
4419 aSig0 = extractFloat128Frac0( a );
4420 aExp = extractFloat128Exp( a );
4421 aSign = extractFloat128Sign( a );
4422 bSig1 = extractFloat128Frac1( b );
4423 bSig0 = extractFloat128Frac0( b );
4424 bExp = extractFloat128Exp( b );
4425 bSign = extractFloat128Sign( b );
4426 zSign = aSign ^ bSign;
4427 if ( aExp == 0x7FFF ) {
4428 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4429 if ( bExp == 0x7FFF ) {
4430 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4431 goto invalid;
4432 }
4433 return packFloat128( zSign, 0x7FFF, 0, 0 );
4434 }
4435 if ( bExp == 0x7FFF ) {
4436 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4437 return packFloat128( zSign, 0, 0, 0 );
4438 }
4439 if ( bExp == 0 ) {
4440 if ( ( bSig0 | bSig1 ) == 0 ) {
4441 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4442 invalid:
4443 float_raise( float_flag_invalid );
4444 z.low = float128_default_nan_low;
4445 z.high = float128_default_nan_high;
4446 return z;
4447 }
4448 float_raise( float_flag_divbyzero );
4449 return packFloat128( zSign, 0x7FFF, 0, 0 );
4450 }
4451 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4452 }
4453 if ( aExp == 0 ) {
4454 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4455 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4456 }
4457 zExp = aExp - bExp + 0x3FFD;
4458 shortShift128Left(
4459 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4460 shortShift128Left(
4461 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4462 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4463 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4464 ++zExp;
4465 }
4466 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4467 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4468 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4469 while ( (sbits64) rem0 < 0 ) {
4470 --zSig0;
4471 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
4472 }
4473 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
4474 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
4475 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
4476 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
4477 while ( (sbits64) rem1 < 0 ) {
4478 --zSig1;
4479 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
4480 }
4481 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4482 }
4483 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
4484 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4485
4486 }
4487
4488 /*
4489 -------------------------------------------------------------------------------
4490 Returns the remainder of the quadruple-precision floating-point value `a'
4491 with respect to the corresponding value `b'. The operation is performed
4492 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
4493 -------------------------------------------------------------------------------
4494 */
4495 float128 float128_rem( float128 a, float128 b )
4496 {
4497 flag aSign, bSign, zSign;
4498 int32 aExp, bExp, expDiff;
4499 bits64 aSig0, aSig1, bSig0, bSig1;
4500 bits64 q, term0, term1, term2, allZero, alternateASig0, alternateASig1;
4501 bits64 sigMean1;
4502 sbits64 sigMean0;
4503 float128 z;
4504
4505 aSig1 = extractFloat128Frac1( a );
4506 aSig0 = extractFloat128Frac0( a );
4507 aExp = extractFloat128Exp( a );
4508 aSign = extractFloat128Sign( a );
4509 bSig1 = extractFloat128Frac1( b );
4510 bSig0 = extractFloat128Frac0( b );
4511 bExp = extractFloat128Exp( b );
4512 bSign = extractFloat128Sign( b );
4513 if ( aExp == 0x7FFF ) {
4514 if ( ( aSig0 | aSig1 )
4515 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4516 return propagateFloat128NaN( a, b );
4517 }
4518 goto invalid;
4519 }
4520 if ( bExp == 0x7FFF ) {
4521 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4522 return a;
4523 }
4524 if ( bExp == 0 ) {
4525 if ( ( bSig0 | bSig1 ) == 0 ) {
4526 invalid:
4527 float_raise( float_flag_invalid );
4528 z.low = float128_default_nan_low;
4529 z.high = float128_default_nan_high;
4530 return z;
4531 }
4532 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4533 }
4534 if ( aExp == 0 ) {
4535 if ( ( aSig0 | aSig1 ) == 0 ) return a;
4536 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4537 }
4538 expDiff = aExp - bExp;
4539 if ( expDiff < -1 ) return a;
4540 shortShift128Left(
4541 aSig0 | LIT64( 0x0001000000000000 ),
4542 aSig1,
4543 15 - ( expDiff < 0 ),
4544 &aSig0,
4545 &aSig1
4546 );
4547 shortShift128Left(
4548 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4549 q = le128( bSig0, bSig1, aSig0, aSig1 );
4550 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4551 expDiff -= 64;
4552 while ( 0 < expDiff ) {
4553 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4554 q = ( 4 < q ) ? q - 4 : 0;
4555 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4556 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
4557 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
4558 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
4559 expDiff -= 61;
4560 }
4561 if ( -64 < expDiff ) {
4562 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4563 q = ( 4 < q ) ? q - 4 : 0;
4564 q >>= - expDiff;
4565 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4566 expDiff += 52;
4567 if ( expDiff < 0 ) {
4568 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4569 }
4570 else {
4571 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
4572 }
4573 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4574 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
4575 }
4576 else {
4577 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
4578 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4579 }
4580 do {
4581 alternateASig0 = aSig0;
4582 alternateASig1 = aSig1;
4583 ++q;
4584 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4585 } while ( 0 <= (sbits64) aSig0 );
4586 add128(
4587 aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
4588 if ( ( sigMean0 < 0 )
4589 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
4590 aSig0 = alternateASig0;
4591 aSig1 = alternateASig1;
4592 }
4593 zSign = ( (sbits64) aSig0 < 0 );
4594 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
4595 return
4596 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
4597
4598 }
4599
4600 /*
4601 -------------------------------------------------------------------------------
4602 Returns the square root of the quadruple-precision floating-point value `a'.
4603 The operation is performed according to the IEC/IEEE Standard for Binary
4604 Floating-point Arithmetic.
4605 -------------------------------------------------------------------------------
4606 */
4607 float128 float128_sqrt( float128 a )
4608 {
4609 flag aSign;
4610 int32 aExp, zExp;
4611 bits64 aSig0, aSig1, zSig0, zSig1, zSig2;
4612 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4613 bits64 shiftedRem0, shiftedRem1;
4614 float128 z;
4615
4616 aSig1 = extractFloat128Frac1( a );
4617 aSig0 = extractFloat128Frac0( a );
4618 aExp = extractFloat128Exp( a );
4619 aSign = extractFloat128Sign( a );
4620 if ( aExp == 0x7FFF ) {
4621 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
4622 if ( ! aSign ) return a;
4623 goto invalid;
4624 }
4625 if ( aSign ) {
4626 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
4627 invalid:
4628 float_raise( float_flag_invalid );
4629 z.low = float128_default_nan_low;
4630 z.high = float128_default_nan_high;
4631 return z;
4632 }
4633 if ( aExp == 0 ) {
4634 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
4635 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4636 }
4637 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
4638 aSig0 |= LIT64( 0x0001000000000000 );
4639 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
4640 zSig0 <<= 31;
4641 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
4642 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0 ) + zSig0 + 4;
4643 if ( 0 <= (sbits64) zSig0 ) zSig0 = LIT64( 0xFFFFFFFFFFFFFFFF );
4644 shortShift128Left( aSig0, aSig1, 2, &aSig0, &aSig1 );
4645 mul64To128( zSig0, zSig0, &term0, &term1 );
4646 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4647 while ( (sbits64) rem0 < 0 ) {
4648 --zSig0;
4649 shortShift128Left( 0, zSig0, 1, &term0, &term1 );
4650 term1 |= 1;
4651 add128( rem0, rem1, term0, term1, &rem0, &rem1 );
4652 }
4653 shortShift128Left( rem0, rem1, 63, &shiftedRem0, &shiftedRem1 );
4654 zSig1 = estimateDiv128To64( shiftedRem0, shiftedRem1, zSig0 );
4655 if ( ( zSig1 & 0x3FFF ) <= 5 ) {
4656 if ( zSig1 == 0 ) zSig1 = 1;
4657 mul64To128( zSig0, zSig1, &term1, &term2 );
4658 shortShift128Left( term1, term2, 1, &term1, &term2 );
4659 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4660 mul64To128( zSig1, zSig1, &term2, &term3 );
4661 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4662 while ( (sbits64) rem1 < 0 ) {
4663 --zSig1;
4664 shortShift192Left( 0, zSig0, zSig1, 1, &term1, &term2, &term3 );
4665 term3 |= 1;
4666 add192(
4667 rem1, rem2, rem3, term1, term2, term3, &rem1, &rem2, &rem3 );
4668 }
4669 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4670 }
4671 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
4672 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
4673
4674 }
4675
4676 /*
4677 -------------------------------------------------------------------------------
4678 Returns 1 if the quadruple-precision floating-point value `a' is equal to
4679 the corresponding value `b', and 0 otherwise. The comparison is performed
4680 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
4681 -------------------------------------------------------------------------------
4682 */
4683 flag float128_eq( float128 a, float128 b )
4684 {
4685
4686 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
4687 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
4688 || ( ( extractFloat128Exp( b ) == 0x7FFF )
4689 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
4690 ) {
4691 if ( float128_is_signaling_nan( a )
4692 || float128_is_signaling_nan( b ) ) {
4693 float_raise( float_flag_invalid );
4694 }
4695 return 0;
4696 }
4697 return
4698 ( a.low == b.low )
4699 && ( ( a.high == b.high )
4700 || ( ( a.low == 0 )
4701 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
4702 );
4703
4704 }
4705
4706 /*
4707 -------------------------------------------------------------------------------
4708 Returns 1 if the quadruple-precision floating-point value `a' is less than
4709 or equal to the corresponding value `b', and 0 otherwise. The comparison
4710 is performed according to the IEC/IEEE Standard for Binary Floating-point
4711 Arithmetic.
4712 -------------------------------------------------------------------------------
4713 */
4714 flag float128_le( float128 a, float128 b )
4715 {
4716 flag aSign, bSign;
4717
4718 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
4719 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
4720 || ( ( extractFloat128Exp( b ) == 0x7FFF )
4721 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
4722 ) {
4723 float_raise( float_flag_invalid );
4724 return 0;
4725 }
4726 aSign = extractFloat128Sign( a );
4727 bSign = extractFloat128Sign( b );
4728 if ( aSign != bSign ) {
4729 return
4730 aSign
4731 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4732 == 0 );
4733 }
4734 return
4735 aSign ? le128( b.high, b.low, a.high, a.low )
4736 : le128( a.high, a.low, b.high, b.low );
4737
4738 }
4739
4740 /*
4741 -------------------------------------------------------------------------------
4742 Returns 1 if the quadruple-precision floating-point value `a' is less than
4743 the corresponding value `b', and 0 otherwise. The comparison is performed
4744 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
4745 -------------------------------------------------------------------------------
4746 */
4747 flag float128_lt( float128 a, float128 b )
4748 {
4749 flag aSign, bSign;
4750
4751 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
4752 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
4753 || ( ( extractFloat128Exp( b ) == 0x7FFF )
4754 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
4755 ) {
4756 float_raise( float_flag_invalid );
4757 return 0;
4758 }
4759 aSign = extractFloat128Sign( a );
4760 bSign = extractFloat128Sign( b );
4761 if ( aSign != bSign ) {
4762 return
4763 aSign
4764 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4765 != 0 );
4766 }
4767 return
4768 aSign ? lt128( b.high, b.low, a.high, a.low )
4769 : lt128( a.high, a.low, b.high, b.low );
4770
4771 }
4772
4773 /*
4774 -------------------------------------------------------------------------------
4775 Returns 1 if the quadruple-precision floating-point value `a' is equal to
4776 the corresponding value `b', and 0 otherwise. The invalid exception is
4777 raised if either operand is a NaN. Otherwise, the comparison is performed
4778 according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
4779 -------------------------------------------------------------------------------
4780 */
4781 flag float128_eq_signaling( float128 a, float128 b )
4782 {
4783
4784 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
4785 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
4786 || ( ( extractFloat128Exp( b ) == 0x7FFF )
4787 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
4788 ) {
4789 float_raise( float_flag_invalid );
4790 return 0;
4791 }
4792 return
4793 ( a.low == b.low )
4794 && ( ( a.high == b.high )
4795 || ( ( a.low == 0 )
4796 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
4797 );
4798
4799 }
4800
4801 /*
4802 -------------------------------------------------------------------------------
4803 Returns 1 if the quadruple-precision floating-point value `a' is less than
4804 or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4805 cause an exception. Otherwise, the comparison is performed according to the
4806 IEC/IEEE Standard for Binary Floating-point Arithmetic.
4807 -------------------------------------------------------------------------------
4808 */
4809 flag float128_le_quiet( float128 a, float128 b )
4810 {
4811 flag aSign, bSign;
4812
4813 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
4814 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
4815 || ( ( extractFloat128Exp( b ) == 0x7FFF )
4816 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
4817 ) {
4818 if ( float128_is_signaling_nan( a )
4819 || float128_is_signaling_nan( b ) ) {
4820 float_raise( float_flag_invalid );
4821 }
4822 return 0;
4823 }
4824 aSign = extractFloat128Sign( a );
4825 bSign = extractFloat128Sign( b );
4826 if ( aSign != bSign ) {
4827 return
4828 aSign
4829 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4830 == 0 );
4831 }
4832 return
4833 aSign ? le128( b.high, b.low, a.high, a.low )
4834 : le128( a.high, a.low, b.high, b.low );
4835
4836 }
4837
4838 /*
4839 -------------------------------------------------------------------------------
4840 Returns 1 if the quadruple-precision floating-point value `a' is less than
4841 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4842 exception. Otherwise, the comparison is performed according to the IEC/IEEE
4843 Standard for Binary Floating-point Arithmetic.
4844 -------------------------------------------------------------------------------
4845 */
4846 flag float128_lt_quiet( float128 a, float128 b )
4847 {
4848 flag aSign, bSign;
4849
4850 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
4851 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
4852 || ( ( extractFloat128Exp( b ) == 0x7FFF )
4853 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
4854 ) {
4855 if ( float128_is_signaling_nan( a )
4856 || float128_is_signaling_nan( b ) ) {
4857 float_raise( float_flag_invalid );
4858 }
4859 return 0;
4860 }
4861 aSign = extractFloat128Sign( a );
4862 bSign = extractFloat128Sign( b );
4863 if ( aSign != bSign ) {
4864 return
4865 aSign
4866 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4867 != 0 );
4868 }
4869 return
4870 aSign ? lt128( b.high, b.low, a.high, a.low )
4871 : lt128( a.high, a.low, b.high, b.low );
4872
4873 }
4874
4875 #endif
4876
4877