File: /usr/src/linux/arch/arm/nwfpe/softfloat.c

1     /*
2     ===============================================================================
3     
4     This C source file is part of the SoftFloat IEC/IEEE Floating-point
5     Arithmetic Package, Release 2.
6     
7     Written by John R. Hauser.  This work was made possible in part by the
8     International Computer Science Institute, located at Suite 600, 1947 Center
9     Street, Berkeley, California 94704.  Funding was partially provided by the
10     National Science Foundation under grant MIP-9311980.  The original version
11     of this code was written as part of a project to build a fixed-point vector
12     processor in collaboration with the University of California at Berkeley,
13     overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
14     is available through the web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
15     arithmetic/softfloat.html'.
16     
17     THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
18     has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
19     TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
20     PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
21     AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
22     
23     Derivative works are acceptable, even for commercial purposes, so long as
24     (1) they include prominent notice that the work is derivative, and (2) they
25     include prominent notice akin to these three paragraphs for those parts of
26     this code that are retained.
27     
28     ===============================================================================
29     */
30     
31     #include "milieu.h"
32     #include "softfloat.h"
33     
34     /*
35     -------------------------------------------------------------------------------
36     Floating-point rounding mode, extended double-precision rounding precision,
37     and exception flags.
38     -------------------------------------------------------------------------------
39     */
40     int8 float_rounding_mode = float_round_nearest_even;
41     int8 floatx80_rounding_precision = 80;
42     int8 float_exception_flags;
43     
44     /*
45     -------------------------------------------------------------------------------
46     Primitive arithmetic functions, including multi-word arithmetic, and
47     division and square root approximations.  (Can be specialized to target if
48     desired.)
49     -------------------------------------------------------------------------------
50     */
51     #include "softfloat-macros"
52     
53     /*
54     -------------------------------------------------------------------------------
55     Functions and definitions to determine:  (1) whether tininess for underflow
56     is detected before or after rounding by default, (2) what (if anything)
57     happens when exceptions are raised, (3) how signaling NaNs are distinguished
58     from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
59     are propagated from function inputs to output.  These details are target-
60     specific.
61     -------------------------------------------------------------------------------
62     */
63     #include "softfloat-specialize"
64     
65     /*
66     -------------------------------------------------------------------------------
67     Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
68     and 7, and returns the properly rounded 32-bit integer corresponding to the
69     input.  If `zSign' is nonzero, the input is negated before being converted
70     to an integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point
71     input is simply rounded to an integer, with the inexact exception raised if
72     the input cannot be represented exactly as an integer.  If the fixed-point
73     input is too large, however, the invalid exception is raised and the largest
74     positive or negative integer is returned.
75     -------------------------------------------------------------------------------
76     */
77     static int32 roundAndPackInt32( flag zSign, bits64 absZ )
78     {
79         int8 roundingMode;
80         flag roundNearestEven;
81         int8 roundIncrement, roundBits;
82         int32 z;
83     
84         roundingMode = float_rounding_mode;
85         roundNearestEven = ( roundingMode == float_round_nearest_even );
86         roundIncrement = 0x40;
87         if ( ! roundNearestEven ) {
88             if ( roundingMode == float_round_to_zero ) {
89                 roundIncrement = 0;
90             }
91             else {
92                 roundIncrement = 0x7F;
93                 if ( zSign ) {
94                     if ( roundingMode == float_round_up ) roundIncrement = 0;
95                 }
96                 else {
97                     if ( roundingMode == float_round_down ) roundIncrement = 0;
98                 }
99             }
100         }
101         roundBits = absZ & 0x7F;
102         absZ = ( absZ + roundIncrement )>>7;
103         absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
104         z = absZ;
105         if ( zSign ) z = - z;
106         if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
107             float_exception_flags |= float_flag_invalid;
108             return zSign ? 0x80000000 : 0x7FFFFFFF;
109         }
110         if ( roundBits ) float_exception_flags |= float_flag_inexact;
111         return z;
112     
113     }
114     
115     /*
116     -------------------------------------------------------------------------------
117     Returns the fraction bits of the single-precision floating-point value `a'.
118     -------------------------------------------------------------------------------
119     */
120     INLINE bits32 extractFloat32Frac( float32 a )
121     {
122     
123         return a & 0x007FFFFF;
124     
125     }
126     
127     /*
128     -------------------------------------------------------------------------------
129     Returns the exponent bits of the single-precision floating-point value `a'.
130     -------------------------------------------------------------------------------
131     */
132     INLINE int16 extractFloat32Exp( float32 a )
133     {
134     
135         return ( a>>23 ) & 0xFF;
136     
137     }
138     
139     /*
140     -------------------------------------------------------------------------------
141     Returns the sign bit of the single-precision floating-point value `a'.
142     -------------------------------------------------------------------------------
143     */
144     INLINE flag extractFloat32Sign( float32 a )
145     {
146     
147         return a>>31;
148     
149     }
150     
151     /*
152     -------------------------------------------------------------------------------
153     Normalizes the subnormal single-precision floating-point value represented
154     by the denormalized significand `aSig'.  The normalized exponent and
155     significand are stored at the locations pointed to by `zExpPtr' and
156     `zSigPtr', respectively.
157     -------------------------------------------------------------------------------
158     */
159     static void
160      normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
161     {
162         int8 shiftCount;
163     
164         shiftCount = countLeadingZeros32( aSig ) - 8;
165         *zSigPtr = aSig<<shiftCount;
166         *zExpPtr = 1 - shiftCount;
167     
168     }
169     
170     /*
171     -------------------------------------------------------------------------------
172     Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
173     single-precision floating-point value, returning the result.  After being
174     shifted into the proper positions, the three fields are simply added
175     together to form the result.  This means that any integer portion of `zSig'
176     will be added into the exponent.  Since a properly normalized significand
177     will have an integer portion equal to 1, the `zExp' input should be 1 less
178     than the desired result exponent whenever `zSig' is a complete, normalized
179     significand.
180     -------------------------------------------------------------------------------
181     */
182     INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
183     {
184     #if 0
185        float32 f;
186        __asm__("@ packFloat32;
187        	    mov %0, %1, asl #31;
188        	    orr %0, %2, asl #23;
189        	    orr %0, %3"
190        	    : /* no outputs */
191        	    : "g" (f), "g" (zSign), "g" (zExp), "g" (zSig)
192        	    : "cc");
193        return f;
194     #else
195         return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
196     #endif 
197     }
198     
199     /*
200     -------------------------------------------------------------------------------
201     Takes an abstract floating-point value having sign `zSign', exponent `zExp',
202     and significand `zSig', and returns the proper single-precision floating-
203     point value corresponding to the abstract input.  Ordinarily, the abstract
204     value is simply rounded and packed into the single-precision format, with
205     the inexact exception raised if the abstract input cannot be represented
206     exactly.  If the abstract value is too large, however, the overflow and
207     inexact exceptions are raised and an infinity or maximal finite value is
208     returned.  If the abstract value is too small, the input value is rounded to
209     a subnormal number, and the underflow and inexact exceptions are raised if
210     the abstract input cannot be represented exactly as a subnormal single-
211     precision floating-point number.
212         The input significand `zSig' has its binary point between bits 30
213     and 29, which is 7 bits to the left of the usual location.  This shifted
214     significand must be normalized or smaller.  If `zSig' is not normalized,
215     `zExp' must be 0; in that case, the result returned is a subnormal number,
216     and it must not require rounding.  In the usual case that `zSig' is
217     normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
218     The handling of underflow and overflow follows the IEC/IEEE Standard for
219     Binary Floating-point Arithmetic.
220     -------------------------------------------------------------------------------
221     */
222     static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
223     {
224         int8 roundingMode;
225         flag roundNearestEven;
226         int8 roundIncrement, roundBits;
227         flag isTiny;
228     
229         roundingMode = float_rounding_mode;
230         roundNearestEven = ( roundingMode == float_round_nearest_even );
231         roundIncrement = 0x40;
232         if ( ! roundNearestEven ) {
233             if ( roundingMode == float_round_to_zero ) {
234                 roundIncrement = 0;
235             }
236             else {
237                 roundIncrement = 0x7F;
238                 if ( zSign ) {
239                     if ( roundingMode == float_round_up ) roundIncrement = 0;
240                 }
241                 else {
242                     if ( roundingMode == float_round_down ) roundIncrement = 0;
243                 }
244             }
245         }
246         roundBits = zSig & 0x7F;
247         if ( 0xFD <= (bits16) zExp ) {
248             if (    ( 0xFD < zExp )
249                  || (    ( zExp == 0xFD )
250                       && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
251                ) {
252                 float_raise( float_flag_overflow | float_flag_inexact );
253                 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
254             }
255             if ( zExp < 0 ) {
256                 isTiny =
257                        ( float_detect_tininess == float_tininess_before_rounding )
258                     || ( zExp < -1 )
259                     || ( zSig + roundIncrement < 0x80000000 );
260                 shift32RightJamming( zSig, - zExp, &zSig );
261                 zExp = 0;
262                 roundBits = zSig & 0x7F;
263                 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
264             }
265         }
266         if ( roundBits ) float_exception_flags |= float_flag_inexact;
267         zSig = ( zSig + roundIncrement )>>7;
268         zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
269         if ( zSig == 0 ) zExp = 0;
270         return packFloat32( zSign, zExp, zSig );
271     
272     }
273     
274     /*
275     -------------------------------------------------------------------------------
276     Takes an abstract floating-point value having sign `zSign', exponent `zExp',
277     and significand `zSig', and returns the proper single-precision floating-
278     point value corresponding to the abstract input.  This routine is just like
279     `roundAndPackFloat32' except that `zSig' does not have to be normalized in
280     any way.  In all cases, `zExp' must be 1 less than the ``true'' floating-
281     point exponent.
282     -------------------------------------------------------------------------------
283     */
284     static float32
285      normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
286     {
287         int8 shiftCount;
288     
289         shiftCount = countLeadingZeros32( zSig ) - 1;
290         return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
291     
292     }
293     
294     /*
295     -------------------------------------------------------------------------------
296     Returns the fraction bits of the double-precision floating-point value `a'.
297     -------------------------------------------------------------------------------
298     */
299     INLINE bits64 extractFloat64Frac( float64 a )
300     {
301     
302         return a & LIT64( 0x000FFFFFFFFFFFFF );
303     
304     }
305     
306     /*
307     -------------------------------------------------------------------------------
308     Returns the exponent bits of the double-precision floating-point value `a'.
309     -------------------------------------------------------------------------------
310     */
311     INLINE int16 extractFloat64Exp( float64 a )
312     {
313     
314         return ( a>>52 ) & 0x7FF;
315     
316     }
317     
318     /*
319     -------------------------------------------------------------------------------
320     Returns the sign bit of the double-precision floating-point value `a'.
321     -------------------------------------------------------------------------------
322     */
323     INLINE flag extractFloat64Sign( float64 a )
324     {
325     
326         return a>>63;
327     
328     }
329     
330     /*
331     -------------------------------------------------------------------------------
332     Normalizes the subnormal double-precision floating-point value represented
333     by the denormalized significand `aSig'.  The normalized exponent and
334     significand are stored at the locations pointed to by `zExpPtr' and
335     `zSigPtr', respectively.
336     -------------------------------------------------------------------------------
337     */
338     static void
339      normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
340     {
341         int8 shiftCount;
342     
343         shiftCount = countLeadingZeros64( aSig ) - 11;
344         *zSigPtr = aSig<<shiftCount;
345         *zExpPtr = 1 - shiftCount;
346     
347     }
348     
349     /*
350     -------------------------------------------------------------------------------
351     Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
352     double-precision floating-point value, returning the result.  After being
353     shifted into the proper positions, the three fields are simply added
354     together to form the result.  This means that any integer portion of `zSig'
355     will be added into the exponent.  Since a properly normalized significand
356     will have an integer portion equal to 1, the `zExp' input should be 1 less
357     than the desired result exponent whenever `zSig' is a complete, normalized
358     significand.
359     -------------------------------------------------------------------------------
360     */
361     INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
362     {
363     
364         return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
365     
366     }
367     
368     /*
369     -------------------------------------------------------------------------------
370     Takes an abstract floating-point value having sign `zSign', exponent `zExp',
371     and significand `zSig', and returns the proper double-precision floating-
372     point value corresponding to the abstract input.  Ordinarily, the abstract
373     value is simply rounded and packed into the double-precision format, with
374     the inexact exception raised if the abstract input cannot be represented
375     exactly.  If the abstract value is too large, however, the overflow and
376     inexact exceptions are raised and an infinity or maximal finite value is
377     returned.  If the abstract value is too small, the input value is rounded to
378     a subnormal number, and the underflow and inexact exceptions are raised if
379     the abstract input cannot be represented exactly as a subnormal double-
380     precision floating-point number.
381         The input significand `zSig' has its binary point between bits 62
382     and 61, which is 10 bits to the left of the usual location.  This shifted
383     significand must be normalized or smaller.  If `zSig' is not normalized,
384     `zExp' must be 0; in that case, the result returned is a subnormal number,
385     and it must not require rounding.  In the usual case that `zSig' is
386     normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
387     The handling of underflow and overflow follows the IEC/IEEE Standard for
388     Binary Floating-point Arithmetic.
389     -------------------------------------------------------------------------------
390     */
391     static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
392     {
393         int8 roundingMode;
394         flag roundNearestEven;
395         int16 roundIncrement, roundBits;
396         flag isTiny;
397     
398         roundingMode = float_rounding_mode;
399         roundNearestEven = ( roundingMode == float_round_nearest_even );
400         roundIncrement = 0x200;
401         if ( ! roundNearestEven ) {
402             if ( roundingMode == float_round_to_zero ) {
403                 roundIncrement = 0;
404             }
405             else {
406                 roundIncrement = 0x3FF;
407                 if ( zSign ) {
408                     if ( roundingMode == float_round_up ) roundIncrement = 0;
409                 }
410                 else {
411                     if ( roundingMode == float_round_down ) roundIncrement = 0;
412                 }
413             }
414         }
415         roundBits = zSig & 0x3FF;
416         if ( 0x7FD <= (bits16) zExp ) {
417             if (    ( 0x7FD < zExp )
418                  || (    ( zExp == 0x7FD )
419                       && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
420                ) {
421                 //register int lr = __builtin_return_address(0);
422                 //printk("roundAndPackFloat64 called from 0x%08x\n",lr);
423                 float_raise( float_flag_overflow | float_flag_inexact );
424                 return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
425             }
426             if ( zExp < 0 ) {
427                 isTiny =
428                        ( float_detect_tininess == float_tininess_before_rounding )
429                     || ( zExp < -1 )
430                     || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
431                 shift64RightJamming( zSig, - zExp, &zSig );
432                 zExp = 0;
433                 roundBits = zSig & 0x3FF;
434                 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
435             }
436         }
437         if ( roundBits ) float_exception_flags |= float_flag_inexact;
438         zSig = ( zSig + roundIncrement )>>10;
439         zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
440         if ( zSig == 0 ) zExp = 0;
441         return packFloat64( zSign, zExp, zSig );
442     
443     }
444     
445     /*
446     -------------------------------------------------------------------------------
447     Takes an abstract floating-point value having sign `zSign', exponent `zExp',
448     and significand `zSig', and returns the proper double-precision floating-
449     point value corresponding to the abstract input.  This routine is just like
450     `roundAndPackFloat64' except that `zSig' does not have to be normalized in
451     any way.  In all cases, `zExp' must be 1 less than the ``true'' floating-
452     point exponent.
453     -------------------------------------------------------------------------------
454     */
455     static float64
456      normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
457     {
458         int8 shiftCount;
459     
460         shiftCount = countLeadingZeros64( zSig ) - 1;
461         return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
462     
463     }
464     
465     #ifdef FLOATX80
466     
467     /*
468     -------------------------------------------------------------------------------
469     Returns the fraction bits of the extended double-precision floating-point
470     value `a'.
471     -------------------------------------------------------------------------------
472     */
473     INLINE bits64 extractFloatx80Frac( floatx80 a )
474     {
475     
476         return a.low;
477     
478     }
479     
480     /*
481     -------------------------------------------------------------------------------
482     Returns the exponent bits of the extended double-precision floating-point
483     value `a'.
484     -------------------------------------------------------------------------------
485     */
486     INLINE int32 extractFloatx80Exp( floatx80 a )
487     {
488     
489         return a.high & 0x7FFF;
490     
491     }
492     
493     /*
494     -------------------------------------------------------------------------------
495     Returns the sign bit of the extended double-precision floating-point value
496     `a'.
497     -------------------------------------------------------------------------------
498     */
499     INLINE flag extractFloatx80Sign( floatx80 a )
500     {
501     
502         return a.high>>15;
503     
504     }
505     
506     /*
507     -------------------------------------------------------------------------------
508     Normalizes the subnormal extended double-precision floating-point value
509     represented by the denormalized significand `aSig'.  The normalized exponent
510     and significand are stored at the locations pointed to by `zExpPtr' and
511     `zSigPtr', respectively.
512     -------------------------------------------------------------------------------
513     */
514     static void
515      normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
516     {
517         int8 shiftCount;
518     
519         shiftCount = countLeadingZeros64( aSig );
520         *zSigPtr = aSig<<shiftCount;
521         *zExpPtr = 1 - shiftCount;
522     
523     }
524     
525     /*
526     -------------------------------------------------------------------------------
527     Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
528     extended double-precision floating-point value, returning the result.
529     -------------------------------------------------------------------------------
530     */
531     INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
532     {
533         floatx80 z;
534     
535         z.low = zSig;
536         z.high = ( ( (bits16) zSign )<<15 ) + zExp;
537         return z;
538     
539     }
540     
541     /*
542     -------------------------------------------------------------------------------
543     Takes an abstract floating-point value having sign `zSign', exponent `zExp',
544     and extended significand formed by the concatenation of `zSig0' and `zSig1',
545     and returns the proper extended double-precision floating-point value
546     corresponding to the abstract input.  Ordinarily, the abstract value is
547     rounded and packed into the extended double-precision format, with the
548     inexact exception raised if the abstract input cannot be represented
549     exactly.  If the abstract value is too large, however, the overflow and
550     inexact exceptions are raised and an infinity or maximal finite value is
551     returned.  If the abstract value is too small, the input value is rounded to
552     a subnormal number, and the underflow and inexact exceptions are raised if
553     the abstract input cannot be represented exactly as a subnormal extended
554     double-precision floating-point number.
555         If `roundingPrecision' is 32 or 64, the result is rounded to the same
556     number of bits as single or double precision, respectively.  Otherwise, the
557     result is rounded to the full precision of the extended double-precision
558     format.
559         The input significand must be normalized or smaller.  If the input
560     significand is not normalized, `zExp' must be 0; in that case, the result
561     returned is a subnormal number, and it must not require rounding.  The
562     handling of underflow and overflow follows the IEC/IEEE Standard for Binary
563     Floating-point Arithmetic.
564     -------------------------------------------------------------------------------
565     */
566     static floatx80
567      roundAndPackFloatx80(
568          int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
569      )
570     {
571         int8 roundingMode;
572         flag roundNearestEven, increment, isTiny;
573         int64 roundIncrement, roundMask, roundBits;
574     
575         roundingMode = float_rounding_mode;
576         roundNearestEven = ( roundingMode == float_round_nearest_even );
577         if ( roundingPrecision == 80 ) goto precision80;
578         if ( roundingPrecision == 64 ) {
579             roundIncrement = LIT64( 0x0000000000000400 );
580             roundMask = LIT64( 0x00000000000007FF );
581         }
582         else if ( roundingPrecision == 32 ) {
583             roundIncrement = LIT64( 0x0000008000000000 );
584             roundMask = LIT64( 0x000000FFFFFFFFFF );
585         }
586         else {
587             goto precision80;
588         }
589         zSig0 |= ( zSig1 != 0 );
590         if ( ! roundNearestEven ) {
591             if ( roundingMode == float_round_to_zero ) {
592                 roundIncrement = 0;
593             }
594             else {
595                 roundIncrement = roundMask;
596                 if ( zSign ) {
597                     if ( roundingMode == float_round_up ) roundIncrement = 0;
598                 }
599                 else {
600                     if ( roundingMode == float_round_down ) roundIncrement = 0;
601                 }
602             }
603         }
604         roundBits = zSig0 & roundMask;
605         if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
606             if (    ( 0x7FFE < zExp )
607                  || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
608                ) {
609                 goto overflow;
610             }
611             if ( zExp <= 0 ) {
612                 isTiny =
613                        ( float_detect_tininess == float_tininess_before_rounding )
614                     || ( zExp < 0 )
615                     || ( zSig0 <= zSig0 + roundIncrement );
616                 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
617                 zExp = 0;
618                 roundBits = zSig0 & roundMask;
619                 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
620                 if ( roundBits ) float_exception_flags |= float_flag_inexact;
621                 zSig0 += roundIncrement;
622                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
623                 roundIncrement = roundMask + 1;
624                 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
625                     roundMask |= roundIncrement;
626                 }
627                 zSig0 &= ~ roundMask;
628                 return packFloatx80( zSign, zExp, zSig0 );
629             }
630         }
631         if ( roundBits ) float_exception_flags |= float_flag_inexact;
632         zSig0 += roundIncrement;
633         if ( zSig0 < roundIncrement ) {
634             ++zExp;
635             zSig0 = LIT64( 0x8000000000000000 );
636         }
637         roundIncrement = roundMask + 1;
638         if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
639             roundMask |= roundIncrement;
640         }
641         zSig0 &= ~ roundMask;
642         if ( zSig0 == 0 ) zExp = 0;
643         return packFloatx80( zSign, zExp, zSig0 );
644      precision80:
645         increment = ( (sbits64) zSig1 < 0 );
646         if ( ! roundNearestEven ) {
647             if ( roundingMode == float_round_to_zero ) {
648                 increment = 0;
649             }
650             else {
651                 if ( zSign ) {
652                     increment = ( roundingMode == float_round_down ) && zSig1;
653                 }
654                 else {
655                     increment = ( roundingMode == float_round_up ) && zSig1;
656                 }
657             }
658         }
659         if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
660             if (    ( 0x7FFE < zExp )
661                  || (    ( zExp == 0x7FFE )
662                       && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
663                       && increment
664                     )
665                ) {
666                 roundMask = 0;
667      overflow:
668                 float_raise( float_flag_overflow | float_flag_inexact );
669                 if (    ( roundingMode == float_round_to_zero )
670                      || ( zSign && ( roundingMode == float_round_up ) )
671                      || ( ! zSign && ( roundingMode == float_round_down ) )
672                    ) {
673                     return packFloatx80( zSign, 0x7FFE, ~ roundMask );
674                 }
675                 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
676             }
677             if ( zExp <= 0 ) {
678                 isTiny =
679                        ( float_detect_tininess == float_tininess_before_rounding )
680                     || ( zExp < 0 )
681                     || ! increment
682                     || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
683                 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
684                 zExp = 0;
685                 if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
686                 if ( zSig1 ) float_exception_flags |= float_flag_inexact;
687                 if ( roundNearestEven ) {
688                     increment = ( (sbits64) zSig1 < 0 );
689                 }
690                 else {
691                     if ( zSign ) {
692                         increment = ( roundingMode == float_round_down ) && zSig1;
693                     }
694                     else {
695                         increment = ( roundingMode == float_round_up ) && zSig1;
696                     }
697                 }
698                 if ( increment ) {
699                     ++zSig0;
700                     zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
701                     if ( (sbits64) zSig0 < 0 ) zExp = 1;
702                 }
703                 return packFloatx80( zSign, zExp, zSig0 );
704             }
705         }
706         if ( zSig1 ) float_exception_flags |= float_flag_inexact;
707         if ( increment ) {
708             ++zSig0;
709             if ( zSig0 == 0 ) {
710                 ++zExp;
711                 zSig0 = LIT64( 0x8000000000000000 );
712             }
713             else {
714                 zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
715             }
716         }
717         else {
718             if ( zSig0 == 0 ) zExp = 0;
719         }
720         
721         return packFloatx80( zSign, zExp, zSig0 );
722     }
723     
724     /*
725     -------------------------------------------------------------------------------
726     Takes an abstract floating-point value having sign `zSign', exponent
727     `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
728     and returns the proper extended double-precision floating-point value
729     corresponding to the abstract input.  This routine is just like
730     `roundAndPackFloatx80' except that the input significand does not have to be
731     normalized.
732     -------------------------------------------------------------------------------
733     */
734     static floatx80
735      normalizeRoundAndPackFloatx80(
736          int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
737      )
738     {
739         int8 shiftCount;
740     
741         if ( zSig0 == 0 ) {
742             zSig0 = zSig1;
743             zSig1 = 0;
744             zExp -= 64;
745         }
746         shiftCount = countLeadingZeros64( zSig0 );
747         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
748         zExp -= shiftCount;
749         return
750             roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
751     
752     }
753     
754     #endif
755     
756     #ifdef FLOAT128
757     
758     /*
759     -------------------------------------------------------------------------------
760     Returns the least-significant 64 fraction bits of the quadruple-precision
761     floating-point value `a'.
762     -------------------------------------------------------------------------------
763     */
764     INLINE bits64 extractFloat128Frac1( float128 a )
765     {
766     
767         return a.low;
768     
769     }
770     
771     /*
772     -------------------------------------------------------------------------------
773     Returns the most-significant 48 fraction bits of the quadruple-precision
774     floating-point value `a'.
775     -------------------------------------------------------------------------------
776     */
777     INLINE bits64 extractFloat128Frac0( float128 a )
778     {
779     
780         return a.high & LIT64( 0x0000FFFFFFFFFFFF );
781     
782     }
783     
784     /*
785     -------------------------------------------------------------------------------
786     Returns the exponent bits of the quadruple-precision floating-point value
787     `a'.
788     -------------------------------------------------------------------------------
789     */
790     INLINE int32 extractFloat128Exp( float128 a )
791     {
792     
793         return ( a.high>>48 ) & 0x7FFF;
794     
795     }
796     
797     /*
798     -------------------------------------------------------------------------------
799     Returns the sign bit of the quadruple-precision floating-point value `a'.
800     -------------------------------------------------------------------------------
801     */
802     INLINE flag extractFloat128Sign( float128 a )
803     {
804     
805         return a.high>>63;
806     
807     }
808     
809     /*
810     -------------------------------------------------------------------------------
811     Normalizes the subnormal quadruple-precision floating-point value
812     represented by the denormalized significand formed by the concatenation of
813     `aSig0' and `aSig1'.  The normalized exponent is stored at the location
814     pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
815     significand are stored at the location pointed to by `zSig0Ptr', and the
816     least significant 64 bits of the normalized significand are stored at the
817     location pointed to by `zSig1Ptr'.
818     -------------------------------------------------------------------------------
819     */
820     static void
821      normalizeFloat128Subnormal(
822          bits64 aSig0,
823          bits64 aSig1,
824          int32 *zExpPtr,
825          bits64 *zSig0Ptr,
826          bits64 *zSig1Ptr
827      )
828     {
829         int8 shiftCount;
830     
831         if ( aSig0 == 0 ) {
832             shiftCount = countLeadingZeros64( aSig1 ) - 15;
833             if ( shiftCount < 0 ) {
834                 *zSig0Ptr = aSig1>>( - shiftCount );
835                 *zSig1Ptr = aSig1<<( shiftCount & 63 );
836             }
837             else {
838                 *zSig0Ptr = aSig1<<shiftCount;
839                 *zSig1Ptr = 0;
840             }
841             *zExpPtr = - shiftCount - 63;
842         }
843         else {
844             shiftCount = countLeadingZeros64( aSig0 ) - 15;
845             shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
846             *zExpPtr = 1 - shiftCount;
847         }
848     
849     }
850     
851     /*
852     -------------------------------------------------------------------------------
853     Packs the sign `zSign', the exponent `zExp', and the significand formed
854     by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
855     floating-point value, returning the result.  After being shifted into the
856     proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
857     added together to form the most significant 32 bits of the result.  This
858     means that any integer portion of `zSig0' will be added into the exponent.
859     Since a properly normalized significand will have an integer portion equal
860     to 1, the `zExp' input should be 1 less than the desired result exponent
861     whenever `zSig0' and `zSig1' concatenated form a complete, normalized
862     significand.
863     -------------------------------------------------------------------------------
864     */
865     INLINE float128
866      packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
867     {
868         float128 z;
869     
870         z.low = zSig1;
871         z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
872         return z;
873     
874     }
875     
876     /*
877     -------------------------------------------------------------------------------
878     Takes an abstract floating-point value having sign `zSign', exponent `zExp',
879     and extended significand formed by the concatenation of `zSig0', `zSig1',
880     and `zSig2', and returns the proper quadruple-precision floating-point value
881     corresponding to the abstract input.  Ordinarily, the abstract value is
882     simply rounded and packed into the quadruple-precision format, with the
883     inexact exception raised if the abstract input cannot be represented
884     exactly.  If the abstract value is too large, however, the overflow and
885     inexact exceptions are raised and an infinity or maximal finite value is
886     returned.  If the abstract value is too small, the input value is rounded to
887     a subnormal number, and the underflow and inexact exceptions are raised if
888     the abstract input cannot be represented exactly as a subnormal quadruple-
889     precision floating-point number.
890         The input significand must be normalized or smaller.  If the input
891     significand is not normalized, `zExp' must be 0; in that case, the result
892     returned is a subnormal number, and it must not require rounding.  In the
893     usual case that the input significand is normalized, `zExp' must be 1 less
894     than the ``true'' floating-point exponent.  The handling of underflow and
895     overflow follows the IEC/IEEE Standard for Binary Floating-point Arithmetic.
896     -------------------------------------------------------------------------------
897     */
898     static float128
899      roundAndPackFloat128(
900          flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
901     {
902         int8 roundingMode;
903         flag roundNearestEven, increment, isTiny;
904     
905         roundingMode = float_rounding_mode;
906         roundNearestEven = ( roundingMode == float_round_nearest_even );
907         increment = ( (sbits64) zSig2 < 0 );
908         if ( ! roundNearestEven ) {
909             if ( roundingMode == float_round_to_zero ) {
910                 increment = 0;
911             }
912             else {
913                 if ( zSign ) {
914                     increment = ( roundingMode == float_round_down ) && zSig2;
915                 }
916                 else {
917                     increment = ( roundingMode == float_round_up ) && zSig2;
918                 }
919             }
920         }
921         if ( 0x7FFD <= (bits32) zExp ) {
922             if (    ( 0x7FFD < zExp )
923                  || (    ( zExp == 0x7FFD )
924                       && eq128(
925                              LIT64( 0x0001FFFFFFFFFFFF ),
926                              LIT64( 0xFFFFFFFFFFFFFFFF ),
927                              zSig0,
928                              zSig1
929                          )
930                       && increment
931                     )
932                ) {
933                 float_raise( float_flag_overflow | float_flag_inexact );
934                 if (    ( roundingMode == float_round_to_zero )
935                      || ( zSign && ( roundingMode == float_round_up ) )
936                      || ( ! zSign && ( roundingMode == float_round_down ) )
937                    ) {
938                     return
939                         packFloat128(
940                             zSign,
941                             0x7FFE,
942                             LIT64( 0x0000FFFFFFFFFFFF ),
943                             LIT64( 0xFFFFFFFFFFFFFFFF )
944                         );
945                 }
946                 return packFloat128( zSign, 0x7FFF, 0, 0 );
947             }
948             if ( zExp < 0 ) {
949                 isTiny =
950                        ( float_detect_tininess == float_tininess_before_rounding )
951                     || ( zExp < -1 )
952                     || ! increment
953                     || lt128(
954                            zSig0,
955                            zSig1,
956                            LIT64( 0x0001FFFFFFFFFFFF ),
957                            LIT64( 0xFFFFFFFFFFFFFFFF )
958                        );
959                 shift128ExtraRightJamming(
960                     zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
961                 zExp = 0;
962                 if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
963                 if ( roundNearestEven ) {
964                     increment = ( (sbits64) zSig2 < 0 );
965                 }
966                 else {
967                     if ( zSign ) {
968                         increment = ( roundingMode == float_round_down ) && zSig2;
969                     }
970                     else {
971                         increment = ( roundingMode == float_round_up ) && zSig2;
972                     }
973                 }
974             }
975         }
976         if ( zSig2 ) float_exception_flags |= float_flag_inexact;
977         if ( increment ) {
978             add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
979             zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
980         }
981         else {
982             if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
983         }
984         return packFloat128( zSign, zExp, zSig0, zSig1 );
985     
986     }
987     
988     /*
989     -------------------------------------------------------------------------------
990     Takes an abstract floating-point value having sign `zSign', exponent `zExp',
991     and significand formed by the concatenation of `zSig0' and `zSig1', and
992     returns the proper quadruple-precision floating-point value corresponding to
993     the abstract input.  This routine is just like `roundAndPackFloat128' except
994     that the input significand has fewer bits and does not have to be normalized
995     in any way.  In all cases, `zExp' must be 1 less than the ``true'' floating-
996     point exponent.
997     -------------------------------------------------------------------------------
998     */
999     static float128
1000      normalizeRoundAndPackFloat128(
1001          flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1002     {
1003         int8 shiftCount;
1004         bits64 zSig2;
1005     
1006         if ( zSig0 == 0 ) {
1007             zSig0 = zSig1;
1008             zSig1 = 0;
1009             zExp -= 64;
1010         }
1011         shiftCount = countLeadingZeros64( zSig0 ) - 15;
1012         if ( 0 <= shiftCount ) {
1013             zSig2 = 0;
1014             shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1015         }
1016         else {
1017             shift128ExtraRightJamming(
1018                 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1019         }
1020         zExp -= shiftCount;
1021         return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1022     
1023     }
1024     
1025     #endif
1026     
1027     /*
1028     -------------------------------------------------------------------------------
1029     Returns the result of converting the 32-bit two's complement integer `a' to
1030     the single-precision floating-point format.  The conversion is performed
1031     according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1032     -------------------------------------------------------------------------------
1033     */
1034     float32 int32_to_float32( int32 a )
1035     {
1036         flag zSign;
1037     
1038         if ( a == 0 ) return 0;
1039         if ( a == 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1040         zSign = ( a < 0 );
1041         return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
1042     
1043     }
1044     
1045     /*
1046     -------------------------------------------------------------------------------
1047     Returns the result of converting the 32-bit two's complement integer `a' to
1048     the double-precision floating-point format.  The conversion is performed
1049     according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1050     -------------------------------------------------------------------------------
1051     */
1052     float64 int32_to_float64( int32 a )
1053     {
1054         flag aSign;
1055         uint32 absA;
1056         int8 shiftCount;
1057         bits64 zSig;
1058     
1059         if ( a == 0 ) return 0;
1060         aSign = ( a < 0 );
1061         absA = aSign ? - a : a;
1062         shiftCount = countLeadingZeros32( absA ) + 21;
1063         zSig = absA;
1064         return packFloat64( aSign, 0x432 - shiftCount, zSig<<shiftCount );
1065     
1066     }
1067     
1068     #ifdef FLOATX80
1069     
1070     /*
1071     -------------------------------------------------------------------------------
1072     Returns the result of converting the 32-bit two's complement integer `a'
1073     to the extended double-precision floating-point format.  The conversion
1074     is performed according to the IEC/IEEE Standard for Binary Floating-point
1075     Arithmetic.
1076     -------------------------------------------------------------------------------
1077     */
1078     floatx80 int32_to_floatx80( int32 a )
1079     {
1080         flag zSign;
1081         uint32 absA;
1082         int8 shiftCount;
1083         bits64 zSig;
1084     
1085         if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1086         zSign = ( a < 0 );
1087         absA = zSign ? - a : a;
1088         shiftCount = countLeadingZeros32( absA ) + 32;
1089         zSig = absA;
1090         return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1091     
1092     }
1093     
1094     #endif
1095     
1096     #ifdef FLOAT128
1097     
1098     /*
1099     -------------------------------------------------------------------------------
1100     Returns the result of converting the 32-bit two's complement integer `a' to
1101     the quadruple-precision floating-point format.  The conversion is performed
1102     according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1103     -------------------------------------------------------------------------------
1104     */
1105     float128 int32_to_float128( int32 a )
1106     {
1107         flag zSign;
1108         uint32 absA;
1109         int8 shiftCount;
1110         bits64 zSig0;
1111     
1112         if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1113         zSign = ( a < 0 );
1114         absA = zSign ? - a : a;
1115         shiftCount = countLeadingZeros32( absA ) + 17;
1116         zSig0 = absA;
1117         return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1118     
1119     }
1120     
1121     #endif
1122     
1123     /*
1124     -------------------------------------------------------------------------------
1125     Returns the result of converting the single-precision floating-point value
1126     `a' to the 32-bit two's complement integer format.  The conversion is
1127     performed according to the IEC/IEEE Standard for Binary Floating-point
1128     Arithmetic---which means in particular that the conversion is rounded
1129     according to the current rounding mode.  If `a' is a NaN, the largest
1130     positive integer is returned.  Otherwise, if the conversion overflows, the
1131     largest integer with the same sign as `a' is returned.
1132     -------------------------------------------------------------------------------
1133     */
1134     int32 float32_to_int32( float32 a )
1135     {
1136         flag aSign;
1137         int16 aExp, shiftCount;
1138         bits32 aSig;
1139         bits64 zSig;
1140     
1141         aSig = extractFloat32Frac( a );
1142         aExp = extractFloat32Exp( a );
1143         aSign = extractFloat32Sign( a );
1144         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1145         if ( aExp ) aSig |= 0x00800000;
1146         shiftCount = 0xAF - aExp;
1147         zSig = aSig;
1148         zSig <<= 32;
1149         if ( 0 < shiftCount ) shift64RightJamming( zSig, shiftCount, &zSig );
1150         return roundAndPackInt32( aSign, zSig );
1151     
1152     }
1153     
1154     /*
1155     -------------------------------------------------------------------------------
1156     Returns the result of converting the single-precision floating-point value
1157     `a' to the 32-bit two's complement integer format.  The conversion is
1158     performed according to the IEC/IEEE Standard for Binary Floating-point
1159     Arithmetic, except that the conversion is always rounded toward zero.  If
1160     `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1161     conversion overflows, the largest integer with the same sign as `a' is
1162     returned.
1163     -------------------------------------------------------------------------------
1164     */
1165     int32 float32_to_int32_round_to_zero( float32 a )
1166     {
1167         flag aSign;
1168         int16 aExp, shiftCount;
1169         bits32 aSig;
1170         int32 z;
1171     
1172         aSig = extractFloat32Frac( a );
1173         aExp = extractFloat32Exp( a );
1174         aSign = extractFloat32Sign( a );
1175         shiftCount = aExp - 0x9E;
1176         if ( 0 <= shiftCount ) {
1177             if ( a == 0xCF000000 ) return 0x80000000;
1178             float_raise( float_flag_invalid );
1179             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1180             return 0x80000000;
1181         }
1182         else if ( aExp <= 0x7E ) {
1183             if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1184             return 0;
1185         }
1186         aSig = ( aSig | 0x00800000 )<<8;
1187         z = aSig>>( - shiftCount );
1188         if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1189             float_exception_flags |= float_flag_inexact;
1190         }
1191         return aSign ? - z : z;
1192     
1193     }
1194     
1195     /*
1196     -------------------------------------------------------------------------------
1197     Returns the result of converting the single-precision floating-point value
1198     `a' to the double-precision floating-point format.  The conversion is
1199     performed according to the IEC/IEEE Standard for Binary Floating-point
1200     Arithmetic.
1201     -------------------------------------------------------------------------------
1202     */
1203     float64 float32_to_float64( float32 a )
1204     {
1205         flag aSign;
1206         int16 aExp;
1207         bits32 aSig;
1208     
1209         aSig = extractFloat32Frac( a );
1210         aExp = extractFloat32Exp( a );
1211         aSign = extractFloat32Sign( a );
1212         if ( aExp == 0xFF ) {
1213             if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1214             return packFloat64( aSign, 0x7FF, 0 );
1215         }
1216         if ( aExp == 0 ) {
1217             if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1218             normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1219             --aExp;
1220         }
1221         return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1222     
1223     }
1224     
1225     #ifdef FLOATX80
1226     
1227     /*
1228     -------------------------------------------------------------------------------
1229     Returns the result of converting the single-precision floating-point value
1230     `a' to the extended double-precision floating-point format.  The conversion
1231     is performed according to the IEC/IEEE Standard for Binary Floating-point
1232     Arithmetic.
1233     -------------------------------------------------------------------------------
1234     */
1235     floatx80 float32_to_floatx80( float32 a )
1236     {
1237         flag aSign;
1238         int16 aExp;
1239         bits32 aSig;
1240     
1241         aSig = extractFloat32Frac( a );
1242         aExp = extractFloat32Exp( a );
1243         aSign = extractFloat32Sign( a );
1244         if ( aExp == 0xFF ) {
1245             if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1246             return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1247         }
1248         if ( aExp == 0 ) {
1249             if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1250             normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1251         }
1252         aSig |= 0x00800000;
1253         return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1254     
1255     }
1256     
1257     #endif
1258     
1259     #ifdef FLOAT128
1260     
1261     /*
1262     -------------------------------------------------------------------------------
1263     Returns the result of converting the single-precision floating-point value
1264     `a' to the double-precision floating-point format.  The conversion is
1265     performed according to the IEC/IEEE Standard for Binary Floating-point
1266     Arithmetic.
1267     -------------------------------------------------------------------------------
1268     */
1269     float128 float32_to_float128( float32 a )
1270     {
1271         flag aSign;
1272         int16 aExp;
1273         bits32 aSig;
1274     
1275         aSig = extractFloat32Frac( a );
1276         aExp = extractFloat32Exp( a );
1277         aSign = extractFloat32Sign( a );
1278         if ( aExp == 0xFF ) {
1279             if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1280             return packFloat128( aSign, 0x7FFF, 0, 0 );
1281         }
1282         if ( aExp == 0 ) {
1283             if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1284             normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1285             --aExp;
1286         }
1287         return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1288     
1289     }
1290     
1291     #endif
1292     
1293     /*
1294     -------------------------------------------------------------------------------
1295     Rounds the single-precision floating-point value `a' to an integer, and
1296     returns the result as a single-precision floating-point value.  The
1297     operation is performed according to the IEC/IEEE Standard for Binary
1298     Floating-point Arithmetic.
1299     -------------------------------------------------------------------------------
1300     */
1301     float32 float32_round_to_int( float32 a )
1302     {
1303         flag aSign;
1304         int16 aExp;
1305         bits32 lastBitMask, roundBitsMask;
1306         int8 roundingMode;
1307         float32 z;
1308     
1309         aExp = extractFloat32Exp( a );
1310         if ( 0x96 <= aExp ) {
1311             if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1312                 return propagateFloat32NaN( a, a );
1313             }
1314             return a;
1315         }
1316         if ( aExp <= 0x7E ) {
1317             if ( (bits32) ( a<<1 ) == 0 ) return a;
1318             float_exception_flags |= float_flag_inexact;
1319             aSign = extractFloat32Sign( a );
1320             switch ( float_rounding_mode ) {
1321              case float_round_nearest_even:
1322                 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1323                     return packFloat32( aSign, 0x7F, 0 );
1324                 }
1325                 break;
1326              case float_round_down:
1327                 return aSign ? 0xBF800000 : 0;
1328              case float_round_up:
1329                 return aSign ? 0x80000000 : 0x3F800000;
1330             }
1331             return packFloat32( aSign, 0, 0 );
1332         }
1333         lastBitMask = 1;
1334         lastBitMask <<= 0x96 - aExp;
1335         roundBitsMask = lastBitMask - 1;
1336         z = a;
1337         roundingMode = float_rounding_mode;
1338         if ( roundingMode == float_round_nearest_even ) {
1339             z += lastBitMask>>1;
1340             if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1341         }
1342         else if ( roundingMode != float_round_to_zero ) {
1343             if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1344                 z += roundBitsMask;
1345             }
1346         }
1347         z &= ~ roundBitsMask;
1348         if ( z != a ) float_exception_flags |= float_flag_inexact;
1349         return z;
1350     
1351     }
1352     
1353     /*
1354     -------------------------------------------------------------------------------
1355     Returns the result of adding the absolute values of the single-precision
1356     floating-point values `a' and `b'.  If `zSign' is true, the sum is negated
1357     before being returned.  `zSign' is ignored if the result is a NaN.  The
1358     addition is performed according to the IEC/IEEE Standard for Binary
1359     Floating-point Arithmetic.
1360     -------------------------------------------------------------------------------
1361     */
1362     static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1363     {
1364         int16 aExp, bExp, zExp;
1365         bits32 aSig, bSig, zSig;
1366         int16 expDiff;
1367     
1368         aSig = extractFloat32Frac( a );
1369         aExp = extractFloat32Exp( a );
1370         bSig = extractFloat32Frac( b );
1371         bExp = extractFloat32Exp( b );
1372         expDiff = aExp - bExp;
1373         aSig <<= 6;
1374         bSig <<= 6;
1375         if ( 0 < expDiff ) {
1376             if ( aExp == 0xFF ) {
1377                 if ( aSig ) return propagateFloat32NaN( a, b );
1378                 return a;
1379             }
1380             if ( bExp == 0 ) {
1381                 --expDiff;
1382             }
1383             else {
1384                 bSig |= 0x20000000;
1385             }
1386             shift32RightJamming( bSig, expDiff, &bSig );
1387             zExp = aExp;
1388         }
1389         else if ( expDiff < 0 ) {
1390             if ( bExp == 0xFF ) {
1391                 if ( bSig ) return propagateFloat32NaN( a, b );
1392                 return packFloat32( zSign, 0xFF, 0 );
1393             }
1394             if ( aExp == 0 ) {
1395                 ++expDiff;
1396             }
1397             else {
1398                 aSig |= 0x20000000;
1399             }
1400             shift32RightJamming( aSig, - expDiff, &aSig );
1401             zExp = bExp;
1402         }
1403         else {
1404             if ( aExp == 0xFF ) {
1405                 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1406                 return a;
1407             }
1408             if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1409             zSig = 0x40000000 + aSig + bSig;
1410             zExp = aExp;
1411             goto roundAndPack;
1412         }
1413         aSig |= 0x20000000;
1414         zSig = ( aSig + bSig )<<1;
1415         --zExp;
1416         if ( (sbits32) zSig < 0 ) {
1417             zSig = aSig + bSig;
1418             ++zExp;
1419         }
1420      roundAndPack:
1421         return roundAndPackFloat32( zSign, zExp, zSig );
1422     
1423     }
1424     
1425     /*
1426     -------------------------------------------------------------------------------
1427     Returns the result of subtracting the absolute values of the single-
1428     precision floating-point values `a' and `b'.  If `zSign' is true, the
1429     difference is negated before being returned.  `zSign' is ignored if the
1430     result is a NaN.  The subtraction is performed according to the IEC/IEEE
1431     Standard for Binary Floating-point Arithmetic.
1432     -------------------------------------------------------------------------------
1433     */
1434     static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1435     {
1436         int16 aExp, bExp, zExp;
1437         bits32 aSig, bSig, zSig;
1438         int16 expDiff;
1439     
1440         aSig = extractFloat32Frac( a );
1441         aExp = extractFloat32Exp( a );
1442         bSig = extractFloat32Frac( b );
1443         bExp = extractFloat32Exp( b );
1444         expDiff = aExp - bExp;
1445         aSig <<= 7;
1446         bSig <<= 7;
1447         if ( 0 < expDiff ) goto aExpBigger;
1448         if ( expDiff < 0 ) goto bExpBigger;
1449         if ( aExp == 0xFF ) {
1450             if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1451             float_raise( float_flag_invalid );
1452             return float32_default_nan;
1453         }
1454         if ( aExp == 0 ) {
1455             aExp = 1;
1456             bExp = 1;
1457         }
1458         if ( bSig < aSig ) goto aBigger;
1459         if ( aSig < bSig ) goto bBigger;
1460         return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
1461      bExpBigger:
1462         if ( bExp == 0xFF ) {
1463             if ( bSig ) return propagateFloat32NaN( a, b );
1464             return packFloat32( zSign ^ 1, 0xFF, 0 );
1465         }
1466         if ( aExp == 0 ) {
1467             ++expDiff;
1468         }
1469         else {
1470             aSig |= 0x40000000;
1471         }
1472         shift32RightJamming( aSig, - expDiff, &aSig );
1473         bSig |= 0x40000000;
1474      bBigger:
1475         zSig = bSig - aSig;
1476         zExp = bExp;
1477         zSign ^= 1;
1478         goto normalizeRoundAndPack;
1479      aExpBigger:
1480         if ( aExp == 0xFF ) {
1481             if ( aSig ) return propagateFloat32NaN( a, b );
1482             return a;
1483         }
1484         if ( bExp == 0 ) {
1485             --expDiff;
1486         }
1487         else {
1488             bSig |= 0x40000000;
1489         }
1490         shift32RightJamming( bSig, expDiff, &bSig );
1491         aSig |= 0x40000000;
1492      aBigger:
1493         zSig = aSig - bSig;
1494         zExp = aExp;
1495      normalizeRoundAndPack:
1496         --zExp;
1497         return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1498     
1499     }
1500     
1501     /*
1502     -------------------------------------------------------------------------------
1503     Returns the result of adding the single-precision floating-point values `a'
1504     and `b'.  The operation is performed according to the IEC/IEEE Standard for
1505     Binary Floating-point Arithmetic.
1506     -------------------------------------------------------------------------------
1507     */
1508     float32 float32_add( float32 a, float32 b )
1509     {
1510         flag aSign, bSign;
1511     
1512         aSign = extractFloat32Sign( a );
1513         bSign = extractFloat32Sign( b );
1514         if ( aSign == bSign ) {
1515             return addFloat32Sigs( a, b, aSign );
1516         }
1517         else {
1518             return subFloat32Sigs( a, b, aSign );
1519         }
1520     
1521     }
1522     
1523     /*
1524     -------------------------------------------------------------------------------
1525     Returns the result of subtracting the single-precision floating-point values
1526     `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1527     for Binary Floating-point Arithmetic.
1528     -------------------------------------------------------------------------------
1529     */
1530     float32 float32_sub( float32 a, float32 b )
1531     {
1532         flag aSign, bSign;
1533     
1534         aSign = extractFloat32Sign( a );
1535         bSign = extractFloat32Sign( b );
1536         if ( aSign == bSign ) {
1537             return subFloat32Sigs( a, b, aSign );
1538         }
1539         else {
1540             return addFloat32Sigs( a, b, aSign );
1541         }
1542     
1543     }
1544     
1545     /*
1546     -------------------------------------------------------------------------------
1547     Returns the result of multiplying the single-precision floating-point values
1548     `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1549     for Binary Floating-point Arithmetic.
1550     -------------------------------------------------------------------------------
1551     */
1552     float32 float32_mul( float32 a, float32 b )
1553     {
1554         flag aSign, bSign, zSign;
1555         int16 aExp, bExp, zExp;
1556         bits32 aSig, bSig;
1557         bits64 zSig64;
1558         bits32 zSig;
1559     
1560         aSig = extractFloat32Frac( a );
1561         aExp = extractFloat32Exp( a );
1562         aSign = extractFloat32Sign( a );
1563         bSig = extractFloat32Frac( b );
1564         bExp = extractFloat32Exp( b );
1565         bSign = extractFloat32Sign( b );
1566         zSign = aSign ^ bSign;
1567         if ( aExp == 0xFF ) {
1568             if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1569                 return propagateFloat32NaN( a, b );
1570             }
1571             if ( ( bExp | bSig ) == 0 ) {
1572                 float_raise( float_flag_invalid );
1573                 return float32_default_nan;
1574             }
1575             return packFloat32( zSign, 0xFF, 0 );
1576         }
1577         if ( bExp == 0xFF ) {
1578             if ( bSig ) return propagateFloat32NaN( a, b );
1579             if ( ( aExp | aSig ) == 0 ) {
1580                 float_raise( float_flag_invalid );
1581                 return float32_default_nan;
1582             }
1583             return packFloat32( zSign, 0xFF, 0 );
1584         }
1585         if ( aExp == 0 ) {
1586             if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1587             normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1588         }
1589         if ( bExp == 0 ) {
1590             if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1591             normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1592         }
1593         zExp = aExp + bExp - 0x7F;
1594         aSig = ( aSig | 0x00800000 )<<7;
1595         bSig = ( bSig | 0x00800000 )<<8;
1596         shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1597         zSig = zSig64;
1598         if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1599             zSig <<= 1;
1600             --zExp;
1601         }
1602         return roundAndPackFloat32( zSign, zExp, zSig );
1603     
1604     }
1605     
1606     /*
1607     -------------------------------------------------------------------------------
1608     Returns the result of dividing the single-precision floating-point value `a'
1609     by the corresponding value `b'.  The operation is performed according to the
1610     IEC/IEEE Standard for Binary Floating-point Arithmetic.
1611     -------------------------------------------------------------------------------
1612     */
1613     float32 float32_div( float32 a, float32 b )
1614     {
1615         flag aSign, bSign, zSign;
1616         int16 aExp, bExp, zExp;
1617         bits32 aSig, bSig, zSig;
1618     
1619         aSig = extractFloat32Frac( a );
1620         aExp = extractFloat32Exp( a );
1621         aSign = extractFloat32Sign( a );
1622         bSig = extractFloat32Frac( b );
1623         bExp = extractFloat32Exp( b );
1624         bSign = extractFloat32Sign( b );
1625         zSign = aSign ^ bSign;
1626         if ( aExp == 0xFF ) {
1627             if ( aSig ) return propagateFloat32NaN( a, b );
1628             if ( bExp == 0xFF ) {
1629                 if ( bSig ) return propagateFloat32NaN( a, b );
1630                 float_raise( float_flag_invalid );
1631                 return float32_default_nan;
1632             }
1633             return packFloat32( zSign, 0xFF, 0 );
1634         }
1635         if ( bExp == 0xFF ) {
1636             if ( bSig ) return propagateFloat32NaN( a, b );
1637             return packFloat32( zSign, 0, 0 );
1638         }
1639         if ( bExp == 0 ) {
1640             if ( bSig == 0 ) {
1641                 if ( ( aExp | aSig ) == 0 ) {
1642                     float_raise( float_flag_invalid );
1643                     return float32_default_nan;
1644                 }
1645                 float_raise( float_flag_divbyzero );
1646                 return packFloat32( zSign, 0xFF, 0 );
1647             }
1648             normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1649         }
1650         if ( aExp == 0 ) {
1651             if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1652             normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1653         }
1654         zExp = aExp - bExp + 0x7D;
1655         aSig = ( aSig | 0x00800000 )<<7;
1656         bSig = ( bSig | 0x00800000 )<<8;
1657         if ( bSig <= ( aSig + aSig ) ) {
1658             aSig >>= 1;
1659             ++zExp;
1660         }
1661         zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1662         if ( ( zSig & 0x3F ) == 0 ) {
1663             zSig |= ( ( (bits64) bSig ) * zSig != ( (bits64) aSig )<<32 );
1664         }
1665         return roundAndPackFloat32( zSign, zExp, zSig );
1666     
1667     }
1668     
1669     /*
1670     -------------------------------------------------------------------------------
1671     Returns the remainder of the single-precision floating-point value `a'
1672     with respect to the corresponding value `b'.  The operation is performed
1673     according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1674     -------------------------------------------------------------------------------
1675     */
1676     float32 float32_rem( float32 a, float32 b )
1677     {
1678         flag aSign, bSign, zSign;
1679         int16 aExp, bExp, expDiff;
1680         bits32 aSig, bSig;
1681         bits32 q;
1682         bits64 aSig64, bSig64, q64;
1683         bits32 alternateASig;
1684         sbits32 sigMean;
1685     
1686         aSig = extractFloat32Frac( a );
1687         aExp = extractFloat32Exp( a );
1688         aSign = extractFloat32Sign( a );
1689         bSig = extractFloat32Frac( b );
1690         bExp = extractFloat32Exp( b );
1691         bSign = extractFloat32Sign( b );
1692         if ( aExp == 0xFF ) {
1693             if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1694                 return propagateFloat32NaN( a, b );
1695             }
1696             float_raise( float_flag_invalid );
1697             return float32_default_nan;
1698         }
1699         if ( bExp == 0xFF ) {
1700             if ( bSig ) return propagateFloat32NaN( a, b );
1701             return a;
1702         }
1703         if ( bExp == 0 ) {
1704             if ( bSig == 0 ) {
1705                 float_raise( float_flag_invalid );
1706                 return float32_default_nan;
1707             }
1708             normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1709         }
1710         if ( aExp == 0 ) {
1711             if ( aSig == 0 ) return a;
1712             normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1713         }
1714         expDiff = aExp - bExp;
1715         aSig |= 0x00800000;
1716         bSig |= 0x00800000;
1717         if ( expDiff < 32 ) {
1718             aSig <<= 8;
1719             bSig <<= 8;
1720             if ( expDiff < 0 ) {
1721                 if ( expDiff < -1 ) return a;
1722                 aSig >>= 1;
1723             }
1724             q = ( bSig <= aSig );
1725             if ( q ) aSig -= bSig;
1726             if ( 0 < expDiff ) {
1727                 q = ( ( (bits64) aSig )<<32 ) / bSig;
1728                 q >>= 32 - expDiff;
1729                 bSig >>= 2;
1730                 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1731             }
1732             else {
1733                 aSig >>= 2;
1734                 bSig >>= 2;
1735             }
1736         }
1737         else {
1738             if ( bSig <= aSig ) aSig -= bSig;
1739             aSig64 = ( (bits64) aSig )<<40;
1740             bSig64 = ( (bits64) bSig )<<40;
1741             expDiff -= 64;
1742             while ( 0 < expDiff ) {
1743                 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1744                 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1745                 aSig64 = - ( ( bSig * q64 )<<38 );
1746                 expDiff -= 62;
1747             }
1748             expDiff += 64;
1749             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1750             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1751             q = q64>>( 64 - expDiff );
1752             bSig <<= 6;
1753             aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1754         }
1755         do {
1756             alternateASig = aSig;
1757             ++q;
1758             aSig -= bSig;
1759         } while ( 0 <= (sbits32) aSig );
1760         sigMean = aSig + alternateASig;
1761         if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1762             aSig = alternateASig;
1763         }
1764         zSign = ( (sbits32) aSig < 0 );
1765         if ( zSign ) aSig = - aSig;
1766         return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
1767     
1768     }
1769     
1770     /*
1771     -------------------------------------------------------------------------------
1772     Returns the square root of the single-precision floating-point value `a'.
1773     The operation is performed according to the IEC/IEEE Standard for Binary
1774     Floating-point Arithmetic.
1775     -------------------------------------------------------------------------------
1776     */
1777     float32 float32_sqrt( float32 a )
1778     {
1779         flag aSign;
1780         int16 aExp, zExp;
1781         bits32 aSig, zSig;
1782         bits64 rem, term;
1783     
1784         aSig = extractFloat32Frac( a );
1785         aExp = extractFloat32Exp( a );
1786         aSign = extractFloat32Sign( a );
1787         if ( aExp == 0xFF ) {
1788             if ( aSig ) return propagateFloat32NaN( a, 0 );
1789             if ( ! aSign ) return a;
1790             float_raise( float_flag_invalid );
1791             return float32_default_nan;
1792         }
1793         if ( aSign ) {
1794             if ( ( aExp | aSig ) == 0 ) return a;
1795             float_raise( float_flag_invalid );
1796             return float32_default_nan;
1797         }
1798         if ( aExp == 0 ) {
1799             if ( aSig == 0 ) return 0;
1800             normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1801         }
1802         zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1803         aSig = ( aSig | 0x00800000 )<<8;
1804         zSig = estimateSqrt32( aExp, aSig ) + 2;
1805         if ( ( zSig & 0x7F ) <= 5 ) {
1806             if ( zSig < 2 ) {
1807                 zSig = 0xFFFFFFFF;
1808             }
1809             else {
1810                 aSig >>= aExp & 1;
1811                 term = ( (bits64) zSig ) * zSig;
1812                 rem = ( ( (bits64) aSig )<<32 ) - term;
1813                 while ( (sbits64) rem < 0 ) {
1814                     --zSig;
1815                     rem += ( ( (bits64) zSig )<<1 ) | 1;
1816                 }
1817                 zSig |= ( rem != 0 );
1818             }
1819         }
1820         shift32RightJamming( zSig, 1, &zSig );
1821         return roundAndPackFloat32( 0, zExp, zSig );
1822     
1823     }
1824     
1825     /*
1826     -------------------------------------------------------------------------------
1827     Returns 1 if the single-precision floating-point value `a' is equal to the
1828     corresponding value `b', and 0 otherwise.  The comparison is performed
1829     according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1830     -------------------------------------------------------------------------------
1831     */
1832     flag float32_eq( float32 a, float32 b )
1833     {
1834     
1835         if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1836              || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1837            ) {
1838             if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1839                 float_raise( float_flag_invalid );
1840             }
1841             return 0;
1842         }
1843         return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1844     
1845     }
1846     
1847     /*
1848     -------------------------------------------------------------------------------
1849     Returns 1 if the single-precision floating-point value `a' is less than or
1850     equal to the corresponding value `b', and 0 otherwise.  The comparison is
1851     performed according to the IEC/IEEE Standard for Binary Floating-point
1852     Arithmetic.
1853     -------------------------------------------------------------------------------
1854     */
1855     flag float32_le( float32 a, float32 b )
1856     {
1857         flag aSign, bSign;
1858     
1859         if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1860              || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1861            ) {
1862             float_raise( float_flag_invalid );
1863             return 0;
1864         }
1865         aSign = extractFloat32Sign( a );
1866         bSign = extractFloat32Sign( b );
1867         if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1868         return ( a == b ) || ( aSign ^ ( a < b ) );
1869     
1870     }
1871     
1872     /*
1873     -------------------------------------------------------------------------------
1874     Returns 1 if the single-precision floating-point value `a' is less than
1875     the corresponding value `b', and 0 otherwise.  The comparison is performed
1876     according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1877     -------------------------------------------------------------------------------
1878     */
1879     flag float32_lt( float32 a, float32 b )
1880     {
1881         flag aSign, bSign;
1882     
1883         if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1884              || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1885            ) {
1886             float_raise( float_flag_invalid );
1887             return 0;
1888         }
1889         aSign = extractFloat32Sign( a );
1890         bSign = extractFloat32Sign( b );
1891         if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1892         return ( a != b ) && ( aSign ^ ( a < b ) );
1893     
1894     }
1895     
1896     /*
1897     -------------------------------------------------------------------------------
1898     Returns 1 if the single-precision floating-point value `a' is equal to the
1899     corresponding value `b', and 0 otherwise.  The invalid exception is raised
1900     if either operand is a NaN.  Otherwise, the comparison is performed
1901     according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1902     -------------------------------------------------------------------------------
1903     */
1904     flag float32_eq_signaling( float32 a, float32 b )
1905     {
1906     
1907         if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1908              || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1909            ) {
1910             float_raise( float_flag_invalid );
1911             return 0;
1912         }
1913         return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1914     
1915     }
1916     
1917     /*
1918     -------------------------------------------------------------------------------
1919     Returns 1 if the single-precision floating-point value `a' is less than or
1920     equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
1921     cause an exception.  Otherwise, the comparison is performed according to the
1922     IEC/IEEE Standard for Binary Floating-point Arithmetic.
1923     -------------------------------------------------------------------------------
1924     */
1925     flag float32_le_quiet( float32 a, float32 b )
1926     {
1927         flag aSign, bSign;
1928         //int16 aExp, bExp;
1929     
1930         if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1931              || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1932            ) {
1933             if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1934                 float_raise( float_flag_invalid );
1935             }
1936             return 0;
1937         }
1938         aSign = extractFloat32Sign( a );
1939         bSign = extractFloat32Sign( b );
1940         if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1941         return ( a == b ) || ( aSign ^ ( a < b ) );
1942     
1943     }
1944     
1945     /*
1946     -------------------------------------------------------------------------------
1947     Returns 1 if the single-precision floating-point value `a' is less than
1948     the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
1949     exception.  Otherwise, the comparison is performed according to the IEC/IEEE
1950     Standard for Binary Floating-point Arithmetic.
1951     -------------------------------------------------------------------------------
1952     */
1953     flag float32_lt_quiet( float32 a, float32 b )
1954     {
1955         flag aSign, bSign;
1956     
1957         if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1958              || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1959            ) {
1960             if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1961                 float_raise( float_flag_invalid );
1962             }
1963             return 0;
1964         }
1965         aSign = extractFloat32Sign( a );
1966         bSign = extractFloat32Sign( b );
1967         if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1968         return ( a != b ) && ( aSign ^ ( a < b ) );
1969     
1970     }
1971     
1972     /*
1973     -------------------------------------------------------------------------------
1974     Returns the result of converting the double-precision floating-point value
1975     `a' to the 32-bit two's complement integer format.  The conversion is
1976     performed according to the IEC/IEEE Standard for Binary Floating-point
1977     Arithmetic---which means in particular that the conversion is rounded
1978     according to the current rounding mode.  If `a' is a NaN, the largest
1979     positive integer is returned.  Otherwise, if the conversion overflows, the
1980     largest integer with the same sign as `a' is returned.
1981     -------------------------------------------------------------------------------
1982     */
1983     int32 float64_to_int32( float64 a )
1984     {
1985         flag aSign;
1986         int16 aExp, shiftCount;
1987         bits64 aSig;
1988     
1989         aSig = extractFloat64Frac( a );
1990         aExp = extractFloat64Exp( a );
1991         aSign = extractFloat64Sign( a );
1992         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1993         if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1994         shiftCount = 0x42C - aExp;
1995         if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1996         return roundAndPackInt32( aSign, aSig );
1997     
1998     }
1999     
2000     /*
2001     -------------------------------------------------------------------------------
2002     Returns the result of converting the double-precision floating-point value
2003     `a' to the 32-bit two's complement integer format.  The conversion is
2004     performed according to the IEC/IEEE Standard for Binary Floating-point
2005     Arithmetic, except that the conversion is always rounded toward zero.  If
2006     `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
2007     conversion overflows, the largest integer with the same sign as `a' is
2008     returned.
2009     -------------------------------------------------------------------------------
2010     */
2011     int32 float64_to_int32_round_to_zero( float64 a )
2012     {
2013         flag aSign;
2014         int16 aExp, shiftCount;
2015         bits64 aSig, savedASig;
2016         int32 z;
2017     
2018         aSig = extractFloat64Frac( a );
2019         aExp = extractFloat64Exp( a );
2020         aSign = extractFloat64Sign( a );
2021         shiftCount = 0x433 - aExp;
2022         if ( shiftCount < 21 ) {
2023             if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2024             goto invalid;
2025         }
2026         else if ( 52 < shiftCount ) {
2027             if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2028             return 0;
2029         }
2030         aSig |= LIT64( 0x0010000000000000 );
2031         savedASig = aSig;
2032         aSig >>= shiftCount;
2033         z = aSig;
2034         if ( aSign ) z = - z;
2035         if ( ( z < 0 ) ^ aSign ) {
2036      invalid:
2037             float_exception_flags |= float_flag_invalid;
2038             return aSign ? 0x80000000 : 0x7FFFFFFF;
2039         }
2040         if ( ( aSig<<shiftCount ) != savedASig ) {
2041             float_exception_flags |= float_flag_inexact;
2042         }
2043         return z;
2044     
2045     }
2046     
2047     /*
2048     -------------------------------------------------------------------------------
2049     Returns the result of converting the double-precision floating-point value
2050     `a' to the 32-bit two's complement unsigned integer format.  The conversion
2051     is performed according to the IEC/IEEE Standard for Binary Floating-point
2052     Arithmetic---which means in particular that the conversion is rounded
2053     according to the current rounding mode.  If `a' is a NaN, the largest
2054     positive integer is returned.  Otherwise, if the conversion overflows, the
2055     largest positive integer is returned.
2056     -------------------------------------------------------------------------------
2057     */
2058     int32 float64_to_uint32( float64 a )
2059     {
2060         flag aSign;
2061         int16 aExp, shiftCount;
2062         bits64 aSig;
2063     
2064         aSig = extractFloat64Frac( a );
2065         aExp = extractFloat64Exp( a );
2066         aSign = 0; //extractFloat64Sign( a );
2067         //if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2068         if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2069         shiftCount = 0x42C - aExp;
2070         if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2071         return roundAndPackInt32( aSign, aSig );
2072     }
2073     
2074     /*
2075     -------------------------------------------------------------------------------
2076     Returns the result of converting the double-precision floating-point value
2077     `a' to the 32-bit two's complement integer format.  The conversion is
2078     performed according to the IEC/IEEE Standard for Binary Floating-point
2079     Arithmetic, except that the conversion is always rounded toward zero.  If
2080     `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
2081     conversion overflows, the largest positive integer is returned.
2082     -------------------------------------------------------------------------------
2083     */
2084     int32 float64_to_uint32_round_to_zero( float64 a )
2085     {
2086         flag aSign;
2087         int16 aExp, shiftCount;
2088         bits64 aSig, savedASig;
2089         int32 z;
2090     
2091         aSig = extractFloat64Frac( a );
2092         aExp = extractFloat64Exp( a );
2093         aSign = extractFloat64Sign( a );
2094         shiftCount = 0x433 - aExp;
2095         if ( shiftCount < 21 ) {
2096             if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2097             goto invalid;
2098         }
2099         else if ( 52 < shiftCount ) {
2100             if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2101             return 0;
2102         }
2103         aSig |= LIT64( 0x0010000000000000 );
2104         savedASig = aSig;
2105         aSig >>= shiftCount;
2106         z = aSig;
2107         if ( aSign ) z = - z;
2108         if ( ( z < 0 ) ^ aSign ) {
2109      invalid:
2110             float_exception_flags |= float_flag_invalid;
2111             return aSign ? 0x80000000 : 0x7FFFFFFF;
2112         }
2113         if ( ( aSig<<shiftCount ) != savedASig ) {
2114             float_exception_flags |= float_flag_inexact;
2115         }
2116         return z;
2117     }
2118     
2119     /*
2120     -------------------------------------------------------------------------------
2121     Returns the result of converting the double-precision floating-point value
2122     `a' to the single-precision floating-point format.  The conversion is
2123     performed according to the IEC/IEEE Standard for Binary Floating-point
2124     Arithmetic.
2125     -------------------------------------------------------------------------------
2126     */
2127     float32 float64_to_float32( float64 a )
2128     {
2129         flag aSign;
2130         int16 aExp;
2131         bits64 aSig;
2132         bits32 zSig;
2133     
2134         aSig = extractFloat64Frac( a );
2135         aExp = extractFloat64Exp( a );
2136         aSign = extractFloat64Sign( a );
2137         if ( aExp == 0x7FF ) {
2138             if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2139             return packFloat32( aSign, 0xFF, 0 );
2140         }
2141         shift64RightJamming( aSig, 22, &aSig );
2142         zSig = aSig;
2143         if ( aExp || zSig ) {
2144             zSig |= 0x40000000;
2145             aExp -= 0x381;
2146         }
2147         return roundAndPackFloat32( aSign, aExp, zSig );
2148     
2149     }
2150     
2151     #ifdef FLOATX80
2152     
2153     /*
2154     -------------------------------------------------------------------------------
2155     Returns the result of converting the double-precision floating-point value
2156     `a' to the extended double-precision floating-point format.  The conversion
2157     is performed according to the IEC/IEEE Standard for Binary Floating-point
2158     Arithmetic.
2159     -------------------------------------------------------------------------------
2160     */
2161     floatx80 float64_to_floatx80( float64 a )
2162     {
2163         flag aSign;
2164         int16 aExp;
2165         bits64 aSig;
2166     
2167         aSig = extractFloat64Frac( a );
2168         aExp = extractFloat64Exp( a );
2169         aSign = extractFloat64Sign( a );
2170         if ( aExp == 0x7FF ) {
2171             if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2172             return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2173         }
2174         if ( aExp == 0 ) {
2175             if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2176             normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2177         }
2178         return
2179             packFloatx80(
2180                 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2181     
2182     }
2183     
2184     #endif
2185     
2186     #ifdef FLOAT128
2187     
2188     /*
2189     -------------------------------------------------------------------------------
2190     Returns the result of converting the double-precision floating-point value
2191     `a' to the quadruple-precision floating-point format.  The conversion is
2192     performed according to the IEC/IEEE Standard for Binary Floating-point
2193     Arithmetic.
2194     -------------------------------------------------------------------------------
2195     */
2196     float128 float64_to_float128( float64 a )
2197     {
2198         flag aSign;
2199         int16 aExp;
2200         bits64 aSig, zSig0, zSig1;
2201     
2202         aSig = extractFloat64Frac( a );
2203         aExp = extractFloat64Exp( a );
2204         aSign = extractFloat64Sign( a );
2205         if ( aExp == 0x7FF ) {
2206             if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2207             return packFloat128( aSign, 0x7FFF, 0, 0 );
2208         }
2209         if ( aExp == 0 ) {
2210             if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2211             normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2212             --aExp;
2213         }
2214         shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2215         return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2216     
2217     }
2218     
2219     #endif
2220     
2221     /*
2222     -------------------------------------------------------------------------------
2223     Rounds the double-precision floating-point value `a' to an integer, and
2224     returns the result as a double-precision floating-point value.  The
2225     operation is performed according to the IEC/IEEE Standard for Binary
2226     Floating-point Arithmetic.
2227     -------------------------------------------------------------------------------
2228     */
2229     float64 float64_round_to_int( float64 a )
2230     {
2231         flag aSign;
2232         int16 aExp;
2233         bits64 lastBitMask, roundBitsMask;
2234         int8 roundingMode;
2235         float64 z;
2236     
2237         aExp = extractFloat64Exp( a );
2238         if ( 0x433 <= aExp ) {
2239             if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2240                 return propagateFloat64NaN( a, a );
2241             }
2242             return a;
2243         }
2244         if ( aExp <= 0x3FE ) {
2245             if ( (bits64) ( a<<1 ) == 0 ) return a;
2246             float_exception_flags |= float_flag_inexact;
2247             aSign = extractFloat64Sign( a );
2248             switch ( float_rounding_mode ) {
2249              case float_round_nearest_even:
2250                 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2251                     return packFloat64( aSign, 0x3FF, 0 );
2252                 }
2253                 break;
2254              case float_round_down:
2255                 return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2256              case float_round_up:
2257                 return
2258                 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2259             }
2260             return packFloat64( aSign, 0, 0 );
2261         }
2262         lastBitMask = 1;
2263         lastBitMask <<= 0x433 - aExp;
2264         roundBitsMask = lastBitMask - 1;
2265         z = a;
2266         roundingMode = float_rounding_mode;
2267         if ( roundingMode == float_round_nearest_even ) {
2268             z += lastBitMask>>1;
2269             if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2270         }
2271         else if ( roundingMode != float_round_to_zero ) {
2272             if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2273                 z += roundBitsMask;
2274             }
2275         }
2276         z &= ~ roundBitsMask;
2277         if ( z != a ) float_exception_flags |= float_flag_inexact;
2278         return z;
2279     
2280     }
2281     
2282     /*
2283     -------------------------------------------------------------------------------
2284     Returns the result of adding the absolute values of the double-precision
2285     floating-point values `a' and `b'.  If `zSign' is true, the sum is negated
2286     before being returned.  `zSign' is ignored if the result is a NaN.  The
2287     addition is performed according to the IEC/IEEE Standard for Binary
2288     Floating-point Arithmetic.
2289     -------------------------------------------------------------------------------
2290     */
2291     static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2292     {
2293         int16 aExp, bExp, zExp;
2294         bits64 aSig, bSig, zSig;
2295         int16 expDiff;
2296     
2297         aSig = extractFloat64Frac( a );
2298         aExp = extractFloat64Exp( a );
2299         bSig = extractFloat64Frac( b );
2300         bExp = extractFloat64Exp( b );
2301         expDiff = aExp - bExp;
2302         aSig <<= 9;
2303         bSig <<= 9;
2304         if ( 0 < expDiff ) {
2305             if ( aExp == 0x7FF ) {
2306                 if ( aSig ) return propagateFloat64NaN( a, b );
2307                 return a;
2308             }
2309             if ( bExp == 0 ) {
2310                 --expDiff;
2311             }
2312             else {
2313                 bSig |= LIT64( 0x2000000000000000 );
2314             }
2315             shift64RightJamming( bSig, expDiff, &bSig );
2316             zExp = aExp;
2317         }
2318         else if ( expDiff < 0 ) {
2319             if ( bExp == 0x7FF ) {
2320                 if ( bSig ) return propagateFloat64NaN( a, b );
2321                 return packFloat64( zSign, 0x7FF, 0 );
2322             }
2323             if ( aExp == 0 ) {
2324                 ++expDiff;
2325             }
2326             else {
2327                 aSig |= LIT64( 0x2000000000000000 );
2328             }
2329             shift64RightJamming( aSig, - expDiff, &aSig );
2330             zExp = bExp;
2331         }
2332         else {
2333             if ( aExp == 0x7FF ) {
2334                 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2335                 return a;
2336             }
2337             if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2338             zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2339             zExp = aExp;
2340             goto roundAndPack;
2341         }
2342         aSig |= LIT64( 0x2000000000000000 );
2343         zSig = ( aSig + bSig )<<1;
2344         --zExp;
2345         if ( (sbits64) zSig < 0 ) {
2346             zSig = aSig + bSig;
2347             ++zExp;
2348         }
2349      roundAndPack:
2350         return roundAndPackFloat64( zSign, zExp, zSig );
2351     
2352     }
2353     
2354     /*
2355     -------------------------------------------------------------------------------
2356     Returns the result of subtracting the absolute values of the double-
2357     precision floating-point values `a' and `b'.  If `zSign' is true, the
2358     difference is negated before being returned.  `zSign' is ignored if the
2359     result is a NaN.  The subtraction is performed according to the IEC/IEEE
2360     Standard for Binary Floating-point Arithmetic.
2361     -------------------------------------------------------------------------------
2362     */
2363     static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2364     {
2365         int16 aExp, bExp, zExp;
2366         bits64 aSig, bSig, zSig;
2367         int16 expDiff;
2368     
2369         aSig = extractFloat64Frac( a );
2370         aExp = extractFloat64Exp( a );
2371         bSig = extractFloat64Frac( b );
2372         bExp = extractFloat64Exp( b );
2373         expDiff = aExp - bExp;
2374         aSig <<= 10;
2375         bSig <<= 10;
2376         if ( 0 < expDiff ) goto aExpBigger;
2377         if ( expDiff < 0 ) goto bExpBigger;
2378         if ( aExp == 0x7FF ) {
2379             if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2380             float_raise( float_flag_invalid );
2381             return float64_default_nan;
2382         }
2383         if ( aExp == 0 ) {
2384             aExp = 1;
2385             bExp = 1;
2386         }
2387         if ( bSig < aSig ) goto aBigger;
2388         if ( aSig < bSig ) goto bBigger;
2389         return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
2390      bExpBigger:
2391         if ( bExp == 0x7FF ) {
2392             if ( bSig ) return propagateFloat64NaN( a, b );
2393             return packFloat64( zSign ^ 1, 0x7FF, 0 );
2394         }
2395         if ( aExp == 0 ) {
2396             ++expDiff;
2397         }
2398         else {
2399             aSig |= LIT64( 0x4000000000000000 );
2400         }
2401         shift64RightJamming( aSig, - expDiff, &aSig );
2402         bSig |= LIT64( 0x4000000000000000 );
2403      bBigger:
2404         zSig = bSig - aSig;
2405         zExp = bExp;
2406         zSign ^= 1;
2407         goto normalizeRoundAndPack;
2408      aExpBigger:
2409         if ( aExp == 0x7FF ) {
2410             if ( aSig ) return propagateFloat64NaN( a, b );
2411             return a;
2412         }
2413         if ( bExp == 0 ) {
2414             --expDiff;
2415         }
2416         else {
2417             bSig |= LIT64( 0x4000000000000000 );
2418         }
2419         shift64RightJamming( bSig, expDiff, &bSig );
2420         aSig |= LIT64( 0x4000000000000000 );
2421      aBigger:
2422         zSig = aSig - bSig;
2423         zExp = aExp;
2424      normalizeRoundAndPack:
2425         --zExp;
2426         return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2427     
2428     }
2429     
2430     /*
2431     -------------------------------------------------------------------------------
2432     Returns the result of adding the double-precision floating-point values `a'
2433     and `b'.  The operation is performed according to the IEC/IEEE Standard for
2434     Binary Floating-point Arithmetic.
2435     -------------------------------------------------------------------------------
2436     */
2437     float64 float64_add( float64 a, float64 b )
2438     {
2439         flag aSign, bSign;
2440     
2441         aSign = extractFloat64Sign( a );
2442         bSign = extractFloat64Sign( b );
2443         if ( aSign == bSign ) {
2444             return addFloat64Sigs( a, b, aSign );
2445         }
2446         else {
2447             return subFloat64Sigs( a, b, aSign );
2448         }
2449     
2450     }
2451     
2452     /*
2453     -------------------------------------------------------------------------------
2454     Returns the result of subtracting the double-precision floating-point values
2455     `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2456     for Binary Floating-point Arithmetic.
2457     -------------------------------------------------------------------------------
2458     */
2459     float64 float64_sub( float64 a, float64 b )
2460     {
2461         flag aSign, bSign;
2462     
2463         aSign = extractFloat64Sign( a );
2464         bSign = extractFloat64Sign( b );
2465         if ( aSign == bSign ) {
2466             return subFloat64Sigs( a, b, aSign );
2467         }
2468         else {
2469             return addFloat64Sigs( a, b, aSign );
2470         }
2471     
2472     }
2473     
2474     /*
2475     -------------------------------------------------------------------------------
2476     Returns the result of multiplying the double-precision floating-point values
2477     `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2478     for Binary Floating-point Arithmetic.
2479     -------------------------------------------------------------------------------
2480     */
2481     float64 float64_mul( float64 a, float64 b )
2482     {
2483         flag aSign, bSign, zSign;
2484         int16 aExp, bExp, zExp;
2485         bits64 aSig, bSig, zSig0, zSig1;
2486     
2487         aSig = extractFloat64Frac( a );
2488         aExp = extractFloat64Exp( a );
2489         aSign = extractFloat64Sign( a );
2490         bSig = extractFloat64Frac( b );
2491         bExp = extractFloat64Exp( b );
2492         bSign = extractFloat64Sign( b );
2493         zSign = aSign ^ bSign;
2494         if ( aExp == 0x7FF ) {
2495             if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2496                 return propagateFloat64NaN( a, b );
2497             }
2498             if ( ( bExp | bSig ) == 0 ) {
2499                 float_raise( float_flag_invalid );
2500                 return float64_default_nan;
2501             }
2502             return packFloat64( zSign, 0x7FF, 0 );
2503         }
2504         if ( bExp == 0x7FF ) {
2505             if ( bSig ) return propagateFloat64NaN( a, b );
2506             if ( ( aExp | aSig ) == 0 ) {
2507                 float_raise( float_flag_invalid );
2508                 return float64_default_nan;
2509             }
2510             return packFloat64( zSign, 0x7FF, 0 );
2511         }
2512         if ( aExp == 0 ) {
2513             if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2514             normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2515         }
2516         if ( bExp == 0 ) {
2517             if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2518             normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2519         }
2520         zExp = aExp + bExp - 0x3FF;
2521         aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2522         bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2523         mul64To128( aSig, bSig, &zSig0, &zSig1 );
2524         zSig0 |= ( zSig1 != 0 );
2525         if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2526             zSig0 <<= 1;
2527             --zExp;
2528         }
2529         return roundAndPackFloat64( zSign, zExp, zSig0 );
2530     
2531     }
2532     
2533     /*
2534     -------------------------------------------------------------------------------
2535     Returns the result of dividing the double-precision floating-point value `a'
2536     by the corresponding value `b'.  The operation is performed according to
2537     the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2538     -------------------------------------------------------------------------------
2539     */
2540     float64 float64_div( float64 a, float64 b )
2541     {
2542         flag aSign, bSign, zSign;
2543         int16 aExp, bExp, zExp;
2544         bits64 aSig, bSig, zSig;
2545         bits64 rem0, rem1;
2546         bits64 term0, term1;
2547     
2548         aSig = extractFloat64Frac( a );
2549         aExp = extractFloat64Exp( a );
2550         aSign = extractFloat64Sign( a );
2551         bSig = extractFloat64Frac( b );
2552         bExp = extractFloat64Exp( b );
2553         bSign = extractFloat64Sign( b );
2554         zSign = aSign ^ bSign;
2555         if ( aExp == 0x7FF ) {
2556             if ( aSig ) return propagateFloat64NaN( a, b );
2557             if ( bExp == 0x7FF ) {
2558                 if ( bSig ) return propagateFloat64NaN( a, b );
2559                 float_raise( float_flag_invalid );
2560                 return float64_default_nan;
2561             }
2562             return packFloat64( zSign, 0x7FF, 0 );
2563         }
2564         if ( bExp == 0x7FF ) {
2565             if ( bSig ) return propagateFloat64NaN( a, b );
2566             return packFloat64( zSign, 0, 0 );
2567         }
2568         if ( bExp == 0 ) {
2569             if ( bSig == 0 ) {
2570                 if ( ( aExp | aSig ) == 0 ) {
2571                     float_raise( float_flag_invalid );
2572                     return float64_default_nan;
2573                 }
2574                 float_raise( float_flag_divbyzero );
2575                 return packFloat64( zSign, 0x7FF, 0 );
2576             }
2577             normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2578         }
2579         if ( aExp == 0 ) {
2580             if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2581             normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2582         }
2583         zExp = aExp - bExp + 0x3FD;
2584         aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2585         bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2586         if ( bSig <= ( aSig + aSig ) ) {
2587             aSig >>= 1;
2588             ++zExp;
2589         }
2590         zSig = estimateDiv128To64( aSig, 0, bSig );
2591         if ( ( zSig & 0x1FF ) <= 2 ) {
2592             mul64To128( bSig, zSig, &term0, &term1 );
2593             sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2594             while ( (sbits64) rem0 < 0 ) {
2595                 --zSig;
2596                 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2597             }
2598             zSig |= ( rem1 != 0 );
2599         }
2600         return roundAndPackFloat64( zSign, zExp, zSig );
2601     
2602     }
2603     
2604     /*
2605     -------------------------------------------------------------------------------
2606     Returns the remainder of the double-precision floating-point value `a'
2607     with respect to the corresponding value `b'.  The operation is performed
2608     according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2609     -------------------------------------------------------------------------------
2610     */
2611     float64 float64_rem( float64 a, float64 b )
2612     {
2613         flag aSign, bSign, zSign;
2614         int16 aExp, bExp, expDiff;
2615         bits64 aSig, bSig;
2616         bits64 q, alternateASig;
2617         sbits64 sigMean;
2618     
2619         aSig = extractFloat64Frac( a );
2620         aExp = extractFloat64Exp( a );
2621         aSign = extractFloat64Sign( a );
2622         bSig = extractFloat64Frac( b );
2623         bExp = extractFloat64Exp( b );
2624         bSign = extractFloat64Sign( b );
2625         if ( aExp == 0x7FF ) {
2626             if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2627                 return propagateFloat64NaN( a, b );
2628             }
2629             float_raise( float_flag_invalid );
2630             return float64_default_nan;
2631         }
2632         if ( bExp == 0x7FF ) {
2633             if ( bSig ) return propagateFloat64NaN( a, b );
2634             return a;
2635         }
2636         if ( bExp == 0 ) {
2637             if ( bSig == 0 ) {
2638                 float_raise( float_flag_invalid );
2639                 return float64_default_nan;
2640             }
2641             normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2642         }
2643         if ( aExp == 0 ) {
2644             if ( aSig == 0 ) return a;
2645             normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2646         }
2647         expDiff = aExp - bExp;
2648         aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2649         bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2650         if ( expDiff < 0 ) {
2651             if ( expDiff < -1 ) return a;
2652             aSig >>= 1;
2653         }
2654         q = ( bSig <= aSig );
2655         if ( q ) aSig -= bSig;
2656         expDiff -= 64;
2657         while ( 0 < expDiff ) {
2658             q = estimateDiv128To64( aSig, 0, bSig );
2659             q = ( 2 < q ) ? q - 2 : 0;
2660             aSig = - ( ( bSig>>2 ) * q );
2661             expDiff -= 62;
2662         }
2663         expDiff += 64;
2664         if ( 0 < expDiff ) {
2665             q = estimateDiv128To64( aSig, 0, bSig );
2666             q = ( 2 < q ) ? q - 2 : 0;
2667             q >>= 64 - expDiff;
2668             bSig >>= 2;
2669             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2670         }
2671         else {
2672             aSig >>= 2;
2673             bSig >>= 2;
2674         }
2675         do {
2676             alternateASig = aSig;
2677             ++q;
2678             aSig -= bSig;
2679         } while ( 0 <= (sbits64) aSig );
2680         sigMean = aSig + alternateASig;
2681         if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2682             aSig = alternateASig;
2683         }
2684         zSign = ( (sbits64) aSig < 0 );
2685         if ( zSign ) aSig = - aSig;
2686         return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
2687     
2688     }
2689     
2690     /*
2691     -------------------------------------------------------------------------------
2692     Returns the square root of the double-precision floating-point value `a'.
2693     The operation is performed according to the IEC/IEEE Standard for Binary
2694     Floating-point Arithmetic.
2695     -------------------------------------------------------------------------------
2696     */
2697     float64 float64_sqrt( float64 a )
2698     {
2699         flag aSign;
2700         int16 aExp, zExp;
2701         bits64 aSig, zSig;
2702         bits64 rem0, rem1, term0, term1; //, shiftedRem;
2703         //float64 z;
2704     
2705         aSig = extractFloat64Frac( a );
2706         aExp = extractFloat64Exp( a );
2707         aSign = extractFloat64Sign( a );
2708         if ( aExp == 0x7FF ) {
2709             if ( aSig ) return propagateFloat64NaN( a, a );
2710             if ( ! aSign ) return a;
2711             float_raise( float_flag_invalid );
2712             return float64_default_nan;
2713         }
2714         if ( aSign ) {
2715             if ( ( aExp | aSig ) == 0 ) return a;
2716             float_raise( float_flag_invalid );
2717             return float64_default_nan;
2718         }
2719         if ( aExp == 0 ) {
2720             if ( aSig == 0 ) return 0;
2721             normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2722         }
2723         zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2724         aSig |= LIT64( 0x0010000000000000 );
2725         zSig = estimateSqrt32( aExp, aSig>>21 );
2726         zSig <<= 31;
2727         aSig <<= 9 - ( aExp & 1 );
2728         zSig = estimateDiv128To64( aSig, 0, zSig ) + zSig + 2;
2729         if ( ( zSig & 0x3FF ) <= 5 ) {
2730             if ( zSig < 2 ) {
2731                 zSig = LIT64( 0xFFFFFFFFFFFFFFFF );
2732             }
2733             else {
2734                 aSig <<= 2;
2735                 mul64To128( zSig, zSig, &term0, &term1 );
2736                 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2737                 while ( (sbits64) rem0 < 0 ) {
2738                     --zSig;
2739                     shortShift128Left( 0, zSig, 1, &term0, &term1 );
2740                     term1 |= 1;
2741                     add128( rem0, rem1, term0, term1, &rem0, &rem1 );
2742                 }
2743                 zSig |= ( ( rem0 | rem1 ) != 0 );
2744             }
2745         }
2746         shift64RightJamming( zSig, 1, &zSig );
2747         return roundAndPackFloat64( 0, zExp, zSig );
2748     
2749     }
2750     
2751     /*
2752     -------------------------------------------------------------------------------
2753     Returns 1 if the double-precision floating-point value `a' is equal to the
2754     corresponding value `b', and 0 otherwise.  The comparison is performed
2755     according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2756     -------------------------------------------------------------------------------
2757     */
2758     flag float64_eq( float64 a, float64 b )
2759     {
2760     
2761         if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2762              || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2763            ) {
2764             if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2765                 float_raise( float_flag_invalid );
2766             }
2767             return 0;
2768         }
2769         return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2770     
2771     }
2772     
2773     /*
2774     -------------------------------------------------------------------------------
2775     Returns 1 if the double-precision floating-point value `a' is less than or
2776     equal to the corresponding value `b', and 0 otherwise.  The comparison is
2777     performed according to the IEC/IEEE Standard for Binary Floating-point
2778     Arithmetic.
2779     -------------------------------------------------------------------------------
2780     */
2781     flag float64_le( float64 a, float64 b )
2782     {
2783         flag aSign, bSign;
2784     
2785         if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2786              || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2787            ) {
2788             float_raise( float_flag_invalid );
2789             return 0;
2790         }
2791         aSign = extractFloat64Sign( a );
2792         bSign = extractFloat64Sign( b );
2793         if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2794         return ( a == b ) || ( aSign ^ ( a < b ) );
2795     
2796     }
2797     
2798     /*
2799     -------------------------------------------------------------------------------
2800     Returns 1 if the double-precision floating-point value `a' is less than
2801     the corresponding value `b', and 0 otherwise.  The comparison is performed
2802     according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2803     -------------------------------------------------------------------------------
2804     */
2805     flag float64_lt( float64 a, float64 b )
2806     {
2807         flag aSign, bSign;
2808     
2809         if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2810              || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2811            ) {
2812             float_raise( float_flag_invalid );
2813             return 0;
2814         }
2815         aSign = extractFloat64Sign( a );
2816         bSign = extractFloat64Sign( b );
2817         if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2818         return ( a != b ) && ( aSign ^ ( a < b ) );
2819     
2820     }
2821     
2822     /*
2823     -------------------------------------------------------------------------------
2824     Returns 1 if the double-precision floating-point value `a' is equal to the
2825     corresponding value `b', and 0 otherwise.  The invalid exception is raised
2826     if either operand is a NaN.  Otherwise, the comparison is performed
2827     according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2828     -------------------------------------------------------------------------------
2829     */
2830     flag float64_eq_signaling( float64 a, float64 b )
2831     {
2832     
2833         if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2834              || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2835            ) {
2836             float_raise( float_flag_invalid );
2837             return 0;
2838         }
2839         return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2840     
2841     }
2842     
2843     /*
2844     -------------------------------------------------------------------------------
2845     Returns 1 if the double-precision floating-point value `a' is less than or
2846     equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2847     cause an exception.  Otherwise, the comparison is performed according to the
2848     IEC/IEEE Standard for Binary Floating-point Arithmetic.
2849     -------------------------------------------------------------------------------
2850     */
2851     flag float64_le_quiet( float64 a, float64 b )
2852     {
2853         flag aSign, bSign;
2854         //int16 aExp, bExp;
2855     
2856         if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2857              || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2858            ) {
2859             if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2860                 float_raise( float_flag_invalid );
2861             }
2862             return 0;
2863         }
2864         aSign = extractFloat64Sign( a );
2865         bSign = extractFloat64Sign( b );
2866         if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2867         return ( a == b ) || ( aSign ^ ( a < b ) );
2868     
2869     }
2870     
2871     /*
2872     -------------------------------------------------------------------------------
2873     Returns 1 if the double-precision floating-point value `a' is less than
2874     the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2875     exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2876     Standard for Binary Floating-point Arithmetic.
2877     -------------------------------------------------------------------------------
2878     */
2879     flag float64_lt_quiet( float64 a, float64 b )
2880     {
2881         flag aSign, bSign;
2882     
2883         if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2884              || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2885            ) {
2886             if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2887                 float_raise( float_flag_invalid );
2888             }
2889             return 0;
2890         }
2891         aSign = extractFloat64Sign( a );
2892         bSign = extractFloat64Sign( b );
2893         if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2894         return ( a != b ) && ( aSign ^ ( a < b ) );
2895     
2896     }
2897     
2898     #ifdef FLOATX80
2899     
2900     /*
2901     -------------------------------------------------------------------------------
2902     Returns the result of converting the extended double-precision floating-
2903     point value `a' to the 32-bit two's complement integer format.  The
2904     conversion is performed according to the IEC/IEEE Standard for Binary
2905     Floating-point Arithmetic---which means in particular that the conversion
2906     is rounded according to the current rounding mode.  If `a' is a NaN, the
2907     largest positive integer is returned.  Otherwise, if the conversion
2908     overflows, the largest integer with the same sign as `a' is returned.
2909     -------------------------------------------------------------------------------
2910     */
2911     int32 floatx80_to_int32( floatx80 a )
2912     {
2913         flag aSign;
2914         int32 aExp, shiftCount;
2915         bits64 aSig;
2916     
2917         aSig = extractFloatx80Frac( a );
2918         aExp = extractFloatx80Exp( a );
2919         aSign = extractFloatx80Sign( a );
2920         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2921         shiftCount = 0x4037 - aExp;
2922         if ( shiftCount <= 0 ) shiftCount = 1;
2923         shift64RightJamming( aSig, shiftCount, &aSig );
2924         return roundAndPackInt32( aSign, aSig );
2925     
2926     }
2927     
2928     /*
2929     -------------------------------------------------------------------------------
2930     Returns the result of converting the extended double-precision floating-
2931     point value `a' to the 32-bit two's complement integer format.  The
2932     conversion is performed according to the IEC/IEEE Standard for Binary
2933     Floating-point Arithmetic, except that the conversion is always rounded
2934     toward zero.  If `a' is a NaN, the largest positive integer is returned.
2935     Otherwise, if the conversion overflows, the largest integer with the same
2936     sign as `a' is returned.
2937     -------------------------------------------------------------------------------
2938     */
2939     int32 floatx80_to_int32_round_to_zero( floatx80 a )
2940     {
2941         flag aSign;
2942         int32 aExp, shiftCount;
2943         bits64 aSig, savedASig;
2944         int32 z;
2945     
2946         aSig = extractFloatx80Frac( a );
2947         aExp = extractFloatx80Exp( a );
2948         aSign = extractFloatx80Sign( a );
2949         shiftCount = 0x403E - aExp;
2950         if ( shiftCount < 32 ) {
2951             if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2952             goto invalid;
2953         }
2954         else if ( 63 < shiftCount ) {
2955             if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2956             return 0;
2957         }
2958         savedASig = aSig;
2959         aSig >>= shiftCount;
2960         z = aSig;
2961         if ( aSign ) z = - z;
2962         if ( ( z < 0 ) ^ aSign ) {
2963      invalid:
2964             float_exception_flags |= float_flag_invalid;
2965             return aSign ? 0x80000000 : 0x7FFFFFFF;
2966         }
2967         if ( ( aSig<<shiftCount ) != savedASig ) {
2968             float_exception_flags |= float_flag_inexact;
2969         }
2970         return z;
2971     
2972     }
2973     
2974     /*
2975     -------------------------------------------------------------------------------
2976     Returns the result of converting the extended double-precision floating-
2977     point value `a' to the single-precision floating-point format.  The
2978     conversion is performed according to the IEC/IEEE Standard for Binary
2979     Floating-point Arithmetic.
2980     -------------------------------------------------------------------------------
2981     */
2982     float32 floatx80_to_float32( floatx80 a )
2983     {
2984         flag aSign;
2985         int32 aExp;
2986         bits64 aSig;
2987     
2988         aSig = extractFloatx80Frac( a );
2989         aExp = extractFloatx80Exp( a );
2990         aSign = extractFloatx80Sign( a );
2991         if ( aExp == 0x7FFF ) {
2992             if ( (bits64) ( aSig<<1 ) ) {
2993                 return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
2994             }
2995             return packFloat32( aSign, 0xFF, 0 );
2996         }
2997         shift64RightJamming( aSig, 33, &aSig );
2998         if ( aExp || aSig ) aExp -= 0x3F81;
2999         return roundAndPackFloat32( aSign, aExp, aSig );
3000     
3001     }
3002     
3003     /*
3004     -------------------------------------------------------------------------------
3005     Returns the result of converting the extended double-precision floating-
3006     point value `a' to the double-precision floating-point format.  The
3007     conversion is performed according to the IEC/IEEE Standard for Binary
3008     Floating-point Arithmetic.
3009     -------------------------------------------------------------------------------
3010     */
3011     float64 floatx80_to_float64( floatx80 a )
3012     {
3013         flag aSign;
3014         int32 aExp;
3015         bits64 aSig, zSig;
3016     
3017         aSig = extractFloatx80Frac( a );
3018         aExp = extractFloatx80Exp( a );
3019         aSign = extractFloatx80Sign( a );
3020         if ( aExp == 0x7FFF ) {
3021             if ( (bits64) ( aSig<<1 ) ) {
3022                 return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3023             }
3024             return packFloat64( aSign, 0x7FF, 0 );
3025         }
3026         shift64RightJamming( aSig, 1, &zSig );
3027         if ( aExp || aSig ) aExp -= 0x3C01;
3028         return roundAndPackFloat64( aSign, aExp, zSig );
3029     
3030     }
3031     
3032     #ifdef FLOAT128
3033     
3034     /*
3035     -------------------------------------------------------------------------------
3036     Returns the result of converting the extended double-precision floating-
3037     point value `a' to the quadruple-precision floating-point format.  The
3038     conversion is performed according to the IEC/IEEE Standard for Binary
3039     Floating-point Arithmetic.
3040     -------------------------------------------------------------------------------
3041     */
3042     float128 floatx80_to_float128( floatx80 a )
3043     {
3044         flag aSign;
3045         int16 aExp;
3046         bits64 aSig, zSig0, zSig1;
3047     
3048         aSig = extractFloatx80Frac( a );
3049         aExp = extractFloatx80Exp( a );
3050         aSign = extractFloatx80Sign( a );
3051         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3052             return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3053         }
3054         shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3055         return packFloat128( aSign, aExp, zSig0, zSig1 );
3056     
3057     }
3058     
3059     #endif
3060     
3061     /*
3062     -------------------------------------------------------------------------------
3063     Rounds the extended double-precision floating-point value `a' to an integer,
3064     and returns the result as an extended quadruple-precision floating-point
3065     value.  The operation is performed according to the IEC/IEEE Standard for
3066     Binary Floating-point Arithmetic.
3067     -------------------------------------------------------------------------------
3068     */
3069     floatx80 floatx80_round_to_int( floatx80 a )
3070     {
3071         flag aSign;
3072         int32 aExp;
3073         bits64 lastBitMask, roundBitsMask;
3074         int8 roundingMode;
3075         floatx80 z;
3076     
3077         aExp = extractFloatx80Exp( a );
3078         if ( 0x403E <= aExp ) {
3079             if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3080                 return propagateFloatx80NaN( a, a );
3081             }
3082             return a;
3083         }
3084         if ( aExp <= 0x3FFE ) {
3085             if (    ( aExp == 0 )
3086                  && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3087                 return a;
3088             }
3089             float_exception_flags |= float_flag_inexact;
3090             aSign = extractFloatx80Sign( a );
3091             switch ( float_rounding_mode ) {
3092              case float_round_nearest_even:
3093                 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3094                    ) {
3095                     return
3096                         packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3097                 }
3098                 break;
3099              case float_round_down:
3100                 return
3101                       aSign ?
3102                           packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3103                     : packFloatx80( 0, 0, 0 );
3104              case float_round_up:
3105                 return
3106                       aSign ? packFloatx80( 1, 0, 0 )
3107                     : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3108             }
3109             return packFloatx80( aSign, 0, 0 );
3110         }
3111         lastBitMask = 1;
3112         lastBitMask <<= 0x403E - aExp;
3113         roundBitsMask = lastBitMask - 1;
3114         z = a;
3115         roundingMode = float_rounding_mode;
3116         if ( roundingMode == float_round_nearest_even ) {
3117             z.low += lastBitMask>>1;
3118             if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3119         }
3120         else if ( roundingMode != float_round_to_zero ) {
3121             if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3122                 z.low += roundBitsMask;
3123             }
3124         }
3125         z.low &= ~ roundBitsMask;
3126         if ( z.low == 0 ) {
3127             ++z.high;
3128             z.low = LIT64( 0x8000000000000000 );
3129         }
3130         if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;
3131         return z;
3132     
3133     }
3134     
3135     /*
3136     -------------------------------------------------------------------------------
3137     Returns the result of adding the absolute values of the extended double-
3138     precision floating-point values `a' and `b'.  If `zSign' is true, the sum is
3139     negated before being returned.  `zSign' is ignored if the result is a NaN.
3140     The addition is performed according to the IEC/IEEE Standard for Binary
3141     Floating-point Arithmetic.
3142     -------------------------------------------------------------------------------
3143     */
3144     static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3145     {
3146         int32 aExp, bExp, zExp;
3147         bits64 aSig, bSig, zSig0, zSig1;
3148         int32 expDiff;
3149     
3150         aSig = extractFloatx80Frac( a );
3151         aExp = extractFloatx80Exp( a );
3152         bSig = extractFloatx80Frac( b );
3153         bExp = extractFloatx80Exp( b );
3154         expDiff = aExp - bExp;
3155         if ( 0 < expDiff ) {
3156             if ( aExp == 0x7FFF ) {
3157                 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3158                 return a;
3159             }
3160             if ( bExp == 0 ) --expDiff;
3161             shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3162             zExp = aExp;
3163         }
3164         else if ( expDiff < 0 ) {
3165             if ( bExp == 0x7FFF ) {
3166                 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3167                 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3168             }
3169             if ( aExp == 0 ) ++expDiff;
3170             shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3171             zExp = bExp;
3172         }
3173         else {
3174             if ( aExp == 0x7FFF ) {
3175                 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3176                     return propagateFloatx80NaN( a, b );
3177                 }
3178                 return a;
3179             }
3180             zSig1 = 0;
3181             zSig0 = aSig + bSig;
3182             if ( aExp == 0 ) {
3183                 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3184                 goto roundAndPack;
3185             }
3186             zExp = aExp;
3187             goto shiftRight1;
3188         }
3189         
3190         zSig0 = aSig + bSig;
3191     
3192         if ( (sbits64) zSig0 < 0 ) goto roundAndPack; 
3193      shiftRight1:
3194         shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3195         zSig0 |= LIT64( 0x8000000000000000 );
3196         ++zExp;
3197      roundAndPack:
3198         return
3199             roundAndPackFloatx80(
3200                 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3201     
3202     }
3203     
3204     /*
3205     -------------------------------------------------------------------------------
3206     Returns the result of subtracting the absolute values of the extended
3207     double-precision floating-point values `a' and `b'.  If `zSign' is true,
3208     the difference is negated before being returned.  `zSign' is ignored if the
3209     result is a NaN.  The subtraction is performed according to the IEC/IEEE
3210     Standard for Binary Floating-point Arithmetic.
3211     -------------------------------------------------------------------------------
3212     */
3213     static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3214     {
3215         int32 aExp, bExp, zExp;
3216         bits64 aSig, bSig, zSig0, zSig1;
3217         int32 expDiff;
3218         floatx80 z;
3219     
3220         aSig = extractFloatx80Frac( a );
3221         aExp = extractFloatx80Exp( a );
3222         bSig = extractFloatx80Frac( b );
3223         bExp = extractFloatx80Exp( b );
3224         expDiff = aExp - bExp;
3225         if ( 0 < expDiff ) goto aExpBigger;
3226         if ( expDiff < 0 ) goto bExpBigger;
3227         if ( aExp == 0x7FFF ) {
3228             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3229                 return propagateFloatx80NaN( a, b );
3230             }
3231             float_raise( float_flag_invalid );
3232             z.low = floatx80_default_nan_low;
3233             z.high = floatx80_default_nan_high;
3234             return z;
3235         }
3236         if ( aExp == 0 ) {
3237             aExp = 1;
3238             bExp = 1;
3239         }
3240         zSig1 = 0;
3241         if ( bSig < aSig ) goto aBigger;
3242         if ( aSig < bSig ) goto bBigger;
3243         return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
3244      bExpBigger:
3245         if ( bExp == 0x7FFF ) {
3246             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3247             return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3248         }
3249         if ( aExp == 0 ) ++expDiff;
3250         shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3251      bBigger:
3252         sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3253         zExp = bExp;
3254         zSign ^= 1;
3255         goto normalizeRoundAndPack;
3256      aExpBigger:
3257         if ( aExp == 0x7FFF ) {
3258             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3259             return a;
3260         }
3261         if ( bExp == 0 ) --expDiff;
3262         shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3263      aBigger:
3264         sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3265         zExp = aExp;
3266      normalizeRoundAndPack:
3267         return
3268             normalizeRoundAndPackFloatx80(
3269                 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3270     
3271     }
3272     
3273     /*
3274     -------------------------------------------------------------------------------
3275     Returns the result of adding the extended double-precision floating-point
3276     values `a' and `b'.  The operation is performed according to the IEC/IEEE
3277     Standard for Binary Floating-point Arithmetic.
3278     -------------------------------------------------------------------------------
3279     */
3280     floatx80 floatx80_add( floatx80 a, floatx80 b )
3281     {
3282         flag aSign, bSign;
3283         
3284         aSign = extractFloatx80Sign( a );
3285         bSign = extractFloatx80Sign( b );
3286         if ( aSign == bSign ) {
3287             return addFloatx80Sigs( a, b, aSign );
3288         }
3289         else {
3290             return subFloatx80Sigs( a, b, aSign );
3291         }
3292         
3293     }
3294     
3295     /*
3296     -------------------------------------------------------------------------------
3297     Returns the result of subtracting the extended double-precision floating-
3298     point values `a' and `b'.  The operation is performed according to the
3299     IEC/IEEE Standard for Binary Floating-point Arithmetic.
3300     -------------------------------------------------------------------------------
3301     */
3302     floatx80 floatx80_sub( floatx80 a, floatx80 b )
3303     {
3304         flag aSign, bSign;
3305     
3306         aSign = extractFloatx80Sign( a );
3307         bSign = extractFloatx80Sign( b );
3308         if ( aSign == bSign ) {
3309             return subFloatx80Sigs( a, b, aSign );
3310         }
3311         else {
3312             return addFloatx80Sigs( a, b, aSign );
3313         }
3314     
3315     }
3316     
3317     /*
3318     -------------------------------------------------------------------------------
3319     Returns the result of multiplying the extended double-precision floating-
3320     point values `a' and `b'.  The operation is performed according to the
3321     IEC/IEEE Standard for Binary Floating-point Arithmetic.
3322     -------------------------------------------------------------------------------
3323     */
3324     floatx80 floatx80_mul( floatx80 a, floatx80 b )
3325     {
3326         flag aSign, bSign, zSign;
3327         int32 aExp, bExp, zExp;
3328         bits64 aSig, bSig, zSig0, zSig1;
3329         floatx80 z;
3330     
3331         aSig = extractFloatx80Frac( a );
3332         aExp = extractFloatx80Exp( a );
3333         aSign = extractFloatx80Sign( a );
3334         bSig = extractFloatx80Frac( b );
3335         bExp = extractFloatx80Exp( b );
3336         bSign = extractFloatx80Sign( b );
3337         zSign = aSign ^ bSign;
3338         if ( aExp == 0x7FFF ) {
3339             if (    (bits64) ( aSig<<1 )
3340                  || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3341                 return propagateFloatx80NaN( a, b );
3342             }
3343             if ( ( bExp | bSig ) == 0 ) goto invalid;
3344             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3345         }
3346         if ( bExp == 0x7FFF ) {
3347             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3348             if ( ( aExp | aSig ) == 0 ) {
3349      invalid:
3350                 float_raise( float_flag_invalid );
3351                 z.low = floatx80_default_nan_low;
3352                 z.high = floatx80_default_nan_high;
3353                 return z;
3354             }
3355             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3356         }
3357         if ( aExp == 0 ) {
3358             if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3359             normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3360         }
3361         if ( bExp == 0 ) {
3362             if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3363             normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3364         }
3365         zExp = aExp + bExp - 0x3FFE;
3366         mul64To128( aSig, bSig, &zSig0, &zSig1 );
3367         if ( 0 < (sbits64) zSig0 ) {
3368             shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3369             --zExp;
3370         }
3371         return
3372             roundAndPackFloatx80(
3373                 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3374     
3375     }
3376     
3377     /*
3378     -------------------------------------------------------------------------------
3379     Returns the result of dividing the extended double-precision floating-point
3380     value `a' by the corresponding value `b'.  The operation is performed
3381     according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3382     -------------------------------------------------------------------------------
3383     */
3384     floatx80 floatx80_div( floatx80 a, floatx80 b )
3385     {
3386         flag aSign, bSign, zSign;
3387         int32 aExp, bExp, zExp;
3388         bits64 aSig, bSig, zSig0, zSig1;
3389         bits64 rem0, rem1, rem2, term0, term1, term2;
3390         floatx80 z;
3391     
3392         aSig = extractFloatx80Frac( a );
3393         aExp = extractFloatx80Exp( a );
3394         aSign = extractFloatx80Sign( a );
3395         bSig = extractFloatx80Frac( b );
3396         bExp = extractFloatx80Exp( b );
3397         bSign = extractFloatx80Sign( b );
3398         zSign = aSign ^ bSign;
3399         if ( aExp == 0x7FFF ) {
3400             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3401             if ( bExp == 0x7FFF ) {
3402                 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3403                 goto invalid;
3404             }
3405             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3406         }
3407         if ( bExp == 0x7FFF ) {
3408             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3409             return packFloatx80( zSign, 0, 0 );
3410         }
3411         if ( bExp == 0 ) {
3412             if ( bSig == 0 ) {
3413                 if ( ( aExp | aSig ) == 0 ) {
3414      invalid:
3415                     float_raise( float_flag_invalid );
3416                     z.low = floatx80_default_nan_low;
3417                     z.high = floatx80_default_nan_high;
3418                     return z;
3419                 }
3420                 float_raise( float_flag_divbyzero );
3421                 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3422             }
3423             normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3424         }
3425         if ( aExp == 0 ) {
3426             if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3427             normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3428         }
3429         zExp = aExp - bExp + 0x3FFE;
3430         rem1 = 0;
3431         if ( bSig <= aSig ) {
3432             shift128Right( aSig, 0, 1, &aSig, &rem1 );
3433             ++zExp;
3434         }
3435         zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3436         mul64To128( bSig, zSig0, &term0, &term1 );
3437         sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3438         while ( (sbits64) rem0 < 0 ) {
3439             --zSig0;
3440             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3441         }
3442         zSig1 = estimateDiv128To64( rem1, 0, bSig );
3443         if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3444             mul64To128( bSig, zSig1, &term1, &term2 );
3445             sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3446             while ( (sbits64) rem1 < 0 ) {
3447                 --zSig1;
3448                 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3449             }
3450             zSig1 |= ( ( rem1 | rem2 ) != 0 );
3451         }
3452         return
3453             roundAndPackFloatx80(
3454                 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3455     
3456     }
3457     
3458     /*
3459     -------------------------------------------------------------------------------
3460     Returns the remainder of the extended double-precision floating-point value
3461     `a' with respect to the corresponding value `b'.  The operation is performed
3462     according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3463     -------------------------------------------------------------------------------
3464     */
3465     floatx80 floatx80_rem( floatx80 a, floatx80 b )
3466     {
3467         flag aSign, bSign, zSign;
3468         int32 aExp, bExp, expDiff;
3469         bits64 aSig0, aSig1, bSig;
3470         bits64 q, term0, term1, alternateASig0, alternateASig1;
3471         floatx80 z;
3472     
3473         aSig0 = extractFloatx80Frac( a );
3474         aExp = extractFloatx80Exp( a );
3475         aSign = extractFloatx80Sign( a );
3476         bSig = extractFloatx80Frac( b );
3477         bExp = extractFloatx80Exp( b );
3478         bSign = extractFloatx80Sign( b );
3479         if ( aExp == 0x7FFF ) {
3480             if (    (bits64) ( aSig0<<1 )
3481                  || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3482                 return propagateFloatx80NaN( a, b );
3483             }
3484             goto invalid;
3485         }
3486         if ( bExp == 0x7FFF ) {
3487             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3488             return a;
3489         }
3490         if ( bExp == 0 ) {
3491             if ( bSig == 0 ) {
3492      invalid:
3493                 float_raise( float_flag_invalid );
3494                 z.low = floatx80_default_nan_low;
3495                 z.high = floatx80_default_nan_high;
3496                 return z;
3497             }
3498             normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3499         }
3500         if ( aExp == 0 ) {
3501             if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3502             normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3503         }
3504         bSig |= LIT64( 0x8000000000000000 );
3505         zSign = aSign;
3506         expDiff = aExp - bExp;
3507         aSig1 = 0;
3508         if ( expDiff < 0 ) {
3509             if ( expDiff < -1 ) return a;
3510             shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3511             expDiff = 0;
3512         }
3513         q = ( bSig <= aSig0 );
3514         if ( q ) aSig0 -= bSig;
3515         expDiff -= 64;
3516         while ( 0 < expDiff ) {
3517             q = estimateDiv128To64( aSig0, aSig1, bSig );
3518             q = ( 2 < q ) ? q - 2 : 0;
3519             mul64To128( bSig, q, &term0, &term1 );
3520             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3521             shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3522             expDiff -= 62;
3523         }
3524         expDiff += 64;
3525         if ( 0 < expDiff ) {
3526             q = estimateDiv128To64( aSig0, aSig1, bSig );
3527             q = ( 2 < q ) ? q - 2 : 0;
3528             q >>= 64 - expDiff;
3529             mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3530             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3531             shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3532             while ( le128( term0, term1, aSig0, aSig1 ) ) {
3533                 ++q;
3534                 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3535             }
3536         }
3537         else {
3538             term1 = 0;
3539             term0 = bSig;
3540         }
3541         sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3542         if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3543              || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3544                   && ( q & 1 ) )
3545            ) {
3546             aSig0 = alternateASig0;
3547             aSig1 = alternateASig1;
3548             zSign = ! zSign;
3549         }
3550         return
3551             normalizeRoundAndPackFloatx80(
3552                 80, zSign, bExp + expDiff, aSig0, aSig1 );
3553     
3554     }
3555     
3556     /*
3557     -------------------------------------------------------------------------------
3558     Returns the square root of the extended double-precision floating-point
3559     value `a'.  The operation is performed according to the IEC/IEEE Standard
3560     for Binary Floating-point Arithmetic.
3561     -------------------------------------------------------------------------------
3562     */
3563     floatx80 floatx80_sqrt( floatx80 a )
3564     {
3565         flag aSign;
3566         int32 aExp, zExp;
3567         bits64 aSig0, aSig1, zSig0, zSig1;
3568         bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3569         bits64 shiftedRem0, shiftedRem1;
3570         floatx80 z;
3571     
3572         aSig0 = extractFloatx80Frac( a );
3573         aExp = extractFloatx80Exp( a );
3574         aSign = extractFloatx80Sign( a );
3575         if ( aExp == 0x7FFF ) {
3576             if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
3577             if ( ! aSign ) return a;
3578             goto invalid;
3579         }
3580         if ( aSign ) {
3581             if ( ( aExp | aSig0 ) == 0 ) return a;
3582      invalid:
3583             float_raise( float_flag_invalid );
3584             z.low = floatx80_default_nan_low;
3585             z.high = floatx80_default_nan_high;
3586             return z;
3587         }
3588         if ( aExp == 0 ) {
3589             if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3590             normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3591         }
3592         zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3593         zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3594         zSig0 <<= 31;
3595         aSig1 = 0;
3596         shift128Right( aSig0, 0, ( aExp & 1 ) + 2, &aSig0, &aSig1 );
3597         zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0 ) + zSig0 + 4;
3598         if ( 0 <= (sbits64) zSig0 ) zSig0 = LIT64( 0xFFFFFFFFFFFFFFFF );
3599         shortShift128Left( aSig0, aSig1, 2, &aSig0, &aSig1 );
3600         mul64To128( zSig0, zSig0, &term0, &term1 );
3601         sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3602         while ( (sbits64) rem0 < 0 ) {
3603             --zSig0;
3604             shortShift128Left( 0, zSig0, 1, &term0, &term1 );
3605             term1 |= 1;
3606             add128( rem0, rem1, term0, term1, &rem0, &rem1 );
3607         }
3608         shortShift128Left( rem0, rem1, 63, &shiftedRem0, &shiftedRem1 );
3609         zSig1 = estimateDiv128To64( shiftedRem0, shiftedRem1, zSig0 );
3610         if ( (bits64) ( zSig1<<1 ) <= 10 ) {
3611             if ( zSig1 == 0 ) zSig1 = 1;
3612             mul64To128( zSig0, zSig1, &term1, &term2 );
3613             shortShift128Left( term1, term2, 1, &term1, &term2 );
3614             sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3615             mul64To128( zSig1, zSig1, &term2, &term3 );
3616             sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3617             while ( (sbits64) rem1 < 0 ) {
3618                 --zSig1;
3619                 shortShift192Left( 0, zSig0, zSig1, 1, &term1, &term2, &term3 );
3620                 term3 |= 1;
3621                 add192(
3622                     rem1, rem2, rem3, term1, term2, term3, &rem1, &rem2, &rem3 );
3623             }
3624             zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3625         }
3626         return
3627             roundAndPackFloatx80(
3628                 floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
3629     
3630     }
3631     
3632     /*
3633     -------------------------------------------------------------------------------
3634     Returns 1 if the extended double-precision floating-point value `a' is
3635     equal to the corresponding value `b', and 0 otherwise.  The comparison is
3636     performed according to the IEC/IEEE Standard for Binary Floating-point
3637     Arithmetic.
3638     -------------------------------------------------------------------------------
3639     */
3640     flag floatx80_eq( floatx80 a, floatx80 b )
3641     {
3642     
3643         if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3644                   && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3645              || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3646                   && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3647            ) {
3648             if (    floatx80_is_signaling_nan( a )
3649                  || floatx80_is_signaling_nan( b ) ) {
3650                 float_raise( float_flag_invalid );
3651             }
3652             return 0;
3653         }
3654         return
3655                ( a.low == b.low )
3656             && (    ( a.high == b.high )
3657                  || (    ( a.low == 0 )
3658                       && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3659                );
3660     
3661     }
3662     
3663     /*
3664     -------------------------------------------------------------------------------
3665     Returns 1 if the extended double-precision floating-point value `a' is
3666     less than or equal to the corresponding value `b', and 0 otherwise.  The
3667     comparison is performed according to the IEC/IEEE Standard for Binary
3668     Floating-point Arithmetic.
3669     -------------------------------------------------------------------------------
3670     */
3671     flag floatx80_le( floatx80 a, floatx80 b )
3672     {
3673         flag aSign, bSign;
3674     
3675         if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3676                   && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3677              || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3678                   && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3679            ) {
3680             float_raise( float_flag_invalid );
3681             return 0;
3682         }
3683         aSign = extractFloatx80Sign( a );
3684         bSign = extractFloatx80Sign( b );
3685         if ( aSign != bSign ) {
3686             return
3687                    aSign
3688                 || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3689                      == 0 );
3690         }
3691         return
3692               aSign ? le128( b.high, b.low, a.high, a.low )
3693             : le128( a.high, a.low, b.high, b.low );
3694     
3695     }
3696     
3697     /*
3698     -------------------------------------------------------------------------------
3699     Returns 1 if the extended double-precision floating-point value `a' is
3700     less than the corresponding value `b', and 0 otherwise.  The comparison
3701     is performed according to the IEC/IEEE Standard for Binary Floating-point
3702     Arithmetic.
3703     -------------------------------------------------------------------------------
3704     */
3705     flag floatx80_lt( floatx80 a, floatx80 b )
3706     {
3707         flag aSign, bSign;
3708     
3709         if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3710                   && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3711              || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3712                   && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3713            ) {
3714             float_raise( float_flag_invalid );
3715             return 0;
3716         }
3717         aSign = extractFloatx80Sign( a );
3718         bSign = extractFloatx80Sign( b );
3719         if ( aSign != bSign ) {
3720             return
3721                    aSign
3722                 && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3723                      != 0 );
3724         }
3725         return
3726               aSign ? lt128( b.high, b.low, a.high, a.low )
3727             : lt128( a.high, a.low, b.high, b.low );
3728     
3729     }
3730     
3731     /*
3732     -------------------------------------------------------------------------------
3733     Returns 1 if the extended double-precision floating-point value `a' is equal
3734     to the corresponding value `b', and 0 otherwise.  The invalid exception is
3735     raised if either operand is a NaN.  Otherwise, the comparison is performed
3736     according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3737     -------------------------------------------------------------------------------
3738     */
3739     flag floatx80_eq_signaling( floatx80 a, floatx80 b )
3740     {
3741     
3742         if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3743                   && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3744              || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3745                   && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3746            ) {
3747             float_raise( float_flag_invalid );
3748             return 0;
3749         }
3750         return
3751                ( a.low == b.low )
3752             && (    ( a.high == b.high )
3753                  || (    ( a.low == 0 )
3754                       && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3755                );
3756     
3757     }
3758     
3759     /*
3760     -------------------------------------------------------------------------------
3761     Returns 1 if the extended double-precision floating-point value `a' is less
3762     than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
3763     do not cause an exception.  Otherwise, the comparison is performed according
3764     to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3765     -------------------------------------------------------------------------------
3766     */
3767     flag floatx80_le_quiet( floatx80 a, floatx80 b )
3768     {
3769         flag aSign, bSign;
3770     
3771         if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3772                   && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3773              || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3774                   && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3775            ) {
3776             if (    floatx80_is_signaling_nan( a )
3777                  || floatx80_is_signaling_nan( b ) ) {
3778                 float_raise( float_flag_invalid );
3779             }
3780             return 0;
3781         }
3782         aSign = extractFloatx80Sign( a );
3783         bSign = extractFloatx80Sign( b );
3784         if ( aSign != bSign ) {
3785             return
3786                    aSign
3787                 || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3788                      == 0 );
3789         }
3790         return
3791               aSign ? le128( b.high, b.low, a.high, a.low )
3792             : le128( a.high, a.low, b.high, b.low );
3793     
3794     }
3795     
3796     /*
3797     -------------------------------------------------------------------------------
3798     Returns 1 if the extended double-precision floating-point value `a' is less
3799     than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
3800     an exception.  Otherwise, the comparison is performed according to the
3801     IEC/IEEE Standard for Binary Floating-point Arithmetic.
3802     -------------------------------------------------------------------------------
3803     */
3804     flag floatx80_lt_quiet( floatx80 a, floatx80 b )
3805     {
3806         flag aSign, bSign;
3807     
3808         if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3809                   && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3810              || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3811                   && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3812            ) {
3813             if (    floatx80_is_signaling_nan( a )
3814                  || floatx80_is_signaling_nan( b ) ) {
3815                 float_raise( float_flag_invalid );
3816             }
3817             return 0;
3818         }
3819         aSign = extractFloatx80Sign( a );
3820         bSign = extractFloatx80Sign( b );
3821         if ( aSign != bSign ) {
3822             return
3823                    aSign
3824                 && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3825                      != 0 );
3826         }
3827         return
3828               aSign ? lt128( b.high, b.low, a.high, a.low )
3829             : lt128( a.high, a.low, b.high, b.low );
3830     
3831     }
3832     
3833     #endif
3834     
3835     #ifdef FLOAT128
3836     
3837     /*
3838     -------------------------------------------------------------------------------
3839     Returns the result of converting the quadruple-precision floating-point
3840     value `a' to the 32-bit two's complement integer format.  The conversion
3841     is performed according to the IEC/IEEE Standard for Binary Floating-point
3842     Arithmetic---which means in particular that the conversion is rounded
3843     according to the current rounding mode.  If `a' is a NaN, the largest
3844     positive integer is returned.  Otherwise, if the conversion overflows, the
3845     largest integer with the same sign as `a' is returned.
3846     -------------------------------------------------------------------------------
3847     */
3848     int32 float128_to_int32( float128 a )
3849     {
3850         flag aSign;
3851         int32 aExp, shiftCount;
3852         bits64 aSig0, aSig1;
3853     
3854         aSig1 = extractFloat128Frac1( a );
3855         aSig0 = extractFloat128Frac0( a );
3856         aExp = extractFloat128Exp( a );
3857         aSign = extractFloat128Sign( a );
3858         if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
3859         if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
3860         aSig0 |= ( aSig1 != 0 );
3861         shiftCount = 0x4028 - aExp;
3862         if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
3863         return roundAndPackInt32( aSign, aSig0 );
3864     
3865     }
3866     
3867     /*
3868     -------------------------------------------------------------------------------
3869     Returns the result of converting the quadruple-precision floating-point
3870     value `a' to the 32-bit two's complement integer format.  The conversion
3871     is performed according to the IEC/IEEE Standard for Binary Floating-point
3872     Arithmetic, except that the conversion is always rounded toward zero.  If
3873     `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
3874     conversion overflows, the largest integer with the same sign as `a' is
3875     returned.
3876     -------------------------------------------------------------------------------
3877     */
3878     int32 float128_to_int32_round_to_zero( float128 a )
3879     {
3880         flag aSign;
3881         int32 aExp, shiftCount;
3882         bits64 aSig0, aSig1, savedASig;
3883         int32 z;
3884     
3885         aSig1 = extractFloat128Frac1( a );
3886         aSig0 = extractFloat128Frac0( a );
3887         aExp = extractFloat128Exp( a );
3888         aSign = extractFloat128Sign( a );
3889         aSig0 |= ( aSig1 != 0 );
3890         shiftCount = 0x402F - aExp;
3891         if ( shiftCount < 17 ) {
3892             if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
3893             goto invalid;
3894         }
3895         else if ( 48 < shiftCount ) {
3896             if ( aExp || aSig0 ) float_exception_flags |= float_flag_inexact;
3897             return 0;
3898         }
3899         aSig0 |= LIT64( 0x0001000000000000 );
3900         savedASig = aSig0;
3901         aSig0 >>= shiftCount;
3902         z = aSig0;
3903         if ( aSign ) z = - z;
3904         if ( ( z < 0 ) ^ aSign ) {
3905      invalid:
3906             float_exception_flags |= float_flag_invalid;
3907             return aSign ? 0x80000000 : 0x7FFFFFFF;
3908         }
3909         if ( ( aSig0<<shiftCount ) != savedASig ) {
3910             float_exception_flags |= float_flag_inexact;
3911         }
3912         return z;
3913     
3914     }
3915     
3916     /*
3917     -------------------------------------------------------------------------------
3918     Returns the result of converting the quadruple-precision floating-point
3919     value `a' to the single-precision floating-point format.  The conversion
3920     is performed according to the IEC/IEEE Standard for Binary Floating-point
3921     Arithmetic.
3922     -------------------------------------------------------------------------------
3923     */
3924     float32 float128_to_float32( float128 a )
3925     {
3926         flag aSign;
3927         int32 aExp;
3928         bits64 aSig0, aSig1;
3929         bits32 zSig;
3930     
3931         aSig1 = extractFloat128Frac1( a );
3932         aSig0 = extractFloat128Frac0( a );
3933         aExp = extractFloat128Exp( a );
3934         aSign = extractFloat128Sign( a );
3935         if ( aExp == 0x7FFF ) {
3936             if ( aSig0 | aSig1 ) {
3937                 return commonNaNToFloat32( float128ToCommonNaN( a ) );
3938             }
3939             return packFloat32( aSign, 0xFF, 0 );
3940         }
3941         aSig0 |= ( aSig1 != 0 );
3942         shift64RightJamming( aSig0, 18, &aSig0 );
3943         zSig = aSig0;
3944         if ( aExp || zSig ) {
3945             zSig |= 0x40000000;
3946             aExp -= 0x3F81;
3947         }
3948         return roundAndPackFloat32( aSign, aExp, zSig );
3949     
3950     }
3951     
3952     /*
3953     -------------------------------------------------------------------------------
3954     Returns the result of converting the quadruple-precision floating-point
3955     value `a' to the double-precision floating-point format.  The conversion
3956     is performed according to the IEC/IEEE Standard for Binary Floating-point
3957     Arithmetic.
3958     -------------------------------------------------------------------------------
3959     */
3960     float64 float128_to_float64( float128 a )
3961     {
3962         flag aSign;
3963         int32 aExp;
3964         bits64 aSig0, aSig1;
3965     
3966         aSig1 = extractFloat128Frac1( a );
3967         aSig0 = extractFloat128Frac0( a );
3968         aExp = extractFloat128Exp( a );
3969         aSign = extractFloat128Sign( a );
3970         if ( aExp == 0x7FFF ) {
3971             if ( aSig0 | aSig1 ) {
3972                 return commonNaNToFloat64( float128ToCommonNaN( a ) );
3973             }
3974             return packFloat64( aSign, 0x7FF, 0 );
3975         }
3976         shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
3977         aSig0 |= ( aSig1 != 0 );
3978         if ( aExp || aSig0 ) {
3979             aSig0 |= LIT64( 0x4000000000000000 );
3980             aExp -= 0x3C01;
3981         }
3982         return roundAndPackFloat64( aSign, aExp, aSig0 );
3983     
3984     }
3985     
3986     #ifdef FLOATX80
3987     
3988     /*
3989     -------------------------------------------------------------------------------
3990     Returns the result of converting the quadruple-precision floating-point
3991     value `a' to the extended double-precision floating-point format.  The
3992     conversion is performed according to the IEC/IEEE Standard for Binary
3993     Floating-point Arithmetic.
3994     -------------------------------------------------------------------------------
3995     */
3996     floatx80 float128_to_floatx80( float128 a )
3997     {
3998         flag aSign;
3999         int32 aExp;
4000         bits64 aSig0, aSig1;
4001     
4002         aSig1 = extractFloat128Frac1( a );
4003         aSig0 = extractFloat128Frac0( a );
4004         aExp = extractFloat128Exp( a );
4005         aSign = extractFloat128Sign( a );
4006         if ( aExp == 0x7FFF ) {
4007             if ( aSig0 | aSig1 ) {
4008                 return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4009             }
4010             return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4011         }
4012         if ( aExp == 0 ) {
4013             if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4014             normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4015         }
4016         else {
4017             aSig0 |= LIT64( 0x0001000000000000 );
4018         }
4019         shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4020         return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4021     
4022     }
4023     
4024     #endif
4025     
4026     /*
4027     -------------------------------------------------------------------------------
4028     Rounds the quadruple-precision floating-point value `a' to an integer, and
4029     returns the result as a quadruple-precision floating-point value.  The
4030     operation is performed according to the IEC/IEEE Standard for Binary
4031     Floating-point Arithmetic.
4032     -------------------------------------------------------------------------------
4033     */
4034     float128 float128_round_to_int( float128 a )
4035     {
4036         flag aSign;
4037         int32 aExp;
4038         bits64 lastBitMask, roundBitsMask;
4039         int8 roundingMode;
4040         float128 z;
4041     
4042         aExp = extractFloat128Exp( a );
4043         if ( 0x402F <= aExp ) {
4044             if ( 0x406F <= aExp ) {
4045                 if (    ( aExp == 0x7FFF )
4046                      && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4047                    ) {
4048                     return propagateFloat128NaN( a, a );
4049                 }
4050                 return a;
4051             }
4052             lastBitMask = 1;
4053             lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4054             roundBitsMask = lastBitMask - 1;
4055             z = a;
4056             roundingMode = float_rounding_mode;
4057             if ( roundingMode == float_round_nearest_even ) {
4058                 if ( lastBitMask ) {
4059                     add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4060                     if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4061                 }
4062                 else {
4063                     if ( (sbits64) z.low < 0 ) {
4064                         ++z.high;
4065                         if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4066                     }
4067                 }
4068             }
4069             else if ( roundingMode != float_round_to_zero ) {
4070                 if (   extractFloat128Sign( z )
4071                      ^ ( roundingMode == float_round_up ) ) {
4072                     add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4073                 }
4074             }
4075             z.low &= ~ roundBitsMask;
4076         }
4077         else {
4078             if ( aExp <= 0x3FFE ) {
4079                 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4080                 float_exception_flags |= float_flag_inexact;
4081                 aSign = extractFloat128Sign( a );
4082                 switch ( float_rounding_mode ) {
4083                  case float_round_nearest_even:
4084                     if (    ( aExp == 0x3FFE )
4085                          && (   extractFloat128Frac0( a )
4086                               | extractFloat128Frac1( a ) )
4087                        ) {
4088                         return packFloat128( aSign, 0x3FFF, 0, 0 );
4089                     }
4090                     break;
4091                  case float_round_down:
4092                     return
4093                           aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4094                         : packFloat128( 0, 0, 0, 0 );
4095                  case float_round_up:
4096                     return
4097                           aSign ? packFloat128( 1, 0, 0, 0 )
4098                         : packFloat128( 0, 0x3FFF, 0, 0 );
4099                 }
4100                 return packFloat128( aSign, 0, 0, 0 );
4101             }
4102             lastBitMask = 1;
4103             lastBitMask <<= 0x402F - aExp;
4104             roundBitsMask = lastBitMask - 1;
4105             z.low = 0;
4106             z.high = a.high;
4107             roundingMode = float_rounding_mode;
4108             if ( roundingMode == float_round_nearest_even ) {
4109                 z.high += lastBitMask>>1;
4110                 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4111                     z.high &= ~ lastBitMask;
4112                 }
4113             }
4114             else if ( roundingMode != float_round_to_zero ) {
4115                 if (   extractFloat128Sign( z )
4116                      ^ ( roundingMode == float_round_up ) ) {
4117                     z.high |= ( a.low != 0 );
4118                     z.high += roundBitsMask;
4119                 }
4120             }
4121             z.high &= ~ roundBitsMask;
4122         }
4123         if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4124             float_exception_flags |= float_flag_inexact;
4125         }
4126         return z;
4127     
4128     }
4129     
4130     /*
4131     -------------------------------------------------------------------------------
4132     Returns the result of adding the absolute values of the quadruple-precision
4133     floating-point values `a' and `b'.  If `zSign' is true, the sum is negated
4134     before being returned.  `zSign' is ignored if the result is a NaN.  The
4135     addition is performed according to the IEC/IEEE Standard for Binary
4136     Floating-point Arithmetic.
4137     -------------------------------------------------------------------------------
4138     */
4139     static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4140     {
4141         int32 aExp, bExp, zExp;
4142         bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4143         int32 expDiff;
4144     
4145         aSig1 = extractFloat128Frac1( a );
4146         aSig0 = extractFloat128Frac0( a );
4147         aExp = extractFloat128Exp( a );
4148         bSig1 = extractFloat128Frac1( b );
4149         bSig0 = extractFloat128Frac0( b );
4150         bExp = extractFloat128Exp( b );
4151         expDiff = aExp - bExp;
4152         if ( 0 < expDiff ) {
4153             if ( aExp == 0x7FFF ) {
4154                 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4155                 return a;
4156             }
4157             if ( bExp == 0 ) {
4158                 --expDiff;
4159             }
4160             else {
4161                 bSig0 |= LIT64( 0x0001000000000000 );
4162             }
4163             shift128ExtraRightJamming(
4164                 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4165             zExp = aExp;
4166         }
4167         else if ( expDiff < 0 ) {
4168             if ( bExp == 0x7FFF ) {
4169                 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4170                 return packFloat128( zSign, 0x7FFF, 0, 0 );
4171             }
4172             if ( aExp == 0 ) {
4173                 ++expDiff;
4174             }
4175             else {
4176                 aSig0 |= LIT64( 0x0001000000000000 );
4177             }
4178             shift128ExtraRightJamming(
4179                 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4180             zExp = bExp;
4181         }
4182         else {
4183             if ( aExp == 0x7FFF ) {
4184                 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4185                     return propagateFloat128NaN( a, b );
4186                 }
4187                 return a;
4188             }
4189             add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4190             if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4191             zSig2 = 0;
4192             zSig0 |= LIT64( 0x0002000000000000 );
4193             zExp = aExp;
4194             goto shiftRight1;
4195         }
4196         aSig0 |= LIT64( 0x0001000000000000 );
4197         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4198         --zExp;
4199         if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4200         ++zExp;
4201      shiftRight1:
4202         shift128ExtraRightJamming(
4203             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4204      roundAndPack:
4205         return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4206     
4207     }
4208     
4209     /*
4210     -------------------------------------------------------------------------------
4211     Returns the result of subtracting the absolute values of the quadruple-
4212     precision floating-point values `a' and `b'.  If `zSign' is true, the
4213     difference is negated before being returned.  `zSign' is ignored if the
4214     result is a NaN.  The subtraction is performed according to the IEC/IEEE
4215     Standard for Binary Floating-point Arithmetic.
4216     -------------------------------------------------------------------------------
4217     */
4218     static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4219     {
4220         int32 aExp, bExp, zExp;
4221         bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4222         int32 expDiff;
4223         float128 z;
4224     
4225         aSig1 = extractFloat128Frac1( a );
4226         aSig0 = extractFloat128Frac0( a );
4227         aExp = extractFloat128Exp( a );
4228         bSig1 = extractFloat128Frac1( b );
4229         bSig0 = extractFloat128Frac0( b );
4230         bExp = extractFloat128Exp( b );
4231         expDiff = aExp - bExp;
4232         shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4233         shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4234         if ( 0 < expDiff ) goto aExpBigger;
4235         if ( expDiff < 0 ) goto bExpBigger;
4236         if ( aExp == 0x7FFF ) {
4237             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4238                 return propagateFloat128NaN( a, b );
4239             }
4240             float_raise( float_flag_invalid );
4241             z.low = float128_default_nan_low;
4242             z.high = float128_default_nan_high;
4243             return z;
4244         }
4245         if ( aExp == 0 ) {
4246             aExp = 1;
4247             bExp = 1;
4248         }
4249         if ( bSig0 < aSig0 ) goto aBigger;
4250         if ( aSig0 < bSig0 ) goto bBigger;
4251         if ( bSig1 < aSig1 ) goto aBigger;
4252         if ( aSig1 < bSig1 ) goto bBigger;
4253         return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 );
4254      bExpBigger:
4255         if ( bExp == 0x7FFF ) {
4256             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4257             return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4258         }
4259         if ( aExp == 0 ) {
4260             ++expDiff;
4261         }
4262         else {
4263             aSig0 |= LIT64( 0x4000000000000000 );
4264         }
4265         shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4266         bSig0 |= LIT64( 0x4000000000000000 );
4267      bBigger:
4268         sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4269         zExp = bExp;
4270         zSign ^= 1;
4271         goto normalizeRoundAndPack;
4272      aExpBigger:
4273         if ( aExp == 0x7FFF ) {
4274             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4275             return a;
4276         }
4277         if ( bExp == 0 ) {
4278             --expDiff;
4279         }
4280         else {
4281             bSig0 |= LIT64( 0x4000000000000000 );
4282         }
4283         shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4284         aSig0 |= LIT64( 0x4000000000000000 );
4285      aBigger:
4286         sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4287         zExp = aExp;
4288      normalizeRoundAndPack:
4289         --zExp;
4290         return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4291     
4292     }
4293     
4294     /*
4295     -------------------------------------------------------------------------------
4296     Returns the result of adding the quadruple-precision floating-point values
4297     `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4298     for Binary Floating-point Arithmetic.
4299     -------------------------------------------------------------------------------
4300     */
4301     float128 float128_add( float128 a, float128 b )
4302     {
4303         flag aSign, bSign;
4304     
4305         aSign = extractFloat128Sign( a );
4306         bSign = extractFloat128Sign( b );
4307         if ( aSign == bSign ) {
4308             return addFloat128Sigs( a, b, aSign );
4309         }
4310         else {
4311             return subFloat128Sigs( a, b, aSign );
4312         }
4313     
4314     }
4315     
4316     /*
4317     -------------------------------------------------------------------------------
4318     Returns the result of subtracting the quadruple-precision floating-point
4319     values `a' and `b'.  The operation is performed according to the IEC/IEEE
4320     Standard for Binary Floating-point Arithmetic.
4321     -------------------------------------------------------------------------------
4322     */
4323     float128 float128_sub( float128 a, float128 b )
4324     {
4325         flag aSign, bSign;
4326     
4327         aSign = extractFloat128Sign( a );
4328         bSign = extractFloat128Sign( b );
4329         if ( aSign == bSign ) {
4330             return subFloat128Sigs( a, b, aSign );
4331         }
4332         else {
4333             return addFloat128Sigs( a, b, aSign );
4334         }
4335     
4336     }
4337     
4338     /*
4339     -------------------------------------------------------------------------------
4340     Returns the result of multiplying the quadruple-precision floating-point
4341     values `a' and `b'.  The operation is performed according to the IEC/IEEE
4342     Standard for Binary Floating-point Arithmetic.
4343     -------------------------------------------------------------------------------
4344     */
4345     float128 float128_mul( float128 a, float128 b )
4346     {
4347         flag aSign, bSign, zSign;
4348         int32 aExp, bExp, zExp;
4349         bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4350         float128 z;
4351     
4352         aSig1 = extractFloat128Frac1( a );
4353         aSig0 = extractFloat128Frac0( a );
4354         aExp = extractFloat128Exp( a );
4355         aSign = extractFloat128Sign( a );
4356         bSig1 = extractFloat128Frac1( b );
4357         bSig0 = extractFloat128Frac0( b );
4358         bExp = extractFloat128Exp( b );
4359         bSign = extractFloat128Sign( b );
4360         zSign = aSign ^ bSign;
4361         if ( aExp == 0x7FFF ) {
4362             if (    ( aSig0 | aSig1 )
4363                  || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4364                 return propagateFloat128NaN( a, b );
4365             }
4366             if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4367             return packFloat128( zSign, 0x7FFF, 0, 0 );
4368         }
4369         if ( bExp == 0x7FFF ) {
4370             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4371             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4372      invalid:
4373                 float_raise( float_flag_invalid );
4374                 z.low = float128_default_nan_low;
4375                 z.high = float128_default_nan_high;
4376                 return z;
4377             }
4378             return packFloat128( zSign, 0x7FFF, 0, 0 );
4379         }
4380         if ( aExp == 0 ) {
4381             if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4382             normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4383         }
4384         if ( bExp == 0 ) {
4385             if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4386             normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4387         }
4388         zExp = aExp + bExp - 0x4000;
4389         aSig0 |= LIT64( 0x0001000000000000 );
4390         shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4391         mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4392         add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4393         zSig2 |= ( zSig3 != 0 );
4394         if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4395             shift128ExtraRightJamming(
4396                 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4397             ++zExp;
4398         }
4399         return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4400     
4401     }
4402     
4403     /*
4404     -------------------------------------------------------------------------------
4405     Returns the result of dividing the quadruple-precision floating-point value
4406     `a' by the corresponding value `b'.  The operation is performed according to
4407     the IEC/IEEE Standard for Binary Floating-point Arithmetic.
4408     -------------------------------------------------------------------------------
4409     */
4410     float128 float128_div( float128 a, float128 b )
4411     {
4412         flag aSign, bSign, zSign;
4413         int32 aExp, bExp, zExp;
4414         bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4415         bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4416         float128 z;
4417     
4418         aSig1 = extractFloat128Frac1( a );
4419         aSig0 = extractFloat128Frac0( a );
4420         aExp = extractFloat128Exp( a );
4421         aSign = extractFloat128Sign( a );
4422         bSig1 = extractFloat128Frac1( b );
4423         bSig0 = extractFloat128Frac0( b );
4424         bExp = extractFloat128Exp( b );
4425         bSign = extractFloat128Sign( b );
4426         zSign = aSign ^ bSign;
4427         if ( aExp == 0x7FFF ) {
4428             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4429             if ( bExp == 0x7FFF ) {
4430                 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4431                 goto invalid;
4432             }
4433             return packFloat128( zSign, 0x7FFF, 0, 0 );
4434         }
4435         if ( bExp == 0x7FFF ) {
4436             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4437             return packFloat128( zSign, 0, 0, 0 );
4438         }
4439         if ( bExp == 0 ) {
4440             if ( ( bSig0 | bSig1 ) == 0 ) {
4441                 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4442      invalid:
4443                     float_raise( float_flag_invalid );
4444                     z.low = float128_default_nan_low;
4445                     z.high = float128_default_nan_high;
4446                     return z;
4447                 }
4448                 float_raise( float_flag_divbyzero );
4449                 return packFloat128( zSign, 0x7FFF, 0, 0 );
4450             }
4451             normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4452         }
4453         if ( aExp == 0 ) {
4454             if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4455             normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4456         }
4457         zExp = aExp - bExp + 0x3FFD;
4458         shortShift128Left(
4459             aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4460         shortShift128Left(
4461             bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4462         if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4463             shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4464             ++zExp;
4465         }
4466         zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4467         mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4468         sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4469         while ( (sbits64) rem0 < 0 ) {
4470             --zSig0;
4471             add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
4472         }
4473         zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
4474         if ( ( zSig1 & 0x3FFF ) <= 4 ) {
4475             mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
4476             sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
4477             while ( (sbits64) rem1 < 0 ) {
4478                 --zSig1;
4479                 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
4480             }
4481             zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4482         }
4483         shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
4484         return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4485     
4486     }
4487     
4488     /*
4489     -------------------------------------------------------------------------------
4490     Returns the remainder of the quadruple-precision floating-point value `a'
4491     with respect to the corresponding value `b'.  The operation is performed
4492     according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
4493     -------------------------------------------------------------------------------
4494     */
4495     float128 float128_rem( float128 a, float128 b )
4496     {
4497         flag aSign, bSign, zSign;
4498         int32 aExp, bExp, expDiff;
4499         bits64 aSig0, aSig1, bSig0, bSig1;
4500         bits64 q, term0, term1, term2, allZero, alternateASig0, alternateASig1;
4501         bits64 sigMean1;
4502         sbits64 sigMean0;
4503         float128 z;
4504     
4505         aSig1 = extractFloat128Frac1( a );
4506         aSig0 = extractFloat128Frac0( a );
4507         aExp = extractFloat128Exp( a );
4508         aSign = extractFloat128Sign( a );
4509         bSig1 = extractFloat128Frac1( b );
4510         bSig0 = extractFloat128Frac0( b );
4511         bExp = extractFloat128Exp( b );
4512         bSign = extractFloat128Sign( b );
4513         if ( aExp == 0x7FFF ) {
4514             if (    ( aSig0 | aSig1 )
4515                  || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4516                 return propagateFloat128NaN( a, b );
4517             }
4518             goto invalid;
4519         }
4520         if ( bExp == 0x7FFF ) {
4521             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4522             return a;
4523         }
4524         if ( bExp == 0 ) {
4525             if ( ( bSig0 | bSig1 ) == 0 ) {
4526      invalid:
4527                 float_raise( float_flag_invalid );
4528                 z.low = float128_default_nan_low;
4529                 z.high = float128_default_nan_high;
4530                 return z;
4531             }
4532             normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4533         }
4534         if ( aExp == 0 ) {
4535             if ( ( aSig0 | aSig1 ) == 0 ) return a;
4536             normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4537         }
4538         expDiff = aExp - bExp;
4539         if ( expDiff < -1 ) return a;
4540         shortShift128Left(
4541             aSig0 | LIT64( 0x0001000000000000 ),
4542             aSig1,
4543             15 - ( expDiff < 0 ),
4544             &aSig0,
4545             &aSig1
4546         );
4547         shortShift128Left(
4548             bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4549         q = le128( bSig0, bSig1, aSig0, aSig1 );
4550         if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4551         expDiff -= 64;
4552         while ( 0 < expDiff ) {
4553             q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4554             q = ( 4 < q ) ? q - 4 : 0;
4555             mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4556             shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
4557             shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
4558             sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
4559             expDiff -= 61;
4560         }
4561         if ( -64 < expDiff ) {
4562             q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4563             q = ( 4 < q ) ? q - 4 : 0;
4564             q >>= - expDiff;
4565             shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4566             expDiff += 52;
4567             if ( expDiff < 0 ) {
4568                 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4569             }
4570             else {
4571                 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
4572             }
4573             mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4574             sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
4575         }
4576         else {
4577             shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
4578             shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4579         }
4580         do {
4581             alternateASig0 = aSig0;
4582             alternateASig1 = aSig1;
4583             ++q;
4584             sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4585         } while ( 0 <= (sbits64) aSig0 );
4586         add128(
4587             aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
4588         if (    ( sigMean0 < 0 )
4589              || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
4590             aSig0 = alternateASig0;
4591             aSig1 = alternateASig1;
4592         }
4593         zSign = ( (sbits64) aSig0 < 0 );
4594         if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
4595         return
4596             normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
4597     
4598     }
4599     
4600     /*
4601     -------------------------------------------------------------------------------
4602     Returns the square root of the quadruple-precision floating-point value `a'.
4603     The operation is performed according to the IEC/IEEE Standard for Binary
4604     Floating-point Arithmetic.
4605     -------------------------------------------------------------------------------
4606     */
4607     float128 float128_sqrt( float128 a )
4608     {
4609         flag aSign;
4610         int32 aExp, zExp;
4611         bits64 aSig0, aSig1, zSig0, zSig1, zSig2;
4612         bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4613         bits64 shiftedRem0, shiftedRem1;
4614         float128 z;
4615     
4616         aSig1 = extractFloat128Frac1( a );
4617         aSig0 = extractFloat128Frac0( a );
4618         aExp = extractFloat128Exp( a );
4619         aSign = extractFloat128Sign( a );
4620         if ( aExp == 0x7FFF ) {
4621             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
4622             if ( ! aSign ) return a;
4623             goto invalid;
4624         }
4625         if ( aSign ) {
4626             if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
4627      invalid:
4628             float_raise( float_flag_invalid );
4629             z.low = float128_default_nan_low;
4630             z.high = float128_default_nan_high;
4631             return z;
4632         }
4633         if ( aExp == 0 ) {
4634             if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
4635             normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4636         }
4637         zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
4638         aSig0 |= LIT64( 0x0001000000000000 );
4639         zSig0 = estimateSqrt32( aExp, aSig0>>17 );
4640         zSig0 <<= 31;
4641         shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
4642         zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0 ) + zSig0 + 4;
4643         if ( 0 <= (sbits64) zSig0 ) zSig0 = LIT64( 0xFFFFFFFFFFFFFFFF );
4644         shortShift128Left( aSig0, aSig1, 2, &aSig0, &aSig1 );
4645         mul64To128( zSig0, zSig0, &term0, &term1 );
4646         sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4647         while ( (sbits64) rem0 < 0 ) {
4648             --zSig0;
4649             shortShift128Left( 0, zSig0, 1, &term0, &term1 );
4650             term1 |= 1;
4651             add128( rem0, rem1, term0, term1, &rem0, &rem1 );
4652         }
4653         shortShift128Left( rem0, rem1, 63, &shiftedRem0, &shiftedRem1 );
4654         zSig1 = estimateDiv128To64( shiftedRem0, shiftedRem1, zSig0 );
4655         if ( ( zSig1 & 0x3FFF ) <= 5 ) {
4656             if ( zSig1 == 0 ) zSig1 = 1;
4657             mul64To128( zSig0, zSig1, &term1, &term2 );
4658             shortShift128Left( term1, term2, 1, &term1, &term2 );
4659             sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4660             mul64To128( zSig1, zSig1, &term2, &term3 );
4661             sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4662             while ( (sbits64) rem1 < 0 ) {
4663                 --zSig1;
4664                 shortShift192Left( 0, zSig0, zSig1, 1, &term1, &term2, &term3 );
4665                 term3 |= 1;
4666                 add192(
4667                     rem1, rem2, rem3, term1, term2, term3, &rem1, &rem2, &rem3 );
4668             }
4669             zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4670         }
4671         shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
4672         return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
4673     
4674     }
4675     
4676     /*
4677     -------------------------------------------------------------------------------
4678     Returns 1 if the quadruple-precision floating-point value `a' is equal to
4679     the corresponding value `b', and 0 otherwise.  The comparison is performed
4680     according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
4681     -------------------------------------------------------------------------------
4682     */
4683     flag float128_eq( float128 a, float128 b )
4684     {
4685     
4686         if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
4687                   && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
4688              || (    ( extractFloat128Exp( b ) == 0x7FFF )
4689                   && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
4690            ) {
4691             if (    float128_is_signaling_nan( a )
4692                  || float128_is_signaling_nan( b ) ) {
4693                 float_raise( float_flag_invalid );
4694             }
4695             return 0;
4696         }
4697         return
4698                ( a.low == b.low )
4699             && (    ( a.high == b.high )
4700                  || (    ( a.low == 0 )
4701                       && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
4702                );
4703     
4704     }
4705     
4706     /*
4707     -------------------------------------------------------------------------------
4708     Returns 1 if the quadruple-precision floating-point value `a' is less than
4709     or equal to the corresponding value `b', and 0 otherwise.  The comparison
4710     is performed according to the IEC/IEEE Standard for Binary Floating-point
4711     Arithmetic.
4712     -------------------------------------------------------------------------------
4713     */
4714     flag float128_le( float128 a, float128 b )
4715     {
4716         flag aSign, bSign;
4717     
4718         if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
4719                   && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
4720              || (    ( extractFloat128Exp( b ) == 0x7FFF )
4721                   && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
4722            ) {
4723             float_raise( float_flag_invalid );
4724             return 0;
4725         }
4726         aSign = extractFloat128Sign( a );
4727         bSign = extractFloat128Sign( b );
4728         if ( aSign != bSign ) {
4729             return
4730                    aSign
4731                 || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4732                      == 0 );
4733         }
4734         return
4735               aSign ? le128( b.high, b.low, a.high, a.low )
4736             : le128( a.high, a.low, b.high, b.low );
4737     
4738     }
4739     
4740     /*
4741     -------------------------------------------------------------------------------
4742     Returns 1 if the quadruple-precision floating-point value `a' is less than
4743     the corresponding value `b', and 0 otherwise.  The comparison is performed
4744     according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
4745     -------------------------------------------------------------------------------
4746     */
4747     flag float128_lt( float128 a, float128 b )
4748     {
4749         flag aSign, bSign;
4750     
4751         if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
4752                   && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
4753              || (    ( extractFloat128Exp( b ) == 0x7FFF )
4754                   && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
4755            ) {
4756             float_raise( float_flag_invalid );
4757             return 0;
4758         }
4759         aSign = extractFloat128Sign( a );
4760         bSign = extractFloat128Sign( b );
4761         if ( aSign != bSign ) {
4762             return
4763                    aSign
4764                 && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4765                      != 0 );
4766         }
4767         return
4768               aSign ? lt128( b.high, b.low, a.high, a.low )
4769             : lt128( a.high, a.low, b.high, b.low );
4770     
4771     }
4772     
4773     /*
4774     -------------------------------------------------------------------------------
4775     Returns 1 if the quadruple-precision floating-point value `a' is equal to
4776     the corresponding value `b', and 0 otherwise.  The invalid exception is
4777     raised if either operand is a NaN.  Otherwise, the comparison is performed
4778     according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
4779     -------------------------------------------------------------------------------
4780     */
4781     flag float128_eq_signaling( float128 a, float128 b )
4782     {
4783     
4784         if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
4785                   && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
4786              || (    ( extractFloat128Exp( b ) == 0x7FFF )
4787                   && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
4788            ) {
4789             float_raise( float_flag_invalid );
4790             return 0;
4791         }
4792         return
4793                ( a.low == b.low )
4794             && (    ( a.high == b.high )
4795                  || (    ( a.low == 0 )
4796                       && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
4797                );
4798     
4799     }
4800     
4801     /*
4802     -------------------------------------------------------------------------------
4803     Returns 1 if the quadruple-precision floating-point value `a' is less than
4804     or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4805     cause an exception.  Otherwise, the comparison is performed according to the
4806     IEC/IEEE Standard for Binary Floating-point Arithmetic.
4807     -------------------------------------------------------------------------------
4808     */
4809     flag float128_le_quiet( float128 a, float128 b )
4810     {
4811         flag aSign, bSign;
4812     
4813         if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
4814                   && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
4815              || (    ( extractFloat128Exp( b ) == 0x7FFF )
4816                   && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
4817            ) {
4818             if (    float128_is_signaling_nan( a )
4819                  || float128_is_signaling_nan( b ) ) {
4820                 float_raise( float_flag_invalid );
4821             }
4822             return 0;
4823         }
4824         aSign = extractFloat128Sign( a );
4825         bSign = extractFloat128Sign( b );
4826         if ( aSign != bSign ) {
4827             return
4828                    aSign
4829                 || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4830                      == 0 );
4831         }
4832         return
4833               aSign ? le128( b.high, b.low, a.high, a.low )
4834             : le128( a.high, a.low, b.high, b.low );
4835     
4836     }
4837     
4838     /*
4839     -------------------------------------------------------------------------------
4840     Returns 1 if the quadruple-precision floating-point value `a' is less than
4841     the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4842     exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4843     Standard for Binary Floating-point Arithmetic.
4844     -------------------------------------------------------------------------------
4845     */
4846     flag float128_lt_quiet( float128 a, float128 b )
4847     {
4848         flag aSign, bSign;
4849     
4850         if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
4851                   && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
4852              || (    ( extractFloat128Exp( b ) == 0x7FFF )
4853                   && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
4854            ) {
4855             if (    float128_is_signaling_nan( a )
4856                  || float128_is_signaling_nan( b ) ) {
4857                 float_raise( float_flag_invalid );
4858             }
4859             return 0;
4860         }
4861         aSign = extractFloat128Sign( a );
4862         bSign = extractFloat128Sign( b );
4863         if ( aSign != bSign ) {
4864             return
4865                    aSign
4866                 && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4867                      != 0 );
4868         }
4869         return
4870               aSign ? lt128( b.high, b.low, a.high, a.low )
4871             : lt128( a.high, a.low, b.high, b.low );
4872     
4873     }
4874     
4875     #endif
4876     
4877