| @@ -1,1663 +1,1663 @@ | | | @@ -1,1663 +1,1663 @@ |
1 | /* $NetBSD: softfloat.c,v 1.13 2013/11/22 17:04:24 martin Exp $ */ | | 1 | /* $NetBSD: softfloat.c,v 1.14 2016/03/29 18:42:29 martin Exp $ */ |
2 | | | 2 | |
3 | /* | | 3 | /* |
4 | * This version hacked for use with gcc -msoft-float by bjh21. | | 4 | * This version hacked for use with gcc -msoft-float by bjh21. |
5 | * (Mostly a case of #ifdefing out things GCC doesn't need or provides | | 5 | * (Mostly a case of #ifdefing out things GCC doesn't need or provides |
6 | * itself). | | 6 | * itself). |
7 | */ | | 7 | */ |
8 | | | 8 | |
9 | /* | | 9 | /* |
10 | * Things you may want to define: | | 10 | * Things you may want to define: |
11 | * | | 11 | * |
12 | * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with | | 12 | * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with |
13 | * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them | | 13 | * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them |
14 | * properly renamed. | | 14 | * properly renamed. |
15 | */ | | 15 | */ |
16 | | | 16 | |
17 | /* | | 17 | /* |
18 | =============================================================================== | | 18 | =============================================================================== |
19 | | | 19 | |
20 | This C source file is part of the SoftFloat IEC/IEEE Floating-point | | 20 | This C source file is part of the SoftFloat IEC/IEEE Floating-point |
21 | Arithmetic Package, Release 2a. | | 21 | Arithmetic Package, Release 2a. |
22 | | | 22 | |
23 | Written by John R. Hauser. This work was made possible in part by the | | 23 | Written by John R. Hauser. This work was made possible in part by the |
24 | International Computer Science Institute, located at Suite 600, 1947 Center | | 24 | International Computer Science Institute, located at Suite 600, 1947 Center |
25 | Street, Berkeley, California 94704. Funding was partially provided by the | | 25 | Street, Berkeley, California 94704. Funding was partially provided by the |
26 | National Science Foundation under grant MIP-9311980. The original version | | 26 | National Science Foundation under grant MIP-9311980. The original version |
27 | of this code was written as part of a project to build a fixed-point vector | | 27 | of this code was written as part of a project to build a fixed-point vector |
28 | processor in collaboration with the University of California at Berkeley, | | 28 | processor in collaboration with the University of California at Berkeley, |
29 | overseen by Profs. Nelson Morgan and John Wawrzynek. More information | | 29 | overseen by Profs. Nelson Morgan and John Wawrzynek. More information |
30 | is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ | | 30 | is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/ |
31 | arithmetic/SoftFloat.html'. | | 31 | arithmetic/SoftFloat.html'. |
32 | | | 32 | |
33 | THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort | | 33 | THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort |
34 | has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT | | 34 | has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT |
35 | TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO | | 35 | TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO |
36 | PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY | | 36 | PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY |
37 | AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. | | 37 | AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE. |
38 | | | 38 | |
39 | Derivative works are acceptable, even for commercial purposes, so long as | | 39 | Derivative works are acceptable, even for commercial purposes, so long as |
40 | (1) they include prominent notice that the work is derivative, and (2) they | | 40 | (1) they include prominent notice that the work is derivative, and (2) they |
41 | include prominent notice akin to these four paragraphs for those parts of | | 41 | include prominent notice akin to these four paragraphs for those parts of |
42 | this code that are retained. | | 42 | this code that are retained. |
43 | | | 43 | |
44 | =============================================================================== | | 44 | =============================================================================== |
45 | */ | | 45 | */ |
46 | | | 46 | |
47 | #include <sys/cdefs.h> | | 47 | #include <sys/cdefs.h> |
48 | #if defined(LIBC_SCCS) && !defined(lint) | | 48 | #if defined(LIBC_SCCS) && !defined(lint) |
49 | __RCSID("$NetBSD: softfloat.c,v 1.13 2013/11/22 17:04:24 martin Exp $"); | | 49 | __RCSID("$NetBSD: softfloat.c,v 1.14 2016/03/29 18:42:29 martin Exp $"); |
50 | #endif /* LIBC_SCCS and not lint */ | | 50 | #endif /* LIBC_SCCS and not lint */ |
51 | | | 51 | |
52 | #ifdef SOFTFLOAT_FOR_GCC | | 52 | #ifdef SOFTFLOAT_FOR_GCC |
53 | #include "softfloat-for-gcc.h" | | 53 | #include "softfloat-for-gcc.h" |
54 | #endif | | 54 | #endif |
55 | | | 55 | |
56 | #include "milieu.h" | | 56 | #include "milieu.h" |
57 | #include "softfloat.h" | | 57 | #include "softfloat.h" |
58 | | | 58 | |
59 | /* | | 59 | /* |
60 | * Conversions between floats as stored in memory and floats as | | 60 | * Conversions between floats as stored in memory and floats as |
61 | * SoftFloat uses them | | 61 | * SoftFloat uses them |
62 | */ | | 62 | */ |
63 | #ifndef FLOAT64_DEMANGLE | | 63 | #ifndef FLOAT64_DEMANGLE |
64 | #define FLOAT64_DEMANGLE(a) (a) | | 64 | #define FLOAT64_DEMANGLE(a) (a) |
65 | #endif | | 65 | #endif |
66 | #ifndef FLOAT64_MANGLE | | 66 | #ifndef FLOAT64_MANGLE |
67 | #define FLOAT64_MANGLE(a) (a) | | 67 | #define FLOAT64_MANGLE(a) (a) |
68 | #endif | | 68 | #endif |
69 | | | 69 | |
70 | /* | | 70 | /* |
71 | ------------------------------------------------------------------------------- | | 71 | ------------------------------------------------------------------------------- |
72 | Floating-point rounding mode, extended double-precision rounding precision, | | 72 | Floating-point rounding mode, extended double-precision rounding precision, |
73 | and exception flags. | | 73 | and exception flags. |
74 | ------------------------------------------------------------------------------- | | 74 | ------------------------------------------------------------------------------- |
75 | */ | | 75 | */ |
76 | #ifndef set_float_rounding_mode | | 76 | #ifndef set_float_rounding_mode |
77 | fp_rnd float_rounding_mode = float_round_nearest_even; | | 77 | fp_rnd float_rounding_mode = float_round_nearest_even; |
78 | fp_except float_exception_flags = 0; | | 78 | fp_except float_exception_flags = 0; |
79 | #endif | | 79 | #endif |
80 | #ifndef set_float_exception_inexact_flag | | 80 | #ifndef set_float_exception_inexact_flag |
81 | #define set_float_exception_inexact_flag() \ | | 81 | #define set_float_exception_inexact_flag() \ |
82 | ((void)(float_exception_flags |= float_flag_inexact)) | | 82 | ((void)(float_exception_flags |= float_flag_inexact)) |
83 | #endif | | 83 | #endif |
84 | #ifdef FLOATX80 | | 84 | #ifdef FLOATX80 |
85 | int8 floatx80_rounding_precision = 80; | | 85 | int8 floatx80_rounding_precision = 80; |
86 | #endif | | 86 | #endif |
87 | | | 87 | |
88 | /* | | 88 | /* |
89 | ------------------------------------------------------------------------------- | | 89 | ------------------------------------------------------------------------------- |
90 | Primitive arithmetic functions, including multi-word arithmetic, and | | 90 | Primitive arithmetic functions, including multi-word arithmetic, and |
91 | division and square root approximations. (Can be specialized to target if | | 91 | division and square root approximations. (Can be specialized to target if |
92 | desired.) | | 92 | desired.) |
93 | ------------------------------------------------------------------------------- | | 93 | ------------------------------------------------------------------------------- |
94 | */ | | 94 | */ |
95 | #include "softfloat-macros" | | 95 | #include "softfloat-macros" |
96 | | | 96 | |
97 | /* | | 97 | /* |
98 | ------------------------------------------------------------------------------- | | 98 | ------------------------------------------------------------------------------- |
99 | Functions and definitions to determine: (1) whether tininess for underflow | | 99 | Functions and definitions to determine: (1) whether tininess for underflow |
100 | is detected before or after rounding by default, (2) what (if anything) | | 100 | is detected before or after rounding by default, (2) what (if anything) |
101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished | | 101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished |
102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs | | 102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs |
103 | are propagated from function inputs to output. These details are target- | | 103 | are propagated from function inputs to output. These details are target- |
104 | specific. | | 104 | specific. |
105 | ------------------------------------------------------------------------------- | | 105 | ------------------------------------------------------------------------------- |
106 | */ | | 106 | */ |
107 | #include "softfloat-specialize" | | 107 | #include "softfloat-specialize" |
108 | | | 108 | |
109 | #if !defined(SOFTFLOAT_FOR_GCC) || defined(FLOATX80) || defined(FLOAT128) | | 109 | #if !defined(SOFTFLOAT_FOR_GCC) || defined(FLOATX80) || defined(FLOAT128) |
110 | /* | | 110 | /* |
111 | ------------------------------------------------------------------------------- | | 111 | ------------------------------------------------------------------------------- |
112 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 | | 112 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 |
113 | and 7, and returns the properly rounded 32-bit integer corresponding to the | | 113 | and 7, and returns the properly rounded 32-bit integer corresponding to the |
114 | input. If `zSign' is 1, the input is negated before being converted to an | | 114 | input. If `zSign' is 1, the input is negated before being converted to an |
115 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input | | 115 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input |
116 | is simply rounded to an integer, with the inexact exception raised if the | | 116 | is simply rounded to an integer, with the inexact exception raised if the |
117 | input cannot be represented exactly as an integer. However, if the fixed- | | 117 | input cannot be represented exactly as an integer. However, if the fixed- |
118 | point input is too large, the invalid exception is raised and the largest | | 118 | point input is too large, the invalid exception is raised and the largest |
119 | positive or negative integer is returned. | | 119 | positive or negative integer is returned. |
120 | ------------------------------------------------------------------------------- | | 120 | ------------------------------------------------------------------------------- |
121 | */ | | 121 | */ |
122 | static int32 roundAndPackInt32( flag zSign, bits64 absZ ) | | 122 | static int32 roundAndPackInt32( flag zSign, bits64 absZ ) |
123 | { | | 123 | { |
124 | int8 roundingMode; | | 124 | int8 roundingMode; |
125 | flag roundNearestEven; | | 125 | flag roundNearestEven; |
126 | int8 roundIncrement, roundBits; | | 126 | int8 roundIncrement, roundBits; |
127 | int32 z; | | 127 | int32 z; |
128 | | | 128 | |
129 | roundingMode = float_rounding_mode; | | 129 | roundingMode = float_rounding_mode; |
130 | roundNearestEven = ( roundingMode == float_round_nearest_even ); | | 130 | roundNearestEven = ( roundingMode == float_round_nearest_even ); |
131 | roundIncrement = 0x40; | | 131 | roundIncrement = 0x40; |
132 | if ( ! roundNearestEven ) { | | 132 | if ( ! roundNearestEven ) { |
133 | if ( roundingMode == float_round_to_zero ) { | | 133 | if ( roundingMode == float_round_to_zero ) { |
134 | roundIncrement = 0; | | 134 | roundIncrement = 0; |
135 | } | | 135 | } |
136 | else { | | 136 | else { |
137 | roundIncrement = 0x7F; | | 137 | roundIncrement = 0x7F; |
138 | if ( zSign ) { | | 138 | if ( zSign ) { |
139 | if ( roundingMode == float_round_up ) roundIncrement = 0; | | 139 | if ( roundingMode == float_round_up ) roundIncrement = 0; |
140 | } | | 140 | } |
141 | else { | | 141 | else { |
142 | if ( roundingMode == float_round_down ) roundIncrement = 0; | | 142 | if ( roundingMode == float_round_down ) roundIncrement = 0; |
143 | } | | 143 | } |
144 | } | | 144 | } |
145 | } | | 145 | } |
146 | roundBits = (int8)(absZ & 0x7F); | | 146 | roundBits = (int8)(absZ & 0x7F); |
147 | absZ = ( absZ + roundIncrement )>>7; | | 147 | absZ = ( absZ + roundIncrement )>>7; |
148 | absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); | | 148 | absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); |
149 | z = (int32)absZ; | | 149 | z = (int32)absZ; |
150 | if ( zSign ) z = - z; | | 150 | if ( zSign ) z = - z; |
151 | if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { | | 151 | if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { |
152 | float_raise( float_flag_invalid ); | | 152 | float_raise( float_flag_invalid ); |
153 | return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; | | 153 | return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; |
154 | } | | 154 | } |
155 | if ( roundBits ) set_float_exception_inexact_flag(); | | 155 | if ( roundBits ) set_float_exception_inexact_flag(); |
156 | return z; | | 156 | return z; |
157 | | | 157 | |
158 | } | | 158 | } |
159 | | | 159 | |
160 | /* | | 160 | /* |
161 | ------------------------------------------------------------------------------- | | 161 | ------------------------------------------------------------------------------- |
162 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and | | 162 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and |
163 | `absZ1', with binary point between bits 63 and 64 (between the input words), | | 163 | `absZ1', with binary point between bits 63 and 64 (between the input words), |
164 | and returns the properly rounded 64-bit integer corresponding to the input. | | 164 | and returns the properly rounded 64-bit integer corresponding to the input. |
165 | If `zSign' is 1, the input is negated before being converted to an integer. | | 165 | If `zSign' is 1, the input is negated before being converted to an integer. |
166 | Ordinarily, the fixed-point input is simply rounded to an integer, with | | 166 | Ordinarily, the fixed-point input is simply rounded to an integer, with |
167 | the inexact exception raised if the input cannot be represented exactly as | | 167 | the inexact exception raised if the input cannot be represented exactly as |
168 | an integer. However, if the fixed-point input is too large, the invalid | | 168 | an integer. However, if the fixed-point input is too large, the invalid |
169 | exception is raised and the largest positive or negative integer is | | 169 | exception is raised and the largest positive or negative integer is |
170 | returned. | | 170 | returned. |
171 | ------------------------------------------------------------------------------- | | 171 | ------------------------------------------------------------------------------- |
172 | */ | | 172 | */ |
173 | static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 ) | | 173 | static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 ) |
174 | { | | 174 | { |
175 | int8 roundingMode; | | 175 | int8 roundingMode; |
176 | flag roundNearestEven, increment; | | 176 | flag roundNearestEven, increment; |
177 | int64 z; | | 177 | int64 z; |
178 | | | 178 | |
179 | roundingMode = float_rounding_mode; | | 179 | roundingMode = float_rounding_mode; |
180 | roundNearestEven = ( roundingMode == float_round_nearest_even ); | | 180 | roundNearestEven = ( roundingMode == float_round_nearest_even ); |
181 | increment = ( (sbits64) absZ1 < 0 ); | | 181 | increment = ( (sbits64) absZ1 < 0 ); |
182 | if ( ! roundNearestEven ) { | | 182 | if ( ! roundNearestEven ) { |
183 | if ( roundingMode == float_round_to_zero ) { | | 183 | if ( roundingMode == float_round_to_zero ) { |
184 | increment = 0; | | 184 | increment = 0; |
185 | } | | 185 | } |
186 | else { | | 186 | else { |
187 | if ( zSign ) { | | 187 | if ( zSign ) { |
188 | increment = ( roundingMode == float_round_down ) && absZ1; | | 188 | increment = ( roundingMode == float_round_down ) && absZ1; |
189 | } | | 189 | } |
190 | else { | | 190 | else { |
191 | increment = ( roundingMode == float_round_up ) && absZ1; | | 191 | increment = ( roundingMode == float_round_up ) && absZ1; |
192 | } | | 192 | } |
193 | } | | 193 | } |
194 | } | | 194 | } |
195 | if ( increment ) { | | 195 | if ( increment ) { |
196 | ++absZ0; | | 196 | ++absZ0; |
197 | if ( absZ0 == 0 ) goto overflow; | | 197 | if ( absZ0 == 0 ) goto overflow; |
198 | absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven ); | | 198 | absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven ); |
199 | } | | 199 | } |
200 | z = absZ0; | | 200 | z = absZ0; |
201 | if ( zSign ) z = - z; | | 201 | if ( zSign ) z = - z; |
202 | if ( z && ( ( z < 0 ) ^ zSign ) ) { | | 202 | if ( z && ( ( z < 0 ) ^ zSign ) ) { |
203 | overflow: | | 203 | overflow: |
204 | float_raise( float_flag_invalid ); | | 204 | float_raise( float_flag_invalid ); |
205 | return | | 205 | return |
206 | zSign ? (sbits64) LIT64( 0x8000000000000000 ) | | 206 | zSign ? (sbits64) LIT64( 0x8000000000000000 ) |
207 | : LIT64( 0x7FFFFFFFFFFFFFFF ); | | 207 | : LIT64( 0x7FFFFFFFFFFFFFFF ); |
208 | } | | 208 | } |
209 | if ( absZ1 ) set_float_exception_inexact_flag(); | | 209 | if ( absZ1 ) set_float_exception_inexact_flag(); |
210 | return z; | | 210 | return z; |
211 | | | 211 | |
212 | } | | 212 | } |
213 | #endif | | 213 | #endif |
214 | | | 214 | |
215 | /* | | 215 | /* |
216 | ------------------------------------------------------------------------------- | | 216 | ------------------------------------------------------------------------------- |
217 | Returns the fraction bits of the single-precision floating-point value `a'. | | 217 | Returns the fraction bits of the single-precision floating-point value `a'. |
218 | ------------------------------------------------------------------------------- | | 218 | ------------------------------------------------------------------------------- |
219 | */ | | 219 | */ |
220 | INLINE bits32 extractFloat32Frac( float32 a ) | | 220 | INLINE bits32 extractFloat32Frac( float32 a ) |
221 | { | | 221 | { |
222 | | | 222 | |
223 | return a & 0x007FFFFF; | | 223 | return a & 0x007FFFFF; |
224 | | | 224 | |
225 | } | | 225 | } |
226 | | | 226 | |
227 | /* | | 227 | /* |
228 | ------------------------------------------------------------------------------- | | 228 | ------------------------------------------------------------------------------- |
229 | Returns the exponent bits of the single-precision floating-point value `a'. | | 229 | Returns the exponent bits of the single-precision floating-point value `a'. |
230 | ------------------------------------------------------------------------------- | | 230 | ------------------------------------------------------------------------------- |
231 | */ | | 231 | */ |
232 | INLINE int16 extractFloat32Exp( float32 a ) | | 232 | INLINE int16 extractFloat32Exp( float32 a ) |
233 | { | | 233 | { |
234 | | | 234 | |
235 | return ( a>>23 ) & 0xFF; | | 235 | return ( a>>23 ) & 0xFF; |
236 | | | 236 | |
237 | } | | 237 | } |
238 | | | 238 | |
239 | /* | | 239 | /* |
240 | ------------------------------------------------------------------------------- | | 240 | ------------------------------------------------------------------------------- |
241 | Returns the sign bit of the single-precision floating-point value `a'. | | 241 | Returns the sign bit of the single-precision floating-point value `a'. |
242 | ------------------------------------------------------------------------------- | | 242 | ------------------------------------------------------------------------------- |
243 | */ | | 243 | */ |
244 | INLINE flag extractFloat32Sign( float32 a ) | | 244 | INLINE flag extractFloat32Sign( float32 a ) |
245 | { | | 245 | { |
246 | | | 246 | |
247 | return a>>31; | | 247 | return a>>31; |
248 | | | 248 | |
249 | } | | 249 | } |
250 | | | 250 | |
251 | /* | | 251 | /* |
252 | ------------------------------------------------------------------------------- | | 252 | ------------------------------------------------------------------------------- |
253 | Normalizes the subnormal single-precision floating-point value represented | | 253 | Normalizes the subnormal single-precision floating-point value represented |
254 | by the denormalized significand `aSig'. The normalized exponent and | | 254 | by the denormalized significand `aSig'. The normalized exponent and |
255 | significand are stored at the locations pointed to by `zExpPtr' and | | 255 | significand are stored at the locations pointed to by `zExpPtr' and |
256 | `zSigPtr', respectively. | | 256 | `zSigPtr', respectively. |
257 | ------------------------------------------------------------------------------- | | 257 | ------------------------------------------------------------------------------- |
258 | */ | | 258 | */ |
259 | static void | | 259 | static void |
260 | normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr ) | | 260 | normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr ) |
261 | { | | 261 | { |
262 | int8 shiftCount; | | 262 | int8 shiftCount; |
263 | | | 263 | |
264 | shiftCount = countLeadingZeros32( aSig ) - 8; | | 264 | shiftCount = countLeadingZeros32( aSig ) - 8; |
265 | *zSigPtr = aSig<<shiftCount; | | 265 | *zSigPtr = aSig<<shiftCount; |
266 | *zExpPtr = 1 - shiftCount; | | 266 | *zExpPtr = 1 - shiftCount; |
267 | | | 267 | |
268 | } | | 268 | } |
269 | | | 269 | |
270 | /* | | 270 | /* |
271 | ------------------------------------------------------------------------------- | | 271 | ------------------------------------------------------------------------------- |
272 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a | | 272 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a |
273 | single-precision floating-point value, returning the result. After being | | 273 | single-precision floating-point value, returning the result. After being |
274 | shifted into the proper positions, the three fields are simply added | | 274 | shifted into the proper positions, the three fields are simply added |
275 | together to form the result. This means that any integer portion of `zSig' | | 275 | together to form the result. This means that any integer portion of `zSig' |
276 | will be added into the exponent. Since a properly normalized significand | | 276 | will be added into the exponent. Since a properly normalized significand |
277 | will have an integer portion equal to 1, the `zExp' input should be 1 less | | 277 | will have an integer portion equal to 1, the `zExp' input should be 1 less |
278 | than the desired result exponent whenever `zSig' is a complete, normalized | | 278 | than the desired result exponent whenever `zSig' is a complete, normalized |
279 | significand. | | 279 | significand. |
280 | ------------------------------------------------------------------------------- | | 280 | ------------------------------------------------------------------------------- |
281 | */ | | 281 | */ |
282 | INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig ) | | 282 | INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig ) |
283 | { | | 283 | { |
284 | | | 284 | |
285 | return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig; | | 285 | return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig; |
286 | | | 286 | |
287 | } | | 287 | } |
288 | | | 288 | |
289 | /* | | 289 | /* |
290 | ------------------------------------------------------------------------------- | | 290 | ------------------------------------------------------------------------------- |
291 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', | | 291 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', |
292 | and significand `zSig', and returns the proper single-precision floating- | | 292 | and significand `zSig', and returns the proper single-precision floating- |
293 | point value corresponding to the abstract input. Ordinarily, the abstract | | 293 | point value corresponding to the abstract input. Ordinarily, the abstract |
294 | value is simply rounded and packed into the single-precision format, with | | 294 | value is simply rounded and packed into the single-precision format, with |
295 | the inexact exception raised if the abstract input cannot be represented | | 295 | the inexact exception raised if the abstract input cannot be represented |
296 | exactly. However, if the abstract value is too large, the overflow and | | 296 | exactly. However, if the abstract value is too large, the overflow and |
297 | inexact exceptions are raised and an infinity or maximal finite value is | | 297 | inexact exceptions are raised and an infinity or maximal finite value is |
298 | returned. If the abstract value is too small, the input value is rounded to | | 298 | returned. If the abstract value is too small, the input value is rounded to |
299 | a subnormal number, and the underflow and inexact exceptions are raised if | | 299 | a subnormal number, and the underflow and inexact exceptions are raised if |
300 | the abstract input cannot be represented exactly as a subnormal single- | | 300 | the abstract input cannot be represented exactly as a subnormal single- |
301 | precision floating-point number. | | 301 | precision floating-point number. |
302 | The input significand `zSig' has its binary point between bits 30 | | 302 | The input significand `zSig' has its binary point between bits 30 |
303 | and 29, which is 7 bits to the left of the usual location. This shifted | | 303 | and 29, which is 7 bits to the left of the usual location. This shifted |
304 | significand must be normalized or smaller. If `zSig' is not normalized, | | 304 | significand must be normalized or smaller. If `zSig' is not normalized, |
305 | `zExp' must be 0; in that case, the result returned is a subnormal number, | | 305 | `zExp' must be 0; in that case, the result returned is a subnormal number, |
306 | and it must not require rounding. In the usual case that `zSig' is | | 306 | and it must not require rounding. In the usual case that `zSig' is |
307 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. | | 307 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. |
308 | The handling of underflow and overflow follows the IEC/IEEE Standard for | | 308 | The handling of underflow and overflow follows the IEC/IEEE Standard for |
309 | Binary Floating-Point Arithmetic. | | 309 | Binary Floating-Point Arithmetic. |
310 | ------------------------------------------------------------------------------- | | 310 | ------------------------------------------------------------------------------- |
311 | */ | | 311 | */ |
312 | static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig ) | | 312 | static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig ) |
313 | { | | 313 | { |
314 | int8 roundingMode; | | 314 | int8 roundingMode; |
315 | flag roundNearestEven; | | 315 | flag roundNearestEven; |
316 | int8 roundIncrement, roundBits; | | 316 | int8 roundIncrement, roundBits; |
317 | flag isTiny; | | 317 | flag isTiny; |
318 | | | 318 | |
319 | roundingMode = float_rounding_mode; | | 319 | roundingMode = float_rounding_mode; |
320 | roundNearestEven = ( roundingMode == float_round_nearest_even ); | | 320 | roundNearestEven = ( roundingMode == float_round_nearest_even ); |
321 | roundIncrement = 0x40; | | 321 | roundIncrement = 0x40; |
322 | if ( ! roundNearestEven ) { | | 322 | if ( ! roundNearestEven ) { |
323 | if ( roundingMode == float_round_to_zero ) { | | 323 | if ( roundingMode == float_round_to_zero ) { |
324 | roundIncrement = 0; | | 324 | roundIncrement = 0; |
325 | } | | 325 | } |
326 | else { | | 326 | else { |
327 | roundIncrement = 0x7F; | | 327 | roundIncrement = 0x7F; |
328 | if ( zSign ) { | | 328 | if ( zSign ) { |
329 | if ( roundingMode == float_round_up ) roundIncrement = 0; | | 329 | if ( roundingMode == float_round_up ) roundIncrement = 0; |
330 | } | | 330 | } |
331 | else { | | 331 | else { |
332 | if ( roundingMode == float_round_down ) roundIncrement = 0; | | 332 | if ( roundingMode == float_round_down ) roundIncrement = 0; |
333 | } | | 333 | } |
334 | } | | 334 | } |
335 | } | | 335 | } |
336 | roundBits = zSig & 0x7F; | | 336 | roundBits = zSig & 0x7F; |
337 | if ( 0xFD <= (bits16) zExp ) { | | 337 | if ( 0xFD <= (bits16) zExp ) { |
338 | if ( ( 0xFD < zExp ) | | 338 | if ( ( 0xFD < zExp ) |
339 | || ( ( zExp == 0xFD ) | | 339 | || ( ( zExp == 0xFD ) |
340 | && ( (sbits32) ( zSig + roundIncrement ) < 0 ) ) | | 340 | && ( (sbits32) ( zSig + roundIncrement ) < 0 ) ) |
341 | ) { | | 341 | ) { |
342 | float_raise( float_flag_overflow | float_flag_inexact ); | | 342 | float_raise( float_flag_overflow | float_flag_inexact ); |
343 | return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 ); | | 343 | return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 ); |
344 | } | | 344 | } |
345 | if ( zExp < 0 ) { | | 345 | if ( zExp < 0 ) { |
346 | isTiny = | | 346 | isTiny = |
347 | ( float_detect_tininess == float_tininess_before_rounding ) | | 347 | ( float_detect_tininess == float_tininess_before_rounding ) |
348 | || ( zExp < -1 ) | | 348 | || ( zExp < -1 ) |
349 | || ( zSig + roundIncrement < 0x80000000U ); | | 349 | || ( zSig + roundIncrement < 0x80000000U ); |
350 | shift32RightJamming( zSig, - zExp, &zSig ); | | 350 | shift32RightJamming( zSig, - zExp, &zSig ); |
351 | zExp = 0; | | 351 | zExp = 0; |
352 | roundBits = zSig & 0x7F; | | 352 | roundBits = zSig & 0x7F; |
353 | if ( isTiny && roundBits ) float_raise( float_flag_underflow ); | | 353 | if ( isTiny && roundBits ) float_raise( float_flag_underflow ); |
354 | } | | 354 | } |
355 | } | | 355 | } |
356 | if ( roundBits ) set_float_exception_inexact_flag(); | | 356 | if ( roundBits ) set_float_exception_inexact_flag(); |
357 | zSig = ( zSig + roundIncrement )>>7; | | 357 | zSig = ( zSig + roundIncrement )>>7; |
358 | zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); | | 358 | zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); |
359 | if ( zSig == 0 ) zExp = 0; | | 359 | if ( zSig == 0 ) zExp = 0; |
360 | return packFloat32( zSign, zExp, zSig ); | | 360 | return packFloat32( zSign, zExp, zSig ); |
361 | | | 361 | |
362 | } | | 362 | } |
363 | | | 363 | |
364 | /* | | 364 | /* |
365 | ------------------------------------------------------------------------------- | | 365 | ------------------------------------------------------------------------------- |
366 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', | | 366 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', |
367 | and significand `zSig', and returns the proper single-precision floating- | | 367 | and significand `zSig', and returns the proper single-precision floating- |
368 | point value corresponding to the abstract input. This routine is just like | | 368 | point value corresponding to the abstract input. This routine is just like |
369 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. | | 369 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. |
370 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' | | 370 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' |
371 | floating-point exponent. | | 371 | floating-point exponent. |
372 | ------------------------------------------------------------------------------- | | 372 | ------------------------------------------------------------------------------- |
373 | */ | | 373 | */ |
374 | static float32 | | 374 | static float32 |
375 | normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig ) | | 375 | normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig ) |
376 | { | | 376 | { |
377 | int8 shiftCount; | | 377 | int8 shiftCount; |
378 | | | 378 | |
379 | shiftCount = countLeadingZeros32( zSig ) - 1; | | 379 | shiftCount = countLeadingZeros32( zSig ) - 1; |
380 | return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount ); | | 380 | return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount ); |
381 | | | 381 | |
382 | } | | 382 | } |
383 | | | 383 | |
384 | /* | | 384 | /* |
385 | ------------------------------------------------------------------------------- | | 385 | ------------------------------------------------------------------------------- |
386 | Returns the fraction bits of the double-precision floating-point value `a'. | | 386 | Returns the fraction bits of the double-precision floating-point value `a'. |
387 | ------------------------------------------------------------------------------- | | 387 | ------------------------------------------------------------------------------- |
388 | */ | | 388 | */ |
389 | INLINE bits64 extractFloat64Frac( float64 a ) | | 389 | INLINE bits64 extractFloat64Frac( float64 a ) |
390 | { | | 390 | { |
391 | | | 391 | |
392 | return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF ); | | 392 | return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF ); |
393 | | | 393 | |
394 | } | | 394 | } |
395 | | | 395 | |
396 | /* | | 396 | /* |
397 | ------------------------------------------------------------------------------- | | 397 | ------------------------------------------------------------------------------- |
398 | Returns the exponent bits of the double-precision floating-point value `a'. | | 398 | Returns the exponent bits of the double-precision floating-point value `a'. |
399 | ------------------------------------------------------------------------------- | | 399 | ------------------------------------------------------------------------------- |
400 | */ | | 400 | */ |
401 | INLINE int16 extractFloat64Exp( float64 a ) | | 401 | INLINE int16 extractFloat64Exp( float64 a ) |
402 | { | | 402 | { |
403 | | | 403 | |
404 | return (int16)((FLOAT64_DEMANGLE(a) >> 52) & 0x7FF); | | 404 | return (int16)((FLOAT64_DEMANGLE(a) >> 52) & 0x7FF); |
405 | | | 405 | |
406 | } | | 406 | } |
407 | | | 407 | |
408 | /* | | 408 | /* |
409 | ------------------------------------------------------------------------------- | | 409 | ------------------------------------------------------------------------------- |
410 | Returns the sign bit of the double-precision floating-point value `a'. | | 410 | Returns the sign bit of the double-precision floating-point value `a'. |
411 | ------------------------------------------------------------------------------- | | 411 | ------------------------------------------------------------------------------- |
412 | */ | | 412 | */ |
413 | INLINE flag extractFloat64Sign( float64 a ) | | 413 | INLINE flag extractFloat64Sign( float64 a ) |
414 | { | | 414 | { |
415 | | | 415 | |
416 | return (flag)(FLOAT64_DEMANGLE(a) >> 63); | | 416 | return (flag)(FLOAT64_DEMANGLE(a) >> 63); |
417 | | | 417 | |
418 | } | | 418 | } |
419 | | | 419 | |
420 | /* | | 420 | /* |
421 | ------------------------------------------------------------------------------- | | 421 | ------------------------------------------------------------------------------- |
422 | Normalizes the subnormal double-precision floating-point value represented | | 422 | Normalizes the subnormal double-precision floating-point value represented |
423 | by the denormalized significand `aSig'. The normalized exponent and | | 423 | by the denormalized significand `aSig'. The normalized exponent and |
424 | significand are stored at the locations pointed to by `zExpPtr' and | | 424 | significand are stored at the locations pointed to by `zExpPtr' and |
425 | `zSigPtr', respectively. | | 425 | `zSigPtr', respectively. |
426 | ------------------------------------------------------------------------------- | | 426 | ------------------------------------------------------------------------------- |
427 | */ | | 427 | */ |
428 | static void | | 428 | static void |
429 | normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr ) | | 429 | normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr ) |
430 | { | | 430 | { |
431 | int8 shiftCount; | | 431 | int8 shiftCount; |
432 | | | 432 | |
433 | shiftCount = countLeadingZeros64( aSig ) - 11; | | 433 | shiftCount = countLeadingZeros64( aSig ) - 11; |
434 | *zSigPtr = aSig<<shiftCount; | | 434 | *zSigPtr = aSig<<shiftCount; |
435 | *zExpPtr = 1 - shiftCount; | | 435 | *zExpPtr = 1 - shiftCount; |
436 | | | 436 | |
437 | } | | 437 | } |
438 | | | 438 | |
439 | /* | | 439 | /* |
440 | ------------------------------------------------------------------------------- | | 440 | ------------------------------------------------------------------------------- |
441 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a | | 441 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a |
442 | double-precision floating-point value, returning the result. After being | | 442 | double-precision floating-point value, returning the result. After being |
443 | shifted into the proper positions, the three fields are simply added | | 443 | shifted into the proper positions, the three fields are simply added |
444 | together to form the result. This means that any integer portion of `zSig' | | 444 | together to form the result. This means that any integer portion of `zSig' |
445 | will be added into the exponent. Since a properly normalized significand | | 445 | will be added into the exponent. Since a properly normalized significand |
446 | will have an integer portion equal to 1, the `zExp' input should be 1 less | | 446 | will have an integer portion equal to 1, the `zExp' input should be 1 less |
447 | than the desired result exponent whenever `zSig' is a complete, normalized | | 447 | than the desired result exponent whenever `zSig' is a complete, normalized |
448 | significand. | | 448 | significand. |
449 | ------------------------------------------------------------------------------- | | 449 | ------------------------------------------------------------------------------- |
450 | */ | | 450 | */ |
451 | INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig ) | | 451 | INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig ) |
452 | { | | 452 | { |
453 | | | 453 | |
454 | return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) + | | 454 | return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) + |
455 | ( ( (bits64) zExp )<<52 ) + zSig ); | | 455 | ( ( (bits64) zExp )<<52 ) + zSig ); |
456 | | | 456 | |
457 | } | | 457 | } |
458 | | | 458 | |
459 | /* | | 459 | /* |
460 | ------------------------------------------------------------------------------- | | 460 | ------------------------------------------------------------------------------- |
461 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', | | 461 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', |
462 | and significand `zSig', and returns the proper double-precision floating- | | 462 | and significand `zSig', and returns the proper double-precision floating- |
463 | point value corresponding to the abstract input. Ordinarily, the abstract | | 463 | point value corresponding to the abstract input. Ordinarily, the abstract |
464 | value is simply rounded and packed into the double-precision format, with | | 464 | value is simply rounded and packed into the double-precision format, with |
465 | the inexact exception raised if the abstract input cannot be represented | | 465 | the inexact exception raised if the abstract input cannot be represented |
466 | exactly. However, if the abstract value is too large, the overflow and | | 466 | exactly. However, if the abstract value is too large, the overflow and |
467 | inexact exceptions are raised and an infinity or maximal finite value is | | 467 | inexact exceptions are raised and an infinity or maximal finite value is |
468 | returned. If the abstract value is too small, the input value is rounded to | | 468 | returned. If the abstract value is too small, the input value is rounded to |
469 | a subnormal number, and the underflow and inexact exceptions are raised if | | 469 | a subnormal number, and the underflow and inexact exceptions are raised if |
470 | the abstract input cannot be represented exactly as a subnormal double- | | 470 | the abstract input cannot be represented exactly as a subnormal double- |
471 | precision floating-point number. | | 471 | precision floating-point number. |
472 | The input significand `zSig' has its binary point between bits 62 | | 472 | The input significand `zSig' has its binary point between bits 62 |
473 | and 61, which is 10 bits to the left of the usual location. This shifted | | 473 | and 61, which is 10 bits to the left of the usual location. This shifted |
474 | significand must be normalized or smaller. If `zSig' is not normalized, | | 474 | significand must be normalized or smaller. If `zSig' is not normalized, |
475 | `zExp' must be 0; in that case, the result returned is a subnormal number, | | 475 | `zExp' must be 0; in that case, the result returned is a subnormal number, |
476 | and it must not require rounding. In the usual case that `zSig' is | | 476 | and it must not require rounding. In the usual case that `zSig' is |
477 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. | | 477 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. |
478 | The handling of underflow and overflow follows the IEC/IEEE Standard for | | 478 | The handling of underflow and overflow follows the IEC/IEEE Standard for |
479 | Binary Floating-Point Arithmetic. | | 479 | Binary Floating-Point Arithmetic. |
480 | ------------------------------------------------------------------------------- | | 480 | ------------------------------------------------------------------------------- |
481 | */ | | 481 | */ |
482 | static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig ) | | 482 | static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig ) |
483 | { | | 483 | { |
484 | int8 roundingMode; | | 484 | int8 roundingMode; |
485 | flag roundNearestEven; | | 485 | flag roundNearestEven; |
486 | int16 roundIncrement, roundBits; | | 486 | int16 roundIncrement, roundBits; |
487 | flag isTiny; | | 487 | flag isTiny; |
488 | | | 488 | |
489 | roundingMode = float_rounding_mode; | | 489 | roundingMode = float_rounding_mode; |
490 | roundNearestEven = ( roundingMode == float_round_nearest_even ); | | 490 | roundNearestEven = ( roundingMode == float_round_nearest_even ); |
491 | roundIncrement = 0x200; | | 491 | roundIncrement = 0x200; |
492 | if ( ! roundNearestEven ) { | | 492 | if ( ! roundNearestEven ) { |
493 | if ( roundingMode == float_round_to_zero ) { | | 493 | if ( roundingMode == float_round_to_zero ) { |
494 | roundIncrement = 0; | | 494 | roundIncrement = 0; |
495 | } | | 495 | } |
496 | else { | | 496 | else { |
497 | roundIncrement = 0x3FF; | | 497 | roundIncrement = 0x3FF; |
498 | if ( zSign ) { | | 498 | if ( zSign ) { |
499 | if ( roundingMode == float_round_up ) roundIncrement = 0; | | 499 | if ( roundingMode == float_round_up ) roundIncrement = 0; |
500 | } | | 500 | } |
501 | else { | | 501 | else { |
502 | if ( roundingMode == float_round_down ) roundIncrement = 0; | | 502 | if ( roundingMode == float_round_down ) roundIncrement = 0; |
503 | } | | 503 | } |
504 | } | | 504 | } |
505 | } | | 505 | } |
506 | roundBits = (int16)(zSig & 0x3FF); | | 506 | roundBits = (int16)(zSig & 0x3FF); |
507 | if ( 0x7FD <= (bits16) zExp ) { | | 507 | if ( 0x7FD <= (bits16) zExp ) { |
508 | if ( ( 0x7FD < zExp ) | | 508 | if ( ( 0x7FD < zExp ) |
509 | || ( ( zExp == 0x7FD ) | | 509 | || ( ( zExp == 0x7FD ) |
510 | && ( (sbits64) ( zSig + roundIncrement ) < 0 ) ) | | 510 | && ( (sbits64) ( zSig + roundIncrement ) < 0 ) ) |
511 | ) { | | 511 | ) { |
512 | float_raise( float_flag_overflow | float_flag_inexact ); | | 512 | float_raise( float_flag_overflow | float_flag_inexact ); |
513 | return FLOAT64_MANGLE( | | 513 | return FLOAT64_MANGLE( |
514 | FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) - | | 514 | FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) - |
515 | ( roundIncrement == 0 )); | | 515 | ( roundIncrement == 0 )); |
516 | } | | 516 | } |
517 | if ( zExp < 0 ) { | | 517 | if ( zExp < 0 ) { |
518 | isTiny = | | 518 | isTiny = |
519 | ( float_detect_tininess == float_tininess_before_rounding ) | | 519 | ( float_detect_tininess == float_tininess_before_rounding ) |
520 | || ( zExp < -1 ) | | 520 | || ( zExp < -1 ) |
521 | || ( zSig + roundIncrement < (bits64)LIT64( 0x8000000000000000 ) ); | | 521 | || ( zSig + roundIncrement < (bits64)LIT64( 0x8000000000000000 ) ); |
522 | shift64RightJamming( zSig, - zExp, &zSig ); | | 522 | shift64RightJamming( zSig, - zExp, &zSig ); |
523 | zExp = 0; | | 523 | zExp = 0; |
524 | roundBits = (int16)(zSig & 0x3FF); | | 524 | roundBits = (int16)(zSig & 0x3FF); |
525 | if ( isTiny && roundBits ) float_raise( float_flag_underflow ); | | 525 | if ( isTiny && roundBits ) float_raise( float_flag_underflow ); |
526 | } | | 526 | } |
527 | } | | 527 | } |
528 | if ( roundBits ) set_float_exception_inexact_flag(); | | 528 | if ( roundBits ) set_float_exception_inexact_flag(); |
529 | zSig = ( zSig + roundIncrement )>>10; | | 529 | zSig = ( zSig + roundIncrement )>>10; |
530 | zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); | | 530 | zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); |
531 | if ( zSig == 0 ) zExp = 0; | | 531 | if ( zSig == 0 ) zExp = 0; |
532 | return packFloat64( zSign, zExp, zSig ); | | 532 | return packFloat64( zSign, zExp, zSig ); |
533 | | | 533 | |
534 | } | | 534 | } |
535 | | | 535 | |
536 | /* | | 536 | /* |
537 | ------------------------------------------------------------------------------- | | 537 | ------------------------------------------------------------------------------- |
538 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', | | 538 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', |
539 | and significand `zSig', and returns the proper double-precision floating- | | 539 | and significand `zSig', and returns the proper double-precision floating- |
540 | point value corresponding to the abstract input. This routine is just like | | 540 | point value corresponding to the abstract input. This routine is just like |
541 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. | | 541 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. |
542 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' | | 542 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' |
543 | floating-point exponent. | | 543 | floating-point exponent. |
544 | ------------------------------------------------------------------------------- | | 544 | ------------------------------------------------------------------------------- |
545 | */ | | 545 | */ |
546 | static float64 | | 546 | static float64 |
547 | normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig ) | | 547 | normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig ) |
548 | { | | 548 | { |
549 | int8 shiftCount; | | 549 | int8 shiftCount; |
550 | | | 550 | |
551 | shiftCount = countLeadingZeros64( zSig ) - 1; | | 551 | shiftCount = countLeadingZeros64( zSig ) - 1; |
552 | return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount ); | | 552 | return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount ); |
553 | | | 553 | |
554 | } | | 554 | } |
555 | | | 555 | |
556 | #ifdef FLOATX80 | | 556 | #ifdef FLOATX80 |
557 | | | 557 | |
558 | /* | | 558 | /* |
559 | ------------------------------------------------------------------------------- | | 559 | ------------------------------------------------------------------------------- |
560 | Returns the fraction bits of the extended double-precision floating-point | | 560 | Returns the fraction bits of the extended double-precision floating-point |
561 | value `a'. | | 561 | value `a'. |
562 | ------------------------------------------------------------------------------- | | 562 | ------------------------------------------------------------------------------- |
563 | */ | | 563 | */ |
564 | INLINE bits64 extractFloatx80Frac( floatx80 a ) | | 564 | INLINE bits64 extractFloatx80Frac( floatx80 a ) |
565 | { | | 565 | { |
566 | | | 566 | |
567 | return a.low; | | 567 | return a.low; |
568 | | | 568 | |
569 | } | | 569 | } |
570 | | | 570 | |
571 | /* | | 571 | /* |
572 | ------------------------------------------------------------------------------- | | 572 | ------------------------------------------------------------------------------- |
573 | Returns the exponent bits of the extended double-precision floating-point | | 573 | Returns the exponent bits of the extended double-precision floating-point |
574 | value `a'. | | 574 | value `a'. |
575 | ------------------------------------------------------------------------------- | | 575 | ------------------------------------------------------------------------------- |
576 | */ | | 576 | */ |
577 | INLINE int32 extractFloatx80Exp( floatx80 a ) | | 577 | INLINE int32 extractFloatx80Exp( floatx80 a ) |
578 | { | | 578 | { |
579 | | | 579 | |
580 | return a.high & 0x7FFF; | | 580 | return a.high & 0x7FFF; |
581 | | | 581 | |
582 | } | | 582 | } |
583 | | | 583 | |
584 | /* | | 584 | /* |
585 | ------------------------------------------------------------------------------- | | 585 | ------------------------------------------------------------------------------- |
586 | Returns the sign bit of the extended double-precision floating-point value | | 586 | Returns the sign bit of the extended double-precision floating-point value |
587 | `a'. | | 587 | `a'. |
588 | ------------------------------------------------------------------------------- | | 588 | ------------------------------------------------------------------------------- |
589 | */ | | 589 | */ |
590 | INLINE flag extractFloatx80Sign( floatx80 a ) | | 590 | INLINE flag extractFloatx80Sign( floatx80 a ) |
591 | { | | 591 | { |
592 | | | 592 | |
593 | return a.high>>15; | | 593 | return a.high>>15; |
594 | | | 594 | |
595 | } | | 595 | } |
596 | | | 596 | |
597 | /* | | 597 | /* |
598 | ------------------------------------------------------------------------------- | | 598 | ------------------------------------------------------------------------------- |
599 | Normalizes the subnormal extended double-precision floating-point value | | 599 | Normalizes the subnormal extended double-precision floating-point value |
600 | represented by the denormalized significand `aSig'. The normalized exponent | | 600 | represented by the denormalized significand `aSig'. The normalized exponent |
601 | and significand are stored at the locations pointed to by `zExpPtr' and | | 601 | and significand are stored at the locations pointed to by `zExpPtr' and |
602 | `zSigPtr', respectively. | | 602 | `zSigPtr', respectively. |
603 | ------------------------------------------------------------------------------- | | 603 | ------------------------------------------------------------------------------- |
604 | */ | | 604 | */ |
605 | static void | | 605 | static void |
606 | normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr ) | | 606 | normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr ) |
607 | { | | 607 | { |
608 | int8 shiftCount; | | 608 | int8 shiftCount; |
609 | | | 609 | |
610 | shiftCount = countLeadingZeros64( aSig ); | | 610 | shiftCount = countLeadingZeros64( aSig ); |
611 | *zSigPtr = aSig<<shiftCount; | | 611 | *zSigPtr = aSig<<shiftCount; |
612 | *zExpPtr = 1 - shiftCount; | | 612 | *zExpPtr = 1 - shiftCount; |
613 | | | 613 | |
614 | } | | 614 | } |
615 | | | 615 | |
616 | /* | | 616 | /* |
617 | ------------------------------------------------------------------------------- | | 617 | ------------------------------------------------------------------------------- |
618 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an | | 618 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an |
619 | extended double-precision floating-point value, returning the result. | | 619 | extended double-precision floating-point value, returning the result. |
620 | ------------------------------------------------------------------------------- | | 620 | ------------------------------------------------------------------------------- |
621 | */ | | 621 | */ |
622 | INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig ) | | 622 | INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig ) |
623 | { | | 623 | { |
624 | floatx80 z; | | 624 | floatx80 z; |
625 | | | 625 | |
626 | z.low = zSig; | | 626 | z.low = zSig; |
627 | z.high = ( ( (bits16) zSign )<<15 ) + zExp; | | 627 | z.high = ( ( (bits16) zSign )<<15 ) + zExp; |
628 | return z; | | 628 | return z; |
629 | | | 629 | |
630 | } | | 630 | } |
631 | | | 631 | |
632 | /* | | 632 | /* |
633 | ------------------------------------------------------------------------------- | | 633 | ------------------------------------------------------------------------------- |
634 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', | | 634 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', |
635 | and extended significand formed by the concatenation of `zSig0' and `zSig1', | | 635 | and extended significand formed by the concatenation of `zSig0' and `zSig1', |
636 | and returns the proper extended double-precision floating-point value | | 636 | and returns the proper extended double-precision floating-point value |
637 | corresponding to the abstract input. Ordinarily, the abstract value is | | 637 | corresponding to the abstract input. Ordinarily, the abstract value is |
638 | rounded and packed into the extended double-precision format, with the | | 638 | rounded and packed into the extended double-precision format, with the |
639 | inexact exception raised if the abstract input cannot be represented | | 639 | inexact exception raised if the abstract input cannot be represented |
640 | exactly. However, if the abstract value is too large, the overflow and | | 640 | exactly. However, if the abstract value is too large, the overflow and |
641 | inexact exceptions are raised and an infinity or maximal finite value is | | 641 | inexact exceptions are raised and an infinity or maximal finite value is |
642 | returned. If the abstract value is too small, the input value is rounded to | | 642 | returned. If the abstract value is too small, the input value is rounded to |
643 | a subnormal number, and the underflow and inexact exceptions are raised if | | 643 | a subnormal number, and the underflow and inexact exceptions are raised if |
644 | the abstract input cannot be represented exactly as a subnormal extended | | 644 | the abstract input cannot be represented exactly as a subnormal extended |
645 | double-precision floating-point number. | | 645 | double-precision floating-point number. |
646 | If `roundingPrecision' is 32 or 64, the result is rounded to the same | | 646 | If `roundingPrecision' is 32 or 64, the result is rounded to the same |
647 | number of bits as single or double precision, respectively. Otherwise, the | | 647 | number of bits as single or double precision, respectively. Otherwise, the |
648 | result is rounded to the full precision of the extended double-precision | | 648 | result is rounded to the full precision of the extended double-precision |
649 | format. | | 649 | format. |
650 | The input significand must be normalized or smaller. If the input | | 650 | The input significand must be normalized or smaller. If the input |
651 | significand is not normalized, `zExp' must be 0; in that case, the result | | 651 | significand is not normalized, `zExp' must be 0; in that case, the result |
652 | returned is a subnormal number, and it must not require rounding. The | | 652 | returned is a subnormal number, and it must not require rounding. The |
653 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary | | 653 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary |
654 | Floating-Point Arithmetic. | | 654 | Floating-Point Arithmetic. |
655 | ------------------------------------------------------------------------------- | | 655 | ------------------------------------------------------------------------------- |
656 | */ | | 656 | */ |
657 | static floatx80 | | 657 | static floatx80 |
658 | roundAndPackFloatx80( | | 658 | roundAndPackFloatx80( |
659 | int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 | | 659 | int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 |
660 | ) | | 660 | ) |
661 | { | | 661 | { |
662 | int8 roundingMode; | | 662 | int8 roundingMode; |
663 | flag roundNearestEven, increment, isTiny; | | 663 | flag roundNearestEven, increment, isTiny; |
664 | int64 roundIncrement, roundMask, roundBits; | | 664 | uint64 roundIncrement, roundMask, roundBits; |
665 | | | 665 | |
666 | roundingMode = float_rounding_mode; | | 666 | roundingMode = float_rounding_mode; |
667 | roundNearestEven = ( roundingMode == float_round_nearest_even ); | | 667 | roundNearestEven = ( roundingMode == float_round_nearest_even ); |
668 | if ( roundingPrecision == 80 ) goto precision80; | | 668 | if ( roundingPrecision == 80 ) goto precision80; |
669 | if ( roundingPrecision == 64 ) { | | 669 | if ( roundingPrecision == 64 ) { |
670 | roundIncrement = LIT64( 0x0000000000000400 ); | | 670 | roundIncrement = LIT64( 0x0000000000000400 ); |
671 | roundMask = LIT64( 0x00000000000007FF ); | | 671 | roundMask = LIT64( 0x00000000000007FF ); |
672 | } | | 672 | } |
673 | else if ( roundingPrecision == 32 ) { | | 673 | else if ( roundingPrecision == 32 ) { |
674 | roundIncrement = LIT64( 0x0000008000000000 ); | | 674 | roundIncrement = LIT64( 0x0000008000000000 ); |
675 | roundMask = LIT64( 0x000000FFFFFFFFFF ); | | 675 | roundMask = LIT64( 0x000000FFFFFFFFFF ); |
676 | } | | 676 | } |
677 | else { | | 677 | else { |
678 | goto precision80; | | 678 | goto precision80; |
679 | } | | 679 | } |
680 | zSig0 |= ( zSig1 != 0 ); | | 680 | zSig0 |= ( zSig1 != 0 ); |
681 | if ( ! roundNearestEven ) { | | 681 | if ( ! roundNearestEven ) { |
682 | if ( roundingMode == float_round_to_zero ) { | | 682 | if ( roundingMode == float_round_to_zero ) { |
683 | roundIncrement = 0; | | 683 | roundIncrement = 0; |
684 | } | | 684 | } |
685 | else { | | 685 | else { |
686 | roundIncrement = roundMask; | | 686 | roundIncrement = roundMask; |
687 | if ( zSign ) { | | 687 | if ( zSign ) { |
688 | if ( roundingMode == float_round_up ) roundIncrement = 0; | | 688 | if ( roundingMode == float_round_up ) roundIncrement = 0; |
689 | } | | 689 | } |
690 | else { | | 690 | else { |
691 | if ( roundingMode == float_round_down ) roundIncrement = 0; | | 691 | if ( roundingMode == float_round_down ) roundIncrement = 0; |
692 | } | | 692 | } |
693 | } | | 693 | } |
694 | } | | 694 | } |
695 | roundBits = zSig0 & roundMask; | | 695 | roundBits = zSig0 & roundMask; |
696 | if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { | | 696 | if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { |
697 | if ( ( 0x7FFE < zExp ) | | 697 | if ( ( 0x7FFE < zExp ) |
698 | || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) | | 698 | || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) |
699 | ) { | | 699 | ) { |
700 | goto overflow; | | 700 | goto overflow; |
701 | } | | 701 | } |
702 | if ( zExp <= 0 ) { | | 702 | if ( zExp <= 0 ) { |
703 | isTiny = | | 703 | isTiny = |
704 | ( float_detect_tininess == float_tininess_before_rounding ) | | 704 | ( float_detect_tininess == float_tininess_before_rounding ) |
705 | || ( zExp < 0 ) | | 705 | || ( zExp < 0 ) |
706 | || ( zSig0 <= zSig0 + roundIncrement ); | | 706 | || ( zSig0 <= zSig0 + roundIncrement ); |
707 | shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); | | 707 | shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); |
708 | zExp = 0; | | 708 | zExp = 0; |
709 | roundBits = zSig0 & roundMask; | | 709 | roundBits = zSig0 & roundMask; |
710 | if ( isTiny && roundBits ) float_raise( float_flag_underflow ); | | 710 | if ( isTiny && roundBits ) float_raise( float_flag_underflow ); |
711 | if ( roundBits ) set_float_exception_inexact_flag(); | | 711 | if ( roundBits ) set_float_exception_inexact_flag(); |
712 | zSig0 += roundIncrement; | | 712 | zSig0 += roundIncrement; |
713 | if ( (sbits64) zSig0 < 0 ) zExp = 1; | | 713 | if ( (sbits64) zSig0 < 0 ) zExp = 1; |
714 | roundIncrement = roundMask + 1; | | 714 | roundIncrement = roundMask + 1; |
715 | if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { | | 715 | if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { |
716 | roundMask |= roundIncrement; | | 716 | roundMask |= roundIncrement; |
717 | } | | 717 | } |
718 | zSig0 &= ~ roundMask; | | 718 | zSig0 &= ~ roundMask; |
719 | return packFloatx80( zSign, zExp, zSig0 ); | | 719 | return packFloatx80( zSign, zExp, zSig0 ); |
720 | } | | 720 | } |
721 | } | | 721 | } |
722 | if ( roundBits ) set_float_exception_inexact_flag(); | | 722 | if ( roundBits ) set_float_exception_inexact_flag(); |
723 | zSig0 += roundIncrement; | | 723 | zSig0 += roundIncrement; |
724 | if ( zSig0 < roundIncrement ) { | | 724 | if ( zSig0 < roundIncrement ) { |
725 | ++zExp; | | 725 | ++zExp; |
726 | zSig0 = LIT64( 0x8000000000000000 ); | | 726 | zSig0 = LIT64( 0x8000000000000000 ); |
727 | } | | 727 | } |
728 | roundIncrement = roundMask + 1; | | 728 | roundIncrement = roundMask + 1; |
729 | if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { | | 729 | if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { |
730 | roundMask |= roundIncrement; | | 730 | roundMask |= roundIncrement; |
731 | } | | 731 | } |
732 | zSig0 &= ~ roundMask; | | 732 | zSig0 &= ~ roundMask; |
733 | if ( zSig0 == 0 ) zExp = 0; | | 733 | if ( zSig0 == 0 ) zExp = 0; |
734 | return packFloatx80( zSign, zExp, zSig0 ); | | 734 | return packFloatx80( zSign, zExp, zSig0 ); |
735 | precision80: | | 735 | precision80: |
736 | increment = ( (sbits64) zSig1 < 0 ); | | 736 | increment = ( (sbits64) zSig1 < 0 ); |
737 | if ( ! roundNearestEven ) { | | 737 | if ( ! roundNearestEven ) { |
738 | if ( roundingMode == float_round_to_zero ) { | | 738 | if ( roundingMode == float_round_to_zero ) { |
739 | increment = 0; | | 739 | increment = 0; |
740 | } | | 740 | } |
741 | else { | | 741 | else { |
742 | if ( zSign ) { | | 742 | if ( zSign ) { |
743 | increment = ( roundingMode == float_round_down ) && zSig1; | | 743 | increment = ( roundingMode == float_round_down ) && zSig1; |
744 | } | | 744 | } |
745 | else { | | 745 | else { |
746 | increment = ( roundingMode == float_round_up ) && zSig1; | | 746 | increment = ( roundingMode == float_round_up ) && zSig1; |
747 | } | | 747 | } |
748 | } | | 748 | } |
749 | } | | 749 | } |
750 | if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { | | 750 | if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { |
751 | if ( ( 0x7FFE < zExp ) | | 751 | if ( ( 0x7FFE < zExp ) |
752 | || ( ( zExp == 0x7FFE ) | | 752 | || ( ( zExp == 0x7FFE ) |
753 | && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) | | 753 | && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) |
754 | && increment | | 754 | && increment |
755 | ) | | 755 | ) |
756 | ) { | | 756 | ) { |
757 | roundMask = 0; | | 757 | roundMask = 0; |
758 | overflow: | | 758 | overflow: |
759 | float_raise( float_flag_overflow | float_flag_inexact ); | | 759 | float_raise( float_flag_overflow | float_flag_inexact ); |
760 | if ( ( roundingMode == float_round_to_zero ) | | 760 | if ( ( roundingMode == float_round_to_zero ) |
761 | || ( zSign && ( roundingMode == float_round_up ) ) | | 761 | || ( zSign && ( roundingMode == float_round_up ) ) |
762 | || ( ! zSign && ( roundingMode == float_round_down ) ) | | 762 | || ( ! zSign && ( roundingMode == float_round_down ) ) |
763 | ) { | | 763 | ) { |
764 | return packFloatx80( zSign, 0x7FFE, ~ roundMask ); | | 764 | return packFloatx80( zSign, 0x7FFE, ~ roundMask ); |
765 | } | | 765 | } |
766 | return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); | | 766 | return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); |
767 | } | | 767 | } |
768 | if ( zExp <= 0 ) { | | 768 | if ( zExp <= 0 ) { |
769 | isTiny = | | 769 | isTiny = |
770 | ( float_detect_tininess == float_tininess_before_rounding ) | | 770 | ( float_detect_tininess == float_tininess_before_rounding ) |
771 | || ( zExp < 0 ) | | 771 | || ( zExp < 0 ) |
772 | || ! increment | | 772 | || ! increment |
773 | || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); | | 773 | || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); |
774 | shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); | | 774 | shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); |
775 | zExp = 0; | | 775 | zExp = 0; |
776 | if ( isTiny && zSig1 ) float_raise( float_flag_underflow ); | | 776 | if ( isTiny && zSig1 ) float_raise( float_flag_underflow ); |
777 | if ( zSig1 ) set_float_exception_inexact_flag(); | | 777 | if ( zSig1 ) set_float_exception_inexact_flag(); |
778 | if ( roundNearestEven ) { | | 778 | if ( roundNearestEven ) { |
779 | increment = ( (sbits64) zSig1 < 0 ); | | 779 | increment = ( (sbits64) zSig1 < 0 ); |
780 | } | | 780 | } |
781 | else { | | 781 | else { |
782 | if ( zSign ) { | | 782 | if ( zSign ) { |
783 | increment = ( roundingMode == float_round_down ) && zSig1; | | 783 | increment = ( roundingMode == float_round_down ) && zSig1; |
784 | } | | 784 | } |
785 | else { | | 785 | else { |
786 | increment = ( roundingMode == float_round_up ) && zSig1; | | 786 | increment = ( roundingMode == float_round_up ) && zSig1; |
787 | } | | 787 | } |
788 | } | | 788 | } |
789 | if ( increment ) { | | 789 | if ( increment ) { |
790 | ++zSig0; | | 790 | ++zSig0; |
791 | zSig0 &= | | 791 | zSig0 &= |
792 | ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven ); | | 792 | ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven ); |
793 | if ( (sbits64) zSig0 < 0 ) zExp = 1; | | 793 | if ( (sbits64) zSig0 < 0 ) zExp = 1; |
794 | } | | 794 | } |
795 | return packFloatx80( zSign, zExp, zSig0 ); | | 795 | return packFloatx80( zSign, zExp, zSig0 ); |
796 | } | | 796 | } |
797 | } | | 797 | } |
798 | if ( zSig1 ) set_float_exception_inexact_flag(); | | 798 | if ( zSig1 ) set_float_exception_inexact_flag(); |
799 | if ( increment ) { | | 799 | if ( increment ) { |
800 | ++zSig0; | | 800 | ++zSig0; |
801 | if ( zSig0 == 0 ) { | | 801 | if ( zSig0 == 0 ) { |
802 | ++zExp; | | 802 | ++zExp; |
803 | zSig0 = LIT64( 0x8000000000000000 ); | | 803 | zSig0 = LIT64( 0x8000000000000000 ); |
804 | } | | 804 | } |
805 | else { | | 805 | else { |
806 | zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven ); | | 806 | zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven ); |
807 | } | | 807 | } |
808 | } | | 808 | } |
809 | else { | | 809 | else { |
810 | if ( zSig0 == 0 ) zExp = 0; | | 810 | if ( zSig0 == 0 ) zExp = 0; |
811 | } | | 811 | } |
812 | return packFloatx80( zSign, zExp, zSig0 ); | | 812 | return packFloatx80( zSign, zExp, zSig0 ); |
813 | | | 813 | |
814 | } | | 814 | } |
815 | | | 815 | |
816 | /* | | 816 | /* |
817 | ------------------------------------------------------------------------------- | | 817 | ------------------------------------------------------------------------------- |
818 | Takes an abstract floating-point value having sign `zSign', exponent | | 818 | Takes an abstract floating-point value having sign `zSign', exponent |
819 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', | | 819 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', |
820 | and returns the proper extended double-precision floating-point value | | 820 | and returns the proper extended double-precision floating-point value |
821 | corresponding to the abstract input. This routine is just like | | 821 | corresponding to the abstract input. This routine is just like |
822 | `roundAndPackFloatx80' except that the input significand does not have to be | | 822 | `roundAndPackFloatx80' except that the input significand does not have to be |
823 | normalized. | | 823 | normalized. |
824 | ------------------------------------------------------------------------------- | | 824 | ------------------------------------------------------------------------------- |
825 | */ | | 825 | */ |
826 | static floatx80 | | 826 | static floatx80 |
827 | normalizeRoundAndPackFloatx80( | | 827 | normalizeRoundAndPackFloatx80( |
828 | int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 | | 828 | int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 |
829 | ) | | 829 | ) |
830 | { | | 830 | { |
831 | int8 shiftCount; | | 831 | int8 shiftCount; |
832 | | | 832 | |
833 | if ( zSig0 == 0 ) { | | 833 | if ( zSig0 == 0 ) { |
834 | zSig0 = zSig1; | | 834 | zSig0 = zSig1; |
835 | zSig1 = 0; | | 835 | zSig1 = 0; |
836 | zExp -= 64; | | 836 | zExp -= 64; |
837 | } | | 837 | } |
838 | shiftCount = countLeadingZeros64( zSig0 ); | | 838 | shiftCount = countLeadingZeros64( zSig0 ); |
839 | shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); | | 839 | shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); |
840 | zExp -= shiftCount; | | 840 | zExp -= shiftCount; |
841 | return | | 841 | return |
842 | roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 ); | | 842 | roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 ); |
843 | | | 843 | |
844 | } | | 844 | } |
845 | | | 845 | |
846 | #endif | | 846 | #endif |
847 | | | 847 | |
848 | #ifdef FLOAT128 | | 848 | #ifdef FLOAT128 |
849 | | | 849 | |
850 | /* | | 850 | /* |
851 | ------------------------------------------------------------------------------- | | 851 | ------------------------------------------------------------------------------- |
852 | Returns the least-significant 64 fraction bits of the quadruple-precision | | 852 | Returns the least-significant 64 fraction bits of the quadruple-precision |
853 | floating-point value `a'. | | 853 | floating-point value `a'. |
854 | ------------------------------------------------------------------------------- | | 854 | ------------------------------------------------------------------------------- |
855 | */ | | 855 | */ |
856 | INLINE bits64 extractFloat128Frac1( float128 a ) | | 856 | INLINE bits64 extractFloat128Frac1( float128 a ) |
857 | { | | 857 | { |
858 | | | 858 | |
859 | return a.low; | | 859 | return a.low; |
860 | | | 860 | |
861 | } | | 861 | } |
862 | | | 862 | |
863 | /* | | 863 | /* |
864 | ------------------------------------------------------------------------------- | | 864 | ------------------------------------------------------------------------------- |
865 | Returns the most-significant 48 fraction bits of the quadruple-precision | | 865 | Returns the most-significant 48 fraction bits of the quadruple-precision |
866 | floating-point value `a'. | | 866 | floating-point value `a'. |
867 | ------------------------------------------------------------------------------- | | 867 | ------------------------------------------------------------------------------- |
868 | */ | | 868 | */ |
869 | INLINE bits64 extractFloat128Frac0( float128 a ) | | 869 | INLINE bits64 extractFloat128Frac0( float128 a ) |
870 | { | | 870 | { |
871 | | | 871 | |
872 | return a.high & LIT64( 0x0000FFFFFFFFFFFF ); | | 872 | return a.high & LIT64( 0x0000FFFFFFFFFFFF ); |
873 | | | 873 | |
874 | } | | 874 | } |
875 | | | 875 | |
876 | /* | | 876 | /* |
877 | ------------------------------------------------------------------------------- | | 877 | ------------------------------------------------------------------------------- |
878 | Returns the exponent bits of the quadruple-precision floating-point value | | 878 | Returns the exponent bits of the quadruple-precision floating-point value |
879 | `a'. | | 879 | `a'. |
880 | ------------------------------------------------------------------------------- | | 880 | ------------------------------------------------------------------------------- |
881 | */ | | 881 | */ |
882 | INLINE int32 extractFloat128Exp( float128 a ) | | 882 | INLINE int32 extractFloat128Exp( float128 a ) |
883 | { | | 883 | { |
884 | | | 884 | |
885 | return (int32)((a.high >> 48) & 0x7FFF); | | 885 | return (int32)((a.high >> 48) & 0x7FFF); |
886 | | | 886 | |
887 | } | | 887 | } |
888 | | | 888 | |
889 | /* | | 889 | /* |
890 | ------------------------------------------------------------------------------- | | 890 | ------------------------------------------------------------------------------- |
891 | Returns the sign bit of the quadruple-precision floating-point value `a'. | | 891 | Returns the sign bit of the quadruple-precision floating-point value `a'. |
892 | ------------------------------------------------------------------------------- | | 892 | ------------------------------------------------------------------------------- |
893 | */ | | 893 | */ |
894 | INLINE flag extractFloat128Sign( float128 a ) | | 894 | INLINE flag extractFloat128Sign( float128 a ) |
895 | { | | 895 | { |
896 | | | 896 | |
897 | return (flag)(a.high >> 63); | | 897 | return (flag)(a.high >> 63); |
898 | | | 898 | |
899 | } | | 899 | } |
900 | | | 900 | |
901 | /* | | 901 | /* |
902 | ------------------------------------------------------------------------------- | | 902 | ------------------------------------------------------------------------------- |
903 | Normalizes the subnormal quadruple-precision floating-point value | | 903 | Normalizes the subnormal quadruple-precision floating-point value |
904 | represented by the denormalized significand formed by the concatenation of | | 904 | represented by the denormalized significand formed by the concatenation of |
905 | `aSig0' and `aSig1'. The normalized exponent is stored at the location | | 905 | `aSig0' and `aSig1'. The normalized exponent is stored at the location |
906 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized | | 906 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized |
907 | significand are stored at the location pointed to by `zSig0Ptr', and the | | 907 | significand are stored at the location pointed to by `zSig0Ptr', and the |
908 | least significant 64 bits of the normalized significand are stored at the | | 908 | least significant 64 bits of the normalized significand are stored at the |
909 | location pointed to by `zSig1Ptr'. | | 909 | location pointed to by `zSig1Ptr'. |
910 | ------------------------------------------------------------------------------- | | 910 | ------------------------------------------------------------------------------- |
911 | */ | | 911 | */ |
912 | static void | | 912 | static void |
913 | normalizeFloat128Subnormal( | | 913 | normalizeFloat128Subnormal( |
914 | bits64 aSig0, | | 914 | bits64 aSig0, |
915 | bits64 aSig1, | | 915 | bits64 aSig1, |
916 | int32 *zExpPtr, | | 916 | int32 *zExpPtr, |
917 | bits64 *zSig0Ptr, | | 917 | bits64 *zSig0Ptr, |
918 | bits64 *zSig1Ptr | | 918 | bits64 *zSig1Ptr |
919 | ) | | 919 | ) |
920 | { | | 920 | { |
921 | int8 shiftCount; | | 921 | int8 shiftCount; |
922 | | | 922 | |
923 | if ( aSig0 == 0 ) { | | 923 | if ( aSig0 == 0 ) { |
924 | shiftCount = countLeadingZeros64( aSig1 ) - 15; | | 924 | shiftCount = countLeadingZeros64( aSig1 ) - 15; |
925 | if ( shiftCount < 0 ) { | | 925 | if ( shiftCount < 0 ) { |
926 | *zSig0Ptr = aSig1>>( - shiftCount ); | | 926 | *zSig0Ptr = aSig1>>( - shiftCount ); |
927 | *zSig1Ptr = aSig1<<( shiftCount & 63 ); | | 927 | *zSig1Ptr = aSig1<<( shiftCount & 63 ); |
928 | } | | 928 | } |
929 | else { | | 929 | else { |
930 | *zSig0Ptr = aSig1<<shiftCount; | | 930 | *zSig0Ptr = aSig1<<shiftCount; |
931 | *zSig1Ptr = 0; | | 931 | *zSig1Ptr = 0; |
932 | } | | 932 | } |
933 | *zExpPtr = - shiftCount - 63; | | 933 | *zExpPtr = - shiftCount - 63; |
934 | } | | 934 | } |
935 | else { | | 935 | else { |
936 | shiftCount = countLeadingZeros64( aSig0 ) - 15; | | 936 | shiftCount = countLeadingZeros64( aSig0 ) - 15; |
937 | shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); | | 937 | shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); |
938 | *zExpPtr = 1 - shiftCount; | | 938 | *zExpPtr = 1 - shiftCount; |
939 | } | | 939 | } |
940 | | | 940 | |
941 | } | | 941 | } |
942 | | | 942 | |
943 | /* | | 943 | /* |
944 | ------------------------------------------------------------------------------- | | 944 | ------------------------------------------------------------------------------- |
945 | Packs the sign `zSign', the exponent `zExp', and the significand formed | | 945 | Packs the sign `zSign', the exponent `zExp', and the significand formed |
946 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision | | 946 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision |
947 | floating-point value, returning the result. After being shifted into the | | 947 | floating-point value, returning the result. After being shifted into the |
948 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply | | 948 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply |
949 | added together to form the most significant 32 bits of the result. This | | 949 | added together to form the most significant 32 bits of the result. This |
950 | means that any integer portion of `zSig0' will be added into the exponent. | | 950 | means that any integer portion of `zSig0' will be added into the exponent. |
951 | Since a properly normalized significand will have an integer portion equal | | 951 | Since a properly normalized significand will have an integer portion equal |
952 | to 1, the `zExp' input should be 1 less than the desired result exponent | | 952 | to 1, the `zExp' input should be 1 less than the desired result exponent |
953 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized | | 953 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized |
954 | significand. | | 954 | significand. |
955 | ------------------------------------------------------------------------------- | | 955 | ------------------------------------------------------------------------------- |
956 | */ | | 956 | */ |
957 | INLINE float128 | | 957 | INLINE float128 |
958 | packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 ) | | 958 | packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 ) |
959 | { | | 959 | { |
960 | float128 z; | | 960 | float128 z; |
961 | | | 961 | |
962 | z.low = zSig1; | | 962 | z.low = zSig1; |
963 | z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0; | | 963 | z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0; |
964 | return z; | | 964 | return z; |
965 | | | 965 | |
966 | } | | 966 | } |
967 | | | 967 | |
968 | /* | | 968 | /* |
969 | ------------------------------------------------------------------------------- | | 969 | ------------------------------------------------------------------------------- |
970 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', | | 970 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', |
971 | and extended significand formed by the concatenation of `zSig0', `zSig1', | | 971 | and extended significand formed by the concatenation of `zSig0', `zSig1', |
972 | and `zSig2', and returns the proper quadruple-precision floating-point value | | 972 | and `zSig2', and returns the proper quadruple-precision floating-point value |
973 | corresponding to the abstract input. Ordinarily, the abstract value is | | 973 | corresponding to the abstract input. Ordinarily, the abstract value is |
974 | simply rounded and packed into the quadruple-precision format, with the | | 974 | simply rounded and packed into the quadruple-precision format, with the |
975 | inexact exception raised if the abstract input cannot be represented | | 975 | inexact exception raised if the abstract input cannot be represented |
976 | exactly. However, if the abstract value is too large, the overflow and | | 976 | exactly. However, if the abstract value is too large, the overflow and |
977 | inexact exceptions are raised and an infinity or maximal finite value is | | 977 | inexact exceptions are raised and an infinity or maximal finite value is |
978 | returned. If the abstract value is too small, the input value is rounded to | | 978 | returned. If the abstract value is too small, the input value is rounded to |
979 | a subnormal number, and the underflow and inexact exceptions are raised if | | 979 | a subnormal number, and the underflow and inexact exceptions are raised if |
980 | the abstract input cannot be represented exactly as a subnormal quadruple- | | 980 | the abstract input cannot be represented exactly as a subnormal quadruple- |
981 | precision floating-point number. | | 981 | precision floating-point number. |
982 | The input significand must be normalized or smaller. If the input | | 982 | The input significand must be normalized or smaller. If the input |
983 | significand is not normalized, `zExp' must be 0; in that case, the result | | 983 | significand is not normalized, `zExp' must be 0; in that case, the result |
984 | returned is a subnormal number, and it must not require rounding. In the | | 984 | returned is a subnormal number, and it must not require rounding. In the |
985 | usual case that the input significand is normalized, `zExp' must be 1 less | | 985 | usual case that the input significand is normalized, `zExp' must be 1 less |
986 | than the ``true'' floating-point exponent. The handling of underflow and | | 986 | than the ``true'' floating-point exponent. The handling of underflow and |
987 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. | | 987 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
988 | ------------------------------------------------------------------------------- | | 988 | ------------------------------------------------------------------------------- |
989 | */ | | 989 | */ |
990 | static float128 | | 990 | static float128 |
991 | roundAndPackFloat128( | | 991 | roundAndPackFloat128( |
992 | flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 ) | | 992 | flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 ) |
993 | { | | 993 | { |
994 | int8 roundingMode; | | 994 | int8 roundingMode; |
995 | flag roundNearestEven, increment, isTiny; | | 995 | flag roundNearestEven, increment, isTiny; |
996 | | | 996 | |
997 | roundingMode = float_rounding_mode; | | 997 | roundingMode = float_rounding_mode; |
998 | roundNearestEven = ( roundingMode == float_round_nearest_even ); | | 998 | roundNearestEven = ( roundingMode == float_round_nearest_even ); |
999 | increment = ( (sbits64) zSig2 < 0 ); | | 999 | increment = ( (sbits64) zSig2 < 0 ); |
1000 | if ( ! roundNearestEven ) { | | 1000 | if ( ! roundNearestEven ) { |
1001 | if ( roundingMode == float_round_to_zero ) { | | 1001 | if ( roundingMode == float_round_to_zero ) { |
1002 | increment = 0; | | 1002 | increment = 0; |
1003 | } | | 1003 | } |
1004 | else { | | 1004 | else { |
1005 | if ( zSign ) { | | 1005 | if ( zSign ) { |
1006 | increment = ( roundingMode == float_round_down ) && zSig2; | | 1006 | increment = ( roundingMode == float_round_down ) && zSig2; |
1007 | } | | 1007 | } |
1008 | else { | | 1008 | else { |
1009 | increment = ( roundingMode == float_round_up ) && zSig2; | | 1009 | increment = ( roundingMode == float_round_up ) && zSig2; |
1010 | } | | 1010 | } |
1011 | } | | 1011 | } |
1012 | } | | 1012 | } |
1013 | if ( 0x7FFD <= (bits32) zExp ) { | | 1013 | if ( 0x7FFD <= (bits32) zExp ) { |
1014 | if ( ( 0x7FFD < zExp ) | | 1014 | if ( ( 0x7FFD < zExp ) |
1015 | || ( ( zExp == 0x7FFD ) | | 1015 | || ( ( zExp == 0x7FFD ) |
1016 | && eq128( | | 1016 | && eq128( |
1017 | LIT64( 0x0001FFFFFFFFFFFF ), | | 1017 | LIT64( 0x0001FFFFFFFFFFFF ), |
1018 | LIT64( 0xFFFFFFFFFFFFFFFF ), | | 1018 | LIT64( 0xFFFFFFFFFFFFFFFF ), |
1019 | zSig0, | | 1019 | zSig0, |
1020 | zSig1 | | 1020 | zSig1 |
1021 | ) | | 1021 | ) |
1022 | && increment | | 1022 | && increment |
1023 | ) | | 1023 | ) |
1024 | ) { | | 1024 | ) { |
1025 | float_raise( float_flag_overflow | float_flag_inexact ); | | 1025 | float_raise( float_flag_overflow | float_flag_inexact ); |
1026 | if ( ( roundingMode == float_round_to_zero ) | | 1026 | if ( ( roundingMode == float_round_to_zero ) |
1027 | || ( zSign && ( roundingMode == float_round_up ) ) | | 1027 | || ( zSign && ( roundingMode == float_round_up ) ) |
1028 | || ( ! zSign && ( roundingMode == float_round_down ) ) | | 1028 | || ( ! zSign && ( roundingMode == float_round_down ) ) |
1029 | ) { | | 1029 | ) { |
1030 | return | | 1030 | return |
1031 | packFloat128( | | 1031 | packFloat128( |
1032 | zSign, | | 1032 | zSign, |
1033 | 0x7FFE, | | 1033 | 0x7FFE, |
1034 | LIT64( 0x0000FFFFFFFFFFFF ), | | 1034 | LIT64( 0x0000FFFFFFFFFFFF ), |
1035 | LIT64( 0xFFFFFFFFFFFFFFFF ) | | 1035 | LIT64( 0xFFFFFFFFFFFFFFFF ) |
1036 | ); | | 1036 | ); |
1037 | } | | 1037 | } |
1038 | return packFloat128( zSign, 0x7FFF, 0, 0 ); | | 1038 | return packFloat128( zSign, 0x7FFF, 0, 0 ); |
1039 | } | | 1039 | } |
1040 | if ( zExp < 0 ) { | | 1040 | if ( zExp < 0 ) { |
1041 | isTiny = | | 1041 | isTiny = |
1042 | ( float_detect_tininess == float_tininess_before_rounding ) | | 1042 | ( float_detect_tininess == float_tininess_before_rounding ) |
1043 | || ( zExp < -1 ) | | 1043 | || ( zExp < -1 ) |
1044 | || ! increment | | 1044 | || ! increment |
1045 | || lt128( | | 1045 | || lt128( |
1046 | zSig0, | | 1046 | zSig0, |
1047 | zSig1, | | 1047 | zSig1, |
1048 | LIT64( 0x0001FFFFFFFFFFFF ), | | 1048 | LIT64( 0x0001FFFFFFFFFFFF ), |
1049 | LIT64( 0xFFFFFFFFFFFFFFFF ) | | 1049 | LIT64( 0xFFFFFFFFFFFFFFFF ) |
1050 | ); | | 1050 | ); |
1051 | shift128ExtraRightJamming( | | 1051 | shift128ExtraRightJamming( |
1052 | zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); | | 1052 | zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); |
1053 | zExp = 0; | | 1053 | zExp = 0; |
1054 | if ( isTiny && zSig2 ) float_raise( float_flag_underflow ); | | 1054 | if ( isTiny && zSig2 ) float_raise( float_flag_underflow ); |
1055 | if ( roundNearestEven ) { | | 1055 | if ( roundNearestEven ) { |
1056 | increment = ( (sbits64) zSig2 < 0 ); | | 1056 | increment = ( (sbits64) zSig2 < 0 ); |
1057 | } | | 1057 | } |
1058 | else { | | 1058 | else { |
1059 | if ( zSign ) { | | 1059 | if ( zSign ) { |
1060 | increment = ( roundingMode == float_round_down ) && zSig2; | | 1060 | increment = ( roundingMode == float_round_down ) && zSig2; |
1061 | } | | 1061 | } |
1062 | else { | | 1062 | else { |
1063 | increment = ( roundingMode == float_round_up ) && zSig2; | | 1063 | increment = ( roundingMode == float_round_up ) && zSig2; |
1064 | } | | 1064 | } |
1065 | } | | 1065 | } |
1066 | } | | 1066 | } |
1067 | } | | 1067 | } |
1068 | if ( zSig2 ) set_float_exception_inexact_flag(); | | 1068 | if ( zSig2 ) set_float_exception_inexact_flag(); |
1069 | if ( increment ) { | | 1069 | if ( increment ) { |
1070 | add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); | | 1070 | add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); |
1071 | zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); | | 1071 | zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); |
1072 | } | | 1072 | } |
1073 | else { | | 1073 | else { |
1074 | if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; | | 1074 | if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; |
1075 | } | | 1075 | } |
1076 | return packFloat128( zSign, zExp, zSig0, zSig1 ); | | 1076 | return packFloat128( zSign, zExp, zSig0, zSig1 ); |
1077 | | | 1077 | |
1078 | } | | 1078 | } |
1079 | | | 1079 | |
1080 | /* | | 1080 | /* |
1081 | ------------------------------------------------------------------------------- | | 1081 | ------------------------------------------------------------------------------- |
1082 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', | | 1082 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', |
1083 | and significand formed by the concatenation of `zSig0' and `zSig1', and | | 1083 | and significand formed by the concatenation of `zSig0' and `zSig1', and |
1084 | returns the proper quadruple-precision floating-point value corresponding | | 1084 | returns the proper quadruple-precision floating-point value corresponding |
1085 | to the abstract input. This routine is just like `roundAndPackFloat128' | | 1085 | to the abstract input. This routine is just like `roundAndPackFloat128' |
1086 | except that the input significand has fewer bits and does not have to be | | 1086 | except that the input significand has fewer bits and does not have to be |
1087 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- | | 1087 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- |
1088 | point exponent. | | 1088 | point exponent. |
1089 | ------------------------------------------------------------------------------- | | 1089 | ------------------------------------------------------------------------------- |
1090 | */ | | 1090 | */ |
1091 | static float128 | | 1091 | static float128 |
1092 | normalizeRoundAndPackFloat128( | | 1092 | normalizeRoundAndPackFloat128( |
1093 | flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 ) | | 1093 | flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 ) |
1094 | { | | 1094 | { |
1095 | int8 shiftCount; | | 1095 | int8 shiftCount; |
1096 | bits64 zSig2; | | 1096 | bits64 zSig2; |
1097 | | | 1097 | |
1098 | if ( zSig0 == 0 ) { | | 1098 | if ( zSig0 == 0 ) { |
1099 | zSig0 = zSig1; | | 1099 | zSig0 = zSig1; |
1100 | zSig1 = 0; | | 1100 | zSig1 = 0; |
1101 | zExp -= 64; | | 1101 | zExp -= 64; |
1102 | } | | 1102 | } |
1103 | shiftCount = countLeadingZeros64( zSig0 ) - 15; | | 1103 | shiftCount = countLeadingZeros64( zSig0 ) - 15; |
1104 | if ( 0 <= shiftCount ) { | | 1104 | if ( 0 <= shiftCount ) { |
1105 | zSig2 = 0; | | 1105 | zSig2 = 0; |
1106 | shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); | | 1106 | shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); |
1107 | } | | 1107 | } |
1108 | else { | | 1108 | else { |
1109 | shift128ExtraRightJamming( | | 1109 | shift128ExtraRightJamming( |
1110 | zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); | | 1110 | zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); |
1111 | } | | 1111 | } |
1112 | zExp -= shiftCount; | | 1112 | zExp -= shiftCount; |
1113 | return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); | | 1113 | return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); |
1114 | | | 1114 | |
1115 | } | | 1115 | } |
1116 | | | 1116 | |
1117 | #endif | | 1117 | #endif |
1118 | | | 1118 | |
1119 | /* | | 1119 | /* |
1120 | ------------------------------------------------------------------------------- | | 1120 | ------------------------------------------------------------------------------- |
1121 | Returns the result of converting the 32-bit two's complement integer `a' | | 1121 | Returns the result of converting the 32-bit two's complement integer `a' |
1122 | to the single-precision floating-point format. The conversion is performed | | 1122 | to the single-precision floating-point format. The conversion is performed |
1123 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. | | 1123 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
1124 | ------------------------------------------------------------------------------- | | 1124 | ------------------------------------------------------------------------------- |
1125 | */ | | 1125 | */ |
1126 | float32 int32_to_float32( int32 a ) | | 1126 | float32 int32_to_float32( int32 a ) |
1127 | { | | 1127 | { |
1128 | flag zSign; | | 1128 | flag zSign; |
1129 | | | 1129 | |
1130 | if ( a == 0 ) return 0; | | 1130 | if ( a == 0 ) return 0; |
1131 | if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 ); | | 1131 | if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 ); |
1132 | zSign = ( a < 0 ); | | 1132 | zSign = ( a < 0 ); |
1133 | return normalizeRoundAndPackFloat32(zSign, 0x9C, (uint32)(zSign ? - a : a)); | | 1133 | return normalizeRoundAndPackFloat32(zSign, 0x9C, (uint32)(zSign ? - a : a)); |
1134 | | | 1134 | |
1135 | } | | 1135 | } |
1136 | | | 1136 | |
1137 | float32 uint32_to_float32( uint32 a ) | | 1137 | float32 uint32_to_float32( uint32 a ) |
1138 | { | | 1138 | { |
1139 | if ( a == 0 ) return 0; | | 1139 | if ( a == 0 ) return 0; |
1140 | if ( a & (bits32) 0x80000000 ) | | 1140 | if ( a & (bits32) 0x80000000 ) |
1141 | return normalizeRoundAndPackFloat32( 0, 0x9D, a >> 1 ); | | 1141 | return normalizeRoundAndPackFloat32( 0, 0x9D, a >> 1 ); |
1142 | return normalizeRoundAndPackFloat32( 0, 0x9C, a ); | | 1142 | return normalizeRoundAndPackFloat32( 0, 0x9C, a ); |
1143 | } | | 1143 | } |
1144 | | | 1144 | |
1145 | | | 1145 | |
1146 | /* | | 1146 | /* |
1147 | ------------------------------------------------------------------------------- | | 1147 | ------------------------------------------------------------------------------- |
1148 | Returns the result of converting the 32-bit two's complement integer `a' | | 1148 | Returns the result of converting the 32-bit two's complement integer `a' |
1149 | to the double-precision floating-point format. The conversion is performed | | 1149 | to the double-precision floating-point format. The conversion is performed |
1150 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. | | 1150 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
1151 | ------------------------------------------------------------------------------- | | 1151 | ------------------------------------------------------------------------------- |
1152 | */ | | 1152 | */ |
1153 | float64 int32_to_float64( int32 a ) | | 1153 | float64 int32_to_float64( int32 a ) |
1154 | { | | 1154 | { |
1155 | flag zSign; | | 1155 | flag zSign; |
1156 | uint32 absA; | | 1156 | uint32 absA; |
1157 | int8 shiftCount; | | 1157 | int8 shiftCount; |
1158 | bits64 zSig; | | 1158 | bits64 zSig; |
1159 | | | 1159 | |
1160 | if ( a == 0 ) return 0; | | 1160 | if ( a == 0 ) return 0; |
1161 | zSign = ( a < 0 ); | | 1161 | zSign = ( a < 0 ); |
1162 | absA = zSign ? - a : a; | | 1162 | absA = zSign ? - a : a; |
1163 | shiftCount = countLeadingZeros32( absA ) + 21; | | 1163 | shiftCount = countLeadingZeros32( absA ) + 21; |
1164 | zSig = absA; | | 1164 | zSig = absA; |
1165 | return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount ); | | 1165 | return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount ); |
1166 | | | 1166 | |
1167 | } | | 1167 | } |
1168 | | | 1168 | |
1169 | float64 uint32_to_float64( uint32 a ) | | 1169 | float64 uint32_to_float64( uint32 a ) |
1170 | { | | 1170 | { |
1171 | int8 shiftCount; | | 1171 | int8 shiftCount; |
1172 | bits64 zSig = a; | | 1172 | bits64 zSig = a; |
1173 | | | 1173 | |
1174 | if ( a == 0 ) return 0; | | 1174 | if ( a == 0 ) return 0; |
1175 | shiftCount = countLeadingZeros32( a ) + 21; | | 1175 | shiftCount = countLeadingZeros32( a ) + 21; |
1176 | return packFloat64( 0, 0x432 - shiftCount, zSig<<shiftCount ); | | 1176 | return packFloat64( 0, 0x432 - shiftCount, zSig<<shiftCount ); |
1177 | | | 1177 | |
1178 | } | | 1178 | } |
1179 | | | 1179 | |
1180 | #ifdef FLOATX80 | | 1180 | #ifdef FLOATX80 |
1181 | | | 1181 | |
1182 | /* | | 1182 | /* |
1183 | ------------------------------------------------------------------------------- | | 1183 | ------------------------------------------------------------------------------- |
1184 | Returns the result of converting the 32-bit two's complement integer `a' | | 1184 | Returns the result of converting the 32-bit two's complement integer `a' |
1185 | to the extended double-precision floating-point format. The conversion | | 1185 | to the extended double-precision floating-point format. The conversion |
1186 | is performed according to the IEC/IEEE Standard for Binary Floating-Point | | 1186 | is performed according to the IEC/IEEE Standard for Binary Floating-Point |
1187 | Arithmetic. | | 1187 | Arithmetic. |
1188 | ------------------------------------------------------------------------------- | | 1188 | ------------------------------------------------------------------------------- |
1189 | */ | | 1189 | */ |
1190 | floatx80 int32_to_floatx80( int32 a ) | | 1190 | floatx80 int32_to_floatx80( int32 a ) |
1191 | { | | 1191 | { |
1192 | flag zSign; | | 1192 | flag zSign; |
1193 | uint32 absA; | | 1193 | uint32 absA; |
1194 | int8 shiftCount; | | 1194 | int8 shiftCount; |
1195 | bits64 zSig; | | 1195 | bits64 zSig; |
1196 | | | 1196 | |
1197 | if ( a == 0 ) return packFloatx80( 0, 0, 0 ); | | 1197 | if ( a == 0 ) return packFloatx80( 0, 0, 0 ); |
1198 | zSign = ( a < 0 ); | | 1198 | zSign = ( a < 0 ); |
1199 | absA = zSign ? - a : a; | | 1199 | absA = zSign ? - a : a; |
1200 | shiftCount = countLeadingZeros32( absA ) + 32; | | 1200 | shiftCount = countLeadingZeros32( absA ) + 32; |
1201 | zSig = absA; | | 1201 | zSig = absA; |
1202 | return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); | | 1202 | return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); |
1203 | | | 1203 | |
1204 | } | | 1204 | } |
1205 | | | 1205 | |
1206 | floatx80 uint32_to_floatx80( uint32 a ) | | 1206 | floatx80 uint32_to_floatx80( uint32 a ) |
1207 | { | | 1207 | { |
1208 | int8 shiftCount; | | 1208 | int8 shiftCount; |
1209 | bits64 zSig = a; | | 1209 | bits64 zSig = a; |
1210 | | | 1210 | |
1211 | if ( a == 0 ) return packFloatx80( 0, 0, 0 ); | | 1211 | if ( a == 0 ) return packFloatx80( 0, 0, 0 ); |
1212 | shiftCount = countLeadingZeros32( a ) + 32; | | 1212 | shiftCount = countLeadingZeros32( a ) + 32; |
1213 | return packFloatx80( 0, 0x403E - shiftCount, zSig<<shiftCount ); | | 1213 | return packFloatx80( 0, 0x403E - shiftCount, zSig<<shiftCount ); |
1214 | | | 1214 | |
1215 | } | | 1215 | } |
1216 | | | 1216 | |
1217 | #endif | | 1217 | #endif |
1218 | | | 1218 | |
1219 | #ifdef FLOAT128 | | 1219 | #ifdef FLOAT128 |
1220 | | | 1220 | |
1221 | /* | | 1221 | /* |
1222 | ------------------------------------------------------------------------------- | | 1222 | ------------------------------------------------------------------------------- |
1223 | Returns the result of converting the 32-bit two's complement integer `a' to | | 1223 | Returns the result of converting the 32-bit two's complement integer `a' to |
1224 | the quadruple-precision floating-point format. The conversion is performed | | 1224 | the quadruple-precision floating-point format. The conversion is performed |
1225 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. | | 1225 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
1226 | ------------------------------------------------------------------------------- | | 1226 | ------------------------------------------------------------------------------- |
1227 | */ | | 1227 | */ |
1228 | float128 int32_to_float128( int32 a ) | | 1228 | float128 int32_to_float128( int32 a ) |
1229 | { | | 1229 | { |
1230 | flag zSign; | | 1230 | flag zSign; |
1231 | uint32 absA; | | 1231 | uint32 absA; |
1232 | int8 shiftCount; | | 1232 | int8 shiftCount; |
1233 | bits64 zSig0; | | 1233 | bits64 zSig0; |
1234 | | | 1234 | |
1235 | if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); | | 1235 | if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); |
1236 | zSign = ( a < 0 ); | | 1236 | zSign = ( a < 0 ); |
1237 | absA = zSign ? - a : a; | | 1237 | absA = zSign ? - a : a; |
1238 | shiftCount = countLeadingZeros32( absA ) + 17; | | 1238 | shiftCount = countLeadingZeros32( absA ) + 17; |
1239 | zSig0 = absA; | | 1239 | zSig0 = absA; |
1240 | return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); | | 1240 | return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); |
1241 | | | 1241 | |
1242 | } | | 1242 | } |
1243 | | | 1243 | |
1244 | float128 uint32_to_float128( uint32 a ) | | 1244 | float128 uint32_to_float128( uint32 a ) |
1245 | { | | 1245 | { |
1246 | int8 shiftCount; | | 1246 | int8 shiftCount; |
1247 | bits64 zSig0 = a; | | 1247 | bits64 zSig0 = a; |
1248 | | | 1248 | |
1249 | if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); | | 1249 | if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); |
1250 | shiftCount = countLeadingZeros32( a ) + 17; | | 1250 | shiftCount = countLeadingZeros32( a ) + 17; |
1251 | return packFloat128( 0, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); | | 1251 | return packFloat128( 0, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); |
1252 | | | 1252 | |
1253 | } | | 1253 | } |
1254 | | | 1254 | |
1255 | #endif | | 1255 | #endif |
1256 | | | 1256 | |
1257 | #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */ | | 1257 | #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */ |
1258 | /* | | 1258 | /* |
1259 | ------------------------------------------------------------------------------- | | 1259 | ------------------------------------------------------------------------------- |
1260 | Returns the result of converting the 64-bit two's complement integer `a' | | 1260 | Returns the result of converting the 64-bit two's complement integer `a' |
1261 | to the single-precision floating-point format. The conversion is performed | | 1261 | to the single-precision floating-point format. The conversion is performed |
1262 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. | | 1262 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
1263 | ------------------------------------------------------------------------------- | | 1263 | ------------------------------------------------------------------------------- |
1264 | */ | | 1264 | */ |
1265 | float32 int64_to_float32( int64 a ) | | 1265 | float32 int64_to_float32( int64 a ) |
1266 | { | | 1266 | { |
1267 | flag zSign; | | 1267 | flag zSign; |
1268 | uint64 absA; | | 1268 | uint64 absA; |
1269 | int8 shiftCount; | | 1269 | int8 shiftCount; |
1270 | | | 1270 | |
1271 | if ( a == 0 ) return 0; | | 1271 | if ( a == 0 ) return 0; |
1272 | zSign = ( a < 0 ); | | 1272 | zSign = ( a < 0 ); |
1273 | absA = zSign ? - a : a; | | 1273 | absA = zSign ? - a : a; |
1274 | shiftCount = countLeadingZeros64( absA ) - 40; | | 1274 | shiftCount = countLeadingZeros64( absA ) - 40; |
1275 | if ( 0 <= shiftCount ) { | | 1275 | if ( 0 <= shiftCount ) { |
1276 | return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount ); | | 1276 | return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount ); |
1277 | } | | 1277 | } |
1278 | else { | | 1278 | else { |
1279 | shiftCount += 7; | | 1279 | shiftCount += 7; |
1280 | if ( shiftCount < 0 ) { | | 1280 | if ( shiftCount < 0 ) { |
1281 | shift64RightJamming( absA, - shiftCount, &absA ); | | 1281 | shift64RightJamming( absA, - shiftCount, &absA ); |
1282 | } | | 1282 | } |
1283 | else { | | 1283 | else { |
1284 | absA <<= shiftCount; | | 1284 | absA <<= shiftCount; |
1285 | } | | 1285 | } |
1286 | return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA ); | | 1286 | return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA ); |
1287 | } | | 1287 | } |
1288 | | | 1288 | |
1289 | } | | 1289 | } |
1290 | | | 1290 | |
1291 | /* | | 1291 | /* |
1292 | ------------------------------------------------------------------------------- | | 1292 | ------------------------------------------------------------------------------- |
1293 | Returns the result of converting the 64-bit two's complement integer `a' | | 1293 | Returns the result of converting the 64-bit two's complement integer `a' |
1294 | to the double-precision floating-point format. The conversion is performed | | 1294 | to the double-precision floating-point format. The conversion is performed |
1295 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. | | 1295 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
1296 | ------------------------------------------------------------------------------- | | 1296 | ------------------------------------------------------------------------------- |
1297 | */ | | 1297 | */ |
1298 | float64 int64_to_float64( int64 a ) | | 1298 | float64 int64_to_float64( int64 a ) |
1299 | { | | 1299 | { |
1300 | flag zSign; | | 1300 | flag zSign; |
1301 | | | 1301 | |
1302 | if ( a == 0 ) return 0; | | 1302 | if ( a == 0 ) return 0; |
1303 | if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) { | | 1303 | if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) { |
1304 | return packFloat64( 1, 0x43E, 0 ); | | 1304 | return packFloat64( 1, 0x43E, 0 ); |
1305 | } | | 1305 | } |
1306 | zSign = ( a < 0 ); | | 1306 | zSign = ( a < 0 ); |
1307 | return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a ); | | 1307 | return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a ); |
1308 | | | 1308 | |
1309 | } | | 1309 | } |
1310 | | | 1310 | |
1311 | #ifdef FLOATX80 | | 1311 | #ifdef FLOATX80 |
1312 | | | 1312 | |
1313 | /* | | 1313 | /* |
1314 | ------------------------------------------------------------------------------- | | 1314 | ------------------------------------------------------------------------------- |
1315 | Returns the result of converting the 64-bit two's complement integer `a' | | 1315 | Returns the result of converting the 64-bit two's complement integer `a' |
1316 | to the extended double-precision floating-point format. The conversion | | 1316 | to the extended double-precision floating-point format. The conversion |
1317 | is performed according to the IEC/IEEE Standard for Binary Floating-Point | | 1317 | is performed according to the IEC/IEEE Standard for Binary Floating-Point |
1318 | Arithmetic. | | 1318 | Arithmetic. |
1319 | ------------------------------------------------------------------------------- | | 1319 | ------------------------------------------------------------------------------- |
1320 | */ | | 1320 | */ |
1321 | floatx80 int64_to_floatx80( int64 a ) | | 1321 | floatx80 int64_to_floatx80( int64 a ) |
1322 | { | | 1322 | { |
1323 | flag zSign; | | 1323 | flag zSign; |
1324 | uint64 absA; | | 1324 | uint64 absA; |
1325 | int8 shiftCount; | | 1325 | int8 shiftCount; |
1326 | | | 1326 | |
1327 | if ( a == 0 ) return packFloatx80( 0, 0, 0 ); | | 1327 | if ( a == 0 ) return packFloatx80( 0, 0, 0 ); |
1328 | zSign = ( a < 0 ); | | 1328 | zSign = ( a < 0 ); |
1329 | absA = zSign ? - a : a; | | 1329 | absA = zSign ? - a : a; |
1330 | shiftCount = countLeadingZeros64( absA ); | | 1330 | shiftCount = countLeadingZeros64( absA ); |
1331 | return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); | | 1331 | return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); |
1332 | | | 1332 | |
1333 | } | | 1333 | } |
1334 | | | 1334 | |
1335 | #endif | | 1335 | #endif |
1336 | | | 1336 | |
1337 | #endif /* !SOFTFLOAT_FOR_GCC */ | | 1337 | #endif /* !SOFTFLOAT_FOR_GCC */ |
1338 | | | 1338 | |
1339 | #ifdef FLOAT128 | | 1339 | #ifdef FLOAT128 |
1340 | | | 1340 | |
1341 | /* | | 1341 | /* |
1342 | ------------------------------------------------------------------------------- | | 1342 | ------------------------------------------------------------------------------- |
1343 | Returns the result of converting the 64-bit two's complement integer `a' to | | 1343 | Returns the result of converting the 64-bit two's complement integer `a' to |
1344 | the quadruple-precision floating-point format. The conversion is performed | | 1344 | the quadruple-precision floating-point format. The conversion is performed |
1345 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. | | 1345 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
1346 | ------------------------------------------------------------------------------- | | 1346 | ------------------------------------------------------------------------------- |
1347 | */ | | 1347 | */ |
1348 | float128 int64_to_float128( int64 a ) | | 1348 | float128 int64_to_float128( int64 a ) |
1349 | { | | 1349 | { |
1350 | flag zSign; | | 1350 | flag zSign; |
1351 | uint64 absA; | | 1351 | uint64 absA; |
1352 | int8 shiftCount; | | 1352 | int8 shiftCount; |
1353 | int32 zExp; | | 1353 | int32 zExp; |
1354 | bits64 zSig0, zSig1; | | 1354 | bits64 zSig0, zSig1; |
1355 | | | 1355 | |
1356 | if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); | | 1356 | if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); |
1357 | zSign = ( a < 0 ); | | 1357 | zSign = ( a < 0 ); |
1358 | absA = zSign ? - a : a; | | 1358 | absA = zSign ? - a : a; |
1359 | shiftCount = countLeadingZeros64( absA ) + 49; | | 1359 | shiftCount = countLeadingZeros64( absA ) + 49; |
1360 | zExp = 0x406E - shiftCount; | | 1360 | zExp = 0x406E - shiftCount; |
1361 | if ( 64 <= shiftCount ) { | | 1361 | if ( 64 <= shiftCount ) { |
1362 | zSig1 = 0; | | 1362 | zSig1 = 0; |
1363 | zSig0 = absA; | | 1363 | zSig0 = absA; |
1364 | shiftCount -= 64; | | 1364 | shiftCount -= 64; |
1365 | } | | 1365 | } |
1366 | else { | | 1366 | else { |
1367 | zSig1 = absA; | | 1367 | zSig1 = absA; |
1368 | zSig0 = 0; | | 1368 | zSig0 = 0; |
1369 | } | | 1369 | } |
1370 | shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); | | 1370 | shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); |
1371 | return packFloat128( zSign, zExp, zSig0, zSig1 ); | | 1371 | return packFloat128( zSign, zExp, zSig0, zSig1 ); |
1372 | | | 1372 | |
1373 | } | | 1373 | } |
1374 | | | 1374 | |
1375 | #endif | | 1375 | #endif |
1376 | | | 1376 | |
1377 | #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ | | 1377 | #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ |
1378 | /* | | 1378 | /* |
1379 | ------------------------------------------------------------------------------- | | 1379 | ------------------------------------------------------------------------------- |
1380 | Returns the result of converting the single-precision floating-point value | | 1380 | Returns the result of converting the single-precision floating-point value |
1381 | `a' to the 32-bit two's complement integer format. The conversion is | | 1381 | `a' to the 32-bit two's complement integer format. The conversion is |
1382 | performed according to the IEC/IEEE Standard for Binary Floating-Point | | 1382 | performed according to the IEC/IEEE Standard for Binary Floating-Point |
1383 | Arithmetic---which means in particular that the conversion is rounded | | 1383 | Arithmetic---which means in particular that the conversion is rounded |
1384 | according to the current rounding mode. If `a' is a NaN, the largest | | 1384 | according to the current rounding mode. If `a' is a NaN, the largest |
1385 | positive integer is returned. Otherwise, if the conversion overflows, the | | 1385 | positive integer is returned. Otherwise, if the conversion overflows, the |
1386 | largest integer with the same sign as `a' is returned. | | 1386 | largest integer with the same sign as `a' is returned. |
1387 | ------------------------------------------------------------------------------- | | 1387 | ------------------------------------------------------------------------------- |
1388 | */ | | 1388 | */ |
1389 | int32 float32_to_int32( float32 a ) | | 1389 | int32 float32_to_int32( float32 a ) |
1390 | { | | 1390 | { |
1391 | flag aSign; | | 1391 | flag aSign; |
1392 | int16 aExp, shiftCount; | | 1392 | int16 aExp, shiftCount; |
1393 | bits32 aSig; | | 1393 | bits32 aSig; |
1394 | bits64 aSig64; | | 1394 | bits64 aSig64; |
1395 | | | 1395 | |
1396 | aSig = extractFloat32Frac( a ); | | 1396 | aSig = extractFloat32Frac( a ); |
1397 | aExp = extractFloat32Exp( a ); | | 1397 | aExp = extractFloat32Exp( a ); |
1398 | aSign = extractFloat32Sign( a ); | | 1398 | aSign = extractFloat32Sign( a ); |
1399 | if ( ( aExp == 0xFF ) && aSig ) aSign = 0; | | 1399 | if ( ( aExp == 0xFF ) && aSig ) aSign = 0; |
1400 | if ( aExp ) aSig |= 0x00800000; | | 1400 | if ( aExp ) aSig |= 0x00800000; |
1401 | shiftCount = 0xAF - aExp; | | 1401 | shiftCount = 0xAF - aExp; |
1402 | aSig64 = aSig; | | 1402 | aSig64 = aSig; |
1403 | aSig64 <<= 32; | | 1403 | aSig64 <<= 32; |
1404 | if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 ); | | 1404 | if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 ); |
1405 | return roundAndPackInt32( aSign, aSig64 ); | | 1405 | return roundAndPackInt32( aSign, aSig64 ); |
1406 | | | 1406 | |
1407 | } | | 1407 | } |
1408 | #endif /* !SOFTFLOAT_FOR_GCC */ | | 1408 | #endif /* !SOFTFLOAT_FOR_GCC */ |
1409 | | | 1409 | |
1410 | /* | | 1410 | /* |
1411 | ------------------------------------------------------------------------------- | | 1411 | ------------------------------------------------------------------------------- |
1412 | Returns the result of converting the single-precision floating-point value | | 1412 | Returns the result of converting the single-precision floating-point value |
1413 | `a' to the 32-bit two's complement integer format. The conversion is | | 1413 | `a' to the 32-bit two's complement integer format. The conversion is |
1414 | performed according to the IEC/IEEE Standard for Binary Floating-Point | | 1414 | performed according to the IEC/IEEE Standard for Binary Floating-Point |
1415 | Arithmetic, except that the conversion is always rounded toward zero. | | 1415 | Arithmetic, except that the conversion is always rounded toward zero. |
1416 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if | | 1416 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if |
1417 | the conversion overflows, the largest integer with the same sign as `a' is | | 1417 | the conversion overflows, the largest integer with the same sign as `a' is |
1418 | returned. | | 1418 | returned. |
1419 | ------------------------------------------------------------------------------- | | 1419 | ------------------------------------------------------------------------------- |
1420 | */ | | 1420 | */ |
1421 | int32 float32_to_int32_round_to_zero( float32 a ) | | 1421 | int32 float32_to_int32_round_to_zero( float32 a ) |
1422 | { | | 1422 | { |
1423 | flag aSign; | | 1423 | flag aSign; |
1424 | int16 aExp, shiftCount; | | 1424 | int16 aExp, shiftCount; |
1425 | bits32 aSig; | | 1425 | bits32 aSig; |
1426 | int32 z; | | 1426 | int32 z; |
1427 | | | 1427 | |
1428 | aSig = extractFloat32Frac( a ); | | 1428 | aSig = extractFloat32Frac( a ); |
1429 | aExp = extractFloat32Exp( a ); | | 1429 | aExp = extractFloat32Exp( a ); |
1430 | aSign = extractFloat32Sign( a ); | | 1430 | aSign = extractFloat32Sign( a ); |
1431 | shiftCount = aExp - 0x9E; | | 1431 | shiftCount = aExp - 0x9E; |
1432 | if ( 0 <= shiftCount ) { | | 1432 | if ( 0 <= shiftCount ) { |
1433 | if ( a != 0xCF000000 ) { | | 1433 | if ( a != 0xCF000000 ) { |
1434 | float_raise( float_flag_invalid ); | | 1434 | float_raise( float_flag_invalid ); |
1435 | if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF; | | 1435 | if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF; |
1436 | } | | 1436 | } |
1437 | return (sbits32) 0x80000000; | | 1437 | return (sbits32) 0x80000000; |
1438 | } | | 1438 | } |
1439 | else if ( aExp <= 0x7E ) { | | 1439 | else if ( aExp <= 0x7E ) { |
1440 | if ( aExp | aSig ) set_float_exception_inexact_flag(); | | 1440 | if ( aExp | aSig ) set_float_exception_inexact_flag(); |
1441 | return 0; | | 1441 | return 0; |
1442 | } | | 1442 | } |
1443 | aSig = ( aSig | 0x00800000 )<<8; | | 1443 | aSig = ( aSig | 0x00800000 )<<8; |
1444 | z = aSig>>( - shiftCount ); | | 1444 | z = aSig>>( - shiftCount ); |
1445 | if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) { | | 1445 | if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) { |
1446 | set_float_exception_inexact_flag(); | | 1446 | set_float_exception_inexact_flag(); |
1447 | } | | 1447 | } |
1448 | if ( aSign ) z = - z; | | 1448 | if ( aSign ) z = - z; |
1449 | return z; | | 1449 | return z; |
1450 | | | 1450 | |
1451 | } | | 1451 | } |
1452 | | | 1452 | |
1453 | #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */ | | 1453 | #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */ |
1454 | /* | | 1454 | /* |
1455 | ------------------------------------------------------------------------------- | | 1455 | ------------------------------------------------------------------------------- |
1456 | Returns the result of converting the single-precision floating-point value | | 1456 | Returns the result of converting the single-precision floating-point value |
1457 | `a' to the 64-bit two's complement integer format. The conversion is | | 1457 | `a' to the 64-bit two's complement integer format. The conversion is |
1458 | performed according to the IEC/IEEE Standard for Binary Floating-Point | | 1458 | performed according to the IEC/IEEE Standard for Binary Floating-Point |
1459 | Arithmetic---which means in particular that the conversion is rounded | | 1459 | Arithmetic---which means in particular that the conversion is rounded |
1460 | according to the current rounding mode. If `a' is a NaN, the largest | | 1460 | according to the current rounding mode. If `a' is a NaN, the largest |
1461 | positive integer is returned. Otherwise, if the conversion overflows, the | | 1461 | positive integer is returned. Otherwise, if the conversion overflows, the |
1462 | largest integer with the same sign as `a' is returned. | | 1462 | largest integer with the same sign as `a' is returned. |
1463 | ------------------------------------------------------------------------------- | | 1463 | ------------------------------------------------------------------------------- |
1464 | */ | | 1464 | */ |
1465 | int64 float32_to_int64( float32 a ) | | 1465 | int64 float32_to_int64( float32 a ) |
1466 | { | | 1466 | { |
1467 | flag aSign; | | 1467 | flag aSign; |
1468 | int16 aExp, shiftCount; | | 1468 | int16 aExp, shiftCount; |
1469 | bits32 aSig; | | 1469 | bits32 aSig; |
1470 | bits64 aSig64, aSigExtra; | | 1470 | bits64 aSig64, aSigExtra; |
1471 | | | 1471 | |
1472 | aSig = extractFloat32Frac( a ); | | 1472 | aSig = extractFloat32Frac( a ); |
1473 | aExp = extractFloat32Exp( a ); | | 1473 | aExp = extractFloat32Exp( a ); |
1474 | aSign = extractFloat32Sign( a ); | | 1474 | aSign = extractFloat32Sign( a ); |
1475 | shiftCount = 0xBE - aExp; | | 1475 | shiftCount = 0xBE - aExp; |
1476 | if ( shiftCount < 0 ) { | | 1476 | if ( shiftCount < 0 ) { |
1477 | float_raise( float_flag_invalid ); | | 1477 | float_raise( float_flag_invalid ); |
1478 | if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { | | 1478 | if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { |
1479 | return LIT64( 0x7FFFFFFFFFFFFFFF ); | | 1479 | return LIT64( 0x7FFFFFFFFFFFFFFF ); |
1480 | } | | 1480 | } |
1481 | return (sbits64) LIT64( 0x8000000000000000 ); | | 1481 | return (sbits64) LIT64( 0x8000000000000000 ); |
1482 | } | | 1482 | } |
1483 | if ( aExp ) aSig |= 0x00800000; | | 1483 | if ( aExp ) aSig |= 0x00800000; |
1484 | aSig64 = aSig; | | 1484 | aSig64 = aSig; |
1485 | aSig64 <<= 40; | | 1485 | aSig64 <<= 40; |
1486 | shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra ); | | 1486 | shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra ); |
1487 | return roundAndPackInt64( aSign, aSig64, aSigExtra ); | | 1487 | return roundAndPackInt64( aSign, aSig64, aSigExtra ); |
1488 | | | 1488 | |
1489 | } | | 1489 | } |
1490 | | | 1490 | |
1491 | /* | | 1491 | /* |
1492 | ------------------------------------------------------------------------------- | | 1492 | ------------------------------------------------------------------------------- |
1493 | Returns the result of converting the single-precision floating-point value | | 1493 | Returns the result of converting the single-precision floating-point value |
1494 | `a' to the 64-bit two's complement integer format. The conversion is | | 1494 | `a' to the 64-bit two's complement integer format. The conversion is |
1495 | performed according to the IEC/IEEE Standard for Binary Floating-Point | | 1495 | performed according to the IEC/IEEE Standard for Binary Floating-Point |
1496 | Arithmetic, except that the conversion is always rounded toward zero. If | | 1496 | Arithmetic, except that the conversion is always rounded toward zero. If |
1497 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the | | 1497 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the |
1498 | conversion overflows, the largest integer with the same sign as `a' is | | 1498 | conversion overflows, the largest integer with the same sign as `a' is |
1499 | returned. | | 1499 | returned. |
1500 | ------------------------------------------------------------------------------- | | 1500 | ------------------------------------------------------------------------------- |
1501 | */ | | 1501 | */ |
1502 | int64 float32_to_int64_round_to_zero( float32 a ) | | 1502 | int64 float32_to_int64_round_to_zero( float32 a ) |
1503 | { | | 1503 | { |
1504 | flag aSign; | | 1504 | flag aSign; |
1505 | int16 aExp, shiftCount; | | 1505 | int16 aExp, shiftCount; |
1506 | bits32 aSig; | | 1506 | bits32 aSig; |
1507 | bits64 aSig64; | | 1507 | bits64 aSig64; |
1508 | int64 z; | | 1508 | int64 z; |
1509 | | | 1509 | |
1510 | aSig = extractFloat32Frac( a ); | | 1510 | aSig = extractFloat32Frac( a ); |
1511 | aExp = extractFloat32Exp( a ); | | 1511 | aExp = extractFloat32Exp( a ); |
1512 | aSign = extractFloat32Sign( a ); | | 1512 | aSign = extractFloat32Sign( a ); |
1513 | shiftCount = aExp - 0xBE; | | 1513 | shiftCount = aExp - 0xBE; |
1514 | if ( 0 <= shiftCount ) { | | 1514 | if ( 0 <= shiftCount ) { |
1515 | if ( a != 0xDF000000 ) { | | 1515 | if ( a != 0xDF000000 ) { |
1516 | float_raise( float_flag_invalid ); | | 1516 | float_raise( float_flag_invalid ); |
1517 | if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { | | 1517 | if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { |
1518 | return LIT64( 0x7FFFFFFFFFFFFFFF ); | | 1518 | return LIT64( 0x7FFFFFFFFFFFFFFF ); |
1519 | } | | 1519 | } |
1520 | } | | 1520 | } |
1521 | return (sbits64) LIT64( 0x8000000000000000 ); | | 1521 | return (sbits64) LIT64( 0x8000000000000000 ); |
1522 | } | | 1522 | } |
1523 | else if ( aExp <= 0x7E ) { | | 1523 | else if ( aExp <= 0x7E ) { |
1524 | if ( aExp | aSig ) set_float_exception_inexact_flag(); | | 1524 | if ( aExp | aSig ) set_float_exception_inexact_flag(); |
1525 | return 0; | | 1525 | return 0; |
1526 | } | | 1526 | } |
1527 | aSig64 = aSig | 0x00800000; | | 1527 | aSig64 = aSig | 0x00800000; |
1528 | aSig64 <<= 40; | | 1528 | aSig64 <<= 40; |
1529 | z = aSig64>>( - shiftCount ); | | 1529 | z = aSig64>>( - shiftCount ); |
1530 | if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) { | | 1530 | if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) { |
1531 | set_float_exception_inexact_flag(); | | 1531 | set_float_exception_inexact_flag(); |
1532 | } | | 1532 | } |
1533 | if ( aSign ) z = - z; | | 1533 | if ( aSign ) z = - z; |
1534 | return z; | | 1534 | return z; |
1535 | | | 1535 | |
1536 | } | | 1536 | } |
1537 | #endif /* !SOFTFLOAT_FOR_GCC */ | | 1537 | #endif /* !SOFTFLOAT_FOR_GCC */ |
1538 | | | 1538 | |
1539 | /* | | 1539 | /* |
1540 | ------------------------------------------------------------------------------- | | 1540 | ------------------------------------------------------------------------------- |
1541 | Returns the result of converting the single-precision floating-point value | | 1541 | Returns the result of converting the single-precision floating-point value |
1542 | `a' to the double-precision floating-point format. The conversion is | | 1542 | `a' to the double-precision floating-point format. The conversion is |
1543 | performed according to the IEC/IEEE Standard for Binary Floating-Point | | 1543 | performed according to the IEC/IEEE Standard for Binary Floating-Point |
1544 | Arithmetic. | | 1544 | Arithmetic. |
1545 | ------------------------------------------------------------------------------- | | 1545 | ------------------------------------------------------------------------------- |
1546 | */ | | 1546 | */ |
1547 | float64 float32_to_float64( float32 a ) | | 1547 | float64 float32_to_float64( float32 a ) |
1548 | { | | 1548 | { |
1549 | flag aSign; | | 1549 | flag aSign; |
1550 | int16 aExp; | | 1550 | int16 aExp; |
1551 | bits32 aSig; | | 1551 | bits32 aSig; |
1552 | | | 1552 | |
1553 | aSig = extractFloat32Frac( a ); | | 1553 | aSig = extractFloat32Frac( a ); |
1554 | aExp = extractFloat32Exp( a ); | | 1554 | aExp = extractFloat32Exp( a ); |
1555 | aSign = extractFloat32Sign( a ); | | 1555 | aSign = extractFloat32Sign( a ); |
1556 | if ( aExp == 0xFF ) { | | 1556 | if ( aExp == 0xFF ) { |
1557 | if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) ); | | 1557 | if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) ); |
1558 | return packFloat64( aSign, 0x7FF, 0 ); | | 1558 | return packFloat64( aSign, 0x7FF, 0 ); |
1559 | } | | 1559 | } |
1560 | if ( aExp == 0 ) { | | 1560 | if ( aExp == 0 ) { |
1561 | if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); | | 1561 | if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); |
1562 | normalizeFloat32Subnormal( aSig, &aExp, &aSig ); | | 1562 | normalizeFloat32Subnormal( aSig, &aExp, &aSig ); |
1563 | --aExp; | | 1563 | --aExp; |
1564 | } | | 1564 | } |
1565 | return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 ); | | 1565 | return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 ); |
1566 | | | 1566 | |
1567 | } | | 1567 | } |
1568 | | | 1568 | |
1569 | #ifdef FLOATX80 | | 1569 | #ifdef FLOATX80 |
1570 | | | 1570 | |
1571 | /* | | 1571 | /* |
1572 | ------------------------------------------------------------------------------- | | 1572 | ------------------------------------------------------------------------------- |
1573 | Returns the result of converting the single-precision floating-point value | | 1573 | Returns the result of converting the single-precision floating-point value |
1574 | `a' to the extended double-precision floating-point format. The conversion | | 1574 | `a' to the extended double-precision floating-point format. The conversion |
1575 | is performed according to the IEC/IEEE Standard for Binary Floating-Point | | 1575 | is performed according to the IEC/IEEE Standard for Binary Floating-Point |
1576 | Arithmetic. | | 1576 | Arithmetic. |
1577 | ------------------------------------------------------------------------------- | | 1577 | ------------------------------------------------------------------------------- |
1578 | */ | | 1578 | */ |
1579 | floatx80 float32_to_floatx80( float32 a ) | | 1579 | floatx80 float32_to_floatx80( float32 a ) |
1580 | { | | 1580 | { |
1581 | flag aSign; | | 1581 | flag aSign; |
1582 | int16 aExp; | | 1582 | int16 aExp; |
1583 | bits32 aSig; | | 1583 | bits32 aSig; |
1584 | | | 1584 | |
1585 | aSig = extractFloat32Frac( a ); | | 1585 | aSig = extractFloat32Frac( a ); |
1586 | aExp = extractFloat32Exp( a ); | | 1586 | aExp = extractFloat32Exp( a ); |
1587 | aSign = extractFloat32Sign( a ); | | 1587 | aSign = extractFloat32Sign( a ); |
1588 | if ( aExp == 0xFF ) { | | 1588 | if ( aExp == 0xFF ) { |
1589 | if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) ); | | 1589 | if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) ); |
1590 | return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); | | 1590 | return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); |
1591 | } | | 1591 | } |
1592 | if ( aExp == 0 ) { | | 1592 | if ( aExp == 0 ) { |
1593 | if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); | | 1593 | if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); |
1594 | normalizeFloat32Subnormal( aSig, &aExp, &aSig ); | | 1594 | normalizeFloat32Subnormal( aSig, &aExp, &aSig ); |
1595 | } | | 1595 | } |
1596 | aSig |= 0x00800000; | | 1596 | aSig |= 0x00800000; |
1597 | return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 ); | | 1597 | return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 ); |
1598 | | | 1598 | |
1599 | } | | 1599 | } |
1600 | | | 1600 | |
1601 | #endif | | 1601 | #endif |
1602 | | | 1602 | |
1603 | #ifdef FLOAT128 | | 1603 | #ifdef FLOAT128 |
1604 | | | 1604 | |
1605 | /* | | 1605 | /* |
1606 | ------------------------------------------------------------------------------- | | 1606 | ------------------------------------------------------------------------------- |
1607 | Returns the result of converting the single-precision floating-point value | | 1607 | Returns the result of converting the single-precision floating-point value |
1608 | `a' to the double-precision floating-point format. The conversion is | | 1608 | `a' to the double-precision floating-point format. The conversion is |
1609 | performed according to the IEC/IEEE Standard for Binary Floating-Point | | 1609 | performed according to the IEC/IEEE Standard for Binary Floating-Point |
1610 | Arithmetic. | | 1610 | Arithmetic. |
1611 | ------------------------------------------------------------------------------- | | 1611 | ------------------------------------------------------------------------------- |
1612 | */ | | 1612 | */ |
1613 | float128 float32_to_float128( float32 a ) | | 1613 | float128 float32_to_float128( float32 a ) |
1614 | { | | 1614 | { |
1615 | flag aSign; | | 1615 | flag aSign; |
1616 | int16 aExp; | | 1616 | int16 aExp; |
1617 | bits32 aSig; | | 1617 | bits32 aSig; |
1618 | | | 1618 | |
1619 | aSig = extractFloat32Frac( a ); | | 1619 | aSig = extractFloat32Frac( a ); |
1620 | aExp = extractFloat32Exp( a ); | | 1620 | aExp = extractFloat32Exp( a ); |
1621 | aSign = extractFloat32Sign( a ); | | 1621 | aSign = extractFloat32Sign( a ); |
1622 | if ( aExp == 0xFF ) { | | 1622 | if ( aExp == 0xFF ) { |
1623 | if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) ); | | 1623 | if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) ); |
1624 | return packFloat128( aSign, 0x7FFF, 0, 0 ); | | 1624 | return packFloat128( aSign, 0x7FFF, 0, 0 ); |
1625 | } | | 1625 | } |
1626 | if ( aExp == 0 ) { | | 1626 | if ( aExp == 0 ) { |
1627 | if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); | | 1627 | if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); |
1628 | normalizeFloat32Subnormal( aSig, &aExp, &aSig ); | | 1628 | normalizeFloat32Subnormal( aSig, &aExp, &aSig ); |
1629 | --aExp; | | 1629 | --aExp; |
1630 | } | | 1630 | } |
1631 | return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 ); | | 1631 | return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 ); |
1632 | | | 1632 | |
1633 | } | | 1633 | } |
1634 | | | 1634 | |
1635 | #endif | | 1635 | #endif |
1636 | | | 1636 | |
1637 | #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ | | 1637 | #ifndef SOFTFLOAT_FOR_GCC /* Not needed */ |
1638 | /* | | 1638 | /* |
1639 | ------------------------------------------------------------------------------- | | 1639 | ------------------------------------------------------------------------------- |
1640 | Rounds the single-precision floating-point value `a' to an integer, and | | 1640 | Rounds the single-precision floating-point value `a' to an integer, and |
1641 | returns the result as a single-precision floating-point value. The | | 1641 | returns the result as a single-precision floating-point value. The |
1642 | operation is performed according to the IEC/IEEE Standard for Binary | | 1642 | operation is performed according to the IEC/IEEE Standard for Binary |
1643 | Floating-Point Arithmetic. | | 1643 | Floating-Point Arithmetic. |
1644 | ------------------------------------------------------------------------------- | | 1644 | ------------------------------------------------------------------------------- |
1645 | */ | | 1645 | */ |
1646 | float32 float32_round_to_int( float32 a ) | | 1646 | float32 float32_round_to_int( float32 a ) |
1647 | { | | 1647 | { |
1648 | flag aSign; | | 1648 | flag aSign; |
1649 | int16 aExp; | | 1649 | int16 aExp; |
1650 | bits32 lastBitMask, roundBitsMask; | | 1650 | bits32 lastBitMask, roundBitsMask; |
1651 | int8 roundingMode; | | 1651 | int8 roundingMode; |
1652 | float32 z; | | 1652 | float32 z; |
1653 | | | 1653 | |
1654 | aExp = extractFloat32Exp( a ); | | 1654 | aExp = extractFloat32Exp( a ); |
1655 | if ( 0x96 <= aExp ) { | | 1655 | if ( 0x96 <= aExp ) { |
1656 | if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) { | | 1656 | if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) { |
1657 | return propagateFloat32NaN( a, a ); | | 1657 | return propagateFloat32NaN( a, a ); |
1658 | } | | 1658 | } |
1659 | return a; | | 1659 | return a; |
1660 | } | | 1660 | } |
1661 | if ( aExp <= 0x7E ) { | | 1661 | if ( aExp <= 0x7E ) { |
1662 | if ( (bits32) ( a<<1 ) == 0 ) return a; | | 1662 | if ( (bits32) ( a<<1 ) == 0 ) return a; |
1663 | set_float_exception_inexact_flag(); | | 1663 | set_float_exception_inexact_flag(); |
| @@ -2942,2010 +2942,2010 @@ float64 float64_div( float64 a, float64 | | | @@ -2942,2010 +2942,2010 @@ float64 float64_div( float64 a, float64 |
2942 | if ( bExp == 0x7FF ) { | | 2942 | if ( bExp == 0x7FF ) { |
2943 | if ( bSig ) return propagateFloat64NaN( a, b ); | | 2943 | if ( bSig ) return propagateFloat64NaN( a, b ); |
2944 | float_raise( float_flag_invalid ); | | 2944 | float_raise( float_flag_invalid ); |
2945 | return float64_default_nan; | | 2945 | return float64_default_nan; |
2946 | } | | 2946 | } |
2947 | return packFloat64( zSign, 0x7FF, 0 ); | | 2947 | return packFloat64( zSign, 0x7FF, 0 ); |
2948 | } | | 2948 | } |
2949 | if ( bExp == 0x7FF ) { | | 2949 | if ( bExp == 0x7FF ) { |
2950 | if ( bSig ) return propagateFloat64NaN( a, b ); | | 2950 | if ( bSig ) return propagateFloat64NaN( a, b ); |
2951 | return packFloat64( zSign, 0, 0 ); | | 2951 | return packFloat64( zSign, 0, 0 ); |
2952 | } | | 2952 | } |
2953 | if ( bExp == 0 ) { | | 2953 | if ( bExp == 0 ) { |
2954 | if ( bSig == 0 ) { | | 2954 | if ( bSig == 0 ) { |
2955 | if ( ( aExp | aSig ) == 0 ) { | | 2955 | if ( ( aExp | aSig ) == 0 ) { |
2956 | float_raise( float_flag_invalid ); | | 2956 | float_raise( float_flag_invalid ); |
2957 | return float64_default_nan; | | 2957 | return float64_default_nan; |
2958 | } | | 2958 | } |
2959 | float_raise( float_flag_divbyzero ); | | 2959 | float_raise( float_flag_divbyzero ); |
2960 | return packFloat64( zSign, 0x7FF, 0 ); | | 2960 | return packFloat64( zSign, 0x7FF, 0 ); |
2961 | } | | 2961 | } |
2962 | normalizeFloat64Subnormal( bSig, &bExp, &bSig ); | | 2962 | normalizeFloat64Subnormal( bSig, &bExp, &bSig ); |
2963 | } | | 2963 | } |
2964 | if ( aExp == 0 ) { | | 2964 | if ( aExp == 0 ) { |
2965 | if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); | | 2965 | if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); |
2966 | normalizeFloat64Subnormal( aSig, &aExp, &aSig ); | | 2966 | normalizeFloat64Subnormal( aSig, &aExp, &aSig ); |
2967 | } | | 2967 | } |
2968 | zExp = aExp - bExp + 0x3FD; | | 2968 | zExp = aExp - bExp + 0x3FD; |
2969 | aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; | | 2969 | aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; |
2970 | bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; | | 2970 | bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; |
2971 | if ( bSig <= ( aSig + aSig ) ) { | | 2971 | if ( bSig <= ( aSig + aSig ) ) { |
2972 | aSig >>= 1; | | 2972 | aSig >>= 1; |
2973 | ++zExp; | | 2973 | ++zExp; |
2974 | } | | 2974 | } |
2975 | zSig = estimateDiv128To64( aSig, 0, bSig ); | | 2975 | zSig = estimateDiv128To64( aSig, 0, bSig ); |
2976 | if ( ( zSig & 0x1FF ) <= 2 ) { | | 2976 | if ( ( zSig & 0x1FF ) <= 2 ) { |
2977 | mul64To128( bSig, zSig, &term0, &term1 ); | | 2977 | mul64To128( bSig, zSig, &term0, &term1 ); |
2978 | sub128( aSig, 0, term0, term1, &rem0, &rem1 ); | | 2978 | sub128( aSig, 0, term0, term1, &rem0, &rem1 ); |
2979 | while ( (sbits64) rem0 < 0 ) { | | 2979 | while ( (sbits64) rem0 < 0 ) { |
2980 | --zSig; | | 2980 | --zSig; |
2981 | add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); | | 2981 | add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); |
2982 | } | | 2982 | } |
2983 | zSig |= ( rem1 != 0 ); | | 2983 | zSig |= ( rem1 != 0 ); |
2984 | } | | 2984 | } |
2985 | return roundAndPackFloat64( zSign, zExp, zSig ); | | 2985 | return roundAndPackFloat64( zSign, zExp, zSig ); |
2986 | | | 2986 | |
2987 | } | | 2987 | } |
2988 | | | 2988 | |
2989 | #ifndef SOFTFLOAT_FOR_GCC | | 2989 | #ifndef SOFTFLOAT_FOR_GCC |
2990 | /* | | 2990 | /* |
2991 | ------------------------------------------------------------------------------- | | 2991 | ------------------------------------------------------------------------------- |
2992 | Returns the remainder of the double-precision floating-point value `a' | | 2992 | Returns the remainder of the double-precision floating-point value `a' |
2993 | with respect to the corresponding value `b'. The operation is performed | | 2993 | with respect to the corresponding value `b'. The operation is performed |
2994 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. | | 2994 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
2995 | ------------------------------------------------------------------------------- | | 2995 | ------------------------------------------------------------------------------- |
2996 | */ | | 2996 | */ |
2997 | float64 float64_rem( float64 a, float64 b ) | | 2997 | float64 float64_rem( float64 a, float64 b ) |
2998 | { | | 2998 | { |
2999 | flag aSign, bSign, zSign; | | 2999 | flag aSign, bSign, zSign; |
3000 | int16 aExp, bExp, expDiff; | | 3000 | int16 aExp, bExp, expDiff; |
3001 | bits64 aSig, bSig; | | 3001 | bits64 aSig, bSig; |
3002 | bits64 q, alternateASig; | | 3002 | bits64 q, alternateASig; |
3003 | sbits64 sigMean; | | 3003 | sbits64 sigMean; |
3004 | | | 3004 | |
3005 | aSig = extractFloat64Frac( a ); | | 3005 | aSig = extractFloat64Frac( a ); |
3006 | aExp = extractFloat64Exp( a ); | | 3006 | aExp = extractFloat64Exp( a ); |
3007 | aSign = extractFloat64Sign( a ); | | 3007 | aSign = extractFloat64Sign( a ); |
3008 | bSig = extractFloat64Frac( b ); | | 3008 | bSig = extractFloat64Frac( b ); |
3009 | bExp = extractFloat64Exp( b ); | | 3009 | bExp = extractFloat64Exp( b ); |
3010 | bSign = extractFloat64Sign( b ); | | 3010 | bSign = extractFloat64Sign( b ); |
3011 | if ( aExp == 0x7FF ) { | | 3011 | if ( aExp == 0x7FF ) { |
3012 | if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { | | 3012 | if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { |
3013 | return propagateFloat64NaN( a, b ); | | 3013 | return propagateFloat64NaN( a, b ); |
3014 | } | | 3014 | } |
3015 | float_raise( float_flag_invalid ); | | 3015 | float_raise( float_flag_invalid ); |
3016 | return float64_default_nan; | | 3016 | return float64_default_nan; |
3017 | } | | 3017 | } |
3018 | if ( bExp == 0x7FF ) { | | 3018 | if ( bExp == 0x7FF ) { |
3019 | if ( bSig ) return propagateFloat64NaN( a, b ); | | 3019 | if ( bSig ) return propagateFloat64NaN( a, b ); |
3020 | return a; | | 3020 | return a; |
3021 | } | | 3021 | } |
3022 | if ( bExp == 0 ) { | | 3022 | if ( bExp == 0 ) { |
3023 | if ( bSig == 0 ) { | | 3023 | if ( bSig == 0 ) { |
3024 | float_raise( float_flag_invalid ); | | 3024 | float_raise( float_flag_invalid ); |
3025 | return float64_default_nan; | | 3025 | return float64_default_nan; |
3026 | } | | 3026 | } |
3027 | normalizeFloat64Subnormal( bSig, &bExp, &bSig ); | | 3027 | normalizeFloat64Subnormal( bSig, &bExp, &bSig ); |
3028 | } | | 3028 | } |
3029 | if ( aExp == 0 ) { | | 3029 | if ( aExp == 0 ) { |
3030 | if ( aSig == 0 ) return a; | | 3030 | if ( aSig == 0 ) return a; |
3031 | normalizeFloat64Subnormal( aSig, &aExp, &aSig ); | | 3031 | normalizeFloat64Subnormal( aSig, &aExp, &aSig ); |
3032 | } | | 3032 | } |
3033 | expDiff = aExp - bExp; | | 3033 | expDiff = aExp - bExp; |
3034 | aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; | | 3034 | aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; |
3035 | bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; | | 3035 | bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; |
3036 | if ( expDiff < 0 ) { | | 3036 | if ( expDiff < 0 ) { |
3037 | if ( expDiff < -1 ) return a; | | 3037 | if ( expDiff < -1 ) return a; |
3038 | aSig >>= 1; | | 3038 | aSig >>= 1; |
3039 | } | | 3039 | } |
3040 | q = ( bSig <= aSig ); | | 3040 | q = ( bSig <= aSig ); |
3041 | if ( q ) aSig -= bSig; | | 3041 | if ( q ) aSig -= bSig; |
3042 | expDiff -= 64; | | 3042 | expDiff -= 64; |
3043 | while ( 0 < expDiff ) { | | 3043 | while ( 0 < expDiff ) { |
3044 | q = estimateDiv128To64( aSig, 0, bSig ); | | 3044 | q = estimateDiv128To64( aSig, 0, bSig ); |
3045 | q = ( 2 < q ) ? q - 2 : 0; | | 3045 | q = ( 2 < q ) ? q - 2 : 0; |
3046 | aSig = - ( ( bSig>>2 ) * q ); | | 3046 | aSig = - ( ( bSig>>2 ) * q ); |
3047 | expDiff -= 62; | | 3047 | expDiff -= 62; |
3048 | } | | 3048 | } |
3049 | expDiff += 64; | | 3049 | expDiff += 64; |
3050 | if ( 0 < expDiff ) { | | 3050 | if ( 0 < expDiff ) { |
3051 | q = estimateDiv128To64( aSig, 0, bSig ); | | 3051 | q = estimateDiv128To64( aSig, 0, bSig ); |
3052 | q = ( 2 < q ) ? q - 2 : 0; | | 3052 | q = ( 2 < q ) ? q - 2 : 0; |
3053 | q >>= 64 - expDiff; | | 3053 | q >>= 64 - expDiff; |
3054 | bSig >>= 2; | | 3054 | bSig >>= 2; |
3055 | aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; | | 3055 | aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; |
3056 | } | | 3056 | } |
3057 | else { | | 3057 | else { |
3058 | aSig >>= 2; | | 3058 | aSig >>= 2; |
3059 | bSig >>= 2; | | 3059 | bSig >>= 2; |
3060 | } | | 3060 | } |
3061 | do { | | 3061 | do { |
3062 | alternateASig = aSig; | | 3062 | alternateASig = aSig; |
3063 | ++q; | | 3063 | ++q; |
3064 | aSig -= bSig; | | 3064 | aSig -= bSig; |
3065 | } while ( 0 <= (sbits64) aSig ); | | 3065 | } while ( 0 <= (sbits64) aSig ); |
3066 | sigMean = aSig + alternateASig; | | 3066 | sigMean = aSig + alternateASig; |
3067 | if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { | | 3067 | if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { |
3068 | aSig = alternateASig; | | 3068 | aSig = alternateASig; |
3069 | } | | 3069 | } |
3070 | zSign = ( (sbits64) aSig < 0 ); | | 3070 | zSign = ( (sbits64) aSig < 0 ); |
3071 | if ( zSign ) aSig = - aSig; | | 3071 | if ( zSign ) aSig = - aSig; |
3072 | return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig ); | | 3072 | return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig ); |
3073 | | | 3073 | |
3074 | } | | 3074 | } |
3075 | | | 3075 | |
3076 | /* | | 3076 | /* |
3077 | ------------------------------------------------------------------------------- | | 3077 | ------------------------------------------------------------------------------- |
3078 | Returns the square root of the double-precision floating-point value `a'. | | 3078 | Returns the square root of the double-precision floating-point value `a'. |
3079 | The operation is performed according to the IEC/IEEE Standard for Binary | | 3079 | The operation is performed according to the IEC/IEEE Standard for Binary |
3080 | Floating-Point Arithmetic. | | 3080 | Floating-Point Arithmetic. |
3081 | ------------------------------------------------------------------------------- | | 3081 | ------------------------------------------------------------------------------- |
3082 | */ | | 3082 | */ |
3083 | float64 float64_sqrt( float64 a ) | | 3083 | float64 float64_sqrt( float64 a ) |
3084 | { | | 3084 | { |
3085 | flag aSign; | | 3085 | flag aSign; |
3086 | int16 aExp, zExp; | | 3086 | int16 aExp, zExp; |
3087 | bits64 aSig, zSig, doubleZSig; | | 3087 | bits64 aSig, zSig, doubleZSig; |
3088 | bits64 rem0, rem1, term0, term1; | | 3088 | bits64 rem0, rem1, term0, term1; |
3089 | | | 3089 | |
3090 | aSig = extractFloat64Frac( a ); | | 3090 | aSig = extractFloat64Frac( a ); |
3091 | aExp = extractFloat64Exp( a ); | | 3091 | aExp = extractFloat64Exp( a ); |
3092 | aSign = extractFloat64Sign( a ); | | 3092 | aSign = extractFloat64Sign( a ); |
3093 | if ( aExp == 0x7FF ) { | | 3093 | if ( aExp == 0x7FF ) { |
3094 | if ( aSig ) return propagateFloat64NaN( a, a ); | | 3094 | if ( aSig ) return propagateFloat64NaN( a, a ); |
3095 | if ( ! aSign ) return a; | | 3095 | if ( ! aSign ) return a; |
3096 | float_raise( float_flag_invalid ); | | 3096 | float_raise( float_flag_invalid ); |
3097 | return float64_default_nan; | | 3097 | return float64_default_nan; |
3098 | } | | 3098 | } |
3099 | if ( aSign ) { | | 3099 | if ( aSign ) { |
3100 | if ( ( aExp | aSig ) == 0 ) return a; | | 3100 | if ( ( aExp | aSig ) == 0 ) return a; |
3101 | float_raise( float_flag_invalid ); | | 3101 | float_raise( float_flag_invalid ); |
3102 | return float64_default_nan; | | 3102 | return float64_default_nan; |
3103 | } | | 3103 | } |
3104 | if ( aExp == 0 ) { | | 3104 | if ( aExp == 0 ) { |
3105 | if ( aSig == 0 ) return 0; | | 3105 | if ( aSig == 0 ) return 0; |
3106 | normalizeFloat64Subnormal( aSig, &aExp, &aSig ); | | 3106 | normalizeFloat64Subnormal( aSig, &aExp, &aSig ); |
3107 | } | | 3107 | } |
3108 | zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; | | 3108 | zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; |
3109 | aSig |= LIT64( 0x0010000000000000 ); | | 3109 | aSig |= LIT64( 0x0010000000000000 ); |
3110 | zSig = estimateSqrt32( aExp, aSig>>21 ); | | 3110 | zSig = estimateSqrt32( aExp, aSig>>21 ); |
3111 | aSig <<= 9 - ( aExp & 1 ); | | 3111 | aSig <<= 9 - ( aExp & 1 ); |
3112 | zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 ); | | 3112 | zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 ); |
3113 | if ( ( zSig & 0x1FF ) <= 5 ) { | | 3113 | if ( ( zSig & 0x1FF ) <= 5 ) { |
3114 | doubleZSig = zSig<<1; | | 3114 | doubleZSig = zSig<<1; |
3115 | mul64To128( zSig, zSig, &term0, &term1 ); | | 3115 | mul64To128( zSig, zSig, &term0, &term1 ); |
3116 | sub128( aSig, 0, term0, term1, &rem0, &rem1 ); | | 3116 | sub128( aSig, 0, term0, term1, &rem0, &rem1 ); |
3117 | while ( (sbits64) rem0 < 0 ) { | | 3117 | while ( (sbits64) rem0 < 0 ) { |
3118 | --zSig; | | 3118 | --zSig; |
3119 | doubleZSig -= 2; | | 3119 | doubleZSig -= 2; |
3120 | add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 ); | | 3120 | add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 ); |
3121 | } | | 3121 | } |
3122 | zSig |= ( ( rem0 | rem1 ) != 0 ); | | 3122 | zSig |= ( ( rem0 | rem1 ) != 0 ); |
3123 | } | | 3123 | } |
3124 | return roundAndPackFloat64( 0, zExp, zSig ); | | 3124 | return roundAndPackFloat64( 0, zExp, zSig ); |
3125 | | | 3125 | |
3126 | } | | 3126 | } |
3127 | #endif | | 3127 | #endif |
3128 | | | 3128 | |
3129 | /* | | 3129 | /* |
3130 | ------------------------------------------------------------------------------- | | 3130 | ------------------------------------------------------------------------------- |
3131 | Returns 1 if the double-precision floating-point value `a' is equal to the | | 3131 | Returns 1 if the double-precision floating-point value `a' is equal to the |
3132 | corresponding value `b', and 0 otherwise. The comparison is performed | | 3132 | corresponding value `b', and 0 otherwise. The comparison is performed |
3133 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. | | 3133 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
3134 | ------------------------------------------------------------------------------- | | 3134 | ------------------------------------------------------------------------------- |
3135 | */ | | 3135 | */ |
3136 | flag float64_eq( float64 a, float64 b ) | | 3136 | flag float64_eq( float64 a, float64 b ) |
3137 | { | | 3137 | { |
3138 | | | 3138 | |
3139 | if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) | | 3139 | if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) |
3140 | || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) | | 3140 | || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) |
3141 | ) { | | 3141 | ) { |
3142 | if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { | | 3142 | if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { |
3143 | float_raise( float_flag_invalid ); | | 3143 | float_raise( float_flag_invalid ); |
3144 | } | | 3144 | } |
3145 | return 0; | | 3145 | return 0; |
3146 | } | | 3146 | } |
3147 | return ( a == b ) || | | 3147 | return ( a == b ) || |
3148 | ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 ); | | 3148 | ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 ); |
3149 | | | 3149 | |
3150 | } | | 3150 | } |
3151 | | | 3151 | |
3152 | /* | | 3152 | /* |
3153 | ------------------------------------------------------------------------------- | | 3153 | ------------------------------------------------------------------------------- |
3154 | Returns 1 if the double-precision floating-point value `a' is less than or | | 3154 | Returns 1 if the double-precision floating-point value `a' is less than or |
3155 | equal to the corresponding value `b', and 0 otherwise. The comparison is | | 3155 | equal to the corresponding value `b', and 0 otherwise. The comparison is |
3156 | performed according to the IEC/IEEE Standard for Binary Floating-Point | | 3156 | performed according to the IEC/IEEE Standard for Binary Floating-Point |
3157 | Arithmetic. | | 3157 | Arithmetic. |
3158 | ------------------------------------------------------------------------------- | | 3158 | ------------------------------------------------------------------------------- |
3159 | */ | | 3159 | */ |
3160 | flag float64_le( float64 a, float64 b ) | | 3160 | flag float64_le( float64 a, float64 b ) |
3161 | { | | 3161 | { |
3162 | flag aSign, bSign; | | 3162 | flag aSign, bSign; |
3163 | | | 3163 | |
3164 | if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) | | 3164 | if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) |
3165 | || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) | | 3165 | || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) |
3166 | ) { | | 3166 | ) { |
3167 | float_raise( float_flag_invalid ); | | 3167 | float_raise( float_flag_invalid ); |
3168 | return 0; | | 3168 | return 0; |
3169 | } | | 3169 | } |
3170 | aSign = extractFloat64Sign( a ); | | 3170 | aSign = extractFloat64Sign( a ); |
3171 | bSign = extractFloat64Sign( b ); | | 3171 | bSign = extractFloat64Sign( b ); |
3172 | if ( aSign != bSign ) | | 3172 | if ( aSign != bSign ) |
3173 | return aSign || | | 3173 | return aSign || |
3174 | ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == | | 3174 | ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == |
3175 | 0 ); | | 3175 | 0 ); |
3176 | return ( a == b ) || | | 3176 | return ( a == b ) || |
3177 | ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) ); | | 3177 | ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) ); |
3178 | | | 3178 | |
3179 | } | | 3179 | } |
3180 | | | 3180 | |
3181 | /* | | 3181 | /* |
3182 | ------------------------------------------------------------------------------- | | 3182 | ------------------------------------------------------------------------------- |
3183 | Returns 1 if the double-precision floating-point value `a' is less than | | 3183 | Returns 1 if the double-precision floating-point value `a' is less than |
3184 | the corresponding value `b', and 0 otherwise. The comparison is performed | | 3184 | the corresponding value `b', and 0 otherwise. The comparison is performed |
3185 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. | | 3185 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
3186 | ------------------------------------------------------------------------------- | | 3186 | ------------------------------------------------------------------------------- |
3187 | */ | | 3187 | */ |
3188 | flag float64_lt( float64 a, float64 b ) | | 3188 | flag float64_lt( float64 a, float64 b ) |
3189 | { | | 3189 | { |
3190 | flag aSign, bSign; | | 3190 | flag aSign, bSign; |
3191 | | | 3191 | |
3192 | if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) | | 3192 | if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) |
3193 | || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) | | 3193 | || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) |
3194 | ) { | | 3194 | ) { |
3195 | float_raise( float_flag_invalid ); | | 3195 | float_raise( float_flag_invalid ); |
3196 | return 0; | | 3196 | return 0; |
3197 | } | | 3197 | } |
3198 | aSign = extractFloat64Sign( a ); | | 3198 | aSign = extractFloat64Sign( a ); |
3199 | bSign = extractFloat64Sign( b ); | | 3199 | bSign = extractFloat64Sign( b ); |
3200 | if ( aSign != bSign ) | | 3200 | if ( aSign != bSign ) |
3201 | return aSign && | | 3201 | return aSign && |
3202 | ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) != | | 3202 | ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) != |
3203 | 0 ); | | 3203 | 0 ); |
3204 | return ( a != b ) && | | 3204 | return ( a != b ) && |
3205 | ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) ); | | 3205 | ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) ); |
3206 | | | 3206 | |
3207 | } | | 3207 | } |
3208 | | | 3208 | |
3209 | #ifndef SOFTFLOAT_FOR_GCC | | 3209 | #ifndef SOFTFLOAT_FOR_GCC |
3210 | /* | | 3210 | /* |
3211 | ------------------------------------------------------------------------------- | | 3211 | ------------------------------------------------------------------------------- |
3212 | Returns 1 if the double-precision floating-point value `a' is equal to the | | 3212 | Returns 1 if the double-precision floating-point value `a' is equal to the |
3213 | corresponding value `b', and 0 otherwise. The invalid exception is raised | | 3213 | corresponding value `b', and 0 otherwise. The invalid exception is raised |
3214 | if either operand is a NaN. Otherwise, the comparison is performed | | 3214 | if either operand is a NaN. Otherwise, the comparison is performed |
3215 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. | | 3215 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
3216 | ------------------------------------------------------------------------------- | | 3216 | ------------------------------------------------------------------------------- |
3217 | */ | | 3217 | */ |
3218 | flag float64_eq_signaling( float64 a, float64 b ) | | 3218 | flag float64_eq_signaling( float64 a, float64 b ) |
3219 | { | | 3219 | { |
3220 | | | 3220 | |
3221 | if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) | | 3221 | if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) |
3222 | || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) | | 3222 | || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) |
3223 | ) { | | 3223 | ) { |
3224 | float_raise( float_flag_invalid ); | | 3224 | float_raise( float_flag_invalid ); |
3225 | return 0; | | 3225 | return 0; |
3226 | } | | 3226 | } |
3227 | return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 ); | | 3227 | return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 ); |
3228 | | | 3228 | |
3229 | } | | 3229 | } |
3230 | | | 3230 | |
3231 | /* | | 3231 | /* |
3232 | ------------------------------------------------------------------------------- | | 3232 | ------------------------------------------------------------------------------- |
3233 | Returns 1 if the double-precision floating-point value `a' is less than or | | 3233 | Returns 1 if the double-precision floating-point value `a' is less than or |
3234 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not | | 3234 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not |
3235 | cause an exception. Otherwise, the comparison is performed according to the | | 3235 | cause an exception. Otherwise, the comparison is performed according to the |
3236 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. | | 3236 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
3237 | ------------------------------------------------------------------------------- | | 3237 | ------------------------------------------------------------------------------- |
3238 | */ | | 3238 | */ |
3239 | flag float64_le_quiet( float64 a, float64 b ) | | 3239 | flag float64_le_quiet( float64 a, float64 b ) |
3240 | { | | 3240 | { |
3241 | flag aSign, bSign; | | 3241 | flag aSign, bSign; |
3242 | | | 3242 | |
3243 | if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) | | 3243 | if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) |
3244 | || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) | | 3244 | || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) |
3245 | ) { | | 3245 | ) { |
3246 | if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { | | 3246 | if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { |
3247 | float_raise( float_flag_invalid ); | | 3247 | float_raise( float_flag_invalid ); |
3248 | } | | 3248 | } |
3249 | return 0; | | 3249 | return 0; |
3250 | } | | 3250 | } |
3251 | aSign = extractFloat64Sign( a ); | | 3251 | aSign = extractFloat64Sign( a ); |
3252 | bSign = extractFloat64Sign( b ); | | 3252 | bSign = extractFloat64Sign( b ); |
3253 | if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 ); | | 3253 | if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 ); |
3254 | return ( a == b ) || ( aSign ^ ( a < b ) ); | | 3254 | return ( a == b ) || ( aSign ^ ( a < b ) ); |
3255 | | | 3255 | |
3256 | } | | 3256 | } |
3257 | | | 3257 | |
3258 | /* | | 3258 | /* |
3259 | ------------------------------------------------------------------------------- | | 3259 | ------------------------------------------------------------------------------- |
3260 | Returns 1 if the double-precision floating-point value `a' is less than | | 3260 | Returns 1 if the double-precision floating-point value `a' is less than |
3261 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an | | 3261 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an |
3262 | exception. Otherwise, the comparison is performed according to the IEC/IEEE | | 3262 | exception. Otherwise, the comparison is performed according to the IEC/IEEE |
3263 | Standard for Binary Floating-Point Arithmetic. | | 3263 | Standard for Binary Floating-Point Arithmetic. |
3264 | ------------------------------------------------------------------------------- | | 3264 | ------------------------------------------------------------------------------- |
3265 | */ | | 3265 | */ |
3266 | flag float64_lt_quiet( float64 a, float64 b ) | | 3266 | flag float64_lt_quiet( float64 a, float64 b ) |
3267 | { | | 3267 | { |
3268 | flag aSign, bSign; | | 3268 | flag aSign, bSign; |
3269 | | | 3269 | |
3270 | if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) | | 3270 | if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) |
3271 | || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) | | 3271 | || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) |
3272 | ) { | | 3272 | ) { |
3273 | if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { | | 3273 | if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { |
3274 | float_raise( float_flag_invalid ); | | 3274 | float_raise( float_flag_invalid ); |
3275 | } | | 3275 | } |
3276 | return 0; | | 3276 | return 0; |
3277 | } | | 3277 | } |
3278 | aSign = extractFloat64Sign( a ); | | 3278 | aSign = extractFloat64Sign( a ); |
3279 | bSign = extractFloat64Sign( b ); | | 3279 | bSign = extractFloat64Sign( b ); |
3280 | if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 ); | | 3280 | if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 ); |
3281 | return ( a != b ) && ( aSign ^ ( a < b ) ); | | 3281 | return ( a != b ) && ( aSign ^ ( a < b ) ); |
3282 | | | 3282 | |
3283 | } | | 3283 | } |
3284 | #endif | | 3284 | #endif |
3285 | | | 3285 | |
3286 | #ifdef FLOATX80 | | 3286 | #ifdef FLOATX80 |
3287 | | | 3287 | |
3288 | /* | | 3288 | /* |
3289 | ------------------------------------------------------------------------------- | | 3289 | ------------------------------------------------------------------------------- |
3290 | Returns the result of converting the extended double-precision floating- | | 3290 | Returns the result of converting the extended double-precision floating- |
3291 | point value `a' to the 32-bit two's complement integer format. The | | 3291 | point value `a' to the 32-bit two's complement integer format. The |
3292 | conversion is performed according to the IEC/IEEE Standard for Binary | | 3292 | conversion is performed according to the IEC/IEEE Standard for Binary |
3293 | Floating-Point Arithmetic---which means in particular that the conversion | | 3293 | Floating-Point Arithmetic---which means in particular that the conversion |
3294 | is rounded according to the current rounding mode. If `a' is a NaN, the | | 3294 | is rounded according to the current rounding mode. If `a' is a NaN, the |
3295 | largest positive integer is returned. Otherwise, if the conversion | | 3295 | largest positive integer is returned. Otherwise, if the conversion |
3296 | overflows, the largest integer with the same sign as `a' is returned. | | 3296 | overflows, the largest integer with the same sign as `a' is returned. |
3297 | ------------------------------------------------------------------------------- | | 3297 | ------------------------------------------------------------------------------- |
3298 | */ | | 3298 | */ |
3299 | int32 floatx80_to_int32( floatx80 a ) | | 3299 | int32 floatx80_to_int32( floatx80 a ) |
3300 | { | | 3300 | { |
3301 | flag aSign; | | 3301 | flag aSign; |
3302 | int32 aExp, shiftCount; | | 3302 | int32 aExp, shiftCount; |
3303 | bits64 aSig; | | 3303 | bits64 aSig; |
3304 | | | 3304 | |
3305 | aSig = extractFloatx80Frac( a ); | | 3305 | aSig = extractFloatx80Frac( a ); |
3306 | aExp = extractFloatx80Exp( a ); | | 3306 | aExp = extractFloatx80Exp( a ); |
3307 | aSign = extractFloatx80Sign( a ); | | 3307 | aSign = extractFloatx80Sign( a ); |
3308 | if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; | | 3308 | if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; |
3309 | shiftCount = 0x4037 - aExp; | | 3309 | shiftCount = 0x4037 - aExp; |
3310 | if ( shiftCount <= 0 ) shiftCount = 1; | | 3310 | if ( shiftCount <= 0 ) shiftCount = 1; |
3311 | shift64RightJamming( aSig, shiftCount, &aSig ); | | 3311 | shift64RightJamming( aSig, shiftCount, &aSig ); |
3312 | return roundAndPackInt32( aSign, aSig ); | | 3312 | return roundAndPackInt32( aSign, aSig ); |
3313 | | | 3313 | |
3314 | } | | 3314 | } |
3315 | | | 3315 | |
3316 | /* | | 3316 | /* |
3317 | ------------------------------------------------------------------------------- | | 3317 | ------------------------------------------------------------------------------- |
3318 | Returns the result of converting the extended double-precision floating- | | 3318 | Returns the result of converting the extended double-precision floating- |
3319 | point value `a' to the 32-bit two's complement integer format. The | | 3319 | point value `a' to the 32-bit two's complement integer format. The |
3320 | conversion is performed according to the IEC/IEEE Standard for Binary | | 3320 | conversion is performed according to the IEC/IEEE Standard for Binary |
3321 | Floating-Point Arithmetic, except that the conversion is always rounded | | 3321 | Floating-Point Arithmetic, except that the conversion is always rounded |
3322 | toward zero. If `a' is a NaN, the largest positive integer is returned. | | 3322 | toward zero. If `a' is a NaN, the largest positive integer is returned. |
3323 | Otherwise, if the conversion overflows, the largest integer with the same | | 3323 | Otherwise, if the conversion overflows, the largest integer with the same |
3324 | sign as `a' is returned. | | 3324 | sign as `a' is returned. |
3325 | ------------------------------------------------------------------------------- | | 3325 | ------------------------------------------------------------------------------- |
3326 | */ | | 3326 | */ |
3327 | int32 floatx80_to_int32_round_to_zero( floatx80 a ) | | 3327 | int32 floatx80_to_int32_round_to_zero( floatx80 a ) |
3328 | { | | 3328 | { |
3329 | flag aSign; | | 3329 | flag aSign; |
3330 | int32 aExp, shiftCount; | | 3330 | int32 aExp, shiftCount; |
3331 | bits64 aSig, savedASig; | | 3331 | bits64 aSig, savedASig; |
3332 | int32 z; | | 3332 | int32 z; |
3333 | | | 3333 | |
3334 | aSig = extractFloatx80Frac( a ); | | 3334 | aSig = extractFloatx80Frac( a ); |
3335 | aExp = extractFloatx80Exp( a ); | | 3335 | aExp = extractFloatx80Exp( a ); |
3336 | aSign = extractFloatx80Sign( a ); | | 3336 | aSign = extractFloatx80Sign( a ); |
3337 | if ( 0x401E < aExp ) { | | 3337 | if ( 0x401E < aExp ) { |
3338 | if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; | | 3338 | if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; |
3339 | goto invalid; | | 3339 | goto invalid; |
3340 | } | | 3340 | } |
3341 | else if ( aExp < 0x3FFF ) { | | 3341 | else if ( aExp < 0x3FFF ) { |
3342 | if ( aExp || aSig ) set_float_exception_inexact_flag(); | | 3342 | if ( aExp || aSig ) set_float_exception_inexact_flag(); |
3343 | return 0; | | 3343 | return 0; |
3344 | } | | 3344 | } |
3345 | shiftCount = 0x403E - aExp; | | 3345 | shiftCount = 0x403E - aExp; |
3346 | savedASig = aSig; | | 3346 | savedASig = aSig; |
3347 | aSig >>= shiftCount; | | 3347 | aSig >>= shiftCount; |
3348 | z = aSig; | | 3348 | z = aSig; |
3349 | if ( aSign ) z = - z; | | 3349 | if ( aSign ) z = - z; |
3350 | if ( ( z < 0 ) ^ aSign ) { | | 3350 | if ( ( z < 0 ) ^ aSign ) { |
3351 | invalid: | | 3351 | invalid: |
3352 | float_raise( float_flag_invalid ); | | 3352 | float_raise( float_flag_invalid ); |
3353 | return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; | | 3353 | return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; |
3354 | } | | 3354 | } |
3355 | if ( ( aSig<<shiftCount ) != savedASig ) { | | 3355 | if ( ( aSig<<shiftCount ) != savedASig ) { |
3356 | set_float_exception_inexact_flag(); | | 3356 | set_float_exception_inexact_flag(); |
3357 | } | | 3357 | } |
3358 | return z; | | 3358 | return z; |
3359 | | | 3359 | |
3360 | } | | 3360 | } |
3361 | | | 3361 | |
3362 | /* | | 3362 | /* |
3363 | ------------------------------------------------------------------------------- | | 3363 | ------------------------------------------------------------------------------- |
3364 | Returns the result of converting the extended double-precision floating- | | 3364 | Returns the result of converting the extended double-precision floating- |
3365 | point value `a' to the 64-bit two's complement integer format. The | | 3365 | point value `a' to the 64-bit two's complement integer format. The |
3366 | conversion is performed according to the IEC/IEEE Standard for Binary | | 3366 | conversion is performed according to the IEC/IEEE Standard for Binary |
3367 | Floating-Point Arithmetic---which means in particular that the conversion | | 3367 | Floating-Point Arithmetic---which means in particular that the conversion |
3368 | is rounded according to the current rounding mode. If `a' is a NaN, | | 3368 | is rounded according to the current rounding mode. If `a' is a NaN, |
3369 | the largest positive integer is returned. Otherwise, if the conversion | | 3369 | the largest positive integer is returned. Otherwise, if the conversion |
3370 | overflows, the largest integer with the same sign as `a' is returned. | | 3370 | overflows, the largest integer with the same sign as `a' is returned. |
3371 | ------------------------------------------------------------------------------- | | 3371 | ------------------------------------------------------------------------------- |
3372 | */ | | 3372 | */ |
3373 | int64 floatx80_to_int64( floatx80 a ) | | 3373 | int64 floatx80_to_int64( floatx80 a ) |
3374 | { | | 3374 | { |
3375 | flag aSign; | | 3375 | flag aSign; |
3376 | int32 aExp, shiftCount; | | 3376 | int32 aExp, shiftCount; |
3377 | bits64 aSig, aSigExtra; | | 3377 | bits64 aSig, aSigExtra; |
3378 | | | 3378 | |
3379 | aSig = extractFloatx80Frac( a ); | | 3379 | aSig = extractFloatx80Frac( a ); |
3380 | aExp = extractFloatx80Exp( a ); | | 3380 | aExp = extractFloatx80Exp( a ); |
3381 | aSign = extractFloatx80Sign( a ); | | 3381 | aSign = extractFloatx80Sign( a ); |
3382 | shiftCount = 0x403E - aExp; | | 3382 | shiftCount = 0x403E - aExp; |
3383 | if ( shiftCount <= 0 ) { | | 3383 | if ( shiftCount <= 0 ) { |
3384 | if ( shiftCount ) { | | 3384 | if ( shiftCount ) { |
3385 | float_raise( float_flag_invalid ); | | 3385 | float_raise( float_flag_invalid ); |
3386 | if ( ! aSign | | 3386 | if ( ! aSign |
3387 | || ( ( aExp == 0x7FFF ) | | 3387 | || ( ( aExp == 0x7FFF ) |
3388 | && ( aSig != LIT64( 0x8000000000000000 ) ) ) | | 3388 | && ( aSig != LIT64( 0x8000000000000000 ) ) ) |
3389 | ) { | | 3389 | ) { |
3390 | return LIT64( 0x7FFFFFFFFFFFFFFF ); | | 3390 | return LIT64( 0x7FFFFFFFFFFFFFFF ); |
3391 | } | | 3391 | } |
3392 | return (sbits64) LIT64( 0x8000000000000000 ); | | 3392 | return (sbits64) LIT64( 0x8000000000000000 ); |
3393 | } | | 3393 | } |
3394 | aSigExtra = 0; | | 3394 | aSigExtra = 0; |
3395 | } | | 3395 | } |
3396 | else { | | 3396 | else { |
3397 | shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); | | 3397 | shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); |
3398 | } | | 3398 | } |
3399 | return roundAndPackInt64( aSign, aSig, aSigExtra ); | | 3399 | return roundAndPackInt64( aSign, aSig, aSigExtra ); |
3400 | | | 3400 | |
3401 | } | | 3401 | } |
3402 | | | 3402 | |
3403 | /* | | 3403 | /* |
3404 | ------------------------------------------------------------------------------- | | 3404 | ------------------------------------------------------------------------------- |
3405 | Returns the result of converting the extended double-precision floating- | | 3405 | Returns the result of converting the extended double-precision floating- |
3406 | point value `a' to the 64-bit two's complement integer format. The | | 3406 | point value `a' to the 64-bit two's complement integer format. The |
3407 | conversion is performed according to the IEC/IEEE Standard for Binary | | 3407 | conversion is performed according to the IEC/IEEE Standard for Binary |
3408 | Floating-Point Arithmetic, except that the conversion is always rounded | | 3408 | Floating-Point Arithmetic, except that the conversion is always rounded |
3409 | toward zero. If `a' is a NaN, the largest positive integer is returned. | | 3409 | toward zero. If `a' is a NaN, the largest positive integer is returned. |
3410 | Otherwise, if the conversion overflows, the largest integer with the same | | 3410 | Otherwise, if the conversion overflows, the largest integer with the same |
3411 | sign as `a' is returned. | | 3411 | sign as `a' is returned. |
3412 | ------------------------------------------------------------------------------- | | 3412 | ------------------------------------------------------------------------------- |
3413 | */ | | 3413 | */ |
3414 | int64 floatx80_to_int64_round_to_zero( floatx80 a ) | | 3414 | int64 floatx80_to_int64_round_to_zero( floatx80 a ) |
3415 | { | | 3415 | { |
3416 | flag aSign; | | 3416 | flag aSign; |
3417 | int32 aExp, shiftCount; | | 3417 | int32 aExp, shiftCount; |
3418 | bits64 aSig; | | 3418 | bits64 aSig; |
3419 | int64 z; | | 3419 | int64 z; |
3420 | | | 3420 | |
3421 | aSig = extractFloatx80Frac( a ); | | 3421 | aSig = extractFloatx80Frac( a ); |
3422 | aExp = extractFloatx80Exp( a ); | | 3422 | aExp = extractFloatx80Exp( a ); |
3423 | aSign = extractFloatx80Sign( a ); | | 3423 | aSign = extractFloatx80Sign( a ); |
3424 | shiftCount = aExp - 0x403E; | | 3424 | shiftCount = aExp - 0x403E; |
3425 | if ( 0 <= shiftCount ) { | | 3425 | if ( 0 <= shiftCount ) { |
3426 | aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); | | 3426 | aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); |
3427 | if ( ( a.high != 0xC03E ) || aSig ) { | | 3427 | if ( ( a.high != 0xC03E ) || aSig ) { |
3428 | float_raise( float_flag_invalid ); | | 3428 | float_raise( float_flag_invalid ); |
3429 | if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { | | 3429 | if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { |
3430 | return LIT64( 0x7FFFFFFFFFFFFFFF ); | | 3430 | return LIT64( 0x7FFFFFFFFFFFFFFF ); |
3431 | } | | 3431 | } |
3432 | } | | 3432 | } |
3433 | return (sbits64) LIT64( 0x8000000000000000 ); | | 3433 | return (sbits64) LIT64( 0x8000000000000000 ); |
3434 | } | | 3434 | } |
3435 | else if ( aExp < 0x3FFF ) { | | 3435 | else if ( aExp < 0x3FFF ) { |
3436 | if ( aExp | aSig ) set_float_exception_inexact_flag(); | | 3436 | if ( aExp | aSig ) set_float_exception_inexact_flag(); |
3437 | return 0; | | 3437 | return 0; |
3438 | } | | 3438 | } |
3439 | z = aSig>>( - shiftCount ); | | 3439 | z = aSig>>( - shiftCount ); |
3440 | if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) { | | 3440 | if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) { |
3441 | set_float_exception_inexact_flag(); | | 3441 | set_float_exception_inexact_flag(); |
3442 | } | | 3442 | } |
3443 | if ( aSign ) z = - z; | | 3443 | if ( aSign ) z = - z; |
3444 | return z; | | 3444 | return z; |
3445 | | | 3445 | |
3446 | } | | 3446 | } |
3447 | | | 3447 | |
3448 | /* | | 3448 | /* |
3449 | ------------------------------------------------------------------------------- | | 3449 | ------------------------------------------------------------------------------- |
3450 | Returns the result of converting the extended double-precision floating- | | 3450 | Returns the result of converting the extended double-precision floating- |
3451 | point value `a' to the single-precision floating-point format. The | | 3451 | point value `a' to the single-precision floating-point format. The |
3452 | conversion is performed according to the IEC/IEEE Standard for Binary | | 3452 | conversion is performed according to the IEC/IEEE Standard for Binary |
3453 | Floating-Point Arithmetic. | | 3453 | Floating-Point Arithmetic. |
3454 | ------------------------------------------------------------------------------- | | 3454 | ------------------------------------------------------------------------------- |
3455 | */ | | 3455 | */ |
3456 | float32 floatx80_to_float32( floatx80 a ) | | 3456 | float32 floatx80_to_float32( floatx80 a ) |
3457 | { | | 3457 | { |
3458 | flag aSign; | | 3458 | flag aSign; |
3459 | int32 aExp; | | 3459 | int32 aExp; |
3460 | bits64 aSig; | | 3460 | bits64 aSig; |
3461 | | | 3461 | |
3462 | aSig = extractFloatx80Frac( a ); | | 3462 | aSig = extractFloatx80Frac( a ); |
3463 | aExp = extractFloatx80Exp( a ); | | 3463 | aExp = extractFloatx80Exp( a ); |
3464 | aSign = extractFloatx80Sign( a ); | | 3464 | aSign = extractFloatx80Sign( a ); |
3465 | if ( aExp == 0x7FFF ) { | | 3465 | if ( aExp == 0x7FFF ) { |
3466 | if ( (bits64) ( aSig<<1 ) ) { | | 3466 | if ( (bits64) ( aSig<<1 ) ) { |
3467 | return commonNaNToFloat32( floatx80ToCommonNaN( a ) ); | | 3467 | return commonNaNToFloat32( floatx80ToCommonNaN( a ) ); |
3468 | } | | 3468 | } |
3469 | return packFloat32( aSign, 0xFF, 0 ); | | 3469 | return packFloat32( aSign, 0xFF, 0 ); |
3470 | } | | 3470 | } |
3471 | shift64RightJamming( aSig, 33, &aSig ); | | 3471 | shift64RightJamming( aSig, 33, &aSig ); |
3472 | if ( aExp || aSig ) aExp -= 0x3F81; | | 3472 | if ( aExp || aSig ) aExp -= 0x3F81; |
3473 | return roundAndPackFloat32( aSign, aExp, aSig ); | | 3473 | return roundAndPackFloat32( aSign, aExp, aSig ); |
3474 | | | 3474 | |
3475 | } | | 3475 | } |
3476 | | | 3476 | |
3477 | /* | | 3477 | /* |
3478 | ------------------------------------------------------------------------------- | | 3478 | ------------------------------------------------------------------------------- |
3479 | Returns the result of converting the extended double-precision floating- | | 3479 | Returns the result of converting the extended double-precision floating- |
3480 | point value `a' to the double-precision floating-point format. The | | 3480 | point value `a' to the double-precision floating-point format. The |
3481 | conversion is performed according to the IEC/IEEE Standard for Binary | | 3481 | conversion is performed according to the IEC/IEEE Standard for Binary |
3482 | Floating-Point Arithmetic. | | 3482 | Floating-Point Arithmetic. |
3483 | ------------------------------------------------------------------------------- | | 3483 | ------------------------------------------------------------------------------- |
3484 | */ | | 3484 | */ |
3485 | float64 floatx80_to_float64( floatx80 a ) | | 3485 | float64 floatx80_to_float64( floatx80 a ) |
3486 | { | | 3486 | { |
3487 | flag aSign; | | 3487 | flag aSign; |
3488 | int32 aExp; | | 3488 | int32 aExp; |
3489 | bits64 aSig, zSig; | | 3489 | bits64 aSig, zSig; |
3490 | | | 3490 | |
3491 | aSig = extractFloatx80Frac( a ); | | 3491 | aSig = extractFloatx80Frac( a ); |
3492 | aExp = extractFloatx80Exp( a ); | | 3492 | aExp = extractFloatx80Exp( a ); |
3493 | aSign = extractFloatx80Sign( a ); | | 3493 | aSign = extractFloatx80Sign( a ); |
3494 | if ( aExp == 0x7FFF ) { | | 3494 | if ( aExp == 0x7FFF ) { |
3495 | if ( (bits64) ( aSig<<1 ) ) { | | 3495 | if ( (bits64) ( aSig<<1 ) ) { |
3496 | return commonNaNToFloat64( floatx80ToCommonNaN( a ) ); | | 3496 | return commonNaNToFloat64( floatx80ToCommonNaN( a ) ); |
3497 | } | | 3497 | } |
3498 | return packFloat64( aSign, 0x7FF, 0 ); | | 3498 | return packFloat64( aSign, 0x7FF, 0 ); |
3499 | } | | 3499 | } |
3500 | shift64RightJamming( aSig, 1, &zSig ); | | 3500 | shift64RightJamming( aSig, 1, &zSig ); |
3501 | if ( aExp || aSig ) aExp -= 0x3C01; | | 3501 | if ( aExp || aSig ) aExp -= 0x3C01; |
3502 | return roundAndPackFloat64( aSign, aExp, zSig ); | | 3502 | return roundAndPackFloat64( aSign, aExp, zSig ); |
3503 | | | 3503 | |
3504 | } | | 3504 | } |
3505 | | | 3505 | |
3506 | #ifdef FLOAT128 | | 3506 | #ifdef FLOAT128 |
3507 | | | 3507 | |
3508 | /* | | 3508 | /* |
3509 | ------------------------------------------------------------------------------- | | 3509 | ------------------------------------------------------------------------------- |
3510 | Returns the result of converting the extended double-precision floating- | | 3510 | Returns the result of converting the extended double-precision floating- |
3511 | point value `a' to the quadruple-precision floating-point format. The | | 3511 | point value `a' to the quadruple-precision floating-point format. The |
3512 | conversion is performed according to the IEC/IEEE Standard for Binary | | 3512 | conversion is performed according to the IEC/IEEE Standard for Binary |
3513 | Floating-Point Arithmetic. | | 3513 | Floating-Point Arithmetic. |
3514 | ------------------------------------------------------------------------------- | | 3514 | ------------------------------------------------------------------------------- |
3515 | */ | | 3515 | */ |
3516 | float128 floatx80_to_float128( floatx80 a ) | | 3516 | float128 floatx80_to_float128( floatx80 a ) |
3517 | { | | 3517 | { |
3518 | flag aSign; | | 3518 | flag aSign; |
3519 | int16 aExp; | | 3519 | int16 aExp; |
3520 | bits64 aSig, zSig0, zSig1; | | 3520 | bits64 aSig, zSig0, zSig1; |
3521 | | | 3521 | |
3522 | aSig = extractFloatx80Frac( a ); | | 3522 | aSig = extractFloatx80Frac( a ); |
3523 | aExp = extractFloatx80Exp( a ); | | 3523 | aExp = extractFloatx80Exp( a ); |
3524 | aSign = extractFloatx80Sign( a ); | | 3524 | aSign = extractFloatx80Sign( a ); |
3525 | if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) { | | 3525 | if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) { |
3526 | return commonNaNToFloat128( floatx80ToCommonNaN( a ) ); | | 3526 | return commonNaNToFloat128( floatx80ToCommonNaN( a ) ); |
3527 | } | | 3527 | } |
3528 | shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); | | 3528 | shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); |
3529 | return packFloat128( aSign, aExp, zSig0, zSig1 ); | | 3529 | return packFloat128( aSign, aExp, zSig0, zSig1 ); |
3530 | | | 3530 | |
3531 | } | | 3531 | } |
3532 | | | 3532 | |
3533 | #endif | | 3533 | #endif |
3534 | | | 3534 | |
3535 | /* | | 3535 | /* |
3536 | ------------------------------------------------------------------------------- | | 3536 | ------------------------------------------------------------------------------- |
3537 | Rounds the extended double-precision floating-point value `a' to an integer, | | 3537 | Rounds the extended double-precision floating-point value `a' to an integer, |
3538 | and returns the result as an extended quadruple-precision floating-point | | 3538 | and returns the result as an extended quadruple-precision floating-point |
3539 | value. The operation is performed according to the IEC/IEEE Standard for | | 3539 | value. The operation is performed according to the IEC/IEEE Standard for |
3540 | Binary Floating-Point Arithmetic. | | 3540 | Binary Floating-Point Arithmetic. |
3541 | ------------------------------------------------------------------------------- | | 3541 | ------------------------------------------------------------------------------- |
3542 | */ | | 3542 | */ |
3543 | floatx80 floatx80_round_to_int( floatx80 a ) | | 3543 | floatx80 floatx80_round_to_int( floatx80 a ) |
3544 | { | | 3544 | { |
3545 | flag aSign; | | 3545 | flag aSign; |
3546 | int32 aExp; | | 3546 | int32 aExp; |
3547 | bits64 lastBitMask, roundBitsMask; | | 3547 | bits64 lastBitMask, roundBitsMask; |
3548 | int8 roundingMode; | | 3548 | int8 roundingMode; |
3549 | floatx80 z; | | 3549 | floatx80 z; |
3550 | | | 3550 | |
3551 | aExp = extractFloatx80Exp( a ); | | 3551 | aExp = extractFloatx80Exp( a ); |
3552 | if ( 0x403E <= aExp ) { | | 3552 | if ( 0x403E <= aExp ) { |
3553 | if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) { | | 3553 | if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) { |
3554 | return propagateFloatx80NaN( a, a ); | | 3554 | return propagateFloatx80NaN( a, a ); |
3555 | } | | 3555 | } |
3556 | return a; | | 3556 | return a; |
3557 | } | | 3557 | } |
3558 | if ( aExp < 0x3FFF ) { | | 3558 | if ( aExp < 0x3FFF ) { |
3559 | if ( ( aExp == 0 ) | | 3559 | if ( ( aExp == 0 ) |
3560 | && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { | | 3560 | && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { |
3561 | return a; | | 3561 | return a; |
3562 | } | | 3562 | } |
3563 | set_float_exception_inexact_flag(); | | 3563 | set_float_exception_inexact_flag(); |
3564 | aSign = extractFloatx80Sign( a ); | | 3564 | aSign = extractFloatx80Sign( a ); |
3565 | switch ( float_rounding_mode ) { | | 3565 | switch ( float_rounding_mode ) { |
3566 | case float_round_nearest_even: | | 3566 | case float_round_nearest_even: |
3567 | if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 ) | | 3567 | if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 ) |
3568 | ) { | | 3568 | ) { |
3569 | return | | 3569 | return |
3570 | packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); | | 3570 | packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); |
3571 | } | | 3571 | } |
3572 | break; | | 3572 | break; |
3573 | case float_round_to_zero: | | 3573 | case float_round_to_zero: |
3574 | break; | | 3574 | break; |
3575 | case float_round_down: | | 3575 | case float_round_down: |
3576 | return | | 3576 | return |
3577 | aSign ? | | 3577 | aSign ? |
3578 | packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) | | 3578 | packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) |
3579 | : packFloatx80( 0, 0, 0 ); | | 3579 | : packFloatx80( 0, 0, 0 ); |
3580 | case float_round_up: | | 3580 | case float_round_up: |
3581 | return | | 3581 | return |
3582 | aSign ? packFloatx80( 1, 0, 0 ) | | 3582 | aSign ? packFloatx80( 1, 0, 0 ) |
3583 | : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); | | 3583 | : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); |
3584 | } | | 3584 | } |
3585 | return packFloatx80( aSign, 0, 0 ); | | 3585 | return packFloatx80( aSign, 0, 0 ); |
3586 | } | | 3586 | } |
3587 | lastBitMask = 1; | | 3587 | lastBitMask = 1; |
3588 | lastBitMask <<= 0x403E - aExp; | | 3588 | lastBitMask <<= 0x403E - aExp; |
3589 | roundBitsMask = lastBitMask - 1; | | 3589 | roundBitsMask = lastBitMask - 1; |
3590 | z = a; | | 3590 | z = a; |
3591 | roundingMode = float_rounding_mode; | | 3591 | roundingMode = float_rounding_mode; |
3592 | if ( roundingMode == float_round_nearest_even ) { | | 3592 | if ( roundingMode == float_round_nearest_even ) { |
3593 | z.low += lastBitMask>>1; | | 3593 | z.low += lastBitMask>>1; |
3594 | if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; | | 3594 | if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; |
3595 | } | | 3595 | } |
3596 | else if ( roundingMode != float_round_to_zero ) { | | 3596 | else if ( roundingMode != float_round_to_zero ) { |
3597 | if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) { | | 3597 | if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) { |
3598 | z.low += roundBitsMask; | | 3598 | z.low += roundBitsMask; |
3599 | } | | 3599 | } |
3600 | } | | 3600 | } |
3601 | z.low &= ~ roundBitsMask; | | 3601 | z.low &= ~ roundBitsMask; |
3602 | if ( z.low == 0 ) { | | 3602 | if ( z.low == 0 ) { |
3603 | ++z.high; | | 3603 | ++z.high; |
3604 | z.low = LIT64( 0x8000000000000000 ); | | 3604 | z.low = LIT64( 0x8000000000000000 ); |
3605 | } | | 3605 | } |
3606 | if ( z.low != a.low ) set_float_exception_inexact_flag(); | | 3606 | if ( z.low != a.low ) set_float_exception_inexact_flag(); |
3607 | return z; | | 3607 | return z; |
3608 | | | 3608 | |
3609 | } | | 3609 | } |
3610 | | | 3610 | |
3611 | /* | | 3611 | /* |
3612 | ------------------------------------------------------------------------------- | | 3612 | ------------------------------------------------------------------------------- |
3613 | Returns the result of adding the absolute values of the extended double- | | 3613 | Returns the result of adding the absolute values of the extended double- |
3614 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is | | 3614 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is |
3615 | negated before being returned. `zSign' is ignored if the result is a NaN. | | 3615 | negated before being returned. `zSign' is ignored if the result is a NaN. |
3616 | The addition is performed according to the IEC/IEEE Standard for Binary | | 3616 | The addition is performed according to the IEC/IEEE Standard for Binary |
3617 | Floating-Point Arithmetic. | | 3617 | Floating-Point Arithmetic. |
3618 | ------------------------------------------------------------------------------- | | 3618 | ------------------------------------------------------------------------------- |
3619 | */ | | 3619 | */ |
3620 | static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign ) | | 3620 | static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign ) |
3621 | { | | 3621 | { |
3622 | int32 aExp, bExp, zExp; | | 3622 | int32 aExp, bExp, zExp; |
3623 | bits64 aSig, bSig, zSig0, zSig1; | | 3623 | bits64 aSig, bSig, zSig0, zSig1; |
3624 | int32 expDiff; | | 3624 | int32 expDiff; |
3625 | | | 3625 | |
3626 | aSig = extractFloatx80Frac( a ); | | 3626 | aSig = extractFloatx80Frac( a ); |
3627 | aExp = extractFloatx80Exp( a ); | | 3627 | aExp = extractFloatx80Exp( a ); |
3628 | bSig = extractFloatx80Frac( b ); | | 3628 | bSig = extractFloatx80Frac( b ); |
3629 | bExp = extractFloatx80Exp( b ); | | 3629 | bExp = extractFloatx80Exp( b ); |
3630 | expDiff = aExp - bExp; | | 3630 | expDiff = aExp - bExp; |
3631 | if ( 0 < expDiff ) { | | 3631 | if ( 0 < expDiff ) { |
3632 | if ( aExp == 0x7FFF ) { | | 3632 | if ( aExp == 0x7FFF ) { |
3633 | if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); | | 3633 | if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); |
3634 | return a; | | 3634 | return a; |
3635 | } | | 3635 | } |
3636 | if ( bExp == 0 ) --expDiff; | | 3636 | if ( bExp == 0 ) --expDiff; |
3637 | shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); | | 3637 | shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); |
3638 | zExp = aExp; | | 3638 | zExp = aExp; |
3639 | } | | 3639 | } |
3640 | else if ( expDiff < 0 ) { | | 3640 | else if ( expDiff < 0 ) { |
3641 | if ( bExp == 0x7FFF ) { | | 3641 | if ( bExp == 0x7FFF ) { |
3642 | if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); | | 3642 | if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); |
3643 | return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); | | 3643 | return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); |
3644 | } | | 3644 | } |
3645 | if ( aExp == 0 ) ++expDiff; | | 3645 | if ( aExp == 0 ) ++expDiff; |
3646 | shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); | | 3646 | shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); |
3647 | zExp = bExp; | | 3647 | zExp = bExp; |
3648 | } | | 3648 | } |
3649 | else { | | 3649 | else { |
3650 | if ( aExp == 0x7FFF ) { | | 3650 | if ( aExp == 0x7FFF ) { |
3651 | if ( (bits64) ( ( aSig | bSig )<<1 ) ) { | | 3651 | if ( (bits64) ( ( aSig | bSig )<<1 ) ) { |
3652 | return propagateFloatx80NaN( a, b ); | | 3652 | return propagateFloatx80NaN( a, b ); |
3653 | } | | 3653 | } |
3654 | return a; | | 3654 | return a; |
3655 | } | | 3655 | } |
3656 | zSig1 = 0; | | 3656 | zSig1 = 0; |
3657 | zSig0 = aSig + bSig; | | 3657 | zSig0 = aSig + bSig; |
3658 | if ( aExp == 0 ) { | | 3658 | if ( aExp == 0 ) { |
3659 | normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); | | 3659 | normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); |
3660 | goto roundAndPack; | | 3660 | goto roundAndPack; |
3661 | } | | 3661 | } |
3662 | zExp = aExp; | | 3662 | zExp = aExp; |
3663 | goto shiftRight1; | | 3663 | goto shiftRight1; |
3664 | } | | 3664 | } |
3665 | zSig0 = aSig + bSig; | | 3665 | zSig0 = aSig + bSig; |
3666 | if ( (sbits64) zSig0 < 0 ) goto roundAndPack; | | 3666 | if ( (sbits64) zSig0 < 0 ) goto roundAndPack; |
3667 | shiftRight1: | | 3667 | shiftRight1: |
3668 | shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); | | 3668 | shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); |
3669 | zSig0 |= LIT64( 0x8000000000000000 ); | | 3669 | zSig0 |= LIT64( 0x8000000000000000 ); |
3670 | ++zExp; | | 3670 | ++zExp; |
3671 | roundAndPack: | | 3671 | roundAndPack: |
3672 | return | | 3672 | return |
3673 | roundAndPackFloatx80( | | 3673 | roundAndPackFloatx80( |
3674 | floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); | | 3674 | floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); |
3675 | | | 3675 | |
3676 | } | | 3676 | } |
3677 | | | 3677 | |
3678 | /* | | 3678 | /* |
3679 | ------------------------------------------------------------------------------- | | 3679 | ------------------------------------------------------------------------------- |
3680 | Returns the result of subtracting the absolute values of the extended | | 3680 | Returns the result of subtracting the absolute values of the extended |
3681 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the | | 3681 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the |
3682 | difference is negated before being returned. `zSign' is ignored if the | | 3682 | difference is negated before being returned. `zSign' is ignored if the |
3683 | result is a NaN. The subtraction is performed according to the IEC/IEEE | | 3683 | result is a NaN. The subtraction is performed according to the IEC/IEEE |
3684 | Standard for Binary Floating-Point Arithmetic. | | 3684 | Standard for Binary Floating-Point Arithmetic. |
3685 | ------------------------------------------------------------------------------- | | 3685 | ------------------------------------------------------------------------------- |
3686 | */ | | 3686 | */ |
3687 | static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign ) | | 3687 | static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign ) |
3688 | { | | 3688 | { |
3689 | int32 aExp, bExp, zExp; | | 3689 | int32 aExp, bExp, zExp; |
3690 | bits64 aSig, bSig, zSig0, zSig1; | | 3690 | bits64 aSig, bSig, zSig0, zSig1; |
3691 | int32 expDiff; | | 3691 | int32 expDiff; |
3692 | floatx80 z; | | 3692 | floatx80 z; |
3693 | | | 3693 | |
3694 | aSig = extractFloatx80Frac( a ); | | 3694 | aSig = extractFloatx80Frac( a ); |
3695 | aExp = extractFloatx80Exp( a ); | | 3695 | aExp = extractFloatx80Exp( a ); |
3696 | bSig = extractFloatx80Frac( b ); | | 3696 | bSig = extractFloatx80Frac( b ); |
3697 | bExp = extractFloatx80Exp( b ); | | 3697 | bExp = extractFloatx80Exp( b ); |
3698 | expDiff = aExp - bExp; | | 3698 | expDiff = aExp - bExp; |
3699 | if ( 0 < expDiff ) goto aExpBigger; | | 3699 | if ( 0 < expDiff ) goto aExpBigger; |
3700 | if ( expDiff < 0 ) goto bExpBigger; | | 3700 | if ( expDiff < 0 ) goto bExpBigger; |
3701 | if ( aExp == 0x7FFF ) { | | 3701 | if ( aExp == 0x7FFF ) { |
3702 | if ( (bits64) ( ( aSig | bSig )<<1 ) ) { | | 3702 | if ( (bits64) ( ( aSig | bSig )<<1 ) ) { |
3703 | return propagateFloatx80NaN( a, b ); | | 3703 | return propagateFloatx80NaN( a, b ); |
3704 | } | | 3704 | } |
3705 | float_raise( float_flag_invalid ); | | 3705 | float_raise( float_flag_invalid ); |
3706 | z.low = floatx80_default_nan_low; | | 3706 | z.low = floatx80_default_nan_low; |
3707 | z.high = floatx80_default_nan_high; | | 3707 | z.high = floatx80_default_nan_high; |
3708 | return z; | | 3708 | return z; |
3709 | } | | 3709 | } |
3710 | if ( aExp == 0 ) { | | 3710 | if ( aExp == 0 ) { |
3711 | aExp = 1; | | 3711 | aExp = 1; |
3712 | bExp = 1; | | 3712 | bExp = 1; |
3713 | } | | 3713 | } |
3714 | zSig1 = 0; | | 3714 | zSig1 = 0; |
3715 | if ( bSig < aSig ) goto aBigger; | | 3715 | if ( bSig < aSig ) goto aBigger; |
3716 | if ( aSig < bSig ) goto bBigger; | | 3716 | if ( aSig < bSig ) goto bBigger; |
3717 | return packFloatx80( float_rounding_mode == float_round_down, 0, 0 ); | | 3717 | return packFloatx80( float_rounding_mode == float_round_down, 0, 0 ); |
3718 | bExpBigger: | | 3718 | bExpBigger: |
3719 | if ( bExp == 0x7FFF ) { | | 3719 | if ( bExp == 0x7FFF ) { |
3720 | if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); | | 3720 | if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); |
3721 | return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) ); | | 3721 | return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) ); |
3722 | } | | 3722 | } |
3723 | if ( aExp == 0 ) ++expDiff; | | 3723 | if ( aExp == 0 ) ++expDiff; |
3724 | shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); | | 3724 | shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); |
3725 | bBigger: | | 3725 | bBigger: |
3726 | sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); | | 3726 | sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); |
3727 | zExp = bExp; | | 3727 | zExp = bExp; |
3728 | zSign ^= 1; | | 3728 | zSign ^= 1; |
3729 | goto normalizeRoundAndPack; | | 3729 | goto normalizeRoundAndPack; |
3730 | aExpBigger: | | 3730 | aExpBigger: |
3731 | if ( aExp == 0x7FFF ) { | | 3731 | if ( aExp == 0x7FFF ) { |
3732 | if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); | | 3732 | if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); |
3733 | return a; | | 3733 | return a; |
3734 | } | | 3734 | } |
3735 | if ( bExp == 0 ) --expDiff; | | 3735 | if ( bExp == 0 ) --expDiff; |
3736 | shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); | | 3736 | shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); |
3737 | aBigger: | | 3737 | aBigger: |
3738 | sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); | | 3738 | sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); |
3739 | zExp = aExp; | | 3739 | zExp = aExp; |
3740 | normalizeRoundAndPack: | | 3740 | normalizeRoundAndPack: |
3741 | return | | 3741 | return |
3742 | normalizeRoundAndPackFloatx80( | | 3742 | normalizeRoundAndPackFloatx80( |
3743 | floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); | | 3743 | floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); |
3744 | | | 3744 | |
3745 | } | | 3745 | } |
3746 | | | 3746 | |
3747 | /* | | 3747 | /* |
3748 | ------------------------------------------------------------------------------- | | 3748 | ------------------------------------------------------------------------------- |
3749 | Returns the result of adding the extended double-precision floating-point | | 3749 | Returns the result of adding the extended double-precision floating-point |
3750 | values `a' and `b'. The operation is performed according to the IEC/IEEE | | 3750 | values `a' and `b'. The operation is performed according to the IEC/IEEE |
3751 | Standard for Binary Floating-Point Arithmetic. | | 3751 | Standard for Binary Floating-Point Arithmetic. |
3752 | ------------------------------------------------------------------------------- | | 3752 | ------------------------------------------------------------------------------- |
3753 | */ | | 3753 | */ |
3754 | floatx80 floatx80_add( floatx80 a, floatx80 b ) | | 3754 | floatx80 floatx80_add( floatx80 a, floatx80 b ) |
3755 | { | | 3755 | { |
3756 | flag aSign, bSign; | | 3756 | flag aSign, bSign; |
3757 | | | 3757 | |
3758 | aSign = extractFloatx80Sign( a ); | | 3758 | aSign = extractFloatx80Sign( a ); |
3759 | bSign = extractFloatx80Sign( b ); | | 3759 | bSign = extractFloatx80Sign( b ); |
3760 | if ( aSign == bSign ) { | | 3760 | if ( aSign == bSign ) { |
3761 | return addFloatx80Sigs( a, b, aSign ); | | 3761 | return addFloatx80Sigs( a, b, aSign ); |
3762 | } | | 3762 | } |
3763 | else { | | 3763 | else { |
3764 | return subFloatx80Sigs( a, b, aSign ); | | 3764 | return subFloatx80Sigs( a, b, aSign ); |
3765 | } | | 3765 | } |
3766 | | | 3766 | |
3767 | } | | 3767 | } |
3768 | | | 3768 | |
3769 | /* | | 3769 | /* |
3770 | ------------------------------------------------------------------------------- | | 3770 | ------------------------------------------------------------------------------- |
3771 | Returns the result of subtracting the extended double-precision floating- | | 3771 | Returns the result of subtracting the extended double-precision floating- |
3772 | point values `a' and `b'. The operation is performed according to the | | 3772 | point values `a' and `b'. The operation is performed according to the |
3773 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. | | 3773 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
3774 | ------------------------------------------------------------------------------- | | 3774 | ------------------------------------------------------------------------------- |
3775 | */ | | 3775 | */ |
3776 | floatx80 floatx80_sub( floatx80 a, floatx80 b ) | | 3776 | floatx80 floatx80_sub( floatx80 a, floatx80 b ) |
3777 | { | | 3777 | { |
3778 | flag aSign, bSign; | | 3778 | flag aSign, bSign; |
3779 | | | 3779 | |
3780 | aSign = extractFloatx80Sign( a ); | | 3780 | aSign = extractFloatx80Sign( a ); |
3781 | bSign = extractFloatx80Sign( b ); | | 3781 | bSign = extractFloatx80Sign( b ); |
3782 | if ( aSign == bSign ) { | | 3782 | if ( aSign == bSign ) { |
3783 | return subFloatx80Sigs( a, b, aSign ); | | 3783 | return subFloatx80Sigs( a, b, aSign ); |
3784 | } | | 3784 | } |
3785 | else { | | 3785 | else { |
3786 | return addFloatx80Sigs( a, b, aSign ); | | 3786 | return addFloatx80Sigs( a, b, aSign ); |
3787 | } | | 3787 | } |
3788 | | | 3788 | |
3789 | } | | 3789 | } |
3790 | | | 3790 | |
3791 | /* | | 3791 | /* |
3792 | ------------------------------------------------------------------------------- | | 3792 | ------------------------------------------------------------------------------- |
3793 | Returns the result of multiplying the extended double-precision floating- | | 3793 | Returns the result of multiplying the extended double-precision floating- |
3794 | point values `a' and `b'. The operation is performed according to the | | 3794 | point values `a' and `b'. The operation is performed according to the |
3795 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. | | 3795 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
3796 | ------------------------------------------------------------------------------- | | 3796 | ------------------------------------------------------------------------------- |
3797 | */ | | 3797 | */ |
3798 | floatx80 floatx80_mul( floatx80 a, floatx80 b ) | | 3798 | floatx80 floatx80_mul( floatx80 a, floatx80 b ) |
3799 | { | | 3799 | { |
3800 | flag aSign, bSign, zSign; | | 3800 | flag aSign, bSign, zSign; |
3801 | int32 aExp, bExp, zExp; | | 3801 | int32 aExp, bExp, zExp; |
3802 | bits64 aSig, bSig, zSig0, zSig1; | | 3802 | bits64 aSig, bSig, zSig0, zSig1; |
3803 | floatx80 z; | | 3803 | floatx80 z; |
3804 | | | 3804 | |
3805 | aSig = extractFloatx80Frac( a ); | | 3805 | aSig = extractFloatx80Frac( a ); |
3806 | aExp = extractFloatx80Exp( a ); | | 3806 | aExp = extractFloatx80Exp( a ); |
3807 | aSign = extractFloatx80Sign( a ); | | 3807 | aSign = extractFloatx80Sign( a ); |
3808 | bSig = extractFloatx80Frac( b ); | | 3808 | bSig = extractFloatx80Frac( b ); |
3809 | bExp = extractFloatx80Exp( b ); | | 3809 | bExp = extractFloatx80Exp( b ); |
3810 | bSign = extractFloatx80Sign( b ); | | 3810 | bSign = extractFloatx80Sign( b ); |
3811 | zSign = aSign ^ bSign; | | 3811 | zSign = aSign ^ bSign; |
3812 | if ( aExp == 0x7FFF ) { | | 3812 | if ( aExp == 0x7FFF ) { |
3813 | if ( (bits64) ( aSig<<1 ) | | 3813 | if ( (bits64) ( aSig<<1 ) |
3814 | || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { | | 3814 | || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { |
3815 | return propagateFloatx80NaN( a, b ); | | 3815 | return propagateFloatx80NaN( a, b ); |
3816 | } | | 3816 | } |
3817 | if ( ( bExp | bSig ) == 0 ) goto invalid; | | 3817 | if ( ( bExp | bSig ) == 0 ) goto invalid; |
3818 | return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); | | 3818 | return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); |
3819 | } | | 3819 | } |
3820 | if ( bExp == 0x7FFF ) { | | 3820 | if ( bExp == 0x7FFF ) { |
3821 | if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); | | 3821 | if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); |
3822 | if ( ( aExp | aSig ) == 0 ) { | | 3822 | if ( ( aExp | aSig ) == 0 ) { |
3823 | invalid: | | 3823 | invalid: |
3824 | float_raise( float_flag_invalid ); | | 3824 | float_raise( float_flag_invalid ); |
3825 | z.low = floatx80_default_nan_low; | | 3825 | z.low = floatx80_default_nan_low; |
3826 | z.high = floatx80_default_nan_high; | | 3826 | z.high = floatx80_default_nan_high; |
3827 | return z; | | 3827 | return z; |
3828 | } | | 3828 | } |
3829 | return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); | | 3829 | return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); |
3830 | } | | 3830 | } |
3831 | if ( aExp == 0 ) { | | 3831 | if ( aExp == 0 ) { |
3832 | if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); | | 3832 | if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); |
3833 | normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); | | 3833 | normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); |
3834 | } | | 3834 | } |
3835 | if ( bExp == 0 ) { | | 3835 | if ( bExp == 0 ) { |
3836 | if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); | | 3836 | if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); |
3837 | normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); | | 3837 | normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); |
3838 | } | | 3838 | } |
3839 | zExp = aExp + bExp - 0x3FFE; | | 3839 | zExp = aExp + bExp - 0x3FFE; |
3840 | mul64To128( aSig, bSig, &zSig0, &zSig1 ); | | 3840 | mul64To128( aSig, bSig, &zSig0, &zSig1 ); |
3841 | if ( 0 < (sbits64) zSig0 ) { | | 3841 | if ( 0 < (sbits64) zSig0 ) { |
3842 | shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); | | 3842 | shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); |
3843 | --zExp; | | 3843 | --zExp; |
3844 | } | | 3844 | } |
3845 | return | | 3845 | return |
3846 | roundAndPackFloatx80( | | 3846 | roundAndPackFloatx80( |
3847 | floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); | | 3847 | floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); |
3848 | | | 3848 | |
3849 | } | | 3849 | } |
3850 | | | 3850 | |
3851 | /* | | 3851 | /* |
3852 | ------------------------------------------------------------------------------- | | 3852 | ------------------------------------------------------------------------------- |
3853 | Returns the result of dividing the extended double-precision floating-point | | 3853 | Returns the result of dividing the extended double-precision floating-point |
3854 | value `a' by the corresponding value `b'. The operation is performed | | 3854 | value `a' by the corresponding value `b'. The operation is performed |
3855 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. | | 3855 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
3856 | ------------------------------------------------------------------------------- | | 3856 | ------------------------------------------------------------------------------- |
3857 | */ | | 3857 | */ |
3858 | floatx80 floatx80_div( floatx80 a, floatx80 b ) | | 3858 | floatx80 floatx80_div( floatx80 a, floatx80 b ) |
3859 | { | | 3859 | { |
3860 | flag aSign, bSign, zSign; | | 3860 | flag aSign, bSign, zSign; |
3861 | int32 aExp, bExp, zExp; | | 3861 | int32 aExp, bExp, zExp; |
3862 | bits64 aSig, bSig, zSig0, zSig1; | | 3862 | bits64 aSig, bSig, zSig0, zSig1; |
3863 | bits64 rem0, rem1, rem2, term0, term1, term2; | | 3863 | bits64 rem0, rem1, rem2, term0, term1, term2; |
3864 | floatx80 z; | | 3864 | floatx80 z; |
3865 | | | 3865 | |
3866 | aSig = extractFloatx80Frac( a ); | | 3866 | aSig = extractFloatx80Frac( a ); |
3867 | aExp = extractFloatx80Exp( a ); | | 3867 | aExp = extractFloatx80Exp( a ); |
3868 | aSign = extractFloatx80Sign( a ); | | 3868 | aSign = extractFloatx80Sign( a ); |
3869 | bSig = extractFloatx80Frac( b ); | | 3869 | bSig = extractFloatx80Frac( b ); |
3870 | bExp = extractFloatx80Exp( b ); | | 3870 | bExp = extractFloatx80Exp( b ); |
3871 | bSign = extractFloatx80Sign( b ); | | 3871 | bSign = extractFloatx80Sign( b ); |
3872 | zSign = aSign ^ bSign; | | 3872 | zSign = aSign ^ bSign; |
3873 | if ( aExp == 0x7FFF ) { | | 3873 | if ( aExp == 0x7FFF ) { |
3874 | if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); | | 3874 | if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); |
3875 | if ( bExp == 0x7FFF ) { | | 3875 | if ( bExp == 0x7FFF ) { |
3876 | if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); | | 3876 | if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); |
3877 | goto invalid; | | 3877 | goto invalid; |
3878 | } | | 3878 | } |
3879 | return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); | | 3879 | return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); |
3880 | } | | 3880 | } |
3881 | if ( bExp == 0x7FFF ) { | | 3881 | if ( bExp == 0x7FFF ) { |
3882 | if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); | | 3882 | if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); |
3883 | return packFloatx80( zSign, 0, 0 ); | | 3883 | return packFloatx80( zSign, 0, 0 ); |
3884 | } | | 3884 | } |
3885 | if ( bExp == 0 ) { | | 3885 | if ( bExp == 0 ) { |
3886 | if ( bSig == 0 ) { | | 3886 | if ( bSig == 0 ) { |
3887 | if ( ( aExp | aSig ) == 0 ) { | | 3887 | if ( ( aExp | aSig ) == 0 ) { |
3888 | invalid: | | 3888 | invalid: |
3889 | float_raise( float_flag_invalid ); | | 3889 | float_raise( float_flag_invalid ); |
3890 | z.low = floatx80_default_nan_low; | | 3890 | z.low = floatx80_default_nan_low; |
3891 | z.high = floatx80_default_nan_high; | | 3891 | z.high = floatx80_default_nan_high; |
3892 | return z; | | 3892 | return z; |
3893 | } | | 3893 | } |
3894 | float_raise( float_flag_divbyzero ); | | 3894 | float_raise( float_flag_divbyzero ); |
3895 | return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); | | 3895 | return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); |
3896 | } | | 3896 | } |
3897 | normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); | | 3897 | normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); |
3898 | } | | 3898 | } |
3899 | if ( aExp == 0 ) { | | 3899 | if ( aExp == 0 ) { |
3900 | if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); | | 3900 | if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); |
3901 | normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); | | 3901 | normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); |
3902 | } | | 3902 | } |
3903 | zExp = aExp - bExp + 0x3FFE; | | 3903 | zExp = aExp - bExp + 0x3FFE; |
3904 | rem1 = 0; | | 3904 | rem1 = 0; |
3905 | if ( bSig <= aSig ) { | | 3905 | if ( bSig <= aSig ) { |
3906 | shift128Right( aSig, 0, 1, &aSig, &rem1 ); | | 3906 | shift128Right( aSig, 0, 1, &aSig, &rem1 ); |
3907 | ++zExp; | | 3907 | ++zExp; |
3908 | } | | 3908 | } |
3909 | zSig0 = estimateDiv128To64( aSig, rem1, bSig ); | | 3909 | zSig0 = estimateDiv128To64( aSig, rem1, bSig ); |
3910 | mul64To128( bSig, zSig0, &term0, &term1 ); | | 3910 | mul64To128( bSig, zSig0, &term0, &term1 ); |
3911 | sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); | | 3911 | sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); |
3912 | while ( (sbits64) rem0 < 0 ) { | | 3912 | while ( (sbits64) rem0 < 0 ) { |
3913 | --zSig0; | | 3913 | --zSig0; |
3914 | add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); | | 3914 | add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); |
3915 | } | | 3915 | } |
3916 | zSig1 = estimateDiv128To64( rem1, 0, bSig ); | | 3916 | zSig1 = estimateDiv128To64( rem1, 0, bSig ); |
3917 | if ( (bits64) ( zSig1<<1 ) <= 8 ) { | | 3917 | if ( (bits64) ( zSig1<<1 ) <= 8 ) { |
3918 | mul64To128( bSig, zSig1, &term1, &term2 ); | | 3918 | mul64To128( bSig, zSig1, &term1, &term2 ); |
3919 | sub128( rem1, 0, term1, term2, &rem1, &rem2 ); | | 3919 | sub128( rem1, 0, term1, term2, &rem1, &rem2 ); |
3920 | while ( (sbits64) rem1 < 0 ) { | | 3920 | while ( (sbits64) rem1 < 0 ) { |
3921 | --zSig1; | | 3921 | --zSig1; |
3922 | add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); | | 3922 | add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); |
3923 | } | | 3923 | } |
3924 | zSig1 |= ( ( rem1 | rem2 ) != 0 ); | | 3924 | zSig1 |= ( ( rem1 | rem2 ) != 0 ); |
3925 | } | | 3925 | } |
3926 | return | | 3926 | return |
3927 | roundAndPackFloatx80( | | 3927 | roundAndPackFloatx80( |
3928 | floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); | | 3928 | floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); |
3929 | | | 3929 | |
3930 | } | | 3930 | } |
3931 | | | 3931 | |
3932 | /* | | 3932 | /* |
3933 | ------------------------------------------------------------------------------- | | 3933 | ------------------------------------------------------------------------------- |
3934 | Returns the remainder of the extended double-precision floating-point value | | 3934 | Returns the remainder of the extended double-precision floating-point value |
3935 | `a' with respect to the corresponding value `b'. The operation is performed | | 3935 | `a' with respect to the corresponding value `b'. The operation is performed |
3936 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. | | 3936 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
3937 | ------------------------------------------------------------------------------- | | 3937 | ------------------------------------------------------------------------------- |
3938 | */ | | 3938 | */ |
3939 | floatx80 floatx80_rem( floatx80 a, floatx80 b ) | | 3939 | floatx80 floatx80_rem( floatx80 a, floatx80 b ) |
3940 | { | | 3940 | { |
3941 | flag aSign, bSign, zSign; | | 3941 | flag aSign, zSign; |
3942 | int32 aExp, bExp, expDiff; | | 3942 | int32 aExp, bExp, expDiff; |
3943 | bits64 aSig0, aSig1, bSig; | | 3943 | bits64 aSig0, aSig1, bSig; |
3944 | bits64 q, term0, term1, alternateASig0, alternateASig1; | | 3944 | bits64 q, term0, term1, alternateASig0, alternateASig1; |
3945 | floatx80 z; | | 3945 | floatx80 z; |
3946 | | | 3946 | |
3947 | aSig0 = extractFloatx80Frac( a ); | | 3947 | aSig0 = extractFloatx80Frac( a ); |
3948 | aExp = extractFloatx80Exp( a ); | | 3948 | aExp = extractFloatx80Exp( a ); |
3949 | aSign = extractFloatx80Sign( a ); | | 3949 | aSign = extractFloatx80Sign( a ); |
3950 | bSig = extractFloatx80Frac( b ); | | 3950 | bSig = extractFloatx80Frac( b ); |
3951 | bExp = extractFloatx80Exp( b ); | | 3951 | bExp = extractFloatx80Exp( b ); |
3952 | bSign = extractFloatx80Sign( b ); | | 3952 | |
3953 | if ( aExp == 0x7FFF ) { | | 3953 | if ( aExp == 0x7FFF ) { |
3954 | if ( (bits64) ( aSig0<<1 ) | | 3954 | if ( (bits64) ( aSig0<<1 ) |
3955 | || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { | | 3955 | || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { |
3956 | return propagateFloatx80NaN( a, b ); | | 3956 | return propagateFloatx80NaN( a, b ); |
3957 | } | | 3957 | } |
3958 | goto invalid; | | 3958 | goto invalid; |
3959 | } | | 3959 | } |
3960 | if ( bExp == 0x7FFF ) { | | 3960 | if ( bExp == 0x7FFF ) { |
3961 | if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); | | 3961 | if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); |
3962 | return a; | | 3962 | return a; |
3963 | } | | 3963 | } |
3964 | if ( bExp == 0 ) { | | 3964 | if ( bExp == 0 ) { |
3965 | if ( bSig == 0 ) { | | 3965 | if ( bSig == 0 ) { |
3966 | invalid: | | 3966 | invalid: |
3967 | float_raise( float_flag_invalid ); | | 3967 | float_raise( float_flag_invalid ); |
3968 | z.low = floatx80_default_nan_low; | | 3968 | z.low = floatx80_default_nan_low; |
3969 | z.high = floatx80_default_nan_high; | | 3969 | z.high = floatx80_default_nan_high; |
3970 | return z; | | 3970 | return z; |
3971 | } | | 3971 | } |
3972 | normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); | | 3972 | normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); |
3973 | } | | 3973 | } |
3974 | if ( aExp == 0 ) { | | 3974 | if ( aExp == 0 ) { |
3975 | if ( (bits64) ( aSig0<<1 ) == 0 ) return a; | | 3975 | if ( (bits64) ( aSig0<<1 ) == 0 ) return a; |
3976 | normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); | | 3976 | normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); |
3977 | } | | 3977 | } |
3978 | bSig |= LIT64( 0x8000000000000000 ); | | 3978 | bSig |= LIT64( 0x8000000000000000 ); |
3979 | zSign = aSign; | | 3979 | zSign = aSign; |
3980 | expDiff = aExp - bExp; | | 3980 | expDiff = aExp - bExp; |
3981 | aSig1 = 0; | | 3981 | aSig1 = 0; |
3982 | if ( expDiff < 0 ) { | | 3982 | if ( expDiff < 0 ) { |
3983 | if ( expDiff < -1 ) return a; | | 3983 | if ( expDiff < -1 ) return a; |
3984 | shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); | | 3984 | shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); |
3985 | expDiff = 0; | | 3985 | expDiff = 0; |
3986 | } | | 3986 | } |
3987 | q = ( bSig <= aSig0 ); | | 3987 | q = ( bSig <= aSig0 ); |
3988 | if ( q ) aSig0 -= bSig; | | 3988 | if ( q ) aSig0 -= bSig; |
3989 | expDiff -= 64; | | 3989 | expDiff -= 64; |
3990 | while ( 0 < expDiff ) { | | 3990 | while ( 0 < expDiff ) { |
3991 | q = estimateDiv128To64( aSig0, aSig1, bSig ); | | 3991 | q = estimateDiv128To64( aSig0, aSig1, bSig ); |
3992 | q = ( 2 < q ) ? q - 2 : 0; | | 3992 | q = ( 2 < q ) ? q - 2 : 0; |
3993 | mul64To128( bSig, q, &term0, &term1 ); | | 3993 | mul64To128( bSig, q, &term0, &term1 ); |
3994 | sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); | | 3994 | sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); |
3995 | shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); | | 3995 | shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); |
3996 | expDiff -= 62; | | 3996 | expDiff -= 62; |
3997 | } | | 3997 | } |
3998 | expDiff += 64; | | 3998 | expDiff += 64; |
3999 | if ( 0 < expDiff ) { | | 3999 | if ( 0 < expDiff ) { |
4000 | q = estimateDiv128To64( aSig0, aSig1, bSig ); | | 4000 | q = estimateDiv128To64( aSig0, aSig1, bSig ); |
4001 | q = ( 2 < q ) ? q - 2 : 0; | | 4001 | q = ( 2 < q ) ? q - 2 : 0; |
4002 | q >>= 64 - expDiff; | | 4002 | q >>= 64 - expDiff; |
4003 | mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); | | 4003 | mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); |
4004 | sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); | | 4004 | sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); |
4005 | shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); | | 4005 | shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); |
4006 | while ( le128( term0, term1, aSig0, aSig1 ) ) { | | 4006 | while ( le128( term0, term1, aSig0, aSig1 ) ) { |
4007 | ++q; | | 4007 | ++q; |
4008 | sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); | | 4008 | sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); |
4009 | } | | 4009 | } |
4010 | } | | 4010 | } |
4011 | else { | | 4011 | else { |
4012 | term1 = 0; | | 4012 | term1 = 0; |
4013 | term0 = bSig; | | 4013 | term0 = bSig; |
4014 | } | | 4014 | } |
4015 | sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); | | 4015 | sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); |
4016 | if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) | | 4016 | if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) |
4017 | || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) | | 4017 | || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) |
4018 | && ( q & 1 ) ) | | 4018 | && ( q & 1 ) ) |
4019 | ) { | | 4019 | ) { |
4020 | aSig0 = alternateASig0; | | 4020 | aSig0 = alternateASig0; |
4021 | aSig1 = alternateASig1; | | 4021 | aSig1 = alternateASig1; |
4022 | zSign = ! zSign; | | 4022 | zSign = ! zSign; |
4023 | } | | 4023 | } |
4024 | return | | 4024 | return |
4025 | normalizeRoundAndPackFloatx80( | | 4025 | normalizeRoundAndPackFloatx80( |
4026 | 80, zSign, bExp + expDiff, aSig0, aSig1 ); | | 4026 | 80, zSign, bExp + expDiff, aSig0, aSig1 ); |
4027 | | | 4027 | |
4028 | } | | 4028 | } |
4029 | | | 4029 | |
4030 | /* | | 4030 | /* |
4031 | ------------------------------------------------------------------------------- | | 4031 | ------------------------------------------------------------------------------- |
4032 | Returns the square root of the extended double-precision floating-point | | 4032 | Returns the square root of the extended double-precision floating-point |
4033 | value `a'. The operation is performed according to the IEC/IEEE Standard | | 4033 | value `a'. The operation is performed according to the IEC/IEEE Standard |
4034 | for Binary Floating-Point Arithmetic. | | 4034 | for Binary Floating-Point Arithmetic. |
4035 | ------------------------------------------------------------------------------- | | 4035 | ------------------------------------------------------------------------------- |
4036 | */ | | 4036 | */ |
4037 | floatx80 floatx80_sqrt( floatx80 a ) | | 4037 | floatx80 floatx80_sqrt( floatx80 a ) |
4038 | { | | 4038 | { |
4039 | flag aSign; | | 4039 | flag aSign; |
4040 | int32 aExp, zExp; | | 4040 | int32 aExp, zExp; |
4041 | bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0; | | 4041 | bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0; |
4042 | bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; | | 4042 | bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; |
4043 | floatx80 z; | | 4043 | floatx80 z; |
4044 | | | 4044 | |
4045 | aSig0 = extractFloatx80Frac( a ); | | 4045 | aSig0 = extractFloatx80Frac( a ); |
4046 | aExp = extractFloatx80Exp( a ); | | 4046 | aExp = extractFloatx80Exp( a ); |
4047 | aSign = extractFloatx80Sign( a ); | | 4047 | aSign = extractFloatx80Sign( a ); |
4048 | if ( aExp == 0x7FFF ) { | | 4048 | if ( aExp == 0x7FFF ) { |
4049 | if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a ); | | 4049 | if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a ); |
4050 | if ( ! aSign ) return a; | | 4050 | if ( ! aSign ) return a; |
4051 | goto invalid; | | 4051 | goto invalid; |
4052 | } | | 4052 | } |
4053 | if ( aSign ) { | | 4053 | if ( aSign ) { |
4054 | if ( ( aExp | aSig0 ) == 0 ) return a; | | 4054 | if ( ( aExp | aSig0 ) == 0 ) return a; |
4055 | invalid: | | 4055 | invalid: |
4056 | float_raise( float_flag_invalid ); | | 4056 | float_raise( float_flag_invalid ); |
4057 | z.low = floatx80_default_nan_low; | | 4057 | z.low = floatx80_default_nan_low; |
4058 | z.high = floatx80_default_nan_high; | | 4058 | z.high = floatx80_default_nan_high; |
4059 | return z; | | 4059 | return z; |
4060 | } | | 4060 | } |
4061 | if ( aExp == 0 ) { | | 4061 | if ( aExp == 0 ) { |
4062 | if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); | | 4062 | if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); |
4063 | normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); | | 4063 | normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); |
4064 | } | | 4064 | } |
4065 | zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; | | 4065 | zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; |
4066 | zSig0 = estimateSqrt32( aExp, aSig0>>32 ); | | 4066 | zSig0 = estimateSqrt32( aExp, aSig0>>32 ); |
4067 | shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); | | 4067 | shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); |
4068 | zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); | | 4068 | zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); |
4069 | doubleZSig0 = zSig0<<1; | | 4069 | doubleZSig0 = zSig0<<1; |
4070 | mul64To128( zSig0, zSig0, &term0, &term1 ); | | 4070 | mul64To128( zSig0, zSig0, &term0, &term1 ); |
4071 | sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); | | 4071 | sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); |
4072 | while ( (sbits64) rem0 < 0 ) { | | 4072 | while ( (sbits64) rem0 < 0 ) { |
4073 | --zSig0; | | 4073 | --zSig0; |
4074 | doubleZSig0 -= 2; | | 4074 | doubleZSig0 -= 2; |
4075 | add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); | | 4075 | add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); |
4076 | } | | 4076 | } |
4077 | zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); | | 4077 | zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); |
4078 | if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { | | 4078 | if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { |
4079 | if ( zSig1 == 0 ) zSig1 = 1; | | 4079 | if ( zSig1 == 0 ) zSig1 = 1; |
4080 | mul64To128( doubleZSig0, zSig1, &term1, &term2 ); | | 4080 | mul64To128( doubleZSig0, zSig1, &term1, &term2 ); |
4081 | sub128( rem1, 0, term1, term2, &rem1, &rem2 ); | | 4081 | sub128( rem1, 0, term1, term2, &rem1, &rem2 ); |
4082 | mul64To128( zSig1, zSig1, &term2, &term3 ); | | 4082 | mul64To128( zSig1, zSig1, &term2, &term3 ); |
4083 | sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); | | 4083 | sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); |
4084 | while ( (sbits64) rem1 < 0 ) { | | 4084 | while ( (sbits64) rem1 < 0 ) { |
4085 | --zSig1; | | 4085 | --zSig1; |
4086 | shortShift128Left( 0, zSig1, 1, &term2, &term3 ); | | 4086 | shortShift128Left( 0, zSig1, 1, &term2, &term3 ); |
4087 | term3 |= 1; | | 4087 | term3 |= 1; |
4088 | term2 |= doubleZSig0; | | 4088 | term2 |= doubleZSig0; |
4089 | add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); | | 4089 | add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); |
4090 | } | | 4090 | } |
4091 | zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); | | 4091 | zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); |
4092 | } | | 4092 | } |
4093 | shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); | | 4093 | shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); |
4094 | zSig0 |= doubleZSig0; | | 4094 | zSig0 |= doubleZSig0; |
4095 | return | | 4095 | return |
4096 | roundAndPackFloatx80( | | 4096 | roundAndPackFloatx80( |
4097 | floatx80_rounding_precision, 0, zExp, zSig0, zSig1 ); | | 4097 | floatx80_rounding_precision, 0, zExp, zSig0, zSig1 ); |
4098 | | | 4098 | |
4099 | } | | 4099 | } |
4100 | | | 4100 | |
4101 | /* | | 4101 | /* |
4102 | ------------------------------------------------------------------------------- | | 4102 | ------------------------------------------------------------------------------- |
4103 | Returns 1 if the extended double-precision floating-point value `a' is | | 4103 | Returns 1 if the extended double-precision floating-point value `a' is |
4104 | equal to the corresponding value `b', and 0 otherwise. The comparison is | | 4104 | equal to the corresponding value `b', and 0 otherwise. The comparison is |
4105 | performed according to the IEC/IEEE Standard for Binary Floating-Point | | 4105 | performed according to the IEC/IEEE Standard for Binary Floating-Point |
4106 | Arithmetic. | | 4106 | Arithmetic. |
4107 | ------------------------------------------------------------------------------- | | 4107 | ------------------------------------------------------------------------------- |
4108 | */ | | 4108 | */ |
4109 | flag floatx80_eq( floatx80 a, floatx80 b ) | | 4109 | flag floatx80_eq( floatx80 a, floatx80 b ) |
4110 | { | | 4110 | { |
4111 | | | 4111 | |
4112 | if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) | | 4112 | if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) |
4113 | && (bits64) ( extractFloatx80Frac( a )<<1 ) ) | | 4113 | && (bits64) ( extractFloatx80Frac( a )<<1 ) ) |
4114 | || ( ( extractFloatx80Exp( b ) == 0x7FFF ) | | 4114 | || ( ( extractFloatx80Exp( b ) == 0x7FFF ) |
4115 | && (bits64) ( extractFloatx80Frac( b )<<1 ) ) | | 4115 | && (bits64) ( extractFloatx80Frac( b )<<1 ) ) |
4116 | ) { | | 4116 | ) { |
4117 | if ( floatx80_is_signaling_nan( a ) | | 4117 | if ( floatx80_is_signaling_nan( a ) |
4118 | || floatx80_is_signaling_nan( b ) ) { | | 4118 | || floatx80_is_signaling_nan( b ) ) { |
4119 | float_raise( float_flag_invalid ); | | 4119 | float_raise( float_flag_invalid ); |
4120 | } | | 4120 | } |
4121 | return 0; | | 4121 | return 0; |
4122 | } | | 4122 | } |
4123 | return | | 4123 | return |
4124 | ( a.low == b.low ) | | 4124 | ( a.low == b.low ) |
4125 | && ( ( a.high == b.high ) | | 4125 | && ( ( a.high == b.high ) |
4126 | || ( ( a.low == 0 ) | | 4126 | || ( ( a.low == 0 ) |
4127 | && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) | | 4127 | && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) |
4128 | ); | | 4128 | ); |
4129 | | | 4129 | |
4130 | } | | 4130 | } |
4131 | | | 4131 | |
4132 | /* | | 4132 | /* |
4133 | ------------------------------------------------------------------------------- | | 4133 | ------------------------------------------------------------------------------- |
4134 | Returns 1 if the extended double-precision floating-point value `a' is | | 4134 | Returns 1 if the extended double-precision floating-point value `a' is |
4135 | less than or equal to the corresponding value `b', and 0 otherwise. The | | 4135 | less than or equal to the corresponding value `b', and 0 otherwise. The |
4136 | comparison is performed according to the IEC/IEEE Standard for Binary | | 4136 | comparison is performed according to the IEC/IEEE Standard for Binary |
4137 | Floating-Point Arithmetic. | | 4137 | Floating-Point Arithmetic. |
4138 | ------------------------------------------------------------------------------- | | 4138 | ------------------------------------------------------------------------------- |
4139 | */ | | 4139 | */ |
4140 | flag floatx80_le( floatx80 a, floatx80 b ) | | 4140 | flag floatx80_le( floatx80 a, floatx80 b ) |
4141 | { | | 4141 | { |
4142 | flag aSign, bSign; | | 4142 | flag aSign, bSign; |
4143 | | | 4143 | |
4144 | if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) | | 4144 | if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) |
4145 | && (bits64) ( extractFloatx80Frac( a )<<1 ) ) | | 4145 | && (bits64) ( extractFloatx80Frac( a )<<1 ) ) |
4146 | || ( ( extractFloatx80Exp( b ) == 0x7FFF ) | | 4146 | || ( ( extractFloatx80Exp( b ) == 0x7FFF ) |
4147 | && (bits64) ( extractFloatx80Frac( b )<<1 ) ) | | 4147 | && (bits64) ( extractFloatx80Frac( b )<<1 ) ) |
4148 | ) { | | 4148 | ) { |
4149 | float_raise( float_flag_invalid ); | | 4149 | float_raise( float_flag_invalid ); |
4150 | return 0; | | 4150 | return 0; |
4151 | } | | 4151 | } |
4152 | aSign = extractFloatx80Sign( a ); | | 4152 | aSign = extractFloatx80Sign( a ); |
4153 | bSign = extractFloatx80Sign( b ); | | 4153 | bSign = extractFloatx80Sign( b ); |
4154 | if ( aSign != bSign ) { | | 4154 | if ( aSign != bSign ) { |
4155 | return | | 4155 | return |
4156 | aSign | | 4156 | aSign |
4157 | || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) | | 4157 | || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) |
4158 | == 0 ); | | 4158 | == 0 ); |
4159 | } | | 4159 | } |
4160 | return | | 4160 | return |
4161 | aSign ? le128( b.high, b.low, a.high, a.low ) | | 4161 | aSign ? le128( b.high, b.low, a.high, a.low ) |
4162 | : le128( a.high, a.low, b.high, b.low ); | | 4162 | : le128( a.high, a.low, b.high, b.low ); |
4163 | | | 4163 | |
4164 | } | | 4164 | } |
4165 | | | 4165 | |
4166 | /* | | 4166 | /* |
4167 | ------------------------------------------------------------------------------- | | 4167 | ------------------------------------------------------------------------------- |
4168 | Returns 1 if the extended double-precision floating-point value `a' is | | 4168 | Returns 1 if the extended double-precision floating-point value `a' is |
4169 | less than the corresponding value `b', and 0 otherwise. The comparison | | 4169 | less than the corresponding value `b', and 0 otherwise. The comparison |
4170 | is performed according to the IEC/IEEE Standard for Binary Floating-Point | | 4170 | is performed according to the IEC/IEEE Standard for Binary Floating-Point |
4171 | Arithmetic. | | 4171 | Arithmetic. |
4172 | ------------------------------------------------------------------------------- | | 4172 | ------------------------------------------------------------------------------- |
4173 | */ | | 4173 | */ |
4174 | flag floatx80_lt( floatx80 a, floatx80 b ) | | 4174 | flag floatx80_lt( floatx80 a, floatx80 b ) |
4175 | { | | 4175 | { |
4176 | flag aSign, bSign; | | 4176 | flag aSign, bSign; |
4177 | | | 4177 | |
4178 | if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) | | 4178 | if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) |
4179 | && (bits64) ( extractFloatx80Frac( a )<<1 ) ) | | 4179 | && (bits64) ( extractFloatx80Frac( a )<<1 ) ) |
4180 | || ( ( extractFloatx80Exp( b ) == 0x7FFF ) | | 4180 | || ( ( extractFloatx80Exp( b ) == 0x7FFF ) |
4181 | && (bits64) ( extractFloatx80Frac( b )<<1 ) ) | | 4181 | && (bits64) ( extractFloatx80Frac( b )<<1 ) ) |
4182 | ) { | | 4182 | ) { |
4183 | float_raise( float_flag_invalid ); | | 4183 | float_raise( float_flag_invalid ); |
4184 | return 0; | | 4184 | return 0; |
4185 | } | | 4185 | } |
4186 | aSign = extractFloatx80Sign( a ); | | 4186 | aSign = extractFloatx80Sign( a ); |
4187 | bSign = extractFloatx80Sign( b ); | | 4187 | bSign = extractFloatx80Sign( b ); |
4188 | if ( aSign != bSign ) { | | 4188 | if ( aSign != bSign ) { |
4189 | return | | 4189 | return |
4190 | aSign | | 4190 | aSign |
4191 | && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) | | 4191 | && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) |
4192 | != 0 ); | | 4192 | != 0 ); |
4193 | } | | 4193 | } |
4194 | return | | 4194 | return |
4195 | aSign ? lt128( b.high, b.low, a.high, a.low ) | | 4195 | aSign ? lt128( b.high, b.low, a.high, a.low ) |
4196 | : lt128( a.high, a.low, b.high, b.low ); | | 4196 | : lt128( a.high, a.low, b.high, b.low ); |
4197 | | | 4197 | |
4198 | } | | 4198 | } |
4199 | | | 4199 | |
4200 | /* | | 4200 | /* |
4201 | ------------------------------------------------------------------------------- | | 4201 | ------------------------------------------------------------------------------- |
4202 | Returns 1 if the extended double-precision floating-point value `a' is equal | | 4202 | Returns 1 if the extended double-precision floating-point value `a' is equal |
4203 | to the corresponding value `b', and 0 otherwise. The invalid exception is | | 4203 | to the corresponding value `b', and 0 otherwise. The invalid exception is |
4204 | raised if either operand is a NaN. Otherwise, the comparison is performed | | 4204 | raised if either operand is a NaN. Otherwise, the comparison is performed |
4205 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. | | 4205 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
4206 | ------------------------------------------------------------------------------- | | 4206 | ------------------------------------------------------------------------------- |
4207 | */ | | 4207 | */ |
4208 | flag floatx80_eq_signaling( floatx80 a, floatx80 b ) | | 4208 | flag floatx80_eq_signaling( floatx80 a, floatx80 b ) |
4209 | { | | 4209 | { |
4210 | | | 4210 | |
4211 | if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) | | 4211 | if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) |
4212 | && (bits64) ( extractFloatx80Frac( a )<<1 ) ) | | 4212 | && (bits64) ( extractFloatx80Frac( a )<<1 ) ) |
4213 | || ( ( extractFloatx80Exp( b ) == 0x7FFF ) | | 4213 | || ( ( extractFloatx80Exp( b ) == 0x7FFF ) |
4214 | && (bits64) ( extractFloatx80Frac( b )<<1 ) ) | | 4214 | && (bits64) ( extractFloatx80Frac( b )<<1 ) ) |
4215 | ) { | | 4215 | ) { |
4216 | float_raise( float_flag_invalid ); | | 4216 | float_raise( float_flag_invalid ); |
4217 | return 0; | | 4217 | return 0; |
4218 | } | | 4218 | } |
4219 | return | | 4219 | return |
4220 | ( a.low == b.low ) | | 4220 | ( a.low == b.low ) |
4221 | && ( ( a.high == b.high ) | | 4221 | && ( ( a.high == b.high ) |
4222 | || ( ( a.low == 0 ) | | 4222 | || ( ( a.low == 0 ) |
4223 | && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) | | 4223 | && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) |
4224 | ); | | 4224 | ); |
4225 | | | 4225 | |
4226 | } | | 4226 | } |
4227 | | | 4227 | |
4228 | /* | | 4228 | /* |
4229 | ------------------------------------------------------------------------------- | | 4229 | ------------------------------------------------------------------------------- |
4230 | Returns 1 if the extended double-precision floating-point value `a' is less | | 4230 | Returns 1 if the extended double-precision floating-point value `a' is less |
4231 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs | | 4231 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs |
4232 | do not cause an exception. Otherwise, the comparison is performed according | | 4232 | do not cause an exception. Otherwise, the comparison is performed according |
4233 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. | | 4233 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
4234 | ------------------------------------------------------------------------------- | | 4234 | ------------------------------------------------------------------------------- |
4235 | */ | | 4235 | */ |
4236 | flag floatx80_le_quiet( floatx80 a, floatx80 b ) | | 4236 | flag floatx80_le_quiet( floatx80 a, floatx80 b ) |
4237 | { | | 4237 | { |
4238 | flag aSign, bSign; | | 4238 | flag aSign, bSign; |
4239 | | | 4239 | |
4240 | if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) | | 4240 | if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) |
4241 | && (bits64) ( extractFloatx80Frac( a )<<1 ) ) | | 4241 | && (bits64) ( extractFloatx80Frac( a )<<1 ) ) |
4242 | || ( ( extractFloatx80Exp( b ) == 0x7FFF ) | | 4242 | || ( ( extractFloatx80Exp( b ) == 0x7FFF ) |
4243 | && (bits64) ( extractFloatx80Frac( b )<<1 ) ) | | 4243 | && (bits64) ( extractFloatx80Frac( b )<<1 ) ) |
4244 | ) { | | 4244 | ) { |
4245 | if ( floatx80_is_signaling_nan( a ) | | 4245 | if ( floatx80_is_signaling_nan( a ) |
4246 | || floatx80_is_signaling_nan( b ) ) { | | 4246 | || floatx80_is_signaling_nan( b ) ) { |
4247 | float_raise( float_flag_invalid ); | | 4247 | float_raise( float_flag_invalid ); |
4248 | } | | 4248 | } |
4249 | return 0; | | 4249 | return 0; |
4250 | } | | 4250 | } |
4251 | aSign = extractFloatx80Sign( a ); | | 4251 | aSign = extractFloatx80Sign( a ); |
4252 | bSign = extractFloatx80Sign( b ); | | 4252 | bSign = extractFloatx80Sign( b ); |
4253 | if ( aSign != bSign ) { | | 4253 | if ( aSign != bSign ) { |
4254 | return | | 4254 | return |
4255 | aSign | | 4255 | aSign |
4256 | || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) | | 4256 | || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) |
4257 | == 0 ); | | 4257 | == 0 ); |
4258 | } | | 4258 | } |
4259 | return | | 4259 | return |
4260 | aSign ? le128( b.high, b.low, a.high, a.low ) | | 4260 | aSign ? le128( b.high, b.low, a.high, a.low ) |
4261 | : le128( a.high, a.low, b.high, b.low ); | | 4261 | : le128( a.high, a.low, b.high, b.low ); |
4262 | | | 4262 | |
4263 | } | | 4263 | } |
4264 | | | 4264 | |
4265 | /* | | 4265 | /* |
4266 | ------------------------------------------------------------------------------- | | 4266 | ------------------------------------------------------------------------------- |
4267 | Returns 1 if the extended double-precision floating-point value `a' is less | | 4267 | Returns 1 if the extended double-precision floating-point value `a' is less |
4268 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause | | 4268 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause |
4269 | an exception. Otherwise, the comparison is performed according to the | | 4269 | an exception. Otherwise, the comparison is performed according to the |
4270 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. | | 4270 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
4271 | ------------------------------------------------------------------------------- | | 4271 | ------------------------------------------------------------------------------- |
4272 | */ | | 4272 | */ |
4273 | flag floatx80_lt_quiet( floatx80 a, floatx80 b ) | | 4273 | flag floatx80_lt_quiet( floatx80 a, floatx80 b ) |
4274 | { | | 4274 | { |
4275 | flag aSign, bSign; | | 4275 | flag aSign, bSign; |
4276 | | | 4276 | |
4277 | if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) | | 4277 | if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) |
4278 | && (bits64) ( extractFloatx80Frac( a )<<1 ) ) | | 4278 | && (bits64) ( extractFloatx80Frac( a )<<1 ) ) |
4279 | || ( ( extractFloatx80Exp( b ) == 0x7FFF ) | | 4279 | || ( ( extractFloatx80Exp( b ) == 0x7FFF ) |
4280 | && (bits64) ( extractFloatx80Frac( b )<<1 ) ) | | 4280 | && (bits64) ( extractFloatx80Frac( b )<<1 ) ) |
4281 | ) { | | 4281 | ) { |
4282 | if ( floatx80_is_signaling_nan( a ) | | 4282 | if ( floatx80_is_signaling_nan( a ) |
4283 | || floatx80_is_signaling_nan( b ) ) { | | 4283 | || floatx80_is_signaling_nan( b ) ) { |
4284 | float_raise( float_flag_invalid ); | | 4284 | float_raise( float_flag_invalid ); |
4285 | } | | 4285 | } |
4286 | return 0; | | 4286 | return 0; |
4287 | } | | 4287 | } |
4288 | aSign = extractFloatx80Sign( a ); | | 4288 | aSign = extractFloatx80Sign( a ); |
4289 | bSign = extractFloatx80Sign( b ); | | 4289 | bSign = extractFloatx80Sign( b ); |
4290 | if ( aSign != bSign ) { | | 4290 | if ( aSign != bSign ) { |
4291 | return | | 4291 | return |
4292 | aSign | | 4292 | aSign |
4293 | && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) | | 4293 | && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) |
4294 | != 0 ); | | 4294 | != 0 ); |
4295 | } | | 4295 | } |
4296 | return | | 4296 | return |
4297 | aSign ? lt128( b.high, b.low, a.high, a.low ) | | 4297 | aSign ? lt128( b.high, b.low, a.high, a.low ) |
4298 | : lt128( a.high, a.low, b.high, b.low ); | | 4298 | : lt128( a.high, a.low, b.high, b.low ); |
4299 | | | 4299 | |
4300 | } | | 4300 | } |
4301 | | | 4301 | |
4302 | #endif | | 4302 | #endif |
4303 | | | 4303 | |
4304 | #ifdef FLOAT128 | | 4304 | #ifdef FLOAT128 |
4305 | | | 4305 | |
4306 | /* | | 4306 | /* |
4307 | ------------------------------------------------------------------------------- | | 4307 | ------------------------------------------------------------------------------- |
4308 | Returns the result of converting the quadruple-precision floating-point | | 4308 | Returns the result of converting the quadruple-precision floating-point |
4309 | value `a' to the 32-bit two's complement integer format. The conversion | | 4309 | value `a' to the 32-bit two's complement integer format. The conversion |
4310 | is performed according to the IEC/IEEE Standard for Binary Floating-Point | | 4310 | is performed according to the IEC/IEEE Standard for Binary Floating-Point |
4311 | Arithmetic---which means in particular that the conversion is rounded | | 4311 | Arithmetic---which means in particular that the conversion is rounded |
4312 | according to the current rounding mode. If `a' is a NaN, the largest | | 4312 | according to the current rounding mode. If `a' is a NaN, the largest |
4313 | positive integer is returned. Otherwise, if the conversion overflows, the | | 4313 | positive integer is returned. Otherwise, if the conversion overflows, the |
4314 | largest integer with the same sign as `a' is returned. | | 4314 | largest integer with the same sign as `a' is returned. |
4315 | ------------------------------------------------------------------------------- | | 4315 | ------------------------------------------------------------------------------- |
4316 | */ | | 4316 | */ |
4317 | int32 float128_to_int32( float128 a ) | | 4317 | int32 float128_to_int32( float128 a ) |
4318 | { | | 4318 | { |
4319 | flag aSign; | | 4319 | flag aSign; |
4320 | int32 aExp, shiftCount; | | 4320 | int32 aExp, shiftCount; |
4321 | bits64 aSig0, aSig1; | | 4321 | bits64 aSig0, aSig1; |
4322 | | | 4322 | |
4323 | aSig1 = extractFloat128Frac1( a ); | | 4323 | aSig1 = extractFloat128Frac1( a ); |
4324 | aSig0 = extractFloat128Frac0( a ); | | 4324 | aSig0 = extractFloat128Frac0( a ); |
4325 | aExp = extractFloat128Exp( a ); | | 4325 | aExp = extractFloat128Exp( a ); |
4326 | aSign = extractFloat128Sign( a ); | | 4326 | aSign = extractFloat128Sign( a ); |
4327 | if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; | | 4327 | if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; |
4328 | if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); | | 4328 | if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); |
4329 | aSig0 |= ( aSig1 != 0 ); | | 4329 | aSig0 |= ( aSig1 != 0 ); |
4330 | shiftCount = 0x4028 - aExp; | | 4330 | shiftCount = 0x4028 - aExp; |
4331 | if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); | | 4331 | if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); |
4332 | return roundAndPackInt32( aSign, aSig0 ); | | 4332 | return roundAndPackInt32( aSign, aSig0 ); |
4333 | | | 4333 | |
4334 | } | | 4334 | } |
4335 | | | 4335 | |
4336 | /* | | 4336 | /* |
4337 | ------------------------------------------------------------------------------- | | 4337 | ------------------------------------------------------------------------------- |
4338 | Returns the result of converting the quadruple-precision floating-point | | 4338 | Returns the result of converting the quadruple-precision floating-point |
4339 | value `a' to the 32-bit two's complement integer format. The conversion | | 4339 | value `a' to the 32-bit two's complement integer format. The conversion |
4340 | is performed according to the IEC/IEEE Standard for Binary Floating-Point | | 4340 | is performed according to the IEC/IEEE Standard for Binary Floating-Point |
4341 | Arithmetic, except that the conversion is always rounded toward zero. If | | 4341 | Arithmetic, except that the conversion is always rounded toward zero. If |
4342 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the | | 4342 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the |
4343 | conversion overflows, the largest integer with the same sign as `a' is | | 4343 | conversion overflows, the largest integer with the same sign as `a' is |
4344 | returned. | | 4344 | returned. |
4345 | ------------------------------------------------------------------------------- | | 4345 | ------------------------------------------------------------------------------- |
4346 | */ | | 4346 | */ |
4347 | int32 float128_to_int32_round_to_zero( float128 a ) | | 4347 | int32 float128_to_int32_round_to_zero( float128 a ) |
4348 | { | | 4348 | { |
4349 | flag aSign; | | 4349 | flag aSign; |
4350 | int32 aExp, shiftCount; | | 4350 | int32 aExp, shiftCount; |
4351 | bits64 aSig0, aSig1, savedASig; | | 4351 | bits64 aSig0, aSig1, savedASig; |
4352 | int32 z; | | 4352 | int32 z; |
4353 | | | 4353 | |
4354 | aSig1 = extractFloat128Frac1( a ); | | 4354 | aSig1 = extractFloat128Frac1( a ); |
4355 | aSig0 = extractFloat128Frac0( a ); | | 4355 | aSig0 = extractFloat128Frac0( a ); |
4356 | aExp = extractFloat128Exp( a ); | | 4356 | aExp = extractFloat128Exp( a ); |
4357 | aSign = extractFloat128Sign( a ); | | 4357 | aSign = extractFloat128Sign( a ); |
4358 | aSig0 |= ( aSig1 != 0 ); | | 4358 | aSig0 |= ( aSig1 != 0 ); |
4359 | if ( 0x401E < aExp ) { | | 4359 | if ( 0x401E < aExp ) { |
4360 | if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; | | 4360 | if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; |
4361 | goto invalid; | | 4361 | goto invalid; |
4362 | } | | 4362 | } |
4363 | else if ( aExp < 0x3FFF ) { | | 4363 | else if ( aExp < 0x3FFF ) { |
4364 | if ( aExp || aSig0 ) set_float_exception_inexact_flag(); | | 4364 | if ( aExp || aSig0 ) set_float_exception_inexact_flag(); |
4365 | return 0; | | 4365 | return 0; |
4366 | } | | 4366 | } |
4367 | aSig0 |= LIT64( 0x0001000000000000 ); | | 4367 | aSig0 |= LIT64( 0x0001000000000000 ); |
4368 | shiftCount = 0x402F - aExp; | | 4368 | shiftCount = 0x402F - aExp; |
4369 | savedASig = aSig0; | | 4369 | savedASig = aSig0; |
4370 | aSig0 >>= shiftCount; | | 4370 | aSig0 >>= shiftCount; |
4371 | z = (int32)aSig0; | | 4371 | z = (int32)aSig0; |
4372 | if ( aSign ) z = - z; | | 4372 | if ( aSign ) z = - z; |
4373 | if ( ( z < 0 ) ^ aSign ) { | | 4373 | if ( ( z < 0 ) ^ aSign ) { |
4374 | invalid: | | 4374 | invalid: |
4375 | float_raise( float_flag_invalid ); | | 4375 | float_raise( float_flag_invalid ); |
4376 | return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; | | 4376 | return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; |
4377 | } | | 4377 | } |
4378 | if ( ( aSig0<<shiftCount ) != savedASig ) { | | 4378 | if ( ( aSig0<<shiftCount ) != savedASig ) { |
4379 | set_float_exception_inexact_flag(); | | 4379 | set_float_exception_inexact_flag(); |
4380 | } | | 4380 | } |
4381 | return z; | | 4381 | return z; |
4382 | | | 4382 | |
4383 | } | | 4383 | } |
4384 | | | 4384 | |
4385 | /* | | 4385 | /* |
4386 | ------------------------------------------------------------------------------- | | 4386 | ------------------------------------------------------------------------------- |
4387 | Returns the result of converting the quadruple-precision floating-point | | 4387 | Returns the result of converting the quadruple-precision floating-point |
4388 | value `a' to the 64-bit two's complement integer format. The conversion | | 4388 | value `a' to the 64-bit two's complement integer format. The conversion |
4389 | is performed according to the IEC/IEEE Standard for Binary Floating-Point | | 4389 | is performed according to the IEC/IEEE Standard for Binary Floating-Point |
4390 | Arithmetic---which means in particular that the conversion is rounded | | 4390 | Arithmetic---which means in particular that the conversion is rounded |
4391 | according to the current rounding mode. If `a' is a NaN, the largest | | 4391 | according to the current rounding mode. If `a' is a NaN, the largest |
4392 | positive integer is returned. Otherwise, if the conversion overflows, the | | 4392 | positive integer is returned. Otherwise, if the conversion overflows, the |
4393 | largest integer with the same sign as `a' is returned. | | 4393 | largest integer with the same sign as `a' is returned. |
4394 | ------------------------------------------------------------------------------- | | 4394 | ------------------------------------------------------------------------------- |
4395 | */ | | 4395 | */ |
4396 | int64 float128_to_int64( float128 a ) | | 4396 | int64 float128_to_int64( float128 a ) |
4397 | { | | 4397 | { |
4398 | flag aSign; | | 4398 | flag aSign; |
4399 | int32 aExp, shiftCount; | | 4399 | int32 aExp, shiftCount; |
4400 | bits64 aSig0, aSig1; | | 4400 | bits64 aSig0, aSig1; |
4401 | | | 4401 | |
4402 | aSig1 = extractFloat128Frac1( a ); | | 4402 | aSig1 = extractFloat128Frac1( a ); |
4403 | aSig0 = extractFloat128Frac0( a ); | | 4403 | aSig0 = extractFloat128Frac0( a ); |
4404 | aExp = extractFloat128Exp( a ); | | 4404 | aExp = extractFloat128Exp( a ); |
4405 | aSign = extractFloat128Sign( a ); | | 4405 | aSign = extractFloat128Sign( a ); |
4406 | if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); | | 4406 | if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); |
4407 | shiftCount = 0x402F - aExp; | | 4407 | shiftCount = 0x402F - aExp; |
4408 | if ( shiftCount <= 0 ) { | | 4408 | if ( shiftCount <= 0 ) { |
4409 | if ( 0x403E < aExp ) { | | 4409 | if ( 0x403E < aExp ) { |
4410 | float_raise( float_flag_invalid ); | | 4410 | float_raise( float_flag_invalid ); |
4411 | if ( ! aSign | | 4411 | if ( ! aSign |
4412 | || ( ( aExp == 0x7FFF ) | | 4412 | || ( ( aExp == 0x7FFF ) |
4413 | && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) | | 4413 | && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) |
4414 | ) | | 4414 | ) |
4415 | ) { | | 4415 | ) { |
4416 | return LIT64( 0x7FFFFFFFFFFFFFFF ); | | 4416 | return LIT64( 0x7FFFFFFFFFFFFFFF ); |
4417 | } | | 4417 | } |
4418 | return (sbits64) LIT64( 0x8000000000000000 ); | | 4418 | return (sbits64) LIT64( 0x8000000000000000 ); |
4419 | } | | 4419 | } |
4420 | shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); | | 4420 | shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); |
4421 | } | | 4421 | } |
4422 | else { | | 4422 | else { |
4423 | shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); | | 4423 | shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); |
4424 | } | | 4424 | } |
4425 | return roundAndPackInt64( aSign, aSig0, aSig1 ); | | 4425 | return roundAndPackInt64( aSign, aSig0, aSig1 ); |
4426 | | | 4426 | |
4427 | } | | 4427 | } |
4428 | | | 4428 | |
4429 | /* | | 4429 | /* |
4430 | ------------------------------------------------------------------------------- | | 4430 | ------------------------------------------------------------------------------- |
4431 | Returns the result of converting the quadruple-precision floating-point | | 4431 | Returns the result of converting the quadruple-precision floating-point |
4432 | value `a' to the 64-bit two's complement integer format. The conversion | | 4432 | value `a' to the 64-bit two's complement integer format. The conversion |
4433 | is performed according to the IEC/IEEE Standard for Binary Floating-Point | | 4433 | is performed according to the IEC/IEEE Standard for Binary Floating-Point |
4434 | Arithmetic, except that the conversion is always rounded toward zero. | | 4434 | Arithmetic, except that the conversion is always rounded toward zero. |
4435 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if | | 4435 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if |
4436 | the conversion overflows, the largest integer with the same sign as `a' is | | 4436 | the conversion overflows, the largest integer with the same sign as `a' is |
4437 | returned. | | 4437 | returned. |
4438 | ------------------------------------------------------------------------------- | | 4438 | ------------------------------------------------------------------------------- |
4439 | */ | | 4439 | */ |
4440 | int64 float128_to_int64_round_to_zero( float128 a ) | | 4440 | int64 float128_to_int64_round_to_zero( float128 a ) |
4441 | { | | 4441 | { |
4442 | flag aSign; | | 4442 | flag aSign; |
4443 | int32 aExp, shiftCount; | | 4443 | int32 aExp, shiftCount; |
4444 | bits64 aSig0, aSig1; | | 4444 | bits64 aSig0, aSig1; |
4445 | int64 z; | | 4445 | int64 z; |
4446 | | | 4446 | |
4447 | aSig1 = extractFloat128Frac1( a ); | | 4447 | aSig1 = extractFloat128Frac1( a ); |
4448 | aSig0 = extractFloat128Frac0( a ); | | 4448 | aSig0 = extractFloat128Frac0( a ); |
4449 | aExp = extractFloat128Exp( a ); | | 4449 | aExp = extractFloat128Exp( a ); |
4450 | aSign = extractFloat128Sign( a ); | | 4450 | aSign = extractFloat128Sign( a ); |
4451 | if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); | | 4451 | if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); |
4452 | shiftCount = aExp - 0x402F; | | 4452 | shiftCount = aExp - 0x402F; |
4453 | if ( 0 < shiftCount ) { | | 4453 | if ( 0 < shiftCount ) { |
4454 | if ( 0x403E <= aExp ) { | | 4454 | if ( 0x403E <= aExp ) { |
4455 | aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); | | 4455 | aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); |
4456 | if ( ( a.high == LIT64( 0xC03E000000000000 ) ) | | 4456 | if ( ( a.high == LIT64( 0xC03E000000000000 ) ) |
4457 | && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { | | 4457 | && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { |
4458 | if ( aSig1 ) set_float_exception_inexact_flag(); | | 4458 | if ( aSig1 ) set_float_exception_inexact_flag(); |
4459 | } | | 4459 | } |
4460 | else { | | 4460 | else { |
4461 | float_raise( float_flag_invalid ); | | 4461 | float_raise( float_flag_invalid ); |
4462 | if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { | | 4462 | if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { |
4463 | return LIT64( 0x7FFFFFFFFFFFFFFF ); | | 4463 | return LIT64( 0x7FFFFFFFFFFFFFFF ); |
4464 | } | | 4464 | } |
4465 | } | | 4465 | } |
4466 | return (sbits64) LIT64( 0x8000000000000000 ); | | 4466 | return (sbits64) LIT64( 0x8000000000000000 ); |
4467 | } | | 4467 | } |
4468 | z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); | | 4468 | z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); |
4469 | if ( (bits64) ( aSig1<<shiftCount ) ) { | | 4469 | if ( (bits64) ( aSig1<<shiftCount ) ) { |
4470 | set_float_exception_inexact_flag(); | | 4470 | set_float_exception_inexact_flag(); |
4471 | } | | 4471 | } |
4472 | } | | 4472 | } |
4473 | else { | | 4473 | else { |
4474 | if ( aExp < 0x3FFF ) { | | 4474 | if ( aExp < 0x3FFF ) { |
4475 | if ( aExp | aSig0 | aSig1 ) { | | 4475 | if ( aExp | aSig0 | aSig1 ) { |
4476 | set_float_exception_inexact_flag(); | | 4476 | set_float_exception_inexact_flag(); |
4477 | } | | 4477 | } |
4478 | return 0; | | 4478 | return 0; |
4479 | } | | 4479 | } |
4480 | z = aSig0>>( - shiftCount ); | | 4480 | z = aSig0>>( - shiftCount ); |
4481 | if ( aSig1 | | 4481 | if ( aSig1 |
4482 | || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) { | | 4482 | || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) { |
4483 | set_float_exception_inexact_flag(); | | 4483 | set_float_exception_inexact_flag(); |
4484 | } | | 4484 | } |
4485 | } | | 4485 | } |
4486 | if ( aSign ) z = - z; | | 4486 | if ( aSign ) z = - z; |
4487 | return z; | | 4487 | return z; |
4488 | | | 4488 | |
4489 | } | | 4489 | } |
4490 | | | 4490 | |
4491 | #if (defined(SOFTFLOATSPARC64_FOR_GCC) || defined(SOFTFLOAT_FOR_GCC)) \ | | 4491 | #if (defined(SOFTFLOATSPARC64_FOR_GCC) || defined(SOFTFLOAT_FOR_GCC)) \ |
4492 | && defined(SOFTFLOAT_NEED_FIXUNS) | | 4492 | && defined(SOFTFLOAT_NEED_FIXUNS) |
4493 | /* | | 4493 | /* |
4494 | * just like above - but do not care for overflow of signed results | | 4494 | * just like above - but do not care for overflow of signed results |
4495 | */ | | 4495 | */ |
4496 | uint64 float128_to_uint64_round_to_zero( float128 a ) | | 4496 | uint64 float128_to_uint64_round_to_zero( float128 a ) |
4497 | { | | 4497 | { |
4498 | flag aSign; | | 4498 | flag aSign; |
4499 | int32 aExp, shiftCount; | | 4499 | int32 aExp, shiftCount; |
4500 | bits64 aSig0, aSig1; | | 4500 | bits64 aSig0, aSig1; |
4501 | uint64 z; | | 4501 | uint64 z; |
4502 | | | 4502 | |
4503 | aSig1 = extractFloat128Frac1( a ); | | 4503 | aSig1 = extractFloat128Frac1( a ); |
4504 | aSig0 = extractFloat128Frac0( a ); | | 4504 | aSig0 = extractFloat128Frac0( a ); |
4505 | aExp = extractFloat128Exp( a ); | | 4505 | aExp = extractFloat128Exp( a ); |
4506 | aSign = extractFloat128Sign( a ); | | 4506 | aSign = extractFloat128Sign( a ); |
4507 | if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); | | 4507 | if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); |
4508 | shiftCount = aExp - 0x402F; | | 4508 | shiftCount = aExp - 0x402F; |
4509 | if ( 0 < shiftCount ) { | | 4509 | if ( 0 < shiftCount ) { |
4510 | if ( 0x403F <= aExp ) { | | 4510 | if ( 0x403F <= aExp ) { |
4511 | aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); | | 4511 | aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); |
4512 | if ( ( a.high == LIT64( 0xC03E000000000000 ) ) | | 4512 | if ( ( a.high == LIT64( 0xC03E000000000000 ) ) |
4513 | && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { | | 4513 | && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { |
4514 | if ( aSig1 ) set_float_exception_inexact_flag(); | | 4514 | if ( aSig1 ) set_float_exception_inexact_flag(); |
4515 | } | | 4515 | } |
4516 | else { | | 4516 | else { |
4517 | float_raise( float_flag_invalid ); | | 4517 | float_raise( float_flag_invalid ); |
4518 | } | | 4518 | } |
4519 | return LIT64( 0xFFFFFFFFFFFFFFFF ); | | 4519 | return LIT64( 0xFFFFFFFFFFFFFFFF ); |
4520 | } | | 4520 | } |
4521 | z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); | | 4521 | z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); |
4522 | if ( (bits64) ( aSig1<<shiftCount ) ) { | | 4522 | if ( (bits64) ( aSig1<<shiftCount ) ) { |
4523 | set_float_exception_inexact_flag(); | | 4523 | set_float_exception_inexact_flag(); |
4524 | } | | 4524 | } |
4525 | } | | 4525 | } |
4526 | else { | | 4526 | else { |
4527 | if ( aExp < 0x3FFF ) { | | 4527 | if ( aExp < 0x3FFF ) { |
4528 | if ( aExp | aSig0 | aSig1 ) { | | 4528 | if ( aExp | aSig0 | aSig1 ) { |
4529 | set_float_exception_inexact_flag(); | | 4529 | set_float_exception_inexact_flag(); |
4530 | } | | 4530 | } |
4531 | return 0; | | 4531 | return 0; |
4532 | } | | 4532 | } |
4533 | z = aSig0>>( - shiftCount ); | | 4533 | z = aSig0>>( - shiftCount ); |
4534 | if (aSig1 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) { | | 4534 | if (aSig1 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) { |
4535 | set_float_exception_inexact_flag(); | | 4535 | set_float_exception_inexact_flag(); |
4536 | } | | 4536 | } |
4537 | } | | 4537 | } |
4538 | if ( aSign ) z = - z; | | 4538 | if ( aSign ) z = - z; |
4539 | return z; | | 4539 | return z; |
4540 | | | 4540 | |
4541 | } | | 4541 | } |
4542 | #endif /* (SOFTFLOATSPARC64_FOR_GCC || SOFTFLOAT_FOR_GCC) && SOFTFLOAT_NEED_FIXUNS */ | | 4542 | #endif /* (SOFTFLOATSPARC64_FOR_GCC || SOFTFLOAT_FOR_GCC) && SOFTFLOAT_NEED_FIXUNS */ |
4543 | | | 4543 | |
4544 | /* | | 4544 | /* |
4545 | ------------------------------------------------------------------------------- | | 4545 | ------------------------------------------------------------------------------- |
4546 | Returns the result of converting the quadruple-precision floating-point | | 4546 | Returns the result of converting the quadruple-precision floating-point |
4547 | value `a' to the single-precision floating-point format. The conversion | | 4547 | value `a' to the single-precision floating-point format. The conversion |
4548 | is performed according to the IEC/IEEE Standard for Binary Floating-Point | | 4548 | is performed according to the IEC/IEEE Standard for Binary Floating-Point |
4549 | Arithmetic. | | 4549 | Arithmetic. |
4550 | ------------------------------------------------------------------------------- | | 4550 | ------------------------------------------------------------------------------- |
4551 | */ | | 4551 | */ |
4552 | float32 float128_to_float32( float128 a ) | | 4552 | float32 float128_to_float32( float128 a ) |
4553 | { | | 4553 | { |
4554 | flag aSign; | | 4554 | flag aSign; |
4555 | int32 aExp; | | 4555 | int32 aExp; |
4556 | bits64 aSig0, aSig1; | | 4556 | bits64 aSig0, aSig1; |
4557 | bits32 zSig; | | 4557 | bits32 zSig; |
4558 | | | 4558 | |
4559 | aSig1 = extractFloat128Frac1( a ); | | 4559 | aSig1 = extractFloat128Frac1( a ); |
4560 | aSig0 = extractFloat128Frac0( a ); | | 4560 | aSig0 = extractFloat128Frac0( a ); |
4561 | aExp = extractFloat128Exp( a ); | | 4561 | aExp = extractFloat128Exp( a ); |
4562 | aSign = extractFloat128Sign( a ); | | 4562 | aSign = extractFloat128Sign( a ); |
4563 | if ( aExp == 0x7FFF ) { | | 4563 | if ( aExp == 0x7FFF ) { |
4564 | if ( aSig0 | aSig1 ) { | | 4564 | if ( aSig0 | aSig1 ) { |
4565 | return commonNaNToFloat32( float128ToCommonNaN( a ) ); | | 4565 | return commonNaNToFloat32( float128ToCommonNaN( a ) ); |
4566 | } | | 4566 | } |
4567 | return packFloat32( aSign, 0xFF, 0 ); | | 4567 | return packFloat32( aSign, 0xFF, 0 ); |
4568 | } | | 4568 | } |
4569 | aSig0 |= ( aSig1 != 0 ); | | 4569 | aSig0 |= ( aSig1 != 0 ); |
4570 | shift64RightJamming( aSig0, 18, &aSig0 ); | | 4570 | shift64RightJamming( aSig0, 18, &aSig0 ); |
4571 | zSig = (bits32)aSig0; | | 4571 | zSig = (bits32)aSig0; |
4572 | if ( aExp || zSig ) { | | 4572 | if ( aExp || zSig ) { |
4573 | zSig |= 0x40000000; | | 4573 | zSig |= 0x40000000; |
4574 | aExp -= 0x3F81; | | 4574 | aExp -= 0x3F81; |
4575 | } | | 4575 | } |
4576 | return roundAndPackFloat32( aSign, aExp, zSig ); | | 4576 | return roundAndPackFloat32( aSign, aExp, zSig ); |
4577 | | | 4577 | |
4578 | } | | 4578 | } |
4579 | | | 4579 | |
4580 | /* | | 4580 | /* |
4581 | ------------------------------------------------------------------------------- | | 4581 | ------------------------------------------------------------------------------- |
4582 | Returns the result of converting the quadruple-precision floating-point | | 4582 | Returns the result of converting the quadruple-precision floating-point |
4583 | value `a' to the double-precision floating-point format. The conversion | | 4583 | value `a' to the double-precision floating-point format. The conversion |
4584 | is performed according to the IEC/IEEE Standard for Binary Floating-Point | | 4584 | is performed according to the IEC/IEEE Standard for Binary Floating-Point |
4585 | Arithmetic. | | 4585 | Arithmetic. |
4586 | ------------------------------------------------------------------------------- | | 4586 | ------------------------------------------------------------------------------- |
4587 | */ | | 4587 | */ |
4588 | float64 float128_to_float64( float128 a ) | | 4588 | float64 float128_to_float64( float128 a ) |
4589 | { | | 4589 | { |
4590 | flag aSign; | | 4590 | flag aSign; |
4591 | int32 aExp; | | 4591 | int32 aExp; |
4592 | bits64 aSig0, aSig1; | | 4592 | bits64 aSig0, aSig1; |
4593 | | | 4593 | |
4594 | aSig1 = extractFloat128Frac1( a ); | | 4594 | aSig1 = extractFloat128Frac1( a ); |
4595 | aSig0 = extractFloat128Frac0( a ); | | 4595 | aSig0 = extractFloat128Frac0( a ); |
4596 | aExp = extractFloat128Exp( a ); | | 4596 | aExp = extractFloat128Exp( a ); |
4597 | aSign = extractFloat128Sign( a ); | | 4597 | aSign = extractFloat128Sign( a ); |
4598 | if ( aExp == 0x7FFF ) { | | 4598 | if ( aExp == 0x7FFF ) { |
4599 | if ( aSig0 | aSig1 ) { | | 4599 | if ( aSig0 | aSig1 ) { |
4600 | return commonNaNToFloat64( float128ToCommonNaN( a ) ); | | 4600 | return commonNaNToFloat64( float128ToCommonNaN( a ) ); |
4601 | } | | 4601 | } |
4602 | return packFloat64( aSign, 0x7FF, 0 ); | | 4602 | return packFloat64( aSign, 0x7FF, 0 ); |
4603 | } | | 4603 | } |
4604 | shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); | | 4604 | shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); |
4605 | aSig0 |= ( aSig1 != 0 ); | | 4605 | aSig0 |= ( aSig1 != 0 ); |
4606 | if ( aExp || aSig0 ) { | | 4606 | if ( aExp || aSig0 ) { |
4607 | aSig0 |= LIT64( 0x4000000000000000 ); | | 4607 | aSig0 |= LIT64( 0x4000000000000000 ); |
4608 | aExp -= 0x3C01; | | 4608 | aExp -= 0x3C01; |
4609 | } | | 4609 | } |
4610 | return roundAndPackFloat64( aSign, aExp, aSig0 ); | | 4610 | return roundAndPackFloat64( aSign, aExp, aSig0 ); |
4611 | | | 4611 | |
4612 | } | | 4612 | } |
4613 | | | 4613 | |
4614 | #ifdef FLOATX80 | | 4614 | #ifdef FLOATX80 |
4615 | | | 4615 | |
4616 | /* | | 4616 | /* |
4617 | ------------------------------------------------------------------------------- | | 4617 | ------------------------------------------------------------------------------- |
4618 | Returns the result of converting the quadruple-precision floating-point | | 4618 | Returns the result of converting the quadruple-precision floating-point |
4619 | value `a' to the extended double-precision floating-point format. The | | 4619 | value `a' to the extended double-precision floating-point format. The |
4620 | conversion is performed according to the IEC/IEEE Standard for Binary | | 4620 | conversion is performed according to the IEC/IEEE Standard for Binary |
4621 | Floating-Point Arithmetic. | | 4621 | Floating-Point Arithmetic. |
4622 | ------------------------------------------------------------------------------- | | 4622 | ------------------------------------------------------------------------------- |
4623 | */ | | 4623 | */ |
4624 | floatx80 float128_to_floatx80( float128 a ) | | 4624 | floatx80 float128_to_floatx80( float128 a ) |
4625 | { | | 4625 | { |
4626 | flag aSign; | | 4626 | flag aSign; |
4627 | int32 aExp; | | 4627 | int32 aExp; |
4628 | bits64 aSig0, aSig1; | | 4628 | bits64 aSig0, aSig1; |
4629 | | | 4629 | |
4630 | aSig1 = extractFloat128Frac1( a ); | | 4630 | aSig1 = extractFloat128Frac1( a ); |
4631 | aSig0 = extractFloat128Frac0( a ); | | 4631 | aSig0 = extractFloat128Frac0( a ); |
4632 | aExp = extractFloat128Exp( a ); | | 4632 | aExp = extractFloat128Exp( a ); |
4633 | aSign = extractFloat128Sign( a ); | | 4633 | aSign = extractFloat128Sign( a ); |
4634 | if ( aExp == 0x7FFF ) { | | 4634 | if ( aExp == 0x7FFF ) { |
4635 | if ( aSig0 | aSig1 ) { | | 4635 | if ( aSig0 | aSig1 ) { |
4636 | return commonNaNToFloatx80( float128ToCommonNaN( a ) ); | | 4636 | return commonNaNToFloatx80( float128ToCommonNaN( a ) ); |
4637 | } | | 4637 | } |
4638 | return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); | | 4638 | return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); |
4639 | } | | 4639 | } |
4640 | if ( aExp == 0 ) { | | 4640 | if ( aExp == 0 ) { |
4641 | if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); | | 4641 | if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); |
4642 | normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); | | 4642 | normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); |
4643 | } | | 4643 | } |
4644 | else { | | 4644 | else { |
4645 | aSig0 |= LIT64( 0x0001000000000000 ); | | 4645 | aSig0 |= LIT64( 0x0001000000000000 ); |
4646 | } | | 4646 | } |
4647 | shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); | | 4647 | shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); |
4648 | return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 ); | | 4648 | return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 ); |
4649 | | | 4649 | |
4650 | } | | 4650 | } |
4651 | | | 4651 | |
4652 | #endif | | 4652 | #endif |
4653 | | | 4653 | |
4654 | /* | | 4654 | /* |
4655 | ------------------------------------------------------------------------------- | | 4655 | ------------------------------------------------------------------------------- |
4656 | Rounds the quadruple-precision floating-point value `a' to an integer, and | | 4656 | Rounds the quadruple-precision floating-point value `a' to an integer, and |
4657 | returns the result as a quadruple-precision floating-point value. The | | 4657 | returns the result as a quadruple-precision floating-point value. The |
4658 | operation is performed according to the IEC/IEEE Standard for Binary | | 4658 | operation is performed according to the IEC/IEEE Standard for Binary |
4659 | Floating-Point Arithmetic. | | 4659 | Floating-Point Arithmetic. |
4660 | ------------------------------------------------------------------------------- | | 4660 | ------------------------------------------------------------------------------- |
4661 | */ | | 4661 | */ |
4662 | float128 float128_round_to_int( float128 a ) | | 4662 | float128 float128_round_to_int( float128 a ) |
4663 | { | | 4663 | { |
4664 | flag aSign; | | 4664 | flag aSign; |
4665 | int32 aExp; | | 4665 | int32 aExp; |
4666 | bits64 lastBitMask, roundBitsMask; | | 4666 | bits64 lastBitMask, roundBitsMask; |
4667 | int8 roundingMode; | | 4667 | int8 roundingMode; |
4668 | float128 z; | | 4668 | float128 z; |
4669 | | | 4669 | |
4670 | aExp = extractFloat128Exp( a ); | | 4670 | aExp = extractFloat128Exp( a ); |
4671 | if ( 0x402F <= aExp ) { | | 4671 | if ( 0x402F <= aExp ) { |
4672 | if ( 0x406F <= aExp ) { | | 4672 | if ( 0x406F <= aExp ) { |
4673 | if ( ( aExp == 0x7FFF ) | | 4673 | if ( ( aExp == 0x7FFF ) |
4674 | && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) | | 4674 | && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) |
4675 | ) { | | 4675 | ) { |
4676 | return propagateFloat128NaN( a, a ); | | 4676 | return propagateFloat128NaN( a, a ); |
4677 | } | | 4677 | } |
4678 | return a; | | 4678 | return a; |
4679 | } | | 4679 | } |
4680 | lastBitMask = 1; | | 4680 | lastBitMask = 1; |
4681 | lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; | | 4681 | lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; |
4682 | roundBitsMask = lastBitMask - 1; | | 4682 | roundBitsMask = lastBitMask - 1; |
4683 | z = a; | | 4683 | z = a; |
4684 | roundingMode = float_rounding_mode; | | 4684 | roundingMode = float_rounding_mode; |
4685 | if ( roundingMode == float_round_nearest_even ) { | | 4685 | if ( roundingMode == float_round_nearest_even ) { |
4686 | if ( lastBitMask ) { | | 4686 | if ( lastBitMask ) { |
4687 | add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); | | 4687 | add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); |
4688 | if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; | | 4688 | if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; |
4689 | } | | 4689 | } |
4690 | else { | | 4690 | else { |
4691 | if ( (sbits64) z.low < 0 ) { | | 4691 | if ( (sbits64) z.low < 0 ) { |
4692 | ++z.high; | | 4692 | ++z.high; |
4693 | if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1; | | 4693 | if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1; |
4694 | } | | 4694 | } |
4695 | } | | 4695 | } |
4696 | } | | 4696 | } |
4697 | else if ( roundingMode != float_round_to_zero ) { | | 4697 | else if ( roundingMode != float_round_to_zero ) { |
4698 | if ( extractFloat128Sign( z ) | | 4698 | if ( extractFloat128Sign( z ) |
4699 | ^ ( roundingMode == float_round_up ) ) { | | 4699 | ^ ( roundingMode == float_round_up ) ) { |
4700 | add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low ); | | 4700 | add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low ); |
4701 | } | | 4701 | } |
4702 | } | | 4702 | } |
4703 | z.low &= ~ roundBitsMask; | | 4703 | z.low &= ~ roundBitsMask; |
4704 | } | | 4704 | } |
4705 | else { | | 4705 | else { |
4706 | if ( aExp < 0x3FFF ) { | | 4706 | if ( aExp < 0x3FFF ) { |
4707 | if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a; | | 4707 | if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a; |
4708 | set_float_exception_inexact_flag(); | | 4708 | set_float_exception_inexact_flag(); |
4709 | aSign = extractFloat128Sign( a ); | | 4709 | aSign = extractFloat128Sign( a ); |
4710 | switch ( float_rounding_mode ) { | | 4710 | switch ( float_rounding_mode ) { |
4711 | case float_round_nearest_even: | | 4711 | case float_round_nearest_even: |
4712 | if ( ( aExp == 0x3FFE ) | | 4712 | if ( ( aExp == 0x3FFE ) |
4713 | && ( extractFloat128Frac0( a ) | | 4713 | && ( extractFloat128Frac0( a ) |
4714 | | extractFloat128Frac1( a ) ) | | 4714 | | extractFloat128Frac1( a ) ) |
4715 | ) { | | 4715 | ) { |
4716 | return packFloat128( aSign, 0x3FFF, 0, 0 ); | | 4716 | return packFloat128( aSign, 0x3FFF, 0, 0 ); |
4717 | } | | 4717 | } |
4718 | break; | | 4718 | break; |
4719 | case float_round_to_zero: | | 4719 | case float_round_to_zero: |
4720 | break; | | 4720 | break; |
4721 | case float_round_down: | | 4721 | case float_round_down: |
4722 | return | | 4722 | return |
4723 | aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) | | 4723 | aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) |
4724 | : packFloat128( 0, 0, 0, 0 ); | | 4724 | : packFloat128( 0, 0, 0, 0 ); |
4725 | case float_round_up: | | 4725 | case float_round_up: |
4726 | return | | 4726 | return |
4727 | aSign ? packFloat128( 1, 0, 0, 0 ) | | 4727 | aSign ? packFloat128( 1, 0, 0, 0 ) |
4728 | : packFloat128( 0, 0x3FFF, 0, 0 ); | | 4728 | : packFloat128( 0, 0x3FFF, 0, 0 ); |
4729 | } | | 4729 | } |
4730 | return packFloat128( aSign, 0, 0, 0 ); | | 4730 | return packFloat128( aSign, 0, 0, 0 ); |
4731 | } | | 4731 | } |
4732 | lastBitMask = 1; | | 4732 | lastBitMask = 1; |
4733 | lastBitMask <<= 0x402F - aExp; | | 4733 | lastBitMask <<= 0x402F - aExp; |
4734 | roundBitsMask = lastBitMask - 1; | | 4734 | roundBitsMask = lastBitMask - 1; |
4735 | z.low = 0; | | 4735 | z.low = 0; |
4736 | z.high = a.high; | | 4736 | z.high = a.high; |
4737 | roundingMode = float_rounding_mode; | | 4737 | roundingMode = float_rounding_mode; |
4738 | if ( roundingMode == float_round_nearest_even ) { | | 4738 | if ( roundingMode == float_round_nearest_even ) { |
4739 | z.high += lastBitMask>>1; | | 4739 | z.high += lastBitMask>>1; |
4740 | if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { | | 4740 | if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { |
4741 | z.high &= ~ lastBitMask; | | 4741 | z.high &= ~ lastBitMask; |
4742 | } | | 4742 | } |
4743 | } | | 4743 | } |
4744 | else if ( roundingMode != float_round_to_zero ) { | | 4744 | else if ( roundingMode != float_round_to_zero ) { |
4745 | if ( extractFloat128Sign( z ) | | 4745 | if ( extractFloat128Sign( z ) |
4746 | ^ ( roundingMode == float_round_up ) ) { | | 4746 | ^ ( roundingMode == float_round_up ) ) { |
4747 | z.high |= ( a.low != 0 ); | | 4747 | z.high |= ( a.low != 0 ); |
4748 | z.high += roundBitsMask; | | 4748 | z.high += roundBitsMask; |
4749 | } | | 4749 | } |
4750 | } | | 4750 | } |
4751 | z.high &= ~ roundBitsMask; | | 4751 | z.high &= ~ roundBitsMask; |
4752 | } | | 4752 | } |
4753 | if ( ( z.low != a.low ) || ( z.high != a.high ) ) { | | 4753 | if ( ( z.low != a.low ) || ( z.high != a.high ) ) { |
4754 | set_float_exception_inexact_flag(); | | 4754 | set_float_exception_inexact_flag(); |
4755 | } | | 4755 | } |
4756 | return z; | | 4756 | return z; |
4757 | | | 4757 | |
4758 | } | | 4758 | } |
4759 | | | 4759 | |
4760 | /* | | 4760 | /* |
4761 | ------------------------------------------------------------------------------- | | 4761 | ------------------------------------------------------------------------------- |
4762 | Returns the result of adding the absolute values of the quadruple-precision | | 4762 | Returns the result of adding the absolute values of the quadruple-precision |
4763 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated | | 4763 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated |
4764 | before being returned. `zSign' is ignored if the result is a NaN. | | 4764 | before being returned. `zSign' is ignored if the result is a NaN. |
4765 | The addition is performed according to the IEC/IEEE Standard for Binary | | 4765 | The addition is performed according to the IEC/IEEE Standard for Binary |
4766 | Floating-Point Arithmetic. | | 4766 | Floating-Point Arithmetic. |
4767 | ------------------------------------------------------------------------------- | | 4767 | ------------------------------------------------------------------------------- |
4768 | */ | | 4768 | */ |
4769 | static float128 addFloat128Sigs( float128 a, float128 b, flag zSign ) | | 4769 | static float128 addFloat128Sigs( float128 a, float128 b, flag zSign ) |
4770 | { | | 4770 | { |
4771 | int32 aExp, bExp, zExp; | | 4771 | int32 aExp, bExp, zExp; |
4772 | bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; | | 4772 | bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; |
4773 | int32 expDiff; | | 4773 | int32 expDiff; |
4774 | | | 4774 | |
4775 | aSig1 = extractFloat128Frac1( a ); | | 4775 | aSig1 = extractFloat128Frac1( a ); |
4776 | aSig0 = extractFloat128Frac0( a ); | | 4776 | aSig0 = extractFloat128Frac0( a ); |
4777 | aExp = extractFloat128Exp( a ); | | 4777 | aExp = extractFloat128Exp( a ); |
4778 | bSig1 = extractFloat128Frac1( b ); | | 4778 | bSig1 = extractFloat128Frac1( b ); |
4779 | bSig0 = extractFloat128Frac0( b ); | | 4779 | bSig0 = extractFloat128Frac0( b ); |
4780 | bExp = extractFloat128Exp( b ); | | 4780 | bExp = extractFloat128Exp( b ); |
4781 | expDiff = aExp - bExp; | | 4781 | expDiff = aExp - bExp; |
4782 | if ( 0 < expDiff ) { | | 4782 | if ( 0 < expDiff ) { |
4783 | if ( aExp == 0x7FFF ) { | | 4783 | if ( aExp == 0x7FFF ) { |
4784 | if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); | | 4784 | if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); |
4785 | return a; | | 4785 | return a; |
4786 | } | | 4786 | } |
4787 | if ( bExp == 0 ) { | | 4787 | if ( bExp == 0 ) { |
4788 | --expDiff; | | 4788 | --expDiff; |
4789 | } | | 4789 | } |
4790 | else { | | 4790 | else { |
4791 | bSig0 |= LIT64( 0x0001000000000000 ); | | 4791 | bSig0 |= LIT64( 0x0001000000000000 ); |
4792 | } | | 4792 | } |
4793 | shift128ExtraRightJamming( | | 4793 | shift128ExtraRightJamming( |
4794 | bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); | | 4794 | bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); |
4795 | zExp = aExp; | | 4795 | zExp = aExp; |
4796 | } | | 4796 | } |
4797 | else if ( expDiff < 0 ) { | | 4797 | else if ( expDiff < 0 ) { |
4798 | if ( bExp == 0x7FFF ) { | | 4798 | if ( bExp == 0x7FFF ) { |
4799 | if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); | | 4799 | if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); |
4800 | return packFloat128( zSign, 0x7FFF, 0, 0 ); | | 4800 | return packFloat128( zSign, 0x7FFF, 0, 0 ); |
4801 | } | | 4801 | } |
4802 | if ( aExp == 0 ) { | | 4802 | if ( aExp == 0 ) { |
4803 | ++expDiff; | | 4803 | ++expDiff; |
4804 | } | | 4804 | } |
4805 | else { | | 4805 | else { |
4806 | aSig0 |= LIT64( 0x0001000000000000 ); | | 4806 | aSig0 |= LIT64( 0x0001000000000000 ); |
4807 | } | | 4807 | } |
4808 | shift128ExtraRightJamming( | | 4808 | shift128ExtraRightJamming( |
4809 | aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); | | 4809 | aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); |
4810 | zExp = bExp; | | 4810 | zExp = bExp; |
4811 | } | | 4811 | } |
4812 | else { | | 4812 | else { |
4813 | if ( aExp == 0x7FFF ) { | | 4813 | if ( aExp == 0x7FFF ) { |
4814 | if ( aSig0 | aSig1 | bSig0 | bSig1 ) { | | 4814 | if ( aSig0 | aSig1 | bSig0 | bSig1 ) { |
4815 | return propagateFloat128NaN( a, b ); | | 4815 | return propagateFloat128NaN( a, b ); |
4816 | } | | 4816 | } |
4817 | return a; | | 4817 | return a; |
4818 | } | | 4818 | } |
4819 | add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); | | 4819 | add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); |
4820 | if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 ); | | 4820 | if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 ); |
4821 | zSig2 = 0; | | 4821 | zSig2 = 0; |
4822 | zSig0 |= LIT64( 0x0002000000000000 ); | | 4822 | zSig0 |= LIT64( 0x0002000000000000 ); |
4823 | zExp = aExp; | | 4823 | zExp = aExp; |
4824 | goto shiftRight1; | | 4824 | goto shiftRight1; |
4825 | } | | 4825 | } |
4826 | aSig0 |= LIT64( 0x0001000000000000 ); | | 4826 | aSig0 |= LIT64( 0x0001000000000000 ); |
4827 | add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); | | 4827 | add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); |
4828 | --zExp; | | 4828 | --zExp; |
4829 | if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; | | 4829 | if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; |
4830 | ++zExp; | | 4830 | ++zExp; |
4831 | shiftRight1: | | 4831 | shiftRight1: |
4832 | shift128ExtraRightJamming( | | 4832 | shift128ExtraRightJamming( |
4833 | zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); | | 4833 | zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); |
4834 | roundAndPack: | | 4834 | roundAndPack: |
4835 | return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); | | 4835 | return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); |
4836 | | | 4836 | |
4837 | } | | 4837 | } |
4838 | | | 4838 | |
4839 | /* | | 4839 | /* |
4840 | ------------------------------------------------------------------------------- | | 4840 | ------------------------------------------------------------------------------- |
4841 | Returns the result of subtracting the absolute values of the quadruple- | | 4841 | Returns the result of subtracting the absolute values of the quadruple- |
4842 | precision floating-point values `a' and `b'. If `zSign' is 1, the | | 4842 | precision floating-point values `a' and `b'. If `zSign' is 1, the |
4843 | difference is negated before being returned. `zSign' is ignored if the | | 4843 | difference is negated before being returned. `zSign' is ignored if the |
4844 | result is a NaN. The subtraction is performed according to the IEC/IEEE | | 4844 | result is a NaN. The subtraction is performed according to the IEC/IEEE |
4845 | Standard for Binary Floating-Point Arithmetic. | | 4845 | Standard for Binary Floating-Point Arithmetic. |
4846 | ------------------------------------------------------------------------------- | | 4846 | ------------------------------------------------------------------------------- |
4847 | */ | | 4847 | */ |
4848 | static float128 subFloat128Sigs( float128 a, float128 b, flag zSign ) | | 4848 | static float128 subFloat128Sigs( float128 a, float128 b, flag zSign ) |
4849 | { | | 4849 | { |
4850 | int32 aExp, bExp, zExp; | | 4850 | int32 aExp, bExp, zExp; |
4851 | bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; | | 4851 | bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; |
4852 | int32 expDiff; | | 4852 | int32 expDiff; |
4853 | float128 z; | | 4853 | float128 z; |
4854 | | | 4854 | |
4855 | aSig1 = extractFloat128Frac1( a ); | | 4855 | aSig1 = extractFloat128Frac1( a ); |
4856 | aSig0 = extractFloat128Frac0( a ); | | 4856 | aSig0 = extractFloat128Frac0( a ); |
4857 | aExp = extractFloat128Exp( a ); | | 4857 | aExp = extractFloat128Exp( a ); |
4858 | bSig1 = extractFloat128Frac1( b ); | | 4858 | bSig1 = extractFloat128Frac1( b ); |
4859 | bSig0 = extractFloat128Frac0( b ); | | 4859 | bSig0 = extractFloat128Frac0( b ); |
4860 | bExp = extractFloat128Exp( b ); | | 4860 | bExp = extractFloat128Exp( b ); |
4861 | expDiff = aExp - bExp; | | 4861 | expDiff = aExp - bExp; |
4862 | shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); | | 4862 | shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); |
4863 | shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); | | 4863 | shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); |
4864 | if ( 0 < expDiff ) goto aExpBigger; | | 4864 | if ( 0 < expDiff ) goto aExpBigger; |
4865 | if ( expDiff < 0 ) goto bExpBigger; | | 4865 | if ( expDiff < 0 ) goto bExpBigger; |
4866 | if ( aExp == 0x7FFF ) { | | 4866 | if ( aExp == 0x7FFF ) { |
4867 | if ( aSig0 | aSig1 | bSig0 | bSig1 ) { | | 4867 | if ( aSig0 | aSig1 | bSig0 | bSig1 ) { |
4868 | return propagateFloat128NaN( a, b ); | | 4868 | return propagateFloat128NaN( a, b ); |
4869 | } | | 4869 | } |
4870 | float_raise( float_flag_invalid ); | | 4870 | float_raise( float_flag_invalid ); |
4871 | z.low = float128_default_nan_low; | | 4871 | z.low = float128_default_nan_low; |
4872 | z.high = float128_default_nan_high; | | 4872 | z.high = float128_default_nan_high; |
4873 | return z; | | 4873 | return z; |
4874 | } | | 4874 | } |
4875 | if ( aExp == 0 ) { | | 4875 | if ( aExp == 0 ) { |
4876 | aExp = 1; | | 4876 | aExp = 1; |
4877 | bExp = 1; | | 4877 | bExp = 1; |
4878 | } | | 4878 | } |
4879 | if ( bSig0 < aSig0 ) goto aBigger; | | 4879 | if ( bSig0 < aSig0 ) goto aBigger; |
4880 | if ( aSig0 < bSig0 ) goto bBigger; | | 4880 | if ( aSig0 < bSig0 ) goto bBigger; |
4881 | if ( bSig1 < aSig1 ) goto aBigger; | | 4881 | if ( bSig1 < aSig1 ) goto aBigger; |
4882 | if ( aSig1 < bSig1 ) goto bBigger; | | 4882 | if ( aSig1 < bSig1 ) goto bBigger; |
4883 | return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 ); | | 4883 | return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 ); |
4884 | bExpBigger: | | 4884 | bExpBigger: |
4885 | if ( bExp == 0x7FFF ) { | | 4885 | if ( bExp == 0x7FFF ) { |
4886 | if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); | | 4886 | if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); |
4887 | return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); | | 4887 | return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); |
4888 | } | | 4888 | } |
4889 | if ( aExp == 0 ) { | | 4889 | if ( aExp == 0 ) { |
4890 | ++expDiff; | | 4890 | ++expDiff; |
4891 | } | | 4891 | } |
4892 | else { | | 4892 | else { |
4893 | aSig0 |= LIT64( 0x4000000000000000 ); | | 4893 | aSig0 |= LIT64( 0x4000000000000000 ); |
4894 | } | | 4894 | } |
4895 | shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); | | 4895 | shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); |
4896 | bSig0 |= LIT64( 0x4000000000000000 ); | | 4896 | bSig0 |= LIT64( 0x4000000000000000 ); |
4897 | bBigger: | | 4897 | bBigger: |
4898 | sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); | | 4898 | sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); |
4899 | zExp = bExp; | | 4899 | zExp = bExp; |
4900 | zSign ^= 1; | | 4900 | zSign ^= 1; |
4901 | goto normalizeRoundAndPack; | | 4901 | goto normalizeRoundAndPack; |
4902 | aExpBigger: | | 4902 | aExpBigger: |
4903 | if ( aExp == 0x7FFF ) { | | 4903 | if ( aExp == 0x7FFF ) { |
4904 | if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); | | 4904 | if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); |
4905 | return a; | | 4905 | return a; |
4906 | } | | 4906 | } |
4907 | if ( bExp == 0 ) { | | 4907 | if ( bExp == 0 ) { |
4908 | --expDiff; | | 4908 | --expDiff; |
4909 | } | | 4909 | } |
4910 | else { | | 4910 | else { |
4911 | bSig0 |= LIT64( 0x4000000000000000 ); | | 4911 | bSig0 |= LIT64( 0x4000000000000000 ); |
4912 | } | | 4912 | } |
4913 | shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); | | 4913 | shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); |
4914 | aSig0 |= LIT64( 0x4000000000000000 ); | | 4914 | aSig0 |= LIT64( 0x4000000000000000 ); |
4915 | aBigger: | | 4915 | aBigger: |
4916 | sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); | | 4916 | sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); |
4917 | zExp = aExp; | | 4917 | zExp = aExp; |
4918 | normalizeRoundAndPack: | | 4918 | normalizeRoundAndPack: |
4919 | --zExp; | | 4919 | --zExp; |
4920 | return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 ); | | 4920 | return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 ); |
4921 | | | 4921 | |
4922 | } | | 4922 | } |
4923 | | | 4923 | |
4924 | /* | | 4924 | /* |
4925 | ------------------------------------------------------------------------------- | | 4925 | ------------------------------------------------------------------------------- |
4926 | Returns the result of adding the quadruple-precision floating-point values | | 4926 | Returns the result of adding the quadruple-precision floating-point values |
4927 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard | | 4927 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard |
4928 | for Binary Floating-Point Arithmetic. | | 4928 | for Binary Floating-Point Arithmetic. |
4929 | ------------------------------------------------------------------------------- | | 4929 | ------------------------------------------------------------------------------- |
4930 | */ | | 4930 | */ |
4931 | float128 float128_add( float128 a, float128 b ) | | 4931 | float128 float128_add( float128 a, float128 b ) |
4932 | { | | 4932 | { |
4933 | flag aSign, bSign; | | 4933 | flag aSign, bSign; |
4934 | | | 4934 | |
4935 | aSign = extractFloat128Sign( a ); | | 4935 | aSign = extractFloat128Sign( a ); |
4936 | bSign = extractFloat128Sign( b ); | | 4936 | bSign = extractFloat128Sign( b ); |
4937 | if ( aSign == bSign ) { | | 4937 | if ( aSign == bSign ) { |
4938 | return addFloat128Sigs( a, b, aSign ); | | 4938 | return addFloat128Sigs( a, b, aSign ); |
4939 | } | | 4939 | } |
4940 | else { | | 4940 | else { |
4941 | return subFloat128Sigs( a, b, aSign ); | | 4941 | return subFloat128Sigs( a, b, aSign ); |
4942 | } | | 4942 | } |
4943 | | | 4943 | |
4944 | } | | 4944 | } |
4945 | | | 4945 | |
4946 | /* | | 4946 | /* |
4947 | ------------------------------------------------------------------------------- | | 4947 | ------------------------------------------------------------------------------- |
4948 | Returns the result of subtracting the quadruple-precision floating-point | | 4948 | Returns the result of subtracting the quadruple-precision floating-point |
4949 | values `a' and `b'. The operation is performed according to the IEC/IEEE | | 4949 | values `a' and `b'. The operation is performed according to the IEC/IEEE |
4950 | Standard for Binary Floating-Point Arithmetic. | | 4950 | Standard for Binary Floating-Point Arithmetic. |
4951 | ------------------------------------------------------------------------------- | | 4951 | ------------------------------------------------------------------------------- |