| @@ -1,418 +1,417 @@ | | | @@ -1,418 +1,417 @@ |
1 | /* $NetBSD: fpu_sqrt.c,v 1.8 2020/06/27 04:17:51 rin Exp $ */ | | 1 | /* $NetBSD: fpu_sqrt.c,v 1.9 2020/06/27 04:29:27 rin Exp $ */ |
2 | | | 2 | |
3 | /* | | 3 | /* |
4 | * Copyright (c) 1992, 1993 | | 4 | * Copyright (c) 1992, 1993 |
5 | * The Regents of the University of California. All rights reserved. | | 5 | * The Regents of the University of California. All rights reserved. |
6 | * | | 6 | * |
7 | * This software was developed by the Computer Systems Engineering group | | 7 | * This software was developed by the Computer Systems Engineering group |
8 | * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and | | 8 | * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and |
9 | * contributed to Berkeley. | | 9 | * contributed to Berkeley. |
10 | * | | 10 | * |
11 | * All advertising materials mentioning features or use of this software | | 11 | * All advertising materials mentioning features or use of this software |
12 | * must display the following acknowledgement: | | 12 | * must display the following acknowledgement: |
13 | * This product includes software developed by the University of | | 13 | * This product includes software developed by the University of |
14 | * California, Lawrence Berkeley Laboratory. | | 14 | * California, Lawrence Berkeley Laboratory. |
15 | * | | 15 | * |
16 | * Redistribution and use in source and binary forms, with or without | | 16 | * Redistribution and use in source and binary forms, with or without |
17 | * modification, are permitted provided that the following conditions | | 17 | * modification, are permitted provided that the following conditions |
18 | * are met: | | 18 | * are met: |
19 | * 1. Redistributions of source code must retain the above copyright | | 19 | * 1. Redistributions of source code must retain the above copyright |
20 | * notice, this list of conditions and the following disclaimer. | | 20 | * notice, this list of conditions and the following disclaimer. |
21 | * 2. Redistributions in binary form must reproduce the above copyright | | 21 | * 2. Redistributions in binary form must reproduce the above copyright |
22 | * notice, this list of conditions and the following disclaimer in the | | 22 | * notice, this list of conditions and the following disclaimer in the |
23 | * documentation and/or other materials provided with the distribution. | | 23 | * documentation and/or other materials provided with the distribution. |
24 | * 3. Neither the name of the University nor the names of its contributors | | 24 | * 3. Neither the name of the University nor the names of its contributors |
25 | * may be used to endorse or promote products derived from this software | | 25 | * may be used to endorse or promote products derived from this software |
26 | * without specific prior written permission. | | 26 | * without specific prior written permission. |
27 | * | | 27 | * |
28 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND | | 28 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
29 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | | 29 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
30 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | | 30 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
31 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE | | 31 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
32 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | | 32 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
33 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | | 33 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
34 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | | 34 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
35 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | | 35 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
36 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | | 36 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
37 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | | 37 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
38 | * SUCH DAMAGE. | | 38 | * SUCH DAMAGE. |
39 | * | | 39 | * |
40 | * @(#)fpu_sqrt.c 8.1 (Berkeley) 6/11/93 | | 40 | * @(#)fpu_sqrt.c 8.1 (Berkeley) 6/11/93 |
41 | */ | | 41 | */ |
42 | | | 42 | |
43 | /* | | 43 | /* |
44 | * Perform an FPU square root (return sqrt(x)). | | 44 | * Perform an FPU square root (return sqrt(x)). |
45 | */ | | 45 | */ |
46 | | | 46 | |
47 | #include <sys/cdefs.h> | | 47 | #include <sys/cdefs.h> |
48 | __KERNEL_RCSID(0, "$NetBSD: fpu_sqrt.c,v 1.8 2020/06/27 04:17:51 rin Exp $"); | | 48 | __KERNEL_RCSID(0, "$NetBSD: fpu_sqrt.c,v 1.9 2020/06/27 04:29:27 rin Exp $"); |
49 | | | 49 | |
50 | #include <sys/types.h> | | 50 | #include <sys/types.h> |
51 | #if defined(DIAGNOSTIC)||defined(DEBUG) | | 51 | #if defined(DIAGNOSTIC)||defined(DEBUG) |
52 | #include <sys/systm.h> | | 52 | #include <sys/systm.h> |
53 | #endif | | 53 | #endif |
54 | | | 54 | |
55 | #include <machine/fpu.h> | | 55 | #include <machine/fpu.h> |
56 | #include <machine/reg.h> | | 56 | #include <machine/reg.h> |
57 | | | 57 | |
58 | #include <powerpc/fpu/fpu_arith.h> | | 58 | #include <powerpc/fpu/fpu_arith.h> |
59 | #include <powerpc/fpu/fpu_emu.h> | | 59 | #include <powerpc/fpu/fpu_emu.h> |
60 | | | 60 | |
61 | /* | | 61 | /* |
62 | * Our task is to calculate the square root of a floating point number x0. | | 62 | * Our task is to calculate the square root of a floating point number x0. |
63 | * This number x normally has the form: | | 63 | * This number x normally has the form: |
64 | * | | 64 | * |
65 | * exp | | 65 | * exp |
66 | * x = mant * 2 (where 1 <= mant < 2 and exp is an integer) | | 66 | * x = mant * 2 (where 1 <= mant < 2 and exp is an integer) |
67 | * | | 67 | * |
68 | * This can be left as it stands, or the mantissa can be doubled and the | | 68 | * This can be left as it stands, or the mantissa can be doubled and the |
69 | * exponent decremented: | | 69 | * exponent decremented: |
70 | * | | 70 | * |
71 | * exp-1 | | 71 | * exp-1 |
72 | * x = (2 * mant) * 2 (where 2 <= 2 * mant < 4) | | 72 | * x = (2 * mant) * 2 (where 2 <= 2 * mant < 4) |
73 | * | | 73 | * |
74 | * If the exponent `exp' is even, the square root of the number is best | | 74 | * If the exponent `exp' is even, the square root of the number is best |
75 | * handled using the first form, and is by definition equal to: | | 75 | * handled using the first form, and is by definition equal to: |
76 | * | | 76 | * |
77 | * exp/2 | | 77 | * exp/2 |
78 | * sqrt(x) = sqrt(mant) * 2 | | 78 | * sqrt(x) = sqrt(mant) * 2 |
79 | * | | 79 | * |
80 | * If exp is odd, on the other hand, it is convenient to use the second | | 80 | * If exp is odd, on the other hand, it is convenient to use the second |
81 | * form, giving: | | 81 | * form, giving: |
82 | * | | 82 | * |
83 | * (exp-1)/2 | | 83 | * (exp-1)/2 |
84 | * sqrt(x) = sqrt(2 * mant) * 2 | | 84 | * sqrt(x) = sqrt(2 * mant) * 2 |
85 | * | | 85 | * |
86 | * In the first case, we have | | 86 | * In the first case, we have |
87 | * | | 87 | * |
88 | * 1 <= mant < 2 | | 88 | * 1 <= mant < 2 |
89 | * | | 89 | * |
90 | * and therefore | | 90 | * and therefore |
91 | * | | 91 | * |
92 | * sqrt(1) <= sqrt(mant) < sqrt(2) | | 92 | * sqrt(1) <= sqrt(mant) < sqrt(2) |
93 | * | | 93 | * |
94 | * while in the second case we have | | 94 | * while in the second case we have |
95 | * | | 95 | * |
96 | * 2 <= 2*mant < 4 | | 96 | * 2 <= 2*mant < 4 |
97 | * | | 97 | * |
98 | * and therefore | | 98 | * and therefore |
99 | * | | 99 | * |
100 | * sqrt(2) <= sqrt(2*mant) < sqrt(4) | | 100 | * sqrt(2) <= sqrt(2*mant) < sqrt(4) |
101 | * | | 101 | * |
102 | * so that in any case, we are sure that | | 102 | * so that in any case, we are sure that |
103 | * | | 103 | * |
104 | * sqrt(1) <= sqrt(n * mant) < sqrt(4), n = 1 or 2 | | 104 | * sqrt(1) <= sqrt(n * mant) < sqrt(4), n = 1 or 2 |
105 | * | | 105 | * |
106 | * or | | 106 | * or |
107 | * | | 107 | * |
108 | * 1 <= sqrt(n * mant) < 2, n = 1 or 2. | | 108 | * 1 <= sqrt(n * mant) < 2, n = 1 or 2. |
109 | * | | 109 | * |
110 | * This root is therefore a properly formed mantissa for a floating | | 110 | * This root is therefore a properly formed mantissa for a floating |
111 | * point number. The exponent of sqrt(x) is either exp/2 or (exp-1)/2 | | 111 | * point number. The exponent of sqrt(x) is either exp/2 or (exp-1)/2 |
112 | * as above. This leaves us with the problem of finding the square root | | 112 | * as above. This leaves us with the problem of finding the square root |
113 | * of a fixed-point number in the range [1..4). | | 113 | * of a fixed-point number in the range [1..4). |
114 | * | | 114 | * |
115 | * Though it may not be instantly obvious, the following square root | | 115 | * Though it may not be instantly obvious, the following square root |
116 | * algorithm works for any integer x of an even number of bits, provided | | 116 | * algorithm works for any integer x of an even number of bits, provided |
117 | * that no overflows occur: | | 117 | * that no overflows occur: |
118 | * | | 118 | * |
119 | * let q = 0 | | 119 | * let q = 0 |
120 | * for k = NBITS-1 to 0 step -1 do -- for each digit in the answer... | | 120 | * for k = NBITS-1 to 0 step -1 do -- for each digit in the answer... |
121 | * x *= 2 -- multiply by radix, for next digit | | 121 | * x *= 2 -- multiply by radix, for next digit |
122 | * if x >= 2q + 2^k then -- if adding 2^k does not | | 122 | * if x >= 2q + 2^k then -- if adding 2^k does not |
123 | * x -= 2q + 2^k -- exceed the correct root, | | 123 | * x -= 2q + 2^k -- exceed the correct root, |
124 | * q += 2^k -- add 2^k and adjust x | | 124 | * q += 2^k -- add 2^k and adjust x |
125 | * fi | | 125 | * fi |
126 | * done | | 126 | * done |
127 | * sqrt = q / 2^(NBITS/2) -- (and any remainder is in x) | | 127 | * sqrt = q / 2^(NBITS/2) -- (and any remainder is in x) |
128 | * | | 128 | * |
129 | * If NBITS is odd (so that k is initially even), we can just add another | | 129 | * If NBITS is odd (so that k is initially even), we can just add another |
130 | * zero bit at the top of x. Doing so means that q is not going to acquire | | 130 | * zero bit at the top of x. Doing so means that q is not going to acquire |
131 | * a 1 bit in the first trip around the loop (since x0 < 2^NBITS). If the | | 131 | * a 1 bit in the first trip around the loop (since x0 < 2^NBITS). If the |
132 | * final value in x is not needed, or can be off by a factor of 2, this is | | 132 | * final value in x is not needed, or can be off by a factor of 2, this is |
133 | * equivalant to moving the `x *= 2' step to the bottom of the loop: | | 133 | * equivalant to moving the `x *= 2' step to the bottom of the loop: |
134 | * | | 134 | * |
135 | * for k = NBITS-1 to 0 step -1 do if ... fi; x *= 2; done | | 135 | * for k = NBITS-1 to 0 step -1 do if ... fi; x *= 2; done |
136 | * | | 136 | * |
137 | * and the result q will then be sqrt(x0) * 2^floor(NBITS / 2). | | 137 | * and the result q will then be sqrt(x0) * 2^floor(NBITS / 2). |
138 | * (Since the algorithm is destructive on x, we will call x's initial | | 138 | * (Since the algorithm is destructive on x, we will call x's initial |
139 | * value, for which q is some power of two times its square root, x0.) | | 139 | * value, for which q is some power of two times its square root, x0.) |
140 | * | | 140 | * |
141 | * If we insert a loop invariant y = 2q, we can then rewrite this using | | 141 | * If we insert a loop invariant y = 2q, we can then rewrite this using |
142 | * C notation as: | | 142 | * C notation as: |
143 | * | | 143 | * |
144 | * q = y = 0; x = x0; | | 144 | * q = y = 0; x = x0; |
145 | * for (k = NBITS; --k >= 0;) { | | 145 | * for (k = NBITS; --k >= 0;) { |
146 | * #if (NBITS is even) | | 146 | * #if (NBITS is even) |
147 | * x *= 2; | | 147 | * x *= 2; |
148 | * #endif | | 148 | * #endif |
149 | * t = y + (1 << k); | | 149 | * t = y + (1 << k); |
150 | * if (x >= t) { | | 150 | * if (x >= t) { |
151 | * x -= t; | | 151 | * x -= t; |
152 | * q += 1 << k; | | 152 | * q += 1 << k; |
153 | * y += 1 << (k + 1); | | 153 | * y += 1 << (k + 1); |
154 | * } | | 154 | * } |
155 | * #if (NBITS is odd) | | 155 | * #if (NBITS is odd) |
156 | * x *= 2; | | 156 | * x *= 2; |
157 | * #endif | | 157 | * #endif |
158 | * } | | 158 | * } |
159 | * | | 159 | * |
160 | * If x0 is fixed point, rather than an integer, we can simply alter the | | 160 | * If x0 is fixed point, rather than an integer, we can simply alter the |
161 | * scale factor between q and sqrt(x0). As it happens, we can easily arrange | | 161 | * scale factor between q and sqrt(x0). As it happens, we can easily arrange |
162 | * for the scale factor to be 2**0 or 1, so that sqrt(x0) == q. | | 162 | * for the scale factor to be 2**0 or 1, so that sqrt(x0) == q. |
163 | * | | 163 | * |
164 | * In our case, however, x0 (and therefore x, y, q, and t) are multiword | | 164 | * In our case, however, x0 (and therefore x, y, q, and t) are multiword |
165 | * integers, which adds some complication. But note that q is built one | | 165 | * integers, which adds some complication. But note that q is built one |
166 | * bit at a time, from the top down, and is not used itself in the loop | | 166 | * bit at a time, from the top down, and is not used itself in the loop |
167 | * (we use 2q as held in y instead). This means we can build our answer | | 167 | * (we use 2q as held in y instead). This means we can build our answer |
168 | * in an integer, one word at a time, which saves a bit of work. Also, | | 168 | * in an integer, one word at a time, which saves a bit of work. Also, |
169 | * since 1 << k is always a `new' bit in q, 1 << k and 1 << (k+1) are | | 169 | * since 1 << k is always a `new' bit in q, 1 << k and 1 << (k+1) are |
170 | * `new' bits in y and we can set them with an `or' operation rather than | | 170 | * `new' bits in y and we can set them with an `or' operation rather than |
171 | * a full-blown multiword add. | | 171 | * a full-blown multiword add. |
172 | * | | 172 | * |
173 | * We are almost done, except for one snag. We must prove that none of our | | 173 | * We are almost done, except for one snag. We must prove that none of our |
174 | * intermediate calculations can overflow. We know that x0 is in [1..4) | | 174 | * intermediate calculations can overflow. We know that x0 is in [1..4) |
175 | * and therefore the square root in q will be in [1..2), but what about x, | | 175 | * and therefore the square root in q will be in [1..2), but what about x, |
176 | * y, and t? | | 176 | * y, and t? |
177 | * | | 177 | * |
178 | * We know that y = 2q at the beginning of each loop. (The relation only | | 178 | * We know that y = 2q at the beginning of each loop. (The relation only |
179 | * fails temporarily while y and q are being updated.) Since q < 2, y < 4. | | 179 | * fails temporarily while y and q are being updated.) Since q < 2, y < 4. |
180 | * The sum in t can, in our case, be as much as y+(1<<1) = y+2 < 6, and. | | 180 | * The sum in t can, in our case, be as much as y+(1<<1) = y+2 < 6, and. |
181 | * Furthermore, we can prove with a bit of work that x never exceeds y by | | 181 | * Furthermore, we can prove with a bit of work that x never exceeds y by |
182 | * more than 2, so that even after doubling, 0 <= x < 8. (This is left as | | 182 | * more than 2, so that even after doubling, 0 <= x < 8. (This is left as |
183 | * an exercise to the reader, mostly because I have become tired of working | | 183 | * an exercise to the reader, mostly because I have become tired of working |
184 | * on this comment.) | | 184 | * on this comment.) |
185 | * | | 185 | * |
186 | * If our floating point mantissas (which are of the form 1.frac) occupy | | 186 | * If our floating point mantissas (which are of the form 1.frac) occupy |
187 | * B+1 bits, our largest intermediary needs at most B+3 bits, or two extra. | | 187 | * B+1 bits, our largest intermediary needs at most B+3 bits, or two extra. |
188 | * In fact, we want even one more bit (for a carry, to avoid compares), or | | 188 | * In fact, we want even one more bit (for a carry, to avoid compares), or |
189 | * three extra. There is a comment in fpu_emu.h reminding maintainers of | | 189 | * three extra. There is a comment in fpu_emu.h reminding maintainers of |
190 | * this, so we have some justification in assuming it. | | 190 | * this, so we have some justification in assuming it. |
191 | */ | | 191 | */ |
192 | struct fpn * | | 192 | struct fpn * |
193 | fpu_sqrt(struct fpemu *fe) | | 193 | fpu_sqrt(struct fpemu *fe) |
194 | { | | 194 | { |
195 | struct fpn *x = &fe->fe_f1; | | 195 | struct fpn *x = &fe->fe_f1; |
196 | u_int bit, q, tt; | | 196 | u_int bit, q, tt; |
197 | u_int x0, x1, x2, x3; | | 197 | u_int x0, x1, x2, x3; |
198 | u_int y0, y1, y2, y3; | | 198 | u_int y0, y1, y2, y3; |
199 | u_int d0, d1, d2, d3; | | 199 | u_int d0, d1, d2, d3; |
200 | int e; | | 200 | int e; |
201 | FPU_DECL_CARRY; | | 201 | FPU_DECL_CARRY; |
202 | | | 202 | |
203 | /* | | 203 | /* |
204 | * Take care of special cases first. In order: | | 204 | * Take care of special cases first. In order: |
205 | * | | 205 | * |
206 | * sqrt(NaN) = NaN | | 206 | * sqrt(NaN) = NaN |
207 | * sqrt(+0) = +0 | | 207 | * sqrt(+0) = +0 |
208 | * sqrt(-0) = -0 | | 208 | * sqrt(-0) = -0 |
209 | * sqrt(x < 0) = NaN (including sqrt(-Inf)) | | 209 | * sqrt(x < 0) = NaN (including sqrt(-Inf)) |
210 | * sqrt(+Inf) = +Inf | | 210 | * sqrt(+Inf) = +Inf |
211 | * | | 211 | * |
212 | * Then all that remains are numbers with mantissas in [1..2). | | 212 | * Then all that remains are numbers with mantissas in [1..2). |
213 | */ | | 213 | */ |
214 | DPRINTF(FPE_REG, ("fpu_sqrt:\n")); | | 214 | DPRINTF(FPE_REG, ("fpu_sqrt:\n")); |
215 | DUMPFPN(FPE_REG, x); | | 215 | DUMPFPN(FPE_REG, x); |
216 | DPRINTF(FPE_REG, ("=>\n")); | | 216 | DPRINTF(FPE_REG, ("=>\n")); |
217 | if (ISNAN(x)) { | | 217 | if (ISNAN(x)) { |
218 | fe->fe_cx |= FPSCR_VXSNAN; | | 218 | fe->fe_cx |= FPSCR_VXSNAN; |
219 | DUMPFPN(FPE_REG, x); | | 219 | DUMPFPN(FPE_REG, x); |
220 | return (x); | | 220 | return (x); |
221 | } | | 221 | } |
222 | if (ISZERO(x)) { | | 222 | if (ISZERO(x)) { |
223 | fe->fe_cx |= FPSCR_ZX; | | 223 | fe->fe_cx |= FPSCR_ZX; |
224 | x->fp_class = FPC_INF; | | 224 | x->fp_class = FPC_INF; |
225 | DUMPFPN(FPE_REG, x); | | 225 | DUMPFPN(FPE_REG, x); |
226 | return (x); | | 226 | return (x); |
227 | } | | 227 | } |
228 | if (x->fp_sign) { | | 228 | if (x->fp_sign) { |
229 | fe->fe_cx |= FPSCR_VXSQRT; | | 229 | fe->fe_cx |= FPSCR_VXSQRT; |
230 | return (fpu_newnan(fe)); | | 230 | return (fpu_newnan(fe)); |
231 | } | | 231 | } |
232 | if (ISINF(x)) { | | 232 | if (ISINF(x)) { |
233 | fe->fe_cx |= FPSCR_VXSQRT; | | 233 | DUMPFPN(FPE_REG, x); |
234 | DUMPFPN(FPE_REG, 0); | | 234 | return (x); |
235 | return (0); | | | |
236 | } | | 235 | } |
237 | | | 236 | |
238 | /* | | 237 | /* |
239 | * Calculate result exponent. As noted above, this may involve | | 238 | * Calculate result exponent. As noted above, this may involve |
240 | * doubling the mantissa. We will also need to double x each | | 239 | * doubling the mantissa. We will also need to double x each |
241 | * time around the loop, so we define a macro for this here, and | | 240 | * time around the loop, so we define a macro for this here, and |
242 | * we break out the multiword mantissa. | | 241 | * we break out the multiword mantissa. |
243 | */ | | 242 | */ |
244 | #ifdef FPU_SHL1_BY_ADD | | 243 | #ifdef FPU_SHL1_BY_ADD |
245 | #define DOUBLE_X { \ | | 244 | #define DOUBLE_X { \ |
246 | FPU_ADDS(x3, x3, x3); FPU_ADDCS(x2, x2, x2); \ | | 245 | FPU_ADDS(x3, x3, x3); FPU_ADDCS(x2, x2, x2); \ |
247 | FPU_ADDCS(x1, x1, x1); FPU_ADDC(x0, x0, x0); \ | | 246 | FPU_ADDCS(x1, x1, x1); FPU_ADDC(x0, x0, x0); \ |
248 | } | | 247 | } |
249 | #else | | 248 | #else |
250 | #define DOUBLE_X { \ | | 249 | #define DOUBLE_X { \ |
251 | x0 = (x0 << 1) | (x1 >> 31); x1 = (x1 << 1) | (x2 >> 31); \ | | 250 | x0 = (x0 << 1) | (x1 >> 31); x1 = (x1 << 1) | (x2 >> 31); \ |
252 | x2 = (x2 << 1) | (x3 >> 31); x3 <<= 1; \ | | 251 | x2 = (x2 << 1) | (x3 >> 31); x3 <<= 1; \ |
253 | } | | 252 | } |
254 | #endif | | 253 | #endif |
255 | #if (FP_NMANT & 1) != 0 | | 254 | #if (FP_NMANT & 1) != 0 |
256 | # define ODD_DOUBLE DOUBLE_X | | 255 | # define ODD_DOUBLE DOUBLE_X |
257 | # define EVEN_DOUBLE /* nothing */ | | 256 | # define EVEN_DOUBLE /* nothing */ |
258 | #else | | 257 | #else |
259 | # define ODD_DOUBLE /* nothing */ | | 258 | # define ODD_DOUBLE /* nothing */ |
260 | # define EVEN_DOUBLE DOUBLE_X | | 259 | # define EVEN_DOUBLE DOUBLE_X |
261 | #endif | | 260 | #endif |
262 | x0 = x->fp_mant[0]; | | 261 | x0 = x->fp_mant[0]; |
263 | x1 = x->fp_mant[1]; | | 262 | x1 = x->fp_mant[1]; |
264 | x2 = x->fp_mant[2]; | | 263 | x2 = x->fp_mant[2]; |
265 | x3 = x->fp_mant[3]; | | 264 | x3 = x->fp_mant[3]; |
266 | e = x->fp_exp; | | 265 | e = x->fp_exp; |
267 | if (e & 1) /* exponent is odd; use sqrt(2mant) */ | | 266 | if (e & 1) /* exponent is odd; use sqrt(2mant) */ |
268 | DOUBLE_X; | | 267 | DOUBLE_X; |
269 | /* THE FOLLOWING ASSUMES THAT RIGHT SHIFT DOES SIGN EXTENSION */ | | 268 | /* THE FOLLOWING ASSUMES THAT RIGHT SHIFT DOES SIGN EXTENSION */ |
270 | x->fp_exp = e >> 1; /* calculates (e&1 ? (e-1)/2 : e/2 */ | | 269 | x->fp_exp = e >> 1; /* calculates (e&1 ? (e-1)/2 : e/2 */ |
271 | | | 270 | |
272 | /* | | 271 | /* |
273 | * Now calculate the mantissa root. Since x is now in [1..4), | | 272 | * Now calculate the mantissa root. Since x is now in [1..4), |
274 | * we know that the first trip around the loop will definitely | | 273 | * we know that the first trip around the loop will definitely |
275 | * set the top bit in q, so we can do that manually and start | | 274 | * set the top bit in q, so we can do that manually and start |
276 | * the loop at the next bit down instead. We must be sure to | | 275 | * the loop at the next bit down instead. We must be sure to |
277 | * double x correctly while doing the `known q=1.0'. | | 276 | * double x correctly while doing the `known q=1.0'. |
278 | * | | 277 | * |
279 | * We do this one mantissa-word at a time, as noted above, to | | 278 | * We do this one mantissa-word at a time, as noted above, to |
280 | * save work. To avoid `(1 << 31) << 1', we also do the top bit | | 279 | * save work. To avoid `(1 << 31) << 1', we also do the top bit |
281 | * outside of each per-word loop. | | 280 | * outside of each per-word loop. |
282 | * | | 281 | * |
283 | * The calculation `t = y + bit' breaks down into `t0 = y0, ..., | | 282 | * The calculation `t = y + bit' breaks down into `t0 = y0, ..., |
284 | * t3 = y3, t? |= bit' for the appropriate word. Since the bit | | 283 | * t3 = y3, t? |= bit' for the appropriate word. Since the bit |
285 | * is always a `new' one, this means that three of the `t?'s are | | 284 | * is always a `new' one, this means that three of the `t?'s are |
286 | * just the corresponding `y?'; we use `#define's here for this. | | 285 | * just the corresponding `y?'; we use `#define's here for this. |
287 | * The variable `tt' holds the actual `t?' variable. | | 286 | * The variable `tt' holds the actual `t?' variable. |
288 | */ | | 287 | */ |
289 | | | 288 | |
290 | /* calculate q0 */ | | 289 | /* calculate q0 */ |
291 | #define t0 tt | | 290 | #define t0 tt |
292 | bit = FP_1; | | 291 | bit = FP_1; |
293 | EVEN_DOUBLE; | | 292 | EVEN_DOUBLE; |
294 | /* if (x >= (t0 = y0 | bit)) { */ /* always true */ | | 293 | /* if (x >= (t0 = y0 | bit)) { */ /* always true */ |
295 | q = bit; | | 294 | q = bit; |
296 | x0 -= bit; | | 295 | x0 -= bit; |
297 | y0 = bit << 1; | | 296 | y0 = bit << 1; |
298 | /* } */ | | 297 | /* } */ |
299 | ODD_DOUBLE; | | 298 | ODD_DOUBLE; |
300 | while ((bit >>= 1) != 0) { /* for remaining bits in q0 */ | | 299 | while ((bit >>= 1) != 0) { /* for remaining bits in q0 */ |
301 | EVEN_DOUBLE; | | 300 | EVEN_DOUBLE; |
302 | t0 = y0 | bit; /* t = y + bit */ | | 301 | t0 = y0 | bit; /* t = y + bit */ |
303 | if (x0 >= t0) { /* if x >= t then */ | | 302 | if (x0 >= t0) { /* if x >= t then */ |
304 | x0 -= t0; /* x -= t */ | | 303 | x0 -= t0; /* x -= t */ |
305 | q |= bit; /* q += bit */ | | 304 | q |= bit; /* q += bit */ |
306 | y0 |= bit << 1; /* y += bit << 1 */ | | 305 | y0 |= bit << 1; /* y += bit << 1 */ |
307 | } | | 306 | } |
308 | ODD_DOUBLE; | | 307 | ODD_DOUBLE; |
309 | } | | 308 | } |
310 | x->fp_mant[0] = q; | | 309 | x->fp_mant[0] = q; |
311 | #undef t0 | | 310 | #undef t0 |
312 | | | 311 | |
313 | /* calculate q1. note (y0&1)==0. */ | | 312 | /* calculate q1. note (y0&1)==0. */ |
314 | #define t0 y0 | | 313 | #define t0 y0 |
315 | #define t1 tt | | 314 | #define t1 tt |
316 | q = 0; | | 315 | q = 0; |
317 | y1 = 0; | | 316 | y1 = 0; |
318 | bit = 1 << 31; | | 317 | bit = 1 << 31; |
319 | EVEN_DOUBLE; | | 318 | EVEN_DOUBLE; |
320 | t1 = bit; | | 319 | t1 = bit; |
321 | FPU_SUBS(d1, x1, t1); | | 320 | FPU_SUBS(d1, x1, t1); |
322 | FPU_SUBC(d0, x0, t0); /* d = x - t */ | | 321 | FPU_SUBC(d0, x0, t0); /* d = x - t */ |
323 | if ((int)d0 >= 0) { /* if d >= 0 (i.e., x >= t) then */ | | 322 | if ((int)d0 >= 0) { /* if d >= 0 (i.e., x >= t) then */ |
324 | x0 = d0, x1 = d1; /* x -= t */ | | 323 | x0 = d0, x1 = d1; /* x -= t */ |
325 | q = bit; /* q += bit */ | | 324 | q = bit; /* q += bit */ |
326 | y0 |= 1; /* y += bit << 1 */ | | 325 | y0 |= 1; /* y += bit << 1 */ |
327 | } | | 326 | } |
328 | ODD_DOUBLE; | | 327 | ODD_DOUBLE; |
329 | while ((bit >>= 1) != 0) { /* for remaining bits in q1 */ | | 328 | while ((bit >>= 1) != 0) { /* for remaining bits in q1 */ |
330 | EVEN_DOUBLE; /* as before */ | | 329 | EVEN_DOUBLE; /* as before */ |
331 | t1 = y1 | bit; | | 330 | t1 = y1 | bit; |
332 | FPU_SUBS(d1, x1, t1); | | 331 | FPU_SUBS(d1, x1, t1); |
333 | FPU_SUBC(d0, x0, t0); | | 332 | FPU_SUBC(d0, x0, t0); |
334 | if ((int)d0 >= 0) { | | 333 | if ((int)d0 >= 0) { |
335 | x0 = d0, x1 = d1; | | 334 | x0 = d0, x1 = d1; |
336 | q |= bit; | | 335 | q |= bit; |
337 | y1 |= bit << 1; | | 336 | y1 |= bit << 1; |
338 | } | | 337 | } |
339 | ODD_DOUBLE; | | 338 | ODD_DOUBLE; |
340 | } | | 339 | } |
341 | x->fp_mant[1] = q; | | 340 | x->fp_mant[1] = q; |
342 | #undef t1 | | 341 | #undef t1 |
343 | | | 342 | |
344 | /* calculate q2. note (y1&1)==0; y0 (aka t0) is fixed. */ | | 343 | /* calculate q2. note (y1&1)==0; y0 (aka t0) is fixed. */ |
345 | #define t1 y1 | | 344 | #define t1 y1 |
346 | #define t2 tt | | 345 | #define t2 tt |
347 | q = 0; | | 346 | q = 0; |
348 | y2 = 0; | | 347 | y2 = 0; |
349 | bit = 1 << 31; | | 348 | bit = 1 << 31; |
350 | EVEN_DOUBLE; | | 349 | EVEN_DOUBLE; |
351 | t2 = bit; | | 350 | t2 = bit; |
352 | FPU_SUBS(d2, x2, t2); | | 351 | FPU_SUBS(d2, x2, t2); |
353 | FPU_SUBCS(d1, x1, t1); | | 352 | FPU_SUBCS(d1, x1, t1); |
354 | FPU_SUBC(d0, x0, t0); | | 353 | FPU_SUBC(d0, x0, t0); |
355 | if ((int)d0 >= 0) { | | 354 | if ((int)d0 >= 0) { |
356 | x0 = d0, x1 = d1, x2 = d2; | | 355 | x0 = d0, x1 = d1, x2 = d2; |
357 | q |= bit; | | 356 | q |= bit; |
358 | y1 |= 1; /* now t1, y1 are set in concrete */ | | 357 | y1 |= 1; /* now t1, y1 are set in concrete */ |
359 | } | | 358 | } |
360 | ODD_DOUBLE; | | 359 | ODD_DOUBLE; |
361 | while ((bit >>= 1) != 0) { | | 360 | while ((bit >>= 1) != 0) { |
362 | EVEN_DOUBLE; | | 361 | EVEN_DOUBLE; |
363 | t2 = y2 | bit; | | 362 | t2 = y2 | bit; |
364 | FPU_SUBS(d2, x2, t2); | | 363 | FPU_SUBS(d2, x2, t2); |
365 | FPU_SUBCS(d1, x1, t1); | | 364 | FPU_SUBCS(d1, x1, t1); |
366 | FPU_SUBC(d0, x0, t0); | | 365 | FPU_SUBC(d0, x0, t0); |
367 | if ((int)d0 >= 0) { | | 366 | if ((int)d0 >= 0) { |
368 | x0 = d0, x1 = d1, x2 = d2; | | 367 | x0 = d0, x1 = d1, x2 = d2; |
369 | q |= bit; | | 368 | q |= bit; |
370 | y2 |= bit << 1; | | 369 | y2 |= bit << 1; |
371 | } | | 370 | } |
372 | ODD_DOUBLE; | | 371 | ODD_DOUBLE; |
373 | } | | 372 | } |
374 | x->fp_mant[2] = q; | | 373 | x->fp_mant[2] = q; |
375 | #undef t2 | | 374 | #undef t2 |
376 | | | 375 | |
377 | /* calculate q3. y0, t0, y1, t1 all fixed; y2, t2, almost done. */ | | 376 | /* calculate q3. y0, t0, y1, t1 all fixed; y2, t2, almost done. */ |
378 | #define t2 y2 | | 377 | #define t2 y2 |
379 | #define t3 tt | | 378 | #define t3 tt |
380 | q = 0; | | 379 | q = 0; |
381 | y3 = 0; | | 380 | y3 = 0; |
382 | bit = 1 << 31; | | 381 | bit = 1 << 31; |
383 | EVEN_DOUBLE; | | 382 | EVEN_DOUBLE; |
384 | t3 = bit; | | 383 | t3 = bit; |
385 | FPU_SUBS(d3, x3, t3); __USE(d3); | | 384 | FPU_SUBS(d3, x3, t3); __USE(d3); |
386 | FPU_SUBCS(d2, x2, t2); | | 385 | FPU_SUBCS(d2, x2, t2); |
387 | FPU_SUBCS(d1, x1, t1); | | 386 | FPU_SUBCS(d1, x1, t1); |
388 | FPU_SUBC(d0, x0, t0); | | 387 | FPU_SUBC(d0, x0, t0); |
389 | ODD_DOUBLE; | | 388 | ODD_DOUBLE; |
390 | if ((int)d0 >= 0) { | | 389 | if ((int)d0 >= 0) { |
391 | x0 = d0, x1 = d1, x2 = d2; | | 390 | x0 = d0, x1 = d1, x2 = d2; |
392 | q |= bit; | | 391 | q |= bit; |
393 | y2 |= 1; | | 392 | y2 |= 1; |
394 | } | | 393 | } |
395 | while ((bit >>= 1) != 0) { | | 394 | while ((bit >>= 1) != 0) { |
396 | EVEN_DOUBLE; | | 395 | EVEN_DOUBLE; |
397 | t3 = y3 | bit; | | 396 | t3 = y3 | bit; |
398 | FPU_SUBS(d3, x3, t3); | | 397 | FPU_SUBS(d3, x3, t3); |
399 | FPU_SUBCS(d2, x2, t2); | | 398 | FPU_SUBCS(d2, x2, t2); |
400 | FPU_SUBCS(d1, x1, t1); | | 399 | FPU_SUBCS(d1, x1, t1); |
401 | FPU_SUBC(d0, x0, t0); | | 400 | FPU_SUBC(d0, x0, t0); |
402 | if ((int)d0 >= 0) { | | 401 | if ((int)d0 >= 0) { |
403 | x0 = d0, x1 = d1, x2 = d2; | | 402 | x0 = d0, x1 = d1, x2 = d2; |
404 | q |= bit; | | 403 | q |= bit; |
405 | y3 |= bit << 1; | | 404 | y3 |= bit << 1; |
406 | } | | 405 | } |
407 | ODD_DOUBLE; | | 406 | ODD_DOUBLE; |
408 | } | | 407 | } |
409 | x->fp_mant[3] = q; | | 408 | x->fp_mant[3] = q; |
410 | | | 409 | |
411 | /* | | 410 | /* |
412 | * The result, which includes guard and round bits, is exact iff | | 411 | * The result, which includes guard and round bits, is exact iff |
413 | * x is now zero; any nonzero bits in x represent sticky bits. | | 412 | * x is now zero; any nonzero bits in x represent sticky bits. |
414 | */ | | 413 | */ |
415 | x->fp_sticky = x0 | x1 | x2 | x3; | | 414 | x->fp_sticky = x0 | x1 | x2 | x3; |
416 | DUMPFPN(FPE_REG, x); | | 415 | DUMPFPN(FPE_REG, x); |
417 | return (x); | | 416 | return (x); |
418 | } | | 417 | } |