1 /* Software floating-point emulation. 2 Basic two-word fraction declaration and manipulation. 3 Copyright (C) 1997,1998,1999,2006,2007 Free Software Foundation, Inc. 4 This file is part of the GNU C Library. 5 Contributed by Richard Henderson (rth@cygnus.com), 6 Jakub Jelinek (jj@ultra.linux.cz), 7 David S. Miller (davem@redhat.com) and 8 Peter Maydell (pmaydell@chiark.greenend.org.uk). 9 10 The GNU C Library is free software; you can redistribute it and/or 11 modify it under the terms of the GNU Lesser General Public 12 License as published by the Free Software Foundation; either 13 version 2.1 of the License, or (at your option) any later version. 14 15 In addition to the permissions in the GNU Lesser General Public 16 License, the Free Software Foundation gives you unlimited 17 permission to link the compiled version of this file into 18 combinations with other programs, and to distribute those 19 combinations without any restriction coming from the use of this 20 file. (The Lesser General Public License restrictions do apply in 21 other respects; for example, they cover modification of the file, 22 and distribution when not linked into a combine executable.) 23 24 The GNU C Library is distributed in the hope that it will be useful, 25 but WITHOUT ANY WARRANTY; without even the implied warranty of 26 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 27 Lesser General Public License for more details. 28 29 You should have received a copy of the GNU Lesser General Public 30 License along with the GNU C Library; if not, see 31 <http://www.gnu.org/licenses/>. */ 32 33 #define _FP_FRAC_DECL_2(X) _FP_W_TYPE X##_f0, X##_f1 34 #define _FP_FRAC_COPY_2(D,S) (D##_f0 = S##_f0, D##_f1 = S##_f1) 35 #define _FP_FRAC_SET_2(X,I) __FP_FRAC_SET_2(X, I) 36 #define _FP_FRAC_HIGH_2(X) (X##_f1) 37 #define _FP_FRAC_LOW_2(X) (X##_f0) 38 #define _FP_FRAC_WORD_2(X,w) (X##_f##w) 39 40 #define _FP_FRAC_SLL_2(X,N) \ 41 (void)(((N) < _FP_W_TYPE_SIZE) \ 42 ? ({ \ 43 if (__builtin_constant_p(N) && (N) == 1) \ 44 { \ 45 X##_f1 = X##_f1 + X##_f1 + (((_FP_WS_TYPE)(X##_f0)) < 0); \ 46 X##_f0 += X##_f0; \ 47 } \ 48 else \ 49 { \ 50 X##_f1 = X##_f1 << (N) | X##_f0 >> (_FP_W_TYPE_SIZE - (N)); \ 51 X##_f0 <<= (N); \ 52 } \ 53 0; \ 54 }) \ 55 : ({ \ 56 X##_f1 = X##_f0 << ((N) - _FP_W_TYPE_SIZE); \ 57 X##_f0 = 0; \ 58 })) 59 60 61 #define _FP_FRAC_SRL_2(X,N) \ 62 (void)(((N) < _FP_W_TYPE_SIZE) \ 63 ? ({ \ 64 X##_f0 = X##_f0 >> (N) | X##_f1 << (_FP_W_TYPE_SIZE - (N)); \ 65 X##_f1 >>= (N); \ 66 }) \ 67 : ({ \ 68 X##_f0 = X##_f1 >> ((N) - _FP_W_TYPE_SIZE); \ 69 X##_f1 = 0; \ 70 })) 71 72 /* Right shift with sticky-lsb. */ 73 #define _FP_FRAC_SRST_2(X,S, N,sz) \ 74 (void)(((N) < _FP_W_TYPE_SIZE) \ 75 ? ({ \ 76 S = (__builtin_constant_p(N) && (N) == 1 \ 77 ? X##_f0 & 1 \ 78 : (X##_f0 << (_FP_W_TYPE_SIZE - (N))) != 0); \ 79 X##_f0 = (X##_f1 << (_FP_W_TYPE_SIZE - (N)) | X##_f0 >> (N)); \ 80 X##_f1 >>= (N); \ 81 }) \ 82 : ({ \ 83 S = ((((N) == _FP_W_TYPE_SIZE \ 84 ? 0 \ 85 : (X##_f1 << (2*_FP_W_TYPE_SIZE - (N)))) \ 86 | X##_f0) != 0); \ 87 X##_f0 = (X##_f1 >> ((N) - _FP_W_TYPE_SIZE)); \ 88 X##_f1 = 0; \ 89 })) 90 91 #define _FP_FRAC_SRS_2(X,N,sz) \ 92 (void)(((N) < _FP_W_TYPE_SIZE) \ 93 ? ({ \ 94 X##_f0 = (X##_f1 << (_FP_W_TYPE_SIZE - (N)) | X##_f0 >> (N) | \ 95 (__builtin_constant_p(N) && (N) == 1 \ 96 ? X##_f0 & 1 \ 97 : (X##_f0 << (_FP_W_TYPE_SIZE - (N))) != 0)); \ 98 X##_f1 >>= (N); \ 99 }) \ 100 : ({ \ 101 X##_f0 = (X##_f1 >> ((N) - _FP_W_TYPE_SIZE) | \ 102 ((((N) == _FP_W_TYPE_SIZE \ 103 ? 0 \ 104 : (X##_f1 << (2*_FP_W_TYPE_SIZE - (N)))) \ 105 | X##_f0) != 0)); \ 106 X##_f1 = 0; \ 107 })) 108 109 #define _FP_FRAC_ADDI_2(X,I) \ 110 __FP_FRAC_ADDI_2(X##_f1, X##_f0, I) 111 112 #define _FP_FRAC_ADD_2(R,X,Y) \ 113 __FP_FRAC_ADD_2(R##_f1, R##_f0, X##_f1, X##_f0, Y##_f1, Y##_f0) 114 115 #define _FP_FRAC_SUB_2(R,X,Y) \ 116 __FP_FRAC_SUB_2(R##_f1, R##_f0, X##_f1, X##_f0, Y##_f1, Y##_f0) 117 118 #define _FP_FRAC_DEC_2(X,Y) \ 119 __FP_FRAC_DEC_2(X##_f1, X##_f0, Y##_f1, Y##_f0) 120 121 #define _FP_FRAC_CLZ_2(R,X) \ 122 do { \ 123 if (X##_f1) \ 124 __FP_CLZ(R,X##_f1); \ 125 else \ 126 { \ 127 __FP_CLZ(R,X##_f0); \ 128 R += _FP_W_TYPE_SIZE; \ 129 } \ 130 } while(0) 131 132 /* Predicates */ 133 #define _FP_FRAC_NEGP_2(X) ((_FP_WS_TYPE)X##_f1 < 0) 134 #define _FP_FRAC_ZEROP_2(X) ((X##_f1 | X##_f0) == 0) 135 #define _FP_FRAC_OVERP_2(fs,X) (_FP_FRAC_HIGH_##fs(X) & _FP_OVERFLOW_##fs) 136 #define _FP_FRAC_CLEAR_OVERP_2(fs,X) (_FP_FRAC_HIGH_##fs(X) &= ~_FP_OVERFLOW_##fs) 137 #define _FP_FRAC_EQ_2(X, Y) (X##_f1 == Y##_f1 && X##_f0 == Y##_f0) 138 #define _FP_FRAC_GT_2(X, Y) \ 139 (X##_f1 > Y##_f1 || (X##_f1 == Y##_f1 && X##_f0 > Y##_f0)) 140 #define _FP_FRAC_GE_2(X, Y) \ 141 (X##_f1 > Y##_f1 || (X##_f1 == Y##_f1 && X##_f0 >= Y##_f0)) 142 143 #define _FP_ZEROFRAC_2 0, 0 144 #define _FP_MINFRAC_2 0, 1 145 #define _FP_MAXFRAC_2 (~(_FP_WS_TYPE)0), (~(_FP_WS_TYPE)0) 146 147 /* 148 * Internals 149 */ 150 151 #define __FP_FRAC_SET_2(X,I1,I0) (X##_f0 = I0, X##_f1 = I1) 152 153 #define __FP_CLZ_2(R, xh, xl) \ 154 do { \ 155 if (xh) \ 156 __FP_CLZ(R,xh); \ 157 else \ 158 { \ 159 __FP_CLZ(R,xl); \ 160 R += _FP_W_TYPE_SIZE; \ 161 } \ 162 } while(0) 163 164 #if 0 165 166 #ifndef __FP_FRAC_ADDI_2 167 #define __FP_FRAC_ADDI_2(xh, xl, i) \ 168 (xh += ((xl += i) < i)) 169 #endif 170 #ifndef __FP_FRAC_ADD_2 171 #define __FP_FRAC_ADD_2(rh, rl, xh, xl, yh, yl) \ 172 (rh = xh + yh + ((rl = xl + yl) < xl)) 173 #endif 174 #ifndef __FP_FRAC_SUB_2 175 #define __FP_FRAC_SUB_2(rh, rl, xh, xl, yh, yl) \ 176 (rh = xh - yh - ((rl = xl - yl) > xl)) 177 #endif 178 #ifndef __FP_FRAC_DEC_2 179 #define __FP_FRAC_DEC_2(xh, xl, yh, yl) \ 180 do { \ 181 UWtype _t = xl; \ 182 xh -= yh + ((xl -= yl) > _t); \ 183 } while (0) 184 #endif 185 186 #else 187 188 #undef __FP_FRAC_ADDI_2 189 #define __FP_FRAC_ADDI_2(xh, xl, i) add_ssaaaa(xh, xl, xh, xl, 0, i) 190 #undef __FP_FRAC_ADD_2 191 #define __FP_FRAC_ADD_2 add_ssaaaa 192 #undef __FP_FRAC_SUB_2 193 #define __FP_FRAC_SUB_2 sub_ddmmss 194 #undef __FP_FRAC_DEC_2 195 #define __FP_FRAC_DEC_2(xh, xl, yh, yl) sub_ddmmss(xh, xl, xh, xl, yh, yl) 196 197 #endif 198 199 /* 200 * Unpack the raw bits of a native fp value. Do not classify or 201 * normalize the data. 202 */ 203 204 #define _FP_UNPACK_RAW_2(fs, X, val) \ 205 do { \ 206 union _FP_UNION_##fs _flo; _flo.flt = (val); \ 207 \ 208 X##_f0 = _flo.bits.frac0; \ 209 X##_f1 = _flo.bits.frac1; \ 210 X##_e = _flo.bits.exp; \ 211 X##_s = _flo.bits.sign; \ 212 } while (0) 213 214 #define _FP_UNPACK_RAW_2_P(fs, X, val) \ 215 do { \ 216 union _FP_UNION_##fs *_flo = \ 217 (union _FP_UNION_##fs *)(val); \ 218 \ 219 X##_f0 = _flo->bits.frac0; \ 220 X##_f1 = _flo->bits.frac1; \ 221 X##_e = _flo->bits.exp; \ 222 X##_s = _flo->bits.sign; \ 223 } while (0) 224 225 226 /* 227 * Repack the raw bits of a native fp value. 228 */ 229 230 #define _FP_PACK_RAW_2(fs, val, X) \ 231 do { \ 232 union _FP_UNION_##fs _flo; \ 233 \ 234 _flo.bits.frac0 = X##_f0; \ 235 _flo.bits.frac1 = X##_f1; \ 236 _flo.bits.exp = X##_e; \ 237 _flo.bits.sign = X##_s; \ 238 \ 239 (val) = _flo.flt; \ 240 } while (0) 241 242 #define _FP_PACK_RAW_2_P(fs, val, X) \ 243 do { \ 244 union _FP_UNION_##fs *_flo = \ 245 (union _FP_UNION_##fs *)(val); \ 246 \ 247 _flo->bits.frac0 = X##_f0; \ 248 _flo->bits.frac1 = X##_f1; \ 249 _flo->bits.exp = X##_e; \ 250 _flo->bits.sign = X##_s; \ 251 } while (0) 252 253 254 /* 255 * Multiplication algorithms: 256 */ 257 258 /* Given a 1W * 1W => 2W primitive, do the extended multiplication. */ 259 260 #define _FP_MUL_MEAT_2_wide(wfracbits, R, X, Y, doit) \ 261 do { \ 262 _FP_FRAC_DECL_4(_z); _FP_FRAC_DECL_2(_b); _FP_FRAC_DECL_2(_c); \ 263 \ 264 doit(_FP_FRAC_WORD_4(_z,1), _FP_FRAC_WORD_4(_z,0), X##_f0, Y##_f0); \ 265 doit(_b_f1, _b_f0, X##_f0, Y##_f1); \ 266 doit(_c_f1, _c_f0, X##_f1, Y##_f0); \ 267 doit(_FP_FRAC_WORD_4(_z,3), _FP_FRAC_WORD_4(_z,2), X##_f1, Y##_f1); \ 268 \ 269 __FP_FRAC_ADD_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2), \ 270 _FP_FRAC_WORD_4(_z,1), 0, _b_f1, _b_f0, \ 271 _FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2), \ 272 _FP_FRAC_WORD_4(_z,1)); \ 273 __FP_FRAC_ADD_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2), \ 274 _FP_FRAC_WORD_4(_z,1), 0, _c_f1, _c_f0, \ 275 _FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2), \ 276 _FP_FRAC_WORD_4(_z,1)); \ 277 \ 278 /* Normalize since we know where the msb of the multiplicands \ 279 were (bit B), we know that the msb of the of the product is \ 280 at either 2B or 2B-1. */ \ 281 _FP_FRAC_SRS_4(_z, wfracbits-1, 2*wfracbits); \ 282 R##_f0 = _FP_FRAC_WORD_4(_z,0); \ 283 R##_f1 = _FP_FRAC_WORD_4(_z,1); \ 284 } while (0) 285 286 /* Given a 1W * 1W => 2W primitive, do the extended multiplication. 287 Do only 3 multiplications instead of four. This one is for machines 288 where multiplication is much more expensive than subtraction. */ 289 290 #define _FP_MUL_MEAT_2_wide_3mul(wfracbits, R, X, Y, doit) \ 291 do { \ 292 _FP_FRAC_DECL_4(_z); _FP_FRAC_DECL_2(_b); _FP_FRAC_DECL_2(_c); \ 293 _FP_W_TYPE _d; \ 294 int _c1, _c2; \ 295 \ 296 _b_f0 = X##_f0 + X##_f1; \ 297 _c1 = _b_f0 < X##_f0; \ 298 _b_f1 = Y##_f0 + Y##_f1; \ 299 _c2 = _b_f1 < Y##_f0; \ 300 doit(_d, _FP_FRAC_WORD_4(_z,0), X##_f0, Y##_f0); \ 301 doit(_FP_FRAC_WORD_4(_z,2), _FP_FRAC_WORD_4(_z,1), _b_f0, _b_f1); \ 302 doit(_c_f1, _c_f0, X##_f1, Y##_f1); \ 303 \ 304 _b_f0 &= -_c2; \ 305 _b_f1 &= -_c1; \ 306 __FP_FRAC_ADD_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2), \ 307 _FP_FRAC_WORD_4(_z,1), (_c1 & _c2), 0, _d, \ 308 0, _FP_FRAC_WORD_4(_z,2), _FP_FRAC_WORD_4(_z,1)); \ 309 __FP_FRAC_ADDI_2(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2), \ 310 _b_f0); \ 311 __FP_FRAC_ADDI_2(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2), \ 312 _b_f1); \ 313 __FP_FRAC_DEC_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2), \ 314 _FP_FRAC_WORD_4(_z,1), \ 315 0, _d, _FP_FRAC_WORD_4(_z,0)); \ 316 __FP_FRAC_DEC_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2), \ 317 _FP_FRAC_WORD_4(_z,1), 0, _c_f1, _c_f0); \ 318 __FP_FRAC_ADD_2(_FP_FRAC_WORD_4(_z,3), _FP_FRAC_WORD_4(_z,2), \ 319 _c_f1, _c_f0, \ 320 _FP_FRAC_WORD_4(_z,3), _FP_FRAC_WORD_4(_z,2)); \ 321 \ 322 /* Normalize since we know where the msb of the multiplicands \ 323 were (bit B), we know that the msb of the of the product is \ 324 at either 2B or 2B-1. */ \ 325 _FP_FRAC_SRS_4(_z, wfracbits-1, 2*wfracbits); \ 326 R##_f0 = _FP_FRAC_WORD_4(_z,0); \ 327 R##_f1 = _FP_FRAC_WORD_4(_z,1); \ 328 } while (0) 329 330 #define _FP_MUL_MEAT_2_gmp(wfracbits, R, X, Y) \ 331 do { \ 332 _FP_FRAC_DECL_4(_z); \ 333 _FP_W_TYPE _x[2], _y[2]; \ 334 _x[0] = X##_f0; _x[1] = X##_f1; \ 335 _y[0] = Y##_f0; _y[1] = Y##_f1; \ 336 \ 337 mpn_mul_n(_z_f, _x, _y, 2); \ 338 \ 339 /* Normalize since we know where the msb of the multiplicands \ 340 were (bit B), we know that the msb of the of the product is \ 341 at either 2B or 2B-1. */ \ 342 _FP_FRAC_SRS_4(_z, wfracbits-1, 2*wfracbits); \ 343 R##_f0 = _z_f[0]; \ 344 R##_f1 = _z_f[1]; \ 345 } while (0) 346 347 /* Do at most 120x120=240 bits multiplication using double floating 348 point multiplication. This is useful if floating point 349 multiplication has much bigger throughput than integer multiply. 350 It is supposed to work for _FP_W_TYPE_SIZE 64 and wfracbits 351 between 106 and 120 only. 352 Caller guarantees that X and Y has (1LLL << (wfracbits - 1)) set. 353 SETFETZ is a macro which will disable all FPU exceptions and set rounding 354 towards zero, RESETFE should optionally reset it back. */ 355 356 #define _FP_MUL_MEAT_2_120_240_double(wfracbits, R, X, Y, setfetz, resetfe) \ 357 do { \ 358 static const double _const[] = { \ 359 /* 2^-24 */ 5.9604644775390625e-08, \ 360 /* 2^-48 */ 3.5527136788005009e-15, \ 361 /* 2^-72 */ 2.1175823681357508e-22, \ 362 /* 2^-96 */ 1.2621774483536189e-29, \ 363 /* 2^28 */ 2.68435456e+08, \ 364 /* 2^4 */ 1.600000e+01, \ 365 /* 2^-20 */ 9.5367431640625e-07, \ 366 /* 2^-44 */ 5.6843418860808015e-14, \ 367 /* 2^-68 */ 3.3881317890172014e-21, \ 368 /* 2^-92 */ 2.0194839173657902e-28, \ 369 /* 2^-116 */ 1.2037062152420224e-35}; \ 370 double _a240, _b240, _c240, _d240, _e240, _f240, \ 371 _g240, _h240, _i240, _j240, _k240; \ 372 union { double d; UDItype i; } _l240, _m240, _n240, _o240, \ 373 _p240, _q240, _r240, _s240; \ 374 UDItype _t240, _u240, _v240, _w240, _x240, _y240 = 0; \ 375 \ 376 if (wfracbits < 106 || wfracbits > 120) \ 377 abort(); \ 378 \ 379 setfetz; \ 380 \ 381 _e240 = (double)(long)(X##_f0 & 0xffffff); \ 382 _j240 = (double)(long)(Y##_f0 & 0xffffff); \ 383 _d240 = (double)(long)((X##_f0 >> 24) & 0xffffff); \ 384 _i240 = (double)(long)((Y##_f0 >> 24) & 0xffffff); \ 385 _c240 = (double)(long)(((X##_f1 << 16) & 0xffffff) | (X##_f0 >> 48)); \ 386 _h240 = (double)(long)(((Y##_f1 << 16) & 0xffffff) | (Y##_f0 >> 48)); \ 387 _b240 = (double)(long)((X##_f1 >> 8) & 0xffffff); \ 388 _g240 = (double)(long)((Y##_f1 >> 8) & 0xffffff); \ 389 _a240 = (double)(long)(X##_f1 >> 32); \ 390 _f240 = (double)(long)(Y##_f1 >> 32); \ 391 _e240 *= _const[3]; \ 392 _j240 *= _const[3]; \ 393 _d240 *= _const[2]; \ 394 _i240 *= _const[2]; \ 395 _c240 *= _const[1]; \ 396 _h240 *= _const[1]; \ 397 _b240 *= _const[0]; \ 398 _g240 *= _const[0]; \ 399 _s240.d = _e240*_j240;\ 400 _r240.d = _d240*_j240 + _e240*_i240;\ 401 _q240.d = _c240*_j240 + _d240*_i240 + _e240*_h240;\ 402 _p240.d = _b240*_j240 + _c240*_i240 + _d240*_h240 + _e240*_g240;\ 403 _o240.d = _a240*_j240 + _b240*_i240 + _c240*_h240 + _d240*_g240 + _e240*_f240;\ 404 _n240.d = _a240*_i240 + _b240*_h240 + _c240*_g240 + _d240*_f240; \ 405 _m240.d = _a240*_h240 + _b240*_g240 + _c240*_f240; \ 406 _l240.d = _a240*_g240 + _b240*_f240; \ 407 _k240 = _a240*_f240; \ 408 _r240.d += _s240.d; \ 409 _q240.d += _r240.d; \ 410 _p240.d += _q240.d; \ 411 _o240.d += _p240.d; \ 412 _n240.d += _o240.d; \ 413 _m240.d += _n240.d; \ 414 _l240.d += _m240.d; \ 415 _k240 += _l240.d; \ 416 _s240.d -= ((_const[10]+_s240.d)-_const[10]); \ 417 _r240.d -= ((_const[9]+_r240.d)-_const[9]); \ 418 _q240.d -= ((_const[8]+_q240.d)-_const[8]); \ 419 _p240.d -= ((_const[7]+_p240.d)-_const[7]); \ 420 _o240.d += _const[7]; \ 421 _n240.d += _const[6]; \ 422 _m240.d += _const[5]; \ 423 _l240.d += _const[4]; \ 424 if (_s240.d != 0.0) _y240 = 1; \ 425 if (_r240.d != 0.0) _y240 = 1; \ 426 if (_q240.d != 0.0) _y240 = 1; \ 427 if (_p240.d != 0.0) _y240 = 1; \ 428 _t240 = (DItype)_k240; \ 429 _u240 = _l240.i; \ 430 _v240 = _m240.i; \ 431 _w240 = _n240.i; \ 432 _x240 = _o240.i; \ 433 R##_f1 = (_t240 << (128 - (wfracbits - 1))) \ 434 | ((_u240 & 0xffffff) >> ((wfracbits - 1) - 104)); \ 435 R##_f0 = ((_u240 & 0xffffff) << (168 - (wfracbits - 1))) \ 436 | ((_v240 & 0xffffff) << (144 - (wfracbits - 1))) \ 437 | ((_w240 & 0xffffff) << (120 - (wfracbits - 1))) \ 438 | ((_x240 & 0xffffff) >> ((wfracbits - 1) - 96)) \ 439 | _y240; \ 440 resetfe; \ 441 } while (0) 442 443 /* 444 * Division algorithms: 445 */ 446 447 #define _FP_DIV_MEAT_2_udiv(fs, R, X, Y) \ 448 do { \ 449 _FP_W_TYPE _n_f2, _n_f1, _n_f0, _r_f1, _r_f0, _m_f1, _m_f0; \ 450 if (_FP_FRAC_GT_2(X, Y)) \ 451 { \ 452 _n_f2 = X##_f1 >> 1; \ 453 _n_f1 = X##_f1 << (_FP_W_TYPE_SIZE - 1) | X##_f0 >> 1; \ 454 _n_f0 = X##_f0 << (_FP_W_TYPE_SIZE - 1); \ 455 } \ 456 else \ 457 { \ 458 R##_e--; \ 459 _n_f2 = X##_f1; \ 460 _n_f1 = X##_f0; \ 461 _n_f0 = 0; \ 462 } \ 463 \ 464 /* Normalize, i.e. make the most significant bit of the \ 465 denominator set. */ \ 466 _FP_FRAC_SLL_2(Y, _FP_WFRACXBITS_##fs); \ 467 \ 468 udiv_qrnnd(R##_f1, _r_f1, _n_f2, _n_f1, Y##_f1); \ 469 umul_ppmm(_m_f1, _m_f0, R##_f1, Y##_f0); \ 470 _r_f0 = _n_f0; \ 471 if (_FP_FRAC_GT_2(_m, _r)) \ 472 { \ 473 R##_f1--; \ 474 _FP_FRAC_ADD_2(_r, Y, _r); \ 475 if (_FP_FRAC_GE_2(_r, Y) && _FP_FRAC_GT_2(_m, _r)) \ 476 { \ 477 R##_f1--; \ 478 _FP_FRAC_ADD_2(_r, Y, _r); \ 479 } \ 480 } \ 481 _FP_FRAC_DEC_2(_r, _m); \ 482 \ 483 if (_r_f1 == Y##_f1) \ 484 { \ 485 /* This is a special case, not an optimization \ 486 (_r/Y##_f1 would not fit into UWtype). \ 487 As _r is guaranteed to be < Y, R##_f0 can be either \ 488 (UWtype)-1 or (UWtype)-2. But as we know what kind \ 489 of bits it is (sticky, guard, round), we don't care. \ 490 We also don't care what the reminder is, because the \ 491 guard bit will be set anyway. -jj */ \ 492 R##_f0 = -1; \ 493 } \ 494 else \ 495 { \ 496 udiv_qrnnd(R##_f0, _r_f1, _r_f1, _r_f0, Y##_f1); \ 497 umul_ppmm(_m_f1, _m_f0, R##_f0, Y##_f0); \ 498 _r_f0 = 0; \ 499 if (_FP_FRAC_GT_2(_m, _r)) \ 500 { \ 501 R##_f0--; \ 502 _FP_FRAC_ADD_2(_r, Y, _r); \ 503 if (_FP_FRAC_GE_2(_r, Y) && _FP_FRAC_GT_2(_m, _r)) \ 504 { \ 505 R##_f0--; \ 506 _FP_FRAC_ADD_2(_r, Y, _r); \ 507 } \ 508 } \ 509 if (!_FP_FRAC_EQ_2(_r, _m)) \ 510 R##_f0 |= _FP_WORK_STICKY; \ 511 } \ 512 } while (0) 513 514 515 #define _FP_DIV_MEAT_2_gmp(fs, R, X, Y) \ 516 do { \ 517 _FP_W_TYPE _x[4], _y[2], _z[4]; \ 518 _y[0] = Y##_f0; _y[1] = Y##_f1; \ 519 _x[0] = _x[3] = 0; \ 520 if (_FP_FRAC_GT_2(X, Y)) \ 521 { \ 522 R##_e++; \ 523 _x[1] = (X##_f0 << (_FP_WFRACBITS_##fs-1 - _FP_W_TYPE_SIZE) | \ 524 X##_f1 >> (_FP_W_TYPE_SIZE - \ 525 (_FP_WFRACBITS_##fs-1 - _FP_W_TYPE_SIZE))); \ 526 _x[2] = X##_f1 << (_FP_WFRACBITS_##fs-1 - _FP_W_TYPE_SIZE); \ 527 } \ 528 else \ 529 { \ 530 _x[1] = (X##_f0 << (_FP_WFRACBITS_##fs - _FP_W_TYPE_SIZE) | \ 531 X##_f1 >> (_FP_W_TYPE_SIZE - \ 532 (_FP_WFRACBITS_##fs - _FP_W_TYPE_SIZE))); \ 533 _x[2] = X##_f1 << (_FP_WFRACBITS_##fs - _FP_W_TYPE_SIZE); \ 534 } \ 535 \ 536 (void) mpn_divrem (_z, 0, _x, 4, _y, 2); \ 537 R##_f1 = _z[1]; \ 538 R##_f0 = _z[0] | ((_x[0] | _x[1]) != 0); \ 539 } while (0) 540 541 542 /* 543 * Square root algorithms: 544 * We have just one right now, maybe Newton approximation 545 * should be added for those machines where division is fast. 546 */ 547 548 #define _FP_SQRT_MEAT_2(R, S, T, X, q) \ 549 do { \ 550 while (q) \ 551 { \ 552 T##_f1 = S##_f1 + q; \ 553 if (T##_f1 <= X##_f1) \ 554 { \ 555 S##_f1 = T##_f1 + q; \ 556 X##_f1 -= T##_f1; \ 557 R##_f1 += q; \ 558 } \ 559 _FP_FRAC_SLL_2(X, 1); \ 560 q >>= 1; \ 561 } \ 562 q = (_FP_W_TYPE)1 << (_FP_W_TYPE_SIZE - 1); \ 563 while (q != _FP_WORK_ROUND) \ 564 { \ 565 T##_f0 = S##_f0 + q; \ 566 T##_f1 = S##_f1; \ 567 if (T##_f1 < X##_f1 || \ 568 (T##_f1 == X##_f1 && T##_f0 <= X##_f0)) \ 569 { \ 570 S##_f0 = T##_f0 + q; \ 571 S##_f1 += (T##_f0 > S##_f0); \ 572 _FP_FRAC_DEC_2(X, T); \ 573 R##_f0 += q; \ 574 } \ 575 _FP_FRAC_SLL_2(X, 1); \ 576 q >>= 1; \ 577 } \ 578 if (X##_f0 | X##_f1) \ 579 { \ 580 if (S##_f1 < X##_f1 || \ 581 (S##_f1 == X##_f1 && S##_f0 < X##_f0)) \ 582 R##_f0 |= _FP_WORK_ROUND; \ 583 R##_f0 |= _FP_WORK_STICKY; \ 584 } \ 585 } while (0) 586 587 588 /* 589 * Assembly/disassembly for converting to/from integral types. 590 * No shifting or overflow handled here. 591 */ 592 593 #define _FP_FRAC_ASSEMBLE_2(r, X, rsize) \ 594 (void)((rsize <= _FP_W_TYPE_SIZE) \ 595 ? ({ r = X##_f0; }) \ 596 : ({ \ 597 r = X##_f1; \ 598 r <<= _FP_W_TYPE_SIZE; \ 599 r += X##_f0; \ 600 })) 601 602 #define _FP_FRAC_DISASSEMBLE_2(X, r, rsize) \ 603 do { \ 604 X##_f0 = r; \ 605 X##_f1 = (rsize <= _FP_W_TYPE_SIZE ? 0 : r >> _FP_W_TYPE_SIZE); \ 606 } while (0) 607 608 /* 609 * Convert FP values between word sizes 610 */ 611 612 #define _FP_FRAC_COPY_1_2(D, S) (D##_f = S##_f0) 613 614 #define _FP_FRAC_COPY_2_1(D, S) ((D##_f0 = S##_f), (D##_f1 = 0)) 615 616 #define _FP_FRAC_COPY_2_2(D,S) _FP_FRAC_COPY_2(D,S) 617