1 // SPDX-License-Identifier: BSD-3-Clause
2 /* ==========================================================================
3 * ieee754.c -- floating-point conversion between half, double & single-precision
4 *
5 * Copyright (c) 2018-2024, Laurence Lundblade. All rights reserved.
6 * Copyright (c) 2021, Arm Limited. All rights reserved.
7 *
8 * SPDX-License-Identifier: BSD-3-Clause
9 *
10 * See BSD-3-Clause license in README.md
11 *
12 * Created on 7/23/18
13 * ========================================================================== */
14
15 /*
16 * Include before QCBOR_DISABLE_PREFERRED_FLOAT is checked as
17 * QCBOR_DISABLE_PREFERRED_FLOAT might be defined in qcbor/qcbor_common.h
18 */
19 #include "qcbor/qcbor_common.h"
20
21 #ifndef QCBOR_DISABLE_PREFERRED_FLOAT
22
23 #include "ieee754.h"
24 #include <string.h> /* For memcpy() */
25
26
27 /*
28 * This code has long lines and is easier to read because of
29 * them. Some coding guidelines prefer 80 column lines (can they not
30 * afford big displays?).
31 *
32 * This code works solely using shifts and masks and thus has no
33 * dependency on any math libraries. It can even work if the CPU
34 * doesn't have any floating-point support, though that isn't the most
35 * useful thing to do.
36 *
37 * The memcpy() dependency is only for CopyFloatToUint32() and friends
38 * which only is needed to avoid type punning when converting the
39 * actual float bits to an unsigned value so the bit shifts and masks
40 * can work.
41 *
42 * The references used to write this code:
43 *
44 * IEEE 754-2008, particularly section 3.6 and 6.2.1
45 *
46 * https://en.wikipedia.org/wiki/IEEE_754 and subordinate pages
47 *
48 * https://stackoverflow.com/questions/19800415/why-does-ieee-754-reserve-so-many-nan-values
49 *
50 * https://stackoverflow.com/questions/46073295/implicit-type-promotion-rules
51 *
52 * https://stackoverflow.com/questions/589575/what-does-the-c-standard-state-the-size-of-int-long-type-to-be
53 *
54 * IEEE754_FloatToDouble(uint32_t uFloat) was created but is not
55 * needed. It can be retrieved from github history if needed.
56 */
57
58
59
60
61 /* ----- Half Precsion ----------- */
62 #define HALF_NUM_SIGNIFICAND_BITS (10)
63 #define HALF_NUM_EXPONENT_BITS (5)
64 #define HALF_NUM_SIGN_BITS (1)
65
66 #define HALF_SIGNIFICAND_SHIFT (0)
67 #define HALF_EXPONENT_SHIFT (HALF_NUM_SIGNIFICAND_BITS)
68 #define HALF_SIGN_SHIFT (HALF_NUM_SIGNIFICAND_BITS + HALF_NUM_EXPONENT_BITS)
69
70 #define HALF_SIGNIFICAND_MASK (0x3ffU) // The lower 10 bits
71 #define HALF_EXPONENT_MASK (0x1fU << HALF_EXPONENT_SHIFT) // 0x7c00 5 bits of exponent
72 #define HALF_SIGN_MASK (0x01U << HALF_SIGN_SHIFT) // 0x8000 1 bit of sign
73 #define HALF_QUIET_NAN_BIT (0x01U << (HALF_NUM_SIGNIFICAND_BITS-1)) // 0x0200
74
75 /* Biased Biased Unbiased Use
76 * 0x00 0 -15 0 and subnormal
77 * 0x01 1 -14 Smallest normal exponent
78 * 0x1e 30 15 Largest normal exponent
79 * 0x1F 31 16 NaN and Infinity */
80 #define HALF_EXPONENT_BIAS (15)
81 #define HALF_EXPONENT_MAX (HALF_EXPONENT_BIAS) // 15 Unbiased
82 #define HALF_EXPONENT_MIN (-HALF_EXPONENT_BIAS+1) // -14 Unbiased
83 #define HALF_EXPONENT_ZERO (-HALF_EXPONENT_BIAS) // -15 Unbiased
84 #define HALF_EXPONENT_INF_OR_NAN (HALF_EXPONENT_BIAS+1) // 16 Unbiased
85
86
87 /* ------ Single-Precision -------- */
88 #define SINGLE_NUM_SIGNIFICAND_BITS (23)
89 #define SINGLE_NUM_EXPONENT_BITS (8)
90 #define SINGLE_NUM_SIGN_BITS (1)
91
92 #define SINGLE_SIGNIFICAND_SHIFT (0)
93 #define SINGLE_EXPONENT_SHIFT (SINGLE_NUM_SIGNIFICAND_BITS)
94 #define SINGLE_SIGN_SHIFT (SINGLE_NUM_SIGNIFICAND_BITS + SINGLE_NUM_EXPONENT_BITS)
95
96 #define SINGLE_SIGNIFICAND_MASK (0x7fffffU) // The lower 23 bits
97 #define SINGLE_EXPONENT_MASK (0xffU << SINGLE_EXPONENT_SHIFT) // 8 bits of exponent
98 #define SINGLE_SIGN_MASK (0x01U << SINGLE_SIGN_SHIFT) // 1 bit of sign
99 #define SINGLE_QUIET_NAN_BIT (0x01U << (SINGLE_NUM_SIGNIFICAND_BITS-1))
100
101 /* Biased Biased Unbiased Use
102 * 0x0000 0 -127 0 and subnormal
103 * 0x0001 1 -126 Smallest normal exponent
104 * 0x7f 127 0 1
105 * 0xfe 254 127 Largest normal exponent
106 * 0xff 255 128 NaN and Infinity */
107 #define SINGLE_EXPONENT_BIAS (127)
108 #define SINGLE_EXPONENT_MAX (SINGLE_EXPONENT_BIAS)
109 #define SINGLE_EXPONENT_MIN (-SINGLE_EXPONENT_BIAS+1)
110 #define SINGLE_EXPONENT_ZERO (-SINGLE_EXPONENT_BIAS)
111 #define SINGLE_EXPONENT_INF_OR_NAN (SINGLE_EXPONENT_BIAS+1)
112
113
114 /* --------- Double-Precision ---------- */
115 #define DOUBLE_NUM_SIGNIFICAND_BITS (52)
116 #define DOUBLE_NUM_EXPONENT_BITS (11)
117 #define DOUBLE_NUM_SIGN_BITS (1)
118
119 #define DOUBLE_SIGNIFICAND_SHIFT (0)
120 #define DOUBLE_EXPONENT_SHIFT (DOUBLE_NUM_SIGNIFICAND_BITS)
121 #define DOUBLE_SIGN_SHIFT (DOUBLE_NUM_SIGNIFICAND_BITS + DOUBLE_NUM_EXPONENT_BITS)
122
123 #define DOUBLE_SIGNIFICAND_MASK (0xfffffffffffffULL) // The lower 52 bits
124 #define DOUBLE_EXPONENT_MASK (0x7ffULL << DOUBLE_EXPONENT_SHIFT) // 11 bits of exponent
125 #define DOUBLE_SIGN_MASK (0x01ULL << DOUBLE_SIGN_SHIFT) // 1 bit of sign
126 #define DOUBLE_QUIET_NAN_BIT (0x01ULL << (DOUBLE_NUM_SIGNIFICAND_BITS-1))
127
128
129 /* Biased Biased Unbiased Use
130 * 0x00000000 0 -1023 0 and subnormal
131 * 0x00000001 1 -1022 Smallest normal exponent
132 * 0x000007fe 2046 1023 Largest normal exponent
133 * 0x000007ff 2047 1024 NaN and Infinity */
134 #define DOUBLE_EXPONENT_BIAS (1023)
135 #define DOUBLE_EXPONENT_MAX (DOUBLE_EXPONENT_BIAS)
136 #define DOUBLE_EXPONENT_MIN (-DOUBLE_EXPONENT_BIAS+1)
137 #define DOUBLE_EXPONENT_ZERO (-DOUBLE_EXPONENT_BIAS)
138 #define DOUBLE_EXPONENT_INF_OR_NAN (DOUBLE_EXPONENT_BIAS+1)
139
140
141
142
143 /*
144 * Convenient functions to avoid type punning, compiler warnings and
145 * such. The optimizer reduces them to a simple assignment. This is a
146 * crusty corner of C. It shouldn't be this hard.
147 *
148 * These are also in UsefulBuf.h under a different name. They are copied
149 * here to avoid a dependency on UsefulBuf.h. There is no object code
150 * size impact because these always optimze down to a simple assignment.
151 */
152 static inline uint32_t
CopyFloatToUint32(float f)153 CopyFloatToUint32(float f)
154 {
155 uint32_t u32;
156 memcpy(&u32, &f, sizeof(uint32_t));
157 return u32;
158 }
159
160 static inline uint64_t
CopyDoubleToUint64(double d)161 CopyDoubleToUint64(double d)
162 {
163 uint64_t u64;
164 memcpy(&u64, &d, sizeof(uint64_t));
165 return u64;
166 }
167
168 static inline double
CopyUint64ToDouble(uint64_t u64)169 CopyUint64ToDouble(uint64_t u64)
170 {
171 double d;
172 memcpy(&d, &u64, sizeof(uint64_t));
173 return d;
174 }
175
176 static inline float
CopyUint32ToSingle(uint32_t u32)177 CopyUint32ToSingle(uint32_t u32)
178 {
179 float f;
180 memcpy(&f, &u32, sizeof(uint32_t));
181 return f;
182 }
183
184
185
186
187 /**
188 * @brief Assemble sign, significand and exponent into single precision float.
189 *
190 * @param[in] uDoubleSign 0 if positive, 1 if negative
191 * @pararm[in] uDoubleSignificand Bits of the significand
192 * @param[in] nDoubleUnBiasedExponent Exponent
193 *
194 * This returns the bits for a single-precision float, a binary64
195 * as specified in IEEE754.
196 */
197 static double
IEEE754_AssembleDouble(uint64_t uDoubleSign,uint64_t uDoubleSignificand,int64_t nDoubleUnBiasedExponent)198 IEEE754_AssembleDouble(uint64_t uDoubleSign,
199 uint64_t uDoubleSignificand,
200 int64_t nDoubleUnBiasedExponent)
201 {
202 uint64_t uDoubleBiasedExponent;
203
204 uDoubleBiasedExponent = (uint64_t)(nDoubleUnBiasedExponent + DOUBLE_EXPONENT_BIAS);
205
206 return CopyUint64ToDouble(uDoubleSignificand |
207 (uDoubleBiasedExponent << DOUBLE_EXPONENT_SHIFT) |
208 (uDoubleSign << DOUBLE_SIGN_SHIFT));
209 }
210
211
212 double
IEEE754_HalfToDouble(uint16_t uHalfPrecision)213 IEEE754_HalfToDouble(uint16_t uHalfPrecision)
214 {
215 uint64_t uDoubleSignificand;
216 int64_t nDoubleUnBiasedExponent;
217 double dResult;
218
219 /* Pull out the three parts of the half-precision float. Do all
220 * the work in 64 bits because that is what the end result is. It
221 * may give smaller code size and will keep static analyzers
222 * happier.
223 */
224 const uint64_t uHalfSignificand = uHalfPrecision & HALF_SIGNIFICAND_MASK;
225 const uint64_t uHalfBiasedExponent = (uHalfPrecision & HALF_EXPONENT_MASK) >> HALF_EXPONENT_SHIFT;
226 const int64_t nHalfUnBiasedExponent = (int64_t)uHalfBiasedExponent - HALF_EXPONENT_BIAS;
227 const uint64_t uHalfSign = (uHalfPrecision & HALF_SIGN_MASK) >> HALF_SIGN_SHIFT;
228
229 if(nHalfUnBiasedExponent == HALF_EXPONENT_ZERO) {
230 /* 0 or subnormal */
231 if(uHalfSignificand) {
232 /* --- SUBNORMAL --- */
233 /* A half-precision subnormal can always be converted to a
234 * normal double-precision float because the ranges line up.
235 * The exponent of a subnormal starts out at the min exponent
236 * for a normal. As the sub normal significand bits are
237 * shifted, left to normalize, the exponent is
238 * decremented. Shifting continues until fully normalized.
239 */
240 nDoubleUnBiasedExponent = HALF_EXPONENT_MIN;
241 uDoubleSignificand = uHalfSignificand;
242 do {
243 uDoubleSignificand <<= 1;
244 nDoubleUnBiasedExponent--;
245 } while ((uDoubleSignificand & (1ULL << HALF_NUM_SIGNIFICAND_BITS)) == 0);
246 /* A normal has an implied 1 in the most significant
247 * position that a subnormal doesn't. */
248 uDoubleSignificand -= 1ULL << HALF_NUM_SIGNIFICAND_BITS;
249 /* Must shift into place for a double significand */
250 uDoubleSignificand <<= DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS;
251
252 dResult = IEEE754_AssembleDouble(uHalfSign,
253 uDoubleSignificand,
254 nDoubleUnBiasedExponent);
255 } else {
256 /* --- ZERO --- */
257 dResult = IEEE754_AssembleDouble(uHalfSign,
258 0,
259 DOUBLE_EXPONENT_ZERO);
260 }
261 } else if(nHalfUnBiasedExponent == HALF_EXPONENT_INF_OR_NAN) {
262 /* NaN or Inifinity */
263 if(uHalfSignificand) {
264 /* --- NaN --- */
265 /* Half-precision payloads always fit into double precision
266 * payloads. They are shifted left the same as a normal
267 * number significand.
268 */
269 uDoubleSignificand = uHalfSignificand << (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
270 dResult = IEEE754_AssembleDouble(uHalfSign,
271 uDoubleSignificand,
272 DOUBLE_EXPONENT_INF_OR_NAN);
273 } else {
274 /* --- INFINITY --- */
275 dResult = IEEE754_AssembleDouble(uHalfSign,
276 0,
277 DOUBLE_EXPONENT_INF_OR_NAN);
278 }
279 } else {
280 /* --- NORMAL NUMBER --- */
281 uDoubleSignificand = uHalfSignificand << (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
282 dResult = IEEE754_AssembleDouble(uHalfSign,
283 uDoubleSignificand,
284 nHalfUnBiasedExponent);
285 }
286
287 return dResult;
288 }
289
290
291 /**
292 * @brief Assemble sign, significand and exponent into single precision float.
293 *
294 * @param[in] uHalfSign 0 if positive, 1 if negative
295 * @pararm[in] uHalfSignificand Bits of the significand
296 * @param[in] nHalfUnBiasedExponent Exponent
297 *
298 * This returns the bits for a single-precision float, a binary32 as
299 * specified in IEEE754. It is returned as a uint64_t rather than a
300 * uint32_t or a float for convenience of usage.
301 */
302 static uint32_t
IEEE754_AssembleHalf(uint32_t uHalfSign,uint32_t uHalfSignificand,int32_t nHalfUnBiasedExponent)303 IEEE754_AssembleHalf(uint32_t uHalfSign,
304 uint32_t uHalfSignificand,
305 int32_t nHalfUnBiasedExponent)
306 {
307 uint32_t uHalfUnbiasedExponent;
308
309 uHalfUnbiasedExponent = (uint32_t)(nHalfUnBiasedExponent + HALF_EXPONENT_BIAS);
310
311 return uHalfSignificand |
312 (uHalfUnbiasedExponent << HALF_EXPONENT_SHIFT) |
313 (uHalfSign << HALF_SIGN_SHIFT);
314 }
315
316
317 /* Public function; see ieee754.h */
318 IEEE754_union
IEEE754_SingleToHalf(float f)319 IEEE754_SingleToHalf(float f)
320 {
321 IEEE754_union result;
322 uint32_t uDroppedBits;
323 int32_t nExponentDifference;
324 int32_t nShiftAmount;
325 uint32_t uHalfSignificand;
326
327 /* Pull the three parts out of the double-precision float Most work
328 * is done with uint32_t which helps avoid integer promotions and
329 * static analyzer complaints.
330 */
331 const uint32_t uSingle = CopyFloatToUint32(f);
332 const uint32_t uSingleBiasedExponent = (uSingle & SINGLE_EXPONENT_MASK) >> SINGLE_EXPONENT_SHIFT;
333 const int32_t nSingleUnbiasedExponent = (int32_t)uSingleBiasedExponent - SINGLE_EXPONENT_BIAS;
334 const uint32_t uSingleSignificand = uSingle & SINGLE_SIGNIFICAND_MASK;
335 const uint32_t uSingleSign = (uSingle & SINGLE_SIGN_MASK) >> SINGLE_SIGN_SHIFT;
336
337 if(nSingleUnbiasedExponent == SINGLE_EXPONENT_ZERO) {
338 if(uSingleSignificand == 0) {
339 /* --- IS ZERO --- */
340 result.uSize = IEEE754_UNION_IS_HALF;
341 result.uValue = IEEE754_AssembleHalf(uSingleSign,
342 0,
343 HALF_EXPONENT_ZERO);
344 } else {
345 /* --- IS SINGLE SUBNORMAL --- */
346 /* The largest single subnormal is slightly less than the
347 * largest single normal which is 2^-149 or
348 * 2.2040517676619426e-38. The smallest half subnormal is
349 * 2^-14 or 5.9604644775390625E-8. There is no overlap so
350 * single subnormals can't be converted to halfs of any sort.
351 */
352 result.uSize = IEEE754_UNION_IS_SINGLE;
353 result.uValue = uSingle;
354 }
355 } else if(nSingleUnbiasedExponent == SINGLE_EXPONENT_INF_OR_NAN) {
356 if(uSingleSignificand == 0) {
357 /* ---- IS INFINITY ---- */
358 result.uSize = IEEE754_UNION_IS_HALF;
359 result.uValue = IEEE754_AssembleHalf(uSingleSign, 0, HALF_EXPONENT_INF_OR_NAN);
360 } else {
361 /* The NaN can only be converted if no payload bits are lost
362 * per RFC 8949 section 4.1 that defines Preferred
363 * Serializaton. Note that Deterministically Encode CBOR in
364 * section 4.2 allows for some variation of this rule, but at
365 * the moment this implementation is of Preferred
366 * Serialization, not CDE. As of December 2023, we are also
367 * expecting an update to CDE. This code may need to be
368 * updated for CDE.
369 */
370 uDroppedBits = uSingleSignificand & (SINGLE_SIGNIFICAND_MASK >> HALF_NUM_SIGNIFICAND_BITS);
371 if(uDroppedBits == 0) {
372 /* --- IS CONVERTABLE NAN --- */
373 uHalfSignificand = uSingleSignificand >> (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
374 result.uSize = IEEE754_UNION_IS_HALF;
375 result.uValue = IEEE754_AssembleHalf(uSingleSign,
376 uHalfSignificand,
377 HALF_EXPONENT_INF_OR_NAN);
378
379 } else {
380 /* --- IS UNCONVERTABLE NAN --- */
381 result.uSize = IEEE754_UNION_IS_SINGLE;
382 result.uValue = uSingle;
383 }
384 }
385 } else {
386 /* ---- REGULAR NUMBER ---- */
387 /* A regular single can be converted to a regular half if the
388 * single's exponent is in the smaller range of a half and if no
389 * precision is lost in the significand.
390 */
391 if(nSingleUnbiasedExponent >= HALF_EXPONENT_MIN &&
392 nSingleUnbiasedExponent <= HALF_EXPONENT_MAX &&
393 (uSingleSignificand & (SINGLE_SIGNIFICAND_MASK >> HALF_NUM_SIGNIFICAND_BITS)) == 0) {
394 uHalfSignificand = uSingleSignificand >> (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
395
396 /* --- CONVERT TO HALF NORMAL --- */
397 result.uSize = IEEE754_UNION_IS_HALF;
398 result.uValue = IEEE754_AssembleHalf(uSingleSign,
399 uHalfSignificand,
400 nSingleUnbiasedExponent);
401 } else {
402 /* Unable to convert to a half normal. See if it can be
403 * converted to a half subnormal. To do that, the exponent
404 * must be in range and no precision can be lost in the
405 * signficand.
406 *
407 * This is more complicated because the number is not
408 * normalized. The signficand must be shifted proprotionally
409 * to the exponent and 1 must be added in. See
410 * https://en.wikipedia.org/wiki/Single-precision_floating-point_format#Exponent_encoding
411 *
412 * Exponents -14 to -24 map to a shift of 0 to 10 of the
413 * significand. The largest value of a half subnormal has an
414 * exponent of -14. Subnormals are not normalized like
415 * normals meaning they lose precision as the numbers get
416 * smaller. Normals don't lose precision because the exponent
417 * allows all the bits of the significand to be significant.
418 */
419 /* The exponent of the largest possible half-precision
420 * subnormal is HALF_EXPONENT_MIN (-14). Exponents larger
421 * than this are normal and handled above. We're going to
422 * shift the significand right by at least this amount.
423 */
424 nExponentDifference = -(nSingleUnbiasedExponent - HALF_EXPONENT_MIN);
425
426 /* In addition to the shift based on the exponent's value,
427 * the single significand has to be shifted right to fit into
428 * a half-precision significand */
429 nShiftAmount = nExponentDifference + (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
430
431 /* Must add 1 in to the possible significand because there is
432 * an implied 1 for normal values and not for subnormal
433 * values. See equations here:
434 * https://en.wikipedia.org/wiki/Single-precision_floating-point_format#Exponent_encoding
435 */
436 uHalfSignificand = (uSingleSignificand + (1 << SINGLE_NUM_SIGNIFICAND_BITS)) >> nShiftAmount;
437
438 /* If only zero bits get shifted out, this can be converted
439 * to subnormal */
440 if(nSingleUnbiasedExponent < HALF_EXPONENT_MIN &&
441 nSingleUnbiasedExponent >= HALF_EXPONENT_MIN - HALF_NUM_SIGNIFICAND_BITS &&
442 uHalfSignificand << nShiftAmount == uSingleSignificand + (1 << SINGLE_NUM_SIGNIFICAND_BITS)) {
443 /* --- CONVERTABLE TO HALF SUBNORMAL --- */
444 result.uSize = IEEE754_UNION_IS_HALF;
445 result.uValue = IEEE754_AssembleHalf(uSingleSign,
446 uHalfSignificand,
447 HALF_EXPONENT_ZERO);
448 } else {
449 /* --- DO NOT CONVERT --- */
450 result.uSize = IEEE754_UNION_IS_SINGLE;
451 result.uValue = uSingle;
452 }
453 }
454 }
455
456 return result;
457 }
458
459
460 /**
461 * @brief Assemble sign, significand and exponent into single precision float.
462 *
463 * @param[in] uSingleSign 0 if positive, 1 if negative
464 * @pararm[in] uSingleSignificand Bits of the significand
465 * @param[in] nSingleUnBiasedExponent Exponent
466 *
467 * This returns the bits for a single-precision float, a binary32 as
468 * specified in IEEE754. It is returned as a uint64_t rather than a
469 * uint32_t or a float for convenience of usage.
470 */
471 static uint64_t
IEEE754_AssembleSingle(uint64_t uSingleSign,uint64_t uSingleSignificand,int64_t nSingleUnBiasedExponent)472 IEEE754_AssembleSingle(uint64_t uSingleSign,
473 uint64_t uSingleSignificand,
474 int64_t nSingleUnBiasedExponent)
475 {
476 uint64_t uSingleBiasedExponent;
477
478 uSingleBiasedExponent = (uint64_t)(nSingleUnBiasedExponent + SINGLE_EXPONENT_BIAS);
479
480 return uSingleSignificand |
481 (uSingleBiasedExponent << SINGLE_EXPONENT_SHIFT) |
482 (uSingleSign << SINGLE_SIGN_SHIFT);
483 }
484
485
486 /**
487 * @brief Convert a double-precision float to single-precision.
488 *
489 * @param[in] d The value to convert.
490 *
491 * @returns Either unconverted value or value converted to single-precision.
492 *
493 * This always succeeds. If the value cannot be converted without the
494 * loss of precision, it is not converted.
495 *
496 * This handles all subnormals and NaN payloads.
497 */
498 static IEEE754_union
IEEE754_DoubleToSingle(double d)499 IEEE754_DoubleToSingle(double d)
500 {
501 IEEE754_union Result;
502 int64_t nExponentDifference;
503 int64_t nShiftAmount;
504 uint64_t uSingleSignificand;
505 uint64_t uDroppedBits;
506
507
508 /* Pull the three parts out of the double-precision float. Most
509 * work is done with uint64_t which helps avoid integer promotions
510 * and static analyzer complaints.
511 */
512 const uint64_t uDouble = CopyDoubleToUint64(d);
513 const uint64_t uDoubleBiasedExponent = (uDouble & DOUBLE_EXPONENT_MASK) >> DOUBLE_EXPONENT_SHIFT;
514 const int64_t nDoubleUnbiasedExponent = (int64_t)uDoubleBiasedExponent - DOUBLE_EXPONENT_BIAS;
515 const uint64_t uDoubleSign = (uDouble & DOUBLE_SIGN_MASK) >> DOUBLE_SIGN_SHIFT;
516 const uint64_t uDoubleSignificand = uDouble & DOUBLE_SIGNIFICAND_MASK;
517
518
519 if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_ZERO) {
520 if(uDoubleSignificand == 0) {
521 /* --- IS ZERO --- */
522 Result.uSize = IEEE754_UNION_IS_SINGLE;
523 Result.uValue = IEEE754_AssembleSingle(uDoubleSign,
524 0,
525 SINGLE_EXPONENT_ZERO);
526 } else {
527 /* --- IS DOUBLE SUBNORMAL --- */
528 /* The largest double subnormal is slightly less than the
529 * largest double normal which is 2^-1022 or
530 * 2.2250738585072014e-308. The smallest single subnormal
531 * is 2^-149 or 1.401298464324817e-45. There is no
532 * overlap so double subnormals can't be converted to
533 * singles of any sort.
534 */
535 Result.uSize = IEEE754_UNION_IS_DOUBLE;
536 Result.uValue = uDouble;
537 }
538 } else if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_INF_OR_NAN) {
539 if(uDoubleSignificand == 0) {
540 /* ---- IS INFINITY ---- */
541 Result.uSize = IEEE754_UNION_IS_SINGLE;
542 Result.uValue = IEEE754_AssembleSingle(uDoubleSign,
543 0,
544 SINGLE_EXPONENT_INF_OR_NAN);
545 } else {
546 /* The NaN can only be converted if no payload bits are
547 * lost per RFC 8949 section 4.1 that defines Preferred
548 * Serializaton. Note that Deterministically Encode CBOR
549 * in section 4.2 allows for some variation of this rule,
550 * but at the moment this implementation is of Preferred
551 * Serialization, not CDE. As of December 2023, we are
552 * also expecting an update to CDE. This code may need to
553 * be updated for CDE.
554 */
555 uDroppedBits = uDoubleSignificand & (DOUBLE_SIGNIFICAND_MASK >> SINGLE_NUM_SIGNIFICAND_BITS);
556 if(uDroppedBits == 0) {
557 /* --- IS CONVERTABLE NAN --- */
558 uSingleSignificand = uDoubleSignificand >> (DOUBLE_NUM_SIGNIFICAND_BITS - SINGLE_NUM_SIGNIFICAND_BITS);
559 Result.uSize = IEEE754_UNION_IS_SINGLE;
560 Result.uValue = IEEE754_AssembleSingle(uDoubleSign,
561 uSingleSignificand,
562 SINGLE_EXPONENT_INF_OR_NAN);
563 } else {
564 /* --- IS UNCONVERTABLE NAN --- */
565 Result.uSize = IEEE754_UNION_IS_DOUBLE;
566 Result.uValue = uDouble;
567 }
568 }
569 } else {
570 /* ---- REGULAR NUMBER ---- */
571 /* A regular double can be converted to a regular single if
572 * the double's exponent is in the smaller range of a single
573 * and if no precision is lost in the significand.
574 */
575 uDroppedBits = uDoubleSignificand & (DOUBLE_SIGNIFICAND_MASK >> SINGLE_NUM_SIGNIFICAND_BITS);
576 if(nDoubleUnbiasedExponent >= SINGLE_EXPONENT_MIN &&
577 nDoubleUnbiasedExponent <= SINGLE_EXPONENT_MAX &&
578 uDroppedBits == 0) {
579 /* --- IS CONVERTABLE TO SINGLE --- */
580 uSingleSignificand = uDoubleSignificand >> (DOUBLE_NUM_SIGNIFICAND_BITS - SINGLE_NUM_SIGNIFICAND_BITS);
581 Result.uSize = IEEE754_UNION_IS_SINGLE;
582 Result.uValue = IEEE754_AssembleSingle(uDoubleSign,
583 uSingleSignificand,
584 nDoubleUnbiasedExponent);
585 } else {
586 /* Unable to convert to a single normal. See if it can be
587 * converted to a single subnormal. To do that, the
588 * exponent must be in range and no precision can be lost
589 * in the signficand.
590 *
591 * This is more complicated because the number is not
592 * normalized. The signficand must be shifted
593 * proprotionally to the exponent and 1 must be added
594 * in. See
595 * https://en.wikipedia.org/wiki/Single-precision_floating-point_format#Exponent_encoding
596 */
597 nExponentDifference = -(nDoubleUnbiasedExponent - SINGLE_EXPONENT_MIN);
598 nShiftAmount = nExponentDifference + (DOUBLE_NUM_SIGNIFICAND_BITS - SINGLE_NUM_SIGNIFICAND_BITS);
599 uSingleSignificand = (uDoubleSignificand + (1ULL << DOUBLE_NUM_SIGNIFICAND_BITS)) >> nShiftAmount;
600
601 if(nDoubleUnbiasedExponent < SINGLE_EXPONENT_MIN &&
602 nDoubleUnbiasedExponent >= SINGLE_EXPONENT_MIN - SINGLE_NUM_SIGNIFICAND_BITS &&
603 uSingleSignificand << nShiftAmount == uDoubleSignificand + (1ULL << DOUBLE_NUM_SIGNIFICAND_BITS)) {
604 /* --- IS CONVERTABLE TO SINGLE SUBNORMAL --- */
605 Result.uSize = IEEE754_UNION_IS_SINGLE;
606 Result.uValue = IEEE754_AssembleSingle(uDoubleSign,
607 uSingleSignificand,
608 SINGLE_EXPONENT_ZERO);
609 } else {
610 /* --- CAN NOT BE CONVERTED --- */
611 Result.uSize = IEEE754_UNION_IS_DOUBLE;
612 Result.uValue = uDouble;
613 }
614 }
615 }
616
617 return Result;
618 }
619
620
621 /* Public function; see ieee754.h */
622 IEEE754_union
IEEE754_DoubleToSmaller(double d,int bAllowHalfPrecision)623 IEEE754_DoubleToSmaller(double d, int bAllowHalfPrecision)
624 {
625 IEEE754_union result;
626
627 result = IEEE754_DoubleToSingle(d);
628
629 if(result.uSize == IEEE754_UNION_IS_SINGLE && bAllowHalfPrecision) {
630 /* Cast to uint32_t is OK, because value was just successfully
631 * converted to single. */
632 float uSingle = CopyUint32ToSingle((uint32_t)result.uValue);
633 result = IEEE754_SingleToHalf(uSingle);
634 }
635
636 return result;
637 }
638
639
640 #else /* QCBOR_DISABLE_PREFERRED_FLOAT */
641
642 int ieee754_dummy_place_holder;
643
644 #endif /* QCBOR_DISABLE_PREFERRED_FLOAT */
645