1 // SPDX-License-Identifier: BSD-3-Clause
2 /* ==========================================================================
3  * ieee754.c -- floating-point conversion between half, double & single-precision
4  *
5  * Copyright (c) 2018-2024, Laurence Lundblade. All rights reserved.
6  * Copyright (c) 2021, Arm Limited. All rights reserved.
7  *
8  * SPDX-License-Identifier: BSD-3-Clause
9  *
10  * See BSD-3-Clause license in README.md
11  *
12  * Created on 7/23/18
13  * ========================================================================== */
14 
15 /*
16  * Include before QCBOR_DISABLE_PREFERRED_FLOAT is checked as
17  * QCBOR_DISABLE_PREFERRED_FLOAT might be defined in qcbor/qcbor_common.h
18  */
19 #include "qcbor/qcbor_common.h"
20 
21 #ifndef QCBOR_DISABLE_PREFERRED_FLOAT
22 
23 #include "ieee754.h"
24 #include <string.h> /* For memcpy() */
25 
26 
27 /*
28  * This code has long lines and is easier to read because of
29  * them. Some coding guidelines prefer 80 column lines (can they not
30  * afford big displays?).
31  *
32  * This code works solely using shifts and masks and thus has no
33  * dependency on any math libraries. It can even work if the CPU
34  * doesn't have any floating-point support, though that isn't the most
35  * useful thing to do.
36  *
37  * The memcpy() dependency is only for CopyFloatToUint32() and friends
38  * which only is needed to avoid type punning when converting the
39  * actual float bits to an unsigned value so the bit shifts and masks
40  * can work.
41  *
42  * The references used to write this code:
43  *
44  *  IEEE 754-2008, particularly section 3.6 and 6.2.1
45  *
46  *  https://en.wikipedia.org/wiki/IEEE_754 and subordinate pages
47  *
48  *  https://stackoverflow.com/questions/19800415/why-does-ieee-754-reserve-so-many-nan-values
49  *
50  *  https://stackoverflow.com/questions/46073295/implicit-type-promotion-rules
51  *
52  *  https://stackoverflow.com/questions/589575/what-does-the-c-standard-state-the-size-of-int-long-type-to-be
53  *
54  * IEEE754_FloatToDouble(uint32_t uFloat) was created but is not
55  * needed. It can be retrieved from github history if needed.
56  */
57 
58 
59 
60 
61 /* ----- Half Precsion ----------- */
62 #define HALF_NUM_SIGNIFICAND_BITS (10)
63 #define HALF_NUM_EXPONENT_BITS    (5)
64 #define HALF_NUM_SIGN_BITS        (1)
65 
66 #define HALF_SIGNIFICAND_SHIFT    (0)
67 #define HALF_EXPONENT_SHIFT       (HALF_NUM_SIGNIFICAND_BITS)
68 #define HALF_SIGN_SHIFT           (HALF_NUM_SIGNIFICAND_BITS + HALF_NUM_EXPONENT_BITS)
69 
70 #define HALF_SIGNIFICAND_MASK     (0x3ffU) // The lower 10 bits
71 #define HALF_EXPONENT_MASK        (0x1fU << HALF_EXPONENT_SHIFT) // 0x7c00 5 bits of exponent
72 #define HALF_SIGN_MASK            (0x01U << HALF_SIGN_SHIFT) // 0x8000 1 bit of sign
73 #define HALF_QUIET_NAN_BIT        (0x01U << (HALF_NUM_SIGNIFICAND_BITS-1)) // 0x0200
74 
75 /* Biased    Biased    Unbiased   Use
76  *  0x00       0        -15       0 and subnormal
77  *  0x01       1        -14       Smallest normal exponent
78  *  0x1e      30         15       Largest normal exponent
79  *  0x1F      31         16       NaN and Infinity  */
80 #define HALF_EXPONENT_BIAS        (15)
81 #define HALF_EXPONENT_MAX         (HALF_EXPONENT_BIAS)    //  15 Unbiased
82 #define HALF_EXPONENT_MIN         (-HALF_EXPONENT_BIAS+1) // -14 Unbiased
83 #define HALF_EXPONENT_ZERO        (-HALF_EXPONENT_BIAS)   // -15 Unbiased
84 #define HALF_EXPONENT_INF_OR_NAN  (HALF_EXPONENT_BIAS+1)  //  16 Unbiased
85 
86 
87 /* ------ Single-Precision -------- */
88 #define SINGLE_NUM_SIGNIFICAND_BITS (23)
89 #define SINGLE_NUM_EXPONENT_BITS    (8)
90 #define SINGLE_NUM_SIGN_BITS        (1)
91 
92 #define SINGLE_SIGNIFICAND_SHIFT    (0)
93 #define SINGLE_EXPONENT_SHIFT       (SINGLE_NUM_SIGNIFICAND_BITS)
94 #define SINGLE_SIGN_SHIFT           (SINGLE_NUM_SIGNIFICAND_BITS + SINGLE_NUM_EXPONENT_BITS)
95 
96 #define SINGLE_SIGNIFICAND_MASK     (0x7fffffU) // The lower 23 bits
97 #define SINGLE_EXPONENT_MASK        (0xffU << SINGLE_EXPONENT_SHIFT) // 8 bits of exponent
98 #define SINGLE_SIGN_MASK            (0x01U << SINGLE_SIGN_SHIFT) // 1 bit of sign
99 #define SINGLE_QUIET_NAN_BIT        (0x01U << (SINGLE_NUM_SIGNIFICAND_BITS-1))
100 
101 /* Biased  Biased   Unbiased  Use
102  *  0x0000     0     -127      0 and subnormal
103  *  0x0001     1     -126      Smallest normal exponent
104  *  0x7f     127        0      1
105  *  0xfe     254      127      Largest normal exponent
106  *  0xff     255      128      NaN and Infinity  */
107 #define SINGLE_EXPONENT_BIAS        (127)
108 #define SINGLE_EXPONENT_MAX         (SINGLE_EXPONENT_BIAS)
109 #define SINGLE_EXPONENT_MIN         (-SINGLE_EXPONENT_BIAS+1)
110 #define SINGLE_EXPONENT_ZERO        (-SINGLE_EXPONENT_BIAS)
111 #define SINGLE_EXPONENT_INF_OR_NAN  (SINGLE_EXPONENT_BIAS+1)
112 
113 
114 /* --------- Double-Precision ---------- */
115 #define DOUBLE_NUM_SIGNIFICAND_BITS (52)
116 #define DOUBLE_NUM_EXPONENT_BITS    (11)
117 #define DOUBLE_NUM_SIGN_BITS        (1)
118 
119 #define DOUBLE_SIGNIFICAND_SHIFT    (0)
120 #define DOUBLE_EXPONENT_SHIFT       (DOUBLE_NUM_SIGNIFICAND_BITS)
121 #define DOUBLE_SIGN_SHIFT           (DOUBLE_NUM_SIGNIFICAND_BITS + DOUBLE_NUM_EXPONENT_BITS)
122 
123 #define DOUBLE_SIGNIFICAND_MASK     (0xfffffffffffffULL) // The lower 52 bits
124 #define DOUBLE_EXPONENT_MASK        (0x7ffULL << DOUBLE_EXPONENT_SHIFT) // 11 bits of exponent
125 #define DOUBLE_SIGN_MASK            (0x01ULL << DOUBLE_SIGN_SHIFT) // 1 bit of sign
126 #define DOUBLE_QUIET_NAN_BIT        (0x01ULL << (DOUBLE_NUM_SIGNIFICAND_BITS-1))
127 
128 
129 /* Biased      Biased   Unbiased  Use
130  * 0x00000000     0     -1023     0 and subnormal
131  * 0x00000001     1     -1022     Smallest normal exponent
132  * 0x000007fe  2046      1023     Largest normal exponent
133  * 0x000007ff  2047      1024     NaN and Infinity  */
134 #define DOUBLE_EXPONENT_BIAS        (1023)
135 #define DOUBLE_EXPONENT_MAX         (DOUBLE_EXPONENT_BIAS)
136 #define DOUBLE_EXPONENT_MIN         (-DOUBLE_EXPONENT_BIAS+1)
137 #define DOUBLE_EXPONENT_ZERO        (-DOUBLE_EXPONENT_BIAS)
138 #define DOUBLE_EXPONENT_INF_OR_NAN  (DOUBLE_EXPONENT_BIAS+1)
139 
140 
141 
142 
143 /*
144  * Convenient functions to avoid type punning, compiler warnings and
145  * such. The optimizer reduces them to a simple assignment. This is a
146  * crusty corner of C. It shouldn't be this hard.
147  *
148  * These are also in UsefulBuf.h under a different name. They are copied
149  * here to avoid a dependency on UsefulBuf.h. There is no object code
150  * size impact because these always optimze down to a simple assignment.
151  */
152 static inline uint32_t
CopyFloatToUint32(float f)153 CopyFloatToUint32(float f)
154 {
155    uint32_t u32;
156    memcpy(&u32, &f, sizeof(uint32_t));
157    return u32;
158 }
159 
160 static inline uint64_t
CopyDoubleToUint64(double d)161 CopyDoubleToUint64(double d)
162 {
163    uint64_t u64;
164    memcpy(&u64, &d, sizeof(uint64_t));
165    return u64;
166 }
167 
168 static inline double
CopyUint64ToDouble(uint64_t u64)169 CopyUint64ToDouble(uint64_t u64)
170 {
171    double d;
172    memcpy(&d, &u64, sizeof(uint64_t));
173    return d;
174 }
175 
176 static inline float
CopyUint32ToSingle(uint32_t u32)177 CopyUint32ToSingle(uint32_t u32)
178 {
179    float f;
180    memcpy(&f, &u32, sizeof(uint32_t));
181    return f;
182 }
183 
184 
185 
186 
187 /**
188  * @brief Assemble sign, significand and exponent into single precision float.
189  *
190  * @param[in] uDoubleSign              0 if positive, 1 if negative
191  * @pararm[in] uDoubleSignificand      Bits of the significand
192  * @param[in] nDoubleUnBiasedExponent  Exponent
193  *
194  * This returns the bits for a single-precision float, a binary64
195  * as specified in IEEE754.
196  */
197 static double
IEEE754_AssembleDouble(uint64_t uDoubleSign,uint64_t uDoubleSignificand,int64_t nDoubleUnBiasedExponent)198 IEEE754_AssembleDouble(uint64_t uDoubleSign,
199                        uint64_t uDoubleSignificand,
200                        int64_t  nDoubleUnBiasedExponent)
201 {
202    uint64_t uDoubleBiasedExponent;
203 
204    uDoubleBiasedExponent = (uint64_t)(nDoubleUnBiasedExponent + DOUBLE_EXPONENT_BIAS);
205 
206    return CopyUint64ToDouble(uDoubleSignificand |
207                              (uDoubleBiasedExponent << DOUBLE_EXPONENT_SHIFT) |
208                              (uDoubleSign << DOUBLE_SIGN_SHIFT));
209 }
210 
211 
212 double
IEEE754_HalfToDouble(uint16_t uHalfPrecision)213 IEEE754_HalfToDouble(uint16_t uHalfPrecision)
214 {
215    uint64_t uDoubleSignificand;
216    int64_t  nDoubleUnBiasedExponent;
217    double   dResult;
218 
219    /* Pull out the three parts of the half-precision float.  Do all
220     * the work in 64 bits because that is what the end result is.  It
221     * may give smaller code size and will keep static analyzers
222     * happier.
223     */
224    const uint64_t uHalfSignificand      = uHalfPrecision & HALF_SIGNIFICAND_MASK;
225    const uint64_t uHalfBiasedExponent   = (uHalfPrecision & HALF_EXPONENT_MASK) >> HALF_EXPONENT_SHIFT;
226    const int64_t  nHalfUnBiasedExponent = (int64_t)uHalfBiasedExponent - HALF_EXPONENT_BIAS;
227    const uint64_t uHalfSign             = (uHalfPrecision & HALF_SIGN_MASK) >> HALF_SIGN_SHIFT;
228 
229    if(nHalfUnBiasedExponent == HALF_EXPONENT_ZERO) {
230       /* 0 or subnormal */
231       if(uHalfSignificand) {
232          /* --- SUBNORMAL --- */
233          /* A half-precision subnormal can always be converted to a
234           * normal double-precision float because the ranges line up.
235           * The exponent of a subnormal starts out at the min exponent
236           * for a normal. As the sub normal significand bits are
237           * shifted, left to normalize, the exponent is
238           * decremented. Shifting continues until fully normalized.
239           */
240           nDoubleUnBiasedExponent = HALF_EXPONENT_MIN;
241           uDoubleSignificand      = uHalfSignificand;
242           do {
243              uDoubleSignificand <<= 1;
244              nDoubleUnBiasedExponent--;
245           } while ((uDoubleSignificand & (1ULL << HALF_NUM_SIGNIFICAND_BITS)) == 0);
246           /* A normal has an implied 1 in the most significant
247            * position that a subnormal doesn't. */
248           uDoubleSignificand -= 1ULL << HALF_NUM_SIGNIFICAND_BITS;
249           /* Must shift into place for a double significand */
250           uDoubleSignificand <<= DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS;
251 
252           dResult = IEEE754_AssembleDouble(uHalfSign,
253                                            uDoubleSignificand,
254                                            nDoubleUnBiasedExponent);
255       } else {
256          /* --- ZERO --- */
257          dResult = IEEE754_AssembleDouble(uHalfSign,
258                                           0,
259                                           DOUBLE_EXPONENT_ZERO);
260       }
261    } else if(nHalfUnBiasedExponent == HALF_EXPONENT_INF_OR_NAN) {
262       /* NaN or Inifinity */
263       if(uHalfSignificand) {
264          /* --- NaN --- */
265          /* Half-precision payloads always fit into double precision
266           * payloads. They are shifted left the same as a normal
267           * number significand.
268           */
269          uDoubleSignificand = uHalfSignificand << (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
270          dResult = IEEE754_AssembleDouble(uHalfSign,
271                                           uDoubleSignificand,
272                                           DOUBLE_EXPONENT_INF_OR_NAN);
273       } else {
274          /* --- INFINITY --- */
275          dResult = IEEE754_AssembleDouble(uHalfSign,
276                                           0,
277                                           DOUBLE_EXPONENT_INF_OR_NAN);
278       }
279    } else {
280       /* --- NORMAL NUMBER --- */
281       uDoubleSignificand = uHalfSignificand << (DOUBLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
282       dResult = IEEE754_AssembleDouble(uHalfSign,
283                                        uDoubleSignificand,
284                                        nHalfUnBiasedExponent);
285    }
286 
287    return dResult;
288 }
289 
290 
291 /**
292  * @brief Assemble sign, significand and exponent into single precision float.
293  *
294  * @param[in] uHalfSign              0 if positive, 1 if negative
295  * @pararm[in] uHalfSignificand      Bits of the significand
296  * @param[in] nHalfUnBiasedExponent  Exponent
297  *
298  * This returns the bits for a single-precision float, a binary32 as
299  * specified in IEEE754. It is returned as a uint64_t rather than a
300  * uint32_t or a float for convenience of usage.
301  */
302 static uint32_t
IEEE754_AssembleHalf(uint32_t uHalfSign,uint32_t uHalfSignificand,int32_t nHalfUnBiasedExponent)303 IEEE754_AssembleHalf(uint32_t uHalfSign,
304                      uint32_t uHalfSignificand,
305                      int32_t nHalfUnBiasedExponent)
306 {
307    uint32_t uHalfUnbiasedExponent;
308 
309    uHalfUnbiasedExponent = (uint32_t)(nHalfUnBiasedExponent + HALF_EXPONENT_BIAS);
310 
311    return uHalfSignificand |
312           (uHalfUnbiasedExponent << HALF_EXPONENT_SHIFT) |
313           (uHalfSign << HALF_SIGN_SHIFT);
314 }
315 
316 
317 /*  Public function; see ieee754.h */
318 IEEE754_union
IEEE754_SingleToHalf(float f)319 IEEE754_SingleToHalf(float f)
320 {
321    IEEE754_union result;
322    uint32_t      uDroppedBits;
323    int32_t       nExponentDifference;
324    int32_t       nShiftAmount;
325    uint32_t      uHalfSignificand;
326 
327    /* Pull the three parts out of the double-precision float Most work
328     * is done with uint32_t which helps avoid integer promotions and
329     * static analyzer complaints.
330     */
331    const uint32_t uSingle                 = CopyFloatToUint32(f);
332    const uint32_t uSingleBiasedExponent   = (uSingle & SINGLE_EXPONENT_MASK) >> SINGLE_EXPONENT_SHIFT;
333    const int32_t  nSingleUnbiasedExponent = (int32_t)uSingleBiasedExponent - SINGLE_EXPONENT_BIAS;
334    const uint32_t uSingleSignificand      = uSingle & SINGLE_SIGNIFICAND_MASK;
335    const uint32_t uSingleSign             = (uSingle & SINGLE_SIGN_MASK) >> SINGLE_SIGN_SHIFT;
336 
337    if(nSingleUnbiasedExponent == SINGLE_EXPONENT_ZERO) {
338       if(uSingleSignificand == 0) {
339          /* --- IS ZERO --- */
340          result.uSize  = IEEE754_UNION_IS_HALF;
341          result.uValue = IEEE754_AssembleHalf(uSingleSign,
342                                               0,
343                                               HALF_EXPONENT_ZERO);
344       } else {
345          /* --- IS SINGLE SUBNORMAL --- */
346          /* The largest single subnormal is slightly less than the
347           * largest single normal which is 2^-149 or
348           * 2.2040517676619426e-38.  The smallest half subnormal is
349           * 2^-14 or 5.9604644775390625E-8.  There is no overlap so
350           * single subnormals can't be converted to halfs of any sort.
351           */
352          result.uSize   = IEEE754_UNION_IS_SINGLE;
353          result.uValue  = uSingle;
354       }
355    } else if(nSingleUnbiasedExponent == SINGLE_EXPONENT_INF_OR_NAN) {
356       if(uSingleSignificand == 0) {
357          /* ---- IS INFINITY ---- */
358          result.uSize  = IEEE754_UNION_IS_HALF;
359          result.uValue = IEEE754_AssembleHalf(uSingleSign, 0, HALF_EXPONENT_INF_OR_NAN);
360       } else {
361          /* The NaN can only be converted if no payload bits are lost
362           * per RFC 8949 section 4.1 that defines Preferred
363           * Serializaton. Note that Deterministically Encode CBOR in
364           * section 4.2 allows for some variation of this rule, but at
365           * the moment this implementation is of Preferred
366           * Serialization, not CDE. As of December 2023, we are also
367           * expecting an update to CDE. This code may need to be
368           * updated for CDE.
369           */
370          uDroppedBits = uSingleSignificand & (SINGLE_SIGNIFICAND_MASK >> HALF_NUM_SIGNIFICAND_BITS);
371          if(uDroppedBits == 0) {
372             /* --- IS CONVERTABLE NAN --- */
373             uHalfSignificand = uSingleSignificand >> (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
374             result.uSize  = IEEE754_UNION_IS_HALF;
375             result.uValue = IEEE754_AssembleHalf(uSingleSign,
376                                                  uHalfSignificand,
377                                                  HALF_EXPONENT_INF_OR_NAN);
378 
379          } else {
380             /* --- IS UNCONVERTABLE NAN --- */
381             result.uSize   = IEEE754_UNION_IS_SINGLE;
382             result.uValue  = uSingle;
383          }
384       }
385    } else {
386       /* ---- REGULAR NUMBER ---- */
387       /* A regular single can be converted to a regular half if the
388        * single's exponent is in the smaller range of a half and if no
389        * precision is lost in the significand.
390        */
391       if(nSingleUnbiasedExponent >= HALF_EXPONENT_MIN &&
392          nSingleUnbiasedExponent <= HALF_EXPONENT_MAX &&
393         (uSingleSignificand & (SINGLE_SIGNIFICAND_MASK >> HALF_NUM_SIGNIFICAND_BITS)) == 0) {
394          uHalfSignificand = uSingleSignificand >> (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
395 
396          /* --- CONVERT TO HALF NORMAL --- */
397          result.uSize  = IEEE754_UNION_IS_HALF;
398          result.uValue = IEEE754_AssembleHalf(uSingleSign,
399                                               uHalfSignificand,
400                                               nSingleUnbiasedExponent);
401       } else {
402          /* Unable to convert to a half normal. See if it can be
403           * converted to a half subnormal. To do that, the exponent
404           * must be in range and no precision can be lost in the
405           * signficand.
406           *
407           * This is more complicated because the number is not
408           * normalized.  The signficand must be shifted proprotionally
409           * to the exponent and 1 must be added in.  See
410           * https://en.wikipedia.org/wiki/Single-precision_floating-point_format#Exponent_encoding
411           *
412           * Exponents -14 to -24 map to a shift of 0 to 10 of the
413           * significand.  The largest value of a half subnormal has an
414           * exponent of -14. Subnormals are not normalized like
415           * normals meaning they lose precision as the numbers get
416           * smaller. Normals don't lose precision because the exponent
417           * allows all the bits of the significand to be significant.
418           */
419          /* The exponent of the largest possible half-precision
420           * subnormal is HALF_EXPONENT_MIN (-14).  Exponents larger
421           * than this are normal and handled above. We're going to
422           * shift the significand right by at least this amount.
423           */
424          nExponentDifference = -(nSingleUnbiasedExponent - HALF_EXPONENT_MIN);
425 
426          /* In addition to the shift based on the exponent's value,
427           * the single significand has to be shifted right to fit into
428           * a half-precision significand */
429          nShiftAmount = nExponentDifference + (SINGLE_NUM_SIGNIFICAND_BITS - HALF_NUM_SIGNIFICAND_BITS);
430 
431          /* Must add 1 in to the possible significand because there is
432           * an implied 1 for normal values and not for subnormal
433           * values. See equations here:
434           * https://en.wikipedia.org/wiki/Single-precision_floating-point_format#Exponent_encoding
435           */
436          uHalfSignificand = (uSingleSignificand + (1 << SINGLE_NUM_SIGNIFICAND_BITS)) >> nShiftAmount;
437 
438          /* If only zero bits get shifted out, this can be converted
439           * to subnormal */
440          if(nSingleUnbiasedExponent < HALF_EXPONENT_MIN &&
441             nSingleUnbiasedExponent >= HALF_EXPONENT_MIN - HALF_NUM_SIGNIFICAND_BITS &&
442             uHalfSignificand << nShiftAmount == uSingleSignificand + (1 << SINGLE_NUM_SIGNIFICAND_BITS)) {
443             /* --- CONVERTABLE TO HALF SUBNORMAL --- */
444             result.uSize  = IEEE754_UNION_IS_HALF;
445             result.uValue = IEEE754_AssembleHalf(uSingleSign,
446                                                  uHalfSignificand,
447                                                  HALF_EXPONENT_ZERO);
448          } else {
449             /* --- DO NOT CONVERT --- */
450             result.uSize   = IEEE754_UNION_IS_SINGLE;
451             result.uValue  = uSingle;
452          }
453       }
454    }
455 
456    return result;
457 }
458 
459 
460 /**
461  * @brief Assemble sign, significand and exponent into single precision float.
462  *
463  * @param[in] uSingleSign              0 if positive, 1 if negative
464  * @pararm[in] uSingleSignificand      Bits of the significand
465  * @param[in] nSingleUnBiasedExponent  Exponent
466  *
467  * This returns the bits for a single-precision float, a binary32 as
468  * specified in IEEE754. It is returned as a uint64_t rather than a
469  * uint32_t or a float for convenience of usage.
470  */
471 static uint64_t
IEEE754_AssembleSingle(uint64_t uSingleSign,uint64_t uSingleSignificand,int64_t nSingleUnBiasedExponent)472 IEEE754_AssembleSingle(uint64_t uSingleSign,
473                        uint64_t uSingleSignificand,
474                        int64_t  nSingleUnBiasedExponent)
475 {
476    uint64_t uSingleBiasedExponent;
477 
478    uSingleBiasedExponent = (uint64_t)(nSingleUnBiasedExponent + SINGLE_EXPONENT_BIAS);
479 
480    return uSingleSignificand |
481           (uSingleBiasedExponent << SINGLE_EXPONENT_SHIFT) |
482           (uSingleSign << SINGLE_SIGN_SHIFT);
483 }
484 
485 
486 /**
487  * @brief Convert a double-precision float to single-precision.
488  *
489  * @param[in] d  The value to convert.
490  *
491  * @returns Either unconverted value or value converted to single-precision.
492  *
493  * This always succeeds. If the value cannot be converted without the
494  * loss of precision, it is not converted.
495  *
496  * This handles all subnormals and NaN payloads.
497  */
498 static IEEE754_union
IEEE754_DoubleToSingle(double d)499 IEEE754_DoubleToSingle(double d)
500 {
501    IEEE754_union Result;
502    int64_t       nExponentDifference;
503    int64_t       nShiftAmount;
504    uint64_t      uSingleSignificand;
505    uint64_t      uDroppedBits;
506 
507 
508    /* Pull the three parts out of the double-precision float. Most
509     * work is done with uint64_t which helps avoid integer promotions
510     * and static analyzer complaints.
511     */
512    const uint64_t uDouble                 = CopyDoubleToUint64(d);
513    const uint64_t uDoubleBiasedExponent   = (uDouble & DOUBLE_EXPONENT_MASK) >> DOUBLE_EXPONENT_SHIFT;
514    const int64_t  nDoubleUnbiasedExponent = (int64_t)uDoubleBiasedExponent - DOUBLE_EXPONENT_BIAS;
515    const uint64_t uDoubleSign             = (uDouble & DOUBLE_SIGN_MASK) >> DOUBLE_SIGN_SHIFT;
516    const uint64_t uDoubleSignificand      = uDouble & DOUBLE_SIGNIFICAND_MASK;
517 
518 
519     if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_ZERO) {
520         if(uDoubleSignificand == 0) {
521             /* --- IS ZERO --- */
522             Result.uSize  = IEEE754_UNION_IS_SINGLE;
523             Result.uValue = IEEE754_AssembleSingle(uDoubleSign,
524                                                    0,
525                                                    SINGLE_EXPONENT_ZERO);
526         } else {
527             /* --- IS DOUBLE SUBNORMAL --- */
528             /* The largest double subnormal is slightly less than the
529              * largest double normal which is 2^-1022 or
530              * 2.2250738585072014e-308.  The smallest single subnormal
531              * is 2^-149 or 1.401298464324817e-45.  There is no
532              * overlap so double subnormals can't be converted to
533              * singles of any sort.
534              */
535             Result.uSize   = IEEE754_UNION_IS_DOUBLE;
536             Result.uValue  = uDouble;
537          }
538     } else if(nDoubleUnbiasedExponent == DOUBLE_EXPONENT_INF_OR_NAN) {
539          if(uDoubleSignificand == 0) {
540              /* ---- IS INFINITY ---- */
541              Result.uSize  = IEEE754_UNION_IS_SINGLE;
542              Result.uValue = IEEE754_AssembleSingle(uDoubleSign,
543                                                     0,
544                                                     SINGLE_EXPONENT_INF_OR_NAN);
545          } else {
546              /* The NaN can only be converted if no payload bits are
547               * lost per RFC 8949 section 4.1 that defines Preferred
548               * Serializaton. Note that Deterministically Encode CBOR
549               * in section 4.2 allows for some variation of this rule,
550               * but at the moment this implementation is of Preferred
551               * Serialization, not CDE. As of December 2023, we are
552               * also expecting an update to CDE. This code may need to
553               * be updated for CDE.
554               */
555              uDroppedBits = uDoubleSignificand & (DOUBLE_SIGNIFICAND_MASK >> SINGLE_NUM_SIGNIFICAND_BITS);
556              if(uDroppedBits == 0) {
557                 /* --- IS CONVERTABLE NAN --- */
558                 uSingleSignificand = uDoubleSignificand >> (DOUBLE_NUM_SIGNIFICAND_BITS - SINGLE_NUM_SIGNIFICAND_BITS);
559                 Result.uSize  = IEEE754_UNION_IS_SINGLE;
560                 Result.uValue = IEEE754_AssembleSingle(uDoubleSign,
561                                                        uSingleSignificand,
562                                                        SINGLE_EXPONENT_INF_OR_NAN);
563             } else {
564                /* --- IS UNCONVERTABLE NAN --- */
565                Result.uSize   = IEEE754_UNION_IS_DOUBLE;
566                Result.uValue  = uDouble;
567             }
568          }
569     } else {
570         /* ---- REGULAR NUMBER ---- */
571         /* A regular double can be converted to a regular single if
572          * the double's exponent is in the smaller range of a single
573          * and if no precision is lost in the significand.
574          */
575         uDroppedBits = uDoubleSignificand & (DOUBLE_SIGNIFICAND_MASK >> SINGLE_NUM_SIGNIFICAND_BITS);
576         if(nDoubleUnbiasedExponent >= SINGLE_EXPONENT_MIN &&
577            nDoubleUnbiasedExponent <= SINGLE_EXPONENT_MAX &&
578            uDroppedBits == 0) {
579             /* --- IS CONVERTABLE TO SINGLE --- */
580             uSingleSignificand = uDoubleSignificand >> (DOUBLE_NUM_SIGNIFICAND_BITS - SINGLE_NUM_SIGNIFICAND_BITS);
581             Result.uSize  = IEEE754_UNION_IS_SINGLE;
582             Result.uValue = IEEE754_AssembleSingle(uDoubleSign,
583                                                    uSingleSignificand,
584                                                    nDoubleUnbiasedExponent);
585         } else {
586             /* Unable to convert to a single normal. See if it can be
587              * converted to a single subnormal. To do that, the
588              * exponent must be in range and no precision can be lost
589              * in the signficand.
590              *
591              * This is more complicated because the number is not
592              * normalized.  The signficand must be shifted
593              * proprotionally to the exponent and 1 must be added
594              * in. See
595              * https://en.wikipedia.org/wiki/Single-precision_floating-point_format#Exponent_encoding
596              */
597             nExponentDifference = -(nDoubleUnbiasedExponent - SINGLE_EXPONENT_MIN);
598             nShiftAmount        = nExponentDifference + (DOUBLE_NUM_SIGNIFICAND_BITS - SINGLE_NUM_SIGNIFICAND_BITS);
599             uSingleSignificand  = (uDoubleSignificand + (1ULL << DOUBLE_NUM_SIGNIFICAND_BITS)) >> nShiftAmount;
600 
601             if(nDoubleUnbiasedExponent < SINGLE_EXPONENT_MIN &&
602                nDoubleUnbiasedExponent >= SINGLE_EXPONENT_MIN - SINGLE_NUM_SIGNIFICAND_BITS &&
603                uSingleSignificand << nShiftAmount == uDoubleSignificand + (1ULL << DOUBLE_NUM_SIGNIFICAND_BITS)) {
604                /* --- IS CONVERTABLE TO SINGLE SUBNORMAL --- */
605                Result.uSize  = IEEE754_UNION_IS_SINGLE;
606                Result.uValue = IEEE754_AssembleSingle(uDoubleSign,
607                                                       uSingleSignificand,
608                                                       SINGLE_EXPONENT_ZERO);
609             } else {
610                /* --- CAN NOT BE CONVERTED --- */
611                Result.uSize   = IEEE754_UNION_IS_DOUBLE;
612                Result.uValue  = uDouble;
613             }
614         }
615     }
616 
617     return Result;
618 }
619 
620 
621 /* Public function; see ieee754.h */
622 IEEE754_union
IEEE754_DoubleToSmaller(double d,int bAllowHalfPrecision)623 IEEE754_DoubleToSmaller(double d, int bAllowHalfPrecision)
624 {
625    IEEE754_union result;
626 
627    result = IEEE754_DoubleToSingle(d);
628 
629    if(result.uSize == IEEE754_UNION_IS_SINGLE && bAllowHalfPrecision) {
630       /* Cast to uint32_t is OK, because value was just successfully
631        * converted to single. */
632       float uSingle = CopyUint32ToSingle((uint32_t)result.uValue);
633       result = IEEE754_SingleToHalf(uSingle);
634    }
635 
636    return result;
637 }
638 
639 
640 #else /* QCBOR_DISABLE_PREFERRED_FLOAT */
641 
642 int ieee754_dummy_place_holder;
643 
644 #endif /* QCBOR_DISABLE_PREFERRED_FLOAT */
645