1 // Copyright 2018 The Fuchsia Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include <zircon/compiler.h>
6 #include <zircon/types.h>
7 
8 #pragma once
9 
10 __BEGIN_CDECLS
11 
12 // Flags which control UTF conversion behavior.
13 //
14 // ++ PRESERVE_UNPAIRED_SURROGATES
15 // By default, when an unpaired surrogates are encountered in a UTF16 stream,
16 // they will be replaced with the codepoint for the Unicode replacment character
17 // (U+FFFD).  When the PRESERVE_UNPAIRED_SURROGATE flag is passed, however, the
18 // value of the unpaired surrogate will be encoded directly as a codepoint.
19 //
20 // Note that while the presence of unpaired surrogates are technically a
21 // violation of the Unicode UTF16 encoding specification, apparently there are
22 // many UTF16 encoded strings in the world today who have chosen to allow this.
23 //
24 // This implementation considers the following to be unpaired surrogates.
25 // ++ A "high" surrogate [0xD800, 0xDBFF] which is not followed by a "low"
26 //    surrogate.
27 // ++ A "low" surrogate [0xDC00, 0xDFFF] which is not preceded by a "high"
28 //    surrogate.
29 //
30 // ++ FORCE_LITTLE_ENDIAN
31 // ++ FORCE_BIG_ENDIAN
32 // By default, the conversion process will look for a byte-order-marker (code
33 // unit 0xFEFF) in order to determine the endianness of the UTF16 source
34 // string.  If no byte-order-marker is detected, host endianness will be
35 // assumed.
36 //
37 // Users may override this behavior by passing one of the force endian flags.
38 // The indicated endianness will be assumed, regardless of whether or not a byte
39 // order marker is found, and anything.  It is illegal to attempt to force both
40 // big and little endian encoding at the same time.  Attempts to do so will
41 // result byte-order-marker detection being applied.
42 //
43 // ++ DISCARD_BOM
44 // By default, a byte order marker detected in a UTF16 encoded string will be
45 // encoded in the UTF8 output.  Users may change this behavior and cause the BOM
46 // to be discarded instead of encoded by passing the DISCARD_BOM flag.
47 #define UTF_CONVERT_FLAG_PRESERVE_UNPAIRED_SURROGATES   ((uint32_t)0x01)
48 #define UTF_CONVERT_FLAG_FORCE_LITTLE_ENDIAN            ((uint32_t)0x02)
49 #define UTF_CONVERT_FLAG_FORCE_BIG_ENDIAN               ((uint32_t)0x04)
50 #define UTF_CONVERT_FLAG_DISCARD_BOM                    ((uint32_t)0x08)
51 
52 // Attempt to convert a UTF16 string to UTF8 using either an explicitly
53 // specified (utf16le_*, utf16be_*) or an unspecified endianness (utf16_*)
54 //
55 // src     : a pointer to the source string, encoded using UTF16
56 // src_len : The number of code units (uint16_t) in the source to process.
57 // dst     : a pointer to the buffer which will hold the null terminated result
58 //           of the conversion.
59 // dst_len : A pointer to the length of of the destination buffer (in bytes).
60 //           Afterwards, this parameter will be updated to indicate the total
61 //           number of bytes it would take to hold a null terminated
62 //           representation of the UTF8 string, even if there was not enough
63 //           room in the destination buffer to perform a full conversion.
64 // flags   : Flags which control the conversion process.  See above.
65 //
66 // Note:  Embedded nulls within the source will be processed and encoded.  *No*
67 // null termination of the destination buffer will be performed by default.
68 #if __cplusplus
69 zx_status_t utf16_to_utf8(const uint16_t* src, size_t src_len,
70                           uint8_t* dst, size_t* dst_len,
71                           uint32_t flags = 0);
72 #else
73 zx_status_t utf16_to_utf8(const uint16_t* src, size_t src_len,
74                           uint8_t* dst, size_t* dst_len,
75                           uint32_t flags);
76 #endif
77 
78 __END_CDECLS
79