1 // Copyright 2018 The Fuchsia Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include <zircon/compiler.h> 6 #include <zircon/types.h> 7 8 #pragma once 9 10 __BEGIN_CDECLS 11 12 // Flags which control UTF conversion behavior. 13 // 14 // ++ PRESERVE_UNPAIRED_SURROGATES 15 // By default, when an unpaired surrogates are encountered in a UTF16 stream, 16 // they will be replaced with the codepoint for the Unicode replacment character 17 // (U+FFFD). When the PRESERVE_UNPAIRED_SURROGATE flag is passed, however, the 18 // value of the unpaired surrogate will be encoded directly as a codepoint. 19 // 20 // Note that while the presence of unpaired surrogates are technically a 21 // violation of the Unicode UTF16 encoding specification, apparently there are 22 // many UTF16 encoded strings in the world today who have chosen to allow this. 23 // 24 // This implementation considers the following to be unpaired surrogates. 25 // ++ A "high" surrogate [0xD800, 0xDBFF] which is not followed by a "low" 26 // surrogate. 27 // ++ A "low" surrogate [0xDC00, 0xDFFF] which is not preceded by a "high" 28 // surrogate. 29 // 30 // ++ FORCE_LITTLE_ENDIAN 31 // ++ FORCE_BIG_ENDIAN 32 // By default, the conversion process will look for a byte-order-marker (code 33 // unit 0xFEFF) in order to determine the endianness of the UTF16 source 34 // string. If no byte-order-marker is detected, host endianness will be 35 // assumed. 36 // 37 // Users may override this behavior by passing one of the force endian flags. 38 // The indicated endianness will be assumed, regardless of whether or not a byte 39 // order marker is found, and anything. It is illegal to attempt to force both 40 // big and little endian encoding at the same time. Attempts to do so will 41 // result byte-order-marker detection being applied. 42 // 43 // ++ DISCARD_BOM 44 // By default, a byte order marker detected in a UTF16 encoded string will be 45 // encoded in the UTF8 output. Users may change this behavior and cause the BOM 46 // to be discarded instead of encoded by passing the DISCARD_BOM flag. 47 #define UTF_CONVERT_FLAG_PRESERVE_UNPAIRED_SURROGATES ((uint32_t)0x01) 48 #define UTF_CONVERT_FLAG_FORCE_LITTLE_ENDIAN ((uint32_t)0x02) 49 #define UTF_CONVERT_FLAG_FORCE_BIG_ENDIAN ((uint32_t)0x04) 50 #define UTF_CONVERT_FLAG_DISCARD_BOM ((uint32_t)0x08) 51 52 // Attempt to convert a UTF16 string to UTF8 using either an explicitly 53 // specified (utf16le_*, utf16be_*) or an unspecified endianness (utf16_*) 54 // 55 // src : a pointer to the source string, encoded using UTF16 56 // src_len : The number of code units (uint16_t) in the source to process. 57 // dst : a pointer to the buffer which will hold the null terminated result 58 // of the conversion. 59 // dst_len : A pointer to the length of of the destination buffer (in bytes). 60 // Afterwards, this parameter will be updated to indicate the total 61 // number of bytes it would take to hold a null terminated 62 // representation of the UTF8 string, even if there was not enough 63 // room in the destination buffer to perform a full conversion. 64 // flags : Flags which control the conversion process. See above. 65 // 66 // Note: Embedded nulls within the source will be processed and encoded. *No* 67 // null termination of the destination buffer will be performed by default. 68 #if __cplusplus 69 zx_status_t utf16_to_utf8(const uint16_t* src, size_t src_len, 70 uint8_t* dst, size_t* dst_len, 71 uint32_t flags = 0); 72 #else 73 zx_status_t utf16_to_utf8(const uint16_t* src, size_t src_len, 74 uint8_t* dst, size_t* dst_len, 75 uint32_t flags); 76 #endif 77 78 __END_CDECLS 79