Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 1 | // Copyright (c) 2018 The Chromium Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
| 4 | |
| 5 | #include "base/strings/utf_string_conversions.h" |
| 6 | |
| 7 | #include <stdint.h> |
| 8 | |
Brett Wilson | ad9e442 | 2019-09-07 13:33:06 -0700 | [diff] [blame] | 9 | #include <string_view> |
| 10 | |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 11 | #include "base/strings/string_util.h" |
| 12 | #include "base/strings/utf_string_conversion_utils.h" |
| 13 | #include "base/third_party/icu/icu_utf.h" |
Scott Graham | 76a8dc7 | 2018-06-18 13:37:29 -0700 | [diff] [blame] | 14 | #include "util/build_config.h" |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 15 | |
| 16 | namespace base { |
| 17 | |
| 18 | namespace { |
| 19 | |
| 20 | constexpr int32_t kErrorCodePoint = 0xFFFD; |
| 21 | |
| 22 | // Size coefficient ---------------------------------------------------------- |
| 23 | // The maximum number of codeunits in the destination encoding corresponding to |
| 24 | // one codeunit in the source encoding. |
| 25 | |
| 26 | template <typename SrcChar, typename DestChar> |
| 27 | struct SizeCoefficient { |
| 28 | static_assert(sizeof(SrcChar) < sizeof(DestChar), |
| 29 | "Default case: from a smaller encoding to the bigger one"); |
| 30 | |
| 31 | // ASCII symbols are encoded by one codeunit in all encodings. |
| 32 | static constexpr int value = 1; |
| 33 | }; |
| 34 | |
| 35 | template <> |
Brett Wilson | ad9e442 | 2019-09-07 13:33:06 -0700 | [diff] [blame] | 36 | struct SizeCoefficient<char16_t, char> { |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 37 | // One UTF-16 codeunit corresponds to at most 3 codeunits in UTF-8. |
| 38 | static constexpr int value = 3; |
| 39 | }; |
| 40 | |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 41 | template <typename SrcChar, typename DestChar> |
| 42 | constexpr int size_coefficient_v = |
| 43 | SizeCoefficient<std::decay_t<SrcChar>, std::decay_t<DestChar>>::value; |
| 44 | |
| 45 | // UnicodeAppendUnsafe -------------------------------------------------------- |
| 46 | // Function overloads that write code_point to the output string. Output string |
| 47 | // has to have enough space for the codepoint. |
| 48 | |
| 49 | void UnicodeAppendUnsafe(char* out, int32_t* size, uint32_t code_point) { |
| 50 | CBU8_APPEND_UNSAFE(out, *size, code_point); |
| 51 | } |
| 52 | |
Brett Wilson | ad9e442 | 2019-09-07 13:33:06 -0700 | [diff] [blame] | 53 | void UnicodeAppendUnsafe(char16_t* out, int32_t* size, uint32_t code_point) { |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 54 | CBU16_APPEND_UNSAFE(out, *size, code_point); |
| 55 | } |
| 56 | |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 57 | // DoUTFConversion ------------------------------------------------------------ |
| 58 | // Main driver of UTFConversion specialized for different Src encodings. |
| 59 | // dest has to have enough room for the converted text. |
| 60 | |
| 61 | template <typename DestChar> |
| 62 | bool DoUTFConversion(const char* src, |
| 63 | int32_t src_len, |
| 64 | DestChar* dest, |
| 65 | int32_t* dest_len) { |
| 66 | bool success = true; |
| 67 | |
| 68 | for (int32_t i = 0; i < src_len;) { |
| 69 | int32_t code_point; |
| 70 | CBU8_NEXT(src, i, src_len, code_point); |
| 71 | |
| 72 | if (!IsValidCodepoint(code_point)) { |
| 73 | success = false; |
| 74 | code_point = kErrorCodePoint; |
| 75 | } |
| 76 | |
| 77 | UnicodeAppendUnsafe(dest, dest_len, code_point); |
| 78 | } |
| 79 | |
| 80 | return success; |
| 81 | } |
| 82 | |
| 83 | template <typename DestChar> |
Brett Wilson | ad9e442 | 2019-09-07 13:33:06 -0700 | [diff] [blame] | 84 | bool DoUTFConversion(const char16_t* src, |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 85 | int32_t src_len, |
| 86 | DestChar* dest, |
| 87 | int32_t* dest_len) { |
| 88 | bool success = true; |
| 89 | |
Brett Wilson | ad9e442 | 2019-09-07 13:33:06 -0700 | [diff] [blame] | 90 | auto ConvertSingleChar = [&success](char16_t in) -> int32_t { |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 91 | if (!CBU16_IS_SINGLE(in) || !IsValidCodepoint(in)) { |
| 92 | success = false; |
| 93 | return kErrorCodePoint; |
| 94 | } |
| 95 | return in; |
| 96 | }; |
| 97 | |
| 98 | int32_t i = 0; |
| 99 | |
| 100 | // Always have another symbol in order to avoid checking boundaries in the |
| 101 | // middle of the surrogate pair. |
| 102 | while (i < src_len - 1) { |
| 103 | int32_t code_point; |
| 104 | |
| 105 | if (CBU16_IS_LEAD(src[i]) && CBU16_IS_TRAIL(src[i + 1])) { |
| 106 | code_point = CBU16_GET_SUPPLEMENTARY(src[i], src[i + 1]); |
| 107 | if (!IsValidCodepoint(code_point)) { |
| 108 | code_point = kErrorCodePoint; |
| 109 | success = false; |
| 110 | } |
| 111 | i += 2; |
| 112 | } else { |
| 113 | code_point = ConvertSingleChar(src[i]); |
| 114 | ++i; |
| 115 | } |
| 116 | |
| 117 | UnicodeAppendUnsafe(dest, dest_len, code_point); |
| 118 | } |
| 119 | |
| 120 | if (i < src_len) |
| 121 | UnicodeAppendUnsafe(dest, dest_len, ConvertSingleChar(src[i])); |
| 122 | |
| 123 | return success; |
| 124 | } |
| 125 | |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 126 | // UTFConversion -------------------------------------------------------------- |
| 127 | // Function template for generating all UTF conversions. |
| 128 | |
| 129 | template <typename InputString, typename DestString> |
| 130 | bool UTFConversion(const InputString& src_str, DestString* dest_str) { |
| 131 | if (IsStringASCII(src_str)) { |
| 132 | dest_str->assign(src_str.begin(), src_str.end()); |
| 133 | return true; |
| 134 | } |
| 135 | |
| 136 | dest_str->resize(src_str.length() * |
| 137 | size_coefficient_v<typename InputString::value_type, |
| 138 | typename DestString::value_type>); |
| 139 | |
| 140 | // Empty string is ASCII => it OK to call operator[]. |
| 141 | auto* dest = &(*dest_str)[0]; |
| 142 | |
| 143 | // ICU requires 32 bit numbers. |
| 144 | int32_t src_len32 = static_cast<int32_t>(src_str.length()); |
| 145 | int32_t dest_len32 = 0; |
| 146 | |
| 147 | bool res = DoUTFConversion(src_str.data(), src_len32, dest, &dest_len32); |
| 148 | |
| 149 | dest_str->resize(dest_len32); |
| 150 | dest_str->shrink_to_fit(); |
| 151 | |
| 152 | return res; |
| 153 | } |
| 154 | |
| 155 | } // namespace |
| 156 | |
| 157 | // UTF16 <-> UTF8 -------------------------------------------------------------- |
| 158 | |
Brett Wilson | ad9e442 | 2019-09-07 13:33:06 -0700 | [diff] [blame] | 159 | bool UTF8ToUTF16(const char* src, size_t src_len, std::u16string* output) { |
| 160 | return UTFConversion(std::string_view(src, src_len), output); |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 161 | } |
| 162 | |
Brett Wilson | ad9e442 | 2019-09-07 13:33:06 -0700 | [diff] [blame] | 163 | std::u16string UTF8ToUTF16(std::string_view utf8) { |
| 164 | std::u16string ret; |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 165 | // Ignore the success flag of this call, it will do the best it can for |
| 166 | // invalid input, which is what we want here. |
| 167 | UTF8ToUTF16(utf8.data(), utf8.size(), &ret); |
| 168 | return ret; |
| 169 | } |
| 170 | |
Brett Wilson | ad9e442 | 2019-09-07 13:33:06 -0700 | [diff] [blame] | 171 | bool UTF16ToUTF8(const char16_t* src, size_t src_len, std::string* output) { |
| 172 | return UTFConversion(std::u16string_view(src, src_len), output); |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 173 | } |
| 174 | |
Brett Wilson | ad9e442 | 2019-09-07 13:33:06 -0700 | [diff] [blame] | 175 | std::string UTF16ToUTF8(std::u16string_view utf16) { |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 176 | std::string ret; |
| 177 | // Ignore the success flag of this call, it will do the best it can for |
| 178 | // invalid input, which is what we want here. |
| 179 | UTF16ToUTF8(utf16.data(), utf16.length(), &ret); |
| 180 | return ret; |
| 181 | } |
| 182 | |
Brett Wilson | 102cdd4 | 2019-09-06 09:41:18 -0700 | [diff] [blame] | 183 | // ASCII <-> UTF-16 ----------------------------------------------------------- |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 184 | |
Brett Wilson | ad9e442 | 2019-09-07 13:33:06 -0700 | [diff] [blame] | 185 | std::u16string ASCIIToUTF16(std::string_view ascii) { |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 186 | DCHECK(IsStringASCII(ascii)) << ascii; |
Brett Wilson | ad9e442 | 2019-09-07 13:33:06 -0700 | [diff] [blame] | 187 | return std::u16string(ascii.begin(), ascii.end()); |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 188 | } |
| 189 | |
Brett Wilson | ad9e442 | 2019-09-07 13:33:06 -0700 | [diff] [blame] | 190 | std::string UTF16ToASCII(std::u16string_view utf16) { |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 191 | DCHECK(IsStringASCII(utf16)) << UTF16ToUTF8(utf16); |
| 192 | return std::string(utf16.begin(), utf16.end()); |
| 193 | } |
| 194 | |
| 195 | } // namespace base |