blob: 4b4e4208f87a89d19b4c9895aedca411025f1fb0 [file] [log] [blame]
Scott Graham66962112018-06-08 12:42:08 -07001// Copyright (c) 2018 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "base/strings/utf_string_conversions.h"
6
7#include <stdint.h>
8
Brett Wilsonad9e4422019-09-07 13:33:06 -07009#include <string_view>
10
Scott Graham66962112018-06-08 12:42:08 -070011#include "base/strings/string_util.h"
12#include "base/strings/utf_string_conversion_utils.h"
13#include "base/third_party/icu/icu_utf.h"
Scott Graham76a8dc72018-06-18 13:37:29 -070014#include "util/build_config.h"
Scott Graham66962112018-06-08 12:42:08 -070015
16namespace base {
17
18namespace {
19
20constexpr int32_t kErrorCodePoint = 0xFFFD;
21
22// Size coefficient ----------------------------------------------------------
23// The maximum number of codeunits in the destination encoding corresponding to
24// one codeunit in the source encoding.
25
26template <typename SrcChar, typename DestChar>
27struct SizeCoefficient {
28 static_assert(sizeof(SrcChar) < sizeof(DestChar),
29 "Default case: from a smaller encoding to the bigger one");
30
31 // ASCII symbols are encoded by one codeunit in all encodings.
32 static constexpr int value = 1;
33};
34
35template <>
Brett Wilsonad9e4422019-09-07 13:33:06 -070036struct SizeCoefficient<char16_t, char> {
Scott Graham66962112018-06-08 12:42:08 -070037 // One UTF-16 codeunit corresponds to at most 3 codeunits in UTF-8.
38 static constexpr int value = 3;
39};
40
Scott Graham66962112018-06-08 12:42:08 -070041template <typename SrcChar, typename DestChar>
42constexpr int size_coefficient_v =
43 SizeCoefficient<std::decay_t<SrcChar>, std::decay_t<DestChar>>::value;
44
45// UnicodeAppendUnsafe --------------------------------------------------------
46// Function overloads that write code_point to the output string. Output string
47// has to have enough space for the codepoint.
48
49void UnicodeAppendUnsafe(char* out, int32_t* size, uint32_t code_point) {
50 CBU8_APPEND_UNSAFE(out, *size, code_point);
51}
52
Brett Wilsonad9e4422019-09-07 13:33:06 -070053void UnicodeAppendUnsafe(char16_t* out, int32_t* size, uint32_t code_point) {
Scott Graham66962112018-06-08 12:42:08 -070054 CBU16_APPEND_UNSAFE(out, *size, code_point);
55}
56
Scott Graham66962112018-06-08 12:42:08 -070057// DoUTFConversion ------------------------------------------------------------
58// Main driver of UTFConversion specialized for different Src encodings.
59// dest has to have enough room for the converted text.
60
61template <typename DestChar>
62bool DoUTFConversion(const char* src,
63 int32_t src_len,
64 DestChar* dest,
65 int32_t* dest_len) {
66 bool success = true;
67
68 for (int32_t i = 0; i < src_len;) {
69 int32_t code_point;
70 CBU8_NEXT(src, i, src_len, code_point);
71
72 if (!IsValidCodepoint(code_point)) {
73 success = false;
74 code_point = kErrorCodePoint;
75 }
76
77 UnicodeAppendUnsafe(dest, dest_len, code_point);
78 }
79
80 return success;
81}
82
83template <typename DestChar>
Brett Wilsonad9e4422019-09-07 13:33:06 -070084bool DoUTFConversion(const char16_t* src,
Scott Graham66962112018-06-08 12:42:08 -070085 int32_t src_len,
86 DestChar* dest,
87 int32_t* dest_len) {
88 bool success = true;
89
Brett Wilsonad9e4422019-09-07 13:33:06 -070090 auto ConvertSingleChar = [&success](char16_t in) -> int32_t {
Scott Graham66962112018-06-08 12:42:08 -070091 if (!CBU16_IS_SINGLE(in) || !IsValidCodepoint(in)) {
92 success = false;
93 return kErrorCodePoint;
94 }
95 return in;
96 };
97
98 int32_t i = 0;
99
100 // Always have another symbol in order to avoid checking boundaries in the
101 // middle of the surrogate pair.
102 while (i < src_len - 1) {
103 int32_t code_point;
104
105 if (CBU16_IS_LEAD(src[i]) && CBU16_IS_TRAIL(src[i + 1])) {
106 code_point = CBU16_GET_SUPPLEMENTARY(src[i], src[i + 1]);
107 if (!IsValidCodepoint(code_point)) {
108 code_point = kErrorCodePoint;
109 success = false;
110 }
111 i += 2;
112 } else {
113 code_point = ConvertSingleChar(src[i]);
114 ++i;
115 }
116
117 UnicodeAppendUnsafe(dest, dest_len, code_point);
118 }
119
120 if (i < src_len)
121 UnicodeAppendUnsafe(dest, dest_len, ConvertSingleChar(src[i]));
122
123 return success;
124}
125
Scott Graham66962112018-06-08 12:42:08 -0700126// UTFConversion --------------------------------------------------------------
127// Function template for generating all UTF conversions.
128
129template <typename InputString, typename DestString>
130bool UTFConversion(const InputString& src_str, DestString* dest_str) {
131 if (IsStringASCII(src_str)) {
132 dest_str->assign(src_str.begin(), src_str.end());
133 return true;
134 }
135
136 dest_str->resize(src_str.length() *
137 size_coefficient_v<typename InputString::value_type,
138 typename DestString::value_type>);
139
140 // Empty string is ASCII => it OK to call operator[].
141 auto* dest = &(*dest_str)[0];
142
143 // ICU requires 32 bit numbers.
144 int32_t src_len32 = static_cast<int32_t>(src_str.length());
145 int32_t dest_len32 = 0;
146
147 bool res = DoUTFConversion(src_str.data(), src_len32, dest, &dest_len32);
148
149 dest_str->resize(dest_len32);
150 dest_str->shrink_to_fit();
151
152 return res;
153}
154
155} // namespace
156
157// UTF16 <-> UTF8 --------------------------------------------------------------
158
Brett Wilsonad9e4422019-09-07 13:33:06 -0700159bool UTF8ToUTF16(const char* src, size_t src_len, std::u16string* output) {
160 return UTFConversion(std::string_view(src, src_len), output);
Scott Graham66962112018-06-08 12:42:08 -0700161}
162
Brett Wilsonad9e4422019-09-07 13:33:06 -0700163std::u16string UTF8ToUTF16(std::string_view utf8) {
164 std::u16string ret;
Scott Graham66962112018-06-08 12:42:08 -0700165 // Ignore the success flag of this call, it will do the best it can for
166 // invalid input, which is what we want here.
167 UTF8ToUTF16(utf8.data(), utf8.size(), &ret);
168 return ret;
169}
170
Brett Wilsonad9e4422019-09-07 13:33:06 -0700171bool UTF16ToUTF8(const char16_t* src, size_t src_len, std::string* output) {
172 return UTFConversion(std::u16string_view(src, src_len), output);
Scott Graham66962112018-06-08 12:42:08 -0700173}
174
Brett Wilsonad9e4422019-09-07 13:33:06 -0700175std::string UTF16ToUTF8(std::u16string_view utf16) {
Scott Graham66962112018-06-08 12:42:08 -0700176 std::string ret;
177 // Ignore the success flag of this call, it will do the best it can for
178 // invalid input, which is what we want here.
179 UTF16ToUTF8(utf16.data(), utf16.length(), &ret);
180 return ret;
181}
182
Brett Wilson102cdd42019-09-06 09:41:18 -0700183// ASCII <-> UTF-16 -----------------------------------------------------------
Scott Graham66962112018-06-08 12:42:08 -0700184
Brett Wilsonad9e4422019-09-07 13:33:06 -0700185std::u16string ASCIIToUTF16(std::string_view ascii) {
Scott Graham66962112018-06-08 12:42:08 -0700186 DCHECK(IsStringASCII(ascii)) << ascii;
Brett Wilsonad9e4422019-09-07 13:33:06 -0700187 return std::u16string(ascii.begin(), ascii.end());
Scott Graham66962112018-06-08 12:42:08 -0700188}
189
Brett Wilsonad9e4422019-09-07 13:33:06 -0700190std::string UTF16ToASCII(std::u16string_view utf16) {
Scott Graham66962112018-06-08 12:42:08 -0700191 DCHECK(IsStringASCII(utf16)) << UTF16ToUTF8(utf16);
192 return std::string(utf16.begin(), utf16.end());
193}
194
195} // namespace base