Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 1 | // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
| 4 | |
| 5 | #include "base/json/json_parser.h" |
| 6 | |
| 7 | #include <cmath> |
Brett Wilson | ad9e442 | 2019-09-07 13:33:06 -0700 | [diff] [blame] | 8 | #include <string_view> |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 9 | #include <utility> |
| 10 | #include <vector> |
| 11 | |
| 12 | #include "base/logging.h" |
| 13 | #include "base/macros.h" |
| 14 | #include "base/numerics/safe_conversions.h" |
| 15 | #include "base/strings/string_number_conversions.h" |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 16 | #include "base/strings/string_util.h" |
| 17 | #include "base/strings/stringprintf.h" |
| 18 | #include "base/strings/utf_string_conversion_utils.h" |
| 19 | #include "base/strings/utf_string_conversions.h" |
| 20 | #include "base/third_party/icu/icu_utf.h" |
| 21 | #include "base/values.h" |
| 22 | |
| 23 | namespace base { |
| 24 | namespace internal { |
| 25 | |
| 26 | namespace { |
| 27 | |
| 28 | const int32_t kExtendedASCIIStart = 0x80; |
| 29 | |
| 30 | // Simple class that checks for maximum recursion/"stack overflow." |
| 31 | class StackMarker { |
| 32 | public: |
| 33 | StackMarker(int max_depth, int* depth) |
| 34 | : max_depth_(max_depth), depth_(depth) { |
| 35 | ++(*depth_); |
| 36 | DCHECK_LE(*depth_, max_depth_); |
| 37 | } |
Scott Graham | 98cd3ca | 2018-06-14 22:26:55 -0700 | [diff] [blame] | 38 | ~StackMarker() { --(*depth_); } |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 39 | |
| 40 | bool IsTooDeep() const { return *depth_ >= max_depth_; } |
| 41 | |
| 42 | private: |
| 43 | const int max_depth_; |
| 44 | int* const depth_; |
| 45 | |
| 46 | DISALLOW_COPY_AND_ASSIGN(StackMarker); |
| 47 | }; |
| 48 | |
| 49 | constexpr uint32_t kUnicodeReplacementPoint = 0xFFFD; |
| 50 | |
| 51 | } // namespace |
| 52 | |
| 53 | // This is U+FFFD. |
| 54 | const char kUnicodeReplacementString[] = "\xEF\xBF\xBD"; |
| 55 | |
| 56 | JSONParser::JSONParser(int options, int max_depth) |
| 57 | : options_(options), |
| 58 | max_depth_(max_depth), |
| 59 | index_(0), |
| 60 | stack_depth_(0), |
| 61 | line_number_(0), |
| 62 | index_last_line_(0), |
| 63 | error_code_(JSONReader::JSON_NO_ERROR), |
| 64 | error_line_(0), |
| 65 | error_column_(0) { |
| 66 | CHECK_LE(max_depth, JSONReader::kStackMaxDepth); |
| 67 | } |
| 68 | |
| 69 | JSONParser::~JSONParser() = default; |
| 70 | |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 71 | std::optional<Value> JSONParser::Parse(std::string_view input) { |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 72 | input_ = input; |
| 73 | index_ = 0; |
| 74 | line_number_ = 1; |
| 75 | index_last_line_ = 0; |
| 76 | |
| 77 | error_code_ = JSONReader::JSON_NO_ERROR; |
| 78 | error_line_ = 0; |
| 79 | error_column_ = 0; |
| 80 | |
| 81 | // ICU and ReadUnicodeCharacter() use int32_t for lengths, so ensure |
| 82 | // that the index_ will not overflow when parsing. |
| 83 | if (!base::IsValueInRangeForNumericType<int32_t>(input.length())) { |
| 84 | ReportError(JSONReader::JSON_TOO_LARGE, 0); |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 85 | return std::nullopt; |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 86 | } |
| 87 | |
| 88 | // When the input JSON string starts with a UTF-8 Byte-Order-Mark, |
| 89 | // advance the start position to avoid the ParseNextToken function mis- |
| 90 | // treating a Unicode BOM as an invalid character and returning NULL. |
| 91 | ConsumeIfMatch("\xEF\xBB\xBF"); |
| 92 | |
| 93 | // Parse the first and any nested tokens. |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 94 | std::optional<Value> root(ParseNextToken()); |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 95 | if (!root) |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 96 | return std::nullopt; |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 97 | |
| 98 | // Make sure the input stream is at an end. |
| 99 | if (GetNextToken() != T_END_OF_INPUT) { |
| 100 | ReportError(JSONReader::JSON_UNEXPECTED_DATA_AFTER_ROOT, 1); |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 101 | return std::nullopt; |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 102 | } |
| 103 | |
| 104 | return root; |
| 105 | } |
| 106 | |
| 107 | JSONReader::JsonParseError JSONParser::error_code() const { |
| 108 | return error_code_; |
| 109 | } |
| 110 | |
| 111 | std::string JSONParser::GetErrorMessage() const { |
| 112 | return FormatErrorMessage(error_line_, error_column_, |
Scott Graham | 98cd3ca | 2018-06-14 22:26:55 -0700 | [diff] [blame] | 113 | JSONReader::ErrorCodeToString(error_code_)); |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 114 | } |
| 115 | |
| 116 | int JSONParser::error_line() const { |
| 117 | return error_line_; |
| 118 | } |
| 119 | |
| 120 | int JSONParser::error_column() const { |
| 121 | return error_column_; |
| 122 | } |
| 123 | |
| 124 | // StringBuilder /////////////////////////////////////////////////////////////// |
| 125 | |
| 126 | JSONParser::StringBuilder::StringBuilder() : StringBuilder(nullptr) {} |
| 127 | |
| 128 | JSONParser::StringBuilder::StringBuilder(const char* pos) |
| 129 | : pos_(pos), length_(0) {} |
| 130 | |
| 131 | JSONParser::StringBuilder::~StringBuilder() = default; |
| 132 | |
| 133 | JSONParser::StringBuilder& JSONParser::StringBuilder::operator=( |
| 134 | StringBuilder&& other) = default; |
| 135 | |
| 136 | void JSONParser::StringBuilder::Append(uint32_t point) { |
| 137 | DCHECK(IsValidCharacter(point)); |
| 138 | |
| 139 | if (point < kExtendedASCIIStart && !string_) { |
| 140 | DCHECK_EQ(static_cast<char>(point), pos_[length_]); |
| 141 | ++length_; |
| 142 | } else { |
| 143 | Convert(); |
| 144 | if (UNLIKELY(point == kUnicodeReplacementPoint)) { |
| 145 | string_->append(kUnicodeReplacementString); |
| 146 | } else { |
| 147 | WriteUnicodeCharacter(point, &*string_); |
| 148 | } |
| 149 | } |
| 150 | } |
| 151 | |
| 152 | void JSONParser::StringBuilder::Convert() { |
| 153 | if (string_) |
| 154 | return; |
| 155 | string_.emplace(pos_, length_); |
| 156 | } |
| 157 | |
| 158 | std::string JSONParser::StringBuilder::DestructiveAsString() { |
| 159 | if (string_) |
| 160 | return std::move(*string_); |
| 161 | return std::string(pos_, length_); |
| 162 | } |
| 163 | |
| 164 | // JSONParser private ////////////////////////////////////////////////////////// |
| 165 | |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 166 | std::optional<std::string_view> JSONParser::PeekChars(int count) { |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 167 | if (static_cast<size_t>(index_) + count > input_.length()) |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 168 | return std::nullopt; |
Brett Wilson | ad9e442 | 2019-09-07 13:33:06 -0700 | [diff] [blame] | 169 | // Using std::string_view::substr() is significantly slower (according to |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 170 | // base_perftests) than constructing a substring manually. |
Brett Wilson | ad9e442 | 2019-09-07 13:33:06 -0700 | [diff] [blame] | 171 | return std::string_view(input_.data() + index_, count); |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 172 | } |
| 173 | |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 174 | std::optional<char> JSONParser::PeekChar() { |
| 175 | std::optional<std::string_view> chars = PeekChars(1); |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 176 | if (chars) |
| 177 | return (*chars)[0]; |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 178 | return std::nullopt; |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 179 | } |
| 180 | |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 181 | std::optional<std::string_view> JSONParser::ConsumeChars(int count) { |
| 182 | std::optional<std::string_view> chars = PeekChars(count); |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 183 | if (chars) |
| 184 | index_ += count; |
| 185 | return chars; |
| 186 | } |
| 187 | |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 188 | std::optional<char> JSONParser::ConsumeChar() { |
| 189 | std::optional<std::string_view> chars = ConsumeChars(1); |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 190 | if (chars) |
| 191 | return (*chars)[0]; |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 192 | return std::nullopt; |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 193 | } |
| 194 | |
| 195 | const char* JSONParser::pos() { |
| 196 | CHECK_LE(static_cast<size_t>(index_), input_.length()); |
| 197 | return input_.data() + index_; |
| 198 | } |
| 199 | |
| 200 | JSONParser::Token JSONParser::GetNextToken() { |
| 201 | EatWhitespaceAndComments(); |
| 202 | |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 203 | std::optional<char> c = PeekChar(); |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 204 | if (!c) |
| 205 | return T_END_OF_INPUT; |
| 206 | |
| 207 | switch (*c) { |
| 208 | case '{': |
| 209 | return T_OBJECT_BEGIN; |
| 210 | case '}': |
| 211 | return T_OBJECT_END; |
| 212 | case '[': |
| 213 | return T_ARRAY_BEGIN; |
| 214 | case ']': |
| 215 | return T_ARRAY_END; |
| 216 | case '"': |
| 217 | return T_STRING; |
| 218 | case '0': |
| 219 | case '1': |
| 220 | case '2': |
| 221 | case '3': |
| 222 | case '4': |
| 223 | case '5': |
| 224 | case '6': |
| 225 | case '7': |
| 226 | case '8': |
| 227 | case '9': |
| 228 | case '-': |
| 229 | return T_NUMBER; |
| 230 | case 't': |
| 231 | return T_BOOL_TRUE; |
| 232 | case 'f': |
| 233 | return T_BOOL_FALSE; |
| 234 | case 'n': |
| 235 | return T_NULL; |
| 236 | case ',': |
| 237 | return T_LIST_SEPARATOR; |
| 238 | case ':': |
| 239 | return T_OBJECT_PAIR_SEPARATOR; |
| 240 | default: |
| 241 | return T_INVALID_TOKEN; |
| 242 | } |
| 243 | } |
| 244 | |
| 245 | void JSONParser::EatWhitespaceAndComments() { |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 246 | while (std::optional<char> c = PeekChar()) { |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 247 | switch (*c) { |
| 248 | case '\r': |
| 249 | case '\n': |
| 250 | index_last_line_ = index_; |
| 251 | // Don't increment line_number_ twice for "\r\n". |
| 252 | if (!(c == '\n' && index_ > 0 && input_[index_ - 1] == '\r')) { |
| 253 | ++line_number_; |
| 254 | } |
| 255 | FALLTHROUGH; |
| 256 | case ' ': |
| 257 | case '\t': |
| 258 | ConsumeChar(); |
| 259 | break; |
| 260 | case '/': |
| 261 | if (!EatComment()) |
| 262 | return; |
| 263 | break; |
| 264 | default: |
| 265 | return; |
| 266 | } |
| 267 | } |
| 268 | } |
| 269 | |
| 270 | bool JSONParser::EatComment() { |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 271 | std::optional<std::string_view> comment_start = ConsumeChars(2); |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 272 | if (!comment_start) |
| 273 | return false; |
| 274 | |
| 275 | if (comment_start == "//") { |
| 276 | // Single line comment, read to newline. |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 277 | while (std::optional<char> c = PeekChar()) { |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 278 | if (c == '\n' || c == '\r') |
| 279 | return true; |
| 280 | ConsumeChar(); |
| 281 | } |
| 282 | } else if (comment_start == "/*") { |
| 283 | char previous_char = '\0'; |
| 284 | // Block comment, read until end marker. |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 285 | while (std::optional<char> c = PeekChar()) { |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 286 | if (previous_char == '*' && c == '/') { |
| 287 | // EatWhitespaceAndComments will inspect pos(), which will still be on |
| 288 | // the last / of the comment, so advance once more (which may also be |
| 289 | // end of input). |
| 290 | ConsumeChar(); |
| 291 | return true; |
| 292 | } |
| 293 | previous_char = *ConsumeChar(); |
| 294 | } |
| 295 | |
| 296 | // If the comment is unterminated, GetNextToken will report T_END_OF_INPUT. |
| 297 | } |
| 298 | |
| 299 | return false; |
| 300 | } |
| 301 | |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 302 | std::optional<Value> JSONParser::ParseNextToken() { |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 303 | return ParseToken(GetNextToken()); |
| 304 | } |
| 305 | |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 306 | std::optional<Value> JSONParser::ParseToken(Token token) { |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 307 | switch (token) { |
| 308 | case T_OBJECT_BEGIN: |
| 309 | return ConsumeDictionary(); |
| 310 | case T_ARRAY_BEGIN: |
| 311 | return ConsumeList(); |
| 312 | case T_STRING: |
| 313 | return ConsumeString(); |
| 314 | case T_NUMBER: |
| 315 | return ConsumeNumber(); |
| 316 | case T_BOOL_TRUE: |
| 317 | case T_BOOL_FALSE: |
| 318 | case T_NULL: |
| 319 | return ConsumeLiteral(); |
| 320 | default: |
| 321 | ReportError(JSONReader::JSON_UNEXPECTED_TOKEN, 1); |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 322 | return std::nullopt; |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 323 | } |
| 324 | } |
| 325 | |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 326 | std::optional<Value> JSONParser::ConsumeDictionary() { |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 327 | if (ConsumeChar() != '{') { |
| 328 | ReportError(JSONReader::JSON_UNEXPECTED_TOKEN, 1); |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 329 | return std::nullopt; |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 330 | } |
| 331 | |
| 332 | StackMarker depth_check(max_depth_, &stack_depth_); |
| 333 | if (depth_check.IsTooDeep()) { |
| 334 | ReportError(JSONReader::JSON_TOO_MUCH_NESTING, 0); |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 335 | return std::nullopt; |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 336 | } |
| 337 | |
| 338 | std::vector<Value::DictStorage::value_type> dict_storage; |
| 339 | |
| 340 | Token token = GetNextToken(); |
| 341 | while (token != T_OBJECT_END) { |
| 342 | if (token != T_STRING) { |
| 343 | ReportError(JSONReader::JSON_UNQUOTED_DICTIONARY_KEY, 1); |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 344 | return std::nullopt; |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 345 | } |
| 346 | |
| 347 | // First consume the key. |
| 348 | StringBuilder key; |
| 349 | if (!ConsumeStringRaw(&key)) { |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 350 | return std::nullopt; |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 351 | } |
| 352 | |
| 353 | // Read the separator. |
| 354 | token = GetNextToken(); |
| 355 | if (token != T_OBJECT_PAIR_SEPARATOR) { |
| 356 | ReportError(JSONReader::JSON_SYNTAX_ERROR, 1); |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 357 | return std::nullopt; |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 358 | } |
| 359 | |
| 360 | // The next token is the value. Ownership transfers to |dict|. |
| 361 | ConsumeChar(); |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 362 | std::optional<Value> value = ParseNextToken(); |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 363 | if (!value) { |
| 364 | // ReportError from deeper level. |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 365 | return std::nullopt; |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 366 | } |
| 367 | |
| 368 | dict_storage.emplace_back(key.DestructiveAsString(), |
| 369 | std::make_unique<Value>(std::move(*value))); |
| 370 | |
| 371 | token = GetNextToken(); |
| 372 | if (token == T_LIST_SEPARATOR) { |
| 373 | ConsumeChar(); |
| 374 | token = GetNextToken(); |
| 375 | if (token == T_OBJECT_END && !(options_ & JSON_ALLOW_TRAILING_COMMAS)) { |
| 376 | ReportError(JSONReader::JSON_TRAILING_COMMA, 1); |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 377 | return std::nullopt; |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 378 | } |
| 379 | } else if (token != T_OBJECT_END) { |
| 380 | ReportError(JSONReader::JSON_SYNTAX_ERROR, 0); |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 381 | return std::nullopt; |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 382 | } |
| 383 | } |
| 384 | |
| 385 | ConsumeChar(); // Closing '}'. |
| 386 | |
| 387 | return Value(Value::DictStorage(std::move(dict_storage), KEEP_LAST_OF_DUPES)); |
| 388 | } |
| 389 | |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 390 | std::optional<Value> JSONParser::ConsumeList() { |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 391 | if (ConsumeChar() != '[') { |
| 392 | ReportError(JSONReader::JSON_UNEXPECTED_TOKEN, 1); |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 393 | return std::nullopt; |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 394 | } |
| 395 | |
| 396 | StackMarker depth_check(max_depth_, &stack_depth_); |
| 397 | if (depth_check.IsTooDeep()) { |
| 398 | ReportError(JSONReader::JSON_TOO_MUCH_NESTING, 0); |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 399 | return std::nullopt; |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 400 | } |
| 401 | |
| 402 | Value::ListStorage list_storage; |
| 403 | |
| 404 | Token token = GetNextToken(); |
| 405 | while (token != T_ARRAY_END) { |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 406 | std::optional<Value> item = ParseToken(token); |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 407 | if (!item) { |
| 408 | // ReportError from deeper level. |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 409 | return std::nullopt; |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 410 | } |
| 411 | |
| 412 | list_storage.push_back(std::move(*item)); |
| 413 | |
| 414 | token = GetNextToken(); |
| 415 | if (token == T_LIST_SEPARATOR) { |
| 416 | ConsumeChar(); |
| 417 | token = GetNextToken(); |
| 418 | if (token == T_ARRAY_END && !(options_ & JSON_ALLOW_TRAILING_COMMAS)) { |
| 419 | ReportError(JSONReader::JSON_TRAILING_COMMA, 1); |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 420 | return std::nullopt; |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 421 | } |
| 422 | } else if (token != T_ARRAY_END) { |
| 423 | ReportError(JSONReader::JSON_SYNTAX_ERROR, 1); |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 424 | return std::nullopt; |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 425 | } |
| 426 | } |
| 427 | |
| 428 | ConsumeChar(); // Closing ']'. |
| 429 | |
| 430 | return Value(std::move(list_storage)); |
| 431 | } |
| 432 | |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 433 | std::optional<Value> JSONParser::ConsumeString() { |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 434 | StringBuilder string; |
| 435 | if (!ConsumeStringRaw(&string)) |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 436 | return std::nullopt; |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 437 | |
| 438 | return Value(string.DestructiveAsString()); |
| 439 | } |
| 440 | |
| 441 | bool JSONParser::ConsumeStringRaw(StringBuilder* out) { |
| 442 | if (ConsumeChar() != '"') { |
| 443 | ReportError(JSONReader::JSON_UNEXPECTED_TOKEN, 1); |
| 444 | return false; |
| 445 | } |
| 446 | |
Brett Wilson | ad9e442 | 2019-09-07 13:33:06 -0700 | [diff] [blame] | 447 | // StringBuilder will internally build a std::string_view unless a UTF-16 |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 448 | // conversion occurs, at which point it will perform a copy into a |
| 449 | // std::string. |
| 450 | StringBuilder string(pos()); |
| 451 | |
| 452 | while (PeekChar()) { |
| 453 | uint32_t next_char = 0; |
| 454 | if (!ReadUnicodeCharacter(input_.data(), |
Scott Graham | 98cd3ca | 2018-06-14 22:26:55 -0700 | [diff] [blame] | 455 | static_cast<int32_t>(input_.length()), &index_, |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 456 | &next_char) || |
| 457 | !IsValidCharacter(next_char)) { |
| 458 | if ((options_ & JSON_REPLACE_INVALID_CHARACTERS) == 0) { |
| 459 | ReportError(JSONReader::JSON_UNSUPPORTED_ENCODING, 1); |
| 460 | return false; |
| 461 | } |
| 462 | ConsumeChar(); |
| 463 | string.Append(kUnicodeReplacementPoint); |
| 464 | continue; |
| 465 | } |
| 466 | |
| 467 | if (next_char == '"') { |
| 468 | ConsumeChar(); |
| 469 | *out = std::move(string); |
| 470 | return true; |
| 471 | } else if (next_char != '\\') { |
| 472 | // If this character is not an escape sequence... |
| 473 | ConsumeChar(); |
| 474 | string.Append(next_char); |
| 475 | } else { |
| 476 | // And if it is an escape sequence, the input string will be adjusted |
| 477 | // (either by combining the two characters of an encoded escape sequence, |
Brett Wilson | ad9e442 | 2019-09-07 13:33:06 -0700 | [diff] [blame] | 478 | // or with a UTF conversion), so using std::string_view isn't possible -- |
| 479 | // force a conversion. |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 480 | string.Convert(); |
| 481 | |
| 482 | // Read past the escape '\' and ensure there's a character following. |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 483 | std::optional<std::string_view> escape_sequence = ConsumeChars(2); |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 484 | if (!escape_sequence) { |
| 485 | ReportError(JSONReader::JSON_INVALID_ESCAPE, 0); |
| 486 | return false; |
| 487 | } |
| 488 | |
| 489 | switch ((*escape_sequence)[1]) { |
| 490 | // Allowed esape sequences: |
| 491 | case 'x': { // UTF-8 sequence. |
| 492 | // UTF-8 \x escape sequences are not allowed in the spec, but they |
| 493 | // are supported here for backwards-compatiblity with the old parser. |
| 494 | escape_sequence = ConsumeChars(2); |
| 495 | if (!escape_sequence) { |
| 496 | ReportError(JSONReader::JSON_INVALID_ESCAPE, -2); |
| 497 | return false; |
| 498 | } |
| 499 | |
| 500 | int hex_digit = 0; |
| 501 | if (!HexStringToInt(*escape_sequence, &hex_digit) || |
| 502 | !IsValidCharacter(hex_digit)) { |
| 503 | ReportError(JSONReader::JSON_INVALID_ESCAPE, -2); |
| 504 | return false; |
| 505 | } |
| 506 | |
| 507 | string.Append(hex_digit); |
| 508 | break; |
| 509 | } |
| 510 | case 'u': { // UTF-16 sequence. |
| 511 | // UTF units are of the form \uXXXX. |
| 512 | uint32_t code_point; |
| 513 | if (!DecodeUTF16(&code_point)) { |
| 514 | ReportError(JSONReader::JSON_INVALID_ESCAPE, 0); |
| 515 | return false; |
| 516 | } |
| 517 | string.Append(code_point); |
| 518 | break; |
| 519 | } |
| 520 | case '"': |
| 521 | string.Append('"'); |
| 522 | break; |
| 523 | case '\\': |
| 524 | string.Append('\\'); |
| 525 | break; |
| 526 | case '/': |
| 527 | string.Append('/'); |
| 528 | break; |
| 529 | case 'b': |
| 530 | string.Append('\b'); |
| 531 | break; |
| 532 | case 'f': |
| 533 | string.Append('\f'); |
| 534 | break; |
| 535 | case 'n': |
| 536 | string.Append('\n'); |
| 537 | break; |
| 538 | case 'r': |
| 539 | string.Append('\r'); |
| 540 | break; |
| 541 | case 't': |
| 542 | string.Append('\t'); |
| 543 | break; |
| 544 | case 'v': // Not listed as valid escape sequence in the RFC. |
| 545 | string.Append('\v'); |
| 546 | break; |
| 547 | // All other escape squences are illegal. |
| 548 | default: |
| 549 | ReportError(JSONReader::JSON_INVALID_ESCAPE, 0); |
| 550 | return false; |
| 551 | } |
| 552 | } |
| 553 | } |
| 554 | |
| 555 | ReportError(JSONReader::JSON_SYNTAX_ERROR, 0); |
| 556 | return false; |
| 557 | } |
| 558 | |
| 559 | // Entry is at the first X in \uXXXX. |
| 560 | bool JSONParser::DecodeUTF16(uint32_t* out_code_point) { |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 561 | std::optional<std::string_view> escape_sequence = ConsumeChars(4); |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 562 | if (!escape_sequence) |
| 563 | return false; |
| 564 | |
| 565 | // Consume the UTF-16 code unit, which may be a high surrogate. |
| 566 | int code_unit16_high = 0; |
| 567 | if (!HexStringToInt(*escape_sequence, &code_unit16_high)) |
| 568 | return false; |
| 569 | |
| 570 | // If this is a high surrogate, consume the next code unit to get the |
| 571 | // low surrogate. |
| 572 | if (CBU16_IS_SURROGATE(code_unit16_high)) { |
| 573 | // Make sure this is the high surrogate. If not, it's an encoding |
| 574 | // error. |
| 575 | if (!CBU16_IS_SURROGATE_LEAD(code_unit16_high)) |
| 576 | return false; |
| 577 | |
| 578 | // Make sure that the token has more characters to consume the |
| 579 | // lower surrogate. |
| 580 | if (!ConsumeIfMatch("\\u")) |
| 581 | return false; |
| 582 | |
| 583 | escape_sequence = ConsumeChars(4); |
| 584 | if (!escape_sequence) |
| 585 | return false; |
| 586 | |
| 587 | int code_unit16_low = 0; |
| 588 | if (!HexStringToInt(*escape_sequence, &code_unit16_low)) |
| 589 | return false; |
| 590 | |
| 591 | if (!CBU16_IS_TRAIL(code_unit16_low)) |
| 592 | return false; |
| 593 | |
| 594 | uint32_t code_point = |
| 595 | CBU16_GET_SUPPLEMENTARY(code_unit16_high, code_unit16_low); |
| 596 | if (!IsValidCharacter(code_point)) |
| 597 | return false; |
| 598 | |
| 599 | *out_code_point = code_point; |
| 600 | } else { |
| 601 | // Not a surrogate. |
| 602 | DCHECK(CBU16_IS_SINGLE(code_unit16_high)); |
| 603 | if (!IsValidCharacter(code_unit16_high)) { |
| 604 | if ((options_ & JSON_REPLACE_INVALID_CHARACTERS) == 0) { |
| 605 | return false; |
| 606 | } |
| 607 | *out_code_point = kUnicodeReplacementPoint; |
| 608 | return true; |
| 609 | } |
| 610 | |
| 611 | *out_code_point = code_unit16_high; |
| 612 | } |
| 613 | |
| 614 | return true; |
| 615 | } |
| 616 | |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 617 | std::optional<Value> JSONParser::ConsumeNumber() { |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 618 | const char* num_start = pos(); |
| 619 | const int start_index = index_; |
| 620 | int end_index = start_index; |
| 621 | |
| 622 | if (PeekChar() == '-') |
| 623 | ConsumeChar(); |
| 624 | |
| 625 | if (!ReadInt(false)) { |
| 626 | ReportError(JSONReader::JSON_SYNTAX_ERROR, 1); |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 627 | return std::nullopt; |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 628 | } |
| 629 | end_index = index_; |
| 630 | |
| 631 | // The optional fraction part. |
| 632 | if (PeekChar() == '.') { |
| 633 | ConsumeChar(); |
| 634 | if (!ReadInt(true)) { |
| 635 | ReportError(JSONReader::JSON_SYNTAX_ERROR, 1); |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 636 | return std::nullopt; |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 637 | } |
| 638 | end_index = index_; |
| 639 | } |
| 640 | |
| 641 | // Optional exponent part. |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 642 | std::optional<char> c = PeekChar(); |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 643 | if (c == 'e' || c == 'E') { |
| 644 | ConsumeChar(); |
| 645 | if (PeekChar() == '-' || PeekChar() == '+') { |
| 646 | ConsumeChar(); |
| 647 | } |
| 648 | if (!ReadInt(true)) { |
| 649 | ReportError(JSONReader::JSON_SYNTAX_ERROR, 1); |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 650 | return std::nullopt; |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 651 | } |
| 652 | end_index = index_; |
| 653 | } |
| 654 | |
| 655 | // ReadInt is greedy because numbers have no easily detectable sentinel, |
| 656 | // so save off where the parser should be on exit (see Consume invariant at |
| 657 | // the top of the header), then make sure the next token is one which is |
| 658 | // valid. |
| 659 | int exit_index = index_; |
| 660 | |
| 661 | switch (GetNextToken()) { |
| 662 | case T_OBJECT_END: |
| 663 | case T_ARRAY_END: |
| 664 | case T_LIST_SEPARATOR: |
| 665 | case T_END_OF_INPUT: |
| 666 | break; |
| 667 | default: |
| 668 | ReportError(JSONReader::JSON_SYNTAX_ERROR, 1); |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 669 | return std::nullopt; |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 670 | } |
| 671 | |
| 672 | index_ = exit_index; |
| 673 | |
Brett Wilson | ad9e442 | 2019-09-07 13:33:06 -0700 | [diff] [blame] | 674 | std::string_view num_string(num_start, end_index - start_index); |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 675 | |
| 676 | int num_int; |
| 677 | if (StringToInt(num_string, &num_int)) |
| 678 | return Value(num_int); |
| 679 | |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 680 | return std::nullopt; |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 681 | } |
| 682 | |
| 683 | bool JSONParser::ReadInt(bool allow_leading_zeros) { |
| 684 | size_t len = 0; |
| 685 | char first = 0; |
| 686 | |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 687 | while (std::optional<char> c = PeekChar()) { |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 688 | if (!IsAsciiDigit(c)) |
| 689 | break; |
| 690 | |
| 691 | if (len == 0) |
| 692 | first = *c; |
| 693 | |
| 694 | ++len; |
| 695 | ConsumeChar(); |
| 696 | } |
| 697 | |
| 698 | if (len == 0) |
| 699 | return false; |
| 700 | |
| 701 | if (!allow_leading_zeros && len > 1 && first == '0') |
| 702 | return false; |
| 703 | |
| 704 | return true; |
| 705 | } |
| 706 | |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 707 | std::optional<Value> JSONParser::ConsumeLiteral() { |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 708 | if (ConsumeIfMatch("true")) { |
| 709 | return Value(true); |
| 710 | } else if (ConsumeIfMatch("false")) { |
| 711 | return Value(false); |
| 712 | } else if (ConsumeIfMatch("null")) { |
| 713 | return Value(Value::Type::NONE); |
| 714 | } else { |
| 715 | ReportError(JSONReader::JSON_SYNTAX_ERROR, 1); |
Brett Wilson | 572ba24 | 2019-09-09 16:32:59 -0700 | [diff] [blame] | 716 | return std::nullopt; |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 717 | } |
| 718 | } |
| 719 | |
Brett Wilson | ad9e442 | 2019-09-07 13:33:06 -0700 | [diff] [blame] | 720 | bool JSONParser::ConsumeIfMatch(std::string_view match) { |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 721 | if (match == PeekChars(match.size())) { |
| 722 | ConsumeChars(match.size()); |
| 723 | return true; |
| 724 | } |
| 725 | return false; |
| 726 | } |
| 727 | |
| 728 | void JSONParser::ReportError(JSONReader::JsonParseError code, |
| 729 | int column_adjust) { |
| 730 | error_code_ = code; |
| 731 | error_line_ = line_number_; |
| 732 | error_column_ = index_ - index_last_line_ + column_adjust; |
| 733 | } |
| 734 | |
| 735 | // static |
Scott Graham | 98cd3ca | 2018-06-14 22:26:55 -0700 | [diff] [blame] | 736 | std::string JSONParser::FormatErrorMessage(int line, |
| 737 | int column, |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 738 | const std::string& description) { |
| 739 | if (line || column) { |
Scott Graham | 98cd3ca | 2018-06-14 22:26:55 -0700 | [diff] [blame] | 740 | return StringPrintf("Line: %i, column: %i, %s", line, column, |
| 741 | description.c_str()); |
Scott Graham | 6696211 | 2018-06-08 12:42:08 -0700 | [diff] [blame] | 742 | } |
| 743 | return description; |
| 744 | } |
| 745 | |
| 746 | } // namespace internal |
| 747 | } // namespace base |