| // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "base/json/json_parser.h" |
| |
| #include <cmath> |
| #include <string_view> |
| #include <utility> |
| #include <vector> |
| |
| #include "base/logging.h" |
| #include "base/macros.h" |
| #include "base/numerics/safe_conversions.h" |
| #include "base/strings/string_number_conversions.h" |
| #include "base/strings/string_util.h" |
| #include "base/strings/stringprintf.h" |
| #include "base/strings/utf_string_conversion_utils.h" |
| #include "base/strings/utf_string_conversions.h" |
| #include "base/third_party/icu/icu_utf.h" |
| #include "base/values.h" |
| |
| namespace base { |
| namespace internal { |
| |
| namespace { |
| |
| const int32_t kExtendedASCIIStart = 0x80; |
| |
| // Simple class that checks for maximum recursion/"stack overflow." |
| class StackMarker { |
| public: |
| StackMarker(int max_depth, int* depth) |
| : max_depth_(max_depth), depth_(depth) { |
| ++(*depth_); |
| DCHECK_LE(*depth_, max_depth_); |
| } |
| ~StackMarker() { --(*depth_); } |
| |
| bool IsTooDeep() const { return *depth_ >= max_depth_; } |
| |
| private: |
| const int max_depth_; |
| int* const depth_; |
| |
| DISALLOW_COPY_AND_ASSIGN(StackMarker); |
| }; |
| |
| constexpr uint32_t kUnicodeReplacementPoint = 0xFFFD; |
| |
| } // namespace |
| |
| // This is U+FFFD. |
| const char kUnicodeReplacementString[] = "\xEF\xBF\xBD"; |
| |
| JSONParser::JSONParser(int options, int max_depth) |
| : options_(options), |
| max_depth_(max_depth), |
| index_(0), |
| stack_depth_(0), |
| line_number_(0), |
| index_last_line_(0), |
| error_code_(JSONReader::JSON_NO_ERROR), |
| error_line_(0), |
| error_column_(0) { |
| CHECK_LE(max_depth, JSONReader::kStackMaxDepth); |
| } |
| |
| JSONParser::~JSONParser() = default; |
| |
| std::optional<Value> JSONParser::Parse(std::string_view input) { |
| input_ = input; |
| index_ = 0; |
| line_number_ = 1; |
| index_last_line_ = 0; |
| |
| error_code_ = JSONReader::JSON_NO_ERROR; |
| error_line_ = 0; |
| error_column_ = 0; |
| |
| // ICU and ReadUnicodeCharacter() use int32_t for lengths, so ensure |
| // that the index_ will not overflow when parsing. |
| if (!base::IsValueInRangeForNumericType<int32_t>(input.length())) { |
| ReportError(JSONReader::JSON_TOO_LARGE, 0); |
| return std::nullopt; |
| } |
| |
| // When the input JSON string starts with a UTF-8 Byte-Order-Mark, |
| // advance the start position to avoid the ParseNextToken function mis- |
| // treating a Unicode BOM as an invalid character and returning NULL. |
| ConsumeIfMatch("\xEF\xBB\xBF"); |
| |
| // Parse the first and any nested tokens. |
| std::optional<Value> root(ParseNextToken()); |
| if (!root) |
| return std::nullopt; |
| |
| // Make sure the input stream is at an end. |
| if (GetNextToken() != T_END_OF_INPUT) { |
| ReportError(JSONReader::JSON_UNEXPECTED_DATA_AFTER_ROOT, 1); |
| return std::nullopt; |
| } |
| |
| return root; |
| } |
| |
| JSONReader::JsonParseError JSONParser::error_code() const { |
| return error_code_; |
| } |
| |
| std::string JSONParser::GetErrorMessage() const { |
| return FormatErrorMessage(error_line_, error_column_, |
| JSONReader::ErrorCodeToString(error_code_)); |
| } |
| |
| int JSONParser::error_line() const { |
| return error_line_; |
| } |
| |
| int JSONParser::error_column() const { |
| return error_column_; |
| } |
| |
| // StringBuilder /////////////////////////////////////////////////////////////// |
| |
| JSONParser::StringBuilder::StringBuilder() : StringBuilder(nullptr) {} |
| |
| JSONParser::StringBuilder::StringBuilder(const char* pos) |
| : pos_(pos), length_(0) {} |
| |
| JSONParser::StringBuilder::~StringBuilder() = default; |
| |
| JSONParser::StringBuilder& JSONParser::StringBuilder::operator=( |
| StringBuilder&& other) = default; |
| |
| void JSONParser::StringBuilder::Append(uint32_t point) { |
| DCHECK(IsValidCharacter(point)); |
| |
| if (point < kExtendedASCIIStart && !string_) { |
| DCHECK_EQ(static_cast<char>(point), pos_[length_]); |
| ++length_; |
| } else { |
| Convert(); |
| if (UNLIKELY(point == kUnicodeReplacementPoint)) { |
| string_->append(kUnicodeReplacementString); |
| } else { |
| WriteUnicodeCharacter(point, &*string_); |
| } |
| } |
| } |
| |
| void JSONParser::StringBuilder::Convert() { |
| if (string_) |
| return; |
| string_.emplace(pos_, length_); |
| } |
| |
| std::string JSONParser::StringBuilder::DestructiveAsString() { |
| if (string_) |
| return std::move(*string_); |
| return std::string(pos_, length_); |
| } |
| |
| // JSONParser private ////////////////////////////////////////////////////////// |
| |
| std::optional<std::string_view> JSONParser::PeekChars(int count) { |
| if (static_cast<size_t>(index_) + count > input_.length()) |
| return std::nullopt; |
| // Using std::string_view::substr() is significantly slower (according to |
| // base_perftests) than constructing a substring manually. |
| return std::string_view(input_.data() + index_, count); |
| } |
| |
| std::optional<char> JSONParser::PeekChar() { |
| std::optional<std::string_view> chars = PeekChars(1); |
| if (chars) |
| return (*chars)[0]; |
| return std::nullopt; |
| } |
| |
| std::optional<std::string_view> JSONParser::ConsumeChars(int count) { |
| std::optional<std::string_view> chars = PeekChars(count); |
| if (chars) |
| index_ += count; |
| return chars; |
| } |
| |
| std::optional<char> JSONParser::ConsumeChar() { |
| std::optional<std::string_view> chars = ConsumeChars(1); |
| if (chars) |
| return (*chars)[0]; |
| return std::nullopt; |
| } |
| |
| const char* JSONParser::pos() { |
| CHECK_LE(static_cast<size_t>(index_), input_.length()); |
| return input_.data() + index_; |
| } |
| |
| JSONParser::Token JSONParser::GetNextToken() { |
| EatWhitespaceAndComments(); |
| |
| std::optional<char> c = PeekChar(); |
| if (!c) |
| return T_END_OF_INPUT; |
| |
| switch (*c) { |
| case '{': |
| return T_OBJECT_BEGIN; |
| case '}': |
| return T_OBJECT_END; |
| case '[': |
| return T_ARRAY_BEGIN; |
| case ']': |
| return T_ARRAY_END; |
| case '"': |
| return T_STRING; |
| case '0': |
| case '1': |
| case '2': |
| case '3': |
| case '4': |
| case '5': |
| case '6': |
| case '7': |
| case '8': |
| case '9': |
| case '-': |
| return T_NUMBER; |
| case 't': |
| return T_BOOL_TRUE; |
| case 'f': |
| return T_BOOL_FALSE; |
| case 'n': |
| return T_NULL; |
| case ',': |
| return T_LIST_SEPARATOR; |
| case ':': |
| return T_OBJECT_PAIR_SEPARATOR; |
| default: |
| return T_INVALID_TOKEN; |
| } |
| } |
| |
| void JSONParser::EatWhitespaceAndComments() { |
| while (std::optional<char> c = PeekChar()) { |
| switch (*c) { |
| case '\r': |
| case '\n': |
| index_last_line_ = index_; |
| // Don't increment line_number_ twice for "\r\n". |
| if (!(c == '\n' && index_ > 0 && input_[index_ - 1] == '\r')) { |
| ++line_number_; |
| } |
| FALLTHROUGH; |
| case ' ': |
| case '\t': |
| ConsumeChar(); |
| break; |
| case '/': |
| if (!EatComment()) |
| return; |
| break; |
| default: |
| return; |
| } |
| } |
| } |
| |
| bool JSONParser::EatComment() { |
| std::optional<std::string_view> comment_start = ConsumeChars(2); |
| if (!comment_start) |
| return false; |
| |
| if (comment_start == "//") { |
| // Single line comment, read to newline. |
| while (std::optional<char> c = PeekChar()) { |
| if (c == '\n' || c == '\r') |
| return true; |
| ConsumeChar(); |
| } |
| } else if (comment_start == "/*") { |
| char previous_char = '\0'; |
| // Block comment, read until end marker. |
| while (std::optional<char> c = PeekChar()) { |
| if (previous_char == '*' && c == '/') { |
| // EatWhitespaceAndComments will inspect pos(), which will still be on |
| // the last / of the comment, so advance once more (which may also be |
| // end of input). |
| ConsumeChar(); |
| return true; |
| } |
| previous_char = *ConsumeChar(); |
| } |
| |
| // If the comment is unterminated, GetNextToken will report T_END_OF_INPUT. |
| } |
| |
| return false; |
| } |
| |
| std::optional<Value> JSONParser::ParseNextToken() { |
| return ParseToken(GetNextToken()); |
| } |
| |
| std::optional<Value> JSONParser::ParseToken(Token token) { |
| switch (token) { |
| case T_OBJECT_BEGIN: |
| return ConsumeDictionary(); |
| case T_ARRAY_BEGIN: |
| return ConsumeList(); |
| case T_STRING: |
| return ConsumeString(); |
| case T_NUMBER: |
| return ConsumeNumber(); |
| case T_BOOL_TRUE: |
| case T_BOOL_FALSE: |
| case T_NULL: |
| return ConsumeLiteral(); |
| default: |
| ReportError(JSONReader::JSON_UNEXPECTED_TOKEN, 1); |
| return std::nullopt; |
| } |
| } |
| |
| std::optional<Value> JSONParser::ConsumeDictionary() { |
| if (ConsumeChar() != '{') { |
| ReportError(JSONReader::JSON_UNEXPECTED_TOKEN, 1); |
| return std::nullopt; |
| } |
| |
| StackMarker depth_check(max_depth_, &stack_depth_); |
| if (depth_check.IsTooDeep()) { |
| ReportError(JSONReader::JSON_TOO_MUCH_NESTING, 0); |
| return std::nullopt; |
| } |
| |
| std::vector<Value::DictStorage::value_type> dict_storage; |
| |
| Token token = GetNextToken(); |
| while (token != T_OBJECT_END) { |
| if (token != T_STRING) { |
| ReportError(JSONReader::JSON_UNQUOTED_DICTIONARY_KEY, 1); |
| return std::nullopt; |
| } |
| |
| // First consume the key. |
| StringBuilder key; |
| if (!ConsumeStringRaw(&key)) { |
| return std::nullopt; |
| } |
| |
| // Read the separator. |
| token = GetNextToken(); |
| if (token != T_OBJECT_PAIR_SEPARATOR) { |
| ReportError(JSONReader::JSON_SYNTAX_ERROR, 1); |
| return std::nullopt; |
| } |
| |
| // The next token is the value. Ownership transfers to |dict|. |
| ConsumeChar(); |
| std::optional<Value> value = ParseNextToken(); |
| if (!value) { |
| // ReportError from deeper level. |
| return std::nullopt; |
| } |
| |
| dict_storage.emplace_back(key.DestructiveAsString(), |
| std::make_unique<Value>(std::move(*value))); |
| |
| token = GetNextToken(); |
| if (token == T_LIST_SEPARATOR) { |
| ConsumeChar(); |
| token = GetNextToken(); |
| if (token == T_OBJECT_END && !(options_ & JSON_ALLOW_TRAILING_COMMAS)) { |
| ReportError(JSONReader::JSON_TRAILING_COMMA, 1); |
| return std::nullopt; |
| } |
| } else if (token != T_OBJECT_END) { |
| ReportError(JSONReader::JSON_SYNTAX_ERROR, 0); |
| return std::nullopt; |
| } |
| } |
| |
| ConsumeChar(); // Closing '}'. |
| |
| return Value(Value::DictStorage(std::move(dict_storage), KEEP_LAST_OF_DUPES)); |
| } |
| |
| std::optional<Value> JSONParser::ConsumeList() { |
| if (ConsumeChar() != '[') { |
| ReportError(JSONReader::JSON_UNEXPECTED_TOKEN, 1); |
| return std::nullopt; |
| } |
| |
| StackMarker depth_check(max_depth_, &stack_depth_); |
| if (depth_check.IsTooDeep()) { |
| ReportError(JSONReader::JSON_TOO_MUCH_NESTING, 0); |
| return std::nullopt; |
| } |
| |
| Value::ListStorage list_storage; |
| |
| Token token = GetNextToken(); |
| while (token != T_ARRAY_END) { |
| std::optional<Value> item = ParseToken(token); |
| if (!item) { |
| // ReportError from deeper level. |
| return std::nullopt; |
| } |
| |
| list_storage.push_back(std::move(*item)); |
| |
| token = GetNextToken(); |
| if (token == T_LIST_SEPARATOR) { |
| ConsumeChar(); |
| token = GetNextToken(); |
| if (token == T_ARRAY_END && !(options_ & JSON_ALLOW_TRAILING_COMMAS)) { |
| ReportError(JSONReader::JSON_TRAILING_COMMA, 1); |
| return std::nullopt; |
| } |
| } else if (token != T_ARRAY_END) { |
| ReportError(JSONReader::JSON_SYNTAX_ERROR, 1); |
| return std::nullopt; |
| } |
| } |
| |
| ConsumeChar(); // Closing ']'. |
| |
| return Value(std::move(list_storage)); |
| } |
| |
| std::optional<Value> JSONParser::ConsumeString() { |
| StringBuilder string; |
| if (!ConsumeStringRaw(&string)) |
| return std::nullopt; |
| |
| return Value(string.DestructiveAsString()); |
| } |
| |
| bool JSONParser::ConsumeStringRaw(StringBuilder* out) { |
| if (ConsumeChar() != '"') { |
| ReportError(JSONReader::JSON_UNEXPECTED_TOKEN, 1); |
| return false; |
| } |
| |
| // StringBuilder will internally build a std::string_view unless a UTF-16 |
| // conversion occurs, at which point it will perform a copy into a |
| // std::string. |
| StringBuilder string(pos()); |
| |
| while (PeekChar()) { |
| uint32_t next_char = 0; |
| if (!ReadUnicodeCharacter(input_.data(), |
| static_cast<int32_t>(input_.length()), &index_, |
| &next_char) || |
| !IsValidCharacter(next_char)) { |
| if ((options_ & JSON_REPLACE_INVALID_CHARACTERS) == 0) { |
| ReportError(JSONReader::JSON_UNSUPPORTED_ENCODING, 1); |
| return false; |
| } |
| ConsumeChar(); |
| string.Append(kUnicodeReplacementPoint); |
| continue; |
| } |
| |
| if (next_char == '"') { |
| ConsumeChar(); |
| *out = std::move(string); |
| return true; |
| } else if (next_char != '\\') { |
| // If this character is not an escape sequence... |
| ConsumeChar(); |
| string.Append(next_char); |
| } else { |
| // And if it is an escape sequence, the input string will be adjusted |
| // (either by combining the two characters of an encoded escape sequence, |
| // or with a UTF conversion), so using std::string_view isn't possible -- |
| // force a conversion. |
| string.Convert(); |
| |
| // Read past the escape '\' and ensure there's a character following. |
| std::optional<std::string_view> escape_sequence = ConsumeChars(2); |
| if (!escape_sequence) { |
| ReportError(JSONReader::JSON_INVALID_ESCAPE, 0); |
| return false; |
| } |
| |
| switch ((*escape_sequence)[1]) { |
| // Allowed esape sequences: |
| case 'x': { // UTF-8 sequence. |
| // UTF-8 \x escape sequences are not allowed in the spec, but they |
| // are supported here for backwards-compatiblity with the old parser. |
| escape_sequence = ConsumeChars(2); |
| if (!escape_sequence) { |
| ReportError(JSONReader::JSON_INVALID_ESCAPE, -2); |
| return false; |
| } |
| |
| int hex_digit = 0; |
| if (!HexStringToInt(*escape_sequence, &hex_digit) || |
| !IsValidCharacter(hex_digit)) { |
| ReportError(JSONReader::JSON_INVALID_ESCAPE, -2); |
| return false; |
| } |
| |
| string.Append(hex_digit); |
| break; |
| } |
| case 'u': { // UTF-16 sequence. |
| // UTF units are of the form \uXXXX. |
| uint32_t code_point; |
| if (!DecodeUTF16(&code_point)) { |
| ReportError(JSONReader::JSON_INVALID_ESCAPE, 0); |
| return false; |
| } |
| string.Append(code_point); |
| break; |
| } |
| case '"': |
| string.Append('"'); |
| break; |
| case '\\': |
| string.Append('\\'); |
| break; |
| case '/': |
| string.Append('/'); |
| break; |
| case 'b': |
| string.Append('\b'); |
| break; |
| case 'f': |
| string.Append('\f'); |
| break; |
| case 'n': |
| string.Append('\n'); |
| break; |
| case 'r': |
| string.Append('\r'); |
| break; |
| case 't': |
| string.Append('\t'); |
| break; |
| case 'v': // Not listed as valid escape sequence in the RFC. |
| string.Append('\v'); |
| break; |
| // All other escape squences are illegal. |
| default: |
| ReportError(JSONReader::JSON_INVALID_ESCAPE, 0); |
| return false; |
| } |
| } |
| } |
| |
| ReportError(JSONReader::JSON_SYNTAX_ERROR, 0); |
| return false; |
| } |
| |
| // Entry is at the first X in \uXXXX. |
| bool JSONParser::DecodeUTF16(uint32_t* out_code_point) { |
| std::optional<std::string_view> escape_sequence = ConsumeChars(4); |
| if (!escape_sequence) |
| return false; |
| |
| // Consume the UTF-16 code unit, which may be a high surrogate. |
| int code_unit16_high = 0; |
| if (!HexStringToInt(*escape_sequence, &code_unit16_high)) |
| return false; |
| |
| // If this is a high surrogate, consume the next code unit to get the |
| // low surrogate. |
| if (CBU16_IS_SURROGATE(code_unit16_high)) { |
| // Make sure this is the high surrogate. If not, it's an encoding |
| // error. |
| if (!CBU16_IS_SURROGATE_LEAD(code_unit16_high)) |
| return false; |
| |
| // Make sure that the token has more characters to consume the |
| // lower surrogate. |
| if (!ConsumeIfMatch("\\u")) |
| return false; |
| |
| escape_sequence = ConsumeChars(4); |
| if (!escape_sequence) |
| return false; |
| |
| int code_unit16_low = 0; |
| if (!HexStringToInt(*escape_sequence, &code_unit16_low)) |
| return false; |
| |
| if (!CBU16_IS_TRAIL(code_unit16_low)) |
| return false; |
| |
| uint32_t code_point = |
| CBU16_GET_SUPPLEMENTARY(code_unit16_high, code_unit16_low); |
| if (!IsValidCharacter(code_point)) |
| return false; |
| |
| *out_code_point = code_point; |
| } else { |
| // Not a surrogate. |
| DCHECK(CBU16_IS_SINGLE(code_unit16_high)); |
| if (!IsValidCharacter(code_unit16_high)) { |
| if ((options_ & JSON_REPLACE_INVALID_CHARACTERS) == 0) { |
| return false; |
| } |
| *out_code_point = kUnicodeReplacementPoint; |
| return true; |
| } |
| |
| *out_code_point = code_unit16_high; |
| } |
| |
| return true; |
| } |
| |
| std::optional<Value> JSONParser::ConsumeNumber() { |
| const char* num_start = pos(); |
| const int start_index = index_; |
| int end_index = start_index; |
| |
| if (PeekChar() == '-') |
| ConsumeChar(); |
| |
| if (!ReadInt(false)) { |
| ReportError(JSONReader::JSON_SYNTAX_ERROR, 1); |
| return std::nullopt; |
| } |
| end_index = index_; |
| |
| // The optional fraction part. |
| if (PeekChar() == '.') { |
| ConsumeChar(); |
| if (!ReadInt(true)) { |
| ReportError(JSONReader::JSON_SYNTAX_ERROR, 1); |
| return std::nullopt; |
| } |
| end_index = index_; |
| } |
| |
| // Optional exponent part. |
| std::optional<char> c = PeekChar(); |
| if (c == 'e' || c == 'E') { |
| ConsumeChar(); |
| if (PeekChar() == '-' || PeekChar() == '+') { |
| ConsumeChar(); |
| } |
| if (!ReadInt(true)) { |
| ReportError(JSONReader::JSON_SYNTAX_ERROR, 1); |
| return std::nullopt; |
| } |
| end_index = index_; |
| } |
| |
| // ReadInt is greedy because numbers have no easily detectable sentinel, |
| // so save off where the parser should be on exit (see Consume invariant at |
| // the top of the header), then make sure the next token is one which is |
| // valid. |
| int exit_index = index_; |
| |
| switch (GetNextToken()) { |
| case T_OBJECT_END: |
| case T_ARRAY_END: |
| case T_LIST_SEPARATOR: |
| case T_END_OF_INPUT: |
| break; |
| default: |
| ReportError(JSONReader::JSON_SYNTAX_ERROR, 1); |
| return std::nullopt; |
| } |
| |
| index_ = exit_index; |
| |
| std::string_view num_string(num_start, end_index - start_index); |
| |
| int num_int; |
| if (StringToInt(num_string, &num_int)) |
| return Value(num_int); |
| |
| return std::nullopt; |
| } |
| |
| bool JSONParser::ReadInt(bool allow_leading_zeros) { |
| size_t len = 0; |
| char first = 0; |
| |
| while (std::optional<char> c = PeekChar()) { |
| if (!IsAsciiDigit(c)) |
| break; |
| |
| if (len == 0) |
| first = *c; |
| |
| ++len; |
| ConsumeChar(); |
| } |
| |
| if (len == 0) |
| return false; |
| |
| if (!allow_leading_zeros && len > 1 && first == '0') |
| return false; |
| |
| return true; |
| } |
| |
| std::optional<Value> JSONParser::ConsumeLiteral() { |
| if (ConsumeIfMatch("true")) { |
| return Value(true); |
| } else if (ConsumeIfMatch("false")) { |
| return Value(false); |
| } else if (ConsumeIfMatch("null")) { |
| return Value(Value::Type::NONE); |
| } else { |
| ReportError(JSONReader::JSON_SYNTAX_ERROR, 1); |
| return std::nullopt; |
| } |
| } |
| |
| bool JSONParser::ConsumeIfMatch(std::string_view match) { |
| if (match == PeekChars(match.size())) { |
| ConsumeChars(match.size()); |
| return true; |
| } |
| return false; |
| } |
| |
| void JSONParser::ReportError(JSONReader::JsonParseError code, |
| int column_adjust) { |
| error_code_ = code; |
| error_line_ = line_number_; |
| error_column_ = index_ - index_last_line_ + column_adjust; |
| } |
| |
| // static |
| std::string JSONParser::FormatErrorMessage(int line, |
| int column, |
| const std::string& description) { |
| if (line || column) { |
| return StringPrintf("Line: %i, column: %i, %s", line, column, |
| description.c_str()); |
| } |
| return description; |
| } |
| |
| } // namespace internal |
| } // namespace base |