blob: 5a245ec6312a5f3e857055336ad0ec51a86a0509 [file] [log] [blame]
// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "base/json/json_parser.h"
#include <cmath>
#include <string_view>
#include <utility>
#include <vector>
#include "base/logging.h"
#include "base/numerics/safe_conversions.h"
#include "base/strings/string_number_conversions.h"
#include "base/strings/string_util.h"
#include "base/strings/stringprintf.h"
#include "base/strings/utf_string_conversion_utils.h"
#include "base/strings/utf_string_conversions.h"
#include "base/third_party/icu/icu_utf.h"
#include "base/values.h"
namespace base {
namespace internal {
namespace {
const int32_t kExtendedASCIIStart = 0x80;
// Simple class that checks for maximum recursion/"stack overflow."
class StackMarker {
public:
StackMarker(int max_depth, int* depth)
: max_depth_(max_depth), depth_(depth) {
++(*depth_);
DCHECK_LE(*depth_, max_depth_);
}
~StackMarker() { --(*depth_); }
bool IsTooDeep() const { return *depth_ >= max_depth_; }
private:
const int max_depth_;
int* const depth_;
StackMarker(const StackMarker&) = delete;
StackMarker& operator=(const StackMarker&) = delete;
};
constexpr uint32_t kUnicodeReplacementPoint = 0xFFFD;
} // namespace
// This is U+FFFD.
const char kUnicodeReplacementString[] = "\xEF\xBF\xBD";
JSONParser::JSONParser(int options, int max_depth)
: options_(options),
max_depth_(max_depth),
index_(0),
stack_depth_(0),
line_number_(0),
index_last_line_(0),
error_code_(JSONReader::JSON_NO_ERROR),
error_line_(0),
error_column_(0) {
CHECK_LE(max_depth, JSONReader::kStackMaxDepth);
}
JSONParser::~JSONParser() = default;
std::optional<Value> JSONParser::Parse(std::string_view input) {
input_ = input;
index_ = 0;
line_number_ = 1;
index_last_line_ = 0;
error_code_ = JSONReader::JSON_NO_ERROR;
error_line_ = 0;
error_column_ = 0;
// ICU and ReadUnicodeCharacter() use int32_t for lengths, so ensure
// that the index_ will not overflow when parsing.
if (!base::IsValueInRangeForNumericType<int32_t>(input.length())) {
ReportError(JSONReader::JSON_TOO_LARGE, 0);
return std::nullopt;
}
// When the input JSON string starts with a UTF-8 Byte-Order-Mark,
// advance the start position to avoid the ParseNextToken function mis-
// treating a Unicode BOM as an invalid character and returning NULL.
ConsumeIfMatch("\xEF\xBB\xBF");
// Parse the first and any nested tokens.
std::optional<Value> root(ParseNextToken());
if (!root)
return std::nullopt;
// Make sure the input stream is at an end.
if (GetNextToken() != T_END_OF_INPUT) {
ReportError(JSONReader::JSON_UNEXPECTED_DATA_AFTER_ROOT, 1);
return std::nullopt;
}
return root;
}
JSONReader::JsonParseError JSONParser::error_code() const {
return error_code_;
}
std::string JSONParser::GetErrorMessage() const {
return FormatErrorMessage(error_line_, error_column_,
JSONReader::ErrorCodeToString(error_code_));
}
int JSONParser::error_line() const {
return error_line_;
}
int JSONParser::error_column() const {
return error_column_;
}
// StringBuilder ///////////////////////////////////////////////////////////////
JSONParser::StringBuilder::StringBuilder() : StringBuilder(nullptr) {}
JSONParser::StringBuilder::StringBuilder(const char* pos)
: pos_(pos), length_(0) {}
JSONParser::StringBuilder::~StringBuilder() = default;
JSONParser::StringBuilder& JSONParser::StringBuilder::operator=(
StringBuilder&& other) = default;
void JSONParser::StringBuilder::Append(uint32_t point) {
DCHECK(IsValidCharacter(point));
if (point < kExtendedASCIIStart && !string_) {
DCHECK_EQ(static_cast<char>(point), pos_[length_]);
++length_;
} else {
Convert();
if (UNLIKELY(point == kUnicodeReplacementPoint)) {
string_->append(kUnicodeReplacementString);
} else {
WriteUnicodeCharacter(point, &*string_);
}
}
}
void JSONParser::StringBuilder::Convert() {
if (string_)
return;
string_.emplace(pos_, length_);
}
std::string JSONParser::StringBuilder::DestructiveAsString() {
if (string_)
return std::move(*string_);
return std::string(pos_, length_);
}
// JSONParser private //////////////////////////////////////////////////////////
std::optional<std::string_view> JSONParser::PeekChars(int count) {
if (static_cast<size_t>(index_) + count > input_.length())
return std::nullopt;
// Using std::string_view::substr() is significantly slower (according to
// base_perftests) than constructing a substring manually.
return std::string_view(input_.data() + index_, count);
}
std::optional<char> JSONParser::PeekChar() {
std::optional<std::string_view> chars = PeekChars(1);
if (chars)
return (*chars)[0];
return std::nullopt;
}
std::optional<std::string_view> JSONParser::ConsumeChars(int count) {
std::optional<std::string_view> chars = PeekChars(count);
if (chars)
index_ += count;
return chars;
}
std::optional<char> JSONParser::ConsumeChar() {
std::optional<std::string_view> chars = ConsumeChars(1);
if (chars)
return (*chars)[0];
return std::nullopt;
}
const char* JSONParser::pos() {
CHECK_LE(static_cast<size_t>(index_), input_.length());
return input_.data() + index_;
}
JSONParser::Token JSONParser::GetNextToken() {
EatWhitespaceAndComments();
std::optional<char> c = PeekChar();
if (!c)
return T_END_OF_INPUT;
switch (*c) {
case '{':
return T_OBJECT_BEGIN;
case '}':
return T_OBJECT_END;
case '[':
return T_ARRAY_BEGIN;
case ']':
return T_ARRAY_END;
case '"':
return T_STRING;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
case '-':
return T_NUMBER;
case 't':
return T_BOOL_TRUE;
case 'f':
return T_BOOL_FALSE;
case 'n':
return T_NULL;
case ',':
return T_LIST_SEPARATOR;
case ':':
return T_OBJECT_PAIR_SEPARATOR;
default:
return T_INVALID_TOKEN;
}
}
void JSONParser::EatWhitespaceAndComments() {
while (std::optional<char> c = PeekChar()) {
switch (*c) {
case '\r':
case '\n':
index_last_line_ = index_;
// Don't increment line_number_ twice for "\r\n".
if (!(c == '\n' && index_ > 0 && input_[index_ - 1] == '\r')) {
++line_number_;
}
FALLTHROUGH;
case ' ':
case '\t':
ConsumeChar();
break;
case '/':
if (!EatComment())
return;
break;
default:
return;
}
}
}
bool JSONParser::EatComment() {
std::optional<std::string_view> comment_start = ConsumeChars(2);
if (!comment_start)
return false;
if (comment_start == "//") {
// Single line comment, read to newline.
while (std::optional<char> c = PeekChar()) {
if (c == '\n' || c == '\r')
return true;
ConsumeChar();
}
} else if (comment_start == "/*") {
char previous_char = '\0';
// Block comment, read until end marker.
while (std::optional<char> c = PeekChar()) {
if (previous_char == '*' && c == '/') {
// EatWhitespaceAndComments will inspect pos(), which will still be on
// the last / of the comment, so advance once more (which may also be
// end of input).
ConsumeChar();
return true;
}
previous_char = *ConsumeChar();
}
// If the comment is unterminated, GetNextToken will report T_END_OF_INPUT.
}
return false;
}
std::optional<Value> JSONParser::ParseNextToken() {
return ParseToken(GetNextToken());
}
std::optional<Value> JSONParser::ParseToken(Token token) {
switch (token) {
case T_OBJECT_BEGIN:
return ConsumeDictionary();
case T_ARRAY_BEGIN:
return ConsumeList();
case T_STRING:
return ConsumeString();
case T_NUMBER:
return ConsumeNumber();
case T_BOOL_TRUE:
case T_BOOL_FALSE:
case T_NULL:
return ConsumeLiteral();
default:
ReportError(JSONReader::JSON_UNEXPECTED_TOKEN, 1);
return std::nullopt;
}
}
std::optional<Value> JSONParser::ConsumeDictionary() {
if (ConsumeChar() != '{') {
ReportError(JSONReader::JSON_UNEXPECTED_TOKEN, 1);
return std::nullopt;
}
StackMarker depth_check(max_depth_, &stack_depth_);
if (depth_check.IsTooDeep()) {
ReportError(JSONReader::JSON_TOO_MUCH_NESTING, 0);
return std::nullopt;
}
std::vector<Value::DictStorage::value_type> dict_storage;
Token token = GetNextToken();
while (token != T_OBJECT_END) {
if (token != T_STRING) {
ReportError(JSONReader::JSON_UNQUOTED_DICTIONARY_KEY, 1);
return std::nullopt;
}
// First consume the key.
StringBuilder key;
if (!ConsumeStringRaw(&key)) {
return std::nullopt;
}
// Read the separator.
token = GetNextToken();
if (token != T_OBJECT_PAIR_SEPARATOR) {
ReportError(JSONReader::JSON_SYNTAX_ERROR, 1);
return std::nullopt;
}
// The next token is the value. Ownership transfers to |dict|.
ConsumeChar();
std::optional<Value> value = ParseNextToken();
if (!value) {
// ReportError from deeper level.
return std::nullopt;
}
dict_storage.emplace_back(key.DestructiveAsString(),
std::make_unique<Value>(std::move(*value)));
token = GetNextToken();
if (token == T_LIST_SEPARATOR) {
ConsumeChar();
token = GetNextToken();
if (token == T_OBJECT_END && !(options_ & JSON_ALLOW_TRAILING_COMMAS)) {
ReportError(JSONReader::JSON_TRAILING_COMMA, 1);
return std::nullopt;
}
} else if (token != T_OBJECT_END) {
ReportError(JSONReader::JSON_SYNTAX_ERROR, 0);
return std::nullopt;
}
}
ConsumeChar(); // Closing '}'.
return Value(Value::DictStorage(std::move(dict_storage), KEEP_LAST_OF_DUPES));
}
std::optional<Value> JSONParser::ConsumeList() {
if (ConsumeChar() != '[') {
ReportError(JSONReader::JSON_UNEXPECTED_TOKEN, 1);
return std::nullopt;
}
StackMarker depth_check(max_depth_, &stack_depth_);
if (depth_check.IsTooDeep()) {
ReportError(JSONReader::JSON_TOO_MUCH_NESTING, 0);
return std::nullopt;
}
Value::ListStorage list_storage;
Token token = GetNextToken();
while (token != T_ARRAY_END) {
std::optional<Value> item = ParseToken(token);
if (!item) {
// ReportError from deeper level.
return std::nullopt;
}
list_storage.push_back(std::move(*item));
token = GetNextToken();
if (token == T_LIST_SEPARATOR) {
ConsumeChar();
token = GetNextToken();
if (token == T_ARRAY_END && !(options_ & JSON_ALLOW_TRAILING_COMMAS)) {
ReportError(JSONReader::JSON_TRAILING_COMMA, 1);
return std::nullopt;
}
} else if (token != T_ARRAY_END) {
ReportError(JSONReader::JSON_SYNTAX_ERROR, 1);
return std::nullopt;
}
}
ConsumeChar(); // Closing ']'.
return Value(std::move(list_storage));
}
std::optional<Value> JSONParser::ConsumeString() {
StringBuilder string;
if (!ConsumeStringRaw(&string))
return std::nullopt;
return Value(string.DestructiveAsString());
}
bool JSONParser::ConsumeStringRaw(StringBuilder* out) {
if (ConsumeChar() != '"') {
ReportError(JSONReader::JSON_UNEXPECTED_TOKEN, 1);
return false;
}
// StringBuilder will internally build a std::string_view unless a UTF-16
// conversion occurs, at which point it will perform a copy into a
// std::string.
StringBuilder string(pos());
while (PeekChar()) {
uint32_t next_char = 0;
if (!ReadUnicodeCharacter(input_.data(),
static_cast<int32_t>(input_.length()), &index_,
&next_char) ||
!IsValidCharacter(next_char)) {
if ((options_ & JSON_REPLACE_INVALID_CHARACTERS) == 0) {
ReportError(JSONReader::JSON_UNSUPPORTED_ENCODING, 1);
return false;
}
ConsumeChar();
string.Append(kUnicodeReplacementPoint);
continue;
}
if (next_char == '"') {
ConsumeChar();
*out = std::move(string);
return true;
} else if (next_char != '\\') {
// If this character is not an escape sequence...
ConsumeChar();
string.Append(next_char);
} else {
// And if it is an escape sequence, the input string will be adjusted
// (either by combining the two characters of an encoded escape sequence,
// or with a UTF conversion), so using std::string_view isn't possible --
// force a conversion.
string.Convert();
// Read past the escape '\' and ensure there's a character following.
std::optional<std::string_view> escape_sequence = ConsumeChars(2);
if (!escape_sequence) {
ReportError(JSONReader::JSON_INVALID_ESCAPE, 0);
return false;
}
switch ((*escape_sequence)[1]) {
// Allowed esape sequences:
case 'x': { // UTF-8 sequence.
// UTF-8 \x escape sequences are not allowed in the spec, but they
// are supported here for backwards-compatiblity with the old parser.
escape_sequence = ConsumeChars(2);
if (!escape_sequence) {
ReportError(JSONReader::JSON_INVALID_ESCAPE, -2);
return false;
}
int hex_digit = 0;
if (!HexStringToInt(*escape_sequence, &hex_digit) ||
!IsValidCharacter(hex_digit)) {
ReportError(JSONReader::JSON_INVALID_ESCAPE, -2);
return false;
}
string.Append(hex_digit);
break;
}
case 'u': { // UTF-16 sequence.
// UTF units are of the form \uXXXX.
uint32_t code_point;
if (!DecodeUTF16(&code_point)) {
ReportError(JSONReader::JSON_INVALID_ESCAPE, 0);
return false;
}
string.Append(code_point);
break;
}
case '"':
string.Append('"');
break;
case '\\':
string.Append('\\');
break;
case '/':
string.Append('/');
break;
case 'b':
string.Append('\b');
break;
case 'f':
string.Append('\f');
break;
case 'n':
string.Append('\n');
break;
case 'r':
string.Append('\r');
break;
case 't':
string.Append('\t');
break;
case 'v': // Not listed as valid escape sequence in the RFC.
string.Append('\v');
break;
// All other escape squences are illegal.
default:
ReportError(JSONReader::JSON_INVALID_ESCAPE, 0);
return false;
}
}
}
ReportError(JSONReader::JSON_SYNTAX_ERROR, 0);
return false;
}
// Entry is at the first X in \uXXXX.
bool JSONParser::DecodeUTF16(uint32_t* out_code_point) {
std::optional<std::string_view> escape_sequence = ConsumeChars(4);
if (!escape_sequence)
return false;
// Consume the UTF-16 code unit, which may be a high surrogate.
int code_unit16_high = 0;
if (!HexStringToInt(*escape_sequence, &code_unit16_high))
return false;
// If this is a high surrogate, consume the next code unit to get the
// low surrogate.
if (CBU16_IS_SURROGATE(code_unit16_high)) {
// Make sure this is the high surrogate. If not, it's an encoding
// error.
if (!CBU16_IS_SURROGATE_LEAD(code_unit16_high))
return false;
// Make sure that the token has more characters to consume the
// lower surrogate.
if (!ConsumeIfMatch("\\u"))
return false;
escape_sequence = ConsumeChars(4);
if (!escape_sequence)
return false;
int code_unit16_low = 0;
if (!HexStringToInt(*escape_sequence, &code_unit16_low))
return false;
if (!CBU16_IS_TRAIL(code_unit16_low))
return false;
uint32_t code_point =
CBU16_GET_SUPPLEMENTARY(code_unit16_high, code_unit16_low);
if (!IsValidCharacter(code_point))
return false;
*out_code_point = code_point;
} else {
// Not a surrogate.
DCHECK(CBU16_IS_SINGLE(code_unit16_high));
if (!IsValidCharacter(code_unit16_high)) {
if ((options_ & JSON_REPLACE_INVALID_CHARACTERS) == 0) {
return false;
}
*out_code_point = kUnicodeReplacementPoint;
return true;
}
*out_code_point = code_unit16_high;
}
return true;
}
std::optional<Value> JSONParser::ConsumeNumber() {
const char* num_start = pos();
const int start_index = index_;
int end_index = start_index;
if (PeekChar() == '-')
ConsumeChar();
if (!ReadInt(false)) {
ReportError(JSONReader::JSON_SYNTAX_ERROR, 1);
return std::nullopt;
}
end_index = index_;
// The optional fraction part.
if (PeekChar() == '.') {
ConsumeChar();
if (!ReadInt(true)) {
ReportError(JSONReader::JSON_SYNTAX_ERROR, 1);
return std::nullopt;
}
end_index = index_;
}
// Optional exponent part.
std::optional<char> c = PeekChar();
if (c == 'e' || c == 'E') {
ConsumeChar();
if (PeekChar() == '-' || PeekChar() == '+') {
ConsumeChar();
}
if (!ReadInt(true)) {
ReportError(JSONReader::JSON_SYNTAX_ERROR, 1);
return std::nullopt;
}
end_index = index_;
}
// ReadInt is greedy because numbers have no easily detectable sentinel,
// so save off where the parser should be on exit (see Consume invariant at
// the top of the header), then make sure the next token is one which is
// valid.
int exit_index = index_;
switch (GetNextToken()) {
case T_OBJECT_END:
case T_ARRAY_END:
case T_LIST_SEPARATOR:
case T_END_OF_INPUT:
break;
default:
ReportError(JSONReader::JSON_SYNTAX_ERROR, 1);
return std::nullopt;
}
index_ = exit_index;
std::string_view num_string(num_start, end_index - start_index);
int num_int;
if (StringToInt(num_string, &num_int))
return Value(num_int);
return std::nullopt;
}
bool JSONParser::ReadInt(bool allow_leading_zeros) {
size_t len = 0;
char first = 0;
while (std::optional<char> c = PeekChar()) {
if (!IsAsciiDigit(c))
break;
if (len == 0)
first = *c;
++len;
ConsumeChar();
}
if (len == 0)
return false;
if (!allow_leading_zeros && len > 1 && first == '0')
return false;
return true;
}
std::optional<Value> JSONParser::ConsumeLiteral() {
if (ConsumeIfMatch("true")) {
return Value(true);
} else if (ConsumeIfMatch("false")) {
return Value(false);
} else if (ConsumeIfMatch("null")) {
return Value(Value::Type::NONE);
} else {
ReportError(JSONReader::JSON_SYNTAX_ERROR, 1);
return std::nullopt;
}
}
bool JSONParser::ConsumeIfMatch(std::string_view match) {
if (match == PeekChars(match.size())) {
ConsumeChars(match.size());
return true;
}
return false;
}
void JSONParser::ReportError(JSONReader::JsonParseError code,
int column_adjust) {
error_code_ = code;
error_line_ = line_number_;
error_column_ = index_ - index_last_line_ + column_adjust;
}
// static
std::string JSONParser::FormatErrorMessage(int line,
int column,
const std::string& description) {
if (line || column) {
return StringPrintf("Line: %i, column: %i, %s", line, column,
description.c_str());
}
return description;
}
} // namespace internal
} // namespace base