| // Copyright (c) 2013 The Chromium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "tools/gn/string_utils.h" |
| |
| #include <stddef.h> |
| #include <cctype> |
| |
| #include "base/strings/string_number_conversions.h" |
| #include "tools/gn/err.h" |
| #include "tools/gn/input_file.h" |
| #include "tools/gn/parser.h" |
| #include "tools/gn/scope.h" |
| #include "tools/gn/token.h" |
| #include "tools/gn/tokenizer.h" |
| #include "tools/gn/value.h" |
| |
| namespace { |
| |
| // Constructs an Err indicating a range inside a string. We assume that the |
| // token has quotes around it that are not counted by the offset. |
| Err ErrInsideStringToken(const Token& token, |
| size_t offset, |
| size_t size, |
| const std::string& msg, |
| const std::string& help = std::string()) { |
| // The "+1" is skipping over the " at the beginning of the token. |
| int int_offset = static_cast<int>(offset); |
| Location begin_loc(token.location().file(), token.location().line_number(), |
| token.location().column_number() + int_offset + 1, |
| token.location().byte() + int_offset + 1); |
| Location end_loc( |
| token.location().file(), token.location().line_number(), |
| token.location().column_number() + int_offset + 1 + |
| static_cast<int>(size), |
| token.location().byte() + int_offset + 1 + static_cast<int>(size)); |
| return Err(LocationRange(begin_loc, end_loc), msg, help); |
| } |
| |
| // Notes about expression interpolation. This is based loosly on Dart but is |
| // slightly less flexible. In Dart, seeing the ${ in a string is something |
| // the toplevel parser knows about, and it will recurse into the block |
| // treating it as a first-class {...} block. So even things like this work: |
| // "hello ${"foo}"*2+"bar"}" => "hello foo}foo}bar" |
| // (you can see it did not get confused by the nested strings or the nested "}" |
| // inside the block). |
| // |
| // This is cool but complicates the parser for almost no benefit for this |
| // non-general-purpose programming language. The main reason expressions are |
| // supported here at all are to support "${scope.variable}" and "${list[0]}", |
| // neither of which have any of these edge-cases. |
| // |
| // In this simplified approach, we search for the terminating '}' and execute |
| // the result. This means we can't support any expressions with embedded '}' |
| // or '"'. To keep people from getting confusing about what's supported and |
| // what's not, only identifier and accessor expressions are allowed (neither |
| // of these run into any of these edge-cases). |
| bool AppendInterpolatedExpression(Scope* scope, |
| const Token& token, |
| const char* input, |
| size_t begin_offset, |
| size_t end_offset, |
| std::string* output, |
| Err* err) { |
| SourceFile empty_source_file; // Prevent most vexing parse. |
| InputFile input_file(empty_source_file); |
| input_file.SetContents( |
| std::string(&input[begin_offset], end_offset - begin_offset)); |
| |
| // Tokenize. |
| std::vector<Token> tokens = Tokenizer::Tokenize(&input_file, err); |
| if (err->has_error()) { |
| // The error will point into our temporary buffer, rewrite it to refer |
| // to the original token. This will make the location information less |
| // precise, but generally there won't be complicated things in string |
| // interpolations. |
| *err = ErrInsideStringToken(token, begin_offset, end_offset - begin_offset, |
| err->message(), err->help_text()); |
| return false; |
| } |
| |
| // Parse. |
| std::unique_ptr<ParseNode> node = Parser::ParseExpression(tokens, err); |
| if (err->has_error()) { |
| // Rewrite error as above. |
| *err = ErrInsideStringToken(token, begin_offset, end_offset - begin_offset, |
| err->message(), err->help_text()); |
| return false; |
| } |
| if (!(node->AsIdentifier() || node->AsAccessor())) { |
| *err = ErrInsideStringToken( |
| token, begin_offset, end_offset - begin_offset, |
| "Invalid string interpolation.", |
| "The thing inside the ${} must be an identifier ${foo},\n" |
| "a scope access ${foo.bar}, or a list access ${foo[0]}."); |
| return false; |
| } |
| |
| // Evaluate. |
| Value result = node->Execute(scope, err); |
| if (err->has_error()) { |
| // Rewrite error as above. |
| *err = ErrInsideStringToken(token, begin_offset, end_offset - begin_offset, |
| err->message(), err->help_text()); |
| return false; |
| } |
| |
| output->append(result.ToString(false)); |
| return true; |
| } |
| |
| bool AppendInterpolatedIdentifier(Scope* scope, |
| const Token& token, |
| const char* input, |
| size_t begin_offset, |
| size_t end_offset, |
| std::string* output, |
| Err* err) { |
| base::StringPiece identifier(&input[begin_offset], end_offset - begin_offset); |
| const Value* value = scope->GetValue(identifier, true); |
| if (!value) { |
| // We assume the input points inside the token. |
| *err = ErrInsideStringToken( |
| token, identifier.data() - token.value().data() - 1, identifier.size(), |
| "Undefined identifier in string expansion.", |
| std::string("\"") + identifier + "\" is not currently in scope."); |
| return false; |
| } |
| |
| output->append(value->ToString(false)); |
| return true; |
| } |
| |
| // Handles string interpolations: $identifier and ${expression} |
| // |
| // |*i| is the index into |input| after the $. This will be updated to point to |
| // the last character consumed on success. The token is the original string |
| // to blame on failure. |
| // |
| // On failure, returns false and sets the error. On success, appends the |
| // result of the interpolation to |*output|. |
| bool AppendStringInterpolation(Scope* scope, |
| const Token& token, |
| const char* input, |
| size_t size, |
| size_t* i, |
| std::string* output, |
| Err* err) { |
| size_t dollars_index = *i - 1; |
| |
| if (input[*i] == '{') { |
| // Bracketed expression. |
| (*i)++; |
| size_t begin_offset = *i; |
| |
| // Find the closing } and check for non-identifier chars. Don't need to |
| // bother checking for the more-restricted first character of an identifier |
| // since the {} unambiguously denotes the range, and identifiers with |
| // invalid names just won't be found later. |
| bool has_non_ident_chars = false; |
| while (*i < size && input[*i] != '}') { |
| has_non_ident_chars |= Tokenizer::IsIdentifierContinuingChar(input[*i]); |
| (*i)++; |
| } |
| if (*i == size) { |
| *err = ErrInsideStringToken(token, dollars_index, *i - dollars_index, |
| "Unterminated ${..."); |
| return false; |
| } |
| |
| // In the common case, the thing inside the {} will actually be a |
| // simple identifier. Avoid all the complicated parsing of accessors |
| // in this case. |
| if (!has_non_ident_chars) { |
| return AppendInterpolatedIdentifier(scope, token, input, begin_offset, *i, |
| output, err); |
| } |
| return AppendInterpolatedExpression(scope, token, input, begin_offset, *i, |
| output, err); |
| } |
| |
| // Simple identifier. |
| // The first char of an identifier is more restricted. |
| if (!Tokenizer::IsIdentifierFirstChar(input[*i])) { |
| *err = ErrInsideStringToken(token, dollars_index, *i - dollars_index + 1, |
| "$ not followed by an identifier char.", |
| "It you want a literal $ use \"\\$\"."); |
| return false; |
| } |
| size_t begin_offset = *i; |
| (*i)++; |
| |
| // Find the first non-identifier char following the string. |
| while (*i < size && Tokenizer::IsIdentifierContinuingChar(input[*i])) |
| (*i)++; |
| size_t end_offset = *i; |
| (*i)--; // Back up to mark the last character consumed. |
| return AppendInterpolatedIdentifier(scope, token, input, begin_offset, |
| end_offset, output, err); |
| } |
| |
| // Handles a hex literal: $0xFF |
| // |
| // |*i| is the index into |input| after the $. This will be updated to point to |
| // the last character consumed on success. The token is the original string |
| // to blame on failure. |
| // |
| // On failure, returns false and sets the error. On success, appends the |
| // char with the given hex value to |*output|. |
| bool AppendHexByte(Scope* scope, |
| const Token& token, |
| const char* input, |
| size_t size, |
| size_t* i, |
| std::string* output, |
| Err* err) { |
| size_t dollars_index = *i - 1; |
| // "$0" is already known to exist. |
| if (*i + 3 >= size || input[*i + 1] != 'x' || !std::isxdigit(input[*i + 2]) || |
| !std::isxdigit(input[*i + 3])) { |
| *err = ErrInsideStringToken( |
| token, dollars_index, *i - dollars_index + 1, |
| "Invalid hex character. Hex values must look like 0xFF."); |
| return false; |
| } |
| int value = 0; |
| if (!base::HexStringToInt(base::StringPiece(&input[*i + 2], 2), &value)) { |
| *err = ErrInsideStringToken(token, dollars_index, *i - dollars_index + 1, |
| "Could not convert hex value."); |
| return false; |
| } |
| *i += 3; |
| output->push_back(value); |
| return true; |
| } |
| |
| } // namespace |
| |
| bool ExpandStringLiteral(Scope* scope, |
| const Token& literal, |
| Value* result, |
| Err* err) { |
| DCHECK(literal.type() == Token::STRING); |
| DCHECK(literal.value().size() > 1); // Should include quotes. |
| DCHECK(result->type() == Value::STRING); // Should be already set. |
| |
| // The token includes the surrounding quotes, so strip those off. |
| const char* input = &literal.value().data()[1]; |
| size_t size = literal.value().size() - 2; |
| |
| std::string& output = result->string_value(); |
| output.reserve(size); |
| for (size_t i = 0; i < size; i++) { |
| if (input[i] == '\\') { |
| if (i < size - 1) { |
| switch (input[i + 1]) { |
| case '\\': |
| case '"': |
| case '$': |
| output.push_back(input[i + 1]); |
| i++; |
| continue; |
| default: // Everything else has no meaning: pass the literal. |
| break; |
| } |
| } |
| output.push_back(input[i]); |
| } else if (input[i] == '$') { |
| i++; |
| if (i == size) { |
| *err = ErrInsideStringToken( |
| literal, i - 1, 1, "$ at end of string.", |
| "I was expecting an identifier, 0xFF, or {...} after the $."); |
| return false; |
| } |
| if (input[i] == '0') { |
| if (!AppendHexByte(scope, literal, input, size, &i, &output, err)) |
| return false; |
| } else if (!AppendStringInterpolation(scope, literal, input, size, &i, |
| &output, err)) |
| return false; |
| } else { |
| output.push_back(input[i]); |
| } |
| } |
| return true; |
| } |
| |
| size_t EditDistance(const base::StringPiece& s1, |
| const base::StringPiece& s2, |
| size_t max_edit_distance) { |
| // The algorithm implemented below is the "classic" |
| // dynamic-programming algorithm for computing the Levenshtein |
| // distance, which is described here: |
| // |
| // http://en.wikipedia.org/wiki/Levenshtein_distance |
| // |
| // Although the algorithm is typically described using an m x n |
| // array, only one row plus one element are used at a time, so this |
| // implementation just keeps one vector for the row. To update one entry, |
| // only the entries to the left, top, and top-left are needed. The left |
| // entry is in row[x-1], the top entry is what's in row[x] from the last |
| // iteration, and the top-left entry is stored in previous. |
| size_t m = s1.size(); |
| size_t n = s2.size(); |
| |
| std::vector<size_t> row(n + 1); |
| for (size_t i = 1; i <= n; ++i) |
| row[i] = i; |
| |
| for (size_t y = 1; y <= m; ++y) { |
| row[0] = y; |
| size_t best_this_row = row[0]; |
| |
| size_t previous = y - 1; |
| for (size_t x = 1; x <= n; ++x) { |
| size_t old_row = row[x]; |
| row[x] = std::min(previous + (s1[y - 1] == s2[x - 1] ? 0u : 1u), |
| std::min(row[x - 1], row[x]) + 1u); |
| previous = old_row; |
| best_this_row = std::min(best_this_row, row[x]); |
| } |
| |
| if (max_edit_distance && best_this_row > max_edit_distance) |
| return max_edit_distance + 1; |
| } |
| |
| return row[n]; |
| } |
| |
| base::StringPiece SpellcheckString( |
| const base::StringPiece& text, |
| const std::vector<base::StringPiece>& words) { |
| const size_t kMaxValidEditDistance = 3u; |
| |
| size_t min_distance = kMaxValidEditDistance + 1u; |
| base::StringPiece result; |
| for (base::StringPiece word : words) { |
| size_t distance = EditDistance(word, text, kMaxValidEditDistance); |
| if (distance < min_distance) { |
| min_distance = distance; |
| result = word; |
| } |
| } |
| return result; |
| } |