|  | // Copyright (c) 2013 The Chromium Authors. All rights reserved. | 
|  | // Use of this source code is governed by a BSD-style license that can be | 
|  | // found in the LICENSE file. | 
|  |  | 
|  | #include "tools/gn/string_utils.h" | 
|  |  | 
|  | #include <stddef.h> | 
|  | #include <cctype> | 
|  |  | 
|  | #include "base/strings/string_number_conversions.h" | 
|  | #include "tools/gn/err.h" | 
|  | #include "tools/gn/input_file.h" | 
|  | #include "tools/gn/parser.h" | 
|  | #include "tools/gn/scope.h" | 
|  | #include "tools/gn/token.h" | 
|  | #include "tools/gn/tokenizer.h" | 
|  | #include "tools/gn/value.h" | 
|  |  | 
|  | namespace { | 
|  |  | 
|  | // Constructs an Err indicating a range inside a string. We assume that the | 
|  | // token has quotes around it that are not counted by the offset. | 
|  | Err ErrInsideStringToken(const Token& token, | 
|  | size_t offset, | 
|  | size_t size, | 
|  | const std::string& msg, | 
|  | const std::string& help = std::string()) { | 
|  | // The "+1" is skipping over the " at the beginning of the token. | 
|  | int int_offset = static_cast<int>(offset); | 
|  | Location begin_loc(token.location().file(), token.location().line_number(), | 
|  | token.location().column_number() + int_offset + 1, | 
|  | token.location().byte() + int_offset + 1); | 
|  | Location end_loc( | 
|  | token.location().file(), token.location().line_number(), | 
|  | token.location().column_number() + int_offset + 1 + | 
|  | static_cast<int>(size), | 
|  | token.location().byte() + int_offset + 1 + static_cast<int>(size)); | 
|  | return Err(LocationRange(begin_loc, end_loc), msg, help); | 
|  | } | 
|  |  | 
|  | // Notes about expression interpolation. This is based loosly on Dart but is | 
|  | // slightly less flexible. In Dart, seeing the ${ in a string is something | 
|  | // the toplevel parser knows about, and it will recurse into the block | 
|  | // treating it as a first-class {...} block. So even things like this work: | 
|  | //   "hello ${"foo}"*2+"bar"}"  =>  "hello foo}foo}bar" | 
|  | // (you can see it did not get confused by the nested strings or the nested "}" | 
|  | // inside the block). | 
|  | // | 
|  | // This is cool but complicates the parser for almost no benefit for this | 
|  | // non-general-purpose programming language. The main reason expressions are | 
|  | // supported here at all are to support "${scope.variable}" and "${list[0]}", | 
|  | // neither of which have any of these edge-cases. | 
|  | // | 
|  | // In this simplified approach, we search for the terminating '}' and execute | 
|  | // the result. This means we can't support any expressions with embedded '}' | 
|  | // or '"'. To keep people from getting confusing about what's supported and | 
|  | // what's not, only identifier and accessor expressions are allowed (neither | 
|  | // of these run into any of these edge-cases). | 
|  | bool AppendInterpolatedExpression(Scope* scope, | 
|  | const Token& token, | 
|  | const char* input, | 
|  | size_t begin_offset, | 
|  | size_t end_offset, | 
|  | std::string* output, | 
|  | Err* err) { | 
|  | SourceFile empty_source_file;  // Prevent most vexing parse. | 
|  | InputFile input_file(empty_source_file); | 
|  | input_file.SetContents( | 
|  | std::string(&input[begin_offset], end_offset - begin_offset)); | 
|  |  | 
|  | // Tokenize. | 
|  | std::vector<Token> tokens = Tokenizer::Tokenize(&input_file, err); | 
|  | if (err->has_error()) { | 
|  | // The error will point into our temporary buffer, rewrite it to refer | 
|  | // to the original token. This will make the location information less | 
|  | // precise, but generally there won't be complicated things in string | 
|  | // interpolations. | 
|  | *err = ErrInsideStringToken(token, begin_offset, end_offset - begin_offset, | 
|  | err->message(), err->help_text()); | 
|  | return false; | 
|  | } | 
|  |  | 
|  | // Parse. | 
|  | std::unique_ptr<ParseNode> node = Parser::ParseExpression(tokens, err); | 
|  | if (err->has_error()) { | 
|  | // Rewrite error as above. | 
|  | *err = ErrInsideStringToken(token, begin_offset, end_offset - begin_offset, | 
|  | err->message(), err->help_text()); | 
|  | return false; | 
|  | } | 
|  | if (!(node->AsIdentifier() || node->AsAccessor())) { | 
|  | *err = ErrInsideStringToken( | 
|  | token, begin_offset, end_offset - begin_offset, | 
|  | "Invalid string interpolation.", | 
|  | "The thing inside the ${} must be an identifier ${foo},\n" | 
|  | "a scope access ${foo.bar}, or a list access ${foo[0]}."); | 
|  | return false; | 
|  | } | 
|  |  | 
|  | // Evaluate. | 
|  | Value result = node->Execute(scope, err); | 
|  | if (err->has_error()) { | 
|  | // Rewrite error as above. | 
|  | *err = ErrInsideStringToken(token, begin_offset, end_offset - begin_offset, | 
|  | err->message(), err->help_text()); | 
|  | return false; | 
|  | } | 
|  |  | 
|  | output->append(result.ToString(false)); | 
|  | return true; | 
|  | } | 
|  |  | 
|  | bool AppendInterpolatedIdentifier(Scope* scope, | 
|  | const Token& token, | 
|  | const char* input, | 
|  | size_t begin_offset, | 
|  | size_t end_offset, | 
|  | std::string* output, | 
|  | Err* err) { | 
|  | base::StringPiece identifier(&input[begin_offset], end_offset - begin_offset); | 
|  | const Value* value = scope->GetValue(identifier, true); | 
|  | if (!value) { | 
|  | // We assume the input points inside the token. | 
|  | *err = ErrInsideStringToken( | 
|  | token, identifier.data() - token.value().data() - 1, identifier.size(), | 
|  | "Undefined identifier in string expansion.", | 
|  | std::string("\"") + identifier + "\" is not currently in scope."); | 
|  | return false; | 
|  | } | 
|  |  | 
|  | output->append(value->ToString(false)); | 
|  | return true; | 
|  | } | 
|  |  | 
|  | // Handles string interpolations: $identifier and ${expression} | 
|  | // | 
|  | // |*i| is the index into |input| after the $. This will be updated to point to | 
|  | // the last character consumed on success. The token is the original string | 
|  | // to blame on failure. | 
|  | // | 
|  | // On failure, returns false and sets the error. On success, appends the | 
|  | // result of the interpolation to |*output|. | 
|  | bool AppendStringInterpolation(Scope* scope, | 
|  | const Token& token, | 
|  | const char* input, | 
|  | size_t size, | 
|  | size_t* i, | 
|  | std::string* output, | 
|  | Err* err) { | 
|  | size_t dollars_index = *i - 1; | 
|  |  | 
|  | if (input[*i] == '{') { | 
|  | // Bracketed expression. | 
|  | (*i)++; | 
|  | size_t begin_offset = *i; | 
|  |  | 
|  | // Find the closing } and check for non-identifier chars. Don't need to | 
|  | // bother checking for the more-restricted first character of an identifier | 
|  | // since the {} unambiguously denotes the range, and identifiers with | 
|  | // invalid names just won't be found later. | 
|  | bool has_non_ident_chars = false; | 
|  | while (*i < size && input[*i] != '}') { | 
|  | has_non_ident_chars |= Tokenizer::IsIdentifierContinuingChar(input[*i]); | 
|  | (*i)++; | 
|  | } | 
|  | if (*i == size) { | 
|  | *err = ErrInsideStringToken(token, dollars_index, *i - dollars_index, | 
|  | "Unterminated ${..."); | 
|  | return false; | 
|  | } | 
|  |  | 
|  | // In the common case, the thing inside the {} will actually be a | 
|  | // simple identifier. Avoid all the complicated parsing of accessors | 
|  | // in this case. | 
|  | if (!has_non_ident_chars) { | 
|  | return AppendInterpolatedIdentifier(scope, token, input, begin_offset, *i, | 
|  | output, err); | 
|  | } | 
|  | return AppendInterpolatedExpression(scope, token, input, begin_offset, *i, | 
|  | output, err); | 
|  | } | 
|  |  | 
|  | // Simple identifier. | 
|  | // The first char of an identifier is more restricted. | 
|  | if (!Tokenizer::IsIdentifierFirstChar(input[*i])) { | 
|  | *err = ErrInsideStringToken(token, dollars_index, *i - dollars_index + 1, | 
|  | "$ not followed by an identifier char.", | 
|  | "It you want a literal $ use \"\\$\"."); | 
|  | return false; | 
|  | } | 
|  | size_t begin_offset = *i; | 
|  | (*i)++; | 
|  |  | 
|  | // Find the first non-identifier char following the string. | 
|  | while (*i < size && Tokenizer::IsIdentifierContinuingChar(input[*i])) | 
|  | (*i)++; | 
|  | size_t end_offset = *i; | 
|  | (*i)--;  // Back up to mark the last character consumed. | 
|  | return AppendInterpolatedIdentifier(scope, token, input, begin_offset, | 
|  | end_offset, output, err); | 
|  | } | 
|  |  | 
|  | // Handles a hex literal: $0xFF | 
|  | // | 
|  | // |*i| is the index into |input| after the $. This will be updated to point to | 
|  | // the last character consumed on success. The token is the original string | 
|  | // to blame on failure. | 
|  | // | 
|  | // On failure, returns false and sets the error. On success, appends the | 
|  | // char with the given hex value to |*output|. | 
|  | bool AppendHexByte(Scope* scope, | 
|  | const Token& token, | 
|  | const char* input, | 
|  | size_t size, | 
|  | size_t* i, | 
|  | std::string* output, | 
|  | Err* err) { | 
|  | size_t dollars_index = *i - 1; | 
|  | // "$0" is already known to exist. | 
|  | if (*i + 3 >= size || input[*i + 1] != 'x' || !std::isxdigit(input[*i + 2]) || | 
|  | !std::isxdigit(input[*i + 3])) { | 
|  | *err = ErrInsideStringToken( | 
|  | token, dollars_index, *i - dollars_index + 1, | 
|  | "Invalid hex character. Hex values must look like 0xFF."); | 
|  | return false; | 
|  | } | 
|  | int value = 0; | 
|  | if (!base::HexStringToInt(base::StringPiece(&input[*i + 2], 2), &value)) { | 
|  | *err = ErrInsideStringToken(token, dollars_index, *i - dollars_index + 1, | 
|  | "Could not convert hex value."); | 
|  | return false; | 
|  | } | 
|  | *i += 3; | 
|  | output->push_back(value); | 
|  | return true; | 
|  | } | 
|  |  | 
|  | }  // namespace | 
|  |  | 
|  | bool ExpandStringLiteral(Scope* scope, | 
|  | const Token& literal, | 
|  | Value* result, | 
|  | Err* err) { | 
|  | DCHECK(literal.type() == Token::STRING); | 
|  | DCHECK(literal.value().size() > 1);       // Should include quotes. | 
|  | DCHECK(result->type() == Value::STRING);  // Should be already set. | 
|  |  | 
|  | // The token includes the surrounding quotes, so strip those off. | 
|  | const char* input = &literal.value().data()[1]; | 
|  | size_t size = literal.value().size() - 2; | 
|  |  | 
|  | std::string& output = result->string_value(); | 
|  | output.reserve(size); | 
|  | for (size_t i = 0; i < size; i++) { | 
|  | if (input[i] == '\\') { | 
|  | if (i < size - 1) { | 
|  | switch (input[i + 1]) { | 
|  | case '\\': | 
|  | case '"': | 
|  | case '$': | 
|  | output.push_back(input[i + 1]); | 
|  | i++; | 
|  | continue; | 
|  | default:  // Everything else has no meaning: pass the literal. | 
|  | break; | 
|  | } | 
|  | } | 
|  | output.push_back(input[i]); | 
|  | } else if (input[i] == '$') { | 
|  | i++; | 
|  | if (i == size) { | 
|  | *err = ErrInsideStringToken( | 
|  | literal, i - 1, 1, "$ at end of string.", | 
|  | "I was expecting an identifier, 0xFF, or {...} after the $."); | 
|  | return false; | 
|  | } | 
|  | if (input[i] == '0') { | 
|  | if (!AppendHexByte(scope, literal, input, size, &i, &output, err)) | 
|  | return false; | 
|  | } else if (!AppendStringInterpolation(scope, literal, input, size, &i, | 
|  | &output, err)) | 
|  | return false; | 
|  | } else { | 
|  | output.push_back(input[i]); | 
|  | } | 
|  | } | 
|  | return true; | 
|  | } | 
|  |  | 
|  | size_t EditDistance(const base::StringPiece& s1, | 
|  | const base::StringPiece& s2, | 
|  | size_t max_edit_distance) { | 
|  | // The algorithm implemented below is the "classic" | 
|  | // dynamic-programming algorithm for computing the Levenshtein | 
|  | // distance, which is described here: | 
|  | // | 
|  | //   http://en.wikipedia.org/wiki/Levenshtein_distance | 
|  | // | 
|  | // Although the algorithm is typically described using an m x n | 
|  | // array, only one row plus one element are used at a time, so this | 
|  | // implementation just keeps one vector for the row.  To update one entry, | 
|  | // only the entries to the left, top, and top-left are needed.  The left | 
|  | // entry is in row[x-1], the top entry is what's in row[x] from the last | 
|  | // iteration, and the top-left entry is stored in previous. | 
|  | size_t m = s1.size(); | 
|  | size_t n = s2.size(); | 
|  |  | 
|  | std::vector<size_t> row(n + 1); | 
|  | for (size_t i = 1; i <= n; ++i) | 
|  | row[i] = i; | 
|  |  | 
|  | for (size_t y = 1; y <= m; ++y) { | 
|  | row[0] = y; | 
|  | size_t best_this_row = row[0]; | 
|  |  | 
|  | size_t previous = y - 1; | 
|  | for (size_t x = 1; x <= n; ++x) { | 
|  | size_t old_row = row[x]; | 
|  | row[x] = std::min(previous + (s1[y - 1] == s2[x - 1] ? 0u : 1u), | 
|  | std::min(row[x - 1], row[x]) + 1u); | 
|  | previous = old_row; | 
|  | best_this_row = std::min(best_this_row, row[x]); | 
|  | } | 
|  |  | 
|  | if (max_edit_distance && best_this_row > max_edit_distance) | 
|  | return max_edit_distance + 1; | 
|  | } | 
|  |  | 
|  | return row[n]; | 
|  | } | 
|  |  | 
|  | base::StringPiece SpellcheckString( | 
|  | const base::StringPiece& text, | 
|  | const std::vector<base::StringPiece>& words) { | 
|  | const size_t kMaxValidEditDistance = 3u; | 
|  |  | 
|  | size_t min_distance = kMaxValidEditDistance + 1u; | 
|  | base::StringPiece result; | 
|  | for (base::StringPiece word : words) { | 
|  | size_t distance = EditDistance(word, text, kMaxValidEditDistance); | 
|  | if (distance < min_distance) { | 
|  | min_distance = distance; | 
|  | result = word; | 
|  | } | 
|  | } | 
|  | return result; | 
|  | } |