src/gn/string_utils.cc - gn.git - Git at Google

 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "gn/string_utils.h"

 #include <stddef.h>
 #include <cctype>

 #include "base/strings/string_number_conversions.h"
 #include "gn/err.h"
 #include "gn/input_file.h"
 #include "gn/parser.h"
 #include "gn/scope.h"
 #include "gn/token.h"
 #include "gn/tokenizer.h"
 #include "gn/value.h"

 namespace {

 // Constructs an Err indicating a range inside a string. We assume that the
 // token has quotes around it that are not counted by the offset.
 Err ErrInsideStringToken(const Token& token,
                          size_t offset,
                          size_t size,
                          const std::string& msg,
                          const std::string& help = std::string()) {
   // The "+1" is skipping over the " at the beginning of the token.
   int int_offset = static_cast<int>(offset);
   Location begin_loc(token.location().file(), token.location().line_number(),
                      token.location().column_number() + int_offset + 1);
   Location end_loc(token.location().file(), token.location().line_number(),
                    token.location().column_number() + int_offset + 1 +
                        static_cast<int>(size));
   return Err(LocationRange(begin_loc, end_loc), msg, help);
 }

 // Notes about expression interpolation. This is based loosly on Dart but is
 // slightly less flexible. In Dart, seeing the ${ in a string is something
 // the toplevel parser knows about, and it will recurse into the block
 // treating it as a first-class {...} block. So even things like this work:
 //   "hello ${"foo}"*2+"bar"}"  =>  "hello foo}foo}bar"
 // (you can see it did not get confused by the nested strings or the nested "}"
 // inside the block).
 //
 // This is cool but complicates the parser for almost no benefit for this
 // non-general-purpose programming language. The main reason expressions are
 // supported here at all are to support "${scope.variable}" and "${list[0]}",
 // neither of which have any of these edge-cases.
 //
 // In this simplified approach, we search for the terminating '}' and execute
 // the result. This means we can't support any expressions with embedded '}'
 // or '"'. To keep people from getting confusing about what's supported and
 // what's not, only identifier and accessor expressions are allowed (neither
 // of these run into any of these edge-cases).
 bool AppendInterpolatedExpression(Scope* scope,
                                   const Token& token,
                                   const char* input,
                                   size_t begin_offset,
                                   size_t end_offset,
                                   std::string* output,
                                   Err* err) {
   SourceFile empty_source_file;  // Prevent most vexing parse.
   InputFile input_file(empty_source_file);
   input_file.SetContents(
       std::string(&input[begin_offset], end_offset - begin_offset));

   // Tokenize.
   std::vector<Token> tokens = Tokenizer::Tokenize(&input_file, err);
   if (err->has_error()) {
     // The error will point into our temporary buffer, rewrite it to refer
     // to the original token. This will make the location information less
     // precise, but generally there won't be complicated things in string
     // interpolations.
     *err = ErrInsideStringToken(token, begin_offset, end_offset - begin_offset,
                                 err->message(), err->help_text());
     return false;
   }

   // Parse.
   std::unique_ptr<ParseNode> node = Parser::ParseExpression(tokens, err);
   if (err->has_error()) {
     // Rewrite error as above.
     *err = ErrInsideStringToken(token, begin_offset, end_offset - begin_offset,
                                 err->message(), err->help_text());
     return false;
   }
   if (!(node->AsIdentifier() || node->AsAccessor())) {
     *err = ErrInsideStringToken(
         token, begin_offset, end_offset - begin_offset,
         "Invalid string interpolation.",
         "The thing inside the ${} must be an identifier ${foo},\n"
         "a scope access ${foo.bar}, or a list access ${foo[0]}.");
     return false;
   }

   // Evaluate.
   Value result = node->Execute(scope, err);
   if (err->has_error()) {
     // Rewrite error as above.
     *err = ErrInsideStringToken(token, begin_offset, end_offset - begin_offset,
                                 err->message(), err->help_text());
     return false;
   }

   output->append(result.ToString(false));
   return true;
 }

 bool AppendInterpolatedIdentifier(Scope* scope,
                                   const Token& token,
                                   const char* input,
                                   size_t begin_offset,
                                   size_t end_offset,
                                   std::string* output,
                                   Err* err) {
   std::string_view identifier(&input[begin_offset], end_offset - begin_offset);
   const Value* value = scope->GetValue(identifier, true);
   if (!value) {
     // We assume the input points inside the token.
     *err = ErrInsideStringToken(
         token, identifier.data() - token.value().data() - 1, identifier.size(),
         "Undefined identifier in string expansion.",
         std::string("\"") + identifier + "\" is not currently in scope.");
     return false;
   }

   output->append(value->ToString(false));
   return true;
 }

 // Handles string interpolations: $identifier and ${expression}
 //
 // |*i| is the index into |input| after the $. This will be updated to point to
 // the last character consumed on success. The token is the original string
 // to blame on failure.
 //
 // On failure, returns false and sets the error. On success, appends the
 // result of the interpolation to |*output|.
 bool AppendStringInterpolation(Scope* scope,
                                const Token& token,
                                const char* input,
                                size_t size,
                                size_t* i,
                                std::string* output,
                                Err* err) {
   size_t dollars_index = *i - 1;

   if (input[*i] == '{') {
     // Bracketed expression.
     (*i)++;
     size_t begin_offset = *i;

     // Find the closing } and check for non-identifier chars. Don't need to
     // bother checking for the more-restricted first character of an identifier
     // since the {} unambiguously denotes the range, and identifiers with
     // invalid names just won't be found later.
     bool has_non_ident_chars = false;
     while (*i < size && input[*i] != '}') {
       has_non_ident_chars |= Tokenizer::IsIdentifierContinuingChar(input[*i]);
       (*i)++;
     }
     if (*i == size) {
       *err = ErrInsideStringToken(token, dollars_index, *i - dollars_index,
                                   "Unterminated ${...");
       return false;
     }

     // In the common case, the thing inside the {} will actually be a
     // simple identifier. Avoid all the complicated parsing of accessors
     // in this case.
     if (!has_non_ident_chars) {
       return AppendInterpolatedIdentifier(scope, token, input, begin_offset, *i,
                                           output, err);
     }
     return AppendInterpolatedExpression(scope, token, input, begin_offset, *i,
                                         output, err);
   }

   // Simple identifier.
   // The first char of an identifier is more restricted.
   if (!Tokenizer::IsIdentifierFirstChar(input[*i])) {
     *err = ErrInsideStringToken(token, dollars_index, *i - dollars_index + 1,
                                 "$ not followed by an identifier char.",
                                 "It you want a literal $ use \"\\$\".");
     return false;
   }
   size_t begin_offset = *i;
   (*i)++;

   // Find the first non-identifier char following the string.
   while (*i < size && Tokenizer::IsIdentifierContinuingChar(input[*i]))
     (*i)++;
   size_t end_offset = *i;
   (*i)--;  // Back up to mark the last character consumed.
   return AppendInterpolatedIdentifier(scope, token, input, begin_offset,
                                       end_offset, output, err);
 }

 // Handles a hex literal: $0xFF
 //
 // |*i| is the index into |input| after the $. This will be updated to point to
 // the last character consumed on success. The token is the original string
 // to blame on failure.
 //
 // On failure, returns false and sets the error. On success, appends the
 // char with the given hex value to |*output|.
 bool AppendHexByte(Scope* scope,
                    const Token& token,
                    const char* input,
                    size_t size,
                    size_t* i,
                    std::string* output,
                    Err* err) {
   size_t dollars_index = *i - 1;
   // "$0" is already known to exist.
   if (*i + 3 >= size || input[*i + 1] != 'x' || !std::isxdigit(input[*i + 2]) ||
       !std::isxdigit(input[*i + 3])) {
     *err = ErrInsideStringToken(
         token, dollars_index, *i - dollars_index + 1,
         "Invalid hex character. Hex values must look like 0xFF.");
     return false;
   }
   int value = 0;
   if (!base::HexStringToInt(std::string_view(&input[*i + 2], 2), &value)) {
     *err = ErrInsideStringToken(token, dollars_index, *i - dollars_index + 1,
                                 "Could not convert hex value.");
     return false;
   }
   *i += 3;
   output->push_back(value);
   return true;
 }

 }  // namespace

 bool ExpandStringLiteral(Scope* scope,
                          const Token& literal,
                          Value* result,
                          Err* err) {
   DCHECK(literal.type() == Token::STRING);
   DCHECK(literal.value().size() > 1);       // Should include quotes.
   DCHECK(result->type() == Value::STRING);  // Should be already set.

   // The token includes the surrounding quotes, so strip those off.
   const char* input = &literal.value().data()[1];
   size_t size = literal.value().size() - 2;

   std::string& output = result->string_value();
   output.reserve(size);
   for (size_t i = 0; i < size; i++) {
     if (input[i] == '\\') {
       if (i < size - 1) {
         switch (input[i + 1]) {
           case '\\':
           case '"':
           case '$':
             output.push_back(input[i + 1]);
             i++;
             continue;
           default:  // Everything else has no meaning: pass the literal.
             break;
         }
       }
       output.push_back(input[i]);
     } else if (input[i] == '$') {
       i++;
       if (i == size) {
         *err = ErrInsideStringToken(
             literal, i - 1, 1, "$ at end of string.",
             "I was expecting an identifier, 0xFF, or {...} after the $.");
         return false;
       }
       if (input[i] == '0') {
         if (!AppendHexByte(scope, literal, input, size, &i, &output, err))
           return false;
       } else if (!AppendStringInterpolation(scope, literal, input, size, &i,
                                             &output, err))
         return false;
     } else {
       output.push_back(input[i]);
     }
   }
   return true;
 }

 size_t EditDistance(std::string_view s1,
                     std::string_view s2,
                     size_t max_edit_distance) {
   // The algorithm implemented below is the "classic"
   // dynamic-programming algorithm for computing the Levenshtein
   // distance, which is described here:
   //
   //   http://en.wikipedia.org/wiki/Levenshtein_distance
   //
   // Although the algorithm is typically described using an m x n
   // array, only one row plus one element are used at a time, so this
   // implementation just keeps one vector for the row.  To update one entry,
   // only the entries to the left, top, and top-left are needed.  The left
   // entry is in row[x-1], the top entry is what's in row[x] from the last
   // iteration, and the top-left entry is stored in previous.
   size_t m = s1.size();
   size_t n = s2.size();

   std::vector<size_t> row(n + 1);
   for (size_t i = 1; i <= n; ++i)
     row[i] = i;

   for (size_t y = 1; y <= m; ++y) {
     row[0] = y;
     size_t best_this_row = row[0];

     size_t previous = y - 1;
     for (size_t x = 1; x <= n; ++x) {
       size_t old_row = row[x];
       row[x] = std::min(previous + (s1[y - 1] == s2[x - 1] ? 0u : 1u),
                         std::min(row[x - 1], row[x]) + 1u);
       previous = old_row;
       best_this_row = std::min(best_this_row, row[x]);
     }

     if (max_edit_distance && best_this_row > max_edit_distance)
       return max_edit_distance + 1;
   }

   return row[n];
 }

 std::string_view SpellcheckString(std::string_view text,
                                   const std::vector<std::string_view>& words) {
   const size_t kMaxValidEditDistance = 3u;

   size_t min_distance = kMaxValidEditDistance + 1u;
   std::string_view result;
   for (std::string_view word : words) {
     size_t distance = EditDistance(word, text, kMaxValidEditDistance);
     if (distance < min_distance) {
       min_distance = distance;
       result = word;
     }
   }
   return result;
 }

 std::string ReadStdin() {
   char buffer[4 << 10];
   std::string result;
   size_t len;
   while ((len = fread(buffer, 1, sizeof(buffer), stdin)) > 0)
     result.append(buffer, len);
   // TODO(thakis): Check ferror(stdin)?
   return result;
 }
	// Copyright (c) 2013 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include "gn/string_utils.h"

	#include <stddef.h>
	#include <cctype>

	#include "base/strings/string_number_conversions.h"
	#include "gn/err.h"
	#include "gn/input_file.h"
	#include "gn/parser.h"
	#include "gn/scope.h"
	#include "gn/token.h"
	#include "gn/tokenizer.h"
	#include "gn/value.h"

	namespace {

	// Constructs an Err indicating a range inside a string. We assume that the
	// token has quotes around it that are not counted by the offset.
	Err ErrInsideStringToken(const Token& token,
	size_t offset,
	size_t size,
	const std::string& msg,
	const std::string& help = std::string()) {
	// The "+1" is skipping over the " at the beginning of the token.
	int int_offset = static_cast<int>(offset);
	Location begin_loc(token.location().file(), token.location().line_number(),
	token.location().column_number() + int_offset + 1);
	Location end_loc(token.location().file(), token.location().line_number(),
	token.location().column_number() + int_offset + 1 +
	static_cast<int>(size));
	return Err(LocationRange(begin_loc, end_loc), msg, help);
	}

	// Notes about expression interpolation. This is based loosly on Dart but is
	// slightly less flexible. In Dart, seeing the ${ in a string is something
	// the toplevel parser knows about, and it will recurse into the block
	// treating it as a first-class {...} block. So even things like this work:
	// "hello ${"foo}"*2+"bar"}" => "hello foo}foo}bar"
	// (you can see it did not get confused by the nested strings or the nested "}"
	// inside the block).
	//
	// This is cool but complicates the parser for almost no benefit for this
	// non-general-purpose programming language. The main reason expressions are
	// supported here at all are to support "${scope.variable}" and "${list[0]}",
	// neither of which have any of these edge-cases.
	//
	// In this simplified approach, we search for the terminating '}' and execute
	// the result. This means we can't support any expressions with embedded '}'
	// or '"'. To keep people from getting confusing about what's supported and
	// what's not, only identifier and accessor expressions are allowed (neither
	// of these run into any of these edge-cases).
	bool AppendInterpolatedExpression(Scope* scope,
	const Token& token,
	const char* input,
	size_t begin_offset,
	size_t end_offset,
	std::string* output,
	Err* err) {
	SourceFile empty_source_file; // Prevent most vexing parse.
	InputFile input_file(empty_source_file);
	input_file.SetContents(
	std::string(&input[begin_offset], end_offset - begin_offset));

	// Tokenize.
	std::vector<Token> tokens = Tokenizer::Tokenize(&input_file, err);
	if (err->has_error()) {
	// The error will point into our temporary buffer, rewrite it to refer
	// to the original token. This will make the location information less
	// precise, but generally there won't be complicated things in string
	// interpolations.
	*err = ErrInsideStringToken(token, begin_offset, end_offset - begin_offset,
	err->message(), err->help_text());
	return false;
	}

	// Parse.
	std::unique_ptr<ParseNode> node = Parser::ParseExpression(tokens, err);
	if (err->has_error()) {
	// Rewrite error as above.
	*err = ErrInsideStringToken(token, begin_offset, end_offset - begin_offset,
	err->message(), err->help_text());
	return false;
	}
	if (!(node->AsIdentifier() \|\| node->AsAccessor())) {
	*err = ErrInsideStringToken(
	token, begin_offset, end_offset - begin_offset,
	"Invalid string interpolation.",
	"The thing inside the ${} must be an identifier ${foo},\n"
	"a scope access ${foo.bar}, or a list access ${foo[0]}.");
	return false;
	}

	// Evaluate.
	Value result = node->Execute(scope, err);
	if (err->has_error()) {
	// Rewrite error as above.
	*err = ErrInsideStringToken(token, begin_offset, end_offset - begin_offset,
	err->message(), err->help_text());
	return false;
	}

	output->append(result.ToString(false));
	return true;
	}

	bool AppendInterpolatedIdentifier(Scope* scope,
	const Token& token,
	const char* input,
	size_t begin_offset,
	size_t end_offset,
	std::string* output,
	Err* err) {
	std::string_view identifier(&input[begin_offset], end_offset - begin_offset);
	const Value* value = scope->GetValue(identifier, true);
	if (!value) {
	// We assume the input points inside the token.
	*err = ErrInsideStringToken(
	token, identifier.data() - token.value().data() - 1, identifier.size(),
	"Undefined identifier in string expansion.",
	std::string("\"") + identifier + "\" is not currently in scope.");
	return false;
	}

	output->append(value->ToString(false));
	return true;
	}

	// Handles string interpolations: $identifier and ${expression}
	//
	// \|*i\| is the index into \|input\| after the $. This will be updated to point to
	// the last character consumed on success. The token is the original string
	// to blame on failure.
	//
	// On failure, returns false and sets the error. On success, appends the
	// result of the interpolation to \|*output\|.
	bool AppendStringInterpolation(Scope* scope,
	const Token& token,
	const char* input,
	size_t size,
	size_t* i,
	std::string* output,
	Err* err) {
	size_t dollars_index = *i - 1;

	if (input[*i] == '{') {
	// Bracketed expression.
	(*i)++;
	size_t begin_offset = *i;

	// Find the closing } and check for non-identifier chars. Don't need to
	// bother checking for the more-restricted first character of an identifier
	// since the {} unambiguously denotes the range, and identifiers with
	// invalid names just won't be found later.
	bool has_non_ident_chars = false;
	while (i < size && input[i] != '}') {
	has_non_ident_chars \|= Tokenizer::IsIdentifierContinuingChar(input[*i]);
	(*i)++;
	}
	if (*i == size) {
	err = ErrInsideStringToken(token, dollars_index, i - dollars_index,
	"Unterminated ${...");
	return false;
	}

	// In the common case, the thing inside the {} will actually be a
	// simple identifier. Avoid all the complicated parsing of accessors
	// in this case.
	if (!has_non_ident_chars) {
	return AppendInterpolatedIdentifier(scope, token, input, begin_offset, *i,
	output, err);
	}
	return AppendInterpolatedExpression(scope, token, input, begin_offset, *i,
	output, err);
	}

	// Simple identifier.
	// The first char of an identifier is more restricted.
	if (!Tokenizer::IsIdentifierFirstChar(input[*i])) {
	err = ErrInsideStringToken(token, dollars_index, i - dollars_index + 1,
	"$ not followed by an identifier char.",
	"It you want a literal $ use \"\\$\".");
	return false;
	}
	size_t begin_offset = *i;
	(*i)++;

	// Find the first non-identifier char following the string.
	while (i < size && Tokenizer::IsIdentifierContinuingChar(input[i]))
	(*i)++;
	size_t end_offset = *i;
	(*i)--; // Back up to mark the last character consumed.
	return AppendInterpolatedIdentifier(scope, token, input, begin_offset,
	end_offset, output, err);
	}

	// Handles a hex literal: $0xFF
	//
	// \|*i\| is the index into \|input\| after the $. This will be updated to point to
	// the last character consumed on success. The token is the original string
	// to blame on failure.
	//
	// On failure, returns false and sets the error. On success, appends the
	// char with the given hex value to \|*output\|.
	bool AppendHexByte(Scope* scope,
	const Token& token,
	const char* input,
	size_t size,
	size_t* i,
	std::string* output,
	Err* err) {
	size_t dollars_index = *i - 1;
	// "$0" is already known to exist.
	if (i + 3 >= size \|\| input[i + 1] != 'x' \|\| !std::isxdigit(input[*i + 2]) \|\|
	!std::isxdigit(input[*i + 3])) {
	*err = ErrInsideStringToken(
	token, dollars_index, *i - dollars_index + 1,
	"Invalid hex character. Hex values must look like 0xFF.");
	return false;
	}
	int value = 0;
	if (!base::HexStringToInt(std::string_view(&input[*i + 2], 2), &value)) {
	err = ErrInsideStringToken(token, dollars_index, i - dollars_index + 1,
	"Could not convert hex value.");
	return false;
	}
	*i += 3;
	output->push_back(value);
	return true;
	}

	} // namespace

	bool ExpandStringLiteral(Scope* scope,
	const Token& literal,
	Value* result,
	Err* err) {
	DCHECK(literal.type() == Token::STRING);
	DCHECK(literal.value().size() > 1); // Should include quotes.
	DCHECK(result->type() == Value::STRING); // Should be already set.

	// The token includes the surrounding quotes, so strip those off.
	const char* input = &literal.value().data()[1];
	size_t size = literal.value().size() - 2;

	std::string& output = result->string_value();
	output.reserve(size);
	for (size_t i = 0; i < size; i++) {
	if (input[i] == '\\') {
	if (i < size - 1) {
	switch (input[i + 1]) {
	case '\\':
	case '"':
	case '$':
	output.push_back(input[i + 1]);
	i++;
	continue;
	default: // Everything else has no meaning: pass the literal.
	break;
	}
	}
	output.push_back(input[i]);
	} else if (input[i] == '$') {
	i++;
	if (i == size) {
	*err = ErrInsideStringToken(
	literal, i - 1, 1, "$ at end of string.",
	"I was expecting an identifier, 0xFF, or {...} after the $.");
	return false;
	}
	if (input[i] == '0') {
	if (!AppendHexByte(scope, literal, input, size, &i, &output, err))
	return false;
	} else if (!AppendStringInterpolation(scope, literal, input, size, &i,
	&output, err))
	return false;
	} else {
	output.push_back(input[i]);
	}
	}
	return true;
	}

	size_t EditDistance(std::string_view s1,
	std::string_view s2,
	size_t max_edit_distance) {
	// The algorithm implemented below is the "classic"
	// dynamic-programming algorithm for computing the Levenshtein
	// distance, which is described here:
	//
	// http://en.wikipedia.org/wiki/Levenshtein_distance
	//
	// Although the algorithm is typically described using an m x n
	// array, only one row plus one element are used at a time, so this
	// implementation just keeps one vector for the row. To update one entry,
	// only the entries to the left, top, and top-left are needed. The left
	// entry is in row[x-1], the top entry is what's in row[x] from the last
	// iteration, and the top-left entry is stored in previous.
	size_t m = s1.size();
	size_t n = s2.size();

	std::vector<size_t> row(n + 1);
	for (size_t i = 1; i <= n; ++i)
	row[i] = i;

	for (size_t y = 1; y <= m; ++y) {
	row[0] = y;
	size_t best_this_row = row[0];

	size_t previous = y - 1;
	for (size_t x = 1; x <= n; ++x) {
	size_t old_row = row[x];
	row[x] = std::min(previous + (s1[y - 1] == s2[x - 1] ? 0u : 1u),
	std::min(row[x - 1], row[x]) + 1u);
	previous = old_row;
	best_this_row = std::min(best_this_row, row[x]);
	}

	if (max_edit_distance && best_this_row > max_edit_distance)
	return max_edit_distance + 1;
	}

	return row[n];
	}

	std::string_view SpellcheckString(std::string_view text,
	const std::vector<std::string_view>& words) {
	const size_t kMaxValidEditDistance = 3u;

	size_t min_distance = kMaxValidEditDistance + 1u;
	std::string_view result;
	for (std::string_view word : words) {
	size_t distance = EditDistance(word, text, kMaxValidEditDistance);
	if (distance < min_distance) {
	min_distance = distance;
	result = word;
	}
	}
	return result;
	}

	std::string ReadStdin() {
	char buffer[4 << 10];
	std::string result;
	size_t len;
	while ((len = fread(buffer, 1, sizeof(buffer), stdin)) > 0)
	result.append(buffer, len);
	// TODO(thakis): Check ferror(stdin)?
	return result;
	}