tools/gn/tokenizer.cc - gn - Git at Google

 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "tools/gn/tokenizer.h"

 #include "base/logging.h"
 #include "base/strings/string_util.h"
 #include "tools/gn/input_file.h"

 namespace {

 bool CouldBeTwoCharOperatorBegin(char c) {
   return c == '<' || c == '>' || c == '!' || c == '=' || c == '-' || c == '+' ||
          c == '|' || c == '&';
 }

 bool CouldBeTwoCharOperatorEnd(char c) {
   return c == '=' || c == '|' || c == '&';
 }

 bool CouldBeOneCharOperator(char c) {
   return c == '=' || c == '<' || c == '>' || c == '+' || c == '!' || c == ':' ||
          c == '|' || c == '&' || c == '-';
 }

 bool CouldBeOperator(char c) {
   return CouldBeOneCharOperator(c) || CouldBeTwoCharOperatorBegin(c);
 }

 bool IsScoperChar(char c) {
   return c == '(' || c == ')' || c == '[' || c == ']' || c == '{' || c == '}';
 }

 Token::Type GetSpecificOperatorType(std::string_view value) {
   if (value == "=")
     return Token::EQUAL;
   if (value == "+")
     return Token::PLUS;
   if (value == "-")
     return Token::MINUS;
   if (value == "+=")
     return Token::PLUS_EQUALS;
   if (value == "-=")
     return Token::MINUS_EQUALS;
   if (value == "==")
     return Token::EQUAL_EQUAL;
   if (value == "!=")
     return Token::NOT_EQUAL;
   if (value == "<=")
     return Token::LESS_EQUAL;
   if (value == ">=")
     return Token::GREATER_EQUAL;
   if (value == "<")
     return Token::LESS_THAN;
   if (value == ">")
     return Token::GREATER_THAN;
   if (value == "&&")
     return Token::BOOLEAN_AND;
   if (value == "||")
     return Token::BOOLEAN_OR;
   if (value == "!")
     return Token::BANG;
   if (value == ".")
     return Token::DOT;
   return Token::INVALID;
 }

 }  // namespace

 Tokenizer::Tokenizer(const InputFile* input_file, Err* err)
     : input_file_(input_file), input_(input_file->contents()), err_(err) {}

 Tokenizer::~Tokenizer() = default;

 // static
 std::vector<Token> Tokenizer::Tokenize(const InputFile* input_file, Err* err) {
   Tokenizer t(input_file, err);
   return t.Run();
 }

 std::vector<Token> Tokenizer::Run() {
   DCHECK(tokens_.empty());
   while (!done()) {
     AdvanceToNextToken();
     if (done())
       break;
     Location location = GetCurrentLocation();

     Token::Type type = ClassifyCurrent();
     if (type == Token::INVALID) {
       *err_ = GetErrorForInvalidToken(location);
       break;
     }
     size_t token_begin = cur_;
     AdvanceToEndOfToken(location, type);
     if (has_error())
       break;
     size_t token_end = cur_;

     std::string_view token_value(&input_.data()[token_begin],
                                  token_end - token_begin);

     if (type == Token::UNCLASSIFIED_OPERATOR) {
       type = GetSpecificOperatorType(token_value);
     } else if (type == Token::IDENTIFIER) {
       if (token_value == "if")
         type = Token::IF;
       else if (token_value == "else")
         type = Token::ELSE;
       else if (token_value == "true")
         type = Token::TRUE_TOKEN;
       else if (token_value == "false")
         type = Token::FALSE_TOKEN;
     } else if (type == Token::UNCLASSIFIED_COMMENT) {
       if (AtStartOfLine(token_begin) &&
           // If it's a standalone comment, but is a continuation of a comment on
           // a previous line, then instead make it a continued suffix comment.
           (tokens_.empty() || tokens_.back().type() != Token::SUFFIX_COMMENT ||
            tokens_.back().location().line_number() + 1 !=
                location.line_number() ||
            tokens_.back().location().column_number() !=
                location.column_number())) {
         type = Token::LINE_COMMENT;
         if (!at_end())  // Could be EOF.
           Advance();    // The current \n.
         // If this comment is separated from the next syntax element, then we
         // want to tag it as a block comment. This will become a standalone
         // statement at the parser level to keep this comment separate, rather
         // than attached to the subsequent statement.
         while (!at_end() && IsCurrentWhitespace()) {
           if (IsCurrentNewline()) {
             type = Token::BLOCK_COMMENT;
             break;
           }
           Advance();
         }
       } else {
         type = Token::SUFFIX_COMMENT;
       }
     }

     tokens_.push_back(Token(location, type, token_value));
   }
   if (err_->has_error())
     tokens_.clear();
   return tokens_;
 }

 // static
 size_t Tokenizer::ByteOffsetOfNthLine(const std::string_view& buf, int n) {
   DCHECK_GT(n, 0);

   if (n == 1)
     return 0;

   int cur_line = 1;
   size_t cur_byte = 0;
   while (cur_byte < buf.size()) {
     if (IsNewline(buf, cur_byte)) {
       cur_line++;
       if (cur_line == n)
         return cur_byte + 1;
     }
     cur_byte++;
   }
   return static_cast<size_t>(-1);
 }

 // static
 bool Tokenizer::IsNewline(const std::string_view& buffer, size_t offset) {
   DCHECK(offset < buffer.size());
   // We may need more logic here to handle different line ending styles.
   return buffer[offset] == '\n';
 }

 // static
 bool Tokenizer::IsIdentifierFirstChar(char c) {
   return base::IsAsciiAlpha(c) || c == '_';
 }

 // static
 bool Tokenizer::IsIdentifierContinuingChar(char c) {
   // Also allow digits after the first char.
   return IsIdentifierFirstChar(c) || base::IsAsciiDigit(c);
 }

 void Tokenizer::AdvanceToNextToken() {
   while (!at_end() && IsCurrentWhitespace())
     Advance();
 }

 Token::Type Tokenizer::ClassifyCurrent() const {
   DCHECK(!at_end());
   char next_char = cur_char();
   if (base::IsAsciiDigit(next_char))
     return Token::INTEGER;
   if (next_char == '"')
     return Token::STRING;

   // Note: '-' handled specially below.
   if (next_char != '-' && CouldBeOperator(next_char))
     return Token::UNCLASSIFIED_OPERATOR;

   if (IsIdentifierFirstChar(next_char))
     return Token::IDENTIFIER;

   if (next_char == '[')
     return Token::LEFT_BRACKET;
   if (next_char == ']')
     return Token::RIGHT_BRACKET;
   if (next_char == '(')
     return Token::LEFT_PAREN;
   if (next_char == ')')
     return Token::RIGHT_PAREN;
   if (next_char == '{')
     return Token::LEFT_BRACE;
   if (next_char == '}')
     return Token::RIGHT_BRACE;

   if (next_char == '.')
     return Token::DOT;
   if (next_char == ',')
     return Token::COMMA;

   if (next_char == '#')
     return Token::UNCLASSIFIED_COMMENT;

   // For the case of '-' differentiate between a negative number and anything
   // else.
   if (next_char == '-') {
     if (!CanIncrement())
       return Token::UNCLASSIFIED_OPERATOR;  // Just the minus before end of
                                             // file.
     char following_char = input_[cur_ + 1];
     if (base::IsAsciiDigit(following_char))
       return Token::INTEGER;
     return Token::UNCLASSIFIED_OPERATOR;
   }

   return Token::INVALID;
 }

 void Tokenizer::AdvanceToEndOfToken(const Location& location,
                                     Token::Type type) {
   switch (type) {
     case Token::INTEGER:
       do {
         Advance();
       } while (!at_end() && base::IsAsciiDigit(cur_char()));
       if (!at_end()) {
         // Require the char after a number to be some kind of space, scope,
         // or operator.
         char c = cur_char();
         if (!IsCurrentWhitespace() && !CouldBeOperator(c) && !IsScoperChar(c) &&
             c != ',') {
           *err_ = Err(GetCurrentLocation(), "This is not a valid number.");
           // Highlight the number.
           err_->AppendRange(LocationRange(location, GetCurrentLocation()));
         }
       }
       break;

     case Token::STRING: {
       char initial = cur_char();
       Advance();  // Advance past initial "
       for (;;) {
         if (at_end()) {
           *err_ = Err(LocationRange(location, GetCurrentLocation()),
                       "Unterminated string literal.",
                       "Don't leave me hanging like this!");
           break;
         }
         if (IsCurrentStringTerminator(initial)) {
           Advance();  // Skip past last "
           break;
         } else if (IsCurrentNewline()) {
           *err_ = Err(LocationRange(location, GetCurrentLocation()),
                       "Newline in string constant.");
         }
         Advance();
       }
       break;
     }

     case Token::UNCLASSIFIED_OPERATOR:
       // Some operators are two characters, some are one.
       if (CouldBeTwoCharOperatorBegin(cur_char())) {
         if (CanIncrement() && CouldBeTwoCharOperatorEnd(input_[cur_ + 1]))
           Advance();
       }
       Advance();
       break;

     case Token::IDENTIFIER:
       while (!at_end() && IsIdentifierContinuingChar(cur_char()))
         Advance();
       break;

     case Token::LEFT_BRACKET:
     case Token::RIGHT_BRACKET:
     case Token::LEFT_BRACE:
     case Token::RIGHT_BRACE:
     case Token::LEFT_PAREN:
     case Token::RIGHT_PAREN:
     case Token::DOT:
     case Token::COMMA:
       Advance();  // All are one char.
       break;

     case Token::UNCLASSIFIED_COMMENT:
       // Eat to EOL.
       while (!at_end() && !IsCurrentNewline())
         Advance();
       break;

     case Token::INVALID:
     default:
       *err_ = Err(location, "Everything is all messed up",
                   "Please insert system disk in drive A: and press any key.");
       NOTREACHED();
       return;
   }
 }

 bool Tokenizer::AtStartOfLine(size_t location) const {
   while (location > 0) {
     --location;
     char c = input_[location];
     if (c == '\n')
       return true;
     if (c != ' ')
       return false;
   }
   return true;
 }

 bool Tokenizer::IsCurrentWhitespace() const {
   DCHECK(!at_end());
   char c = input_[cur_];
   // Note that tab (0x09), vertical tab (0x0B), and formfeed (0x0C) are illegal.
   return c == 0x0A || c == 0x0D || c == 0x20;
 }

 bool Tokenizer::IsCurrentStringTerminator(char quote_char) const {
   DCHECK(!at_end());
   if (cur_char() != quote_char)
     return false;

   // Check for escaping. \" is not a string terminator, but \\" is. Count
   // the number of preceding backslashes.
   int num_backslashes = 0;
   for (int i = static_cast<int>(cur_) - 1; i >= 0 && input_[i] == '\\'; i--)
     num_backslashes++;

   // Even backslashes mean that they were escaping each other and don't count
   // as escaping this quote.
   return (num_backslashes % 2) == 0;
 }

 bool Tokenizer::IsCurrentNewline() const {
   return IsNewline(input_, cur_);
 }

 void Tokenizer::Advance() {
   DCHECK(cur_ < input_.size());
   if (IsCurrentNewline()) {
     line_number_++;
     column_number_ = 1;
   } else {
     column_number_++;
   }
   cur_++;
 }

 Location Tokenizer::GetCurrentLocation() const {
   return Location(input_file_, line_number_, column_number_,
                   static_cast<int>(cur_));
 }

 Err Tokenizer::GetErrorForInvalidToken(const Location& location) const {
   std::string help;
   if (cur_char() == ';') {
     // Semicolon.
     help = "Semicolons are not needed, delete this one.";
   } else if (cur_char() == '\t') {
     // Tab.
     help =
         "You got a tab character in here. Tabs are evil. "
         "Convert to spaces.";
   } else if (cur_char() == '/' && cur_ + 1 < input_.size() &&
              (input_[cur_ + 1] == '/' || input_[cur_ + 1] == '*')) {
     // Different types of comments.
     help = "Comments should start with # instead";
   } else if (cur_char() == '\'') {
     help = "Strings are delimited by \" characters, not apostrophes.";
   } else {
     help = "I have no idea what this is.";
   }

   return Err(location, "Invalid token.", help);
 }
	// Copyright (c) 2013 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include "tools/gn/tokenizer.h"

	#include "base/logging.h"
	#include "base/strings/string_util.h"
	#include "tools/gn/input_file.h"

	namespace {

	bool CouldBeTwoCharOperatorBegin(char c) {
	return c == '<' \|\| c == '>' \|\| c == '!' \|\| c == '=' \|\| c == '-' \|\| c == '+' \|\|
	c == '\|' \|\| c == '&';
	}

	bool CouldBeTwoCharOperatorEnd(char c) {
	return c == '=' \|\| c == '\|' \|\| c == '&';
	}

	bool CouldBeOneCharOperator(char c) {
	return c == '=' \|\| c == '<' \|\| c == '>' \|\| c == '+' \|\| c == '!' \|\| c == ':' \|\|
	c == '\|' \|\| c == '&' \|\| c == '-';
	}

	bool CouldBeOperator(char c) {
	return CouldBeOneCharOperator(c) \|\| CouldBeTwoCharOperatorBegin(c);
	}

	bool IsScoperChar(char c) {
	return c == '(' \|\| c == ')' \|\| c == '[' \|\| c == ']' \|\| c == '{' \|\| c == '}';
	}

	Token::Type GetSpecificOperatorType(std::string_view value) {
	if (value == "=")
	return Token::EQUAL;
	if (value == "+")
	return Token::PLUS;
	if (value == "-")
	return Token::MINUS;
	if (value == "+=")
	return Token::PLUS_EQUALS;
	if (value == "-=")
	return Token::MINUS_EQUALS;
	if (value == "==")
	return Token::EQUAL_EQUAL;
	if (value == "!=")
	return Token::NOT_EQUAL;
	if (value == "<=")
	return Token::LESS_EQUAL;
	if (value == ">=")
	return Token::GREATER_EQUAL;
	if (value == "<")
	return Token::LESS_THAN;
	if (value == ">")
	return Token::GREATER_THAN;
	if (value == "&&")
	return Token::BOOLEAN_AND;
	if (value == "\|\|")
	return Token::BOOLEAN_OR;
	if (value == "!")
	return Token::BANG;
	if (value == ".")
	return Token::DOT;
	return Token::INVALID;
	}

	} // namespace

	Tokenizer::Tokenizer(const InputFile* input_file, Err* err)
	: input_file_(input_file), input_(input_file->contents()), err_(err) {}

	Tokenizer::~Tokenizer() = default;

	// static
	std::vector<Token> Tokenizer::Tokenize(const InputFile* input_file, Err* err) {
	Tokenizer t(input_file, err);
	return t.Run();
	}

	std::vector<Token> Tokenizer::Run() {
	DCHECK(tokens_.empty());
	while (!done()) {
	AdvanceToNextToken();
	if (done())
	break;
	Location location = GetCurrentLocation();

	Token::Type type = ClassifyCurrent();
	if (type == Token::INVALID) {
	*err_ = GetErrorForInvalidToken(location);
	break;
	}
	size_t token_begin = cur_;
	AdvanceToEndOfToken(location, type);
	if (has_error())
	break;
	size_t token_end = cur_;

	std::string_view token_value(&input_.data()[token_begin],
	token_end - token_begin);

	if (type == Token::UNCLASSIFIED_OPERATOR) {
	type = GetSpecificOperatorType(token_value);
	} else if (type == Token::IDENTIFIER) {
	if (token_value == "if")
	type = Token::IF;
	else if (token_value == "else")
	type = Token::ELSE;
	else if (token_value == "true")
	type = Token::TRUE_TOKEN;
	else if (token_value == "false")
	type = Token::FALSE_TOKEN;
	} else if (type == Token::UNCLASSIFIED_COMMENT) {
	if (AtStartOfLine(token_begin) &&
	// If it's a standalone comment, but is a continuation of a comment on
	// a previous line, then instead make it a continued suffix comment.
	(tokens_.empty() \|\| tokens_.back().type() != Token::SUFFIX_COMMENT \|\|
	tokens_.back().location().line_number() + 1 !=
	location.line_number() \|\|
	tokens_.back().location().column_number() !=
	location.column_number())) {
	type = Token::LINE_COMMENT;
	if (!at_end()) // Could be EOF.
	Advance(); // The current \n.
	// If this comment is separated from the next syntax element, then we
	// want to tag it as a block comment. This will become a standalone
	// statement at the parser level to keep this comment separate, rather
	// than attached to the subsequent statement.
	while (!at_end() && IsCurrentWhitespace()) {
	if (IsCurrentNewline()) {
	type = Token::BLOCK_COMMENT;
	break;
	}
	Advance();
	}
	} else {
	type = Token::SUFFIX_COMMENT;
	}
	}

	tokens_.push_back(Token(location, type, token_value));
	}
	if (err_->has_error())
	tokens_.clear();
	return tokens_;
	}

	// static
	size_t Tokenizer::ByteOffsetOfNthLine(const std::string_view& buf, int n) {
	DCHECK_GT(n, 0);

	if (n == 1)
	return 0;

	int cur_line = 1;
	size_t cur_byte = 0;
	while (cur_byte < buf.size()) {
	if (IsNewline(buf, cur_byte)) {
	cur_line++;
	if (cur_line == n)
	return cur_byte + 1;
	}
	cur_byte++;
	}
	return static_cast<size_t>(-1);
	}

	// static
	bool Tokenizer::IsNewline(const std::string_view& buffer, size_t offset) {
	DCHECK(offset < buffer.size());
	// We may need more logic here to handle different line ending styles.
	return buffer[offset] == '\n';
	}

	// static
	bool Tokenizer::IsIdentifierFirstChar(char c) {
	return base::IsAsciiAlpha(c) \|\| c == '_';
	}

	// static
	bool Tokenizer::IsIdentifierContinuingChar(char c) {
	// Also allow digits after the first char.
	return IsIdentifierFirstChar(c) \|\| base::IsAsciiDigit(c);
	}

	void Tokenizer::AdvanceToNextToken() {
	while (!at_end() && IsCurrentWhitespace())
	Advance();
	}

	Token::Type Tokenizer::ClassifyCurrent() const {
	DCHECK(!at_end());
	char next_char = cur_char();
	if (base::IsAsciiDigit(next_char))
	return Token::INTEGER;
	if (next_char == '"')
	return Token::STRING;

	// Note: '-' handled specially below.
	if (next_char != '-' && CouldBeOperator(next_char))
	return Token::UNCLASSIFIED_OPERATOR;

	if (IsIdentifierFirstChar(next_char))
	return Token::IDENTIFIER;

	if (next_char == '[')
	return Token::LEFT_BRACKET;
	if (next_char == ']')
	return Token::RIGHT_BRACKET;
	if (next_char == '(')
	return Token::LEFT_PAREN;
	if (next_char == ')')
	return Token::RIGHT_PAREN;
	if (next_char == '{')
	return Token::LEFT_BRACE;
	if (next_char == '}')
	return Token::RIGHT_BRACE;

	if (next_char == '.')
	return Token::DOT;
	if (next_char == ',')
	return Token::COMMA;

	if (next_char == '#')
	return Token::UNCLASSIFIED_COMMENT;

	// For the case of '-' differentiate between a negative number and anything
	// else.
	if (next_char == '-') {
	if (!CanIncrement())
	return Token::UNCLASSIFIED_OPERATOR; // Just the minus before end of
	// file.
	char following_char = input_[cur_ + 1];
	if (base::IsAsciiDigit(following_char))
	return Token::INTEGER;
	return Token::UNCLASSIFIED_OPERATOR;
	}

	return Token::INVALID;
	}

	void Tokenizer::AdvanceToEndOfToken(const Location& location,
	Token::Type type) {
	switch (type) {
	case Token::INTEGER:
	do {
	Advance();
	} while (!at_end() && base::IsAsciiDigit(cur_char()));
	if (!at_end()) {
	// Require the char after a number to be some kind of space, scope,
	// or operator.
	char c = cur_char();
	if (!IsCurrentWhitespace() && !CouldBeOperator(c) && !IsScoperChar(c) &&
	c != ',') {
	*err_ = Err(GetCurrentLocation(), "This is not a valid number.");
	// Highlight the number.
	err_->AppendRange(LocationRange(location, GetCurrentLocation()));
	}
	}
	break;

	case Token::STRING: {
	char initial = cur_char();
	Advance(); // Advance past initial "
	for (;;) {
	if (at_end()) {
	*err_ = Err(LocationRange(location, GetCurrentLocation()),
	"Unterminated string literal.",
	"Don't leave me hanging like this!");
	break;
	}
	if (IsCurrentStringTerminator(initial)) {
	Advance(); // Skip past last "
	break;
	} else if (IsCurrentNewline()) {
	*err_ = Err(LocationRange(location, GetCurrentLocation()),
	"Newline in string constant.");
	}
	Advance();
	}
	break;
	}

	case Token::UNCLASSIFIED_OPERATOR:
	// Some operators are two characters, some are one.
	if (CouldBeTwoCharOperatorBegin(cur_char())) {
	if (CanIncrement() && CouldBeTwoCharOperatorEnd(input_[cur_ + 1]))
	Advance();
	}
	Advance();
	break;

	case Token::IDENTIFIER:
	while (!at_end() && IsIdentifierContinuingChar(cur_char()))
	Advance();
	break;

	case Token::LEFT_BRACKET:
	case Token::RIGHT_BRACKET:
	case Token::LEFT_BRACE:
	case Token::RIGHT_BRACE:
	case Token::LEFT_PAREN:
	case Token::RIGHT_PAREN:
	case Token::DOT:
	case Token::COMMA:
	Advance(); // All are one char.
	break;

	case Token::UNCLASSIFIED_COMMENT:
	// Eat to EOL.
	while (!at_end() && !IsCurrentNewline())
	Advance();
	break;

	case Token::INVALID:
	default:
	*err_ = Err(location, "Everything is all messed up",
	"Please insert system disk in drive A: and press any key.");
	NOTREACHED();
	return;
	}
	}

	bool Tokenizer::AtStartOfLine(size_t location) const {
	while (location > 0) {
	--location;
	char c = input_[location];
	if (c == '\n')
	return true;
	if (c != ' ')
	return false;
	}
	return true;
	}

	bool Tokenizer::IsCurrentWhitespace() const {
	DCHECK(!at_end());
	char c = input_[cur_];
	// Note that tab (0x09), vertical tab (0x0B), and formfeed (0x0C) are illegal.
	return c == 0x0A \|\| c == 0x0D \|\| c == 0x20;
	}

	bool Tokenizer::IsCurrentStringTerminator(char quote_char) const {
	DCHECK(!at_end());
	if (cur_char() != quote_char)
	return false;

	// Check for escaping. \" is not a string terminator, but \\" is. Count
	// the number of preceding backslashes.
	int num_backslashes = 0;
	for (int i = static_cast<int>(cur_) - 1; i >= 0 && input_[i] == '\\'; i--)
	num_backslashes++;

	// Even backslashes mean that they were escaping each other and don't count
	// as escaping this quote.
	return (num_backslashes % 2) == 0;
	}

	bool Tokenizer::IsCurrentNewline() const {
	return IsNewline(input_, cur_);
	}

	void Tokenizer::Advance() {
	DCHECK(cur_ < input_.size());
	if (IsCurrentNewline()) {
	line_number_++;
	column_number_ = 1;
	} else {
	column_number_++;
	}
	cur_++;
	}

	Location Tokenizer::GetCurrentLocation() const {
	return Location(input_file_, line_number_, column_number_,
	static_cast<int>(cur_));
	}

	Err Tokenizer::GetErrorForInvalidToken(const Location& location) const {
	std::string help;
	if (cur_char() == ';') {
	// Semicolon.
	help = "Semicolons are not needed, delete this one.";
	} else if (cur_char() == '\t') {
	// Tab.
	help =
	"You got a tab character in here. Tabs are evil. "
	"Convert to spaces.";
	} else if (cur_char() == '/' && cur_ + 1 < input_.size() &&
	(input_[cur_ + 1] == '/' \|\| input_[cur_ + 1] == '*')) {
	// Different types of comments.
	help = "Comments should start with # instead";
	} else if (cur_char() == '\'') {
	help = "Strings are delimited by \" characters, not apostrophes.";
	} else {
	help = "I have no idea what this is.";
	}

	return Err(location, "Invalid token.", help);
	}