tools/gn/tokenizer.cc - gn - Git at Google

 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "tools/gn/tokenizer.h"

 #include "base/logging.h"
 #include "tools/gn/input_file.h"

 namespace {

 bool IsNumberChar(char c) {
   return c >= '0' && c <= '9';
 }

 bool CouldBeTwoCharOperatorBegin(char c) {
   return c == '<' || c == '>' || c == '!' || c == '=' || c == '-' ||
          c == '+' || c == '|' || c == '&';
 }

 bool CouldBeTwoCharOperatorEnd(char c) {
   return c == '=' || c == '|' || c == '&';
 }

 bool CouldBeOneCharOperator(char c) {
   return c == '=' || c == '<' || c == '>' || c == '+' || c == '!' ||
          c == ':' || c == '|' || c == '&' || c == '-';
 }

 bool CouldBeOperator(char c) {
   return CouldBeOneCharOperator(c) || CouldBeTwoCharOperatorBegin(c);
 }

 bool IsScoperChar(char c) {
   return c == '(' || c == ')' || c == '[' || c == ']' || c == '{' || c == '}';
 }

 Token::Type GetSpecificOperatorType(base::StringPiece value) {
   if (value == "=")
     return Token::EQUAL;
   if (value == "+")
     return Token::PLUS;
   if (value == "-")
     return Token::MINUS;
   if (value == "+=")
     return Token::PLUS_EQUALS;
   if (value == "-=")
     return Token::MINUS_EQUALS;
   if (value == "==")
     return Token::EQUAL_EQUAL;
   if (value == "!=")
     return Token::NOT_EQUAL;
   if (value == "<=")
     return Token::LESS_EQUAL;
   if (value == ">=")
     return Token::GREATER_EQUAL;
   if (value == "<")
     return Token::LESS_THAN;
   if (value == ">")
     return Token::GREATER_THAN;
   if (value == "&&")
     return Token::BOOLEAN_AND;
   if (value == "||")
     return Token::BOOLEAN_OR;
   if (value == "!")
     return Token::BANG;
   return Token::INVALID;
 }

 }  // namespace

 Tokenizer::Tokenizer(const InputFile* input_file, Err* err)
     : input_file_(input_file),
       input_(input_file->contents()),
       err_(err),
       cur_(0),
       line_number_(1),
       char_in_line_(1) {
 }

 Tokenizer::~Tokenizer() {
 }

 // static
 std::vector<Token> Tokenizer::Tokenize(const InputFile* input_file, Err* err) {
   Tokenizer t(input_file, err);
   return t.Run();
 }

 std::vector<Token> Tokenizer::Run() {
   DCHECK(tokens_.empty());
   while (!done()) {
     AdvanceToNextToken();
     if (done())
       break;
     Location location = GetCurrentLocation();

     Token::Type type = ClassifyCurrent();
     if (type == Token::INVALID) {
       *err_ = GetErrorForInvalidToken(location);
       break;
     }
     size_t token_begin = cur_;
     AdvanceToEndOfToken(location, type);
     if (has_error())
       break;
     size_t token_end = cur_;

     base::StringPiece token_value(&input_.data()[token_begin],
                                   token_end - token_begin);

     if (type == Token::UNCLASSIFIED_OPERATOR)
       type = GetSpecificOperatorType(token_value);
     if (type == Token::IDENTIFIER) {
       if (token_value == "if")
         type = Token::IF;
       else if (token_value == "else")
         type = Token::ELSE;
       else if (token_value == "true")
         type = Token::TRUE_TOKEN;
       else if (token_value == "false")
         type = Token::FALSE_TOKEN;
     }

     // TODO(brettw) This just strips comments from the token stream. This
     // is probably wrong, they should be removed at a later stage so we can
     // do things like rewrite the file. But this makes the parser simpler and
     // is OK for now.
     if (type != Token::COMMENT)
       tokens_.push_back(Token(location, type, token_value));
   }
   if (err_->has_error())
     tokens_.clear();
   return tokens_;
 }

 // static
 size_t Tokenizer::ByteOffsetOfNthLine(const base::StringPiece& buf, int n) {
   int cur_line = 1;
   size_t cur_byte = 0;

   DCHECK(n > 0);

   if (n == 1)
     return 0;

   while (cur_byte < buf.size()) {
     if (IsNewline(buf, cur_byte)) {
       cur_line++;
       if (cur_line == n)
         return cur_byte + 1;
     }
     cur_byte++;
   }
   return -1;
 }

 // static
 bool Tokenizer::IsNewline(const base::StringPiece& buffer, size_t offset) {
   DCHECK(offset < buffer.size());
   // We may need more logic here to handle different line ending styles.
   return buffer[offset] == '\n';
 }


 void Tokenizer::AdvanceToNextToken() {
   while (!at_end() && IsCurrentWhitespace())
     Advance();
 }

 Token::Type Tokenizer::ClassifyCurrent() const {
   DCHECK(!at_end());
   char next_char = cur_char();
   if (next_char >= '0' && next_char <= '9')
     return Token::INTEGER;
   if (next_char == '"')
     return Token::STRING;

   // Note: '-' handled specially below.
   if (next_char != '-' && CouldBeOperator(next_char))
     return Token::UNCLASSIFIED_OPERATOR;

   if (IsIdentifierFirstChar(next_char))
     return Token::IDENTIFIER;

   if (next_char == '[')
     return Token::LEFT_BRACKET;
   if (next_char == ']')
     return Token::RIGHT_BRACKET;
   if (next_char == '(')
     return Token::LEFT_PAREN;
   if (next_char == ')')
     return Token::RIGHT_PAREN;
   if (next_char == '{')
     return Token::LEFT_BRACE;
   if (next_char == '}')
     return Token::RIGHT_BRACE;

   if (next_char == ',')
     return Token::COMMA;

   if (next_char == '#')
     return Token::COMMENT;

   // For the case of '-' differentiate between a negative number and anything
   // else.
   if (next_char == '-') {
     if (!CanIncrement())
       return Token::UNCLASSIFIED_OPERATOR;  // Just the minus before end of
                                             // file.
     char following_char = input_[cur_ + 1];
     if (following_char >= '0' && following_char <= '9')
       return Token::INTEGER;
     return Token::UNCLASSIFIED_OPERATOR;
   }

   return Token::INVALID;
 }

 void Tokenizer::AdvanceToEndOfToken(const Location& location,
                                     Token::Type type) {
   switch (type) {
     case Token::INTEGER:
       do {
         Advance();
       } while (!at_end() && IsNumberChar(cur_char()));
       if (!at_end()) {
         // Require the char after a number to be some kind of space, scope,
         // or operator.
         char c = cur_char();
         if (!IsCurrentWhitespace() && !CouldBeOperator(c) &&
             !IsScoperChar(c) && c != ',') {
           *err_ = Err(GetCurrentLocation(),
               "This is not a valid number.",
               "Learn to count.");
           // Highlight the number.
           err_->AppendRange(LocationRange(location, GetCurrentLocation()));
         }
       }
       break;

     case Token::STRING: {
       char initial = cur_char();
       Advance();  // Advance past initial "
       for (;;) {
         if (at_end()) {
           *err_ = Err(LocationRange(location,
                           Location(input_file_, line_number_, char_in_line_)),
                      "Unterminated string literal.",
                      "Don't leave me hanging like this!");
           break;
         }
         if (IsCurrentStringTerminator(initial)) {
           Advance();  // Skip past last "
           break;
         } else if (cur_char() == '\n') {
           *err_ = Err(LocationRange(location,
                                    GetCurrentLocation()),
                      "Newline in string constant.");
         }
         Advance();
       }
       break;
     }

     case Token::UNCLASSIFIED_OPERATOR:
       // Some operators are two characters, some are one.
       if (CouldBeTwoCharOperatorBegin(cur_char())) {
         if (CanIncrement() && CouldBeTwoCharOperatorEnd(input_[cur_ + 1]))
           Advance();
       }
       Advance();
       break;

     case Token::IDENTIFIER:
       while (!at_end() && IsIdentifierContinuingChar(cur_char()))
         Advance();
       break;

     case Token::LEFT_BRACKET:
     case Token::RIGHT_BRACKET:
     case Token::LEFT_BRACE:
     case Token::RIGHT_BRACE:
     case Token::LEFT_PAREN:
     case Token::RIGHT_PAREN:
     case Token::COMMA:
       Advance();  // All are one char.
       break;

     case Token::COMMENT:
       // Eat to EOL.
       while (!at_end() && !IsCurrentNewline())
         Advance();
       break;

     case Token::INVALID:
     default:
       *err_ = Err(location, "Everything is all messed up",
                   "Please insert system disk in drive A: and press any key.");
       NOTREACHED();
       return;
   }
 }

 bool Tokenizer::IsCurrentWhitespace() const {
   DCHECK(!at_end());
   char c = input_[cur_];
   // Note that tab (0x09) is illegal.
   return c == 0x0A || c == 0x0B || c == 0x0C || c == 0x0D || c == 0x20;
 }

 bool Tokenizer::IsCurrentStringTerminator(char quote_char) const {
   DCHECK(!at_end());
   if (cur_char() != quote_char)
     return false;

   // Check for escaping. \" is not a string terminator, but \\" is. Count
   // the number of preceeding backslashes.
   int num_backslashes = 0;
   for (int i = static_cast<int>(cur_) - 1; i >= 0 && input_[i] == '\\'; i--)
     num_backslashes++;

   // Even backslashes mean that they were escaping each other and don't count
   // as escaping this quote.
   return (num_backslashes % 2) == 0;
 }

 bool Tokenizer::IsCurrentNewline() const {
   return IsNewline(input_, cur_);
 }

 void Tokenizer::Advance() {
   DCHECK(cur_ < input_.size());
   if (IsCurrentNewline()) {
     line_number_++;
     char_in_line_ = 1;
   } else {
     char_in_line_++;
   }
   cur_++;
 }

 Location Tokenizer::GetCurrentLocation() const {
   return Location(input_file_, line_number_, char_in_line_);
 }

 Err Tokenizer::GetErrorForInvalidToken(const Location& location) const {
   std::string help;
   if (cur_char() == ';') {
     // Semicolon.
     help = "Semicolons are not needed, delete this one.";
   } else if (cur_char() == '\t') {
     // Tab.
     help = "You got a tab character in here. Tabs are evil. "
            "Convert to spaces.";
   } else if (cur_char() == '/' && cur_ + 1 < input_.size() &&
       (input_[cur_ + 1] == '/' || input_[cur_ + 1] == '*')) {
     // Different types of comments.
     help = "Comments should start with # instead";
   } else {
     help = "I have no idea what this is.";
   }

   return Err(location, "Invalid token.", help);
 }
	// Copyright (c) 2013 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include "tools/gn/tokenizer.h"

	#include "base/logging.h"
	#include "tools/gn/input_file.h"

	namespace {

	bool IsNumberChar(char c) {
	return c >= '0' && c <= '9';
	}

	bool CouldBeTwoCharOperatorBegin(char c) {
	return c == '<' \|\| c == '>' \|\| c == '!' \|\| c == '=' \|\| c == '-' \|\|
	c == '+' \|\| c == '\|' \|\| c == '&';
	}

	bool CouldBeTwoCharOperatorEnd(char c) {
	return c == '=' \|\| c == '\|' \|\| c == '&';
	}

	bool CouldBeOneCharOperator(char c) {
	return c == '=' \|\| c == '<' \|\| c == '>' \|\| c == '+' \|\| c == '!' \|\|
	c == ':' \|\| c == '\|' \|\| c == '&' \|\| c == '-';
	}

	bool CouldBeOperator(char c) {
	return CouldBeOneCharOperator(c) \|\| CouldBeTwoCharOperatorBegin(c);
	}

	bool IsScoperChar(char c) {
	return c == '(' \|\| c == ')' \|\| c == '[' \|\| c == ']' \|\| c == '{' \|\| c == '}';
	}

	Token::Type GetSpecificOperatorType(base::StringPiece value) {
	if (value == "=")
	return Token::EQUAL;
	if (value == "+")
	return Token::PLUS;
	if (value == "-")
	return Token::MINUS;
	if (value == "+=")
	return Token::PLUS_EQUALS;
	if (value == "-=")
	return Token::MINUS_EQUALS;
	if (value == "==")
	return Token::EQUAL_EQUAL;
	if (value == "!=")
	return Token::NOT_EQUAL;
	if (value == "<=")
	return Token::LESS_EQUAL;
	if (value == ">=")
	return Token::GREATER_EQUAL;
	if (value == "<")
	return Token::LESS_THAN;
	if (value == ">")
	return Token::GREATER_THAN;
	if (value == "&&")
	return Token::BOOLEAN_AND;
	if (value == "\|\|")
	return Token::BOOLEAN_OR;
	if (value == "!")
	return Token::BANG;
	return Token::INVALID;
	}

	} // namespace

	Tokenizer::Tokenizer(const InputFile* input_file, Err* err)
	: input_file_(input_file),
	input_(input_file->contents()),
	err_(err),
	cur_(0),
	line_number_(1),
	char_in_line_(1) {
	}

	Tokenizer::~Tokenizer() {
	}

	// static
	std::vector<Token> Tokenizer::Tokenize(const InputFile* input_file, Err* err) {
	Tokenizer t(input_file, err);
	return t.Run();
	}

	std::vector<Token> Tokenizer::Run() {
	DCHECK(tokens_.empty());
	while (!done()) {
	AdvanceToNextToken();
	if (done())
	break;
	Location location = GetCurrentLocation();

	Token::Type type = ClassifyCurrent();
	if (type == Token::INVALID) {
	*err_ = GetErrorForInvalidToken(location);
	break;
	}
	size_t token_begin = cur_;
	AdvanceToEndOfToken(location, type);
	if (has_error())
	break;
	size_t token_end = cur_;

	base::StringPiece token_value(&input_.data()[token_begin],
	token_end - token_begin);

	if (type == Token::UNCLASSIFIED_OPERATOR)
	type = GetSpecificOperatorType(token_value);
	if (type == Token::IDENTIFIER) {
	if (token_value == "if")
	type = Token::IF;
	else if (token_value == "else")
	type = Token::ELSE;
	else if (token_value == "true")
	type = Token::TRUE_TOKEN;
	else if (token_value == "false")
	type = Token::FALSE_TOKEN;
	}

	// TODO(brettw) This just strips comments from the token stream. This
	// is probably wrong, they should be removed at a later stage so we can
	// do things like rewrite the file. But this makes the parser simpler and
	// is OK for now.
	if (type != Token::COMMENT)
	tokens_.push_back(Token(location, type, token_value));
	}
	if (err_->has_error())
	tokens_.clear();
	return tokens_;
	}

	// static
	size_t Tokenizer::ByteOffsetOfNthLine(const base::StringPiece& buf, int n) {
	int cur_line = 1;
	size_t cur_byte = 0;

	DCHECK(n > 0);

	if (n == 1)
	return 0;

	while (cur_byte < buf.size()) {
	if (IsNewline(buf, cur_byte)) {
	cur_line++;
	if (cur_line == n)
	return cur_byte + 1;
	}
	cur_byte++;
	}
	return -1;
	}

	// static
	bool Tokenizer::IsNewline(const base::StringPiece& buffer, size_t offset) {
	DCHECK(offset < buffer.size());
	// We may need more logic here to handle different line ending styles.
	return buffer[offset] == '\n';
	}


	void Tokenizer::AdvanceToNextToken() {
	while (!at_end() && IsCurrentWhitespace())
	Advance();
	}

	Token::Type Tokenizer::ClassifyCurrent() const {
	DCHECK(!at_end());
	char next_char = cur_char();
	if (next_char >= '0' && next_char <= '9')
	return Token::INTEGER;
	if (next_char == '"')
	return Token::STRING;

	// Note: '-' handled specially below.
	if (next_char != '-' && CouldBeOperator(next_char))
	return Token::UNCLASSIFIED_OPERATOR;

	if (IsIdentifierFirstChar(next_char))
	return Token::IDENTIFIER;

	if (next_char == '[')
	return Token::LEFT_BRACKET;
	if (next_char == ']')
	return Token::RIGHT_BRACKET;
	if (next_char == '(')
	return Token::LEFT_PAREN;
	if (next_char == ')')
	return Token::RIGHT_PAREN;
	if (next_char == '{')
	return Token::LEFT_BRACE;
	if (next_char == '}')
	return Token::RIGHT_BRACE;

	if (next_char == ',')
	return Token::COMMA;

	if (next_char == '#')
	return Token::COMMENT;

	// For the case of '-' differentiate between a negative number and anything
	// else.
	if (next_char == '-') {
	if (!CanIncrement())
	return Token::UNCLASSIFIED_OPERATOR; // Just the minus before end of
	// file.
	char following_char = input_[cur_ + 1];
	if (following_char >= '0' && following_char <= '9')
	return Token::INTEGER;
	return Token::UNCLASSIFIED_OPERATOR;
	}

	return Token::INVALID;
	}

	void Tokenizer::AdvanceToEndOfToken(const Location& location,
	Token::Type type) {
	switch (type) {
	case Token::INTEGER:
	do {
	Advance();
	} while (!at_end() && IsNumberChar(cur_char()));
	if (!at_end()) {
	// Require the char after a number to be some kind of space, scope,
	// or operator.
	char c = cur_char();
	if (!IsCurrentWhitespace() && !CouldBeOperator(c) &&
	!IsScoperChar(c) && c != ',') {
	*err_ = Err(GetCurrentLocation(),
	"This is not a valid number.",
	"Learn to count.");
	// Highlight the number.
	err_->AppendRange(LocationRange(location, GetCurrentLocation()));
	}
	}
	break;

	case Token::STRING: {
	char initial = cur_char();
	Advance(); // Advance past initial "
	for (;;) {
	if (at_end()) {
	*err_ = Err(LocationRange(location,
	Location(input_file_, line_number_, char_in_line_)),
	"Unterminated string literal.",
	"Don't leave me hanging like this!");
	break;
	}
	if (IsCurrentStringTerminator(initial)) {
	Advance(); // Skip past last "
	break;
	} else if (cur_char() == '\n') {
	*err_ = Err(LocationRange(location,
	GetCurrentLocation()),
	"Newline in string constant.");
	}
	Advance();
	}
	break;
	}

	case Token::UNCLASSIFIED_OPERATOR:
	// Some operators are two characters, some are one.
	if (CouldBeTwoCharOperatorBegin(cur_char())) {
	if (CanIncrement() && CouldBeTwoCharOperatorEnd(input_[cur_ + 1]))
	Advance();
	}
	Advance();
	break;

	case Token::IDENTIFIER:
	while (!at_end() && IsIdentifierContinuingChar(cur_char()))
	Advance();
	break;

	case Token::LEFT_BRACKET:
	case Token::RIGHT_BRACKET:
	case Token::LEFT_BRACE:
	case Token::RIGHT_BRACE:
	case Token::LEFT_PAREN:
	case Token::RIGHT_PAREN:
	case Token::COMMA:
	Advance(); // All are one char.
	break;

	case Token::COMMENT:
	// Eat to EOL.
	while (!at_end() && !IsCurrentNewline())
	Advance();
	break;

	case Token::INVALID:
	default:
	*err_ = Err(location, "Everything is all messed up",
	"Please insert system disk in drive A: and press any key.");
	NOTREACHED();
	return;
	}
	}

	bool Tokenizer::IsCurrentWhitespace() const {
	DCHECK(!at_end());
	char c = input_[cur_];
	// Note that tab (0x09) is illegal.
	return c == 0x0A \|\| c == 0x0B \|\| c == 0x0C \|\| c == 0x0D \|\| c == 0x20;
	}

	bool Tokenizer::IsCurrentStringTerminator(char quote_char) const {
	DCHECK(!at_end());
	if (cur_char() != quote_char)
	return false;

	// Check for escaping. \" is not a string terminator, but \\" is. Count
	// the number of preceeding backslashes.
	int num_backslashes = 0;
	for (int i = static_cast<int>(cur_) - 1; i >= 0 && input_[i] == '\\'; i--)
	num_backslashes++;

	// Even backslashes mean that they were escaping each other and don't count
	// as escaping this quote.
	return (num_backslashes % 2) == 0;
	}

	bool Tokenizer::IsCurrentNewline() const {
	return IsNewline(input_, cur_);
	}

	void Tokenizer::Advance() {
	DCHECK(cur_ < input_.size());
	if (IsCurrentNewline()) {
	line_number_++;
	char_in_line_ = 1;
	} else {
	char_in_line_++;
	}
	cur_++;
	}

	Location Tokenizer::GetCurrentLocation() const {
	return Location(input_file_, line_number_, char_in_line_);
	}

	Err Tokenizer::GetErrorForInvalidToken(const Location& location) const {
	std::string help;
	if (cur_char() == ';') {
	// Semicolon.
	help = "Semicolons are not needed, delete this one.";
	} else if (cur_char() == '\t') {
	// Tab.
	help = "You got a tab character in here. Tabs are evil. "
	"Convert to spaces.";
	} else if (cur_char() == '/' && cur_ + 1 < input_.size() &&
	(input_[cur_ + 1] == '/' \|\| input_[cur_ + 1] == '*')) {
	// Different types of comments.
	help = "Comments should start with # instead";
	} else {
	help = "I have no idea what this is.";
	}

	return Err(location, "Invalid token.", help);
	}