|  | // Copyright (c) 2013 The Chromium Authors. All rights reserved. | 
|  | // Use of this source code is governed by a BSD-style license that can be | 
|  | // found in the LICENSE file. | 
|  |  | 
|  | #include "tools/gn/tokenizer.h" | 
|  |  | 
|  | #include "base/logging.h" | 
|  | #include "tools/gn/input_file.h" | 
|  |  | 
|  | namespace { | 
|  |  | 
|  | bool IsNumberChar(char c) { | 
|  | return c >= '0' && c <= '9'; | 
|  | } | 
|  |  | 
|  | bool CouldBeTwoCharOperatorBegin(char c) { | 
|  | return c == '<' || c == '>' || c == '!' || c == '=' || c == '-' || | 
|  | c == '+' || c == '|' || c == '&'; | 
|  | } | 
|  |  | 
|  | bool CouldBeTwoCharOperatorEnd(char c) { | 
|  | return c == '=' || c == '|' || c == '&'; | 
|  | } | 
|  |  | 
|  | bool CouldBeOneCharOperator(char c) { | 
|  | return c == '=' || c == '<' || c == '>' || c == '+' || c == '!' || | 
|  | c == ':' || c == '|' || c == '&' || c == '-'; | 
|  | } | 
|  |  | 
|  | bool CouldBeOperator(char c) { | 
|  | return CouldBeOneCharOperator(c) || CouldBeTwoCharOperatorBegin(c); | 
|  | } | 
|  |  | 
|  | bool IsScoperChar(char c) { | 
|  | return c == '(' || c == ')' || c == '[' || c == ']' || c == '{' || c == '}'; | 
|  | } | 
|  |  | 
|  | Token::Type GetSpecificOperatorType(base::StringPiece value) { | 
|  | if (value == "=") | 
|  | return Token::EQUAL; | 
|  | if (value == "+") | 
|  | return Token::PLUS; | 
|  | if (value == "-") | 
|  | return Token::MINUS; | 
|  | if (value == "+=") | 
|  | return Token::PLUS_EQUALS; | 
|  | if (value == "-=") | 
|  | return Token::MINUS_EQUALS; | 
|  | if (value == "==") | 
|  | return Token::EQUAL_EQUAL; | 
|  | if (value == "!=") | 
|  | return Token::NOT_EQUAL; | 
|  | if (value == "<=") | 
|  | return Token::LESS_EQUAL; | 
|  | if (value == ">=") | 
|  | return Token::GREATER_EQUAL; | 
|  | if (value == "<") | 
|  | return Token::LESS_THAN; | 
|  | if (value == ">") | 
|  | return Token::GREATER_THAN; | 
|  | if (value == "&&") | 
|  | return Token::BOOLEAN_AND; | 
|  | if (value == "||") | 
|  | return Token::BOOLEAN_OR; | 
|  | if (value == "!") | 
|  | return Token::BANG; | 
|  | return Token::INVALID; | 
|  | } | 
|  |  | 
|  | }  // namespace | 
|  |  | 
|  | Tokenizer::Tokenizer(const InputFile* input_file, Err* err) | 
|  | : input_file_(input_file), | 
|  | input_(input_file->contents()), | 
|  | err_(err), | 
|  | cur_(0), | 
|  | line_number_(1), | 
|  | char_in_line_(1) { | 
|  | } | 
|  |  | 
|  | Tokenizer::~Tokenizer() { | 
|  | } | 
|  |  | 
|  | // static | 
|  | std::vector<Token> Tokenizer::Tokenize(const InputFile* input_file, Err* err) { | 
|  | Tokenizer t(input_file, err); | 
|  | return t.Run(); | 
|  | } | 
|  |  | 
|  | std::vector<Token> Tokenizer::Run() { | 
|  | DCHECK(tokens_.empty()); | 
|  | while (!done()) { | 
|  | AdvanceToNextToken(); | 
|  | if (done()) | 
|  | break; | 
|  | Location location = GetCurrentLocation(); | 
|  |  | 
|  | Token::Type type = ClassifyCurrent(); | 
|  | if (type == Token::INVALID) { | 
|  | *err_ = GetErrorForInvalidToken(location); | 
|  | break; | 
|  | } | 
|  | size_t token_begin = cur_; | 
|  | AdvanceToEndOfToken(location, type); | 
|  | if (has_error()) | 
|  | break; | 
|  | size_t token_end = cur_; | 
|  |  | 
|  | base::StringPiece token_value(&input_.data()[token_begin], | 
|  | token_end - token_begin); | 
|  |  | 
|  | if (type == Token::UNCLASSIFIED_OPERATOR) | 
|  | type = GetSpecificOperatorType(token_value); | 
|  | if (type == Token::IDENTIFIER) { | 
|  | if (token_value == "if") | 
|  | type = Token::IF; | 
|  | else if (token_value == "else") | 
|  | type = Token::ELSE; | 
|  | else if (token_value == "true") | 
|  | type = Token::TRUE_TOKEN; | 
|  | else if (token_value == "false") | 
|  | type = Token::FALSE_TOKEN; | 
|  | } | 
|  |  | 
|  | // TODO(brettw) This just strips comments from the token stream. This | 
|  | // is probably wrong, they should be removed at a later stage so we can | 
|  | // do things like rewrite the file. But this makes the parser simpler and | 
|  | // is OK for now. | 
|  | if (type != Token::COMMENT) | 
|  | tokens_.push_back(Token(location, type, token_value)); | 
|  | } | 
|  | if (err_->has_error()) | 
|  | tokens_.clear(); | 
|  | return tokens_; | 
|  | } | 
|  |  | 
|  | // static | 
|  | size_t Tokenizer::ByteOffsetOfNthLine(const base::StringPiece& buf, int n) { | 
|  | int cur_line = 1; | 
|  | size_t cur_byte = 0; | 
|  |  | 
|  | DCHECK(n > 0); | 
|  |  | 
|  | if (n == 1) | 
|  | return 0; | 
|  |  | 
|  | while (cur_byte < buf.size()) { | 
|  | if (IsNewline(buf, cur_byte)) { | 
|  | cur_line++; | 
|  | if (cur_line == n) | 
|  | return cur_byte + 1; | 
|  | } | 
|  | cur_byte++; | 
|  | } | 
|  | return -1; | 
|  | } | 
|  |  | 
|  | // static | 
|  | bool Tokenizer::IsNewline(const base::StringPiece& buffer, size_t offset) { | 
|  | DCHECK(offset < buffer.size()); | 
|  | // We may need more logic here to handle different line ending styles. | 
|  | return buffer[offset] == '\n'; | 
|  | } | 
|  |  | 
|  |  | 
|  | void Tokenizer::AdvanceToNextToken() { | 
|  | while (!at_end() && IsCurrentWhitespace()) | 
|  | Advance(); | 
|  | } | 
|  |  | 
|  | Token::Type Tokenizer::ClassifyCurrent() const { | 
|  | DCHECK(!at_end()); | 
|  | char next_char = cur_char(); | 
|  | if (next_char >= '0' && next_char <= '9') | 
|  | return Token::INTEGER; | 
|  | if (next_char == '"') | 
|  | return Token::STRING; | 
|  |  | 
|  | // Note: '-' handled specially below. | 
|  | if (next_char != '-' && CouldBeOperator(next_char)) | 
|  | return Token::UNCLASSIFIED_OPERATOR; | 
|  |  | 
|  | if (IsIdentifierFirstChar(next_char)) | 
|  | return Token::IDENTIFIER; | 
|  |  | 
|  | if (next_char == '[') | 
|  | return Token::LEFT_BRACKET; | 
|  | if (next_char == ']') | 
|  | return Token::RIGHT_BRACKET; | 
|  | if (next_char == '(') | 
|  | return Token::LEFT_PAREN; | 
|  | if (next_char == ')') | 
|  | return Token::RIGHT_PAREN; | 
|  | if (next_char == '{') | 
|  | return Token::LEFT_BRACE; | 
|  | if (next_char == '}') | 
|  | return Token::RIGHT_BRACE; | 
|  |  | 
|  | if (next_char == ',') | 
|  | return Token::COMMA; | 
|  |  | 
|  | if (next_char == '#') | 
|  | return Token::COMMENT; | 
|  |  | 
|  | // For the case of '-' differentiate between a negative number and anything | 
|  | // else. | 
|  | if (next_char == '-') { | 
|  | if (!CanIncrement()) | 
|  | return Token::UNCLASSIFIED_OPERATOR;  // Just the minus before end of | 
|  | // file. | 
|  | char following_char = input_[cur_ + 1]; | 
|  | if (following_char >= '0' && following_char <= '9') | 
|  | return Token::INTEGER; | 
|  | return Token::UNCLASSIFIED_OPERATOR; | 
|  | } | 
|  |  | 
|  | return Token::INVALID; | 
|  | } | 
|  |  | 
|  | void Tokenizer::AdvanceToEndOfToken(const Location& location, | 
|  | Token::Type type) { | 
|  | switch (type) { | 
|  | case Token::INTEGER: | 
|  | do { | 
|  | Advance(); | 
|  | } while (!at_end() && IsNumberChar(cur_char())); | 
|  | if (!at_end()) { | 
|  | // Require the char after a number to be some kind of space, scope, | 
|  | // or operator. | 
|  | char c = cur_char(); | 
|  | if (!IsCurrentWhitespace() && !CouldBeOperator(c) && | 
|  | !IsScoperChar(c) && c != ',') { | 
|  | *err_ = Err(GetCurrentLocation(), | 
|  | "This is not a valid number.", | 
|  | "Learn to count."); | 
|  | // Highlight the number. | 
|  | err_->AppendRange(LocationRange(location, GetCurrentLocation())); | 
|  | } | 
|  | } | 
|  | break; | 
|  |  | 
|  | case Token::STRING: { | 
|  | char initial = cur_char(); | 
|  | Advance();  // Advance past initial " | 
|  | for (;;) { | 
|  | if (at_end()) { | 
|  | *err_ = Err(LocationRange(location, | 
|  | Location(input_file_, line_number_, char_in_line_)), | 
|  | "Unterminated string literal.", | 
|  | "Don't leave me hanging like this!"); | 
|  | break; | 
|  | } | 
|  | if (IsCurrentStringTerminator(initial)) { | 
|  | Advance();  // Skip past last " | 
|  | break; | 
|  | } else if (cur_char() == '\n') { | 
|  | *err_ = Err(LocationRange(location, | 
|  | GetCurrentLocation()), | 
|  | "Newline in string constant."); | 
|  | } | 
|  | Advance(); | 
|  | } | 
|  | break; | 
|  | } | 
|  |  | 
|  | case Token::UNCLASSIFIED_OPERATOR: | 
|  | // Some operators are two characters, some are one. | 
|  | if (CouldBeTwoCharOperatorBegin(cur_char())) { | 
|  | if (CanIncrement() && CouldBeTwoCharOperatorEnd(input_[cur_ + 1])) | 
|  | Advance(); | 
|  | } | 
|  | Advance(); | 
|  | break; | 
|  |  | 
|  | case Token::IDENTIFIER: | 
|  | while (!at_end() && IsIdentifierContinuingChar(cur_char())) | 
|  | Advance(); | 
|  | break; | 
|  |  | 
|  | case Token::LEFT_BRACKET: | 
|  | case Token::RIGHT_BRACKET: | 
|  | case Token::LEFT_BRACE: | 
|  | case Token::RIGHT_BRACE: | 
|  | case Token::LEFT_PAREN: | 
|  | case Token::RIGHT_PAREN: | 
|  | case Token::COMMA: | 
|  | Advance();  // All are one char. | 
|  | break; | 
|  |  | 
|  | case Token::COMMENT: | 
|  | // Eat to EOL. | 
|  | while (!at_end() && !IsCurrentNewline()) | 
|  | Advance(); | 
|  | break; | 
|  |  | 
|  | case Token::INVALID: | 
|  | default: | 
|  | *err_ = Err(location, "Everything is all messed up", | 
|  | "Please insert system disk in drive A: and press any key."); | 
|  | NOTREACHED(); | 
|  | return; | 
|  | } | 
|  | } | 
|  |  | 
|  | bool Tokenizer::IsCurrentWhitespace() const { | 
|  | DCHECK(!at_end()); | 
|  | char c = input_[cur_]; | 
|  | // Note that tab (0x09) is illegal. | 
|  | return c == 0x0A || c == 0x0B || c == 0x0C || c == 0x0D || c == 0x20; | 
|  | } | 
|  |  | 
|  | bool Tokenizer::IsCurrentStringTerminator(char quote_char) const { | 
|  | DCHECK(!at_end()); | 
|  | if (cur_char() != quote_char) | 
|  | return false; | 
|  |  | 
|  | // Check for escaping. \" is not a string terminator, but \\" is. Count | 
|  | // the number of preceeding backslashes. | 
|  | int num_backslashes = 0; | 
|  | for (int i = static_cast<int>(cur_) - 1; i >= 0 && input_[i] == '\\'; i--) | 
|  | num_backslashes++; | 
|  |  | 
|  | // Even backslashes mean that they were escaping each other and don't count | 
|  | // as escaping this quote. | 
|  | return (num_backslashes % 2) == 0; | 
|  | } | 
|  |  | 
|  | bool Tokenizer::IsCurrentNewline() const { | 
|  | return IsNewline(input_, cur_); | 
|  | } | 
|  |  | 
|  | void Tokenizer::Advance() { | 
|  | DCHECK(cur_ < input_.size()); | 
|  | if (IsCurrentNewline()) { | 
|  | line_number_++; | 
|  | char_in_line_ = 1; | 
|  | } else { | 
|  | char_in_line_++; | 
|  | } | 
|  | cur_++; | 
|  | } | 
|  |  | 
|  | Location Tokenizer::GetCurrentLocation() const { | 
|  | return Location(input_file_, line_number_, char_in_line_); | 
|  | } | 
|  |  | 
|  | Err Tokenizer::GetErrorForInvalidToken(const Location& location) const { | 
|  | std::string help; | 
|  | if (cur_char() == ';') { | 
|  | // Semicolon. | 
|  | help = "Semicolons are not needed, delete this one."; | 
|  | } else if (cur_char() == '\t') { | 
|  | // Tab. | 
|  | help = "You got a tab character in here. Tabs are evil. " | 
|  | "Convert to spaces."; | 
|  | } else if (cur_char() == '/' && cur_ + 1 < input_.size() && | 
|  | (input_[cur_ + 1] == '/' || input_[cur_ + 1] == '*')) { | 
|  | // Different types of comments. | 
|  | help = "Comments should start with # instead"; | 
|  | } else { | 
|  | help = "I have no idea what this is."; | 
|  | } | 
|  |  | 
|  | return Err(location, "Invalid token.", help); | 
|  | } |