| // Copyright (c) 2013 The Chromium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "gn/tokenizer.h" |
| |
| #include "base/logging.h" |
| #include "base/strings/string_util.h" |
| #include "gn/input_file.h" |
| |
| namespace { |
| |
| bool CouldBeTwoCharOperatorBegin(char c) { |
| return c == '<' || c == '>' || c == '!' || c == '=' || c == '-' || c == '+' || |
| c == '|' || c == '&'; |
| } |
| |
| bool CouldBeTwoCharOperatorEnd(char c) { |
| return c == '=' || c == '|' || c == '&'; |
| } |
| |
| bool CouldBeOneCharOperator(char c) { |
| return c == '=' || c == '<' || c == '>' || c == '+' || c == '!' || c == ':' || |
| c == '|' || c == '&' || c == '-'; |
| } |
| |
| bool CouldBeOperator(char c) { |
| return CouldBeOneCharOperator(c) || CouldBeTwoCharOperatorBegin(c); |
| } |
| |
| bool IsScoperChar(char c) { |
| return c == '(' || c == ')' || c == '[' || c == ']' || c == '{' || c == '}'; |
| } |
| |
| Token::Type GetSpecificOperatorType(std::string_view value) { |
| if (value == "=") |
| return Token::EQUAL; |
| if (value == "+") |
| return Token::PLUS; |
| if (value == "-") |
| return Token::MINUS; |
| if (value == "+=") |
| return Token::PLUS_EQUALS; |
| if (value == "-=") |
| return Token::MINUS_EQUALS; |
| if (value == "==") |
| return Token::EQUAL_EQUAL; |
| if (value == "!=") |
| return Token::NOT_EQUAL; |
| if (value == "<=") |
| return Token::LESS_EQUAL; |
| if (value == ">=") |
| return Token::GREATER_EQUAL; |
| if (value == "<") |
| return Token::LESS_THAN; |
| if (value == ">") |
| return Token::GREATER_THAN; |
| if (value == "&&") |
| return Token::BOOLEAN_AND; |
| if (value == "||") |
| return Token::BOOLEAN_OR; |
| if (value == "!") |
| return Token::BANG; |
| if (value == ".") |
| return Token::DOT; |
| return Token::INVALID; |
| } |
| |
| } // namespace |
| |
| Tokenizer::Tokenizer(const InputFile* input_file, |
| Err* err, |
| WhitespaceTransform whitespace_transform) |
| : input_file_(input_file), |
| input_(input_file->contents()), |
| err_(err), |
| whitespace_transform_(whitespace_transform) {} |
| |
| Tokenizer::~Tokenizer() = default; |
| |
| // static |
| std::vector<Token> Tokenizer::Tokenize( |
| const InputFile* input_file, |
| Err* err, |
| WhitespaceTransform whitespace_transform) { |
| Tokenizer t(input_file, err, whitespace_transform); |
| return t.Run(); |
| } |
| |
| std::vector<Token> Tokenizer::Run() { |
| DCHECK(tokens_.empty()); |
| while (!done()) { |
| AdvanceToNextToken(); |
| if (done()) |
| break; |
| Location location = GetCurrentLocation(); |
| |
| Token::Type type = ClassifyCurrent(); |
| if (type == Token::INVALID) { |
| *err_ = GetErrorForInvalidToken(location); |
| break; |
| } |
| size_t token_begin = cur_; |
| AdvanceToEndOfToken(location, type); |
| if (has_error()) |
| break; |
| size_t token_end = cur_; |
| |
| std::string_view token_value(&input_.data()[token_begin], |
| token_end - token_begin); |
| |
| if (type == Token::UNCLASSIFIED_OPERATOR) { |
| type = GetSpecificOperatorType(token_value); |
| } else if (type == Token::IDENTIFIER) { |
| if (token_value == "if") |
| type = Token::IF; |
| else if (token_value == "else") |
| type = Token::ELSE; |
| else if (token_value == "true") |
| type = Token::TRUE_TOKEN; |
| else if (token_value == "false") |
| type = Token::FALSE_TOKEN; |
| } else if (type == Token::UNCLASSIFIED_COMMENT) { |
| if (AtStartOfLine(token_begin) && |
| // If it's a standalone comment, but is a continuation of a comment on |
| // a previous line, then instead make it a continued suffix comment. |
| (tokens_.empty() || tokens_.back().type() != Token::SUFFIX_COMMENT || |
| tokens_.back().location().line_number() + 1 != |
| location.line_number() || |
| tokens_.back().location().column_number() != |
| location.column_number())) { |
| type = Token::LINE_COMMENT; |
| if (!at_end()) // Could be EOF. |
| Advance(); // The current \n. |
| // If this comment is separated from the next syntax element, then we |
| // want to tag it as a block comment. This will become a standalone |
| // statement at the parser level to keep this comment separate, rather |
| // than attached to the subsequent statement. |
| while (!at_end() && IsCurrentWhitespace()) { |
| if (IsCurrentNewline()) { |
| type = Token::BLOCK_COMMENT; |
| break; |
| } |
| Advance(); |
| } |
| } else { |
| type = Token::SUFFIX_COMMENT; |
| } |
| } |
| |
| tokens_.push_back(Token(location, type, token_value)); |
| } |
| if (err_->has_error()) |
| tokens_.clear(); |
| return tokens_; |
| } |
| |
| // static |
| size_t Tokenizer::ByteOffsetOfNthLine(std::string_view buf, int n) { |
| DCHECK_GT(n, 0); |
| |
| if (n == 1) |
| return 0; |
| |
| int cur_line = 1; |
| size_t cur_byte = 0; |
| while (cur_byte < buf.size()) { |
| if (IsNewline(buf, cur_byte)) { |
| cur_line++; |
| if (cur_line == n) |
| return cur_byte + 1; |
| } |
| cur_byte++; |
| } |
| return static_cast<size_t>(-1); |
| } |
| |
| // static |
| bool Tokenizer::IsNewline(std::string_view buffer, size_t offset) { |
| DCHECK(offset < buffer.size()); |
| // We may need more logic here to handle different line ending styles. |
| return buffer[offset] == '\n'; |
| } |
| |
| // static |
| bool Tokenizer::IsIdentifierFirstChar(char c) { |
| return base::IsAsciiAlpha(c) || c == '_'; |
| } |
| |
| // static |
| bool Tokenizer::IsIdentifierContinuingChar(char c) { |
| // Also allow digits after the first char. |
| return IsIdentifierFirstChar(c) || base::IsAsciiDigit(c); |
| } |
| |
| void Tokenizer::AdvanceToNextToken() { |
| while (!at_end() && IsCurrentWhitespace()) |
| Advance(); |
| } |
| |
| // static |
| Token::Type Tokenizer::ClassifyToken(char next_char, char following_char) { |
| if (base::IsAsciiDigit(next_char)) |
| return Token::INTEGER; |
| if (next_char == '"') |
| return Token::STRING; |
| |
| // Note: '-' handled specially below. |
| if (next_char != '-' && CouldBeOperator(next_char)) |
| return Token::UNCLASSIFIED_OPERATOR; |
| |
| if (IsIdentifierFirstChar(next_char)) |
| return Token::IDENTIFIER; |
| |
| if (next_char == '[') |
| return Token::LEFT_BRACKET; |
| if (next_char == ']') |
| return Token::RIGHT_BRACKET; |
| if (next_char == '(') |
| return Token::LEFT_PAREN; |
| if (next_char == ')') |
| return Token::RIGHT_PAREN; |
| if (next_char == '{') |
| return Token::LEFT_BRACE; |
| if (next_char == '}') |
| return Token::RIGHT_BRACE; |
| |
| if (next_char == '.') |
| return Token::DOT; |
| if (next_char == ',') |
| return Token::COMMA; |
| |
| if (next_char == '#') |
| return Token::UNCLASSIFIED_COMMENT; |
| |
| // For the case of '-' differentiate between a negative number and anything |
| // else. |
| if (next_char == '-') { |
| if (following_char == '\0') |
| return Token::UNCLASSIFIED_OPERATOR; // Just the minus before end of |
| // file. |
| if (base::IsAsciiDigit(following_char)) |
| return Token::INTEGER; |
| return Token::UNCLASSIFIED_OPERATOR; |
| } |
| |
| return Token::INVALID; |
| } |
| |
| Token::Type Tokenizer::ClassifyCurrent() const { |
| DCHECK(!at_end()); |
| char next_char = cur_char(); |
| char following_char = CanIncrement() ? input_[cur_ + 1] : '\0'; |
| return ClassifyToken(next_char, following_char); |
| } |
| |
| void Tokenizer::AdvanceToEndOfToken(const Location& location, |
| Token::Type type) { |
| switch (type) { |
| case Token::INTEGER: |
| do { |
| Advance(); |
| } while (!at_end() && base::IsAsciiDigit(cur_char())); |
| if (!at_end()) { |
| // Require the char after a number to be some kind of space, scope, |
| // or operator. |
| char c = cur_char(); |
| if (!IsCurrentWhitespace() && !CouldBeOperator(c) && !IsScoperChar(c) && |
| c != ',') { |
| *err_ = Err(GetCurrentLocation(), "This is not a valid number."); |
| // Highlight the number. |
| err_->AppendRange(LocationRange(location, GetCurrentLocation())); |
| } |
| } |
| break; |
| |
| case Token::STRING: { |
| char initial = cur_char(); |
| Advance(); // Advance past initial " |
| for (;;) { |
| if (at_end()) { |
| *err_ = Err(LocationRange(location, GetCurrentLocation()), |
| "Unterminated string literal.", |
| "Don't leave me hanging like this!"); |
| break; |
| } |
| if (IsCurrentStringTerminator(initial)) { |
| Advance(); // Skip past last " |
| break; |
| } else if (IsCurrentNewline()) { |
| *err_ = Err(LocationRange(location, GetCurrentLocation()), |
| "Newline in string constant."); |
| } |
| Advance(); |
| } |
| break; |
| } |
| |
| case Token::UNCLASSIFIED_OPERATOR: |
| // Some operators are two characters, some are one. |
| if (CouldBeTwoCharOperatorBegin(cur_char())) { |
| if (CanIncrement() && CouldBeTwoCharOperatorEnd(input_[cur_ + 1])) |
| Advance(); |
| } |
| Advance(); |
| break; |
| |
| case Token::IDENTIFIER: |
| while (!at_end() && IsIdentifierContinuingChar(cur_char())) |
| Advance(); |
| break; |
| |
| case Token::LEFT_BRACKET: |
| case Token::RIGHT_BRACKET: |
| case Token::LEFT_BRACE: |
| case Token::RIGHT_BRACE: |
| case Token::LEFT_PAREN: |
| case Token::RIGHT_PAREN: |
| case Token::DOT: |
| case Token::COMMA: |
| Advance(); // All are one char. |
| break; |
| |
| case Token::UNCLASSIFIED_COMMENT: |
| // Eat to EOL. |
| while (!at_end() && !IsCurrentNewline()) |
| Advance(); |
| break; |
| |
| case Token::INVALID: |
| default: |
| *err_ = Err(location, "Everything is all messed up", |
| "Please insert system disk in drive A: and press any key."); |
| NOTREACHED(); |
| return; |
| } |
| } |
| |
| bool Tokenizer::AtStartOfLine(size_t location) const { |
| while (location > 0) { |
| --location; |
| char c = input_[location]; |
| if (c == '\n') |
| return true; |
| if (c != ' ') |
| return false; |
| } |
| return true; |
| } |
| |
| bool Tokenizer::IsCurrentWhitespace() const { |
| DCHECK(!at_end()); |
| char c = input_[cur_]; |
| // Note that tab (0x09), vertical tab (0x0B), and formfeed (0x0C) are illegal. |
| return c == 0x0A || c == 0x0D || c == 0x20 || |
| (whitespace_transform_ == WhitespaceTransform::kInvalidToSpace && |
| (c == 0x09 || c == 0x0B || c == 0x0C)); |
| } |
| |
| bool Tokenizer::IsCurrentStringTerminator(char quote_char) const { |
| DCHECK(!at_end()); |
| if (cur_char() != quote_char) |
| return false; |
| |
| // Check for escaping. \" is not a string terminator, but \\" is. Count |
| // the number of preceding backslashes. |
| int num_backslashes = 0; |
| for (int i = static_cast<int>(cur_) - 1; i >= 0 && input_[i] == '\\'; i--) |
| num_backslashes++; |
| |
| // Even backslashes mean that they were escaping each other and don't count |
| // as escaping this quote. |
| return (num_backslashes % 2) == 0; |
| } |
| |
| bool Tokenizer::IsCurrentNewline() const { |
| return IsNewline(input_, cur_); |
| } |
| |
| void Tokenizer::Advance() { |
| DCHECK(cur_ < input_.size()); |
| if (IsCurrentNewline()) { |
| line_number_++; |
| column_number_ = 1; |
| } else { |
| column_number_++; |
| } |
| cur_++; |
| } |
| |
| Location Tokenizer::GetCurrentLocation() const { |
| return Location(input_file_, line_number_, column_number_); |
| } |
| |
| Err Tokenizer::GetErrorForInvalidToken(const Location& location) const { |
| std::string help; |
| if (cur_char() == ';') { |
| // Semicolon. |
| help = "Semicolons are not needed, delete this one."; |
| } else if (cur_char() == '\t') { |
| // Tab. |
| help = |
| "You got a tab character in here. Tabs are evil. " |
| "Convert to spaces."; |
| } else if (cur_char() == '/' && cur_ + 1 < input_.size() && |
| (input_[cur_ + 1] == '/' || input_[cur_ + 1] == '*')) { |
| // Different types of comments. |
| help = "Comments should start with # instead"; |
| } else if (cur_char() == '\'') { |
| help = "Strings are delimited by \" characters, not apostrophes."; |
| } else { |
| help = "I have no idea what this is."; |
| } |
| |
| return Err(location, "Invalid token.", help); |
| } |