|  | // Copyright (c) 2011 The Chromium Authors. All rights reserved. | 
|  | // Use of this source code is governed by a BSD-style license that can be | 
|  | // found in the LICENSE file. | 
|  |  | 
|  | #include "base/i18n/break_iterator.h" | 
|  |  | 
|  | #include <stdint.h> | 
|  |  | 
|  | #include "base/logging.h" | 
|  | #include "third_party/icu/source/common/unicode/ubrk.h" | 
|  | #include "third_party/icu/source/common/unicode/uchar.h" | 
|  | #include "third_party/icu/source/common/unicode/ustring.h" | 
|  |  | 
|  | namespace base { | 
|  | namespace i18n { | 
|  |  | 
|  | const size_t npos = static_cast<size_t>(-1); | 
|  |  | 
|  | BreakIterator::BreakIterator(const StringPiece16& str, BreakType break_type) | 
|  | : iter_(nullptr), | 
|  | string_(str), | 
|  | break_type_(break_type), | 
|  | prev_(npos), | 
|  | pos_(0) {} | 
|  |  | 
|  | BreakIterator::BreakIterator(const StringPiece16& str, const string16& rules) | 
|  | : iter_(nullptr), | 
|  | string_(str), | 
|  | rules_(rules), | 
|  | break_type_(RULE_BASED), | 
|  | prev_(npos), | 
|  | pos_(0) {} | 
|  |  | 
|  | BreakIterator::~BreakIterator() { | 
|  | if (iter_) | 
|  | ubrk_close(static_cast<UBreakIterator*>(iter_)); | 
|  | } | 
|  |  | 
|  | bool BreakIterator::Init() { | 
|  | UErrorCode status = U_ZERO_ERROR; | 
|  | UParseError parse_error; | 
|  | UBreakIteratorType break_type; | 
|  | switch (break_type_) { | 
|  | case BREAK_CHARACTER: | 
|  | break_type = UBRK_CHARACTER; | 
|  | break; | 
|  | case BREAK_WORD: | 
|  | break_type = UBRK_WORD; | 
|  | break; | 
|  | case BREAK_LINE: | 
|  | case BREAK_NEWLINE: | 
|  | case RULE_BASED: // (Keep compiler happy, break_type not used in this case) | 
|  | break_type = UBRK_LINE; | 
|  | break; | 
|  | default: | 
|  | NOTREACHED() << "invalid break_type_"; | 
|  | return false; | 
|  | } | 
|  | if (break_type_ == RULE_BASED) { | 
|  | iter_ = ubrk_openRules(rules_.c_str(), | 
|  | static_cast<int32_t>(rules_.length()), | 
|  | string_.data(), | 
|  | static_cast<int32_t>(string_.size()), | 
|  | &parse_error, | 
|  | &status); | 
|  | if (U_FAILURE(status)) { | 
|  | NOTREACHED() << "ubrk_openRules failed to parse rule string at line " | 
|  | << parse_error.line << ", offset " << parse_error.offset; | 
|  | } | 
|  | } else { | 
|  | iter_ = ubrk_open(break_type, nullptr, string_.data(), | 
|  | static_cast<int32_t>(string_.size()), &status); | 
|  | if (U_FAILURE(status)) { | 
|  | NOTREACHED() << "ubrk_open failed for type " << break_type | 
|  | << " with error " << status; | 
|  | } | 
|  | } | 
|  |  | 
|  | if (U_FAILURE(status)) { | 
|  | return false; | 
|  | } | 
|  |  | 
|  | // Move the iterator to the beginning of the string. | 
|  | ubrk_first(static_cast<UBreakIterator*>(iter_)); | 
|  | return true; | 
|  | } | 
|  |  | 
|  | bool BreakIterator::Advance() { | 
|  | int32_t pos; | 
|  | int32_t status; | 
|  | prev_ = pos_; | 
|  | switch (break_type_) { | 
|  | case BREAK_CHARACTER: | 
|  | case BREAK_WORD: | 
|  | case BREAK_LINE: | 
|  | case RULE_BASED: | 
|  | pos = ubrk_next(static_cast<UBreakIterator*>(iter_)); | 
|  | if (pos == UBRK_DONE) { | 
|  | pos_ = npos; | 
|  | return false; | 
|  | } | 
|  | pos_ = static_cast<size_t>(pos); | 
|  | return true; | 
|  | case BREAK_NEWLINE: | 
|  | do { | 
|  | pos = ubrk_next(static_cast<UBreakIterator*>(iter_)); | 
|  | if (pos == UBRK_DONE) | 
|  | break; | 
|  | pos_ = static_cast<size_t>(pos); | 
|  | status = ubrk_getRuleStatus(static_cast<UBreakIterator*>(iter_)); | 
|  | } while (status >= UBRK_LINE_SOFT && status < UBRK_LINE_SOFT_LIMIT); | 
|  | if (pos == UBRK_DONE && prev_ == pos_) { | 
|  | pos_ = npos; | 
|  | return false; | 
|  | } | 
|  | return true; | 
|  | default: | 
|  | NOTREACHED() << "invalid break_type_"; | 
|  | return false; | 
|  | } | 
|  | } | 
|  |  | 
|  | bool BreakIterator::SetText(const base::char16* text, const size_t length) { | 
|  | UErrorCode status = U_ZERO_ERROR; | 
|  | ubrk_setText(static_cast<UBreakIterator*>(iter_), | 
|  | text, length, &status); | 
|  | pos_ = 0;  // implicit when ubrk_setText is done | 
|  | prev_ = npos; | 
|  | if (U_FAILURE(status)) { | 
|  | NOTREACHED() << "ubrk_setText failed"; | 
|  | return false; | 
|  | } | 
|  | string_ = StringPiece16(text, length); | 
|  | return true; | 
|  | } | 
|  |  | 
|  | bool BreakIterator::IsWord() const { | 
|  | return GetWordBreakStatus() == IS_WORD_BREAK; | 
|  | } | 
|  |  | 
|  | BreakIterator::WordBreakStatus BreakIterator::GetWordBreakStatus() const { | 
|  | int32_t status = ubrk_getRuleStatus(static_cast<UBreakIterator*>(iter_)); | 
|  | if (break_type_ != BREAK_WORD && break_type_ != RULE_BASED) | 
|  | return IS_LINE_OR_CHAR_BREAK; | 
|  | // In ICU 60, trying to advance past the end of the text does not change | 
|  | // |status| so that |pos_| has to be checked as well as |status|. | 
|  | // See http://bugs.icu-project.org/trac/ticket/13447 . | 
|  | return (status == UBRK_WORD_NONE || pos_ == npos) ? IS_SKIPPABLE_WORD | 
|  | : IS_WORD_BREAK; | 
|  | } | 
|  |  | 
|  | bool BreakIterator::IsEndOfWord(size_t position) const { | 
|  | if (break_type_ != BREAK_WORD && break_type_ != RULE_BASED) | 
|  | return false; | 
|  |  | 
|  | UBreakIterator* iter = static_cast<UBreakIterator*>(iter_); | 
|  | UBool boundary = ubrk_isBoundary(iter, static_cast<int32_t>(position)); | 
|  | int32_t status = ubrk_getRuleStatus(iter); | 
|  | return (!!boundary && status != UBRK_WORD_NONE); | 
|  | } | 
|  |  | 
|  | bool BreakIterator::IsStartOfWord(size_t position) const { | 
|  | if (break_type_ != BREAK_WORD && break_type_ != RULE_BASED) | 
|  | return false; | 
|  |  | 
|  | UBreakIterator* iter = static_cast<UBreakIterator*>(iter_); | 
|  | UBool boundary = ubrk_isBoundary(iter, static_cast<int32_t>(position)); | 
|  | ubrk_next(iter); | 
|  | int32_t next_status = ubrk_getRuleStatus(iter); | 
|  | return (!!boundary && next_status != UBRK_WORD_NONE); | 
|  | } | 
|  |  | 
|  | bool BreakIterator::IsGraphemeBoundary(size_t position) const { | 
|  | if (break_type_ != BREAK_CHARACTER) | 
|  | return false; | 
|  |  | 
|  | UBreakIterator* iter = static_cast<UBreakIterator*>(iter_); | 
|  | return !!ubrk_isBoundary(iter, static_cast<int32_t>(position)); | 
|  | } | 
|  |  | 
|  | string16 BreakIterator::GetString() const { | 
|  | return GetStringPiece().as_string(); | 
|  | } | 
|  |  | 
|  | StringPiece16 BreakIterator::GetStringPiece() const { | 
|  | DCHECK(prev_ != npos && pos_ != npos); | 
|  | return string_.substr(prev_, pos_ - prev_); | 
|  | } | 
|  |  | 
|  | }  // namespace i18n | 
|  | }  // namespace base |