|  | // Copyright (c) 2011 The Chromium Authors. All rights reserved. | 
|  | // Use of this source code is governed by a BSD-style license that can be | 
|  | // found in the LICENSE file. | 
|  |  | 
|  | #include "base/strings/utf_offset_string_conversions.h" | 
|  |  | 
|  | #include <stdint.h> | 
|  |  | 
|  | #include <algorithm> | 
|  | #include <memory> | 
|  |  | 
|  | #include "base/logging.h" | 
|  | #include "base/strings/string_piece.h" | 
|  | #include "base/strings/utf_string_conversion_utils.h" | 
|  |  | 
|  | namespace base { | 
|  |  | 
|  | OffsetAdjuster::Adjustment::Adjustment(size_t original_offset, | 
|  | size_t original_length, | 
|  | size_t output_length) | 
|  | : original_offset(original_offset), | 
|  | original_length(original_length), | 
|  | output_length(output_length) { | 
|  | } | 
|  |  | 
|  | // static | 
|  | void OffsetAdjuster::AdjustOffsets(const Adjustments& adjustments, | 
|  | std::vector<size_t>* offsets_for_adjustment, | 
|  | size_t limit) { | 
|  | DCHECK(offsets_for_adjustment); | 
|  | for (std::vector<size_t>::iterator i(offsets_for_adjustment->begin()); | 
|  | i != offsets_for_adjustment->end(); ++i) | 
|  | AdjustOffset(adjustments, &(*i), limit); | 
|  | } | 
|  |  | 
|  | // static | 
|  | void OffsetAdjuster::AdjustOffset(const Adjustments& adjustments, | 
|  | size_t* offset, | 
|  | size_t limit) { | 
|  | DCHECK(offset); | 
|  | if (*offset == string16::npos) | 
|  | return; | 
|  | int adjustment = 0; | 
|  | for (Adjustments::const_iterator i = adjustments.begin(); | 
|  | i != adjustments.end(); ++i) { | 
|  | if (*offset <= i->original_offset) | 
|  | break; | 
|  | if (*offset < (i->original_offset + i->original_length)) { | 
|  | *offset = string16::npos; | 
|  | return; | 
|  | } | 
|  | adjustment += static_cast<int>(i->original_length - i->output_length); | 
|  | } | 
|  | *offset -= adjustment; | 
|  |  | 
|  | if (*offset > limit) | 
|  | *offset = string16::npos; | 
|  | } | 
|  |  | 
|  | // static | 
|  | void OffsetAdjuster::UnadjustOffsets( | 
|  | const Adjustments& adjustments, | 
|  | std::vector<size_t>* offsets_for_unadjustment) { | 
|  | if (!offsets_for_unadjustment || adjustments.empty()) | 
|  | return; | 
|  | for (std::vector<size_t>::iterator i(offsets_for_unadjustment->begin()); | 
|  | i != offsets_for_unadjustment->end(); ++i) | 
|  | UnadjustOffset(adjustments, &(*i)); | 
|  | } | 
|  |  | 
|  | // static | 
|  | void OffsetAdjuster::UnadjustOffset(const Adjustments& adjustments, | 
|  | size_t* offset) { | 
|  | if (*offset == string16::npos) | 
|  | return; | 
|  | int adjustment = 0; | 
|  | for (Adjustments::const_iterator i = adjustments.begin(); | 
|  | i != adjustments.end(); ++i) { | 
|  | if (*offset + adjustment <= i->original_offset) | 
|  | break; | 
|  | adjustment += static_cast<int>(i->original_length - i->output_length); | 
|  | if ((*offset + adjustment) < | 
|  | (i->original_offset + i->original_length)) { | 
|  | *offset = string16::npos; | 
|  | return; | 
|  | } | 
|  | } | 
|  | *offset += adjustment; | 
|  | } | 
|  |  | 
|  | // static | 
|  | void OffsetAdjuster::MergeSequentialAdjustments( | 
|  | const Adjustments& first_adjustments, | 
|  | Adjustments* adjustments_on_adjusted_string) { | 
|  | Adjustments::iterator adjusted_iter = adjustments_on_adjusted_string->begin(); | 
|  | Adjustments::const_iterator first_iter = first_adjustments.begin(); | 
|  | // Simultaneously iterate over all |adjustments_on_adjusted_string| and | 
|  | // |first_adjustments|, adding adjustments to or correcting the adjustments | 
|  | // in |adjustments_on_adjusted_string| as we go.  |shift| keeps track of the | 
|  | // current number of characters collapsed by |first_adjustments| up to this | 
|  | // point.  |currently_collapsing| keeps track of the number of characters | 
|  | // collapsed by |first_adjustments| into the current |adjusted_iter|'s | 
|  | // length.  These are characters that will change |shift| as soon as we're | 
|  | // done processing the current |adjusted_iter|; they are not yet reflected in | 
|  | // |shift|. | 
|  | size_t shift = 0; | 
|  | size_t currently_collapsing = 0; | 
|  | while (adjusted_iter != adjustments_on_adjusted_string->end()) { | 
|  | if ((first_iter == first_adjustments.end()) || | 
|  | ((adjusted_iter->original_offset + shift + | 
|  | adjusted_iter->original_length) <= first_iter->original_offset)) { | 
|  | // Entire |adjusted_iter| (accounting for its shift and including its | 
|  | // whole original length) comes before |first_iter|. | 
|  | // | 
|  | // Correct the offset at |adjusted_iter| and move onto the next | 
|  | // adjustment that needs revising. | 
|  | adjusted_iter->original_offset += shift; | 
|  | shift += currently_collapsing; | 
|  | currently_collapsing = 0; | 
|  | ++adjusted_iter; | 
|  | } else if ((adjusted_iter->original_offset + shift) > | 
|  | first_iter->original_offset) { | 
|  | // |first_iter| comes before the |adjusted_iter| (as adjusted by |shift|). | 
|  |  | 
|  | // It's not possible for the adjustments to overlap.  (It shouldn't | 
|  | // be possible that we have an |adjusted_iter->original_offset| that, | 
|  | // when adjusted by the computed |shift|, is in the middle of | 
|  | // |first_iter|'s output's length.  After all, that would mean the | 
|  | // current adjustment_on_adjusted_string somehow points to an offset | 
|  | // that was supposed to have been eliminated by the first set of | 
|  | // adjustments.) | 
|  | DCHECK_LE(first_iter->original_offset + first_iter->output_length, | 
|  | adjusted_iter->original_offset + shift); | 
|  |  | 
|  | // Add the |first_adjustment_iter| to the full set of adjustments while | 
|  | // making sure |adjusted_iter| continues pointing to the same element. | 
|  | // We do this by inserting the |first_adjustment_iter| right before | 
|  | // |adjusted_iter|, then incrementing |adjusted_iter| so it points to | 
|  | // the following element. | 
|  | shift += first_iter->original_length - first_iter->output_length; | 
|  | adjusted_iter = adjustments_on_adjusted_string->insert( | 
|  | adjusted_iter, *first_iter); | 
|  | ++adjusted_iter; | 
|  | ++first_iter; | 
|  | } else { | 
|  | // The first adjustment adjusted something that then got further adjusted | 
|  | // by the second set of adjustments.  In other words, |first_iter| points | 
|  | // to something in the range covered by |adjusted_iter|'s length (after | 
|  | // accounting for |shift|).  Precisely, | 
|  | //   adjusted_iter->original_offset + shift | 
|  | //   <= | 
|  | //   first_iter->original_offset | 
|  | //   <= | 
|  | //   adjusted_iter->original_offset + shift + | 
|  | //       adjusted_iter->original_length | 
|  |  | 
|  | // Modify the current |adjusted_iter| to include whatever collapsing | 
|  | // happened in |first_iter|, then advance to the next |first_adjustments| | 
|  | // because we dealt with the current one. | 
|  | const int collapse = static_cast<int>(first_iter->original_length) - | 
|  | static_cast<int>(first_iter->output_length); | 
|  | // This function does not know how to deal with a string that expands and | 
|  | // then gets modified, only strings that collapse and then get modified. | 
|  | DCHECK_GT(collapse, 0); | 
|  | adjusted_iter->original_length += collapse; | 
|  | currently_collapsing += collapse; | 
|  | ++first_iter; | 
|  | } | 
|  | } | 
|  | DCHECK_EQ(0u, currently_collapsing); | 
|  | if (first_iter != first_adjustments.end()) { | 
|  | // Only first adjustments are left.  These do not need to be modified. | 
|  | // (Their offsets are already correct with respect to the original string.) | 
|  | // Append them all. | 
|  | DCHECK(adjusted_iter == adjustments_on_adjusted_string->end()); | 
|  | adjustments_on_adjusted_string->insert( | 
|  | adjustments_on_adjusted_string->end(), first_iter, | 
|  | first_adjustments.end()); | 
|  | } | 
|  | } | 
|  |  | 
|  | // Converts the given source Unicode character type to the given destination | 
|  | // Unicode character type as a STL string. The given input buffer and size | 
|  | // determine the source, and the given output STL string will be replaced by | 
|  | // the result.  If non-NULL, |adjustments| is set to reflect the all the | 
|  | // alterations to the string that are not one-character-to-one-character. | 
|  | // It will always be sorted by increasing offset. | 
|  | template<typename SrcChar, typename DestStdString> | 
|  | bool ConvertUnicode(const SrcChar* src, | 
|  | size_t src_len, | 
|  | DestStdString* output, | 
|  | OffsetAdjuster::Adjustments* adjustments) { | 
|  | if (adjustments) | 
|  | adjustments->clear(); | 
|  | // ICU requires 32-bit numbers. | 
|  | bool success = true; | 
|  | int32_t src_len32 = static_cast<int32_t>(src_len); | 
|  | for (int32_t i = 0; i < src_len32; i++) { | 
|  | uint32_t code_point; | 
|  | size_t original_i = i; | 
|  | size_t chars_written = 0; | 
|  | if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) { | 
|  | chars_written = WriteUnicodeCharacter(code_point, output); | 
|  | } else { | 
|  | chars_written = WriteUnicodeCharacter(0xFFFD, output); | 
|  | success = false; | 
|  | } | 
|  |  | 
|  | // Only bother writing an adjustment if this modification changed the | 
|  | // length of this character. | 
|  | // NOTE: ReadUnicodeCharacter() adjusts |i| to point _at_ the last | 
|  | // character read, not after it (so that incrementing it in the loop | 
|  | // increment will place it at the right location), so we need to account | 
|  | // for that in determining the amount that was read. | 
|  | if (adjustments && ((i - original_i + 1) != chars_written)) { | 
|  | adjustments->push_back(OffsetAdjuster::Adjustment( | 
|  | original_i, i - original_i + 1, chars_written)); | 
|  | } | 
|  | } | 
|  | return success; | 
|  | } | 
|  |  | 
|  | bool UTF8ToUTF16WithAdjustments( | 
|  | const char* src, | 
|  | size_t src_len, | 
|  | string16* output, | 
|  | base::OffsetAdjuster::Adjustments* adjustments) { | 
|  | PrepareForUTF16Or32Output(src, src_len, output); | 
|  | return ConvertUnicode(src, src_len, output, adjustments); | 
|  | } | 
|  |  | 
|  | string16 UTF8ToUTF16WithAdjustments( | 
|  | const base::StringPiece& utf8, | 
|  | base::OffsetAdjuster::Adjustments* adjustments) { | 
|  | string16 result; | 
|  | UTF8ToUTF16WithAdjustments(utf8.data(), utf8.length(), &result, adjustments); | 
|  | return result; | 
|  | } | 
|  |  | 
|  | string16 UTF8ToUTF16AndAdjustOffsets( | 
|  | const base::StringPiece& utf8, | 
|  | std::vector<size_t>* offsets_for_adjustment) { | 
|  | for (size_t& offset : *offsets_for_adjustment) { | 
|  | if (offset > utf8.length()) | 
|  | offset = string16::npos; | 
|  | } | 
|  | OffsetAdjuster::Adjustments adjustments; | 
|  | string16 result = UTF8ToUTF16WithAdjustments(utf8, &adjustments); | 
|  | OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment); | 
|  | return result; | 
|  | } | 
|  |  | 
|  | std::string UTF16ToUTF8AndAdjustOffsets( | 
|  | const base::StringPiece16& utf16, | 
|  | std::vector<size_t>* offsets_for_adjustment) { | 
|  | for (size_t& offset : *offsets_for_adjustment) { | 
|  | if (offset > utf16.length()) | 
|  | offset = string16::npos; | 
|  | } | 
|  | std::string result; | 
|  | PrepareForUTF8Output(utf16.data(), utf16.length(), &result); | 
|  | OffsetAdjuster::Adjustments adjustments; | 
|  | ConvertUnicode(utf16.data(), utf16.length(), &result, &adjustments); | 
|  | OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment); | 
|  | return result; | 
|  | } | 
|  |  | 
|  | }  // namespace base |