| // Copyright (c) 2011 The Chromium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "base/strings/utf_offset_string_conversions.h" |
| |
| #include <stdint.h> |
| |
| #include <algorithm> |
| #include <memory> |
| #include <string_view> |
| |
| #include "base/logging.h" |
| #include "base/strings/utf_string_conversion_utils.h" |
| |
| namespace base { |
| |
| OffsetAdjuster::Adjustment::Adjustment(size_t original_offset, |
| size_t original_length, |
| size_t output_length) |
| : original_offset(original_offset), |
| original_length(original_length), |
| output_length(output_length) {} |
| |
| // static |
| void OffsetAdjuster::AdjustOffsets(const Adjustments& adjustments, |
| std::vector<size_t>* offsets_for_adjustment, |
| size_t limit) { |
| DCHECK(offsets_for_adjustment); |
| for (std::vector<size_t>::iterator i(offsets_for_adjustment->begin()); |
| i != offsets_for_adjustment->end(); ++i) |
| AdjustOffset(adjustments, &(*i), limit); |
| } |
| |
| // static |
| void OffsetAdjuster::AdjustOffset(const Adjustments& adjustments, |
| size_t* offset, |
| size_t limit) { |
| DCHECK(offset); |
| if (*offset == std::u16string::npos) |
| return; |
| int adjustment = 0; |
| for (Adjustments::const_iterator i = adjustments.begin(); |
| i != adjustments.end(); ++i) { |
| if (*offset <= i->original_offset) |
| break; |
| if (*offset < (i->original_offset + i->original_length)) { |
| *offset = std::u16string::npos; |
| return; |
| } |
| adjustment += static_cast<int>(i->original_length - i->output_length); |
| } |
| *offset -= adjustment; |
| |
| if (*offset > limit) |
| *offset = std::u16string::npos; |
| } |
| |
| // static |
| void OffsetAdjuster::UnadjustOffsets( |
| const Adjustments& adjustments, |
| std::vector<size_t>* offsets_for_unadjustment) { |
| if (!offsets_for_unadjustment || adjustments.empty()) |
| return; |
| for (std::vector<size_t>::iterator i(offsets_for_unadjustment->begin()); |
| i != offsets_for_unadjustment->end(); ++i) |
| UnadjustOffset(adjustments, &(*i)); |
| } |
| |
| // static |
| void OffsetAdjuster::UnadjustOffset(const Adjustments& adjustments, |
| size_t* offset) { |
| if (*offset == std::u16string::npos) |
| return; |
| int adjustment = 0; |
| for (Adjustments::const_iterator i = adjustments.begin(); |
| i != adjustments.end(); ++i) { |
| if (*offset + adjustment <= i->original_offset) |
| break; |
| adjustment += static_cast<int>(i->original_length - i->output_length); |
| if ((*offset + adjustment) < (i->original_offset + i->original_length)) { |
| *offset = std::u16string::npos; |
| return; |
| } |
| } |
| *offset += adjustment; |
| } |
| |
| // static |
| void OffsetAdjuster::MergeSequentialAdjustments( |
| const Adjustments& first_adjustments, |
| Adjustments* adjustments_on_adjusted_string) { |
| Adjustments::iterator adjusted_iter = adjustments_on_adjusted_string->begin(); |
| Adjustments::const_iterator first_iter = first_adjustments.begin(); |
| // Simultaneously iterate over all |adjustments_on_adjusted_string| and |
| // |first_adjustments|, adding adjustments to or correcting the adjustments |
| // in |adjustments_on_adjusted_string| as we go. |shift| keeps track of the |
| // current number of characters collapsed by |first_adjustments| up to this |
| // point. |currently_collapsing| keeps track of the number of characters |
| // collapsed by |first_adjustments| into the current |adjusted_iter|'s |
| // length. These are characters that will change |shift| as soon as we're |
| // done processing the current |adjusted_iter|; they are not yet reflected in |
| // |shift|. |
| size_t shift = 0; |
| size_t currently_collapsing = 0; |
| while (adjusted_iter != adjustments_on_adjusted_string->end()) { |
| if ((first_iter == first_adjustments.end()) || |
| ((adjusted_iter->original_offset + shift + |
| adjusted_iter->original_length) <= first_iter->original_offset)) { |
| // Entire |adjusted_iter| (accounting for its shift and including its |
| // whole original length) comes before |first_iter|. |
| // |
| // Correct the offset at |adjusted_iter| and move onto the next |
| // adjustment that needs revising. |
| adjusted_iter->original_offset += shift; |
| shift += currently_collapsing; |
| currently_collapsing = 0; |
| ++adjusted_iter; |
| } else if ((adjusted_iter->original_offset + shift) > |
| first_iter->original_offset) { |
| // |first_iter| comes before the |adjusted_iter| (as adjusted by |shift|). |
| |
| // It's not possible for the adjustments to overlap. (It shouldn't |
| // be possible that we have an |adjusted_iter->original_offset| that, |
| // when adjusted by the computed |shift|, is in the middle of |
| // |first_iter|'s output's length. After all, that would mean the |
| // current adjustment_on_adjusted_string somehow points to an offset |
| // that was supposed to have been eliminated by the first set of |
| // adjustments.) |
| DCHECK_LE(first_iter->original_offset + first_iter->output_length, |
| adjusted_iter->original_offset + shift); |
| |
| // Add the |first_adjustment_iter| to the full set of adjustments while |
| // making sure |adjusted_iter| continues pointing to the same element. |
| // We do this by inserting the |first_adjustment_iter| right before |
| // |adjusted_iter|, then incrementing |adjusted_iter| so it points to |
| // the following element. |
| shift += first_iter->original_length - first_iter->output_length; |
| adjusted_iter = |
| adjustments_on_adjusted_string->insert(adjusted_iter, *first_iter); |
| ++adjusted_iter; |
| ++first_iter; |
| } else { |
| // The first adjustment adjusted something that then got further adjusted |
| // by the second set of adjustments. In other words, |first_iter| points |
| // to something in the range covered by |adjusted_iter|'s length (after |
| // accounting for |shift|). Precisely, |
| // adjusted_iter->original_offset + shift |
| // <= |
| // first_iter->original_offset |
| // <= |
| // adjusted_iter->original_offset + shift + |
| // adjusted_iter->original_length |
| |
| // Modify the current |adjusted_iter| to include whatever collapsing |
| // happened in |first_iter|, then advance to the next |first_adjustments| |
| // because we dealt with the current one. |
| const int collapse = static_cast<int>(first_iter->original_length) - |
| static_cast<int>(first_iter->output_length); |
| // This function does not know how to deal with a string that expands and |
| // then gets modified, only strings that collapse and then get modified. |
| DCHECK_GT(collapse, 0); |
| adjusted_iter->original_length += collapse; |
| currently_collapsing += collapse; |
| ++first_iter; |
| } |
| } |
| DCHECK_EQ(0u, currently_collapsing); |
| if (first_iter != first_adjustments.end()) { |
| // Only first adjustments are left. These do not need to be modified. |
| // (Their offsets are already correct with respect to the original string.) |
| // Append them all. |
| DCHECK(adjusted_iter == adjustments_on_adjusted_string->end()); |
| adjustments_on_adjusted_string->insert( |
| adjustments_on_adjusted_string->end(), first_iter, |
| first_adjustments.end()); |
| } |
| } |
| |
| // Converts the given source Unicode character type to the given destination |
| // Unicode character type as a STL string. The given input buffer and size |
| // determine the source, and the given output STL string will be replaced by |
| // the result. If non-NULL, |adjustments| is set to reflect the all the |
| // alterations to the string that are not one-character-to-one-character. |
| // It will always be sorted by increasing offset. |
| template <typename SrcChar, typename DestStdString> |
| bool ConvertUnicode(const SrcChar* src, |
| size_t src_len, |
| DestStdString* output, |
| OffsetAdjuster::Adjustments* adjustments) { |
| if (adjustments) |
| adjustments->clear(); |
| // ICU requires 32-bit numbers. |
| bool success = true; |
| int32_t src_len32 = static_cast<int32_t>(src_len); |
| for (int32_t i = 0; i < src_len32; i++) { |
| uint32_t code_point; |
| size_t original_i = i; |
| size_t chars_written = 0; |
| if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) { |
| chars_written = WriteUnicodeCharacter(code_point, output); |
| } else { |
| chars_written = WriteUnicodeCharacter(0xFFFD, output); |
| success = false; |
| } |
| |
| // Only bother writing an adjustment if this modification changed the |
| // length of this character. |
| // NOTE: ReadUnicodeCharacter() adjusts |i| to point _at_ the last |
| // character read, not after it (so that incrementing it in the loop |
| // increment will place it at the right location), so we need to account |
| // for that in determining the amount that was read. |
| if (adjustments && ((i - original_i + 1) != chars_written)) { |
| adjustments->push_back(OffsetAdjuster::Adjustment( |
| original_i, i - original_i + 1, chars_written)); |
| } |
| } |
| return success; |
| } |
| |
| bool UTF8ToUTF16WithAdjustments( |
| const char* src, |
| size_t src_len, |
| std::u16string* output, |
| base::OffsetAdjuster::Adjustments* adjustments) { |
| PrepareForUTF16Or32Output(src, src_len, output); |
| return ConvertUnicode(src, src_len, output, adjustments); |
| } |
| |
| std::u16string UTF8ToUTF16WithAdjustments( |
| const std::string_view& utf8, |
| base::OffsetAdjuster::Adjustments* adjustments) { |
| std::u16string result; |
| UTF8ToUTF16WithAdjustments(utf8.data(), utf8.length(), &result, adjustments); |
| return result; |
| } |
| |
| std::u16string UTF8ToUTF16AndAdjustOffsets( |
| const std::string_view& utf8, |
| std::vector<size_t>* offsets_for_adjustment) { |
| for (size_t& offset : *offsets_for_adjustment) { |
| if (offset > utf8.length()) |
| offset = std::u16string::npos; |
| } |
| OffsetAdjuster::Adjustments adjustments; |
| std::u16string result = UTF8ToUTF16WithAdjustments(utf8, &adjustments); |
| OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment); |
| return result; |
| } |
| |
| std::string UTF16ToUTF8AndAdjustOffsets( |
| const std::u16string_view& utf16, |
| std::vector<size_t>* offsets_for_adjustment) { |
| for (size_t& offset : *offsets_for_adjustment) { |
| if (offset > utf16.length()) |
| offset = std::u16string::npos; |
| } |
| std::string result; |
| PrepareForUTF8Output(utf16.data(), utf16.length(), &result); |
| OffsetAdjuster::Adjustments adjustments; |
| ConvertUnicode(utf16.data(), utf16.length(), &result, &adjustments); |
| OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment); |
| return result; |
| } |
| |
| } // namespace base |