| // Copyright (c) 2011 The Chromium Authors. All rights reserved. | 
 | // Use of this source code is governed by a BSD-style license that can be | 
 | // found in the LICENSE file. | 
 |  | 
 | #include "base/strings/utf_offset_string_conversions.h" | 
 |  | 
 | #include <stdint.h> | 
 |  | 
 | #include <algorithm> | 
 | #include <memory> | 
 |  | 
 | #include "base/logging.h" | 
 | #include "base/strings/string_piece.h" | 
 | #include "base/strings/utf_string_conversion_utils.h" | 
 |  | 
 | namespace base { | 
 |  | 
 | OffsetAdjuster::Adjustment::Adjustment(size_t original_offset, | 
 |                                        size_t original_length, | 
 |                                        size_t output_length) | 
 |     : original_offset(original_offset), | 
 |       original_length(original_length), | 
 |       output_length(output_length) { | 
 | } | 
 |  | 
 | // static | 
 | void OffsetAdjuster::AdjustOffsets(const Adjustments& adjustments, | 
 |                                    std::vector<size_t>* offsets_for_adjustment, | 
 |                                    size_t limit) { | 
 |   DCHECK(offsets_for_adjustment); | 
 |   for (std::vector<size_t>::iterator i(offsets_for_adjustment->begin()); | 
 |        i != offsets_for_adjustment->end(); ++i) | 
 |     AdjustOffset(adjustments, &(*i), limit); | 
 | } | 
 |  | 
 | // static | 
 | void OffsetAdjuster::AdjustOffset(const Adjustments& adjustments, | 
 |                                   size_t* offset, | 
 |                                   size_t limit) { | 
 |   DCHECK(offset); | 
 |   if (*offset == string16::npos) | 
 |     return; | 
 |   int adjustment = 0; | 
 |   for (Adjustments::const_iterator i = adjustments.begin(); | 
 |        i != adjustments.end(); ++i) { | 
 |     if (*offset <= i->original_offset) | 
 |       break; | 
 |     if (*offset < (i->original_offset + i->original_length)) { | 
 |       *offset = string16::npos; | 
 |       return; | 
 |     } | 
 |     adjustment += static_cast<int>(i->original_length - i->output_length); | 
 |   } | 
 |   *offset -= adjustment; | 
 |  | 
 |   if (*offset > limit) | 
 |     *offset = string16::npos; | 
 | } | 
 |  | 
 | // static | 
 | void OffsetAdjuster::UnadjustOffsets( | 
 |     const Adjustments& adjustments, | 
 |     std::vector<size_t>* offsets_for_unadjustment) { | 
 |   if (!offsets_for_unadjustment || adjustments.empty()) | 
 |     return; | 
 |   for (std::vector<size_t>::iterator i(offsets_for_unadjustment->begin()); | 
 |        i != offsets_for_unadjustment->end(); ++i) | 
 |     UnadjustOffset(adjustments, &(*i)); | 
 | } | 
 |  | 
 | // static | 
 | void OffsetAdjuster::UnadjustOffset(const Adjustments& adjustments, | 
 |                                     size_t* offset) { | 
 |   if (*offset == string16::npos) | 
 |     return; | 
 |   int adjustment = 0; | 
 |   for (Adjustments::const_iterator i = adjustments.begin(); | 
 |        i != adjustments.end(); ++i) { | 
 |     if (*offset + adjustment <= i->original_offset) | 
 |       break; | 
 |     adjustment += static_cast<int>(i->original_length - i->output_length); | 
 |     if ((*offset + adjustment) < | 
 |         (i->original_offset + i->original_length)) { | 
 |       *offset = string16::npos; | 
 |       return; | 
 |     } | 
 |   } | 
 |   *offset += adjustment; | 
 | } | 
 |  | 
 | // static | 
 | void OffsetAdjuster::MergeSequentialAdjustments( | 
 |     const Adjustments& first_adjustments, | 
 |     Adjustments* adjustments_on_adjusted_string) { | 
 |   Adjustments::iterator adjusted_iter = adjustments_on_adjusted_string->begin(); | 
 |   Adjustments::const_iterator first_iter = first_adjustments.begin(); | 
 |   // Simultaneously iterate over all |adjustments_on_adjusted_string| and | 
 |   // |first_adjustments|, adding adjustments to or correcting the adjustments | 
 |   // in |adjustments_on_adjusted_string| as we go.  |shift| keeps track of the | 
 |   // current number of characters collapsed by |first_adjustments| up to this | 
 |   // point.  |currently_collapsing| keeps track of the number of characters | 
 |   // collapsed by |first_adjustments| into the current |adjusted_iter|'s | 
 |   // length.  These are characters that will change |shift| as soon as we're | 
 |   // done processing the current |adjusted_iter|; they are not yet reflected in | 
 |   // |shift|. | 
 |   size_t shift = 0; | 
 |   size_t currently_collapsing = 0; | 
 |   while (adjusted_iter != adjustments_on_adjusted_string->end()) { | 
 |     if ((first_iter == first_adjustments.end()) || | 
 |         ((adjusted_iter->original_offset + shift + | 
 |           adjusted_iter->original_length) <= first_iter->original_offset)) { | 
 |       // Entire |adjusted_iter| (accounting for its shift and including its | 
 |       // whole original length) comes before |first_iter|. | 
 |       // | 
 |       // Correct the offset at |adjusted_iter| and move onto the next | 
 |       // adjustment that needs revising. | 
 |       adjusted_iter->original_offset += shift; | 
 |       shift += currently_collapsing; | 
 |       currently_collapsing = 0; | 
 |       ++adjusted_iter; | 
 |     } else if ((adjusted_iter->original_offset + shift) > | 
 |                first_iter->original_offset) { | 
 |       // |first_iter| comes before the |adjusted_iter| (as adjusted by |shift|). | 
 |  | 
 |       // It's not possible for the adjustments to overlap.  (It shouldn't | 
 |       // be possible that we have an |adjusted_iter->original_offset| that, | 
 |       // when adjusted by the computed |shift|, is in the middle of | 
 |       // |first_iter|'s output's length.  After all, that would mean the | 
 |       // current adjustment_on_adjusted_string somehow points to an offset | 
 |       // that was supposed to have been eliminated by the first set of | 
 |       // adjustments.) | 
 |       DCHECK_LE(first_iter->original_offset + first_iter->output_length, | 
 |                 adjusted_iter->original_offset + shift); | 
 |  | 
 |       // Add the |first_adjustment_iter| to the full set of adjustments while | 
 |       // making sure |adjusted_iter| continues pointing to the same element. | 
 |       // We do this by inserting the |first_adjustment_iter| right before | 
 |       // |adjusted_iter|, then incrementing |adjusted_iter| so it points to | 
 |       // the following element. | 
 |       shift += first_iter->original_length - first_iter->output_length; | 
 |       adjusted_iter = adjustments_on_adjusted_string->insert( | 
 |           adjusted_iter, *first_iter); | 
 |       ++adjusted_iter; | 
 |       ++first_iter; | 
 |     } else { | 
 |       // The first adjustment adjusted something that then got further adjusted | 
 |       // by the second set of adjustments.  In other words, |first_iter| points | 
 |       // to something in the range covered by |adjusted_iter|'s length (after | 
 |       // accounting for |shift|).  Precisely, | 
 |       //   adjusted_iter->original_offset + shift | 
 |       //   <= | 
 |       //   first_iter->original_offset | 
 |       //   <= | 
 |       //   adjusted_iter->original_offset + shift + | 
 |       //       adjusted_iter->original_length | 
 |  | 
 |       // Modify the current |adjusted_iter| to include whatever collapsing | 
 |       // happened in |first_iter|, then advance to the next |first_adjustments| | 
 |       // because we dealt with the current one. | 
 |       const int collapse = static_cast<int>(first_iter->original_length) - | 
 |           static_cast<int>(first_iter->output_length); | 
 |       // This function does not know how to deal with a string that expands and | 
 |       // then gets modified, only strings that collapse and then get modified. | 
 |       DCHECK_GT(collapse, 0); | 
 |       adjusted_iter->original_length += collapse; | 
 |       currently_collapsing += collapse; | 
 |       ++first_iter; | 
 |     } | 
 |   } | 
 |   DCHECK_EQ(0u, currently_collapsing); | 
 |   if (first_iter != first_adjustments.end()) { | 
 |     // Only first adjustments are left.  These do not need to be modified. | 
 |     // (Their offsets are already correct with respect to the original string.) | 
 |     // Append them all. | 
 |     DCHECK(adjusted_iter == adjustments_on_adjusted_string->end()); | 
 |     adjustments_on_adjusted_string->insert( | 
 |         adjustments_on_adjusted_string->end(), first_iter, | 
 |         first_adjustments.end()); | 
 |   } | 
 | } | 
 |  | 
 | // Converts the given source Unicode character type to the given destination | 
 | // Unicode character type as a STL string. The given input buffer and size | 
 | // determine the source, and the given output STL string will be replaced by | 
 | // the result.  If non-NULL, |adjustments| is set to reflect the all the | 
 | // alterations to the string that are not one-character-to-one-character. | 
 | // It will always be sorted by increasing offset. | 
 | template<typename SrcChar, typename DestStdString> | 
 | bool ConvertUnicode(const SrcChar* src, | 
 |                     size_t src_len, | 
 |                     DestStdString* output, | 
 |                     OffsetAdjuster::Adjustments* adjustments) { | 
 |   if (adjustments) | 
 |     adjustments->clear(); | 
 |   // ICU requires 32-bit numbers. | 
 |   bool success = true; | 
 |   int32_t src_len32 = static_cast<int32_t>(src_len); | 
 |   for (int32_t i = 0; i < src_len32; i++) { | 
 |     uint32_t code_point; | 
 |     size_t original_i = i; | 
 |     size_t chars_written = 0; | 
 |     if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) { | 
 |       chars_written = WriteUnicodeCharacter(code_point, output); | 
 |     } else { | 
 |       chars_written = WriteUnicodeCharacter(0xFFFD, output); | 
 |       success = false; | 
 |     } | 
 |  | 
 |     // Only bother writing an adjustment if this modification changed the | 
 |     // length of this character. | 
 |     // NOTE: ReadUnicodeCharacter() adjusts |i| to point _at_ the last | 
 |     // character read, not after it (so that incrementing it in the loop | 
 |     // increment will place it at the right location), so we need to account | 
 |     // for that in determining the amount that was read. | 
 |     if (adjustments && ((i - original_i + 1) != chars_written)) { | 
 |       adjustments->push_back(OffsetAdjuster::Adjustment( | 
 |           original_i, i - original_i + 1, chars_written)); | 
 |     } | 
 |   } | 
 |   return success; | 
 | } | 
 |  | 
 | bool UTF8ToUTF16WithAdjustments( | 
 |     const char* src, | 
 |     size_t src_len, | 
 |     string16* output, | 
 |     base::OffsetAdjuster::Adjustments* adjustments) { | 
 |   PrepareForUTF16Or32Output(src, src_len, output); | 
 |   return ConvertUnicode(src, src_len, output, adjustments); | 
 | } | 
 |  | 
 | string16 UTF8ToUTF16WithAdjustments( | 
 |     const base::StringPiece& utf8, | 
 |     base::OffsetAdjuster::Adjustments* adjustments) { | 
 |   string16 result; | 
 |   UTF8ToUTF16WithAdjustments(utf8.data(), utf8.length(), &result, adjustments); | 
 |   return result; | 
 | } | 
 |  | 
 | string16 UTF8ToUTF16AndAdjustOffsets( | 
 |     const base::StringPiece& utf8, | 
 |     std::vector<size_t>* offsets_for_adjustment) { | 
 |   for (size_t& offset : *offsets_for_adjustment) { | 
 |     if (offset > utf8.length()) | 
 |       offset = string16::npos; | 
 |   } | 
 |   OffsetAdjuster::Adjustments adjustments; | 
 |   string16 result = UTF8ToUTF16WithAdjustments(utf8, &adjustments); | 
 |   OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment); | 
 |   return result; | 
 | } | 
 |  | 
 | std::string UTF16ToUTF8AndAdjustOffsets( | 
 |     const base::StringPiece16& utf16, | 
 |     std::vector<size_t>* offsets_for_adjustment) { | 
 |   for (size_t& offset : *offsets_for_adjustment) { | 
 |     if (offset > utf16.length()) | 
 |       offset = string16::npos; | 
 |   } | 
 |   std::string result; | 
 |   PrepareForUTF8Output(utf16.data(), utf16.length(), &result); | 
 |   OffsetAdjuster::Adjustments adjustments; | 
 |   ConvertUnicode(utf16.data(), utf16.length(), &result, &adjustments); | 
 |   OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment); | 
 |   return result; | 
 | } | 
 |  | 
 | }  // namespace base |