update icu Uupdated icu from current https://source.chromium.org/chromium/chromium/src/+/main:base/third_party/icu/ This is preparation for https://gn-review.googlesource.com/c/gn/+/13782 Change-Id: Ic6da06e92e801cfb3851a5d5087661f20e199a04 Reviewed-on: https://gn-review.googlesource.com/c/gn/+/13801 Reviewed-by: Petr Hosek <phosek@google.com> Commit-Queue: Takuto Ikuta <tikuta@google.com>
diff --git a/build/gen.py b/build/gen.py index e814b58..9fd68e9 100755 --- a/build/gen.py +++ b/build/gen.py
@@ -562,7 +562,6 @@ 'src/base/strings/stringprintf.cc', 'src/base/strings/utf_string_conversion_utils.cc', 'src/base/strings/utf_string_conversions.cc', - 'src/base/third_party/icu/icu_utf.cc', 'src/base/timer/elapsed_timer.cc', 'src/base/value_iterators.cc', 'src/base/values.cc',
diff --git a/src/base/third_party/icu/README.chromium b/src/base/third_party/icu/README.chromium index 297e89a..4f398d9 100644 --- a/src/base/third_party/icu/README.chromium +++ b/src/base/third_party/icu/README.chromium
@@ -6,12 +6,12 @@ This file has the relevant components from ICU copied to handle basic UTF8/16/32 conversions. Components are copied from umachine.h, utf.h, utf8.h, and utf16.h -into icu_utf.h, and from utf_impl.cpp into icu_utf.cc. +into icu_utf.h. -The main change is that U_/U8_/U16_ prefixes have been replaced with -CBU_/CBU8_/CBU16_ (for "Chrome Base") to avoid confusion with the "real" ICU -macros should ICU be in use on the system. For the same reason, the functions -and types have been put in the "base_icu" namespace. +The main change is that U_/U8_/U16_/UPRV_ prefixes have been replaced with +CBU_/CBU8_/CBU16_/CBUPRV_ (for "Chrome Base") to avoid confusion with the "real" +ICU macros should ICU be in use on the system. For the same reason, the +functions and types have been put in the "base_icu" namespace. Note that this license file is marked as NOT_SHIPPED, since a more complete ICU license is included from //third_party/icu/README.chromium
diff --git a/src/base/third_party/icu/icu_utf.cc b/src/base/third_party/icu/icu_utf.cc deleted file mode 100644 index 8dcb401..0000000 --- a/src/base/third_party/icu/icu_utf.cc +++ /dev/null
@@ -1,129 +0,0 @@ -// © 2016 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html -/* -****************************************************************************** -* -* Copyright (C) 1999-2012, International Business Machines -* Corporation and others. All Rights Reserved. -* -****************************************************************************** -* file name: utf_impl.cpp -* encoding: UTF-8 -* tab size: 8 (not used) -* indentation:4 -* -* created on: 1999sep13 -* created by: Markus W. Scherer -* -* This file provides implementation functions for macros in the utfXX.h -* that would otherwise be too long as macros. -*/ - -#include "base/third_party/icu/icu_utf.h" - -namespace base_icu { - -// source/common/utf_impl.cpp - -static const UChar32 utf8_errorValue[6] = { - // Same values as UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE, - // but without relying on the obsolete unicode/utf_old.h. - 0x15, 0x9f, 0xffff, 0x10ffff}; - -static UChar32 errorValue(int32_t count, int8_t strict) { - if (strict >= 0) { - return utf8_errorValue[count]; - } else if (strict == -3) { - return 0xfffd; - } else { - return CBU_SENTINEL; - } -} - -/* - * Handle the non-inline part of the U8_NEXT() and U8_NEXT_FFFD() macros - * and their obsolete sibling UTF8_NEXT_CHAR_SAFE(). - * - * U8_NEXT() supports NUL-terminated strings indicated via length<0. - * - * The "strict" parameter controls the error behavior: - * <0 "Safe" behavior of U8_NEXT(): - * -1: All illegal byte sequences yield U_SENTINEL=-1. - * -2: Same as -1, except for lenient treatment of surrogate code points as - * legal. Some implementations use this for roundtripping of Unicode 16-bit - * strings that are not well-formed UTF-16, that is, they contain - * unpaired surrogates. -3: All illegal byte sequences yield U+FFFD. 0 - * Obsolete "safe" behavior of UTF8_NEXT_CHAR_SAFE(..., FALSE): All illegal - * byte sequences yield a positive code point such that this result code - * point would be encoded with the same number of bytes as the illegal - * sequence. >0 Obsolete "strict" behavior of UTF8_NEXT_CHAR_SAFE(..., - * TRUE): Same as the obsolete "safe" behavior, but non-characters are also - * treated like illegal sequences. - * - * Note that a UBool is the same as an int8_t. - */ -UChar32 utf8_nextCharSafeBody(const uint8_t* s, - int32_t* pi, - int32_t length, - UChar32 c, - UBool strict) { - // *pi is one after byte c. - int32_t i = *pi; - // length can be negative for NUL-terminated strings: Read and validate one - // byte at a time. - if (i == length || c > 0xf4) { - // end of string, or not a lead byte - } else if (c >= 0xf0) { - // Test for 4-byte sequences first because - // U8_NEXT() handles shorter valid sequences inline. - uint8_t t1 = s[i], t2, t3; - c &= 7; - if (CBU8_IS_VALID_LEAD4_AND_T1(c, t1) && ++i != length && - (t2 = s[i] - 0x80) <= 0x3f && ++i != length && - (t3 = s[i] - 0x80) <= 0x3f) { - ++i; - c = (c << 18) | ((t1 & 0x3f) << 12) | (t2 << 6) | t3; - // strict: forbid non-characters like U+fffe - if (strict <= 0 || !CBU_IS_UNICODE_NONCHAR(c)) { - *pi = i; - return c; - } - } - } else if (c >= 0xe0) { - c &= 0xf; - if (strict != -2) { - uint8_t t1 = s[i], t2; - if (CBU8_IS_VALID_LEAD3_AND_T1(c, t1) && ++i != length && - (t2 = s[i] - 0x80) <= 0x3f) { - ++i; - c = (c << 12) | ((t1 & 0x3f) << 6) | t2; - // strict: forbid non-characters like U+fffe - if (strict <= 0 || !CBU_IS_UNICODE_NONCHAR(c)) { - *pi = i; - return c; - } - } - } else { - // strict=-2 -> lenient: allow surrogates - uint8_t t1 = s[i] - 0x80, t2; - if (t1 <= 0x3f && (c > 0 || t1 >= 0x20) && ++i != length && - (t2 = s[i] - 0x80) <= 0x3f) { - *pi = i + 1; - return (c << 12) | (t1 << 6) | t2; - } - } - } else if (c >= 0xc2) { - uint8_t t1 = s[i] - 0x80; - if (t1 <= 0x3f) { - *pi = i + 1; - return ((c - 0xc0) << 6) | t1; - } - } // else 0x80<=c<0xc2 is not a lead byte - - /* error handling */ - c = errorValue(i - *pi, strict); - *pi = i; - return c; -} - -} // namespace base_icu
diff --git a/src/base/third_party/icu/icu_utf.h b/src/base/third_party/icu/icu_utf.h index b626b39..16792c4 100644 --- a/src/base/third_party/icu/icu_utf.h +++ b/src/base/third_party/icu/icu_utf.h
@@ -60,6 +60,25 @@ */ #define CBU_SENTINEL (-1) +/** + * \def UPRV_BLOCK_MACRO_BEGIN + * Defined as the "do" keyword by default. + * @internal + */ +#ifndef CBUPRV_BLOCK_MACRO_BEGIN +#define CBUPRV_BLOCK_MACRO_BEGIN do +#endif + +/** + * \def UPRV_BLOCK_MACRO_END + * Defined as "while (FALSE)" by default. + * @internal + */ +#ifndef CBUPRV_BLOCK_MACRO_END +#define CBUPRV_BLOCK_MACRO_END while (0) +#endif + + // source/common/unicode/utf.h /** @@ -68,9 +87,9 @@ * @return TRUE or FALSE * @stable ICU 2.4 */ -#define CBU_IS_UNICODE_NONCHAR(c) \ - ((c) >= 0xfdd0 && ((c) <= 0xfdef || ((c)&0xfffe) == 0xfffe) && \ - (c) <= 0x10ffff) +#define CBU_IS_UNICODE_NONCHAR(c) \ + ((c)>=0xfdd0 && \ + ((c)<=0xfdef || ((c)&0xfffe)==0xfffe) && (c)<=0x10ffff) /** * Is c a Unicode code point value (0..U+10ffff) @@ -78,9 +97,9 @@ * * Code points that are not characters include: * - single surrogate code points (U+d800..U+dfff, 2048 code points) - * - the last two code points on each plane (U+__fffe and U+__ffff, 34 code - * points) - U+fdd0..U+fdef (new with Unicode 3.1, 32 code points) - the highest - * Unicode code point value is U+10ffff + * - the last two code points on each plane (U+__fffe and U+__ffff, 34 code points) + * - U+fdd0..U+fdef (new with Unicode 3.1, 32 code points) + * - the highest Unicode code point value is U+10ffff * * This means that all code points below U+d800 are character code points, * and that boundary is tested first for performance. @@ -90,8 +109,8 @@ * @stable ICU 2.4 */ #define CBU_IS_UNICODE_CHAR(c) \ - ((uint32_t)(c) < 0xd800 || \ - (0xdfff < (c) && (c) <= 0x10ffff && !CBU_IS_UNICODE_NONCHAR(c))) + ((uint32_t)(c)<0xd800 || \ + (0xdfff<(c) && (c)<=0x10ffff && !CBU_IS_UNICODE_NONCHAR(c))) /** * Is this code point a surrogate (U+d800..U+dfff)? @@ -99,7 +118,7 @@ * @return TRUE or FALSE * @stable ICU 2.4 */ -#define CBU_IS_SURROGATE(c) (((c)&0xfffff800) == 0xd800) +#define CBU_IS_SURROGATE(c) (((c)&0xfffff800)==0xd800) /** * Assuming c is a surrogate code point (U_IS_SURROGATE(c)), @@ -108,69 +127,43 @@ * @return TRUE or FALSE * @stable ICU 2.4 */ -#define CBU_IS_SURROGATE_LEAD(c) (((c)&0x400) == 0) +#define CBU_IS_SURROGATE_LEAD(c) (((c)&0x400)==0) // source/common/unicode/utf8.h /** - * Internal bit vector for 3-byte UTF-8 validity check, for use in - * U8_IS_VALID_LEAD3_AND_T1. Each bit indicates whether one lead byte + first - * trail byte pair starts a valid sequence. Lead byte E0..EF bits 3..0 are used - * as byte index, first trail byte bits 7..5 are used as bit index into that - * byte. + * Internal bit vector for 3-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD3_AND_T1. + * Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence. + * Lead byte E0..EF bits 3..0 are used as byte index, + * first trail byte bits 7..5 are used as bit index into that byte. * @see U8_IS_VALID_LEAD3_AND_T1 * @internal */ -#define CBU8_LEAD3_T1_BITS \ - "\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30" +#define CBU8_LEAD3_T1_BITS "\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30" /** * Internal 3-byte UTF-8 validity check. - * Non-zero if lead byte E0..EF and first trail byte 00..FF start a valid - * sequence. + * Non-zero if lead byte E0..EF and first trail byte 00..FF start a valid sequence. * @internal */ -#define CBU8_IS_VALID_LEAD3_AND_T1(lead, t1) \ - (CBU8_LEAD3_T1_BITS[(lead)&0xf] & (1 << ((uint8_t)(t1) >> 5))) +#define CBU8_IS_VALID_LEAD3_AND_T1(lead, t1) (CBU8_LEAD3_T1_BITS[(lead)&0xf]&(1<<((uint8_t)(t1)>>5))) /** - * Internal bit vector for 4-byte UTF-8 validity check, for use in - * U8_IS_VALID_LEAD4_AND_T1. Each bit indicates whether one lead byte + first - * trail byte pair starts a valid sequence. First trail byte bits 7..4 are used - * as byte index, lead byte F0..F4 bits 2..0 are used as bit index into that - * byte. + * Internal bit vector for 4-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD4_AND_T1. + * Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence. + * First trail byte bits 7..4 are used as byte index, + * lead byte F0..F4 bits 2..0 are used as bit index into that byte. * @see U8_IS_VALID_LEAD4_AND_T1 * @internal */ -#define CBU8_LEAD4_T1_BITS \ - "\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00" +#define CBU8_LEAD4_T1_BITS "\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00" /** * Internal 4-byte UTF-8 validity check. - * Non-zero if lead byte F0..F4 and first trail byte 00..FF start a valid - * sequence. + * Non-zero if lead byte F0..F4 and first trail byte 00..FF start a valid sequence. * @internal */ -#define CBU8_IS_VALID_LEAD4_AND_T1(lead, t1) \ - (CBU8_LEAD4_T1_BITS[(uint8_t)(t1) >> 4] & (1 << ((lead)&7))) - -/** - * Function for handling "next code point" with error-checking. - * - * This is internal since it is not meant to be called directly by external clie -nts; - * however it is U_STABLE (not U_INTERNAL) since it is called by public macros i -n this - * file and thus must remain stable, and should not be hidden when other interna -l - * functions are hidden (otherwise public macros would fail to compile). - * @internal - */ -UChar32 utf8_nextCharSafeBody(const uint8_t* s, - int32_t* pi, - int32_t length, - ::base_icu::UChar32 c, - ::base_icu::UBool strict); +#define CBU8_IS_VALID_LEAD4_AND_T1(lead, t1) (CBU8_LEAD4_T1_BITS[(uint8_t)(t1)>>4]&(1<<((lead)&7))) /** * Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)? @@ -178,7 +171,7 @@ * @return TRUE or FALSE * @stable ICU 2.4 */ -#define CBU8_IS_SINGLE(c) (((c)&0x80) == 0) +#define CBU8_IS_SINGLE(c) (((c)&0x80)==0) /** * Is this code unit (byte) a UTF-8 lead byte? (0xC2..0xF4) @@ -186,7 +179,7 @@ * @return TRUE or FALSE * @stable ICU 2.4 */ -#define CBU8_IS_LEAD(c) ((uint8_t)((c)-0xc2) <= 0x32) +#define CBU8_IS_LEAD(c) ((uint8_t)((c)-0xc2)<=0x32) /** * Is this code unit (byte) a UTF-8 trail byte? (0x80..0xBF) @@ -194,7 +187,7 @@ * @return TRUE or FALSE * @stable ICU 2.4 */ -#define CBU8_IS_TRAIL(c) ((int8_t)(c) < -0x40) +#define CBU8_IS_TRAIL(c) ((int8_t)(c)<-0x40) /** * How many code units (bytes) are used for the UTF-8 encoding @@ -203,20 +196,19 @@ * @return 1..4, or 0 if c is a surrogate or not a Unicode code point * @stable ICU 2.4 */ -#define CBU8_LENGTH(c) \ - ((uint32_t)(c) <= 0x7f \ - ? 1 \ - : ((uint32_t)(c) <= 0x7ff \ - ? 2 \ - : ((uint32_t)(c) <= 0xd7ff \ - ? 3 \ - : ((uint32_t)(c) <= 0xdfff || (uint32_t)(c) > 0x10ffff \ - ? 0 \ - : ((uint32_t)(c) <= 0xffff ? 3 : 4))))) +#define CBU8_LENGTH(c) \ + ((uint32_t)(c)<=0x7f ? 1 : \ + ((uint32_t)(c)<=0x7ff ? 2 : \ + ((uint32_t)(c)<=0xd7ff ? 3 : \ + ((uint32_t)(c)<=0xdfff || (uint32_t)(c)>0x10ffff ? 0 : \ + ((uint32_t)(c)<=0xffff ? 3 : 4)\ + ) \ + ) \ + ) \ + ) /** - * The maximum number of UTF-8 code units (bytes) per Unicode code point - * (U+0000..U+10ffff). + * The maximum number of UTF-8 code units (bytes) per Unicode code point (U+0000..U+10ffff). * @return 4 * @stable ICU 2.4 */ @@ -242,37 +234,43 @@ * @see U8_NEXT_UNSAFE * @stable ICU 2.4 */ -#define CBU8_NEXT(s, i, length, c) \ - { \ - (c) = (uint8_t)(s)[(i)++]; \ - if (!CBU8_IS_SINGLE(c)) { \ - uint8_t __t1, __t2; \ - if (/* handle U+0800..U+FFFF inline */ \ - (0xe0 <= (c) && (c) < 0xf0) && \ - (((i) + 1) < (length) || (length) < 0) && \ - CBU8_IS_VALID_LEAD3_AND_T1((c), __t1 = (s)[i]) && \ - (__t2 = (s)[(i) + 1] - 0x80) <= 0x3f) { \ - (c) = (((c)&0xf) << 12) | ((__t1 & 0x3f) << 6) | __t2; \ - (i) += 2; \ - } else if (/* handle U+0080..U+07FF inline */ \ - ((c) < 0xe0 && (c) >= 0xc2) && ((i) != (length)) && \ - (__t1 = (s)[i] - 0x80) <= 0x3f) { \ - (c) = (((c)&0x1f) << 6) | __t1; \ - ++(i); \ - } else { \ - /* function call for "complicated" and error cases */ \ - (c) = ::base_icu::utf8_nextCharSafeBody((const uint8_t*)s, &(i), \ - (length), c, -1); \ - } \ - } \ - } +#define CBU8_NEXT(s, i, length, c) CBU8_INTERNAL_NEXT_OR_SUB(s, i, length, c, CBU_SENTINEL) + +/** @internal */ +#define CBU8_INTERNAL_NEXT_OR_SUB(s, i, length, c, sub) CBUPRV_BLOCK_MACRO_BEGIN { \ + (c)=(uint8_t)(s)[(i)++]; \ + if(!CBU8_IS_SINGLE(c)) { \ + uint8_t __t = 0; \ + if((i)!=(length) && \ + /* fetch/validate/assemble all but last trail byte */ \ + ((c)>=0xe0 ? \ + ((c)<0xf0 ? /* U+0800..U+FFFF except surrogates */ \ + CBU8_LEAD3_T1_BITS[(c)&=0xf]&(1<<((__t=(s)[i])>>5)) && \ + (__t&=0x3f, 1) \ + : /* U+10000..U+10FFFF */ \ + ((c)-=0xf0)<=4 && \ + CBU8_LEAD4_T1_BITS[(__t=(s)[i])>>4]&(1<<(c)) && \ + ((c)=((c)<<6)|(__t&0x3f), ++(i)!=(length)) && \ + (__t=(s)[i]-0x80)<=0x3f) && \ + /* valid second-to-last trail byte */ \ + ((c)=((c)<<6)|__t, ++(i)!=(length)) \ + : /* U+0080..U+07FF */ \ + (c)>=0xc2 && ((c)&=0x1f, 1)) && \ + /* last trail byte */ \ + (__t=(s)[i]-0x80)<=0x3f && \ + ((c)=((c)<<6)|__t, ++(i), 1)) { \ + } else { \ + (c)=(sub); /* ill-formed*/ \ + } \ + } \ +} CBUPRV_BLOCK_MACRO_END /** * Append a code point to a string, overwriting 1 to 4 bytes. * The offset points to the current end of the string contents * and is advanced (post-increment). - * "Unsafe" macro, assumes a valid code point and sufficient space in the - * string. Otherwise, the result is undefined. + * "Unsafe" macro, assumes a valid code point and sufficient space in the string. + * Otherwise, the result is undefined. * * @param s const uint8_t * string buffer * @param i string offset @@ -280,25 +278,25 @@ * @see U8_APPEND * @stable ICU 2.4 */ -#define CBU8_APPEND_UNSAFE(s, i, c) \ - { \ - if ((uint32_t)(c) <= 0x7f) { \ - (s)[(i)++] = (uint8_t)(c); \ - } else { \ - if ((uint32_t)(c) <= 0x7ff) { \ - (s)[(i)++] = (uint8_t)(((c) >> 6) | 0xc0); \ - } else { \ - if ((uint32_t)(c) <= 0xffff) { \ - (s)[(i)++] = (uint8_t)(((c) >> 12) | 0xe0); \ - } else { \ - (s)[(i)++] = (uint8_t)(((c) >> 18) | 0xf0); \ - (s)[(i)++] = (uint8_t)((((c) >> 12) & 0x3f) | 0x80); \ - } \ - (s)[(i)++] = (uint8_t)((((c) >> 6) & 0x3f) | 0x80); \ - } \ - (s)[(i)++] = (uint8_t)(((c)&0x3f) | 0x80); \ - } \ - } +#define CBU8_APPEND_UNSAFE(s, i, c) CBUPRV_BLOCK_MACRO_BEGIN { \ + uint32_t __uc=(c); \ + if(__uc<=0x7f) { \ + (s)[(i)++]=(uint8_t)__uc; \ + } else { \ + if(__uc<=0x7ff) { \ + (s)[(i)++]=(uint8_t)((__uc>>6)|0xc0); \ + } else { \ + if(__uc<=0xffff) { \ + (s)[(i)++]=(uint8_t)((__uc>>12)|0xe0); \ + } else { \ + (s)[(i)++]=(uint8_t)((__uc>>18)|0xf0); \ + (s)[(i)++]=(uint8_t)(((__uc>>12)&0x3f)|0x80); \ + } \ + (s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \ + } \ + (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \ + } \ +} CBUPRV_BLOCK_MACRO_END // source/common/unicode/utf16.h @@ -316,7 +314,7 @@ * @return TRUE or FALSE * @stable ICU 2.4 */ -#define CBU16_IS_LEAD(c) (((c)&0xfffffc00) == 0xd800) +#define CBU16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800) /** * Is this code unit a trail surrogate (U+dc00..U+dfff)? @@ -324,7 +322,7 @@ * @return TRUE or FALSE * @stable ICU 2.4 */ -#define CBU16_IS_TRAIL(c) (((c)&0xfffffc00) == 0xdc00) +#define CBU16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00) /** * Is this code unit a surrogate (U+d800..U+dfff)? @@ -341,13 +339,13 @@ * @return TRUE or FALSE * @stable ICU 2.4 */ -#define CBU16_IS_SURROGATE_LEAD(c) (((c)&0x400) == 0) +#define CBU16_IS_SURROGATE_LEAD(c) (((c)&0x400)==0) /** * Helper constant for U16_GET_SUPPLEMENTARY. * @internal */ -#define CBU16_SURROGATE_OFFSET ((0xd800 << 10UL) + 0xdc00 - 0x10000) +#define CBU16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000) /** * Get a supplementary code point value (U+10000..U+10ffff) @@ -361,8 +359,7 @@ * @stable ICU 2.4 */ #define CBU16_GET_SUPPLEMENTARY(lead, trail) \ - (((::base_icu::UChar32)(lead) << 10UL) + \ - (::base_icu::UChar32)(trail)-CBU16_SURROGATE_OFFSET) + (((::base_icu::UChar32)(lead)<<10UL)+(::base_icu::UChar32)(trail)-CBU16_SURROGATE_OFFSET) /** * Get the lead surrogate (0xd800..0xdbff) for a @@ -371,8 +368,7 @@ * @return lead surrogate (U+d800..U+dbff) for supplementary * @stable ICU 2.4 */ -#define CBU16_LEAD(supplementary) \ - (::base_icu::UChar)(((supplementary) >> 10) + 0xd7c0) +#define CBU16_LEAD(supplementary) (::base_icu::UChar)(((supplementary)>>10)+0xd7c0) /** * Get the trail surrogate (0xdc00..0xdfff) for a @@ -381,28 +377,64 @@ * @return trail surrogate (U+dc00..U+dfff) for supplementary * @stable ICU 2.4 */ -#define CBU16_TRAIL(supplementary) \ - (::base_icu::UChar)(((supplementary)&0x3ff) | 0xdc00) +#define CBU16_TRAIL(supplementary) (::base_icu::UChar)(((supplementary)&0x3ff)|0xdc00) /** - * How many 16-bit code units are used to encode this Unicode code point? (1 or - * 2) The result is not defined if c is not a Unicode code point - * (U+0000..U+10ffff). + * How many 16-bit code units are used to encode this Unicode code point? (1 or 2) + * The result is not defined if c is not a Unicode code point (U+0000..U+10ffff). * @param c 32-bit code point * @return 1 or 2 * @stable ICU 2.4 */ -#define CBU16_LENGTH(c) ((uint32_t)(c) <= 0xffff ? 1 : 2) +#define CBU16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2) /** - * The maximum number of 16-bit code units per Unicode code point - * (U+0000..U+10ffff). + * The maximum number of 16-bit code units per Unicode code point (U+0000..U+10ffff). * @return 2 * @stable ICU 2.4 */ #define CBU16_MAX_LENGTH 2 /** + * Get a code point from a string at a random-access offset, + * without changing the offset. + * "Safe" macro, handles unpaired surrogates and checks for string boundaries. + * + * The offset may point to either the lead or trail surrogate unit + * for a supplementary code point, in which case the macro will read + * the adjacent matching surrogate as well. + * + * The length can be negative for a NUL-terminated string. + * + * If the offset points to a single, unpaired surrogate, then + * c is set to that unpaired surrogate. + * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT. + * + * @param s const UChar * string + * @param start starting string offset (usually 0) + * @param i string offset, must be start<=i<length + * @param length string length + * @param c output UChar32 variable + * @see U16_GET_UNSAFE + * @stable ICU 2.4 + */ +#define CBU16_GET(s, start, i, length, c) CBUPRV_BLOCK_MACRO_BEGIN { \ + (c)=(s)[i]; \ + if(CBU16_IS_SURROGATE(c)) { \ + uint16_t __c2; \ + if(CBU16_IS_SURROGATE_LEAD(c)) { \ + if((i)+1!=(length) && CBU16_IS_TRAIL(__c2=(s)[(i)+1])) { \ + (c)=CBU16_GET_SUPPLEMENTARY((c), __c2); \ + } \ + } else { \ + if((i)>(start) && CBU16_IS_LEAD(__c2=(s)[(i)-1])) { \ + (c)=CBU16_GET_SUPPLEMENTARY(__c2, (c)); \ + } \ + } \ + } \ +} CBUPRV_BLOCK_MACRO_END + +/** * Get a code point from a string at a code point boundary offset, * and advance the offset to the next code point boundary. * (Post-incrementing forward iteration.) @@ -414,8 +446,7 @@ * for a supplementary code point, in which case the macro will read * the following trail surrogate as well. * If the offset points to a trail surrogate or - * to a single, unpaired lead surrogate, then c is set to that unpaired - * surrogate. + * to a single, unpaired lead surrogate, then c is set to that unpaired surrogate. * * @param s const UChar * string * @param i string offset, must be i<length @@ -424,24 +455,23 @@ * @see U16_NEXT_UNSAFE * @stable ICU 2.4 */ -#define CBU16_NEXT(s, i, length, c) \ - { \ - (c) = (s)[(i)++]; \ - if (CBU16_IS_LEAD(c)) { \ - uint16_t __c2; \ - if ((i) != (length) && CBU16_IS_TRAIL(__c2 = (s)[(i)])) { \ - ++(i); \ - (c) = CBU16_GET_SUPPLEMENTARY((c), __c2); \ - } \ - } \ - } +#define CBU16_NEXT(s, i, length, c) CBUPRV_BLOCK_MACRO_BEGIN { \ + (c)=(s)[(i)++]; \ + if(CBU16_IS_LEAD(c)) { \ + uint16_t __c2; \ + if((i)!=(length) && CBU16_IS_TRAIL(__c2=(s)[(i)])) { \ + ++(i); \ + (c)=CBU16_GET_SUPPLEMENTARY((c), __c2); \ + } \ + } \ +} CBUPRV_BLOCK_MACRO_END /** * Append a code point to a string, overwriting 1 or 2 code units. * The offset points to the current end of the string contents * and is advanced (post-increment). - * "Unsafe" macro, assumes a valid code point and sufficient space in the - * string. Otherwise, the result is undefined. + * "Unsafe" macro, assumes a valid code point and sufficient space in the string. + * Otherwise, the result is undefined. * * @param s const UChar * string buffer * @param i string offset @@ -449,16 +479,89 @@ * @see U16_APPEND * @stable ICU 2.4 */ -#define CBU16_APPEND_UNSAFE(s, i, c) \ - { \ - if ((uint32_t)(c) <= 0xffff) { \ - (s)[(i)++] = (uint16_t)(c); \ - } else { \ - (s)[(i)++] = (uint16_t)(((c) >> 10) + 0xd7c0); \ - (s)[(i)++] = (uint16_t)(((c)&0x3ff) | 0xdc00); \ - } \ - } +#define CBU16_APPEND_UNSAFE(s, i, c) CBUPRV_BLOCK_MACRO_BEGIN { \ + if((uint32_t)(c)<=0xffff) { \ + (s)[(i)++]=(uint16_t)(c); \ + } else { \ + (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \ + (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \ + } \ +} CBUPRV_BLOCK_MACRO_END -} // namespace base_icu +/** + * Adjust a random-access offset to a code point boundary + * at the start of a code point. + * If the offset points to the trail surrogate of a surrogate pair, + * then the offset is decremented. + * Otherwise, it is not modified. + * "Safe" macro, handles unpaired surrogates and checks for string boundaries. + * + * @param s const UChar * string + * @param start starting string offset (usually 0) + * @param i string offset, must be start<=i + * @see U16_SET_CP_START_UNSAFE + * @stable ICU 2.4 + */ +#define CBU16_SET_CP_START(s, start, i) CBUPRV_BLOCK_MACRO_BEGIN { \ + if(CBU16_IS_TRAIL((s)[i]) && (i)>(start) && CBU16_IS_LEAD((s)[(i)-1])) { \ + --(i); \ + } \ +} CBUPRV_BLOCK_MACRO_END + +/** + * Move the string offset from one code point boundary to the previous one + * and get the code point between them. + * (Pre-decrementing backward iteration.) + * "Safe" macro, handles unpaired surrogates and checks for string boundaries. + * + * The input offset may be the same as the string length. + * If the offset is behind a trail surrogate unit + * for a supplementary code point, then the macro will read + * the preceding lead surrogate as well. + * If the offset is behind a lead surrogate or behind a single, unpaired + * trail surrogate, then c is set to that unpaired surrogate. + * + * @param s const UChar * string + * @param start starting string offset (usually 0) + * @param i string offset, must be start<i + * @param c output UChar32 variable + * @see U16_PREV_UNSAFE + * @stable ICU 2.4 + */ +#define CBU16_PREV(s, start, i, c) CBUPRV_BLOCK_MACRO_BEGIN { \ + (c)=(s)[--(i)]; \ + if(CBU16_IS_TRAIL(c)) { \ + uint16_t __c2; \ + if((i)>(start) && CBU16_IS_LEAD(__c2=(s)[(i)-1])) { \ + --(i); \ + (c)=CBU16_GET_SUPPLEMENTARY(__c2, (c)); \ + } \ + } \ +} CBUPRV_BLOCK_MACRO_END + +/** + * Adjust a random-access offset to a code point boundary after a code point. + * If the offset is behind the lead surrogate of a surrogate pair, + * then the offset is incremented. + * Otherwise, it is not modified. + * The input offset may be the same as the string length. + * "Safe" macro, handles unpaired surrogates and checks for string boundaries. + * + * The length can be negative for a NUL-terminated string. + * + * @param s const UChar * string + * @param start int32_t starting string offset (usually 0) + * @param i int32_t string offset, start<=i<=length + * @param length int32_t string length + * @see U16_SET_CP_LIMIT_UNSAFE + * @stable ICU 2.4 + */ +#define CBU16_SET_CP_LIMIT(s, start, i, length) CBUPRV_BLOCK_MACRO_BEGIN { \ + if((start)<(i) && ((i)<(length) || (length)<0) && CBU16_IS_LEAD((s)[(i)-1]) && CBU16_IS_TRAIL((s)[i])) { \ + ++(i); \ + } \ +} CBUPRV_BLOCK_MACRO_END + +} // namesapce base_icu #endif // BASE_THIRD_PARTY_ICU_ICU_UTF_H_