base/third_party/icu/icu_utf.cc - gn - Git at Google

 // © 2016 and later: Unicode, Inc. and others.
 // License & terms of use: http://www.unicode.org/copyright.html
 /*
 ******************************************************************************
 *
 *   Copyright (C) 1999-2012, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 ******************************************************************************
 *   file name:  utf_impl.cpp
 *   encoding:   UTF-8
 *   tab size:   8 (not used)
 *   indentation:4
 *
 *   created on: 1999sep13
 *   created by: Markus W. Scherer
 *
 *   This file provides implementation functions for macros in the utfXX.h
 *   that would otherwise be too long as macros.
 */

 #include "base/third_party/icu/icu_utf.h"

 namespace base_icu {

 // source/common/utf_impl.cpp

 static const UChar32 utf8_errorValue[6] = {
     // Same values as UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE,
     // but without relying on the obsolete unicode/utf_old.h.
     0x15, 0x9f, 0xffff, 0x10ffff};

 static UChar32 errorValue(int32_t count, int8_t strict) {
   if (strict >= 0) {
     return utf8_errorValue[count];
   } else if (strict == -3) {
     return 0xfffd;
   } else {
     return CBU_SENTINEL;
   }
 }

 /*
  * Handle the non-inline part of the U8_NEXT() and U8_NEXT_FFFD() macros
  * and their obsolete sibling UTF8_NEXT_CHAR_SAFE().
  *
  * U8_NEXT() supports NUL-terminated strings indicated via length<0.
  *
  * The "strict" parameter controls the error behavior:
  * <0  "Safe" behavior of U8_NEXT():
  *     -1: All illegal byte sequences yield U_SENTINEL=-1.
  *     -2: Same as -1, except for lenient treatment of surrogate code points as
  * legal. Some implementations use this for roundtripping of Unicode 16-bit
  * strings that are not well-formed UTF-16, that is, they contain
  * unpaired surrogates. -3: All illegal byte sequences yield U+FFFD. 0
  * Obsolete "safe" behavior of UTF8_NEXT_CHAR_SAFE(..., FALSE): All illegal
  * byte sequences yield a positive code point such that this result code
  * point would be encoded with the same number of bytes as the illegal
  * sequence. >0  Obsolete "strict" behavior of UTF8_NEXT_CHAR_SAFE(...,
  * TRUE): Same as the obsolete "safe" behavior, but non-characters are also
  * treated like illegal sequences.
  *
  * Note that a UBool is the same as an int8_t.
  */
 UChar32 utf8_nextCharSafeBody(const uint8_t* s,
                               int32_t* pi,
                               int32_t length,
                               UChar32 c,
                               UBool strict) {
   // *pi is one after byte c.
   int32_t i = *pi;
   // length can be negative for NUL-terminated strings: Read and validate one
   // byte at a time.
   if (i == length || c > 0xf4) {
     // end of string, or not a lead byte
   } else if (c >= 0xf0) {
     // Test for 4-byte sequences first because
     // U8_NEXT() handles shorter valid sequences inline.
     uint8_t t1 = s[i], t2, t3;
     c &= 7;
     if (CBU8_IS_VALID_LEAD4_AND_T1(c, t1) && ++i != length &&
         (t2 = s[i] - 0x80) <= 0x3f && ++i != length &&
         (t3 = s[i] - 0x80) <= 0x3f) {
       ++i;
       c = (c << 18) | ((t1 & 0x3f) << 12) | (t2 << 6) | t3;
       // strict: forbid non-characters like U+fffe
       if (strict <= 0 || !CBU_IS_UNICODE_NONCHAR(c)) {
         *pi = i;
         return c;
       }
     }
   } else if (c >= 0xe0) {
     c &= 0xf;
     if (strict != -2) {
       uint8_t t1 = s[i], t2;
       if (CBU8_IS_VALID_LEAD3_AND_T1(c, t1) && ++i != length &&
           (t2 = s[i] - 0x80) <= 0x3f) {
         ++i;
         c = (c << 12) | ((t1 & 0x3f) << 6) | t2;
         // strict: forbid non-characters like U+fffe
         if (strict <= 0 || !CBU_IS_UNICODE_NONCHAR(c)) {
           *pi = i;
           return c;
         }
       }
     } else {
       // strict=-2 -> lenient: allow surrogates
       uint8_t t1 = s[i] - 0x80, t2;
       if (t1 <= 0x3f && (c > 0 || t1 >= 0x20) && ++i != length &&
           (t2 = s[i] - 0x80) <= 0x3f) {
         *pi = i + 1;
         return (c << 12) | (t1 << 6) | t2;
       }
     }
   } else if (c >= 0xc2) {
     uint8_t t1 = s[i] - 0x80;
     if (t1 <= 0x3f) {
       *pi = i + 1;
       return ((c - 0xc0) << 6) | t1;
     }
   }  // else 0x80<=c<0xc2 is not a lead byte

   /* error handling */
   c = errorValue(i - *pi, strict);
   *pi = i;
   return c;
 }

 }  // namespace base_icu
	// © 2016 and later: Unicode, Inc. and others.
	// License & terms of use: http://www.unicode.org/copyright.html
	/*
	******************************************************************************
	*
	* Copyright (C) 1999-2012, International Business Machines
	* Corporation and others. All Rights Reserved.
	*
	******************************************************************************
	* file name: utf_impl.cpp
	* encoding: UTF-8
	* tab size: 8 (not used)
	* indentation:4
	*
	* created on: 1999sep13
	* created by: Markus W. Scherer
	*
	* This file provides implementation functions for macros in the utfXX.h
	* that would otherwise be too long as macros.
	*/

	#include "base/third_party/icu/icu_utf.h"

	namespace base_icu {

	// source/common/utf_impl.cpp

	static const UChar32 utf8_errorValue[6] = {
	// Same values as UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE,
	// but without relying on the obsolete unicode/utf_old.h.
	0x15, 0x9f, 0xffff, 0x10ffff};

	static UChar32 errorValue(int32_t count, int8_t strict) {
	if (strict >= 0) {
	return utf8_errorValue[count];
	} else if (strict == -3) {
	return 0xfffd;
	} else {
	return CBU_SENTINEL;
	}
	}

	/*
	* Handle the non-inline part of the U8_NEXT() and U8_NEXT_FFFD() macros
	* and their obsolete sibling UTF8_NEXT_CHAR_SAFE().
	*
	* U8_NEXT() supports NUL-terminated strings indicated via length<0.
	*
	* The "strict" parameter controls the error behavior:
	* <0 "Safe" behavior of U8_NEXT():
	* -1: All illegal byte sequences yield U_SENTINEL=-1.
	* -2: Same as -1, except for lenient treatment of surrogate code points as
	* legal. Some implementations use this for roundtripping of Unicode 16-bit
	* strings that are not well-formed UTF-16, that is, they contain
	* unpaired surrogates. -3: All illegal byte sequences yield U+FFFD. 0
	* Obsolete "safe" behavior of UTF8_NEXT_CHAR_SAFE(..., FALSE): All illegal
	* byte sequences yield a positive code point such that this result code
	* point would be encoded with the same number of bytes as the illegal
	* sequence. >0 Obsolete "strict" behavior of UTF8_NEXT_CHAR_SAFE(...,
	* TRUE): Same as the obsolete "safe" behavior, but non-characters are also
	* treated like illegal sequences.
	*
	* Note that a UBool is the same as an int8_t.
	*/
	UChar32 utf8_nextCharSafeBody(const uint8_t* s,
	int32_t* pi,
	int32_t length,
	UChar32 c,
	UBool strict) {
	// *pi is one after byte c.
	int32_t i = *pi;
	// length can be negative for NUL-terminated strings: Read and validate one
	// byte at a time.
	if (i == length \|\| c > 0xf4) {
	// end of string, or not a lead byte
	} else if (c >= 0xf0) {
	// Test for 4-byte sequences first because
	// U8_NEXT() handles shorter valid sequences inline.
	uint8_t t1 = s[i], t2, t3;
	c &= 7;
	if (CBU8_IS_VALID_LEAD4_AND_T1(c, t1) && ++i != length &&
	(t2 = s[i] - 0x80) <= 0x3f && ++i != length &&
	(t3 = s[i] - 0x80) <= 0x3f) {
	++i;
	c = (c << 18) \| ((t1 & 0x3f) << 12) \| (t2 << 6) \| t3;
	// strict: forbid non-characters like U+fffe
	if (strict <= 0 \|\| !CBU_IS_UNICODE_NONCHAR(c)) {
	*pi = i;
	return c;
	}
	}
	} else if (c >= 0xe0) {
	c &= 0xf;
	if (strict != -2) {
	uint8_t t1 = s[i], t2;
	if (CBU8_IS_VALID_LEAD3_AND_T1(c, t1) && ++i != length &&
	(t2 = s[i] - 0x80) <= 0x3f) {
	++i;
	c = (c << 12) \| ((t1 & 0x3f) << 6) \| t2;
	// strict: forbid non-characters like U+fffe
	if (strict <= 0 \|\| !CBU_IS_UNICODE_NONCHAR(c)) {
	*pi = i;
	return c;
	}
	}
	} else {
	// strict=-2 -> lenient: allow surrogates
	uint8_t t1 = s[i] - 0x80, t2;
	if (t1 <= 0x3f && (c > 0 \|\| t1 >= 0x20) && ++i != length &&
	(t2 = s[i] - 0x80) <= 0x3f) {
	*pi = i + 1;
	return (c << 12) \| (t1 << 6) \| t2;
	}
	}
	} else if (c >= 0xc2) {
	uint8_t t1 = s[i] - 0x80;
	if (t1 <= 0x3f) {
	*pi = i + 1;
	return ((c - 0xc0) << 6) \| t1;
	}
	} // else 0x80<=c<0xc2 is not a lead byte

	/* error handling */
	c = errorValue(i - *pi, strict);
	*pi = i;
	return c;
	}

	} // namespace base_icu